Explorar o código

Implement VP9 loop filtering (#550)

Unmerged PR from OG Ryujinx (#4367). From @gdkchan:

> The main goal of this change is porting the loop filtering from
libvpx, which should fix the block artifacts on some VP9 videos on games
using NVDEC to decode them. In addition to that, there are two other
changes:
> 
> - The remaining decoder code required to decode a VP9 video (with
headers included) has been added. That was done because it's much better
to test the decoder standalone with a video file. I decided to keep that
code on the emulator, even if some of it is unused, since it makes
standalone testing easier in the future too, and we can include unit
tests with video files.
> - Large refactoring of both new and existing code to conform with our
conding [sic] styles, done by @TSRBerry (thanks!) Some of it has been
automated.
> 
> Since we had no loop filtering before, this change will make video
decoding slower. That may cause frame drop etc if the decoder is not
fast enough in some games. I plan to optimize the decoder more in the
future to make up for that, but if possible I'd prefer to not do it as
part of this PR, but if the perf loss is too severe I might consider.
> 
> This will need to be tested on games that had the block artifacts, it
would be nice to confirm if they match hardware now, and get some
before/after screenshots etc.

Comment from @Bjorn29512:

> Significantly improves the block artifacts in FE: Engage.
> 
> Before:
>
![](https://user-images.githubusercontent.com/110204265/216882414-ec88dbda-7544-4490-8a47-37f074056ae3.png)
> 
> After:
>
![](https://user-images.githubusercontent.com/110204265/216882478-4e81fead-1033-4877-b282-f9cac6d6aa3b.png)

---------

Co-authored-by: gdkchan <gab.dark.100@gmail.com>
Co-authored-by: TSR Berry <20988865+TSRBerry@users.noreply.github.com>
Keaton hai 1 ano
pai
achega
f91cd05260
Modificáronse 79 ficheiros con 11004 adicións e 2678 borrados
  1. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/BitDepth.cs
  2. 71 52
      src/Ryujinx.Graphics.Nvdec.Vp9/CodecErr.cs
  3. 2 2
      src/Ryujinx.Graphics.Nvdec.Vp9/Common/BitUtils.cs
  4. 6 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryAllocator.cs
  5. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryUtil.cs
  6. 37 6
      src/Ryujinx.Graphics.Nvdec.Vp9/Constants.cs
  7. 47 0
      src/Ryujinx.Graphics.Nvdec.Vp9/DSubExp.cs
  8. 904 120
      src/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs
  9. 160 179
      src/Ryujinx.Graphics.Nvdec.Vp9/DecodeMv.cs
  10. 51 65
      src/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs
  11. 37 41
      src/Ryujinx.Graphics.Nvdec.Vp9/Detokenize.cs
  12. 83 81
      src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs
  13. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs
  14. 180 175
      src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs
  15. 232 245
      src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs
  16. 229 0
      src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterAuto.cs
  17. 1093 0
      src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterScalar.cs
  18. 1837 0
      src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterSse2.cs
  19. 15 19
      src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs
  20. 106 32
      src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs
  21. 36 36
      src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs
  22. 623 0
      src/Ryujinx.Graphics.Nvdec.Vp9/Entropy.cs
  23. 400 0
      src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMode.cs
  24. 165 0
      src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMv.cs
  25. 79 0
      src/Ryujinx.Graphics.Nvdec.Vp9/FrameBuffers.cs
  26. 97 84
      src/Ryujinx.Graphics.Nvdec.Vp9/Idct.cs
  27. 2 2
      src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorException.cs
  28. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorInfo.cs
  29. 1706 148
      src/Ryujinx.Graphics.Nvdec.Vp9/LoopFilter.cs
  30. 432 403
      src/Ryujinx.Graphics.Nvdec.Vp9/Luts.cs
  31. 92 48
      src/Ryujinx.Graphics.Nvdec.Vp9/PredCommon.cs
  32. 94 0
      src/Ryujinx.Graphics.Nvdec.Vp9/Prob.cs
  33. 104 166
      src/Ryujinx.Graphics.Nvdec.Vp9/QuantCommon.cs
  34. 84 0
      src/Ryujinx.Graphics.Nvdec.Vp9/ReadBitBuffer.cs
  35. 23 50
      src/Ryujinx.Graphics.Nvdec.Vp9/ReconInter.cs
  36. 80 183
      src/Ryujinx.Graphics.Nvdec.Vp9/ReconIntra.cs
  37. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs
  38. 12 1
      src/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs
  39. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/BModeInfo.cs
  40. 11 0
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/BitstreamProfile.cs
  41. 17 17
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/BlockSize.cs
  42. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/Buf2D.cs
  43. 18 0
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/BufferPool.cs
  44. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/FrameType.cs
  45. 14 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilter.cs
  46. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterInfoN.cs
  47. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterMask.cs
  48. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterThresh.cs
  49. 44 33
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockD.cs
  50. 2 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockDPlane.cs
  51. 49 8
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/ModeInfo.cs
  52. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/MotionVectorContext.cs
  53. 80 85
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv.cs
  54. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv32.cs
  55. 12 12
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvClassType.cs
  56. 5 5
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvJointType.cs
  57. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvRef.cs
  58. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/PartitionType.cs
  59. 5 5
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/PlaneType.cs
  60. 1 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/Position.cs
  61. 17 17
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/PredictionMode.cs
  62. 4 1
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefBuffer.cs
  63. 12 0
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefCntBuffer.cs
  64. 6 6
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/ReferenceMode.cs
  65. 84 224
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/ScaleFactors.cs
  66. 7 7
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/SegLvlFeatures.cs
  67. 102 9
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/Segmentation.cs
  68. 189 33
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/Surface.cs
  69. 4 4
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/TileInfo.cs
  70. 8 8
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxMode.cs
  71. 7 7
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxSize.cs
  72. 7 7
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxType.cs
  73. 700 32
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs
  74. 410 0
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Decoder.cs
  75. 10 0
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxCodecFrameBuffer.cs
  76. 11 0
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxColorRange.cs
  77. 29 0
      src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxColorSpace.cs
  78. 2 0
      src/Ryujinx.Graphics.Nvdec/Types/Vp9/PictureInfo.cs
  79. 3 1
      src/Ryujinx.Graphics.Video/Vp9PictureInfo.cs

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/BitDepth.cs

@@ -6,4 +6,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         Bits10 = 10, // < 10 bits
         Bits12 = 12, // < 12 bits
     }
-}
+}

+ 71 - 52
src/Ryujinx.Graphics.Nvdec.Vp9/CodecErr.cs

@@ -1,56 +1,75 @@
-namespace Ryujinx.Graphics.Nvdec.Vp9
+namespace Ryujinx.Graphics.Nvdec.Vp9
 {
     internal enum CodecErr
     {
-        /*!\brief Operation completed without error */
-        CodecOk,
-
-        /*!\brief Unspecified error */
-        CodecError,
-
-        /*!\brief Memory operation failed */
-        CodecMemError,
-
-        /*!\brief ABI version mismatch */
-        CodecAbiMismatch,
-
-        /*!\brief Algorithm does not have required capability */
-        CodecIncapable,
-
-        /*!\brief The given bitstream is not supported.
-         *
-         * The bitstream was unable to be parsed at the highest level. The decoder
-         * is unable to proceed. This error \ref SHOULD be treated as fatal to the
-         * stream. */
-        CodecUnsupBitstream,
-
-        /*!\brief Encoded bitstream uses an unsupported feature
-         *
-         * The decoder does not implement a feature required by the encoder. This
-         * return code should only be used for features that prevent future
-         * pictures from being properly decoded. This error \ref MAY be treated as
-         * fatal to the stream or \ref MAY be treated as fatal to the current GOP.
-         */
-        CodecUnsupFeature,
-
-        /*!\brief The coded data for this stream is corrupt or incomplete
-         *
-         * There was a problem decoding the current frame.  This return code
-         * should only be used for failures that prevent future pictures from
-         * being properly decoded. This error \ref MAY be treated as fatal to the
-         * stream or \ref MAY be treated as fatal to the current GOP. If decoding
-         * is continued for the current GOP, artifacts may be present.
-         */
-        CodecCorruptFrame,
-
-        /*!\brief An application-supplied parameter is not valid.
-         *
-         */
-        CodecInvalidParam,
-
-        /*!\brief An iterator reached the end of list.
-         *
-         */
-        CodecListEnd,
+        /// <summary>
+        /// Operation completed without error
+        /// </summary>
+        Ok,
+
+        /// <summary>
+        /// Unspecified error
+        /// </summary>
+        Error,
+
+        /// <summary>
+        /// Memory operation failed
+        /// </summary>
+        MemError,
+
+        /// <summary>
+        /// ABI version mismatch
+        /// </summary>
+        AbiMismatch,
+
+        /// <summary>
+        /// Algorithm does not have required capability
+        /// </summary>
+        Incapable,
+
+        /// <summary>
+        /// The given bitstream is not supported.
+        /// </summary>
+        /// <remarks>
+        /// The bitstream was unable to be parsed at the highest level.<br/>
+        /// The decoder is unable to proceed.<br/>
+        /// This error SHOULD be treated as fatal to the stream.
+        /// </remarks>
+        UnsupBitstream,
+
+        /// <summary>
+        /// Encoded bitstream uses an unsupported feature
+        /// </summary>
+        /// <remarks>
+        /// The decoder does not implement a feature required by the encoder.<br/>
+        /// This return code should only be used for features that prevent future
+        /// pictures from being properly decoded.<br/>
+        /// <br/>
+        /// This error MAY be treated as fatal to the stream or MAY be treated as fatal to the current GOP.
+        /// </remarks>
+        UnsupFeature,
+
+         /// <summary>
+         /// The coded data for this stream is corrupt or incomplete.
+         /// </summary>
+         /// <remarks>
+         /// There was a problem decoding the current frame.<br/>
+         /// This return code should only be used
+         /// for failures that prevent future pictures from being properly decoded.<br/>
+         /// <br/>
+         /// This error MAY be treated as fatal to the stream or MAY be treated as fatal to the current GOP.<br/>
+         /// If decoding is continued for the current GOP, artifacts may be present.
+         /// </remarks>
+         CorruptFrame,
+
+        /// <summary>
+        /// An application-supplied parameter is not valid.
+        /// </summary>
+        InvalidParam,
+
+        /// <summary>
+        /// An iterator reached the end of list.
+        /// </summary>
+        ListEnd
     }
-}
+}

+ 2 - 2
src/Ryujinx.Graphics.Nvdec.Vp9/Common/BitUtils.cs

@@ -10,7 +10,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Common
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static byte ClipPixel(int val)
         {
-            return (byte)((val > 255) ? 255 : (val < 0) ? 0 : val);
+            return (byte)(val > 255 ? 255 : val < 0 ? 0 : val);
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -56,4 +56,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Common
             return numValues > 0 ? GetMsb(numValues) + 1 : 0;
         }
     }
-}
+}

+ 6 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryAllocator.cs

@@ -51,6 +51,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Common
                         {
                             Marshal.FreeHGlobal(item.Pointer);
                         }
+
                         item.Pointer = ptr;
                         item.Length = lengthInBytes;
                         break;
@@ -58,7 +59,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Common
                 }
             }
 
-            return new ArrayPtr<T>(ptr, length);
+            ArrayPtr<T> allocation = new ArrayPtr<T>(ptr, length);
+
+            allocation.AsSpan().Fill(default);
+
+            return allocation;
         }
 
         public unsafe void Free<T>(ArrayPtr<T> arr) where T : unmanaged

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryUtil.cs

@@ -20,4 +20,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Common
             new Span<T>(ptr, length).Fill(value);
         }
     }
-}
+}

+ 37 - 6
src/Ryujinx.Graphics.Nvdec.Vp9/Constants.cs

@@ -1,8 +1,10 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+
 namespace Ryujinx.Graphics.Nvdec.Vp9
 {
     internal static class Constants
     {
-        public const int Vp9InterpExtend = 4;
+        public const int InterpExtend = 4;
 
         public const int MaxMbPlane = 3;
 
@@ -25,6 +27,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         /* Segment Feature Masks */
         public const int MaxMvRefCandidates = 2;
 
+        public const int IntraInterContexts = 4;
         public const int CompInterContexts = 5;
         public const int RefContexts = 5;
 
@@ -32,12 +35,26 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         public const int EightTapSmooth = 1;
         public const int EightTapSharp = 2;
         public const int SwitchableFilters = 3; /* Number of switchable filters */
+
         public const int Bilinear = 3;
-        public const int Switchable = 4; /* should be the last one */
+
+        // The codec can operate in four possible inter prediction filter mode:
+        // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
+        public const int SwitchableFilterContexts = SwitchableFilters + 1;
+        public const int Switchable = 4; /* Should be the last one */
 
         // Frame
         public const int RefsPerFrame = 3;
 
+        public const int RefFramesLog2 = 3;
+        public const int RefFrames = 1 << RefFramesLog2;
+
+        // 1 scratch frame for the new frame, 3 for scaled references on the encoder.
+        public const int FrameBuffers = RefFrames + 4;
+
+        public const int FrameContextsLog2 = 2;
+        public const int FrameContexts = 1 << FrameContextsLog2;
+
         public const int NumPingPongBuffers = 2;
 
         public const int Class0Bits = 1; /* bits at integer precision for class 0 */
@@ -48,9 +65,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         public const int MvLow = -(1 << MvInUseBits);
 
         // Coefficient token alphabet
-        public const int ZeroToken = 0; // 0 Extra Bits 0+0
-        public const int OneToken = 1; // 1 Extra Bits 0+1
-        public const int TwoToken = 2; // 2 Extra Bits 0+1
+        public const int ZeroToken = 0; // 0     Extra Bits 0+0
+        public const int OneToken = 1; // 1     Extra Bits 0+1
+        public const int TwoToken = 2; // 2     Extra Bits 0+1
 
         public const int PivotNode = 2;
 
@@ -65,5 +82,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
         public const int SegmentAbsData = 1;
         public const int MaxSegments = 8;
+
+        public const int PartitionTypes = (int)PartitionType.PartitionTypes;
+
+        public const int PartitionPlOffset = 4; // Number of probability models per block size
+        public const int PartitionContexts = 4 * PartitionPlOffset;
+
+        public const int PlaneTypes = (int)PlaneType.PlaneTypes;
+
+        public const int IntraModes = (int)PredictionMode.TmPred + 1;
+
+        public const int InterModes = 1 + (int)PredictionMode.NewMv - (int)PredictionMode.NearestMv;
+
+        public const int SkipContexts = 3;
+        public const int InterModeContexts = 7;
     }
-}
+}

+ 47 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/DSubExp.cs

@@ -0,0 +1,47 @@
+using System.Diagnostics;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class DSubExp
+    {
+        public static int InvRecenterNonneg(int v, int m)
+        {
+            if (v > 2 * m)
+            {
+                return v;
+            }
+
+            return (v & 1) != 0 ? m - ((v + 1) >> 1) : m + (v >> 1);
+        }
+
+        private static readonly byte[] InvMapTable =
+        {
+            7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176, 189, 202, 215, 228, 241, 254, 1, 2, 3, 4,
+            5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 34,
+            35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62,
+            63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 90,
+            91, 92, 93, 94, 95, 96, 97, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114,
+            115, 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
+            138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+            160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 177, 178, 179, 180, 181, 182,
+            183, 184, 185, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205,
+            206, 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227,
+            229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250,
+            251, 252, 253, 253
+        };
+
+        public static int InvRemapProb(int v, int m)
+        {
+            Debug.Assert(v < InvMapTable.Length / sizeof(byte));
+
+            v = InvMapTable[v];
+            m--;
+            if (m << 1 <= Prob.MaxProb)
+            {
+                return 1 + InvRecenterNonneg(v, m);
+            }
+
+            return Prob.MaxProb - InvRecenterNonneg(v, Prob.MaxProb - 1 - m);
+        }
+    }
+}

+ 904 - 120
src/Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
 using Ryujinx.Graphics.Nvdec.Vp9.Common;
 using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
 using Ryujinx.Graphics.Nvdec.Vp9.Types;
@@ -12,14 +12,96 @@ using System.Threading.Tasks;
 
 namespace Ryujinx.Graphics.Nvdec.Vp9
 {
-    static class DecodeFrame
+    internal static class DecodeFrame
     {
         private static bool ReadIsValid(ArrayPtr<byte> start, int len)
         {
             return len != 0 && len <= start.Length;
         }
 
-        private static void InverseTransformBlockInter(ref MacroBlockD xd, int plane, TxSize txSize, Span<byte> dst, int stride, int eob)
+        private static void ReadTxModeProbs(ref Vp9EntropyProbs txProbs, ref Reader r)
+        {
+            for (int i = 0; i < EntropyMode.TxSizeContexts; ++i)
+            {
+                for (int j = 0; j < (int)TxSize.TxSizes - 3; ++j)
+                {
+                    r.DiffUpdateProb(ref txProbs.Tx8x8Prob[i][j]);
+                }
+            }
+
+            for (int i = 0; i < EntropyMode.TxSizeContexts; ++i)
+            {
+                for (int j = 0; j < (int)TxSize.TxSizes - 2; ++j)
+                {
+                    r.DiffUpdateProb(ref txProbs.Tx16x16Prob[i][j]);
+                }
+            }
+
+            for (int i = 0; i < EntropyMode.TxSizeContexts; ++i)
+            {
+                for (int j = 0; j < (int)TxSize.TxSizes - 1; ++j)
+                {
+                    r.DiffUpdateProb(ref txProbs.Tx32x32Prob[i][j]);
+                }
+            }
+        }
+
+        private static void ReadSwitchableInterpProbs(ref Vp9EntropyProbs fc, ref Reader r)
+        {
+            for (int j = 0; j < Constants.SwitchableFilterContexts; ++j)
+            {
+                for (int i = 0; i < Constants.SwitchableFilters - 1; ++i)
+                {
+                    r.DiffUpdateProb(ref fc.SwitchableInterpProb[j][i]);
+                }
+            }
+        }
+
+        private static void ReadInterModeProbs(ref Vp9EntropyProbs fc, ref Reader r)
+        {
+            for (int i = 0; i < Constants.InterModeContexts; ++i)
+            {
+                for (int j = 0; j < Constants.InterModes - 1; ++j)
+                {
+                    r.DiffUpdateProb( ref fc.InterModeProb[i][j]);
+                }
+            }
+        }
+
+        private static void ReadMvProbs(ref Vp9EntropyProbs ctx, bool allowHp, ref Reader r)
+        {
+            r.UpdateMvProbs(ctx.Joints.AsSpan(), EntropyMv.Joints - 1);
+
+            for (int i = 0; i < 2; ++i)
+            {
+                r.UpdateMvProbs(MemoryMarshal.CreateSpan(ref ctx.Sign[i], 1), 1);
+                r.UpdateMvProbs(ctx.Classes[i].AsSpan(), EntropyMv.Classes - 1);
+                r.UpdateMvProbs(ctx.Class0[i].AsSpan(), EntropyMv.Class0Size - 1);
+                r.UpdateMvProbs(ctx.Bits[i].AsSpan(), EntropyMv.OffsetBits);
+            }
+
+            for (int i = 0; i < 2; ++i)
+            {
+                for (int j = 0; j < EntropyMv.Class0Size; ++j)
+                {
+                    r.UpdateMvProbs(ctx.Class0Fp[i][j].AsSpan(), EntropyMv.FpSize - 1);
+                }
+
+                r.UpdateMvProbs(ctx.Fp[i].AsSpan(), 3);
+            }
+
+            if (allowHp)
+            {
+                for (int i = 0; i < 2; ++i)
+                {
+                    r.UpdateMvProbs(MemoryMarshal.CreateSpan(ref ctx.Class0Hp[i], 1), 1);
+                    r.UpdateMvProbs(MemoryMarshal.CreateSpan(ref ctx.Hp[i], 1), 1);
+                }
+            }
+        }
+
+        private static void InverseTransformBlockInter(ref MacroBlockD xd, int plane, TxSize txSize, Span<byte> dst,
+            int stride, int eob)
         {
             ref MacroBlockDPlane pd = ref xd.Plane[plane];
             ArrayPtr<int> dqcoeff = pd.DqCoeff;
@@ -90,15 +172,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             {
                 if (txSize <= TxSize.Tx16x16 && eob <= 10)
                 {
-                    dqcoeff.AsSpan()[..(4 * (4 << (int)txSize))].Clear();
+                    dqcoeff.AsSpan().Slice(0, 4 * (4 << (int)txSize)).Clear();
                 }
                 else if (txSize == TxSize.Tx32x32 && eob <= 34)
                 {
-                    dqcoeff.AsSpan()[..256].Clear();
+                    dqcoeff.AsSpan().Slice(0, 256).Clear();
                 }
                 else
                 {
-                    dqcoeff.AsSpan()[..(16 << ((int)txSize << 1))].Clear();
+                    dqcoeff.AsSpan().Slice(0, 16 << ((int)txSize << 1)).Clear();
                 }
             }
         }
@@ -181,15 +263,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             {
                 if (txType == TxType.DctDct && txSize <= TxSize.Tx16x16 && eob <= 10)
                 {
-                    dqcoeff.AsSpan()[..(4 * (4 << (int)txSize))].Clear();
+                    dqcoeff.AsSpan().Slice(0, 4 * (4 << (int)txSize)).Clear();
                 }
                 else if (txSize == TxSize.Tx32x32 && eob <= 34)
                 {
-                    dqcoeff.AsSpan()[..256].Clear();
+                    dqcoeff.AsSpan().Slice(0, 256).Clear();
                 }
                 else
                 {
-                    dqcoeff.AsSpan()[..(16 << ((int)txSize << 1))].Clear();
+                    dqcoeff.AsSpan().Slice(0, 16 << ((int)txSize << 1)).Clear();
                 }
             }
         }
@@ -204,10 +286,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         {
             ref MacroBlockD xd = ref twd.Xd;
             ref MacroBlockDPlane pd = ref xd.Plane[plane];
-            PredictionMode mode = (plane == 0) ? mi.Mode : mi.UvMode;
-            int dstOffset = 4 * row * pd.Dst.Stride + 4 * col;
+            PredictionMode mode = plane == 0 ? mi.Mode : mi.UvMode;
+            int dstOffset = (4 * row * pd.Dst.Stride) + (4 * col);
             byte* dst = &pd.Dst.Buf.ToPointer()[dstOffset];
-            Span<byte> dstSpan = pd.Dst.Buf.AsSpan()[dstOffset..];
+            Span<byte> dstSpan = pd.Dst.Buf.AsSpan().Slice(dstOffset);
 
             if (mi.SbType < BlockSize.Block8x8)
             {
@@ -217,15 +299,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 }
             }
 
-            ReconIntra.PredictIntraBlock(ref xd, pd.N4Wl, txSize, mode, dst, pd.Dst.Stride, dst, pd.Dst.Stride, col, row, plane);
+            ReconIntra.PredictIntraBlock(ref xd, pd.N4Wl, txSize, mode, dst, pd.Dst.Stride, dst, pd.Dst.Stride, col,
+                row, plane);
 
             if (mi.Skip == 0)
             {
                 TxType txType =
-                    (plane != 0 || xd.Lossless) ? TxType.DctDct : ReconIntra.IntraModeToTxTypeLookup[(int)mode];
-                var sc = (plane != 0 || xd.Lossless)
-                    ? Luts.Vp9DefaultScanOrders[(int)txSize]
-                    : Luts.Vp9ScanOrders[(int)txSize][(int)txType];
+                    plane != 0 || xd.Lossless ? TxType.DctDct : ReconIntra.IntraModeToTxTypeLookup[(int)mode];
+                Luts.ScanOrder sc = plane != 0 || xd.Lossless
+                    ? Luts.DefaultScanOrders[(int)txSize]
+                    : Luts.ScanOrders[(int)txSize][(int)txType];
                 int eob = Detokenize.DecodeBlockTokens(ref twd, plane, sc, col, row, txSize, mi.SegmentId);
                 if (eob > 0)
                 {
@@ -244,14 +327,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         {
             ref MacroBlockD xd = ref twd.Xd;
             ref MacroBlockDPlane pd = ref xd.Plane[plane];
-            var sc = Luts.Vp9DefaultScanOrders[(int)txSize];
+            Luts.ScanOrder sc = Luts.DefaultScanOrders[(int)txSize];
             int eob = Detokenize.DecodeBlockTokens(ref twd, plane, sc, col, row, txSize, mi.SegmentId);
-            Span<byte> dst = pd.Dst.Buf.AsSpan()[(4 * row * pd.Dst.Stride + 4 * col)..];
+            Span<byte> dst = pd.Dst.Buf.AsSpan().Slice((4 * row * pd.Dst.Stride) + (4 * col));
 
             if (eob > 0)
             {
                 InverseTransformBlockInter(ref xd, plane, txSize, dst, pd.Dst.Stride, eob);
             }
+
             return eob;
         }
 
@@ -268,7 +352,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int h)
         {
             // Get a pointer to the start of the real data for this row.
-            byte* refRow = src - x - y * srcStride;
+            byte* refRow = src - x - (y * srcStride);
 
             if (y >= h)
             {
@@ -340,7 +424,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         {
             // Get a pointer to the start of the real data for this row.
             ushort* src = (ushort*)src8;
-            ushort* refRow = src - x - y * srcStride;
+            ushort* refRow = src - x - (y * srcStride);
 
             if (y >= h)
             {
@@ -483,9 +567,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int refr)
         {
             ref MacroBlockDPlane pd = ref xd.Plane[plane];
-            byte* dst = dstBuf.Buf.ToPointer() + dstBuf.Stride * y + x;
+            byte* dst = dstBuf.Buf.ToPointer() + (dstBuf.Stride * y) + x;
             Mv32 scaledMv;
-            int xs, ys, x0, y0, x0_16, y0_16, frameWidth, frameHeight, bufStride, subpelX, subpelY;
+            int xs, ys, x0, y0, x016, y016, frameWidth, frameHeight, bufStride, subpelX, subpelY;
             byte* refFrame;
             byte* bufPtr;
 
@@ -507,16 +591,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             {
                 Mv mvQ4 = ReconInter.ClampMvToUmvBorderSb(ref xd, ref mv, bw, bh, pd.SubsamplingX, pd.SubsamplingY);
                 // Co-ordinate of containing block to pixel precision.
-                int xStart = (-xd.MbToLeftEdge >> (3 + pd.SubsamplingX));
-                int yStart = (-xd.MbToTopEdge >> (3 + pd.SubsamplingY));
+                int xStart = -xd.MbToLeftEdge >> (3 + pd.SubsamplingX);
+                int yStart = -xd.MbToTopEdge >> (3 + pd.SubsamplingY);
                 // Co-ordinate of the block to 1/16th pixel precision.
-                x0_16 = (xStart + x) << Filter.SubpelBits;
-                y0_16 = (yStart + y) << Filter.SubpelBits;
+                x016 = (xStart + x) << Filter.SubpelBits;
+                y016 = (yStart + y) << Filter.SubpelBits;
 
                 // Co-ordinate of current block in reference frame
                 // to 1/16th pixel precision.
-                x0_16 = sf.ScaleValueX(x0_16);
-                y0_16 = sf.ScaleValueY(y0_16);
+                x016 = sf.ScaleValueX(x016);
+                y016 = sf.ScaleValueY(y016);
 
                 // Map the top left corner of the block into the reference frame.
                 x0 = sf.ScaleValueX(xStart + x);
@@ -535,13 +619,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 y0 = (-xd.MbToTopEdge >> (3 + pd.SubsamplingY)) + y;
 
                 // Co-ordinate of the block to 1/16th pixel precision.
-                x0_16 = x0 << Filter.SubpelBits;
-                y0_16 = y0 << Filter.SubpelBits;
+                x016 = x0 << Filter.SubpelBits;
+                y016 = y0 << Filter.SubpelBits;
 
                 scaledMv.Row = mv.Row * (1 << (1 - pd.SubsamplingY));
                 scaledMv.Col = mv.Col * (1 << (1 - pd.SubsamplingX));
                 xs = ys = 16;
             }
+
             subpelX = scaledMv.Col & Filter.SubpelMask;
             subpelY = scaledMv.Row & Filter.SubpelMask;
 
@@ -549,34 +634,35 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // reference frame.
             x0 += scaledMv.Col >> Filter.SubpelBits;
             y0 += scaledMv.Row >> Filter.SubpelBits;
-            x0_16 += scaledMv.Col;
-            y0_16 += scaledMv.Row;
+            x016 += scaledMv.Col;
+            y016 += scaledMv.Row;
 
             // Get reference block pointer.
-            bufPtr = refFrame + y0 * preBuf.Stride + x0;
+            bufPtr = refFrame + (y0 * preBuf.Stride) + x0;
             bufStride = preBuf.Stride;
 
             // Do border extension if there is motion or the
             // width/height is not a multiple of 8 pixels.
-            if (isScaled || scaledMv.Col != 0 || scaledMv.Row != 0 || (frameWidth & 0x7) != 0 || (frameHeight & 0x7) != 0)
+            if (isScaled || scaledMv.Col != 0 || scaledMv.Row != 0 || (frameWidth & 0x7) != 0 ||
+                (frameHeight & 0x7) != 0)
             {
-                int y1 = ((y0_16 + (h - 1) * ys) >> Filter.SubpelBits) + 1;
+                int y1 = ((y016 + ((h - 1) * ys)) >> Filter.SubpelBits) + 1;
 
                 // Get reference block bottom right horizontal coordinate.
-                int x1 = ((x0_16 + (w - 1) * xs) >> Filter.SubpelBits) + 1;
+                int x1 = ((x016 + ((w - 1) * xs)) >> Filter.SubpelBits) + 1;
                 int xPad = 0, yPad = 0;
 
-                if (subpelX != 0 || (sf.XStepQ4 != Filter.SubpelShifts))
+                if (subpelX != 0 || sf.XStepQ4 != Filter.SubpelShifts)
                 {
-                    x0 -= Constants.Vp9InterpExtend - 1;
-                    x1 += Constants.Vp9InterpExtend;
+                    x0 -= Constants.InterpExtend - 1;
+                    x1 += Constants.InterpExtend;
                     xPad = 1;
                 }
 
-                if (subpelY != 0 || (sf.YStepQ4 != Filter.SubpelShifts))
+                if (subpelY != 0 || sf.YStepQ4 != Filter.SubpelShifts)
                 {
-                    y0 -= Constants.Vp9InterpExtend - 1;
-                    y1 += Constants.Vp9InterpExtend;
+                    y0 -= Constants.InterpExtend - 1;
+                    y1 += Constants.InterpExtend;
                     yPad = 1;
                 }
 
@@ -585,10 +671,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     y0 < 0 || y0 > frameHeight - 1 || y1 < 0 || y1 > frameHeight - 1)
                 {
                     // Extend the border.
-                    byte* bufPtr1 = refFrame + y0 * bufStride + x0;
+                    byte* bufPtr1 = refFrame + (y0 * bufStride) + x0;
                     int bW = x1 - x0 + 1;
                     int bH = y1 - y0 + 1;
-                    int borderOffset = yPad * 3 * bW + xPad * 3;
+                    int borderOffset = (yPad * 3 * bW) + (xPad * 3);
 
                     ExtendAndPredict(
                         bufPtr1,
@@ -612,7 +698,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         refr,
                         xs,
                         ys);
-
                     return;
                 }
             }
@@ -660,7 +745,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int miX = miCol * Constants.MiSize;
             int miY = miRow * Constants.MiSize;
             ref ModeInfo mi = ref xd.Mi[0].Value;
-            Array8<short>[] kernel = Luts.Vp9FilterKernels[mi.InterpFilter];
+            Array8<short>[] kernel = Luts.FilterKernels[mi.InterpFilter];
             BlockSize sbType = mi.SbType;
             int isCompound = mi.HasSecondRef() ? 1 : 0;
             int refr;
@@ -675,11 +760,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
                 if (!sf.IsValidScale())
                 {
-                    xd.ErrorInfo.Value.InternalError(CodecErr.CodecUnsupBitstream, "Reference frame has invalid dimensions");
+                    xd.ErrorInfo.Value.InternalError(CodecErr.UnsupBitstream,
+                        "Reference frame has invalid dimensions");
                 }
 
                 isScaled = sf.IsScaled();
-                ReconInter.SetupPrePlanes(ref xd, refr, ref refFrameBuf, miRow, miCol, isScaled ? new Ptr<ScaleFactors>(ref sf) : Ptr<ScaleFactors>.Null);
+                ReconInter.SetupPrePlanes(ref xd, refr, ref refFrameBuf, miRow, miCol,
+                    isScaled ? new Ptr<ScaleFactors>(ref sf) : Ptr<ScaleFactors>.Null);
                 xd.BlockRefs[refr] = new Ptr<RefBuffer>(ref refBuf);
 
                 if (sbType < BlockSize.Block8x8)
@@ -693,10 +780,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         int n4Wx4 = 4 * num4x4W;
                         int n4Hx4 = 4 * num4x4H;
                         ref Buf2D preBuf = ref pd.Pre[refr];
-                        int i = 0, x, y;
-                        for (y = 0; y < num4x4H; ++y)
+                        int i = 0;
+                        for (int y = 0; y < num4x4H; ++y)
                         {
-                            for (x = 0; x < num4x4W; ++x)
+                            for (int x = 0; x < num4x4W; ++x)
                             {
                                 Mv mv = ReconInter.AverageSplitMvs(ref pd, ref mi, refr, i++);
                                 DecBuildInterPredictors(
@@ -758,21 +845,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
         }
 
-        private static unsafe void DecResetSkipContext(ref MacroBlockD xd)
-        {
-            int i;
-            for (i = 0; i < Constants.MaxMbPlane; i++)
-            {
-                ref MacroBlockDPlane pd = ref xd.Plane[i];
-                MemoryUtil.Fill(pd.AboveContext.ToPointer(), (sbyte)0, pd.N4W);
-                MemoryUtil.Fill(pd.LeftContext.ToPointer(), (sbyte)0, pd.N4H);
-            }
-        }
-
         private static void SetPlaneN4(ref MacroBlockD xd, int bw, int bh, int bwl, int bhl)
         {
-            int i;
-            for (i = 0; i < Constants.MaxMbPlane; i++)
+            for (int i = 0; i < Constants.MaxMbPlane; i++)
             {
                 xd.Plane[i].N4W = (ushort)((bw << 1) >> xd.Plane[i].SubsamplingX);
                 xd.Plane[i].N4H = (ushort)((bh << 1) >> xd.Plane[i].SubsamplingY);
@@ -794,18 +869,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int bwl,
             int bhl)
         {
-            int offset = miRow * cm.MiStride + miCol;
-            int x, y;
+            int offset = (miRow * cm.MiStride) + miCol;
+
             ref TileInfo tile = ref xd.Tile;
 
             xd.Mi = cm.MiGridVisible.Slice(offset);
             xd.Mi[0] = new Ptr<ModeInfo>(ref cm.Mi[offset]);
             xd.Mi[0].Value.SbType = bsize;
-            for (y = 0; y < yMis; ++y)
+            for (int y = 0; y < yMis; ++y)
             {
-                for (x = y == 0 ? 1 : 0; x < xMis; ++x)
+                for (int x = y == 0 ? 1 : 0; x < xMis; ++x)
                 {
-                    xd.Mi[y * cm.MiStride + x] = xd.Mi[0];
+                    xd.Mi[(y * cm.MiStride) + x] = xd.Mi[0];
                 }
             }
 
@@ -818,7 +893,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             xd.SetMiRowCol(ref tile, miRow, bh, miCol, bw, cm.MiRows, cm.MiCols);
 
             ReconInter.SetupDstPlanes(ref xd.Plane, ref xd.CurBuf, miRow, miCol);
-
             return ref xd.Mi[0].Value;
         }
 
@@ -846,7 +920,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 BlockSize uvSubsize = Luts.SsSizeLookup[(int)bsize][cm.SubsamplingX][cm.SubsamplingY];
                 if (uvSubsize == BlockSize.BlockInvalid)
                 {
-                    xd.ErrorInfo.Value.InternalError(CodecErr.CodecCorruptFrame, "Invalid block size.");
+                    xd.ErrorInfo.Value.InternalError(CodecErr.CorruptFrame, "Invalid block size.");
                 }
             }
 
@@ -854,7 +928,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
             if (mi.Skip != 0)
             {
-                DecResetSkipContext(ref xd);
+                xd.DecResetSkipContext();
             }
 
             if (!mi.IsInterBlock())
@@ -868,8 +942,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     int num4x4H = pd.N4H;
                     int step = 1 << (int)txSize;
                     int row, col;
-                    int maxBlocksWide = num4x4W + (xd.MbToRightEdge >= 0 ? 0 : xd.MbToRightEdge >> (5 + pd.SubsamplingX));
-                    int maxBlocksHigh = num4x4H + (xd.MbToBottomEdge >= 0 ? 0 : xd.MbToBottomEdge >> (5 + pd.SubsamplingY));
+                    int maxBlocksWide =
+                        num4x4W + (xd.MbToRightEdge >= 0 ? 0 : xd.MbToRightEdge >> (5 + pd.SubsamplingX));
+                    int maxBlocksHigh =
+                        num4x4H + (xd.MbToBottomEdge >= 0 ? 0 : xd.MbToBottomEdge >> (5 + pd.SubsamplingY));
 
                     xd.MaxBlocksWide = (uint)(xd.MbToRightEdge >= 0 ? 0 : maxBlocksWide);
                     xd.MaxBlocksHigh = (uint)(xd.MbToBottomEdge >= 0 ? 0 : maxBlocksHigh);
@@ -902,8 +978,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         int num4x4H = pd.N4H;
                         int step = 1 << (int)txSize;
                         int row, col;
-                        int maxBlocksWide = num4x4W + (xd.MbToRightEdge >= 0 ? 0 : xd.MbToRightEdge >> (5 + pd.SubsamplingX));
-                        int maxBlocksHigh = num4x4H + (xd.MbToBottomEdge >= 0 ? 0 : xd.MbToBottomEdge >> (5 + pd.SubsamplingY));
+                        int maxBlocksWide =
+                            num4x4W + (xd.MbToRightEdge >= 0 ? 0 : xd.MbToRightEdge >> (5 + pd.SubsamplingX));
+                        int maxBlocksHigh = num4x4H +
+                                            (xd.MbToBottomEdge >= 0 ? 0 : xd.MbToBottomEdge >> (5 + pd.SubsamplingY));
 
                         xd.MaxBlocksWide = (uint)(xd.MbToRightEdge >= 0 ? 0 : maxBlocksWide);
                         xd.MaxBlocksHigh = (uint)(xd.MbToBottomEdge >= 0 ? 0 : maxBlocksHigh);
@@ -932,15 +1010,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
         }
 
-        private static int DecPartitionPlaneContext(ref TileWorkerData twd, int miRow, int miCol, int bsl)
-        {
-            ref sbyte aboveCtx = ref twd.Xd.AboveSegContext[miCol];
-            ref sbyte leftCtx = ref twd.Xd.LeftSegContext[miRow & Constants.MiMask];
-            int above = (aboveCtx >> bsl) & 1, left = (leftCtx >> bsl) & 1;
-
-            return (left * 2 + above) + bsl * Constants.PartitionPloffset;
-        }
-
         private static void DecUpdatePartitionContext(
             ref TileWorkerData twd,
             int miRow,
@@ -949,13 +1018,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int bw)
         {
             Span<sbyte> aboveCtx = twd.Xd.AboveSegContext.Slice(miCol).AsSpan();
-            Span<sbyte> leftCtx = MemoryMarshal.CreateSpan(ref twd.Xd.LeftSegContext[miRow & Constants.MiMask], 8 - (miRow & Constants.MiMask));
+            Span<sbyte> leftCtx = MemoryMarshal.CreateSpan(ref twd.Xd.LeftSegContext[miRow & Constants.MiMask],
+                8 - (miRow & Constants.MiMask));
 
             // Update the partition context at the end notes. Set partition bits
             // of block sizes larger than the current one to be one, and partition
             // bits of smaller block sizes to be zero.
-            aboveCtx[..bw].Fill(Luts.PartitionContextLookup[(int)subsize].Above);
-            leftCtx[..bw].Fill(Luts.PartitionContextLookup[(int)subsize].Left);
+            aboveCtx.Slice(0, bw).Fill(Luts.PartitionContextLookup[(int)subsize].Above);
+            leftCtx.Slice(0, bw).Fill(Luts.PartitionContextLookup[(int)subsize].Left);
         }
 
         private static PartitionType ReadPartition(
@@ -966,14 +1036,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int hasCols,
             int bsl)
         {
-            int ctx = DecPartitionPlaneContext(ref twd, miRow, miCol, bsl);
+            int ctx = twd.DecPartitionPlaneContext(miRow, miCol, bsl);
             ReadOnlySpan<byte> probs = MemoryMarshal.CreateReadOnlySpan(ref twd.Xd.PartitionProbs[ctx][0], 3);
             PartitionType p;
             ref Reader r = ref twd.BitReader;
 
             if (hasRows != 0 && hasCols != 0)
             {
-                p = (PartitionType)r.ReadTree(Luts.Vp9PartitionTree, probs);
+                p = (PartitionType)r.ReadTree(Luts.PartitionTree, probs);
             }
             else if (hasRows == 0 && hasCols != 0)
             {
@@ -1009,8 +1079,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int hbs = num8x8Wh >> 1;
             PartitionType partition;
             BlockSize subsize;
-            bool hasRows = (miRow + hbs) < cm.MiRows;
-            bool hasCols = (miCol + hbs) < cm.MiCols;
+            bool hasRows = miRow + hbs < cm.MiRows;
+            bool hasCols = miCol + hbs < cm.MiCols;
             ref MacroBlockD xd = ref twd.Xd;
 
             if (miRow >= cm.MiRows || miCol >= cm.MiCols)
@@ -1063,7 +1133,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
 
             // Update partition context
-            if (bsize >= BlockSize.Block8x8 && (bsize == BlockSize.Block8x8 || partition != PartitionType.PartitionSplit))
+            if (bsize >= BlockSize.Block8x8 &&
+                (bsize == BlockSize.Block8x8 || partition != PartitionType.PartitionSplit))
             {
                 DecUpdatePartitionContext(ref twd, miRow, miCol, subsize, num8x8Wh);
             }
@@ -1079,13 +1150,255 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // partition can't be fully read then throw an error.
             if (!ReadIsValid(data, readSize))
             {
-                errorInfo.InternalError(CodecErr.CodecCorruptFrame, "Truncated packet or corrupt tile length");
+                errorInfo.InternalError(CodecErr.CorruptFrame, "Truncated packet or corrupt tile length");
             }
 
             if (r.Init(data, readSize))
             {
-                errorInfo.InternalError(CodecErr.CodecMemError, "Failed to allocate bool decoder 1");
+                errorInfo.InternalError(CodecErr.MemError, "Failed to allocate bool decoder 1");
+            }
+        }
+
+        private static void ReadCoefProbsCommon(ref Array2<Array2<Array6<Array6<Array3<byte>>>>> coefProbs,
+            ref Reader r, int txSize)
+        {
+            if (r.ReadBit() != 0)
+            {
+                for (int i = 0; i < Constants.PlaneTypes; ++i)
+                {
+                    for (int j = 0; j < Entropy.RefTypes; ++j)
+                    {
+                        for (int k = 0; k < Entropy.CoefBands; ++k)
+                        {
+                            for (int l = 0; l < Entropy.BAND_COEFF_CONTEXTS(k); ++l)
+                            {
+                                for (int m = 0; m < Entropy.UnconstrainedNodes; ++m)
+                                {
+                                    r.DiffUpdateProb( ref coefProbs[i][j][k][l][m]);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        private static void ReadCoefProbs(ref Vp9EntropyProbs fc, TxMode txMode, ref Reader r)
+        {
+            int maxTxSize = (int)Luts.TxModeToBiggestTxSize[(int)txMode];
+            for (int txSize = (int)TxSize.Tx4x4; txSize <= maxTxSize; ++txSize)
+            {
+                ReadCoefProbsCommon(ref fc.CoefProbs[txSize], ref r, txSize);
+            }
+        }
+
+        private static void SetupLoopfilter(ref Types.LoopFilter lf, ref ReadBitBuffer rb)
+        {
+            lf.FilterLevel = rb.ReadLiteral(6);
+            lf.SharpnessLevel = rb.ReadLiteral(3);
+
+            // Read in loop filter deltas applied at the MB level based on mode or ref
+            // frame.
+            lf.ModeRefDeltaUpdate = false;
+
+            lf.ModeRefDeltaEnabled = rb.ReadBit() != 0;
+            if (lf.ModeRefDeltaEnabled)
+            {
+                lf.ModeRefDeltaUpdate = rb.ReadBit() != 0;
+                if (lf.ModeRefDeltaUpdate)
+                {
+                    for (int i = 0; i < LoopFilter.MaxRefLfDeltas; i++)
+                    {
+                        if (rb.ReadBit() != 0)
+                        {
+                            lf.RefDeltas[i] = (sbyte)rb.ReadSignedLiteral(6);
+                        }
+                    }
+
+                    for (int i = 0; i < LoopFilter.MaxModeLfDeltas; i++)
+                    {
+                        if (rb.ReadBit() != 0)
+                        {
+                            lf.ModeDeltas[i] = (sbyte)rb.ReadSignedLiteral(6);
+                        }
+                    }
+                }
+            }
+        }
+
+        private static void SetupQuantization(ref Vp9Common cm, ref MacroBlockD xd, ref ReadBitBuffer rb)
+        {
+            cm.BaseQindex = rb.ReadLiteral(QuantCommon.QindexBits);
+            cm.YDcDeltaQ = rb.ReadDeltaQ();
+            cm.UvDcDeltaQ = rb.ReadDeltaQ();
+            cm.UvAcDeltaQ = rb.ReadDeltaQ();
+            cm.DequantBitDepth = cm.BitDepth;
+            xd.Lossless = cm.BaseQindex == 0 && cm.YDcDeltaQ == 0 && cm.UvDcDeltaQ == 0 && cm.UvAcDeltaQ == 0;
+
+            xd.Bd = (int)cm.BitDepth;
+        }
+
+        private static readonly byte[] LiteralToFilter =
+        {
+            Constants.EightTapSmooth, Constants.EightTap, Constants.EightTapSharp, Constants.Bilinear
+        };
+
+        private static byte ReadInterpFilter(ref ReadBitBuffer rb)
+        {
+            return rb.ReadBit() != 0
+                ? (byte)Constants.Switchable
+                : LiteralToFilter[rb.ReadLiteral(2)];
+        }
+
+        private static void SetupRenderSize(ref Vp9Common cm, ref ReadBitBuffer rb)
+        {
+            cm.RenderWidth = cm.Width;
+            cm.RenderHeight = cm.Height;
+            if (rb.ReadBit() != 0)
+            {
+                rb.ReadFrameSize(out cm.RenderWidth, out cm.RenderHeight);
+            }
+        }
+
+        private static void SetupFrameSize(MemoryAllocator allocator, ref Vp9Common cm, ref ReadBitBuffer rb)
+        {
+            int width = 0, height = 0;
+            ref BufferPool pool = ref cm.BufferPool.Value;
+            rb.ReadFrameSize(out width, out height);
+            cm.ResizeContextBuffers(allocator, width, height);
+            SetupRenderSize(ref cm, ref rb);
+
+            if (cm.GetFrameNewBuffer().ReallocFrameBuffer(
+                    allocator,
+                    cm.Width,
+                    cm.Height,
+                    cm.SubsamplingX,
+                    cm.SubsamplingY,
+                    cm.UseHighBitDepth,
+                    Surface.DecBorderInPixels,
+                    cm.ByteAlignment,
+                    new Ptr<VpxCodecFrameBuffer>(ref pool.FrameBufs[cm.NewFbIdx].RawFrameBuffer),
+                    FrameBuffers.GetFrameBuffer,
+                    pool.CbPriv) != 0)
+            {
+                cm.Error.InternalError(CodecErr.MemError, "Failed to allocate frame buffer");
+            }
+
+            pool.FrameBufs[cm.NewFbIdx].Released = 0;
+            pool.FrameBufs[cm.NewFbIdx].Buf.SubsamplingX = cm.SubsamplingX;
+            pool.FrameBufs[cm.NewFbIdx].Buf.SubsamplingY = cm.SubsamplingY;
+            pool.FrameBufs[cm.NewFbIdx].Buf.BitDepth = (uint)cm.BitDepth;
+            pool.FrameBufs[cm.NewFbIdx].Buf.ColorSpace = cm.ColorSpace;
+            pool.FrameBufs[cm.NewFbIdx].Buf.ColorRange = cm.ColorRange;
+            pool.FrameBufs[cm.NewFbIdx].Buf.RenderWidth = cm.RenderWidth;
+            pool.FrameBufs[cm.NewFbIdx].Buf.RenderHeight = cm.RenderHeight;
+        }
+
+        private static bool ValidRefFrameImgFmt(
+            BitDepth refBitDepth,
+            int refXss, int refYss,
+            BitDepth thisBitDepth,
+            int thisXss,
+            int thisYss)
+        {
+            return refBitDepth == thisBitDepth && refXss == thisXss && refYss == thisYss;
+        }
+
+        private static void SetupFrameSizeWithRefs(MemoryAllocator allocator, ref Vp9Common cm,
+            ref ReadBitBuffer rb)
+        {
+            int width = 0, height = 0;
+            bool found = false;
+
+            bool hasValidRefFrame = false;
+            ref BufferPool pool = ref cm.BufferPool.Value;
+            for (int i = 0; i < Constants.RefsPerFrame; ++i)
+            {
+                if (rb.ReadBit() != 0)
+                {
+                    if (cm.FrameRefs[i].Idx != RefBuffer.InvalidIdx)
+                    {
+                        ref Surface buf = ref cm.FrameRefs[i].Buf;
+                        width = buf.YCropWidth;
+                        height = buf.YCropHeight;
+                        found = true;
+                        break;
+                    }
+
+                    cm.Error.InternalError(CodecErr.CorruptFrame, "Failed to decode frame size");
+                }
+            }
+
+            if (!found)
+            {
+                rb.ReadFrameSize(out width, out height);
+            }
+
+            if (width <= 0 || height <= 0)
+            {
+                cm.Error.InternalError(CodecErr.CorruptFrame, "Invalid frame size");
+            }
+
+            // Check to make sure at least one of frames that this frame references
+            // has valid dimensions.
+            for (int i = 0; i < Constants.RefsPerFrame; ++i)
+            {
+                ref RefBuffer refFrame = ref cm.FrameRefs[i];
+                hasValidRefFrame |=
+                    refFrame.Idx != RefBuffer.InvalidIdx &&
+                    ScaleFactors.ValidRefFrameSize(refFrame.Buf.YCropWidth, refFrame.Buf.YCropHeight, width,
+                        height);
             }
+
+            if (!hasValidRefFrame)
+            {
+                cm.Error.InternalError(CodecErr.CorruptFrame, "Referenced frame has invalid size");
+            }
+
+            for (int i = 0; i < Constants.RefsPerFrame; ++i)
+            {
+                ref RefBuffer refFrame = ref cm.FrameRefs[i];
+                if (refFrame.Idx == RefBuffer.InvalidIdx ||
+                    !ValidRefFrameImgFmt(
+                        (BitDepth)refFrame.Buf.BitDepth,
+                        refFrame.Buf.SubsamplingX,
+                        refFrame.Buf.SubsamplingY,
+                        cm.BitDepth,
+                        cm.SubsamplingX,
+                        cm.SubsamplingY))
+                {
+                    cm.Error.InternalError(CodecErr.CorruptFrame,
+                        "Referenced frame has incompatible color format");
+                }
+            }
+
+            cm.ResizeContextBuffers(allocator, width, height);
+            SetupRenderSize(ref cm, ref rb);
+
+            if (cm.GetFrameNewBuffer().ReallocFrameBuffer(
+                    allocator,
+                    cm.Width,
+                    cm.Height,
+                    cm.SubsamplingX,
+                    cm.SubsamplingY,
+                    cm.UseHighBitDepth,
+                    Surface.DecBorderInPixels,
+                    cm.ByteAlignment,
+                    new Ptr<VpxCodecFrameBuffer>(ref pool.FrameBufs[cm.NewFbIdx].RawFrameBuffer),
+                    FrameBuffers.GetFrameBuffer,
+                    pool.CbPriv) != 0)
+            {
+                cm.Error.InternalError(CodecErr.MemError, "Failed to allocate frame buffer");
+            }
+
+            pool.FrameBufs[cm.NewFbIdx].Released = 0;
+            pool.FrameBufs[cm.NewFbIdx].Buf.SubsamplingX = cm.SubsamplingX;
+            pool.FrameBufs[cm.NewFbIdx].Buf.SubsamplingY = cm.SubsamplingY;
+            pool.FrameBufs[cm.NewFbIdx].Buf.BitDepth = (uint)cm.BitDepth;
+            pool.FrameBufs[cm.NewFbIdx].Buf.ColorSpace = cm.ColorSpace;
+            pool.FrameBufs[cm.NewFbIdx].Buf.ColorRange = cm.ColorRange;
+            pool.FrameBufs[cm.NewFbIdx].Buf.RenderWidth = cm.RenderWidth;
+            pool.FrameBufs[cm.NewFbIdx].Buf.RenderHeight = cm.RenderHeight;
         }
 
         // Reads the next tile returning its size and adjusting '*data' accordingly
@@ -1102,7 +1415,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             {
                 if (!ReadIsValid(data, 4))
                 {
-                    errorInfo.InternalError(CodecErr.CodecCorruptFrame, "Truncated packet or corrupt tile length");
+                    errorInfo.InternalError(CodecErr.CorruptFrame, "Truncated packet or corrupt tile length");
                 }
 
                 size = BinaryPrimitives.ReadInt32BigEndian(data.AsSpan());
@@ -1110,7 +1423,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
                 if (size > data.Length)
                 {
-                    errorInfo.InternalError(CodecErr.CodecCorruptFrame, "Truncated packet or corrupt tile size");
+                    errorInfo.InternalError(CodecErr.CorruptFrame, "Truncated packet or corrupt tile size");
                 }
             }
             else
@@ -1124,11 +1437,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             data = data.Slice(size);
         }
 
-        private static void GetTileBuffers(ref Vp9Common cm, ArrayPtr<byte> data, int tileCols, ref Array64<TileBuffer> tileBuffers)
+        private static void GetTileBuffers(ref Vp9Common cm, ArrayPtr<byte> data, int tileCols,
+            ref Array64<TileBuffer> tileBuffers)
         {
-            int c;
-
-            for (c = 0; c < tileCols; ++c)
+            for (int c = 0; c < tileCols; ++c)
             {
                 bool isLast = c == tileCols - 1;
                 ref TileBuffer buf = ref tileBuffers[c];
@@ -1144,13 +1456,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int tileRows,
             ref Array4<Array64<TileBuffer>> tileBuffers)
         {
-            int r, c;
-
-            for (r = 0; r < tileRows; ++r)
+            for (int r = 0; r < tileRows; ++r)
             {
-                for (c = 0; c < tileCols; ++c)
+                for (int c = 0; c < tileCols; ++c)
                 {
-                    bool isLast = (r == tileRows - 1) && (c == tileCols - 1);
+                    bool isLast = r == tileRows - 1 && c == tileCols - 1;
                     ref TileBuffer buf = ref tileBuffers[r][c];
                     GetTileBuffer(isLast, ref cm.Error, ref data, ref buf);
                 }
@@ -1167,7 +1477,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int miRow, miCol;
 
             Debug.Assert(tileRows <= 4);
-            Debug.Assert(tileCols <= (1 << 6));
+            Debug.Assert(tileCols <= 1 << 6);
 
             // Note: this memset assumes above_context[0], [1] and [2]
             // are allocated as part of the same buffer.
@@ -1183,7 +1493,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 for (tileCol = 0; tileCol < tileCols; ++tileCol)
                 {
                     ref TileBuffer buf = ref tileBuffers[tileRow][tileCol];
-                    ref TileWorkerData tileData = ref cm.TileWorkerData[tileCols * tileRow + tileCol];
+                    ref TileWorkerData tileData = ref cm.TileWorkerData[(tileCols * tileRow) + tileCol];
                     tileData.Xd = cm.Mb;
                     tileData.Xd.Corrupted = false;
                     tileData.Xd.Counts = cm.Counts;
@@ -1203,7 +1513,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     for (tileCol = 0; tileCol < tileCols; ++tileCol)
                     {
                         int col = tileCol;
-                        ref TileWorkerData tileData = ref cm.TileWorkerData[tileCols * tileRow + col];
+                        ref TileWorkerData tileData = ref cm.TileWorkerData[(tileCols * tileRow) + col];
                         tile.SetCol(ref cm, col);
                         tileData.Xd.LeftContext = new Array3<Array16<sbyte>>();
                         tileData.Xd.LeftSegContext = new Array8<sbyte>();
@@ -1211,20 +1521,22 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         {
                             DecodePartition(ref tileData, ref cm, miRow, miCol, BlockSize.Block64x64, 4);
                         }
+
                         cm.Mb.Corrupted |= tileData.Xd.Corrupted;
                         if (cm.Mb.Corrupted)
                         {
-                            cm.Error.InternalError(CodecErr.CodecCorruptFrame, "Failed to decode tile data");
+                            cm.Error.InternalError(CodecErr.CorruptFrame, "Failed to decode tile data");
                         }
                     }
                 }
             }
 
             // Get last tile data.
-            return cm.TileWorkerData[tileCols * tileRows - 1].BitReader.FindEnd();
+            return cm.TileWorkerData[(tileCols * tileRows) - 1].BitReader.FindEnd();
         }
 
-        private static bool DecodeTileCol(ref TileWorkerData tileData, ref Vp9Common cm, ref Array64<TileBuffer> tileBuffers)
+        private static bool DecodeTileCol(ref TileWorkerData tileData, ref Vp9Common cm,
+            ref Array64<TileBuffer> tileBuffers)
         {
             ref TileInfo tile = ref tileData.Xd.Tile;
             int finalCol = (1 << cm.Log2TileCols) - 1;
@@ -1262,7 +1574,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             } while (!tileData.Xd.Corrupted && ++n <= tileData.BufEnd);
 
             tileData.DataEnd = bitReaderEnd;
-
             return !tileData.Xd.Corrupted;
         }
 
@@ -1276,9 +1587,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int numWorkers = Math.Min(maxThreads, tileCols);
             int n;
 
-            Debug.Assert(tileCols <= (1 << 6));
+            Debug.Assert(tileCols <= 1 << 6);
             Debug.Assert(tileRows == 1);
 
+            LoopFilter.ResetLfm(ref cm);
+
             cm.AboveContext.AsSpan().Clear();
             cm.AboveSegContext.AsSpan().Clear();
 
@@ -1295,13 +1608,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
             GetTileBuffers(ref cm, data, tileCols, ref tileBuffers);
 
-            tileBuffers.AsSpan()[..tileCols].Sort(CompareTileBuffers);
+            tileBuffers.AsSpan().Slice(0, tileCols).Sort(CompareTileBuffers);
 
             if (numWorkers == tileCols)
             {
                 TileBuffer largest = tileBuffers[0];
                 Span<TileBuffer> buffers = tileBuffers.AsSpan();
-                buffers[1..].CopyTo(buffers[..(tileBuffers.Length - 1)]);
+                buffers.Slice(1).CopyTo(buffers.Slice(0, tileBuffers.Length - 1));
                 tileBuffers[tileCols - 1] = largest;
             }
             else
@@ -1327,7 +1640,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
             for (n = 0; n < numWorkers; ++n)
             {
-                int count = baseVal + (remain + n) / numWorkers;
+                int count = baseVal + ((remain + n) / numWorkers);
                 ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles];
 
                 tileData.BufStart = bufStart;
@@ -1364,7 +1677,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
 
             Debug.Assert(!bitReaderEnd.IsNull || cm.Mb.Corrupted);
-
             return bitReaderEnd;
         }
 
@@ -1383,5 +1695,477 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 a[i] += c[i];
             }
         }
+
+        private static void ErrorHandler(Ptr<Vp9Common> data)
+        {
+            ref Vp9Common cm = ref data.Value;
+            cm.Error.InternalError(CodecErr.CorruptFrame, "Truncated packet");
+        }
+
+        private static void FlushAllFbOnKey(ref Vp9Common cm)
+        {
+            if (cm.FrameType == FrameType.KeyFrame && cm.CurrentVideoFrame > 0)
+            {
+                ref Array12<RefCntBuffer> frameBufs = ref cm.BufferPool.Value.FrameBufs;
+                ref BufferPool pool = ref cm.BufferPool.Value;
+
+                for (int i = 0; i < Constants.FrameBuffers; ++i)
+                {
+                    if (i == cm.NewFbIdx)
+                    {
+                        continue;
+                    }
+
+                    frameBufs[i].RefCount = 0;
+                    if (frameBufs[i].Released == 0)
+                    {
+                        FrameBuffers.ReleaseFrameBuffer(pool.CbPriv, ref frameBufs[i].RawFrameBuffer);
+                        frameBufs[i].Released = 1;
+                    }
+                }
+            }
+        }
+
+        private const int SyncCode0 = 0x49;
+        private const int SyncCode1 = 0x83;
+        private const int SyncCode2 = 0x42;
+
+        private const int FrameMarker = 0x2;
+
+        private static bool ReadSyncCode(ref ReadBitBuffer rb)
+        {
+            return rb.ReadLiteral(8) == SyncCode0 &&
+                   rb.ReadLiteral(8) == SyncCode1 &&
+                   rb.ReadLiteral(8) == SyncCode2;
+        }
+
+        private static void RefCntFb(ref Array12<RefCntBuffer> bufs, ref int idx, int newIdx)
+        {
+            int refIndex = idx;
+
+            if (refIndex >= 0 && bufs[refIndex].RefCount > 0)
+            {
+                bufs[refIndex].RefCount--;
+            }
+
+            idx = newIdx;
+
+            bufs[newIdx].RefCount++;
+        }
+
+        private static ulong ReadUncompressedHeader(MemoryAllocator allocator, ref Vp9Decoder pbi,
+            ref ReadBitBuffer rb)
+        {
+            ref Vp9Common cm = ref pbi.Common;
+            ref BufferPool pool = ref cm.BufferPool.Value;
+            ref Array12<RefCntBuffer> frameBufs = ref pool.FrameBufs;
+            int mask, refIndex = 0;
+            ulong sz;
+
+            cm.LastFrameType = cm.FrameType;
+            cm.LastIntraOnly = cm.IntraOnly;
+
+            if (rb.ReadLiteral(2) != FrameMarker)
+            {
+                cm.Error.InternalError(CodecErr.UnsupBitstream, "Invalid frame marker");
+            }
+
+            cm.Profile = rb.ReadProfile();
+            if (cm.Profile >= BitstreamProfile.MaxProfiles)
+            {
+                cm.Error.InternalError(CodecErr.UnsupBitstream, "Unsupported bitstream profile");
+            }
+
+            cm.ShowExistingFrame = rb.ReadBit();
+            if (cm.ShowExistingFrame != 0)
+            {
+                // Show an existing frame directly.
+                int frameToShow = cm.RefFrameMap[rb.ReadLiteral(3)];
+                if (frameToShow < 0 || frameBufs[frameToShow].RefCount < 1)
+                {
+                    cm.Error.InternalError(CodecErr.UnsupBitstream,
+                        $"Buffer {frameToShow} does not contain a decoded frame");
+                }
+
+                RefCntFb(ref frameBufs, ref cm.NewFbIdx, frameToShow);
+                pbi.RefreshFrameFlags = 0;
+                cm.Lf.FilterLevel = 0;
+                cm.ShowFrame = 1;
+
+                return 0;
+            }
+
+            cm.FrameType = (FrameType)rb.ReadBit();
+            cm.ShowFrame = rb.ReadBit();
+            cm.ErrorResilientMode = rb.ReadBit();
+
+            if (cm.FrameType == FrameType.KeyFrame)
+            {
+                if (!ReadSyncCode(ref rb))
+                {
+                    cm.Error.InternalError(CodecErr.UnsupBitstream, "Invalid frame sync code");
+                }
+
+                cm.ReadBitdepthColorspaceSampling(ref rb);
+                pbi.RefreshFrameFlags = (1 << Constants.RefFrames) - 1;
+
+                for (int i = 0; i < Constants.RefsPerFrame; ++i)
+                {
+                    cm.FrameRefs[i].Idx = RefBuffer.InvalidIdx;
+                    cm.FrameRefs[i].Buf = default;
+                }
+
+                SetupFrameSize(allocator, ref cm, ref rb);
+                if (pbi.NeedResync != 0)
+                {
+                    cm.RefFrameMap.AsSpan().Fill(-1);
+                    FlushAllFbOnKey(ref cm);
+                    pbi.NeedResync = 0;
+                }
+            }
+            else
+            {
+                cm.IntraOnly = (cm.ShowFrame != 0 ? 0 : rb.ReadBit()) != 0;
+
+                cm.ResetFrameContext = cm.ErrorResilientMode != 0 ? 0 : rb.ReadLiteral(2);
+
+                if (cm.IntraOnly)
+                {
+                    if (!ReadSyncCode(ref rb))
+                    {
+                        cm.Error.InternalError(CodecErr.UnsupBitstream, "Invalid frame sync code");
+                    }
+
+                    if (cm.Profile > BitstreamProfile.Profile0)
+                    {
+                        cm.ReadBitdepthColorspaceSampling(ref rb);
+                    }
+                    else
+                    {
+                        // NOTE: The intra-only frame header does not include the specification
+                        // of either the color format or color sub-sampling in profile 0. VP9
+                        // specifies that the default color format should be YUV 4:2:0 in this
+                        // case (normative).
+                        cm.ColorSpace = VpxColorSpace.Bt601;
+                        cm.ColorRange = VpxColorRange.Studio;
+                        cm.SubsamplingY = cm.SubsamplingX = 1;
+                        cm.BitDepth = BitDepth.Bits8;
+                        cm.UseHighBitDepth = false;
+                    }
+
+                    pbi.RefreshFrameFlags = rb.ReadLiteral(Constants.RefFrames);
+                    SetupFrameSize(allocator, ref cm, ref rb);
+                    if (pbi.NeedResync != 0)
+                    {
+                        cm.RefFrameMap.AsSpan().Fill(-1);
+                        pbi.NeedResync = 0;
+                    }
+                }
+                else if (pbi.NeedResync != 1)
+                {
+                    /* Skip if need resync */
+                    pbi.RefreshFrameFlags = rb.ReadLiteral(Constants.RefFrames);
+                    for (int i = 0; i < Constants.RefsPerFrame; ++i)
+                    {
+                        int refr = rb.ReadLiteral(Constants.RefFramesLog2);
+                        int idx = cm.RefFrameMap[refr];
+                        ref RefBuffer refFrame = ref cm.FrameRefs[i];
+                        refFrame.Idx = idx;
+                        refFrame.Buf = frameBufs[idx].Buf;
+                        cm.RefFrameSignBias[Constants.LastFrame + i] = (sbyte)rb.ReadBit();
+                    }
+
+                    SetupFrameSizeWithRefs(allocator, ref cm, ref rb);
+
+                    cm.AllowHighPrecisionMv = rb.ReadBit() != 0;
+                    cm.InterpFilter = ReadInterpFilter(ref rb);
+
+                    for (int i = 0; i < Constants.RefsPerFrame; ++i)
+                    {
+                        ref RefBuffer refBuf = ref cm.FrameRefs[i];
+                        refBuf.Sf.SetupScaleFactorsForFrame(
+                            refBuf.Buf.YCropWidth,
+                            refBuf.Buf.YCropHeight,
+                            cm.Width,
+                            cm.Height);
+                    }
+                }
+            }
+
+            cm.GetFrameNewBuffer().BitDepth = (uint)cm.BitDepth;
+            cm.GetFrameNewBuffer().ColorSpace = cm.ColorSpace;
+            cm.GetFrameNewBuffer().ColorRange = cm.ColorRange;
+            cm.GetFrameNewBuffer().RenderWidth = cm.RenderWidth;
+            cm.GetFrameNewBuffer().RenderHeight = cm.RenderHeight;
+
+            if (pbi.NeedResync != 0)
+            {
+                cm.Error.InternalError(CodecErr.CorruptFrame,
+                    "Keyframe / intra-only frame required to reset decoder state");
+            }
+
+            if (cm.ErrorResilientMode == 0)
+            {
+                cm.RefreshFrameContext = rb.ReadBit();
+                cm.FrameParallelDecodingMode = rb.ReadBit();
+                if (cm.FrameParallelDecodingMode == 0)
+                {
+                    cm.Counts.Value = new Vp9BackwardUpdates();
+                }
+            }
+            else
+            {
+                cm.RefreshFrameContext = 0;
+                cm.FrameParallelDecodingMode = 1;
+            }
+
+            // This flag will be overridden by the call to SetupPastIndependence
+            // below, forcing the use of context 0 for those frame types.
+            cm.FrameContextIdx = (uint)rb.ReadLiteral(Constants.FrameContextsLog2);
+
+            // Generate next_ref_frame_map.
+            for (mask = pbi.RefreshFrameFlags; mask != 0; mask >>= 1)
+            {
+                if ((mask & 1) != 0)
+                {
+                    cm.NextRefFrameMap[refIndex] = cm.NewFbIdx;
+                    ++frameBufs[cm.NewFbIdx].RefCount;
+                }
+                else
+                {
+                    cm.NextRefFrameMap[refIndex] = cm.RefFrameMap[refIndex];
+                }
+
+                // Current thread holds the reference frame.
+                if (cm.RefFrameMap[refIndex] >= 0)
+                {
+                    ++frameBufs[cm.RefFrameMap[refIndex]].RefCount;
+                }
+
+                ++refIndex;
+            }
+
+            for (; refIndex < Constants.RefFrames; ++refIndex)
+            {
+                cm.NextRefFrameMap[refIndex] = cm.RefFrameMap[refIndex];
+                // Current thread holds the reference frame.
+                if (cm.RefFrameMap[refIndex] >= 0)
+                {
+                    ++frameBufs[cm.RefFrameMap[refIndex]].RefCount;
+                }
+            }
+
+            pbi.HoldRefBuf = 1;
+
+            if (cm.FrameIsIntraOnly() || cm.ErrorResilientMode != 0)
+            {
+                EntropyMode.SetupPastIndependence(ref cm);
+            }
+
+            SetupLoopfilter(ref cm.Lf, ref rb);
+            SetupQuantization(ref cm, ref cm.Mb, ref rb);
+            cm.Seg.SetupSegmentation(ref cm.Fc.Value, ref rb);
+            cm.SetupSegmentationDequant();
+
+            cm.SetupTileInfo(ref rb);
+            sz = (ulong)rb.ReadLiteral(16);
+
+            if (sz == 0)
+            {
+                cm.Error.InternalError(CodecErr.CorruptFrame, "Invalid header size");
+            }
+
+            return sz;
+        }
+
+        private static bool ReadCompressedHeader(ref Vp9Decoder pbi, ArrayPtr<byte> data, ulong partitionSize)
+        {
+            ref Vp9Common cm = ref pbi.Common;
+            ref MacroBlockD xd = ref cm.Mb;
+            ref Vp9EntropyProbs fc = ref cm.Fc.Value;
+            Reader r = new();
+
+            if (r.Init(data, (int)partitionSize))
+            {
+                cm.Error.InternalError(CodecErr.MemError, "Failed to allocate bool decoder 0");
+            }
+
+            cm.TxMode = xd.Lossless ? TxMode.Only4x4 : r.ReadTxMode();
+            if (cm.TxMode == TxMode.TxModeSelect)
+            {
+                ReadTxModeProbs(ref fc, ref r);
+            }
+
+            ReadCoefProbs(ref fc, cm.TxMode, ref r);
+
+            for (int k = 0; k < Constants.SkipContexts; ++k)
+            {
+                r.DiffUpdateProb(ref fc.SkipProb[k]);
+            }
+
+            if (!cm.FrameIsIntraOnly())
+            {
+                ReadInterModeProbs(ref fc, ref r);
+
+                if (cm.InterpFilter == Constants.Switchable)
+                {
+                    ReadSwitchableInterpProbs(ref fc, ref r);
+                }
+
+                for (int i = 0; i < Constants.IntraInterContexts; i++)
+                {
+                    r.DiffUpdateProb( ref fc.IntraInterProb[i]);
+                }
+
+                cm.ReferenceMode = cm.ReadFrameReferenceMode(ref r);
+                if (cm.ReferenceMode != ReferenceMode.Single)
+                {
+                    cm.SetupCompoundReferenceMode();
+                }
+
+                cm.ReadFrameReferenceModeProbs(ref r);
+
+                for (int j = 0; j < EntropyMode.BlockSizeGroups; j++)
+                {
+                    for (int i = 0; i < Constants.IntraModes - 1; ++i)
+                    {
+                        r.DiffUpdateProb( ref fc.YModeProb[j][i]);
+                    }
+                }
+
+                for (int j = 0; j < Constants.PartitionContexts; ++j)
+                {
+                    for (int i = 0; i < Constants.PartitionTypes - 1; ++i)
+                    {
+                        r.DiffUpdateProb( ref fc.PartitionProb[j][i]);
+                    }
+                }
+
+                ReadMvProbs(ref fc, cm.AllowHighPrecisionMv, ref r);
+            }
+
+            return r.HasError();
+        }
+
+        private static ref ReadBitBuffer InitReadBitBuffer(ref ReadBitBuffer rb, ReadOnlySpan<byte> data)
+        {
+            rb.BitOffset = 0;
+            rb.BitBuffer = data;
+            return ref rb;
+        }
+
+        public static unsafe void Decode(MemoryAllocator allocator,
+            ref Vp9Decoder pbi,
+            ArrayPtr<byte> data,
+            out ArrayPtr<byte> pDataEnd,
+            bool multithreaded = true)
+        {
+            ref Vp9Common cm = ref pbi.Common;
+            ref MacroBlockD xd = ref cm.Mb;
+            ReadBitBuffer rb = new();
+            int contextUpdated = 0;
+            Span<byte> clearData = stackalloc byte[80];
+            ulong firstPartitionSize =
+                ReadUncompressedHeader(allocator, ref pbi, ref InitReadBitBuffer(ref rb, data.AsSpan()));
+            int tileRows = 1 << cm.Log2TileRows;
+            int tileCols = 1 << cm.Log2TileCols;
+            ref Surface newFb = ref cm.GetFrameNewBuffer();
+            xd.CurBuf = newFb;
+
+            if (firstPartitionSize == 0)
+            {
+                // showing a frame directly
+                pDataEnd = data.Slice(cm.Profile <= BitstreamProfile.Profile2 ? 1 : 2);
+                return;
+            }
+
+            data = data.Slice((int)rb.BytesRead());
+            if (!ReadIsValid(data, (int)firstPartitionSize))
+            {
+                cm.Error.InternalError(CodecErr.CorruptFrame, "Truncated packet or corrupt header length");
+            }
+
+            cm.UsePrevFrameMvs =
+                cm.ErrorResilientMode == 0 &&
+                cm.Width == cm.LastWidth &&
+                cm.Height == cm.LastHeight &&
+                !cm.LastIntraOnly &&
+                cm.LastShowFrame != 0 &&
+                cm.LastFrameType != FrameType.KeyFrame;
+
+            xd.SetupBlockPlanes(cm.SubsamplingX, cm.SubsamplingY);
+
+            cm.Fc = new Ptr<Vp9EntropyProbs>(ref cm.FrameContexts[(int)cm.FrameContextIdx]);
+
+            xd.Corrupted = false;
+            newFb.Corrupted = ReadCompressedHeader(ref pbi, data, firstPartitionSize) ? 1 : 0;
+            if (newFb.Corrupted != 0)
+            {
+                cm.Error.InternalError(CodecErr.CorruptFrame, "Decode failed. Frame data header is corrupted.");
+            }
+
+            if (cm.Lf.FilterLevel != 0 && cm.SkipLoopFilter == 0)
+            {
+                LoopFilter.LoopFilterFrameInit(ref cm, cm.Lf.FilterLevel);
+            }
+
+            int threadCount = multithreaded ? Math.Max(1, Environment.ProcessorCount / 2) : 0;
+
+            if (cm.TileWorkerData.IsNull || tileCols * tileRows != cm.TotalTiles)
+            {
+                int numTileWorkers = (tileCols * tileRows) + threadCount;
+                if (!cm.TileWorkerData.IsNull)
+                {
+                    allocator.Free(cm.TileWorkerData);
+                }
+
+                cm.CheckMemError( ref cm.TileWorkerData, allocator.Allocate<TileWorkerData>(numTileWorkers));
+                cm.TotalTiles = tileRows * tileCols;
+            }
+
+            if (multithreaded)
+            {
+                pDataEnd = DecodeTilesMt(ref pbi.Common, data.Slice((int)firstPartitionSize), threadCount);
+
+                LoopFilter.LoopFilterFrameMt(
+                    ref cm.Mb.CurBuf,
+                    ref cm,
+                    ref cm.Mb,
+                    cm.Lf.FilterLevel,
+                    false,
+                    false,
+                    threadCount);
+            }
+            else
+            {
+                pDataEnd = DecodeTiles(ref pbi.Common, data.Slice((int)firstPartitionSize));
+
+                LoopFilter.LoopFilterFrame(ref cm.Mb.CurBuf, ref cm, ref cm.Mb, cm.Lf.FilterLevel, false, false);
+            }
+
+            if (!xd.Corrupted)
+            {
+                if (cm.ErrorResilientMode == 0 && cm.FrameParallelDecodingMode == 0)
+                {
+                    cm.AdaptCoefProbs();
+
+                    if (!cm.FrameIsIntraOnly())
+                    {
+                        cm.AdaptModeProbs();
+                        cm.AdaptMvProbs(cm.AllowHighPrecisionMv);
+                    }
+                }
+            }
+            else
+            {
+                cm.Error.InternalError(CodecErr.CorruptFrame, "Decode failed. Frame data is corrupted.");
+            }
+
+            // Non frame parallel update frame context here.
+            if (cm.RefreshFrameContext != 0 && contextUpdated == 0)
+            {
+                cm.FrameContexts[(int)cm.FrameContextIdx] = cm.Fc.Value;
+            }
+        }
     }
-}
+}

+ 160 - 179
src/Ryujinx.Graphics.Nvdec.Vp9/DecodeMv.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
 using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
 using Ryujinx.Graphics.Nvdec.Vp9.Types;
 using Ryujinx.Graphics.Video;
@@ -10,11 +10,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 {
     internal static class DecodeMv
     {
-        private const int MvrefNeighbours = 8;
+        private const int RefNeighbours = 8;
 
         private static PredictionMode ReadIntraMode(ref Reader r, ReadOnlySpan<byte> p)
         {
-            return (PredictionMode)r.ReadTree(Luts.Vp9IntraModeTree, p);
+            return (PredictionMode)r.ReadTree(Luts.IntraModeTree, p);
         }
 
         private static PredictionMode ReadIntraModeY(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r, int sizeGroup)
@@ -41,7 +41,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
         private static PredictionMode ReadInterMode(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r, int ctx)
         {
-            int mode = r.ReadTree(Luts.Vp9InterModeTree, cm.Fc.Value.InterModeProb[ctx].AsSpan());
+            int mode = r.ReadTree(Luts.InterModeTree, cm.Fc.Value.InterModeProb[ctx].AsSpan());
             if (!xd.Counts.IsNull)
             {
                 ++xd.Counts.Value.InterMode[ctx][mode];
@@ -52,22 +52,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
         private static int ReadSegmentId(ref Reader r, ref Array7<byte> segTreeProbs)
         {
-            return r.ReadTree(Luts.Vp9SegmentTree, segTreeProbs.AsSpan());
+            return r.ReadTree(Luts.SegmentTree, segTreeProbs.AsSpan());
         }
 
         private static ReadOnlySpan<byte> GetTxProbs(ref Vp9EntropyProbs fc, TxSize maxTxSize, int ctx)
         {
             switch (maxTxSize)
             {
-                case TxSize.Tx8x8:
-                    return fc.Tx8x8Prob[ctx].AsSpan();
-                case TxSize.Tx16x16:
-                    return fc.Tx16x16Prob[ctx].AsSpan();
-                case TxSize.Tx32x32:
-                    return fc.Tx32x32Prob[ctx].AsSpan();
+                case TxSize.Tx8x8: return fc.Tx8x8Prob[ctx].AsSpan();
+                case TxSize.Tx16x16: return fc.Tx16x16Prob[ctx].AsSpan();
+                case TxSize.Tx32x32: return fc.Tx32x32Prob[ctx].AsSpan();
                 default:
                     Debug.Assert(false, "Invalid maxTxSize.");
-
                     return ReadOnlySpan<byte>.Empty;
             }
         }
@@ -76,15 +72,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         {
             switch (maxTxSize)
             {
-                case TxSize.Tx8x8:
-                    return counts.Tx8x8[ctx].AsSpan();
-                case TxSize.Tx16x16:
-                    return counts.Tx16x16[ctx].AsSpan();
-                case TxSize.Tx32x32:
-                    return counts.Tx32x32[ctx].AsSpan();
+                case TxSize.Tx8x8: return counts.Tx8x8[ctx].AsSpan();
+                case TxSize.Tx16x16: return counts.Tx16x16[ctx].AsSpan();
+                case TxSize.Tx32x32: return counts.Tx32x32[ctx].AsSpan();
                 default:
                     Debug.Assert(false, "Invalid maxTxSize.");
-
                     return Span<uint>.Empty;
             }
         }
@@ -124,34 +116,32 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             return (TxSize)Math.Min((int)maxTxSize, (int)Luts.TxModeToBiggestTxSize[(int)txMode]);
         }
 
-        private static int DecGetSegmentId(ref Vp9Common cm, ArrayPtr<byte> segmentIds, int miOffset, int xMis, int yMis)
+        private static int DecGetSegmentId(ref Vp9Common cm, ArrayPtr<byte> segmentIds, int miOffset, int xMis,
+            int yMis)
         {
-            int x, y, segmentId = int.MaxValue;
+            int segmentId = int.MaxValue;
 
-            for (y = 0; y < yMis; y++)
+            for (int y = 0; y < yMis; y++)
             {
-                for (x = 0; x < xMis; x++)
+                for (int x = 0; x < xMis; x++)
                 {
-                    segmentId = Math.Min(segmentId, segmentIds[miOffset + y * cm.MiCols + x]);
+                    segmentId = Math.Min(segmentId, segmentIds[miOffset + (y * cm.MiCols) + x]);
                 }
             }
 
             Debug.Assert(segmentId >= 0 && segmentId < Constants.MaxSegments);
-
             return segmentId;
         }
 
         private static void SetSegmentId(ref Vp9Common cm, int miOffset, int xMis, int yMis, int segmentId)
         {
-            int x, y;
-
             Debug.Assert(segmentId >= 0 && segmentId < Constants.MaxSegments);
 
-            for (y = 0; y < yMis; y++)
+            for (int y = 0; y < yMis; y++)
             {
-                for (x = 0; x < xMis; x++)
+                for (int x = 0; x < xMis; x++)
                 {
-                    cm.CurrentFrameSegMap[miOffset + y * cm.MiCols + x] = (byte)segmentId;
+                    cm.CurrentFrameSegMap[miOffset + (y * cm.MiCols) + x] = (byte)segmentId;
                 }
             }
         }
@@ -164,13 +154,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int xMis,
             int yMis)
         {
-            int x, y;
-
-            for (y = 0; y < yMis; y++)
+            for (int y = 0; y < yMis; y++)
             {
-                for (x = 0; x < xMis; x++)
+                for (int x = 0; x < xMis; x++)
                 {
-                    currentSegmentIds[miOffset + y * cm.MiCols + x] = (byte)(!lastSegmentIds.IsNull ? lastSegmentIds[miOffset + y * cm.MiCols + x] : 0);
+                    currentSegmentIds[miOffset + (y * cm.MiCols) + x] = (byte)(!lastSegmentIds.IsNull
+                        ? lastSegmentIds[miOffset + (y * cm.MiCols) + x]
+                        : 0);
                 }
             }
         }
@@ -188,13 +178,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             if (!seg.UpdateMap)
             {
                 CopySegmentId(ref cm, cm.LastFrameSegMap, cm.CurrentFrameSegMap, miOffset, xMis, yMis);
-
                 return 0;
             }
 
             segmentId = ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb);
             SetSegmentId(ref cm, miOffset, xMis, yMis, segmentId);
-
             return segmentId;
         }
 
@@ -210,7 +198,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             ref Segmentation seg = ref cm.Seg;
             ref ModeInfo mi = ref xd.Mi[0].Value;
             int predictedSegmentId, segmentId;
-            int miOffset = miRow * cm.MiCols + miCol;
+            int miOffset = (miRow * cm.MiCols) + miCol;
 
             if (!seg.Enabled)
             {
@@ -224,7 +212,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             if (!seg.UpdateMap)
             {
                 CopySegmentId(ref cm, cm.LastFrameSegMap, cm.CurrentFrameSegMap, miOffset, xMis, yMis);
-
                 return predictedSegmentId;
             }
 
@@ -232,20 +219,22 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             {
                 byte predProb = Segmentation.GetPredProbSegId(ref cm.Fc.Value.SegPredProb, ref xd);
                 mi.SegIdPredicted = (sbyte)r.Read(predProb);
-                segmentId = mi.SegIdPredicted != 0 ? predictedSegmentId : ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb);
+                segmentId = mi.SegIdPredicted != 0
+                    ? predictedSegmentId
+                    : ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb);
             }
             else
             {
                 segmentId = ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb);
             }
-            SetSegmentId(ref cm, miOffset, xMis, yMis, segmentId);
 
+            SetSegmentId(ref cm, miOffset, xMis, yMis, segmentId);
             return segmentId;
         }
 
         private static int ReadSkip(ref Vp9Common cm, ref MacroBlockD xd, int segmentId, ref Reader r)
         {
-            if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlSkip) != 0)
+            if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.Skip) != 0)
             {
                 return 1;
             }
@@ -260,12 +249,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             return skip;
         }
 
-        private static int ReadMvComponent(ref Reader r, ref Vp9EntropyProbs fc, int mvcomp, bool usehp)
+        private static int ReadComponent(ref Reader r, ref Vp9EntropyProbs fc, int mvcomp, bool usehp)
         {
             int mag, d, fr, hp;
             bool sign = r.Read(fc.Sign[mvcomp]) != 0;
-            MvClassType mvClass = (MvClassType)r.ReadTree(Luts.Vp9MvClassTree, fc.Classes[mvcomp].AsSpan());
-            bool class0 = mvClass == MvClassType.MvClass0;
+            MvClassType mvClass = (MvClassType)r.ReadTree(Luts.MvClassTree, fc.Classes[mvcomp].AsSpan());
+            bool class0 = mvClass == MvClassType.Class0;
 
             // Integer part
             if (class0)
@@ -275,11 +264,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
             else
             {
-                int i;
                 int n = (int)mvClass + Constants.Class0Bits - 1; // Number of bits
 
                 d = 0;
-                for (i = 0; i < n; ++i)
+                for (int i = 0; i < n; ++i)
                 {
                     d |= r.Read(fc.Bits[mvcomp][i]) << i;
                 }
@@ -288,40 +276,39 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
 
             // Fractional part
-            fr = r.ReadTree(Luts.Vp9MvFPTree, class0 ? fc.Class0Fp[mvcomp][d].AsSpan() : fc.Fp[mvcomp].AsSpan());
+            fr = r.ReadTree(Luts.MvFPTree, class0 ? fc.Class0Fp[mvcomp][d].AsSpan() : fc.Fp[mvcomp].AsSpan());
 
             // High precision part (if hp is not used, the default value of the hp is 1)
             hp = usehp ? r.Read(class0 ? fc.Class0Hp[mvcomp] : fc.Hp[mvcomp]) : 1;
 
             // Result
             mag += ((d << 3) | (fr << 1) | hp) + 1;
-
             return sign ? -mag : mag;
         }
 
-        private static void ReadMv(
+        private static void Read(
             ref Reader r,
             ref Mv mv,
             ref Mv refr,
             ref Vp9EntropyProbs fc,
             Ptr<Vp9BackwardUpdates> counts,
-            bool allowHP)
+            bool allowHp)
         {
-            MvJointType jointType = (MvJointType)r.ReadTree(Luts.Vp9MvJointTree, fc.Joints.AsSpan());
-            bool useHP = allowHP && refr.UseMvHp();
+            MvJointType jointType = (MvJointType)r.ReadTree(Luts.MvJointTree, fc.Joints.AsSpan());
+            bool useHp = allowHp && refr.UseHp();
             Mv diff = new();
 
-            if (Mv.MvJointVertical(jointType))
+            if (Mv.JointVertical(jointType))
             {
-                diff.Row = (short)ReadMvComponent(ref r, ref fc, 0, useHP);
+                diff.Row = (short)ReadComponent(ref r, ref fc, 0, useHp);
             }
 
-            if (Mv.MvJointHorizontal(jointType))
+            if (Mv.JointHorizontal(jointType))
             {
-                diff.Col = (short)ReadMvComponent(ref r, ref fc, 1, useHP);
+                diff.Col = (short)ReadComponent(ref r, ref fc, 1, useHp);
             }
 
-            diff.IncMv(counts);
+            diff.Inc(counts);
 
             mv.Row = (short)(refr.Row + diff.Row);
             mv.Col = (short)(refr.Col + diff.Col);
@@ -329,7 +316,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
         private static ReferenceMode ReadBlockReferenceMode(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r)
         {
-            if (cm.ReferenceMode == ReferenceMode.ReferenceModeSelect)
+            if (cm.ReferenceMode == ReferenceMode.Select)
             {
                 int ctx = PredCommon.GetReferenceModeContext(ref cm, ref xd);
                 ReferenceMode mode = (ReferenceMode)r.Read(cm.Fc.Value.CompInterProb[ctx]);
@@ -354,15 +341,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         {
             ref Vp9EntropyProbs fc = ref cm.Fc.Value;
 
-            if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlRefFrame) != 0)
+            if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.RefFrame) != 0)
             {
-                refFrame[0] = (sbyte)cm.Seg.GetSegData(segmentId, SegLvlFeatures.SegLvlRefFrame);
+                refFrame[0] = (sbyte)cm.Seg.GetSegData(segmentId, SegLvlFeatures.RefFrame);
                 refFrame[1] = Constants.None;
             }
             else
             {
                 ReferenceMode mode = ReadBlockReferenceMode(ref cm, ref xd, ref r);
-                if (mode == ReferenceMode.CompoundReference)
+                if (mode == ReferenceMode.Compound)
                 {
                     int idx = cm.RefFrameSignBias[cm.CompFixedRef];
                     int ctx = PredCommon.GetPredContextCompRefP(ref cm, ref xd);
@@ -375,7 +362,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     refFrame[idx] = cm.CompFixedRef;
                     refFrame[idx == 0 ? 1 : 0] = cm.CompVarRef[bit];
                 }
-                else if (mode == ReferenceMode.SingleReference)
+                else if (mode == ReferenceMode.Single)
                 {
                     int ctx0 = PredCommon.GetPredContextSingleRefP1(ref xd);
                     int bit0 = r.Read(fc.SingleRefProb[ctx0][0]);
@@ -412,7 +399,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         private static byte ReadSwitchableInterpFilter(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r)
         {
             int ctx = xd.GetPredContextSwitchableInterp();
-            byte type = (byte)r.ReadTree(Luts.Vp9SwitchableInterpTree, cm.Fc.Value.SwitchableInterpProb[ctx].AsSpan());
+            byte type = (byte)r.ReadTree(Luts.SwitchableInterpTree, cm.Fc.Value.SwitchableInterpProb[ctx].AsSpan());
             if (!xd.Counts.IsNull)
             {
                 ++xd.Counts.Value.SwitchableInterp[ctx][type];
@@ -424,12 +411,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         private static void ReadIntraBlockModeInfo(ref Vp9Common cm, ref MacroBlockD xd, ref ModeInfo mi, ref Reader r)
         {
             BlockSize bsize = mi.SbType;
-            int i;
+
 
             switch (bsize)
             {
                 case BlockSize.Block4x4:
-                    for (i = 0; i < 4; ++i)
+                    for (int i = 0; i < 4; ++i)
                     {
                         mi.Bmi[i].Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0);
                     }
@@ -459,27 +446,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             mi.RefFrame[1] = Constants.None;
         }
 
-        private static bool IsMvValid(ref Mv mv)
-        {
-            return mv.Row > Constants.MvLow &&
-                   mv.Row < Constants.MvUpp &&
-                   mv.Col > Constants.MvLow &&
-                   mv.Col < Constants.MvUpp;
-        }
-
-        private static void CopyMvPair(ref Array2<Mv> dst, ref Array2<Mv> src)
+        private static void CopyPair(ref Array2<Mv> dst, ref Array2<Mv> src)
         {
             dst[0] = src[0];
             dst[1] = src[1];
         }
 
-        private static void ZeroMvPair(ref Array2<Mv> dst)
+        private static void ZeroPair(ref Array2<Mv> dst)
         {
             dst[0] = new Mv();
             dst[1] = new Mv();
         }
 
-        private static bool AssignMv(
+        private static bool Assign(
             ref Vp9Common cm,
             ref MacroBlockD xd,
             PredictionMode mode,
@@ -487,45 +466,45 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             ref Array2<Mv> refMv,
             ref Array2<Mv> nearNearestMv,
             int isCompound,
-            bool allowHP,
+            bool allowHp,
             ref Reader r)
         {
-            int i;
             bool ret = true;
 
             switch (mode)
             {
                 case PredictionMode.NewMv:
                     {
-                        for (i = 0; i < 1 + isCompound; ++i)
+                        for (int i = 0; i < 1 + isCompound; ++i)
                         {
-                            ReadMv(ref r, ref mv[i], ref refMv[i], ref cm.Fc.Value, xd.Counts, allowHP);
-                            ret = ret && IsMvValid(ref mv[i]);
+                            Read(ref r, ref mv[i], ref refMv[i], ref cm.Fc.Value, xd.Counts, allowHp);
+                            ret = ret && mv[i].IsValid();
                         }
+
                         break;
                     }
                 case PredictionMode.NearMv:
                 case PredictionMode.NearestMv:
                     {
-                        CopyMvPair(ref mv, ref nearNearestMv);
+                        CopyPair(ref mv, ref nearNearestMv);
                         break;
                     }
                 case PredictionMode.ZeroMv:
                     {
-                        ZeroMvPair(ref mv);
+                        ZeroPair(ref mv);
                         break;
                     }
-                default:
-                    return false;
+                default: return false;
             }
+
             return ret;
         }
 
         private static bool ReadIsInterBlock(ref Vp9Common cm, ref MacroBlockD xd, int segmentId, ref Reader r)
         {
-            if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlRefFrame) != 0)
+            if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.RefFrame) != 0)
             {
-                return cm.Seg.GetSegData(segmentId, SegLvlFeatures.SegLvlRefFrame) != Constants.IntraFrame;
+                return cm.Seg.GetSegData(segmentId, SegLvlFeatures.RefFrame) != Constants.IntraFrame;
             }
 
             int ctx = xd.GetIntraInterContext();
@@ -538,33 +517,30 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             return isInter;
         }
 
-        private static void DecFindBestRefMvs(bool allowHP, Span<Mv> mvlist, ref Mv bestMv, int refmvCount)
+        private static void DecFindBestRefs(bool allowHp, Span<Mv> mvlist, ref Mv bestMv, int refmvCount)
         {
-            int i;
-
             // Make sure all the candidates are properly clamped etc
-            for (i = 0; i < refmvCount; ++i)
+            for (int i = 0; i < refmvCount; ++i)
             {
-                mvlist[i].LowerMvPrecision(allowHP);
+                mvlist[i].LowerPrecision(allowHp);
                 bestMv = mvlist[i];
             }
         }
 
-        private static bool AddMvRefListEb(Mv mv, ref int refMvCount, Span<Mv> mvRefList, bool earlyBreak)
+        private static bool AddRefListEb(Mv mv, ref int refCount, Span<Mv> mvRefList, bool earlyBreak)
         {
-            if (refMvCount != 0)
+            if (refCount != 0)
             {
                 if (Unsafe.As<Mv, int>(ref mv) != Unsafe.As<Mv, int>(ref mvRefList[0]))
                 {
-                    mvRefList[refMvCount] = mv;
-                    refMvCount++;
-
+                    mvRefList[refCount] = mv;
+                    refCount++;
                     return true;
                 }
             }
             else
             {
-                mvRefList[refMvCount++] = mv;
+                mvRefList[refCount++] = mv;
                 if (earlyBreak)
                 {
                     return true;
@@ -574,19 +550,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             return false;
         }
 
-        // Performs mv sign inversion if indicated by the reference frame combination.
-        private static Mv ScaleMv(ref ModeInfo mi, int refr, sbyte thisRefFrame, ref Array4<sbyte> refSignBias)
-        {
-            Mv mv = mi.Mv[refr];
-            if (refSignBias[mi.RefFrame[refr]] != refSignBias[thisRefFrame])
-            {
-                mv.Row *= -1;
-                mv.Col *= -1;
-            }
-            return mv;
-        }
-
-        private static bool IsDiffRefFrameAddMvEb(
+        private static bool IsDiffRefFrameAddEb(
             ref ModeInfo mbmi,
             sbyte refFrame,
             ref Array4<sbyte> refSignBias,
@@ -598,26 +562,30 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             {
                 if (mbmi.RefFrame[0] != refFrame)
                 {
-                    if (AddMvRefListEb(ScaleMv(ref mbmi, 0, refFrame, ref refSignBias), ref refmvCount, mvRefList, earlyBreak))
+                    if (AddRefListEb(mbmi.ScaleMv(0, refFrame, ref refSignBias), ref refmvCount, mvRefList,
+                            earlyBreak))
                     {
                         return true;
                     }
                 }
-                if (mbmi.HasSecondRef() && mbmi.RefFrame[1] != refFrame && Unsafe.As<Mv, int>(ref mbmi.Mv[1]) != Unsafe.As<Mv, int>(ref mbmi.Mv[0]))
+
+                if (mbmi.HasSecondRef() && mbmi.RefFrame[1] != refFrame &&
+                    Unsafe.As<Mv, int>(ref mbmi.Mv[1]) != Unsafe.As<Mv, int>(ref mbmi.Mv[0]))
                 {
-                    if (AddMvRefListEb(ScaleMv(ref mbmi, 1, refFrame, ref refSignBias), ref refmvCount, mvRefList, earlyBreak))
+                    if (AddRefListEb(mbmi.ScaleMv(1, refFrame, ref refSignBias), ref refmvCount, mvRefList,
+                            earlyBreak))
                     {
                         return true;
                     }
                 }
-
             }
+
             return false;
         }
 
         // This function searches the neighborhood of a given MB/SB
         // to try and find candidate reference vectors.
-        private static int DecFindMvRefs(
+        private static int DecFindRefs(
             ref Vp9Common cm,
             ref MacroBlockD xd,
             PredictionMode mode,
@@ -627,22 +595,24 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int miRow,
             int miCol,
             int block,
-            int isSub8X8)
+            int isSub8x8)
         {
             ref Array4<sbyte> refSignBias = ref cm.RefFrameSignBias;
             int i, refmvCount = 0;
             bool differentRefFound = false;
-            Ptr<MvRef> prevFrameMvs = cm.UsePrevFrameMvs ? new Ptr<MvRef>(ref cm.PrevFrameMvs[miRow * cm.MiCols + miCol]) : Ptr<MvRef>.Null;
+            Ptr<MvRef> prevFrameMvs = cm.UsePrevFrameMvs
+                ? new Ptr<MvRef>(ref cm.PrevFrameMvs[(miRow * cm.MiCols) + miCol])
+                : Ptr<MvRef>.Null;
             ref TileInfo tile = ref xd.Tile;
             // If mode is nearestmv or newmv (uses nearestmv as a reference) then stop
             // searching after the first mv is found.
             bool earlyBreak = mode != PredictionMode.NearMv;
 
             // Blank the reference vector list
-            mvRefList[..Constants.MaxMvRefCandidates].Clear();
+            mvRefList.Slice(0, Constants.MaxMvRefCandidates).Fill(new Mv());
 
             i = 0;
-            if (isSub8X8 != 0)
+            if (isSub8x8 != 0)
             {
                 // If the size < 8x8 we get the mv from the bmi substructure for the
                 // nearest two blocks.
@@ -651,19 +621,21 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     ref Position mvRef = ref mvRefSearch[i];
                     if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef))
                     {
-                        ref ModeInfo candidateMi = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value;
+                        ref ModeInfo candidateMi = ref xd.Mi[mvRef.Col + (mvRef.Row * xd.MiStride)].Value;
                         differentRefFound = true;
 
                         if (candidateMi.RefFrame[0] == refFrame)
                         {
-                            if (AddMvRefListEb(candidateMi.GetSubBlockMv(0, mvRef.Col, block), ref refmvCount, mvRefList, earlyBreak))
+                            if (AddRefListEb(candidateMi.GetSubBlockMv(0, mvRef.Col, block), ref refmvCount,
+                                    mvRefList, earlyBreak))
                             {
                                 goto Done;
                             }
                         }
                         else if (candidateMi.RefFrame[1] == refFrame)
                         {
-                            if (AddMvRefListEb(candidateMi.GetSubBlockMv(1, mvRef.Col, block), ref refmvCount, mvRefList, earlyBreak))
+                            if (AddRefListEb(candidateMi.GetSubBlockMv(1, mvRef.Col, block), ref refmvCount,
+                                    mvRefList, earlyBreak))
                             {
                                 goto Done;
                             }
@@ -675,24 +647,24 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // Check the rest of the neighbors in much the same way
             // as before except we don't need to keep track of sub blocks or
             // mode counts.
-            for (; i < MvrefNeighbours; ++i)
+            for (; i < RefNeighbours; ++i)
             {
                 ref Position mvRef = ref mvRefSearch[i];
                 if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef))
                 {
-                    ref ModeInfo candidate = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value;
+                    ref ModeInfo candidate = ref xd.Mi[mvRef.Col + (mvRef.Row * xd.MiStride)].Value;
                     differentRefFound = true;
 
                     if (candidate.RefFrame[0] == refFrame)
                     {
-                        if (AddMvRefListEb(candidate.Mv[0], ref refmvCount, mvRefList, earlyBreak))
+                        if (AddRefListEb(candidate.Mv[0], ref refmvCount, mvRefList, earlyBreak))
                         {
                             goto Done;
                         }
                     }
                     else if (candidate.RefFrame[1] == refFrame)
                     {
-                        if (AddMvRefListEb(candidate.Mv[1], ref refmvCount, mvRefList, earlyBreak))
+                        if (AddRefListEb(candidate.Mv[1], ref refmvCount, mvRefList, earlyBreak))
                         {
                             goto Done;
                         }
@@ -705,14 +677,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             {
                 if (prevFrameMvs.Value.RefFrame[0] == refFrame)
                 {
-                    if (AddMvRefListEb(prevFrameMvs.Value.Mv[0], ref refmvCount, mvRefList, earlyBreak))
+                    if (AddRefListEb(prevFrameMvs.Value.Mv[0], ref refmvCount, mvRefList, earlyBreak))
                     {
                         goto Done;
                     }
                 }
                 else if (prevFrameMvs.Value.RefFrame[1] == refFrame)
                 {
-                    if (AddMvRefListEb(prevFrameMvs.Value.Mv[1], ref refmvCount, mvRefList, earlyBreak))
+                    if (AddRefListEb(prevFrameMvs.Value.Mv[1], ref refmvCount, mvRefList, earlyBreak))
                     {
                         goto Done;
                     }
@@ -724,15 +696,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // different reference frames.
             if (differentRefFound)
             {
-                for (i = 0; i < MvrefNeighbours; ++i)
+                for (i = 0; i < RefNeighbours; ++i)
                 {
                     ref Position mvRef = ref mvRefSearch[i];
                     if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef))
                     {
-                        ref ModeInfo candidate = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value;
+                        ref ModeInfo candidate = ref xd.Mi[mvRef.Col + (mvRef.Row * xd.MiStride)].Value;
 
                         // If the candidate is Intra we don't want to consider its mv.
-                        if (IsDiffRefFrameAddMvEb(ref candidate, refFrame, ref refSignBias, ref refmvCount, mvRefList, earlyBreak))
+                        if (IsDiffRefFrameAddEb(ref candidate, refFrame, ref refSignBias, ref refmvCount, mvRefList,
+                                earlyBreak))
                         {
                             goto Done;
                         }
@@ -751,7 +724,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         mv.Row *= -1;
                         mv.Col *= -1;
                     }
-                    if (AddMvRefListEb(mv, ref refmvCount, mvRefList, earlyBreak))
+
+                    if (AddRefListEb(mv, ref refmvCount, mvRefList, earlyBreak))
                     {
                         goto Done;
                     }
@@ -759,7 +733,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
                 if (prevFrameMvs.Value.RefFrame[1] > Constants.IntraFrame &&
                     prevFrameMvs.Value.RefFrame[1] != refFrame &&
-                    Unsafe.As<Mv, int>(ref prevFrameMvs.Value.Mv[1]) != Unsafe.As<Mv, int>(ref prevFrameMvs.Value.Mv[0]))
+                    Unsafe.As<Mv, int>(ref prevFrameMvs.Value.Mv[1]) !=
+                    Unsafe.As<Mv, int>(ref prevFrameMvs.Value.Mv[0]))
                 {
                     Mv mv = prevFrameMvs.Value.Mv[1];
                     if (refSignBias[prevFrameMvs.Value.RefFrame[1]] != refSignBias[refFrame])
@@ -767,7 +742,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         mv.Row *= -1;
                         mv.Col *= -1;
                     }
-                    if (AddMvRefListEb(mv, ref refmvCount, mvRefList, earlyBreak))
+
+                    if (AddRefListEb(mv, ref refmvCount, mvRefList, earlyBreak))
                     {
                         goto Done;
                     }
@@ -784,17 +760,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 refmvCount = 1;
             }
 
-        Done:
+            Done:
             // Clamp vectors
             for (i = 0; i < refmvCount; ++i)
             {
-                mvRefList[i].ClampMvRef(ref xd);
+                mvRefList[i].ClampRef(ref xd);
             }
 
             return refmvCount;
         }
 
-        private static void AppendSub8x8MvsForIdx(
+        private static void AppendSub8x8ForIdx(
             ref Vp9Common cm,
             ref MacroBlockD xd,
             Span<Position> mvRefSearch,
@@ -808,12 +784,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             Span<Mv> mvList = stackalloc Mv[Constants.MaxMvRefCandidates];
             ref ModeInfo mi = ref xd.Mi[0].Value;
             ref Array4<BModeInfo> bmi = ref mi.Bmi;
-            int n;
             int refmvCount;
 
             Debug.Assert(Constants.MaxMvRefCandidates == 2);
 
-            refmvCount = DecFindMvRefs(ref cm, ref xd, bMode, mi.RefFrame[refr], mvRefSearch, mvList, miRow, miCol, block, 1);
+            refmvCount = DecFindRefs(ref cm, ref xd, bMode, mi.RefFrame[refr], mvRefSearch, mvList, miRow, miCol,
+                block, 1);
 
             switch (block)
             {
@@ -829,7 +805,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     else
                     {
                         bestSub8x8 = new Mv();
-                        for (n = 0; n < refmvCount; ++n)
+                        for (int n = 0; n < refmvCount; ++n)
                         {
                             if (Unsafe.As<Mv, int>(ref bmi[0].Mv[refr]) != Unsafe.As<Mv, int>(ref mvList[n]))
                             {
@@ -838,6 +814,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                             }
                         }
                     }
+
                     break;
                 case 3:
                     if (bMode == PredictionMode.NearestMv)
@@ -852,7 +829,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         candidates[2] = mvList[0];
                         candidates[3] = mvList[1];
                         bestSub8x8 = new Mv();
-                        for (n = 0; n < 2 + Constants.MaxMvRefCandidates; ++n)
+                        for (int n = 0; n < 2 + Constants.MaxMvRefCandidates; ++n)
                         {
                             if (Unsafe.As<Mv, int>(ref bmi[2].Mv[refr]) != Unsafe.As<Mv, int>(ref candidates[n]))
                             {
@@ -861,6 +838,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                             }
                         }
                     }
+
                     break;
                 default:
                     Debug.Assert(false, "Invalid block index.");
@@ -868,19 +846,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
         }
 
-        private static byte GetModeContext(ref Vp9Common cm, ref MacroBlockD xd, Span<Position> mvRefSearch, int miRow, int miCol)
+        private static byte GetModeContext(ref Vp9Common cm, ref MacroBlockD xd, Span<Position> mvRefSearch, int miRow,
+            int miCol)
         {
-            int i;
             int contextCounter = 0;
             ref TileInfo tile = ref xd.Tile;
 
             // Get mode count from nearest 2 blocks
-            for (i = 0; i < 2; ++i)
+            for (int i = 0; i < 2; ++i)
             {
                 ref Position mvRef = ref mvRefSearch[i];
                 if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef))
                 {
-                    ref ModeInfo candidate = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value;
+                    ref ModeInfo candidate = ref xd.Mi[mvRef.Col + (mvRef.Row * xd.MiStride)].Value;
                     // Keep counts for entropy encoding.
                     contextCounter += Luts.Mode2Counter[(int)candidate.Mode];
                 }
@@ -898,7 +876,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             ref Reader r)
         {
             BlockSize bsize = mi.SbType;
-            bool allowHP = cm.AllowHighPrecisionMv;
+            bool allowHp = cm.AllowHighPrecisionMv;
             Array2<Mv> bestRefMvs = new();
             int refr, isCompound;
             byte interModeCtx;
@@ -908,13 +886,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             isCompound = mi.HasSecondRef() ? 1 : 0;
             interModeCtx = GetModeContext(ref cm, ref xd, mvRefSearch, miRow, miCol);
 
-            if (cm.Seg.IsSegFeatureActive(mi.SegmentId, SegLvlFeatures.SegLvlSkip) != 0)
+            if (cm.Seg.IsSegFeatureActive(mi.SegmentId, SegLvlFeatures.Skip) != 0)
             {
                 mi.Mode = PredictionMode.ZeroMv;
                 if (bsize < BlockSize.Block8x8)
                 {
-                    xd.ErrorInfo.Value.InternalError(CodecErr.CodecUnsupBitstream, "Invalid usage of segement feature on small blocks");
-
+                    xd.ErrorInfo.Value.InternalError(CodecErr.UnsupBitstream,
+                        "Invalid usage of segement feature on small blocks");
                     return;
                 }
             }
@@ -942,53 +920,58 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         sbyte frame = mi.RefFrame[refr];
                         int refmvCount;
 
-                        refmvCount = DecFindMvRefs(ref cm, ref xd, mi.Mode, frame, mvRefSearch, tmpMvs, miRow, miCol, -1, 0);
+                        refmvCount = DecFindRefs(ref cm, ref xd, mi.Mode, frame, mvRefSearch, tmpMvs, miRow, miCol,
+                            -1, 0);
 
-                        DecFindBestRefMvs(allowHP, tmpMvs, ref bestRefMvs[refr], refmvCount);
+                        DecFindBestRefs(allowHp, tmpMvs, ref bestRefMvs[refr], refmvCount);
                     }
                 }
             }
 
-            mi.InterpFilter = (cm.InterpFilter == Constants.Switchable) ? ReadSwitchableInterpFilter(ref cm, ref xd, ref r) : cm.InterpFilter;
+            mi.InterpFilter = cm.InterpFilter == Constants.Switchable
+                ? ReadSwitchableInterpFilter(ref cm, ref xd, ref r)
+                : cm.InterpFilter;
 
             if (bsize < BlockSize.Block8x8)
             {
-                int num4X4W = 1 << xd.BmodeBlocksWl;
-                int num4X4H = 1 << xd.BmodeBlocksHl;
+                int num4x4W = 1 << xd.BmodeBlocksWl;
+                int num4x4H = 1 << xd.BmodeBlocksHl;
                 int idx, idy;
                 PredictionMode bMode = 0;
                 Array2<Mv> bestSub8x8 = new();
-                const uint InvalidMv = 0x80008000;
+                const uint invalidMv = 0x80008000;
                 // Initialize the 2nd element as even though it won't be used meaningfully
                 // if isCompound is false.
-                Unsafe.As<Mv, uint>(ref bestSub8x8[1]) = InvalidMv;
-                for (idy = 0; idy < 2; idy += num4X4H)
+                Unsafe.As<Mv, uint>(ref bestSub8x8[1]) = invalidMv;
+                for (idy = 0; idy < 2; idy += num4x4H)
                 {
-                    for (idx = 0; idx < 2; idx += num4X4W)
+                    for (idx = 0; idx < 2; idx += num4x4W)
                     {
-                        int j = idy * 2 + idx;
+                        int j = (idy * 2) + idx;
                         bMode = ReadInterMode(ref cm, ref xd, ref r, interModeCtx);
 
                         if (bMode == PredictionMode.NearestMv || bMode == PredictionMode.NearMv)
                         {
                             for (refr = 0; refr < 1 + isCompound; ++refr)
                             {
-                                AppendSub8x8MvsForIdx(ref cm, ref xd, mvRefSearch, bMode, j, refr, miRow, miCol, ref bestSub8x8[refr]);
+                                AppendSub8x8ForIdx(ref cm, ref xd, mvRefSearch, bMode, j, refr, miRow, miCol,
+                                    ref bestSub8x8[refr]);
                             }
                         }
 
-                        if (!AssignMv(ref cm, ref xd, bMode, ref mi.Bmi[j].Mv, ref bestRefMvs, ref bestSub8x8, isCompound, allowHP, ref r))
+                        if (!Assign(ref cm, ref xd, bMode, ref mi.Bmi[j].Mv, ref bestRefMvs, ref bestSub8x8,
+                                isCompound, allowHp, ref r))
                         {
                             xd.Corrupted |= true;
                             break;
                         }
 
-                        if (num4X4H == 2)
+                        if (num4x4H == 2)
                         {
                             mi.Bmi[j + 2] = mi.Bmi[j];
                         }
 
-                        if (num4X4W == 2)
+                        if (num4x4W == 2)
                         {
                             mi.Bmi[j + 1] = mi.Bmi[j];
                         }
@@ -997,11 +980,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
                 mi.Mode = bMode;
 
-                CopyMvPair(ref mi.Mv, ref mi.Bmi[3].Mv);
+                CopyPair(ref mi.Mv, ref mi.Bmi[3].Mv);
             }
             else
             {
-                xd.Corrupted |= !AssignMv(ref cm, ref xd, mi.Mode, ref mi.Mv, ref bestRefMvs, ref bestRefMvs, isCompound, allowHP, ref r);
+                xd.Corrupted |= !Assign(ref cm, ref xd, mi.Mode, ref mi.Mv, ref bestRefMvs, ref bestRefMvs,
+                    isCompound, allowHp, ref r);
             }
         }
 
@@ -1045,7 +1029,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
 
             Debug.Assert(b == 1 || b == 3);
-
             return curMi.Value.Bmi[b - 1].Mode;
         }
 
@@ -1062,7 +1045,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
 
             Debug.Assert(b == 2 || b == 3);
-
             return curMi.Value.Bmi[b - 2].Mode;
         }
 
@@ -1075,7 +1057,6 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         {
             PredictionMode above = AboveBlockMode(mi, aboveMi, block);
             PredictionMode left = LeftBlockMode(mi, leftMi, block);
-
             return fc.KfYModeProb[(int)above][(int)left].AsSpan();
         }
 
@@ -1092,8 +1073,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             Ptr<ModeInfo> aboveMi = xd.AboveMi;
             Ptr<ModeInfo> leftMi = xd.LeftMi;
             BlockSize bsize = mi.Value.SbType;
-            int i;
-            int miOffset = miRow * cm.MiCols + miCol;
+
+            int miOffset = (miRow * cm.MiCols) + miCol;
 
             mi.Value.SegmentId = (sbyte)ReadIntraSegmentId(ref cm, miOffset, xMis, yMis, ref r);
             mi.Value.Skip = (sbyte)ReadSkip(ref cm, ref xd, mi.Value.SegmentId, ref r);
@@ -1104,7 +1085,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             switch (bsize)
             {
                 case BlockSize.Block4x4:
-                    for (i = 0; i < 4; ++i)
+                    for (int i = 0; i < 4; ++i)
                     {
                         mi.Value.Bmi[i].Mode =
                             ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, i));
@@ -1149,8 +1130,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             ref Reader r = ref twd.BitReader;
             ref MacroBlockD xd = ref twd.Xd;
             ref ModeInfo mi = ref xd.Mi[0].Value;
-            ArrayPtr<MvRef> frameMvs = cm.CurFrameMvs.Slice(miRow * cm.MiCols + miCol);
-            int w, h;
+            ArrayPtr<MvRef> frameMvs = cm.CurFrameMvs.Slice((miRow * cm.MiCols) + miCol);
 
             if (cm.FrameIsIntraOnly())
             {
@@ -1160,17 +1140,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             {
                 ReadInterFrameModeInfo(ref cm, ref xd, miRow, miCol, ref r, xMis, yMis);
 
-                for (h = 0; h < yMis; ++h)
+                for (int h = 0; h < yMis; ++h)
                 {
-                    for (w = 0; w < xMis; ++w)
+                    for (int w = 0; w < xMis; ++w)
                     {
                         ref MvRef mv = ref frameMvs[w];
                         CopyRefFramePair(ref mv.RefFrame, ref mi.RefFrame);
-                        CopyMvPair(ref mv.Mv, ref mi.Mv);
+                        CopyPair(ref mv.Mv, ref mi.Mv);
                     }
+
                     frameMvs = frameMvs.Slice(cm.MiCols);
                 }
             }
         }
     }
-}
+}

+ 51 - 65
src/Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
 using Ryujinx.Graphics.Nvdec.Vp9.Common;
 using Ryujinx.Graphics.Nvdec.Vp9.Types;
 using Ryujinx.Graphics.Video;
@@ -12,14 +12,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
         private readonly MemoryAllocator _allocator = new();
 
-        public ISurface CreateSurface(int width, int height) => new Surface(width, height);
+        public ISurface CreateSurface(int width, int height)
+        {
+            return new Surface(width, height);
+        }
 
         private static ReadOnlySpan<byte> LiteralToFilter => new byte[]
         {
-            Constants.EightTapSmooth,
-            Constants.EightTap,
-            Constants.EightTapSharp,
-            Constants.Bilinear,
+            Constants.EightTapSmooth, Constants.EightTap, Constants.EightTapSharp, Constants.Bilinear
         };
 
         public unsafe bool Decode(
@@ -29,25 +29,24 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             ReadOnlySpan<Vp9MvRef> mvsIn,
             Span<Vp9MvRef> mvsOut)
         {
-            Vp9Common cm = new()
-            {
-                FrameType = pictureInfo.IsKeyFrame ? FrameType.KeyFrame : FrameType.InterFrame,
-                IntraOnly = pictureInfo.IntraOnly,
+            Vp9Common cm = new();
+
+            cm.FrameType = pictureInfo.IsKeyFrame ? FrameType.KeyFrame : FrameType.InterFrame;
+            cm.IntraOnly = pictureInfo.IntraOnly;
 
-                Width = output.Width,
-                Height = output.Height,
-                SubsamplingX = 1,
-                SubsamplingY = 1,
+            cm.Width = output.Width;
+            cm.Height = output.Height;
+            cm.SubsamplingX = 1;
+            cm.SubsamplingY = 1;
 
-                UsePrevFrameMvs = pictureInfo.UsePrevInFindMvRefs,
+            cm.UsePrevFrameMvs = pictureInfo.UsePrevInFindMvRefs;
 
-                RefFrameSignBias = pictureInfo.RefFrameSignBias,
+            cm.RefFrameSignBias = pictureInfo.RefFrameSignBias;
 
-                BaseQindex = pictureInfo.BaseQIndex,
-                YDcDeltaQ = pictureInfo.YDcDeltaQ,
-                UvAcDeltaQ = pictureInfo.UvAcDeltaQ,
-                UvDcDeltaQ = pictureInfo.UvDcDeltaQ,
-            };
+            cm.BaseQindex = pictureInfo.BaseQIndex;
+            cm.YDcDeltaQ = pictureInfo.YDcDeltaQ;
+            cm.UvAcDeltaQ = pictureInfo.UvAcDeltaQ;
+            cm.UvDcDeltaQ = pictureInfo.UvDcDeltaQ;
 
             cm.Mb.Lossless = pictureInfo.Lossless;
             cm.Mb.Bd = 8;
@@ -68,6 +67,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             cm.CompFixedRef = pictureInfo.CompFixedRef;
             cm.CompVarRef = pictureInfo.CompVarRef;
 
+            cm.BitDepth = BitDepth.Bits8;
+
             cm.Log2TileCols = pictureInfo.Log2TileCols;
             cm.Log2TileRows = pictureInfo.Log2TileRows;
 
@@ -78,6 +79,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             cm.Seg.FeatureMask = pictureInfo.SegmentFeatureEnable;
             cm.Seg.FeatureData = pictureInfo.SegmentFeatureData;
 
+            cm.Lf.FilterLevel = pictureInfo.LoopFilterLevel;
+            cm.Lf.SharpnessLevel = pictureInfo.LoopFilterSharpnessLevel;
             cm.Lf.ModeRefDeltaEnabled = pictureInfo.ModeRefDeltaEnabled;
             cm.Lf.RefDeltas = pictureInfo.RefDeltas;
             cm.Lf.ModeDeltas = pictureInfo.ModeDeltas;
@@ -105,7 +108,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             cm.SetupSegmentationDequant();
             cm.SetupScaleFactors();
 
-            SetMvs(ref cm, mvsIn);
+            cm.SetMvs(mvsIn);
+
+            if (cm.Lf.FilterLevel != 0 && cm.SkipLoopFilter == 0)
+            {
+                LoopFilter.LoopFilterFrameInit(ref cm, cm.Lf.FilterLevel);
+            }
 
             fixed (byte* dataPtr = bitstream)
             {
@@ -114,10 +122,27 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     if (maxThreads > 1 && tileRows == 1 && tileCols > 1)
                     {
                         DecodeFrame.DecodeTilesMt(ref cm, new ArrayPtr<byte>(dataPtr, bitstream.Length), maxThreads);
+
+                        LoopFilter.LoopFilterFrameMt(
+                            ref cm.Mb.CurBuf,
+                            ref cm,
+                            ref cm.Mb,
+                            cm.Lf.FilterLevel,
+                            false,
+                            false,
+                            maxThreads);
                     }
                     else
                     {
                         DecodeFrame.DecodeTiles(ref cm, new ArrayPtr<byte>(dataPtr, bitstream.Length));
+
+                        LoopFilter.LoopFilterFrame(
+                            ref cm.Mb.CurBuf,
+                            ref cm,
+                            ref cm.Mb,
+                            cm.Lf.FilterLevel,
+                            false,
+                            false);
                     }
                 }
                 catch (InternalErrorException)
@@ -126,7 +151,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 }
             }
 
-            GetMvs(ref cm, mvsOut);
+            cm.GetMvs(mvsOut);
 
             cm.FreeTileWorkerData(_allocator);
             cm.FreeContextBuffers(_allocator);
@@ -134,48 +159,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             return true;
         }
 
-        private static void SetMvs(ref Vp9Common cm, ReadOnlySpan<Vp9MvRef> mvs)
-        {
-            if (mvs.Length > cm.PrevFrameMvs.Length)
-            {
-                throw new ArgumentException($"Size mismatch, expected: {cm.PrevFrameMvs.Length}, but got: {mvs.Length}.");
-            }
-
-            for (int i = 0; i < mvs.Length; i++)
-            {
-                ref var mv = ref cm.PrevFrameMvs[i];
-
-                mv.Mv[0].Row = mvs[i].Mvs[0].Row;
-                mv.Mv[0].Col = mvs[i].Mvs[0].Col;
-                mv.Mv[1].Row = mvs[i].Mvs[1].Row;
-                mv.Mv[1].Col = mvs[i].Mvs[1].Col;
-
-                mv.RefFrame[0] = (sbyte)mvs[i].RefFrames[0];
-                mv.RefFrame[1] = (sbyte)mvs[i].RefFrames[1];
-            }
-        }
-
-        private static void GetMvs(ref Vp9Common cm, Span<Vp9MvRef> mvs)
+        public void Dispose()
         {
-            if (mvs.Length > cm.CurFrameMvs.Length)
-            {
-                throw new ArgumentException($"Size mismatch, expected: {cm.CurFrameMvs.Length}, but got: {mvs.Length}.");
-            }
-
-            for (int i = 0; i < mvs.Length; i++)
-            {
-                ref var mv = ref cm.CurFrameMvs[i];
-
-                mvs[i].Mvs[0].Row = mv.Mv[0].Row;
-                mvs[i].Mvs[0].Col = mv.Mv[0].Col;
-                mvs[i].Mvs[1].Row = mv.Mv[1].Row;
-                mvs[i].Mvs[1].Col = mv.Mv[1].Col;
-
-                mvs[i].RefFrames[0] = mv.RefFrame[0];
-                mvs[i].RefFrames[1] = mv.RefFrame[1];
-            }
+            _allocator.Dispose();
         }
-
-        public void Dispose() => _allocator.Dispose();
     }
-}
+}

+ 37 - 41
src/Ryujinx.Graphics.Nvdec.Vp9/Detokenize.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
 using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
 using Ryujinx.Graphics.Nvdec.Vp9.Types;
 using Ryujinx.Graphics.Video;
@@ -17,26 +17,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
         private static int GetCoefContext(ReadOnlySpan<short> neighbors, ReadOnlySpan<byte> tokenCache, int c)
         {
-            const int MaxNeighbors = 2;
+            const int maxNeighbors = 2;
 
-            return (1 + tokenCache[neighbors[MaxNeighbors * c + 0]] + tokenCache[neighbors[MaxNeighbors * c + 1]]) >> 1;
-        }
-
-        private static int ReadCoeff(
-            ref Reader r,
-            ReadOnlySpan<byte> probs,
-            int n,
-            ref ulong value,
-            ref int count,
-            ref uint range)
-        {
-            int i, val = 0;
-            for (i = 0; i < n; ++i)
-            {
-                val = (val << 1) | r.ReadBool(probs[i], ref value, ref count, ref range);
-            }
-
-            return val;
+            return (1 + tokenCache[neighbors[(maxNeighbors * c) + 0]] +
+                    tokenCache[neighbors[(maxNeighbors * c) + 1]]) >> 1;
         }
 
         private static int DecodeCoefs(
@@ -58,13 +42,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             ref Array6<Array6<Array3<byte>>> coefProbs = ref fc.CoefProbs[(int)txSize][(int)type][refr];
             Span<byte> tokenCache = stackalloc byte[32 * 32];
             ReadOnlySpan<byte> bandTranslate = Luts.GetBandTranslate(txSize);
-            int dqShift = (txSize == TxSize.Tx32x32) ? 1 : 0;
+            int dqShift = txSize == TxSize.Tx32x32 ? 1 : 0;
             int v;
             short dqv = dq[0];
-            ReadOnlySpan<byte> cat6Prob = (xd.Bd == 12)
-                ? Luts.Vp9Cat6ProbHigh12
-                : (xd.Bd == 10) ? Luts.Vp9Cat6ProbHigh12[2..] : Luts.Vp9Cat6Prob;
-            int cat6Bits = (xd.Bd == 12) ? 18 : (xd.Bd == 10) ? 16 : 14;
+            ReadOnlySpan<byte> cat6Prob = xd.Bd == 12
+                ? Luts.Cat6ProbHigh12
+                : xd.Bd == 10
+                    ? Luts.Cat6ProbHigh12.Slice(2)
+                    : Luts.Cat6Prob;
+            int cat6Bits = xd.Bd == 12 ? 18 : xd.Bd == 10 ? 16 : 14;
             // Keep value, range, and count as locals.  The compiler produces better
             // results with the locals than using r directly.
             ulong value = r.Value;
@@ -75,7 +61,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             {
                 int val = -1;
                 band = bandTranslate[0];
-                bandTranslate = bandTranslate[1..];
+                bandTranslate = bandTranslate.Slice(1);
                 ref Array3<byte> prob = ref coefProbs[band][ctx];
                 if (!xd.Counts.IsNull)
                 {
@@ -107,18 +93,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         r.Value = value;
                         r.Range = range;
                         r.Count = count;
-
                         return c; // Zero tokens at the end (no eob token)
                     }
+
                     ctx = GetCoefContext(nb, tokenCache, c);
                     band = bandTranslate[0];
-                    bandTranslate = bandTranslate[1..];
+                    bandTranslate = bandTranslate.Slice(1);
                     prob = ref coefProbs[band][ctx];
                 }
 
                 if (r.ReadBool(prob[OneContextNode], ref value, ref count, ref range) != 0)
                 {
-                    ReadOnlySpan<byte> p = Luts.Vp9Pareto8Full[prob[Constants.PivotNode] - 1];
+                    ReadOnlySpan<byte> p = Luts.Pareto8Full[prob[Constants.PivotNode] - 1];
                     if (!xd.Counts.IsNull)
                     {
                         ++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.TwoToken];
@@ -133,20 +119,24 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                             {
                                 if (r.ReadBool(p[7], ref value, ref count, ref range) != 0)
                                 {
-                                    val = Constants.Cat6MinVal + ReadCoeff(ref r, cat6Prob, cat6Bits, ref value, ref count, ref range);
+                                    val = Constants.Cat6MinVal + r.ReadCoeff(cat6Prob, cat6Bits, ref value,
+                                        ref count, ref range);
                                 }
                                 else
                                 {
-                                    val = Constants.Cat5MinVal + ReadCoeff(ref r, Luts.Vp9Cat5Prob, 5, ref value, ref count, ref range);
+                                    val = Constants.Cat5MinVal + r.ReadCoeff(Luts.Cat5Prob, 5, ref value,
+                                        ref count, ref range);
                                 }
                             }
                             else if (r.ReadBool(p[6], ref value, ref count, ref range) != 0)
                             {
-                                val = Constants.Cat4MinVal + ReadCoeff(ref r, Luts.Vp9Cat4Prob, 4, ref value, ref count, ref range);
+                                val = Constants.Cat4MinVal + r.ReadCoeff(Luts.Cat4Prob, 4, ref value, ref count,
+                                    ref range);
                             }
                             else
                             {
-                                val = Constants.Cat3MinVal + ReadCoeff(ref r, Luts.Vp9Cat3Prob, 3, ref value, ref count, ref range);
+                                val = Constants.Cat3MinVal + r.ReadCoeff(Luts.Cat3Prob, 3, ref value, ref count,
+                                    ref range);
                             }
                         }
                         else
@@ -154,13 +144,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                             tokenCache[scan[c]] = 4;
                             if (r.ReadBool(p[4], ref value, ref count, ref range) != 0)
                             {
-                                val = Constants.Cat2MinVal + ReadCoeff(ref r, Luts.Vp9Cat2Prob, 2, ref value, ref count, ref range);
+                                val = Constants.Cat2MinVal + r.ReadCoeff(Luts.Cat2Prob, 2, ref value, ref count,
+                                    ref range);
                             }
                             else
                             {
-                                val = Constants.Cat1MinVal + ReadCoeff(ref r, Luts.Vp9Cat1Prob, 1, ref value, ref count, ref range);
+                                val = Constants.Cat1MinVal + r.ReadCoeff(Luts.Cat1Prob, 1, ref value, ref count,
+                                    ref range);
                             }
                         }
+
                         // Val may use 18-bits
                         v = (int)(((long)val * dqv) >> dqShift);
                     }
@@ -188,7 +181,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     tokenCache[scan[c]] = 1;
                     v = dqv >> dqShift;
                 }
-                dqcoeff[scan[c]] = (int)HighbdCheckRange(r.ReadBool(128, ref value, ref count, ref range) != 0 ? -v : v, xd.Bd);
+
+                dqcoeff[scan[c]] = (int)HighbdCheckRange(r.ReadBool(128, ref value, ref count, ref range) != 0 ? -v : v,
+                    xd.Bd);
                 ++c;
                 ctx = GetCoefContext(nb, tokenCache, c);
                 dqv = dq[1];
@@ -197,11 +192,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             r.Value = value;
             r.Range = range;
             r.Count = count;
-
             return c;
         }
 
-        private static void GetCtxShift(ref MacroBlockD xd, ref int ctxShiftA, ref int ctxShiftL, int x, int y, uint txSizeInBlocks)
+        private static void GetCtxShift(ref MacroBlockD xd, ref int ctxShiftA, ref int ctxShiftL, int x, int y,
+            uint txSizeInBlocks)
         {
             if (xd.MaxBlocksWide != 0)
             {
@@ -210,6 +205,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     ctxShiftA = (int)(txSizeInBlocks - (xd.MaxBlocksWide - x)) * 8;
                 }
             }
+
             if (xd.MaxBlocksHigh != 0)
             {
                 if (txSizeInBlocks + y > xd.MaxBlocksHigh)
@@ -238,8 +234,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             ref MacroBlockDPlane pd = ref xd.Plane[plane];
             ref Array2<short> dequant = ref pd.SegDequant[segId];
             int eob;
-            Span<sbyte> a = pd.AboveContext.AsSpan()[x..];
-            Span<sbyte> l = pd.LeftContext.AsSpan()[y..];
+            Span<sbyte> a = pd.AboveContext.AsSpan().Slice(x);
+            Span<sbyte> l = pd.LeftContext.AsSpan().Slice(y);
             int ctx;
             int ctxShiftA = 0;
             int ctxShiftL = 0;
@@ -324,4 +320,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             return eob;
         }
     }
-}
+}

+ 83 - 81
src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
 using Ryujinx.Graphics.Nvdec.Vp9.Common;
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
@@ -75,17 +75,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Vector128<int> zero = Vector128<int>.Zero;
             Vector128<int> const64 = Vector128.Create(64);
 
-            ulong x, y;
-            src -= SubpelTaps / 2 - 1;
+            src -= (SubpelTaps / 2) - 1;
 
             fixed (Array8<short>* xFilter = xFilters)
             {
-                Vector128<short> vfilter = Sse2.LoadVector128((short*)xFilter + (uint)(x0Q4 & SubpelMask) * 8);
+                Vector128<short> vfilter = Sse2.LoadVector128((short*)xFilter + ((uint)(x0Q4 & SubpelMask) * 8));
 
-                for (y = 0; y < (uint)h; ++y)
+                for (ulong y = 0; y < (uint)h; ++y)
                 {
                     ulong srcOffset = (uint)x0Q4 >> SubpelBits;
-                    for (x = 0; x < (uint)w; x += 4)
+                    for (ulong x = 0; x < (uint)w; x += 4)
                     {
                         Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(&src[srcOffset + x]);
                         Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 1]);
@@ -94,8 +93,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
                         Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
 
-                        Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
+                        Sse.StoreScalar((float*)&dst[x],
+                            PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
                     }
+
                     src += srcStride;
                     dst += dstStride;
                 }
@@ -117,22 +118,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             if (Sse41.IsSupported && UseIntrinsics && xStepQ4 == 1 << SubpelBits)
             {
                 ConvolveHorizSse41(src, srcStride, dst, dstStride, xFilters, x0Q4, w, h);
-
                 return;
             }
 
-            int x, y;
-            src -= SubpelTaps / 2 - 1;
+            src -= (SubpelTaps / 2) - 1;
 
-            for (y = 0; y < h; ++y)
+            for (int y = 0; y < h; ++y)
             {
                 int xQ4 = x0Q4;
-                for (x = 0; x < w; ++x)
+                for (int x = 0; x < w; ++x)
                 {
                     byte* srcX = &src[xQ4 >> SubpelBits];
                     ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
-                    int k, sum = 0;
-                    for (k = 0; k < SubpelTaps; ++k)
+                    int sum = 0;
+                    for (int k = 0; k < SubpelTaps; ++k)
                     {
                         sum += srcX[k] * xFilter[k];
                     }
@@ -140,6 +139,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
                     dst[x] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
                     xQ4 += xStepQ4;
                 }
+
                 src += srcStride;
                 dst += dstStride;
             }
@@ -156,25 +156,26 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             int w,
             int h)
         {
-            int x, y;
-            src -= SubpelTaps / 2 - 1;
+            src -= (SubpelTaps / 2) - 1;
 
-            for (y = 0; y < h; ++y)
+            for (int y = 0; y < h; ++y)
             {
                 int xQ4 = x0Q4;
-                for (x = 0; x < w; ++x)
+                for (int x = 0; x < w; ++x)
                 {
                     byte* srcX = &src[xQ4 >> SubpelBits];
                     ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
-                    int k, sum = 0;
-                    for (k = 0; k < SubpelTaps; ++k)
+                    int sum = 0;
+                    for (int k = 0; k < SubpelTaps; ++k)
                     {
                         sum += srcX[k] * xFilter[k];
                     }
 
-                    dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
+                    dst[x] = (byte)BitUtils.RoundPowerOfTwo(
+                        dst[x] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
                     xQ4 += xStepQ4;
                 }
+
                 src += srcStride;
                 dst += dstStride;
             }
@@ -203,18 +204,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
                 srcStride * 6,
                 srcStride * 7);
 
-            ulong x, y;
-            src -= srcStride * (SubpelTaps / 2 - 1);
+            src -= srcStride * ((SubpelTaps / 2) - 1);
 
             fixed (Array8<short>* yFilter = yFilters)
             {
-                Vector128<short> vfilter = Sse2.LoadVector128((short*)yFilter + (uint)(y0Q4 & SubpelMask) * 8);
+                Vector128<short> vfilter = Sse2.LoadVector128((short*)yFilter + ((uint)(y0Q4 & SubpelMask) * 8));
 
                 ulong srcBaseY = (uint)y0Q4 >> SubpelBits;
-                for (y = 0; y < (uint)h; ++y)
+                for (ulong y = 0; y < (uint)h; ++y)
                 {
                     ulong srcOffset = (srcBaseY + y) * (uint)srcStride;
-                    for (x = 0; x < (uint)w; x += 4)
+                    for (ulong x = 0; x < (uint)w; x += 4)
                     {
                         Vector256<int> vsrc = Avx2.GatherVector256((uint*)&src[srcOffset + x], indices, 1).AsInt32();
 
@@ -240,8 +240,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
                         Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
 
-                        Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
+                        Sse.StoreScalar((float*)&dst[x],
+                            PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
                     }
+
                     dst += dstStride;
                 }
             }
@@ -262,22 +264,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             if (Avx2.IsSupported && UseIntrinsics && yStepQ4 == 1 << SubpelBits)
             {
                 ConvolveVertAvx2(src, srcStride, dst, dstStride, yFilters, y0Q4, w, h);
-
                 return;
             }
 
-            int x, y;
-            src -= srcStride * (SubpelTaps / 2 - 1);
+            src -= srcStride * ((SubpelTaps / 2) - 1);
 
-            for (x = 0; x < w; ++x)
+            for (int x = 0; x < w; ++x)
             {
                 int yQ4 = y0Q4;
-                for (y = 0; y < h; ++y)
+                for (int y = 0; y < h; ++y)
                 {
                     byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
                     ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
-                    int k, sum = 0;
-                    for (k = 0; k < SubpelTaps; ++k)
+                    int sum = 0;
+                    for (int k = 0; k < SubpelTaps; ++k)
                     {
                         sum += srcY[k * srcStride] * yFilter[k];
                     }
@@ -285,6 +285,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
                     dst[y * dstStride] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
                     yQ4 += yStepQ4;
                 }
+
                 ++src;
                 ++dst;
             }
@@ -301,18 +302,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             int w,
             int h)
         {
-            int x, y;
-            src -= srcStride * (SubpelTaps / 2 - 1);
+            src -= srcStride * ((SubpelTaps / 2) - 1);
 
-            for (x = 0; x < w; ++x)
+            for (int x = 0; x < w; ++x)
             {
                 int yQ4 = y0Q4;
-                for (y = 0; y < h; ++y)
+                for (int y = 0; y < h; ++y)
                 {
                     byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
                     ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
-                    int k, sum = 0;
-                    for (k = 0; k < SubpelTaps; ++k)
+                    int sum = 0;
+                    for (int k = 0; k < SubpelTaps; ++k)
                     {
                         sum += srcY[k * srcStride] * yFilter[k];
                     }
@@ -321,6 +321,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
                         dst[y * dstStride] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
                     yQ4 += yStepQ4;
                 }
+
                 ++src;
                 ++dst;
             }
@@ -420,15 +421,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             // ==> yStepQ4 = 64. Since w and h are at most 16, the temp buffer is still
             // big enough.
             byte* temp = stackalloc byte[64 * 135];
-            int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
+            int intermediateHeight = ((((h - 1) * yStepQ4) + y0Q4) >> SubpelBits) + SubpelTaps;
 
             Debug.Assert(w <= 64);
             Debug.Assert(h <= 64);
             Debug.Assert(yStepQ4 <= 32 || (yStepQ4 <= 64 && h <= 32));
             Debug.Assert(xStepQ4 <= 64);
 
-            ConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight);
-            ConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
+            ConvolveHoriz(src - (srcStride * ((SubpelTaps / 2) - 1)), srcStride, temp, 64, filter, x0Q4, xStepQ4, w,
+                intermediateHeight);
+            ConvolveVert(temp + (64 * ((SubpelTaps / 2) - 1)), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
         }
 
         public static unsafe void Convolve8Avg(
@@ -489,11 +491,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             int w,
             int h)
         {
-            int x, y;
-
-            for (y = 0; y < h; ++y)
+            for (int y = 0; y < h; ++y)
             {
-                for (x = 0; x < w; ++x)
+                for (int x = 0; x < w; ++x)
                 {
                     dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
                 }
@@ -611,18 +611,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             int h,
             int bd)
         {
-            int x, y;
-            src -= SubpelTaps / 2 - 1;
+            src -= (SubpelTaps / 2) - 1;
 
-            for (y = 0; y < h; ++y)
+            for (int y = 0; y < h; ++y)
             {
                 int xQ4 = x0Q4;
-                for (x = 0; x < w; ++x)
+                for (int x = 0; x < w; ++x)
                 {
                     ushort* srcX = &src[xQ4 >> SubpelBits];
                     ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
-                    int k, sum = 0;
-                    for (k = 0; k < SubpelTaps; ++k)
+                    int sum = 0;
+                    for (int k = 0; k < SubpelTaps; ++k)
                     {
                         sum += srcX[k] * xFilter[k];
                     }
@@ -630,6 +629,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
                     dst[x] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
                     xQ4 += xStepQ4;
                 }
+
                 src += srcStride;
                 dst += dstStride;
             }
@@ -647,25 +647,26 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             int h,
             int bd)
         {
-            int x, y;
-            src -= SubpelTaps / 2 - 1;
+            src -= (SubpelTaps / 2) - 1;
 
-            for (y = 0; y < h; ++y)
+            for (int y = 0; y < h; ++y)
             {
                 int xQ4 = x0Q4;
-                for (x = 0; x < w; ++x)
+                for (int x = 0; x < w; ++x)
                 {
                     ushort* srcX = &src[xQ4 >> SubpelBits];
                     ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
-                    int k, sum = 0;
-                    for (k = 0; k < SubpelTaps; ++k)
+                    int sum = 0;
+                    for (int k = 0; k < SubpelTaps; ++k)
                     {
                         sum += srcX[k] * xFilter[k];
                     }
 
-                    dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
+                    dst[x] = (ushort)BitUtils.RoundPowerOfTwo(
+                        dst[x] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
                     xQ4 += xStepQ4;
                 }
+
                 src += srcStride;
                 dst += dstStride;
             }
@@ -683,18 +684,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             int h,
             int bd)
         {
-            int x, y;
-            src -= srcStride * (SubpelTaps / 2 - 1);
+            src -= srcStride * ((SubpelTaps / 2) - 1);
 
-            for (x = 0; x < w; ++x)
+            for (int x = 0; x < w; ++x)
             {
                 int yQ4 = y0Q4;
-                for (y = 0; y < h; ++y)
+                for (int y = 0; y < h; ++y)
                 {
                     ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
                     ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
-                    int k, sum = 0;
-                    for (k = 0; k < SubpelTaps; ++k)
+                    int sum = 0;
+                    for (int k = 0; k < SubpelTaps; ++k)
                     {
                         sum += srcY[k * srcStride] * yFilter[k];
                     }
@@ -702,6 +702,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
                     dst[y * dstStride] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
                     yQ4 += yStepQ4;
                 }
+
                 ++src;
                 ++dst;
             }
@@ -719,26 +720,27 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             int h,
             int bd)
         {
-            int x, y;
-            src -= srcStride * (SubpelTaps / 2 - 1);
+            src -= srcStride * ((SubpelTaps / 2) - 1);
 
-            for (x = 0; x < w; ++x)
+            for (int x = 0; x < w; ++x)
             {
                 int yQ4 = y0Q4;
-                for (y = 0; y < h; ++y)
+                for (int y = 0; y < h; ++y)
                 {
                     ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
                     ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
-                    int k, sum = 0;
-                    for (k = 0; k < SubpelTaps; ++k)
+                    int sum = 0;
+                    for (int k = 0; k < SubpelTaps; ++k)
                     {
                         sum += srcY[k * srcStride] * yFilter[k];
                     }
 
                     dst[y * dstStride] = (ushort)BitUtils.RoundPowerOfTwo(
-                        dst[y * dstStride] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
+                        dst[y * dstStride] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd),
+                        1);
                     yQ4 += yStepQ4;
                 }
+
                 ++src;
                 ++dst;
             }
@@ -771,15 +773,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             // --Require an additional SubpelTaps rows for the 8-tap filter tails.
             // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
             ushort* temp = stackalloc ushort[64 * 135];
-            int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
+            int intermediateHeight = ((((h - 1) * yStepQ4) + y0Q4) >> SubpelBits) + SubpelTaps;
 
             Debug.Assert(w <= 64);
             Debug.Assert(h <= 64);
             Debug.Assert(yStepQ4 <= 32);
             Debug.Assert(xStepQ4 <= 32);
 
-            HighbdConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight, bd);
-            HighbdConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
+            HighbdConvolveHoriz(src - (srcStride * ((SubpelTaps / 2) - 1)), srcStride, temp, 64, filter, x0Q4, xStepQ4,
+                w, intermediateHeight, bd);
+            HighbdConvolveVert(temp + (64 * ((SubpelTaps / 2) - 1)), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h,
+                bd);
         }
 
         public static unsafe void HighbdConvolve8Horiz(
@@ -928,11 +932,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             int h,
             int bd)
         {
-            int x, y;
-
-            for (y = 0; y < h; ++y)
+            for (int y = 0; y < h; ++y)
             {
-                for (x = 0; x < w; ++x)
+                for (int x = 0; x < w; ++x)
                 {
                     dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
                 }
@@ -942,4 +944,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
         }
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs

@@ -9,4 +9,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
         public const int SubpelShifts = 1 << SubpelBits;
         public const int SubpelTaps = 8;
     }
-}
+}

+ 180 - 175
src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
 
 namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 {
@@ -6,22 +6,22 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
     {
         private static unsafe ref byte Dst(byte* dst, int stride, int x, int y)
         {
-            return ref dst[x + y * stride];
+            return ref dst[x + (y * stride)];
         }
 
         private static unsafe ref ushort Dst(ushort* dst, int stride, int x, int y)
         {
-            return ref dst[x + y * stride];
+            return ref dst[x + (y * stride)];
         }
 
         private static byte Avg3(byte a, byte b, byte c)
         {
-            return (byte)((a + 2 * b + c + 2) >> 2);
+            return (byte)((a + (2 * b) + c + 2) >> 2);
         }
 
         private static ushort Avg3(ushort a, ushort b, ushort c)
         {
-            return (ushort)((a + 2 * b + c + 2) >> 2);
+            return (ushort)((a + (2 * b) + c + 2) >> 2);
         }
 
         private static byte Avg2(byte a, byte b)
@@ -51,9 +51,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
         private static unsafe void D207Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
         {
-            int r, c;
             // First column
-            for (r = 0; r < bs - 1; ++r)
+            for (int r = 0; r < bs - 1; ++r)
             {
                 dst[r * stride] = Avg2(left[r], left[r + 1]);
             }
@@ -62,7 +61,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             dst++;
 
             // Second column
-            for (r = 0; r < bs - 2; ++r)
+            for (int r = 0; r < bs - 2; ++r)
             {
                 dst[r * stride] = Avg3(left[r], left[r + 1], left[r + 2]);
             }
@@ -72,16 +71,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             dst++;
 
             // Rest of last row
-            for (c = 0; c < bs - 2; ++c)
+            for (int c = 0; c < bs - 2; ++c)
             {
-                dst[(bs - 1) * stride + c] = left[bs - 1];
+                dst[((bs - 1) * stride) + c] = left[bs - 1];
             }
 
-            for (r = bs - 2; r >= 0; --r)
+            for (int r = bs - 2; r >= 0; --r)
             {
-                for (c = 0; c < bs - 2; ++c)
+                for (int c = 0; c < bs - 2; ++c)
                 {
-                    dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+                    dst[(r * stride) + c] = dst[((r + 1) * stride) + c - 2];
                 }
             }
         }
@@ -103,19 +102,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
         private static unsafe void D63Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
         {
-            int r, c;
-            int size;
-            for (c = 0; c < bs; ++c)
+            for (int c = 0; c < bs; ++c)
             {
                 dst[c] = Avg2(above[c], above[c + 1]);
                 dst[stride + c] = Avg3(above[c], above[c + 1], above[c + 2]);
             }
-            for (r = 2, size = bs - 2; r < bs; r += 2, --size)
+
+            for (int r = 2, size = bs - 2; r < bs; r += 2, --size)
             {
-                MemoryUtil.Copy(dst + (r + 0) * stride, dst + (r >> 1), size);
-                MemoryUtil.Fill(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
-                MemoryUtil.Copy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
-                MemoryUtil.Fill(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
+                MemoryUtil.Copy(dst + ((r + 0) * stride), dst + (r >> 1), size);
+                MemoryUtil.Fill(dst + ((r + 0) * stride) + size, above[bs - 1], bs - size);
+                MemoryUtil.Copy(dst + ((r + 1) * stride), dst + stride + (r >> 1), size);
+                MemoryUtil.Fill(dst + ((r + 1) * stride) + size, above[bs - 1], bs - size);
             }
         }
 
@@ -138,15 +136,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
         {
             byte aboveRight = above[bs - 1];
             byte* dstRow0 = dst;
-            int x, size;
 
-            for (x = 0; x < bs - 1; ++x)
+            for (int x = 0; x < bs - 1; ++x)
             {
                 dst[x] = Avg3(above[x], above[x + 1], above[x + 2]);
             }
+
             dst[bs - 1] = aboveRight;
             dst += stride;
-            for (x = 1, size = bs - 2; x < bs; ++x, --size)
+            for (int x = 1, size = bs - 2; x < bs; ++x, --size)
             {
                 MemoryUtil.Copy(dst, dstRow0 + x, size);
                 MemoryUtil.Fill(dst + size, aboveRight, x + 1);
@@ -171,10 +169,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
         private static unsafe void D117Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
         {
-            int r, c;
-
             // First row
-            for (c = 0; c < bs; c++)
+            for (int c = 0; c < bs; c++)
             {
                 dst[c] = Avg2(above[c - 1], above[c]);
             }
@@ -183,7 +179,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
             // Second row
             dst[0] = Avg3(left[0], above[-1], above[0]);
-            for (c = 1; c < bs; c++)
+            for (int c = 1; c < bs; c++)
             {
                 dst[c] = Avg3(above[c - 2], above[c - 1], above[c]);
             }
@@ -192,17 +188,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
             // The rest of first col
             dst[0] = Avg3(above[-1], left[0], left[1]);
-            for (r = 3; r < bs; ++r)
+            for (int r = 3; r < bs; ++r)
             {
                 dst[(r - 2) * stride] = Avg3(left[r - 3], left[r - 2], left[r - 1]);
             }
 
             // The rest of the block
-            for (r = 2; r < bs; ++r)
+            for (int r = 2; r < bs; ++r)
             {
-                for (c = 1; c < bs; c++)
+                for (int c = 1; c < bs; c++)
                 {
-                    dst[c] = dst[-2 * stride + c - 1];
+                    dst[c] = dst[(-2 * stride) + c - 1];
                 }
 
                 dst += stride;
@@ -226,26 +222,26 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
         private static unsafe void D135Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
         {
-            int i;
             byte* border = stackalloc byte[32 + 32 - 1]; // outer border from bottom-left to top-right
 
             // Dst(dst, stride, bs, bs - 2)[0], i.e., border starting at bottom-left
-            for (i = 0; i < bs - 2; ++i)
+            for (int i = 0; i < bs - 2; ++i)
             {
                 border[i] = Avg3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
             }
+
             border[bs - 2] = Avg3(above[-1], left[0], left[1]);
             border[bs - 1] = Avg3(left[0], above[-1], above[0]);
             border[bs - 0] = Avg3(above[-1], above[0], above[1]);
             // dst[0][2, size), i.e., remaining top border ascending
-            for (i = 0; i < bs - 2; ++i)
+            for (int i = 0; i < bs - 2; ++i)
             {
                 border[bs + 1 + i] = Avg3(above[i], above[i + 1], above[i + 2]);
             }
 
-            for (i = 0; i < bs; ++i)
+            for (int i = 0; i < bs; ++i)
             {
-                MemoryUtil.Copy(dst + i * stride, border + bs - 1 - i, bs);
+                MemoryUtil.Copy(dst + (i * stride), border + bs - 1 - i, bs);
             }
         }
 
@@ -266,9 +262,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
         private static unsafe void D153Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
         {
-            int r, c;
             dst[0] = Avg2(above[-1], left[0]);
-            for (r = 1; r < bs; r++)
+            for (int r = 1; r < bs; r++)
             {
                 dst[r * stride] = Avg2(left[r - 1], left[r]);
             }
@@ -277,23 +272,23 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
             dst[0] = Avg3(left[0], above[-1], above[0]);
             dst[stride] = Avg3(above[-1], left[0], left[1]);
-            for (r = 2; r < bs; r++)
+            for (int r = 2; r < bs; r++)
             {
                 dst[r * stride] = Avg3(left[r - 2], left[r - 1], left[r]);
             }
 
             dst++;
 
-            for (c = 0; c < bs - 2; c++)
+            for (int c = 0; c < bs - 2; c++)
             {
                 dst[c] = Avg3(above[c - 1], above[c], above[c + 1]);
             }
 
             dst += stride;
 
-            for (r = 1; r < bs; ++r)
+            for (int r = 1; r < bs; ++r)
             {
-                for (c = 0; c < bs - 2; c++)
+                for (int c = 0; c < bs - 2; c++)
                 {
                     dst[c] = dst[-stride + c - 2];
                 }
@@ -324,9 +319,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
         private static unsafe void VPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
         {
-            int r;
-
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
                 MemoryUtil.Copy(dst, above, bs);
                 dst += stride;
@@ -355,43 +348,40 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
         private static unsafe void HPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
         {
-            int r;
-
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
                 MemoryUtil.Fill(dst, left[r], bs);
                 dst += stride;
             }
         }
 
-        public static unsafe void TMPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        public static unsafe void TmPredictor4x4(byte* dst, int stride, byte* above, byte* left)
         {
-            TMPredictor(dst, stride, 4, above, left);
+            TmPredictor(dst, stride, 4, above, left);
         }
 
-        public static unsafe void TMPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+        public static unsafe void TmPredictor8x8(byte* dst, int stride, byte* above, byte* left)
         {
-            TMPredictor(dst, stride, 8, above, left);
+            TmPredictor(dst, stride, 8, above, left);
         }
 
-        public static unsafe void TMPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+        public static unsafe void TmPredictor16x16(byte* dst, int stride, byte* above, byte* left)
         {
-            TMPredictor(dst, stride, 16, above, left);
+            TmPredictor(dst, stride, 16, above, left);
         }
 
-        public static unsafe void TMPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+        public static unsafe void TmPredictor32x32(byte* dst, int stride, byte* above, byte* left)
         {
-            TMPredictor(dst, stride, 32, above, left);
+            TmPredictor(dst, stride, 32, above, left);
         }
 
-        private static unsafe void TMPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        private static unsafe void TmPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
         {
-            int r, c;
             int yTopLeft = above[-1];
 
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
-                for (c = 0; c < bs; c++)
+                for (int c = 0; c < bs; c++)
                 {
                     dst[c] = BitUtils.ClipPixel(left[r] + above[c] - yTopLeft);
                 }
@@ -422,9 +412,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
         private static unsafe void Dc128Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
         {
-            int r;
-
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
                 MemoryUtil.Fill(dst, (byte)128, bs);
                 dst += stride;
@@ -453,16 +441,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
         private static unsafe void DcLeftPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
         {
-            int i, r, expectedDc, sum = 0;
+            int expectedDc, sum = 0;
 
-            for (i = 0; i < bs; i++)
+            for (int i = 0; i < bs; i++)
             {
                 sum += left[i];
             }
 
             expectedDc = (sum + (bs >> 1)) / bs;
 
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
                 MemoryUtil.Fill(dst, (byte)expectedDc, bs);
                 dst += stride;
@@ -491,16 +479,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
         private static unsafe void DcTopPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
         {
-            int i, r, expectedDc, sum = 0;
+            int expectedDc, sum = 0;
 
-            for (i = 0; i < bs; i++)
+            for (int i = 0; i < bs; i++)
             {
                 sum += above[i];
             }
 
             expectedDc = (sum + (bs >> 1)) / bs;
 
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
                 MemoryUtil.Fill(dst, (byte)expectedDc, bs);
                 dst += stride;
@@ -529,10 +517,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
         private static unsafe void DcPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
         {
-            int i, r, expectedDc, sum = 0;
+            int expectedDc, sum = 0;
             int count = 2 * bs;
 
-            for (i = 0; i < bs; i++)
+            for (int i = 0; i < bs; i++)
             {
                 sum += above[i];
                 sum += left[i];
@@ -540,7 +528,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
             expectedDc = (sum + (count >> 1)) / count;
 
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
                 MemoryUtil.Fill(dst, (byte)expectedDc, bs);
                 dst += stride;
@@ -555,10 +543,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             byte k = left[2];
             byte l = left[3];
 
-            MemoryUtil.Fill(dst + stride * 0, Avg3(h, I, j), 4);
-            MemoryUtil.Fill(dst + stride * 1, Avg3(I, j, k), 4);
-            MemoryUtil.Fill(dst + stride * 2, Avg3(j, k, l), 4);
-            MemoryUtil.Fill(dst + stride * 3, Avg3(k, l, l), 4);
+            MemoryUtil.Fill(dst + (stride * 0), Avg3(h, I, j), 4);
+            MemoryUtil.Fill(dst + (stride * 1), Avg3(I, j, k), 4);
+            MemoryUtil.Fill(dst + (stride * 2), Avg3(j, k, l), 4);
+            MemoryUtil.Fill(dst + (stride * 3), Avg3(k, l, l), 4);
         }
 
         public static unsafe void VePredictor4x4(byte* dst, int stride, byte* above, byte* left)
@@ -574,9 +562,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             dst[1] = Avg3(I, j, k);
             dst[2] = Avg3(j, k, l);
             dst[3] = Avg3(k, l, m);
-            MemoryUtil.Copy(dst + stride * 1, dst, 4);
-            MemoryUtil.Copy(dst + stride * 2, dst, 4);
-            MemoryUtil.Copy(dst + stride * 3, dst, 4);
+            MemoryUtil.Copy(dst + (stride * 1), dst, 4);
+            MemoryUtil.Copy(dst + (stride * 2), dst, 4);
+            MemoryUtil.Copy(dst + (stride * 3), dst, 4);
         }
 
         public static unsafe void D207Predictor4x4(byte* dst, int stride, byte* above, byte* left)
@@ -591,7 +579,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Dst(dst, stride, 1, 0) = Avg3(I, j, k);
             Dst(dst, stride, 3, 0) = Dst(dst, stride, 1, 1) = Avg3(j, k, l);
             Dst(dst, stride, 3, 1) = Dst(dst, stride, 1, 2) = Avg3(k, l, l);
-            Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) = Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l;
+            Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) =
+                Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l;
         }
 
         public static unsafe void D63Predictor4x4(byte* dst, int stride, byte* above, byte* left)
@@ -616,7 +605,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Dst(dst, stride, 3, 3) = Avg3(e, f, g); // Differs from vp8
         }
 
-        public static unsafe void D63ePredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        public static unsafe void D63EPredictor4x4(byte* dst, int stride, byte* above, byte* left)
         {
             byte a = above[0];
             byte b = above[1];
@@ -652,13 +641,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Dst(dst, stride, 0, 0) = Avg3(a, b, c);
             Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d);
             Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e);
-            Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f);
+            Dst(dst, stride, 3, 0) =
+                Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f);
             Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g);
             Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h);
             Dst(dst, stride, 3, 3) = h; // differs from vp8
         }
 
-        public static unsafe void D45ePredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        public static unsafe void D45EPredictor4x4(byte* dst, int stride, byte* above, byte* left)
         {
             byte a = above[0];
             byte b = above[1];
@@ -671,7 +661,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Dst(dst, stride, 0, 0) = Avg3(a, b, c);
             Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d);
             Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e);
-            Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f);
+            Dst(dst, stride, 3, 0) =
+                Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f);
             Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g);
             Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h);
             Dst(dst, stride, 3, 3) = Avg3(g, h, h);
@@ -714,7 +705,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Dst(dst, stride, 0, 3) = Avg3(j, k, l);
             Dst(dst, stride, 1, 3) = Dst(dst, stride, 0, 2) = Avg3(I, j, k);
             Dst(dst, stride, 2, 3) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 1) = Avg3(x, I, j);
-            Dst(dst, stride, 3, 3) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I);
+            Dst(dst, stride, 3, 3) =
+                Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I);
             Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 0) = Avg3(b, a, x);
             Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 0) = Avg3(c, b, a);
             Dst(dst, stride, 3, 0) = Avg3(d, c, b);
@@ -758,38 +750,39 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             HighbdD207Predictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdD207Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdD207Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
-            int r, c;
-
             // First column.
-            for (r = 0; r < bs - 1; ++r)
+            for (int r = 0; r < bs - 1; ++r)
             {
                 dst[r * stride] = Avg2(left[r], left[r + 1]);
             }
+
             dst[(bs - 1) * stride] = left[bs - 1];
             dst++;
 
             // Second column.
-            for (r = 0; r < bs - 2; ++r)
+            for (int r = 0; r < bs - 2; ++r)
             {
                 dst[r * stride] = Avg3(left[r], left[r + 1], left[r + 2]);
             }
+
             dst[(bs - 2) * stride] = Avg3(left[bs - 2], left[bs - 1], left[bs - 1]);
             dst[(bs - 1) * stride] = left[bs - 1];
             dst++;
 
             // Rest of last row.
-            for (c = 0; c < bs - 2; ++c)
+            for (int c = 0; c < bs - 2; ++c)
             {
-                dst[(bs - 1) * stride + c] = left[bs - 1];
+                dst[((bs - 1) * stride) + c] = left[bs - 1];
             }
 
-            for (r = bs - 2; r >= 0; --r)
+            for (int r = bs - 2; r >= 0; --r)
             {
-                for (c = 0; c < bs - 2; ++c)
+                for (int c = 0; c < bs - 2; ++c)
                 {
-                    dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+                    dst[(r * stride) + c] = dst[((r + 1) * stride) + c - 2];
                 }
             }
         }
@@ -809,21 +802,21 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             HighbdD63Predictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdD63Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdD63Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
-            int r, c;
-            int size;
-            for (c = 0; c < bs; ++c)
+            for (int c = 0; c < bs; ++c)
             {
                 dst[c] = Avg2(above[c], above[c + 1]);
                 dst[stride + c] = Avg3(above[c], above[c + 1], above[c + 2]);
             }
-            for (r = 2, size = bs - 2; r < bs; r += 2, --size)
+
+            for (int r = 2, size = bs - 2; r < bs; r += 2, --size)
             {
-                MemoryUtil.Copy(dst + (r + 0) * stride, dst + (r >> 1), size);
-                MemoryUtil.Fill(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
-                MemoryUtil.Copy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
-                MemoryUtil.Fill(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
+                MemoryUtil.Copy(dst + ((r + 0) * stride), dst + (r >> 1), size);
+                MemoryUtil.Fill(dst + ((r + 0) * stride) + size, above[bs - 1], bs - size);
+                MemoryUtil.Copy(dst + ((r + 1) * stride), dst + stride + (r >> 1), size);
+                MemoryUtil.Fill(dst + ((r + 1) * stride) + size, above[bs - 1], bs - size);
             }
         }
 
@@ -842,19 +835,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             HighbdD45Predictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdD45Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdD45Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
             ushort aboveRight = above[bs - 1];
             ushort* dstRow0 = dst;
-            int x, size;
 
-            for (x = 0; x < bs - 1; ++x)
+            for (int x = 0; x < bs - 1; ++x)
             {
                 dst[x] = Avg3(above[x], above[x + 1], above[x + 2]);
             }
+
             dst[bs - 1] = aboveRight;
             dst += stride;
-            for (x = 1, size = bs - 2; x < bs; ++x, --size)
+            for (int x = 1, size = bs - 2; x < bs; ++x, --size)
             {
                 MemoryUtil.Copy(dst, dstRow0 + x, size);
                 MemoryUtil.Fill(dst + size, aboveRight, x + 1);
@@ -877,12 +871,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             HighbdD117Predictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdD117Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdD117Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
-            int r, c;
-
             // First row
-            for (c = 0; c < bs; c++)
+            for (int c = 0; c < bs; c++)
             {
                 dst[c] = Avg2(above[c - 1], above[c]);
             }
@@ -891,7 +884,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
             // Second row
             dst[0] = Avg3(left[0], above[-1], above[0]);
-            for (c = 1; c < bs; c++)
+            for (int c = 1; c < bs; c++)
             {
                 dst[c] = Avg3(above[c - 2], above[c - 1], above[c]);
             }
@@ -900,17 +893,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
             // The rest of first col
             dst[0] = Avg3(above[-1], left[0], left[1]);
-            for (r = 3; r < bs; ++r)
+            for (int r = 3; r < bs; ++r)
             {
                 dst[(r - 2) * stride] = Avg3(left[r - 3], left[r - 2], left[r - 1]);
             }
 
             // The rest of the block
-            for (r = 2; r < bs; ++r)
+            for (int r = 2; r < bs; ++r)
             {
-                for (c = 1; c < bs; c++)
+                for (int c = 1; c < bs; c++)
                 {
-                    dst[c] = dst[-2 * stride + c - 1];
+                    dst[c] = dst[(-2 * stride) + c - 1];
                 }
 
                 dst += stride;
@@ -932,28 +925,29 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             HighbdD135Predictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdD135Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdD135Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
-            int i;
             ushort* border = stackalloc ushort[32 + 32 - 1]; // Outer border from bottom-left to top-right
 
             // Dst(dst, stride, bs, bs - 2)[0], i.e., border starting at bottom-left
-            for (i = 0; i < bs - 2; ++i)
+            for (int i = 0; i < bs - 2; ++i)
             {
                 border[i] = Avg3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
             }
+
             border[bs - 2] = Avg3(above[-1], left[0], left[1]);
             border[bs - 1] = Avg3(left[0], above[-1], above[0]);
             border[bs - 0] = Avg3(above[-1], above[0], above[1]);
             // dst[0][2, size), i.e., remaining top border ascending
-            for (i = 0; i < bs - 2; ++i)
+            for (int i = 0; i < bs - 2; ++i)
             {
                 border[bs + 1 + i] = Avg3(above[i], above[i + 1], above[i + 2]);
             }
 
-            for (i = 0; i < bs; ++i)
+            for (int i = 0; i < bs; ++i)
             {
-                MemoryUtil.Copy(dst + i * stride, border + bs - 1 - i, bs);
+                MemoryUtil.Copy(dst + (i * stride), border + bs - 1 - i, bs);
             }
         }
 
@@ -972,11 +966,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             HighbdD153Predictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdD153Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdD153Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
-            int r, c;
             dst[0] = Avg2(above[-1], left[0]);
-            for (r = 1; r < bs; r++)
+            for (int r = 1; r < bs; r++)
             {
                 dst[r * stride] = Avg2(left[r - 1], left[r]);
             }
@@ -985,23 +979,23 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
             dst[0] = Avg3(left[0], above[-1], above[0]);
             dst[stride] = Avg3(above[-1], left[0], left[1]);
-            for (r = 2; r < bs; r++)
+            for (int r = 2; r < bs; r++)
             {
                 dst[r * stride] = Avg3(left[r - 2], left[r - 1], left[r]);
             }
 
             dst++;
 
-            for (c = 0; c < bs - 2; c++)
+            for (int c = 0; c < bs - 2; c++)
             {
                 dst[c] = Avg3(above[c - 1], above[c], above[c + 1]);
             }
 
             dst += stride;
 
-            for (r = 1; r < bs; ++r)
+            for (int r = 1; r < bs; ++r)
             {
-                for (c = 0; c < bs - 2; c++)
+                for (int c = 0; c < bs - 2; c++)
                 {
                     dst[c] = dst[-stride + c - 2];
                 }
@@ -1030,10 +1024,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             HighbdVPredictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdVPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdVPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
-            int r;
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
                 MemoryUtil.Copy(dst, above, bs);
                 dst += stride;
@@ -1060,44 +1054,44 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             HighbdHPredictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdHPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdHPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
-            int r;
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
                 MemoryUtil.Fill(dst, left[r], bs);
                 dst += stride;
             }
         }
 
-        public static unsafe void HighbdTMPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        public static unsafe void HighbdTmPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
         {
-            HighbdTMPredictor(dst, stride, 4, above, left, bd);
+            HighbdTmPredictor(dst, stride, 4, above, left, bd);
         }
 
-        public static unsafe void HighbdTMPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        public static unsafe void HighbdTmPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
         {
-            HighbdTMPredictor(dst, stride, 8, above, left, bd);
+            HighbdTmPredictor(dst, stride, 8, above, left, bd);
         }
 
-        public static unsafe void HighbdTMPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        public static unsafe void HighbdTmPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
         {
-            HighbdTMPredictor(dst, stride, 16, above, left, bd);
+            HighbdTmPredictor(dst, stride, 16, above, left, bd);
         }
 
-        public static unsafe void HighbdTMPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        public static unsafe void HighbdTmPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
         {
-            HighbdTMPredictor(dst, stride, 32, above, left, bd);
+            HighbdTmPredictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdTMPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdTmPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
-            int r, c;
             int yTopLeft = above[-1];
 
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
-                for (c = 0; c < bs; c++)
+                for (int c = 0; c < bs; c++)
                 {
                     dst[c] = BitUtils.ClipPixelHighbd(left[r] + above[c] - yTopLeft, bd);
                 }
@@ -1116,21 +1110,22 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             HighbdDc128Predictor(dst, stride, 8, above, left, bd);
         }
 
-        public static unsafe void HighbdDc128Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        public static unsafe void HighbdDc128Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left,
+            int bd)
         {
             HighbdDc128Predictor(dst, stride, 16, above, left, bd);
         }
 
-        public static unsafe void HighbdDc128Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        public static unsafe void HighbdDc128Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left,
+            int bd)
         {
             HighbdDc128Predictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdDc128Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdDc128Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
-            int r;
-
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
                 MemoryUtil.Fill(dst, (ushort)(128 << (bd - 8)), bs);
                 dst += stride;
@@ -1147,28 +1142,31 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             HighbdDcLeftPredictor(dst, stride, 8, above, left, bd);
         }
 
-        public static unsafe void HighbdDcLeftPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        public static unsafe void HighbdDcLeftPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left,
+            int bd)
         {
             HighbdDcLeftPredictor(dst, stride, 16, above, left, bd);
         }
 
-        public static unsafe void HighbdDcLeftPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        public static unsafe void HighbdDcLeftPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left,
+            int bd)
         {
             HighbdDcLeftPredictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdDcLeftPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdDcLeftPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
-            int i, r, expectedDc, sum = 0;
+            int expectedDc, sum = 0;
 
-            for (i = 0; i < bs; i++)
+            for (int i = 0; i < bs; i++)
             {
                 sum += left[i];
             }
 
             expectedDc = (sum + (bs >> 1)) / bs;
 
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
                 MemoryUtil.Fill(dst, (ushort)expectedDc, bs);
                 dst += stride;
@@ -1185,28 +1183,31 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             HighbdDcTopPredictor(dst, stride, 8, above, left, bd);
         }
 
-        public static unsafe void HighbdDcTopPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        public static unsafe void HighbdDcTopPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left,
+            int bd)
         {
             HighbdDcTopPredictor(dst, stride, 16, above, left, bd);
         }
 
-        public static unsafe void HighbdDcTopPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        public static unsafe void HighbdDcTopPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left,
+            int bd)
         {
             HighbdDcTopPredictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdDcTopPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdDcTopPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
-            int i, r, expectedDc, sum = 0;
+            int expectedDc, sum = 0;
 
-            for (i = 0; i < bs; i++)
+            for (int i = 0; i < bs; i++)
             {
                 sum += above[i];
             }
 
             expectedDc = (sum + (bs >> 1)) / bs;
 
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
                 MemoryUtil.Fill(dst, (ushort)expectedDc, bs);
                 dst += stride;
@@ -1233,12 +1234,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             HighbdDcPredictor(dst, stride, 32, above, left, bd);
         }
 
-        private static unsafe void HighbdDcPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        private static unsafe void HighbdDcPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left,
+            int bd)
         {
-            int i, r, expectedDc, sum = 0;
+            int expectedDc, sum = 0;
             int count = 2 * bs;
 
-            for (i = 0; i < bs; i++)
+            for (int i = 0; i < bs; i++)
             {
                 sum += above[i];
                 sum += left[i];
@@ -1246,7 +1248,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
             expectedDc = (sum + (count >> 1)) / count;
 
-            for (r = 0; r < bs; r++)
+            for (int r = 0; r < bs; r++)
             {
                 MemoryUtil.Fill(dst, (ushort)expectedDc, bs);
                 dst += stride;
@@ -1265,7 +1267,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Dst(dst, stride, 1, 0) = Avg3(I, j, k);
             Dst(dst, stride, 3, 0) = Dst(dst, stride, 1, 1) = Avg3(j, k, l);
             Dst(dst, stride, 3, 1) = Dst(dst, stride, 1, 2) = Avg3(k, l, l);
-            Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) = Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l;
+            Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) =
+                Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l;
         }
 
         public static unsafe void HighbdD63Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
@@ -1303,7 +1306,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Dst(dst, stride, 0, 0) = Avg3(a, b, c);
             Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d);
             Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e);
-            Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f);
+            Dst(dst, stride, 3, 0) =
+                Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f);
             Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g);
             Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h);
             Dst(dst, stride, 3, 3) = h; // Differs from vp8
@@ -1346,7 +1350,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Dst(dst, stride, 0, 3) = Avg3(j, k, l);
             Dst(dst, stride, 1, 3) = Dst(dst, stride, 0, 2) = Avg3(I, j, k);
             Dst(dst, stride, 2, 3) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 1) = Avg3(x, I, j);
-            Dst(dst, stride, 3, 3) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I);
+            Dst(dst, stride, 3, 3) =
+                Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I);
             Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 0) = Avg3(b, a, x);
             Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 0) = Avg3(c, b, a);
             Dst(dst, stride, 3, 0) = Avg3(d, c, b);
@@ -1376,4 +1381,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Dst(dst, stride, 1, 3) = Avg3(l, k, j);
         }
     }
-}
+}

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 232 - 245
src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs


+ 229 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterAuto.cs

@@ -0,0 +1,229 @@
+using Ryujinx.Common.Memory;
+using System;
+using System.Runtime.Intrinsics.X86;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+    internal class LoopFilterAuto
+    {
+        public static void LpfHorizontal4(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            if (Sse2.IsSupported)
+            {
+                LoopFilterSse2.LpfHorizontal4(s, pitch, blimit, limit, thresh);
+            }
+            else
+            {
+                LoopFilterScalar.LpfHorizontal4(s, pitch, blimit[0], limit[0], thresh[0]);
+            }
+        }
+
+        public static void LpfHorizontal4Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit0,
+            ReadOnlySpan<byte> limit0,
+            ReadOnlySpan<byte> thresh0,
+            ReadOnlySpan<byte> blimit1,
+            ReadOnlySpan<byte> limit1,
+            ReadOnlySpan<byte> thresh1)
+        {
+            if (Sse2.IsSupported)
+            {
+                LoopFilterSse2.LpfHorizontal4Dual(s, pitch, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
+            }
+            else
+            {
+                LoopFilterScalar.LpfHorizontal4Dual(s, pitch, blimit0[0], limit0[0], thresh0[0], blimit1[0], limit1[0],
+                    thresh1[0]);
+            }
+        }
+
+        public static void LpfHorizontal8(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            if (Sse2.IsSupported)
+            {
+                LoopFilterSse2.LpfHorizontal8(s, pitch, blimit, limit, thresh);
+            }
+            else
+            {
+                LoopFilterScalar.LpfHorizontal8(s, pitch, blimit[0], limit[0], thresh[0]);
+            }
+        }
+
+        public static void LpfHorizontal8Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit0,
+            ReadOnlySpan<byte> limit0,
+            ReadOnlySpan<byte> thresh0,
+            ReadOnlySpan<byte> blimit1,
+            ReadOnlySpan<byte> limit1,
+            ReadOnlySpan<byte> thresh1)
+        {
+            if (Sse2.IsSupported)
+            {
+                LoopFilterSse2.LpfHorizontal8Dual(s, pitch, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
+            }
+            else
+            {
+                LoopFilterScalar.LpfHorizontal8Dual(s, pitch, blimit0[0], limit0[0], thresh0[0], blimit1[0], limit1[0],
+                    thresh1[0]);
+            }
+        }
+
+        public static void LpfHorizontal16(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            if (Sse2.IsSupported)
+            {
+                LoopFilterSse2.LpfHorizontal16(s, pitch, blimit, limit, thresh);
+            }
+            else
+            {
+                LoopFilterScalar.LpfHorizontal16(s, pitch, blimit[0], limit[0], thresh[0]);
+            }
+        }
+
+        public static void LpfHorizontal16Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            if (Sse2.IsSupported)
+            {
+                LoopFilterSse2.LpfHorizontal16Dual(s, pitch, blimit, limit, thresh);
+            }
+            else
+            {
+                LoopFilterScalar.LpfHorizontal16Dual(s, pitch, blimit[0], limit[0], thresh[0]);
+            }
+        }
+
+        public static void LpfVertical4(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            if (Sse2.IsSupported)
+            {
+                LoopFilterSse2.LpfVertical4(s, pitch, blimit, limit, thresh);
+            }
+            else
+            {
+                LoopFilterScalar.LpfVertical4(s, pitch, blimit[0], limit[0], thresh[0]);
+            }
+        }
+
+        public static void LpfVertical4Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit0,
+            ReadOnlySpan<byte> limit0,
+            ReadOnlySpan<byte> thresh0,
+            ReadOnlySpan<byte> blimit1,
+            ReadOnlySpan<byte> limit1,
+            ReadOnlySpan<byte> thresh1)
+        {
+            if (Sse2.IsSupported)
+            {
+                LoopFilterSse2.LpfVertical4Dual(s, pitch, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
+            }
+            else
+            {
+                LoopFilterScalar.LpfVertical4Dual(s, pitch, blimit0[0], limit0[0], thresh0[0], blimit1[0], limit1[0],
+                    thresh1[0]);
+            }
+        }
+
+        public static void LpfVertical8(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            if (Sse2.IsSupported)
+            {
+                LoopFilterSse2.LpfVertical8(s, pitch, blimit, limit, thresh);
+            }
+            else
+            {
+                LoopFilterScalar.LpfVertical8(s, pitch, blimit[0], limit[0], thresh[0]);
+            }
+        }
+
+        public static void LpfVertical8Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit0,
+            ReadOnlySpan<byte> limit0,
+            ReadOnlySpan<byte> thresh0,
+            ReadOnlySpan<byte> blimit1,
+            ReadOnlySpan<byte> limit1,
+            ReadOnlySpan<byte> thresh1)
+        {
+            if (Sse2.IsSupported)
+            {
+                LoopFilterSse2.LpfVertical8Dual(s, pitch, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
+            }
+            else
+            {
+                LoopFilterScalar.LpfVertical8Dual(s, pitch, blimit0[0], limit0[0], thresh0[0], blimit1[0], limit1[0],
+                    thresh1[0]);
+            }
+        }
+
+        public static void LpfVertical16(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            if (Sse2.IsSupported)
+            {
+                LoopFilterSse2.LpfVertical16(s, pitch, blimit, limit, thresh);
+            }
+            else
+            {
+                LoopFilterScalar.LpfVertical16(s, pitch, blimit[0], limit[0], thresh[0]);
+            }
+        }
+
+        public static void LpfVertical16Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            if (Sse2.IsSupported)
+            {
+                LoopFilterSse2.LpfVertical16Dual(s, pitch, blimit, limit, thresh);
+            }
+            else
+            {
+                LoopFilterScalar.LpfVertical16Dual(s, pitch, blimit[0], limit[0], thresh[0]);
+            }
+        }
+    }
+}

+ 1093 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterScalar.cs

@@ -0,0 +1,1093 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using System;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+    internal static class LoopFilterScalar
+    {
+        private static sbyte ClampSbyte(int t)
+        {
+            return (sbyte)Math.Clamp(t, -128, 127);
+        }
+
+        private static short ClampSbyteHigh(int t, int bd)
+        {
+            return bd switch
+            {
+                10 => (short)Math.Clamp(t, -128 * 4, (128 * 4) - 1),
+                12 => (short)Math.Clamp(t, -128 * 16, (128 * 16) - 1),
+                _ => (short)Math.Clamp(t, -128, 128 - 1)
+            };
+        }
+
+        // Should we apply any filter at all: 11111111 yes, 00000000 no
+        private static sbyte FilterMask(
+            byte limit,
+            byte blimit,
+            byte p3,
+            byte p2,
+            byte p1,
+            byte p0,
+            byte q0,
+            byte q1,
+            byte q2,
+            byte q3)
+        {
+            int mask = 0;
+            mask |= Math.Abs(p3 - p2) > limit ? -1 : 0;
+            mask |= Math.Abs(p2 - p1) > limit ? -1 : 0;
+            mask |= Math.Abs(p1 - p0) > limit ? -1 : 0;
+            mask |= Math.Abs(q1 - q0) > limit ? -1 : 0;
+            mask |= Math.Abs(q2 - q1) > limit ? -1 : 0;
+            mask |= Math.Abs(q3 - q2) > limit ? -1 : 0;
+            mask |= (Math.Abs(p0 - q0) * 2) + (Math.Abs(p1 - q1) / 2) > blimit ? -1 : 0;
+            return (sbyte)~mask;
+        }
+
+        private static sbyte FlatMask4(
+            byte thresh,
+            byte p3,
+            byte p2,
+            byte p1,
+            byte p0,
+            byte q0,
+            byte q1,
+            byte q2,
+            byte q3)
+        {
+            int mask = 0;
+            mask |= Math.Abs(p1 - p0) > thresh ? -1 : 0;
+            mask |= Math.Abs(q1 - q0) > thresh ? -1 : 0;
+            mask |= Math.Abs(p2 - p0) > thresh ? -1 : 0;
+            mask |= Math.Abs(q2 - q0) > thresh ? -1 : 0;
+            mask |= Math.Abs(p3 - p0) > thresh ? -1 : 0;
+            mask |= Math.Abs(q3 - q0) > thresh ? -1 : 0;
+            return (sbyte)~mask;
+        }
+
+        private static sbyte FlatMask5(
+            byte thresh,
+            byte p4,
+            byte p3,
+            byte p2,
+            byte p1,
+            byte p0,
+            byte q0,
+            byte q1,
+            byte q2,
+            byte q3,
+            byte q4)
+        {
+            int mask = ~FlatMask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3);
+            mask |= Math.Abs(p4 - p0) > thresh ? -1 : 0;
+            mask |= Math.Abs(q4 - q0) > thresh ? -1 : 0;
+            return (sbyte)~mask;
+        }
+
+        // Is there high edge variance internal edge: 11111111 yes, 00000000 no
+        private static sbyte HevMask(
+            byte thresh,
+            byte p1,
+            byte p0,
+            byte q0,
+            byte q1)
+        {
+            int hev = 0;
+            hev |= Math.Abs(p1 - p0) > thresh ? -1 : 0;
+            hev |= Math.Abs(q1 - q0) > thresh ? -1 : 0;
+            return (sbyte)hev;
+        }
+
+        private static void Filter4(
+            sbyte mask,
+            byte thresh,
+            ref byte op1,
+            ref byte op0,
+            ref byte oq0,
+            ref byte oq1)
+        {
+            sbyte filter1, filter2;
+
+            sbyte ps1 = (sbyte)(op1 ^ 0x80);
+            sbyte ps0 = (sbyte)(op0 ^ 0x80);
+            sbyte qs0 = (sbyte)(oq0 ^ 0x80);
+            sbyte qs1 = (sbyte)(oq1 ^ 0x80);
+            sbyte hev = HevMask(thresh, op1, op0, oq0, oq1);
+
+            // add outer taps if we have high edge variance
+            sbyte filter = (sbyte)(ClampSbyte(ps1 - qs1) & hev);
+
+            // inner taps
+            filter = (sbyte)(ClampSbyte(filter + (3 * (qs0 - ps0))) & mask);
+
+            // save bottom 3 bits so that we round one side +4 and the other +3
+            // if it equals 4 we'll set it to adjust by -1 to account for the fact
+            // we'd round it by 3 the other way
+            filter1 = (sbyte)(ClampSbyte(filter + 4) >> 3);
+            filter2 = (sbyte)(ClampSbyte(filter + 3) >> 3);
+
+            oq0 = (byte)(ClampSbyte(qs0 - filter1) ^ 0x80);
+            op0 = (byte)(ClampSbyte(ps0 + filter2) ^ 0x80);
+
+            // outer tap adjustments
+            filter = (sbyte)(BitUtils.RoundPowerOfTwo(filter1, 1) & ~hev);
+
+            oq1 = (byte)(ClampSbyte(qs1 - filter) ^ 0x80);
+            op1 = (byte)(ClampSbyte(ps1 + filter) ^ 0x80);
+        }
+
+        public static void LpfHorizontal4(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh)
+        {
+            // loop filter designed to work using chars so that we can make maximum use
+            // of 8 bit simd instructions.
+            for (int i = 0; i < 8; ++i)
+            {
+                byte p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], p0 = s[-pitch];
+                byte q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], q3 = s[3 * pitch];
+                sbyte mask = FilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+                Filter4(mask, thresh, ref s[-2 * pitch], ref s[-1 * pitch], ref s[0], ref s[1 * pitch]);
+                s = s.Slice(1);
+            }
+        }
+
+        public static void LpfHorizontal4Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit0,
+            byte limit0,
+            byte thresh0,
+            byte blimit1,
+            byte limit1,
+            byte thresh1)
+        {
+            LpfHorizontal4(s, pitch, blimit0, limit0, thresh0);
+            LpfHorizontal4(s.Slice(8), pitch, blimit1, limit1, thresh1);
+        }
+
+        public static void LpfVertical4(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh)
+        {
+            // loop filter designed to work using chars so that we can make maximum use
+            // of 8 bit simd instructions.
+            for (int i = 0; i < 8; ++i)
+            {
+                byte p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+                byte q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+                sbyte mask = FilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+                Filter4(mask, thresh, ref s[-2], ref s[-1], ref s[0], ref s[1]);
+                s = s.Slice(pitch);
+            }
+        }
+
+        public static void LpfVertical4Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit0,
+            byte limit0,
+            byte thresh0,
+            byte blimit1,
+            byte limit1,
+            byte thresh1)
+        {
+            LpfVertical4(s, pitch, blimit0, limit0, thresh0);
+            LpfVertical4(s.Slice(8 * pitch), pitch, blimit1, limit1, thresh1);
+        }
+
+        private static void Filter8(
+            sbyte mask,
+            byte thresh,
+            bool flat,
+            ref byte op3,
+            ref byte op2,
+            ref byte op1,
+            ref byte op0,
+            ref byte oq0,
+            ref byte oq1,
+            ref byte oq2,
+            ref byte oq3)
+        {
+            if (flat && mask != 0)
+            {
+                byte p3 = op3, p2 = op2, p1 = op1, p0 = op0;
+                byte q0 = oq0, q1 = oq1, q2 = oq2, q3 = oq3;
+
+                // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+                op2 = (byte)BitUtils.RoundPowerOfTwo(p3 + p3 + p3 + (2 * p2) + p1 + p0 + q0, 3);
+                op1 = (byte)BitUtils.RoundPowerOfTwo(p3 + p3 + p2 + (2 * p1) + p0 + q0 + q1, 3);
+                op0 = (byte)BitUtils.RoundPowerOfTwo(p3 + p2 + p1 + (2 * p0) + q0 + q1 + q2, 3);
+                oq0 = (byte)BitUtils.RoundPowerOfTwo(p2 + p1 + p0 + (2 * q0) + q1 + q2 + q3, 3);
+                oq1 = (byte)BitUtils.RoundPowerOfTwo(p1 + p0 + q0 + (2 * q1) + q2 + q3 + q3, 3);
+                oq2 = (byte)BitUtils.RoundPowerOfTwo(p0 + q0 + q1 + (2 * q2) + q3 + q3 + q3, 3);
+            }
+            else
+            {
+                Filter4(mask, thresh, ref op1, ref op0, ref oq0, ref oq1);
+            }
+        }
+
+        public static void LpfHorizontal8(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh)
+        {
+            // loop filter designed to work using chars so that we can make maximum use
+            // of 8 bit simd instructions.
+            for (int i = 0; i < 8; ++i)
+            {
+                byte p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], p0 = s[-pitch];
+                byte q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], q3 = s[3 * pitch];
+
+                sbyte mask = FilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+                sbyte flat = FlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+                Filter8(
+                    mask,
+                    thresh,
+                    flat != 0,
+                    ref s[-4 * pitch],
+                    ref s[-3 * pitch],
+                    ref s[-2 * pitch],
+                    ref s[-1 * pitch],
+                    ref s[0],
+                    ref s[1 * pitch],
+                    ref s[2 * pitch],
+                    ref s[3 * pitch]);
+                s = s.Slice(1);
+            }
+        }
+
+        public static void LpfHorizontal8Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit0,
+            byte limit0,
+            byte thresh0,
+            byte blimit1,
+            byte limit1,
+            byte thresh1)
+        {
+            LpfHorizontal8(s, pitch, blimit0, limit0, thresh0);
+            LpfHorizontal8(s.Slice(8), pitch, blimit1, limit1, thresh1);
+        }
+
+        public static void LpfVertical8(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh)
+        {
+            for (int i = 0; i < 8; ++i)
+            {
+                byte p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+                byte q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+                sbyte mask = FilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+                sbyte flat = FlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+                Filter8(
+                    mask,
+                    thresh,
+                    flat != 0,
+                    ref s[-4],
+                    ref s[-3],
+                    ref s[-2],
+                    ref s[-1],
+                    ref s[0],
+                    ref s[1],
+                    ref s[2],
+                    ref s[3]);
+                s = s.Slice(pitch);
+            }
+        }
+
+        public static void LpfVertical8Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit0,
+            byte limit0,
+            byte thresh0,
+            byte blimit1,
+            byte limit1,
+            byte thresh1)
+        {
+            LpfVertical8(s, pitch, blimit0, limit0, thresh0);
+            LpfVertical8(s.Slice(8 * pitch), pitch, blimit1, limit1, thresh1);
+        }
+
+        private static void Filter16(
+            sbyte mask,
+            byte thresh,
+            bool flat,
+            bool flat2,
+            ref byte op7,
+            ref byte op6,
+            ref byte op5,
+            ref byte op4,
+            ref byte op3,
+            ref byte op2,
+            ref byte op1,
+            ref byte op0,
+            ref byte oq0,
+            ref byte oq1,
+            ref byte oq2,
+            ref byte oq3,
+            ref byte oq4,
+            ref byte oq5,
+            ref byte oq6,
+            ref byte oq7)
+        {
+            if (flat2 && flat && mask != 0)
+            {
+                byte p7 = op7, p6 = op6, p5 = op5, p4 = op4, p3 = op3, p2 = op2, p1 = op1, p0 = op0;
+                byte q0 = oq0, q1 = oq1, q2 = oq2, q3 = oq3, q4 = oq4, q5 = oq5, q6 = oq6, q7 = oq7;
+
+                // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+                op6 = (byte)BitUtils.RoundPowerOfTwo(
+                    (p7 * 7) + (p6 * 2) + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
+                op5 = (byte)BitUtils.RoundPowerOfTwo(
+                    (p7 * 6) + p6 + (p5 * 2) + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
+                op4 = (byte)BitUtils.RoundPowerOfTwo(
+                    (p7 * 5) + p6 + p5 + (p4 * 2) + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
+                op3 = (byte)BitUtils.RoundPowerOfTwo(
+                    (p7 * 4) + p6 + p5 + p4 + (p3 * 2) + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
+                op2 = (byte)BitUtils.RoundPowerOfTwo(
+                    (p7 * 3) + p6 + p5 + p4 + p3 + (p2 * 2) + p1 + p0 + q0 + q1 + q2 + q3 + q4, 4);
+                op1 = (byte)BitUtils.RoundPowerOfTwo(
+                    (p7 * 2) + p6 + p5 + p4 + p3 + p2 + (p1 * 2) + p0 + q0 + q1 + q2 + q3 + q4 + q5, 4);
+                op0 = (byte)BitUtils.RoundPowerOfTwo(
+                    p7 + p6 + p5 + p4 + p3 + p2 + p1 + (p0 * 2) + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
+                oq0 = (byte)BitUtils.RoundPowerOfTwo(
+                    p6 + p5 + p4 + p3 + p2 + p1 + p0 + (q0 * 2) + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
+                oq1 = (byte)BitUtils.RoundPowerOfTwo(
+                    p5 + p4 + p3 + p2 + p1 + p0 + q0 + (q1 * 2) + q2 + q3 + q4 + q5 + q6 + (q7 * 2), 4);
+                oq2 = (byte)BitUtils.RoundPowerOfTwo(
+                    p4 + p3 + p2 + p1 + p0 + q0 + q1 + (q2 * 2) + q3 + q4 + q5 + q6 + (q7 * 3), 4);
+                oq3 = (byte)BitUtils.RoundPowerOfTwo(
+                    p3 + p2 + p1 + p0 + q0 + q1 + q2 + (q3 * 2) + q4 + q5 + q6 + (q7 * 4), 4);
+                oq4 = (byte)BitUtils.RoundPowerOfTwo(
+                    p2 + p1 + p0 + q0 + q1 + q2 + q3 + (q4 * 2) + q5 + q6 + (q7 * 5), 4);
+                oq5 = (byte)BitUtils.RoundPowerOfTwo(
+                    p1 + p0 + q0 + q1 + q2 + q3 + q4 + (q5 * 2) + q6 + (q7 * 6), 4);
+                oq6 = (byte)BitUtils.RoundPowerOfTwo(
+                    p0 + q0 + q1 + q2 + q3 + q4 + q5 + (q6 * 2) + (q7 * 7), 4);
+            }
+            else
+            {
+                Filter8(mask, thresh, flat, ref op3, ref op2, ref op1, ref op0, ref oq0, ref oq1, ref oq2, ref oq3);
+            }
+        }
+
+        private static void MbLpfHorizontalEdgeW(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh,
+            int count)
+        {
+            // loop filter designed to work using chars so that we can make maximum use
+            // of 8 bit simd instructions.
+            for (int i = 0; i < 8 * count; ++i)
+            {
+                byte p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], p0 = s[-pitch];
+                byte q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], q3 = s[3 * pitch];
+                sbyte mask = FilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+                sbyte flat = FlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+                sbyte flat2 = FlatMask5(
+                    1,
+                    s[-8 * pitch],
+                    s[-7 * pitch],
+                    s[-6 * pitch],
+                    s[-5 * pitch],
+                    p0,
+                    q0,
+                    s[4 * pitch],
+                    s[5 * pitch],
+                    s[6 * pitch],
+                    s[7 * pitch]);
+
+                Filter16(
+                    mask,
+                    thresh,
+                    flat != 0,
+                    flat2 != 0,
+                    ref s[-8 * pitch],
+                    ref s[-7 * pitch],
+                    ref s[-6 * pitch],
+                    ref s[-5 * pitch],
+                    ref s[-4 * pitch],
+                    ref s[-3 * pitch],
+                    ref s[-2 * pitch],
+                    ref s[-1 * pitch],
+                    ref s[0],
+                    ref s[1 * pitch],
+                    ref s[2 * pitch],
+                    ref s[3 * pitch],
+                    ref s[4 * pitch],
+                    ref s[5 * pitch],
+                    ref s[6 * pitch],
+                    ref s[7 * pitch]);
+                s = s.Slice(1);
+            }
+        }
+
+        public static void LpfHorizontal16(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh)
+        {
+            MbLpfHorizontalEdgeW(s, pitch, blimit, limit, thresh, 1);
+        }
+
+        public static void LpfHorizontal16Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh)
+        {
+            MbLpfHorizontalEdgeW(s, pitch, blimit, limit, thresh, 2);
+        }
+
+        private static void MbLpfVerticalEdgeW(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh,
+            int count)
+        {
+            for (int i = 0; i < count; ++i)
+            {
+                byte p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+                byte q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+                sbyte mask = FilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+                sbyte flat = FlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+                sbyte flat2 = FlatMask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], s[7]);
+
+                Filter16(
+                    mask,
+                    thresh,
+                    flat != 0,
+                    flat2 != 0,
+                    ref s[-8],
+                    ref s[-7],
+                    ref s[-6],
+                    ref s[-5],
+                    ref s[-4],
+                    ref s[-3],
+                    ref s[-2],
+                    ref s[-1],
+                    ref s[0],
+                    ref s[1],
+                    ref s[2],
+                    ref s[3],
+                    ref s[4],
+                    ref s[5],
+                    ref s[6],
+                    ref s[7]);
+                s = s.Slice(pitch);
+            }
+        }
+
+        public static void LpfVertical16(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh)
+        {
+            MbLpfVerticalEdgeW(s, pitch, blimit, limit, thresh, 8);
+        }
+
+        public static void LpfVertical16Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh)
+        {
+            MbLpfVerticalEdgeW(s, pitch, blimit, limit, thresh, 16);
+        }
+
+        // Should we apply any filter at all: 11111111 yes, 00000000 no ?
+        private static sbyte HighBdFilterMask(
+            byte limit,
+            byte blimit,
+            ushort p3,
+            ushort p2,
+            ushort p1,
+            ushort p0,
+            ushort q0,
+            ushort q1,
+            ushort q2,
+            ushort q3,
+            int bd)
+        {
+            int mask = 0;
+            short limit16 = (short)(limit << (bd - 8));
+            short blimit16 = (short)(blimit << (bd - 8));
+            mask |= Math.Abs(p3 - p2) > limit16 ? -1 : 0;
+            mask |= Math.Abs(p2 - p1) > limit16 ? -1 : 0;
+            mask |= Math.Abs(p1 - p0) > limit16 ? -1 : 0;
+            mask |= Math.Abs(q1 - q0) > limit16 ? -1 : 0;
+            mask |= Math.Abs(q2 - q1) > limit16 ? -1 : 0;
+            mask |= Math.Abs(q3 - q2) > limit16 ? -1 : 0;
+            mask |= (Math.Abs(p0 - q0) * 2) + (Math.Abs(p1 - q1) / 2) > blimit16 ? -1 : 0;
+            return (sbyte)~mask;
+        }
+
+        private static sbyte HighBdFlatMask4(
+            byte thresh,
+            ushort p3,
+            ushort p2,
+            ushort p1,
+            ushort p0,
+            ushort q0,
+            ushort q1,
+            ushort q2,
+            ushort q3,
+            int bd)
+        {
+            int mask = 0;
+            short thresh16 = (short)(thresh << (bd - 8));
+            mask |= Math.Abs(p1 - p0) > thresh16 ? -1 : 0;
+            mask |= Math.Abs(q1 - q0) > thresh16 ? -1 : 0;
+            mask |= Math.Abs(p2 - p0) > thresh16 ? -1 : 0;
+            mask |= Math.Abs(q2 - q0) > thresh16 ? -1 : 0;
+            mask |= Math.Abs(p3 - p0) > thresh16 ? -1 : 0;
+            mask |= Math.Abs(q3 - q0) > thresh16 ? -1 : 0;
+            return (sbyte)~mask;
+        }
+
+        private static sbyte HighBdFlatMask5(
+            byte thresh,
+            ushort p4,
+            ushort p3,
+            ushort p2,
+            ushort p1,
+            ushort p0,
+            ushort q0,
+            ushort q1,
+            ushort q2,
+            ushort q3,
+            ushort q4,
+            int bd)
+        {
+            int mask = ~HighBdFlatMask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+            short thresh16 = (short)(thresh << (bd - 8));
+            mask |= Math.Abs(p4 - p0) > thresh16 ? -1 : 0;
+            mask |= Math.Abs(q4 - q0) > thresh16 ? -1 : 0;
+            return (sbyte)~mask;
+        }
+
+        // Is there high edge variance internal edge:
+        // 11111111_11111111 yes, 00000000_00000000 no ?
+        private static short HighBdHevMask(
+            byte thresh,
+            ushort p1,
+            ushort p0,
+            ushort q0,
+            ushort q1,
+            int bd)
+        {
+            int hev = 0;
+            short thresh16 = (short)(thresh << (bd - 8));
+            hev |= Math.Abs(p1 - p0) > thresh16 ? -1 : 0;
+            hev |= Math.Abs(q1 - q0) > thresh16 ? -1 : 0;
+            return (short)hev;
+        }
+
+        private static void HighBdFilter4(
+            sbyte mask,
+            byte thresh,
+            ref ushort op1,
+            ref ushort op0,
+            ref ushort oq0,
+            ref ushort oq1,
+            int bd)
+        {
+            short filter1, filter2;
+            // ^0x80 equivalent to subtracting 0x80 from the values to turn them
+            // into -128 to +127 instead of 0 to 255.
+            int shift = bd - 8;
+            short ps1 = (short)((short)op1 - (0x80 << shift));
+            short ps0 = (short)((short)op0 - (0x80 << shift));
+            short qs0 = (short)((short)oq0 - (0x80 << shift));
+            short qs1 = (short)((short)oq1 - (0x80 << shift));
+            short hev = HighBdHevMask(thresh, op1, op0, oq0, oq1, bd);
+
+            // Add outer taps if we have high edge variance.
+            short filter = (short)(ClampSbyteHigh(ps1 - qs1, bd) & hev);
+
+            // Inner taps.
+            filter = (short)(ClampSbyteHigh(filter + (3 * (qs0 - ps0)), bd) & mask);
+
+            // Save bottom 3 bits so that we round one side +4 and the other +3
+            // if it equals 4 we'll set it to adjust by -1 to account for the fact
+            // we'd round it by 3 the other way.
+            filter1 = (short)(ClampSbyteHigh(filter + 4, bd) >> 3);
+            filter2 = (short)(ClampSbyteHigh(filter + 3, bd) >> 3);
+
+            oq0 = (ushort)(ClampSbyteHigh(qs0 - filter1, bd) + (0x80 << shift));
+            op0 = (ushort)(ClampSbyteHigh(ps0 + filter2, bd) + (0x80 << shift));
+
+            // Outer tap adjustments.
+            filter = (short)(BitUtils.RoundPowerOfTwo(filter1, 1) & ~hev);
+
+            oq1 = (ushort)(ClampSbyteHigh(qs1 - filter, bd) + (0x80 << shift));
+            op1 = (ushort)(ClampSbyteHigh(ps1 + filter, bd) + (0x80 << shift));
+        }
+
+        public static void HighBdLpfHorizontal4(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh,
+            int bd)
+        {
+            // loop filter designed to work using chars so that we can make maximum use
+            // of 8 bit simd instructions.
+            for (int i = 0; i < 8; ++i)
+            {
+                ushort p3 = s[-4 * pitch];
+                ushort p2 = s[-3 * pitch];
+                ushort p1 = s[-2 * pitch];
+                ushort p0 = s[-pitch];
+                ushort q0 = s[0 * pitch];
+                ushort q1 = s[1 * pitch];
+                ushort q2 = s[2 * pitch];
+                ushort q3 = s[3 * pitch];
+                sbyte mask = HighBdFilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+                HighBdFilter4(mask, thresh, ref s[-2 * pitch], ref s[-1 * pitch], ref s[0], ref s[1 * pitch], bd);
+                s = s.Slice(1);
+            }
+        }
+
+        public static void HighBdLpfHorizontal4Dual(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit0,
+            byte limit0,
+            byte thresh0,
+            byte blimit1,
+            byte limit1,
+            byte thresh1,
+            int bd)
+        {
+            HighBdLpfHorizontal4(s, pitch, blimit0, limit0, thresh0, bd);
+            HighBdLpfHorizontal4(s.Slice(8), pitch, blimit1, limit1, thresh1, bd);
+        }
+
+        public static void HighBdLpfVertical4(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh,
+            int bd)
+        {
+            // loop filter designed to work using chars so that we can make maximum use
+            // of 8 bit simd instructions.
+            for (int i = 0; i < 8; ++i)
+            {
+                ushort p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+                ushort q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+                sbyte mask = HighBdFilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+                HighBdFilter4(mask, thresh, ref s[-2], ref s[-1], ref s[0], ref s[1], bd);
+                s = s.Slice(pitch);
+            }
+        }
+
+        public static void HighBdLpfVertical4Dual(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit0,
+            byte limit0,
+            byte thresh0,
+            byte blimit1,
+            byte limit1,
+            byte thresh1,
+            int bd)
+        {
+            HighBdLpfVertical4(s, pitch, blimit0, limit0, thresh0, bd);
+            HighBdLpfVertical4(s.Slice(8 * pitch), pitch, blimit1, limit1, thresh1, bd);
+        }
+
+        private static void HighBdFilter8(
+            sbyte mask,
+            byte thresh,
+            bool flat,
+            ref ushort op3,
+            ref ushort op2,
+            ref ushort op1,
+            ref ushort op0,
+            ref ushort oq0,
+            ref ushort oq1,
+            ref ushort oq2,
+            ref ushort oq3,
+            int bd)
+        {
+            if (flat && mask != 0)
+            {
+                ushort p3 = op3, p2 = op2, p1 = op1, p0 = op0;
+                ushort q0 = oq0, q1 = oq1, q2 = oq2, q3 = oq3;
+
+                // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+                op2 = (ushort)BitUtils.RoundPowerOfTwo(p3 + p3 + p3 + (2 * p2) + p1 + p0 + q0, 3);
+                op1 = (ushort)BitUtils.RoundPowerOfTwo(p3 + p3 + p2 + (2 * p1) + p0 + q0 + q1, 3);
+                op0 = (ushort)BitUtils.RoundPowerOfTwo(p3 + p2 + p1 + (2 * p0) + q0 + q1 + q2, 3);
+                oq0 = (ushort)BitUtils.RoundPowerOfTwo(p2 + p1 + p0 + (2 * q0) + q1 + q2 + q3, 3);
+                oq1 = (ushort)BitUtils.RoundPowerOfTwo(p1 + p0 + q0 + (2 * q1) + q2 + q3 + q3, 3);
+                oq2 = (ushort)BitUtils.RoundPowerOfTwo(p0 + q0 + q1 + (2 * q2) + q3 + q3 + q3, 3);
+            }
+            else
+            {
+                HighBdFilter4(mask, thresh, ref op1, ref op0, ref oq0, ref oq1, bd);
+            }
+        }
+
+        public static void HighBdLpfHorizontal8(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh,
+            int bd)
+        {
+            // loop filter designed to work using chars so that we can make maximum use
+            // of 8 bit simd instructions.
+            for (int i = 0; i < 8; ++i)
+            {
+                ushort p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch], p0 = s[-pitch];
+                ushort q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch], q3 = s[3 * pitch];
+
+                sbyte mask = HighBdFilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+                sbyte flat = HighBdFlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+                HighBdFilter8(
+                    mask,
+                    thresh,
+                    flat != 0,
+                    ref s[-4 * pitch],
+                    ref s[-3 * pitch],
+                    ref s[-2 * pitch],
+                    ref s[-1 * pitch],
+                    ref s[0],
+                    ref s[1 * pitch],
+                    ref s[2 * pitch],
+                    ref s[3 * pitch],
+                    bd);
+                s = s.Slice(1);
+            }
+        }
+
+        public static void HighBdLpfHorizontal8Dual(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit0,
+            byte limit0,
+            byte thresh0,
+            byte blimit1,
+            byte limit1,
+            byte thresh1,
+            int bd)
+        {
+            HighBdLpfHorizontal8(s, pitch, blimit0, limit0, thresh0, bd);
+            HighBdLpfHorizontal8(s.Slice(8), pitch, blimit1, limit1, thresh1, bd);
+        }
+
+        public static void HighBdLpfVertical8(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh,
+            int bd)
+        {
+            for (int i = 0; i < 8; ++i)
+            {
+                ushort p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+                ushort q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+                sbyte mask = HighBdFilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+                sbyte flat = HighBdFlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+                HighBdFilter8(
+                    mask,
+                    thresh,
+                    flat != 0,
+                    ref s[-4],
+                    ref s[-3],
+                    ref s[-2],
+                    ref s[-1],
+                    ref s[0],
+                    ref s[1],
+                    ref s[2],
+                    ref s[3],
+                    bd);
+                s = s.Slice(pitch);
+            }
+        }
+
+        public static void HighBdLpfVertical8Dual(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit0,
+            byte limit0,
+            byte thresh0,
+            byte blimit1,
+            byte limit1,
+            byte thresh1,
+            int bd)
+        {
+            HighBdLpfVertical8(s, pitch, blimit0, limit0, thresh0, bd);
+            HighBdLpfVertical8(s.Slice(8 * pitch), pitch, blimit1, limit1, thresh1, bd);
+        }
+
+        private static void HighBdFilter16(
+            sbyte mask,
+            byte thresh,
+            bool flat,
+            bool flat2,
+            ref ushort op7,
+            ref ushort op6,
+            ref ushort op5,
+            ref ushort op4,
+            ref ushort op3,
+            ref ushort op2,
+            ref ushort op1,
+            ref ushort op0,
+            ref ushort oq0,
+            ref ushort oq1,
+            ref ushort oq2,
+            ref ushort oq3,
+            ref ushort oq4,
+            ref ushort oq5,
+            ref ushort oq6,
+            ref ushort oq7,
+            int bd)
+        {
+            if (flat2 && flat && mask != 0)
+            {
+                ushort p7 = op7;
+                ushort p6 = op6;
+                ushort p5 = op5;
+                ushort p4 = op4;
+                ushort p3 = op3;
+                ushort p2 = op2;
+                ushort p1 = op1;
+                ushort p0 = op0;
+                ushort q0 = oq0;
+                ushort q1 = oq1;
+                ushort q2 = oq2;
+                ushort q3 = oq3;
+                ushort q4 = oq4;
+                ushort q5 = oq5;
+                ushort q6 = oq6;
+                ushort q7 = oq7;
+
+                // 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1]
+                op6 = (ushort)BitUtils.RoundPowerOfTwo(
+                    (p7 * 7) + (p6 * 2) + p5 + p4 + p3 + p2 + p1 + p0 + q0, 4);
+                op5 = (ushort)BitUtils.RoundPowerOfTwo(
+                    (p7 * 6) + p6 + (p5 * 2) + p4 + p3 + p2 + p1 + p0 + q0 + q1, 4);
+                op4 = (ushort)BitUtils.RoundPowerOfTwo(
+                    (p7 * 5) + p6 + p5 + (p4 * 2) + p3 + p2 + p1 + p0 + q0 + q1 + q2, 4);
+                op3 = (ushort)BitUtils.RoundPowerOfTwo(
+                    (p7 * 4) + p6 + p5 + p4 + (p3 * 2) + p2 + p1 + p0 + q0 + q1 + q2 + q3, 4);
+                op2 = (ushort)BitUtils.RoundPowerOfTwo(
+                    (p7 * 3) + p6 + p5 + p4 + p3 + (p2 * 2) + p1 + p0 + q0 + q1 + q2 + q3 + q4, 4);
+                op1 = (ushort)BitUtils.RoundPowerOfTwo(
+                    (p7 * 2) + p6 + p5 + p4 + p3 + p2 + (p1 * 2) + p0 + q0 + q1 + q2 + q3 + q4 + q5, 4);
+                op0 = (ushort)BitUtils.RoundPowerOfTwo(
+                    p7 + p6 + p5 + p4 + p3 + p2 + p1 + (p0 * 2) + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4);
+                oq0 = (ushort)BitUtils.RoundPowerOfTwo(
+                    p6 + p5 + p4 + p3 + p2 + p1 + p0 + (q0 * 2) + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4);
+                oq1 = (ushort)BitUtils.RoundPowerOfTwo(
+                    p5 + p4 + p3 + p2 + p1 + p0 + q0 + (q1 * 2) + q2 + q3 + q4 + q5 + q6 + (q7 * 2), 4);
+                oq2 = (ushort)BitUtils.RoundPowerOfTwo(
+                    p4 + p3 + p2 + p1 + p0 + q0 + q1 + (q2 * 2) + q3 + q4 + q5 + q6 + (q7 * 3), 4);
+                oq3 = (ushort)BitUtils.RoundPowerOfTwo(
+                    p3 + p2 + p1 + p0 + q0 + q1 + q2 + (q3 * 2) + q4 + q5 + q6 + (q7 * 4), 4);
+                oq4 = (ushort)BitUtils.RoundPowerOfTwo(
+                    p2 + p1 + p0 + q0 + q1 + q2 + q3 + (q4 * 2) + q5 + q6 + (q7 * 5), 4);
+                oq5 = (ushort)BitUtils.RoundPowerOfTwo(
+                    p1 + p0 + q0 + q1 + q2 + q3 + q4 + (q5 * 2) + q6 + (q7 * 6), 4);
+                oq6 = (ushort)BitUtils.RoundPowerOfTwo(
+                    p0 + q0 + q1 + q2 + q3 + q4 + q5 + (q6 * 2) + (q7 * 7), 4);
+            }
+            else
+            {
+                HighBdFilter8(mask, thresh, flat, ref op3, ref op2, ref op1, ref op0, ref oq0, ref oq1, ref oq2,
+                    ref oq3, bd);
+            }
+        }
+
+        private static void HighBdMbLpfHorizontalEdgeW(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh,
+            int count,
+            int bd)
+        {
+            // loop filter designed to work using chars so that we can make maximum use
+            // of 8 bit simd instructions.
+            for (int i = 0; i < 8 * count; ++i)
+            {
+                ushort p3 = s[-4 * pitch];
+                ushort p2 = s[-3 * pitch];
+                ushort p1 = s[-2 * pitch];
+                ushort p0 = s[-pitch];
+                ushort q0 = s[0 * pitch];
+                ushort q1 = s[1 * pitch];
+                ushort q2 = s[2 * pitch];
+                ushort q3 = s[3 * pitch];
+                sbyte mask = HighBdFilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+                sbyte flat = HighBdFlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+                sbyte flat2 = HighBdFlatMask5(
+                    1,
+                    s[-8 * pitch],
+                    s[-7 * pitch],
+                    s[-6 * pitch],
+                    s[-5 * pitch],
+                    p0,
+                    q0,
+                    s[4 * pitch],
+                    s[5 * pitch],
+                    s[6 * pitch],
+                    s[7 * pitch],
+                    bd);
+
+                HighBdFilter16(
+                    mask,
+                    thresh,
+                    flat != 0,
+                    flat2 != 0,
+                    ref s[-8 * pitch],
+                    ref s[-7 * pitch],
+                    ref s[-6 * pitch],
+                    ref s[-5 * pitch],
+                    ref s[-4 * pitch],
+                    ref s[-3 * pitch],
+                    ref s[-2 * pitch],
+                    ref s[-1 * pitch],
+                    ref s[0],
+                    ref s[1 * pitch],
+                    ref s[2 * pitch],
+                    ref s[3 * pitch],
+                    ref s[4 * pitch],
+                    ref s[5 * pitch],
+                    ref s[6 * pitch],
+                    ref s[7 * pitch],
+                    bd);
+                s = s.Slice(1);
+            }
+        }
+
+        public static void HighBdLpfHorizontal16(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh,
+            int bd)
+        {
+            HighBdMbLpfHorizontalEdgeW(s, pitch, blimit, limit, thresh, 1, bd);
+        }
+
+        public static void HighBdLpfHorizontal16Dual(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh,
+            int bd)
+        {
+            HighBdMbLpfHorizontalEdgeW(s, pitch, blimit, limit, thresh, 2, bd);
+        }
+
+        private static void HighBdMbLpfVerticalEdgeW(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh,
+            int count,
+            int bd)
+        {
+            for (int i = 0; i < count; ++i)
+            {
+                ushort p3 = s[-4];
+                ushort p2 = s[-3];
+                ushort p1 = s[-2];
+                ushort p0 = s[-1];
+                ushort q0 = s[0];
+                ushort q1 = s[1];
+                ushort q2 = s[2];
+                ushort q3 = s[3];
+                sbyte mask = HighBdFilterMask(limit, blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+                sbyte flat = HighBdFlatMask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+                sbyte flat2 = HighBdFlatMask5(1, s[-8], s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], s[7], bd);
+
+                HighBdFilter16(
+                    mask,
+                    thresh,
+                    flat != 0,
+                    flat2 != 0,
+                    ref s[-8],
+                    ref s[-7],
+                    ref s[-6],
+                    ref s[-5],
+                    ref s[-4],
+                    ref s[-3],
+                    ref s[-2],
+                    ref s[-1],
+                    ref s[0],
+                    ref s[1],
+                    ref s[2],
+                    ref s[3],
+                    ref s[4],
+                    ref s[5],
+                    ref s[6],
+                    ref s[7],
+                    bd);
+                s = s.Slice(pitch);
+            }
+        }
+
+        public static void HighBdLpfVertical16(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh,
+            int bd)
+        {
+            HighBdMbLpfVerticalEdgeW(s, pitch, blimit, limit, thresh, 8, bd);
+        }
+
+        public static void HighBdLpfVertical16Dual(
+            ArrayPtr<ushort> s,
+            int pitch,
+            byte blimit,
+            byte limit,
+            byte thresh,
+            int bd)
+        {
+            HighBdMbLpfVerticalEdgeW(s, pitch, blimit, limit, thresh, 16, bd);
+        }
+    }
+}

+ 1837 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/LoopFilterSse2.cs

@@ -0,0 +1,1837 @@
+using Ryujinx.Common.Memory;
+using System;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+    internal static class LoopFilterSse2
+    {
+        private static Vector128<byte> AbsDiff(Vector128<byte> a, Vector128<byte> b)
+        {
+            return Sse2.Or(Sse2.SubtractSaturate(a, b), Sse2.SubtractSaturate(b, a));
+        }
+
+        private static void FilterHevMask(
+            Vector128<byte> q1P1,
+            Vector128<byte> q0P0,
+            Vector128<byte> p3P2,
+            Vector128<byte> p2P1,
+            Vector128<byte> p1P0,
+            Vector128<byte> q3Q2,
+            Vector128<byte> q2Q1,
+            Vector128<byte> q1Q0,
+            Vector128<byte> limitV,
+            Vector128<byte> threshV,
+            out Vector128<byte> hev,
+            out Vector128<byte> mask)
+        {
+            /* (abs(q1 - q0), abs(p1 - p0) */
+            Vector128<byte> flat = AbsDiff(q1P1, q0P0);
+            /* abs(p1 - q1), abs(p0 - q0) */
+            Vector128<byte> absP1Q1P0Q0 = AbsDiff(p1P0, q1Q0);
+            Vector128<byte> absP0Q0, absP1Q1, work;
+
+            /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+            hev = Sse2.UnpackLow(Sse2.Max(flat, Sse2.ShiftRightLogical128BitLane(flat, 8)), Vector128<byte>.Zero);
+            hev = Sse2.CompareGreaterThan(hev.AsInt16(), threshV.AsInt16()).AsByte();
+            hev = Sse2.PackSignedSaturate(hev.AsInt16(), hev.AsInt16()).AsByte();
+
+            /* const int8_t mask = filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3); */
+            absP0Q0 = Sse2.AddSaturate(absP1Q1P0Q0, absP1Q1P0Q0); /* abs(p0 - q0) * 2 */
+            absP1Q1 = Sse2.UnpackHigh(absP1Q1P0Q0, absP1Q1P0Q0); /* abs(p1 - q1) */
+            absP1Q1 = Sse2.ShiftRightLogical(absP1Q1.AsInt16(), 9).AsByte();
+            absP1Q1 = Sse2.PackSignedSaturate(absP1Q1.AsInt16(), absP1Q1.AsInt16()).AsByte(); /* abs(p1 - q1) / 2 */
+            /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+            mask = Sse2.AddSaturate(absP0Q0, absP1Q1);
+            /* abs(p3 - p2), abs(p2 - p1) */
+            work = AbsDiff(p3P2, p2P1);
+            flat = Sse2.Max(work, flat);
+            /* abs(q3 - q2), abs(q2 - q1) */
+            work = AbsDiff(q3Q2, q2Q1);
+            flat = Sse2.Max(work, flat);
+            flat = Sse2.Max(flat, Sse2.ShiftRightLogical128BitLane(flat, 8));
+            mask = Sse2.UnpackLow(mask.AsInt64(), flat.AsInt64()).AsByte();
+            mask = Sse2.SubtractSaturate(mask, limitV);
+            mask = Sse2.CompareEqual(mask, Vector128<byte>.Zero);
+            mask = Sse2.And(mask, Sse2.ShiftRightLogical128BitLane(mask, 8));
+        }
+
+        private static void Filter4(
+            Vector128<byte> p1P0,
+            Vector128<byte> q1Q0,
+            Vector128<byte> hev,
+            Vector128<byte> mask,
+            Vector128<byte> ff,
+            out Vector128<byte> ps1Ps0,
+            out Vector128<byte> qs1Qs0)
+        {
+            Vector128<byte> t3T4 = Vector128.Create(
+                4, 4, 4, 4,
+                4, 4, 4, 4,
+                3, 3, 3, 3,
+                3, 3, 3, (byte)3);
+            Vector128<byte> t80 = Vector128.Create((byte)0x80);
+            Vector128<byte> filter, filter2Filter1, work;
+
+            ps1Ps0 = Sse2.Xor(p1P0, t80); /* ^ 0x80 */
+            qs1Qs0 = Sse2.Xor(q1Q0, t80);
+
+            /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+            work = Sse2.SubtractSaturate(ps1Ps0.AsSByte(), qs1Qs0.AsSByte()).AsByte();
+            filter = Sse2.And(Sse2.ShiftRightLogical128BitLane(work, 8), hev);
+            /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+            filter = Sse2.SubtractSaturate(filter.AsSByte(), work.AsSByte()).AsByte();
+            filter = Sse2.SubtractSaturate(filter.AsSByte(), work.AsSByte()).AsByte();
+            filter = Sse2.SubtractSaturate(filter.AsSByte(), work.AsSByte()).AsByte(); /* + 3 * (qs0 - ps0) */
+            filter = Sse2.And(filter, mask); /* & mask */
+            filter = Sse2.UnpackLow(filter.AsInt64(), filter.AsInt64()).AsByte();
+
+            /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+            /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+            filter2Filter1 = Sse2.AddSaturate(filter.AsSByte(), t3T4.AsSByte()).AsByte(); /* signed_char_clamp */
+            filter = Sse2.UnpackHigh(filter2Filter1, filter2Filter1);
+            filter2Filter1 = Sse2.UnpackLow(filter2Filter1, filter2Filter1);
+            filter2Filter1 = Sse2.ShiftRightArithmetic(filter2Filter1.AsInt16(), 11).AsByte(); /* >> 3 */
+            filter = Sse2.ShiftRightArithmetic(filter.AsInt16(), 11).AsByte(); /* >> 3 */
+            filter2Filter1 = Sse2.PackSignedSaturate(filter2Filter1.AsInt16(), filter.AsInt16()).AsByte();
+
+            /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+            filter = Sse2.SubtractSaturate(filter2Filter1.AsSByte(), ff.AsSByte()).AsByte(); /* + 1 */
+            filter = Sse2.UnpackLow(filter, filter);
+            filter = Sse2.ShiftRightArithmetic(filter.AsInt16(), 9).AsByte(); /* round */
+            filter = Sse2.PackSignedSaturate(filter.AsInt16(), filter.AsInt16()).AsByte();
+            filter = Sse2.AndNot(hev, filter);
+
+            hev = Sse2.UnpackHigh(filter2Filter1.AsInt64(), filter.AsInt64()).AsByte();
+            filter2Filter1 = Sse2.UnpackLow(filter2Filter1.AsInt64(), filter.AsInt64()).AsByte();
+
+            /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+            qs1Qs0 = Sse2.SubtractSaturate(qs1Qs0.AsSByte(), filter2Filter1.AsSByte()).AsByte();
+            /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+            ps1Ps0 = Sse2.AddSaturate(ps1Ps0.AsSByte(), hev.AsSByte()).AsByte();
+            qs1Qs0 = Sse2.Xor(qs1Qs0, t80); /* ^ 0x80 */
+            ps1Ps0 = Sse2.Xor(ps1Ps0, t80); /* ^ 0x80 */
+        }
+
+        public static unsafe void LpfHorizontal4(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            Vector128<byte> zero = Vector128<byte>.Zero;
+            Vector128<byte> limitV, threshV;
+
+            fixed (byte* pBLimit = blimit, pLimit = limit, pThresh = thresh)
+            {
+                limitV = Sse2.UnpackLow(
+                    Sse2.LoadScalarVector128((long*)pBLimit),
+                    Sse2.LoadScalarVector128((long*)pLimit)).AsByte();
+                threshV = Sse2.UnpackLow(Sse2.LoadScalarVector128((long*)pThresh).AsByte(), zero);
+            }
+
+            Vector128<byte> ff = Sse2.CompareEqual(zero, zero);
+            Vector128<byte> q1P1, q0P0, p3P2, p2P1, p1P0, q3Q2, q2Q1, q1Q0, ps1Ps0, qs1Qs0;
+            Vector128<byte> mask, hev;
+
+            p3P2 = Sse2.UnpackLow(Sse2.LoadScalarVector128((long*)(s.ToPointer() - (3 * pitch))),
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() - (4 * pitch)))).AsByte();
+            q1P1 = Sse2.UnpackLow(Sse2.LoadScalarVector128((long*)(s.ToPointer() - (2 * pitch))),
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (1 * pitch)))).AsByte();
+            q0P0 = Sse2.UnpackLow(Sse2.LoadScalarVector128((long*)(s.ToPointer() - (1 * pitch))),
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (0 * pitch)))).AsByte();
+            q3Q2 = Sse2.UnpackLow(Sse2.LoadScalarVector128((long*)(s.ToPointer() + (2 * pitch))),
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (3 * pitch)))).AsByte();
+            p1P0 = Sse2.UnpackLow(q0P0.AsInt64(), q1P1.AsInt64()).AsByte();
+            p2P1 = Sse2.UnpackLow(q1P1.AsInt64(), p3P2.AsInt64()).AsByte();
+            q1Q0 = Sse2.UnpackHigh(q0P0.AsInt64(), q1P1.AsInt64()).AsByte();
+            q2Q1 = Sse2.UnpackLow(Sse2.ShiftRightLogical128BitLane(q1P1, 8).AsInt64(), q3Q2.AsInt64()).AsByte();
+
+            FilterHevMask(q1P1, q0P0, p3P2, p2P1, p1P0, q3Q2, q2Q1, q1Q0, limitV, threshV, out hev, out mask);
+            Filter4(p1P0, q1Q0, hev, mask, ff, out ps1Ps0, out qs1Qs0);
+
+            Sse.StoreHigh((float*)(s.ToPointer() - (2 * pitch)), ps1Ps0.AsSingle()); // *op1
+            Sse2.StoreScalar((long*)(s.ToPointer() - (1 * pitch)), ps1Ps0.AsInt64()); // *op0
+            Sse2.StoreScalar((long*)(s.ToPointer() + (0 * pitch)), qs1Qs0.AsInt64()); // *oq0
+            Sse.StoreHigh((float*)(s.ToPointer() + (1 * pitch)), qs1Qs0.AsSingle()); // *oq1
+        }
+
+        public static unsafe void LpfVertical4(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            Vector128<byte> zero = Vector128<byte>.Zero;
+            Vector128<byte> limitV, threshV;
+
+            fixed (byte* pBLimit = blimit, pLimit = limit, pThresh = thresh)
+            {
+                limitV = Sse2.UnpackLow(
+                    Sse2.LoadScalarVector128((long*)pBLimit).AsInt64(),
+                    Sse2.LoadScalarVector128((long*)pLimit).AsInt64()).AsByte();
+                threshV = Sse2.UnpackLow(Sse2.LoadScalarVector128((long*)pThresh).AsByte(), zero);
+            }
+
+            Vector128<byte> ff = Sse2.CompareEqual(zero, zero);
+            Vector128<byte> x0, x1, x2, x3;
+            Vector128<byte> q1P1, q0P0, p3P2, p2P1, p1P0, q3Q2, q2Q1, q1Q0, ps1Ps0, qs1Qs0;
+            Vector128<byte> mask, hev;
+
+            // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+            q1Q0 = Sse2.UnpackLow(
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (0 * pitch) - 4)).AsByte(),
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (1 * pitch) - 4)).AsByte());
+
+            // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+            x1 = Sse2.UnpackLow(
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (2 * pitch) - 4)).AsByte(),
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (3 * pitch) - 4)).AsByte());
+
+            // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+            x2 = Sse2.UnpackLow(
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (4 * pitch) - 4)).AsByte(),
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (5 * pitch) - 4)).AsByte());
+
+            // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+            x3 = Sse2.UnpackLow(
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (6 * pitch) - 4)).AsByte(),
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (7 * pitch) - 4)).AsByte());
+
+            // Transpose 8x8
+            // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+            p1P0 = Sse2.UnpackLow(q1Q0.AsInt16(), x1.AsInt16()).AsByte();
+            // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+            x0 = Sse2.UnpackLow(x2.AsInt16(), x3.AsInt16()).AsByte();
+            // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+            p3P2 = Sse2.UnpackLow(p1P0.AsInt32(), x0.AsInt32()).AsByte();
+            // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+            p1P0 = Sse2.UnpackHigh(p1P0.AsInt32(), x0.AsInt32()).AsByte();
+            p3P2 = Sse2.UnpackHigh(p3P2.AsInt64(), Sse2.ShiftLeftLogical128BitLane(p3P2, 8).AsInt64())
+                .AsByte(); // swap lo and high
+            p1P0 = Sse2.UnpackHigh(p1P0.AsInt64(), Sse2.ShiftLeftLogical128BitLane(p1P0, 8).AsInt64())
+                .AsByte(); // swap lo and high
+
+            // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+            q1Q0 = Sse2.UnpackHigh(q1Q0.AsInt16(), x1.AsInt16()).AsByte();
+            // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+            x2 = Sse2.UnpackHigh(x2.AsInt16(), x3.AsInt16()).AsByte();
+            // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+            q3Q2 = Sse2.UnpackHigh(q1Q0.AsInt32(), x2.AsInt32()).AsByte();
+            // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+            q1Q0 = Sse2.UnpackLow(q1Q0.AsInt32(), x2.AsInt32()).AsByte();
+
+            q0P0 = Sse2.UnpackLow(p1P0.AsInt64(), q1Q0.AsInt64()).AsByte();
+            q1P1 = Sse2.UnpackHigh(p1P0.AsInt64(), q1Q0.AsInt64()).AsByte();
+            p1P0 = Sse2.UnpackLow(q0P0.AsInt64(), q1P1.AsInt64()).AsByte();
+            p2P1 = Sse2.UnpackLow(q1P1.AsInt64(), p3P2.AsInt64()).AsByte();
+            q2Q1 = Sse2.UnpackLow(Sse2.ShiftRightLogical128BitLane(q1P1, 8).AsInt64(), q3Q2.AsInt64()).AsByte();
+
+            FilterHevMask(q1P1, q0P0, p3P2, p2P1, p1P0, q3Q2, q2Q1, q1Q0, limitV, threshV, out hev, out mask);
+            Filter4(p1P0, q1Q0, hev, mask, ff, out ps1Ps0, out qs1Qs0);
+
+            // Transpose 8x4 to 4x8
+            // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
+            // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
+            // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+            ps1Ps0 = Sse2.UnpackHigh(ps1Ps0.AsInt64(), Sse2.ShiftLeftLogical128BitLane(ps1Ps0, 8).AsInt64()).AsByte();
+            // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
+            x0 = Sse2.UnpackHigh(ps1Ps0, qs1Qs0);
+            // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
+            ps1Ps0 = Sse2.UnpackLow(ps1Ps0, qs1Qs0);
+            // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+            qs1Qs0 = Sse2.UnpackHigh(ps1Ps0, x0);
+            // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+            ps1Ps0 = Sse2.UnpackLow(ps1Ps0, x0);
+
+            *(int*)(s.ToPointer() + (0 * pitch) - 2) = ps1Ps0.AsInt32().GetElement(0);
+            ps1Ps0 = Sse2.ShiftRightLogical128BitLane(ps1Ps0, 4);
+            *(int*)(s.ToPointer() + (1 * pitch) - 2) = ps1Ps0.AsInt32().GetElement(0);
+            ps1Ps0 = Sse2.ShiftRightLogical128BitLane(ps1Ps0, 4);
+            *(int*)(s.ToPointer() + (2 * pitch) - 2) = ps1Ps0.AsInt32().GetElement(0);
+            ps1Ps0 = Sse2.ShiftRightLogical128BitLane(ps1Ps0, 4);
+            *(int*)(s.ToPointer() + (3 * pitch) - 2) = ps1Ps0.AsInt32().GetElement(0);
+
+            *(int*)(s.ToPointer() + (4 * pitch) - 2) = qs1Qs0.AsInt32().GetElement(0);
+            qs1Qs0 = Sse2.ShiftRightLogical128BitLane(qs1Qs0, 4);
+            *(int*)(s.ToPointer() + (5 * pitch) - 2) = qs1Qs0.AsInt32().GetElement(0);
+            qs1Qs0 = Sse2.ShiftRightLogical128BitLane(qs1Qs0, 4);
+            *(int*)(s.ToPointer() + (6 * pitch) - 2) = qs1Qs0.AsInt32().GetElement(0);
+            qs1Qs0 = Sse2.ShiftRightLogical128BitLane(qs1Qs0, 4);
+            *(int*)(s.ToPointer() + (7 * pitch) - 2) = qs1Qs0.AsInt32().GetElement(0);
+        }
+
+        public static unsafe void LpfHorizontal16(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            Vector128<byte> zero = Vector128<byte>.Zero;
+            Vector128<byte> one = Vector128.Create((byte)1);
+            Vector128<byte> blimitV, limitV, threshV;
+
+            fixed (byte* pBLimit = blimit, pLimit = limit, pThresh = thresh)
+            {
+                blimitV = Sse2.LoadVector128(pBLimit);
+                limitV = Sse2.LoadVector128(pLimit);
+                threshV = Sse2.LoadVector128(pThresh);
+            }
+
+            Vector128<byte> mask, hev, flat, flat2;
+            Vector128<byte> q7P7, q6P6, q5P5, q4P4, q3P3, q2P2, q1P1, q0P0, p0Q0, p1Q1;
+            Vector128<byte> absP1P0;
+
+            q4P4 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (5 * pitch))).AsByte();
+            q4P4 = Sse.LoadHigh(q4P4.AsSingle(), (float*)(s.ToPointer() + (4 * pitch))).AsByte();
+            q3P3 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (4 * pitch))).AsByte();
+            q3P3 = Sse.LoadHigh(q3P3.AsSingle(), (float*)(s.ToPointer() + (3 * pitch))).AsByte();
+            q2P2 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (3 * pitch))).AsByte();
+            q2P2 = Sse.LoadHigh(q2P2.AsSingle(), (float*)(s.ToPointer() + (2 * pitch))).AsByte();
+            q1P1 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (2 * pitch))).AsByte();
+            q1P1 = Sse.LoadHigh(q1P1.AsSingle(), (float*)(s.ToPointer() + (1 * pitch))).AsByte();
+            p1Q1 = Sse2.Shuffle(q1P1.AsUInt32(), 78).AsByte();
+            q0P0 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (1 * pitch))).AsByte();
+            q0P0 = Sse.LoadHigh(q0P0.AsSingle(), (float*)(s.ToPointer() - (0 * pitch))).AsByte();
+            p0Q0 = Sse2.Shuffle(q0P0.AsUInt32(), 78).AsByte();
+
+            {
+                Vector128<byte> absP1Q1, absP0Q0, absQ1Q0, fe, ff, work;
+                absP1P0 = AbsDiff(q1P1, q0P0);
+                absQ1Q0 = Sse2.ShiftRightLogical128BitLane(absP1P0, 8);
+                fe = Vector128.Create((byte)0xfe);
+                ff = Sse2.CompareEqual(absP1P0, absP1P0);
+                absP0Q0 = AbsDiff(q0P0, p0Q0);
+                absP1Q1 = AbsDiff(q1P1, p1Q1);
+                flat = Sse2.Max(absP1P0, absQ1Q0);
+                hev = Sse2.SubtractSaturate(flat, threshV);
+                hev = Sse2.Xor(Sse2.CompareEqual(hev, zero), ff);
+
+                absP0Q0 = Sse2.AddSaturate(absP0Q0, absP0Q0);
+                absP1Q1 = Sse2.ShiftRightLogical(Sse2.And(absP1Q1, fe).AsInt16(), 1).AsByte();
+                mask = Sse2.SubtractSaturate(Sse2.AddSaturate(absP0Q0, absP1Q1), blimitV);
+                mask = Sse2.Xor(Sse2.CompareEqual(mask, zero), ff);
+                // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+                mask = Sse2.Max(absP1P0, mask);
+                // mask |= (abs(p1 - p0) > limit) * -1;
+                // mask |= (abs(q1 - q0) > limit) * -1;
+
+                work = Sse2.Max(AbsDiff(q2P2, q1P1), AbsDiff(q3P3, q2P2));
+                mask = Sse2.Max(work, mask);
+                mask = Sse2.Max(mask, Sse2.ShiftRightLogical128BitLane(mask, 8));
+                mask = Sse2.SubtractSaturate(mask, limitV);
+                mask = Sse2.CompareEqual(mask, zero);
+            }
+
+            // lp filter
+            {
+                Vector128<byte> t4 = Vector128.Create((byte)4);
+                Vector128<byte> t3 = Vector128.Create((byte)3);
+                Vector128<byte> t80 = Vector128.Create((byte)0x80);
+                Vector128<ushort> t1 = Vector128.Create((ushort)0x1);
+                Vector128<byte> qs1Ps1 = Sse2.Xor(q1P1, t80);
+                Vector128<byte> qs0Ps0 = Sse2.Xor(q0P0, t80);
+                Vector128<byte> qs0 = Sse2.Xor(p0Q0, t80);
+                Vector128<byte> qs1 = Sse2.Xor(p1Q1, t80);
+                Vector128<byte> filt;
+                Vector128<byte> workA;
+                Vector128<byte> filter1, filter2;
+                Vector128<byte> flat2Q6P6, flat2Q5P5, flat2Q4P4, flat2Q3P3, flat2Q2P2;
+                Vector128<byte> flat2Q1P1, flat2Q0P0, flatQ2P2, flatQ1P1, flatQ0P0;
+
+                filt = Sse2.And(Sse2.SubtractSaturate(qs1Ps1.AsSByte(), qs1.AsSByte()).AsByte(), hev);
+                workA = Sse2.SubtractSaturate(qs0.AsSByte(), qs0Ps0.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                // (vpx_filter + 3 * (qs0 - ps0)) & mask
+                filt = Sse2.And(filt, mask);
+
+                filter1 = Sse2.AddSaturate(filt.AsSByte(), t4.AsSByte()).AsByte();
+                filter2 = Sse2.AddSaturate(filt.AsSByte(), t3.AsSByte()).AsByte();
+
+                filter1 = Sse2.UnpackLow(zero, filter1);
+                filter1 = Sse2.ShiftRightArithmetic(filter1.AsInt16(), 0xB).AsByte();
+                filter2 = Sse2.UnpackLow(zero, filter2);
+                filter2 = Sse2.ShiftRightArithmetic(filter2.AsInt16(), 0xB).AsByte();
+
+                // Filter1 >> 3
+                filt = Sse2.PackSignedSaturate(filter2.AsInt16(),
+                    Sse2.SubtractSaturate(zero.AsInt16(), filter1.AsInt16())).AsByte();
+                qs0Ps0 = Sse2.Xor(Sse2.AddSaturate(qs0Ps0.AsSByte(), filt.AsSByte()).AsByte(), t80);
+
+                // filt >> 1
+                filt = Sse2.AddSaturate(filter1.AsInt16(), t1.AsInt16()).AsByte();
+                filt = Sse2.ShiftRightArithmetic(filt.AsInt16(), 1).AsByte();
+                filt = Sse2.AndNot(Sse2.ShiftRightArithmetic(Sse2.UnpackLow(zero, hev).AsInt16(), 0x8), filt.AsInt16())
+                    .AsByte();
+                filt = Sse2.PackSignedSaturate(filt.AsInt16(), Sse2.SubtractSaturate(zero.AsInt16(), filt.AsInt16()))
+                    .AsByte();
+                qs1Ps1 = Sse2.Xor(Sse2.AddSaturate(qs1Ps1.AsSByte(), filt.AsSByte()).AsByte(), t80);
+                // loopfilter done
+
+                {
+                    Vector128<byte> work;
+                    flat = Sse2.Max(AbsDiff(q2P2, q0P0), AbsDiff(q3P3, q0P0));
+                    flat = Sse2.Max(absP1P0, flat);
+                    flat = Sse2.Max(flat, Sse2.ShiftRightLogical128BitLane(flat, 8));
+                    flat = Sse2.SubtractSaturate(flat, one);
+                    flat = Sse2.CompareEqual(flat, zero);
+                    flat = Sse2.And(flat, mask);
+
+                    q5P5 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (6 * pitch))).AsByte();
+                    q5P5 = Sse.LoadHigh(q5P5.AsSingle(), (float*)(s.ToPointer() + (5 * pitch))).AsByte();
+
+                    q6P6 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (7 * pitch))).AsByte();
+                    q6P6 = Sse.LoadHigh(q6P6.AsSingle(), (float*)(s.ToPointer() + (6 * pitch))).AsByte();
+                    flat2 = Sse2.Max(AbsDiff(q4P4, q0P0), AbsDiff(q5P5, q0P0));
+
+                    q7P7 = Sse2.LoadScalarVector128((long*)(s.ToPointer() - (8 * pitch))).AsByte();
+                    q7P7 = Sse.LoadHigh(q7P7.AsSingle(), (float*)(s.ToPointer() + (7 * pitch))).AsByte();
+                    work = Sse2.Max(AbsDiff(q6P6, q0P0), AbsDiff(q7P7, q0P0));
+                    flat2 = Sse2.Max(work, flat2);
+                    flat2 = Sse2.Max(flat2, Sse2.ShiftRightLogical128BitLane(flat2, 8));
+                    flat2 = Sse2.SubtractSaturate(flat2, one);
+                    flat2 = Sse2.CompareEqual(flat2, zero);
+                    flat2 = Sse2.And(flat2, flat); // flat2 & flat & mask
+                }
+
+                // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                // flat and wide flat calculations
+                {
+                    Vector128<short> eight = Vector128.Create((short)8);
+                    Vector128<short> four = Vector128.Create((short)4);
+                    Vector128<short> p716, p616, p516, p416, p316, p216, p116, p016;
+                    Vector128<short> q716, q616, q516, q416, q316, q216, q116, q016;
+                    Vector128<short> pixelFilterP, pixelFilterQ;
+                    Vector128<short> pixetFilterP2P1P0, pixetFilterQ2Q1Q0;
+                    Vector128<short> sumP7, sumQ7, sumP3, sumQ3, resP, resQ;
+
+                    p716 = Sse2.UnpackLow(q7P7, zero).AsInt16();
+                    p616 = Sse2.UnpackLow(q6P6, zero).AsInt16();
+                    p516 = Sse2.UnpackLow(q5P5, zero).AsInt16();
+                    p416 = Sse2.UnpackLow(q4P4, zero).AsInt16();
+                    p316 = Sse2.UnpackLow(q3P3, zero).AsInt16();
+                    p216 = Sse2.UnpackLow(q2P2, zero).AsInt16();
+                    p116 = Sse2.UnpackLow(q1P1, zero).AsInt16();
+                    p016 = Sse2.UnpackLow(q0P0, zero).AsInt16();
+                    q016 = Sse2.UnpackHigh(q0P0, zero).AsInt16();
+                    q116 = Sse2.UnpackHigh(q1P1, zero).AsInt16();
+                    q216 = Sse2.UnpackHigh(q2P2, zero).AsInt16();
+                    q316 = Sse2.UnpackHigh(q3P3, zero).AsInt16();
+                    q416 = Sse2.UnpackHigh(q4P4, zero).AsInt16();
+                    q516 = Sse2.UnpackHigh(q5P5, zero).AsInt16();
+                    q616 = Sse2.UnpackHigh(q6P6, zero).AsInt16();
+                    q716 = Sse2.UnpackHigh(q7P7, zero).AsInt16();
+
+                    pixelFilterP = Sse2.Add(Sse2.Add(p616, p516), Sse2.Add(p416, p316));
+                    pixelFilterQ = Sse2.Add(Sse2.Add(q616, q516), Sse2.Add(q416, q316));
+
+                    pixetFilterP2P1P0 = Sse2.Add(p016, Sse2.Add(p216, p116));
+                    pixelFilterP = Sse2.Add(pixelFilterP, pixetFilterP2P1P0);
+
+                    pixetFilterQ2Q1Q0 = Sse2.Add(q016, Sse2.Add(q216, q116));
+                    pixelFilterQ = Sse2.Add(pixelFilterQ, pixetFilterQ2Q1Q0);
+                    pixelFilterP = Sse2.Add(eight, Sse2.Add(pixelFilterP, pixelFilterQ));
+                    pixetFilterP2P1P0 = Sse2.Add(four, Sse2.Add(pixetFilterP2P1P0, pixetFilterQ2Q1Q0));
+                    resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(p716, p016)), 4);
+                    resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(q716, q016)), 4);
+                    flat2Q0P0 = Sse2.PackUnsignedSaturate(resP, resQ);
+                    resP = Sse2.ShiftRightLogical(Sse2.Add(pixetFilterP2P1P0, Sse2.Add(p316, p016)), 3);
+                    resQ = Sse2.ShiftRightLogical(Sse2.Add(pixetFilterP2P1P0, Sse2.Add(q316, q016)), 3);
+
+                    flatQ0P0 = Sse2.PackUnsignedSaturate(resP, resQ);
+
+                    sumP7 = Sse2.Add(p716, p716);
+                    sumQ7 = Sse2.Add(q716, q716);
+                    sumP3 = Sse2.Add(p316, p316);
+                    sumQ3 = Sse2.Add(q316, q316);
+
+                    pixelFilterQ = Sse2.Subtract(pixelFilterP, p616);
+                    pixelFilterP = Sse2.Subtract(pixelFilterP, q616);
+                    resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(sumP7, p116)), 4);
+                    resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterQ, Sse2.Add(sumQ7, q116)), 4);
+                    flat2Q1P1 = Sse2.PackUnsignedSaturate(resP, resQ);
+
+                    pixetFilterQ2Q1Q0 = Sse2.Subtract(pixetFilterP2P1P0, p216);
+                    pixetFilterP2P1P0 = Sse2.Subtract(pixetFilterP2P1P0, q216);
+                    resP = Sse2.ShiftRightLogical(Sse2.Add(pixetFilterP2P1P0, Sse2.Add(sumP3, p116)), 3);
+                    resQ = Sse2.ShiftRightLogical(Sse2.Add(pixetFilterQ2Q1Q0, Sse2.Add(sumQ3, q116)), 3);
+                    flatQ1P1 = Sse2.PackUnsignedSaturate(resP, resQ);
+
+                    sumP7 = Sse2.Add(sumP7, p716);
+                    sumQ7 = Sse2.Add(sumQ7, q716);
+                    sumP3 = Sse2.Add(sumP3, p316);
+                    sumQ3 = Sse2.Add(sumQ3, q316);
+
+                    pixelFilterP = Sse2.Subtract(pixelFilterP, q516);
+                    pixelFilterQ = Sse2.Subtract(pixelFilterQ, p516);
+                    resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(sumP7, p216)), 4);
+                    resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterQ, Sse2.Add(sumQ7, q216)), 4);
+                    flat2Q2P2 = Sse2.PackUnsignedSaturate(resP, resQ);
+
+                    pixetFilterP2P1P0 = Sse2.Subtract(pixetFilterP2P1P0, q116);
+                    pixetFilterQ2Q1Q0 = Sse2.Subtract(pixetFilterQ2Q1Q0, p116);
+
+                    resP = Sse2.ShiftRightLogical(Sse2.Add(pixetFilterP2P1P0, Sse2.Add(sumP3, p216)), 3);
+                    resQ = Sse2.ShiftRightLogical(Sse2.Add(pixetFilterQ2Q1Q0, Sse2.Add(sumQ3, q216)), 3);
+                    flatQ2P2 = Sse2.PackUnsignedSaturate(resP, resQ);
+
+                    sumP7 = Sse2.Add(sumP7, p716);
+                    sumQ7 = Sse2.Add(sumQ7, q716);
+                    pixelFilterP = Sse2.Subtract(pixelFilterP, q416);
+                    pixelFilterQ = Sse2.Subtract(pixelFilterQ, p416);
+                    resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(sumP7, p316)), 4);
+                    resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterQ, Sse2.Add(sumQ7, q316)), 4);
+                    flat2Q3P3 = Sse2.PackUnsignedSaturate(resP, resQ);
+
+                    sumP7 = Sse2.Add(sumP7, p716);
+                    sumQ7 = Sse2.Add(sumQ7, q716);
+                    pixelFilterP = Sse2.Subtract(pixelFilterP, q316);
+                    pixelFilterQ = Sse2.Subtract(pixelFilterQ, p316);
+                    resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(sumP7, p416)), 4);
+                    resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterQ, Sse2.Add(sumQ7, q416)), 4);
+                    flat2Q4P4 = Sse2.PackUnsignedSaturate(resP, resQ);
+
+                    sumP7 = Sse2.Add(sumP7, p716);
+                    sumQ7 = Sse2.Add(sumQ7, q716);
+                    pixelFilterP = Sse2.Subtract(pixelFilterP, q216);
+                    pixelFilterQ = Sse2.Subtract(pixelFilterQ, p216);
+                    resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(sumP7, p516)), 4);
+                    resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterQ, Sse2.Add(sumQ7, q516)), 4);
+                    flat2Q5P5 = Sse2.PackUnsignedSaturate(resP, resQ);
+
+                    sumP7 = Sse2.Add(sumP7, p716);
+                    sumQ7 = Sse2.Add(sumQ7, q716);
+                    pixelFilterP = Sse2.Subtract(pixelFilterP, q116);
+                    pixelFilterQ = Sse2.Subtract(pixelFilterQ, p116);
+                    resP = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterP, Sse2.Add(sumP7, p616)), 4);
+                    resQ = Sse2.ShiftRightLogical(Sse2.Add(pixelFilterQ, Sse2.Add(sumQ7, q616)), 4);
+                    flat2Q6P6 = Sse2.PackUnsignedSaturate(resP, resQ);
+                }
+                // wide flat
+                // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+                flat = Sse2.Shuffle(flat.AsInt32(), 68).AsByte();
+                flat2 = Sse2.Shuffle(flat2.AsInt32(), 68).AsByte();
+
+                q2P2 = Sse2.AndNot(flat, q2P2);
+                flatQ2P2 = Sse2.And(flat, flatQ2P2);
+                q2P2 = Sse2.Or(q2P2, flatQ2P2);
+
+                qs1Ps1 = Sse2.AndNot(flat, qs1Ps1);
+                flatQ1P1 = Sse2.And(flat, flatQ1P1);
+                q1P1 = Sse2.Or(qs1Ps1, flatQ1P1);
+
+                qs0Ps0 = Sse2.AndNot(flat, qs0Ps0);
+                flatQ0P0 = Sse2.And(flat, flatQ0P0);
+                q0P0 = Sse2.Or(qs0Ps0, flatQ0P0);
+
+                q6P6 = Sse2.AndNot(flat2, q6P6);
+                flat2Q6P6 = Sse2.And(flat2, flat2Q6P6);
+                q6P6 = Sse2.Or(q6P6, flat2Q6P6);
+                Sse2.StoreScalar((long*)(s.ToPointer() - (7 * pitch)), q6P6.AsInt64());
+                Sse.StoreHigh((float*)(s.ToPointer() + (6 * pitch)), q6P6.AsSingle());
+
+                q5P5 = Sse2.AndNot(flat2, q5P5);
+                flat2Q5P5 = Sse2.And(flat2, flat2Q5P5);
+                q5P5 = Sse2.Or(q5P5, flat2Q5P5);
+                Sse2.StoreScalar((long*)(s.ToPointer() - (6 * pitch)), q5P5.AsInt64());
+                Sse.StoreHigh((float*)(s.ToPointer() + (5 * pitch)), q5P5.AsSingle());
+
+                q4P4 = Sse2.AndNot(flat2, q4P4);
+                flat2Q4P4 = Sse2.And(flat2, flat2Q4P4);
+                q4P4 = Sse2.Or(q4P4, flat2Q4P4);
+                Sse2.StoreScalar((long*)(s.ToPointer() - (5 * pitch)), q4P4.AsInt64());
+                Sse.StoreHigh((float*)(s.ToPointer() + (4 * pitch)), q4P4.AsSingle());
+
+                q3P3 = Sse2.AndNot(flat2, q3P3);
+                flat2Q3P3 = Sse2.And(flat2, flat2Q3P3);
+                q3P3 = Sse2.Or(q3P3, flat2Q3P3);
+                Sse2.StoreScalar((long*)(s.ToPointer() - (4 * pitch)), q3P3.AsInt64());
+                Sse.StoreHigh((float*)(s.ToPointer() + (3 * pitch)), q3P3.AsSingle());
+
+                q2P2 = Sse2.AndNot(flat2, q2P2);
+                flat2Q2P2 = Sse2.And(flat2, flat2Q2P2);
+                q2P2 = Sse2.Or(q2P2, flat2Q2P2);
+                Sse2.StoreScalar((long*)(s.ToPointer() - (3 * pitch)), q2P2.AsInt64());
+                Sse.StoreHigh((float*)(s.ToPointer() + (2 * pitch)), q2P2.AsSingle());
+
+                q1P1 = Sse2.AndNot(flat2, q1P1);
+                flat2Q1P1 = Sse2.And(flat2, flat2Q1P1);
+                q1P1 = Sse2.Or(q1P1, flat2Q1P1);
+                Sse2.StoreScalar((long*)(s.ToPointer() - (2 * pitch)), q1P1.AsInt64());
+                Sse.StoreHigh((float*)(s.ToPointer() + (1 * pitch)), q1P1.AsSingle());
+
+                q0P0 = Sse2.AndNot(flat2, q0P0);
+                flat2Q0P0 = Sse2.And(flat2, flat2Q0P0);
+                q0P0 = Sse2.Or(q0P0, flat2Q0P0);
+                Sse2.StoreScalar((long*)(s.ToPointer() - (1 * pitch)), q0P0.AsInt64());
+                Sse.StoreHigh((float*)(s.ToPointer() - (0 * pitch)), q0P0.AsSingle());
+            }
+        }
+
+        private static Vector128<short> FilterAdd2Sub2(
+            Vector128<short> total,
+            Vector128<short> a1,
+            Vector128<short> a2,
+            Vector128<short> s1,
+            Vector128<short> s2)
+        {
+            Vector128<short> x = Sse2.Add(a1, total);
+            x = Sse2.Add(Sse2.Subtract(x, Sse2.Add(s1, s2)), a2);
+            return x;
+        }
+
+        private static Vector128<byte> Filter8Mask(
+            Vector128<byte> flat,
+            Vector128<byte> otherFilt,
+            Vector128<short> f8Lo,
+            Vector128<short> f8Hi)
+        {
+            Vector128<byte> f8 =
+                Sse2.PackUnsignedSaturate(Sse2.ShiftRightLogical(f8Lo, 3), Sse2.ShiftRightLogical(f8Hi, 3));
+            Vector128<byte> result = Sse2.And(flat, f8);
+            return Sse2.Or(Sse2.AndNot(flat, otherFilt), result);
+        }
+
+        private static Vector128<byte> Filter16Mask(
+            Vector128<byte> flat,
+            Vector128<byte> otherFilt,
+            Vector128<short> fLo,
+            Vector128<short> fHi)
+        {
+            Vector128<byte> f =
+                Sse2.PackUnsignedSaturate(Sse2.ShiftRightLogical(fLo, 4), Sse2.ShiftRightLogical(fHi, 4));
+            Vector128<byte> result = Sse2.And(flat, f);
+            return Sse2.Or(Sse2.AndNot(flat, otherFilt), result);
+        }
+
+        public static unsafe void LpfHorizontal16Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            Vector128<byte> zero = Vector128<byte>.Zero;
+            Vector128<byte> one = Vector128.Create((byte)1);
+            Vector128<byte> blimitV, limitV, threshV;
+
+            fixed (byte* pBLimit = blimit, pLimit = limit, pThresh = thresh)
+            {
+                blimitV = Sse2.LoadVector128(pBLimit);
+                limitV = Sse2.LoadVector128(pLimit);
+                threshV = Sse2.LoadVector128(pThresh);
+            }
+
+            Vector128<byte> mask, hev, flat, flat2;
+            Vector128<byte> p7, p6, p5;
+            Vector128<byte> p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+            Vector128<byte> q5, q6, q7;
+
+            Vector128<byte> op2, op1, op0, oq0, oq1, oq2;
+
+            Vector128<byte> maxAbsP1P0Q1Q0;
+
+            p7 = Sse2.LoadVector128(s.ToPointer() - (8 * pitch));
+            p6 = Sse2.LoadVector128(s.ToPointer() - (7 * pitch));
+            p5 = Sse2.LoadVector128(s.ToPointer() - (6 * pitch));
+            p4 = Sse2.LoadVector128(s.ToPointer() - (5 * pitch));
+            p3 = Sse2.LoadVector128(s.ToPointer() - (4 * pitch));
+            p2 = Sse2.LoadVector128(s.ToPointer() - (3 * pitch));
+            p1 = Sse2.LoadVector128(s.ToPointer() - (2 * pitch));
+            p0 = Sse2.LoadVector128(s.ToPointer() - (1 * pitch));
+            q0 = Sse2.LoadVector128(s.ToPointer() - (0 * pitch));
+            q1 = Sse2.LoadVector128(s.ToPointer() + (1 * pitch));
+            q2 = Sse2.LoadVector128(s.ToPointer() + (2 * pitch));
+            q3 = Sse2.LoadVector128(s.ToPointer() + (3 * pitch));
+            q4 = Sse2.LoadVector128(s.ToPointer() + (4 * pitch));
+            q5 = Sse2.LoadVector128(s.ToPointer() + (5 * pitch));
+            q6 = Sse2.LoadVector128(s.ToPointer() + (6 * pitch));
+            q7 = Sse2.LoadVector128(s.ToPointer() + (7 * pitch));
+
+            {
+                Vector128<byte> absP1P0 = AbsDiff(p1, p0);
+                Vector128<byte> absQ1Q0 = AbsDiff(q1, q0);
+                Vector128<byte> fe = Vector128.Create((byte)0xfe);
+                Vector128<byte> ff = Sse2.CompareEqual(zero, zero);
+                Vector128<byte> absP0Q0 = AbsDiff(p0, q0);
+                Vector128<byte> absP1Q1 = AbsDiff(p1, q1);
+                Vector128<byte> work;
+                maxAbsP1P0Q1Q0 = Sse2.Max(absP1P0, absQ1Q0);
+
+                absP0Q0 = Sse2.AddSaturate(absP0Q0, absP0Q0);
+                absP1Q1 = Sse2.ShiftRightLogical(Sse2.And(absP1Q1, fe).AsInt16(), 1).AsByte();
+                mask = Sse2.SubtractSaturate(Sse2.AddSaturate(absP0Q0, absP1Q1), blimitV);
+                mask = Sse2.Xor(Sse2.CompareEqual(mask, zero), ff);
+                // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+                mask = Sse2.Max(maxAbsP1P0Q1Q0, mask);
+                // mask |= (abs(p1 - p0) > limit) * -1;
+                // mask |= (abs(q1 - q0) > limit) * -1;
+                work = Sse2.Max(AbsDiff(p2, p1), AbsDiff(p3, p2));
+                mask = Sse2.Max(work, mask);
+                work = Sse2.Max(AbsDiff(q2, q1), AbsDiff(q3, q2));
+                mask = Sse2.Max(work, mask);
+                mask = Sse2.SubtractSaturate(mask, limitV);
+                mask = Sse2.CompareEqual(mask, zero);
+            }
+
+            {
+                Vector128<byte> work;
+                work = Sse2.Max(AbsDiff(p2, p0), AbsDiff(q2, q0));
+                flat = Sse2.Max(work, maxAbsP1P0Q1Q0);
+                work = Sse2.Max(AbsDiff(p3, p0), AbsDiff(q3, q0));
+                flat = Sse2.Max(work, flat);
+                work = Sse2.Max(AbsDiff(p4, p0), AbsDiff(q4, q0));
+                flat = Sse2.SubtractSaturate(flat, one);
+                flat = Sse2.CompareEqual(flat, zero);
+                flat = Sse2.And(flat, mask);
+                flat2 = Sse2.Max(AbsDiff(p5, p0), AbsDiff(q5, q0));
+                flat2 = Sse2.Max(work, flat2);
+                work = Sse2.Max(AbsDiff(p6, p0), AbsDiff(q6, q0));
+                flat2 = Sse2.Max(work, flat2);
+                work = Sse2.Max(AbsDiff(p7, p0), AbsDiff(q7, q0));
+                flat2 = Sse2.Max(work, flat2);
+                flat2 = Sse2.SubtractSaturate(flat2, one);
+                flat2 = Sse2.CompareEqual(flat2, zero);
+                flat2 = Sse2.And(flat2, flat); // flat2 & flat & mask
+            }
+
+            // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            // filter4
+            {
+                Vector128<byte> t4 = Vector128.Create((byte)4);
+                Vector128<byte> t3 = Vector128.Create((byte)3);
+                Vector128<byte> t80 = Vector128.Create((byte)0x80);
+                Vector128<byte> te0 = Vector128.Create((byte)0xe0);
+                Vector128<byte> t1F = Vector128.Create((byte)0x1f);
+                Vector128<byte> t1 = Vector128.Create((byte)0x1);
+                Vector128<byte> t7F = Vector128.Create((byte)0x7f);
+                Vector128<byte> ff = Sse2.CompareEqual(t4, t4);
+
+                Vector128<byte> filt;
+                Vector128<byte> workA;
+                Vector128<byte> filter1, filter2;
+
+                op1 = Sse2.Xor(p1, t80);
+                op0 = Sse2.Xor(p0, t80);
+                oq0 = Sse2.Xor(q0, t80);
+                oq1 = Sse2.Xor(q1, t80);
+
+                hev = Sse2.SubtractSaturate(maxAbsP1P0Q1Q0, threshV);
+                hev = Sse2.Xor(Sse2.CompareEqual(hev, zero), ff);
+                filt = Sse2.And(Sse2.SubtractSaturate(op1.AsSByte(), oq1.AsSByte()).AsByte(), hev);
+
+                workA = Sse2.SubtractSaturate(oq0.AsSByte(), op0.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                // (vpx_filter + 3 * (qs0 - ps0)) & mask
+                filt = Sse2.And(filt, mask);
+                filter1 = Sse2.AddSaturate(filt.AsSByte(), t4.AsSByte()).AsByte();
+                filter2 = Sse2.AddSaturate(filt.AsSByte(), t3.AsSByte()).AsByte();
+
+                // Filter1 >> 3
+                workA = Sse2.CompareGreaterThan(zero.AsSByte(), filter1.AsSByte()).AsByte();
+                filter1 = Sse2.ShiftRightLogical(filter1.AsInt16(), 3).AsByte();
+                workA = Sse2.And(workA, te0);
+                filter1 = Sse2.And(filter1, t1F);
+                filter1 = Sse2.Or(filter1, workA);
+                oq0 = Sse2.Xor(Sse2.SubtractSaturate(oq0.AsSByte(), filter1.AsSByte()).AsByte(), t80);
+
+                // Filter2 >> 3
+                workA = Sse2.CompareGreaterThan(zero.AsSByte(), filter2.AsSByte()).AsByte();
+                filter2 = Sse2.ShiftRightLogical(filter2.AsInt16(), 3).AsByte();
+                workA = Sse2.And(workA, te0);
+                filter2 = Sse2.And(filter2, t1F);
+                filter2 = Sse2.Or(filter2, workA);
+                op0 = Sse2.Xor(Sse2.AddSaturate(op0.AsSByte(), filter2.AsSByte()).AsByte(), t80);
+
+                // filt >> 1
+                filt = Sse2.AddSaturate(filter1.AsSByte(), t1.AsSByte()).AsByte();
+                workA = Sse2.CompareGreaterThan(zero.AsSByte(), filt.AsSByte()).AsByte();
+                filt = Sse2.ShiftRightLogical(filt.AsInt16(), 1).AsByte();
+                workA = Sse2.And(workA, t80);
+                filt = Sse2.And(filt, t7F);
+                filt = Sse2.Or(filt, workA);
+                filt = Sse2.AndNot(hev, filt);
+                op1 = Sse2.Xor(Sse2.AddSaturate(op1.AsSByte(), filt.AsSByte()).AsByte(), t80);
+                oq1 = Sse2.Xor(Sse2.SubtractSaturate(oq1.AsSByte(), filt.AsSByte()).AsByte(), t80);
+                // loopfilter done
+
+                // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                // filter8
+                {
+                    Vector128<short> four = Vector128.Create((short)4);
+                    Vector128<short> p3Lo = Sse2.UnpackLow(p3, zero).AsInt16();
+                    Vector128<short> p2Lo = Sse2.UnpackLow(p2, zero).AsInt16();
+                    Vector128<short> p1Lo = Sse2.UnpackLow(p1, zero).AsInt16();
+                    Vector128<short> p0Lo = Sse2.UnpackLow(p0, zero).AsInt16();
+                    Vector128<short> q0Lo = Sse2.UnpackLow(q0, zero).AsInt16();
+                    Vector128<short> q1Lo = Sse2.UnpackLow(q1, zero).AsInt16();
+                    Vector128<short> q2Lo = Sse2.UnpackLow(q2, zero).AsInt16();
+                    Vector128<short> q3Lo = Sse2.UnpackLow(q3, zero).AsInt16();
+
+                    Vector128<short> p3Hi = Sse2.UnpackHigh(p3, zero).AsInt16();
+                    Vector128<short> p2Hi = Sse2.UnpackHigh(p2, zero).AsInt16();
+                    Vector128<short> p1Hi = Sse2.UnpackHigh(p1, zero).AsInt16();
+                    Vector128<short> p0Hi = Sse2.UnpackHigh(p0, zero).AsInt16();
+                    Vector128<short> q0Hi = Sse2.UnpackHigh(q0, zero).AsInt16();
+                    Vector128<short> q1Hi = Sse2.UnpackHigh(q1, zero).AsInt16();
+                    Vector128<short> q2Hi = Sse2.UnpackHigh(q2, zero).AsInt16();
+                    Vector128<short> q3Hi = Sse2.UnpackHigh(q3, zero).AsInt16();
+                    Vector128<short> f8Lo, f8Hi;
+
+                    f8Lo = Sse2.Add(Sse2.Add(p3Lo, four), Sse2.Add(p3Lo, p2Lo));
+                    f8Lo = Sse2.Add(Sse2.Add(p3Lo, f8Lo), Sse2.Add(p2Lo, p1Lo));
+                    f8Lo = Sse2.Add(Sse2.Add(p0Lo, q0Lo), f8Lo);
+
+                    f8Hi = Sse2.Add(Sse2.Add(p3Hi, four), Sse2.Add(p3Hi, p2Hi));
+                    f8Hi = Sse2.Add(Sse2.Add(p3Hi, f8Hi), Sse2.Add(p2Hi, p1Hi));
+                    f8Hi = Sse2.Add(Sse2.Add(p0Hi, q0Hi), f8Hi);
+
+                    op2 = Filter8Mask(flat, p2, f8Lo, f8Hi);
+
+                    f8Lo = FilterAdd2Sub2(f8Lo, q1Lo, p1Lo, p2Lo, p3Lo);
+                    f8Hi = FilterAdd2Sub2(f8Hi, q1Hi, p1Hi, p2Hi, p3Hi);
+                    op1 = Filter8Mask(flat, op1, f8Lo, f8Hi);
+
+                    f8Lo = FilterAdd2Sub2(f8Lo, q2Lo, p0Lo, p1Lo, p3Lo);
+                    f8Hi = FilterAdd2Sub2(f8Hi, q2Hi, p0Hi, p1Hi, p3Hi);
+                    op0 = Filter8Mask(flat, op0, f8Lo, f8Hi);
+
+                    f8Lo = FilterAdd2Sub2(f8Lo, q3Lo, q0Lo, p0Lo, p3Lo);
+                    f8Hi = FilterAdd2Sub2(f8Hi, q3Hi, q0Hi, p0Hi, p3Hi);
+                    oq0 = Filter8Mask(flat, oq0, f8Lo, f8Hi);
+
+                    f8Lo = FilterAdd2Sub2(f8Lo, q3Lo, q1Lo, q0Lo, p2Lo);
+                    f8Hi = FilterAdd2Sub2(f8Hi, q3Hi, q1Hi, q0Hi, p2Hi);
+                    oq1 = Filter8Mask(flat, oq1, f8Lo, f8Hi);
+
+                    f8Lo = FilterAdd2Sub2(f8Lo, q3Lo, q2Lo, q1Lo, p1Lo);
+                    f8Hi = FilterAdd2Sub2(f8Hi, q3Hi, q2Hi, q1Hi, p1Hi);
+                    oq2 = Filter8Mask(flat, q2, f8Lo, f8Hi);
+                }
+
+                // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+                // wide flat calculations
+                {
+                    Vector128<short> eight = Vector128.Create((short)8);
+                    Vector128<short> p7Lo = Sse2.UnpackLow(p7, zero).AsInt16();
+                    Vector128<short> p6Lo = Sse2.UnpackLow(p6, zero).AsInt16();
+                    Vector128<short> p5Lo = Sse2.UnpackLow(p5, zero).AsInt16();
+                    Vector128<short> p4Lo = Sse2.UnpackLow(p4, zero).AsInt16();
+                    Vector128<short> p3Lo = Sse2.UnpackLow(p3, zero).AsInt16();
+                    Vector128<short> p2Lo = Sse2.UnpackLow(p2, zero).AsInt16();
+                    Vector128<short> p1Lo = Sse2.UnpackLow(p1, zero).AsInt16();
+                    Vector128<short> p0Lo = Sse2.UnpackLow(p0, zero).AsInt16();
+                    Vector128<short> q0Lo = Sse2.UnpackLow(q0, zero).AsInt16();
+                    Vector128<short> q1Lo = Sse2.UnpackLow(q1, zero).AsInt16();
+                    Vector128<short> q2Lo = Sse2.UnpackLow(q2, zero).AsInt16();
+                    Vector128<short> q3Lo = Sse2.UnpackLow(q3, zero).AsInt16();
+                    Vector128<short> q4Lo = Sse2.UnpackLow(q4, zero).AsInt16();
+                    Vector128<short> q5Lo = Sse2.UnpackLow(q5, zero).AsInt16();
+                    Vector128<short> q6Lo = Sse2.UnpackLow(q6, zero).AsInt16();
+                    Vector128<short> q7Lo = Sse2.UnpackLow(q7, zero).AsInt16();
+
+                    Vector128<short> p7Hi = Sse2.UnpackHigh(p7, zero).AsInt16();
+                    Vector128<short> p6Hi = Sse2.UnpackHigh(p6, zero).AsInt16();
+                    Vector128<short> p5Hi = Sse2.UnpackHigh(p5, zero).AsInt16();
+                    Vector128<short> p4Hi = Sse2.UnpackHigh(p4, zero).AsInt16();
+                    Vector128<short> p3Hi = Sse2.UnpackHigh(p3, zero).AsInt16();
+                    Vector128<short> p2Hi = Sse2.UnpackHigh(p2, zero).AsInt16();
+                    Vector128<short> p1Hi = Sse2.UnpackHigh(p1, zero).AsInt16();
+                    Vector128<short> p0Hi = Sse2.UnpackHigh(p0, zero).AsInt16();
+                    Vector128<short> q0Hi = Sse2.UnpackHigh(q0, zero).AsInt16();
+                    Vector128<short> q1Hi = Sse2.UnpackHigh(q1, zero).AsInt16();
+                    Vector128<short> q2Hi = Sse2.UnpackHigh(q2, zero).AsInt16();
+                    Vector128<short> q3Hi = Sse2.UnpackHigh(q3, zero).AsInt16();
+                    Vector128<short> q4Hi = Sse2.UnpackHigh(q4, zero).AsInt16();
+                    Vector128<short> q5Hi = Sse2.UnpackHigh(q5, zero).AsInt16();
+                    Vector128<short> q6Hi = Sse2.UnpackHigh(q6, zero).AsInt16();
+                    Vector128<short> q7Hi = Sse2.UnpackHigh(q7, zero).AsInt16();
+
+                    Vector128<short> fLo;
+                    Vector128<short> fHi;
+
+                    fLo = Sse2.Subtract(Sse2.ShiftLeftLogical(p7Lo, 3), p7Lo); // p7 * 7
+                    fLo = Sse2.Add(Sse2.ShiftLeftLogical(p6Lo, 1), Sse2.Add(p4Lo, fLo));
+                    fLo = Sse2.Add(Sse2.Add(p3Lo, fLo), Sse2.Add(p2Lo, p1Lo));
+                    fLo = Sse2.Add(Sse2.Add(p0Lo, q0Lo), fLo);
+                    fLo = Sse2.Add(Sse2.Add(p5Lo, eight), fLo);
+
+                    fHi = Sse2.Subtract(Sse2.ShiftLeftLogical(p7Hi, 3), p7Hi); // p7 * 7
+                    fHi = Sse2.Add(Sse2.ShiftLeftLogical(p6Hi, 1), Sse2.Add(p4Hi, fHi));
+                    fHi = Sse2.Add(Sse2.Add(p3Hi, fHi), Sse2.Add(p2Hi, p1Hi));
+                    fHi = Sse2.Add(Sse2.Add(p0Hi, q0Hi), fHi);
+                    fHi = Sse2.Add(Sse2.Add(p5Hi, eight), fHi);
+
+                    p6 = Filter16Mask(flat2, p6, fLo, fHi);
+                    Sse2.Store(s.ToPointer() - (7 * pitch), p6);
+
+                    fLo = FilterAdd2Sub2(fLo, q1Lo, p5Lo, p6Lo, p7Lo);
+                    fHi = FilterAdd2Sub2(fHi, q1Hi, p5Hi, p6Hi, p7Hi);
+                    p5 = Filter16Mask(flat2, p5, fLo, fHi);
+                    Sse2.Store(s.ToPointer() - (6 * pitch), p5);
+
+                    fLo = FilterAdd2Sub2(fLo, q2Lo, p4Lo, p5Lo, p7Lo);
+                    fHi = FilterAdd2Sub2(fHi, q2Hi, p4Hi, p5Hi, p7Hi);
+                    p4 = Filter16Mask(flat2, p4, fLo, fHi);
+                    Sse2.Store(s.ToPointer() - (5 * pitch), p4);
+
+                    fLo = FilterAdd2Sub2(fLo, q3Lo, p3Lo, p4Lo, p7Lo);
+                    fHi = FilterAdd2Sub2(fHi, q3Hi, p3Hi, p4Hi, p7Hi);
+                    p3 = Filter16Mask(flat2, p3, fLo, fHi);
+                    Sse2.Store(s.ToPointer() - (4 * pitch), p3);
+
+                    fLo = FilterAdd2Sub2(fLo, q4Lo, p2Lo, p3Lo, p7Lo);
+                    fHi = FilterAdd2Sub2(fHi, q4Hi, p2Hi, p3Hi, p7Hi);
+                    op2 = Filter16Mask(flat2, op2, fLo, fHi);
+                    Sse2.Store(s.ToPointer() - (3 * pitch), op2);
+
+                    fLo = FilterAdd2Sub2(fLo, q5Lo, p1Lo, p2Lo, p7Lo);
+                    fHi = FilterAdd2Sub2(fHi, q5Hi, p1Hi, p2Hi, p7Hi);
+                    op1 = Filter16Mask(flat2, op1, fLo, fHi);
+                    Sse2.Store(s.ToPointer() - (2 * pitch), op1);
+
+                    fLo = FilterAdd2Sub2(fLo, q6Lo, p0Lo, p1Lo, p7Lo);
+                    fHi = FilterAdd2Sub2(fHi, q6Hi, p0Hi, p1Hi, p7Hi);
+                    op0 = Filter16Mask(flat2, op0, fLo, fHi);
+                    Sse2.Store(s.ToPointer() - (1 * pitch), op0);
+
+                    fLo = FilterAdd2Sub2(fLo, q7Lo, q0Lo, p0Lo, p7Lo);
+                    fHi = FilterAdd2Sub2(fHi, q7Hi, q0Hi, p0Hi, p7Hi);
+                    oq0 = Filter16Mask(flat2, oq0, fLo, fHi);
+                    Sse2.Store(s.ToPointer() - (0 * pitch), oq0);
+
+                    fLo = FilterAdd2Sub2(fLo, q7Lo, q1Lo, p6Lo, q0Lo);
+                    fHi = FilterAdd2Sub2(fHi, q7Hi, q1Hi, p6Hi, q0Hi);
+                    oq1 = Filter16Mask(flat2, oq1, fLo, fHi);
+                    Sse2.Store(s.ToPointer() + (1 * pitch), oq1);
+
+                    fLo = FilterAdd2Sub2(fLo, q7Lo, q2Lo, p5Lo, q1Lo);
+                    fHi = FilterAdd2Sub2(fHi, q7Hi, q2Hi, p5Hi, q1Hi);
+                    oq2 = Filter16Mask(flat2, oq2, fLo, fHi);
+                    Sse2.Store(s.ToPointer() + (2 * pitch), oq2);
+
+                    fLo = FilterAdd2Sub2(fLo, q7Lo, q3Lo, p4Lo, q2Lo);
+                    fHi = FilterAdd2Sub2(fHi, q7Hi, q3Hi, p4Hi, q2Hi);
+                    q3 = Filter16Mask(flat2, q3, fLo, fHi);
+                    Sse2.Store(s.ToPointer() + (3 * pitch), q3);
+
+                    fLo = FilterAdd2Sub2(fLo, q7Lo, q4Lo, p3Lo, q3Lo);
+                    fHi = FilterAdd2Sub2(fHi, q7Hi, q4Hi, p3Hi, q3Hi);
+                    q4 = Filter16Mask(flat2, q4, fLo, fHi);
+                    Sse2.Store(s.ToPointer() + (4 * pitch), q4);
+
+                    fLo = FilterAdd2Sub2(fLo, q7Lo, q5Lo, p2Lo, q4Lo);
+                    fHi = FilterAdd2Sub2(fHi, q7Hi, q5Hi, p2Hi, q4Hi);
+                    q5 = Filter16Mask(flat2, q5, fLo, fHi);
+                    Sse2.Store(s.ToPointer() + (5 * pitch), q5);
+
+                    fLo = FilterAdd2Sub2(fLo, q7Lo, q6Lo, p1Lo, q5Lo);
+                    fHi = FilterAdd2Sub2(fHi, q7Hi, q6Hi, p1Hi, q5Hi);
+                    q6 = Filter16Mask(flat2, q6, fLo, fHi);
+                    Sse2.Store(s.ToPointer() + (6 * pitch), q6);
+                }
+                // wide flat
+                // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            }
+        }
+
+        public static unsafe void LpfHorizontal8(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            Vector128<byte> flatOp2;
+            Vector128<byte> flatOp1;
+            Vector128<byte> flatOp0;
+            Vector128<byte> flatOq2;
+            Vector128<byte> flatOq1;
+            Vector128<byte> flatOq0;
+            Vector128<byte> zero = Vector128<byte>.Zero;
+            Vector128<byte> blimitV, limitV, threshV;
+
+            fixed (byte* pBLimit = blimit, pLimit = limit, pThresh = thresh)
+            {
+                blimitV = Sse2.LoadVector128(pBLimit);
+                limitV = Sse2.LoadVector128(pLimit);
+                threshV = Sse2.LoadVector128(pThresh);
+            }
+
+            Vector128<byte> mask, hev, flat;
+            Vector128<byte> p3, p2, p1, p0, q0, q1, q2, q3;
+            Vector128<byte> q3P3, q2P2, q1P1, q0P0, p1Q1, p0Q0;
+
+            q3P3 = Sse2.UnpackLow(
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() - (4 * pitch))),
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (3 * pitch)))).AsByte();
+            q2P2 = Sse2.UnpackLow(
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() - (3 * pitch))),
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (2 * pitch)))).AsByte();
+            q1P1 = Sse2.UnpackLow(
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() - (2 * pitch))),
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() + (1 * pitch)))).AsByte();
+            q0P0 = Sse2.UnpackLow(
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() - (1 * pitch))),
+                Sse2.LoadScalarVector128((long*)(s.ToPointer() - (0 * pitch)))).AsByte();
+            p1Q1 = Sse2.Shuffle(q1P1.AsInt32(), 78).AsByte();
+            p0Q0 = Sse2.Shuffle(q0P0.AsInt32(), 78).AsByte();
+
+            {
+                // filter_mask and hev_mask
+                Vector128<byte> one = Vector128.Create((byte)1);
+                Vector128<byte> fe = Vector128.Create((byte)0xfe);
+                Vector128<byte> ff = Sse2.CompareEqual(fe, fe);
+                Vector128<byte> absP1Q1, absP0Q0, absQ1Q0, absP1P0, work;
+                absP1P0 = AbsDiff(q1P1, q0P0);
+                absQ1Q0 = Sse2.ShiftRightLogical128BitLane(absP1P0, 8);
+
+                absP0Q0 = AbsDiff(q0P0, p0Q0);
+                absP1Q1 = AbsDiff(q1P1, p1Q1);
+                flat = Sse2.Max(absP1P0, absQ1Q0);
+                hev = Sse2.SubtractSaturate(flat, threshV);
+                hev = Sse2.Xor(Sse2.CompareEqual(hev, zero), ff);
+
+                absP0Q0 = Sse2.AddSaturate(absP0Q0, absP0Q0);
+                absP1Q1 = Sse2.ShiftRightLogical(Sse2.And(absP1Q1, fe).AsInt16(), 1).AsByte();
+                mask = Sse2.SubtractSaturate(Sse2.AddSaturate(absP0Q0, absP1Q1), blimitV);
+                mask = Sse2.Xor(Sse2.CompareEqual(mask, zero), ff);
+                // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+                mask = Sse2.Max(absP1P0, mask);
+                // mask |= (abs(p1 - p0) > limit) * -1;
+                // mask |= (abs(q1 - q0) > limit) * -1;
+
+                work = Sse2.Max(AbsDiff(q2P2, q1P1), AbsDiff(q3P3, q2P2));
+                mask = Sse2.Max(work, mask);
+                mask = Sse2.Max(mask, Sse2.ShiftRightLogical128BitLane(mask, 8));
+                mask = Sse2.SubtractSaturate(mask, limitV);
+                mask = Sse2.CompareEqual(mask, zero);
+
+                // flat_mask4
+
+                flat = Sse2.Max(AbsDiff(q2P2, q0P0), AbsDiff(q3P3, q0P0));
+                flat = Sse2.Max(absP1P0, flat);
+                flat = Sse2.Max(flat, Sse2.ShiftRightLogical128BitLane(flat, 8));
+                flat = Sse2.SubtractSaturate(flat, one);
+                flat = Sse2.CompareEqual(flat, zero);
+                flat = Sse2.And(flat, mask);
+            }
+
+            {
+                Vector128<short> four = Vector128.Create((short)4);
+                {
+                    Vector128<short> workpA, workpB, workpShft;
+                    p3 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(s.ToPointer() - (4 * pitch))).AsByte(), zero);
+                    p2 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(s.ToPointer() - (3 * pitch))).AsByte(), zero);
+                    p1 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(s.ToPointer() - (2 * pitch))).AsByte(), zero);
+                    p0 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(s.ToPointer() - (1 * pitch))).AsByte(), zero);
+                    q0 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(s.ToPointer() - (0 * pitch))).AsByte(), zero);
+                    q1 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(s.ToPointer() + (1 * pitch))).AsByte(), zero);
+                    q2 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(s.ToPointer() + (2 * pitch))).AsByte(), zero);
+                    q3 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(s.ToPointer() + (3 * pitch))).AsByte(), zero);
+
+                    workpA = Sse2.Add(Sse2.Add(p3.AsInt16(), p3.AsInt16()), Sse2.Add(p2.AsInt16(), p1.AsInt16()));
+                    workpA = Sse2.Add(Sse2.Add(workpA, four), p0.AsInt16());
+                    workpB = Sse2.Add(Sse2.Add(q0.AsInt16(), p2.AsInt16()), p3.AsInt16());
+                    workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3);
+                    Sse2.StoreScalar((long*)&flatOp2, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64());
+
+                    workpB = Sse2.Add(Sse2.Add(q0.AsInt16(), q1.AsInt16()), p1.AsInt16());
+                    workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3);
+                    Sse2.StoreScalar((long*)&flatOp1, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64());
+
+                    workpA = Sse2.Add(Sse2.Subtract(workpA, p3.AsInt16()), q2.AsInt16());
+                    workpB = Sse2.Add(Sse2.Subtract(workpB, p1.AsInt16()), p0.AsInt16());
+                    workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3);
+                    Sse2.StoreScalar((long*)&flatOp0, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64());
+
+                    workpA = Sse2.Add(Sse2.Subtract(workpA, p3.AsInt16()), q3.AsInt16());
+                    workpB = Sse2.Add(Sse2.Subtract(workpB, p0.AsInt16()), q0.AsInt16());
+                    workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3);
+                    Sse2.StoreScalar((long*)&flatOq0, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64());
+
+                    workpA = Sse2.Add(Sse2.Subtract(workpA, p2.AsInt16()), q3.AsInt16());
+                    workpB = Sse2.Add(Sse2.Subtract(workpB, q0.AsInt16()), q1.AsInt16());
+                    workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3);
+                    Sse2.StoreScalar((long*)&flatOq1, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64());
+
+                    workpA = Sse2.Add(Sse2.Subtract(workpA, p1.AsInt16()), q3.AsInt16());
+                    workpB = Sse2.Add(Sse2.Subtract(workpB, q1.AsInt16()), q2.AsInt16());
+                    workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3);
+                    Sse2.StoreScalar((long*)&flatOq2, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64());
+                }
+            }
+            // lp filter
+            {
+                Vector128<byte> t4 = Vector128.Create((byte)4);
+                Vector128<byte> t3 = Vector128.Create((byte)3);
+                Vector128<byte> t80 = Vector128.Create((byte)0x80);
+                Vector128<byte> t1 = Vector128.Create((byte)0x1);
+                Vector128<byte> ps1 =
+                    Sse2.Xor(Sse2.LoadScalarVector128((long*)(s.ToPointer() - (2 * pitch))).AsByte(),
+                        t80);
+                Vector128<byte> ps0 =
+                    Sse2.Xor(Sse2.LoadScalarVector128((long*)(s.ToPointer() - (1 * pitch))).AsByte(),
+                        t80);
+                Vector128<byte> qs0 =
+                    Sse2.Xor(Sse2.LoadScalarVector128((long*)(s.ToPointer() + (0 * pitch))).AsByte(),
+                        t80);
+                Vector128<byte> qs1 =
+                    Sse2.Xor(Sse2.LoadScalarVector128((long*)(s.ToPointer() + (1 * pitch))).AsByte(),
+                        t80);
+                Vector128<byte> filt;
+                Vector128<byte> workA;
+                Vector128<byte> filter1, filter2;
+
+                filt = Sse2.And(Sse2.SubtractSaturate(ps1.AsSByte(), qs1.AsSByte()).AsByte(), hev);
+                workA = Sse2.SubtractSaturate(qs0.AsSByte(), ps0.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                // (vpx_filter + 3 * (qs0 - ps0)) & mask
+                filt = Sse2.And(filt, mask);
+
+                filter1 = Sse2.AddSaturate(filt.AsSByte(), t4.AsSByte()).AsByte();
+                filter2 = Sse2.AddSaturate(filt.AsSByte(), t3.AsSByte()).AsByte();
+
+                // Filter1 >> 3
+                filter1 = Sse2.UnpackLow(zero, filter1);
+                filter1 = Sse2.ShiftRightArithmetic(filter1.AsInt16(), 11).AsByte();
+                filter1 = Sse2.PackSignedSaturate(filter1.AsInt16(), filter1.AsInt16()).AsByte();
+
+                // Filter2 >> 3
+                filter2 = Sse2.UnpackLow(zero, filter2);
+                filter2 = Sse2.ShiftRightArithmetic(filter2.AsInt16(), 11).AsByte();
+                filter2 = Sse2.PackSignedSaturate(filter2.AsInt16(), zero.AsInt16()).AsByte();
+
+                // filt >> 1
+                filt = Sse2.AddSaturate(filter1.AsSByte(), t1.AsSByte()).AsByte();
+                filt = Sse2.UnpackLow(zero, filt);
+                filt = Sse2.ShiftRightArithmetic(filt.AsInt16(), 9).AsByte();
+                filt = Sse2.PackSignedSaturate(filt.AsInt16(), zero.AsInt16()).AsByte();
+
+                filt = Sse2.AndNot(hev, filt);
+
+                workA = Sse2.Xor(Sse2.SubtractSaturate(qs0.AsSByte(), filter1.AsSByte()).AsByte(), t80);
+                q0 = Sse2.LoadScalarVector128((long*)&flatOq0).AsByte();
+                workA = Sse2.AndNot(flat, workA);
+                q0 = Sse2.And(flat, q0);
+                q0 = Sse2.Or(workA, q0);
+
+                workA = Sse2.Xor(Sse2.SubtractSaturate(qs1.AsSByte(), filt.AsSByte()).AsByte(), t80);
+                q1 = Sse2.LoadScalarVector128((long*)&flatOq1).AsByte();
+                workA = Sse2.AndNot(flat, workA);
+                q1 = Sse2.And(flat, q1);
+                q1 = Sse2.Or(workA, q1);
+
+                workA = Sse2.LoadVector128(s.ToPointer() + (2 * pitch));
+                q2 = Sse2.LoadScalarVector128((long*)&flatOq2).AsByte();
+                workA = Sse2.AndNot(flat, workA);
+                q2 = Sse2.And(flat, q2);
+                q2 = Sse2.Or(workA, q2);
+
+                workA = Sse2.Xor(Sse2.AddSaturate(ps0.AsSByte(), filter2.AsSByte()).AsByte(), t80);
+                p0 = Sse2.LoadScalarVector128((long*)&flatOp0).AsByte();
+                workA = Sse2.AndNot(flat, workA);
+                p0 = Sse2.And(flat, p0);
+                p0 = Sse2.Or(workA, p0);
+
+                workA = Sse2.Xor(Sse2.AddSaturate(ps1.AsSByte(), filt.AsSByte()).AsByte(), t80);
+                p1 = Sse2.LoadScalarVector128((long*)&flatOp1).AsByte();
+                workA = Sse2.AndNot(flat, workA);
+                p1 = Sse2.And(flat, p1);
+                p1 = Sse2.Or(workA, p1);
+
+                workA = Sse2.LoadVector128(s.ToPointer() - (3 * pitch));
+                p2 = Sse2.LoadScalarVector128((long*)&flatOp2).AsByte();
+                workA = Sse2.AndNot(flat, workA);
+                p2 = Sse2.And(flat, p2);
+                p2 = Sse2.Or(workA, p2);
+
+                Sse2.StoreScalar((long*)(s.ToPointer() - (3 * pitch)), p2.AsInt64());
+                Sse2.StoreScalar((long*)(s.ToPointer() - (2 * pitch)), p1.AsInt64());
+                Sse2.StoreScalar((long*)(s.ToPointer() - (1 * pitch)), p0.AsInt64());
+                Sse2.StoreScalar((long*)(s.ToPointer() + (0 * pitch)), q0.AsInt64());
+                Sse2.StoreScalar((long*)(s.ToPointer() + (1 * pitch)), q1.AsInt64());
+                Sse2.StoreScalar((long*)(s.ToPointer() + (2 * pitch)), q2.AsInt64());
+            }
+        }
+
+        public static unsafe void LpfHorizontal8Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit0,
+            ReadOnlySpan<byte> limit0,
+            ReadOnlySpan<byte> thresh0,
+            ReadOnlySpan<byte> blimit1,
+            ReadOnlySpan<byte> limit1,
+            ReadOnlySpan<byte> thresh1)
+        {
+            Vector128<byte> flatOp2;
+            Vector128<byte> flatOp1;
+            Vector128<byte> flatOp0;
+            Vector128<byte> flatOq2;
+            Vector128<byte> flatOq1;
+            Vector128<byte> flatOq0;
+            Vector128<byte> zero = Vector128<byte>.Zero;
+            Vector128<byte> blimit, limit, thresh;
+
+            fixed (byte* pBLimit0 = blimit0, pLimit0 = limit0, pThresh0 = thresh0,
+                   pBLimit1 = blimit1, pLimit1 = limit1, pThresh1 = thresh1)
+            {
+                blimit = Sse2.UnpackLow(Sse2.LoadVector128(pBLimit0).AsInt64(), Sse2.LoadVector128(pBLimit1).AsInt64())
+                    .AsByte();
+                limit = Sse2.UnpackLow(Sse2.LoadVector128(pLimit0).AsInt64(), Sse2.LoadVector128(pLimit1).AsInt64())
+                    .AsByte();
+                thresh = Sse2.UnpackLow(Sse2.LoadVector128(pThresh0).AsInt64(), Sse2.LoadVector128(pThresh1).AsInt64())
+                    .AsByte();
+            }
+
+            Vector128<byte> mask, hev, flat;
+            Vector128<byte> p3, p2, p1, p0, q0, q1, q2, q3;
+
+            p3 = Sse2.LoadVector128(s.ToPointer() - (4 * pitch));
+            p2 = Sse2.LoadVector128(s.ToPointer() - (3 * pitch));
+            p1 = Sse2.LoadVector128(s.ToPointer() - (2 * pitch));
+            p0 = Sse2.LoadVector128(s.ToPointer() - (1 * pitch));
+            q0 = Sse2.LoadVector128(s.ToPointer() - (0 * pitch));
+            q1 = Sse2.LoadVector128(s.ToPointer() + (1 * pitch));
+            q2 = Sse2.LoadVector128(s.ToPointer() + (2 * pitch));
+            q3 = Sse2.LoadVector128(s.ToPointer() + (3 * pitch));
+            {
+                Vector128<byte> absP1P0 = Sse2.Or(Sse2.SubtractSaturate(p1, p0), Sse2.SubtractSaturate(p0, p1));
+                Vector128<byte> absQ1Q0 = Sse2.Or(Sse2.SubtractSaturate(q1, q0), Sse2.SubtractSaturate(q0, q1));
+                Vector128<byte> one = Vector128.Create((byte)1);
+                Vector128<byte> fe = Vector128.Create((byte)0xfe);
+                Vector128<byte> ff = Sse2.CompareEqual(absP1P0, absP1P0);
+                Vector128<byte> absP0Q0 = Sse2.Or(Sse2.SubtractSaturate(p0, q0), Sse2.SubtractSaturate(q0, p0));
+                Vector128<byte> absP1Q1 = Sse2.Or(Sse2.SubtractSaturate(p1, q1), Sse2.SubtractSaturate(q1, p1));
+                Vector128<byte> work;
+
+                // filter_mask and hev_mask
+                flat = Sse2.Max(absP1P0, absQ1Q0);
+                hev = Sse2.SubtractSaturate(flat, thresh);
+                hev = Sse2.Xor(Sse2.CompareEqual(hev, zero), ff);
+
+                absP0Q0 = Sse2.AddSaturate(absP0Q0, absP0Q0);
+                absP1Q1 = Sse2.ShiftRightLogical(Sse2.And(absP1Q1, fe).AsInt16(), 1).AsByte();
+                mask = Sse2.SubtractSaturate(Sse2.AddSaturate(absP0Q0, absP1Q1), blimit);
+                mask = Sse2.Xor(Sse2.CompareEqual(mask, zero), ff);
+                // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+                mask = Sse2.Max(flat, mask);
+                // mask |= (abs(p1 - p0) > limit) * -1;
+                // mask |= (abs(q1 - q0) > limit) * -1;
+                work = Sse2.Max(
+                    Sse2.Or(Sse2.SubtractSaturate(p2, p1), Sse2.SubtractSaturate(p1, p2)),
+                    Sse2.Or(Sse2.SubtractSaturate(p3, p2), Sse2.SubtractSaturate(p2, p3)));
+                mask = Sse2.Max(work, mask);
+                work = Sse2.Max(
+                    Sse2.Or(Sse2.SubtractSaturate(q2, q1), Sse2.SubtractSaturate(q1, q2)),
+                    Sse2.Or(Sse2.SubtractSaturate(q3, q2), Sse2.SubtractSaturate(q2, q3)));
+                mask = Sse2.Max(work, mask);
+                mask = Sse2.SubtractSaturate(mask, limit);
+                mask = Sse2.CompareEqual(mask, zero);
+
+                // flat_mask4
+                work = Sse2.Max(
+                    Sse2.Or(Sse2.SubtractSaturate(p2, p0), Sse2.SubtractSaturate(p0, p2)),
+                    Sse2.Or(Sse2.SubtractSaturate(q2, q0), Sse2.SubtractSaturate(q0, q2)));
+                flat = Sse2.Max(work, flat);
+                work = Sse2.Max(
+                    Sse2.Or(Sse2.SubtractSaturate(p3, p0), Sse2.SubtractSaturate(p0, p3)),
+                    Sse2.Or(Sse2.SubtractSaturate(q3, q0), Sse2.SubtractSaturate(q0, q3)));
+                flat = Sse2.Max(work, flat);
+                flat = Sse2.SubtractSaturate(flat, one);
+                flat = Sse2.CompareEqual(flat, zero);
+                flat = Sse2.And(flat, mask);
+            }
+            {
+                Vector128<short> four = Vector128.Create((short)4);
+                ArrayPtr<byte> src = s;
+                int i = 0;
+
+                do
+                {
+                    Vector128<short> workpA, workpB, workpShft;
+                    p3 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(src.ToPointer() - (4 * pitch))).AsByte(), zero);
+                    p2 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(src.ToPointer() - (3 * pitch))).AsByte(), zero);
+                    p1 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(src.ToPointer() - (2 * pitch))).AsByte(), zero);
+                    p0 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(src.ToPointer() - (1 * pitch))).AsByte(), zero);
+                    q0 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(src.ToPointer() - (0 * pitch))).AsByte(), zero);
+                    q1 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(src.ToPointer() + (1 * pitch))).AsByte(), zero);
+                    q2 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(src.ToPointer() + (2 * pitch))).AsByte(), zero);
+                    q3 = Sse2.UnpackLow(
+                        Sse2.LoadScalarVector128((long*)(src.ToPointer() + (3 * pitch))).AsByte(), zero);
+
+                    workpA = Sse2.Add(Sse2.Add(p3.AsInt16(), p3.AsInt16()), Sse2.Add(p2.AsInt16(), p1.AsInt16()));
+                    workpA = Sse2.Add(Sse2.Add(workpA, four), p0.AsInt16());
+                    workpB = Sse2.Add(Sse2.Add(q0.AsInt16(), p2.AsInt16()), p3.AsInt16());
+                    workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3);
+                    Sse2.StoreScalar((long*)&flatOp2 + i, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64());
+
+                    workpB = Sse2.Add(Sse2.Add(q0.AsInt16(), q1.AsInt16()), p1.AsInt16());
+                    workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3);
+                    Sse2.StoreScalar((long*)&flatOp1 + i, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64());
+
+                    workpA = Sse2.Add(Sse2.Subtract(workpA, p3.AsInt16()), q2.AsInt16());
+                    workpB = Sse2.Add(Sse2.Subtract(workpB, p1.AsInt16()), p0.AsInt16());
+                    workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3);
+                    Sse2.StoreScalar((long*)&flatOp0 + i, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64());
+
+                    workpA = Sse2.Add(Sse2.Subtract(workpA, p3.AsInt16()), q3.AsInt16());
+                    workpB = Sse2.Add(Sse2.Subtract(workpB, p0.AsInt16()), q0.AsInt16());
+                    workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3);
+                    Sse2.StoreScalar((long*)&flatOq0 + i, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64());
+
+                    workpA = Sse2.Add(Sse2.Subtract(workpA, p2.AsInt16()), q3.AsInt16());
+                    workpB = Sse2.Add(Sse2.Subtract(workpB, q0.AsInt16()), q1.AsInt16());
+                    workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3);
+                    Sse2.StoreScalar((long*)&flatOq1 + i, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64());
+
+                    workpA = Sse2.Add(Sse2.Subtract(workpA, p1.AsInt16()), q3.AsInt16());
+                    workpB = Sse2.Add(Sse2.Subtract(workpB, q1.AsInt16()), q2.AsInt16());
+                    workpShft = Sse2.ShiftRightLogical(Sse2.Add(workpA, workpB), 3);
+                    Sse2.StoreScalar((long*)&flatOq2 + i, Sse2.PackUnsignedSaturate(workpShft, workpShft).AsInt64());
+
+                    src = src.Slice(8);
+                } while (++i < 2);
+            }
+            // lp filter
+            {
+                Vector128<byte> t4 = Vector128.Create((byte)4);
+                Vector128<byte> t3 = Vector128.Create((byte)3);
+                Vector128<byte> t80 = Vector128.Create((byte)0x80);
+                Vector128<byte> te0 = Vector128.Create((byte)0xe0);
+                Vector128<byte> t1F = Vector128.Create((byte)0x1f);
+                Vector128<byte> t1 = Vector128.Create((byte)0x1);
+                Vector128<byte> t7F = Vector128.Create((byte)0x7f);
+
+                Vector128<byte> ps1 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() - (2 * pitch)), t80);
+                Vector128<byte> ps0 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() - (1 * pitch)), t80);
+                Vector128<byte> qs0 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() + (0 * pitch)), t80);
+                Vector128<byte> qs1 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() + (1 * pitch)), t80);
+                Vector128<byte> filt;
+                Vector128<byte> workA;
+                Vector128<byte> filter1, filter2;
+
+                filt = Sse2.And(Sse2.SubtractSaturate(ps1.AsSByte(), qs1.AsSByte()).AsByte(), hev);
+                workA = Sse2.SubtractSaturate(qs0.AsSByte(), ps0.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                // (vpx_filter + 3 * (qs0 - ps0)) & mask
+                filt = Sse2.And(filt, mask);
+
+                filter1 = Sse2.AddSaturate(filt.AsSByte(), t4.AsSByte()).AsByte();
+                filter2 = Sse2.AddSaturate(filt.AsSByte(), t3.AsSByte()).AsByte();
+
+                // Filter1 >> 3
+                workA = Sse2.CompareGreaterThan(zero.AsSByte(), filter1.AsSByte()).AsByte();
+                filter1 = Sse2.ShiftRightLogical(filter1.AsInt16(), 3).AsByte();
+                workA = Sse2.And(workA, te0);
+                filter1 = Sse2.And(filter1, t1F);
+                filter1 = Sse2.Or(filter1, workA);
+
+                // Filter2 >> 3
+                workA = Sse2.CompareGreaterThan(zero.AsSByte(), filter2.AsSByte()).AsByte();
+                filter2 = Sse2.ShiftRightLogical(filter2.AsInt16(), 3).AsByte();
+                workA = Sse2.And(workA, te0);
+                filter2 = Sse2.And(filter2, t1F);
+                filter2 = Sse2.Or(filter2, workA);
+
+                // filt >> 1
+                filt = Sse2.AddSaturate(filter1.AsSByte(), t1.AsSByte()).AsByte();
+                workA = Sse2.CompareGreaterThan(zero.AsSByte(), filt.AsSByte()).AsByte();
+                filt = Sse2.ShiftRightLogical(filt.AsInt16(), 1).AsByte();
+                workA = Sse2.And(workA, t80);
+                filt = Sse2.And(filt, t7F);
+                filt = Sse2.Or(filt, workA);
+
+                filt = Sse2.AndNot(hev, filt);
+
+                workA = Sse2.Xor(Sse2.SubtractSaturate(qs0.AsSByte(), filter1.AsSByte()).AsByte(), t80);
+                q0 = Sse2.LoadVector128((byte*)&flatOq0);
+                workA = Sse2.AndNot(flat, workA);
+                q0 = Sse2.And(flat, q0);
+                q0 = Sse2.Or(workA, q0);
+
+                workA = Sse2.Xor(Sse2.SubtractSaturate(qs1.AsSByte(), filt.AsSByte()).AsByte(), t80);
+                q1 = Sse2.LoadVector128((byte*)&flatOq1);
+                workA = Sse2.AndNot(flat, workA);
+                q1 = Sse2.And(flat, q1);
+                q1 = Sse2.Or(workA, q1);
+
+                workA = Sse2.LoadVector128(s.ToPointer() + (2 * pitch));
+                q2 = Sse2.LoadVector128((byte*)&flatOq2);
+                workA = Sse2.AndNot(flat, workA);
+                q2 = Sse2.And(flat, q2);
+                q2 = Sse2.Or(workA, q2);
+
+                workA = Sse2.Xor(Sse2.AddSaturate(ps0.AsSByte(), filter2.AsSByte()).AsByte(), t80);
+                p0 = Sse2.LoadVector128((byte*)&flatOp0);
+                workA = Sse2.AndNot(flat, workA);
+                p0 = Sse2.And(flat, p0);
+                p0 = Sse2.Or(workA, p0);
+
+                workA = Sse2.Xor(Sse2.AddSaturate(ps1.AsSByte(), filt.AsSByte()).AsByte(), t80);
+                p1 = Sse2.LoadVector128((byte*)&flatOp1);
+                workA = Sse2.AndNot(flat, workA);
+                p1 = Sse2.And(flat, p1);
+                p1 = Sse2.Or(workA, p1);
+
+                workA = Sse2.LoadVector128(s.ToPointer() - (3 * pitch));
+                p2 = Sse2.LoadVector128((byte*)&flatOp2);
+                workA = Sse2.AndNot(flat, workA);
+                p2 = Sse2.And(flat, p2);
+                p2 = Sse2.Or(workA, p2);
+
+                Sse2.Store(s.ToPointer() - (3 * pitch), p2);
+                Sse2.Store(s.ToPointer() - (2 * pitch), p1);
+                Sse2.Store(s.ToPointer() - (1 * pitch), p0);
+                Sse2.Store(s.ToPointer() + (0 * pitch), q0);
+                Sse2.Store(s.ToPointer() + (1 * pitch), q1);
+                Sse2.Store(s.ToPointer() + (2 * pitch), q2);
+            }
+        }
+
+        public static unsafe void LpfHorizontal4Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit0,
+            ReadOnlySpan<byte> limit0,
+            ReadOnlySpan<byte> thresh0,
+            ReadOnlySpan<byte> blimit1,
+            ReadOnlySpan<byte> limit1,
+            ReadOnlySpan<byte> thresh1)
+        {
+            Vector128<byte> blimit, limit, thresh;
+
+            fixed (byte* pBLimit0 = blimit0, pLimit0 = limit0, pThresh0 = thresh0,
+                   pBLimit1 = blimit1, pLimit1 = limit1, pThresh1 = thresh1)
+            {
+                blimit = Sse2.UnpackLow(Sse2.LoadVector128(pBLimit0).AsInt64(), Sse2.LoadVector128(pBLimit1).AsInt64())
+                    .AsByte();
+                limit = Sse2.UnpackLow(Sse2.LoadVector128(pLimit0).AsInt64(), Sse2.LoadVector128(pLimit1).AsInt64())
+                    .AsByte();
+                thresh = Sse2.UnpackLow(Sse2.LoadVector128(pThresh0).AsInt64(), Sse2.LoadVector128(pThresh1).AsInt64())
+                    .AsByte();
+            }
+
+            Vector128<byte> zero = Vector128<byte>.Zero;
+            Vector128<byte> p3, p2, p1, p0, q0, q1, q2, q3;
+            Vector128<byte> mask, hev, flat;
+
+            p3 = Sse2.LoadVector128(s.ToPointer() - (4 * pitch));
+            p2 = Sse2.LoadVector128(s.ToPointer() - (3 * pitch));
+            p1 = Sse2.LoadVector128(s.ToPointer() - (2 * pitch));
+            p0 = Sse2.LoadVector128(s.ToPointer() - (1 * pitch));
+            q0 = Sse2.LoadVector128(s.ToPointer() - (0 * pitch));
+            q1 = Sse2.LoadVector128(s.ToPointer() + (1 * pitch));
+            q2 = Sse2.LoadVector128(s.ToPointer() + (2 * pitch));
+            q3 = Sse2.LoadVector128(s.ToPointer() + (3 * pitch));
+
+            // filter_mask and hev_mask
+            {
+                Vector128<byte> absP1P0 = Sse2.Or(Sse2.SubtractSaturate(p1, p0), Sse2.SubtractSaturate(p0, p1));
+                Vector128<byte> absQ1Q0 = Sse2.Or(Sse2.SubtractSaturate(q1, q0), Sse2.SubtractSaturate(q0, q1));
+                Vector128<byte> fe = Vector128.Create((byte)0xfe);
+                Vector128<byte> ff = Sse2.CompareEqual(absP1P0, absP1P0);
+                Vector128<byte> absP0Q0 = Sse2.Or(Sse2.SubtractSaturate(p0, q0), Sse2.SubtractSaturate(q0, p0));
+                Vector128<byte> absP1Q1 = Sse2.Or(Sse2.SubtractSaturate(p1, q1), Sse2.SubtractSaturate(q1, p1));
+                Vector128<byte> work;
+
+                flat = Sse2.Max(absP1P0, absQ1Q0);
+                hev = Sse2.SubtractSaturate(flat, thresh);
+                hev = Sse2.Xor(Sse2.CompareEqual(hev, zero), ff);
+
+                absP0Q0 = Sse2.AddSaturate(absP0Q0, absP0Q0);
+                absP1Q1 = Sse2.ShiftRightLogical(Sse2.And(absP1Q1, fe).AsInt16(), 1).AsByte();
+                mask = Sse2.SubtractSaturate(Sse2.AddSaturate(absP0Q0, absP1Q1), blimit);
+                mask = Sse2.Xor(Sse2.CompareEqual(mask, zero), ff);
+                // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+                mask = Sse2.Max(flat, mask);
+                // mask |= (abs(p1 - p0) > limit) * -1;
+                // mask |= (abs(q1 - q0) > limit) * -1;
+                work = Sse2.Max(
+                    Sse2.Or(Sse2.SubtractSaturate(p2, p1), Sse2.SubtractSaturate(p1, p2)),
+                    Sse2.Or(Sse2.SubtractSaturate(p3, p2), Sse2.SubtractSaturate(p2, p3)));
+                mask = Sse2.Max(work, mask);
+                work = Sse2.Max(
+                    Sse2.Or(Sse2.SubtractSaturate(q2, q1), Sse2.SubtractSaturate(q1, q2)),
+                    Sse2.Or(Sse2.SubtractSaturate(q3, q2), Sse2.SubtractSaturate(q2, q3)));
+                mask = Sse2.Max(work, mask);
+                mask = Sse2.SubtractSaturate(mask, limit);
+                mask = Sse2.CompareEqual(mask, zero);
+            }
+
+            // filter4
+            {
+                Vector128<byte> t4 = Vector128.Create((byte)4);
+                Vector128<byte> t3 = Vector128.Create((byte)3);
+                Vector128<byte> t80 = Vector128.Create((byte)0x80);
+                Vector128<byte> te0 = Vector128.Create((byte)0xe0);
+                Vector128<byte> t1F = Vector128.Create((byte)0x1f);
+                Vector128<byte> t1 = Vector128.Create((byte)0x1);
+                Vector128<byte> t7F = Vector128.Create((byte)0x7f);
+
+                Vector128<byte> ps1 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() - (2 * pitch)), t80);
+                Vector128<byte> ps0 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() - (1 * pitch)), t80);
+                Vector128<byte> qs0 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() + (0 * pitch)), t80);
+                Vector128<byte> qs1 = Sse2.Xor(Sse2.LoadVector128(s.ToPointer() + (1 * pitch)), t80);
+                Vector128<byte> filt;
+                Vector128<byte> workA;
+                Vector128<byte> filter1, filter2;
+
+                filt = Sse2.And(Sse2.SubtractSaturate(ps1.AsSByte(), qs1.AsSByte()).AsByte(), hev);
+                workA = Sse2.SubtractSaturate(qs0.AsSByte(), ps0.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                filt = Sse2.AddSaturate(filt.AsSByte(), workA.AsSByte()).AsByte();
+                // (vpx_filter + 3 * (qs0 - ps0)) & mask
+                filt = Sse2.And(filt, mask);
+
+                filter1 = Sse2.AddSaturate(filt.AsSByte(), t4.AsSByte()).AsByte();
+                filter2 = Sse2.AddSaturate(filt.AsSByte(), t3.AsSByte()).AsByte();
+
+                // Filter1 >> 3
+                workA = Sse2.CompareGreaterThan(zero.AsSByte(), filter1.AsSByte()).AsByte();
+                filter1 = Sse2.ShiftRightLogical(filter1.AsInt16(), 3).AsByte();
+                workA = Sse2.And(workA, te0);
+                filter1 = Sse2.And(filter1, t1F);
+                filter1 = Sse2.Or(filter1, workA);
+
+                // Filter2 >> 3
+                workA = Sse2.CompareGreaterThan(zero.AsSByte(), filter2.AsSByte()).AsByte();
+                filter2 = Sse2.ShiftRightLogical(filter2.AsInt16(), 3).AsByte();
+                workA = Sse2.And(workA, te0);
+                filter2 = Sse2.And(filter2, t1F);
+                filter2 = Sse2.Or(filter2, workA);
+
+                // filt >> 1
+                filt = Sse2.AddSaturate(filter1.AsSByte(), t1.AsSByte()).AsByte();
+                workA = Sse2.CompareGreaterThan(zero.AsSByte(), filt.AsSByte()).AsByte();
+                filt = Sse2.ShiftRightLogical(filt.AsInt16(), 1).AsByte();
+                workA = Sse2.And(workA, t80);
+                filt = Sse2.And(filt, t7F);
+                filt = Sse2.Or(filt, workA);
+
+                filt = Sse2.AndNot(hev, filt);
+
+                q0 = Sse2.Xor(Sse2.SubtractSaturate(qs0.AsSByte(), filter1.AsSByte()).AsByte(), t80);
+                q1 = Sse2.Xor(Sse2.SubtractSaturate(qs1.AsSByte(), filt.AsSByte()).AsByte(), t80);
+                p0 = Sse2.Xor(Sse2.AddSaturate(ps0.AsSByte(), filter2.AsSByte()).AsByte(), t80);
+                p1 = Sse2.Xor(Sse2.AddSaturate(ps1.AsSByte(), filt.AsSByte()).AsByte(), t80);
+
+                Sse2.Store(s.ToPointer() - (2 * pitch), p1);
+                Sse2.Store(s.ToPointer() - (1 * pitch), p0);
+                Sse2.Store(s.ToPointer() + (0 * pitch), q0);
+                Sse2.Store(s.ToPointer() + (1 * pitch), q1);
+            }
+        }
+
+        private static unsafe void Transpose8x16(
+            ArrayPtr<byte> in0,
+            ArrayPtr<byte> in1,
+            int inP,
+            ArrayPtr<byte> output,
+            int outP)
+        {
+            Vector128<byte> x0, x1, x2, x3, x4, x5, x6, x7;
+            Vector128<byte> x8, x9, x10, x11, x12, x13, x14, x15;
+
+            // 2-way interleave w/hoisting of unpacks
+            x0 = Sse2.LoadScalarVector128((long*)in0.ToPointer()).AsByte(); // 1
+            x1 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + inP)).AsByte(); // 3
+            x0 = Sse2.UnpackLow(x0, x1); // 1
+
+            x2 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + (2 * inP))).AsByte(); // 5
+            x3 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + (3 * inP))).AsByte(); // 7
+            x1 = Sse2.UnpackLow(x2, x3); // 2
+
+            x4 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + (4 * inP))).AsByte(); // 9
+            x5 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + (5 * inP))).AsByte(); // 11
+            x2 = Sse2.UnpackLow(x4, x5); // 3
+
+            x6 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + (6 * inP))).AsByte(); // 13
+            x7 = Sse2.LoadScalarVector128((long*)(in0.ToPointer() + (7 * inP))).AsByte(); // 15
+            x3 = Sse2.UnpackLow(x6, x7); // 4
+            x4 = Sse2.UnpackLow(x0.AsInt16(), x1.AsInt16()).AsByte(); // 9
+
+            x8 = Sse2.LoadScalarVector128((long*)in1.ToPointer()).AsByte(); // 2
+            x9 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + inP)).AsByte(); // 4
+            x8 = Sse2.UnpackLow(x8, x9); // 5
+            x5 = Sse2.UnpackLow(x2.AsInt16(), x3.AsInt16()).AsByte(); // 10
+
+            x10 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + (2 * inP))).AsByte(); // 6
+            x11 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + (3 * inP))).AsByte(); // 8
+            x9 = Sse2.UnpackLow(x10, x11); // 6
+
+            x12 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + (4 * inP))).AsByte(); // 10
+            x13 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + (5 * inP))).AsByte(); // 12
+            x10 = Sse2.UnpackLow(x12, x13); // 7
+            x12 = Sse2.UnpackLow(x8.AsInt16(), x9.AsInt16()).AsByte(); // 11
+
+            x14 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + (6 * inP))).AsByte(); // 14
+            x15 = Sse2.LoadScalarVector128((long*)(in1.ToPointer() + (7 * inP))).AsByte(); // 16
+            x11 = Sse2.UnpackLow(x14, x15); // 8
+            x13 = Sse2.UnpackLow(x10.AsInt16(), x11.AsInt16()).AsByte(); // 12
+
+            x6 = Sse2.UnpackLow(x4.AsInt32(), x5.AsInt32()).AsByte(); // 13
+            x7 = Sse2.UnpackHigh(x4.AsInt32(), x5.AsInt32()).AsByte(); // 14
+            x14 = Sse2.UnpackLow(x12.AsInt32(), x13.AsInt32()).AsByte(); // 15
+            x15 = Sse2.UnpackHigh(x12.AsInt32(), x13.AsInt32()).AsByte(); // 16
+
+            // Store first 4-line result
+            Sse2.Store(output.ToPointer(), Sse2.UnpackLow(x6.AsInt64(), x14.AsInt64()).AsByte());
+            Sse2.Store(output.ToPointer() + outP, Sse2.UnpackHigh(x6.AsInt64(), x14.AsInt64()).AsByte());
+            Sse2.Store(output.ToPointer() + (2 * outP), Sse2.UnpackLow(x7.AsInt64(), x15.AsInt64()).AsByte());
+            Sse2.Store(output.ToPointer() + (3 * outP), Sse2.UnpackHigh(x7.AsInt64(), x15.AsInt64()).AsByte());
+
+            x4 = Sse2.UnpackHigh(x0.AsInt16(), x1.AsInt16()).AsByte();
+            x5 = Sse2.UnpackHigh(x2.AsInt16(), x3.AsInt16()).AsByte();
+            x12 = Sse2.UnpackHigh(x8.AsInt16(), x9.AsInt16()).AsByte();
+            x13 = Sse2.UnpackHigh(x10.AsInt16(), x11.AsInt16()).AsByte();
+
+            x6 = Sse2.UnpackLow(x4.AsInt32(), x5.AsInt32()).AsByte();
+            x7 = Sse2.UnpackHigh(x4.AsInt32(), x5.AsInt32()).AsByte();
+            x14 = Sse2.UnpackLow(x12.AsInt32(), x13.AsInt32()).AsByte();
+            x15 = Sse2.UnpackHigh(x12.AsInt32(), x13.AsInt32()).AsByte();
+
+            // Store second 4-line result
+            Sse2.Store(output.ToPointer() + (4 * outP), Sse2.UnpackLow(x6.AsInt64(), x14.AsInt64()).AsByte());
+            Sse2.Store(output.ToPointer() + (5 * outP), Sse2.UnpackHigh(x6.AsInt64(), x14.AsInt64()).AsByte());
+            Sse2.Store(output.ToPointer() + (6 * outP), Sse2.UnpackLow(x7.AsInt64(), x15.AsInt64()).AsByte());
+            Sse2.Store(output.ToPointer() + (7 * outP), Sse2.UnpackHigh(x7.AsInt64(), x15.AsInt64()).AsByte());
+        }
+
+        private static unsafe void Transpose(
+            ReadOnlySpan<ArrayPtr<byte>> src,
+            int inP,
+            ReadOnlySpan<ArrayPtr<byte>> dst,
+            int outP,
+            int num8x8ToTranspose)
+        {
+            int idx8x8 = 0;
+            Vector128<byte> x0, x1, x2, x3, x4, x5, x6, x7;
+
+            do
+            {
+                ArrayPtr<byte> input = src[idx8x8];
+                ArrayPtr<byte> output = dst[idx8x8];
+
+                x0 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (0 * inP)))
+                    .AsByte(); // 00 01 02 03 04 05 06 07
+                x1 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (1 * inP)))
+                    .AsByte(); // 10 11 12 13 14 15 16 17
+                // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+                x0 = Sse2.UnpackLow(x0, x1);
+
+                x2 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (2 * inP)))
+                    .AsByte(); // 20 21 22 23 24 25 26 27
+                x3 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (3 * inP)))
+                    .AsByte(); // 30 31 32 33 34 35 36 37
+                // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+                x1 = Sse2.UnpackLow(x2, x3);
+
+                x4 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (4 * inP)))
+                    .AsByte(); // 40 41 42 43 44 45 46 47
+                x5 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (5 * inP)))
+                    .AsByte(); // 50 51 52 53 54 55 56 57
+                // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+                x2 = Sse2.UnpackLow(x4, x5);
+
+                x6 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (6 * inP)))
+                    .AsByte(); // 60 61 62 63 64 65 66 67
+                x7 = Sse2.LoadScalarVector128((long*)(input.ToPointer() + (7 * inP)))
+                    .AsByte(); // 70 71 72 73 74 75 76 77
+                // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+                x3 = Sse2.UnpackLow(x6, x7);
+
+                // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+                x4 = Sse2.UnpackLow(x0.AsInt16(), x1.AsInt16()).AsByte();
+                // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+                x5 = Sse2.UnpackLow(x2.AsInt16(), x3.AsInt16()).AsByte();
+                // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+                x6 = Sse2.UnpackLow(x4.AsInt32(), x5.AsInt32()).AsByte();
+                Sse2.StoreScalar((long*)(output.ToPointer() + (0 * outP)), x6.AsInt64()); // 00 10 20 30 40 50 60 70
+                Sse2.StoreHigh((double*)(output.ToPointer() + (1 * outP)), x6.AsDouble()); // 01 11 21 31 41 51 61 71
+                // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+                x7 = Sse2.UnpackHigh(x4.AsInt32(), x5.AsInt32()).AsByte();
+                Sse2.StoreScalar((long*)(output.ToPointer() + (2 * outP)), x7.AsInt64()); // 02 12 22 32 42 52 62 72
+                Sse2.StoreHigh((double*)(output.ToPointer() + (3 * outP)), x7.AsDouble()); // 03 13 23 33 43 53 63 73
+
+                // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+                x4 = Sse2.UnpackHigh(x0.AsInt16(), x1.AsInt16()).AsByte();
+                // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+                x5 = Sse2.UnpackHigh(x2.AsInt16(), x3.AsInt16()).AsByte();
+                // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+                x6 = Sse2.UnpackLow(x4.AsInt32(), x5.AsInt32()).AsByte();
+                Sse2.StoreScalar((long*)(output.ToPointer() + (4 * outP)), x6.AsInt64()); // 04 14 24 34 44 54 64 74
+                Sse2.StoreHigh((double*)(output.ToPointer() + (5 * outP)), x6.AsDouble()); // 05 15 25 35 45 55 65 75
+                // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+                x7 = Sse2.UnpackHigh(x4.AsInt32(), x5.AsInt32()).AsByte();
+
+                Sse2.StoreScalar((long*)(output.ToPointer() + (6 * outP)), x7.AsInt64()); // 06 16 26 36 46 56 66 76
+                Sse2.StoreHigh((double*)(output.ToPointer() + (7 * outP)), x7.AsDouble()); // 07 17 27 37 47 57 67 77
+            } while (++idx8x8 < num8x8ToTranspose);
+        }
+
+        public static unsafe void LpfVertical4Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit0,
+            ReadOnlySpan<byte> limit0,
+            ReadOnlySpan<byte> thresh0,
+            ReadOnlySpan<byte> blimit1,
+            ReadOnlySpan<byte> limit1,
+            ReadOnlySpan<byte> thresh1)
+        {
+            ulong* tDstStorage = stackalloc ulong[16];
+            ArrayPtr<byte> tDst = new((byte*)tDstStorage, 16 * 8);
+            Span<ArrayPtr<byte>> src = stackalloc ArrayPtr<byte>[2];
+            Span<ArrayPtr<byte>> dst = stackalloc ArrayPtr<byte>[2];
+
+            // Transpose 8x16
+            Transpose8x16(s.Slice(-4), s.Slice(-4 + (pitch * 8)), pitch, tDst, 16);
+
+            // Loop filtering
+            LpfHorizontal4Dual(tDst.Slice(4 * 16), 16, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
+            src[0] = tDst;
+            src[1] = tDst.Slice(8);
+            dst[0] = s.Slice(-4);
+            dst[1] = s.Slice(-4 + (pitch * 8));
+
+            // Transpose back
+            Transpose(src, 16, dst, pitch, 2);
+        }
+
+        public static unsafe void LpfVertical8(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            ulong* tDstStorage = stackalloc ulong[8];
+            ArrayPtr<byte> tDst = new((byte*)tDstStorage, 8 * 8);
+            Span<ArrayPtr<byte>> src = stackalloc ArrayPtr<byte>[1];
+            Span<ArrayPtr<byte>> dst = stackalloc ArrayPtr<byte>[1];
+
+            // Transpose 8x8
+            src[0] = s.Slice(-4);
+            dst[0] = tDst;
+
+            Transpose(src, pitch, dst, 8, 1);
+
+            // Loop filtering
+            LpfHorizontal8(tDst.Slice(4 * 8), 8, blimit, limit, thresh);
+
+            // Transpose back
+            Transpose(dst, 8, src, pitch, 1);
+        }
+
+        public static unsafe void LpfVertical8Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit0,
+            ReadOnlySpan<byte> limit0,
+            ReadOnlySpan<byte> thresh0,
+            ReadOnlySpan<byte> blimit1,
+            ReadOnlySpan<byte> limit1,
+            ReadOnlySpan<byte> thresh1)
+        {
+            ulong* tDstStorage = stackalloc ulong[16];
+            ArrayPtr<byte> tDst = new((byte*)tDstStorage, 16 * 8);
+            Span<ArrayPtr<byte>> src = stackalloc ArrayPtr<byte>[2];
+            Span<ArrayPtr<byte>> dst = stackalloc ArrayPtr<byte>[2];
+
+            // Transpose 8x16
+            Transpose8x16(s.Slice(-4), s.Slice(-4 + (pitch * 8)), pitch, tDst, 16);
+
+            // Loop filtering
+            LpfHorizontal8Dual(tDst.Slice(4 * 16), 16, blimit0, limit0, thresh0, blimit1, limit1, thresh1);
+
+            src[0] = tDst;
+            src[1] = tDst.Slice(8);
+
+            dst[0] = s.Slice(-4);
+            dst[1] = s.Slice(-4 + (pitch * 8));
+
+            // Transpose back
+            Transpose(src, 16, dst, pitch, 2);
+        }
+
+        public static unsafe void LpfVertical16(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            ulong* tDstStorage = stackalloc ulong[16];
+            ArrayPtr<byte> tDst = new((byte*)tDstStorage, 16 * 8);
+            Span<ArrayPtr<byte>> src = stackalloc ArrayPtr<byte>[2];
+            Span<ArrayPtr<byte>> dst = stackalloc ArrayPtr<byte>[2];
+
+            src[0] = s.Slice(-8);
+            src[1] = s;
+            dst[0] = tDst;
+            dst[1] = tDst.Slice(8 * 8);
+
+            // Transpose 16x8
+            Transpose(src, pitch, dst, 8, 2);
+
+            // Loop filtering
+            LpfHorizontal16(tDst.Slice(8 * 8), 8, blimit, limit, thresh);
+
+            // Transpose back
+            Transpose(dst, 8, src, pitch, 2);
+        }
+
+        public static unsafe void LpfVertical16Dual(
+            ArrayPtr<byte> s,
+            int pitch,
+            ReadOnlySpan<byte> blimit,
+            ReadOnlySpan<byte> limit,
+            ReadOnlySpan<byte> thresh)
+        {
+            Vector128<byte>* tDstStorage = stackalloc Vector128<byte>[16];
+            ArrayPtr<byte> tDst = new((byte*)tDstStorage, 256);
+
+            // Transpose 16x16
+            Transpose8x16(s.Slice(-8), s.Slice(-8 + (8 * pitch)), pitch, tDst, 16);
+            Transpose8x16(s, s.Slice(8 * pitch), pitch, tDst.Slice(8 * 16), 16);
+
+            // Loop filtering
+            LpfHorizontal16Dual(tDst.Slice(8 * 16), 16, blimit, limit, thresh);
+
+            // Transpose back
+            Transpose8x16(tDst, tDst.Slice(8 * 16), 16, s.Slice(-8), pitch);
+            Transpose8x16(tDst.Slice(8), tDst.Slice(8 + (8 * 16)), 16, s.Slice(-8 + (8 * pitch)), pitch);
+        }
+    }
+}

+ 15 - 19
src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
 using System;
 using System.Diagnostics;
 
@@ -12,10 +12,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
         {
             Debug.Assert(den != 0);
             {
-                int p = (int)(((ulong)num * 256 + (den >> 1)) / den);
+                int p = (int)((((ulong)num * 256) + (den >> 1)) / den);
                 // (p > 255) ? 255 : (p < 1) ? 1 : p;
                 int clippedProb = p | ((255 - p) >> 23) | (p == 0 ? 1 : 0);
-
                 return (byte)clippedProb;
             }
         }
@@ -23,13 +22,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
         /* This function assumes prob1 and prob2 are already within [1,255] range. */
         public static byte WeightedProb(int prob1, int prob2, int factor)
         {
-            return (byte)BitUtils.RoundPowerOfTwo(prob1 * (256 - factor) + prob2 * factor, 8);
+            return (byte)BitUtils.RoundPowerOfTwo((prob1 * (256 - factor)) + (prob2 * factor), 8);
         }
 
         // MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
-        private static readonly uint[] _countToUpdateFactor = {
-            0,  6,  12, 19, 25, 32,  38,  44,  51,  57, 64,
-            70, 76, 83, 89, 96, 102, 108, 115, 121, 128,
+        private static readonly uint[] CountToUpdateFactor =
+        {
+            0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, 70, 76, 83, 89, 96, 102, 108, 115, 121, 128
         };
 
         private const int ModeMvCountSat = 20;
@@ -41,14 +40,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             {
                 return preProb;
             }
-            else
-            {
-                uint count = Math.Min(den, ModeMvCountSat);
-                uint factor = _countToUpdateFactor[(int)count];
-                byte prob = GetProb(ct0, den);
 
-                return WeightedProb(preProb, prob, (int)factor);
-            }
+            uint count = Math.Min(den, ModeMvCountSat);
+            uint factor = CountToUpdateFactor[(int)count];
+            byte prob = GetProb(ct0, den);
+            return WeightedProb(preProb, prob, (int)factor);
         }
 
         private static uint TreeMergeProbsImpl(
@@ -59,17 +55,17 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Span<byte> probs)
         {
             int l = tree[i];
-            uint leftCount = (l <= 0) ? counts[-l] : TreeMergeProbsImpl((uint)l, tree, preProbs, counts, probs);
+            uint leftCount = l <= 0 ? counts[-l] : TreeMergeProbsImpl((uint)l, tree, preProbs, counts, probs);
             int r = tree[i + 1];
-            uint rightCount = (r <= 0) ? counts[-r] : TreeMergeProbsImpl((uint)r, tree, preProbs, counts, probs);
+            uint rightCount = r <= 0 ? counts[-r] : TreeMergeProbsImpl((uint)r, tree, preProbs, counts, probs);
             probs[(int)(i >> 1)] = ModeMvMergeProbs(preProbs[(int)(i >> 1)], leftCount, rightCount);
-
             return leftCount + rightCount;
         }
 
-        public static void TreeMergeProbs(sbyte[] tree, ReadOnlySpan<byte> preProbs, ReadOnlySpan<uint> counts, Span<byte> probs)
+        public static void TreeMergeProbs(sbyte[] tree, ReadOnlySpan<byte> preProbs, ReadOnlySpan<uint> counts,
+            Span<byte> probs)
         {
             TreeMergeProbsImpl(0, tree, preProbs, counts, probs);
         }
     }
-}
+}

+ 106 - 32
src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs

@@ -1,4 +1,5 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
 using System;
 using System.Buffers.Binary;
 
@@ -6,18 +7,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 {
     internal struct Reader
     {
-        private static readonly byte[] _norm = {
-            0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-            3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        private static readonly byte[] Norm =
+        {
+            0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2,
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
         };
+
         private const int BdValueSize = sizeof(ulong) * 8;
 
         // This is meant to be a large, positive constant that can still be efficiently
@@ -36,16 +37,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             {
                 return true;
             }
-            else
-            {
-                _buffer = new ArrayPtr<byte>(ref buffer[0], size);
-                Value = 0;
-                Count = -8;
-                Range = 255;
-                Fill();
 
-                return ReadBit() != 0; // Marker bit
-            }
+            _buffer = new ArrayPtr<byte>(ref buffer[0], size);
+            Value = 0;
+            Count = -8;
+            Range = 255;
+            Fill();
+            return ReadBit() != 0; // Marker bit
         }
 
         private void Fill()
@@ -65,7 +63,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
                 ulong bigEndianValues = BinaryPrimitives.ReadUInt64BigEndian(buffer);
                 nv = bigEndianValues >> (BdValueSize - bits);
                 count += bits;
-                buffer = buffer[(bits >> 3)..];
+                buffer = buffer.Slice(bits >> 3);
                 value = Value | (nv << (shift & 0x7));
             }
             else
@@ -84,7 +82,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
                     {
                         count += 8;
                         value |= (ulong)buffer[0] << shift;
-                        buffer = buffer[1..];
+                        buffer = buffer.Slice(1);
                         shift -= 8;
                     }
                 }
@@ -98,7 +96,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Count = count;
         }
 
-        public readonly bool HasError()
+        public bool HasError()
         {
             // Check if we have reached the end of the buffer.
             //
@@ -124,7 +122,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             ulong bigsplit;
             int count;
             uint range;
-            uint split = (Range * (uint)prob + (256 - (uint)prob)) >> 8;
+            uint split = ((Range * (uint)prob) + (256 - (uint)prob)) >> 8;
 
             if (Count < 0)
             {
@@ -146,7 +144,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
 
             {
-                int shift = _norm[range];
+                int shift = Norm[range];
                 range <<= shift;
                 value <<= shift;
                 count -= shift;
@@ -188,7 +186,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
         public int ReadBool(int prob, ref ulong value, ref int count, ref uint range)
         {
-            uint split = (range * (uint)prob + (256 - (uint)prob)) >> 8;
+            uint split = ((range * (uint)prob) + (256 - (uint)prob)) >> 8;
             ulong bigsplit = (ulong)split << (BdValueSize - 8);
 
             if (count < 0)
@@ -202,19 +200,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
 
             if (value >= bigsplit)
             {
-                range -= split;
-                value -= bigsplit;
+                range = range - split;
+                value = value - bigsplit;
                 {
-                    int shift = _norm[range];
+                    int shift = Norm[range];
                     range <<= shift;
                     value <<= shift;
                     count -= shift;
                 }
                 return 1;
             }
+
             range = split;
             {
-                int shift = _norm[range];
+                int shift = Norm[range];
                 range <<= shift;
                 value <<= shift;
                 count -= shift;
@@ -230,7 +229,82 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
                 Count -= 8;
                 _buffer = _buffer.Slice(-1);
             }
+
             return _buffer;
         }
+
+        private int DecodeUniform()
+        {
+            const int l = 8;
+            const int m = (1 << l) - 191;
+            int v = ReadLiteral(l - 1);
+            return v < m ? v : (v << 1) - m + ReadBit();
+        }
+
+        public int DecodeTermSubexp()
+        {
+            if (ReadBit() == 0)
+            {
+                return ReadLiteral(4);
+            }
+
+            if (ReadBit() == 0)
+            {
+                return ReadLiteral(4) + 16;
+            }
+
+            if (ReadBit() == 0)
+            {
+                return ReadLiteral(5) + 32;
+            }
+
+            return DecodeUniform() + 64;
+        }
+
+        public TxMode ReadTxMode()
+        {
+            TxMode txMode = (TxMode)ReadLiteral(2);
+            if (txMode == TxMode.Allow32x32)
+            {
+                txMode += ReadBit();
+            }
+
+            return txMode;
+        }
+
+        public int ReadCoeff(
+            ReadOnlySpan<byte> probs,
+            int n,
+            ref ulong value,
+            ref int count,
+            ref uint range)
+        {
+            int val = 0;
+            for (int i = 0; i < n; ++i)
+            {
+                val = (val << 1) | ReadBool(probs[i], ref value, ref count, ref range);
+            }
+
+            return val;
+        }
+
+        public void DiffUpdateProb(ref byte p)
+        {
+            if (Read(Entropy.DiffUpdateProb) != 0)
+            {
+                p = (byte)DSubExp.InvRemapProb(DecodeTermSubexp(), p);
+            }
+        }
+
+        public void UpdateMvProbs(Span<byte> p, int n)
+        {
+            for (int i = 0; i < n; ++i)
+            {
+                if (Read(EntropyMv.UpdateProb) != 0)
+                {
+                    p[i] = (byte)((ReadLiteral(7) << 1) | 1);
+                }
+            }
+        }
     }
-}
+}

+ 36 - 36
src/Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs

@@ -13,42 +13,42 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
         //  for (int i = 1; i < 32; ++i)
         //    Console.WriteLine("public const short CosPi{0}_64 = {1};", i, MathF.Round(16384 * MathF.Cos(i * MathF.PI / 64)));
         // Note: sin(k * Pi / 64) = cos((32 - k) * Pi / 64)
-        public const short CosPi1_64 = 16364;
-        public const short CosPi2_64 = 16305;
-        public const short CosPi3_64 = 16207;
-        public const short CosPi4_64 = 16069;
-        public const short CosPi5_64 = 15893;
-        public const short CosPi6_64 = 15679;
-        public const short CosPi7_64 = 15426;
-        public const short CosPi8_64 = 15137;
-        public const short CosPi9_64 = 14811;
-        public const short CosPi10_64 = 14449;
-        public const short CosPi11_64 = 14053;
-        public const short CosPi12_64 = 13623;
-        public const short CosPi13_64 = 13160;
-        public const short CosPi14_64 = 12665;
-        public const short CosPi15_64 = 12140;
-        public const short CosPi16_64 = 11585;
-        public const short CosPi17_64 = 11003;
-        public const short CosPi18_64 = 10394;
-        public const short CosPi19_64 = 9760;
-        public const short CosPi20_64 = 9102;
-        public const short CosPi21_64 = 8423;
-        public const short CosPi22_64 = 7723;
-        public const short CosPi23_64 = 7005;
-        public const short CosPi24_64 = 6270;
-        public const short CosPi25_64 = 5520;
-        public const short CosPi26_64 = 4756;
-        public const short CosPi27_64 = 3981;
-        public const short CosPi28_64 = 3196;
-        public const short CosPi29_64 = 2404;
-        public const short CosPi30_64 = 1606;
-        public const short CosPi31_64 = 804;
+        public const short CosPi164 = 16364;
+        public const short CosPi264 = 16305;
+        public const short CosPi364 = 16207;
+        public const short CosPi464 = 16069;
+        public const short CosPi564 = 15893;
+        public const short CosPi664 = 15679;
+        public const short CosPi764 = 15426;
+        public const short CosPi864 = 15137;
+        public const short CosPi964 = 14811;
+        public const short CosPi1064 = 14449;
+        public const short CosPi1164 = 14053;
+        public const short CosPi1264 = 13623;
+        public const short CosPi1364 = 13160;
+        public const short CosPi1464 = 12665;
+        public const short CosPi1564 = 12140;
+        public const short CosPi1664 = 11585;
+        public const short CosPi1764 = 11003;
+        public const short CosPi1864 = 10394;
+        public const short CosPi1964 = 9760;
+        public const short CosPi2064 = 9102;
+        public const short CosPi2164 = 8423;
+        public const short CosPi2264 = 7723;
+        public const short CosPi2364 = 7005;
+        public const short CosPi2464 = 6270;
+        public const short CosPi2564 = 5520;
+        public const short CosPi2664 = 4756;
+        public const short CosPi2764 = 3981;
+        public const short CosPi2864 = 3196;
+        public const short CosPi2964 = 2404;
+        public const short CosPi3064 = 1606;
+        public const short CosPi3164 = 804;
 
         //  16384 * sqrt(2) * sin(kPi / 9) * 2 / 3
-        public const short SinPi1_9 = 5283;
-        public const short SinPi2_9 = 9929;
-        public const short SinPi3_9 = 13377;
-        public const short SinPi4_9 = 15212;
+        public const short SinPi19 = 5283;
+        public const short SinPi29 = 9929;
+        public const short SinPi39 = 13377;
+        public const short SinPi49 = 15212;
     }
-}
+}

+ 623 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/Entropy.cs

@@ -0,0 +1,623 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class Entropy
+    {
+        public const int DiffUpdateProb = 252;
+
+        // Coefficient token alphabet
+        public const int ZeroToken = 0; // 0     Extra Bits 0+0
+        public const int OneToken = 1; // 1     Extra Bits 0+1
+        public const int TwoToken = 2; // 2     Extra Bits 0+1
+        public const int ThreeToken = 3; // 3     Extra Bits 0+1
+        public const int FourToken = 4; // 4     Extra Bits 0+1
+        public const int Category1Token = 5; // 5-6   Extra Bits 1+1
+        public const int Category2Token = 6; // 7-10  Extra Bits 2+1
+        public const int Category3Token = 7; // 11-18 Extra Bits 3+1
+        public const int Category4Token = 8; // 19-34 Extra Bits 4+1
+        public const int Category5Token = 9; // 35-66 Extra Bits 5+1
+        public const int Category6Token = 10; // 67+   Extra Bits 14+1
+        public const int EobToken = 11; // EOB   Extra Bits 0+0
+
+        public const int EntropyTokens = 12;
+
+        public const int RefTypes = 2; // intra=0, inter=1
+
+        /* Middle dimension reflects the coefficient position within the transform. */
+        public const int CoefBands = 6;
+
+        /* Inside dimension is measure of nearby complexity, that reflects the energy
+           of nearby coefficients are nonzero.  For the first coefficient (DC, unless
+           block type is 0), we look at the (already encoded) blocks above and to the
+           left of the current block.  The context index is then the number (0,1,or 2)
+           of these blocks having nonzero coefficients.
+           After decoding a coefficient, the measure is determined by the size of the
+           most recently decoded coefficient.
+           Note that the intuitive meaning of this measure changes as coefficients
+           are decoded, e.g., prior to the first token, a zero means that my neighbors
+           are empty while, after the first token, because of the use of end-of-block,
+           a zero means we just decoded a zero and hence guarantees that a non-zero
+           coefficient will appear later in this block.  However, this shift
+           in meaning is perfectly OK because our context depends also on the
+           coefficient band (and since zigzag positions 0, 1, and 2 are in
+           distinct bands). */
+
+        public const int CoeffContexts = 6;
+
+        public static int BAND_COEFF_CONTEXTS(int band)
+        {
+            return band == 0 ? 3 : CoeffContexts;
+        }
+
+        public const int UnconstrainedNodes = 3;
+
+        public const int PivotNode = 2;
+
+        public const int Cat1MinVal = 5;
+        public const int Cat2MinVal = 7;
+        public const int Cat3MinVal = 11;
+        public const int Cat4MinVal = 19;
+        public const int Cat5MinVal = 35;
+        public const int Cat6MinVal = 67;
+
+        public static readonly byte[] Cat1Prob = { 159 };
+        public static readonly byte[] Cat2Prob = { 165, 145 };
+        public static readonly byte[] Cat3Prob = { 173, 148, 140 };
+        public static readonly byte[] Cat4Prob = { 176, 155, 140, 135 };
+        public static readonly byte[] Cat5Prob = { 180, 157, 141, 134, 130 };
+
+        public static readonly byte[] Cat6Prob =
+        {
+            254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+        };
+
+        public static readonly byte[] Cat6ProbHigh12 =
+        {
+            255, 255, 255, 255, 254, 254, 54, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+        };
+
+        public const int EobModelToken = 3;
+
+        private static readonly byte[] CoefbandTrans8x8Plus =
+        {
+            0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
+            // beyond MAXBAND_INDEX+1 all values are filled as 5
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
+        };
+
+        private static readonly byte[] CoefbandTrans4x4 = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5 };
+
+        public static readonly byte[][] Pareto8Full =
+        {
+            new byte[] { 3, 86, 128, 6, 86, 23, 88, 29 }, new byte[] { 6, 86, 128, 11, 87, 42, 91, 52 },
+            new byte[] { 9, 86, 129, 17, 88, 61, 94, 76 }, new byte[] { 12, 86, 129, 22, 88, 77, 97, 93 },
+            new byte[] { 15, 87, 129, 28, 89, 93, 100, 110 }, new byte[] { 17, 87, 129, 33, 90, 105, 103, 123 },
+            new byte[] { 20, 88, 130, 38, 91, 118, 106, 136 }, new byte[] { 23, 88, 130, 43, 91, 128, 108, 146 },
+            new byte[] { 26, 89, 131, 48, 92, 139, 111, 156 }, new byte[] { 28, 89, 131, 53, 93, 147, 114, 163 },
+            new byte[] { 31, 90, 131, 58, 94, 156, 117, 171 }, new byte[] { 34, 90, 131, 62, 94, 163, 119, 177 },
+            new byte[] { 37, 90, 132, 66, 95, 171, 122, 184 }, new byte[] { 39, 90, 132, 70, 96, 177, 124, 189 },
+            new byte[] { 42, 91, 132, 75, 97, 183, 127, 194 }, new byte[] { 44, 91, 132, 79, 97, 188, 129, 198 },
+            new byte[] { 47, 92, 133, 83, 98, 193, 132, 202 }, new byte[] { 49, 92, 133, 86, 99, 197, 134, 205 },
+            new byte[] { 52, 93, 133, 90, 100, 201, 137, 208 }, new byte[] { 54, 93, 133, 94, 100, 204, 139, 211 },
+            new byte[] { 57, 94, 134, 98, 101, 208, 142, 214 }, new byte[] { 59, 94, 134, 101, 102, 211, 144, 216 },
+            new byte[] { 62, 94, 135, 105, 103, 214, 146, 218 },
+            new byte[] { 64, 94, 135, 108, 103, 216, 148, 220 },
+            new byte[] { 66, 95, 135, 111, 104, 219, 151, 222 },
+            new byte[] { 68, 95, 135, 114, 105, 221, 153, 223 },
+            new byte[] { 71, 96, 136, 117, 106, 224, 155, 225 },
+            new byte[] { 73, 96, 136, 120, 106, 225, 157, 226 },
+            new byte[] { 76, 97, 136, 123, 107, 227, 159, 228 },
+            new byte[] { 78, 97, 136, 126, 108, 229, 160, 229 },
+            new byte[] { 80, 98, 137, 129, 109, 231, 162, 231 },
+            new byte[] { 82, 98, 137, 131, 109, 232, 164, 232 },
+            new byte[] { 84, 98, 138, 134, 110, 234, 166, 233 },
+            new byte[] { 86, 98, 138, 137, 111, 235, 168, 234 },
+            new byte[] { 89, 99, 138, 140, 112, 236, 170, 235 },
+            new byte[] { 91, 99, 138, 142, 112, 237, 171, 235 },
+            new byte[] { 93, 100, 139, 145, 113, 238, 173, 236 },
+            new byte[] { 95, 100, 139, 147, 114, 239, 174, 237 },
+            new byte[] { 97, 101, 140, 149, 115, 240, 176, 238 },
+            new byte[] { 99, 101, 140, 151, 115, 241, 177, 238 },
+            new byte[] { 101, 102, 140, 154, 116, 242, 179, 239 },
+            new byte[] { 103, 102, 140, 156, 117, 242, 180, 239 },
+            new byte[] { 105, 103, 141, 158, 118, 243, 182, 240 },
+            new byte[] { 107, 103, 141, 160, 118, 243, 183, 240 },
+            new byte[] { 109, 104, 141, 162, 119, 244, 185, 241 },
+            new byte[] { 111, 104, 141, 164, 119, 244, 186, 241 },
+            new byte[] { 113, 104, 142, 166, 120, 245, 187, 242 },
+            new byte[] { 114, 104, 142, 168, 121, 245, 188, 242 },
+            new byte[] { 116, 105, 143, 170, 122, 246, 190, 243 },
+            new byte[] { 118, 105, 143, 171, 122, 246, 191, 243 },
+            new byte[] { 120, 106, 143, 173, 123, 247, 192, 244 },
+            new byte[] { 121, 106, 143, 175, 124, 247, 193, 244 },
+            new byte[] { 123, 107, 144, 177, 125, 248, 195, 244 },
+            new byte[] { 125, 107, 144, 178, 125, 248, 196, 244 },
+            new byte[] { 127, 108, 145, 180, 126, 249, 197, 245 },
+            new byte[] { 128, 108, 145, 181, 127, 249, 198, 245 },
+            new byte[] { 130, 109, 145, 183, 128, 249, 199, 245 },
+            new byte[] { 132, 109, 145, 184, 128, 249, 200, 245 },
+            new byte[] { 134, 110, 146, 186, 129, 250, 201, 246 },
+            new byte[] { 135, 110, 146, 187, 130, 250, 202, 246 },
+            new byte[] { 137, 111, 147, 189, 131, 251, 203, 246 },
+            new byte[] { 138, 111, 147, 190, 131, 251, 204, 246 },
+            new byte[] { 140, 112, 147, 192, 132, 251, 205, 247 },
+            new byte[] { 141, 112, 147, 193, 132, 251, 206, 247 },
+            new byte[] { 143, 113, 148, 194, 133, 251, 207, 247 },
+            new byte[] { 144, 113, 148, 195, 134, 251, 207, 247 },
+            new byte[] { 146, 114, 149, 197, 135, 252, 208, 248 },
+            new byte[] { 147, 114, 149, 198, 135, 252, 209, 248 },
+            new byte[] { 149, 115, 149, 199, 136, 252, 210, 248 },
+            new byte[] { 150, 115, 149, 200, 137, 252, 210, 248 },
+            new byte[] { 152, 115, 150, 201, 138, 252, 211, 248 },
+            new byte[] { 153, 115, 150, 202, 138, 252, 212, 248 },
+            new byte[] { 155, 116, 151, 204, 139, 253, 213, 249 },
+            new byte[] { 156, 116, 151, 205, 139, 253, 213, 249 },
+            new byte[] { 158, 117, 151, 206, 140, 253, 214, 249 },
+            new byte[] { 159, 117, 151, 207, 141, 253, 215, 249 },
+            new byte[] { 161, 118, 152, 208, 142, 253, 216, 249 },
+            new byte[] { 162, 118, 152, 209, 142, 253, 216, 249 },
+            new byte[] { 163, 119, 153, 210, 143, 253, 217, 249 },
+            new byte[] { 164, 119, 153, 211, 143, 253, 217, 249 },
+            new byte[] { 166, 120, 153, 212, 144, 254, 218, 250 },
+            new byte[] { 167, 120, 153, 212, 145, 254, 219, 250 },
+            new byte[] { 168, 121, 154, 213, 146, 254, 220, 250 },
+            new byte[] { 169, 121, 154, 214, 146, 254, 220, 250 },
+            new byte[] { 171, 122, 155, 215, 147, 254, 221, 250 },
+            new byte[] { 172, 122, 155, 216, 147, 254, 221, 250 },
+            new byte[] { 173, 123, 155, 217, 148, 254, 222, 250 },
+            new byte[] { 174, 123, 155, 217, 149, 254, 222, 250 },
+            new byte[] { 176, 124, 156, 218, 150, 254, 223, 250 },
+            new byte[] { 177, 124, 156, 219, 150, 254, 223, 250 },
+            new byte[] { 178, 125, 157, 220, 151, 254, 224, 251 },
+            new byte[] { 179, 125, 157, 220, 151, 254, 224, 251 },
+            new byte[] { 180, 126, 157, 221, 152, 254, 225, 251 },
+            new byte[] { 181, 126, 157, 221, 152, 254, 225, 251 },
+            new byte[] { 183, 127, 158, 222, 153, 254, 226, 251 },
+            new byte[] { 184, 127, 158, 223, 154, 254, 226, 251 },
+            new byte[] { 185, 128, 159, 224, 155, 255, 227, 251 },
+            new byte[] { 186, 128, 159, 224, 155, 255, 227, 251 },
+            new byte[] { 187, 129, 160, 225, 156, 255, 228, 251 },
+            new byte[] { 188, 130, 160, 225, 156, 255, 228, 251 },
+            new byte[] { 189, 131, 160, 226, 157, 255, 228, 251 },
+            new byte[] { 190, 131, 160, 226, 158, 255, 228, 251 },
+            new byte[] { 191, 132, 161, 227, 159, 255, 229, 251 },
+            new byte[] { 192, 132, 161, 227, 159, 255, 229, 251 },
+            new byte[] { 193, 133, 162, 228, 160, 255, 230, 252 },
+            new byte[] { 194, 133, 162, 229, 160, 255, 230, 252 },
+            new byte[] { 195, 134, 163, 230, 161, 255, 231, 252 },
+            new byte[] { 196, 134, 163, 230, 161, 255, 231, 252 },
+            new byte[] { 197, 135, 163, 231, 162, 255, 231, 252 },
+            new byte[] { 198, 135, 163, 231, 162, 255, 231, 252 },
+            new byte[] { 199, 136, 164, 232, 163, 255, 232, 252 },
+            new byte[] { 200, 136, 164, 232, 164, 255, 232, 252 },
+            new byte[] { 201, 137, 165, 233, 165, 255, 233, 252 },
+            new byte[] { 201, 137, 165, 233, 165, 255, 233, 252 },
+            new byte[] { 202, 138, 166, 233, 166, 255, 233, 252 },
+            new byte[] { 203, 138, 166, 233, 166, 255, 233, 252 },
+            new byte[] { 204, 139, 166, 234, 167, 255, 234, 252 },
+            new byte[] { 205, 139, 166, 234, 167, 255, 234, 252 },
+            new byte[] { 206, 140, 167, 235, 168, 255, 235, 252 },
+            new byte[] { 206, 140, 167, 235, 168, 255, 235, 252 },
+            new byte[] { 207, 141, 168, 236, 169, 255, 235, 252 },
+            new byte[] { 208, 141, 168, 236, 170, 255, 235, 252 },
+            new byte[] { 209, 142, 169, 237, 171, 255, 236, 252 },
+            new byte[] { 209, 143, 169, 237, 171, 255, 236, 252 },
+            new byte[] { 210, 144, 169, 237, 172, 255, 236, 252 },
+            new byte[] { 211, 144, 169, 237, 172, 255, 236, 252 },
+            new byte[] { 212, 145, 170, 238, 173, 255, 237, 252 },
+            new byte[] { 213, 145, 170, 238, 173, 255, 237, 252 },
+            new byte[] { 214, 146, 171, 239, 174, 255, 237, 253 },
+            new byte[] { 214, 146, 171, 239, 174, 255, 237, 253 },
+            new byte[] { 215, 147, 172, 240, 175, 255, 238, 253 },
+            new byte[] { 215, 147, 172, 240, 175, 255, 238, 253 },
+            new byte[] { 216, 148, 173, 240, 176, 255, 238, 253 },
+            new byte[] { 217, 148, 173, 240, 176, 255, 238, 253 },
+            new byte[] { 218, 149, 173, 241, 177, 255, 239, 253 },
+            new byte[] { 218, 149, 173, 241, 178, 255, 239, 253 },
+            new byte[] { 219, 150, 174, 241, 179, 255, 239, 253 },
+            new byte[] { 219, 151, 174, 241, 179, 255, 239, 253 },
+            new byte[] { 220, 152, 175, 242, 180, 255, 240, 253 },
+            new byte[] { 221, 152, 175, 242, 180, 255, 240, 253 },
+            new byte[] { 222, 153, 176, 242, 181, 255, 240, 253 },
+            new byte[] { 222, 153, 176, 242, 181, 255, 240, 253 },
+            new byte[] { 223, 154, 177, 243, 182, 255, 240, 253 },
+            new byte[] { 223, 154, 177, 243, 182, 255, 240, 253 },
+            new byte[] { 224, 155, 178, 244, 183, 255, 241, 253 },
+            new byte[] { 224, 155, 178, 244, 183, 255, 241, 253 },
+            new byte[] { 225, 156, 178, 244, 184, 255, 241, 253 },
+            new byte[] { 225, 157, 178, 244, 184, 255, 241, 253 },
+            new byte[] { 226, 158, 179, 244, 185, 255, 242, 253 },
+            new byte[] { 227, 158, 179, 244, 185, 255, 242, 253 },
+            new byte[] { 228, 159, 180, 245, 186, 255, 242, 253 },
+            new byte[] { 228, 159, 180, 245, 186, 255, 242, 253 },
+            new byte[] { 229, 160, 181, 245, 187, 255, 242, 253 },
+            new byte[] { 229, 160, 181, 245, 187, 255, 242, 253 },
+            new byte[] { 230, 161, 182, 246, 188, 255, 243, 253 },
+            new byte[] { 230, 162, 182, 246, 188, 255, 243, 253 },
+            new byte[] { 231, 163, 183, 246, 189, 255, 243, 253 },
+            new byte[] { 231, 163, 183, 246, 189, 255, 243, 253 },
+            new byte[] { 232, 164, 184, 247, 190, 255, 243, 253 },
+            new byte[] { 232, 164, 184, 247, 190, 255, 243, 253 },
+            new byte[] { 233, 165, 185, 247, 191, 255, 244, 253 },
+            new byte[] { 233, 165, 185, 247, 191, 255, 244, 253 },
+            new byte[] { 234, 166, 185, 247, 192, 255, 244, 253 },
+            new byte[] { 234, 167, 185, 247, 192, 255, 244, 253 },
+            new byte[] { 235, 168, 186, 248, 193, 255, 244, 253 },
+            new byte[] { 235, 168, 186, 248, 193, 255, 244, 253 },
+            new byte[] { 236, 169, 187, 248, 194, 255, 244, 253 },
+            new byte[] { 236, 169, 187, 248, 194, 255, 244, 253 },
+            new byte[] { 236, 170, 188, 248, 195, 255, 245, 253 },
+            new byte[] { 236, 170, 188, 248, 195, 255, 245, 253 },
+            new byte[] { 237, 171, 189, 249, 196, 255, 245, 254 },
+            new byte[] { 237, 172, 189, 249, 196, 255, 245, 254 },
+            new byte[] { 238, 173, 190, 249, 197, 255, 245, 254 },
+            new byte[] { 238, 173, 190, 249, 197, 255, 245, 254 },
+            new byte[] { 239, 174, 191, 249, 198, 255, 245, 254 },
+            new byte[] { 239, 174, 191, 249, 198, 255, 245, 254 },
+            new byte[] { 240, 175, 192, 249, 199, 255, 246, 254 },
+            new byte[] { 240, 176, 192, 249, 199, 255, 246, 254 },
+            new byte[] { 240, 177, 193, 250, 200, 255, 246, 254 },
+            new byte[] { 240, 177, 193, 250, 200, 255, 246, 254 },
+            new byte[] { 241, 178, 194, 250, 201, 255, 246, 254 },
+            new byte[] { 241, 178, 194, 250, 201, 255, 246, 254 },
+            new byte[] { 242, 179, 195, 250, 202, 255, 246, 254 },
+            new byte[] { 242, 180, 195, 250, 202, 255, 246, 254 },
+            new byte[] { 242, 181, 196, 250, 203, 255, 247, 254 },
+            new byte[] { 242, 181, 196, 250, 203, 255, 247, 254 },
+            new byte[] { 243, 182, 197, 251, 204, 255, 247, 254 },
+            new byte[] { 243, 183, 197, 251, 204, 255, 247, 254 },
+            new byte[] { 244, 184, 198, 251, 205, 255, 247, 254 },
+            new byte[] { 244, 184, 198, 251, 205, 255, 247, 254 },
+            new byte[] { 244, 185, 199, 251, 206, 255, 247, 254 },
+            new byte[] { 244, 185, 199, 251, 206, 255, 247, 254 },
+            new byte[] { 245, 186, 200, 251, 207, 255, 247, 254 },
+            new byte[] { 245, 187, 200, 251, 207, 255, 247, 254 },
+            new byte[] { 246, 188, 201, 252, 207, 255, 248, 254 },
+            new byte[] { 246, 188, 201, 252, 207, 255, 248, 254 },
+            new byte[] { 246, 189, 202, 252, 208, 255, 248, 254 },
+            new byte[] { 246, 190, 202, 252, 208, 255, 248, 254 },
+            new byte[] { 247, 191, 203, 252, 209, 255, 248, 254 },
+            new byte[] { 247, 191, 203, 252, 209, 255, 248, 254 },
+            new byte[] { 247, 192, 204, 252, 210, 255, 248, 254 },
+            new byte[] { 247, 193, 204, 252, 210, 255, 248, 254 },
+            new byte[] { 248, 194, 205, 252, 211, 255, 248, 254 },
+            new byte[] { 248, 194, 205, 252, 211, 255, 248, 254 },
+            new byte[] { 248, 195, 206, 252, 212, 255, 249, 254 },
+            new byte[] { 248, 196, 206, 252, 212, 255, 249, 254 },
+            new byte[] { 249, 197, 207, 253, 213, 255, 249, 254 },
+            new byte[] { 249, 197, 207, 253, 213, 255, 249, 254 },
+            new byte[] { 249, 198, 208, 253, 214, 255, 249, 254 },
+            new byte[] { 249, 199, 209, 253, 214, 255, 249, 254 },
+            new byte[] { 250, 200, 210, 253, 215, 255, 249, 254 },
+            new byte[] { 250, 200, 210, 253, 215, 255, 249, 254 },
+            new byte[] { 250, 201, 211, 253, 215, 255, 249, 254 },
+            new byte[] { 250, 202, 211, 253, 215, 255, 249, 254 },
+            new byte[] { 250, 203, 212, 253, 216, 255, 249, 254 },
+            new byte[] { 250, 203, 212, 253, 216, 255, 249, 254 },
+            new byte[] { 251, 204, 213, 253, 217, 255, 250, 254 },
+            new byte[] { 251, 205, 213, 253, 217, 255, 250, 254 },
+            new byte[] { 251, 206, 214, 254, 218, 255, 250, 254 },
+            new byte[] { 251, 206, 215, 254, 218, 255, 250, 254 },
+            new byte[] { 252, 207, 216, 254, 219, 255, 250, 254 },
+            new byte[] { 252, 208, 216, 254, 219, 255, 250, 254 },
+            new byte[] { 252, 209, 217, 254, 220, 255, 250, 254 },
+            new byte[] { 252, 210, 217, 254, 220, 255, 250, 254 },
+            new byte[] { 252, 211, 218, 254, 221, 255, 250, 254 },
+            new byte[] { 252, 212, 218, 254, 221, 255, 250, 254 },
+            new byte[] { 253, 213, 219, 254, 222, 255, 250, 254 },
+            new byte[] { 253, 213, 220, 254, 222, 255, 250, 254 },
+            new byte[] { 253, 214, 221, 254, 223, 255, 250, 254 },
+            new byte[] { 253, 215, 221, 254, 223, 255, 250, 254 },
+            new byte[] { 253, 216, 222, 254, 224, 255, 251, 254 },
+            new byte[] { 253, 217, 223, 254, 224, 255, 251, 254 },
+            new byte[] { 253, 218, 224, 254, 225, 255, 251, 254 },
+            new byte[] { 253, 219, 224, 254, 225, 255, 251, 254 },
+            new byte[] { 254, 220, 225, 254, 225, 255, 251, 254 },
+            new byte[] { 254, 221, 226, 254, 225, 255, 251, 254 },
+            new byte[] { 254, 222, 227, 255, 226, 255, 251, 254 },
+            new byte[] { 254, 223, 227, 255, 226, 255, 251, 254 },
+            new byte[] { 254, 224, 228, 255, 227, 255, 251, 254 },
+            new byte[] { 254, 225, 229, 255, 227, 255, 251, 254 },
+            new byte[] { 254, 226, 230, 255, 228, 255, 251, 254 },
+            new byte[] { 254, 227, 230, 255, 229, 255, 251, 254 },
+            new byte[] { 255, 228, 231, 255, 230, 255, 251, 254 },
+            new byte[] { 255, 229, 232, 255, 230, 255, 251, 254 },
+            new byte[] { 255, 230, 233, 255, 231, 255, 252, 254 },
+            new byte[] { 255, 231, 234, 255, 231, 255, 252, 254 },
+            new byte[] { 255, 232, 235, 255, 232, 255, 252, 254 },
+            new byte[] { 255, 233, 236, 255, 232, 255, 252, 254 },
+            new byte[] { 255, 235, 237, 255, 233, 255, 252, 254 },
+            new byte[] { 255, 236, 238, 255, 234, 255, 252, 254 },
+            new byte[] { 255, 238, 240, 255, 235, 255, 252, 255 },
+            new byte[] { 255, 239, 241, 255, 235, 255, 252, 254 },
+            new byte[] { 255, 241, 243, 255, 236, 255, 252, 254 },
+            new byte[] { 255, 243, 245, 255, 237, 255, 252, 254 },
+            new byte[] { 255, 246, 247, 255, 239, 255, 253, 255 }
+        };
+
+        internal static readonly byte[] DefaultCoefProbs4x4 =
+        {
+            // Y plane
+            // Intra
+            // Band 0
+            195, 29, 183, 84, 49, 136, 8, 42, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            31, 107, 169, 35, 99, 159, 17, 82, 140, 8, 66, 114, 2, 44, 76, 1, 19, 32,
+            // Band 2
+            40, 132, 201, 29, 114, 187, 13, 91, 157, 7, 75, 127, 3, 58, 95, 1, 28, 47,
+            // Band 3
+            69, 142, 221, 42, 122, 201, 15, 91, 159, 6, 67, 121, 1, 42, 77, 1, 17, 31,
+            // Band 4
+            102, 148, 228, 67, 117, 204, 17, 82, 154, 6, 59, 114, 2, 39, 75, 1, 15, 29,
+            // Band 5
+            156, 57, 233, 119, 57, 212, 58, 48, 163, 29, 40, 124, 12, 30, 81, 3, 12, 31,
+            // Inter
+            // Band 0
+            191, 107, 226, 124, 117, 204, 25, 99, 155, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            29, 148, 210, 37, 126, 194, 8, 93, 157, 2, 68, 118, 1, 39, 69, 1, 17, 33,
+            // Band 2
+            41, 151, 213, 27, 123, 193, 3, 82, 144, 1, 58, 105, 1, 32, 60, 1, 13, 26,
+            // Band 3
+            59, 159, 220, 23, 126, 198, 4, 88, 151, 1, 66, 114, 1, 38, 71, 1, 18, 34,
+            // Band 4
+            114, 136, 232, 51, 114, 207, 11, 83, 155, 3, 56, 105, 1, 33, 65, 1, 17, 34,
+            // Band 5
+            149, 65, 234, 121, 57, 215, 61, 49, 166, 28, 36, 114, 12, 25, 76, 3, 16, 42,
+            // UV plane
+            // Intra
+            // Band 0
+            214, 49, 220, 132, 63, 188, 42, 65, 137, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            85, 137, 221, 104, 131, 216, 49, 111, 192, 21, 87, 155, 2, 49, 87, 1, 16, 28,
+            // Band 2
+            89, 163, 230, 90, 137, 220, 29, 100, 183, 10, 70, 135, 2, 42, 81, 1, 17, 33,
+            // Band 3
+            108, 167, 237, 55, 133, 222, 15, 97, 179, 4, 72, 135, 1, 45, 85, 1, 19, 38,
+            // Band 4
+            124, 146, 240, 66, 124, 224, 17, 88, 175, 4, 58, 122, 1, 36, 75, 1, 18, 37,
+            //  Band 5
+            141, 79, 241, 126, 70, 227, 66, 58, 182, 30, 44, 136, 12, 34, 96, 2, 20, 47,
+            // Inter
+            // Band 0
+            229, 99, 249, 143, 111, 235, 46, 109, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            82, 158, 236, 94, 146, 224, 25, 117, 191, 9, 87, 149, 3, 56, 99, 1, 33, 57,
+            // Band 2
+            83, 167, 237, 68, 145, 222, 10, 103, 177, 2, 72, 131, 1, 41, 79, 1, 20, 39,
+            // Band 3
+            99, 167, 239, 47, 141, 224, 10, 104, 178, 2, 73, 133, 1, 44, 85, 1, 22, 47,
+            // Band 4
+            127, 145, 243, 71, 129, 228, 17, 93, 177, 3, 61, 124, 1, 41, 84, 1, 21, 52,
+            // Band 5
+            157, 78, 244, 140, 72, 231, 69, 58, 184, 31, 44, 137, 14, 38, 105, 8, 23, 61
+        };
+
+        internal static readonly byte[] DefaultCoefProbs8x8 =
+        {
+            // Y plane
+            // Intra
+            // Band 0
+            125, 34, 187, 52, 41, 133, 6, 31, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            37, 109, 153, 51, 102, 147, 23, 87, 128, 8, 67, 101, 1, 41, 63, 1, 19, 29,
+            // Band 2
+            31, 154, 185, 17, 127, 175, 6, 96, 145, 2, 73, 114, 1, 51, 82, 1, 28, 45,
+            // Band 3
+            23, 163, 200, 10, 131, 185, 2, 93, 148, 1, 67, 111, 1, 41, 69, 1, 14, 24,
+            // Band 4
+            29, 176, 217, 12, 145, 201, 3, 101, 156, 1, 69, 111, 1, 39, 63, 1, 14, 23,
+            // Band 5
+            57, 192, 233, 25, 154, 215, 6, 109, 167, 3, 78, 118, 1, 48, 69, 1, 21, 29,
+            // Inter
+            // Band 0
+            202, 105, 245, 108, 106, 216, 18, 90, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            33, 172, 219, 64, 149, 206, 14, 117, 177, 5, 90, 141, 2, 61, 95, 1, 37, 57,
+            // Band 2
+            33, 179, 220, 11, 140, 198, 1, 89, 148, 1, 60, 104, 1, 33, 57, 1, 12, 21,
+            // Band 3
+            30, 181, 221, 8, 141, 198, 1, 87, 145, 1, 58, 100, 1, 31, 55, 1, 12, 20,
+            // Band 4
+            32, 186, 224, 7, 142, 198, 1, 86, 143, 1, 58, 100, 1, 31, 55, 1, 12, 22,
+            // Band 5
+            57, 192, 227, 20, 143, 204, 3, 96, 154, 1, 68, 112, 1, 42, 69, 1, 19, 32,
+            // UV plane
+            // Intra
+            // Band 0
+            212, 35, 215, 113, 47, 169, 29, 48, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            74, 129, 203, 106, 120, 203, 49, 107, 178, 19, 84, 144, 4, 50, 84, 1, 15, 25,
+            // Band 2
+            71, 172, 217, 44, 141, 209, 15, 102, 173, 6, 76, 133, 2, 51, 89, 1, 24, 42,
+            // Band 3
+            64, 185, 231, 31, 148, 216, 8, 103, 175, 3, 74, 131, 1, 46, 81, 1, 18, 30,
+            // Band 4
+            65, 196, 235, 25, 157, 221, 5, 105, 174, 1, 67, 120, 1, 38, 69, 1, 15, 30,
+            // Band 5
+            65, 204, 238, 30, 156, 224, 7, 107, 177, 2, 70, 124, 1, 42, 73, 1, 18, 34,
+            // Inter
+            // Band 0
+            225, 86, 251, 144, 104, 235, 42, 99, 181, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            85, 175, 239, 112, 165, 229, 29, 136, 200, 12, 103, 162, 6, 77, 123, 2, 53, 84,
+            // Band 2
+            75, 183, 239, 30, 155, 221, 3, 106, 171, 1, 74, 128, 1, 44, 76, 1, 17, 28,
+            // Band 3
+            73, 185, 240, 27, 159, 222, 2, 107, 172, 1, 75, 127, 1, 42, 73, 1, 17, 29,
+            // Band 4
+            62, 190, 238, 21, 159, 222, 2, 107, 172, 1, 72, 122, 1, 40, 71, 1, 18, 32,
+            // Band 5
+            61, 199, 240, 27, 161, 226, 4, 113, 180, 1, 76, 129, 1, 46, 80, 1, 23, 41
+        };
+
+        internal static readonly byte[] DefaultCoefProbs16x16 =
+        {
+            // Y plane
+            // Intra
+            // Band 0
+            7, 27, 153, 5, 30, 95, 1, 16, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            50, 75, 127, 57, 75, 124, 27, 67, 108, 10, 54, 86, 1, 33, 52, 1, 12, 18,
+            // Band 2
+            43, 125, 151, 26, 108, 148, 7, 83, 122, 2, 59, 89, 1, 38, 60, 1, 17, 27,
+            // Band 3
+            23, 144, 163, 13, 112, 154, 2, 75, 117, 1, 50, 81, 1, 31, 51, 1, 14, 23,
+            // Band 4
+            18, 162, 185, 6, 123, 171, 1, 78, 125, 1, 51, 86, 1, 31, 54, 1, 14, 23,
+            // Band 5
+            15, 199, 227, 3, 150, 204, 1, 91, 146, 1, 55, 95, 1, 30, 53, 1, 11, 20,
+            // Inter
+            // Band 0
+            19, 55, 240, 19, 59, 196, 3, 52, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            41, 166, 207, 104, 153, 199, 31, 123, 181, 14, 101, 152, 5, 72, 106, 1, 36, 52,
+            // Band 2
+            35, 176, 211, 12, 131, 190, 2, 88, 144, 1, 60, 101, 1, 36, 60, 1, 16, 28,
+            // Band 3
+            28, 183, 213, 8, 134, 191, 1, 86, 142, 1, 56, 96, 1, 30, 53, 1, 12, 20,
+            // Band 4
+            20, 190, 215, 4, 135, 192, 1, 84, 139, 1, 53, 91, 1, 28, 49, 1, 11, 20,
+            // Band 5
+            13, 196, 216, 2, 137, 192, 1, 86, 143, 1, 57, 99, 1, 32, 56, 1, 13, 24,
+            // UV plane
+            // Intra
+            // Band 0
+            211, 29, 217, 96, 47, 156, 22, 43, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            78, 120, 193, 111, 116, 186, 46, 102, 164, 15, 80, 128, 2, 49, 76, 1, 18, 28,
+            // Band 2
+            71, 161, 203, 42, 132, 192, 10, 98, 150, 3, 69, 109, 1, 44, 70, 1, 18, 29,
+            // Band 3
+            57, 186, 211, 30, 140, 196, 4, 93, 146, 1, 62, 102, 1, 38, 65, 1, 16, 27,
+            // Band 4
+            47, 199, 217, 14, 145, 196, 1, 88, 142, 1, 57, 98, 1, 36, 62, 1, 15, 26,
+            // Band 5
+            26, 219, 229, 5, 155, 207, 1, 94, 151, 1, 60, 104, 1, 36, 62, 1, 16, 28,
+            // Inter
+            // Band 0
+            233, 29, 248, 146, 47, 220, 43, 52, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            100, 163, 232, 179, 161, 222, 63, 142, 204, 37, 113, 174, 26, 89, 137, 18, 68, 97,
+            // Band 2
+            85, 181, 230, 32, 146, 209, 7, 100, 164, 3, 71, 121, 1, 45, 77, 1, 18, 30,
+            // Band 3
+            65, 187, 230, 20, 148, 207, 2, 97, 159, 1, 68, 116, 1, 40, 70, 1, 14, 29,
+            // Band 4
+            40, 194, 227, 8, 147, 204, 1, 94, 155, 1, 65, 112, 1, 39, 66, 1, 14, 26,
+            // Band 5
+            16, 208, 228, 3, 151, 207, 1, 98, 160, 1, 67, 117, 1, 41, 74, 1, 17, 31
+        };
+
+        internal static readonly byte[] DefaultCoefProbs32x32 =
+        {
+            // Y plane
+            // Intra
+            // Band 0
+            17, 38, 140, 7, 34, 80, 1, 17, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            37, 75, 128, 41, 76, 128, 26, 66, 116, 12, 52, 94, 2, 32, 55, 1, 10, 16,
+            // Band 2
+            50, 127, 154, 37, 109, 152, 16, 82, 121, 5, 59, 85, 1, 35, 54, 1, 13, 20,
+            // Band 3
+            40, 142, 167, 17, 110, 157, 2, 71, 112, 1, 44, 72, 1, 27, 45, 1, 11, 17,
+            // Band 4
+            30, 175, 188, 9, 124, 169, 1, 74, 116, 1, 48, 78, 1, 30, 49, 1, 11, 18,
+            // Band 5
+            10, 222, 223, 2, 150, 194, 1, 83, 128, 1, 48, 79, 1, 27, 45, 1, 11, 17,
+            // Inter
+            // Band 0
+            36, 41, 235, 29, 36, 193, 10, 27, 111, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            85, 165, 222, 177, 162, 215, 110, 135, 195, 57, 113, 168, 23, 83, 120, 10, 49, 61,
+            // Band 2
+            85, 190, 223, 36, 139, 200, 5, 90, 146, 1, 60, 103, 1, 38, 65, 1, 18, 30,
+            // Band 3
+            72, 202, 223, 23, 141, 199, 2, 86, 140, 1, 56, 97, 1, 36, 61, 1, 16, 27,
+            // Band 4
+            55, 218, 225, 13, 145, 200, 1, 86, 141, 1, 57, 99, 1, 35, 61, 1, 13, 22,
+            // Band 5
+            15, 235, 212, 1, 132, 184, 1, 84, 139, 1, 57, 97, 1, 34, 56, 1, 14, 23,
+            // UV plane
+            // Intra
+            // Band 0
+            181, 21, 201, 61, 37, 123, 10, 38, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            47, 106, 172, 95, 104, 173, 42, 93, 159, 18, 77, 131, 4, 50, 81, 1, 17, 23,
+            // Band 2
+            62, 147, 199, 44, 130, 189, 28, 102, 154, 18, 75, 115, 2, 44, 65, 1, 12, 19,
+            // Band 3
+            55, 153, 210, 24, 130, 194, 3, 93, 146, 1, 61, 97, 1, 31, 50, 1, 10, 16,
+            // Band 4
+            49, 186, 223, 17, 148, 204, 1, 96, 142, 1, 53, 83, 1, 26, 44, 1, 11, 17,
+            // Band 5
+            13, 217, 212, 2, 136, 180, 1, 78, 124, 1, 50, 83, 1, 29, 49, 1, 14, 23,
+            // Inter
+            // Band 0
+            197, 13, 247, 82, 17, 222, 25, 17, 162, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            // Band 1
+            126, 186, 247, 234, 191, 243, 176, 177, 234, 104, 158, 220, 66, 128, 186, 55, 90, 137,
+            // Band 2
+            111, 197, 242, 46, 158, 219, 9, 104, 171, 2, 65, 125, 1, 44, 80, 1, 17, 91,
+            // Band 3
+            104, 208, 245, 39, 168, 224, 3, 109, 162, 1, 79, 124, 1, 50, 102, 1, 43, 102,
+            // Band 4
+            84, 220, 246, 31, 177, 231, 2, 115, 180, 1, 79, 134, 1, 55, 77, 1, 60, 79,
+            // Band 5
+            43, 243, 240, 8, 180, 217, 1, 115, 166, 1, 84, 121, 1, 51, 67, 1, 16, 6
+        };
+
+        public static byte[] GetBandTranslate(int txSize)
+        {
+            return txSize == (int)TxSize.Tx4x4 ? CoefbandTrans4x4 : CoefbandTrans8x8Plus;
+        }
+
+        public static void CopyProbs<T>(ref T dest, ReadOnlySpan<byte> probs) where T : unmanaged
+        {
+            if (Unsafe.SizeOf<T>() != probs.Length)
+            {
+                throw new Exception("size mismatch expected: " + probs.Length + " got: " + Unsafe.SizeOf<T>());
+            }
+
+            probs.CopyTo(MemoryMarshal.Cast<T, byte>(MemoryMarshal.CreateSpan(ref dest, 1)));
+        }
+
+        internal const int CoefCountSat = 24;
+        internal const int CoefMaxUpdateFactor = 112;
+        internal const int CoefCountSatKey = 24;
+        internal const int CoefMaxUpdateFactorKey = 112;
+        internal const int CoefCountSatAfterKey = 24;
+        internal const int CoefMaxUpdateFactorAfterKey = 128;
+    }
+}

+ 400 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMode.cs

@@ -0,0 +1,400 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using Ryujinx.Graphics.Video;
+using System;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal class EntropyMode
+    {
+        public const int BlockSizeGroups = 4;
+
+        public const int TxSizeContexts = 2;
+
+        public static readonly byte[][][] KfYModeProb =
+        {
+            new[]
+            {
+                // above = dc
+                new byte[] { 137, 30, 42, 148, 151, 207, 70, 52, 91 }, // left = dc
+                new byte[] { 92, 45, 102, 136, 116, 180, 74, 90, 100 }, // left = v
+                new byte[] { 73, 32, 19, 187, 222, 215, 46, 34, 100 }, // left = h
+                new byte[] { 91, 30, 32, 116, 121, 186, 93, 86, 94 }, // left = d45
+                new byte[] { 72, 35, 36, 149, 68, 206, 68, 63, 105 }, // left = d135
+                new byte[] { 73, 31, 28, 138, 57, 124, 55, 122, 151 }, // left = d117
+                new byte[] { 67, 23, 21, 140, 126, 197, 40, 37, 171 }, // left = d153
+                new byte[] { 86, 27, 28, 128, 154, 212, 45, 43, 53 }, // left = d207
+                new byte[] { 74, 32, 27, 107, 86, 160, 63, 134, 102 }, // left = d63
+                new byte[] { 59, 67, 44, 140, 161, 202, 78, 67, 119 } // left = tm
+            },
+            new[]
+            {
+                // above = v
+                new byte[] { 63, 36, 126, 146, 123, 158, 60, 90, 96 }, // left = dc
+                new byte[] { 43, 46, 168, 134, 107, 128, 69, 142, 92 }, // left = v
+                new byte[] { 44, 29, 68, 159, 201, 177, 50, 57, 77 }, // left = h
+                new byte[] { 58, 38, 76, 114, 97, 172, 78, 133, 92 }, // left = d45
+                new byte[] { 46, 41, 76, 140, 63, 184, 69, 112, 57 }, // left = d135
+                new byte[] { 38, 32, 85, 140, 46, 112, 54, 151, 133 }, // left = d117
+                new byte[] { 39, 27, 61, 131, 110, 175, 44, 75, 136 }, // left = d153
+                new byte[] { 52, 30, 74, 113, 130, 175, 51, 64, 58 }, // left = d207
+                new byte[] { 47, 35, 80, 100, 74, 143, 64, 163, 74 }, // left = d63
+                new byte[] { 36, 61, 116, 114, 128, 162, 80, 125, 82 } // left = tm
+            },
+            new[]
+            {
+                // above = h
+                new byte[] { 82, 26, 26, 171, 208, 204, 44, 32, 105 }, // left = dc
+                new byte[] { 55, 44, 68, 166, 179, 192, 57, 57, 108 }, // left = v
+                new byte[] { 42, 26, 11, 199, 241, 228, 23, 15, 85 }, // left = h
+                new byte[] { 68, 42, 19, 131, 160, 199, 55, 52, 83 }, // left = d45
+                new byte[] { 58, 50, 25, 139, 115, 232, 39, 52, 118 }, // left = d135
+                new byte[] { 50, 35, 33, 153, 104, 162, 64, 59, 131 }, // left = d117
+                new byte[] { 44, 24, 16, 150, 177, 202, 33, 19, 156 }, // left = d153
+                new byte[] { 55, 27, 12, 153, 203, 218, 26, 27, 49 }, // left = d207
+                new byte[] { 53, 49, 21, 110, 116, 168, 59, 80, 76 }, // left = d63
+                new byte[] { 38, 72, 19, 168, 203, 212, 50, 50, 107 } // left = tm
+            },
+            new[]
+            {
+                // above = d45
+                new byte[] { 103, 26, 36, 129, 132, 201, 83, 80, 93 }, // left = dc
+                new byte[] { 59, 38, 83, 112, 103, 162, 98, 136, 90 }, // left = v
+                new byte[] { 62, 30, 23, 158, 200, 207, 59, 57, 50 }, // left = h
+                new byte[] { 67, 30, 29, 84, 86, 191, 102, 91, 59 }, // left = d45
+                new byte[] { 60, 32, 33, 112, 71, 220, 64, 89, 104 }, // left = d135
+                new byte[] { 53, 26, 34, 130, 56, 149, 84, 120, 103 }, // left = d117
+                new byte[] { 53, 21, 23, 133, 109, 210, 56, 77, 172 }, // left = d153
+                new byte[] { 77, 19, 29, 112, 142, 228, 55, 66, 36 }, // left = d207
+                new byte[] { 61, 29, 29, 93, 97, 165, 83, 175, 162 }, // left = d63
+                new byte[] { 47, 47, 43, 114, 137, 181, 100, 99, 95 } // left = tm
+            },
+            new[]
+            {
+                // above = d135
+                new byte[] { 69, 23, 29, 128, 83, 199, 46, 44, 101 }, // left = dc
+                new byte[] { 53, 40, 55, 139, 69, 183, 61, 80, 110 }, // left = v
+                new byte[] { 40, 29, 19, 161, 180, 207, 43, 24, 91 }, // left = h
+                new byte[] { 60, 34, 19, 105, 61, 198, 53, 64, 89 }, // left = d45
+                new byte[] { 52, 31, 22, 158, 40, 209, 58, 62, 89 }, // left = d135
+                new byte[] { 44, 31, 29, 147, 46, 158, 56, 102, 198 }, // left = d117
+                new byte[] { 35, 19, 12, 135, 87, 209, 41, 45, 167 }, // left = d153
+                new byte[] { 55, 25, 21, 118, 95, 215, 38, 39, 66 }, // left = d207
+                new byte[] { 51, 38, 25, 113, 58, 164, 70, 93, 97 }, // left = d63
+                new byte[] { 47, 54, 34, 146, 108, 203, 72, 103, 151 } // left = tm
+            },
+            new[]
+            {
+                // above = d117
+                new byte[] { 64, 19, 37, 156, 66, 138, 49, 95, 133 }, // left = dc
+                new byte[] { 46, 27, 80, 150, 55, 124, 55, 121, 135 }, // left = v
+                new byte[] { 36, 23, 27, 165, 149, 166, 54, 64, 118 }, // left = h
+                new byte[] { 53, 21, 36, 131, 63, 163, 60, 109, 81 }, // left = d45
+                new byte[] { 40, 26, 35, 154, 40, 185, 51, 97, 123 }, // left = d135
+                new byte[] { 35, 19, 34, 179, 19, 97, 48, 129, 124 }, // left = d117
+                new byte[] { 36, 20, 26, 136, 62, 164, 33, 77, 154 }, // left = d153
+                new byte[] { 45, 18, 32, 130, 90, 157, 40, 79, 91 }, // left = d207
+                new byte[] { 45, 26, 28, 129, 45, 129, 49, 147, 123 }, // left = d63
+                new byte[] { 38, 44, 51, 136, 74, 162, 57, 97, 121 } // left = tm
+            },
+            new[]
+            {
+                // above = d153
+                new byte[] { 75, 17, 22, 136, 138, 185, 32, 34, 166 }, // left = dc
+                new byte[] { 56, 39, 58, 133, 117, 173, 48, 53, 187 }, // left = v
+                new byte[] { 35, 21, 12, 161, 212, 207, 20, 23, 145 }, // left = h
+                new byte[] { 56, 29, 19, 117, 109, 181, 55, 68, 112 }, // left = d45
+                new byte[] { 47, 29, 17, 153, 64, 220, 59, 51, 114 }, // left = d135
+                new byte[] { 46, 16, 24, 136, 76, 147, 41, 64, 172 }, // left = d117
+                new byte[] { 34, 17, 11, 108, 152, 187, 13, 15, 209 }, // left = d153
+                new byte[] { 51, 24, 14, 115, 133, 209, 32, 26, 104 }, // left = d207
+                new byte[] { 55, 30, 18, 122, 79, 179, 44, 88, 116 }, // left = d63
+                new byte[] { 37, 49, 25, 129, 168, 164, 41, 54, 148 } // left = tm
+            },
+            new[]
+            {
+                // above = d207
+                new byte[] { 82, 22, 32, 127, 143, 213, 39, 41, 70 }, // left = dc
+                new byte[] { 62, 44, 61, 123, 105, 189, 48, 57, 64 }, // left = v
+                new byte[] { 47, 25, 17, 175, 222, 220, 24, 30, 86 }, // left = h
+                new byte[] { 68, 36, 17, 106, 102, 206, 59, 74, 74 }, // left = d45
+                new byte[] { 57, 39, 23, 151, 68, 216, 55, 63, 58 }, // left = d135
+                new byte[] { 49, 30, 35, 141, 70, 168, 82, 40, 115 }, // left = d117
+                new byte[] { 51, 25, 15, 136, 129, 202, 38, 35, 139 }, // left = d153
+                new byte[] { 68, 26, 16, 111, 141, 215, 29, 28, 28 }, // left = d207
+                new byte[] { 59, 39, 19, 114, 75, 180, 77, 104, 42 }, // left = d63
+                new byte[] { 40, 61, 26, 126, 152, 206, 61, 59, 93 } // left = tm
+            },
+            new[]
+            {
+                // above = d63
+                new byte[] { 78, 23, 39, 111, 117, 170, 74, 124, 94 }, // left = dc
+                new byte[] { 48, 34, 86, 101, 92, 146, 78, 179, 134 }, // left = v
+                new byte[] { 47, 22, 24, 138, 187, 178, 68, 69, 59 }, // left = h
+                new byte[] { 56, 25, 33, 105, 112, 187, 95, 177, 129 }, // left = d45
+                new byte[] { 48, 31, 27, 114, 63, 183, 82, 116, 56 }, // left = d135
+                new byte[] { 43, 28, 37, 121, 63, 123, 61, 192, 169 }, // left = d117
+                new byte[] { 42, 17, 24, 109, 97, 177, 56, 76, 122 }, // left = d153
+                new byte[] { 58, 18, 28, 105, 139, 182, 70, 92, 63 }, // left = d207
+                new byte[] { 46, 23, 32, 74, 86, 150, 67, 183, 88 }, // left = d63
+                new byte[] { 36, 38, 48, 92, 122, 165, 88, 137, 91 } // left = tm
+            },
+            new[]
+            {
+                // above = tm
+                new byte[] { 65, 70, 60, 155, 159, 199, 61, 60, 81 }, // left = dc
+                new byte[] { 44, 78, 115, 132, 119, 173, 71, 112, 93 }, // left = v
+                new byte[] { 39, 38, 21, 184, 227, 206, 42, 32, 64 }, // left = h
+                new byte[] { 58, 47, 36, 124, 137, 193, 80, 82, 78 }, // left = d45
+                new byte[] { 49, 50, 35, 144, 95, 205, 63, 78, 59 }, // left = d135
+                new byte[] { 41, 53, 52, 148, 71, 142, 65, 128, 51 }, // left = d117
+                new byte[] { 40, 36, 28, 143, 143, 202, 40, 55, 137 }, // left = d153
+                new byte[] { 52, 34, 29, 129, 183, 227, 42, 35, 43 }, // left = d207
+                new byte[] { 42, 44, 44, 104, 105, 164, 64, 130, 80 }, // left = d63
+                new byte[] { 43, 81, 53, 140, 169, 204, 68, 84, 72 } // left = tm
+            }
+        };
+
+        public static readonly byte[][] KfUvModeProb =
+        {
+            new byte[] { 144, 11, 54, 157, 195, 130, 46, 58, 108 }, // y = dc
+            new byte[] { 118, 15, 123, 148, 131, 101, 44, 93, 131 }, // y = v
+            new byte[] { 113, 12, 23, 188, 226, 142, 26, 32, 125 }, // y = h
+            new byte[] { 120, 11, 50, 123, 163, 135, 64, 77, 103 }, // y = d45
+            new byte[] { 113, 9, 36, 155, 111, 157, 32, 44, 161 }, // y = d135
+            new byte[] { 116, 9, 55, 176, 76, 96, 37, 61, 149 }, // y = d117
+            new byte[] { 115, 9, 28, 141, 161, 167, 21, 25, 193 }, // y = d153
+            new byte[] { 120, 12, 32, 145, 195, 142, 32, 38, 86 }, // y = d207
+            new byte[] { 116, 12, 64, 120, 140, 125, 49, 115, 121 }, // y = d63
+            new byte[] { 102, 19, 66, 162, 182, 122, 35, 59, 128 } // y = tm
+        };
+
+        private static readonly byte[] DefaultIfYProbs =
+        {
+            65, 32, 18, 144, 162, 194, 41, 51, 98, // block_size < 8x8
+            132, 68, 18, 165, 217, 196, 45, 40, 78, // block_size < 16x16
+            173, 80, 19, 176, 240, 193, 64, 35, 46, // block_size < 32x32
+            221, 135, 38, 194, 248, 121, 96, 85, 29 // block_size >= 32x32
+        };
+
+        private static readonly byte[] DefaultIfUvProbs =
+        {
+            120, 7, 76, 176, 208, 126, 28, 54, 103, // y = dc
+            48, 12, 154, 155, 139, 90, 34, 117, 119, // y = v
+            67, 6, 25, 204, 243, 158, 13, 21, 96, // y = h
+            97, 5, 44, 131, 176, 139, 48, 68, 97, // y = d45
+            83, 5, 42, 156, 111, 152, 26, 49, 152, // y = d135
+            80, 5, 58, 178, 74, 83, 33, 62, 145, // y = d117
+            86, 5, 32, 154, 192, 168, 14, 22, 163, // y = d153
+            85, 5, 32, 156, 216, 148, 19, 29, 73, // y = d207
+            77, 7, 64, 116, 132, 122, 37, 126, 120, // y = d63
+            101, 21, 107, 181, 192, 103, 19, 67, 125 // y = tm
+        };
+
+        private static readonly byte[] DefaultPartitionProbs =
+        {
+            // 8x8 . 4x4
+            199, 122, 141, // a/l both not split
+            147, 63, 159, // a split, l not split
+            148, 133, 118, // l split, a not split
+            121, 104, 114, // a/l both split
+            // 16x16 . 8x8
+            174, 73, 87, // a/l both not split
+            92, 41, 83, // a split, l not split
+            82, 99, 50, // l split, a not split
+            53, 39, 39, // a/l both split
+            // 32x32 . 16x16
+            177, 58, 59, // a/l both not split
+            68, 26, 63, // a split, l not split
+            52, 79, 25, // l split, a not split
+            17, 14, 12, // a/l both split
+            // 64x64 . 32x32
+            222, 34, 30, // a/l both not split
+            72, 16, 44, // a split, l not split
+            58, 32, 12, // l split, a not split
+            10, 7, 6 // a/l both split
+        };
+
+        private static readonly byte[] DefaultInterModeProbs =
+        {
+            2, 173, 34, // 0 = both zero mv
+            7, 145, 85, // 1 = one zero mv + one a predicted mv
+            7, 166, 63, // 2 = two predicted mvs
+            7, 94, 66, // 3 = one predicted/zero and one new mv
+            8, 64, 46, // 4 = two new mvs
+            17, 81, 31, // 5 = one intra neighbour + x
+            25, 29, 30 // 6 = two intra neighbours
+        };
+
+        /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+        public static readonly sbyte[] IntraModeTree =
+        {
+            -(int)PredictionMode.DcPred, 2, /* 0 = DC_NODE */ -(int)PredictionMode.TmPred, 4, /* 1 = TM_NODE */
+            -(int)PredictionMode.VPred, 6, /* 2 = V_NODE */ 8, 12, /* 3 = COM_NODE */ -(int)PredictionMode.HPred,
+            10, /* 4 = H_NODE */ -(int)PredictionMode.D135Pred, -(int)PredictionMode.D117Pred, /* 5 = D135_NODE */
+            -(int)PredictionMode.D45Pred, 14, /* 6 = D45_NODE */ -(int)PredictionMode.D63Pred,
+            16, /* 7 = D63_NODE */ -(int)PredictionMode.D153Pred, -(int)PredictionMode.D207Pred /* 8 = D153_NODE */
+        };
+
+        public static readonly sbyte[] InterModeTree =
+        {
+            -((int)PredictionMode.ZeroMv - (int)PredictionMode.NearestMv), 2,
+            -((int)PredictionMode.NearestMv - (int)PredictionMode.NearestMv), 4,
+            -((int)PredictionMode.NearMv - (int)PredictionMode.NearestMv),
+            -((int)PredictionMode.NewMv - (int)PredictionMode.NearestMv)
+        };
+
+        public static readonly sbyte[] PartitionTree =
+        {
+            -(sbyte)PartitionType.PartitionNone, 2, -(sbyte)PartitionType.PartitionHorz, 4,
+            -(sbyte)PartitionType.PartitionVert, -(sbyte)PartitionType.PartitionSplit
+        };
+
+        public static readonly sbyte[] SwitchableInterpTree =
+        {
+            -Constants.EightTap, 2, -Constants.EightTapSmooth, -Constants.EightTapSharp
+        };
+
+        private static readonly byte[] DefaultIntraInterP = { 9, 102, 187, 225 };
+        private static readonly byte[] DefaultCompInterP = { 239, 183, 119, 96, 41 };
+        private static readonly byte[] DefaultCompRefP = { 50, 126, 123, 221, 226 };
+        private static readonly byte[] DefaultSingleRefP = { 33, 16, 77, 74, 142, 142, 172, 170, 238, 247 };
+        private static readonly byte[] DefaultTxProbs = { 3, 136, 37, 5, 52, 13, 20, 152, 15, 101, 100, 66 };
+
+        static EntropyMode()
+        {
+            byte[][] KfPartitionProbs =
+            {
+                // 8x8 . 4x4
+                new byte[] { 158, 97, 94 }, // a/l both not split
+                new byte[] { 93, 24, 99 }, // a split, l not split
+                new byte[] { 85, 119, 44 }, // l split, a not split
+                new byte[] { 62, 59, 67 }, // a/l both split
+
+                // 16x16 . 8x8
+                new byte[] { 149, 53, 53 }, // a/l both not split
+                new byte[] { 94, 20, 48 }, // a split, l not split
+                new byte[] { 83, 53, 24 }, // l split, a not split
+                new byte[] { 52, 18, 18 }, // a/l both split
+
+                // 32x32 . 16x16
+                new byte[] { 150, 40, 39 }, // a/l both not split
+                new byte[] { 78, 12, 26 }, // a split, l not split
+                new byte[] { 67, 33, 11 }, // l split, a not split
+                new byte[] { 24, 7, 5 }, // a/l both split
+
+                // 64x64 . 32x32
+                new byte[] { 174, 35, 49 }, // a/l both not split
+                new byte[] { 68, 11, 27 }, // a split, l not split
+                new byte[] { 57, 15, 9 }, // l split, a not split
+                new byte[] { 12, 3, 3 } // a/l both split
+            };
+        }
+
+        private static readonly byte[] DefaultSkipProbs = { 192, 128, 64 };
+
+        private static readonly byte[] DefaultSwitchableInterpProb = { 235, 162, 36, 255, 34, 3, 149, 144 };
+
+        private static void InitModeProbs(ref Vp9EntropyProbs fc)
+        {
+            Entropy.CopyProbs(ref fc.UvModeProb, DefaultIfUvProbs);
+            Entropy.CopyProbs(ref fc.YModeProb, DefaultIfYProbs);
+            Entropy.CopyProbs(ref fc.SwitchableInterpProb, DefaultSwitchableInterpProb);
+            Entropy.CopyProbs(ref fc.PartitionProb, DefaultPartitionProbs);
+            Entropy.CopyProbs(ref fc.IntraInterProb, DefaultIntraInterP);
+            Entropy.CopyProbs(ref fc.CompInterProb, DefaultCompInterP);
+            Entropy.CopyProbs(ref fc.CompRefProb, DefaultCompRefP);
+            Entropy.CopyProbs(ref fc.SingleRefProb, DefaultSingleRefP);
+            Entropy.CopyProbs(ref fc.Tx32x32Prob, DefaultTxProbs.AsSpan().Slice(0, 6));
+            Entropy.CopyProbs(ref fc.Tx16x16Prob, DefaultTxProbs.AsSpan().Slice(6, 4));
+            Entropy.CopyProbs(ref fc.Tx8x8Prob, DefaultTxProbs.AsSpan().Slice(10, 2));
+            Entropy.CopyProbs(ref fc.SkipProb, DefaultSkipProbs);
+            Entropy.CopyProbs(ref fc.InterModeProb, DefaultInterModeProbs);
+        }
+
+        internal static void TxCountsToBranchCounts32x32(ReadOnlySpan<uint> txCount32x32P,
+            ref Array3<Array2<uint>> ct32x32P)
+        {
+            ct32x32P[0][0] = txCount32x32P[(int)TxSize.Tx4x4];
+            ct32x32P[0][1] = txCount32x32P[(int)TxSize.Tx8x8] + txCount32x32P[(int)TxSize.Tx16x16] +
+                             txCount32x32P[(int)TxSize.Tx32x32];
+            ct32x32P[1][0] = txCount32x32P[(int)TxSize.Tx8x8];
+            ct32x32P[1][1] = txCount32x32P[(int)TxSize.Tx16x16] + txCount32x32P[(int)TxSize.Tx32x32];
+            ct32x32P[2][0] = txCount32x32P[(int)TxSize.Tx16x16];
+            ct32x32P[2][1] = txCount32x32P[(int)TxSize.Tx32x32];
+        }
+
+        internal static void TxCountsToBranchCounts16x16(ReadOnlySpan<uint> txCount16x16P,
+            ref Array2<Array2<uint>> ct16x16P)
+        {
+            ct16x16P[0][0] = txCount16x16P[(int)TxSize.Tx4x4];
+            ct16x16P[0][1] = txCount16x16P[(int)TxSize.Tx8x8] + txCount16x16P[(int)TxSize.Tx16x16];
+            ct16x16P[1][0] = txCount16x16P[(int)TxSize.Tx8x8];
+            ct16x16P[1][1] = txCount16x16P[(int)TxSize.Tx16x16];
+        }
+
+        internal static void TxCountsToBranchCounts8x8(ReadOnlySpan<uint> txCount8x8P,
+            ref Array1<Array2<uint>> ct8x8P)
+        {
+            ct8x8P[0][0] = txCount8x8P[(int)TxSize.Tx4x4];
+            ct8x8P[0][1] = txCount8x8P[(int)TxSize.Tx8x8];
+        }
+
+        public static unsafe void SetupPastIndependence(ref Vp9Common cm)
+        {
+            // Reset the segment feature data to the default stats:
+            // Features disabled, 0, with delta coding (Default state).
+            ref Types.LoopFilter lf = ref cm.Lf;
+
+            cm.Seg.ClearAllSegFeatures();
+            cm.Seg.AbsDelta = Segmentation.SegmentDeltadata;
+
+            if (!cm.LastFrameSegMap.IsNull)
+            {
+                MemoryUtil.Fill(cm.LastFrameSegMap.ToPointer(), (byte)0, cm.MiRows * cm.MiCols);
+            }
+
+            if (!cm.CurrentFrameSegMap.IsNull)
+            {
+                MemoryUtil.Fill(cm.CurrentFrameSegMap.ToPointer(), (byte)0, cm.MiRows * cm.MiCols);
+            }
+
+            // Reset the mode ref deltas for loop filter
+            lf.LastRefDeltas = new Array4<sbyte>();
+            lf.LastModeDeltas = new Array2<sbyte>();
+            lf.SetDefaultLfDeltas();
+
+            // To force update of the sharpness
+            lf.LastSharpnessLevel = -1;
+
+            cm.DefaultCoefProbs();
+            InitModeProbs(ref cm.Fc.Value);
+            cm.InitMvProbs();
+
+            if (cm.FrameType == FrameType.KeyFrame || cm.ErrorResilientMode != 0 || cm.ResetFrameContext == 3)
+            {
+                // Reset all frame contexts.
+                for (int i = 0; i < Constants.FrameContexts; ++i)
+                {
+                    cm.FrameContexts[i] = cm.Fc.Value;
+                }
+            }
+            else if (cm.ResetFrameContext == 2)
+            {
+                // Reset only the frame context specified in the frame header.
+                cm.FrameContexts[(int)cm.FrameContextIdx] = cm.Fc.Value;
+            }
+
+            // prev_mip will only be allocated in encoder.
+            if (cm.FrameIsIntraOnly() && !cm.PrevMip.IsNull)
+            {
+                cm.PrevMi.Value = new ModeInfo();
+            }
+
+            cm.RefFrameSignBias = new Array4<sbyte>();
+
+            cm.FrameContextIdx = 0;
+        }
+    }
+}

+ 165 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/EntropyMv.cs

@@ -0,0 +1,165 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using Ryujinx.Graphics.Video;
+using System.Diagnostics;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class EntropyMv
+    {
+        public const int UpdateProb = 252;
+
+        /* Symbols for coding which components are zero jointly */
+        public const int Joints = 4;
+
+
+        public static readonly sbyte[] JointTree =
+        {
+            -(sbyte)MvJointType.Zero, 2, -(sbyte)MvJointType.Hnzvz, 4,
+            -(sbyte)MvJointType.Hzvnz, -(sbyte)MvJointType.Hnzvnz
+        };
+
+        public static readonly sbyte[] ClassTree =
+        {
+            -(sbyte)MvClassType.Class0, 2, -(sbyte)MvClassType.Class1, 4, 6, 8, -(sbyte)MvClassType.Class2,
+            -(sbyte)MvClassType.Class3, 10, 12, -(sbyte)MvClassType.Class4, -(sbyte)MvClassType.Class5,
+            -(sbyte)MvClassType.Class6, 14, 16, 18, -(sbyte)MvClassType.Class7, -(sbyte)MvClassType.Class8,
+            -(sbyte)MvClassType.Class9, -(sbyte)MvClassType.Class10
+        };
+
+        public static readonly sbyte[] Class0Tree = { -0, -1 };
+
+        public static readonly sbyte[] FpTree = { -0, 2, -1, 4, -2, -3 };
+
+        private static bool JointVertical(MvJointType type)
+        {
+            return type == MvJointType.Hzvnz || type == MvJointType.Hnzvnz;
+        }
+
+        private static bool JointHorizontal(MvJointType type)
+        {
+            return type == MvJointType.Hnzvz || type == MvJointType.Hnzvnz;
+        }
+
+        private static readonly byte[] LogInBase2 =
+        {
+            0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6,
+            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 10
+        };
+
+        private static int ClassBase(MvClassType c)
+        {
+            return c != 0 ? Class0Size << ((int)c + 2) : 0;
+        }
+
+        private static MvClassType GetClass(int z, Ptr<int> offset)
+        {
+            MvClassType c = z >= Class0Size * 4096
+                ? MvClassType.Class10
+                : (MvClassType)LogInBase2[z >> 3];
+            if (!offset.IsNull)
+            {
+                offset.Value = z - ClassBase(c);
+            }
+
+            return c;
+        }
+
+        private static void IncComponent(int v, ref Vp9BackwardUpdates compCounts, int compIndex, int incr, int usehp)
+        {
+            int s, z, c, o = 0, d, e, f;
+            Debug.Assert(v != 0); /* should not be zero */
+            s = v < 0 ? 1 : 0;
+            compCounts.Sign[compIndex][s] += (uint)incr;
+            z = (s != 0 ? -v : v) - 1; /* magnitude - 1 */
+
+            c = (int)GetClass(z, new Ptr<int>(ref o));
+            compCounts.Classes[compIndex][c] += (uint)incr;
+
+            d = o >> 3; /* int mv data */
+            f = (o >> 1) & 3; /* fractional pel mv data */
+            e = o & 1; /* high precision mv data */
+
+            if (c == (int)MvClassType.Class0)
+            {
+                compCounts.Class0[compIndex][d] += (uint)incr;
+                compCounts.Class0Fp[compIndex][d][f] += (uint)incr;
+                compCounts.Class0Hp[compIndex][e] += (uint)(usehp * incr);
+            }
+            else
+            {
+                int b = c + Class0Bits - 1; // number of bits
+                for (int i = 0; i < b; ++i)
+                {
+                    compCounts.Bits[compIndex][i][(d >> i) & 1] += (uint)incr;
+                }
+
+                compCounts.Fp[compIndex][f] += (uint)incr;
+                compCounts.Hp[compIndex][e] += (uint)(usehp * incr);
+            }
+        }
+
+        public static void Inc(ref Mv mv, Ptr<Vp9BackwardUpdates> counts)
+        {
+            if (!counts.IsNull)
+            {
+                MvJointType j = mv.GetJoint();
+                ++counts.Value.Joints[(int)j];
+
+                if (JointVertical(j))
+                {
+                    IncComponent(mv.Row, ref counts.Value, 0, 1, 1);
+                }
+
+                if (JointHorizontal(j))
+                {
+                    IncComponent(mv.Col, ref counts.Value, 1, 1, 1);
+                }
+            }
+        }
+
+        /* Symbols for coding magnitude class of nonzero components */
+        public const int Classes = 11;
+
+        public const int Class0Bits = 1; /* bits at integer precision for class 0 */
+        public const int Class0Size = 1 << Class0Bits;
+        public const int OffsetBits = Classes + Class0Bits - 2;
+        public const int FpSize = 4;
+
+        public const int MaxBits = Classes + Class0Bits + 2;
+        public const int Max = (1 << MaxBits) - 1;
+        public const int Vals = (Max << 1) + 1;
+
+        public const int InUseBits = 14;
+        public const int Upp = (1 << InUseBits) - 1;
+        public const int Low = -(1 << InUseBits);
+    }
+}

+ 79 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/FrameBuffers.cs

@@ -0,0 +1,79 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal struct InternalFrameBuffer
+    {
+        public ArrayPtr<byte> Data;
+        public bool InUse;
+    }
+
+    internal struct InternalFrameBufferList
+    {
+        public ArrayPtr<InternalFrameBuffer> IntFb;
+    }
+
+    internal static class FrameBuffers
+    {
+        public static int GetFrameBuffer(MemoryAllocator allocator, Ptr<InternalFrameBufferList> cbPriv, ulong minSize,
+            ref VpxCodecFrameBuffer fb)
+        {
+            int i;
+            Ptr<InternalFrameBufferList> intFbList = cbPriv;
+            if (intFbList.IsNull)
+            {
+                return -1;
+            }
+
+            // Find a free frame buffer.
+            for (i = 0; i < intFbList.Value.IntFb.Length; ++i)
+            {
+                if (!intFbList.Value.IntFb[i].InUse)
+                {
+                    break;
+                }
+            }
+
+            if (i == intFbList.Value.IntFb.Length)
+            {
+                return -1;
+            }
+
+            if ((ulong)intFbList.Value.IntFb[i].Data.Length < minSize)
+            {
+                if (!intFbList.Value.IntFb[i].Data.IsNull)
+                {
+                    allocator.Free(intFbList.Value.IntFb[i].Data);
+                }
+
+                // The data must be zeroed to fix a valgrind error from the C loop filter
+                // due to access uninitialized memory in frame border. It could be
+                // skipped if border were totally removed.
+                intFbList.Value.IntFb[i].Data = allocator.Allocate<byte>((int)minSize);
+                if (intFbList.Value.IntFb[i].Data.IsNull)
+                {
+                    return -1;
+                }
+            }
+
+            fb.Data = intFbList.Value.IntFb[i].Data;
+            intFbList.Value.IntFb[i].InUse = true;
+
+            // Set the frame buffer's private data to point at the internal frame buffer.
+            fb.Priv = new Ptr<InternalFrameBuffer>(ref intFbList.Value.IntFb[i]);
+            return 0;
+        }
+
+        public static int ReleaseFrameBuffer(Ptr<InternalFrameBufferList> cbPriv, ref VpxCodecFrameBuffer fb)
+        {
+            if (!fb.Priv.IsNull)
+            {
+                fb.Priv.Value.InUse = false;
+            }
+
+            return 0;
+        }
+    }
+}

+ 97 - 84
src/Ryujinx.Graphics.Nvdec.Vp9/Idct.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
 using Ryujinx.Graphics.Nvdec.Vp9.Types;
 using System;
 using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.InvTxfm;
@@ -8,11 +8,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
     internal static class Idct
     {
         private delegate void Transform1D(ReadOnlySpan<int> input, Span<int> output);
+
         private delegate void HighbdTransform1D(ReadOnlySpan<int> input, Span<int> output, int bd);
 
         private struct Transform2D
         {
-            public Transform1D Cols, Rows; // Vertical and horizontal
+            public readonly Transform1D Cols; // Vertical and horizontal
+            public readonly Transform1D Rows; // Vertical and horizontal
 
             public Transform2D(Transform1D cols, Transform1D rows)
             {
@@ -23,7 +25,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
         private struct HighbdTransform2D
         {
-            public HighbdTransform1D Cols, Rows; // Vertical and horizontal
+            public readonly HighbdTransform1D Cols; // Vertical and horizontal
+            public readonly HighbdTransform1D Rows; // Vertical and horizontal
 
             public HighbdTransform2D(HighbdTransform1D cols, HighbdTransform1D rows)
             {
@@ -32,121 +35,124 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
         }
 
-        private static readonly Transform2D[] _iht4 = {
+        private static readonly Transform2D[] Iht4 =
+        {
             new(Idct4, Idct4), // DCT_DCT  = 0
             new(Iadst4, Idct4), // ADST_DCT = 1
             new(Idct4, Iadst4), // DCT_ADST = 2
-            new(Iadst4, Iadst4), // ADST_ADST = 3
+            new(Iadst4, Iadst4) // ADST_ADST = 3
         };
 
         public static void Iht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int txType)
         {
-            int i, j;
             Span<int> output = stackalloc int[4 * 4];
             Span<int> outptr = output;
             Span<int> tempIn = stackalloc int[4];
             Span<int> tempOut = stackalloc int[4];
 
             // Inverse transform row vectors
-            for (i = 0; i < 4; ++i)
+            for (int i = 0; i < 4; ++i)
             {
-                _iht4[txType].Rows(input, outptr);
-                input = input[4..];
-                outptr = outptr[4..];
+                Iht4[txType].Rows(input, outptr);
+                input = input.Slice(4);
+                outptr = outptr.Slice(4);
             }
 
             // Inverse transform column vectors
-            for (i = 0; i < 4; ++i)
+            for (int i = 0; i < 4; ++i)
             {
-                for (j = 0; j < 4; ++j)
+                for (int j = 0; j < 4; ++j)
                 {
-                    tempIn[j] = output[j * 4 + i];
+                    tempIn[j] = output[(j * 4) + i];
                 }
 
-                _iht4[txType].Cols(tempIn, tempOut);
-                for (j = 0; j < 4; ++j)
+                Iht4[txType].Cols(tempIn, tempOut);
+                for (int j = 0; j < 4; ++j)
                 {
-                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4));
+                    dest[(j * stride) + i] =
+                        ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4));
                 }
             }
         }
 
-        private static readonly Transform2D[] _iht8 = {
+        private static readonly Transform2D[] Iht8 =
+        {
             new(Idct8, Idct8), // DCT_DCT  = 0
             new(Iadst8, Idct8), // ADST_DCT = 1
             new(Idct8, Iadst8), // DCT_ADST = 2
-            new(Iadst8, Iadst8), // ADST_ADST = 3
+            new(Iadst8, Iadst8) // ADST_ADST = 3
         };
 
         public static void Iht8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int txType)
         {
-            int i, j;
             Span<int> output = stackalloc int[8 * 8];
             Span<int> outptr = output;
             Span<int> tempIn = stackalloc int[8];
             Span<int> tempOut = stackalloc int[8];
-            Transform2D ht = _iht8[txType];
+            Transform2D ht = Iht8[txType];
 
             // Inverse transform row vectors
-            for (i = 0; i < 8; ++i)
+            for (int i = 0; i < 8; ++i)
             {
                 ht.Rows(input, outptr);
-                input = input[8..];
-                outptr = outptr[8..];
+                input = input.Slice(8);
+                outptr = outptr.Slice(8);
             }
 
             // Inverse transform column vectors
-            for (i = 0; i < 8; ++i)
+            for (int i = 0; i < 8; ++i)
             {
-                for (j = 0; j < 8; ++j)
+                for (int j = 0; j < 8; ++j)
                 {
-                    tempIn[j] = output[j * 8 + i];
+                    tempIn[j] = output[(j * 8) + i];
                 }
 
                 ht.Cols(tempIn, tempOut);
-                for (j = 0; j < 8; ++j)
+                for (int j = 0; j < 8; ++j)
                 {
-                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5));
+                    dest[(j * stride) + i] =
+                        ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5));
                 }
             }
         }
 
-        private static readonly Transform2D[] _iht16 = {
+        private static readonly Transform2D[] Iht16 =
+        {
             new(Idct16, Idct16), // DCT_DCT  = 0
             new(Iadst16, Idct16), // ADST_DCT = 1
             new(Idct16, Iadst16), // DCT_ADST = 2
-            new(Iadst16, Iadst16), // ADST_ADST = 3
+            new(Iadst16, Iadst16) // ADST_ADST = 3
         };
 
         public static void Iht16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int txType)
         {
-            int i, j;
             Span<int> output = stackalloc int[16 * 16];
             Span<int> outptr = output;
             Span<int> tempIn = stackalloc int[16];
             Span<int> tempOut = stackalloc int[16];
-            Transform2D ht = _iht16[txType];
+            Transform2D ht = Iht16[txType];
 
             // Rows
-            for (i = 0; i < 16; ++i)
+            for (int i = 0; i < 16; ++i)
             {
                 ht.Rows(input, outptr);
-                input = input[16..];
-                outptr = outptr[16..];
+                input = input.Slice(16);
+                outptr = outptr.Slice(16);
             }
 
             // Columns
-            for (i = 0; i < 16; ++i)
+            for (int i = 0; i < 16; ++i)
             {
-                for (j = 0; j < 16; ++j)
+                for (int j = 0; j < 16; ++j)
                 {
-                    tempIn[j] = output[j * 16 + i];
+                    tempIn[j] = output[(j * 16) + i];
                 }
 
                 ht.Cols(tempIn, tempOut);
-                for (j = 0; j < 16; ++j)
+                for (int j = 0; j < 16; ++j)
                 {
-                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+                    dest[(j * stride) + i] =
+                        ClipPixelAdd(dest[(j * stride) + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
                 }
             }
         }
@@ -268,7 +274,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         }
 
         public static void Iht16x16Add(TxType txType, ReadOnlySpan<int> input, Span<byte> dest,
-                              int stride, int eob)
+            int stride, int eob)
         {
             if (txType == TxType.DctDct)
             {
@@ -280,121 +286,125 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
         }
 
-        private static readonly HighbdTransform2D[] _highbdIht4 = {
+        private static readonly HighbdTransform2D[] HighbdIht4 =
+        {
             new(HighbdIdct4, HighbdIdct4), // DCT_DCT  = 0
             new(HighbdIadst4, HighbdIdct4), // ADST_DCT = 1
             new(HighbdIdct4, HighbdIadst4), // DCT_ADST = 2
-            new(HighbdIadst4, HighbdIadst4), // ADST_ADST = 3
+            new(HighbdIadst4, HighbdIadst4) // ADST_ADST = 3
         };
 
         public static void HighbdIht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType, int bd)
         {
-            int i, j;
             Span<int> output = stackalloc int[4 * 4];
             Span<int> outptr = output;
             Span<int> tempIn = stackalloc int[4];
             Span<int> tempOut = stackalloc int[4];
 
             // Inverse transform row vectors.
-            for (i = 0; i < 4; ++i)
+            for (int i = 0; i < 4; ++i)
             {
-                _highbdIht4[txType].Rows(input, outptr, bd);
-                input = input[4..];
-                outptr = outptr[4..];
+                HighbdIht4[txType].Rows(input, outptr, bd);
+                input = input.Slice(4);
+                outptr = outptr.Slice(4);
             }
 
             // Inverse transform column vectors.
-            for (i = 0; i < 4; ++i)
+            for (int i = 0; i < 4; ++i)
             {
-                for (j = 0; j < 4; ++j)
+                for (int j = 0; j < 4; ++j)
                 {
-                    tempIn[j] = output[j * 4 + i];
+                    tempIn[j] = output[(j * 4) + i];
                 }
 
-                _highbdIht4[txType].Cols(tempIn, tempOut, bd);
-                for (j = 0; j < 4; ++j)
+                HighbdIht4[txType].Cols(tempIn, tempOut, bd);
+                for (int j = 0; j < 4; ++j)
                 {
-                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd);
+                    dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i],
+                        BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd);
                 }
             }
         }
 
-        private static readonly HighbdTransform2D[] _highIht8 = {
+        private static readonly HighbdTransform2D[] HighIht8 =
+        {
             new(HighbdIdct8, HighbdIdct8), // DCT_DCT  = 0
             new(HighbdIadst8, HighbdIdct8), // ADST_DCT = 1
             new(HighbdIdct8, HighbdIadst8), // DCT_ADST = 2
-            new(HighbdIadst8, HighbdIadst8), // ADST_ADST = 3
+            new(HighbdIadst8, HighbdIadst8) // ADST_ADST = 3
         };
 
         public static void HighbdIht8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType, int bd)
         {
-            int i, j;
             Span<int> output = stackalloc int[8 * 8];
             Span<int> outptr = output;
             Span<int> tempIn = stackalloc int[8];
             Span<int> tempOut = stackalloc int[8];
-            HighbdTransform2D ht = _highIht8[txType];
+            HighbdTransform2D ht = HighIht8[txType];
 
             // Inverse transform row vectors.
-            for (i = 0; i < 8; ++i)
+            for (int i = 0; i < 8; ++i)
             {
                 ht.Rows(input, outptr, bd);
-                input = input[8..];
-                outptr = output[8..];
+                input = input.Slice(8);
+                outptr = output.Slice(8);
             }
 
             // Inverse transform column vectors.
-            for (i = 0; i < 8; ++i)
+            for (int i = 0; i < 8; ++i)
             {
-                for (j = 0; j < 8; ++j)
+                for (int j = 0; j < 8; ++j)
                 {
-                    tempIn[j] = output[j * 8 + i];
+                    tempIn[j] = output[(j * 8) + i];
                 }
 
                 ht.Cols(tempIn, tempOut, bd);
-                for (j = 0; j < 8; ++j)
+                for (int j = 0; j < 8; ++j)
                 {
-                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
+                    dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i],
+                        BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
                 }
             }
         }
 
-        private static readonly HighbdTransform2D[] _highIht16 = {
+        private static readonly HighbdTransform2D[] HighIht16 =
+        {
             new(HighbdIdct16, HighbdIdct16), // DCT_DCT  = 0
             new(HighbdIadst16, HighbdIdct16), // ADST_DCT = 1
             new(HighbdIdct16, HighbdIadst16), // DCT_ADST = 2
-            new(HighbdIadst16, HighbdIadst16), // ADST_ADST = 3
+            new(HighbdIadst16, HighbdIadst16) // ADST_ADST = 3
         };
 
-        public static void HighbdIht16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType, int bd)
+        public static void HighbdIht16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType,
+            int bd)
         {
-            int i, j;
             Span<int> output = stackalloc int[16 * 16];
             Span<int> outptr = output;
             Span<int> tempIn = stackalloc int[16];
             Span<int> tempOut = stackalloc int[16];
-            HighbdTransform2D ht = _highIht16[txType];
+            HighbdTransform2D ht = HighIht16[txType];
 
             // Rows
-            for (i = 0; i < 16; ++i)
+            for (int i = 0; i < 16; ++i)
             {
                 ht.Rows(input, outptr, bd);
-                input = input[16..];
-                outptr = output[16..];
+                input = input.Slice(16);
+                outptr = output.Slice(16);
             }
 
             // Columns
-            for (i = 0; i < 16; ++i)
+            for (int i = 0; i < 16; ++i)
             {
-                for (j = 0; j < 16; ++j)
+                for (int j = 0; j < 16; ++j)
                 {
-                    tempIn[j] = output[j * 16 + i];
+                    tempIn[j] = output[(j * 16) + i];
                 }
 
                 ht.Cols(tempIn, tempOut, bd);
-                for (j = 0; j < 16; ++j)
+                for (int j = 0; j < 16; ++j)
                 {
-                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+                    dest[(j * stride) + i] = HighbdClipPixelAdd(dest[(j * stride) + i],
+                        BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
                 }
             }
         }
@@ -434,7 +444,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // DC only DCT coefficient
             if (eob == 1)
             {
-                Vpx_Highbdidct8x8_1_add_c(input, dest, stride, bd);
+                VpxHighbdidct8x81AddC(input, dest, stride, bd);
             }
             else if (eob <= 12)
             {
@@ -491,7 +501,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         }
 
         // Iht
-        public static void HighbdIht4x4Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
+        public static void HighbdIht4x4Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride,
+            int eob, int bd)
         {
             if (txType == TxType.DctDct)
             {
@@ -503,7 +514,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
         }
 
-        public static void HighbdIht8x8Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
+        public static void HighbdIht8x8Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride,
+            int eob, int bd)
         {
             if (txType == TxType.DctDct)
             {
@@ -515,7 +527,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
         }
 
-        public static void HighbdIht16x16Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
+        public static void HighbdIht16x16Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride,
+            int eob, int bd)
         {
             if (txType == TxType.DctDct)
             {
@@ -527,4 +540,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
         }
     }
-}
+}

+ 2 - 2
src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorException.cs

@@ -2,7 +2,7 @@ using System;
 
 namespace Ryujinx.Graphics.Nvdec.Vp9
 {
-    class InternalErrorException : Exception
+    internal class InternalErrorException : Exception
     {
         public InternalErrorException(string message) : base(message)
         {
@@ -12,4 +12,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         {
         }
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/InternalErrorInfo.cs

@@ -11,4 +11,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             throw new InternalErrorException(message);
         }
     }
-}
+}

+ 1706 - 148
src/Ryujinx.Graphics.Nvdec.Vp9/LoopFilter.cs

@@ -1,8 +1,12 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
 using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
 using Ryujinx.Graphics.Nvdec.Vp9.Types;
 using System;
+using System.Diagnostics;
 using System.Runtime.InteropServices;
+using System.Threading;
+using System.Threading.Tasks;
 
 namespace Ryujinx.Graphics.Nvdec.Vp9
 {
@@ -13,11 +17,119 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         public const int MaxRefLfDeltas = 4;
         public const int MaxModeLfDeltas = 2;
 
+        private struct LfSync
+        {
+            private int[] _curSbCol;
+            private object[] _syncObjects;
+            private int _syncRange;
+
+            private static int GetSyncRange(int width)
+            {
+                // nsync numbers are picked by testing. For example, for 4k
+                // video, using 4 gives best performance.
+                if (width < 640)
+                {
+                    return 1;
+                }
+
+                if (width <= 1280)
+                {
+                    return 2;
+                }
+
+                if (width <= 4096)
+                {
+                    return 4;
+                }
+
+                return 8;
+            }
+
+            public void Initialize(int width, int sbRows)
+            {
+                if (_curSbCol == null || _curSbCol.Length != sbRows)
+                {
+                    _curSbCol = new int[sbRows];
+                    _syncObjects = new object[sbRows];
+
+                    for (int i = 0; i < sbRows; i++)
+                    {
+                        _syncObjects[i] = new object();
+                    }
+                }
+
+                _syncRange = GetSyncRange(width);
+                _curSbCol.AsSpan().Fill(-1);
+            }
+
+            public void SyncRead(int r, int c)
+            {
+                if (_curSbCol == null)
+                {
+                    return;
+                }
+
+                int nsync = _syncRange;
+
+                if (r != 0 && (c & (nsync - 1)) == 0)
+                {
+                    object syncObject = _syncObjects[r - 1];
+                    lock (syncObject)
+                    {
+                        while (c > _curSbCol[r - 1] - nsync)
+                        {
+                            Monitor.Wait(syncObject);
+                        }
+                    }
+                }
+            }
+
+            public void SyncWrite(int r, int c, int sbCols)
+            {
+                if (_curSbCol == null)
+                {
+                    return;
+                }
+
+                int nsync = _syncRange;
+
+                int cur;
+                // Only signal when there are enough filtered SB for next row to run.
+                bool sig = true;
+
+                if (c < sbCols - 1)
+                {
+                    cur = c;
+
+                    if (c % nsync != 0)
+                    {
+                        sig = false;
+                    }
+                }
+                else
+                {
+                    cur = sbCols + nsync;
+                }
+
+                if (sig)
+                {
+                    object syncObject = _syncObjects[r];
+
+                    lock (syncObject)
+                    {
+                        _curSbCol[r] = cur;
+
+                        Monitor.Pulse(syncObject);
+                    }
+                }
+            }
+        }
+
         // 64 bit masks for left transform size. Each 1 represents a position where
         // we should apply a loop filter across the left border of an 8x8 block
         // boundary.
         //
-        // In the case of TX_16X16 ->  ( in low order byte first we end up with
+        // In the case of (int)TxSize.Tx16x16 .  ( in low order byte first we end up with
         // a mask that looks like this
         //
         //    10101010
@@ -30,18 +142,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         //    10101010
         //
         // A loopfilter should be applied to every other 8x8 horizontally.
-        private static readonly ulong[] _left64X64TxformMask = {
-            0xffffffffffffffffUL, // TX_4X4
-            0xffffffffffffffffUL, // TX_8x8
-            0x5555555555555555UL, // TX_16x16
-            0x1111111111111111UL, // TX_32x32
+        private static readonly ulong[] Left64x64TxformMask =
+        {
+            0xffffffffffffffffUL, // (int)TxSize.Tx4x4
+            0xffffffffffffffffUL, // (int)TxSize.Tx8x8
+            0x5555555555555555UL, // (int)TxSize.Tx16x16
+            0x1111111111111111UL // (int)TxSize.Tx32x32
         };
 
         // 64 bit masks for above transform size. Each 1 represents a position where
         // we should apply a loop filter across the top border of an 8x8 block
         // boundary.
         //
-        // In the case of TX_32x32 ->  ( in low order byte first we end up with
+        // In the case of (int)TxSize.Tx32x32 .  ( in low order byte first we end up with
         // a mask that looks like this
         //
         //    11111111
@@ -54,18 +167,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         //    00000000
         //
         // A loopfilter should be applied to every other 4 the row vertically.
-        private static readonly ulong[] _above64X64TxformMask = {
-            0xffffffffffffffffUL, // TX_4X4
-            0xffffffffffffffffUL, // TX_8x8
-            0x00ff00ff00ff00ffUL, // TX_16x16
-            0x000000ff000000ffUL, // TX_32x32
+        private static readonly ulong[] Above64x64TxformMask =
+        {
+            0xffffffffffffffffUL, // (int)TxSize.Tx4x4
+            0xffffffffffffffffUL, // (int)TxSize.Tx8x8
+            0x00ff00ff00ff00ffUL, // (int)TxSize.Tx16x16
+            0x000000ff000000ffUL // (int)TxSize.Tx32x32
         };
 
         // 64 bit masks for prediction sizes (left). Each 1 represents a position
         // where left border of an 8x8 block. These are aligned to the right most
         // appropriate bit, and then shifted into place.
         //
-        // In the case of TX_16x32 ->  ( low order byte first ) we end up with
+        // In the case of TX_16x32 .  ( low order byte first ) we end up with
         // a mask that looks like this :
         //
         //  10000000
@@ -76,157 +190,163 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         //  00000000
         //  00000000
         //  00000000
-        private static readonly ulong[] _leftPredictionMask = {
-            0x0000000000000001UL, // BLOCK_4X4,
-            0x0000000000000001UL, // BLOCK_4X8,
-            0x0000000000000001UL, // BLOCK_8X4,
-            0x0000000000000001UL, // BLOCK_8X8,
-            0x0000000000000101UL, // BLOCK_8X16,
-            0x0000000000000001UL, // BLOCK_16X8,
-            0x0000000000000101UL, // BLOCK_16X16,
-            0x0000000001010101UL, // BLOCK_16X32,
-            0x0000000000000101UL, // BLOCK_32X16,
-            0x0000000001010101UL, // BLOCK_32X32,
-            0x0101010101010101UL, // BLOCK_32X64,
-            0x0000000001010101UL, // BLOCK_64X32,
-            0x0101010101010101UL, // BLOCK_64X64
+        private static readonly ulong[] LeftPredictionMask =
+        {
+            0x0000000000000001UL, // BLOCK_4x4,
+            0x0000000000000001UL, // BLOCK_4x8,
+            0x0000000000000001UL, // BLOCK_8x4,
+            0x0000000000000001UL, // BLOCK_8x8,
+            0x0000000000000101UL, // BLOCK_8x16,
+            0x0000000000000001UL, // BLOCK_16x8,
+            0x0000000000000101UL, // BLOCK_16x16,
+            0x0000000001010101UL, // BLOCK_16x32,
+            0x0000000000000101UL, // BLOCK_32x16,
+            0x0000000001010101UL, // BLOCK_32x32,
+            0x0101010101010101UL, // BLOCK_32x64,
+            0x0000000001010101UL, // BLOCK_64x32,
+            0x0101010101010101UL // BLOCK_64x64
         };
 
         // 64 bit mask to shift and set for each prediction size.
-        private static readonly ulong[] _abovePredictionMask = {
-            0x0000000000000001UL, // BLOCK_4X4
-            0x0000000000000001UL, // BLOCK_4X8
-            0x0000000000000001UL, // BLOCK_8X4
-            0x0000000000000001UL, // BLOCK_8X8
-            0x0000000000000001UL, // BLOCK_8X16,
-            0x0000000000000003UL, // BLOCK_16X8
-            0x0000000000000003UL, // BLOCK_16X16
-            0x0000000000000003UL, // BLOCK_16X32,
-            0x000000000000000fUL, // BLOCK_32X16,
-            0x000000000000000fUL, // BLOCK_32X32,
-            0x000000000000000fUL, // BLOCK_32X64,
-            0x00000000000000ffUL, // BLOCK_64X32,
-            0x00000000000000ffUL, // BLOCK_64X64
+        private static readonly ulong[] AbovePredictionMask =
+        {
+            0x0000000000000001UL, // BLOCK_4x4
+            0x0000000000000001UL, // BLOCK_4x8
+            0x0000000000000001UL, // BLOCK_8x4
+            0x0000000000000001UL, // BLOCK_8x8
+            0x0000000000000001UL, // BLOCK_8x16,
+            0x0000000000000003UL, // BLOCK_16x8
+            0x0000000000000003UL, // BLOCK_16x16
+            0x0000000000000003UL, // BLOCK_16x32,
+            0x000000000000000fUL, // BLOCK_32x16,
+            0x000000000000000fUL, // BLOCK_32x32,
+            0x000000000000000fUL, // BLOCK_32x64,
+            0x00000000000000ffUL, // BLOCK_64x32,
+            0x00000000000000ffUL // BLOCK_64x64
         };
 
         // 64 bit mask to shift and set for each prediction size. A bit is set for
         // each 8x8 block that would be in the left most block of the given block
         // size in the 64x64 block.
-        private static readonly ulong[] _sizeMask = {
-            0x0000000000000001UL, // BLOCK_4X4
-            0x0000000000000001UL, // BLOCK_4X8
-            0x0000000000000001UL, // BLOCK_8X4
-            0x0000000000000001UL, // BLOCK_8X8
-            0x0000000000000101UL, // BLOCK_8X16,
-            0x0000000000000003UL, // BLOCK_16X8
-            0x0000000000000303UL, // BLOCK_16X16
-            0x0000000003030303UL, // BLOCK_16X32,
-            0x0000000000000f0fUL, // BLOCK_32X16,
-            0x000000000f0f0f0fUL, // BLOCK_32X32,
-            0x0f0f0f0f0f0f0f0fUL, // BLOCK_32X64,
-            0x00000000ffffffffUL, // BLOCK_64X32,
-            0xffffffffffffffffUL, // BLOCK_64X64
+        private static readonly ulong[] SizeMask =
+        {
+            0x0000000000000001UL, // BLOCK_4x4
+            0x0000000000000001UL, // BLOCK_4x8
+            0x0000000000000001UL, // BLOCK_8x4
+            0x0000000000000001UL, // BLOCK_8x8
+            0x0000000000000101UL, // BLOCK_8x16,
+            0x0000000000000003UL, // BLOCK_16x8
+            0x0000000000000303UL, // BLOCK_16x16
+            0x0000000003030303UL, // BLOCK_16x32,
+            0x0000000000000f0fUL, // BLOCK_32x16,
+            0x000000000f0f0f0fUL, // BLOCK_32x32,
+            0x0f0f0f0f0f0f0f0fUL, // BLOCK_32x64,
+            0x00000000ffffffffUL, // BLOCK_64x32,
+            0xffffffffffffffffUL // BLOCK_64x64
         };
 
         // These are used for masking the left and above borders.
-#pragma warning disable IDE0051 // Remove unused private member
         private const ulong LeftBorder = 0x1111111111111111UL;
         private const ulong AboveBorder = 0x000000ff000000ffUL;
-#pragma warning restore IDE0051
 
         // 16 bit masks for uv transform sizes.
-        private static readonly ushort[] _left64X64TxformMaskUv = {
-            0xffff, // TX_4X4
-            0xffff, // TX_8x8
-            0x5555, // TX_16x16
-            0x1111, // TX_32x32
+        private static readonly ushort[] Left64x64TxformMaskUv =
+        {
+            0xffff, // (int)TxSize.Tx4x4
+            0xffff, // (int)TxSize.Tx8x8
+            0x5555, // (int)TxSize.Tx16x16
+            0x1111 // (int)TxSize.Tx32x32
         };
 
-        private static readonly ushort[] _above64X64TxformMaskUv = {
-            0xffff, // TX_4X4
-            0xffff, // TX_8x8
-            0x0f0f, // TX_16x16
-            0x000f, // TX_32x32
+        private static readonly ushort[] Above64x64TxformMaskUv =
+        {
+            0xffff, // (int)TxSize.Tx4x4
+            0xffff, // (int)TxSize.Tx8x8
+            0x0f0f, // (int)TxSize.Tx16x16
+            0x000f // (int)TxSize.Tx32x32
         };
 
         // 16 bit left mask to shift and set for each uv prediction size.
-        private static readonly ushort[] _leftPredictionMaskUv = {
-            0x0001, // BLOCK_4X4,
-            0x0001, // BLOCK_4X8,
-            0x0001, // BLOCK_8X4,
-            0x0001, // BLOCK_8X8,
-            0x0001, // BLOCK_8X16,
-            0x0001, // BLOCK_16X8,
-            0x0001, // BLOCK_16X16,
-            0x0011, // BLOCK_16X32,
-            0x0001, // BLOCK_32X16,
-            0x0011, // BLOCK_32X32,
-            0x1111, // BLOCK_32X64
-            0x0011, // BLOCK_64X32,
-            0x1111, // BLOCK_64X64
+        private static readonly ushort[] LeftPredictionMaskUv =
+        {
+            0x0001, // BLOCK_4x4,
+            0x0001, // BLOCK_4x8,
+            0x0001, // BLOCK_8x4,
+            0x0001, // BLOCK_8x8,
+            0x0001, // BLOCK_8x16,
+            0x0001, // BLOCK_16x8,
+            0x0001, // BLOCK_16x16,
+            0x0011, // BLOCK_16x32,
+            0x0001, // BLOCK_32x16,
+            0x0011, // BLOCK_32x32,
+            0x1111, // BLOCK_32x64
+            0x0011, // BLOCK_64x32,
+            0x1111 // BLOCK_64x64
         };
 
         // 16 bit above mask to shift and set for uv each prediction size.
-        private static readonly ushort[] _abovePredictionMaskUv = {
-            0x0001, // BLOCK_4X4
-            0x0001, // BLOCK_4X8
-            0x0001, // BLOCK_8X4
-            0x0001, // BLOCK_8X8
-            0x0001, // BLOCK_8X16,
-            0x0001, // BLOCK_16X8
-            0x0001, // BLOCK_16X16
-            0x0001, // BLOCK_16X32,
-            0x0003, // BLOCK_32X16,
-            0x0003, // BLOCK_32X32,
-            0x0003, // BLOCK_32X64,
-            0x000f, // BLOCK_64X32,
-            0x000f, // BLOCK_64X64
+        private static readonly ushort[] AbovePredictionMaskUv =
+        {
+            0x0001, // BLOCK_4x4
+            0x0001, // BLOCK_4x8
+            0x0001, // BLOCK_8x4
+            0x0001, // BLOCK_8x8
+            0x0001, // BLOCK_8x16,
+            0x0001, // BLOCK_16x8
+            0x0001, // BLOCK_16x16
+            0x0001, // BLOCK_16x32,
+            0x0003, // BLOCK_32x16,
+            0x0003, // BLOCK_32x32,
+            0x0003, // BLOCK_32x64,
+            0x000f, // BLOCK_64x32,
+            0x000f // BLOCK_64x64
         };
 
         // 64 bit mask to shift and set for each uv prediction size
-        private static readonly ushort[] _sizeMaskUv = {
-            0x0001, // BLOCK_4X4
-            0x0001, // BLOCK_4X8
-            0x0001, // BLOCK_8X4
-            0x0001, // BLOCK_8X8
-            0x0001, // BLOCK_8X16,
-            0x0001, // BLOCK_16X8
-            0x0001, // BLOCK_16X16
-            0x0011, // BLOCK_16X32,
-            0x0003, // BLOCK_32X16,
-            0x0033, // BLOCK_32X32,
-            0x3333, // BLOCK_32X64,
-            0x00ff, // BLOCK_64X32,
-            0xffff, // BLOCK_64X64
+        private static readonly ushort[] SizeMaskUv =
+        {
+            0x0001, // BLOCK_4x4
+            0x0001, // BLOCK_4x8
+            0x0001, // BLOCK_8x4
+            0x0001, // BLOCK_8x8
+            0x0001, // BLOCK_8x16,
+            0x0001, // BLOCK_16x8
+            0x0001, // BLOCK_16x16
+            0x0011, // BLOCK_16x32,
+            0x0003, // BLOCK_32x16,
+            0x0033, // BLOCK_32x32,
+            0x3333, // BLOCK_32x64,
+            0x00ff, // BLOCK_64x32,
+            0xffff // BLOCK_64x64
         };
 
-#pragma warning disable IDE0051 // Remove unused private member
         private const ushort LeftBorderUv = 0x1111;
         private const ushort AboveBorderUv = 0x000f;
-#pragma warning restore IDE0051
 
-        private static readonly int[] _modeLfLut = {
+        private static readonly int[] ModeLfLut =
+        {
             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES
-            1, 1, 0, 1, // INTER_MODES (ZEROMV == 0)
+            1, 1, 0, 1 // INTER_MODES (ZEROMV == 0)
         };
 
         private static byte GetFilterLevel(ref LoopFilterInfoN lfiN, ref ModeInfo mi)
         {
-            return lfiN.Lvl[mi.SegmentId][mi.RefFrame[0]][_modeLfLut[(int)mi.Mode]];
+            return lfiN.Lvl[mi.SegmentId][mi.RefFrame[0]][ModeLfLut[(int)mi.Mode]];
         }
 
-        private static ref LoopFilterMask GetLfm(ref Types.LoopFilter lf, int miRow, int miCol)
+        private static Span<LoopFilterMask> GetLfm(ref Types.LoopFilter lf, int miRow, int miCol)
         {
-            return ref lf.Lfm[(miCol >> 3) + ((miRow >> 3) * lf.LfmStride)];
+            return lf.Lfm.AsSpan().Slice((miCol >> 3) + ((miRow >> 3) * lf.LfmStride));
         }
 
         // 8x8 blocks in a superblock. A "1" represents the first block in a 16x16
         // or greater area.
-        private static readonly byte[][] _firstBlockIn16X16 = {
-            new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 },
+        private static readonly byte[][] FirstBlockIn16x16 =
+        {
             new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 },
             new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 },
             new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 },
+            new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 }
         };
 
         // This function sets up the bit masks for a block represented
@@ -238,18 +358,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             ref LoopFilterInfoN lfiN = ref cm.LfInfo;
             int filterLevel = GetFilterLevel(ref lfiN, ref mi);
             TxSize txSizeUv = Luts.UvTxsizeLookup[(int)blockSize][(int)txSizeY][1][1];
-            ref LoopFilterMask lfm = ref GetLfm(ref cm.Lf, miRow, miCol);
+            ref LoopFilterMask lfm = ref GetLfm(ref cm.Lf, miRow, miCol)[0];
             ref ulong leftY = ref lfm.LeftY[(int)txSizeY];
             ref ulong aboveY = ref lfm.AboveY[(int)txSizeY];
-            ref ulong int4X4Y = ref lfm.Int4x4Y;
+            ref ulong int4x4Y = ref lfm.Int4x4Y;
             ref ushort leftUv = ref lfm.LeftUv[(int)txSizeUv];
             ref ushort aboveUv = ref lfm.AboveUv[(int)txSizeUv];
-            ref ushort int4X4Uv = ref lfm.Int4x4Uv;
-            int rowInSb = (miRow & 7);
-            int colInSb = (miCol & 7);
+            ref ushort int4x4Uv = ref lfm.Int4x4Uv;
+            int rowInSb = miRow & 7;
+            int colInSb = miCol & 7;
             int shiftY = colInSb + (rowInSb << 3);
             int shiftUv = (colInSb >> 1) + ((rowInSb >> 1) << 2);
-            int buildUv = _firstBlockIn16X16[rowInSb][colInSb];
+            int buildUv = FirstBlockIn16x16[rowInSb][colInSb];
 
             if (filterLevel == 0)
             {
@@ -257,10 +377,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
 
             int index = shiftY;
-            int i;
-            for (i = 0; i < bh; i++)
+
+            for (int i = 0; i < bh; i++)
             {
-                MemoryMarshal.CreateSpan(ref lfm.LflY[index], 64 - index)[..bw].Fill((byte)filterLevel);
+                MemoryMarshal.CreateSpan(ref lfm.LflY[index], 64 - index).Slice(0, bw).Fill((byte)filterLevel);
                 index += 8;
             }
 
@@ -276,13 +396,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             //
             // U and V set things on a 16 bit scale.
             //
-            aboveY |= _abovePredictionMask[(int)blockSize] << shiftY;
-            leftY |= _leftPredictionMask[(int)blockSize] << shiftY;
+            aboveY |= AbovePredictionMask[(int)blockSize] << shiftY;
+            leftY |= LeftPredictionMask[(int)blockSize] << shiftY;
 
             if (buildUv != 0)
             {
-                aboveUv |= (ushort)(_abovePredictionMaskUv[(int)blockSize] << shiftUv);
-                leftUv |= (ushort)(_leftPredictionMaskUv[(int)blockSize] << shiftUv);
+                aboveUv |= (ushort)(AbovePredictionMaskUv[(int)blockSize] << shiftUv);
+                leftUv |= (ushort)(LeftPredictionMaskUv[(int)blockSize] << shiftUv);
             }
 
             // If the block has no coefficients and is not intra we skip applying
@@ -295,13 +415,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // Add a mask for the transform size. The transform size mask is set to
             // be correct for a 64x64 prediction block size. Mask to match the size of
             // the block we are working on and then shift it into place.
-            aboveY |= (_sizeMask[(int)blockSize] & _above64X64TxformMask[(int)txSizeY]) << shiftY;
-            leftY |= (_sizeMask[(int)blockSize] & _left64X64TxformMask[(int)txSizeY]) << shiftY;
+            aboveY |= (SizeMask[(int)blockSize] & Above64x64TxformMask[(int)txSizeY]) << shiftY;
+            leftY |= (SizeMask[(int)blockSize] & Left64x64TxformMask[(int)txSizeY]) << shiftY;
 
             if (buildUv != 0)
             {
-                aboveUv |= (ushort)((_sizeMaskUv[(int)blockSize] & _above64X64TxformMaskUv[(int)txSizeUv]) << shiftUv);
-                leftUv |= (ushort)((_sizeMaskUv[(int)blockSize] & _left64X64TxformMaskUv[(int)txSizeUv]) << shiftUv);
+                aboveUv |= (ushort)((SizeMaskUv[(int)blockSize] & Above64x64TxformMaskUv[(int)txSizeUv]) << shiftUv);
+                leftUv |= (ushort)((SizeMaskUv[(int)blockSize] & Left64x64TxformMaskUv[(int)txSizeUv]) << shiftUv);
             }
 
             // Try to determine what to do with the internal 4x4 block boundaries. These
@@ -309,20 +429,154 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // internal ones can be skipped and don't depend on the prediction block size.
             if (txSizeY == TxSize.Tx4x4)
             {
-                int4X4Y |= _sizeMask[(int)blockSize] << shiftY;
+                int4x4Y |= SizeMask[(int)blockSize] << shiftY;
             }
 
             if (buildUv != 0 && txSizeUv == TxSize.Tx4x4)
             {
-                int4X4Uv |= (ushort)((_sizeMaskUv[(int)blockSize] & 0xffff) << shiftUv);
+                int4x4Uv |= (ushort)((SizeMaskUv[(int)blockSize] & 0xffff) << shiftUv);
+            }
+        }
+
+        private static void AdjustMask(ref Vp9Common cm, int miRow, int miCol, ref LoopFilterMask lfm)
+        {
+            const ulong leftBorder = 0x1111111111111111UL;
+            const ulong aboveBorder = 0x000000ff000000ffUL;
+            const ushort leftBorderUv = 0x1111;
+            const ushort aboveBorderUv = 0x000f;
+
+
+            // The largest loopfilter we have is 16x16 so we use the 16x16 mask
+            // for 32x32 transforms also.
+            lfm.LeftY[(int)TxSize.Tx16x16] |= lfm.LeftY[(int)TxSize.Tx32x32];
+            lfm.AboveY[(int)TxSize.Tx16x16] |= lfm.AboveY[(int)TxSize.Tx32x32];
+            lfm.LeftUv[(int)TxSize.Tx16x16] |= lfm.LeftUv[(int)TxSize.Tx32x32];
+            lfm.AboveUv[(int)TxSize.Tx16x16] |= lfm.AboveUv[(int)TxSize.Tx32x32];
+
+            // We do at least 8 tap filter on every 32x32 even if the transform size
+            // is 4x4. So if the 4x4 is set on a border pixel add it to the 8x8 and
+            // remove it from the 4x4.
+            lfm.LeftY[(int)TxSize.Tx8x8] |= lfm.LeftY[(int)TxSize.Tx4x4] & leftBorder;
+            lfm.LeftY[(int)TxSize.Tx4x4] &= ~leftBorder;
+            lfm.AboveY[(int)TxSize.Tx8x8] |= lfm.AboveY[(int)TxSize.Tx4x4] & aboveBorder;
+            lfm.AboveY[(int)TxSize.Tx4x4] &= ~aboveBorder;
+            lfm.LeftUv[(int)TxSize.Tx8x8] |= (ushort)(lfm.LeftUv[(int)TxSize.Tx4x4] & leftBorderUv);
+            lfm.LeftUv[(int)TxSize.Tx4x4] &= unchecked((ushort)~leftBorderUv);
+            lfm.AboveUv[(int)TxSize.Tx8x8] |= (ushort)(lfm.AboveUv[(int)TxSize.Tx4x4] & aboveBorderUv);
+            lfm.AboveUv[(int)TxSize.Tx4x4] &= unchecked((ushort)~aboveBorderUv);
+
+            // We do some special edge handling.
+            if (miRow + Constants.MiBlockSize > cm.MiRows)
+            {
+                int rows = cm.MiRows - miRow;
+
+                // Each pixel inside the border gets a 1,
+                ulong maskY = (1UL << (rows << 3)) - 1;
+                ushort maskUv = (ushort)((1 << (((rows + 1) >> 1) << 2)) - 1);
+
+                // Remove values completely outside our border.
+                for (int i = 0; i < (int)TxSize.Tx32x32; i++)
+                {
+                    lfm.LeftY[i] &= maskY;
+                    lfm.AboveY[i] &= maskY;
+                    lfm.LeftUv[i] &= maskUv;
+                    lfm.AboveUv[i] &= maskUv;
+                }
+
+                lfm.Int4x4Y &= maskY;
+                lfm.Int4x4Uv &= maskUv;
+
+                // We don't apply a wide loop filter on the last uv block row. If set
+                // apply the shorter one instead.
+                if (rows == 1)
+                {
+                    lfm.AboveUv[(int)TxSize.Tx8x8] |= lfm.AboveUv[(int)TxSize.Tx16x16];
+                    lfm.AboveUv[(int)TxSize.Tx16x16] = 0;
+                }
+
+                if (rows == 5)
+                {
+                    lfm.AboveUv[(int)TxSize.Tx8x8] |= (ushort)(lfm.AboveUv[(int)TxSize.Tx16x16] & 0xff00);
+                    lfm.AboveUv[(int)TxSize.Tx16x16] &= (ushort)~(lfm.AboveUv[(int)TxSize.Tx16x16] & 0xff00);
+                }
+            }
+
+            if (miCol + Constants.MiBlockSize > cm.MiCols)
+            {
+                int columns = cm.MiCols - miCol;
+
+                // Each pixel inside the border gets a 1, the multiply copies the border
+                // to where we need it.
+                ulong maskY = ((1UL << columns) - 1) * 0x0101010101010101UL;
+                ushort maskUv = (ushort)(((1 << ((columns + 1) >> 1)) - 1) * 0x1111);
+
+                // Internal edges are not applied on the last column of the image so
+                // we mask 1 more for the internal edges
+                ushort maskUvInt = (ushort)(((1 << (columns >> 1)) - 1) * 0x1111);
+
+                // Remove the bits outside the image edge.
+                for (int i = 0; i < (int)TxSize.Tx32x32; i++)
+                {
+                    lfm.LeftY[i] &= maskY;
+                    lfm.AboveY[i] &= maskY;
+                    lfm.LeftUv[i] &= maskUv;
+                    lfm.AboveUv[i] &= maskUv;
+                }
+
+                lfm.Int4x4Y &= maskY;
+                lfm.Int4x4Uv &= maskUvInt;
+
+                // We don't apply a wide loop filter on the last uv column. If set
+                // apply the shorter one instead.
+                if (columns == 1)
+                {
+                    lfm.LeftUv[(int)TxSize.Tx8x8] |= lfm.LeftUv[(int)TxSize.Tx16x16];
+                    lfm.LeftUv[(int)TxSize.Tx16x16] = 0;
+                }
+
+                if (columns == 5)
+                {
+                    lfm.LeftUv[(int)TxSize.Tx8x8] |= (ushort)(lfm.LeftUv[(int)TxSize.Tx16x16] & 0xcccc);
+                    lfm.LeftUv[(int)TxSize.Tx16x16] &= (ushort)~(lfm.LeftUv[(int)TxSize.Tx16x16] & 0xcccc);
+                }
             }
+
+            // We don't apply a loop filter on the first column in the image, mask that
+            // out.
+            if (miCol == 0)
+            {
+                for (int i = 0; i < (int)TxSize.Tx32x32; i++)
+                {
+                    lfm.LeftY[i] &= 0xfefefefefefefefeUL;
+                    lfm.LeftUv[i] &= 0xeeee;
+                }
+            }
+
+            // Assert if we try to apply 2 different loop filters at the same position.
+            Debug.Assert((lfm.LeftY[(int)TxSize.Tx16x16] & lfm.LeftY[(int)TxSize.Tx8x8]) == 0);
+            Debug.Assert((lfm.LeftY[(int)TxSize.Tx16x16] & lfm.LeftY[(int)TxSize.Tx4x4]) == 0);
+            Debug.Assert((lfm.LeftY[(int)TxSize.Tx8x8] & lfm.LeftY[(int)TxSize.Tx4x4]) == 0);
+            Debug.Assert((lfm.Int4x4Y & lfm.LeftY[(int)TxSize.Tx16x16]) == 0);
+            Debug.Assert((lfm.LeftUv[(int)TxSize.Tx16x16] & lfm.LeftUv[(int)TxSize.Tx8x8]) == 0);
+            Debug.Assert((lfm.LeftUv[(int)TxSize.Tx16x16] & lfm.LeftUv[(int)TxSize.Tx4x4]) == 0);
+            Debug.Assert((lfm.LeftUv[(int)TxSize.Tx8x8] & lfm.LeftUv[(int)TxSize.Tx4x4]) == 0);
+            Debug.Assert((lfm.Int4x4Uv & lfm.LeftUv[(int)TxSize.Tx16x16]) == 0);
+            Debug.Assert((lfm.AboveY[(int)TxSize.Tx16x16] & lfm.AboveY[(int)TxSize.Tx8x8]) == 0);
+            Debug.Assert((lfm.AboveY[(int)TxSize.Tx16x16] & lfm.AboveY[(int)TxSize.Tx4x4]) == 0);
+            Debug.Assert((lfm.AboveY[(int)TxSize.Tx8x8] & lfm.AboveY[(int)TxSize.Tx4x4]) == 0);
+            Debug.Assert((lfm.Int4x4Y & lfm.AboveY[(int)TxSize.Tx16x16]) == 0);
+            Debug.Assert((lfm.AboveUv[(int)TxSize.Tx16x16] & lfm.AboveUv[(int)TxSize.Tx8x8]) == 0);
+            Debug.Assert((lfm.AboveUv[(int)TxSize.Tx16x16] & lfm.AboveUv[(int)TxSize.Tx4x4]) == 0);
+            Debug.Assert((lfm.AboveUv[(int)TxSize.Tx8x8] & lfm.AboveUv[(int)TxSize.Tx4x4]) == 0);
+            Debug.Assert((lfm.Int4x4Uv & lfm.AboveUv[(int)TxSize.Tx16x16]) == 0);
         }
 
         public static unsafe void ResetLfm(ref Vp9Common cm)
         {
             if (cm.Lf.FilterLevel != 0)
             {
-                MemoryUtil.Fill(cm.Lf.Lfm.ToPointer(), new LoopFilterMask(), ((cm.MiRows + (Constants.MiBlockSize - 1)) >> 3) * cm.Lf.LfmStride);
+                MemoryUtil.Fill(cm.Lf.Lfm.ToPointer(), new LoopFilterMask(),
+                    ((cm.MiRows + (Constants.MiBlockSize - 1)) >> 3) * cm.Lf.LfmStride);
             }
         }
 
@@ -338,9 +592,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
                 if (sharpnessLvl > 0)
                 {
-                    if (blockInsideLimit > (9 - sharpnessLvl))
+                    if (blockInsideLimit > 9 - sharpnessLvl)
                     {
-                        blockInsideLimit = (9 - sharpnessLvl);
+                        blockInsideLimit = 9 - sharpnessLvl;
                     }
                 }
 
@@ -350,7 +604,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 }
 
                 lfi.Lfthr[lvl].Lim.AsSpan().Fill((byte)blockInsideLimit);
-                lfi.Lfthr[lvl].Mblim.AsSpan().Fill((byte)(2 * (lvl + 2) + blockInsideLimit));
+                lfi.Lfthr[lvl].Mblim.AsSpan().Fill((byte)((2 * (lvl + 2)) + blockInsideLimit));
             }
         }
 
@@ -375,10 +629,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             for (segId = 0; segId < Constants.MaxSegments; segId++)
             {
                 int lvlSeg = defaultFiltLvl;
-                if (seg.IsSegFeatureActive(segId, SegLvlFeatures.SegLvlAltLf) != 0)
+                if (seg.IsSegFeatureActive(segId, SegLvlFeatures.AltLf) != 0)
                 {
-                    int data = seg.GetSegData(segId, SegLvlFeatures.SegLvlAltLf);
-                    lvlSeg = Math.Clamp(seg.AbsDelta == Constants.SegmentAbsData ? data : defaultFiltLvl + data, 0, MaxLoopFilter);
+                    int data = seg.GetSegData(segId, SegLvlFeatures.AltLf);
+                    lvlSeg = Math.Clamp(seg.AbsDelta == Constants.SegmentAbsData ? data : defaultFiltLvl + data, 0,
+                        MaxLoopFilter);
                 }
 
                 if (!lf.ModeRefDeltaEnabled)
@@ -390,19 +645,1322 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 else
                 {
                     int refr, mode;
-                    int intraLvl = lvlSeg + lf.RefDeltas[Constants.IntraFrame] * scale;
+                    int intraLvl = lvlSeg + (lf.RefDeltas[Constants.IntraFrame] * scale);
                     lfi.Lvl[segId][Constants.IntraFrame][0] = (byte)Math.Clamp(intraLvl, 0, MaxLoopFilter);
 
                     for (refr = Constants.LastFrame; refr < Constants.MaxRefFrames; ++refr)
                     {
                         for (mode = 0; mode < MaxModeLfDeltas; ++mode)
                         {
-                            int interLvl = lvlSeg + lf.RefDeltas[refr] * scale + lf.ModeDeltas[mode] * scale;
+                            int interLvl = lvlSeg + (lf.RefDeltas[refr] * scale) + (lf.ModeDeltas[mode] * scale);
                             lfi.Lvl[segId][refr][mode] = (byte)Math.Clamp(interLvl, 0, MaxLoopFilter);
                         }
                     }
                 }
             }
         }
+
+        private static void FilterSelectivelyVertRow2(
+            int subsamplingFactor,
+            ArrayPtr<byte> s,
+            int pitch,
+            uint mask16x16,
+            uint mask8x8,
+            uint mask4x4,
+            uint mask4x4Int,
+            ReadOnlySpan<LoopFilterThresh> lfthr,
+            ReadOnlySpan<byte> lfl)
+        {
+            uint dualMaskCutoff = subsamplingFactor != 0 ? 0xffu : 0xffffu;
+            int lflForward = subsamplingFactor != 0 ? 4 : 8;
+            uint dualOne = 1u | (1u << lflForward);
+            Span<ArrayPtr<byte>> ss = stackalloc ArrayPtr<byte>[2];
+            Span<LoopFilterThresh> lfis = stackalloc LoopFilterThresh[2];
+            ss[0] = s;
+
+            for (uint mask = (mask16x16 | mask8x8 | mask4x4 | mask4x4Int) & dualMaskCutoff;
+                 mask != 0;
+                 mask = (mask & ~dualOne) >> 1)
+            {
+                if ((mask & dualOne) != 0)
+                {
+                    lfis[0] = lfthr[lfl[0]];
+                    lfis[1] = lfthr[lfl[lflForward]];
+                    ss[1] = ss[0].Slice(8 * pitch);
+
+                    if ((mask16x16 & dualOne) != 0)
+                    {
+                        if ((mask16x16 & dualOne) == dualOne)
+                        {
+                            LoopFilterAuto.LpfVertical16Dual(ss[0], pitch, lfis[0].Mblim.AsSpan(), lfis[0].Lim.AsSpan(),
+                                lfis[0].HevThr.AsSpan());
+                        }
+                        else
+                        {
+                            ref LoopFilterThresh lfi = ref lfis[(mask16x16 & 1) == 0 ? 1 : 0];
+                            LoopFilterAuto.LpfVertical16(ss[(mask16x16 & 1) == 0 ? 1 : 0], pitch, lfi.Mblim.AsSpan(),
+                                lfi.Lim.AsSpan(), lfi.HevThr.AsSpan());
+                        }
+                    }
+
+                    if ((mask8x8 & dualOne) != 0)
+                    {
+                        if ((mask8x8 & dualOne) == dualOne)
+                        {
+                            LoopFilterAuto.LpfVertical8Dual(
+                                ss[0],
+                                pitch,
+                                lfis[0].Mblim.AsSpan(),
+                                lfis[0].Lim.AsSpan(),
+                                lfis[0].HevThr.AsSpan(),
+                                lfis[1].Mblim.AsSpan(),
+                                lfis[1].Lim.AsSpan(),
+                                lfis[1].HevThr.AsSpan());
+                        }
+                        else
+                        {
+                            ref LoopFilterThresh lfi = ref lfis[(mask8x8 & 1) == 0 ? 1 : 0];
+                            LoopFilterAuto.LpfVertical8(
+                                ss[(mask8x8 & 1) == 0 ? 1 : 0],
+                                pitch,
+                                lfi.Mblim.AsSpan(),
+                                lfi.Lim.AsSpan(),
+                                lfi.HevThr.AsSpan());
+                        }
+                    }
+
+                    if ((mask4x4 & dualOne) != 0)
+                    {
+                        if ((mask4x4 & dualOne) == dualOne)
+                        {
+                            LoopFilterAuto.LpfVertical4Dual(
+                                ss[0],
+                                pitch,
+                                lfis[0].Mblim.AsSpan(),
+                                lfis[0].Lim.AsSpan(),
+                                lfis[0].HevThr.AsSpan(),
+                                lfis[1].Mblim.AsSpan(),
+                                lfis[1].Lim.AsSpan(),
+                                lfis[1].HevThr.AsSpan());
+                        }
+                        else
+                        {
+                            ref LoopFilterThresh lfi = ref lfis[(mask4x4 & 1) == 0 ? 1 : 0];
+                            LoopFilterAuto.LpfVertical4(ss[(mask4x4 & 1) == 0 ? 1 : 0], pitch, lfi.Mblim.AsSpan(),
+                                lfi.Lim.AsSpan(), lfi.HevThr.AsSpan());
+                        }
+                    }
+
+                    if ((mask4x4Int & dualOne) != 0)
+                    {
+                        if ((mask4x4Int & dualOne) == dualOne)
+                        {
+                            LoopFilterAuto.LpfVertical4Dual(
+                                ss[0].Slice(4),
+                                pitch,
+                                lfis[0].Mblim.AsSpan(),
+                                lfis[0].Lim.AsSpan(),
+                                lfis[0].HevThr.AsSpan(),
+                                lfis[1].Mblim.AsSpan(),
+                                lfis[1].Lim.AsSpan(),
+                                lfis[1].HevThr.AsSpan());
+                        }
+                        else
+                        {
+                            ref LoopFilterThresh lfi = ref lfis[(mask4x4Int & 1) == 0 ? 1 : 0];
+                            LoopFilterAuto.LpfVertical4(ss[(mask4x4Int & 1) == 0 ? 1 : 0].Slice(4), pitch,
+                                lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(), lfi.HevThr.AsSpan());
+                        }
+                    }
+                }
+
+                ss[0] = ss[0].Slice(8);
+                lfl = lfl.Slice(1);
+                mask16x16 >>= 1;
+                mask8x8 >>= 1;
+                mask4x4 >>= 1;
+                mask4x4Int >>= 1;
+            }
+        }
+
+        private static void HighbdFilterSelectivelyVertRow2(
+            int subsamplingFactor,
+            ArrayPtr<ushort> s,
+            int pitch,
+            uint mask16x16,
+            uint mask8x8,
+            uint mask4x4,
+            uint mask4x4Int,
+            ReadOnlySpan<LoopFilterThresh> lfthr,
+            ReadOnlySpan<byte> lfl,
+            int bd)
+        {
+            uint dualMaskCutoff = subsamplingFactor != 0 ? 0xffu : 0xffffu;
+            int lflForward = subsamplingFactor != 0 ? 4 : 8;
+            uint dualOne = 1u | (1u << lflForward);
+            Span<ArrayPtr<ushort>> ss = stackalloc ArrayPtr<ushort>[2];
+            Span<LoopFilterThresh> lfis = stackalloc LoopFilterThresh[2];
+            ss[0] = s;
+
+            for (uint mask = (mask16x16 | mask8x8 | mask4x4 | mask4x4Int) & dualMaskCutoff;
+                 mask != 0;
+                 mask = (mask & ~dualOne) >> 1)
+            {
+                if ((mask & dualOne) != 0)
+                {
+                    lfis[0] = lfthr[lfl[0]];
+                    lfis[1] = lfthr[lfl[lflForward]];
+                    ss[1] = ss[0].Slice(8 * pitch);
+
+                    if ((mask16x16 & dualOne) != 0)
+                    {
+                        if ((mask16x16 & dualOne) == dualOne)
+                        {
+                            LoopFilterScalar.HighBdLpfVertical16Dual(ss[0], pitch, lfis[0].Mblim[0], lfis[0].Lim[0],
+                                lfis[0].HevThr[0], bd);
+                        }
+                        else
+                        {
+                            ref LoopFilterThresh lfi = ref lfis[(mask16x16 & 1) == 0 ? 1 : 0];
+                            LoopFilterScalar.HighBdLpfVertical16(ss[(mask16x16 & 1) == 0 ? 1 : 0], pitch, lfi.Mblim[0],
+                                lfi.Lim[0], lfi.HevThr[0], bd);
+                        }
+                    }
+
+                    if ((mask8x8 & dualOne) != 0)
+                    {
+                        if ((mask8x8 & dualOne) == dualOne)
+                        {
+                            LoopFilterScalar.HighBdLpfVertical8Dual(
+                                ss[0],
+                                pitch,
+                                lfis[0].Mblim[0],
+                                lfis[0].Lim[0],
+                                lfis[0].HevThr[0],
+                                lfis[1].Mblim[0],
+                                lfis[1].Lim[0],
+                                lfis[1].HevThr[0],
+                                bd);
+                        }
+                        else
+                        {
+                            ref LoopFilterThresh lfi = ref lfis[(mask8x8 & 1) == 0 ? 1 : 0];
+                            LoopFilterScalar.HighBdLpfVertical8(
+                                ss[(mask8x8 & 1) == 0 ? 1 : 0],
+                                pitch,
+                                lfi.Mblim[0],
+                                lfi.Lim[0],
+                                lfi.HevThr[0],
+                                bd);
+                        }
+                    }
+
+                    if ((mask4x4 & dualOne) != 0)
+                    {
+                        if ((mask4x4 & dualOne) == dualOne)
+                        {
+                            LoopFilterScalar.HighBdLpfVertical4Dual(
+                                ss[0],
+                                pitch,
+                                lfis[0].Mblim[0],
+                                lfis[0].Lim[0],
+                                lfis[0].HevThr[0],
+                                lfis[1].Mblim[0],
+                                lfis[1].Lim[0],
+                                lfis[1].HevThr[0],
+                                bd);
+                        }
+                        else
+                        {
+                            ref LoopFilterThresh lfi = ref lfis[(mask4x4 & 1) == 0 ? 1 : 0];
+                            LoopFilterScalar.HighBdLpfVertical4(ss[(mask4x4 & 1) == 0 ? 1 : 0], pitch, lfi.Mblim[0],
+                                lfi.Lim[0], lfi.HevThr[0], bd);
+                        }
+                    }
+
+                    if ((mask4x4Int & dualOne) != 0)
+                    {
+                        if ((mask4x4Int & dualOne) == dualOne)
+                        {
+                            LoopFilterScalar.HighBdLpfVertical4Dual(
+                                ss[0].Slice(4),
+                                pitch,
+                                lfis[0].Mblim[0],
+                                lfis[0].Lim[0],
+                                lfis[0].HevThr[0],
+                                lfis[1].Mblim[0],
+                                lfis[1].Lim[0],
+                                lfis[1].HevThr[0],
+                                bd);
+                        }
+                        else
+                        {
+                            ref LoopFilterThresh lfi = ref lfis[(mask4x4Int & 1) == 0 ? 1 : 0];
+                            LoopFilterScalar.HighBdLpfVertical4(ss[(mask4x4Int & 1) == 0 ? 1 : 0].Slice(4), pitch,
+                                lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], bd);
+                        }
+                    }
+                }
+
+                ss[0] = ss[0].Slice(8);
+                lfl = lfl.Slice(1);
+                mask16x16 >>= 1;
+                mask8x8 >>= 1;
+                mask4x4 >>= 1;
+                mask4x4Int >>= 1;
+            }
+        }
+
+        private static void FilterSelectivelyHoriz(
+            ArrayPtr<byte> s,
+            int pitch,
+            uint mask16x16,
+            uint mask8x8,
+            uint mask4x4,
+            uint mask4x4Int,
+            ReadOnlySpan<LoopFilterThresh> lfthr,
+            ReadOnlySpan<byte> lfl)
+        {
+            int count;
+
+            for (uint mask = mask16x16 | mask8x8 | mask4x4 | mask4x4Int; mask != 0; mask >>= count)
+            {
+                count = 1;
+                if ((mask & 1) != 0)
+                {
+                    LoopFilterThresh lfi = lfthr[lfl[0]];
+
+                    if ((mask16x16 & 1) != 0)
+                    {
+                        if ((mask16x16 & 3) == 3)
+                        {
+                            LoopFilterAuto.LpfHorizontal16Dual(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(),
+                                lfi.HevThr.AsSpan());
+                            count = 2;
+                        }
+                        else
+                        {
+                            LoopFilterAuto.LpfHorizontal16(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(),
+                                lfi.HevThr.AsSpan());
+                        }
+                    }
+                    else if ((mask8x8 & 1) != 0)
+                    {
+                        if ((mask8x8 & 3) == 3)
+                        {
+                            // Next block's thresholds.
+                            LoopFilterThresh lfin = lfthr[lfl[1]];
+
+                            LoopFilterAuto.LpfHorizontal8Dual(
+                                s,
+                                pitch,
+                                lfi.Mblim.AsSpan(),
+                                lfi.Lim.AsSpan(),
+                                lfi.HevThr.AsSpan(),
+                                lfin.Mblim.AsSpan(),
+                                lfin.Lim.AsSpan(),
+                                lfin.HevThr.AsSpan());
+
+                            if ((mask4x4Int & 3) == 3)
+                            {
+                                LoopFilterAuto.LpfHorizontal4Dual(
+                                    s.Slice(4 * pitch),
+                                    pitch,
+                                    lfi.Mblim.AsSpan(),
+                                    lfi.Lim.AsSpan(),
+                                    lfi.HevThr.AsSpan(),
+                                    lfin.Mblim.AsSpan(),
+                                    lfin.Lim.AsSpan(),
+                                    lfin.HevThr.AsSpan());
+                            }
+                            else if ((mask4x4Int & 1) != 0)
+                            {
+                                LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(),
+                                    lfi.Lim.AsSpan(), lfi.HevThr.AsSpan());
+                            }
+                            else if ((mask4x4Int & 2) != 0)
+                            {
+                                LoopFilterAuto.LpfHorizontal4(s.Slice(8 + (4 * pitch)), pitch, lfin.Mblim.AsSpan(),
+                                    lfin.Lim.AsSpan(), lfin.HevThr.AsSpan());
+                            }
+
+                            count = 2;
+                        }
+                        else
+                        {
+                            LoopFilterAuto.LpfHorizontal8(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(),
+                                lfi.HevThr.AsSpan());
+
+                            if ((mask4x4Int & 1) != 0)
+                            {
+                                LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(),
+                                    lfi.Lim.AsSpan(), lfi.HevThr.AsSpan());
+                            }
+                        }
+                    }
+                    else if ((mask4x4 & 1) != 0)
+                    {
+                        if ((mask4x4 & 3) == 3)
+                        {
+                            // Next block's thresholds.
+                            LoopFilterThresh lfin = lfthr[lfl[1]];
+
+                            LoopFilterAuto.LpfHorizontal4Dual(
+                                s,
+                                pitch,
+                                lfi.Mblim.AsSpan(),
+                                lfi.Lim.AsSpan(),
+                                lfi.HevThr.AsSpan(),
+                                lfin.Mblim.AsSpan(),
+                                lfin.Lim.AsSpan(),
+                                lfin.HevThr.AsSpan());
+
+                            if ((mask4x4Int & 3) == 3)
+                            {
+                                LoopFilterAuto.LpfHorizontal4Dual(
+                                    s.Slice(4 * pitch),
+                                    pitch,
+                                    lfi.Mblim.AsSpan(),
+                                    lfi.Lim.AsSpan(),
+                                    lfi.HevThr.AsSpan(),
+                                    lfin.Mblim.AsSpan(),
+                                    lfin.Lim.AsSpan(),
+                                    lfin.HevThr.AsSpan());
+                            }
+                            else if ((mask4x4Int & 1) != 0)
+                            {
+                                LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(),
+                                    lfi.Lim.AsSpan(), lfi.HevThr.AsSpan());
+                            }
+                            else if ((mask4x4Int & 2) != 0)
+                            {
+                                LoopFilterAuto.LpfHorizontal4(s.Slice(8 + (4 * pitch)), pitch, lfin.Mblim.AsSpan(),
+                                    lfin.Lim.AsSpan(), lfin.HevThr.AsSpan());
+                            }
+
+                            count = 2;
+                        }
+                        else
+                        {
+                            LoopFilterAuto.LpfHorizontal4(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(),
+                                lfi.HevThr.AsSpan());
+
+                            if ((mask4x4Int & 1) != 0)
+                            {
+                                LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(),
+                                    lfi.Lim.AsSpan(), lfi.HevThr.AsSpan());
+                            }
+                        }
+                    }
+                    else
+                    {
+                        LoopFilterAuto.LpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(),
+                            lfi.HevThr.AsSpan());
+                    }
+                }
+
+                s = s.Slice(8 * count);
+                lfl = lfl.Slice(count);
+                mask16x16 >>= count;
+                mask8x8 >>= count;
+                mask4x4 >>= count;
+                mask4x4Int >>= count;
+            }
+        }
+
+        private static void HighbdFilterSelectivelyHoriz(
+            ArrayPtr<ushort> s,
+            int pitch,
+            uint mask16x16,
+            uint mask8x8,
+            uint mask4x4,
+            uint mask4x4Int,
+            ReadOnlySpan<LoopFilterThresh> lfthr,
+            ReadOnlySpan<byte> lfl,
+            int bd)
+        {
+            int count;
+
+            for (uint mask = mask16x16 | mask8x8 | mask4x4 | mask4x4Int; mask != 0; mask >>= count)
+            {
+                count = 1;
+                if ((mask & 1) != 0)
+                {
+                    LoopFilterThresh lfi = lfthr[lfl[0]];
+
+                    if ((mask16x16 & 1) != 0)
+                    {
+                        if ((mask16x16 & 3) == 3)
+                        {
+                            LoopFilterScalar.HighBdLpfHorizontal16Dual(s, pitch, lfi.Mblim[0], lfi.Lim[0],
+                                lfi.HevThr[0], bd);
+                            count = 2;
+                        }
+                        else
+                        {
+                            LoopFilterScalar.HighBdLpfHorizontal16(s, pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0],
+                                bd);
+                        }
+                    }
+                    else if ((mask8x8 & 1) != 0)
+                    {
+                        if ((mask8x8 & 3) == 3)
+                        {
+                            // Next block's thresholds.
+                            LoopFilterThresh lfin = lfthr[lfl[1]];
+
+                            LoopFilterScalar.HighBdLpfHorizontal8Dual(
+                                s,
+                                pitch,
+                                lfi.Mblim[0],
+                                lfi.Lim[0],
+                                lfi.HevThr[0],
+                                lfin.Mblim[0],
+                                lfin.Lim[0],
+                                lfin.HevThr[0],
+                                bd);
+
+                            if ((mask4x4Int & 3) == 3)
+                            {
+                                LoopFilterScalar.HighBdLpfHorizontal4Dual(
+                                    s.Slice(4 * pitch),
+                                    pitch,
+                                    lfi.Mblim[0],
+                                    lfi.Lim[0],
+                                    lfi.HevThr[0],
+                                    lfin.Mblim[0],
+                                    lfin.Lim[0],
+                                    lfin.HevThr[0],
+                                    bd);
+                            }
+                            else if ((mask4x4Int & 1) != 0)
+                            {
+                                LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim[0],
+                                    lfi.Lim[0], lfi.HevThr[0], bd);
+                            }
+                            else if ((mask4x4Int & 2) != 0)
+                            {
+                                LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(8 + (4 * pitch)), pitch, lfin.Mblim[0],
+                                    lfin.Lim[0], lfin.HevThr[0], bd);
+                            }
+
+                            count = 2;
+                        }
+                        else
+                        {
+                            LoopFilterScalar.HighBdLpfHorizontal8(s, pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0],
+                                bd);
+
+                            if ((mask4x4Int & 1) != 0)
+                            {
+                                LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim[0],
+                                    lfi.Lim[0], lfi.HevThr[0], bd);
+                            }
+                        }
+                    }
+                    else if ((mask4x4 & 1) != 0)
+                    {
+                        if ((mask4x4 & 3) == 3)
+                        {
+                            // Next block's thresholds.
+                            LoopFilterThresh lfin = lfthr[lfl[1]];
+
+                            LoopFilterScalar.HighBdLpfHorizontal4Dual(
+                                s,
+                                pitch,
+                                lfi.Mblim[0],
+                                lfi.Lim[0],
+                                lfi.HevThr[0],
+                                lfin.Mblim[0],
+                                lfin.Lim[0],
+                                lfin.HevThr[0],
+                                bd);
+
+                            if ((mask4x4Int & 3) == 3)
+                            {
+                                LoopFilterScalar.HighBdLpfHorizontal4Dual(
+                                    s.Slice(4 * pitch),
+                                    pitch,
+                                    lfi.Mblim[0],
+                                    lfi.Lim[0],
+                                    lfi.HevThr[0],
+                                    lfin.Mblim[0],
+                                    lfin.Lim[0],
+                                    lfin.HevThr[0],
+                                    bd);
+                            }
+                            else if ((mask4x4Int & 1) != 0)
+                            {
+                                LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim[0],
+                                    lfi.Lim[0], lfi.HevThr[0], bd);
+                            }
+                            else if ((mask4x4Int & 2) != 0)
+                            {
+                                LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(8 + (4 * pitch)), pitch, lfin.Mblim[0],
+                                    lfin.Lim[0], lfin.HevThr[0], bd);
+                            }
+
+                            count = 2;
+                        }
+                        else
+                        {
+                            LoopFilterScalar.HighBdLpfHorizontal4(s, pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0],
+                                bd);
+
+                            if ((mask4x4Int & 1) != 0)
+                            {
+                                LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim[0],
+                                    lfi.Lim[0], lfi.HevThr[0], bd);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        LoopFilterScalar.HighBdLpfHorizontal4(s.Slice(4 * pitch), pitch, lfi.Mblim[0], lfi.Lim[0],
+                            lfi.HevThr[0], bd);
+                    }
+                }
+
+                s = s.Slice(8 * count);
+                lfl = lfl.Slice(count);
+                mask16x16 >>= count;
+                mask8x8 >>= count;
+                mask4x4 >>= count;
+                mask4x4Int >>= count;
+            }
+        }
+
+        private static void FilterSelectivelyVert(
+            ArrayPtr<byte> s,
+            int pitch,
+            uint mask16x16,
+            uint mask8x8,
+            uint mask4x4,
+            uint mask4x4Int,
+            ReadOnlySpan<LoopFilterThresh> lfthr,
+            ReadOnlySpan<byte> lfl)
+        {
+            for (uint mask = mask16x16 | mask8x8 | mask4x4 | mask4x4Int; mask != 0; mask >>= 1)
+            {
+                LoopFilterThresh lfi = lfthr[lfl[0]];
+
+                if ((mask & 1) != 0)
+                {
+                    if ((mask16x16 & 1) != 0)
+                    {
+                        LoopFilterAuto.LpfVertical16(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(),
+                            lfi.HevThr.AsSpan());
+                    }
+                    else if ((mask8x8 & 1) != 0)
+                    {
+                        LoopFilterAuto.LpfVertical8(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(),
+                            lfi.HevThr.AsSpan());
+                    }
+                    else if ((mask4x4 & 1) != 0)
+                    {
+                        LoopFilterAuto.LpfVertical4(s, pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(),
+                            lfi.HevThr.AsSpan());
+                    }
+                }
+
+                if ((mask4x4Int & 1) != 0)
+                {
+                    LoopFilterAuto.LpfVertical4(s.Slice(4), pitch, lfi.Mblim.AsSpan(), lfi.Lim.AsSpan(),
+                        lfi.HevThr.AsSpan());
+                }
+
+                s = s.Slice(8);
+                lfl = lfl.Slice(1);
+                mask16x16 >>= 1;
+                mask8x8 >>= 1;
+                mask4x4 >>= 1;
+                mask4x4Int >>= 1;
+            }
+        }
+
+        private static void HighbdFilterSelectivelyVert(
+            ArrayPtr<ushort> s,
+            int pitch,
+            uint mask16x16,
+            uint mask8x8,
+            uint mask4x4,
+            uint mask4x4Int,
+            ReadOnlySpan<LoopFilterThresh> lfthr,
+            ReadOnlySpan<byte> lfl,
+            int bd)
+        {
+            for (uint mask = mask16x16 | mask8x8 | mask4x4 | mask4x4Int; mask != 0; mask >>= 1)
+            {
+                LoopFilterThresh lfi = lfthr[lfl[0]];
+
+                if ((mask & 1) != 0)
+                {
+                    if ((mask16x16 & 1) != 0)
+                    {
+                        LoopFilterScalar.HighBdLpfVertical16(s, pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], bd);
+                    }
+                    else if ((mask8x8 & 1) != 0)
+                    {
+                        LoopFilterScalar.HighBdLpfVertical8(s, pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], bd);
+                    }
+                    else if ((mask4x4 & 1) != 0)
+                    {
+                        LoopFilterScalar.HighBdLpfVertical4(s, pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], bd);
+                    }
+                }
+
+                if ((mask4x4Int & 1) != 0)
+                {
+                    LoopFilterScalar.HighBdLpfVertical4(s.Slice(4), pitch, lfi.Mblim[0], lfi.Lim[0], lfi.HevThr[0], bd);
+                }
+
+                s = s.Slice(8);
+                lfl = lfl.Slice(1);
+                mask16x16 >>= 1;
+                mask8x8 >>= 1;
+                mask4x4 >>= 1;
+                mask4x4Int >>= 1;
+            }
+        }
+
+        private static readonly byte[] Num4x4BlocksWideLookup = { 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16 };
+        private static readonly byte[] Num4x4BlocksHighLookup = { 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16 };
+        private static readonly byte[] Num8x8BlocksWideLookup = { 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8 };
+        private static readonly byte[] Num8x8BlocksHighLookup = { 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8 };
+
+        private static void FilterBlockPlaneNon420(
+            ref Vp9Common cm,
+            ref MacroBlockDPlane plane,
+            ArrayPtr<Ptr<ModeInfo>> mi8x8,
+            int miRow,
+            int miCol)
+        {
+            int ssX = plane.SubsamplingX;
+            int ssY = plane.SubsamplingY;
+            int rowStep = 1 << ssY;
+            int colStep = 1 << ssX;
+            int rowStepStride = cm.MiStride * rowStep;
+            ref Buf2D dst = ref plane.Dst;
+            ArrayPtr<byte> dst0 = dst.Buf;
+            Span<int> mask16x16 = stackalloc int[Constants.MiBlockSize];
+            Span<int> mask8x8 = stackalloc int[Constants.MiBlockSize];
+            Span<int> mask4x4 = stackalloc int[Constants.MiBlockSize];
+            Span<int> mask4x4Int = stackalloc int[Constants.MiBlockSize];
+            Span<byte> lfl = stackalloc byte[Constants.MiBlockSize * Constants.MiBlockSize];
+
+
+            for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += rowStep)
+            {
+                uint mask16x16C = 0;
+                uint mask8x8C = 0;
+                uint mask4x4C = 0;
+                uint borderMask;
+
+                // Determine the vertical edges that need filtering
+                for (int c = 0; c < Constants.MiBlockSize && miCol + c < cm.MiCols; c += colStep)
+                {
+                    ref ModeInfo mi = ref mi8x8[c].Value;
+                    BlockSize sbType = mi.SbType;
+                    bool skipThis = mi.Skip != 0 && mi.IsInterBlock();
+                    // left edge of current unit is block/partition edge -> no skip
+                    bool blockEdgeLeft = Num4x4BlocksWideLookup[(int)sbType] <= 1 || (c & (Num8x8BlocksWideLookup[(int)sbType] - 1)) == 0;
+                    bool skipThisC = skipThis && !blockEdgeLeft;
+                    // top edge of current unit is block/partition edge -> no skip
+                    bool blockEdgeAbove = Num4x4BlocksHighLookup[(int)sbType] <= 1 || (r & (Num8x8BlocksHighLookup[(int)sbType] - 1)) == 0;
+                    bool skipThisR = skipThis && !blockEdgeAbove;
+                    TxSize txSize = mi.GetUvTxSize(ref plane);
+                    bool skipBorder4x4C = ssX != 0 && miCol + c == cm.MiCols - 1;
+                    bool skipBorder4x4R = ssY != 0 && miRow + r == cm.MiRows - 1;
+
+                    // Filter level can vary per MI
+                    if ((lfl[(r << 3) + (c >> ssX)] = GetFilterLevel(ref cm.LfInfo, ref mi)) == 0)
+                    {
+                        continue;
+                    }
+
+                    // Build masks based on the transform size of each block
+                    if (txSize == TxSize.Tx32x32)
+                    {
+                        if (!skipThisC && ((c >> ssX) & 3) == 0)
+                        {
+                            if (!skipBorder4x4C)
+                            {
+                                mask16x16C |= 1u << (c >> ssX);
+                            }
+                            else
+                            {
+                                mask8x8C |= 1u << (c >> ssX);
+                            }
+                        }
+
+                        if (!skipThisR && ((r >> ssY) & 3) == 0)
+                        {
+                            if (!skipBorder4x4R)
+                            {
+                                mask16x16[r] |= 1 << (c >> ssX);
+                            }
+                            else
+                            {
+                                mask8x8[r] |= 1 << (c >> ssX);
+                            }
+                        }
+                    }
+                    else if (txSize == TxSize.Tx16x16)
+                    {
+                        if (!skipThisC && ((c >> ssX) & 1) == 0)
+                        {
+                            if (!skipBorder4x4C)
+                            {
+                                mask16x16C |= 1u << (c >> ssX);
+                            }
+                            else
+                            {
+                                mask8x8C |= 1u << (c >> ssX);
+                            }
+                        }
+
+                        if (!skipThisR && ((r >> ssY) & 1) == 0)
+                        {
+                            if (!skipBorder4x4R)
+                            {
+                                mask16x16[r] |= 1 << (c >> ssX);
+                            }
+                            else
+                            {
+                                mask8x8[r] |= 1 << (c >> ssX);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        // force 8x8 filtering on 32x32 boundaries
+                        if (!skipThisC)
+                        {
+                            if (txSize == TxSize.Tx8x8 || ((c >> ssX) & 3) == 0)
+                            {
+                                mask8x8C |= 1u << (c >> ssX);
+                            }
+                            else
+                            {
+                                mask4x4C |= 1u << (c >> ssX);
+                            }
+                        }
+
+                        if (!skipThisR)
+                        {
+                            if (txSize == TxSize.Tx8x8 || ((r >> ssY) & 3) == 0)
+                            {
+                                mask8x8[r] |= 1 << (c >> ssX);
+                            }
+                            else
+                            {
+                                mask4x4[r] |= 1 << (c >> ssX);
+                            }
+                        }
+
+                        if (!skipThis && txSize < TxSize.Tx8x8 && !skipBorder4x4C)
+                        {
+                            mask4x4Int[r] |= 1 << (c >> ssX);
+                        }
+                    }
+                }
+
+                // Disable filtering on the leftmost column
+                borderMask = ~(miCol == 0 ? 1u : 0u);
+
+                if (cm.UseHighBitDepth)
+                {
+                    HighbdFilterSelectivelyVert(
+                        ConvertToUshortPtr(dst.Buf),
+                        dst.Stride,
+                        mask16x16C & borderMask,
+                        mask8x8C & borderMask,
+                        mask4x4C & borderMask,
+                        (uint)mask4x4Int[r],
+                        cm.LfInfo.Lfthr.AsSpan(),
+                        lfl.Slice(r << 3),
+                        (int)cm.BitDepth);
+                }
+                else
+                {
+                    FilterSelectivelyVert(
+                        dst.Buf,
+                        dst.Stride,
+                        mask16x16C & borderMask,
+                        mask8x8C & borderMask,
+                        mask4x4C & borderMask,
+                        (uint)mask4x4Int[r],
+                        cm.LfInfo.Lfthr.AsSpan(),
+                        lfl.Slice(r << 3));
+                }
+
+                dst.Buf = dst.Buf.Slice(8 * dst.Stride);
+                mi8x8 = mi8x8.Slice(rowStepStride);
+            }
+
+            // Now do horizontal pass
+            dst.Buf = dst0;
+            for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += rowStep)
+            {
+                bool skipBorder4x4R = ssY != 0 && miRow + r == cm.MiRows - 1;
+                uint mask4x4IntR = skipBorder4x4R ? 0u : (uint)mask4x4Int[r];
+
+                uint mask16x16R;
+                uint mask8x8R;
+                uint mask4x4R;
+
+                if (miRow + r == 0)
+                {
+                    mask16x16R = 0;
+                    mask8x8R = 0;
+                    mask4x4R = 0;
+                }
+                else
+                {
+                    mask16x16R = (uint)mask16x16[r];
+                    mask8x8R = (uint)mask8x8[r];
+                    mask4x4R = (uint)mask4x4[r];
+                }
+
+                if (cm.UseHighBitDepth)
+                {
+                    HighbdFilterSelectivelyHoriz(
+                        ConvertToUshortPtr(dst.Buf),
+                        dst.Stride,
+                        mask16x16R,
+                        mask8x8R,
+                        mask4x4R,
+                        mask4x4IntR,
+                        cm.LfInfo.Lfthr.AsSpan(),
+                        lfl.Slice(r << 3),
+                        (int)cm.BitDepth);
+                }
+                else
+                {
+                    FilterSelectivelyHoriz(
+                        dst.Buf,
+                        dst.Stride,
+                        mask16x16R,
+                        mask8x8R,
+                        mask4x4R,
+                        mask4x4IntR,
+                        cm.LfInfo.Lfthr.AsSpan(),
+                        lfl.Slice(r << 3));
+                }
+
+                dst.Buf = dst.Buf.Slice(8 * dst.Stride);
+            }
+        }
+
+        private static void FilterBlockPlaneSs00(ref Vp9Common cm, ref MacroBlockDPlane plane, int miRow,
+            ref LoopFilterMask lfm)
+        {
+            ref Buf2D dst = ref plane.Dst;
+            ArrayPtr<byte> dst0 = dst.Buf;
+            ulong mask16x16 = lfm.LeftY[(int)TxSize.Tx16x16];
+            ulong mask8x8 = lfm.LeftY[(int)TxSize.Tx8x8];
+            ulong mask4x4 = lfm.LeftY[(int)TxSize.Tx4x4];
+            ulong mask4x4Int = lfm.Int4x4Y;
+
+            Debug.Assert(plane.SubsamplingX == 0 && plane.SubsamplingY == 0);
+
+            // Vertical pass: do 2 rows at one time
+            for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += 2)
+            {
+                if (cm.UseHighBitDepth)
+                {
+                    // Disable filtering on the leftmost column.
+                    HighbdFilterSelectivelyVertRow2(
+                        plane.SubsamplingX,
+                        ConvertToUshortPtr(dst.Buf),
+                        dst.Stride,
+                        (uint)mask16x16,
+                        (uint)mask8x8,
+                        (uint)mask4x4,
+                        (uint)mask4x4Int,
+                        cm.LfInfo.Lfthr.AsSpan(),
+                        lfm.LflY.AsSpan().Slice(r << 3),
+                        (int)cm.BitDepth);
+                }
+                else
+                {
+                    // Disable filtering on the leftmost column.
+                    FilterSelectivelyVertRow2(
+                        plane.SubsamplingX,
+                        dst.Buf,
+                        dst.Stride,
+                        (uint)mask16x16,
+                        (uint)mask8x8,
+                        (uint)mask4x4,
+                        (uint)mask4x4Int,
+                        cm.LfInfo.Lfthr.AsSpan(),
+                        lfm.LflY.AsSpan().Slice(r << 3));
+                }
+
+                dst.Buf = dst.Buf.Slice(16 * dst.Stride);
+                mask16x16 >>= 16;
+                mask8x8 >>= 16;
+                mask4x4 >>= 16;
+                mask4x4Int >>= 16;
+            }
+
+            // Horizontal pass
+            dst.Buf = dst0;
+            mask16x16 = lfm.AboveY[(int)TxSize.Tx16x16];
+            mask8x8 = lfm.AboveY[(int)TxSize.Tx8x8];
+            mask4x4 = lfm.AboveY[(int)TxSize.Tx4x4];
+            mask4x4Int = lfm.Int4x4Y;
+
+            for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r++)
+            {
+                uint mask16x16R;
+                uint mask8x8R;
+                uint mask4x4R;
+
+                if (miRow + r == 0)
+                {
+                    mask16x16R = 0;
+                    mask8x8R = 0;
+                    mask4x4R = 0;
+                }
+                else
+                {
+                    mask16x16R = (uint)mask16x16 & 0xff;
+                    mask8x8R = (uint)mask8x8 & 0xff;
+                    mask4x4R = (uint)mask4x4 & 0xff;
+                }
+
+                if (cm.UseHighBitDepth)
+                {
+                    HighbdFilterSelectivelyHoriz(
+                        ConvertToUshortPtr(dst.Buf),
+                        dst.Stride,
+                        mask16x16R,
+                        mask8x8R,
+                        mask4x4R,
+                        (uint)mask4x4Int & 0xff,
+                        cm.LfInfo.Lfthr.AsSpan(),
+                        lfm.LflY.AsSpan().Slice(r << 3),
+                        (int)cm.BitDepth);
+                }
+                else
+                {
+                    FilterSelectivelyHoriz(
+                        dst.Buf,
+                        dst.Stride,
+                        mask16x16R,
+                        mask8x8R,
+                        mask4x4R,
+                        (uint)mask4x4Int & 0xff,
+                        cm.LfInfo.Lfthr.AsSpan(),
+                        lfm.LflY.AsSpan().Slice(r << 3));
+                }
+
+                dst.Buf = dst.Buf.Slice(8 * dst.Stride);
+                mask16x16 >>= 8;
+                mask8x8 >>= 8;
+                mask4x4 >>= 8;
+                mask4x4Int >>= 8;
+            }
+        }
+
+        private static void FilterBlockPlaneSs11(ref Vp9Common cm, ref MacroBlockDPlane plane, int miRow,
+            ref LoopFilterMask lfm)
+        {
+            Buf2D dst = plane.Dst;
+            ArrayPtr<byte> dst0 = dst.Buf;
+
+            Span<byte> lflUv = stackalloc byte[16];
+
+            ushort mask16x16 = lfm.LeftUv[(int)TxSize.Tx16x16];
+            ushort mask8x8 = lfm.LeftUv[(int)TxSize.Tx8x8];
+            ushort mask4x4 = lfm.LeftUv[(int)TxSize.Tx4x4];
+            ushort mask4x4Int = lfm.Int4x4Uv;
+
+            Debug.Assert(plane.SubsamplingX == 1 && plane.SubsamplingY == 1);
+
+            // Vertical pass: do 2 rows at one time
+            for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += 4)
+            {
+                for (int c = 0; c < Constants.MiBlockSize >> 1; c++)
+                {
+                    lflUv[(r << 1) + c] = lfm.LflY[(r << 3) + (c << 1)];
+                    lflUv[((r + 2) << 1) + c] = lfm.LflY[((r + 2) << 3) + (c << 1)];
+                }
+
+                if (cm.UseHighBitDepth)
+                {
+                    // Disable filtering on the leftmost column.
+                    HighbdFilterSelectivelyVertRow2(
+                        plane.SubsamplingX,
+                        ConvertToUshortPtr(dst.Buf),
+                        dst.Stride,
+                        mask16x16,
+                        mask8x8,
+                        mask4x4,
+                        mask4x4Int,
+                        cm.LfInfo.Lfthr.AsSpan(),
+                        lflUv.Slice(r << 1),
+                        (int)cm.BitDepth);
+                }
+                else
+                {
+                    // Disable filtering on the leftmost column.
+                    FilterSelectivelyVertRow2(
+                        plane.SubsamplingX,
+                        dst.Buf,
+                        dst.Stride,
+                        mask16x16,
+                        mask8x8,
+                        mask4x4,
+                        mask4x4Int,
+                        cm.LfInfo.Lfthr.AsSpan(),
+                        lflUv.Slice(r << 1));
+                }
+
+                dst.Buf = dst.Buf.Slice(16 * dst.Stride);
+                mask16x16 >>= 8;
+                mask8x8 >>= 8;
+                mask4x4 >>= 8;
+                mask4x4Int >>= 8;
+            }
+
+            // Horizontal pass
+            dst.Buf = dst0;
+            mask16x16 = lfm.AboveUv[(int)TxSize.Tx16x16];
+            mask8x8 = lfm.AboveUv[(int)TxSize.Tx8x8];
+            mask4x4 = lfm.AboveUv[(int)TxSize.Tx4x4];
+            mask4x4Int = lfm.Int4x4Uv;
+
+            for (int r = 0; r < Constants.MiBlockSize && miRow + r < cm.MiRows; r += 2)
+            {
+                bool skipBorder4x4R = miRow + r == cm.MiRows - 1;
+                uint mask4x4IntR = skipBorder4x4R ? 0u : (uint)mask4x4Int & 0xf;
+                uint mask16x16R;
+                uint mask8x8R;
+                uint mask4x4R;
+
+                if (miRow + r == 0)
+                {
+                    mask16x16R = 0;
+                    mask8x8R = 0;
+                    mask4x4R = 0;
+                }
+                else
+                {
+                    mask16x16R = (uint)mask16x16 & 0xf;
+                    mask8x8R = (uint)mask8x8 & 0xf;
+                    mask4x4R = (uint)mask4x4 & 0xf;
+                }
+
+                if (cm.UseHighBitDepth)
+                {
+                    HighbdFilterSelectivelyHoriz(
+                        ConvertToUshortPtr(dst.Buf),
+                        dst.Stride,
+                        mask16x16R,
+                        mask8x8R,
+                        mask4x4R,
+                        mask4x4IntR,
+                        cm.LfInfo.Lfthr.AsSpan(),
+                        lflUv.Slice(r << 1),
+                        (int)cm.BitDepth);
+                }
+                else
+                {
+                    FilterSelectivelyHoriz(
+                        dst.Buf,
+                        dst.Stride,
+                        mask16x16R,
+                        mask8x8R,
+                        mask4x4R,
+                        mask4x4IntR,
+                        cm.LfInfo.Lfthr.AsSpan(),
+                        lflUv.Slice(r << 1));
+                }
+
+                dst.Buf = dst.Buf.Slice(8 * dst.Stride);
+                mask16x16 >>= 4;
+                mask8x8 >>= 4;
+                mask4x4 >>= 4;
+                mask4x4Int >>= 4;
+            }
+        }
+
+        private enum LfPath
+        {
+            LfPathSlow,
+            LfPath420,
+            LfPath444
+        }
+
+        private static void LoopFilterRows(
+            ref Surface frameBuffer,
+            ref Vp9Common cm,
+            Array3<MacroBlockDPlane> planes,
+            int start,
+            int stop,
+            int step,
+            bool yOnly,
+            LfSync lfSync)
+        {
+            int numPlanes = yOnly ? 1 : Constants.MaxMbPlane;
+            int sbCols = TileInfo.MiColsAlignedToSb(cm.MiCols) >> Constants.MiBlockSizeLog2;
+            LfPath path;
+            int miRow, miCol;
+
+            if (yOnly)
+            {
+                path = LfPath.LfPath444;
+            }
+            else if (planes[1].SubsamplingY == 1 && planes[1].SubsamplingX == 1)
+            {
+                path = LfPath.LfPath420;
+            }
+            else if (planes[1].SubsamplingY == 0 && planes[1].SubsamplingX == 0)
+            {
+                path = LfPath.LfPath444;
+            }
+            else
+            {
+                path = LfPath.LfPathSlow;
+            }
+
+            for (miRow = start; miRow < stop; miRow += step)
+            {
+                ArrayPtr<Ptr<ModeInfo>> mi = cm.MiGridVisible.Slice(miRow * cm.MiStride);
+                Span<LoopFilterMask> lfm = GetLfm(ref cm.Lf, miRow, 0);
+
+                for (miCol = 0; miCol < cm.MiCols; miCol += Constants.MiBlockSize, lfm = lfm.Slice(1))
+                {
+                    int r = miRow >> Constants.MiBlockSizeLog2;
+                    int c = miCol >> Constants.MiBlockSizeLog2;
+                    int plane;
+
+                    lfSync.SyncRead(r, c);
+
+                    ReconInter.SetupDstPlanes(ref planes, ref frameBuffer, miRow, miCol);
+
+                    AdjustMask(ref cm, miRow, miCol, ref lfm[0]);
+
+                    FilterBlockPlaneSs00(ref cm, ref planes[0], miRow, ref lfm[0]);
+                    for (plane = 1; plane < numPlanes; ++plane)
+                    {
+                        switch (path)
+                        {
+                            case LfPath.LfPath420:
+                                FilterBlockPlaneSs11(ref cm, ref planes[plane], miRow, ref lfm[0]);
+                                break;
+                            case LfPath.LfPath444:
+                                FilterBlockPlaneSs00(ref cm, ref planes[plane], miRow, ref lfm[0]);
+                                break;
+                            case LfPath.LfPathSlow:
+                                FilterBlockPlaneNon420(ref cm, ref planes[plane], mi.Slice(miCol), miRow,
+                                    miCol);
+                                break;
+                        }
+                    }
+
+                    lfSync.SyncWrite(r, c, sbCols);
+                }
+            }
+        }
+
+        public static void LoopFilterFrame(
+            ref Surface frame,
+            ref Vp9Common cm,
+            ref MacroBlockD xd,
+            int frameFilterLevel,
+            bool yOnly,
+            bool partialFrame)
+        {
+            if (frameFilterLevel == 0)
+            {
+                return;
+            }
+
+            int startMiRow = 0;
+            int miRowsToFilter = cm.MiRows;
+
+            if (partialFrame && cm.MiRows > 8)
+            {
+                startMiRow = cm.MiRows >> 1;
+                startMiRow &= ~7;
+                miRowsToFilter = Math.Max(cm.MiRows / 8, 8);
+            }
+
+            int endMiRow = startMiRow + miRowsToFilter;
+
+            LoopFilterRows(ref frame, ref cm, xd.Plane, startMiRow, endMiRow, Constants.MiBlockSize, yOnly,
+                default);
+        }
+
+        private static void LoopFilterRowsMt(
+            ref Surface frameBuffer,
+            ref Vp9Common cm,
+            Array3<MacroBlockDPlane> planes,
+            int start,
+            int stop,
+            bool yOnly,
+            int threadCount)
+        {
+            int sbRows = TileInfo.MiColsAlignedToSb(cm.MiRows) >> Constants.MiBlockSizeLog2;
+            int numTileCols = 1 << cm.Log2TileCols;
+            int numWorkers = Math.Min(threadCount, Math.Min(numTileCols, sbRows));
+
+            LfSync lfSync = new();
+            lfSync.Initialize(cm.Width, sbRows);
+
+            Ptr<Surface> frameBufferPtr = new(ref frameBuffer);
+            Ptr<Vp9Common> cmPtr = new(ref cm);
+
+            Parallel.For(0, numWorkers, n =>
+            {
+                LoopFilterRows(
+                    ref frameBufferPtr.Value,
+                    ref cmPtr.Value,
+                    planes,
+                    start + (n * Constants.MiBlockSize),
+                    stop,
+                    numWorkers * Constants.MiBlockSize,
+                    yOnly,
+                    lfSync);
+            });
+        }
+
+        public static void LoopFilterFrameMt(
+            ref Surface frame,
+            ref Vp9Common cm,
+            ref MacroBlockD xd,
+            int frameFilterLevel,
+            bool yOnly,
+            bool partialFrame,
+            int threadCount)
+        {
+            if (frameFilterLevel == 0)
+            {
+                return;
+            }
+
+            int startMiRow = 0;
+            int miRowsToFilter = cm.MiRows;
+
+            if (partialFrame && cm.MiRows > 8)
+            {
+                startMiRow = cm.MiRows >> 1;
+                startMiRow &= ~7;
+                miRowsToFilter = Math.Max(cm.MiRows / 8, 8);
+            }
+
+            int endMiRow = startMiRow + miRowsToFilter;
+
+            LoopFilterFrameInit(ref cm, frameFilterLevel);
+            LoopFilterRowsMt(ref frame, ref cm, xd.Plane, startMiRow, endMiRow, yOnly, threadCount);
+        }
+
+        private static unsafe ArrayPtr<ushort> ConvertToUshortPtr(ArrayPtr<byte> s)
+        {
+            return new ArrayPtr<ushort>((ushort*)s.ToPointer(), s.Length / 2);
+        }
     }
-}
+}

+ 432 - 403
src/Ryujinx.Graphics.Nvdec.Vp9/Luts.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
 using Ryujinx.Graphics.Nvdec.Vp9.Types;
 using System;
 
@@ -8,170 +8,175 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
     {
         public static ReadOnlySpan<byte> SizeGroupLookup => new byte[] { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 };
 
-        public static readonly BlockSize[][] SubsizeLookup = {
-            new[]
+        public static readonly BlockSize[][] SubsizeLookup = new BlockSize[][]
+        {
+            new BlockSize[]
             { // PARTITION_NONE
                 BlockSize.Block4x4, BlockSize.Block4x8, BlockSize.Block8x4, BlockSize.Block8x8, BlockSize.Block8x16, BlockSize.Block16x8,
                 BlockSize.Block16x16, BlockSize.Block16x32, BlockSize.Block32x16, BlockSize.Block32x32, BlockSize.Block32x64,
-                BlockSize.Block64x32, BlockSize.Block64x64,
+                BlockSize.Block64x32, BlockSize.Block64x64
             },
-            new[]
+            new BlockSize[]
             { // PARTITION_HORZ
                 BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block8x4, BlockSize.BlockInvalid,
                 BlockSize.BlockInvalid, BlockSize.Block16x8, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block32x16,
-                BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block64x32,
+                BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block64x32
             },
-            new[]
+            new BlockSize[]
             { // PARTITION_VERT
                 BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block4x8, BlockSize.BlockInvalid,
                 BlockSize.BlockInvalid, BlockSize.Block8x16, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block16x32,
-                BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block32x64,
+                BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block32x64
             },
-            new[]
+            new BlockSize[]
             { // PARTITION_SPLIT
                 BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block4x4, BlockSize.BlockInvalid,
                 BlockSize.BlockInvalid, BlockSize.Block8x8, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block16x16,
-                BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block32x32,
-            },
+                BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block32x32
+            }
         };
 
-        public static readonly TxSize[] MaxTxSizeLookup = {
+        public static readonly TxSize[] MaxTxSizeLookup = new TxSize[]
+        {
             TxSize.Tx4x4,   TxSize.Tx4x4,   TxSize.Tx4x4,   TxSize.Tx8x8,   TxSize.Tx8x8,   TxSize.Tx8x8,  TxSize.Tx16x16,
-            TxSize.Tx16x16, TxSize.Tx16x16, TxSize.Tx32x32, TxSize.Tx32x32, TxSize.Tx32x32, TxSize.Tx32x32,
+            TxSize.Tx16x16, TxSize.Tx16x16, TxSize.Tx32x32, TxSize.Tx32x32, TxSize.Tx32x32, TxSize.Tx32x32
         };
 
-        public static readonly TxSize[] TxModeToBiggestTxSize = {
-            TxSize.Tx4x4, // ONLY_4X4
-            TxSize.Tx8x8, // ALLOW_8X8
-            TxSize.Tx16x16, // ALLOW_16X16
-            TxSize.Tx32x32, // ALLOW_32X32
-            TxSize.Tx32x32, // TX_MODE_SELECT
+        public static readonly TxSize[] TxModeToBiggestTxSize = new TxSize[]
+        {
+            TxSize.Tx4x4,    // ONLY_4X4
+            TxSize.Tx8x8,    // ALLOW_8X8
+            TxSize.Tx16x16,  // ALLOW_16X16
+            TxSize.Tx32x32,  // ALLOW_32X32
+            TxSize.Tx32x32,  // TX_MODE_SELECT
         };
 
-        public static readonly BlockSize[][][] SsSizeLookup = {
+        public static readonly BlockSize[][][] SsSizeLookup = new BlockSize[][][]
+        {
             //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
             //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
-            new[] { new[] { BlockSize.Block4x4, BlockSize.BlockInvalid }, new[] { BlockSize.BlockInvalid, BlockSize.BlockInvalid } },
-            new[] { new[] { BlockSize.Block4x8, BlockSize.Block4x4 }, new[] { BlockSize.BlockInvalid, BlockSize.BlockInvalid } },
-            new[] { new[] { BlockSize.Block8x4, BlockSize.BlockInvalid }, new[] { BlockSize.Block4x4, BlockSize.BlockInvalid } },
-            new[] { new[] { BlockSize.Block8x8, BlockSize.Block8x4 }, new[] { BlockSize.Block4x8, BlockSize.Block4x4 } },
-            new[] { new[] { BlockSize.Block8x16, BlockSize.Block8x8 }, new[] { BlockSize.BlockInvalid, BlockSize.Block4x8 } },
-            new[] { new[] { BlockSize.Block16x8, BlockSize.BlockInvalid }, new[] { BlockSize.Block8x8, BlockSize.Block8x4 } },
-            new[] { new[] { BlockSize.Block16x16, BlockSize.Block16x8 }, new[] { BlockSize.Block8x16, BlockSize.Block8x8 } },
-            new[] { new[] { BlockSize.Block16x32, BlockSize.Block16x16 }, new[] { BlockSize.BlockInvalid, BlockSize.Block8x16 } },
-            new[] { new[] { BlockSize.Block32x16, BlockSize.BlockInvalid }, new[] { BlockSize.Block16x16, BlockSize.Block16x8 } },
-            new[] { new[] { BlockSize.Block32x32, BlockSize.Block32x16 }, new[] { BlockSize.Block16x32, BlockSize.Block16x16 } },
-            new[] { new[] { BlockSize.Block32x64, BlockSize.Block32x32 }, new[] { BlockSize.BlockInvalid, BlockSize.Block16x32 } },
-            new[] { new[] { BlockSize.Block64x32, BlockSize.BlockInvalid }, new[] { BlockSize.Block32x32, BlockSize.Block32x16 } },
-            new[] { new[] { BlockSize.Block64x64, BlockSize.Block64x32 }, new[] { BlockSize.Block32x64, BlockSize.Block32x32 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block4x4, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.BlockInvalid } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block4x8, BlockSize.Block4x4 }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.BlockInvalid } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block8x4, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.Block4x4, BlockSize.BlockInvalid } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block8x8, BlockSize.Block8x4 }, new BlockSize[] { BlockSize.Block4x8, BlockSize.Block4x4 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block8x16, BlockSize.Block8x8 }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.Block4x8 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block16x8, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.Block8x8, BlockSize.Block8x4 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block16x16, BlockSize.Block16x8 }, new BlockSize[] { BlockSize.Block8x16, BlockSize.Block8x8 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block16x32, BlockSize.Block16x16 }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.Block8x16 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block32x16, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.Block16x16, BlockSize.Block16x8 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block32x32, BlockSize.Block32x16 }, new BlockSize[] { BlockSize.Block16x32, BlockSize.Block16x16 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block32x64, BlockSize.Block32x32 }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.Block16x32 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block64x32, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.Block32x32, BlockSize.Block32x16 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block64x64, BlockSize.Block64x32 }, new BlockSize[] { BlockSize.Block32x64, BlockSize.Block32x32 } },
         };
 
-        public static readonly TxSize[][][][] UvTxsizeLookup = {
+        public static readonly TxSize[][][][] UvTxsizeLookup = new TxSize[][][][]
+        {
           //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
           //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
-          new[]
+          new TxSize[][][]
           {
               // BLOCK_4X4
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
           },
-          new[]
+          new TxSize[][][]
           {
-              // BLOCK_4X8
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              // BLOCK_4x8
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
           },
-          new[]
+          new TxSize[][][]
           {
-              // BLOCK_8X4
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              // BLOCK_8x4
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
           },
-          new[]
+          new TxSize[][][]
           {
               // BLOCK_8X8
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
           },
-          new[]
+          new TxSize[][][]
           {
-              // BLOCK_8X16
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              // BLOCK_8x16
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
           },
-          new[]
+          new TxSize[][][]
           {
-              // BLOCK_16X8
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new[] { TxSize.Tx8x8, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              // BLOCK_16x8
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
           },
-          new[]
+          new TxSize[][][]
           {
               // BLOCK_16X16
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
-              new[] { new[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
-              new[] { new[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
           },
-          new[]
+          new TxSize[][][]
           {
-              // BLOCK_16X32
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
-              new[] { new[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
-              new[] { new[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              // BLOCK_16x32
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
           },
-          new[]
+          new TxSize[][][]
           {
-              // BLOCK_32X16
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
-              new[] { new[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new[] { TxSize.Tx16x16, TxSize.Tx8x8 } },
-              new[] { new[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new[] { TxSize.Tx16x16, TxSize.Tx8x8 } },
+              // BLOCK_32x16
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 } },
           },
-          new[]
+          new TxSize[][][]
           {
               // BLOCK_32X32
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
-              new[] { new[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
-              new[] { new[] { TxSize.Tx32x32, TxSize.Tx16x16 }, new[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx32x32, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
           },
-          new[]
+          new TxSize[][][]
           {
-              // BLOCK_32X64
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
-              new[] { new[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
-              new[] { new[] { TxSize.Tx32x32, TxSize.Tx32x32 }, new[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
+              // BLOCK_32x64
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx32x32, TxSize.Tx32x32 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
           },
-          new[]
+          new TxSize[][][]
           {
-              // BLOCK_64X32
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
-              new[] { new[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
-              new[] { new[] { TxSize.Tx32x32, TxSize.Tx16x16 }, new[] { TxSize.Tx32x32, TxSize.Tx16x16 } },
+              // BLOCK_64x32
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx32x32, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx32x32, TxSize.Tx16x16 } },
           },
-          new[]
+          new TxSize[][][]
           {
-              // BLOCK_64X64
-              new[] { new[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
-              new[] { new[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
-              new[] { new[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
-              new[] { new[] { TxSize.Tx32x32, TxSize.Tx32x32 }, new[] { TxSize.Tx32x32, TxSize.Tx32x32 } },
+              // BLOCK_64x64
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx32x32, TxSize.Tx32x32 }, new TxSize[] { TxSize.Tx32x32, TxSize.Tx32x32 } },
           },
         };
 
@@ -190,25 +195,27 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         // Generates 4 bit field in which each bit set to 1 represents
         // a blocksize partition  1111 means we split 64x64, 32x32, 16x16
         // and 8x8. 1000 means we just split the 64x64 to 32x32
-        public static readonly PartitionContextPair[] PartitionContextLookup = {
-            new(15, 15), // 4X4   - {0b1111, 0b1111}
-            new(15, 14), // 4X8   - {0b1111, 0b1110}
-            new(14, 15), // 8X4   - {0b1110, 0b1111}
-            new(14, 14), // 8X8   - {0b1110, 0b1110}
-            new(14, 12), // 8X16  - {0b1110, 0b1100}
-            new(12, 14), // 16X8  - {0b1100, 0b1110}
-            new(12, 12), // 16X16 - {0b1100, 0b1100}
-            new(12, 8), // 16X32 - {0b1100, 0b1000}
-            new(8, 12), // 32X16 - {0b1000, 0b1100}
-            new(8, 8), // 32X32 - {0b1000, 0b1000}
-            new(8, 0), // 32X64 - {0b1000, 0b0000}
-            new(0, 8), // 64X32 - {0b0000, 0b1000}
-            new(0, 0), // 64X64 - {0b0000, 0b0000}
+        public static readonly PartitionContextPair[] PartitionContextLookup = new PartitionContextPair[]
+        {
+            new PartitionContextPair(15, 15),  // 4X4   - {0b1111, 0b1111}
+            new PartitionContextPair(15, 14),  // 4x8   - {0b1111, 0b1110}
+            new PartitionContextPair(14, 15),  // 8x4   - {0b1110, 0b1111}
+            new PartitionContextPair(14, 14),  // 8X8   - {0b1110, 0b1110}
+            new PartitionContextPair(14, 12),  // 8x16  - {0b1110, 0b1100}
+            new PartitionContextPair(12, 14),  // 16x8  - {0b1100, 0b1110}
+            new PartitionContextPair(12, 12),  // 16X16 - {0b1100, 0b1100}
+            new PartitionContextPair(12, 8),   // 16x32 - {0b1100, 0b1000}
+            new PartitionContextPair(8, 12),   // 32x16 - {0b1000, 0b1100}
+            new PartitionContextPair(8, 8),    // 32X32 - {0b1000, 0b1000}
+            new PartitionContextPair(8, 0),    // 32x64 - {0b1000, 0b0000}
+            new PartitionContextPair(0, 8),    // 64x32 - {0b0000, 0b1000}
+            new PartitionContextPair(0, 0),    // 64x64 - {0b0000, 0b0000}
         };
 
         // Filter
 
-        private static readonly Array8<short>[] _bilinearFilters = {
+        private static readonly Array8<short>[] BilinearFilters = new Array8<short>[]
+        {
             NewArray8Short(0, 0, 0, 128, 0, 0, 0, 0),  NewArray8Short(0, 0, 0, 120, 8, 0, 0, 0),
             NewArray8Short(0, 0, 0, 112, 16, 0, 0, 0), NewArray8Short(0, 0, 0, 104, 24, 0, 0, 0),
             NewArray8Short(0, 0, 0, 96, 32, 0, 0, 0),  NewArray8Short(0, 0, 0, 88, 40, 0, 0, 0),
@@ -216,11 +223,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             NewArray8Short(0, 0, 0, 64, 64, 0, 0, 0),  NewArray8Short(0, 0, 0, 56, 72, 0, 0, 0),
             NewArray8Short(0, 0, 0, 48, 80, 0, 0, 0),  NewArray8Short(0, 0, 0, 40, 88, 0, 0, 0),
             NewArray8Short(0, 0, 0, 32, 96, 0, 0, 0),  NewArray8Short(0, 0, 0, 24, 104, 0, 0, 0),
-            NewArray8Short(0, 0, 0, 16, 112, 0, 0, 0), NewArray8Short(0, 0, 0, 8, 120, 0, 0, 0),
+            NewArray8Short(0, 0, 0, 16, 112, 0, 0, 0), NewArray8Short(0, 0, 0, 8, 120, 0, 0, 0)
         };
 
         // Lagrangian interpolation filter
-        private static readonly Array8<short>[] _subPelFilters8 = {
+        private static readonly Array8<short>[] SubPelFilters8 = new Array8<short>[]
+        {
             NewArray8Short(0, 0, 0, 128, 0, 0, 0, 0),        NewArray8Short(0, 1, -5, 126, 8, -3, 1, 0),
             NewArray8Short(-1, 3, -10, 122, 18, -6, 2, 0),   NewArray8Short(-1, 4, -13, 118, 27, -9, 3, -1),
             NewArray8Short(-1, 4, -16, 112, 37, -11, 4, -1), NewArray8Short(-1, 5, -18, 105, 48, -14, 4, -1),
@@ -228,11 +236,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             NewArray8Short(-1, 6, -19, 78, 78, -19, 6, -1),  NewArray8Short(-1, 5, -18, 68, 88, -19, 6, -1),
             NewArray8Short(-1, 5, -16, 58, 97, -19, 5, -1),  NewArray8Short(-1, 4, -14, 48, 105, -18, 5, -1),
             NewArray8Short(-1, 4, -11, 37, 112, -16, 4, -1), NewArray8Short(-1, 3, -9, 27, 118, -13, 4, -1),
-            NewArray8Short(0, 2, -6, 18, 122, -10, 3, -1),   NewArray8Short(0, 1, -3, 8, 126, -5, 1, 0),
+            NewArray8Short(0, 2, -6, 18, 122, -10, 3, -1),   NewArray8Short(0, 1, -3, 8, 126, -5, 1, 0)
         };
 
         // DCT based filter
-        private static readonly Array8<short>[] _subPelFilters8S = {
+        private static readonly Array8<short>[] SubPelFilters8S = new Array8<short>[]
+        {
             NewArray8Short(0, 0, 0, 128, 0, 0, 0, 0),         NewArray8Short(-1, 3, -7, 127, 8, -3, 1, 0),
             NewArray8Short(-2, 5, -13, 125, 17, -6, 3, -1),   NewArray8Short(-3, 7, -17, 121, 27, -10, 5, -2),
             NewArray8Short(-4, 9, -20, 115, 37, -13, 6, -2),  NewArray8Short(-4, 10, -23, 108, 48, -16, 8, -3),
@@ -240,11 +249,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             NewArray8Short(-4, 11, -23, 80, 80, -23, 11, -4), NewArray8Short(-4, 10, -21, 70, 90, -24, 11, -4),
             NewArray8Short(-3, 9, -19, 59, 100, -24, 10, -4), NewArray8Short(-3, 8, -16, 48, 108, -23, 10, -4),
             NewArray8Short(-2, 6, -13, 37, 115, -20, 9, -4),  NewArray8Short(-2, 5, -10, 27, 121, -17, 7, -3),
-            NewArray8Short(-1, 3, -6, 17, 125, -13, 5, -2),   NewArray8Short(0, 1, -3, 8, 127, -7, 3, -1),
+            NewArray8Short(-1, 3, -6, 17, 125, -13, 5, -2),   NewArray8Short(0, 1, -3, 8, 127, -7, 3, -1)
         };
 
         // freqmultiplier = 0.5
-        private static readonly Array8<short>[] _subPelFilters8Lp = {
+        private static readonly Array8<short>[] SubPelFilters8Lp = new Array8<short>[]
+        {
             NewArray8Short(0, 0, 0, 128, 0, 0, 0, 0),       NewArray8Short(-3, -1, 32, 64, 38, 1, -3, 0),
             NewArray8Short(-2, -2, 29, 63, 41, 2, -3, 0),   NewArray8Short(-2, -2, 26, 63, 43, 4, -4, 0),
             NewArray8Short(-2, -3, 24, 62, 46, 5, -4, 0),   NewArray8Short(-2, -3, 21, 60, 49, 7, -4, 0),
@@ -252,12 +262,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             NewArray8Short(-1, -4, 14, 55, 55, 14, -4, -1), NewArray8Short(-1, -4, 12, 53, 57, 16, -4, -1),
             NewArray8Short(0, -4, 9, 51, 59, 18, -4, -1),   NewArray8Short(0, -4, 7, 49, 60, 21, -3, -2),
             NewArray8Short(0, -4, 5, 46, 62, 24, -3, -2),   NewArray8Short(0, -4, 4, 43, 63, 26, -2, -2),
-            NewArray8Short(0, -3, 2, 41, 63, 29, -2, -2),   NewArray8Short(0, -3, 1, 38, 64, 32, -1, -3),
+            NewArray8Short(0, -3, 2, 41, 63, 29, -2, -2),   NewArray8Short(0, -3, 1, 38, 64, 32, -1, -3)
         };
 
         private static Array8<short> NewArray8Short(short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7)
         {
-            Array8<short> output = new();
+            Array8<short> output = new Array8<short>();
 
             output[0] = e0;
             output[1] = e1;
@@ -271,46 +281,54 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             return output;
         }
 
-        public static readonly Array8<short>[][] Vp9FilterKernels = {
-            _subPelFilters8, _subPelFilters8Lp, _subPelFilters8S, _bilinearFilters,
+        public static readonly Array8<short>[][] FilterKernels = new Array8<short>[][]
+        {
+            SubPelFilters8, SubPelFilters8Lp, SubPelFilters8S, BilinearFilters
         };
 
         // Scan
 
-        private static readonly short[] _defaultScan4X4 = {
+        private static readonly short[] DefaultScan4X4 = new short[]
+        {
             0, 4, 1, 5, 8, 2, 12, 9, 3, 6, 13, 10, 7, 14, 11, 15,
         };
 
-        private static readonly short[] _colScan4X4 = {
+        private static readonly short[] ColScan4X4 = new short[]
+        {
             0, 4, 8, 1, 12, 5, 9, 2, 13, 6, 10, 3, 7, 14, 11, 15,
         };
 
-        private static readonly short[] _rowScan4X4 = {
+        private static readonly short[] RowScan4X4 = new short[]
+        {
             0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15,
         };
 
-        private static readonly short[] _defaultScan8X8 = {
+        private static readonly short[] DefaultScan8X8 = new short[]
+        {
             0,  8,  1,  16, 9,  2,  17, 24, 10, 3,  18, 25, 32, 11, 4,  26,
             33, 19, 40, 12, 34, 27, 5,  41, 20, 48, 13, 35, 42, 28, 21, 6,
             49, 56, 36, 43, 29, 7,  14, 50, 57, 44, 22, 37, 15, 51, 58, 30,
             45, 23, 52, 59, 38, 31, 60, 53, 46, 39, 61, 54, 47, 62, 55, 63,
         };
 
-        private static readonly short[] _colScan8X8 = {
+        private static readonly short[] ColScan8X8 = new short[]
+        {
             0,  8,  16, 1,  24, 9,  32, 17, 2,  40, 25, 10, 33, 18, 48, 3,
             26, 41, 11, 56, 19, 34, 4,  49, 27, 42, 12, 35, 20, 57, 50, 28,
             5,  43, 13, 36, 58, 51, 21, 44, 6,  29, 59, 37, 14, 52, 22, 7,
             45, 60, 30, 15, 38, 53, 23, 46, 31, 61, 39, 54, 47, 62, 55, 63,
         };
 
-        private static readonly short[] _rowScan8X8 = {
+        private static readonly short[] RowScan8X8 = new short[]
+        {
             0,  1,  2,  8,  9,  3,  16, 10, 4,  17, 11, 24, 5,  18, 25, 12,
             19, 26, 32, 6,  13, 20, 33, 27, 7,  34, 40, 21, 28, 41, 14, 35,
             48, 42, 29, 36, 49, 22, 43, 15, 56, 37, 50, 44, 30, 57, 23, 51,
             58, 45, 38, 52, 31, 59, 53, 46, 60, 39, 61, 47, 54, 55, 62, 63,
         };
 
-        private static readonly short[] _defaultScan16X16 = {
+        private static readonly short[] DefaultScan16X16 = new short[]
+        {
             0,   16,  1,   32,  17,  2,   48,  33,  18,  3,   64,  34,  49,  19,  65,
             80,  50,  4,   35,  66,  20,  81,  96,  51,  5,   36,  82,  97,  67,  112,
             21,  52,  98,  37,  83,  113, 6,   68,  128, 53,  22,  99,  114, 84,  7,
@@ -331,7 +349,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             255,
         };
 
-        private static readonly short[] _colScan16X16 = {
+        private static readonly short[] ColScan16X16 = new short[]
+        {
             0,   16,  32,  48,  1,   64,  17,  80,  33,  96,  49,  2,   65,  112, 18,
             81,  34,  128, 50,  97,  3,   66,  144, 19,  113, 35,  82,  160, 98,  51,
             129, 4,   67,  176, 20,  114, 145, 83,  36,  99,  130, 52,  192, 5,   161,
@@ -352,7 +371,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             255,
         };
 
-        private static readonly short[] _rowScan16X16 = {
+        private static readonly short[] RowScan16X16 = new short[]
+        {
             0,   1,   2,   16,  3,   17,  4,   18,  32,  5,   33,  19,  6,   34,  48,
             20,  49,  7,   35,  21,  50,  64,  8,   36,  65,  22,  51,  37,  80,  9,
             66,  52,  23,  38,  81,  67,  10,  53,  24,  82,  68,  96,  39,  11,  54,
@@ -373,7 +393,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             255,
         };
 
-        private static readonly short[] _defaultScan32X32 = {
+        private static readonly short[] DefaultScan32X32 = new short[]
+        {
             0,    32,   1,    64,  33,   2,    96,   65,   34,   128,  3,    97,   66,
             160,  129,  35,   98,  4,    67,   130,  161,  192,  36,   99,   224,  5,
             162,  193,  68,   131, 37,   100,  225,  194,  256,  163,  69,   132,  6,
@@ -457,22 +478,26 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
         // Neighborhood 2-tuples for various scans and blocksizes,
         // in {top, left} order for each position in corresponding scan order.
-        private static readonly short[] _defaultScan4X4Neighbors = {
+        private static readonly short[] DefaultScan4X4Neighbors = new short[]
+        {
             0, 0, 0, 0, 0,  0, 1, 4, 4, 4,  1,  1, 8,  8,  5,  8, 2,
             2, 2, 5, 9, 12, 6, 9, 3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
         };
 
-        private static readonly short[] _colScan4X4Neighbors = {
+        private static readonly short[] ColScan4X4Neighbors = new short[]
+        {
             0, 0, 0, 0, 4, 4, 0, 0, 8, 8,  1,  1, 5, 5,  1,  1, 9,
             9, 2, 2, 6, 6, 2, 2, 3, 3, 10, 10, 7, 7, 11, 11, 0, 0,
         };
 
-        private static readonly short[] _rowScan4X4Neighbors = {
+        private static readonly short[] RowScan4X4Neighbors = new short[]
+        {
             0, 0, 0, 0, 0, 0, 1, 1,  4,  4,  2,  2,  5,  5,  4,  4, 8,
             8, 6, 6, 8, 8, 9, 9, 12, 12, 10, 10, 13, 13, 14, 14, 0, 0,
         };
 
-        private static readonly short[] _colScan8X8Neighbors = {
+        private static readonly short[] ColScan8X8Neighbors = new short[]
+        {
             0,  0,  0,  0,  8,  8,  0,  0,  16, 16, 1,  1,  24, 24, 9,  9,  1,  1,  32,
             32, 17, 17, 2,  2,  25, 25, 10, 10, 40, 40, 2,  2,  18, 18, 33, 33, 3,  3,
             48, 48, 11, 11, 26, 26, 3,  3,  41, 41, 19, 19, 34, 34, 4,  4,  27, 27, 12,
@@ -482,7 +507,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             53, 53, 31, 31, 46, 46, 39, 39, 54, 54, 47, 47, 55, 55, 0,  0,
         };
 
-        private static readonly short[] _rowScan8X8Neighbors = {
+        private static readonly short[] RowScan8X8Neighbors = new short[]
+        {
             0,  0,  0,  0,  1,  1,  0,  0,  8,  8,  2,  2,  8,  8,  9,  9,  3,  3,  16,
             16, 10, 10, 16, 16, 4,  4,  17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24,
             5,  5,  12, 12, 19, 19, 32, 32, 26, 26, 6,  6,  33, 33, 32, 32, 20, 20, 27,
@@ -492,7 +518,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             38, 38, 60, 60, 46, 46, 53, 53, 54, 54, 61, 61, 62, 62, 0,  0,
         };
 
-        private static readonly short[] _defaultScan8X8Neighbors = {
+        private static readonly short[] DefaultScan8X8Neighbors = new short[]
+        {
             0,  0,  0,  0,  0,  0,  8,  8,  1,  8,  1,  1,  9,  16, 16, 16, 2,  9,  2,
             2,  10, 17, 17, 24, 24, 24, 3,  10, 3,  3,  18, 25, 25, 32, 11, 18, 32, 32,
             4,  11, 26, 33, 19, 26, 4,  4,  33, 40, 12, 19, 40, 40, 5,  12, 27, 34, 34,
@@ -502,7 +529,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             31, 38, 53, 60, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0,  0,
         };
 
-        private static readonly short[] _colScan16X16Neighbors = {
+        private static readonly short[] ColScan16X16Neighbors = new short[]
+        {
             0,   0,   0,   0,   16,  16,  32,  32,  0,   0,   48,  48,  1,   1,   64,
             64,  17,  17,  80,  80,  33,  33,  1,   1,   49,  49,  96,  96,  2,   2,
             65,  65,  18,  18,  112, 112, 34,  34,  81,  81,  2,   2,   50,  50,  128,
@@ -540,7 +568,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             239, 239, 0,   0,
         };
 
-        private static readonly short[] _rowScan16X16Neighbors = {
+        private static readonly short[] RowScan16X16Neighbors = new short[]
+        {
             0,   0,   0,   0,   1,   1,   0,   0,   2,   2,   16,  16,  3,   3,   17,
             17,  16,  16,  4,   4,   32,  32,  18,  18,  5,   5,   33,  33,  32,  32,
             19,  19,  48,  48,  6,   6,   34,  34,  20,  20,  49,  49,  48,  48,  7,
@@ -578,7 +607,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             254, 254, 0,   0,
         };
 
-        private static readonly short[] _defaultScan16X16Neighbors = {
+        private static readonly short[] DefaultScan16X16Neighbors = new short[]
+        {
             0,   0,   0,   0,   0,   0,   16,  16,  1,   16,  1,   1,   32,  32,  17,
             32,  2,   17,  2,   2,   48,  48,  18,  33,  33,  48,  3,   18,  49,  64,
             64,  64,  34,  49,  3,   3,   19,  34,  50,  65,  4,   19,  65,  80,  80,
@@ -616,7 +646,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             239, 254, 0,   0,
         };
 
-        private static readonly short[] _defaultScan32X32Neighbors = {
+        private static readonly short[] DefaultScan32X32Neighbors = new short[]
+        {
             0,   0,    0,   0,    0,   0,    32,  32,   1,   32,  1,   1,    64,  64,
             33,  64,   2,   33,   96,  96,   2,   2,    65,  96,  34,  65,   128, 128,
             97,  128,  3,   34,   66,  97,   3,   3,    35,  66,  98,  129,  129, 160,
@@ -766,40 +797,47 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             959, 990,  991, 1022, 0,   0,
         };
 
-        private static readonly short[] _vp9DefaultIscan4X4 = {
+        private static readonly short[] DefaultIscan4X4 = new short[]
+        {
             0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
         };
 
-        private static readonly short[] _vp9ColIscan4X4 = {
+        private static readonly short[] ColIscan4X4 = new short[]
+        {
             0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
         };
 
-        private static readonly short[] _vp9RowIscan4X4 = {
+        private static readonly short[] RowIscan4X4 = new short[]
+        {
             0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
         };
 
-        private static readonly short[] _vp9ColIscan8X8 = {
+        private static readonly short[] ColIscan8X8 = new short[]
+        {
             0,  3,  8,  15, 22, 32, 40, 47, 1,  5,  11, 18, 26, 34, 44, 51,
             2,  7,  13, 20, 28, 38, 46, 54, 4,  10, 16, 24, 31, 41, 50, 56,
             6,  12, 21, 27, 35, 43, 52, 58, 9,  17, 25, 33, 39, 48, 55, 60,
             14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
         };
 
-        private static readonly short[] _vp9RowIscan8X8 = {
+        private static readonly short[] RowIscan8X8 = new short[]
+        {
             0,  1,  2,  5,  8,  12, 19, 24, 3,  4,  7,  10, 15, 20, 30, 39,
             6,  9,  13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
             18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
             32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
         };
 
-        private static readonly short[] _vp9DefaultIscan8X8 = {
+        private static readonly short[] DefaultIscan8X8 = new short[]
+        {
             0,  2,  5,  9,  14, 22, 31, 37, 1,  4,  8,  13, 19, 26, 38, 44,
             3,  6,  10, 17, 24, 30, 42, 49, 7,  11, 15, 21, 29, 36, 47, 53,
             12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
             25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
         };
 
-        private static readonly short[] _vp9ColIscan16X16 = {
+        private static readonly short[] ColIscan16X16 = new short[]
+        {
             0,  4,  11,  20,  31,  43,  59,  75,  85,  109, 130, 150, 165, 181, 195, 198,
             1,  6,  14,  23,  34,  47,  64,  81,  95,  114, 135, 153, 171, 188, 201, 212,
             2,  8,  16,  25,  38,  52,  67,  83,  101, 116, 136, 157, 172, 190, 205, 216,
@@ -818,7 +856,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
         };
 
-        private static readonly short[] _vp9RowIscan16X16 = {
+        private static readonly short[] RowIscan16X16 = new short[]
+        {
             0,   1,   2,   4,   6,   9,   12,  17,  22,  29,  36,  43,  54,  64,  76,
             86,  3,   5,   7,   11,  15,  19,  25,  32,  38,  48,  59,  68,  84,  99,
             115, 130, 8,   10,  13,  18,  23,  27,  33,  42,  51,  60,  72,  88,  103,
@@ -839,7 +878,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             255,
         };
 
-        private static readonly short[] _vp9DefaultIscan16X16 = {
+        private static readonly short[] DefaultIscan16X16 = new short[]
+        {
             0,   2,   5,   9,   17,  24,  36,  44,  55,  72,  88,  104, 128, 143, 166,
             179, 1,   4,   8,   13,  20,  30,  40,  54,  66,  79,  96,  113, 141, 154,
             178, 196, 3,   7,   11,  18,  25,  33,  46,  57,  71,  86,  101, 119, 148,
@@ -860,7 +900,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             255,
         };
 
-        private static readonly short[] _vp9DefaultIscan32X32 = {
+        private static readonly short[] DefaultIscan32X32 = new short[]
+        {
             0,    2,    5,    10,   17,   25,   38,   47,   62,   83,   101,  121,  145,
             170,  193,  204,  210,  219,  229,  233,  245,  257,  275,  299,  342,  356,
             377,  405,  455,  471,  495,  527,  1,    4,    8,    15,   22,   30,   45,
@@ -956,90 +997,95 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             }
         }
 
-        public static readonly ScanOrder[] Vp9DefaultScanOrders = {
-            new(_defaultScan4X4, _vp9DefaultIscan4X4, _defaultScan4X4Neighbors),
-            new(_defaultScan8X8, _vp9DefaultIscan8X8, _defaultScan8X8Neighbors),
-            new(_defaultScan16X16, _vp9DefaultIscan16X16, _defaultScan16X16Neighbors),
-            new(_defaultScan32X32, _vp9DefaultIscan32X32, _defaultScan32X32Neighbors),
+        public static readonly ScanOrder[] DefaultScanOrders = new ScanOrder[]
+        {
+            new ScanOrder(DefaultScan4X4, DefaultIscan4X4, DefaultScan4X4Neighbors),
+            new ScanOrder(DefaultScan8X8, DefaultIscan8X8, DefaultScan8X8Neighbors),
+            new ScanOrder(DefaultScan16X16, DefaultIscan16X16, DefaultScan16X16Neighbors),
+            new ScanOrder(DefaultScan32X32, DefaultIscan32X32, DefaultScan32X32Neighbors)
         };
 
-        public static readonly ScanOrder[][] Vp9ScanOrders = {
+        public static readonly ScanOrder[][] ScanOrders = new ScanOrder[][]
+        {
             new ScanOrder[]
             { // TX_4X4
-                new(_defaultScan4X4, _vp9DefaultIscan4X4, _defaultScan4X4Neighbors),
-                new(_rowScan4X4, _vp9RowIscan4X4, _rowScan4X4Neighbors),
-                new(_colScan4X4, _vp9ColIscan4X4, _colScan4X4Neighbors),
-                new(_defaultScan4X4, _vp9DefaultIscan4X4, _defaultScan4X4Neighbors),
+                new ScanOrder(DefaultScan4X4, DefaultIscan4X4, DefaultScan4X4Neighbors),
+                new ScanOrder(RowScan4X4, RowIscan4X4, RowScan4X4Neighbors),
+                new ScanOrder(ColScan4X4, ColIscan4X4, ColScan4X4Neighbors),
+                new ScanOrder(DefaultScan4X4, DefaultIscan4X4, DefaultScan4X4Neighbors)
             },
             new ScanOrder[]
             { // TX_8X8
-                new(_defaultScan8X8, _vp9DefaultIscan8X8, _defaultScan8X8Neighbors),
-                new(_rowScan8X8, _vp9RowIscan8X8, _rowScan8X8Neighbors),
-                new(_colScan8X8, _vp9ColIscan8X8, _colScan8X8Neighbors),
-                new(_defaultScan8X8, _vp9DefaultIscan8X8, _defaultScan8X8Neighbors),
+                new ScanOrder(DefaultScan8X8, DefaultIscan8X8, DefaultScan8X8Neighbors),
+                new ScanOrder(RowScan8X8, RowIscan8X8, RowScan8X8Neighbors),
+                new ScanOrder(ColScan8X8, ColIscan8X8, ColScan8X8Neighbors),
+                new ScanOrder(DefaultScan8X8, DefaultIscan8X8, DefaultScan8X8Neighbors)
             },
             new ScanOrder[]
             { // TX_16X16
-                new(_defaultScan16X16, _vp9DefaultIscan16X16, _defaultScan16X16Neighbors),
-                new(_rowScan16X16, _vp9RowIscan16X16, _rowScan16X16Neighbors),
-                new(_colScan16X16, _vp9ColIscan16X16, _colScan16X16Neighbors),
-                new(_defaultScan16X16, _vp9DefaultIscan16X16, _defaultScan16X16Neighbors),
+                new ScanOrder(DefaultScan16X16, DefaultIscan16X16, DefaultScan16X16Neighbors),
+                new ScanOrder(RowScan16X16, RowIscan16X16, RowScan16X16Neighbors),
+                new ScanOrder(ColScan16X16, ColIscan16X16, ColScan16X16Neighbors),
+                new ScanOrder(DefaultScan16X16, DefaultIscan16X16, DefaultScan16X16Neighbors)
             },
             new ScanOrder[]
             { // TX_32X32
-                new(_defaultScan32X32, _vp9DefaultIscan32X32, _defaultScan32X32Neighbors),
-                new(_defaultScan32X32, _vp9DefaultIscan32X32, _defaultScan32X32Neighbors),
-                new(_defaultScan32X32, _vp9DefaultIscan32X32, _defaultScan32X32Neighbors),
-                new(_defaultScan32X32, _vp9DefaultIscan32X32, _defaultScan32X32Neighbors),
-            },
+                new ScanOrder(DefaultScan32X32, DefaultIscan32X32, DefaultScan32X32Neighbors),
+                new ScanOrder(DefaultScan32X32, DefaultIscan32X32, DefaultScan32X32Neighbors),
+                new ScanOrder(DefaultScan32X32, DefaultIscan32X32, DefaultScan32X32Neighbors),
+                new ScanOrder(DefaultScan32X32, DefaultIscan32X32, DefaultScan32X32Neighbors)
+            }
         };
 
         // Entropy MV
 
-        public static readonly sbyte[] Vp9MvJointTree = {
-            -(sbyte)MvJointType.MvJointZero, 2, -(sbyte)MvJointType.MvJointHnzvz, 4, -(sbyte)MvJointType.MvJointHzvnz, -(sbyte)MvJointType.MvJointHnzvnz,
+        public static readonly sbyte[] MvJointTree = new sbyte[]
+        {
+            -(sbyte)MvJointType.Zero, 2, -(sbyte)MvJointType.Hnzvz, 4, -(sbyte)MvJointType.Hzvnz, -(sbyte)MvJointType.Hnzvnz
         };
 
-        public static readonly sbyte[] Vp9MvClassTree = {
-            -(sbyte)MvClassType.MvClass0,
+        public static readonly sbyte[] MvClassTree = new sbyte[]
+        {
+            -(sbyte)MvClassType.Class0,
             2,
-            -(sbyte)MvClassType.MvClass1,
+            -(sbyte)MvClassType.Class1,
             4,
             6,
             8,
-            -(sbyte)MvClassType.MvClass2,
-            -(sbyte)MvClassType.MvClass3,
+            -(sbyte)MvClassType.Class2,
+            -(sbyte)MvClassType.Class3,
             10,
             12,
-            -(sbyte)MvClassType.MvClass4,
-            -(sbyte)MvClassType.MvClass5,
-            -(sbyte)MvClassType.MvClass6,
+            -(sbyte)MvClassType.Class4,
+            -(sbyte)MvClassType.Class5,
+            -(sbyte)MvClassType.Class6,
             14,
             16,
             18,
-            -(sbyte)MvClassType.MvClass7,
-            -(sbyte)MvClassType.MvClass8,
-            -(sbyte)MvClassType.MvClass9,
-            -(sbyte)MvClassType.MvClass10,
+            -(sbyte)MvClassType.Class7,
+            -(sbyte)MvClassType.Class8,
+            -(sbyte)MvClassType.Class9,
+            -(sbyte)MvClassType.Class10
         };
 
-        public static ReadOnlySpan<sbyte> Vp9MvFPTree => new sbyte[] { -0, 2, -1, 4, -2, -3 };
+        public static ReadOnlySpan<sbyte> MvFPTree => new sbyte[] { -0, 2, -1, 4, -2, -3 };
 
         // Entropy
 
-        public static ReadOnlySpan<byte> Vp9Cat1Prob => new byte[] { 159 };
-        public static ReadOnlySpan<byte> Vp9Cat2Prob => new byte[] { 165, 145 };
-        public static ReadOnlySpan<byte> Vp9Cat3Prob => new byte[] { 173, 148, 140 };
-        public static ReadOnlySpan<byte> Vp9Cat4Prob => new byte[] { 176, 155, 140, 135 };
-        public static ReadOnlySpan<byte> Vp9Cat5Prob => new byte[] { 180, 157, 141, 134, 130 };
-        public static ReadOnlySpan<byte> Vp9Cat6Prob => new byte[] { 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 };
+        public static ReadOnlySpan<byte> Cat1Prob => new byte[] { 159 };
+        public static ReadOnlySpan<byte> Cat2Prob => new byte[] { 165, 145 };
+        public static ReadOnlySpan<byte> Cat3Prob => new byte[] { 173, 148, 140 };
+        public static ReadOnlySpan<byte> Cat4Prob => new byte[] { 176, 155, 140, 135 };
+        public static ReadOnlySpan<byte> Cat5Prob => new byte[] { 180, 157, 141, 134, 130 };
+        public static ReadOnlySpan<byte> Cat6Prob => new byte[] { 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 };
 
-        public static ReadOnlySpan<byte> Vp9Cat6ProbHigh12 => new byte[]
+        public static ReadOnlySpan<byte> Cat6ProbHigh12 => new byte[]
         {
-            255, 255, 255, 255, 254, 254, 54, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129,
+            255, 255, 255, 255, 254, 254, 54, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
         };
 
-        private static readonly byte[] _vp9CoefbandTrans8X8Plus = {
+        private static readonly byte[] CoefbandTrans8X8Plus = new byte[]
+        {
             0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
             // Beyond MAXBAND_INDEX+1 all values are filled as 5
             5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
@@ -1083,17 +1129,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         };
 
-        private static ReadOnlySpan<byte> Vp9CoefbandTrans4X4 => new byte[]
+        private static ReadOnlySpan<byte> CoefbandTrans4X4 => new byte[]
         {
             0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
         };
 
         public static ReadOnlySpan<byte> GetBandTranslate(TxSize txSize)
         {
-            return txSize == TxSize.Tx4x4 ? Vp9CoefbandTrans4X4 : _vp9CoefbandTrans8X8Plus;
+            return txSize == TxSize.Tx4x4 ? CoefbandTrans4X4 : CoefbandTrans8X8Plus;
         }
 
-        public static readonly byte[][] Vp9Pareto8Full = {
+        public static readonly byte[][] Pareto8Full = new byte[][]
+        {
             new byte[] { 3, 86, 128, 6, 86, 23, 88, 29 },
             new byte[] { 6, 86, 128, 11, 87, 42, 91, 52 },
             new byte[] { 9, 86, 129, 17, 88, 61, 94, 76 },
@@ -1351,36 +1398,41 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             new byte[] { 255, 246, 247, 255, 239, 255, 253, 255 },
         };
 
-        // Array indices are identical to previously-existing INTRAMODECONTEXTNODES.
-        public static readonly sbyte[] Vp9IntraModeTree = {
-            -(sbyte)PredictionMode.DcPred,   2,                                 // 0 = DC_NODE
-            -(sbyte)PredictionMode.TmPred,   4,                                 // 1 = TM_NODE
-            -(sbyte)PredictionMode.VPred,    6,                                 // 2 = V_NODE
-            8,                               12,                                // 3 = COM_NODE
-            -(sbyte)PredictionMode.HPred,    10,                                // 4 = H_NODE
-            -(sbyte)PredictionMode.D135Pred, -(sbyte)PredictionMode.D117Pred,   // 5 = D135_NODE
-            -(sbyte)PredictionMode.D45Pred,  14,                                // 6 = D45_NODE
-            -(sbyte)PredictionMode.D63Pred,  16,                                // 7 = D63_NODE
-            -(sbyte)PredictionMode.D153Pred, -(sbyte)PredictionMode.D207Pred,   // 8 = D153_NODE
+        /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+        public static readonly sbyte[] IntraModeTree = new sbyte[]
+        {
+            -(sbyte)PredictionMode.DcPred,   2,                                 /* 0 = DC_NODE */
+            -(sbyte)PredictionMode.TmPred,   4,                                 /* 1 = TM_NODE */
+            -(sbyte)PredictionMode.VPred,    6,                                 /* 2 = V_NODE */
+            8,                                 12,                                /* 3 = COM_NODE */
+            -(sbyte)PredictionMode.HPred,    10,                                /* 4 = H_NODE */
+            -(sbyte)PredictionMode.D135Pred, -(sbyte)PredictionMode.D117Pred, /* 5 = D135_NODE */
+            -(sbyte)PredictionMode.D45Pred,  14,                                /* 6 = D45_NODE */
+            -(sbyte)PredictionMode.D63Pred,  16,                                /* 7 = D63_NODE */
+            -(sbyte)PredictionMode.D153Pred, -(sbyte)PredictionMode.D207Pred  /* 8 = D153_NODE */
         };
 
-        public static readonly sbyte[] Vp9InterModeTree = {
+        public static readonly sbyte[] InterModeTree = new sbyte[]
+        {
             -((sbyte)PredictionMode.ZeroMv - (sbyte)PredictionMode. NearestMv), 2,
             -((sbyte)PredictionMode.NearestMv - (sbyte)PredictionMode.NearestMv), 4,
             -((sbyte)PredictionMode.NearMv - (sbyte)PredictionMode.NearestMv),
-            -((sbyte)PredictionMode.NewMv - (sbyte)PredictionMode.NearestMv),
+            -((sbyte)PredictionMode.NewMv - (sbyte)PredictionMode.NearestMv)
         };
 
-        public static readonly sbyte[] Vp9PartitionTree = {
-            -(sbyte)PartitionType.PartitionNone, 2, -(sbyte)PartitionType.PartitionHorz, 4, -(sbyte)PartitionType.PartitionVert, -(sbyte)PartitionType.PartitionSplit,
+        public static readonly sbyte[] PartitionTree = new sbyte[]
+        {
+            -(sbyte)PartitionType.PartitionNone, 2, -(sbyte)PartitionType.PartitionHorz, 4, -(sbyte)PartitionType.PartitionVert, -(sbyte)PartitionType.PartitionSplit
         };
 
-        public static readonly sbyte[] Vp9SwitchableInterpTree = {
-            -Constants.EightTap, 2, -Constants.EightTapSmooth, -Constants.EightTapSharp,
+        public static readonly sbyte[] SwitchableInterpTree = new sbyte[]
+        {
+            -Constants.EightTap, 2, -Constants.EightTapSmooth, -Constants.EightTapSharp
         };
 
-        public static readonly sbyte[] Vp9SegmentTree = {
-            2, 4, 6, 8, 10, 12, 0, -1, -2, -3, -4, -5, -6, -7,
+        public static readonly sbyte[] SegmentTree = new sbyte[]
+        {
+            2, 4, 6, 8, 10, 12, 0, -1, -2, -3, -4, -5, -6, -7
         };
 
         // MV Ref
@@ -1390,192 +1442,169 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         // adding 9 for each intra block, 3 for each zero mv and 1 for each new
         // motion vector. This single number is then converted into a context
         // with a single lookup ( CounterToContext ).
-        public static readonly int[] Mode2Counter = {
-            9, // DC_PRED
-            9, // V_PRED
-            9, // H_PRED
-            9, // D45_PRED
-            9, // D135_PRED
-            9, // D117_PRED
-            9, // D153_PRED
-            9, // D207_PRED
-            9, // D63_PRED
-            9, // TM_PRED
-            0, // NEARESTMV
-            0, // NEARMV
-            3, // ZEROMV
-            1, // NEWMV
+        public static readonly int[] Mode2Counter = new int[]
+        {
+            9,  // DC_PRED
+            9,  // V_PRED
+            9,  // H_PRED
+            9,  // D45_PRED
+            9,  // D135_PRED
+            9,  // D117_PRED
+            9,  // D153_PRED
+            9,  // D207_PRED
+            9,  // D63_PRED
+            9,  // TM_PRED
+            0,  // NEARESTMV
+            0,  // NEARMV
+            3,  // ZEROMV
+            1,  // NEWMV
         };
 
         // There are 3^3 different combinations of 3 counts that can be either 0,1 or
         // 2. However the actual count can never be greater than 2 so the highest
         // counter we need is 18. 9 is an invalid counter that's never used.
-        public static readonly MotionVectorContext[] CounterToContext = {
-            MotionVectorContext.BothPredicted, // 0
-            MotionVectorContext.NewPlusNonIntra, // 1
-            MotionVectorContext.BothNew, // 2
+        public static readonly MotionVectorContext[] CounterToContext = new MotionVectorContext[]
+        {
+            MotionVectorContext.BothPredicted,     // 0
+            MotionVectorContext.NewPlusNonIntra,   // 1
+            MotionVectorContext.BothNew,           // 2
             MotionVectorContext.ZeroPlusPredicted, // 3
-            MotionVectorContext.NewPlusNonIntra, // 4
-            MotionVectorContext.InvalidCase, // 5
-            MotionVectorContext.BothZero, // 6
-            MotionVectorContext.InvalidCase, // 7
-            MotionVectorContext.InvalidCase, // 8
+            MotionVectorContext.NewPlusNonIntra,   // 4
+            MotionVectorContext.InvalidCase,       // 5
+            MotionVectorContext.BothZero,          // 6
+            MotionVectorContext.InvalidCase,       // 7
+            MotionVectorContext.InvalidCase,       // 8
             MotionVectorContext.IntraPlusNonIntra, // 9
             MotionVectorContext.IntraPlusNonIntra, // 10
-            MotionVectorContext.InvalidCase, // 11
+            MotionVectorContext.InvalidCase,       // 11
             MotionVectorContext.IntraPlusNonIntra, // 12
-            MotionVectorContext.InvalidCase, // 13
-            MotionVectorContext.InvalidCase, // 14
-            MotionVectorContext.InvalidCase, // 15
-            MotionVectorContext.InvalidCase, // 16
-            MotionVectorContext.InvalidCase, // 17
-            MotionVectorContext.BothIntra, // 18
+            MotionVectorContext.InvalidCase,       // 13
+            MotionVectorContext.InvalidCase,       // 14
+            MotionVectorContext.InvalidCase,       // 15
+            MotionVectorContext.InvalidCase,       // 16
+            MotionVectorContext.InvalidCase,       // 17
+            MotionVectorContext.BothIntra          // 18
         };
 
-        public static readonly Position[][] MvRefBlocks = {
+        public static readonly Position[][] MvRefBlocks = new Position[][]
+        {
             // 4X4
-            new Position[] {
-                new(-1, 0),
-                new(0, -1),
-                new(-1, -1),
-                new(-2, 0),
-                new(0, -2),
-                new(-2, -1),
-                new(-1, -2),
-                new(-2, -2),
-            },
-            // 4X8
-            new Position[] {
-                new(-1, 0),
-                new(0, -1),
-                new(-1, -1),
-                new(-2, 0),
-                new(0, -2),
-                new(-2, -1),
-                new(-1, -2),
-                new(-2, -2),
-            },
-            // 8X4
-            new Position[] {
-                new(-1, 0),
-                new(0, -1),
-                new(-1, -1),
-                new(-2, 0),
-                new(0, -2),
-                new(-2, -1),
-                new(-1, -2),
-                new(-2, -2),
-            },
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, -1 ),
+            new Position( -2, 0 ),
+            new Position( 0, -2 ),
+            new Position( -2, -1 ),
+            new Position( -1, -2 ),
+            new Position( -2, -2 ) },
+            // 4x8
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, -1 ),
+            new Position( -2, 0 ),
+            new Position( 0, -2 ),
+            new Position( -2, -1 ),
+            new Position( -1, -2 ),
+            new Position( -2, -2 ) },
+            // 8x4
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, -1 ),
+            new Position( -2, 0 ),
+            new Position( 0, -2 ),
+            new Position( -2, -1 ),
+            new Position( -1, -2 ),
+            new Position( -2, -2 ) },
             // 8X8
-            new Position[] {
-                new(-1, 0),
-                new(0, -1),
-                new(-1, -1),
-                new(-2, 0),
-                new(0, -2),
-                new(-2, -1),
-                new(-1, -2),
-                new(-2, -2),
-            },
-            // 8X16
-            new Position[] {
-                new(0, -1),
-                new(-1, 0),
-                new(1, -1),
-                new(-1, -1),
-                new(0, -2),
-                new(-2, 0),
-                new(-2, -1),
-                new(-1, -2),
-            },
-            // 16X8
-            new Position[] {
-                new(-1, 0),
-                new(0, -1),
-                new(-1, 1),
-                new(-1, -1),
-                new(-2, 0),
-                new(0, -2),
-                new(-1, -2),
-                new(-2, -1),
-            },
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, -1 ),
+            new Position( -2, 0 ),
+            new Position( 0, -2 ),
+            new Position( -2, -1 ),
+            new Position( -1, -2 ),
+            new Position( -2, -2 ) },
+            // 8x16
+            new Position[] { new Position( 0, -1 ),
+            new Position( -1, 0 ),
+            new Position( 1, -1 ),
+            new Position( -1, -1 ),
+            new Position( 0, -2 ),
+            new Position( -2, 0 ),
+            new Position( -2, -1 ),
+            new Position( -1, -2 ) },
+            // 16x8
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, 1 ),
+            new Position( -1, -1 ),
+            new Position( -2, 0 ),
+            new Position( 0, -2 ),
+            new Position( -1, -2 ),
+            new Position( -2, -1 ) },
             // 16X16
-            new Position[] {
-                new(-1, 0),
-                new(0, -1),
-                new(-1, 1),
-                new(1, -1),
-                new(-1, -1),
-                new(-3, 0),
-                new(0, -3),
-                new(-3, -3),
-            },
-            // 16X32
-            new Position[] {
-                new(0, -1),
-                new(-1, 0),
-                new(2, -1),
-                new(-1, -1),
-                new(-1, 1),
-                new(0, -3),
-                new(-3, 0),
-                new(-3, -3),
-            },
-            // 32X16
-            new Position[] {
-                new(-1, 0),
-                new(0, -1),
-                new(-1, 2),
-                new(-1, -1),
-                new(1, -1),
-                new(-3, 0),
-                new(0, -3),
-                new(-3, -3),
-            },
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, 1 ),
+            new Position( 1, -1 ),
+            new Position( -1, -1 ),
+            new Position( -3, 0 ),
+            new Position( 0, -3 ),
+            new Position( -3, -3 ) },
+            // 16x32
+            new Position[] { new Position( 0, -1 ),
+            new Position( -1, 0 ),
+            new Position( 2, -1 ),
+            new Position( -1, -1 ),
+            new Position( -1, 1 ),
+            new Position( 0, -3 ),
+            new Position( -3, 0 ),
+            new Position( -3, -3 ) },
+            // 32x16
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, 2 ),
+            new Position( -1, -1 ),
+            new Position( 1, -1 ),
+            new Position( -3, 0 ),
+            new Position( 0, -3 ),
+            new Position( -3, -3 ) },
             // 32X32
-            new Position[] {
-                new(-1, 1),
-                new(1, -1),
-                new(-1, 2),
-                new(2, -1),
-                new(-1, -1),
-                new(-3, 0),
-                new(0, -3),
-                new(-3, -3),
-            },
-            // 32X64
-            new Position[] {
-                new(0, -1),
-                new(-1, 0),
-                new(4, -1),
-                new(-1, 2),
-                new(-1, -1),
-                new(0, -3),
-                new(-3, 0),
-                new(2, -1),
-            },
-            // 64X32
-            new Position[] {
-                new(-1, 0),
-                new(0, -1),
-                new(-1, 4),
-                new(2, -1),
-                new(-1, -1),
-                new(-3, 0),
-                new(0, -3),
-                new(-1, 2),
-            },
-            // 64X64
-            new Position[] {
-                new(-1, 3),
-                new(3, -1),
-                new(-1, 4),
-                new(4, -1),
-                new(-1, -1),
-                new(-1, 0),
-                new(0, -1),
-                new(-1, 6),
-            },
+            new Position[] { new Position( -1, 1 ),
+            new Position( 1, -1 ),
+            new Position( -1, 2 ),
+            new Position( 2, -1 ),
+            new Position( -1, -1 ),
+            new Position( -3, 0 ),
+            new Position( 0, -3 ),
+            new Position( -3, -3 ) },
+            // 32x64
+            new Position[] { new Position( 0, -1 ),
+            new Position( -1, 0 ),
+            new Position( 4, -1 ),
+            new Position( -1, 2 ),
+            new Position( -1, -1 ),
+            new Position( 0, -3 ),
+            new Position( -3, 0 ),
+            new Position( 2, -1 ) },
+            // 64x32
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, 4 ),
+            new Position( 2, -1 ),
+            new Position( -1, -1 ),
+            new Position( -3, 0 ),
+            new Position( 0, -3 ),
+            new Position( -1, 2 ) },
+            // 64x64
+            new Position[] { new Position( -1, 3 ),
+            new Position( 3, -1 ),
+            new Position( -1, 4 ),
+            new Position( 4, -1 ),
+            new Position( -1, -1 ),
+            new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, 6 ) }
         };
     }
 }

+ 92 - 48
src/Ryujinx.Graphics.Nvdec.Vp9/PredCommon.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
 using System.Diagnostics;
 
 namespace Ryujinx.Graphics.Nvdec.Vp9
@@ -13,7 +13,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // left of the entries corresponding to real macroblocks.
             // The prediction flags in these dummy entries are initialized to 0.
             if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull)
-            { // both edges available
+            {
+                // both edges available
                 if (!xd.AboveMi.Value.HasSecondRef() && !xd.LeftMi.Value.HasSecondRef())
                 {
                     // Neither edge uses comp pred (0/1)
@@ -23,12 +24,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 else if (!xd.AboveMi.Value.HasSecondRef())
                 {
                     // One of two edges uses comp pred (2/3)
-                    ctx = 2 + (xd.AboveMi.Value.RefFrame[0] == cm.CompFixedRef || !xd.AboveMi.Value.IsInterBlock() ? 1 : 0);
+                    ctx = 2 + (xd.AboveMi.Value.RefFrame[0] == cm.CompFixedRef || !xd.AboveMi.Value.IsInterBlock()
+                        ? 1
+                        : 0);
                 }
                 else if (!xd.LeftMi.Value.HasSecondRef())
                 {
                     // One of two edges uses comp pred (2/3)
-                    ctx = 2 + (xd.LeftMi.Value.RefFrame[0] == cm.CompFixedRef || !xd.LeftMi.Value.IsInterBlock() ? 1 : 0);
+                    ctx = 2 +
+                          (xd.LeftMi.Value.RefFrame[0] == cm.CompFixedRef || !xd.LeftMi.Value.IsInterBlock() ? 1 : 0);
                 }
                 else // Both edges use comp pred (4)
                 {
@@ -36,7 +40,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 }
             }
             else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull)
-            { // One edge available
+            {
+                // One edge available
                 ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value;
 
                 if (!edgeMi.HasSecondRef())
@@ -51,11 +56,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 }
             }
             else
-            { // No edges available (1)
+            {
+                // No edges available (1)
                 ctx = 1;
             }
-            Debug.Assert(ctx >= 0 && ctx < Constants.CompInterContexts);
 
+            Debug.Assert(ctx >= 0 && ctx < Constants.CompInterContexts);
             return ctx;
         }
 
@@ -71,29 +77,33 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int varRefIdx = fixRefIdx == 0 ? 1 : 0;
 
             if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull)
-            { // Both edges available
+            {
+                // Both edges available
                 bool aboveIntra = !xd.AboveMi.Value.IsInterBlock();
                 bool leftIntra = !xd.LeftMi.Value.IsInterBlock();
 
                 if (aboveIntra && leftIntra)
-                { // Intra/Intra (2)
+                {
+                    // Intra/Intra (2)
                     predContext = 2;
                 }
                 else if (aboveIntra || leftIntra)
-                { // Intra/Inter
+                {
+                    // Intra/Inter
                     ref ModeInfo edgeMi = ref aboveIntra ? ref xd.LeftMi.Value : ref xd.AboveMi.Value;
 
                     if (!edgeMi.HasSecondRef()) // single pred (1/3)
                     {
-                        predContext = 1 + 2 * (edgeMi.RefFrame[0] != cm.CompVarRef[1] ? 1 : 0);
+                        predContext = 1 + (2 * (edgeMi.RefFrame[0] != cm.CompVarRef[1] ? 1 : 0));
                     }
                     else // Comp pred (1/3)
                     {
-                        predContext = 1 + 2 * (edgeMi.RefFrame[varRefIdx] != cm.CompVarRef[1] ? 1 : 0);
+                        predContext = 1 + (2 * (edgeMi.RefFrame[varRefIdx] != cm.CompVarRef[1] ? 1 : 0));
                     }
                 }
                 else
-                { // Inter/Inter
+                {
+                    // Inter/Inter
                     bool lSg = !xd.LeftMi.Value.HasSecondRef();
                     bool aSg = !xd.AboveMi.Value.HasSecondRef();
                     sbyte vrfa = aSg ? xd.AboveMi.Value.RefFrame[0] : xd.AboveMi.Value.RefFrame[varRefIdx];
@@ -104,7 +114,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         predContext = 0;
                     }
                     else if (lSg && aSg)
-                    { // Single/Single
+                    {
+                        // Single/Single
                         if ((vrfa == cm.CompFixedRef && vrfl == cm.CompVarRef[0]) ||
                             (vrfl == cm.CompFixedRef && vrfa == cm.CompVarRef[0]))
                         {
@@ -120,7 +131,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         }
                     }
                     else if (lSg || aSg)
-                    { // Single/Comp
+                    {
+                        // Single/Comp
                         sbyte vrfc = lSg ? vrfa : vrfl;
                         sbyte rfs = aSg ? vrfa : vrfl;
                         if (vrfc == cm.CompVarRef[1] && rfs != cm.CompVarRef[1])
@@ -137,7 +149,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         }
                     }
                     else if (vrfa == vrfl)
-                    { // Comp/Comp
+                    {
+                        // Comp/Comp
                         predContext = 4;
                     }
                     else
@@ -147,7 +160,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 }
             }
             else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull)
-            { // One edge available
+            {
+                // One edge available
                 ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value;
 
                 if (!edgeMi.IsInterBlock())
@@ -167,11 +181,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 }
             }
             else
-            { // No edges available (2)
+            {
+                // No edges available (2)
                 predContext = 2;
             }
-            Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts);
 
+            Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts);
             return predContext;
         }
 
@@ -183,16 +198,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // left of the entries corresponding to real macroblocks.
             // The prediction flags in these dummy entries are initialized to 0.
             if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull)
-            { // Both edges available
+            {
+                // Both edges available
                 bool aboveIntra = !xd.AboveMi.Value.IsInterBlock();
                 bool leftIntra = !xd.LeftMi.Value.IsInterBlock();
 
                 if (aboveIntra && leftIntra)
-                { // Intra/Intra
+                {
+                    // Intra/Intra
                     predContext = 2;
                 }
                 else if (aboveIntra || leftIntra)
-                { // Intra/Inter or Inter/Intra
+                {
+                    // Intra/Inter or Inter/Intra
                     ref ModeInfo edgeMi = ref aboveIntra ? ref xd.LeftMi.Value : ref xd.AboveMi.Value;
                     if (!edgeMi.HasSecondRef())
                     {
@@ -201,11 +219,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     else
                     {
                         predContext = 1 + (edgeMi.RefFrame[0] == Constants.LastFrame ||
-                                           edgeMi.RefFrame[1] == Constants.LastFrame ? 1 : 0);
+                                           edgeMi.RefFrame[1] == Constants.LastFrame
+                            ? 1
+                            : 0);
                     }
                 }
                 else
-                { // Inter/Inter
+                {
+                    // Inter/Inter
                     bool aboveHasSecond = xd.AboveMi.Value.HasSecondRef();
                     bool leftHasSecond = xd.LeftMi.Value.HasSecondRef();
                     sbyte above0 = xd.AboveMi.Value.RefFrame[0];
@@ -216,7 +237,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     if (aboveHasSecond && leftHasSecond)
                     {
                         predContext = 1 + (above0 == Constants.LastFrame || above1 == Constants.LastFrame ||
-                                            left0 == Constants.LastFrame || left1 == Constants.LastFrame ? 1 : 0);
+                                           left0 == Constants.LastFrame || left1 == Constants.LastFrame
+                            ? 1
+                            : 0);
                     }
                     else if (aboveHasSecond || leftHasSecond)
                     {
@@ -230,24 +253,28 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         }
                         else
                         {
-                            predContext = (crf1 == Constants.LastFrame || crf2 == Constants.LastFrame ? 1 : 0);
+                            predContext = crf1 == Constants.LastFrame || crf2 == Constants.LastFrame ? 1 : 0;
                         }
                     }
                     else
                     {
-                        predContext = 2 * (above0 == Constants.LastFrame ? 1 : 0) + 2 * (left0 == Constants.LastFrame ? 1 : 0);
+                        predContext = (2 * (above0 == Constants.LastFrame ? 1 : 0)) +
+                                      (2 * (left0 == Constants.LastFrame ? 1 : 0));
                     }
                 }
             }
             else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull)
-            { // One edge available
+            {
+                // One edge available
                 ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value;
                 if (!edgeMi.IsInterBlock())
-                { // Intra
+                {
+                    // Intra
                     predContext = 2;
                 }
                 else
-                { // Inter
+                {
+                    // Inter
                     if (!edgeMi.HasSecondRef())
                     {
                         predContext = 4 * (edgeMi.RefFrame[0] == Constants.LastFrame ? 1 : 0);
@@ -255,16 +282,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     else
                     {
                         predContext = 1 + (edgeMi.RefFrame[0] == Constants.LastFrame ||
-                                           edgeMi.RefFrame[1] == Constants.LastFrame ? 1 : 0);
+                                           edgeMi.RefFrame[1] == Constants.LastFrame
+                            ? 1
+                            : 0);
                     }
                 }
             }
             else
-            { // No edges available
+            {
+                // No edges available
                 predContext = 2;
             }
-            Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts);
 
+            Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts);
             return predContext;
         }
 
@@ -277,16 +307,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // left of the entries corresponding to real macroblocks.
             // The prediction flags in these dummy entries are initialized to 0.
             if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull)
-            { // Both edges available
+            {
+                // Both edges available
                 bool aboveIntra = !xd.AboveMi.Value.IsInterBlock();
                 bool leftIntra = !xd.LeftMi.Value.IsInterBlock();
 
                 if (aboveIntra && leftIntra)
-                { // Intra/Intra
+                {
+                    // Intra/Intra
                     predContext = 2;
                 }
                 else if (aboveIntra || leftIntra)
-                { // Intra/Inter or Inter/Intra
+                {
+                    // Intra/Inter or Inter/Intra
                     ref ModeInfo edgeMi = ref aboveIntra ? ref xd.LeftMi.Value : ref xd.AboveMi.Value;
                     if (!edgeMi.HasSecondRef())
                     {
@@ -301,12 +334,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     }
                     else
                     {
-                        predContext = 1 + 2 * (edgeMi.RefFrame[0] == Constants.GoldenFrame ||
-                                               edgeMi.RefFrame[1] == Constants.GoldenFrame ? 1 : 0);
+                        predContext = 1 + (2 * (edgeMi.RefFrame[0] == Constants.GoldenFrame ||
+                                                edgeMi.RefFrame[1] == Constants.GoldenFrame
+                            ? 1
+                            : 0));
                     }
                 }
                 else
-                { // Inter/Inter
+                {
+                    // Inter/Inter
                     bool aboveHasSecond = xd.AboveMi.Value.HasSecondRef();
                     bool leftHasSecond = xd.LeftMi.Value.HasSecondRef();
                     sbyte above0 = xd.AboveMi.Value.RefFrame[0];
@@ -319,7 +355,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         if (above0 == left0 && above1 == left1)
                         {
                             predContext = 3 * (above0 == Constants.GoldenFrame || above1 == Constants.GoldenFrame ||
-                                                left0 == Constants.GoldenFrame || left1 == Constants.GoldenFrame ? 1 : 0);
+                                               left0 == Constants.GoldenFrame || left1 == Constants.GoldenFrame
+                                ? 1
+                                : 0);
                         }
                         else
                         {
@@ -342,7 +380,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         }
                         else
                         {
-                            predContext = 1 + 2 * (crf1 == Constants.GoldenFrame || crf2 == Constants.GoldenFrame ? 1 : 0);
+                            predContext =
+                                1 + (2 * (crf1 == Constants.GoldenFrame || crf2 == Constants.GoldenFrame ? 1 : 0));
                         }
                     }
                     else
@@ -353,18 +392,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         }
                         else if (above0 == Constants.LastFrame || left0 == Constants.LastFrame)
                         {
-                            sbyte edge0 = (above0 == Constants.LastFrame) ? left0 : above0;
+                            sbyte edge0 = above0 == Constants.LastFrame ? left0 : above0;
                             predContext = 4 * (edge0 == Constants.GoldenFrame ? 1 : 0);
                         }
                         else
                         {
-                            predContext = 2 * (above0 == Constants.GoldenFrame ? 1 : 0) + 2 * (left0 == Constants.GoldenFrame ? 1 : 0);
+                            predContext = (2 * (above0 == Constants.GoldenFrame ? 1 : 0)) +
+                                          (2 * (left0 == Constants.GoldenFrame ? 1 : 0));
                         }
                     }
                 }
             }
             else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull)
-            { // One edge available
+            {
+                // One edge available
                 ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value;
 
                 if (!edgeMi.IsInterBlock() || (edgeMi.RefFrame[0] == Constants.LastFrame && !edgeMi.HasSecondRef()))
@@ -378,16 +419,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 else
                 {
                     predContext = 3 * (edgeMi.RefFrame[0] == Constants.GoldenFrame ||
-                                       edgeMi.RefFrame[1] == Constants.GoldenFrame ? 1 : 0);
+                                       edgeMi.RefFrame[1] == Constants.GoldenFrame
+                        ? 1
+                        : 0);
                 }
             }
             else
-            { // No edges available (2)
+            {
+                // No edges available (2)
                 predContext = 2;
             }
-            Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts);
 
+            Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts);
             return predContext;
         }
     }
-}
+}

+ 94 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/Prob.cs

@@ -0,0 +1,94 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using System;
+using System.Diagnostics;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    public static class Prob
+    {
+        public const int MaxProb = 255;
+
+        private static byte GetProb(uint num, uint den)
+        {
+            Debug.Assert(den != 0);
+            {
+                int p = (int)((((ulong)num * 256) + (den >> 1)) / den);
+                // (p > 255) ? 255 : (p < 1) ? 1 : p;
+                int clippedProb = p | ((255 - p) >> 23) | (p == 0 ? 1 : 0);
+                return (byte)clippedProb;
+            }
+        }
+
+        private static byte GetBinaryProb(uint n0, uint n1)
+        {
+            uint den = n0 + n1;
+            if (den == 0)
+            {
+                return 128;
+            }
+
+            return GetProb(n0, den);
+        }
+
+        /* This function assumes prob1 and prob2 are already within [1,255] range. */
+        public static byte WeightedProb(int prob1, int prob2, int factor)
+        {
+            return (byte)BitUtils.RoundPowerOfTwo((prob1 * (256 - factor)) + (prob2 * factor), 8);
+        }
+
+        public static byte MergeProbs(byte preProb, ref Array2<uint> ct, uint countSat, uint maxUpdateFactor)
+        {
+            byte prob = GetBinaryProb(ct[0], ct[1]);
+            uint count = Math.Min(ct[0] + ct[1], countSat);
+            uint factor = maxUpdateFactor * count / countSat;
+            return WeightedProb(preProb, prob, (int)factor);
+        }
+
+        // MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
+        private static readonly uint[] CountToUpdateFactor =
+        {
+            0, 6, 12, 19, 25, 32, 38, 44, 51, 57, 64, 70, 76, 83, 89, 96, 102, 108, 115, 121, 128
+        };
+
+        private const int ModeMvCountSat = 20;
+
+        public static byte ModeMvMergeProbs(byte preProb, ref Array2<uint> ct)
+        {
+            uint den = ct[0] + ct[1];
+            if (den == 0)
+            {
+                return preProb;
+            }
+
+            uint count = Math.Min(den, ModeMvCountSat);
+            uint factor = CountToUpdateFactor[(int)count];
+            byte prob = GetProb(ct[0], den);
+            return WeightedProb(preProb, prob, (int)factor);
+        }
+
+        private static uint TreeMergeProbsImpl(
+            uint i,
+            sbyte[] tree,
+            ReadOnlySpan<byte> preProbs,
+            ReadOnlySpan<uint> counts,
+            Span<byte> probs)
+        {
+            int l = tree[i];
+            uint leftCount = l <= 0 ? counts[-l] : TreeMergeProbsImpl((uint)l, tree, preProbs, counts, probs);
+            int r = tree[i + 1];
+            uint rightCount = r <= 0 ? counts[-r] : TreeMergeProbsImpl((uint)r, tree, preProbs, counts, probs);
+            Array2<uint> ct = new();
+            ct[0] = leftCount;
+            ct[1] = rightCount;
+            probs[(int)(i >> 1)] = ModeMvMergeProbs(preProbs[(int)(i >> 1)], ref ct);
+            return leftCount + rightCount;
+        }
+
+        public static void VpxTreeMergeProbs(sbyte[] tree, ReadOnlySpan<byte> preProbs, ReadOnlySpan<uint> counts,
+            Span<byte> probs)
+        {
+            TreeMergeProbsImpl(0, tree, preProbs, counts, probs);
+        }
+    }
+}

+ 104 - 166
src/Ryujinx.Graphics.Nvdec.Vp9/QuantCommon.cs

@@ -1,172 +1,127 @@
-using Ryujinx.Graphics.Nvdec.Vp9.Types;
-using System;
+using System;
 using System.Diagnostics;
 
 namespace Ryujinx.Graphics.Nvdec.Vp9
 {
     internal static class QuantCommon
     {
-        public const int MinQ = 0;
         public const int MaxQ = 255;
+        public const int QindexBits = 8;
 
-        private static readonly short[] _dcQlookup = {
-            4,    8,    8,    9,    10,  11,  12,  12,  13,  14,  15,   16,   17,   18,
-            19,   19,   20,   21,   22,  23,  24,  25,  26,  26,  27,   28,   29,   30,
-            31,   32,   32,   33,   34,  35,  36,  37,  38,  38,  39,   40,   41,   42,
-            43,   43,   44,   45,   46,  47,  48,  48,  49,  50,  51,   52,   53,   53,
-            54,   55,   56,   57,   57,  58,  59,  60,  61,  62,  62,   63,   64,   65,
-            66,   66,   67,   68,   69,  70,  70,  71,  72,  73,  74,   74,   75,   76,
-            77,   78,   78,   79,   80,  81,  81,  82,  83,  84,  85,   85,   87,   88,
-            90,   92,   93,   95,   96,  98,  99,  101, 102, 104, 105,  107,  108,  110,
-            111,  113,  114,  116,  117, 118, 120, 121, 123, 125, 127,  129,  131,  134,
-            136,  138,  140,  142,  144, 146, 148, 150, 152, 154, 156,  158,  161,  164,
-            166,  169,  172,  174,  177, 180, 182, 185, 187, 190, 192,  195,  199,  202,
-            205,  208,  211,  214,  217, 220, 223, 226, 230, 233, 237,  240,  243,  247,
-            250,  253,  257,  261,  265, 269, 272, 276, 280, 284, 288,  292,  296,  300,
-            304,  309,  313,  317,  322, 326, 330, 335, 340, 344, 349,  354,  359,  364,
-            369,  374,  379,  384,  389, 395, 400, 406, 411, 417, 423,  429,  435,  441,
-            447,  454,  461,  467,  475, 482, 489, 497, 505, 513, 522,  530,  539,  549,
-            559,  569,  579,  590,  602, 614, 626, 640, 654, 668, 684,  700,  717,  736,
-            755,  775,  796,  819,  843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139,
-            1184, 1232, 1282, 1336,
+        private static readonly short[] DcQlookup =
+        {
+            4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29,
+            30, 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51,
+            52, 53, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, 66, 66, 67, 68, 69, 70, 70, 71, 72,
+            73, 74, 74, 75, 76, 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, 87, 88, 90, 92, 93, 95, 96, 98, 99,
+            101, 102, 104, 105, 107, 108, 110, 111, 113, 114, 116, 117, 118, 120, 121, 123, 125, 127, 129, 131, 134,
+            136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 161, 164, 166, 169, 172, 174, 177, 180, 182,
+            185, 187, 190, 192, 195, 199, 202, 205, 208, 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247,
+            250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, 296, 300, 304, 309, 313, 317, 322, 326, 330,
+            335, 340, 344, 349, 354, 359, 364, 369, 374, 379, 384, 389, 395, 400, 406, 411, 417, 423, 429, 435, 441,
+            447, 454, 461, 467, 475, 482, 489, 497, 505, 513, 522, 530, 539, 549, 559, 569, 579, 590, 602, 614, 626,
+            640, 654, 668, 684, 700, 717, 736, 755, 775, 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098,
+            1139, 1184, 1232, 1282, 1336
         };
 
-        private static readonly short[] _dcQlookup10 = {
-            4,    9,    10,   13,   15,   17,   20,   22,   25,   28,   31,   34,   37,
-            40,   43,   47,   50,   53,   57,   60,   64,   68,   71,   75,   78,   82,
-            86,   90,   93,   97,   101,  105,  109,  113,  116,  120,  124,  128,  132,
-            136,  140,  143,  147,  151,  155,  159,  163,  166,  170,  174,  178,  182,
-            185,  189,  193,  197,  200,  204,  208,  212,  215,  219,  223,  226,  230,
-            233,  237,  241,  244,  248,  251,  255,  259,  262,  266,  269,  273,  276,
-            280,  283,  287,  290,  293,  297,  300,  304,  307,  310,  314,  317,  321,
-            324,  327,  331,  334,  337,  343,  350,  356,  362,  369,  375,  381,  387,
-            394,  400,  406,  412,  418,  424,  430,  436,  442,  448,  454,  460,  466,
-            472,  478,  484,  490,  499,  507,  516,  525,  533,  542,  550,  559,  567,
-            576,  584,  592,  601,  609,  617,  625,  634,  644,  655,  666,  676,  687,
-            698,  708,  718,  729,  739,  749,  759,  770,  782,  795,  807,  819,  831,
-            844,  856,  868,  880,  891,  906,  920,  933,  947,  961,  975,  988,  1001,
-            1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202,
-            1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436,
-            1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717,
-            1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088,
-            2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675,
-            2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823,
-            3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
+        private static readonly short[] DcQlookup10 =
+        {
+            4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37, 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82,
+            86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132, 136, 140, 143, 147, 151, 155, 159, 163,
+            166, 170, 174, 178, 182, 185, 189, 193, 197, 200, 204, 208, 212, 215, 219, 223, 226, 230, 233, 237, 241,
+            244, 248, 251, 255, 259, 262, 266, 269, 273, 276, 280, 283, 287, 290, 293, 297, 300, 304, 307, 310, 314,
+            317, 321, 324, 327, 331, 334, 337, 343, 350, 356, 362, 369, 375, 381, 387, 394, 400, 406, 412, 418, 424,
+            430, 436, 442, 448, 454, 460, 466, 472, 478, 484, 490, 499, 507, 516, 525, 533, 542, 550, 559, 567, 576,
+            584, 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687, 698, 708, 718, 729, 739, 749, 759, 770, 782,
+            795, 807, 819, 831, 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, 1001, 1015, 1030, 1045,
+            1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202, 1218, 1236, 1253, 1271, 1288, 1306, 1323,
+            1342, 1361, 1379, 1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670,
+            1692, 1717, 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088, 2123, 2159,
+            2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102,
+            3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347
         };
 
-        private static readonly short[] _dcQlookup12 = {
-            4,     12,    18,    25,    33,    41,    50,    60,    70,    80,    91,
-            103,   115,   127,   140,   153,   166,   180,   194,   208,   222,   237,
-            251,   266,   281,   296,   312,   327,   343,   358,   374,   390,   405,
-            421,   437,   453,   469,   484,   500,   516,   532,   548,   564,   580,
-            596,   611,   627,   643,   659,   674,   690,   706,   721,   737,   752,
-            768,   783,   798,   814,   829,   844,   859,   874,   889,   904,   919,
-            934,   949,   964,   978,   993,   1008,  1022,  1037,  1051,  1065,  1080,
-            1094,  1108,  1122,  1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
-            1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,  1368,  1393,  1419,
-            1444,  1469,  1494,  1519,  1544,  1569,  1594,  1618,  1643,  1668,  1692,
-            1717,  1741,  1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,  1957,
-            1992,  2027,  2061,  2096,  2130,  2165,  2199,  2233,  2267,  2300,  2334,
-            2367,  2400,  2434,  2467,  2499,  2532,  2575,  2618,  2661,  2704,  2746,
-            2788,  2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,  3177,  3226,
-            3275,  3324,  3373,  3421,  3469,  3517,  3565,  3621,  3677,  3733,  3788,
-            3843,  3897,  3951,  4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
-            4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,  5013,  5083,  5153,
-            5222,  5291,  5367,  5442,  5517,  5591,  5665,  5745,  5825,  5905,  5984,
-            6063,  6149,  6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,  6966,
-            7064,  7163,  7269,  7376,  7483,  7599,  7715,  7832,  7958,  8085,  8214,
-            8352,  8492,  8635,  8788,  8945,  9104,  9275,  9450,  9639,  9832,  10031,
-            10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118,
-            13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949,
-            19718, 20521, 21387,
+        private static readonly short[] DcQlookup12 =
+        {
+            4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91, 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237, 251,
+            266, 281, 296, 312, 327, 343, 358, 374, 390, 405, 421, 437, 453, 469, 484, 500, 516, 532, 548, 564, 580,
+            596, 611, 627, 643, 659, 674, 690, 706, 721, 737, 752, 768, 783, 798, 814, 829, 844, 859, 874, 889, 904,
+            919, 934, 949, 964, 978, 993, 1008, 1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122, 1136, 1151, 1165,
+            1179, 1192, 1206, 1220, 1234, 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, 1368, 1393, 1419, 1444,
+            1469, 1494, 1519, 1544, 1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741, 1765, 1789, 1814, 1838, 1862,
+            1885, 1909, 1933, 1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199, 2233, 2267, 2300, 2334, 2367, 2400,
+            2434, 2467, 2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788, 2830, 2872, 2913, 2954, 2995, 3036, 3076,
+            3127, 3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517, 3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951,
+            4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, 5013,
+            5083, 5153, 5222, 5291, 5367, 5442, 5517, 5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149, 6234, 6319,
+            6404, 6495, 6587, 6678, 6769, 6867, 6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715, 7832, 7958, 8085,
+            8214, 8352, 8492, 8635, 8788, 8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245, 10465, 10702, 10946,
+            11210, 11482, 11776, 12081, 12409, 12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943,
+            17575, 18237, 18949, 19718, 20521, 21387
         };
 
-        private static readonly short[] _acQlookup = {
-            4,    8,    9,    10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
-            20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
-            33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
-            46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
-            59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,
-            72,   73,   74,   75,   76,   77,   78,   79,   80,   81,   82,   83,   84,
-            85,   86,   87,   88,   89,   90,   91,   92,   93,   94,   95,   96,   97,
-            98,   99,   100,  101,  102,  104,  106,  108,  110,  112,  114,  116,  118,
-            120,  122,  124,  126,  128,  130,  132,  134,  136,  138,  140,  142,  144,
-            146,  148,  150,  152,  155,  158,  161,  164,  167,  170,  173,  176,  179,
-            182,  185,  188,  191,  194,  197,  200,  203,  207,  211,  215,  219,  223,
-            227,  231,  235,  239,  243,  247,  251,  255,  260,  265,  270,  275,  280,
-            285,  290,  295,  300,  305,  311,  317,  323,  329,  335,  341,  347,  353,
-            359,  366,  373,  380,  387,  394,  401,  408,  416,  424,  432,  440,  448,
-            456,  465,  474,  483,  492,  501,  510,  520,  530,  540,  550,  560,  571,
-            582,  593,  604,  615,  627,  639,  651,  663,  676,  689,  702,  715,  729,
-            743,  757,  771,  786,  801,  816,  832,  848,  864,  881,  898,  915,  933,
-            951,  969,  988,  1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196,
-            1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537,
-            1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+        private static readonly short[] AcQlookup =
+        {
+            4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
+            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+            60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
+            86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104, 106, 108, 110, 112, 114,
+            116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, 150, 152, 155, 158,
+            161, 164, 167, 170, 173, 176, 179, 182, 185, 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223, 227,
+            231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280, 285, 290, 295, 300, 305, 311, 317, 323, 329,
+            335, 341, 347, 353, 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, 440, 448, 456, 465, 474, 483,
+            492, 501, 510, 520, 530, 540, 550, 560, 571, 582, 593, 604, 615, 627, 639, 651, 663, 676, 689, 702, 715,
+            729, 743, 757, 771, 786, 801, 816, 832, 848, 864, 881, 898, 915, 933, 951, 969, 988, 1007, 1026, 1046,
+            1066, 1087, 1108, 1129, 1151, 1173, 1196, 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451,
+            1479, 1508, 1537, 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828
         };
 
-        private static readonly short[] _acQlookup10 = {
-            4,    9,    11,   13,   16,   18,   21,   24,   27,   30,   33,   37,   40,
-            44,   48,   51,   55,   59,   63,   67,   71,   75,   79,   83,   88,   92,
-            96,   100,  105,  109,  114,  118,  122,  127,  131,  136,  140,  145,  149,
-            154,  158,  163,  168,  172,  177,  181,  186,  190,  195,  199,  204,  208,
-            213,  217,  222,  226,  231,  235,  240,  244,  249,  253,  258,  262,  267,
-            271,  275,  280,  284,  289,  293,  297,  302,  306,  311,  315,  319,  324,
-            328,  332,  337,  341,  345,  349,  354,  358,  362,  367,  371,  375,  379,
-            384,  388,  392,  396,  401,  409,  417,  425,  433,  441,  449,  458,  466,
-            474,  482,  490,  498,  506,  514,  523,  531,  539,  547,  555,  563,  571,
-            579,  588,  596,  604,  616,  628,  640,  652,  664,  676,  688,  700,  713,
-            725,  737,  749,  761,  773,  785,  797,  809,  825,  841,  857,  873,  889,
-            905,  922,  938,  954,  970,  986,  1002, 1018, 1038, 1058, 1078, 1098, 1118,
-            1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411,
-            1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791,
-            1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283,
-            2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915,
-            2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731,
-            3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784,
-            4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148,
-            6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
+        private static readonly short[] AcQlookup10 =
+        {
+            4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40, 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92,
+            96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149, 154, 158, 163, 168, 172, 177, 181, 186,
+            190, 195, 199, 204, 208, 213, 217, 222, 226, 231, 235, 240, 244, 249, 253, 258, 262, 267, 271, 275, 280,
+            284, 289, 293, 297, 302, 306, 311, 315, 319, 324, 328, 332, 337, 341, 345, 349, 354, 358, 362, 367, 371,
+            375, 379, 384, 388, 392, 396, 401, 409, 417, 425, 433, 441, 449, 458, 466, 474, 482, 490, 498, 506, 514,
+            523, 531, 539, 547, 555, 563, 571, 579, 588, 596, 604, 616, 628, 640, 652, 664, 676, 688, 700, 713, 725,
+            737, 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889, 905, 922, 938, 954, 970, 986, 1002, 1018,
+            1038, 1058, 1078, 1098, 1118, 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386,
+            1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791, 1823, 1859, 1895,
+            1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, 2555, 2603,
+            2651, 2703, 2755, 2807, 2859, 2915, 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591,
+            3659, 3731, 3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784, 4876, 4972,
+            5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, 6388, 6512, 6640, 6768, 6900,
+            7036, 7172, 7312
         };
 
-        private static readonly short[] _acQlookup12 = {
-            4,     13,    19,    27,    35,    44,    54,    64,    75,    87,    99,
-            112,   126,   139,   154,   168,   183,   199,   214,   230,   247,   263,
-            280,   297,   314,   331,   349,   366,   384,   402,   420,   438,   456,
-            475,   493,   511,   530,   548,   567,   586,   604,   623,   642,   660,
-            679,   698,   716,   735,   753,   772,   791,   809,   828,   846,   865,
-            884,   902,   920,   939,   957,   976,   994,   1012,  1030,  1049,  1067,
-            1085,  1103,  1121,  1139,  1157,  1175,  1193,  1211,  1229,  1246,  1264,
-            1282,  1299,  1317,  1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
-            1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,  1627,  1660,  1693,
-            1725,  1758,  1791,  1824,  1856,  1889,  1922,  1954,  1987,  2020,  2052,
-            2085,  2118,  2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,  2411,
-            2459,  2508,  2556,  2605,  2653,  2701,  2750,  2798,  2847,  2895,  2943,
-            2992,  3040,  3088,  3137,  3185,  3234,  3298,  3362,  3426,  3491,  3555,
-            3619,  3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,  4230,  4310,
-            4390,  4470,  4550,  4631,  4711,  4791,  4871,  4967,  5064,  5160,  5256,
-            5352,  5448,  5544,  5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
-            6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,  7579,  7723,  7867,
-            8011,  8155,  8315,  8475,  8635,  8795,  8956,  9132,  9308,  9484,  9660,
-            9836,  10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885,
-            12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637,
-            14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062,
-            18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334,
-            22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599,
-            28143, 28687, 29247,
+        private static readonly short[] AcQlookup12 =
+        {
+            4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99, 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263, 280,
+            297, 314, 331, 349, 366, 384, 402, 420, 438, 456, 475, 493, 511, 530, 548, 567, 586, 604, 623, 642, 660,
+            679, 698, 716, 735, 753, 772, 791, 809, 828, 846, 865, 884, 902, 920, 939, 957, 976, 994, 1012, 1030,
+            1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317, 1335,
+            1352, 1370, 1387, 1405, 1422, 1440, 1457, 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, 1627, 1660,
+            1693, 1725, 1758, 1791, 1824, 1856, 1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118, 2150, 2183, 2216,
+            2248, 2281, 2313, 2346, 2378, 2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750, 2798, 2847, 2895, 2943,
+            2992, 3040, 3088, 3137, 3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619, 3684, 3748, 3812, 3876, 3941,
+            4005, 4069, 4149, 4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791, 4871, 4967, 5064, 5160, 5256, 5352,
+            5448, 5544, 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, 6522, 6650, 6778, 6906, 7034, 7162, 7290,
+            7435, 7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635, 8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028,
+            10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885, 12109, 12333, 12573, 12813, 13053, 13309,
+            13565, 13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726,
+            18062, 18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334, 22766, 23214, 23662,
+            24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247
         };
 
         public static short DcQuant(int qindex, int delta, BitDepth bitDepth)
         {
             switch (bitDepth)
             {
-                case BitDepth.Bits8:
-                    return _dcQlookup[Math.Clamp(qindex + delta, 0, MaxQ)];
-                case BitDepth.Bits10:
-                    return _dcQlookup10[Math.Clamp(qindex + delta, 0, MaxQ)];
-                case BitDepth.Bits12:
-                    return _dcQlookup12[Math.Clamp(qindex + delta, 0, MaxQ)];
+                case BitDepth.Bits8: return DcQlookup[Math.Clamp(qindex + delta, 0, MaxQ)];
+                case BitDepth.Bits10: return DcQlookup10[Math.Clamp(qindex + delta, 0, MaxQ)];
+                case BitDepth.Bits12: return DcQlookup12[Math.Clamp(qindex + delta, 0, MaxQ)];
                 default:
-                    Debug.Assert(false, "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-
+                    Debug.Assert(false, "bitDepth should be Bits8, Bits10 or Bits12");
                     return -1;
             }
         }
@@ -175,30 +130,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         {
             switch (bitDepth)
             {
-                case BitDepth.Bits8:
-                    return _acQlookup[Math.Clamp(qindex + delta, 0, MaxQ)];
-                case BitDepth.Bits10:
-                    return _acQlookup10[Math.Clamp(qindex + delta, 0, MaxQ)];
-                case BitDepth.Bits12:
-                    return _acQlookup12[Math.Clamp(qindex + delta, 0, MaxQ)];
+                case BitDepth.Bits8: return AcQlookup[Math.Clamp(qindex + delta, 0, MaxQ)];
+                case BitDepth.Bits10: return AcQlookup10[Math.Clamp(qindex + delta, 0, MaxQ)];
+                case BitDepth.Bits12: return AcQlookup12[Math.Clamp(qindex + delta, 0, MaxQ)];
                 default:
-                    Debug.Assert(false, "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-
+                    Debug.Assert(false, "bitDepth should be Bits8, Bits10 or Bits12");
                     return -1;
             }
         }
-
-        public static int GetQIndex(ref Segmentation seg, int segmentId, int baseQIndex)
-        {
-            if (seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlAltQ) != 0)
-            {
-                int data = seg.GetSegData(segmentId, SegLvlFeatures.SegLvlAltQ);
-                int segQIndex = seg.AbsDelta == Constants.SegmentAbsData ? data : baseQIndex + data;
-
-                return Math.Clamp(segQIndex, 0, MaxQ);
-            }
-
-            return baseQIndex;
-        }
     }
-}
+}

+ 84 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/ReadBitBuffer.cs

@@ -0,0 +1,84 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using System;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    public ref struct ReadBitBuffer
+    {
+        public ReadOnlySpan<byte> BitBuffer;
+        public ulong BitOffset;
+        public object ErrorHandlerData;
+
+        public int DecodeUnsignedMax(int max)
+        {
+            int data = ReadLiteral(BitUtils.GetUnsignedBits((uint)max));
+            return data > max ? max : data;
+        }
+
+        public ulong BytesRead()
+        {
+            return (BitOffset + 7) >> 3;
+        }
+
+        public int ReadBit()
+        {
+            ulong off = BitOffset;
+            ulong p = off >> 3;
+            int q = 7 - (int)(off & 0x7);
+            if (p < (ulong)BitBuffer.Length)
+            {
+                int bit = (BitBuffer[(int)p] >> q) & 1;
+                BitOffset = off + 1;
+                return bit;
+            }
+
+            return 0;
+        }
+
+        public int ReadLiteral(int bits)
+        {
+            int value = 0, bit;
+            for (bit = bits - 1; bit >= 0; bit--)
+            {
+                value |= ReadBit() << bit;
+            }
+
+            return value;
+        }
+
+        public int ReadSignedLiteral(int bits)
+        {
+            int value = ReadLiteral(bits);
+            return ReadBit() != 0 ? -value : value;
+        }
+
+        public int ReadInvSignedLiteral(int bits)
+        {
+            return ReadSignedLiteral(bits);
+        }
+
+        public int ReadDeltaQ()
+        {
+            return ReadBit() != 0 ? ReadSignedLiteral(4) : 0;
+        }
+
+        public void ReadFrameSize(out int width, out int height)
+        {
+            width = ReadLiteral(16) + 1;
+            height = ReadLiteral(16) + 1;
+        }
+
+        public BitstreamProfile ReadProfile()
+        {
+            int profile = ReadBit();
+            profile |= ReadBit() << 1;
+            if (profile > 2)
+            {
+                profile += ReadBit();
+            }
+
+            return (BitstreamProfile)profile;
+        }
+    }
+}

+ 23 - 50
src/Ryujinx.Graphics.Nvdec.Vp9/ReconInter.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
 using Ryujinx.Graphics.Nvdec.Vp9.Types;
 using System;
 using System.Diagnostics;
@@ -77,65 +77,38 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 bd);
         }
 
-        private static int RoundMvCompQ4(int value)
+        public static int RoundMvCompQ4(int value)
         {
             return (value < 0 ? value - 2 : value + 2) / 4;
         }
 
-        private static Mv MiMvPredQ4(ref ModeInfo mi, int idx)
-        {
-            return new Mv
-            {
-                Row = (short)RoundMvCompQ4(
-                    mi.Bmi[0].Mv[idx].Row + mi.Bmi[1].Mv[idx].Row +
-                    mi.Bmi[2].Mv[idx].Row + mi.Bmi[3].Mv[idx].Row),
-                Col = (short)RoundMvCompQ4(
-                    mi.Bmi[0].Mv[idx].Col + mi.Bmi[1].Mv[idx].Col +
-                    mi.Bmi[2].Mv[idx].Col + mi.Bmi[3].Mv[idx].Col),
-            };
-        }
-
-        private static int RoundMvCompQ2(int value)
+        public static int RoundMvCompQ2(int value)
         {
             return (value < 0 ? value - 1 : value + 1) / 2;
         }
 
-        private static Mv MiMvPredQ2(ref ModeInfo mi, int idx, int block0, int block1)
-        {
-            return new Mv
-            {
-                Row = (short)RoundMvCompQ2(
-                    mi.Bmi[block0].Mv[idx].Row +
-                    mi.Bmi[block1].Mv[idx].Row),
-                Col = (short)RoundMvCompQ2(
-                    mi.Bmi[block0].Mv[idx].Col +
-                    mi.Bmi[block1].Mv[idx].Col),
-            };
-        }
-
         public static Mv ClampMvToUmvBorderSb(ref MacroBlockD xd, ref Mv srcMv, int bw, int bh, int ssX, int ssY)
         {
             // If the MV points so far into the UMV border that no visible pixels
             // are used for reconstruction, the subpel part of the MV can be
             // discarded and the MV limited to 16 pixels with equivalent results.
-            int spelLeft = (Constants.Vp9InterpExtend + bw) << SubpelBits;
+            int spelLeft = (Constants.InterpExtend + bw) << SubpelBits;
             int spelRight = spelLeft - SubpelShifts;
-            int spelTop = (Constants.Vp9InterpExtend + bh) << SubpelBits;
+            int spelTop = (Constants.InterpExtend + bh) << SubpelBits;
             int spelBottom = spelTop - SubpelShifts;
             Mv clampedMv = new()
             {
-                Row = (short)(srcMv.Row * (1 << (1 - ssY))),
-                Col = (short)(srcMv.Col * (1 << (1 - ssX))),
+                Row = (short)(srcMv.Row * (1 << (1 - ssY))), Col = (short)(srcMv.Col * (1 << (1 - ssX)))
             };
 
             Debug.Assert(ssX <= 1);
             Debug.Assert(ssY <= 1);
 
-            clampedMv.ClampMv(
-               xd.MbToLeftEdge * (1 << (1 - ssX)) - spelLeft,
-               xd.MbToRightEdge * (1 << (1 - ssX)) + spelRight,
-               xd.MbToTopEdge * (1 << (1 - ssY)) - spelTop,
-               xd.MbToBottomEdge * (1 << (1 - ssY)) + spelBottom);
+            clampedMv.Clamp(
+                (xd.MbToLeftEdge * (1 << (1 - ssX))) - spelLeft,
+                (xd.MbToRightEdge * (1 << (1 - ssX))) + spelRight,
+                (xd.MbToTopEdge * (1 << (1 - ssY))) - spelTop,
+                (xd.MbToBottomEdge * (1 << (1 - ssY))) + spelBottom);
 
             return clampedMv;
         }
@@ -150,18 +123,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     res = mi.Bmi[block].Mv[refr];
                     break;
                 case 1:
-                    res = MiMvPredQ2(ref mi, refr, block, block + 2);
+                    res = mi.MvPredQ2(refr, block, block + 2);
                     break;
                 case 2:
-                    res = MiMvPredQ2(ref mi, refr, block, block + 1);
+                    res = mi.MvPredQ2(refr, block, block + 1);
                     break;
                 case 3:
-                    res = MiMvPredQ4(ref mi, refr);
+                    res = mi.MvPredQ4(refr);
                     break;
                 default:
                     Debug.Assert(ssIdx <= 3 && ssIdx >= 0);
                     break;
             }
+
             return res;
         }
 
@@ -169,8 +143,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         {
             int x = !sf.IsNull ? sf.Value.ScaleValueX(xOffset) : xOffset;
             int y = !sf.IsNull ? sf.Value.ScaleValueY(yOffset) : yOffset;
-
-            return y * stride + x;
+            return (y * stride) + x;
         }
 
         private static void SetupPredPlanes(
@@ -203,12 +176,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             strides[0] = src.Stride;
             strides[1] = src.UvStride;
             strides[2] = src.UvStride;
-            int i;
 
-            for (i = 0; i < Constants.MaxMbPlane; ++i)
+            for (int i = 0; i < Constants.MaxMbPlane; ++i)
             {
                 ref MacroBlockDPlane pd = ref planes[i];
-                SetupPredPlanes(ref pd.Dst, buffers[i], strides[i], miRow, miCol, Ptr<ScaleFactors>.Null, pd.SubsamplingX, pd.SubsamplingY);
+                SetupPredPlanes(ref pd.Dst, buffers[i], strides[i], miRow, miCol, Ptr<ScaleFactors>.Null,
+                    pd.SubsamplingX, pd.SubsamplingY);
             }
         }
 
@@ -230,14 +203,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 strides[0] = src.Stride;
                 strides[1] = src.UvStride;
                 strides[2] = src.UvStride;
-                int i;
 
-                for (i = 0; i < Constants.MaxMbPlane; ++i)
+                for (int i = 0; i < Constants.MaxMbPlane; ++i)
                 {
                     ref MacroBlockDPlane pd = ref xd.Plane[i];
-                    SetupPredPlanes(ref pd.Pre[idx], buffers[i], strides[i], miRow, miCol, sf, pd.SubsamplingX, pd.SubsamplingY);
+                    SetupPredPlanes(ref pd.Pre[idx], buffers[i], strides[i], miRow, miCol, sf, pd.SubsamplingX,
+                        pd.SubsamplingY);
                 }
             }
         }
     }
-}
+}

+ 80 - 183
src/Ryujinx.Graphics.Nvdec.Vp9/ReconIntra.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
 using Ryujinx.Graphics.Nvdec.Vp9.Types;
 using System;
 using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.IntraPred;
@@ -7,7 +7,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 {
     internal static class ReconIntra
     {
-        public static readonly TxType[] IntraModeToTxTypeLookup = {
+        public static readonly TxType[] IntraModeToTxTypeLookup =
+        {
             TxType.DctDct, // DC
             TxType.AdstDct, // V
             TxType.DctAdst, // H
@@ -17,7 +18,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             TxType.DctAdst, // D153
             TxType.DctAdst, // D207
             TxType.AdstDct, // D63
-            TxType.AdstAdst, // TM
+            TxType.AdstAdst // TM
         };
 
         private const int NeedLeft = 1 << 1;
@@ -35,231 +36,123 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             NeedLeft | NeedAbove, // D153
             NeedLeft, // D207
             NeedAboveRight, // D63
-            NeedLeft | NeedAbove, // TM
+            NeedLeft | NeedAbove // TM
         };
 
         private unsafe delegate void IntraPredFn(byte* dst, int stride, byte* above, byte* left);
 
-        private static readonly unsafe IntraPredFn[][] _pred = {
-            new IntraPredFn[]
-            {
-                null,
-                null,
-                null,
-                null,
-            },
-            new IntraPredFn[]
-            {
-                VPredictor4x4,
-                VPredictor8x8,
-                VPredictor16x16,
-                VPredictor32x32,
-            },
-            new IntraPredFn[]
-            {
-                HPredictor4x4,
-                HPredictor8x8,
-                HPredictor16x16,
-                HPredictor32x32,
-            },
-            new IntraPredFn[]
-            {
-                D45Predictor4x4,
-                D45Predictor8x8,
-                D45Predictor16x16,
-                D45Predictor32x32,
-            },
-            new IntraPredFn[]
-            {
-                D135Predictor4x4,
-                D135Predictor8x8,
-                D135Predictor16x16,
-                D135Predictor32x32,
-            },
-            new IntraPredFn[]
-            {
-                D117Predictor4x4,
-                D117Predictor8x8,
-                D117Predictor16x16,
-                D117Predictor32x32,
-            },
-            new IntraPredFn[]
-            {
-                D153Predictor4x4,
-                D153Predictor8x8,
-                D153Predictor16x16,
-                D153Predictor32x32,
-            },
-            new IntraPredFn[]
-            {
-                D207Predictor4x4,
-                D207Predictor8x8,
-                D207Predictor16x16,
-                D207Predictor32x32,
-            },
-            new IntraPredFn[]
-            {
-                D63Predictor4x4,
-                D63Predictor8x8,
-                D63Predictor16x16,
-                D63Predictor32x32,
-            },
-            new IntraPredFn[]
-            {
-                TMPredictor4x4,
-                TMPredictor8x8,
-                TMPredictor16x16,
-                TMPredictor32x32,
-            },
+        private static readonly unsafe IntraPredFn[][] Pred =
+        {
+            new IntraPredFn[] { null, null, null, null },
+            new IntraPredFn[] { VPredictor4x4, VPredictor8x8, VPredictor16x16, VPredictor32x32 },
+            new IntraPredFn[] { HPredictor4x4, HPredictor8x8, HPredictor16x16, HPredictor32x32 },
+            new IntraPredFn[] { D45Predictor4x4, D45Predictor8x8, D45Predictor16x16, D45Predictor32x32 },
+            new IntraPredFn[] { D135Predictor4x4, D135Predictor8x8, D135Predictor16x16, D135Predictor32x32 },
+            new IntraPredFn[] { D117Predictor4x4, D117Predictor8x8, D117Predictor16x16, D117Predictor32x32 },
+            new IntraPredFn[] { D153Predictor4x4, D153Predictor8x8, D153Predictor16x16, D153Predictor32x32 },
+            new IntraPredFn[] { D207Predictor4x4, D207Predictor8x8, D207Predictor16x16, D207Predictor32x32 },
+            new IntraPredFn[] { D63Predictor4x4, D63Predictor8x8, D63Predictor16x16, D63Predictor32x32 },
+            new IntraPredFn[] { TmPredictor4x4, TmPredictor8x8, TmPredictor16x16, TmPredictor32x32 }
         };
 
-        private static readonly unsafe IntraPredFn[][][] _dcPred = {
+        private static readonly unsafe IntraPredFn[][][] DcPred =
+        {
             new[]
             {
                 new IntraPredFn[]
                 {
-                    Dc128Predictor4x4,
-                    Dc128Predictor8x8,
-                    Dc128Predictor16x16,
-                    Dc128Predictor32x32,
+                    Dc128Predictor4x4, Dc128Predictor8x8, Dc128Predictor16x16, Dc128Predictor32x32
                 },
                 new IntraPredFn[]
                 {
-                    DcTopPredictor4x4,
-                    DcTopPredictor8x8,
-                    DcTopPredictor16x16,
-                    DcTopPredictor32x32,
-                },
+                    DcTopPredictor4x4, DcTopPredictor8x8, DcTopPredictor16x16, DcTopPredictor32x32
+                }
             },
             new[]
             {
                 new IntraPredFn[]
                 {
-                    DcLeftPredictor4x4,
-                    DcLeftPredictor8x8,
-                    DcLeftPredictor16x16,
-                    DcLeftPredictor32x32,
+                    DcLeftPredictor4x4, DcLeftPredictor8x8, DcLeftPredictor16x16, DcLeftPredictor32x32
                 },
-                new IntraPredFn[]
-                {
-                    DcPredictor4x4,
-                    DcPredictor8x8,
-                    DcPredictor16x16,
-                    DcPredictor32x32,
-                },
-            },
+                new IntraPredFn[] { DcPredictor4x4, DcPredictor8x8, DcPredictor16x16, DcPredictor32x32 }
+            }
         };
 
         private unsafe delegate void IntraHighPredFn(ushort* dst, int stride, ushort* above, ushort* left, int bd);
 
-        private static readonly unsafe IntraHighPredFn[][] _predHigh = {
-            new IntraHighPredFn[]
-            {
-                null,
-                null,
-                null,
-                null,
-            },
+        private static readonly unsafe IntraHighPredFn[][] PredHigh =
+        {
+            new IntraHighPredFn[] { null, null, null, null },
             new IntraHighPredFn[]
             {
-                HighbdVPredictor4x4,
-                HighbdVPredictor8x8,
-                HighbdVPredictor16x16,
-                HighbdVPredictor32x32,
+                HighbdVPredictor4x4, HighbdVPredictor8x8, HighbdVPredictor16x16, HighbdVPredictor32x32
             },
             new IntraHighPredFn[]
             {
-                HighbdHPredictor4x4,
-                HighbdHPredictor8x8,
-                HighbdHPredictor16x16,
-                HighbdHPredictor32x32,
+                HighbdHPredictor4x4, HighbdHPredictor8x8, HighbdHPredictor16x16, HighbdHPredictor32x32
             },
             new IntraHighPredFn[]
             {
-                HighbdD45Predictor4x4,
-                HighbdD45Predictor8x8,
-                HighbdD45Predictor16x16,
-                HighbdD45Predictor32x32,
+                HighbdD45Predictor4x4, HighbdD45Predictor8x8, HighbdD45Predictor16x16, HighbdD45Predictor32x32
             },
             new IntraHighPredFn[]
             {
-                HighbdD135Predictor4x4,
-                HighbdD135Predictor8x8,
-                HighbdD135Predictor16x16,
-                HighbdD135Predictor32x32,
+                HighbdD135Predictor4x4, HighbdD135Predictor8x8, HighbdD135Predictor16x16,
+                HighbdD135Predictor32x32
             },
             new IntraHighPredFn[]
             {
-                HighbdD117Predictor4x4,
-                HighbdD117Predictor8x8,
-                HighbdD117Predictor16x16,
-                HighbdD117Predictor32x32,
+                HighbdD117Predictor4x4, HighbdD117Predictor8x8, HighbdD117Predictor16x16,
+                HighbdD117Predictor32x32
             },
             new IntraHighPredFn[]
             {
-                HighbdD153Predictor4x4,
-                HighbdD153Predictor8x8,
-                HighbdD153Predictor16x16,
-                HighbdD153Predictor32x32,
+                HighbdD153Predictor4x4, HighbdD153Predictor8x8, HighbdD153Predictor16x16,
+                HighbdD153Predictor32x32
             },
             new IntraHighPredFn[]
             {
-                HighbdD207Predictor4x4,
-                HighbdD207Predictor8x8,
-                HighbdD207Predictor16x16,
-                HighbdD207Predictor32x32,
+                HighbdD207Predictor4x4, HighbdD207Predictor8x8, HighbdD207Predictor16x16,
+                HighbdD207Predictor32x32
             },
             new IntraHighPredFn[]
             {
-                HighbdD63Predictor4x4,
-                HighbdD63Predictor8x8,
-                HighbdD63Predictor16x16,
-                HighbdD63Predictor32x32,
+                HighbdD63Predictor4x4, HighbdD63Predictor8x8, HighbdD63Predictor16x16, HighbdD63Predictor32x32
             },
             new IntraHighPredFn[]
             {
-                HighbdTMPredictor4x4,
-                HighbdTMPredictor8x8,
-                HighbdTMPredictor16x16,
-                HighbdTMPredictor32x32,
-            },
+                HighbdTmPredictor4x4, HighbdTmPredictor8x8, HighbdTmPredictor16x16, HighbdTmPredictor32x32
+            }
         };
 
-        private static readonly unsafe IntraHighPredFn[][][] _dcPredHigh = {
+        private static readonly unsafe IntraHighPredFn[][][] DcPredHigh =
+        {
             new[]
             {
                 new IntraHighPredFn[]
                 {
-                    HighbdDc128Predictor4x4,
-                    HighbdDc128Predictor8x8,
-                    HighbdDc128Predictor16x16,
-                    HighbdDc128Predictor32x32,
+                    HighbdDc128Predictor4x4, HighbdDc128Predictor8x8, HighbdDc128Predictor16x16,
+                    HighbdDc128Predictor32x32
                 },
                 new IntraHighPredFn[]
                 {
-                    HighbdDcTopPredictor4x4,
-                    HighbdDcTopPredictor8x8,
-                    HighbdDcTopPredictor16x16,
-                    HighbdDcTopPredictor32x32,
-                },
+                    HighbdDcTopPredictor4x4, HighbdDcTopPredictor8x8, HighbdDcTopPredictor16x16,
+                    HighbdDcTopPredictor32x32
+                }
             },
             new[]
             {
                 new IntraHighPredFn[]
                 {
-                    HighbdDcLeftPredictor4x4,
-                    HighbdDcLeftPredictor8x8,
-                    HighbdDcLeftPredictor16x16,
-                    HighbdDcLeftPredictor32x32,
+                    HighbdDcLeftPredictor4x4, HighbdDcLeftPredictor8x8, HighbdDcLeftPredictor16x16,
+                    HighbdDcLeftPredictor32x32
                 },
                 new IntraHighPredFn[]
                 {
-                    HighbdDcPredictor4x4,
-                    HighbdDcPredictor8x8,
-                    HighbdDcPredictor16x16,
-                    HighbdDcPredictor32x32,
-                },
-            },
+                    HighbdDcPredictor4x4, HighbdDcPredictor8x8, HighbdDcPredictor16x16,
+                    HighbdDcPredictor32x32
+                }
+            }
         };
 
         private static unsafe void BuildIntraPredictorsHigh(
@@ -327,7 +220,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         {
                             for (i = 0; i < bs; ++i)
                             {
-                                leftCol[i] = refr[i * refStride - 1];
+                                leftCol[i] = refr[(i * refStride) - 1];
                             }
                         }
                         else
@@ -335,12 +228,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                             int extendBottom = frameHeight - y0;
                             for (i = 0; i < extendBottom; ++i)
                             {
-                                leftCol[i] = refr[i * refStride - 1];
+                                leftCol[i] = refr[(i * refStride) - 1];
                             }
 
                             for (; i < bs; ++i)
                             {
-                                leftCol[i] = refr[(extendBottom - 1) * refStride - 1];
+                                leftCol[i] = refr[((extendBottom - 1) * refStride) - 1];
                             }
                         }
                     }
@@ -349,7 +242,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         /* faster path if the block does not need extension */
                         for (i = 0; i < bs; ++i)
                         {
-                            leftCol[i] = refr[i * refStride - 1];
+                            leftCol[i] = refr[(i * refStride) - 1];
                         }
                     }
                 }
@@ -391,6 +284,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                             MemoryUtil.Copy(aboveRow, aboveRef, bs);
                         }
                     }
+
                     aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (ushort)(baseVal + 1);
                 }
                 else
@@ -409,7 +303,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     if (xd.MbToRightEdge < 0)
                     {
                         /* slower path if the block needs border extension */
-                        if (x0 + 2 * bs <= frameWidth)
+                        if (x0 + (2 * bs) <= frameWidth)
                         {
                             if (rightAvailable != 0 && bs == 4)
                             {
@@ -427,7 +321,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                             if (rightAvailable != 0 && bs == 4)
                             {
                                 MemoryUtil.Copy(aboveRow, aboveRef, r);
-                                MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth);
+                                MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + (2 * bs) - frameWidth);
                             }
                             else
                             {
@@ -439,8 +333,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         {
                             int r = frameWidth - x0;
                             MemoryUtil.Copy(aboveRow, aboveRef, r);
-                            MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth);
+                            MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + (2 * bs) - frameWidth);
                         }
+
                         aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (ushort)(baseVal + 1);
                     }
                     else
@@ -476,11 +371,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // Predict
             if (mode == PredictionMode.DcPred)
             {
-                _dcPredHigh[leftAvailable][upAvailable][(int)txSize](dst, dstStride, constAboveRow, leftCol, xd.Bd);
+                DcPredHigh[leftAvailable][upAvailable][(int)txSize](dst, dstStride, constAboveRow, leftCol, xd.Bd);
             }
             else
             {
-                _predHigh[(int)mode][(int)txSize](dst, dstStride, constAboveRow, leftCol, xd.Bd);
+                PredHigh[(int)mode][(int)txSize](dst, dstStride, constAboveRow, leftCol, xd.Bd);
             }
         }
 
@@ -544,7 +439,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         {
                             for (i = 0; i < bs; ++i)
                             {
-                                leftCol[i] = refr[i * refStride - 1];
+                                leftCol[i] = refr[(i * refStride) - 1];
                             }
                         }
                         else
@@ -552,12 +447,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                             int extendBottom = frameHeight - y0;
                             for (i = 0; i < extendBottom; ++i)
                             {
-                                leftCol[i] = refr[i * refStride - 1];
+                                leftCol[i] = refr[(i * refStride) - 1];
                             }
 
                             for (; i < bs; ++i)
                             {
-                                leftCol[i] = refr[(extendBottom - 1) * refStride - 1];
+                                leftCol[i] = refr[((extendBottom - 1) * refStride) - 1];
                             }
                         }
                     }
@@ -566,7 +461,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         /* Faster path if the block does not need extension */
                         for (i = 0; i < bs; ++i)
                         {
-                            leftCol[i] = refr[i * refStride - 1];
+                            leftCol[i] = refr[(i * refStride) - 1];
                         }
                     }
                 }
@@ -608,6 +503,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                             MemoryUtil.Copy(aboveRow, aboveRef, bs);
                         }
                     }
+
                     aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (byte)129;
                 }
                 else
@@ -626,7 +522,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     if (xd.MbToRightEdge < 0)
                     {
                         /* Slower path if the block needs border extension */
-                        if (x0 + 2 * bs <= frameWidth)
+                        if (x0 + (2 * bs) <= frameWidth)
                         {
                             if (rightAvailable != 0 && bs == 4)
                             {
@@ -644,7 +540,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                             if (rightAvailable != 0 && bs == 4)
                             {
                                 MemoryUtil.Copy(aboveRow, aboveRef, r);
-                                MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth);
+                                MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + (2 * bs) - frameWidth);
                             }
                             else
                             {
@@ -656,7 +552,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                         {
                             int r = frameWidth - x0;
                             MemoryUtil.Copy(aboveRow, aboveRef, r);
-                            MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth);
+                            MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + (2 * bs) - frameWidth);
                         }
                     }
                     else
@@ -679,6 +575,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                             }
                         }
                     }
+
                     aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (byte)129;
                 }
                 else
@@ -691,11 +588,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // Predict
             if (mode == PredictionMode.DcPred)
             {
-                _dcPred[leftAvailable][upAvailable][(int)txSize](dst, dstStride, constAboveRow, leftCol);
+                DcPred[leftAvailable][upAvailable][(int)txSize](dst, dstStride, constAboveRow, leftCol);
             }
             else
             {
-                _pred[(int)mode][(int)txSize](dst, dstStride, constAboveRow, leftCol);
+                Pred[(int)mode][(int)txSize](dst, dstStride, constAboveRow, leftCol);
             }
         }
 
@@ -716,7 +613,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             int txw = 1 << (int)txSize;
             int haveTop = loff != 0 || !xd.AboveMi.IsNull ? 1 : 0;
             int haveLeft = aoff != 0 || !xd.LeftMi.IsNull ? 1 : 0;
-            int haveRight = (aoff + txw) < bw ? 1 : 0;
+            int haveRight = aoff + txw < bw ? 1 : 0;
             int x = aoff * 4;
             int y = loff * 4;
 
@@ -736,9 +633,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                     x,
                     y,
                     plane);
-
                 return;
             }
+
             BuildIntraPredictors(
                 ref xd,
                 refr,
@@ -755,4 +652,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
                 plane);
         }
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs

@@ -8,4 +8,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         public ArrayPtr<byte> Data;
         public int Size;
     }
-}
+}

+ 12 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs

@@ -12,9 +12,20 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
         public int BufEnd;
         public Reader BitReader;
         public Vp9BackwardUpdates Counts;
+
         public MacroBlockD Xd;
+
         /* dqcoeff are shared by all the planes. So planes must be decoded serially */
         public Array32<Array32<int>> Dqcoeff;
         public InternalErrorInfo ErrorInfo;
+
+        public int DecPartitionPlaneContext(int miRow, int miCol, int bsl)
+        {
+            ref sbyte aboveCtx = ref Xd.AboveSegContext[miCol];
+            ref sbyte leftCtx = ref Xd.LeftSegContext[miRow & Constants.MiMask];
+            int above = (aboveCtx >> bsl) & 1, left = (leftCtx >> bsl) & 1;
+
+            return (left * 2) + above + (bsl * Constants.PartitionPloffset);
+        }
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/BModeInfo.cs

@@ -7,4 +7,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         public PredictionMode Mode;
         public Array2<Mv> Mv; // First, second inter predictor motion vectors
     }
-}
+}

+ 11 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/Types/BitstreamProfile.cs

@@ -0,0 +1,11 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    public enum BitstreamProfile
+    {
+        Profile0,
+        Profile1,
+        Profile2,
+        Profile3,
+        MaxProfiles
+    }
+}

+ 17 - 17
src/Ryujinx.Graphics.Nvdec.Vp9/Types/BlockSize.cs

@@ -1,21 +1,21 @@
-namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
     internal enum BlockSize
     {
-        Block4x4 = 0,
-        Block4x8 = 1,
-        Block8x4 = 2,
-        Block8x8 = 3,
-        Block8x16 = 4,
-        Block16x8 = 5,
-        Block16x16 = 6,
-        Block16x32 = 7,
-        Block32x16 = 8,
-        Block32x32 = 9,
-        Block32x64 = 10,
-        Block64x32 = 11,
-        Block64x64 = 12,
-        BlockSizes = 13,
-        BlockInvalid = BlockSizes,
+        Block4x4,
+        Block4x8,
+        Block8x4,
+        Block8x8,
+        Block8x16,
+        Block16x8,
+        Block16x16,
+        Block16x32,
+        Block32x16,
+        Block32x32,
+        Block32x64,
+        Block64x32,
+        Block64x64,
+        BlockSizes,
+        BlockInvalid = BlockSizes
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/Buf2D.cs

@@ -7,4 +7,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         public ArrayPtr<byte> Buf;
         public int Stride;
     }
-}
+}

+ 18 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/Types/BufferPool.cs

@@ -0,0 +1,18 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct BufferPool
+    {
+        // Private data associated with the frame buffer callbacks.
+        public Ptr<InternalFrameBufferList> CbPriv;
+
+        // vpx_get_frame_buffer_cb_fn_t get_fb_cb;
+        // vpx_release_frame_buffer_cb_fn_t release_fb_cb;
+
+        public Array12<RefCntBuffer> FrameBufs;
+
+        // Frame buffers allocated internally by the codec.
+        public InternalFrameBufferList IntFrameBuffers;
+    }
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/FrameType.cs

@@ -5,4 +5,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         KeyFrame = 0,
         InterFrame = 1,
     }
-}
+}

+ 14 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilter.cs

@@ -23,5 +23,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 
         public ArrayPtr<LoopFilterMask> Lfm;
         public int LfmStride;
+
+        public void SetDefaultLfDeltas()
+        {
+            ModeRefDeltaEnabled = true;
+            ModeRefDeltaUpdate = true;
+
+            RefDeltas[Constants.IntraFrame] = 1;
+            RefDeltas[Constants.LastFrame] = 0;
+            RefDeltas[Constants.GoldenFrame] = -1;
+            RefDeltas[Constants.AltRefFrame] = -1;
+            ModeDeltas[0] = 0;
+            ModeDeltas[1] = 0;
+        }
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterInfoN.cs

@@ -7,4 +7,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         public Array64<LoopFilterThresh> Lfthr;
         public Array8<Array4<Array2<byte>>> Lvl;
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterMask.cs

@@ -21,4 +21,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         public ushort Int4x4Uv;
         public Array64<byte> LflY;
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterThresh.cs

@@ -12,4 +12,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         public Array16<byte> HevThr;
 #pragma warning restore CS0649
     }
-}
+}

+ 44 - 33
src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockD.cs

@@ -1,4 +1,5 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
 using Ryujinx.Graphics.Video;
 
 namespace Ryujinx.Graphics.Nvdec.Vp9.Types
@@ -54,7 +55,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 
         public Ptr<InternalErrorInfo> ErrorInfo;
 
-        public readonly int GetPredContextSegId()
+        public int GetPredContextSegId()
         {
             sbyte aboveSip = !AboveMi.IsNull ? AboveMi.Value.SegIdPredicted : (sbyte)0;
             sbyte leftSip = !LeftMi.IsNull ? LeftMi.Value.SegIdPredicted : (sbyte)0;
@@ -62,15 +63,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             return aboveSip + leftSip;
         }
 
-        public readonly int GetSkipContext()
+        public int GetSkipContext()
         {
             int aboveSkip = !AboveMi.IsNull ? AboveMi.Value.Skip : 0;
             int leftSkip = !LeftMi.IsNull ? LeftMi.Value.Skip : 0;
-
             return aboveSkip + leftSkip;
         }
 
-        public readonly int GetPredContextSwitchableInterp()
+        public int GetPredContextSwitchableInterp()
         {
             // Note:
             // The mode info data structure has a one element border above and to the
@@ -83,18 +83,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             {
                 return leftType;
             }
-            else if (leftType == Constants.SwitchableFilters)
+
+            if (leftType == Constants.SwitchableFilters)
             {
                 return aboveType;
             }
-            else if (aboveType == Constants.SwitchableFilters)
+
+            if (aboveType == Constants.SwitchableFilters)
             {
                 return leftType;
             }
-            else
-            {
-                return Constants.SwitchableFilters;
-            }
+
+            return Constants.SwitchableFilters;
         }
 
         // The mode info data structure has a one element border above and to the
@@ -104,20 +104,22 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         // 1 - intra/inter, inter/intra
         // 2 - intra/--, --/intra
         // 3 - intra/intra
-        public readonly int GetIntraInterContext()
+        public int GetIntraInterContext()
         {
             if (!AboveMi.IsNull && !LeftMi.IsNull)
-            { // Both edges available
+            {
+                // Both edges available
                 bool aboveIntra = !AboveMi.Value.IsInterBlock();
                 bool leftIntra = !LeftMi.Value.IsInterBlock();
-
-                return leftIntra && aboveIntra ? 3 : (leftIntra || aboveIntra ? 1 : 0);
+                return leftIntra && aboveIntra ? 3 : leftIntra || aboveIntra ? 1 : 0;
             }
 
             if (!AboveMi.IsNull || !LeftMi.IsNull)
-            { // One edge available
+            {
+                // One edge available
                 return 2 * (!(!AboveMi.IsNull ? AboveMi.Value : LeftMi.Value).IsInterBlock() ? 1 : 0);
             }
+
             return 0;
         }
 
@@ -125,11 +127,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         // The mode info data structure has a one element border above and to the
         // left of the entries corresponding to real blocks.
         // The prediction flags in these dummy entries are initialized to 0.
-        public readonly int GetTxSizeContext()
+        public int GetTxSizeContext()
         {
             int maxTxSize = (int)Luts.MaxTxSizeLookup[(int)Mi[0].Value.SbType];
-            int aboveCtx = (!AboveMi.IsNull && AboveMi.Value.Skip == 0) ? (int)AboveMi.Value.TxSize : maxTxSize;
-            int leftCtx = (!LeftMi.IsNull && LeftMi.Value.Skip == 0) ? (int)LeftMi.Value.TxSize : maxTxSize;
+            int aboveCtx = !AboveMi.IsNull && AboveMi.Value.Skip == 0 ? (int)AboveMi.Value.TxSize : maxTxSize;
+            int leftCtx = !LeftMi.IsNull && LeftMi.Value.Skip == 0 ? (int)LeftMi.Value.TxSize : maxTxSize;
             if (LeftMi.IsNull)
             {
                 leftCtx = aboveCtx;
@@ -140,14 +142,12 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
                 aboveCtx = leftCtx;
             }
 
-            return (aboveCtx + leftCtx) > maxTxSize ? 1 : 0;
+            return aboveCtx + leftCtx > maxTxSize ? 1 : 0;
         }
 
         public void SetupBlockPlanes(int ssX, int ssY)
         {
-            int i;
-
-            for (i = 0; i < Constants.MaxMbPlane; i++)
+            for (int i = 0; i < Constants.MaxMbPlane; i++)
             {
                 Plane[i].SubsamplingX = i != 0 ? ssX : 0;
                 Plane[i].SubsamplingY = i != 0 ? ssY : 0;
@@ -158,25 +158,36 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         {
             int aboveIdx = miCol * 2;
             int leftIdx = (miRow * 2) & 15;
-            int i;
-            for (i = 0; i < Constants.MaxMbPlane; ++i)
+
+            for (int i = 0; i < Constants.MaxMbPlane; ++i)
             {
                 ref MacroBlockDPlane pd = ref Plane[i];
                 pd.AboveContext = AboveContext[i].Slice(aboveIdx >> pd.SubsamplingX);
-                pd.LeftContext = new ArrayPtr<sbyte>(ref LeftContext[i][leftIdx >> pd.SubsamplingY], 16 - (leftIdx >> pd.SubsamplingY));
+                pd.LeftContext = new ArrayPtr<sbyte>(ref LeftContext[i][leftIdx >> pd.SubsamplingY],
+                    16 - (leftIdx >> pd.SubsamplingY));
             }
         }
 
         internal void SetMiRowCol(ref TileInfo tile, int miRow, int bh, int miCol, int bw, int miRows, int miCols)
         {
-            MbToTopEdge = -((miRow * Constants.MiSize) * 8);
-            MbToBottomEdge = ((miRows - bh - miRow) * Constants.MiSize) * 8;
-            MbToLeftEdge = -((miCol * Constants.MiSize) * 8);
-            MbToRightEdge = ((miCols - bw - miCol) * Constants.MiSize) * 8;
+            MbToTopEdge = -(miRow * Constants.MiSize * 8);
+            MbToBottomEdge = (miRows - bh - miRow) * Constants.MiSize * 8;
+            MbToLeftEdge = -(miCol * Constants.MiSize * 8);
+            MbToRightEdge = (miCols - bw - miCol) * Constants.MiSize * 8;
 
             // Are edges available for intra prediction?
-            AboveMi = (miRow != 0) ? Mi[-MiStride] : Ptr<ModeInfo>.Null;
-            LeftMi = (miCol > tile.MiColStart) ? Mi[-1] : Ptr<ModeInfo>.Null;
+            AboveMi = miRow != 0 ? Mi[-MiStride] : Ptr<ModeInfo>.Null;
+            LeftMi = miCol > tile.MiColStart ? Mi[-1] : Ptr<ModeInfo>.Null;
+        }
+
+        public unsafe void DecResetSkipContext()
+        {
+            for (int i = 0; i < Constants.MaxMbPlane; i++)
+            {
+                ref MacroBlockDPlane pd = ref Plane[i];
+                MemoryUtil.Fill(pd.AboveContext.ToPointer(), (sbyte)0, pd.N4W);
+                MemoryUtil.Fill(pd.LeftContext.ToPointer(), (sbyte)0, pd.N4H);
+            }
         }
     }
-}
+}

+ 2 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockDPlane.cs

@@ -15,7 +15,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 
         // Number of 4x4s in current block
         public ushort N4W, N4H;
+
         // Log2 of N4W, N4H
         public byte N4Wl, N4Hl;
     }
-}
+}

+ 49 - 8
src/Ryujinx.Graphics.Nvdec.Vp9/Types/ModeInfo.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
 using System.Diagnostics;
 
 namespace Ryujinx.Graphics.Nvdec.Vp9.Types
@@ -32,11 +32,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             return SbType < BlockSize.Block8x8 ? Bmi[block].Mode : Mode;
         }
 
-        public readonly TxSize GetUvTxSize(ref MacroBlockDPlane pd)
+        public TxSize GetUvTxSize(ref MacroBlockDPlane pd)
         {
             Debug.Assert(SbType < BlockSize.Block8x8 ||
-                Luts.SsSizeLookup[(int)SbType][pd.SubsamplingX][pd.SubsamplingY] != BlockSize.BlockInvalid);
-
+                         Luts.SsSizeLookup[(int)SbType][pd.SubsamplingX][pd.SubsamplingY] != BlockSize.BlockInvalid);
             return Luts.UvTxsizeLookup[(int)SbType][(int)TxSize][pd.SubsamplingX][pd.SubsamplingY];
         }
 
@@ -50,8 +49,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             return RefFrame[1] > Constants.IntraFrame;
         }
 
-        private static readonly int[][] _idxNColumnToSubblock = {
-            new[] { 1, 2 }, new[] { 1, 3 }, new[] { 3, 2 }, new[] { 3, 3 },
+        private static readonly int[][] IdxNColumnToSubblock =
+        {
+            new[] { 1, 2 }, new[] { 1, 3 }, new[] { 3, 2 }, new[] { 3, 3 }
         };
 
         // This function returns either the appropriate sub block or block's mv
@@ -59,8 +59,49 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         public Mv GetSubBlockMv(int whichMv, int searchCol, int blockIdx)
         {
             return blockIdx >= 0 && SbType < BlockSize.Block8x8
-                ? Bmi[_idxNColumnToSubblock[blockIdx][searchCol == 0 ? 1 : 0]].Mv[whichMv]
+                ? Bmi[IdxNColumnToSubblock[blockIdx][searchCol == 0 ? 1 : 0]].Mv[whichMv]
                 : Mv[whichMv];
         }
+
+        public Mv MvPredQ4(int idx)
+        {
+            Mv res = new()
+            {
+                Row = (short)ReconInter.RoundMvCompQ4(
+                    Bmi[0].Mv[idx].Row + Bmi[1].Mv[idx].Row +
+                    Bmi[2].Mv[idx].Row + Bmi[3].Mv[idx].Row),
+                Col = (short)ReconInter.RoundMvCompQ4(
+                    Bmi[0].Mv[idx].Col + Bmi[1].Mv[idx].Col +
+                    Bmi[2].Mv[idx].Col + Bmi[3].Mv[idx].Col)
+            };
+            return res;
+        }
+
+        public Mv MvPredQ2(int idx, int block0, int block1)
+        {
+            Mv res = new()
+            {
+                Row = (short)ReconInter.RoundMvCompQ2(
+                    Bmi[block0].Mv[idx].Row +
+                    Bmi[block1].Mv[idx].Row),
+                Col = (short)ReconInter.RoundMvCompQ2(
+                    Bmi[block0].Mv[idx].Col +
+                    Bmi[block1].Mv[idx].Col)
+            };
+            return res;
+        }
+
+        // Performs mv sign inversion if indicated by the reference frame combination.
+        public Mv ScaleMv(int refr, sbyte thisRefFrame, ref Array4<sbyte> refSignBias)
+        {
+            Mv mv = Mv[refr];
+            if (refSignBias[RefFrame[refr]] != refSignBias[thisRefFrame])
+            {
+                mv.Row *= -1;
+                mv.Col *= -1;
+            }
+
+            return mv;
+        }
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/MotionVectorContext.cs

@@ -11,4 +11,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         BothIntra = 6,
         InvalidCase = 9,
     }
-}
+}

+ 80 - 85
src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
 using Ryujinx.Graphics.Video;
 using System;
 using System.Diagnostics;
@@ -12,96 +12,86 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 
         private static ReadOnlySpan<byte> LogInBase2 => new byte[]
         {
-            0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-            4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
-            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,
-            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
-            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
-            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10,
+            0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6,
+            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 10
         };
 
-        public readonly bool UseMvHp()
+        public bool UseHp()
         {
-            const int KMvRefThresh = 64; // Threshold for use of high-precision 1/8 mv
-            return Math.Abs(Row) < KMvRefThresh && Math.Abs(Col) < KMvRefThresh;
+            const int kMvRefThresh = 64; // Threshold for use of high-precision 1/8 mv
+            return Math.Abs(Row) < kMvRefThresh && Math.Abs(Col) < kMvRefThresh;
         }
 
-        public static bool MvJointVertical(MvJointType type)
+        public static bool JointVertical(MvJointType type)
         {
-            return type == MvJointType.MvJointHzvnz || type == MvJointType.MvJointHnzvnz;
+            return type == MvJointType.Hzvnz || type == MvJointType.Hnzvnz;
         }
 
-        public static bool MvJointHorizontal(MvJointType type)
+        public static bool JointHorizontal(MvJointType type)
         {
-            return type == MvJointType.MvJointHnzvz || type == MvJointType.MvJointHnzvnz;
+            return type == MvJointType.Hnzvz || type == MvJointType.Hnzvnz;
         }
 
-        private static int MvClassBase(MvClassType c)
+        private static int ClassBase(MvClassType c)
         {
             return c != 0 ? Constants.Class0Size << ((int)c + 2) : 0;
         }
 
-        private static MvClassType GetMvClass(int z, Ptr<int> offset)
+        private static MvClassType GetClass(int z, Ptr<int> offset)
         {
-            MvClassType c = (z >= Constants.Class0Size * 4096) ? MvClassType.MvClass10 : (MvClassType)LogInBase2[z >> 3];
+            MvClassType c = z >= Constants.Class0Size * 4096 ? MvClassType.Class10 : (MvClassType)LogInBase2[z >> 3];
             if (!offset.IsNull)
             {
-                offset.Value = z - MvClassBase(c);
+                offset.Value = z - ClassBase(c);
             }
 
             return c;
         }
 
-        private static void IncMvComponent(int v, ref Vp9BackwardUpdates counts, int comp, int incr, int usehp)
+        private static void IncComponent(int v, ref Vp9BackwardUpdates counts, int comp, int incr, int usehp)
         {
-            int s, z, c, o = 0, d, e, f;
+            int o = 0;
             Debug.Assert(v != 0); /* Should not be zero */
-            s = v < 0 ? 1 : 0;
+            int s = v < 0 ? 1 : 0;
             counts.Sign[comp][s] += (uint)incr;
-            z = (s != 0 ? -v : v) - 1; /* Magnitude - 1 */
+            int z = (s != 0 ? -v : v) - 1 /* Magnitude - 1 */;
 
-            c = (int)GetMvClass(z, new Ptr<int>(ref o));
+            int c = (int)GetClass(z, new Ptr<int>(ref o));
             counts.Classes[comp][c] += (uint)incr;
 
-            d = (o >> 3);     /* Int mv data */
-            f = (o >> 1) & 3; /* Fractional pel mv data */
-            e = (o & 1);      /* High precision mv data */
+            int d = o >> 3 /* Int mv data */;
+            int f = (o >> 1) & 3 /* Fractional pel mv data */;
+            int e = o & 1 /* High precision mv data */;
 
-            if (c == (int)MvClassType.MvClass0)
+            if (c == (int)MvClassType.Class0)
             {
                 counts.Class0[comp][d] += (uint)incr;
                 counts.Class0Fp[comp][d][f] += (uint)incr;
@@ -109,11 +99,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             }
             else
             {
-                int i;
                 int b = c + Constants.Class0Bits - 1; // Number of bits
-                for (i = 0; i < b; ++i)
+                for (int i = 0; i < b; ++i)
                 {
-                    counts.Bits[comp][i][((d >> i) & 1)] += (uint)incr;
+                    counts.Bits[comp][i][(d >> i) & 1] += (uint)incr;
                 }
 
                 counts.Fp[comp][f] += (uint)incr;
@@ -121,56 +110,56 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             }
         }
 
-        private readonly MvJointType GetMvJoint()
+        public MvJointType GetJoint()
         {
             if (Row == 0)
             {
-                return Col == 0 ? MvJointType.MvJointZero : MvJointType.MvJointHnzvz;
+                return Col == 0 ? MvJointType.Zero : MvJointType.Hnzvz;
             }
 
-            return Col == 0 ? MvJointType.MvJointHzvnz : MvJointType.MvJointHnzvnz;
+            return Col == 0 ? MvJointType.Hzvnz : MvJointType.Hnzvnz;
         }
 
-        internal readonly void IncMv(Ptr<Vp9BackwardUpdates> counts)
+        internal void Inc(Ptr<Vp9BackwardUpdates> counts)
         {
             if (!counts.IsNull)
             {
-                MvJointType j = GetMvJoint();
+                MvJointType j = GetJoint();
                 ++counts.Value.Joints[(int)j];
 
-                if (MvJointVertical(j))
+                if (JointVertical(j))
                 {
-                    IncMvComponent(Row, ref counts.Value, 0, 1, 1);
+                    IncComponent(Row, ref counts.Value, 0, 1, 1);
                 }
 
-                if (MvJointHorizontal(j))
+                if (JointHorizontal(j))
                 {
-                    IncMvComponent(Col, ref counts.Value, 1, 1, 1);
+                    IncComponent(Col, ref counts.Value, 1, 1, 1);
                 }
             }
         }
 
-        public void ClampMv(int minCol, int maxCol, int minRow, int maxRow)
+        public void Clamp(int minCol, int maxCol, int minRow, int maxRow)
         {
             Col = (short)Math.Clamp(Col, minCol, maxCol);
             Row = (short)Math.Clamp(Row, minRow, maxRow);
         }
 
-        private const int MvBorder = (16 << 3); // Allow 16 pels in 1/8th pel units
+        private const int Border = 16 << 3; // Allow 16 pels in 1/8th pel units
 
-        public void ClampMvRef(ref MacroBlockD xd)
+        public void ClampRef(ref MacroBlockD xd)
         {
-            ClampMv(
-                xd.MbToLeftEdge - MvBorder,
-                xd.MbToRightEdge + MvBorder,
-                xd.MbToTopEdge - MvBorder,
-                xd.MbToBottomEdge + MvBorder);
+            Clamp(
+                xd.MbToLeftEdge - Border,
+                xd.MbToRightEdge + Border,
+                xd.MbToTopEdge - Border,
+                xd.MbToBottomEdge + Border);
         }
 
-        public void LowerMvPrecision(bool allowHP)
+        public void LowerPrecision(bool allowHp)
         {
-            bool useHP = allowHP && UseMvHp();
-            if (!useHP)
+            bool useHp = allowHp && UseHp();
+            if (!useHp)
             {
                 if ((Row & 1) != 0)
                 {
@@ -183,5 +172,11 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
                 }
             }
         }
+
+        public bool IsValid()
+        {
+            return Row is > Constants.MvLow and < Constants.MvUpp &&
+                   Col is > Constants.MvLow and < Constants.MvUpp;
+        }
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/Mv32.cs

@@ -5,4 +5,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         public int Row;
         public int Col;
     }
-}
+}

+ 12 - 12
src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvClassType.cs

@@ -2,16 +2,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
     internal enum MvClassType
     {
-        MvClass0 = 0,   /* (0, 2]     integer pel */
-        MvClass1 = 1,   /* (2, 4]     integer pel */
-        MvClass2 = 2,   /* (4, 8]     integer pel */
-        MvClass3 = 3,   /* (8, 16]    integer pel */
-        MvClass4 = 4,   /* (16, 32]   integer pel */
-        MvClass5 = 5,   /* (32, 64]   integer pel */
-        MvClass6 = 6,   /* (64, 128]  integer pel */
-        MvClass7 = 7,   /* (128, 256] integer pel */
-        MvClass8 = 8,   /* (256, 512] integer pel */
-        MvClass9 = 9,   /* (512, 1024] integer pel */
-        MvClass10 = 10, /* (1024,2048] integer pel */
+        Class0, /* (0, 2]     integer pel */
+        Class1, /* (2, 4]     integer pel */
+        Class2, /* (4, 8]     integer pel */
+        Class3, /* (8, 16]    integer pel */
+        Class4, /* (16, 32]   integer pel */
+        Class5, /* (32, 64]   integer pel */
+        Class6, /* (64, 128]  integer pel */
+        Class7, /* (128, 256] integer pel */
+        Class8, /* (256, 512] integer pel */
+        Class9, /* (512, 1024] integer pel */
+        Class10 /* (1024,2048] integer pel */
     }
-}
+}

+ 5 - 5
src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvJointType.cs

@@ -2,9 +2,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
     internal enum MvJointType
     {
-        MvJointZero = 0,   /* Zero vector */
-        MvJointHnzvz = 1,  /* Vert zero, hor nonzero */
-        MvJointHzvnz = 2,  /* Hor zero, vert nonzero */
-        MvJointHnzvnz = 3, /* Both components nonzero */
+        Zero, /* Zero vector */
+        Hnzvz, /* Vert zero, hor nonzero */
+        Hzvnz, /* Hor zero, vert nonzero */
+        Hnzvnz /* Both components nonzero */
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/MvRef.cs

@@ -7,4 +7,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         public Array2<Mv> Mv;
         public Array2<sbyte> RefFrame;
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/PartitionType.cs

@@ -9,4 +9,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         PartitionTypes,
         PartitionInvalid = PartitionTypes,
     }
-}
+}

+ 5 - 5
src/Ryujinx.Graphics.Nvdec.Vp9/Types/PlaneType.cs

@@ -1,9 +1,9 @@
-namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
     internal enum PlaneType
     {
-        Y = 0,
-        Uv = 1,
-        PlaneTypes,
+        Y,
+        Uv,
+        PlaneTypes
     }
-}
+}

+ 1 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/Position.cs

@@ -11,4 +11,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             Col = col;
         }
     }
-}
+}

+ 17 - 17
src/Ryujinx.Graphics.Nvdec.Vp9/Types/PredictionMode.cs

@@ -1,21 +1,21 @@
-namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
     internal enum PredictionMode
     {
-        DcPred = 0, // Average of above and left pixels
-        VPred = 1, // Vertical
-        HPred = 2, // Horizontal
-        D45Pred = 3, // Directional 45  deg = round(arctan(1 / 1) * 180 / pi)
-        D135Pred = 4, // Directional 135 deg = 180 - 45
-        D117Pred = 5, // Directional 117 deg = 180 - 63
-        D153Pred = 6, // Directional 153 deg = 180 - 27
-        D207Pred = 7, // Directional 207 deg = 180 + 27
-        D63Pred = 8, // Directional 63  deg = round(arctan(2 / 1) * 180 / pi)
-        TmPred = 9, // True-motion
-        NearestMv = 10,
-        NearMv = 11,
-        ZeroMv = 12,
-        NewMv = 13,
-        MbModeCount = 14,
+        DcPred, // Average of above and left pixels
+        VPred, // Vertical
+        HPred, // Horizontal
+        D45Pred, // Directional 45  deg = round(arctan(1 / 1) * 180 / pi)
+        D135Pred, // Directional 135 deg = 180 - 45
+        D117Pred, // Directional 117 deg = 180 - 63
+        D153Pred, // Directional 153 deg = 180 - 27
+        D207Pred, // Directional 207 deg = 180 + 27
+        D63Pred, // Directional 63  deg = round(arctan(2 / 1) * 180 / pi)
+        TmPred, // True-motion
+        NearestMv,
+        NearMv,
+        ZeroMv,
+        NewMv,
+        MbModeCount
     }
-}
+}

+ 4 - 1
src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefBuffer.cs

@@ -2,7 +2,10 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
     internal struct RefBuffer
     {
+        public const int InvalidIdx = -1; // Invalid buffer index.
+
+        public int Idx;
         public Surface Buf;
         public ScaleFactors Sf;
     }
-}
+}

+ 12 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/Types/RefCntBuffer.cs

@@ -0,0 +1,12 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct RefCntBuffer
+    {
+        public int RefCount;
+        public int MiRows;
+        public int MiCols;
+        public byte Released;
+        public VpxCodecFrameBuffer RawFrameBuffer;
+        public Surface Buf;
+    }
+}

+ 6 - 6
src/Ryujinx.Graphics.Nvdec.Vp9/Types/ReferenceMode.cs

@@ -1,10 +1,10 @@
-namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
     internal enum ReferenceMode
     {
-        SingleReference = 0,
-        CompoundReference = 1,
-        ReferenceModeSelect = 2,
-        ReferenceModes = 3,
+        Single,
+        Compound,
+        Select,
+        ReferenceModes
     }
-}
+}

+ 84 - 224
src/Ryujinx.Graphics.Nvdec.Vp9/Types/ScaleFactors.cs

@@ -1,4 +1,4 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
 using System.Runtime.CompilerServices;
 using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Convolve;
 using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter;
@@ -8,7 +8,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
     internal struct ScaleFactors
     {
         private const int RefScaleShift = 14;
-        private const int RefNoScale = (1 << RefScaleShift);
+        private const int RefNoScale = 1 << RefScaleShift;
         private const int RefInvalidScale = -1;
 
         private unsafe delegate void ConvolveFn(
@@ -38,255 +38,114 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             int h,
             int bd);
 
-        private static readonly unsafe ConvolveFn[][][] _predictX16Y16 = {
+        private static readonly unsafe ConvolveFn[][][] PredictX16Y16 =
+        {
             new[]
             {
-                new ConvolveFn[]
-                {
-                    ConvolveCopy,
-                    ConvolveAvg,
-                },
-                new ConvolveFn[]
-                {
-                    Convolve8Vert,
-                    Convolve8AvgVert,
-                },
+                new ConvolveFn[] { ConvolveCopy, ConvolveAvg },
+                new ConvolveFn[] { Convolve8Vert, Convolve8AvgVert }
             },
             new[]
             {
-                new ConvolveFn[]
-                {
-                    Convolve8Horiz,
-                    Convolve8AvgHoriz,
-                },
-                new ConvolveFn[]
-                {
-                    Convolve8,
-                    Convolve8Avg,
-                },
-            },
+                new ConvolveFn[] { Convolve8Horiz, Convolve8AvgHoriz },
+                new ConvolveFn[] { Convolve8, Convolve8Avg }
+            }
         };
 
-        private static readonly unsafe ConvolveFn[][][] _predictX16 = {
-            new[]
-            {
-                new ConvolveFn[]
-                {
-                    ScaledVert,
-                    ScaledAvgVert,
-                },
-                new ConvolveFn[]
-                {
-                    ScaledVert,
-                    ScaledAvgVert,
-                },
-            },
+        private static readonly unsafe ConvolveFn[][][] PredictX16 =
+        {
             new[]
             {
-                new ConvolveFn[]
-                {
-                    Scaled2D,
-                    ScaledAvg2D,
-                },
-                new ConvolveFn[]
-                {
-                    Scaled2D,
-                    ScaledAvg2D,
-                },
+                new ConvolveFn[] { ScaledVert, ScaledAvgVert }, new ConvolveFn[] { ScaledVert, ScaledAvgVert }
             },
+            new[] { new ConvolveFn[] { Scaled2D, ScaledAvg2D }, new ConvolveFn[] { Scaled2D, ScaledAvg2D } }
         };
 
-        private static readonly unsafe ConvolveFn[][][] _predictY16 = {
-            new[]
-            {
-                new ConvolveFn[]
-                {
-                    ScaledHoriz,
-                    ScaledAvgHoriz,
-                },
-                new ConvolveFn[]
-                {
-                    Scaled2D,
-                    ScaledAvg2D,
-                },
-            },
-            new[]
-            {
-                new ConvolveFn[]
-                {
-                    ScaledHoriz,
-                    ScaledAvgHoriz,
-                },
-                new ConvolveFn[]
-                {
-                    Scaled2D,
-                    ScaledAvg2D,
-                },
-            },
+        private static readonly unsafe ConvolveFn[][][] PredictY16 =
+        {
+            new[] { new ConvolveFn[] { ScaledHoriz, ScaledAvgHoriz }, new ConvolveFn[] { Scaled2D, ScaledAvg2D } },
+            new[] { new ConvolveFn[] { ScaledHoriz, ScaledAvgHoriz }, new ConvolveFn[] { Scaled2D, ScaledAvg2D } }
         };
 
-        private static readonly unsafe ConvolveFn[][][] _predict = {
-            new[]
-            {
-                new ConvolveFn[]
-                {
-                    Scaled2D,
-                    ScaledAvg2D,
-                },
-                new ConvolveFn[]
-                {
-                    Scaled2D,
-                    ScaledAvg2D,
-                },
-            },
-            new[]
-            {
-                new ConvolveFn[]
-                {
-                    Scaled2D,
-                    ScaledAvg2D,
-                },
-                new ConvolveFn[]
-                {
-                    Scaled2D,
-                    ScaledAvg2D,
-                },
-            },
+        private static readonly unsafe ConvolveFn[][][] Predict =
+        {
+            new[] { new ConvolveFn[] { Scaled2D, ScaledAvg2D }, new ConvolveFn[] { Scaled2D, ScaledAvg2D } },
+            new[] { new ConvolveFn[] { Scaled2D, ScaledAvg2D }, new ConvolveFn[] { Scaled2D, ScaledAvg2D } }
         };
 
-        private static readonly unsafe HighbdConvolveFn[][][] _highbdPredictX16Y16 = {
+        private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictX16Y16 =
+        {
             new[]
             {
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolveCopy,
-                    HighbdConvolveAvg,
-                },
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8Vert,
-                    HighbdConvolve8AvgVert,
-                },
+                new HighbdConvolveFn[] { HighbdConvolveCopy, HighbdConvolveAvg },
+                new HighbdConvolveFn[] { HighbdConvolve8Vert, HighbdConvolve8AvgVert }
             },
             new[]
             {
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8Horiz,
-                    HighbdConvolve8AvgHoriz,
-                },
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8,
-                    HighbdConvolve8Avg,
-                },
-            },
+                new HighbdConvolveFn[] { HighbdConvolve8Horiz, HighbdConvolve8AvgHoriz },
+                new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg }
+            }
         };
 
-        private static readonly unsafe HighbdConvolveFn[][][] _highbdPredictX16 = {
+        private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictX16 =
+        {
             new[]
             {
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8Vert,
-                    HighbdConvolve8AvgVert,
-                },
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8Vert,
-                    HighbdConvolve8AvgVert,
-                },
+                new HighbdConvolveFn[] { HighbdConvolve8Vert, HighbdConvolve8AvgVert },
+                new HighbdConvolveFn[] { HighbdConvolve8Vert, HighbdConvolve8AvgVert }
             },
             new[]
             {
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8,
-                    HighbdConvolve8Avg,
-                },
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8,
-                    HighbdConvolve8Avg,
-                },
-            },
+                new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg },
+                new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg }
+            }
         };
 
-        private static readonly unsafe HighbdConvolveFn[][][] _highbdPredictY16 = {
+        private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictY16 =
+        {
             new[]
             {
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8Horiz,
-                    HighbdConvolve8AvgHoriz,
-                },
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8,
-                    HighbdConvolve8Avg,
-                },
+                new HighbdConvolveFn[] { HighbdConvolve8Horiz, HighbdConvolve8AvgHoriz },
+                new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg }
             },
             new[]
             {
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8Horiz,
-                    HighbdConvolve8AvgHoriz,
-                },
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8,
-                    HighbdConvolve8Avg,
-                },
-            },
+                new HighbdConvolveFn[] { HighbdConvolve8Horiz, HighbdConvolve8AvgHoriz },
+                new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg }
+            }
         };
 
-        private static readonly unsafe HighbdConvolveFn[][][] _highbdPredict = {
+        private static readonly unsafe HighbdConvolveFn[][][] HighbdPredict =
+        {
             new[]
             {
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8,
-                    HighbdConvolve8Avg,
-                },
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8,
-                    HighbdConvolve8Avg,
-                },
+                new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg },
+                new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg }
             },
             new[]
             {
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8,
-                    HighbdConvolve8Avg,
-                },
-                new HighbdConvolveFn[]
-                {
-                    HighbdConvolve8,
-                    HighbdConvolve8Avg,
-                },
-            },
+                new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg },
+                new HighbdConvolveFn[] { HighbdConvolve8, HighbdConvolve8Avg }
+            }
         };
 
-        public int XScaleFP; // Horizontal fixed point scale factor
-        public int YScaleFP; // Vertical fixed point scale factor
+        public int XScaleFp; // Horizontal fixed point scale factor
+        public int YScaleFp; // Vertical fixed point scale factor
         public int XStepQ4;
         public int YStepQ4;
 
-        public readonly int ScaleValueX(int val)
+        public int ScaleValueX(int val)
         {
             return IsScaled() ? ScaledX(val) : val;
         }
 
-        public readonly int ScaleValueY(int val)
+        public int ScaleValueY(int val)
         {
             return IsScaled() ? ScaledY(val) : val;
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public readonly unsafe void InterPredict(
+        public unsafe void InterPredict(
             int horiz,
             int vert,
             int avg,
@@ -307,12 +166,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
                 if (YStepQ4 == 16)
                 {
                     // No scaling in either direction.
-                    _predictX16Y16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h);
+                    PredictX16Y16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w,
+                        h);
                 }
                 else
                 {
                     // No scaling in x direction. Must always scale in the y direction.
-                    _predictX16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h);
+                    PredictX16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w,
+                        h);
                 }
             }
             else
@@ -320,18 +181,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
                 if (YStepQ4 == 16)
                 {
                     // No scaling in the y direction. Must always scale in the x direction.
-                    _predictY16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h);
+                    PredictY16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w,
+                        h);
                 }
                 else
                 {
                     // Must always scale in both directions.
-                    _predict[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h);
+                    Predict[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h);
                 }
             }
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public readonly unsafe void HighbdInterPredict(
+        public unsafe void HighbdInterPredict(
             int horiz,
             int vert,
             int avg,
@@ -353,12 +215,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
                 if (YStepQ4 == 16)
                 {
                     // No scaling in either direction.
-                    _highbdPredictX16Y16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd);
+                    HighbdPredictX16Y16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY,
+                        ys, w, h, bd);
                 }
                 else
                 {
                     // No scaling in x direction. Must always scale in the y direction.
-                    _highbdPredictX16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd);
+                    HighbdPredictX16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys,
+                        w, h, bd);
                 }
             }
             else
@@ -366,24 +230,26 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
                 if (YStepQ4 == 16)
                 {
                     // No scaling in the y direction. Must always scale in the x direction.
-                    _highbdPredictY16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd);
+                    HighbdPredictY16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys,
+                        w, h, bd);
                 }
                 else
                 {
                     // Must always scale in both directions.
-                    _highbdPredict[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd);
+                    HighbdPredict[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w,
+                        h, bd);
                 }
             }
         }
 
-        private readonly int ScaledX(int val)
+        private int ScaledX(int val)
         {
-            return (int)((long)val * XScaleFP >> RefScaleShift);
+            return (int)(((long)val * XScaleFp) >> RefScaleShift);
         }
 
-        private readonly int ScaledY(int val)
+        private int ScaledY(int val)
         {
-            return (int)((long)val * YScaleFP >> RefScaleShift);
+            return (int)(((long)val * YScaleFp) >> RefScaleShift);
         }
 
         private static int GetFixedPointScaleFactor(int otherSize, int thisSize)
@@ -399,23 +265,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         {
             int xOffQ4 = ScaledX(x << SubpelBits) & SubpelMask;
             int yOffQ4 = ScaledY(y << SubpelBits) & SubpelMask;
-            Mv32 res = new()
-            {
-                Row = ScaledY(mv.Row) + yOffQ4,
-                Col = ScaledX(mv.Col) + xOffQ4,
-            };
-
+            Mv32 res = new() { Row = ScaledY(mv.Row) + yOffQ4, Col = ScaledX(mv.Col) + xOffQ4 };
             return res;
         }
 
-        public readonly bool IsValidScale()
+        public bool IsValidScale()
         {
-            return XScaleFP != RefInvalidScale && YScaleFP != RefInvalidScale;
+            return XScaleFp != RefInvalidScale && YScaleFp != RefInvalidScale;
         }
 
-        public readonly bool IsScaled()
+        public bool IsScaled()
         {
-            return IsValidScale() && (XScaleFP != RefNoScale || YScaleFP != RefNoScale);
+            return IsValidScale() && (XScaleFp != RefNoScale || YScaleFp != RefNoScale);
         }
 
         public static bool ValidRefFrameSize(int refWidth, int refHeight, int thisWidth, int thisHeight)
@@ -430,16 +291,15 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         {
             if (!ValidRefFrameSize(otherW, otherH, thisW, thisH))
             {
-                XScaleFP = RefInvalidScale;
-                YScaleFP = RefInvalidScale;
-
+                XScaleFp = RefInvalidScale;
+                YScaleFp = RefInvalidScale;
                 return;
             }
 
-            XScaleFP = GetFixedPointScaleFactor(otherW, thisW);
-            YScaleFP = GetFixedPointScaleFactor(otherH, thisH);
+            XScaleFp = GetFixedPointScaleFactor(otherW, thisW);
+            YScaleFp = GetFixedPointScaleFactor(otherH, thisH);
             XStepQ4 = ScaledX(16);
             YStepQ4 = ScaledY(16);
         }
     }
-}
+}

+ 7 - 7
src/Ryujinx.Graphics.Nvdec.Vp9/Types/SegLvlFeatures.cs

@@ -1,11 +1,11 @@
-namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
     internal enum SegLvlFeatures
     {
-        SegLvlAltQ = 0, // Use alternate Quantizer ....
-        SegLvlAltLf = 1, // Use alternate loop filter value...
-        SegLvlRefFrame = 2, // Optional Segment reference frame
-        SegLvlSkip = 3, // Optional Segment (0,0) + skip mode
-        SegLvlMax = 4, // Number of features supported
+        AltQ, // Use alternate Quantizer ....
+        AltLf, // Use alternate loop filter value...
+        RefFrame, // Optional Segment reference frame
+        Skip, // Optional Segment (0,0) + skip mode
+        Max // Number of features supported
     }
-}
+}

+ 102 - 9
src/Ryujinx.Graphics.Nvdec.Vp9/Types/Segmentation.cs

@@ -1,4 +1,6 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Video;
+using System;
 using System.Diagnostics;
 using System.Runtime.InteropServices;
 
@@ -6,8 +8,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
     internal struct Segmentation
     {
-        private static readonly int[] _segFeatureDataSigned = { 1, 1, 0, 0 };
-        private static readonly int[] _segFeatureDataMax = { QuantCommon.MaxQ, Vp9.LoopFilter.MaxLoopFilter, 3, 0 };
+        public const int SegmentDeltadata = 0;
+        public const int SegmentAbsdata = 1;
+
+        public const int MaxSegments = 8;
+        public const int SegTreeProbs = MaxSegments - 1;
+
+        public const int PredictionProbs = 3;
+
+        private static readonly int[] SegFeatureDataSigned = { 1, 1, 0, 0 };
+        private static readonly int[] SegFeatureDataMax = { QuantCommon.MaxQ, Vp9.LoopFilter.MaxLoopFilter, 3, 0 };
 
         public bool Enabled;
         public bool UpdateMap;
@@ -38,21 +48,21 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 
         internal static int FeatureDataMax(SegLvlFeatures featureId)
         {
-            return _segFeatureDataMax[(int)featureId];
+            return SegFeatureDataMax[(int)featureId];
         }
 
         internal static int IsSegFeatureSigned(SegLvlFeatures featureId)
         {
-            return _segFeatureDataSigned[(int)featureId];
+            return SegFeatureDataSigned[(int)featureId];
         }
 
         internal void SetSegData(int segmentId, SegLvlFeatures featureId, int segData)
         {
-            Debug.Assert(segData <= _segFeatureDataMax[(int)featureId]);
+            Debug.Assert(segData <= SegFeatureDataMax[(int)featureId]);
             if (segData < 0)
             {
-                Debug.Assert(_segFeatureDataSigned[(int)featureId] != 0);
-                Debug.Assert(-segData <= _segFeatureDataMax[(int)featureId]);
+                Debug.Assert(SegFeatureDataSigned[(int)featureId] != 0);
+                Debug.Assert(-segData <= SegFeatureDataMax[(int)featureId]);
             }
 
             FeatureData[segmentId][(int)featureId] = (short)segData;
@@ -67,5 +77,88 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         {
             return FeatureData[segmentId][(int)featureId];
         }
+
+        public int GetQIndex(int segmentId, int baseQIndex)
+        {
+            if (IsSegFeatureActive(segmentId, SegLvlFeatures.AltQ) != 0)
+            {
+                int data = GetSegData(segmentId, SegLvlFeatures.AltQ);
+                int segQIndex = AbsDelta == Constants.SegmentAbsData ? data : baseQIndex + data;
+                return Math.Clamp(segQIndex, 0, QuantCommon.MaxQ);
+            }
+
+            return baseQIndex;
+        }
+
+        public void SetupSegmentation(ref Vp9EntropyProbs fc, ref ReadBitBuffer rb)
+        {
+            UpdateMap = false;
+            UpdateData = 0;
+
+            Enabled = rb.ReadBit() != 0;
+            if (!Enabled)
+            {
+                return;
+            }
+
+            // Segmentation map update
+            UpdateMap = rb.ReadBit() != 0;
+            if (UpdateMap)
+            {
+                for (int i = 0; i < SegTreeProbs; i++)
+                {
+                    fc.SegTreeProb[i] = rb.ReadBit() != 0
+                        ? (byte)rb.ReadLiteral(8)
+                        : (byte)Prob.MaxProb;
+                }
+
+                TemporalUpdate = rb.ReadBit() != 0;
+                if (TemporalUpdate)
+                {
+                    for (int i = 0; i < PredictionProbs; i++)
+                    {
+                        fc.SegPredProb[i] = rb.ReadBit() != 0
+                            ? (byte)rb.ReadLiteral(8)
+                            : (byte)Prob.MaxProb;
+                    }
+                }
+                else
+                {
+                    for (int i = 0; i < PredictionProbs; i++)
+                    {
+                        fc.SegPredProb[i] = Prob.MaxProb;
+                    }
+                }
+            }
+
+            // Segmentation data update
+            UpdateData = (byte)rb.ReadBit();
+            if (UpdateData != 0)
+            {
+                AbsDelta = (byte)rb.ReadBit();
+
+                ClearAllSegFeatures();
+
+                for (int i = 0; i < Constants.MaxSegments; i++)
+                {
+                    for (int j = 0; j < (int)SegLvlFeatures.Max; j++)
+                    {
+                        int data = 0;
+                        int featureEnabled = rb.ReadBit();
+                        if (featureEnabled != 0)
+                        {
+                            EnableSegFeature(i, (SegLvlFeatures)j);
+                            data = rb.DecodeUnsignedMax(FeatureDataMax((SegLvlFeatures)j));
+                            if (IsSegFeatureSigned((SegLvlFeatures)j) != 0)
+                            {
+                                data = rb.ReadBit() != 0 ? -data : data;
+                            }
+                        }
+
+                        SetSegData(i, (SegLvlFeatures)j, data);
+                    }
+                }
+            }
+        }
     }
-}
+}

+ 189 - 33
src/Ryujinx.Graphics.Nvdec.Vp9/Types/Surface.cs

@@ -1,11 +1,23 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
 using Ryujinx.Graphics.Video;
+using System.Diagnostics;
 using System.Runtime.InteropServices;
 
 namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
+    internal delegate int VpxGetFrameBufferCbFnT(MemoryAllocator allocator, Ptr<InternalFrameBufferList> cbPriv,
+        ulong minSize, ref VpxCodecFrameBuffer fb);
+
     internal struct Surface : ISurface
     {
+        public const int Innerborderinpixels = 96;
+        public const int InterpExtend = 4;
+        public const int EncBorderInPixels = 160;
+        public const int DecBorderInPixels = 32;
+
+        public const int Yv12FlagHighbitdepth = 8;
+
         public ArrayPtr<byte> YBuffer;
         public ArrayPtr<byte> UBuffer;
         public ArrayPtr<byte> VBuffer;
@@ -14,43 +26,62 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         public readonly unsafe Plane UPlane => new((nint)UBuffer.ToPointer(), UBuffer.Length);
         public readonly unsafe Plane VPlane => new((nint)VBuffer.ToPointer(), VBuffer.Length);
 
-        public readonly FrameField Field => FrameField.Progressive;
+        public FrameField Field => FrameField.Progressive;
+
+        public int Width { get; private set; }
+        public int Height { get; private set; }
+        public int AlignedWidth { get; private set; }
+        public int AlignedHeight { get; private set; }
+        public int Stride { get; private set; }
+        public int UvWidth { get; private set; }
+        public int UvHeight { get; private set; }
+        public int UvAlignedWidth { get; private set; }
+        public int UvAlignedHeight { get; private set; }
+        public int UvStride { get; private set; }
+        public bool HighBd { get; private set; }
 
-        public int Width { get; }
-        public int Height { get; }
-        public int AlignedWidth { get; }
-        public int AlignedHeight { get; }
-        public int Stride { get; }
-        public int UvWidth { get; }
-        public int UvHeight { get; }
-        public int UvAlignedWidth { get; }
-        public int UvAlignedHeight { get; }
-        public int UvStride { get; }
+        public int FrameSize { get; private set; }
+        public int Border { get; private set; }
 
-        public bool HighBd { get; }
+        public int YCropWidth => Width;
+        public int YCropHeight => Height;
+        public int UvCropWidth => UvWidth;
+        public int UvCropHeight => UvHeight;
+
+        public ArrayPtr<byte> BufferAlloc;
+        public int BufferAllocSz;
+        public int SubsamplingX;
+        public int SubsamplingY;
+        public uint BitDepth;
+        public VpxColorSpace ColorSpace;
+        public VpxColorRange ColorRange;
+        public int RenderWidth;
+        public int RenderHeight;
+
+        public int Corrupted;
+        public int Flags;
 
         private readonly nint _pointer;
 
         public Surface(int width, int height)
         {
-            HighBd = false;
-
-            const int Border = 32;
-            const int SsX = 1;
-            const int SsY = 1;
+            const int border = 32;
+            const int ssX = 1;
+            const int ssY = 1;
+            const bool highbd = false;
 
             int alignedWidth = (width + 7) & ~7;
             int alignedHeight = (height + 7) & ~7;
-            int yStride = ((alignedWidth + 2 * Border) + 31) & ~31;
-            int yplaneSize = (alignedHeight + 2 * Border) * yStride;
-            int uvWidth = alignedWidth >> SsX;
-            int uvHeight = alignedHeight >> SsY;
-            int uvStride = yStride >> SsX;
-            int uvBorderW = Border >> SsX;
-            int uvBorderH = Border >> SsY;
-            int uvplaneSize = (uvHeight + 2 * uvBorderH) * uvStride;
+            int yStride = (alignedWidth + (2 * border) + 31) & ~31;
+            int yplaneSize = (alignedHeight + (2 * border)) * yStride;
+            int uvWidth = alignedWidth >> ssX;
+            int uvHeight = alignedHeight >> ssY;
+            int uvStride = yStride >> ssX;
+            int uvBorderW = border >> ssX;
+            int uvBorderH = border >> ssY;
+            int uvplaneSize = (uvHeight + (2 * uvBorderH)) * uvStride;
 
-            int frameSize = (HighBd ? 2 : 1) * (yplaneSize + 2 * uvplaneSize);
+            int frameSize = (highbd ? 2 : 1) * (yplaneSize + (2 * uvplaneSize));
 
             nint pointer = Marshal.AllocHGlobal(frameSize);
             _pointer = pointer;
@@ -59,23 +90,148 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             AlignedWidth = alignedWidth;
             AlignedHeight = alignedHeight;
             Stride = yStride;
-            UvWidth = (width + SsX) >> SsX;
-            UvHeight = (height + SsY) >> SsY;
+            UvWidth = (width + ssX) >> ssX;
+            UvHeight = (height + ssY) >> ssY;
             UvAlignedWidth = uvWidth;
             UvAlignedHeight = uvHeight;
             UvStride = uvStride;
 
-            ArrayPtr<byte> NewPlane(int start, int size, int planeBorder)
+            ArrayPtr<byte> NewPlane(int start, int size, int border)
             {
-                return new ArrayPtr<byte>(pointer + start + planeBorder, size - planeBorder);
+                return new ArrayPtr<byte>(pointer + start + border, size - border);
             }
 
-            YBuffer = NewPlane(0, yplaneSize, (Border * yStride) + Border);
+            YBuffer = NewPlane(0, yplaneSize, (border * yStride) + border);
             UBuffer = NewPlane(yplaneSize, uvplaneSize, (uvBorderH * uvStride) + uvBorderW);
             VBuffer = NewPlane(yplaneSize + uvplaneSize, uvplaneSize, (uvBorderH * uvStride) + uvBorderW);
         }
 
-        public readonly void Dispose()
+        public unsafe int ReallocFrameBuffer(
+            MemoryAllocator allocator,
+            int width,
+            int height,
+            int ssX,
+            int ssY,
+            bool useHighbitdepth,
+            int border,
+            int byteAlignment,
+            Ptr<VpxCodecFrameBuffer> fb,
+            VpxGetFrameBufferCbFnT cb,
+            Ptr<InternalFrameBufferList> cbPriv)
+        {
+            int byteAlign = byteAlignment == 0 ? 1 : byteAlignment; // TODO: Is it safe to ignore the alignment?
+            int alignedWidth = (width + 7) & ~7;
+            int alignedHeight = (height + 7) & ~7;
+            int yStride = (alignedWidth + (2 * border) + 31) & ~31;
+            ulong yplaneSize =
+                ((ulong)(alignedHeight + (2 * border)) * (ulong)yStride) + (ulong)byteAlignment;
+            int uvWidth = alignedWidth >> ssX;
+            int uvHeight = alignedHeight >> ssY;
+            int uvStride = yStride >> ssX;
+            int uvBorderW = border >> ssX;
+            int uvBorderH = border >> ssY;
+            ulong uvplaneSize =
+                ((ulong)(uvHeight + (2 * uvBorderH)) * (ulong)uvStride) + (ulong)byteAlignment;
+
+            ulong frameSize = (ulong)(1 + (useHighbitdepth ? 1 : 0)) * (yplaneSize + (2 * uvplaneSize));
+
+            ArrayPtr<byte> buf = ArrayPtr<byte>.Null;
+
+            // frame_size is stored in buffer_alloc_sz, which is an int. If it won't
+            // fit, fail early.
+            if (frameSize > int.MaxValue)
+            {
+                return -1;
+            }
+
+            if (cb != null)
+            {
+                const int alignAddrExtraSize = 31;
+                ulong externalFrameSize = frameSize + alignAddrExtraSize;
+
+                Debug.Assert(!fb.IsNull);
+
+                // Allocation to hold larger frame, or first allocation.
+                if (cb(allocator, cbPriv, externalFrameSize, ref fb.Value) < 0)
+                {
+                    return -1;
+                }
+
+                if (fb.Value.Data.IsNull || (ulong)fb.Value.Data.Length < externalFrameSize)
+                {
+                    return -1;
+                }
+
+                BufferAlloc = fb.Value.Data;
+            }
+            else if (frameSize > (ulong)BufferAllocSz)
+            {
+                // Allocation to hold larger frame, or first allocation.
+                allocator.Free(BufferAlloc);
+                BufferAlloc = ArrayPtr<byte>.Null;
+
+                BufferAlloc = allocator.Allocate<byte>((int)frameSize);
+                if (BufferAlloc.IsNull)
+                {
+                    return -1;
+                }
+
+                BufferAllocSz = (int)frameSize;
+
+                // This memset is needed for fixing valgrind error from C loop filter
+                // due to access uninitialized memory in frame border. It could be
+                // removed if border is totally removed.
+                MemoryUtil.Fill(BufferAlloc.ToPointer(), (byte)0, BufferAllocSz);
+            }
+
+            /* Only support allocating buffers that have a border that's a multiple
+             * of 32. The border restriction is required to get 16-byte alignment of
+             * the start of the chroma rows without introducing an arbitrary gap
+             * between planes, which would break the semantics of things like
+             * vpx_img_set_rect(). */
+            if ((border & 0x1f) != 0)
+            {
+                return -3;
+            }
+
+            Width = width;
+            Height = height;
+            AlignedWidth = alignedWidth;
+            AlignedHeight = alignedHeight;
+            Stride = yStride;
+
+            UvWidth = (width + ssX) >> ssX;
+            UvHeight = (height + ssY) >> ssY;
+            UvAlignedWidth = uvWidth;
+            UvAlignedHeight = uvHeight;
+            UvStride = uvStride;
+
+            Border = border;
+            FrameSize = (int)frameSize;
+            SubsamplingX = ssX;
+            SubsamplingY = ssY;
+
+            buf = BufferAlloc;
+            if (useHighbitdepth)
+            {
+                // Store uint16 addresses when using 16bit framebuffers
+                buf = BufferAlloc;
+                Flags = Yv12FlagHighbitdepth;
+            }
+            else
+            {
+                Flags = 0;
+            }
+
+            YBuffer = buf.Slice((border * yStride) + border);
+            UBuffer = buf.Slice((int)yplaneSize + (uvBorderH * uvStride) + uvBorderW);
+            VBuffer = buf.Slice((int)yplaneSize + (int)uvplaneSize + (uvBorderH * uvStride) + uvBorderW);
+
+            Corrupted = 0; /* assume not corrupted by errors */
+            return 0;
+        }
+
+        public void Dispose()
         {
             Marshal.FreeHGlobal(_pointer);
         }

+ 4 - 4
src/Ryujinx.Graphics.Nvdec.Vp9/Types/TileInfo.cs

@@ -56,7 +56,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         private static int GetMinLog2TileCols(int sb64Cols)
         {
             int minLog2 = 0;
-            while ((MaxTileWidthB64 << minLog2) < sb64Cols)
+            while (MaxTileWidthB64 << minLog2 < sb64Cols)
             {
                 ++minLog2;
             }
@@ -67,7 +67,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         private static int GetMaxLog2TileCols(int sb64Cols)
         {
             int maxLog2 = 1;
-            while ((sb64Cols >> maxLog2) >= MinTileWidthB64)
+            while (sb64Cols >> maxLog2 >= MinTileWidthB64)
             {
                 ++maxLog2;
             }
@@ -75,7 +75,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             return maxLog2 - 1;
         }
 
-        public static void GetTileNBits(int miCols, ref int minLog2TileCols, ref int maxLog2TileCols)
+        public static void GetTileNBits(int miCols, out int minLog2TileCols, out int maxLog2TileCols)
         {
             int sb64Cols = MiColsAlignedToSb(miCols) >> Constants.MiBlockSizeLog2;
             minLog2TileCols = GetMinLog2TileCols(sb64Cols);
@@ -83,4 +83,4 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             Debug.Assert(minLog2TileCols <= maxLog2TileCols);
         }
     }
-}
+}

+ 8 - 8
src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxMode.cs

@@ -1,12 +1,12 @@
-namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
     public enum TxMode
     {
-        Only4X4 = 0, // Only 4x4 transform used
-        Allow8X8 = 1, // Allow block transform size up to 8x8
-        Allow16X16 = 2, // Allow block transform size up to 16x16
-        Allow32X32 = 3, // Allow block transform size up to 32x32
-        TxModeSelect = 4, // Transform specified for each block
-        TxModes = 5,
+        Only4x4, // Only 4x4 transform used
+        Allow8x8, // Allow block transform size up to 8x8
+        Allow16x16, // Allow block transform size up to 16x16
+        Allow32x32, // Allow block transform size up to 32x32
+        TxModeSelect, // Transform specified for each block
+        TxModes
     }
-}
+}

+ 7 - 7
src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxSize.cs

@@ -1,11 +1,11 @@
-namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
     public enum TxSize
     {
-        Tx4x4 = 0, // 4x4 transform
-        Tx8x8 = 1, // 8x8 transform
-        Tx16x16 = 2, // 16x16 transform
-        Tx32x32 = 3, // 32x32 transform
-        TxSizes = 4,
+        Tx4x4, // 4x4 transform
+        Tx8x8, // 8x8 transform
+        Tx16x16, // 16x16 transform
+        Tx32x32, // 32x32 transform
+        TxSizes
     }
-}
+}

+ 7 - 7
src/Ryujinx.Graphics.Nvdec.Vp9/Types/TxType.cs

@@ -1,11 +1,11 @@
-namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
     internal enum TxType
     {
-        DctDct = 0, // DCT  in both horizontal and vertical
-        AdstDct = 1, // ADST in vertical, DCT in horizontal
-        DctAdst = 2, // DCT  in vertical, ADST in horizontal
-        AdstAdst = 3, // ADST in both directions
-        TxTypes = 4,
+        DctDct, // DCT  in both horizontal and vertical
+        AdstDct, // ADST in vertical, DCT in horizontal
+        DctAdst, // DCT  in vertical, ADST in horizontal
+        AdstAdst, // ADST in both directions
+        TxTypes
     }
-}
+}

+ 700 - 32
src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs

@@ -1,6 +1,8 @@
-using Ryujinx.Common.Memory;
+using Ryujinx.Common.Memory;
 using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
 using Ryujinx.Graphics.Video;
+using System;
 
 namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 {
@@ -9,27 +11,62 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         public MacroBlockD Mb;
 
         public ArrayPtr<TileWorkerData> TileWorkerData;
+        public int TotalTiles;
 
         public InternalErrorInfo Error;
 
+        public VpxColorSpace ColorSpace;
+        public VpxColorRange ColorRange;
+
         public int Width;
         public int Height;
 
+        public int RenderWidth;
+        public int RenderHeight;
+
+        public int LastWidth;
+        public int LastHeight;
+
         public int SubsamplingX;
         public int SubsamplingY;
 
+        public bool UseHighBitDepth;
+
         public ArrayPtr<MvRef> PrevFrameMvs;
         public ArrayPtr<MvRef> CurFrameMvs;
 
+        public Ptr<Surface> FrameToShow;
+        public Ptr<RefCntBuffer> PrevFrame;
+
+        public Ptr<RefCntBuffer> CurFrame;
+
+        public Array8<int> RefFrameMap; /* maps fb_idx to reference slot */
+
+        // Prepare ref_frame_map for the next frame.
+        // Only used in frame parallel decode.
+        public Array8<int> NextRefFrameMap;
+
         public Array3<RefBuffer> FrameRefs;
 
+        public int NewFbIdx;
+
+        public int CurShowFrameFbIdx;
+
+        public FrameType LastFrameType;
         public FrameType FrameType;
 
+        public int ShowFrame;
+        public int LastShowFrame;
+        public int ShowExistingFrame;
+
         // Flag signaling that the frame is encoded using only Intra modes.
         public bool IntraOnly;
+        public bool LastIntraOnly;
 
         public bool AllowHighPrecisionMv;
 
+        public int ResetFrameContext;
+
         // MBs, MbRows/Cols is in 16-pixel units; MiRows/Cols is in
         // ModeInfo (8-pixel) units.
         public int MBs;
@@ -49,8 +86,13 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 
         /* We allocate a ModeInfo struct for each macroblock, together with
            an extra row on top and column on the left to simplify prediction. */
+        public int MiAllocSize;
         public ArrayPtr<ModeInfo> Mip; /* Base of allocated array */
-        public ArrayPtr<ModeInfo> Mi;  /* Corresponds to upper left visible macroblock */
+        public ArrayPtr<ModeInfo> Mi; /* Corresponds to upper left visible macroblock */
+
+        // prev_mip and prev_mi will only be allocated in VP9 encoder.
+        public Ptr<ModeInfo> PrevMip; /* MODE_INFO array 'mip' from last decoded frame */
+        public Ptr<ModeInfo> PrevMi; /* 'mi' from last frame (points into prev_mip) */
 
         public ArrayPtr<Ptr<ModeInfo>> MiGridBase;
         public ArrayPtr<Ptr<ModeInfo>> MiGridVisible;
@@ -70,6 +112,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 
         public LoopFilterInfoN LfInfo;
 
+        public int RefreshFrameContext; /* Two state 0 = NO, 1 = YES */
+
         public Array4<sbyte> RefFrameSignBias; /* Two state 0, 1 */
 
         public LoopFilter Lf;
@@ -81,22 +125,37 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
         public ReferenceMode ReferenceMode;
 
         public Ptr<Vp9EntropyProbs> Fc;
+        public ArrayPtr<Vp9EntropyProbs> FrameContexts; // FRAME_CONTEXTS
+        public uint FrameContextIdx; /* Context to use/update */
         public Ptr<Vp9BackwardUpdates> Counts;
 
+        public uint CurrentVideoFrame;
+        public BitstreamProfile Profile;
+
+        public BitDepth BitDepth;
+        public BitDepth DequantBitDepth; // bit_depth of current dequantizer
+
+        public int ErrorResilientMode;
+        public int FrameParallelDecodingMode;
+
         public int Log2TileCols, Log2TileRows;
 
+        public int ByteAlignment;
+        public int SkipLoopFilter;
+
+        public Ptr<BufferPool> BufferPool;
+
         public ArrayPtr<sbyte> AboveSegContext;
         public ArrayPtr<sbyte> AboveContext;
 
-        public readonly bool FrameIsIntraOnly()
+        public bool FrameIsIntraOnly()
         {
             return FrameType == FrameType.KeyFrame || IntraOnly;
         }
 
         public bool CompoundReferenceAllowed()
         {
-            int i;
-            for (i = 1; i < Constants.RefsPerFrame; ++i)
+            for (int i = 1; i < Constants.RefsPerFrame; ++i)
             {
                 if (RefFrameSignBias[i + 1] != RefFrameSignBias[1])
                 {
@@ -107,6 +166,47 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             return false;
         }
 
+        public ref Surface GetFrameNewBuffer()
+        {
+            return ref BufferPool.Value.FrameBufs[NewFbIdx].Buf;
+        }
+
+        public int GetFreeFb()
+        {
+            ref Array12<RefCntBuffer> frameBufs = ref BufferPool.Value.FrameBufs;
+
+            int i;
+
+            for (i = 0; i < Constants.FrameBuffers; ++i)
+            {
+                if (frameBufs[i].RefCount == 0)
+                {
+                    break;
+                }
+            }
+
+            if (i != Constants.FrameBuffers)
+            {
+                frameBufs[i].RefCount = 1;
+            }
+            else
+            {
+                // Reset i to be INVALID_IDX to indicate no free buffer found.
+                i = RefBuffer.InvalidIdx;
+            }
+
+            return i;
+        }
+
+        public void SwapCurrentAndLastSegMap()
+        {
+            // Swap indices.
+            (SegMapIdx, PrevSegMapIdx) = (PrevSegMapIdx, SegMapIdx);
+
+            CurrentFrameSegMap = SegMapArray[SegMapIdx];
+            LastFrameSegMap = SegMapArray[PrevSegMapIdx];
+        }
+
         private static int CalcMiSize(int len)
         {
             // Len is in mi units.
@@ -129,19 +229,18 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 
         public void AllocTileWorkerData(MemoryAllocator allocator, int tileCols, int tileRows, int maxThreads)
         {
-            TileWorkerData = allocator.Allocate<TileWorkerData>(tileCols * tileRows + (maxThreads > 1 ? maxThreads : 0));
+            TileWorkerData =
+                allocator.Allocate<TileWorkerData>((tileCols * tileRows) + (maxThreads > 1 ? maxThreads : 0));
         }
 
-        public readonly void FreeTileWorkerData(MemoryAllocator allocator)
+        public void FreeTileWorkerData(MemoryAllocator allocator)
         {
             allocator.Free(TileWorkerData);
         }
 
         private void AllocSegMap(MemoryAllocator allocator, int segMapSize)
         {
-            int i;
-
-            for (i = 0; i < Constants.NumPingPongBuffers; ++i)
+            for (int i = 0; i < Constants.NumPingPongBuffers; ++i)
             {
                 SegMapArray[i] = allocator.Allocate<byte>(segMapSize);
             }
@@ -156,9 +255,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 
         private void FreeSegMap(MemoryAllocator allocator)
         {
-            int i;
-
-            for (i = 0; i < Constants.NumPingPongBuffers; ++i)
+            for (int i = 0; i < Constants.NumPingPongBuffers; ++i)
             {
                 allocator.Free(SegMapArray[i]);
                 SegMapArray[i] = ArrayPtr<byte>.Null;
@@ -194,6 +291,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             Lf.Lfm = ArrayPtr<LoopFilterMask>.Null;
             allocator.Free(CurFrameMvs);
             CurFrameMvs = ArrayPtr<MvRef>.Null;
+
             if (UsePrevFrameMvs)
             {
                 allocator.Free(PrevFrameMvs);
@@ -209,7 +307,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             Lf.Lfm = allocator.Allocate<LoopFilterMask>(((MiRows + (Constants.MiBlockSize - 1)) >> 3) * Lf.LfmStride);
         }
 
-        public void AllocContextBuffers(MemoryAllocator allocator, int width, int height)
+        public bool AllocContextBuffers(MemoryAllocator allocator, int width, int height)
         {
             SetMbMi(width, height);
             int newMiSize = MiStride * CalcMiSize(MiRows);
@@ -239,6 +337,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             {
                 PrevFrameMvs = allocator.Allocate<MvRef>(MiRows * MiCols);
             }
+
+            return false;
         }
 
         private unsafe void DecSetupMi()
@@ -257,7 +357,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             }
         }
 
-        private readonly void SetPartitionProbs(ref MacroBlockD xd)
+        private void SetPartitionProbs(ref MacroBlockD xd)
         {
             xd.PartitionProbs = FrameIsIntraOnly()
                 ? new ArrayPtr<Array3<byte>>(ref Fc.Value.KfPartitionProb[0], 16)
@@ -266,9 +366,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 
         internal void InitMacroBlockD(ref MacroBlockD xd, ArrayPtr<int> dqcoeff)
         {
-            int i;
-
-            for (i = 0; i < Constants.MaxMbPlane; ++i)
+            for (int i = 0; i < Constants.MaxMbPlane; ++i)
             {
                 xd.Plane[i].DqCoeff = dqcoeff;
                 xd.AboveContext[i] = AboveContext.Slice(i * 2 * TileInfo.MiColsAlignedToSb(MiCols));
@@ -281,6 +379,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
                 {
                     MemoryUtil.Copy(ref xd.Plane[i].SegDequant, ref UvDequant);
                 }
+
                 xd.Fc = new Ptr<Vp9EntropyProbs>(ref Fc.Value);
             }
 
@@ -293,29 +392,27 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
 
         public void SetupSegmentationDequant()
         {
-            const BitDepth BitDepth = BitDepth.Bits8; // TODO: Configurable
             // Build y/uv dequant values based on segmentation.
             if (Seg.Enabled)
             {
-                int i;
-                for (i = 0; i < Constants.MaxSegments; ++i)
+                for (int i = 0; i < Constants.MaxSegments; ++i)
                 {
-                    int qIndex = QuantCommon.GetQIndex(ref Seg, i, BaseQindex);
-                    YDequant[i][0] = QuantCommon.DcQuant(qIndex, YDcDeltaQ, BitDepth);
-                    YDequant[i][1] = QuantCommon.AcQuant(qIndex, 0, BitDepth);
-                    UvDequant[i][0] = QuantCommon.DcQuant(qIndex, UvDcDeltaQ, BitDepth);
-                    UvDequant[i][1] = QuantCommon.AcQuant(qIndex, UvAcDeltaQ, BitDepth);
+                    int qindex = Seg.GetQIndex(i, BaseQindex);
+                    YDequant[i][0] = QuantCommon.DcQuant(qindex, YDcDeltaQ, BitDepth);
+                    YDequant[i][1] = QuantCommon.AcQuant(qindex, 0, BitDepth);
+                    UvDequant[i][0] = QuantCommon.DcQuant(qindex, UvDcDeltaQ, BitDepth);
+                    UvDequant[i][1] = QuantCommon.AcQuant(qindex, UvAcDeltaQ, BitDepth);
                 }
             }
             else
             {
-                int qIndex = BaseQindex;
+                int qindex = BaseQindex;
                 // When segmentation is disabled, only the first value is used.  The
                 // remaining are don't cares.
-                YDequant[0][0] = QuantCommon.DcQuant(qIndex, YDcDeltaQ, BitDepth);
-                YDequant[0][1] = QuantCommon.AcQuant(qIndex, 0, BitDepth);
-                UvDequant[0][0] = QuantCommon.DcQuant(qIndex, UvDcDeltaQ, BitDepth);
-                UvDequant[0][1] = QuantCommon.AcQuant(qIndex, UvAcDeltaQ, BitDepth);
+                YDequant[0][0] = QuantCommon.DcQuant(qindex, YDcDeltaQ, BitDepth);
+                YDequant[0][1] = QuantCommon.AcQuant(qindex, 0, BitDepth);
+                UvDequant[0][0] = QuantCommon.DcQuant(qindex, UvDcDeltaQ, BitDepth);
+                UvDequant[0][1] = QuantCommon.AcQuant(qindex, UvAcDeltaQ, BitDepth);
             }
         }
 
@@ -327,5 +424,576 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
                 refBuf.Sf.SetupScaleFactorsForFrame(refBuf.Buf.Width, refBuf.Buf.Height, Width, Height);
             }
         }
+
+        public void ReadFrameReferenceModeProbs(ref Reader r)
+        {
+            ref Vp9EntropyProbs fc = ref Fc.Value;
+
+
+            if (ReferenceMode == ReferenceMode.Select)
+            {
+                for (int i = 0; i < Constants.CompInterContexts; ++i)
+                {
+                    r.DiffUpdateProb(ref fc.CompInterProb[i]);
+                }
+            }
+
+            if (ReferenceMode != ReferenceMode.Compound)
+            {
+                for (int i = 0; i < Constants.RefContexts; ++i)
+                {
+                    r.DiffUpdateProb(ref fc.SingleRefProb[i][0]);
+                    r.DiffUpdateProb(ref fc.SingleRefProb[i][1]);
+                }
+            }
+
+            if (ReferenceMode != ReferenceMode.Single)
+            {
+                for (int i = 0; i < Constants.RefContexts; ++i)
+                {
+                    r.DiffUpdateProb(ref fc.CompRefProb[i]);
+                }
+            }
+        }
+
+        public ReferenceMode ReadFrameReferenceMode(ref Reader r)
+        {
+            if (CompoundReferenceAllowed())
+            {
+                return r.ReadBit() != 0
+                    ? r.ReadBit() != 0 ? ReferenceMode.Select : ReferenceMode.Compound
+                    : ReferenceMode.Single;
+            }
+
+            return ReferenceMode.Single;
+        }
+
+        public void SetupCompoundReferenceMode()
+        {
+            if (RefFrameSignBias[Constants.LastFrame] == RefFrameSignBias[Constants.GoldenFrame])
+            {
+                CompFixedRef = Constants.AltRefFrame;
+                CompVarRef[0] = Constants.LastFrame;
+                CompVarRef[1] = Constants.GoldenFrame;
+            }
+            else if (RefFrameSignBias[Constants.LastFrame] == RefFrameSignBias[Constants.AltRefFrame])
+            {
+                CompFixedRef = Constants.GoldenFrame;
+                CompVarRef[0] = Constants.LastFrame;
+                CompVarRef[1] = Constants.AltRefFrame;
+            }
+            else
+            {
+                CompFixedRef = Constants.LastFrame;
+                CompVarRef[0] = Constants.GoldenFrame;
+                CompVarRef[1] = Constants.AltRefFrame;
+            }
+        }
+
+        public void InitMvProbs()
+        {
+            Fc.Value.Joints[0] = 32;
+            Fc.Value.Joints[1] = 64;
+            Fc.Value.Joints[2] = 96;
+
+            Fc.Value.Sign[0] = 128;
+            Fc.Value.Classes[0][0] = 224;
+            Fc.Value.Classes[0][1] = 144;
+            Fc.Value.Classes[0][2] = 192;
+            Fc.Value.Classes[0][3] = 168;
+            Fc.Value.Classes[0][4] = 192;
+            Fc.Value.Classes[0][5] = 176;
+            Fc.Value.Classes[0][6] = 192;
+            Fc.Value.Classes[0][7] = 198;
+            Fc.Value.Classes[0][8] = 198;
+            Fc.Value.Classes[0][9] = 245;
+            Fc.Value.Class0[0][0] = 216;
+            Fc.Value.Bits[0][0] = 136;
+            Fc.Value.Bits[0][1] = 140;
+            Fc.Value.Bits[0][2] = 148;
+            Fc.Value.Bits[0][3] = 160;
+            Fc.Value.Bits[0][4] = 176;
+            Fc.Value.Bits[0][5] = 192;
+            Fc.Value.Bits[0][6] = 224;
+            Fc.Value.Bits[0][7] = 234;
+            Fc.Value.Bits[0][8] = 234;
+            Fc.Value.Bits[0][9] = 240;
+            Fc.Value.Class0Fp[0][0][0] = 128;
+            Fc.Value.Class0Fp[0][0][1] = 128;
+            Fc.Value.Class0Fp[0][0][2] = 64;
+            Fc.Value.Class0Fp[0][1][0] = 96;
+            Fc.Value.Class0Fp[0][1][1] = 112;
+            Fc.Value.Class0Fp[0][1][2] = 64;
+            Fc.Value.Fp[0][0] = 64;
+            Fc.Value.Fp[0][1] = 96;
+            Fc.Value.Fp[0][2] = 64;
+            Fc.Value.Class0Hp[0] = 160;
+            Fc.Value.Hp[0] = 128;
+
+            Fc.Value.Sign[1] = 128;
+            Fc.Value.Classes[1][0] = 216;
+            Fc.Value.Classes[1][1] = 128;
+            Fc.Value.Classes[1][2] = 176;
+            Fc.Value.Classes[1][3] = 160;
+            Fc.Value.Classes[1][4] = 176;
+            Fc.Value.Classes[1][5] = 176;
+            Fc.Value.Classes[1][6] = 192;
+            Fc.Value.Classes[1][7] = 198;
+            Fc.Value.Classes[1][8] = 198;
+            Fc.Value.Classes[1][9] = 208;
+            Fc.Value.Class0[1][0] = 208;
+            Fc.Value.Bits[1][0] = 136;
+            Fc.Value.Bits[1][1] = 140;
+            Fc.Value.Bits[1][2] = 148;
+            Fc.Value.Bits[1][3] = 160;
+            Fc.Value.Bits[1][4] = 176;
+            Fc.Value.Bits[1][5] = 192;
+            Fc.Value.Bits[1][6] = 224;
+            Fc.Value.Bits[1][7] = 234;
+            Fc.Value.Bits[1][8] = 234;
+            Fc.Value.Bits[1][9] = 240;
+            Fc.Value.Class0Fp[1][0][0] = 128;
+            Fc.Value.Class0Fp[1][0][1] = 128;
+            Fc.Value.Class0Fp[1][0][2] = 64;
+            Fc.Value.Class0Fp[1][1][0] = 96;
+            Fc.Value.Class0Fp[1][1][1] = 112;
+            Fc.Value.Class0Fp[1][1][2] = 64;
+            Fc.Value.Fp[1][0] = 64;
+            Fc.Value.Fp[1][1] = 96;
+            Fc.Value.Fp[1][2] = 64;
+            Fc.Value.Class0Hp[1] = 160;
+            Fc.Value.Hp[1] = 128;
+        }
+
+        public void AdaptMvProbs(bool allowHp)
+        {
+            ref Vp9EntropyProbs fc = ref Fc.Value;
+            ref Vp9EntropyProbs preFc = ref FrameContexts[(int)FrameContextIdx];
+            ref Vp9BackwardUpdates counts = ref Counts.Value;
+
+            Prob.VpxTreeMergeProbs(
+                EntropyMv.JointTree,
+                preFc.Joints.AsSpan(),
+                counts.Joints.AsSpan(),
+                fc.Joints.AsSpan());
+
+            for (int i = 0; i < 2; ++i)
+            {
+                fc.Sign[i] = Prob.ModeMvMergeProbs(preFc.Sign[i], ref counts.Sign[i]);
+                Prob.VpxTreeMergeProbs(
+                    EntropyMv.ClassTree,
+                    preFc.Classes[i].AsSpan(),
+                    counts.Classes[i].AsSpan(),
+                    fc.Classes[i].AsSpan());
+                Prob.VpxTreeMergeProbs(
+                    EntropyMv.Class0Tree,
+                    preFc.Class0[i].AsSpan(),
+                    counts.Class0[i].AsSpan(),
+                    fc.Class0[i].AsSpan());
+
+                for (int j = 0; j < EntropyMv.OffsetBits; ++j)
+                {
+                    fc.Bits[i][j] = Prob.ModeMvMergeProbs(preFc.Bits[i][j], ref counts.Bits[i][j]);
+                }
+
+                for (int j = 0; j < EntropyMv.Class0Size; ++j)
+                {
+                    Prob.VpxTreeMergeProbs(
+                        EntropyMv.FpTree,
+                        preFc.Class0Fp[i][j].AsSpan(),
+                        counts.Class0Fp[i][j].AsSpan(),
+                        fc.Class0Fp[i][j].AsSpan());
+                }
+
+                Prob.VpxTreeMergeProbs(EntropyMv.FpTree, preFc.Fp[i].AsSpan(), counts.Fp[i].AsSpan(),
+                    fc.Fp[i].AsSpan());
+
+                if (allowHp)
+                {
+                    fc.Class0Hp[i] = Prob.ModeMvMergeProbs(preFc.Class0Hp[i], ref counts.Class0Hp[i]);
+                    fc.Hp[i] = Prob.ModeMvMergeProbs(preFc.Hp[i], ref counts.Hp[i]);
+                }
+            }
+        }
+
+        public void ResizeContextBuffers(MemoryAllocator allocator, int width, int height)
+        {
+            if (Width != width || Height != height)
+            {
+                int newMiRows = BitUtils.AlignPowerOfTwo(height, Constants.MiSizeLog2) >> Constants.MiSizeLog2;
+                int newMiCols = BitUtils.AlignPowerOfTwo(width, Constants.MiSizeLog2) >> Constants.MiSizeLog2;
+
+                // Allocations in AllocContextBuffers() depend on individual
+                // dimensions as well as the overall size.
+                if (newMiCols > MiCols || newMiRows > MiRows)
+                {
+                    if (AllocContextBuffers(allocator, width, height))
+                    {
+                        // The Mi* values have been cleared and any existing context
+                        // buffers have been freed. Clear Width and Height to be
+                        // consistent and to force a realloc next time.
+                        Width = 0;
+                        Height = 0;
+                        Error.InternalError(CodecErr.MemError, "Failed to allocate context buffers");
+                    }
+                }
+                else
+                {
+                    SetMbMi(width, height);
+                }
+
+                InitContextBuffers();
+                Width = width;
+                Height = height;
+            }
+
+            if (CurFrameMvs.IsNull ||
+                MiRows > CurFrame.Value.MiRows ||
+                MiCols > CurFrame.Value.MiCols)
+            {
+                ResizeMvBuffer(allocator);
+            }
+        }
+
+        public void CheckMemError<T>(ref ArrayPtr<T> lval, ArrayPtr<T> expr)
+            where T : unmanaged
+        {
+            lval = expr;
+            if (lval.IsNull)
+            {
+                Error.InternalError(CodecErr.MemError, "Failed to allocate");
+            }
+        }
+
+        private void ResizeMvBuffer(MemoryAllocator allocator)
+        {
+            allocator.Free(CurFrameMvs);
+            CurFrame.Value.MiRows = MiRows;
+            CurFrame.Value.MiCols = MiCols;
+            CheckMemError(ref CurFrameMvs, allocator.Allocate<MvRef>(MiRows * MiCols));
+        }
+
+        public void CheckMemError<T>(ref Ptr<T> lval, Ptr<T> expr) where T : unmanaged
+        {
+            lval = expr;
+            if (lval.IsNull)
+            {
+                Error.InternalError(CodecErr.MemError, "Failed to allocate");
+            }
+        }
+
+        public void SetupTileInfo(ref ReadBitBuffer rb)
+        {
+            int minLog2TileCols = 0, maxLog2TileCols = 0, maxOnes;
+            TileInfo.GetTileNBits(MiCols, out minLog2TileCols, out maxLog2TileCols);
+
+            // columns
+            maxOnes = maxLog2TileCols - minLog2TileCols;
+            Log2TileCols = minLog2TileCols;
+            while (maxOnes-- != 0 && rb.ReadBit() != 0)
+            {
+                Log2TileCols++;
+            }
+
+            if (Log2TileCols > 6)
+            {
+                Error.InternalError(CodecErr.CorruptFrame, "Invalid number of tile columns");
+            }
+
+            // rows
+            Log2TileRows = rb.ReadBit();
+            if (Log2TileRows != 0)
+            {
+                Log2TileRows += rb.ReadBit();
+            }
+        }
+
+        public void ReadBitdepthColorspaceSampling(ref ReadBitBuffer rb)
+        {
+            if (Profile >= BitstreamProfile.Profile2)
+            {
+                BitDepth = rb.ReadBit() != 0 ? BitDepth.Bits12 : BitDepth.Bits10;
+                UseHighBitDepth = true;
+            }
+            else
+            {
+                BitDepth = BitDepth.Bits8;
+                UseHighBitDepth = false;
+            }
+
+            ColorSpace = (VpxColorSpace)rb.ReadLiteral(3);
+            if (ColorSpace != VpxColorSpace.Srgb)
+            {
+                ColorRange = (VpxColorRange)rb.ReadBit();
+                if (Profile == BitstreamProfile.Profile1 || Profile == BitstreamProfile.Profile3)
+                {
+                    SubsamplingX = rb.ReadBit();
+                    SubsamplingY = rb.ReadBit();
+                    if (SubsamplingX == 1 && SubsamplingY == 1)
+                    {
+                        Error.InternalError(CodecErr.UnsupBitstream,
+                            "4:2:0 color not supported in profile 1 or 3");
+                    }
+
+                    if (rb.ReadBit() != 0)
+                    {
+                        Error.InternalError(CodecErr.UnsupBitstream, "Reserved bit set");
+                    }
+                }
+                else
+                {
+                    SubsamplingY = SubsamplingX = 1;
+                }
+            }
+            else
+            {
+                ColorRange = VpxColorRange.Full;
+                if (Profile == BitstreamProfile.Profile1 || Profile == BitstreamProfile.Profile3)
+                {
+                    // Note if colorspace is SRGB then 4:4:4 chroma sampling is assumed.
+                    // 4:2:2 or 4:4:0 chroma sampling is not allowed.
+                    SubsamplingY = SubsamplingX = 0;
+                    if (rb.ReadBit() != 0)
+                    {
+                        Error.InternalError(CodecErr.UnsupBitstream, "Reserved bit set");
+                    }
+                }
+                else
+                {
+                    Error.InternalError(CodecErr.UnsupBitstream, "4:4:4 color not supported in profile 0 or 2");
+                }
+            }
+        }
+
+        public void AdaptModeProbs()
+        {
+            ref Vp9EntropyProbs fc = ref Fc.Value;
+            ref Vp9EntropyProbs preFc = ref FrameContexts[(int)FrameContextIdx];
+            ref Vp9BackwardUpdates counts = ref Counts.Value;
+
+            for (int i = 0; i < Constants.IntraInterContexts; i++)
+            {
+                fc.IntraInterProb[i] = Prob.ModeMvMergeProbs(preFc.IntraInterProb[i], ref counts.IntraInter[i]);
+            }
+
+            for (int i = 0; i < Constants.CompInterContexts; i++)
+            {
+                fc.CompInterProb[i] = Prob.ModeMvMergeProbs(preFc.CompInterProb[i], ref counts.CompInter[i]);
+            }
+
+            for (int i = 0; i < Constants.RefContexts; i++)
+            {
+                fc.CompRefProb[i] = Prob.ModeMvMergeProbs(preFc.CompRefProb[i], ref counts.CompRef[i]);
+            }
+
+            for (int i = 0; i < Constants.RefContexts; i++)
+            {
+                for (int j = 0; j < 2; j++)
+                {
+                    fc.SingleRefProb[i][j] =
+                        Prob.ModeMvMergeProbs(preFc.SingleRefProb[i][j], ref counts.SingleRef[i][j]);
+                }
+            }
+
+            for (int i = 0; i < Constants.InterModeContexts; i++)
+            {
+                Prob.VpxTreeMergeProbs(
+                    EntropyMode.InterModeTree,
+                    preFc.InterModeProb[i].AsSpan(),
+                    counts.InterMode[i].AsSpan(),
+                    fc.InterModeProb[i].AsSpan());
+            }
+
+            for (int i = 0; i < EntropyMode.BlockSizeGroups; i++)
+            {
+                Prob.VpxTreeMergeProbs(
+                    EntropyMode.IntraModeTree,
+                    preFc.YModeProb[i].AsSpan(),
+                    counts.YMode[i].AsSpan(),
+                    fc.YModeProb[i].AsSpan());
+            }
+
+            for (int i = 0; i < Constants.IntraModes; ++i)
+            {
+                Prob.VpxTreeMergeProbs(
+                    EntropyMode.IntraModeTree,
+                    preFc.UvModeProb[i].AsSpan(),
+                    counts.UvMode[i].AsSpan(),
+                    fc.UvModeProb[i].AsSpan());
+            }
+
+            for (int i = 0; i < Constants.PartitionContexts; i++)
+            {
+                Prob.VpxTreeMergeProbs(
+                    EntropyMode.PartitionTree,
+                    preFc.PartitionProb[i].AsSpan(),
+                    counts.Partition[i].AsSpan(),
+                    fc.PartitionProb[i].AsSpan());
+            }
+
+            if (InterpFilter == Constants.Switchable)
+            {
+                for (int i = 0; i < Constants.SwitchableFilterContexts; i++)
+                {
+                    Prob.VpxTreeMergeProbs(
+                        EntropyMode.SwitchableInterpTree,
+                        preFc.SwitchableInterpProb[i].AsSpan(),
+                        counts.SwitchableInterp[i].AsSpan(),
+                        fc.SwitchableInterpProb[i].AsSpan());
+                }
+            }
+
+            if (TxMode == TxMode.TxModeSelect)
+            {
+                Array1<Array2<uint>> branchCt8x8P = new();
+                Array2<Array2<uint>> branchCt16x16P = new();
+                Array3<Array2<uint>> branchCt32x32P = new();
+
+                for (int i = 0; i < EntropyMode.TxSizeContexts; ++i)
+                {
+                    EntropyMode.TxCountsToBranchCounts8x8(counts.Tx8x8[i].AsSpan(), ref branchCt8x8P);
+                    for (int j = 0; j < (int)TxSize.TxSizes - 3; ++j)
+                    {
+                        fc.Tx8x8Prob[i][j] = Prob.ModeMvMergeProbs(preFc.Tx8x8Prob[i][j], ref branchCt8x8P[j]);
+                    }
+
+                    EntropyMode.TxCountsToBranchCounts16x16(counts.Tx16x16[i].AsSpan(), ref branchCt16x16P);
+                    for (int j = 0; j < (int)TxSize.TxSizes - 2; ++j)
+                    {
+                        fc.Tx16x16Prob[i][j] =
+                            Prob.ModeMvMergeProbs(preFc.Tx16x16Prob[i][j], ref branchCt16x16P[j]);
+                    }
+
+                    EntropyMode.TxCountsToBranchCounts32x32(counts.Tx32x32[i].AsSpan(), ref branchCt32x32P);
+                    for (int j = 0; j < (int)TxSize.TxSizes - 1; ++j)
+                    {
+                        fc.Tx32x32Prob[i][j] =
+                            Prob.ModeMvMergeProbs(preFc.Tx32x32Prob[i][j], ref branchCt32x32P[j]);
+                    }
+                }
+            }
+
+            for (int i = 0; i < Constants.SkipContexts; ++i)
+            {
+                fc.SkipProb[i] = Prob.ModeMvMergeProbs(preFc.SkipProb[i], ref counts.Skip[i]);
+            }
+        }
+
+        public void AdaptCoefProbs()
+        {
+            byte t;
+            uint countSat, updateFactor;
+
+            if (FrameIsIntraOnly())
+            {
+                updateFactor = Entropy.CoefMaxUpdateFactorKey;
+                countSat = Entropy.CoefCountSatKey;
+            }
+            else if (LastFrameType == FrameType.KeyFrame)
+            {
+                updateFactor = Entropy.CoefMaxUpdateFactorAfterKey; /* adapt quickly */
+                countSat = Entropy.CoefCountSatAfterKey;
+            }
+            else
+            {
+                updateFactor = Entropy.CoefMaxUpdateFactor;
+                countSat = Entropy.CoefCountSat;
+            }
+
+            for (t = (int)TxSize.Tx4x4; t <= (int)TxSize.Tx32x32; t++)
+            {
+                AdaptCoefProbs(t, countSat, updateFactor);
+            }
+        }
+
+        public void SetMvs(ReadOnlySpan<Vp9MvRef> mvs)
+        {
+            if (mvs.Length > PrevFrameMvs.Length)
+            {
+                throw new ArgumentException(
+                    $"Size mismatch, expected: {PrevFrameMvs.Length}, but got: {mvs.Length}.");
+            }
+
+            for (int i = 0; i < mvs.Length; i++)
+            {
+                ref MvRef mv = ref PrevFrameMvs[i];
+
+                mv.Mv[0].Row = mvs[i].Mvs[0].Row;
+                mv.Mv[0].Col = mvs[i].Mvs[0].Col;
+                mv.Mv[1].Row = mvs[i].Mvs[1].Row;
+                mv.Mv[1].Col = mvs[i].Mvs[1].Col;
+
+                mv.RefFrame[0] = (sbyte)mvs[i].RefFrames[0];
+                mv.RefFrame[1] = (sbyte)mvs[i].RefFrames[1];
+            }
+        }
+
+        public void GetMvs(Span<Vp9MvRef> mvs)
+        {
+            if (mvs.Length > CurFrameMvs.Length)
+            {
+                throw new ArgumentException(
+                    $"Size mismatch, expected: {CurFrameMvs.Length}, but got: {mvs.Length}.");
+            }
+
+            for (int i = 0; i < mvs.Length; i++)
+            {
+                ref MvRef mv = ref CurFrameMvs[i];
+
+                mvs[i].Mvs[0].Row = mv.Mv[0].Row;
+                mvs[i].Mvs[0].Col = mv.Mv[0].Col;
+                mvs[i].Mvs[1].Row = mv.Mv[1].Row;
+                mvs[i].Mvs[1].Col = mv.Mv[1].Col;
+
+                mvs[i].RefFrames[0] = mv.RefFrame[0];
+                mvs[i].RefFrames[1] = mv.RefFrame[1];
+            }
+        }
+
+        private void AdaptCoefProbs(byte txSize, uint countSat, uint updateFactor)
+        {
+            ref Vp9EntropyProbs preFc = ref FrameContexts[(int)FrameContextIdx];
+            ref Array2<Array2<Array6<Array6<Array3<byte>>>>> probs = ref Fc.Value.CoefProbs[txSize];
+            ref Array2<Array2<Array6<Array6<Array3<byte>>>>> preProbs = ref preFc.CoefProbs[txSize];
+            ref Array2<Array2<Array6<Array6<Array4<uint>>>>> counts = ref Counts.Value.Coef[txSize];
+            ref Array2<Array2<Array6<Array6<uint>>>> eobCounts = ref Counts.Value.EobBranch[txSize];
+
+            for (int i = 0; i < Constants.PlaneTypes; ++i)
+            {
+                for (int j = 0; j < Entropy.RefTypes; ++j)
+                {
+                    for (int k = 0; k < Entropy.CoefBands; ++k)
+                    {
+                        for (int l = 0; l < Entropy.BAND_COEFF_CONTEXTS(k); ++l)
+                        {
+                            int n0 = (int)counts[i][j][k][l][Entropy.ZeroToken];
+                            int n1 = (int)counts[i][j][k][l][Entropy.OneToken];
+                            int n2 = (int)counts[i][j][k][l][Entropy.TwoToken];
+                            int neob = (int)counts[i][j][k][l][Entropy.EobModelToken];
+                            Array3<Array2<uint>> branchCt = new();
+                            branchCt[0][0] = (uint)neob;
+                            branchCt[0][1] = (uint)(eobCounts[i][j][k][l] - neob);
+                            branchCt[1][0] = (uint)n0;
+                            branchCt[1][1] = (uint)(n1 + n2);
+                            branchCt[2][0] = (uint)n1;
+                            branchCt[2][1] = (uint)n2;
+                            for (int m = 0; m < Entropy.UnconstrainedNodes; ++m)
+                            {
+                                probs[i][j][k][l][m] = Prob.MergeProbs(preProbs[i][j][k][l][m], ref branchCt[m],
+                                    countSat, updateFactor);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        public void DefaultCoefProbs()
+        {
+            Entropy.CopyProbs(ref Fc.Value.CoefProbs[(int)TxSize.Tx4x4], Entropy.DefaultCoefProbs4x4);
+            Entropy.CopyProbs(ref Fc.Value.CoefProbs[(int)TxSize.Tx8x8], Entropy.DefaultCoefProbs8x8);
+            Entropy.CopyProbs(ref Fc.Value.CoefProbs[(int)TxSize.Tx16x16], Entropy.DefaultCoefProbs16x16);
+            Entropy.CopyProbs(ref Fc.Value.CoefProbs[(int)TxSize.Tx32x32], Entropy.DefaultCoefProbs32x32);
+        }
     }
-}
+}

+ 410 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Decoder.cs

@@ -0,0 +1,410 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Video;
+using System.Diagnostics;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct Vp9Decoder
+    {
+        public Vp9Common Common;
+
+        public int ReadyForNewData;
+
+        public int RefreshFrameFlags;
+
+        public int NeedResync; // Wait for key/intra-only frame.
+        public int HoldRefBuf; // Hold the reference buffer.
+
+        private static void DecreaseRefCount(int idx, ref Array12<RefCntBuffer> frameBufs, ref BufferPool pool)
+        {
+            if (idx >= 0 && frameBufs[idx].RefCount > 0)
+            {
+                --frameBufs[idx].RefCount;
+                // A worker may only get a free framebuffer index when calling GetFreeFb.
+                // But the private buffer is not set up until finish decoding header.
+                // So any error happens during decoding header, the frame_bufs will not
+                // have valid priv buffer.
+                if (frameBufs[idx].Released == 0 && frameBufs[idx].RefCount == 0 &&
+                    !frameBufs[idx].RawFrameBuffer.Priv.IsNull)
+                {
+                    FrameBuffers.ReleaseFrameBuffer(pool.CbPriv, ref frameBufs[idx].RawFrameBuffer);
+                    frameBufs[idx].Released = 1;
+                }
+            }
+        }
+
+        public void Create(MemoryAllocator allocator, ref BufferPool pool)
+        {
+            ref Vp9Common cm = ref Common;
+
+            cm.CheckMemError(ref cm.Fc,
+                new Ptr<Vp9EntropyProbs>(ref allocator.Allocate<Vp9EntropyProbs>(1)[0]));
+            cm.CheckMemError(ref cm.FrameContexts,
+                allocator.Allocate<Vp9EntropyProbs>(Constants.FrameContexts));
+
+            for (int i = 0; i < EntropyMode.KfYModeProb.Length; i++)
+            {
+                for (int j = 0; j < EntropyMode.KfYModeProb[i].Length; j++)
+                {
+                    for (int k = 0; k < EntropyMode.KfYModeProb[i][j].Length; k++)
+                    {
+                        cm.Fc.Value.KfYModeProb[i][j][k] = EntropyMode.KfYModeProb[i][j][k];
+                    }
+                }
+            }
+
+            for (int i = 0; i < EntropyMode.KfUvModeProb.Length; i++)
+            {
+                for (int j = 0; j < EntropyMode.KfUvModeProb[i].Length; j++)
+                {
+                    cm.Fc.Value.KfUvModeProb[i][j] = EntropyMode.KfUvModeProb[i][j];
+                }
+            }
+
+            byte[][] KfPartitionProbs =
+            {
+                // 8x8 . 4x4
+                new byte[] { 158, 97, 94 }, // a/l both not split
+                new byte[] { 93, 24, 99 }, // a split, l not split
+                new byte[] { 85, 119, 44 }, // l split, a not split
+                new byte[] { 62, 59, 67 }, // a/l both split
+
+                // 16x16 . 8x8
+                new byte[] { 149, 53, 53 }, // a/l both not split
+                new byte[] { 94, 20, 48 }, // a split, l not split
+                new byte[] { 83, 53, 24 }, // l split, a not split
+                new byte[] { 52, 18, 18 }, // a/l both split
+
+                // 32x32 . 16x16
+                new byte[] { 150, 40, 39 }, // a/l both not split
+                new byte[] { 78, 12, 26 }, // a split, l not split
+                new byte[] { 67, 33, 11 }, // l split, a not split
+                new byte[] { 24, 7, 5 }, // a/l both split
+
+                // 64x64 . 32x32
+                new byte[] { 174, 35, 49 }, // a/l both not split
+                new byte[] { 68, 11, 27 }, // a split, l not split
+                new byte[] { 57, 15, 9 }, // l split, a not split
+                new byte[] { 12, 3, 3 } // a/l both split
+            };
+
+            for (int i = 0; i < KfPartitionProbs.Length; i++)
+            {
+                for (int j = 0; j < KfPartitionProbs[i].Length; j++)
+                {
+                    cm.Fc.Value.KfPartitionProb[i][j] = KfPartitionProbs[i][j];
+                }
+            }
+
+            cm.Counts = new Ptr<Vp9BackwardUpdates>(ref allocator.Allocate<Vp9BackwardUpdates>(1)[0]);
+
+            NeedResync = 1;
+
+            // Initialize the references to not point to any frame buffers.
+            for (int i = 0; i < 8; i++)
+            {
+                cm.RefFrameMap[i] = -1;
+                cm.NextRefFrameMap[i] = -1;
+            }
+
+            cm.CurrentVideoFrame = 0;
+            ReadyForNewData = 1;
+            Common.BufferPool = new Ptr<BufferPool>(ref pool);
+
+            cm.BitDepth = BitDepth.Bits8;
+            cm.DequantBitDepth = BitDepth.Bits8;
+
+            // vp9_loop_filter_init(ref cm);
+        }
+
+        /* If any buffer updating is signaled it should be done here. */
+        private void SwapFrameBuffers()
+        {
+            int refIndex = 0, mask;
+            ref Vp9Common cm = ref Common;
+            ref BufferPool pool = ref cm.BufferPool.Value;
+            ref Array12<RefCntBuffer> frameBufs = ref cm.BufferPool.Value.FrameBufs;
+
+            for (mask = RefreshFrameFlags; mask != 0; mask >>= 1)
+            {
+                int oldIdx = cm.RefFrameMap[refIndex];
+                // Current thread releases the holding of reference frame.
+                DecreaseRefCount(oldIdx, ref frameBufs, ref pool);
+
+                // Release the reference frame in reference map.
+                if ((mask & 1) != 0)
+                {
+                    DecreaseRefCount(oldIdx, ref frameBufs, ref pool);
+                }
+
+                cm.RefFrameMap[refIndex] = cm.NextRefFrameMap[refIndex];
+                ++refIndex;
+            }
+
+            // Current thread releases the holding of reference frame.
+            for (; refIndex < Constants.RefFrames && cm.ShowExistingFrame == 0; ++refIndex)
+            {
+                int oldIdx = cm.RefFrameMap[refIndex];
+                DecreaseRefCount(oldIdx, ref frameBufs, ref pool);
+                cm.RefFrameMap[refIndex] = cm.NextRefFrameMap[refIndex];
+            }
+
+            HoldRefBuf = 0;
+            cm.FrameToShow = new Ptr<Surface>(ref cm.GetFrameNewBuffer());
+
+            --frameBufs[cm.NewFbIdx].RefCount;
+
+            // Invalidate these references until the next frame starts.
+            for (refIndex = 0; refIndex < 3; refIndex++)
+            {
+                cm.FrameRefs[refIndex].Idx = RefBuffer.InvalidIdx;
+            }
+        }
+
+        public CodecErr ReceiveCompressedData(MemoryAllocator allocator, ulong size, ref ArrayPtr<byte> psource)
+        {
+            ref Vp9Common cm = ref Common;
+            ref BufferPool pool = ref cm.BufferPool.Value;
+            ref Array12<RefCntBuffer> frameBufs = ref cm.BufferPool.Value.FrameBufs;
+            ArrayPtr<byte> source = psource;
+            CodecErr retcode = 0;
+            cm.Error.ErrorCode = CodecErr.Ok;
+
+            if (size == 0)
+            {
+                // This is used to signal that we are missing frames.
+                // We do not know if the missing frame(s) was supposed to update
+                // any of the reference buffers, but we act conservative and
+                // mark only the last buffer as corrupted.
+
+                if (cm.FrameRefs[0].Idx > 0)
+                {
+                    cm.FrameRefs[0].Buf.Corrupted = 1;
+                }
+            }
+
+            ReadyForNewData = 0;
+
+            // Check if the previous frame was a frame without any references to it.
+            if (cm.NewFbIdx >= 0 && frameBufs[cm.NewFbIdx].RefCount == 0 &&
+                frameBufs[cm.NewFbIdx].Released == 0)
+            {
+                FrameBuffers.ReleaseFrameBuffer(pool.CbPriv, ref frameBufs[cm.NewFbIdx].RawFrameBuffer);
+                frameBufs[cm.NewFbIdx].Released = 1;
+            }
+
+            // Find a free frame buffer. Return error if can not find any.
+            cm.NewFbIdx = cm.GetFreeFb();
+            if (cm.NewFbIdx == RefBuffer.InvalidIdx)
+            {
+                ReadyForNewData = 1;
+                cm.Error.InternalError(CodecErr.MemError, "Unable to find free frame buffer");
+
+                return cm.Error.ErrorCode;
+            }
+
+            // Assign a MV array to the frame buffer.
+            cm.CurFrame = new Ptr<RefCntBuffer>(ref pool.FrameBufs[cm.NewFbIdx]);
+
+            HoldRefBuf = 0;
+
+            DecodeFrame.Decode(allocator, ref this, new ArrayPtr<byte>(ref source[0], (int)size), out psource);
+
+            SwapFrameBuffers();
+
+            // vpx_clear_system_state();
+
+            if (cm.ShowExistingFrame == 0)
+            {
+                cm.LastShowFrame = cm.ShowFrame;
+                cm.PrevFrame = cm.CurFrame;
+
+                if (cm.PrevFrameMvs.IsNull || cm.PrevFrameMvs.Length != cm.CurFrameMvs.Length)
+                {
+                    allocator.Free(cm.PrevFrameMvs);
+                    cm.PrevFrameMvs = allocator.Allocate<MvRef>(cm.CurFrameMvs.Length);
+                }
+
+                cm.CurFrameMvs.AsSpan().CopyTo(cm.PrevFrameMvs.AsSpan());
+                if (cm.Seg.Enabled)
+                {
+                    cm.SwapCurrentAndLastSegMap();
+                }
+            }
+
+            if (cm.ShowFrame != 0)
+            {
+                cm.CurShowFrameFbIdx = cm.NewFbIdx;
+            }
+
+            // Update progress in frame parallel decode.
+            cm.LastWidth = cm.Width;
+            cm.LastHeight = cm.Height;
+            if (cm.ShowFrame != 0)
+            {
+                cm.CurrentVideoFrame++;
+            }
+
+            return retcode;
+        }
+
+        public int GetRawFrame(ref Surface sd)
+        {
+            ref Vp9Common cm = ref Common;
+            int ret = -1;
+
+            if (ReadyForNewData == 1)
+            {
+                return ret;
+            }
+
+            ReadyForNewData = 1;
+
+            if (cm.ShowFrame == 0)
+            {
+                return ret;
+            }
+
+            ReadyForNewData = 1;
+
+            sd = cm.FrameToShow.Value;
+            ret = 0;
+
+            return ret;
+        }
+
+        public CodecErr Decode(MemoryAllocator allocator, ArrayPtr<byte> data)
+        {
+            ArrayPtr<byte> dataStart = data;
+            CodecErr res;
+            Array8<uint> frameSizes = new();
+            int frameCount = 0;
+
+            res = Types.Decoder.ParseSuperframeIndex(data, (ulong)data.Length, ref frameSizes, out frameCount);
+            if (res != CodecErr.Ok)
+            {
+                return res;
+            }
+
+            // Decode in serial mode.
+            if (frameCount > 0)
+            {
+                for (int i = 0; i < frameCount; ++i)
+                {
+                    ArrayPtr<byte> dataStartCopy = dataStart;
+                    uint frameSize = frameSizes[i];
+                    if (frameSize > (uint)dataStart.Length)
+                    {
+                        return CodecErr.CorruptFrame;
+                    }
+
+                    res = ReceiveCompressedData(allocator, frameSize, ref dataStartCopy);
+                    if (res != CodecErr.Ok)
+                    {
+                        return res;
+                    }
+
+                    dataStart = dataStart.Slice((int)frameSize);
+                }
+            }
+            else
+            {
+                while (dataStart.Length != 0)
+                {
+                    uint frameSize = (uint)dataStart.Length;
+                    res = ReceiveCompressedData(allocator, frameSize, ref dataStart);
+                    if (res != CodecErr.Ok)
+                    {
+                        return res;
+                    }
+
+                    // Account for suboptimal termination by the encoder.
+                    while (dataStart.Length != 0)
+                    {
+                        byte marker = Types.Decoder.ReadMarker(dataStart);
+                        if (marker != 0)
+                        {
+                            break;
+                        }
+
+                        dataStart = dataStart.Slice(1);
+                    }
+                }
+            }
+
+            return res;
+        }
+    }
+
+    internal static class Decoder
+    {
+        public static byte ReadMarker(ArrayPtr<byte> data)
+        {
+            return data[0];
+        }
+
+        public static CodecErr ParseSuperframeIndex(ArrayPtr<byte> data, ulong dataSz, ref Array8<uint> sizes, out int count)
+        {
+            // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
+            // it is a super frame index. If the last byte of real video compression
+            // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
+            // not the associated matching marker byte at the front of the index we have
+            // an invalid bitstream and need to return an error.
+
+            byte marker;
+
+            Debug.Assert(dataSz != 0);
+            marker = ReadMarker(data.Slice((int)dataSz - 1));
+            count = 0;
+
+            if ((marker & 0xe0) == 0xc0)
+            {
+                uint frames = (uint)(marker & 0x7) + 1;
+                uint mag = (uint)((marker >> 3) & 0x3) + 1;
+                ulong indexSz = 2 + (mag * frames);
+
+                // This chunk is marked as having a superframe index but doesn't have
+                // enough data for it, thus it's an invalid superframe index.
+                if (dataSz < indexSz)
+                {
+                    return CodecErr.CorruptFrame;
+                }
+
+                {
+                    byte marker2 = ReadMarker(data.Slice((int)(dataSz - indexSz)));
+
+                    // This chunk is marked as having a superframe index but doesn't have
+                    // the matching marker byte at the front of the index therefore it's an
+                    // invalid chunk.
+                    if (marker != marker2)
+                    {
+                        return CodecErr.CorruptFrame;
+                    }
+                }
+
+                {
+                    // Found a valid superframe index.
+                    ArrayPtr<byte> x = data.Slice((int)(dataSz - indexSz + 1));
+
+                    for (int i = 0; i < frames; ++i)
+                    {
+                        uint thisSz = 0;
+
+                        for (int j = 0; j < mag; ++j)
+                        {
+                            thisSz |= (uint)x[0] << j * 8;
+                            x = x.Slice(1);
+                        }
+
+                        sizes[i] = thisSz;
+                    }
+
+                    count = (int)frames;
+                }
+            }
+
+            return CodecErr.Ok;
+        }
+    }
+}

+ 10 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxCodecFrameBuffer.cs

@@ -0,0 +1,10 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct VpxCodecFrameBuffer
+    {
+        public ArrayPtr<byte> Data;
+        public Ptr<InternalFrameBuffer> Priv;
+    }
+}

+ 11 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxColorRange.cs

@@ -0,0 +1,11 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum VpxColorRange
+    {
+        // Y [16..235], UV [16..240]
+        Studio,
+
+        // YUV/RGB [0..255]
+        Full
+    }
+}

+ 29 - 0
src/Ryujinx.Graphics.Nvdec.Vp9/Types/VpxColorSpace.cs

@@ -0,0 +1,29 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum VpxColorSpace
+    {
+        // Unknown
+        Unknown,
+
+        // BT.601
+        Bt601,
+
+        // BT.709
+        Bt709,
+
+        // SMPTE.170
+        Smpte170,
+
+        // SMPTE.240
+        Smpte240,
+
+        // BT.2020
+        Bt2020,
+
+        // Reserved
+        Reserved,
+
+        // sRGB
+        Srgb
+    }
+}

+ 2 - 0
src/Ryujinx.Graphics.Nvdec/Types/Vp9/PictureInfo.cs

@@ -59,6 +59,8 @@ namespace Ryujinx.Graphics.Nvdec.Types.Vp9
                     Flags.HasFlag(FrameFlags.LastShowFrame) &&
                     !Flags.HasFlag(FrameFlags.LastFrameIsKeyFrame),
                 RefFrameSignBias = RefFrameSignBias,
+                LoopFilterLevel = FirstLevel,
+                LoopFilterSharpnessLevel = SharpnessLevel,
                 BaseQIndex = BaseQIndex,
                 YDcDeltaQ = YDcDeltaQ,
                 UvDcDeltaQ = UvDcDeltaQ,

+ 3 - 1
src/Ryujinx.Graphics.Video/Vp9PictureInfo.cs

@@ -10,6 +10,8 @@ namespace Ryujinx.Graphics.Video
         public bool IsKeyFrame;
         public bool IntraOnly;
         public Array4<sbyte> RefFrameSignBias;
+        public int LoopFilterLevel;
+        public int LoopFilterSharpnessLevel;
         public int BaseQIndex;
         public int YDcDeltaQ;
         public int UvDcDeltaQ;
@@ -36,4 +38,4 @@ namespace Ryujinx.Graphics.Video
         public Vp9EntropyProbs Entropy;
         public Vp9BackwardUpdates BackwardUpdateCounts;
     }
-}
+}

Algúns arquivos non se mostraron porque demasiados arquivos cambiaron neste cambio