ソースを参照

New NVDEC and VIC implementation (#1384)

* Initial NVDEC and VIC implementation

* Update FFmpeg.AutoGen to 4.3.0

* Add nvdec dependencies for Windows

* Unify some VP9 structures

* Rename VP9 structure fields

* Improvements to Video API

* XML docs for Common.Memory

* Remove now unused or redundant overloads from MemoryAccessor

* NVDEC UV surface read/write scalar paths

* Add FIXME comments about hacky things/stuff that will need to be fixed in the future

* Cleaned up VP9 memory allocation

* Remove some debug logs

* Rename some VP9 structs

* Remove unused struct

* No need to compile Ryujinx.Graphics.Host1x with unsafe anymore

* Name AsyncWorkQueue threads to make debugging easier

* Make Vp9PictureInfo a ref struct

* LayoutConverter no longer needs the depth argument (broken by rebase)

* Pooling of VP9 buffers, plus fix a memory leak on VP9

* Really wish VS could rename projects properly...

* Address feedback

* Remove using

* Catch OperationCanceledException

* Add licensing informations

* Add THIRDPARTY.md to release too

Co-authored-by: Thog <me@thog.eu>
gdkchan 5 年 前
コミット
4d02a2d2c0
100 ファイル変更16553 行追加46 行削除
  1. 6 0
      README.md
  2. 100 0
      Ryujinx.Common/AsyncWorkQueue.cs
  3. 4 1
      Ryujinx.Common/Logging/LogClass.cs
  4. 123 0
      Ryujinx.Common/Memory/ArrayPtr.cs
  5. 21 0
      Ryujinx.Common/Memory/IArray.cs
  6. 68 0
      Ryujinx.Common/Memory/Ptr.cs
  7. 518 0
      Ryujinx.Common/Memory/StructArrayHelpers.cs
  8. 32 0
      Ryujinx.Cpu/MemoryManager.cs
  9. 29 0
      Ryujinx.Cpu/WritableRegion.cs
  10. 10 0
      Ryujinx.Graphics.Device/AccessControl.cs
  11. 124 0
      Ryujinx.Graphics.Device/DeviceState.cs
  12. 8 0
      Ryujinx.Graphics.Device/IDeviceState.cs
  13. 15 0
      Ryujinx.Graphics.Device/RegisterAttribute.cs
  14. 16 0
      Ryujinx.Graphics.Device/RwCallback.cs
  15. 7 0
      Ryujinx.Graphics.Device/Ryujinx.Graphics.Device.csproj
  16. 63 0
      Ryujinx.Graphics.Device/SizeCalculator.cs
  17. 1 1
      Ryujinx.Graphics.Gpu/Engine/Compute.cs
  18. 5 5
      Ryujinx.Graphics.Gpu/Engine/MethodConditionalRendering.cs
  19. 1 1
      Ryujinx.Graphics.Gpu/Engine/Methods.cs
  20. 1 1
      Ryujinx.Graphics.Gpu/GpuContext.cs
  21. 2 0
      Ryujinx.Graphics.Gpu/Image/TextureManager.cs
  22. 0 36
      Ryujinx.Graphics.Gpu/Memory/MemoryAccessor.cs
  23. 59 1
      Ryujinx.Graphics.Gpu/Memory/MemoryManager.cs
  24. 12 0
      Ryujinx.Graphics.Gpu/Memory/PhysicalMemory.cs
  25. 20 0
      Ryujinx.Graphics.Host1x/ClassId.cs
  26. 32 0
      Ryujinx.Graphics.Host1x/Devices.cs
  27. 33 0
      Ryujinx.Graphics.Host1x/Host1xClass.cs
  28. 41 0
      Ryujinx.Graphics.Host1x/Host1xClassRegisters.cs
  29. 123 0
      Ryujinx.Graphics.Host1x/Host1xDevice.cs
  30. 21 0
      Ryujinx.Graphics.Host1x/OpCode.cs
  31. 20 0
      Ryujinx.Graphics.Host1x/Ryujinx.Graphics.Host1x.csproj
  32. 99 0
      Ryujinx.Graphics.Host1x/SyncptIncrManager.cs
  33. 96 0
      Ryujinx.Graphics.Host1x/ThiDevice.cs
  34. 22 0
      Ryujinx.Graphics.Host1x/ThiRegisters.cs
  35. 40 0
      Ryujinx.Graphics.Nvdec.H264/Decoder.cs
  36. 51 0
      Ryujinx.Graphics.Nvdec.H264/FFmpegContext.cs
  37. 121 0
      Ryujinx.Graphics.Nvdec.H264/H264BitStreamWriter.cs
  38. 23 0
      Ryujinx.Graphics.Nvdec.H264/Ryujinx.Graphics.Nvdec.H264.csproj
  39. 159 0
      Ryujinx.Graphics.Nvdec.H264/SpsAndPpsReconstruction.cs
  40. 33 0
      Ryujinx.Graphics.Nvdec.H264/Surface.cs
  41. 9 0
      Ryujinx.Graphics.Nvdec.Vp9/BitDepth.cs
  42. 56 0
      Ryujinx.Graphics.Nvdec.Vp9/CodecErr.cs
  43. 59 0
      Ryujinx.Graphics.Nvdec.Vp9/Common/BitUtils.cs
  44. 94 0
      Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryAllocator.cs
  45. 25 0
      Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryUtil.cs
  46. 71 0
      Ryujinx.Graphics.Nvdec.Vp9/Constants.cs
  47. 1190 0
      Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs
  48. 1159 0
      Ryujinx.Graphics.Nvdec.Vp9/DecodeMv.cs
  49. 164 0
      Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs
  50. 325 0
      Ryujinx.Graphics.Nvdec.Vp9/Detokenize.cs
  51. 949 0
      Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs
  52. 12 0
      Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs
  53. 1379 0
      Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs
  54. 2868 0
      Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs
  55. 73 0
      Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs
  56. 237 0
      Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs
  57. 54 0
      Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs
  58. 536 0
      Ryujinx.Graphics.Nvdec.Vp9/Idct.cs
  59. 15 0
      Ryujinx.Graphics.Nvdec.Vp9/InternalErrorException.cs
  60. 14 0
      Ryujinx.Graphics.Nvdec.Vp9/InternalErrorInfo.cs
  61. 418 0
      Ryujinx.Graphics.Nvdec.Vp9/LoopFilter.cs
  62. 1612 0
      Ryujinx.Graphics.Nvdec.Vp9/Luts.cs
  63. 389 0
      Ryujinx.Graphics.Nvdec.Vp9/PredCommon.cs
  64. 203 0
      Ryujinx.Graphics.Nvdec.Vp9/QuantCommon.cs
  65. 234 0
      Ryujinx.Graphics.Nvdec.Vp9/ReconInter.cs
  66. 761 0
      Ryujinx.Graphics.Nvdec.Vp9/ReconIntra.cs
  67. 20 0
      Ryujinx.Graphics.Nvdec.Vp9/Ryujinx.Graphics.Nvdec.Vp9.csproj
  68. 10 0
      Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs
  69. 15 0
      Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs
  70. 10 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/BModeInfo.cs
  71. 21 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/BlockSize.cs
  72. 10 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/Buf2D.cs
  73. 8 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/FrameType.cs
  74. 27 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilter.cs
  75. 10 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterInfoN.cs
  76. 24 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterMask.cs
  77. 13 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterThresh.cs
  78. 179 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockD.cs
  79. 21 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockDPlane.cs
  80. 66 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/ModeInfo.cs
  81. 14 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/MotionVectorContext.cs
  82. 189 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/Mv.cs
  83. 8 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/Mv32.cs
  84. 17 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/MvClassType.cs
  85. 10 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/MvJointType.cs
  86. 10 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/MvRef.cs
  87. 12 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/PartitionType.cs
  88. 9 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/PlaneType.cs
  89. 14 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/Position.cs
  90. 21 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/PredictionMode.cs
  91. 8 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/RefBuffer.cs
  92. 10 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/ReferenceMode.cs
  93. 451 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/ScaleFactors.cs
  94. 11 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/SegLvlFeatures.cs
  95. 71 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/Segmentation.cs
  96. 80 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/Surface.cs
  97. 85 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/TileInfo.cs
  98. 12 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/TxMode.cs
  99. 11 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/TxSize.cs
  100. 11 0
      Ryujinx.Graphics.Nvdec.Vp9/Types/TxType.cs

+ 6 - 0
README.md

@@ -112,3 +112,9 @@ If you need help with setting up Ryujinx, you can ask questions in the #support
 If you have contributions, need support, have suggestions, or just want to get in touch with the team, join our [Discord server](https://discord.gg/N2FmfVc)!
 
 If you'd like to donate, please take a look at our [Patreon](https://www.patreon.com/ryujinx).
+
+## License
+
+This software is licensed under the terms of the MIT license.
+This project makes use of code authored by the libvpx project, licensed under BSD and the ffmpeg project, licensed under LGPLv3.
+See [LICENSE.txt](LICENSE.txt) and [THIRDPARTY.md](Ryujinx/THIRDPARTY.md) for more details.

+ 100 - 0
Ryujinx.Common/AsyncWorkQueue.cs

@@ -0,0 +1,100 @@
+using System;
+using System.Collections.Concurrent;
+using System.Threading;
+
+namespace Ryujinx.Common
+{
+    public sealed class AsyncWorkQueue<T> : IDisposable
+    {
+        private readonly Thread _workerThread;
+        private readonly CancellationTokenSource _cts;
+        private readonly Action<T> _workerAction;
+        private readonly BlockingCollection<T> _queue;
+
+        public bool IsCancellationRequested => _cts.IsCancellationRequested;
+
+        public AsyncWorkQueue(Action<T> callback, string name = null) : this(callback, name, new BlockingCollection<T>())
+        {
+        }
+
+        public AsyncWorkQueue(Action<T> callback, string name, BlockingCollection<T> collection)
+        {
+            _cts = new CancellationTokenSource();
+            _queue = collection;
+            _workerAction = callback;
+            _workerThread = new Thread(DoWork) { Name = name };
+
+            _workerThread.IsBackground = true;
+            _workerThread.Start();
+        }
+
+        private void DoWork()
+        {
+            try
+            {
+                foreach (var item in _queue.GetConsumingEnumerable(_cts.Token))
+                {
+                    _workerAction(item);
+                }
+            }
+            catch (OperationCanceledException)
+            {
+            }
+        }
+
+        public void Cancel()
+        {
+            _cts.Cancel();
+        }
+
+        public void CancelAfter(int millisecondsDelay)
+        {
+            _cts.CancelAfter(millisecondsDelay);
+        }
+
+        public void CancelAfter(TimeSpan delay)
+        {
+            _cts.CancelAfter(delay);
+        }
+
+        public void Add(T workItem)
+        {
+            _queue.Add(workItem);
+        }
+
+        public void Add(T workItem, CancellationToken cancellationToken)
+        {
+            _queue.Add(workItem, cancellationToken);
+        }
+
+        public bool TryAdd(T workItem)
+        {
+            return _queue.TryAdd(workItem);
+        }
+
+        public bool TryAdd(T workItem, int millisecondsDelay)
+        {
+            return _queue.TryAdd(workItem, millisecondsDelay);
+        }
+
+        public bool TryAdd(T workItem, int millisecondsDelay, CancellationToken cancellationToken)
+        {
+            return _queue.TryAdd(workItem, millisecondsDelay, cancellationToken);
+        }
+
+        public bool TryAdd(T workItem, TimeSpan timeout)
+        {
+            return _queue.TryAdd(workItem, timeout);
+        }
+
+        public void Dispose()
+        {
+            _queue.CompleteAdding();
+            _cts.Cancel();
+            _workerThread.Join();
+
+            _queue.Dispose();
+            _cts.Dispose();
+        }
+    }
+}

+ 4 - 1
Ryujinx.Common/Logging/LogClass.cs

@@ -9,12 +9,14 @@ namespace Ryujinx.Common.Logging
         Emulation,
         Gpu,
         Hid,
+        Host1x,
         Kernel,
         KernelIpc,
         KernelScheduler,
         KernelSvc,
         Loader,
         ModLoader,
+        Nvdec,
         Ptc,
         Service,
         ServiceAcc,
@@ -50,6 +52,7 @@ namespace Ryujinx.Common.Logging
         ServiceSss,
         ServiceTime,
         ServiceVi,
-        SurfaceFlinger
+        SurfaceFlinger,
+        Vic
     }
 }

+ 123 - 0
Ryujinx.Common/Memory/ArrayPtr.cs

@@ -0,0 +1,123 @@
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Common.Memory
+{
+    /// <summary>
+    /// Represents an array of unmanaged resources.
+    /// </summary>
+    /// <typeparam name="T">Array element type</typeparam>
+    public unsafe struct ArrayPtr<T> : IEquatable<ArrayPtr<T>>, IArray<T> where T : unmanaged
+    {
+        private IntPtr _ptr;
+
+        /// <summary>
+        /// Null pointer.
+        /// </summary>
+        public static ArrayPtr<T> Null => new ArrayPtr<T>() { _ptr = IntPtr.Zero };
+
+        /// <summary>
+        /// True if the pointer is null, false otherwise.
+        /// </summary>
+        public bool IsNull => _ptr == IntPtr.Zero;
+
+        /// <summary>
+        /// Number of elements on the array.
+        /// </summary>
+        public int Length { get; }
+
+        /// <summary>
+        /// Gets a reference to the item at the given index.
+        /// </summary>
+        /// <remarks>
+        /// No bounds checks are performed, this allows negative indexing,
+        /// but care must be taken if the index may be out of bounds.
+        /// </remarks>
+        /// <param name="index">Index of the element</param>
+        /// <returns>Reference to the element at the given index</returns>
+        public ref T this[int index] => ref Unsafe.AsRef<T>((T*)_ptr + index);
+
+        /// <summary>
+        /// Creates a new array from a given reference.
+        /// </summary>
+        /// <remarks>
+        /// For data on the heap, proper pinning is necessary during
+        /// use. Failure to do so will result in memory corruption and crashes.
+        /// </remarks>
+        /// <param name="value">Reference of the first array element</param>
+        /// <param name="length">Number of elements on the array</param>
+        public ArrayPtr(ref T value, int length)
+        {
+            _ptr = (IntPtr)Unsafe.AsPointer(ref value);
+            Length = length;
+        }
+
+        /// <summary>
+        /// Creates a new array from a given pointer.
+        /// </summary>
+        /// <param name="ptr">Array base pointer</param>
+        /// <param name="length">Number of elements on the array</param>
+        public ArrayPtr(T* ptr, int length)
+        {
+            _ptr = (IntPtr)ptr;
+            Length = length;
+        }
+
+        /// <summary>
+        /// Creates a new array from a given pointer.
+        /// </summary>
+        /// <param name="ptr">Array base pointer</param>
+        /// <param name="length">Number of elements on the array</param>
+        public ArrayPtr(IntPtr ptr, int length)
+        {
+            _ptr = ptr;
+            Length = length;
+        }
+
+        /// <summary>
+        /// Splits the array starting at the specified position.
+        /// </summary>
+        /// <param name="start">Index where the new array should start</param>
+        /// <returns>New array starting at the specified position</returns>
+        public ArrayPtr<T> Slice(int start) => new ArrayPtr<T>(ref this[start], Length - start);
+
+        /// <summary>
+        /// Gets a span from the array.
+        /// </summary>
+        /// <returns>Span of the array</returns>
+        public Span<T> ToSpan() => Length == 0 ? Span<T>.Empty : MemoryMarshal.CreateSpan(ref this[0], Length);
+
+        /// <summary>
+        /// Gets the array base pointer.
+        /// </summary>
+        /// <returns>Base pointer</returns>
+        public T* ToPointer() => (T*)_ptr;
+
+        public override bool Equals(object obj)
+        {
+            return obj is ArrayPtr<T> other && Equals(other);
+        }
+
+        public bool Equals([AllowNull] ArrayPtr<T> other)
+        {
+            return _ptr == other._ptr && Length == other.Length;
+        }
+
+        public override int GetHashCode()
+        {
+            return HashCode.Combine(_ptr, Length);
+        }
+
+        public static bool operator ==(ArrayPtr<T> left, ArrayPtr<T> right)
+        {
+            return left.Equals(right);
+        }
+
+        public static bool operator !=(ArrayPtr<T> left, ArrayPtr<T> right)
+        {
+            return !(left == right);
+        }
+    }
+}

+ 21 - 0
Ryujinx.Common/Memory/IArray.cs

@@ -0,0 +1,21 @@
+namespace Ryujinx.Common.Memory
+{
+    /// <summary>
+    /// Array interface.
+    /// </summary>
+    /// <typeparam name="T">Element type</typeparam>
+    public interface IArray<T> where T : unmanaged
+    {
+        /// <summary>
+        /// Used to index the array.
+        /// </summary>
+        /// <param name="index">Element index</param>
+        /// <returns>Element at the specified index</returns>
+        ref T this[int index] { get; }
+
+        /// <summary>
+        /// Number of elements on the array.
+        /// </summary>
+        int Length { get; }
+    }
+}

+ 68 - 0
Ryujinx.Common/Memory/Ptr.cs

@@ -0,0 +1,68 @@
+using System;
+using System.Diagnostics.CodeAnalysis;
+using System.Runtime.CompilerServices;
+
+namespace Ryujinx.Common.Memory
+{
+    /// <summary>
+    /// Represents a pointer to an unmanaged resource.
+    /// </summary>
+    /// <typeparam name="T">Type of the unmanaged resource</typeparam>
+    public unsafe struct Ptr<T> : IEquatable<Ptr<T>> where T : unmanaged
+    {
+        private IntPtr _ptr;
+
+        /// <summary>
+        /// Null pointer.
+        /// </summary>
+        public static Ptr<T> Null => new Ptr<T>() { _ptr = IntPtr.Zero };
+
+        /// <summary>
+        /// True if the pointer is null, false otherwise.
+        /// </summary>
+        public bool IsNull => _ptr == IntPtr.Zero;
+
+        /// <summary>
+        /// Gets a reference to the value.
+        /// </summary>
+        public ref T Value => ref Unsafe.AsRef<T>((void*)_ptr);
+
+        /// <summary>
+        /// Creates a new pointer to an unmanaged resource.
+        /// </summary>
+        /// <remarks>
+        /// For data on the heap, proper pinning is necessary during
+        /// use. Failure to do so will result in memory corruption and crashes.
+        /// </remarks>
+        /// <param name="value">Reference to the unmanaged resource</param>
+        public Ptr(ref T value)
+        {
+            _ptr = (IntPtr)Unsafe.AsPointer(ref value);
+        }
+
+        public override bool Equals(object obj)
+        {
+            return obj is Ptr<T> other && Equals(other);
+        }
+
+        public bool Equals([AllowNull] Ptr<T> other)
+        {
+            return _ptr == other._ptr;
+        }
+
+        public override int GetHashCode()
+        {
+            return _ptr.GetHashCode();
+        }
+
+        public static bool operator ==(Ptr<T> left, Ptr<T> right)
+        {
+            return left.Equals(right);
+        }
+
+        public static bool operator !=(Ptr<T> left, Ptr<T> right)
+        {
+            return !(left == right);
+        }
+    }
+}

+ 518 - 0
Ryujinx.Common/Memory/StructArrayHelpers.cs

@@ -0,0 +1,518 @@
+using System;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Common.Memory
+{
+    public struct Array1<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        public int Length => 1;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 1);
+    }
+    public struct Array2<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array1<T> _other;
+        public int Length => 2;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 2);
+    }
+    public struct Array3<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array2<T> _other;
+        public int Length => 3;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 3);
+    }
+    public struct Array4<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array3<T> _other;
+        public int Length => 4;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 4);
+    }
+    public struct Array5<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array4<T> _other;
+        public int Length => 5;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 5);
+    }
+    public struct Array6<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array5<T> _other;
+        public int Length => 6;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 6);
+    }
+    public struct Array7<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array6<T> _other;
+        public int Length => 7;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 7);
+    }
+    public struct Array8<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array7<T> _other;
+        public int Length => 8;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 8);
+    }
+    public struct Array9<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array8<T> _other;
+        public int Length => 9;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 9);
+    }
+    public struct Array10<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array9<T> _other;
+        public int Length => 10;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 10);
+    }
+    public struct Array11<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array10<T> _other;
+        public int Length => 11;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 11);
+    }
+    public struct Array12<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array11<T> _other;
+        public int Length => 12;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 12);
+    }
+    public struct Array13<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array12<T> _other;
+        public int Length => 13;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 13);
+    }
+    public struct Array14<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array13<T> _other;
+        public int Length => 14;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 14);
+    }
+    public struct Array15<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array14<T> _other;
+        public int Length => 15;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 15);
+    }
+    public struct Array16<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array15<T> _other;
+        public int Length => 16;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 16);
+    }
+    public struct Array17<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array16<T> _other;
+        public int Length => 17;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 17);
+    }
+    public struct Array18<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array17<T> _other;
+        public int Length => 18;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 18);
+    }
+    public struct Array19<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array18<T> _other;
+        public int Length => 19;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 19);
+    }
+    public struct Array20<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array19<T> _other;
+        public int Length => 20;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 20);
+    }
+    public struct Array21<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array20<T> _other;
+        public int Length => 21;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 21);
+    }
+    public struct Array22<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array21<T> _other;
+        public int Length => 22;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 22);
+    }
+    public struct Array23<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array22<T> _other;
+        public int Length => 23;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 23);
+    }
+    public struct Array24<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array23<T> _other;
+        public int Length => 24;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 24);
+    }
+    public struct Array25<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array24<T> _other;
+        public int Length => 25;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 25);
+    }
+    public struct Array26<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array25<T> _other;
+        public int Length => 26;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 26);
+    }
+    public struct Array27<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array26<T> _other;
+        public int Length => 27;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 27);
+    }
+    public struct Array28<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array27<T> _other;
+        public int Length => 28;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 28);
+    }
+    public struct Array29<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array28<T> _other;
+        public int Length => 29;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 29);
+    }
+    public struct Array30<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array29<T> _other;
+        public int Length => 30;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 30);
+    }
+    public struct Array31<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array30<T> _other;
+        public int Length => 31;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 31);
+    }
+    public struct Array32<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array31<T> _other;
+        public int Length => 32;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 32);
+    }
+    public struct Array33<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array32<T> _other;
+        public int Length => 33;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 33);
+    }
+    public struct Array34<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array33<T> _other;
+        public int Length => 34;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 34);
+    }
+    public struct Array35<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array34<T> _other;
+        public int Length => 35;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 35);
+    }
+    public struct Array36<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array35<T> _other;
+        public int Length => 36;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 36);
+    }
+    public struct Array37<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array36<T> _other;
+        public int Length => 37;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 37);
+    }
+    public struct Array38<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array37<T> _other;
+        public int Length => 38;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 38);
+    }
+    public struct Array39<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array38<T> _other;
+        public int Length => 39;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 39);
+    }
+    public struct Array40<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array39<T> _other;
+        public int Length => 40;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 40);
+    }
+    public struct Array41<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array40<T> _other;
+        public int Length => 41;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 41);
+    }
+    public struct Array42<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array41<T> _other;
+        public int Length => 42;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 42);
+    }
+    public struct Array43<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array42<T> _other;
+        public int Length => 43;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 43);
+    }
+    public struct Array44<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array43<T> _other;
+        public int Length => 44;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 44);
+    }
+    public struct Array45<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array44<T> _other;
+        public int Length => 45;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 45);
+    }
+    public struct Array46<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array45<T> _other;
+        public int Length => 46;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 46);
+    }
+    public struct Array47<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array46<T> _other;
+        public int Length => 47;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 47);
+    }
+    public struct Array48<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array47<T> _other;
+        public int Length => 48;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 48);
+    }
+    public struct Array49<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array48<T> _other;
+        public int Length => 49;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 49);
+    }
+    public struct Array50<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array49<T> _other;
+        public int Length => 50;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 50);
+    }
+    public struct Array51<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array50<T> _other;
+        public int Length => 51;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 51);
+    }
+    public struct Array52<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array51<T> _other;
+        public int Length => 52;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 52);
+    }
+    public struct Array53<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array52<T> _other;
+        public int Length => 53;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 53);
+    }
+    public struct Array54<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array53<T> _other;
+        public int Length => 54;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 54);
+    }
+    public struct Array55<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array54<T> _other;
+        public int Length => 55;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 55);
+    }
+    public struct Array56<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array55<T> _other;
+        public int Length => 56;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 56);
+    }
+    public struct Array57<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array56<T> _other;
+        public int Length => 57;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 57);
+    }
+    public struct Array58<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array57<T> _other;
+        public int Length => 58;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 58);
+    }
+    public struct Array59<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array58<T> _other;
+        public int Length => 59;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 59);
+    }
+    public struct Array60<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array59<T> _other;
+        public int Length => 60;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 60);
+    }
+    public struct Array61<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array60<T> _other;
+        public int Length => 61;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 61);
+    }
+    public struct Array62<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array61<T> _other;
+        public int Length => 62;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 62);
+    }
+    public struct Array63<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array62<T> _other;
+        public int Length => 63;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 63);
+    }
+    public struct Array64<T> : IArray<T> where T : unmanaged
+    {
+        T _e0;
+        Array63<T> _other;
+        public int Length => 64;
+        public ref T this[int index] => ref ToSpan()[index];
+        public Span<T> ToSpan() => MemoryMarshal.CreateSpan(ref _e0, 64);
+    }
+
+}

+ 32 - 0
Ryujinx.Cpu/MemoryManager.cs

@@ -193,6 +193,38 @@ namespace Ryujinx.Cpu
             }
         }
 
+        /// <summary>
+        /// Gets a region of memory that can be written to.
+        /// </summary>
+        /// <remarks>
+        /// If the requested region is not contiguous in physical memory,
+        /// this will perform an allocation, and flush the data (writing it
+        /// back to guest memory) on disposal.
+        /// </remarks>
+        /// <param name="va">Virtual address of the data</param>
+        /// <param name="size">Size of the data</param>
+        /// <returns>A writable region of memory containing the data</returns>
+        public WritableRegion GetWritableRegion(ulong va, int size)
+        {
+            if (size == 0)
+            {
+                return new WritableRegion(null, va, Memory<byte>.Empty);
+            }
+
+            if (IsContiguous(va, size))
+            {
+                return new WritableRegion(null, va, _backingMemory.GetMemory(GetPhysicalAddressInternal(va), size));
+            }
+            else
+            {
+                Memory<byte> memory = new byte[size];
+
+                GetSpan(va, size).CopyTo(memory.Span);
+
+                return new WritableRegion(this, va, memory);
+            }
+        }
+
         /// <summary>
         /// Gets a reference for the given type at the specified virtual memory address.
         /// </summary>

+ 29 - 0
Ryujinx.Cpu/WritableRegion.cs

@@ -0,0 +1,29 @@
+using System;
+
+namespace Ryujinx.Cpu
+{
+    public sealed class WritableRegion : IDisposable
+    {
+        private readonly MemoryManager _mm;
+        private readonly ulong _va;
+
+        private bool NeedsWriteback => _mm != null;
+
+        public Memory<byte> Memory { get; }
+
+        internal WritableRegion(MemoryManager mm, ulong va, Memory<byte> memory)
+        {
+            _mm = mm;
+            _va = va;
+            Memory = memory;
+        }
+
+        public void Dispose()
+        {
+            if (NeedsWriteback)
+            {
+                _mm.Write(_va, Memory.Span);
+            }
+        }
+    }
+}

+ 10 - 0
Ryujinx.Graphics.Device/AccessControl.cs

@@ -0,0 +1,10 @@
+namespace Ryujinx.Graphics.Device
+{
+    public enum AccessControl
+    {
+        None      = 0,
+        ReadOnly  = 1 << 0,
+        WriteOnly = 1 << 1,
+        ReadWrite = ReadOnly | WriteOnly
+    }
+}

+ 124 - 0
Ryujinx.Graphics.Device/DeviceState.cs

@@ -0,0 +1,124 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Linq;
+using System.Reflection;
+using System.Runtime.CompilerServices;
+
+namespace Ryujinx.Graphics.Device
+{
+    public class DeviceState<TState> : IDeviceState where TState : unmanaged
+    {
+        private const int RegisterSize = sizeof(int);
+
+        public TState State;
+
+        private readonly BitArray _readableRegisters;
+        private readonly BitArray _writableRegisters;
+
+        private readonly Dictionary<int, Func<int>> _readCallbacks;
+        private readonly Dictionary<int, Action<int>> _writeCallbacks;
+
+        public DeviceState(IReadOnlyDictionary<string, RwCallback> callbacks = null)
+        {
+            int size = (Unsafe.SizeOf<TState>() + RegisterSize - 1) / RegisterSize;
+
+            _readableRegisters = new BitArray(size);
+            _writableRegisters = new BitArray(size);
+
+            _readCallbacks = new Dictionary<int, Func<int>>();
+            _writeCallbacks = new Dictionary<int, Action<int>>();
+
+            var fields = typeof(TState).GetFields();
+            int offset = 0;
+
+            for (int fieldIndex = 0; fieldIndex < fields.Length; fieldIndex++)
+            {
+                var field = fields[fieldIndex];
+                var regAttr = field.GetCustomAttributes<RegisterAttribute>(false).FirstOrDefault();
+
+                int sizeOfField = SizeCalculator.SizeOf(field.FieldType);
+
+                for (int i = 0; i < ((sizeOfField + 3) & ~3); i += 4)
+                {
+                    _readableRegisters[(offset + i) / RegisterSize] = regAttr?.AccessControl.HasFlag(AccessControl.ReadOnly)  ?? true;
+                    _writableRegisters[(offset + i) / RegisterSize] = regAttr?.AccessControl.HasFlag(AccessControl.WriteOnly) ?? true;
+                }
+
+                if (callbacks != null && callbacks.TryGetValue(field.Name, out var cb))
+                {
+                    if (cb.Read != null)
+                    {
+                        _readCallbacks.Add(offset, cb.Read);
+                    }
+
+                    if (cb.Write != null)
+                    {
+                        _writeCallbacks.Add(offset, cb.Write);
+                    }
+                }
+
+                offset += sizeOfField;
+            }
+
+            Debug.Assert(offset == Unsafe.SizeOf<TState>());
+        }
+
+        public virtual int Read(int offset)
+        {
+            if (Check(offset) && _readableRegisters[offset / RegisterSize])
+            {
+                int alignedOffset = Align(offset);
+
+                if (_readCallbacks.TryGetValue(alignedOffset, out Func<int> read))
+                {
+                    return read();
+                }
+                else
+                {
+                    return GetRef<int>(alignedOffset);
+                }
+            }
+
+            return 0;
+        }
+
+        public virtual void Write(int offset, int data)
+        {
+            if (Check(offset) && _writableRegisters[offset / RegisterSize])
+            {
+                int alignedOffset = Align(offset);
+
+                if (_writeCallbacks.TryGetValue(alignedOffset, out Action<int> write))
+                {
+                    write(data);
+                }
+                else
+                {
+                    GetRef<int>(alignedOffset) = data;
+                }
+            }
+        }
+
+        private bool Check(int offset)
+        {
+            return (uint)Align(offset) < Unsafe.SizeOf<TState>();
+        }
+
+        public ref T GetRef<T>(int offset) where T : unmanaged
+        {
+            if ((uint)(offset + Unsafe.SizeOf<T>()) > Unsafe.SizeOf<TState>())
+            {
+                throw new ArgumentOutOfRangeException(nameof(offset));
+            }
+
+            return ref Unsafe.As<TState, T>(ref Unsafe.AddByteOffset(ref State, (IntPtr)offset));
+        }
+
+        private static int Align(int offset)
+        {
+            return offset & ~(RegisterSize - 1);
+        }
+    }
+}

+ 8 - 0
Ryujinx.Graphics.Device/IDeviceState.cs

@@ -0,0 +1,8 @@
+namespace Ryujinx.Graphics.Device
+{
+    public interface IDeviceState
+    {
+        int Read(int offset);
+        void Write(int offset, int data);
+    }
+}

+ 15 - 0
Ryujinx.Graphics.Device/RegisterAttribute.cs

@@ -0,0 +1,15 @@
+using System;
+
+namespace Ryujinx.Graphics.Device
+{
+    [AttributeUsage(AttributeTargets.Field, AllowMultiple = false)]
+    public sealed class RegisterAttribute : Attribute
+    {
+        public AccessControl AccessControl { get; }
+
+        public RegisterAttribute(AccessControl ac)
+        {
+            AccessControl = ac;
+        }
+    }
+}

+ 16 - 0
Ryujinx.Graphics.Device/RwCallback.cs

@@ -0,0 +1,16 @@
+using System;
+
+namespace Ryujinx.Graphics.Device
+{
+    public struct RwCallback
+    {
+        public Action<int> Write { get; }
+        public Func<int> Read { get; }
+
+        public RwCallback(Action<int> write, Func<int> read)
+        {
+            Write = write;
+            Read = read;
+        }
+    }
+}

+ 7 - 0
Ryujinx.Graphics.Device/Ryujinx.Graphics.Device.csproj

@@ -0,0 +1,7 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netcoreapp3.1</TargetFramework>
+  </PropertyGroup>
+
+</Project>

+ 63 - 0
Ryujinx.Graphics.Device/SizeCalculator.cs

@@ -0,0 +1,63 @@
+using System;
+using System.Reflection;
+
+namespace Ryujinx.Graphics.Device
+{
+    static class SizeCalculator
+    {
+        public static int SizeOf(Type type)
+        {
+            // Is type a enum type?
+            if (type.IsEnum)
+            {
+                type = type.GetEnumUnderlyingType();
+            }
+
+            // Is type a pointer type?
+            if (type.IsPointer || type == typeof(IntPtr) || type == typeof(UIntPtr))
+            {
+                return IntPtr.Size;
+            }
+
+            // Is type a struct type?
+            if (type.IsValueType && !type.IsPrimitive)
+            {
+                // Check if the struct has a explicit size, if so, return that.
+                if (type.StructLayoutAttribute.Size != 0)
+                {
+                    return type.StructLayoutAttribute.Size;
+                }
+
+                // Otherwise we calculate the sum of the sizes of all fields.
+                int size = 0;
+                var fields = type.GetFields(BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance);
+
+                for (int fieldIndex = 0; fieldIndex < fields.Length; fieldIndex++)
+                {
+                    size += SizeOf(fields[fieldIndex].FieldType);
+                }
+
+                return size;
+            }
+
+            // Primitive types.
+            return (Type.GetTypeCode(type)) switch
+            {
+                TypeCode.SByte => sizeof(sbyte),
+                TypeCode.Byte => sizeof(byte),
+                TypeCode.Int16 => sizeof(short),
+                TypeCode.UInt16 => sizeof(ushort),
+                TypeCode.Int32 => sizeof(int),
+                TypeCode.UInt32 => sizeof(uint),
+                TypeCode.Int64 => sizeof(long),
+                TypeCode.UInt64 => sizeof(ulong),
+                TypeCode.Char => sizeof(char),
+                TypeCode.Single => sizeof(float),
+                TypeCode.Double => sizeof(double),
+                TypeCode.Decimal => sizeof(decimal),
+                TypeCode.Boolean => sizeof(bool),
+                _ => throw new ArgumentException($"Length for type \"{type.Name}\" is unknown.")
+            };
+        }
+    }
+}

+ 1 - 1
Ryujinx.Graphics.Gpu/Engine/Compute.cs

@@ -67,7 +67,7 @@ namespace Ryujinx.Graphics.Gpu.Engine
 
             TextureManager.SetComputeTextureBufferIndex(state.Get<int>(MethodOffset.TextureBufferIndex));
 
-            ShaderProgramInfo info = cs.Shaders[0].Program.Info;            
+            ShaderProgramInfo info = cs.Shaders[0].Program.Info;
 
             for (int index = 0; index < info.CBuffers.Count; index++)
             {

+ 5 - 5
Ryujinx.Graphics.Gpu/Engine/MethodConditionalRendering.cs

@@ -63,7 +63,7 @@ namespace Ryujinx.Graphics.Gpu.Engine
             else
             {
                 evt.Flush();
-                return (_context.MemoryAccessor.ReadUInt64(gpuVa) != 0) ? ConditionalRenderEnabled.True : ConditionalRenderEnabled.False;
+                return (_context.MemoryAccessor.Read<ulong>(gpuVa) != 0) ? ConditionalRenderEnabled.True : ConditionalRenderEnabled.False;
             }
         }
 
@@ -87,11 +87,11 @@ namespace Ryujinx.Graphics.Gpu.Engine
 
             if (evt != null && evt2 == null)
             {
-                useHost = _context.Renderer.Pipeline.TryHostConditionalRendering(evt, _context.MemoryAccessor.ReadUInt64(gpuVa + 16), isEqual);
+                useHost = _context.Renderer.Pipeline.TryHostConditionalRendering(evt, _context.MemoryAccessor.Read<ulong>(gpuVa + 16), isEqual);
             }
             else if (evt == null && evt2 != null)
             {
-                useHost = _context.Renderer.Pipeline.TryHostConditionalRendering(evt2, _context.MemoryAccessor.ReadUInt64(gpuVa), isEqual);
+                useHost = _context.Renderer.Pipeline.TryHostConditionalRendering(evt2, _context.MemoryAccessor.Read<ulong>(gpuVa), isEqual);
             }
             else
             {
@@ -107,8 +107,8 @@ namespace Ryujinx.Graphics.Gpu.Engine
                 evt?.Flush();
                 evt2?.Flush();
 
-                ulong x = _context.MemoryAccessor.ReadUInt64(gpuVa);
-                ulong y = _context.MemoryAccessor.ReadUInt64(gpuVa + 16);
+                ulong x = _context.MemoryAccessor.Read<ulong>(gpuVa);
+                ulong y = _context.MemoryAccessor.Read<ulong>(gpuVa + 16);
 
                 return (isEqual ? x == y : x != y) ? ConditionalRenderEnabled.True : ConditionalRenderEnabled.False;
             }

+ 1 - 1
Ryujinx.Graphics.Gpu/Engine/Methods.cs

@@ -466,7 +466,7 @@ namespace Ryujinx.Graphics.Gpu.Engine
 
             bool   flipY  = yControl.HasFlag(YControl.NegateY);
             Origin origin = yControl.HasFlag(YControl.TriangleRastFlip) ? Origin.LowerLeft : Origin.UpperLeft;
-            
+
             _context.Renderer.Pipeline.SetOrigin(origin);
 
             // The triangle rast flip flag only affects rasterization, the viewport is not flipped.

+ 1 - 1
Ryujinx.Graphics.Gpu/GpuContext.cs

@@ -77,7 +77,7 @@ namespace Ryujinx.Graphics.Gpu
         {
             Renderer = renderer;
 
-            MemoryManager = new MemoryManager();
+            MemoryManager = new MemoryManager(this);
 
             MemoryAccessor = new MemoryAccessor(this);
 

+ 2 - 0
Ryujinx.Graphics.Gpu/Image/TextureManager.cs

@@ -643,6 +643,8 @@ namespace Ryujinx.Graphics.Gpu.Image
                         overlap.ChangeSize(info.Width, info.Height, info.DepthOrLayers);
                     }
 
+                    overlap.SynchronizeMemory();
+
                     return overlap;
                 }
             }

+ 0 - 36
Ryujinx.Graphics.Gpu/Memory/MemoryAccessor.cs

@@ -58,42 +58,6 @@ namespace Ryujinx.Graphics.Gpu.Memory
             return MemoryMarshal.Cast<byte, T>(_context.PhysicalMemory.GetSpan(processVa, Unsafe.SizeOf<T>()))[0];
         }
 
-        /// <summary>
-        /// Reads a 32-bits signed integer from GPU mapped memory.
-        /// </summary>
-        /// <param name="gpuVa">GPU virtual address where the value is located</param>
-        /// <returns>The value at the specified memory location</returns>
-        public int ReadInt32(ulong gpuVa)
-        {
-            ulong processVa = _context.MemoryManager.Translate(gpuVa);
-
-            return _context.PhysicalMemory.Read<int>(processVa);
-        }
-
-        /// <summary>
-        /// Reads a 64-bits unsigned integer from GPU mapped memory.
-        /// </summary>
-        /// <param name="gpuVa">GPU virtual address where the value is located</param>
-        /// <returns>The value at the specified memory location</returns>
-        public ulong ReadUInt64(ulong gpuVa)
-        {
-            ulong processVa = _context.MemoryManager.Translate(gpuVa);
-
-            return _context.PhysicalMemory.Read<ulong>(processVa);
-        }
-
-        /// <summary>
-        /// Reads a 8-bits unsigned integer from GPU mapped memory.
-        /// </summary>
-        /// <param name="gpuVa">GPU virtual address where the value is located</param>
-        /// <param name="value">The value to be written</param>
-        public void WriteByte(ulong gpuVa, byte value)
-        {
-            ulong processVa = _context.MemoryManager.Translate(gpuVa);
-
-            _context.PhysicalMemory.Write(processVa, MemoryMarshal.CreateSpan(ref value, 1));
-        }
-
         /// <summary>
         /// Writes a 32-bits signed integer to GPU mapped memory.
         /// </summary>

+ 59 - 1
Ryujinx.Graphics.Gpu/Memory/MemoryManager.cs

@@ -1,4 +1,7 @@
+using Ryujinx.Cpu;
 using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 
 namespace Ryujinx.Graphics.Gpu.Memory
 {
@@ -33,14 +36,69 @@ namespace Ryujinx.Graphics.Gpu.Memory
 
         public event EventHandler<UnmapEventArgs> MemoryUnmapped;
 
+        private GpuContext _context;
+
         /// <summary>
         /// Creates a new instance of the GPU memory manager.
         /// </summary>
-        public MemoryManager()
+        public MemoryManager(GpuContext context)
         {
+            _context = context;
             _pageTable = new ulong[PtLvl0Size][];
         }
 
+        /// <summary>
+        /// Reads data from GPU mapped memory.
+        /// </summary>
+        /// <typeparam name="T">Type of the data</typeparam>
+        /// <param name="gpuVa">GPU virtual address where the data is located</param>
+        /// <returns>The data at the specified memory location</returns>
+        public T Read<T>(ulong gpuVa) where T : unmanaged
+        {
+            ulong processVa = Translate(gpuVa);
+
+            return MemoryMarshal.Cast<byte, T>(_context.PhysicalMemory.GetSpan(processVa, Unsafe.SizeOf<T>()))[0];
+        }
+
+        /// <summary>
+        /// Gets a read-only span of data from GPU mapped memory.
+        /// This reads as much data as possible, up to the specified maximum size.
+        /// </summary>
+        /// <param name="gpuVa">GPU virtual address where the data is located</param>
+        /// <param name="size">Size of the data</param>
+        /// <returns>The span of the data at the specified memory location</returns>
+        public ReadOnlySpan<byte> GetSpan(ulong gpuVa, int size)
+        {
+            ulong processVa = Translate(gpuVa);
+
+            return _context.PhysicalMemory.GetSpan(processVa, size);
+        }
+
+        /// <summary>
+        /// Gets a writable region from GPU mapped memory.
+        /// </summary>
+        /// <param name="address">Start address of the range</param>
+        /// <param name="size">Size in bytes to be range</param>
+        /// <returns>A writable region with the data at the specified memory location</returns>
+        public WritableRegion GetWritableRegion(ulong gpuVa, int size)
+        {
+            ulong processVa = Translate(gpuVa);
+
+            return _context.PhysicalMemory.GetWritableRegion(processVa, size);
+        }
+
+        /// <summary>
+        /// Writes data to GPU mapped memory.
+        /// </summary>
+        /// <param name="gpuVa">GPU virtual address to write the data into</param>
+        /// <param name="data">The data to be written</param>
+        public void Write(ulong gpuVa, ReadOnlySpan<byte> data)
+        {
+            ulong processVa = Translate(gpuVa);
+
+            _context.PhysicalMemory.Write(processVa, data);
+        }
+
         /// <summary>
         /// Maps a given range of pages to the specified CPU virtual address.
         /// </summary>

+ 12 - 0
Ryujinx.Graphics.Gpu/Memory/PhysicalMemory.cs

@@ -1,3 +1,4 @@
+using Ryujinx.Cpu;
 using System;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
@@ -34,6 +35,17 @@ namespace Ryujinx.Graphics.Gpu.Memory
             return _cpuMemory.GetSpan(address, size);
         }
 
+        /// <summary>
+        /// Gets a writable region from the application process.
+        /// </summary>
+        /// <param name="address">Start address of the range</param>
+        /// <param name="size">Size in bytes to be range</param>
+        /// <returns>A writable region with the data at the specified memory location</returns>
+        public WritableRegion GetWritableRegion(ulong address, int size)
+        {
+            return _cpuMemory.GetWritableRegion(address, size);
+        }
+
         /// <summary>
         /// Reads data from the application process.
         /// </summary>

+ 20 - 0
Ryujinx.Graphics.Host1x/ClassId.cs

@@ -0,0 +1,20 @@
+namespace Ryujinx.Graphics.Host1x
+{
+    public enum ClassId
+    {
+        Host1x = 0x1,
+        Mpeg = 0x20,
+        Nvenc = 0x21,
+        Vi = 0x30,
+        Isp = 0x32,
+        Ispb = 0x34,
+        Vii2c = 0x36,
+        Vic = 0x5d,
+        Gr3d = 0x60,
+        Gpu = 0x61,
+        Tsec = 0xe0,
+        Tsecb = 0xe1,
+        Nvjpg = 0xc0,
+        Nvdec = 0xf0
+    }
+}

+ 32 - 0
Ryujinx.Graphics.Host1x/Devices.cs

@@ -0,0 +1,32 @@
+using Ryujinx.Graphics.Device;
+using System;
+using System.Collections.Generic;
+
+namespace Ryujinx.Graphics.Host1x
+{
+    class Devices : IDisposable
+    {
+        private readonly Dictionary<ClassId, IDeviceState> _devices = new Dictionary<ClassId, IDeviceState>();
+
+        public void RegisterDevice(ClassId classId, IDeviceState device)
+        {
+            _devices[classId] = device;
+        }
+
+        public IDeviceState GetDevice(ClassId classId)
+        {
+            return _devices.TryGetValue(classId, out IDeviceState device) ? device : null;
+        }
+
+        public void Dispose()
+        {
+            foreach (var device in _devices.Values)
+            {
+                if (device is ThiDevice thi)
+                {
+                    thi.Dispose();
+                }
+            }
+        }
+    }
+}

+ 33 - 0
Ryujinx.Graphics.Host1x/Host1xClass.cs

@@ -0,0 +1,33 @@
+using Ryujinx.Graphics.Device;
+using Ryujinx.Graphics.Gpu.Synchronization;
+using System.Collections.Generic;
+using System.Threading;
+
+namespace Ryujinx.Graphics.Host1x
+{
+    public class Host1xClass : IDeviceState
+    {
+        private readonly SynchronizationManager _syncMgr;
+        private readonly DeviceState<Host1xClassRegisters> _state;
+
+        public Host1xClass(SynchronizationManager syncMgr)
+        {
+            _syncMgr = syncMgr;
+            _state = new DeviceState<Host1xClassRegisters>(new Dictionary<string, RwCallback>
+            {
+                { nameof(Host1xClassRegisters.WaitSyncpt32), new RwCallback(WaitSyncpt32, null) }
+            });
+        }
+
+        public int Read(int offset) => _state.Read(offset);
+        public void Write(int offset, int data) => _state.Write(offset, data);
+
+        private void WaitSyncpt32(int data)
+        {
+            uint syncpointId = (uint)(data & 0xFF);
+            uint threshold = _state.State.LoadSyncptPayload32;
+
+            _syncMgr.WaitOnSyncpoint(syncpointId, threshold, Timeout.InfiniteTimeSpan);
+        }
+    }
+}

+ 41 - 0
Ryujinx.Graphics.Host1x/Host1xClassRegisters.cs

@@ -0,0 +1,41 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Host1x
+{
+    struct Host1xClassRegisters
+    {
+        public uint IncrSyncpt;
+        public uint IncrSyncptCntrl;
+        public uint IncrSyncptError;
+        public Array5<uint> ReservedC;
+        public uint WaitSyncpt;
+        public uint WaitSyncptBase;
+        public uint WaitSyncptIncr;
+        public uint LoadSyncptBase;
+        public uint IncrSyncptBase;
+        public uint Clear;
+        public uint Wait;
+        public uint WaitWithIntr;
+        public uint DelayUsec;
+        public uint TickcountHi;
+        public uint TickcountLo;
+        public uint Tickctrl;
+        public Array23<uint> Reserved50;
+        public uint Indctrl;
+        public uint Indoff2;
+        public uint Indoff;
+        public Array31<uint> Inddata;
+        public uint Reserved134;
+        public uint LoadSyncptPayload32;
+        public uint Stallctrl;
+        public uint WaitSyncpt32;
+        public uint WaitSyncptBase32;
+        public uint LoadSyncptBase32;
+        public uint IncrSyncptBase32;
+        public uint StallcountHi;
+        public uint StallcountLo;
+        public uint Xrefctrl;
+        public uint ChannelXrefHi;
+        public uint ChannelXrefLo;
+    }
+}

+ 123 - 0
Ryujinx.Graphics.Host1x/Host1xDevice.cs

@@ -0,0 +1,123 @@
+using Ryujinx.Common;
+using Ryujinx.Common.Logging;
+using Ryujinx.Graphics.Device;
+using Ryujinx.Graphics.Gpu.Synchronization;
+using System;
+using System.Numerics;
+
+namespace Ryujinx.Graphics.Host1x
+{
+    public sealed class Host1xDevice : IDisposable
+    {
+        private readonly SyncptIncrManager _syncptIncrMgr;
+        private readonly AsyncWorkQueue<int[]> _commandQueue;
+
+        private readonly Devices _devices = new Devices();
+
+        public Host1xClass Class { get; }
+
+        private IDeviceState _device;
+
+        private int _count;
+        private int _offset;
+        private int _mask;
+        private bool _incrementing;
+
+        public Host1xDevice(SynchronizationManager syncMgr)
+        {
+            _syncptIncrMgr = new SyncptIncrManager(syncMgr);
+            _commandQueue = new AsyncWorkQueue<int[]>(Process, "Ryujinx.Host1xProcessor");
+
+            Class = new Host1xClass(syncMgr);
+
+            _devices.RegisterDevice(ClassId.Host1x, Class);
+        }
+
+        public void RegisterDevice(ClassId classId, IDeviceState device)
+        {
+            var thi = new ThiDevice(classId, device ?? throw new ArgumentNullException(nameof(device)), _syncptIncrMgr);
+            _devices.RegisterDevice(classId, thi);
+        }
+
+        public void Submit(ReadOnlySpan<int> commandBuffer)
+        {
+            _commandQueue.Add(commandBuffer.ToArray());
+        }
+
+        private void Process(int[] commandBuffer)
+        {
+            for (int index = 0; index < commandBuffer.Length; index++)
+            {
+                Step(commandBuffer[index]);
+            }
+        }
+
+        private void Step(int value)
+        {
+            if (_mask != 0)
+            {
+                int lbs = BitOperations.TrailingZeroCount(_mask);
+
+                _mask &= ~(1 << lbs);
+
+                DeviceWrite(_offset + lbs, value);
+
+                return;
+            }
+            else if (_count != 0)
+            {
+                _count--;
+
+                DeviceWrite(_offset, value);
+
+                if (_incrementing)
+                {
+                    _offset++;
+                }
+
+                return;
+            }
+
+            OpCode opCode = (OpCode)((value >> 28) & 0xf);
+
+            switch (opCode)
+            {
+                case OpCode.SetClass:
+                    _mask = value & 0x3f;
+                    ClassId classId = (ClassId)((value >> 6) & 0x3ff);
+                    _offset = (value >> 16) & 0xfff;
+                    _device = _devices.GetDevice(classId);
+                    break;
+                case OpCode.Incr:
+                case OpCode.NonIncr:
+                    _count = value & 0xffff;
+                    _offset = (value >> 16) & 0xfff;
+                    _incrementing = opCode == OpCode.Incr;
+                    break;
+                case OpCode.Mask:
+                    _mask = value & 0xffff;
+                    _offset = (value >> 16) & 0xfff;
+                    break;
+                case OpCode.Imm:
+                    int data = value & 0xfff;
+                    _offset = (value >> 16) & 0xfff;
+                    DeviceWrite(_offset, data);
+                    break;
+                default:
+                    Logger.PrintError(LogClass.Host1x, $"Unsupported opcode \"{opCode}\".");
+                    break;
+            }
+        }
+
+        private void DeviceWrite(int offset, int data)
+        {
+            _device?.Write(offset * 4, data);
+        }
+
+        public void Dispose()
+        {
+            _commandQueue.Dispose();
+            _devices.Dispose();
+        }
+    }
+}

+ 21 - 0
Ryujinx.Graphics.Host1x/OpCode.cs

@@ -0,0 +1,21 @@
+namespace Ryujinx.Graphics.Host1x
+{
+    enum OpCode
+    {
+        SetClass,
+        Incr,
+        NonIncr,
+        Mask,
+        Imm,
+        Restart,
+        Gather,
+        SetStrmId,
+        SetAppId,
+        SetPyld,
+        IncrW,
+        NonIncrW,
+        GatherW,
+        RestartW,
+        Extend
+    }
+}

+ 20 - 0
Ryujinx.Graphics.Host1x/Ryujinx.Graphics.Host1x.csproj

@@ -0,0 +1,20 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netcoreapp3.1</TargetFramework>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|AnyCPU'">
+    <AllowUnsafeBlocks>false</AllowUnsafeBlocks>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
+    <AllowUnsafeBlocks>false</AllowUnsafeBlocks>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Ryujinx.Graphics.Device\Ryujinx.Graphics.Device.csproj" />
+    <ProjectReference Include="..\Ryujinx.Graphics.Gpu\Ryujinx.Graphics.Gpu.csproj" />
+  </ItemGroup>
+
+</Project>

+ 99 - 0
Ryujinx.Graphics.Host1x/SyncptIncrManager.cs

@@ -0,0 +1,99 @@
+using Ryujinx.Graphics.Gpu.Synchronization;
+using System.Collections.Generic;
+
+namespace Ryujinx.Graphics.Host1x
+{
+    class SyncptIncrManager
+    {
+        private readonly SynchronizationManager _syncMgr;
+
+        private struct SyncptIncr
+        {
+            public uint Id { get; }
+            public ClassId ClassId { get; }
+            public uint SyncptId { get; }
+            public bool Done { get; }
+
+            public SyncptIncr(uint id, ClassId classId, uint syncptId, bool done = false)
+            {
+                Id = id;
+                ClassId = classId;
+                SyncptId = syncptId;
+                Done = done;
+            }
+        }
+
+        private readonly List<SyncptIncr> _incrs = new List<SyncptIncr>();
+
+        private uint _currentId;
+
+        public SyncptIncrManager(SynchronizationManager syncMgr)
+        {
+            _syncMgr = syncMgr;
+        }
+
+        public void Increment(uint id)
+        {
+            lock (_incrs)
+            {
+                _incrs.Add(new SyncptIncr(0, 0, id, true));
+
+                IncrementAllDone();
+            }
+        }
+
+        public uint IncrementWhenDone(ClassId classId, uint id)
+        {
+            lock (_incrs)
+            {
+                uint handle = _currentId++;
+
+                _incrs.Add(new SyncptIncr(handle, classId, id));
+
+                return handle;
+            }
+        }
+
+        public void SignalDone(uint handle)
+        {
+            lock (_incrs)
+            {
+                // Set pending increment with the given handle to "done".
+                for (int i = 0; i < _incrs.Count; i++)
+                {
+                    SyncptIncr incr = _incrs[i];
+
+                    if (_incrs[i].Id == handle)
+                    {
+                        _incrs[i] = new SyncptIncr(incr.Id, incr.ClassId, incr.SyncptId, true);
+
+                        break;
+                    }
+                }
+
+                IncrementAllDone();
+            }
+        }
+
+        private void IncrementAllDone()
+        {
+            lock (_incrs)
+            {
+                // Increment all sequential pending increments that are already done.
+                int doneCount = 0;
+
+                for (; doneCount < _incrs.Count; doneCount++)
+                {
+                    if (!_incrs[doneCount].Done)
+                    {
+                        break;
+                    }
+
+                    _syncMgr.IncrementSyncpoint(_incrs[doneCount].SyncptId);
+                }
+
+                _incrs.RemoveRange(0, doneCount);
+            }
+        }
+    }
+}

+ 96 - 0
Ryujinx.Graphics.Host1x/ThiDevice.cs

@@ -0,0 +1,96 @@
+using Ryujinx.Common;
+using Ryujinx.Graphics.Device;
+using System;
+using System.Collections.Generic;
+
+namespace Ryujinx.Graphics.Host1x
+{
+    class ThiDevice : IDeviceState, IDisposable
+    {
+        private readonly ClassId _classId;
+        private readonly IDeviceState _device;
+
+        private readonly SyncptIncrManager _syncptIncrMgr;
+
+        private class CommandAction
+        {
+            public int Data { get; }
+
+            public CommandAction(int data)
+            {
+                Data = data;
+            }
+        }
+
+        private class MethodCallAction : CommandAction
+        {
+            public int Method { get; }
+
+            public MethodCallAction(int method, int data) : base(data)
+            {
+                Method = method;
+            }
+        }
+
+        private class SyncptIncrAction : CommandAction
+        {
+            public SyncptIncrAction(uint syncptIncrHandle) : base((int)syncptIncrHandle)
+            {
+            }
+        }
+
+        private readonly AsyncWorkQueue<CommandAction> _commandQueue;
+
+        private readonly DeviceState<ThiRegisters> _state;
+
+        public ThiDevice(ClassId classId, IDeviceState device, SyncptIncrManager syncptIncrMgr)
+        {
+            _classId = classId;
+            _device = device;
+            _syncptIncrMgr = syncptIncrMgr;
+            _commandQueue = new AsyncWorkQueue<CommandAction>(Process, $"Ryujinx.{classId}Processor");
+            _state = new DeviceState<ThiRegisters>(new Dictionary<string, RwCallback>
+            {
+                { nameof(ThiRegisters.IncrSyncpt), new RwCallback(IncrSyncpt, null) },
+                { nameof(ThiRegisters.Method1), new RwCallback(Method1, null) }
+            });
+        }
+
+        public int Read(int offset) => _state.Read(offset);
+        public void Write(int offset, int data) => _state.Write(offset, data);
+
+        private void IncrSyncpt(int data)
+        {
+            uint syncpointId = (uint)(data & 0xFF);
+            uint cond = (uint)((data >> 8) & 0xFF); // 0 = Immediate, 1 = Done
+
+            if (cond == 0)
+            {
+                _syncptIncrMgr.Increment(syncpointId);
+            }
+            else
+            {
+                _commandQueue.Add(new SyncptIncrAction(_syncptIncrMgr.IncrementWhenDone(_classId, syncpointId)));
+            }
+        }
+
+        private void Method1(int data)
+        {
+            _commandQueue.Add(new MethodCallAction((int)_state.State.Method0 * 4, data));
+        }
+
+        private void Process(CommandAction cmdAction)
+        {
+            if (cmdAction is SyncptIncrAction syncptIncrAction)
+            {
+                _syncptIncrMgr.SignalDone((uint)syncptIncrAction.Data);
+            }
+            else if (cmdAction is MethodCallAction methodCallAction)
+            {
+                _device.Write(methodCallAction.Method, methodCallAction.Data);
+            }
+        }
+
+        public void Dispose() => _commandQueue.Dispose();
+    }
+}

+ 22 - 0
Ryujinx.Graphics.Host1x/ThiRegisters.cs

@@ -0,0 +1,22 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Host1x
+{
+    struct ThiRegisters
+    {
+        public uint IncrSyncpt;
+        public uint Reserved4;
+        public uint IncrSyncptErr;
+        public uint CtxswIncrSyncpt;
+        public Array4<uint> Reserved10;
+        public uint Ctxsw;
+        public uint Reserved24;
+        public uint ContSyncptEof;
+        public Array5<uint> Reserved2C;
+        public uint Method0;
+        public uint Method1;
+        public Array12<uint> Reserved48;
+        public uint IntStatus;
+        public uint IntMask;
+    }
+}

+ 40 - 0
Ryujinx.Graphics.Nvdec.H264/Decoder.cs

@@ -0,0 +1,40 @@
+using Ryujinx.Graphics.Video;
+using System;
+
+namespace Ryujinx.Graphics.Nvdec.H264
+{
+    public class Decoder : IH264Decoder
+    {
+        public bool IsHardwareAccelerated => false;
+
+        private const int WorkBufferSize = 0x200;
+
+        private readonly byte[] _workBuffer = new byte[WorkBufferSize];
+
+        private readonly FFmpegContext _context = new FFmpegContext();
+
+        public ISurface CreateSurface(int width, int height)
+        {
+            return new Surface();
+        }
+
+        public bool Decode(ref H264PictureInfo pictureInfo, ISurface output, ReadOnlySpan<byte> bitstream)
+        {
+            Span<byte> bs = Prepend(bitstream, SpsAndPpsReconstruction.Reconstruct(ref pictureInfo, _workBuffer));
+
+            return _context.DecodeFrame((Surface)output, bs) == 0;
+        }
+
+        private static byte[] Prepend(ReadOnlySpan<byte> data, ReadOnlySpan<byte> prep)
+        {
+            byte[] output = new byte[data.Length + prep.Length];
+
+            prep.CopyTo(output);
+            data.CopyTo(new Span<byte>(output).Slice(prep.Length));
+
+            return output;
+        }
+
+        public void Dispose() => _context.Dispose();
+    }
+}

+ 51 - 0
Ryujinx.Graphics.Nvdec.H264/FFmpegContext.cs

@@ -0,0 +1,51 @@
+using FFmpeg.AutoGen;
+using System;
+
+namespace Ryujinx.Graphics.Nvdec.H264
+{
+    unsafe class FFmpegContext : IDisposable
+    {
+        private readonly AVCodec* _codec;
+        private AVCodecContext* _context;
+
+        public FFmpegContext()
+        {
+            _codec = ffmpeg.avcodec_find_decoder(AVCodecID.AV_CODEC_ID_H264);
+            _context = ffmpeg.avcodec_alloc_context3(_codec);
+
+            ffmpeg.avcodec_open2(_context, _codec, null);
+        }
+
+        public int DecodeFrame(Surface output, ReadOnlySpan<byte> bitstream)
+        {
+            AVPacket packet;
+
+            ffmpeg.av_init_packet(&packet);
+
+            fixed (byte* ptr = bitstream)
+            {
+                packet.data = ptr;
+                packet.size = bitstream.Length;
+
+                int rc = ffmpeg.avcodec_send_packet(_context, &packet);
+
+                if (rc != 0)
+                {
+                    return rc;
+                }
+            }
+
+            return ffmpeg.avcodec_receive_frame(_context, output.Frame);
+        }
+
+        public void Dispose()
+        {
+            ffmpeg.avcodec_close(_context);
+
+            fixed (AVCodecContext** ppContext = &_context)
+            {
+                ffmpeg.avcodec_free_context(ppContext);
+            }
+        }
+    }
+}

+ 121 - 0
Ryujinx.Graphics.Nvdec.H264/H264BitStreamWriter.cs

@@ -0,0 +1,121 @@
+using System;
+using System.Numerics;
+
+namespace Ryujinx.Graphics.Nvdec.H264
+{
+    struct H264BitStreamWriter
+    {
+        private const int BufferSize = 8;
+
+        private readonly byte[] _workBuffer;
+
+        private int _offset;
+        private int _buffer;
+        private int _bufferPos;
+
+        public H264BitStreamWriter(byte[] workBuffer)
+        {
+            _workBuffer = workBuffer;
+            _offset = 0;
+            _buffer = 0;
+            _bufferPos = 0;
+        }
+
+        public void WriteBit(bool value)
+        {
+            WriteBits(value ? 1 : 0, 1);
+        }
+
+        public void WriteBits(int value, int valueSize)
+        {
+            int valuePos = 0;
+
+            int remaining = valueSize;
+
+            while (remaining > 0)
+            {
+                int copySize = remaining;
+
+                int free = GetFreeBufferBits();
+
+                if (copySize > free)
+                {
+                    copySize = free;
+                }
+
+                int mask = (1 << copySize) - 1;
+
+                int srcShift = (valueSize - valuePos) - copySize;
+                int dstShift = (BufferSize - _bufferPos) - copySize;
+
+                _buffer |= ((value >> srcShift) & mask) << dstShift;
+
+                valuePos += copySize;
+                _bufferPos += copySize;
+                remaining -= copySize;
+            }
+        }
+
+        private int GetFreeBufferBits()
+        {
+            if (_bufferPos == BufferSize)
+            {
+                Flush();
+            }
+
+            return BufferSize - _bufferPos;
+        }
+
+        public void Flush()
+        {
+            if (_bufferPos != 0)
+            {
+                _workBuffer[_offset++] = (byte)_buffer;
+
+                _buffer = 0;
+                _bufferPos = 0;
+            }
+        }
+
+        public void End()
+        {
+            WriteBit(true);
+
+            Flush();
+        }
+
+        public Span<byte> AsSpan()
+        {
+            return new Span<byte>(_workBuffer).Slice(0, _offset);
+        }
+
+        public void WriteU(uint value, int valueSize) => WriteBits((int)value, valueSize);
+        public void WriteSe(int value) => WriteExpGolombCodedInt(value);
+        public void WriteUe(uint value) => WriteExpGolombCodedUInt(value);
+
+        private void WriteExpGolombCodedInt(int value)
+        {
+            int sign = value <= 0 ? 0 : 1;
+
+            if (value < 0)
+            {
+                value = -value;
+            }
+
+            value = (value << 1) - sign;
+
+            WriteExpGolombCodedUInt((uint)value);
+        }
+
+        private void WriteExpGolombCodedUInt(uint value)
+        {
+            int size = 32 - BitOperations.LeadingZeroCount(value + 1);
+
+            WriteBits(1, size);
+
+            value -= (1u << (size - 1)) - 1;
+
+            WriteBits((int)value, size - 1);
+        }
+    }
+}

+ 23 - 0
Ryujinx.Graphics.Nvdec.H264/Ryujinx.Graphics.Nvdec.H264.csproj

@@ -0,0 +1,23 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netcoreapp3.1</TargetFramework>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|AnyCPU'">
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <PackageReference Include="FFmpeg.AutoGen" Version="4.3.0" />
+  </ItemGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Ryujinx.Graphics.Video\Ryujinx.Graphics.Video.csproj" />
+  </ItemGroup>
+
+</Project>

+ 159 - 0
Ryujinx.Graphics.Nvdec.H264/SpsAndPpsReconstruction.cs

@@ -0,0 +1,159 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Video;
+using System;
+
+namespace Ryujinx.Graphics.Nvdec.H264
+{
+    static class SpsAndPpsReconstruction
+    {
+        public static Span<byte> Reconstruct(ref H264PictureInfo pictureInfo, byte[] workBuffer)
+        {
+            H264BitStreamWriter writer = new H264BitStreamWriter(workBuffer);
+
+            // Sequence Parameter Set.
+            writer.WriteU(1, 24);
+            writer.WriteU(0, 1);
+            writer.WriteU(3, 2);
+            writer.WriteU(7, 5);
+            writer.WriteU(100, 8); // Profile idc
+            writer.WriteU(0, 8); // Reserved
+            writer.WriteU(31, 8); // Level idc
+            writer.WriteUe(0); // Seq parameter set id
+            writer.WriteUe(pictureInfo.ChromaFormatIdc);
+
+            if (pictureInfo.ChromaFormatIdc == 3)
+            {
+                writer.WriteBit(false); // Separate colour plane flag
+            }
+
+            writer.WriteUe(0); // Bit depth luma minus 8
+            writer.WriteUe(0); // Bit depth chroma minus 8
+            writer.WriteBit(pictureInfo.QpprimeYZeroTransformBypassFlag);
+            writer.WriteBit(false); // Scaling matrix present flag
+
+            writer.WriteUe(pictureInfo.Log2MaxFrameNumMinus4);
+            writer.WriteUe(pictureInfo.PicOrderCntType);
+
+            if (pictureInfo.PicOrderCntType == 0)
+            {
+                writer.WriteUe(pictureInfo.Log2MaxPicOrderCntLsbMinus4);
+            }
+            else if (pictureInfo.PicOrderCntType == 1)
+            {
+                writer.WriteBit(pictureInfo.DeltaPicOrderAlwaysZeroFlag);
+
+                writer.WriteSe(0); // Offset for non-ref pic
+                writer.WriteSe(0); // Offset for top to bottom field
+                writer.WriteUe(0); // Num ref frames in pic order cnt cycle
+            }
+
+            writer.WriteUe(16); // Max num ref frames
+            writer.WriteBit(false); // Gaps in frame num value allowed flag
+            writer.WriteUe(pictureInfo.PicWidthInMbsMinus1);
+            writer.WriteUe(pictureInfo.PicHeightInMapUnitsMinus1);
+            writer.WriteBit(pictureInfo.FrameMbsOnlyFlag);
+
+            if (!pictureInfo.FrameMbsOnlyFlag)
+            {
+                writer.WriteBit(pictureInfo.MbAdaptiveFrameFieldFlag);
+            }
+
+            writer.WriteBit(pictureInfo.Direct8x8InferenceFlag);
+            writer.WriteBit(false); // Frame cropping flag
+            writer.WriteBit(false); // VUI parameter present flag
+
+            writer.End();
+
+            // Picture Parameter Set.
+            writer.WriteU(1, 24);
+            writer.WriteU(0, 1);
+            writer.WriteU(3, 2);
+            writer.WriteU(8, 5);
+
+            writer.WriteUe(0); // Pic parameter set id
+            writer.WriteUe(0); // Seq parameter set id
+
+            writer.WriteBit(pictureInfo.EntropyCodingModeFlag);
+            writer.WriteBit(false); // Bottom field pic order in frame present flag
+            writer.WriteUe(0); // Num slice groups minus 1
+            writer.WriteUe(pictureInfo.NumRefIdxL0ActiveMinus1);
+            writer.WriteUe(pictureInfo.NumRefIdxL1ActiveMinus1);
+            writer.WriteBit(pictureInfo.WeightedPredFlag);
+            writer.WriteU(pictureInfo.WeightedBipredIdc, 2);
+            writer.WriteSe(pictureInfo.PicInitQpMinus26);
+            writer.WriteSe(0); // Pic init qs minus 26
+            writer.WriteSe(pictureInfo.ChromaQpIndexOffset);
+            writer.WriteBit(pictureInfo.DeblockingFilterControlPresentFlag);
+            writer.WriteBit(pictureInfo.ConstrainedIntraPredFlag);
+            writer.WriteBit(pictureInfo.RedundantPicCntPresentFlag);
+            writer.WriteBit(pictureInfo.Transform8x8ModeFlag);
+
+            writer.WriteBit(pictureInfo.ScalingMatrixPresent);
+
+            if (pictureInfo.ScalingMatrixPresent)
+            {
+                for (int index = 0; index < 6; index++)
+                {
+                    writer.WriteBit(true);
+
+                    WriteScalingList(ref writer, pictureInfo.ScalingLists4x4[index]);
+                }
+
+                if (pictureInfo.Transform8x8ModeFlag)
+                {
+                    for (int index = 0; index < 2; index++)
+                    {
+                        writer.WriteBit(true);
+
+                        WriteScalingList(ref writer, pictureInfo.ScalingLists8x8[index]);
+                    }
+                }
+            }
+
+            writer.WriteSe(pictureInfo.SecondChromaQpIndexOffset);
+
+            writer.End();
+
+            return writer.AsSpan();
+        }
+
+        // ZigZag LUTs from libavcodec.
+        private static readonly byte[] ZigZagDirect = new byte[]
+        {
+            0,   1,  8, 16,  9,  2,  3, 10,
+            17, 24, 32, 25, 18, 11,  4,  5,
+            12, 19, 26, 33, 40, 48, 41, 34,
+            27, 20, 13,  6,  7, 14, 21, 28,
+            35, 42, 49, 56, 57, 50, 43, 36,
+            29, 22, 15, 23, 30, 37, 44, 51,
+            58, 59, 52, 45, 38, 31, 39, 46,
+            53, 60, 61, 54, 47, 55, 62, 63
+        };
+
+        private static readonly byte[] ZigZagScan = new byte[]
+        {
+            0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4,
+            1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4,
+            1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4,
+            3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4
+        };
+
+        private static void WriteScalingList(ref H264BitStreamWriter writer, IArray<byte> list)
+        {
+            byte[] scan = list.Length == 16 ? ZigZagScan : ZigZagDirect;
+
+            int lastScale = 8;
+
+            for (int index = 0; index < list.Length; index++)
+            {
+                byte value = list[scan[index]];
+
+                int deltaScale = value - lastScale;
+
+                writer.WriteSe(deltaScale);
+
+                lastScale = value;
+            }
+        }
+    }
+}

+ 33 - 0
Ryujinx.Graphics.Nvdec.H264/Surface.cs

@@ -0,0 +1,33 @@
+using FFmpeg.AutoGen;
+using Ryujinx.Graphics.Video;
+using System;
+
+namespace Ryujinx.Graphics.Nvdec.H264
+{
+    unsafe class Surface : ISurface
+    {
+        public AVFrame* Frame { get; }
+
+        public Plane YPlane => new Plane((IntPtr)Frame->data[0], Stride * Height);
+        public Plane UPlane => new Plane((IntPtr)Frame->data[1], UvStride * UvHeight);
+        public Plane VPlane => new Plane((IntPtr)Frame->data[2], UvStride * UvHeight);
+
+        public int Width => Frame->width;
+        public int Height => Frame->height;
+        public int Stride => Frame->linesize[0];
+        public int UvWidth => (Frame->width + 1) >> 1;
+        public int UvHeight => (Frame->height + 1) >> 1;
+        public int UvStride => Frame->linesize[1];
+
+        public Surface()
+        {
+            Frame = ffmpeg.av_frame_alloc();
+        }
+
+        public void Dispose()
+        {
+            ffmpeg.av_frame_unref(Frame);
+            ffmpeg.av_free(Frame);
+        }
+    }
+}

+ 9 - 0
Ryujinx.Graphics.Nvdec.Vp9/BitDepth.cs

@@ -0,0 +1,9 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal enum BitDepth
+    {
+        Bits8 = 8,   /**<  8 bits */
+        Bits10 = 10, /**< 10 bits */
+        Bits12 = 12, /**< 12 bits */
+    }
+}

+ 56 - 0
Ryujinx.Graphics.Nvdec.Vp9/CodecErr.cs

@@ -0,0 +1,56 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal enum CodecErr
+    {
+        /*!\brief Operation completed without error */
+        CodecOk,
+
+        /*!\brief Unspecified error */
+        CodecError,
+
+        /*!\brief Memory operation failed */
+        CodecMemError,
+
+        /*!\brief ABI version mismatch */
+        CodecAbiMismatch,
+
+        /*!\brief Algorithm does not have required capability */
+        CodecIncapable,
+
+        /*!\brief The given bitstream is not supported.
+         *
+         * The bitstream was unable to be parsed at the highest level. The decoder
+         * is unable to proceed. This error \ref SHOULD be treated as fatal to the
+         * stream. */
+        CodecUnsupBitstream,
+
+        /*!\brief Encoded bitstream uses an unsupported feature
+         *
+         * The decoder does not implement a feature required by the encoder. This
+         * return code should only be used for features that prevent future
+         * pictures from being properly decoded. This error \ref MAY be treated as
+         * fatal to the stream or \ref MAY be treated as fatal to the current GOP.
+         */
+        CodecUnsupFeature,
+
+        /*!\brief The coded data for this stream is corrupt or incomplete
+         *
+         * There was a problem decoding the current frame.  This return code
+         * should only be used for failures that prevent future pictures from
+         * being properly decoded. This error \ref MAY be treated as fatal to the
+         * stream or \ref MAY be treated as fatal to the current GOP. If decoding
+         * is continued for the current GOP, artifacts may be present.
+         */
+        CodecCorruptFrame,
+
+        /*!\brief An application-supplied parameter is not valid.
+         *
+         */
+        CodecInvalidParam,
+
+        /*!\brief An iterator reached the end of list.
+         *
+         */
+        CodecListEnd
+    }
+}

+ 59 - 0
Ryujinx.Graphics.Nvdec.Vp9/Common/BitUtils.cs

@@ -0,0 +1,59 @@
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Common
+{
+    internal static class BitUtils
+    {
+        // FIXME: Enable inlining here after AVX2 gather bug is fixed.
+        // [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static byte ClipPixel(int val)
+        {
+            return (byte)((val > 255) ? 255 : (val < 0) ? 0 : val);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static ushort ClipPixelHighbd(int val, int bd)
+        {
+            return bd switch
+            {
+                10 => (ushort)Math.Clamp(val, 0, 1023),
+                12 => (ushort)Math.Clamp(val, 0, 4095),
+                _ => (ushort)Math.Clamp(val, 0, 255)
+            };
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int RoundPowerOfTwo(int value, int n)
+        {
+            return (value + (1 << (n - 1))) >> n;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static long RoundPowerOfTwo(long value, int n)
+        {
+            return (value + (1L << (n - 1))) >> n;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int AlignPowerOfTwo(int value, int n)
+        {
+            return (value + ((1 << n) - 1)) & ~((1 << n) - 1);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int GetMsb(uint n)
+        {
+            Debug.Assert(n != 0);
+            return 31 ^ BitOperations.LeadingZeroCount(n);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int GetUnsignedBits(uint numValues)
+        {
+            return numValues > 0 ? GetMsb(numValues) + 1 : 0;
+        }
+    }
+}

+ 94 - 0
Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryAllocator.cs

@@ -0,0 +1,94 @@
+using Ryujinx.Common.Memory;
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Common
+{
+    internal class MemoryAllocator : IDisposable
+    {
+        private const int PoolEntries = 10;
+
+        private struct PoolItem
+        {
+            public IntPtr Pointer;
+            public int Length;
+            public bool InUse;
+        }
+
+        private PoolItem[] _pool = new PoolItem[PoolEntries];
+
+        public ArrayPtr<T> Allocate<T>(int length) where T : unmanaged
+        {
+            int lengthInBytes = Unsafe.SizeOf<T>() * length;
+
+            IntPtr ptr = IntPtr.Zero;
+
+            for (int i = 0; i < PoolEntries; i++)
+            {
+                ref PoolItem item = ref _pool[i];
+
+                if (!item.InUse && item.Length == lengthInBytes)
+                {
+                    item.InUse = true;
+                    ptr = item.Pointer;
+                    break;
+                }
+            }
+
+            if (ptr == IntPtr.Zero)
+            {
+                ptr = Marshal.AllocHGlobal(lengthInBytes);
+
+                for (int i = 0; i < PoolEntries; i++)
+                {
+                    ref PoolItem item = ref _pool[i];
+
+                    if (!item.InUse)
+                    {
+                        item.InUse = true;
+                        if (item.Pointer != IntPtr.Zero)
+                        {
+                            Marshal.FreeHGlobal(item.Pointer);
+                        }
+                        item.Pointer = ptr;
+                        item.Length = lengthInBytes;
+                        break;
+                    }
+                }
+            }
+
+            return new ArrayPtr<T>(ptr, length);
+        }
+
+        public unsafe void Free<T>(ArrayPtr<T> arr) where T : unmanaged
+        {
+            IntPtr ptr = (IntPtr)arr.ToPointer();
+
+            for (int i = 0; i < PoolEntries; i++)
+            {
+                ref PoolItem item = ref _pool[i];
+
+                if (item.Pointer == ptr)
+                {
+                    item.InUse = false;
+                    break;
+                }
+            }
+        }
+
+        public void Dispose()
+        {
+            for (int i = 0; i < PoolEntries; i++)
+            {
+                ref PoolItem item = ref _pool[i];
+
+                if (item.Pointer != IntPtr.Zero)
+                {
+                    Marshal.FreeHGlobal(item.Pointer);
+                    item.Pointer = IntPtr.Zero;
+                }
+            }
+        }
+    }
+}

+ 25 - 0
Ryujinx.Graphics.Nvdec.Vp9/Common/MemoryUtil.cs

@@ -0,0 +1,25 @@
+using Ryujinx.Common.Memory;
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Common
+{
+    internal static class MemoryUtil
+    {
+        public static unsafe void Copy<T>(T* dest, T* source, int length) where T : unmanaged
+        {
+            new Span<T>(source, length).CopyTo(new Span<T>(dest, length));
+        }
+
+        public static void Copy<T>(ref T dest, ref T source) where T : unmanaged
+        {
+            MemoryMarshal.CreateSpan(ref source, 1).CopyTo(MemoryMarshal.CreateSpan(ref dest, 1));
+        }
+
+        public static unsafe void Fill<T>(T* ptr, T value, int length) where T : unmanaged
+        {
+            new Span<T>(ptr, length).Fill(value);
+        }
+    }
+}

+ 71 - 0
Ryujinx.Graphics.Nvdec.Vp9/Constants.cs

@@ -0,0 +1,71 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class Constants
+    {
+        public const int Vp9InterpExtend = 4;
+
+        public const int MaxMbPlane = 3;
+
+        public const int None = -1;
+        public const int IntraFrame = 0;
+        public const int LastFrame = 1;
+        public const int GoldenFrame = 2;
+        public const int AltRefFrame = 3;
+        public const int MaxRefFrames = 4;
+
+        public const int MiSizeLog2 = 3;
+        public const int MiBlockSizeLog2 = 6 - MiSizeLog2;  // 64 = 2^6
+
+        public const int MiSize = 1 << MiSizeLog2;              // pixels per mi-unit
+        public const int MiBlockSize = 1 << MiBlockSizeLog2;  // mi-units per max block
+        public const int MiMask = MiBlockSize - 1;
+
+        public const int PartitionPloffset = 4;  // number of probability models per block size
+
+        /* Segment Feature Masks */
+        public const int MaxMvRefCandidates = 2;
+
+        public const int CompInterContexts = 5;
+        public const int RefContexts = 5;
+
+        public const int EightTap = 0;
+        public const int EightTapSmooth = 1;
+        public const int EightTapSharp = 2;
+        public const int SwitchableFilters = 3; /* Number of switchable filters */
+        public const int Bilinear = 3;
+        public const int Switchable = 4; /* should be the last one */
+
+        // Frame
+        public const int RefsPerFrame = 3;
+
+        public const int NumPingPongBuffers = 2;
+
+        public const int Class0Bits = 1; /* bits at integer precision for class 0 */
+        public const int Class0Size = 1 << Class0Bits;
+
+        public const int MvInUseBits = 14;
+        public const int MvUpp = (1 << MvInUseBits) - 1;
+        public const int MvLow = -(1 << MvInUseBits);
+
+        // Coefficient token alphabet
+        public const int ZeroToken = 0;        // 0     Extra Bits 0+0
+        public const int OneToken = 1;         // 1     Extra Bits 0+1
+        public const int TwoToken = 2;         // 2     Extra Bits 0+1
+
+        public const int PivotNode = 2;
+
+        public const int Cat1MinVal = 5;
+        public const int Cat2MinVal = 7;
+        public const int Cat3MinVal = 11;
+        public const int Cat4MinVal = 19;
+        public const int Cat5MinVal = 35;
+        public const int Cat6MinVal = 67;
+
+        public const int EobModelToken = 3;
+
+        public const int SegmentAbsData = 1;
+        public const int MaxSegments = 8;
+    }
+}

+ 1190 - 0
Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs

@@ -0,0 +1,1190 @@
+using Ryujinx.Common.Memory;
+using System;
+using System.Buffers.Binary;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using Ryujinx.Graphics.Video;
+using Mv = Ryujinx.Graphics.Nvdec.Vp9.Types.Mv;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    static class DecodeFrame
+    {
+        private static bool ReadIsValid(ArrayPtr<byte> start, int len)
+        {
+            return len != 0 && len <= start.Length;
+        }
+
+        private static void InverseTransformBlockInter(ref MacroBlockD xd, int plane, TxSize txSize, Span<byte> dst, int stride, int eob)
+        {
+            ref MacroBlockDPlane pd = ref xd.Plane[plane];
+            ArrayPtr<int> dqcoeff = pd.DqCoeff;
+            Debug.Assert(eob > 0);
+            if (xd.CurBuf.HighBd)
+            {
+                Span<ushort> dst16 = MemoryMarshal.Cast<byte, ushort>(dst);
+                if (xd.Lossless)
+                {
+                    Idct.HighbdIwht4x4Add(dqcoeff.ToSpan(), dst16, stride, eob, xd.Bd);
+                }
+                else
+                {
+                    switch (txSize)
+                    {
+                        case TxSize.Tx4x4:
+                            Idct.HighbdIdct4x4Add(dqcoeff.ToSpan(), dst16, stride, eob, xd.Bd);
+                            break;
+                        case TxSize.Tx8x8:
+                            Idct.HighbdIdct8x8Add(dqcoeff.ToSpan(), dst16, stride, eob, xd.Bd);
+                            break;
+                        case TxSize.Tx16x16:
+                            Idct.HighbdIdct16x16Add(dqcoeff.ToSpan(), dst16, stride, eob, xd.Bd);
+                            break;
+                        case TxSize.Tx32x32:
+                            Idct.HighbdIdct32x32Add(dqcoeff.ToSpan(), dst16, stride, eob, xd.Bd);
+                            break;
+                        default: Debug.Assert(false, "Invalid transform size"); break;
+                    }
+                }
+            }
+            else
+            {
+                if (xd.Lossless)
+                {
+                    Idct.Iwht4x4Add(dqcoeff.ToSpan(), dst, stride, eob);
+                }
+                else
+                {
+                    switch (txSize)
+                    {
+                        case TxSize.Tx4x4: Idct.Idct4x4Add(dqcoeff.ToSpan(), dst, stride, eob); break;
+                        case TxSize.Tx8x8: Idct.Idct8x8Add(dqcoeff.ToSpan(), dst, stride, eob); break;
+                        case TxSize.Tx16x16: Idct.Idct16x16Add(dqcoeff.ToSpan(), dst, stride, eob); break;
+                        case TxSize.Tx32x32: Idct.Idct32x32Add(dqcoeff.ToSpan(), dst, stride, eob); break;
+                        default: Debug.Assert(false, "Invalid transform size"); return;
+                    }
+                }
+            }
+
+            if (eob == 1)
+            {
+                dqcoeff.ToSpan()[0] = 0;
+            }
+            else
+            {
+                if (txSize <= TxSize.Tx16x16 && eob <= 10)
+                {
+                    dqcoeff.ToSpan().Slice(0, 4 * (4 << (int)txSize)).Fill(0);
+                }
+                else if (txSize == TxSize.Tx32x32 && eob <= 34)
+                {
+                    dqcoeff.ToSpan().Slice(0, 256).Fill(0);
+                }
+                else
+                {
+                    dqcoeff.ToSpan().Slice(0, 16 << ((int)txSize << 1)).Fill(0);
+                }
+            }
+        }
+
+        private static void InverseTransformBlockIntra(
+            ref MacroBlockD xd,
+            int plane,
+            TxType txType,
+            TxSize txSize,
+            Span<byte> dst,
+            int stride,
+            int eob)
+        {
+            ref MacroBlockDPlane pd = ref xd.Plane[plane];
+            ArrayPtr<int> dqcoeff = pd.DqCoeff;
+            Debug.Assert(eob > 0);
+            if (xd.CurBuf.HighBd)
+            {
+                Span<ushort> dst16 = MemoryMarshal.Cast<byte, ushort>(dst);
+                if (xd.Lossless)
+                {
+                    Idct.HighbdIwht4x4Add(dqcoeff.ToSpan(), dst16, stride, eob, xd.Bd);
+                }
+                else
+                {
+                    switch (txSize)
+                    {
+                        case TxSize.Tx4x4:
+                            Idct.HighbdIht4x4Add(txType, dqcoeff.ToSpan(), dst16, stride, eob, xd.Bd);
+                            break;
+                        case TxSize.Tx8x8:
+                            Idct.HighbdIht8x8Add(txType, dqcoeff.ToSpan(), dst16, stride, eob, xd.Bd);
+                            break;
+                        case TxSize.Tx16x16:
+                            Idct.HighbdIht16x16Add(txType, dqcoeff.ToSpan(), dst16, stride, eob, xd.Bd);
+                            break;
+                        case TxSize.Tx32x32:
+                            Idct.HighbdIdct32x32Add(dqcoeff.ToSpan(), dst16, stride, eob, xd.Bd);
+                            break;
+                        default: Debug.Assert(false, "Invalid transform size"); break;
+                    }
+                }
+            }
+            else
+            {
+                if (xd.Lossless)
+                {
+                    Idct.Iwht4x4Add(dqcoeff.ToSpan(), dst, stride, eob);
+                }
+                else
+                {
+                    switch (txSize)
+                    {
+                        case TxSize.Tx4x4: Idct.Iht4x4Add(txType, dqcoeff.ToSpan(), dst, stride, eob); break;
+                        case TxSize.Tx8x8: Idct.Iht8x8Add(txType, dqcoeff.ToSpan(), dst, stride, eob); break;
+                        case TxSize.Tx16x16: Idct.Iht16x16Add(txType, dqcoeff.ToSpan(), dst, stride, eob); break;
+                        case TxSize.Tx32x32: Idct.Idct32x32Add(dqcoeff.ToSpan(), dst, stride, eob); break;
+                        default: Debug.Assert(false, "Invalid transform size"); return;
+                    }
+                }
+            }
+
+            if (eob == 1)
+            {
+                dqcoeff.ToSpan()[0] = 0;
+            }
+            else
+            {
+                if (txType == TxType.DctDct && txSize <= TxSize.Tx16x16 && eob <= 10)
+                {
+                    dqcoeff.ToSpan().Slice(0, 4 * (4 << (int)txSize)).Fill(0);
+                }
+                else if (txSize == TxSize.Tx32x32 && eob <= 34)
+                {
+                    dqcoeff.ToSpan().Slice(0, 256).Fill(0);
+                }
+                else
+                {
+                    dqcoeff.ToSpan().Slice(0, 16 << ((int)txSize << 1)).Fill(0);
+                }
+            }
+        }
+
+        private static unsafe void PredictAndReconstructIntraBlock(
+            ref TileWorkerData twd,
+            ref ModeInfo mi,
+            int plane,
+            int row,
+            int col,
+            TxSize txSize)
+        {
+            ref MacroBlockD xd = ref twd.Xd;
+            ref MacroBlockDPlane pd = ref xd.Plane[plane];
+            PredictionMode mode = (plane == 0) ? mi.Mode : mi.UvMode;
+            int dstOffset = 4 * row * pd.Dst.Stride + 4 * col;
+            byte* dst = &pd.Dst.Buf.ToPointer()[dstOffset];
+            Span<byte> dstSpan = pd.Dst.Buf.ToSpan().Slice(dstOffset);
+
+            if (mi.SbType < BlockSize.Block8x8)
+            {
+                if (plane == 0)
+                {
+                    mode = xd.Mi[0].Value.Bmi[(row << 1) + col].Mode;
+                }
+            }
+
+            ReconIntra.PredictIntraBlock(ref xd, pd.N4Wl, txSize, mode, dst, pd.Dst.Stride, dst, pd.Dst.Stride, col, row, plane);
+
+            if (mi.Skip == 0)
+            {
+                TxType txType =
+                    (plane != 0 || xd.Lossless) ? TxType.DctDct : ReconIntra.IntraModeToTxTypeLookup[(int)mode];
+                var sc = (plane != 0 || xd.Lossless)
+                    ? Luts.Vp9DefaultScanOrders[(int)txSize]
+                    : Luts.Vp9ScanOrders[(int)txSize][(int)txType];
+                int eob = Detokenize.DecodeBlockTokens(ref twd, plane, sc, col, row, txSize, mi.SegmentId);
+                if (eob > 0)
+                {
+                    InverseTransformBlockIntra(ref xd, plane, txType, txSize, dstSpan, pd.Dst.Stride, eob);
+                }
+            }
+        }
+
+        private static int ReconstructInterBlock(
+            ref TileWorkerData twd,
+            ref ModeInfo mi,
+            int plane,
+            int row,
+            int col,
+            TxSize txSize)
+        {
+            ref MacroBlockD xd = ref twd.Xd;
+            ref MacroBlockDPlane pd = ref xd.Plane[plane];
+            var sc = Luts.Vp9DefaultScanOrders[(int)txSize];
+            int eob = Detokenize.DecodeBlockTokens(ref twd, plane, sc, col, row, txSize, mi.SegmentId);
+            Span<byte> dst = pd.Dst.Buf.ToSpan().Slice(4 * row * pd.Dst.Stride + 4 * col);
+
+            if (eob > 0)
+            {
+                InverseTransformBlockInter(ref xd, plane, txSize, dst, pd.Dst.Stride, eob);
+            }
+            return eob;
+        }
+
+        private static unsafe void BuildMcBorder(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            int x,
+            int y,
+            int bW,
+            int bH,
+            int w,
+            int h)
+        {
+            // Get a pointer to the start of the real data for this row.
+            byte* refRow = src - x - y * srcStride;
+
+            if (y >= h)
+            {
+                refRow += (h - 1) * srcStride;
+            }
+            else if (y > 0)
+            {
+                refRow += y * srcStride;
+            }
+
+            do
+            {
+                int right = 0, copy;
+                int left = x < 0 ? -x : 0;
+
+                if (left > bW)
+                {
+                    left = bW;
+                }
+
+                if (x + bW > w)
+                {
+                    right = x + bW - w;
+                }
+
+                if (right > bW)
+                {
+                    right = bW;
+                }
+
+                copy = bW - left - right;
+
+                if (left != 0)
+                {
+                    MemoryUtil.Fill(dst, refRow[0], left);
+                }
+
+                if (copy != 0)
+                {
+                    MemoryUtil.Copy(dst + left, refRow + x + left, copy);
+                }
+
+                if (right != 0)
+                {
+                    MemoryUtil.Fill(dst + left + copy, refRow[w - 1], right);
+                }
+
+                dst += dstStride;
+                ++y;
+
+                if (y > 0 && y < h)
+                {
+                    refRow += srcStride;
+                }
+            } while (--bH != 0);
+        }
+
+        private static unsafe void HighBuildMcBorder(
+            byte* src8,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            int x,
+            int y,
+            int bW,
+            int bH,
+            int w,
+            int h)
+        {
+            // Get a pointer to the start of the real data for this row.
+            ushort* src = (ushort*)src8;
+            ushort* refRow = src - x - y * srcStride;
+
+            if (y >= h)
+            {
+                refRow += (h - 1) * srcStride;
+            }
+            else if (y > 0)
+            {
+                refRow += y * srcStride;
+            }
+
+            do
+            {
+                int right = 0, copy;
+                int left = x < 0 ? -x : 0;
+
+                if (left > bW)
+                {
+                    left = bW;
+                }
+
+                if (x + bW > w)
+                {
+                    right = x + bW - w;
+                }
+
+                if (right > bW)
+                {
+                    right = bW;
+                }
+
+                copy = bW - left - right;
+
+                if (left != 0)
+                {
+                    MemoryUtil.Fill(dst, refRow[0], left);
+                }
+
+                if (copy != 0)
+                {
+                    MemoryUtil.Copy(dst + left, refRow + x + left, copy);
+                }
+
+                if (right != 0)
+                {
+                    MemoryUtil.Fill(dst + left + copy, refRow[w - 1], right);
+                }
+
+                dst += dstStride;
+                ++y;
+
+                if (y > 0 && y < h)
+                {
+                    refRow += srcStride;
+                }
+            } while (--bH != 0);
+        }
+
+        [StructLayout(LayoutKind.Sequential, Size = 80 * 2 * 80 * 2)]
+        struct McBufHigh
+        {
+        }
+
+        private static unsafe void ExtendAndPredict(
+            byte* bufPtr1,
+            int preBufStride,
+            int x0,
+            int y0,
+            int bW,
+            int bH,
+            int frameWidth,
+            int frameHeight,
+            int borderOffset,
+            byte* dst,
+            int dstBufStride,
+            int subpelX,
+            int subpelY,
+            Array8<short>[] kernel,
+            ref ScaleFactors sf,
+            ref MacroBlockD xd,
+            int w,
+            int h,
+            int refr,
+            int xs,
+            int ys)
+        {
+            McBufHigh mcBufHighStruct;
+            ushort* mcBufHigh = (ushort*)Unsafe.AsPointer(ref mcBufHighStruct); // Avoid zero initialization.
+            if (xd.CurBuf.HighBd)
+            {
+                HighBuildMcBorder(bufPtr1, preBufStride, mcBufHigh, bW, x0, y0, bW, bH, frameWidth, frameHeight);
+                ReconInter.HighbdInterPredictor(
+                    mcBufHigh + borderOffset,
+                    bW,
+                    (ushort*)dst,
+                    dstBufStride,
+                    subpelX,
+                    subpelY,
+                    ref sf,
+                    w,
+                    h,
+                    refr,
+                    kernel,
+                    xs,
+                    ys,
+                    xd.Bd);
+            }
+            else
+            {
+                BuildMcBorder(bufPtr1, preBufStride, (byte*)mcBufHigh, bW, x0, y0, bW, bH, frameWidth, frameHeight);
+                ReconInter.InterPredictor(
+                    (byte*)mcBufHigh + borderOffset,
+                    bW,
+                    dst,
+                    dstBufStride,
+                    subpelX,
+                    subpelY,
+                    ref sf,
+                    w,
+                    h,
+                    refr,
+                    kernel,
+                    xs,
+                    ys);
+            }
+        }
+
+        private static unsafe void DecBuildInterPredictors(
+            ref MacroBlockD xd,
+            int plane,
+            int bw,
+            int bh,
+            int x,
+            int y,
+            int w,
+            int h,
+            int miX,
+            int miY,
+            Array8<short>[] kernel,
+            ref ScaleFactors sf,
+            ref Buf2D preBuf,
+            ref Buf2D dstBuf,
+            ref Mv mv,
+            ref Surface refFrameBuf,
+            bool isScaled,
+            int refr)
+        {
+            ref MacroBlockDPlane pd = ref xd.Plane[plane];
+            byte* dst = dstBuf.Buf.ToPointer() + dstBuf.Stride * y + x;
+            Mv32 scaledMv;
+            int xs, ys, x0, y0, x0_16, y0_16, frameWidth, frameHeight, bufStride, subpelX, subpelY;
+            byte* refFrame;
+            byte* bufPtr;
+
+            // Get reference frame pointer, width and height.
+            if (plane == 0)
+            {
+                frameWidth = refFrameBuf.Width;
+                frameHeight = refFrameBuf.Height;
+                refFrame = refFrameBuf.YBuffer.ToPointer();
+            }
+            else
+            {
+                frameWidth = refFrameBuf.UvWidth;
+                frameHeight = refFrameBuf.UvHeight;
+                refFrame = plane == 1 ? refFrameBuf.UBuffer.ToPointer() : refFrameBuf.VBuffer.ToPointer();
+            }
+
+            if (isScaled)
+            {
+                Mv mvQ4 = ReconInter.ClampMvToUmvBorderSb(ref xd, ref mv, bw, bh, pd.SubsamplingX, pd.SubsamplingY);
+                // Co-ordinate of containing block to pixel precision.
+                int xStart = (-xd.MbToLeftEdge >> (3 + pd.SubsamplingX));
+                int yStart = (-xd.MbToTopEdge >> (3 + pd.SubsamplingY));
+                // Co-ordinate of the block to 1/16th pixel precision.
+                x0_16 = (xStart + x) << Filter.SubpelBits;
+                y0_16 = (yStart + y) << Filter.SubpelBits;
+
+                // Co-ordinate of current block in reference frame
+                // to 1/16th pixel precision.
+                x0_16 = sf.ScaleValueX(x0_16);
+                y0_16 = sf.ScaleValueY(y0_16);
+
+                // Map the top left corner of the block into the reference frame.
+                x0 = sf.ScaleValueX(xStart + x);
+                y0 = sf.ScaleValueY(yStart + y);
+
+                // Scale the MV and incorporate the sub-pixel offset of the block
+                // in the reference frame.
+                scaledMv = sf.ScaleMv(ref mvQ4, miX + x, miY + y);
+                xs = sf.XStepQ4;
+                ys = sf.YStepQ4;
+            }
+            else
+            {
+                // Co-ordinate of containing block to pixel precision.
+                x0 = (-xd.MbToLeftEdge >> (3 + pd.SubsamplingX)) + x;
+                y0 = (-xd.MbToTopEdge >> (3 + pd.SubsamplingY)) + y;
+
+                // Co-ordinate of the block to 1/16th pixel precision.
+                x0_16 = x0 << Filter.SubpelBits;
+                y0_16 = y0 << Filter.SubpelBits;
+
+                scaledMv.Row = mv.Row * (1 << (1 - pd.SubsamplingY));
+                scaledMv.Col = mv.Col * (1 << (1 - pd.SubsamplingX));
+                xs = ys = 16;
+            }
+            subpelX = scaledMv.Col & Filter.SubpelMask;
+            subpelY = scaledMv.Row & Filter.SubpelMask;
+
+            // Calculate the top left corner of the best matching block in the
+            // reference frame.
+            x0 += scaledMv.Col >> Filter.SubpelBits;
+            y0 += scaledMv.Row >> Filter.SubpelBits;
+            x0_16 += scaledMv.Col;
+            y0_16 += scaledMv.Row;
+
+            // Get reference block pointer.
+            bufPtr = refFrame + y0 * preBuf.Stride + x0;
+            bufStride = preBuf.Stride;
+
+            // Do border extension if there is motion or the
+            // width/height is not a multiple of 8 pixels.
+            if (isScaled || scaledMv.Col != 0 || scaledMv.Row != 0 || (frameWidth & 0x7) != 0 || (frameHeight & 0x7) != 0)
+            {
+                int y1 = ((y0_16 + (h - 1) * ys) >> Filter.SubpelBits) + 1;
+
+                // Get reference block bottom right horizontal coordinate.
+                int x1 = ((x0_16 + (w - 1) * xs) >> Filter.SubpelBits) + 1;
+                int xPad = 0, yPad = 0;
+
+                if (subpelX != 0 || (sf.XStepQ4 != Filter.SubpelShifts))
+                {
+                    x0 -= Constants.Vp9InterpExtend - 1;
+                    x1 += Constants.Vp9InterpExtend;
+                    xPad = 1;
+                }
+
+                if (subpelY != 0 || (sf.YStepQ4 != Filter.SubpelShifts))
+                {
+                    y0 -= Constants.Vp9InterpExtend - 1;
+                    y1 += Constants.Vp9InterpExtend;
+                    yPad = 1;
+                }
+
+                // Skip border extension if block is inside the frame.
+                if (x0 < 0 || x0 > frameWidth - 1 || x1 < 0 || x1 > frameWidth - 1 ||
+                    y0 < 0 || y0 > frameHeight - 1 || y1 < 0 || y1 > frameHeight - 1)
+                {
+                    // Extend the border.
+                    byte* bufPtr1 = refFrame + y0 * bufStride + x0;
+                    int bW = x1 - x0 + 1;
+                    int bH = y1 - y0 + 1;
+                    int borderOffset = yPad * 3 * bW + xPad * 3;
+
+                    ExtendAndPredict(
+                        bufPtr1,
+                        bufStride,
+                        x0,
+                        y0,
+                        bW,
+                        bH,
+                        frameWidth,
+                        frameHeight,
+                        borderOffset,
+                        dst,
+                        dstBuf.Stride,
+                        subpelX,
+                        subpelY,
+                        kernel,
+                        ref sf,
+                        ref xd,
+                        w,
+                        h,
+                        refr,
+                        xs,
+                        ys);
+                    return;
+                }
+            }
+            if (xd.CurBuf.HighBd)
+            {
+                ReconInter.HighbdInterPredictor(
+                    (ushort*)bufPtr,
+                    bufStride,
+                    (ushort*)dst,
+                    dstBuf.Stride,
+                    subpelX,
+                    subpelY,
+                    ref sf,
+                    w,
+                    h,
+                    refr,
+                    kernel,
+                    xs,
+                    ys,
+                    xd.Bd);
+            }
+            else
+            {
+                ReconInter.InterPredictor(
+                    bufPtr,
+                    bufStride,
+                    dst,
+                    dstBuf.Stride,
+                    subpelX,
+                    subpelY,
+                    ref sf,
+                    w,
+                    h,
+                    refr,
+                    kernel,
+                    xs,
+                    ys);
+            }
+        }
+
+        private static void DecBuildInterPredictorsSb(ref Vp9Common cm, ref MacroBlockD xd, int miRow, int miCol)
+        {
+            int plane;
+            int miX = miCol * Constants.MiSize;
+            int miY = miRow * Constants.MiSize;
+            ref ModeInfo mi = ref xd.Mi[0].Value;
+            Array8<short>[] kernel = Luts.Vp9FilterKernels[mi.InterpFilter];
+            BlockSize sbType = mi.SbType;
+            int isCompound = mi.HasSecondRef() ? 1 : 0;
+            int refr;
+            bool isScaled;
+
+            for (refr = 0; refr < 1 + isCompound; ++refr)
+            {
+                int frame = mi.RefFrame[refr];
+                ref RefBuffer refBuf = ref cm.FrameRefs[frame - Constants.LastFrame];
+                ref ScaleFactors sf = ref refBuf.Sf;
+                ref Surface refFrameBuf = ref refBuf.Buf;
+
+                if (!sf.IsValidScale())
+                {
+                    xd.ErrorInfo.Value.InternalError(CodecErr.CodecUnsupBitstream, "Reference frame has invalid dimensions");
+                }
+
+                isScaled = sf.IsScaled();
+                ReconInter.SetupPrePlanes(ref xd, refr, ref refFrameBuf, miRow, miCol, isScaled ? new Ptr<ScaleFactors>(ref sf) : Ptr<ScaleFactors>.Null);
+                xd.BlockRefs[refr] = new Ptr<RefBuffer>(ref refBuf);
+
+                if (sbType < BlockSize.Block8x8)
+                {
+                    for (plane = 0; plane < Constants.MaxMbPlane; ++plane)
+                    {
+                        ref MacroBlockDPlane pd = ref xd.Plane[plane];
+                        ref Buf2D dstBuf = ref pd.Dst;
+                        int num4x4W = pd.N4W;
+                        int num4x4H = pd.N4H;
+                        int n4Wx4 = 4 * num4x4W;
+                        int n4Hx4 = 4 * num4x4H;
+                        ref Buf2D preBuf = ref pd.Pre[refr];
+                        int i = 0, x, y;
+                        for (y = 0; y < num4x4H; ++y)
+                        {
+                            for (x = 0; x < num4x4W; ++x)
+                            {
+                                Mv mv = ReconInter.AverageSplitMvs(ref pd, ref mi, refr, i++);
+                                DecBuildInterPredictors(
+                                    ref xd,
+                                    plane,
+                                    n4Wx4,
+                                    n4Hx4,
+                                    4 * x,
+                                    4 * y,
+                                    4,
+                                    4,
+                                    miX,
+                                    miY,
+                                    kernel,
+                                    ref sf,
+                                    ref preBuf,
+                                    ref dstBuf,
+                                    ref mv,
+                                    ref refFrameBuf,
+                                    isScaled,
+                                    refr);
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    Mv mv = mi.Mv[refr];
+                    for (plane = 0; plane < Constants.MaxMbPlane; ++plane)
+                    {
+                        ref MacroBlockDPlane pd = ref xd.Plane[plane];
+                        ref Buf2D dstBuf = ref pd.Dst;
+                        int num4x4W = pd.N4W;
+                        int num4x4H = pd.N4H;
+                        int n4Wx4 = 4 * num4x4W;
+                        int n4Hx4 = 4 * num4x4H;
+                        ref Buf2D preBuf = ref pd.Pre[refr];
+                        DecBuildInterPredictors(
+                            ref xd,
+                            plane,
+                            n4Wx4,
+                            n4Hx4,
+                            0,
+                            0,
+                            n4Wx4,
+                            n4Hx4,
+                            miX,
+                            miY,
+                            kernel,
+                            ref sf,
+                            ref preBuf,
+                            ref dstBuf,
+                            ref mv,
+                            ref refFrameBuf,
+                            isScaled,
+                            refr);
+                    }
+                }
+            }
+        }
+
+        private static unsafe void DecResetSkipContext(ref MacroBlockD xd)
+        {
+            int i;
+            for (i = 0; i < Constants.MaxMbPlane; i++)
+            {
+                ref MacroBlockDPlane pd = ref xd.Plane[i];
+                MemoryUtil.Fill(pd.AboveContext.ToPointer(), (sbyte)0, pd.N4W);
+                MemoryUtil.Fill(pd.LeftContext.ToPointer(), (sbyte)0, pd.N4H);
+            }
+        }
+
+        private static void SetPlaneN4(ref MacroBlockD xd, int bw, int bh, int bwl, int bhl)
+        {
+            int i;
+            for (i = 0; i < Constants.MaxMbPlane; i++)
+            {
+                xd.Plane[i].N4W = (ushort)((bw << 1) >> xd.Plane[i].SubsamplingX);
+                xd.Plane[i].N4H = (ushort)((bh << 1) >> xd.Plane[i].SubsamplingY);
+                xd.Plane[i].N4Wl = (byte)(bwl - xd.Plane[i].SubsamplingX);
+                xd.Plane[i].N4Hl = (byte)(bhl - xd.Plane[i].SubsamplingY);
+            }
+        }
+
+        private static ref ModeInfo SetOffsets(
+            ref Vp9Common cm,
+            ref MacroBlockD xd,
+            BlockSize bsize,
+            int miRow,
+            int miCol,
+            int bw,
+            int bh,
+            int xMis,
+            int yMis,
+            int bwl,
+            int bhl)
+        {
+            int offset = miRow * cm.MiStride + miCol;
+            int x, y;
+            ref TileInfo tile = ref xd.Tile;
+
+            xd.Mi = cm.MiGridVisible.Slice(offset);
+            xd.Mi[0] = new Ptr<ModeInfo>(ref cm.Mi[offset]);
+            xd.Mi[0].Value.SbType = bsize;
+            for (y = 0; y < yMis; ++y)
+            {
+                for (x = y == 0 ? 1 : 0; x < xMis; ++x)
+                {
+                    xd.Mi[y * cm.MiStride + x] = xd.Mi[0];
+                }
+            }
+
+            SetPlaneN4(ref xd, bw, bh, bwl, bhl);
+
+            xd.SetSkipContext(miRow, miCol);
+
+            // Distance of Mb to the various image edges. These are specified to 8th pel
+            // as they are always compared to values that are in 1/8th pel units
+            xd.SetMiRowCol(ref tile, miRow, bh, miCol, bw, cm.MiRows, cm.MiCols);
+
+            ReconInter.SetupDstPlanes(ref xd.Plane, ref xd.CurBuf, miRow, miCol);
+            return ref xd.Mi[0].Value;
+        }
+
+        private static void DecodeBlock(
+            ref TileWorkerData twd,
+            ref Vp9Common cm,
+            int miRow,
+            int miCol,
+            BlockSize bsize,
+            int bwl,
+            int bhl)
+        {
+            bool less8x8 = bsize < BlockSize.Block8x8;
+            int bw = 1 << (bwl - 1);
+            int bh = 1 << (bhl - 1);
+            int xMis = Math.Min(bw, cm.MiCols - miCol);
+            int yMis = Math.Min(bh, cm.MiRows - miRow);
+            ref Reader r = ref twd.BitReader;
+            ref MacroBlockD xd = ref twd.Xd;
+
+            ref ModeInfo mi = ref SetOffsets(ref cm, ref xd, bsize, miRow, miCol, bw, bh, xMis, yMis, bwl, bhl);
+
+            if (bsize >= BlockSize.Block8x8 && (cm.SubsamplingX != 0 || cm.SubsamplingY != 0))
+            {
+                BlockSize uvSubsize = Luts.SsSizeLookup[(int)bsize][cm.SubsamplingX][cm.SubsamplingY];
+                if (uvSubsize == BlockSize.BlockInvalid)
+                {
+                    xd.ErrorInfo.Value.InternalError(CodecErr.CodecCorruptFrame, "Invalid block size.");
+                }
+            }
+
+            DecodeMv.ReadModeInfo(ref twd, ref cm, miRow, miCol, xMis, yMis);
+
+            if (mi.Skip != 0)
+            {
+                DecResetSkipContext(ref xd);
+            }
+
+            if (!mi.IsInterBlock())
+            {
+                int plane;
+                for (plane = 0; plane < Constants.MaxMbPlane; ++plane)
+                {
+                    ref MacroBlockDPlane pd = ref xd.Plane[plane];
+                    TxSize txSize = plane != 0 ? mi.GetUvTxSize(ref pd) : mi.TxSize;
+                    int num4x4W = pd.N4W;
+                    int num4x4H = pd.N4H;
+                    int step = 1 << (int)txSize;
+                    int row, col;
+                    int maxBlocksWide = num4x4W + (xd.MbToRightEdge >= 0 ? 0 : xd.MbToRightEdge >> (5 + pd.SubsamplingX));
+                    int maxBlocksHigh = num4x4H + (xd.MbToBottomEdge >= 0 ? 0 : xd.MbToBottomEdge >> (5 + pd.SubsamplingY));
+
+                    xd.MaxBlocksWide = (uint)(xd.MbToRightEdge >= 0 ? 0 : maxBlocksWide);
+                    xd.MaxBlocksHigh = (uint)(xd.MbToBottomEdge >= 0 ? 0 : maxBlocksHigh);
+
+                    for (row = 0; row < maxBlocksHigh; row += step)
+                    {
+                        for (col = 0; col < maxBlocksWide; col += step)
+                        {
+                            PredictAndReconstructIntraBlock(ref twd, ref mi, plane, row, col, txSize);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Prediction
+                DecBuildInterPredictorsSb(ref cm, ref xd, miRow, miCol);
+
+                // Reconstruction
+                if (mi.Skip == 0)
+                {
+                    int eobtotal = 0;
+                    int plane;
+
+                    for (plane = 0; plane < Constants.MaxMbPlane; ++plane)
+                    {
+                        ref MacroBlockDPlane pd = ref xd.Plane[plane];
+                        TxSize txSize = plane != 0 ? mi.GetUvTxSize(ref pd) : mi.TxSize;
+                        int num4x4W = pd.N4W;
+                        int num4x4H = pd.N4H;
+                        int step = 1 << (int)txSize;
+                        int row, col;
+                        int maxBlocksWide = num4x4W + (xd.MbToRightEdge >= 0 ? 0 : xd.MbToRightEdge >> (5 + pd.SubsamplingX));
+                        int maxBlocksHigh = num4x4H + (xd.MbToBottomEdge >= 0 ? 0 : xd.MbToBottomEdge >> (5 + pd.SubsamplingY));
+
+                        xd.MaxBlocksWide = (uint)(xd.MbToRightEdge >= 0 ? 0 : maxBlocksWide);
+                        xd.MaxBlocksHigh = (uint)(xd.MbToBottomEdge >= 0 ? 0 : maxBlocksHigh);
+
+                        for (row = 0; row < maxBlocksHigh; row += step)
+                        {
+                            for (col = 0; col < maxBlocksWide; col += step)
+                            {
+                                eobtotal += ReconstructInterBlock(ref twd, ref mi, plane, row, col, txSize);
+                            }
+                        }
+                    }
+
+                    if (!less8x8 && eobtotal == 0)
+                    {
+                        mi.Skip = 1;  // Skip loopfilter
+                    }
+                }
+            }
+
+            xd.Corrupted |= r.HasError();
+
+            if (cm.Lf.FilterLevel != 0)
+            {
+                LoopFilter.BuildMask(ref cm, ref mi, miRow, miCol, bw, bh);
+            }
+        }
+
+        private static int DecPartitionPlaneContext(ref TileWorkerData twd, int miRow, int miCol, int bsl)
+        {
+            ref sbyte aboveCtx = ref twd.Xd.AboveSegContext[miCol];
+            ref sbyte leftCtx = ref twd.Xd.LeftSegContext[miRow & Constants.MiMask];
+            int above = (aboveCtx >> bsl) & 1, left = (leftCtx >> bsl) & 1;
+
+            return (left * 2 + above) + bsl * Constants.PartitionPloffset;
+        }
+
+        private static void DecUpdatePartitionContext(
+            ref TileWorkerData twd,
+            int miRow,
+            int miCol,
+            BlockSize subsize,
+            int bw)
+        {
+            Span<sbyte> aboveCtx = twd.Xd.AboveSegContext.Slice(miCol).ToSpan();
+            Span<sbyte> leftCtx = MemoryMarshal.CreateSpan(ref twd.Xd.LeftSegContext[miRow & Constants.MiMask], 8 - (miRow & Constants.MiMask));
+
+            // Update the partition context at the end notes. Set partition bits
+            // of block sizes larger than the current one to be one, and partition
+            // bits of smaller block sizes to be zero.
+            aboveCtx.Slice(0, bw).Fill(Luts.PartitionContextLookup[(int)subsize].Above);
+            leftCtx.Slice(0, bw).Fill(Luts.PartitionContextLookup[(int)subsize].Left);
+        }
+
+        private static PartitionType ReadPartition(
+            ref TileWorkerData twd,
+            int miRow,
+            int miCol,
+            int hasRows,
+            int hasCols,
+            int bsl)
+        {
+            int ctx = DecPartitionPlaneContext(ref twd, miRow, miCol, bsl);
+            ReadOnlySpan<byte> probs = MemoryMarshal.CreateReadOnlySpan(ref twd.Xd.PartitionProbs[ctx][0], 3);
+            PartitionType p;
+            ref Reader r = ref twd.BitReader;
+
+            if (hasRows != 0 && hasCols != 0)
+            {
+                p = (PartitionType)r.ReadTree(Luts.Vp9PartitionTree, probs);
+            }
+            else if (hasRows == 0 && hasCols != 0)
+            {
+                p = r.Read(probs[1]) != 0 ? PartitionType.PartitionSplit : PartitionType.PartitionHorz;
+            }
+            else if (hasRows != 0 && hasCols == 0)
+            {
+                p = r.Read(probs[2]) != 0 ? PartitionType.PartitionSplit : PartitionType.PartitionVert;
+            }
+            else
+            {
+                p = PartitionType.PartitionSplit;
+            }
+
+            if (!twd.Xd.Counts.IsNull)
+            {
+                ++twd.Xd.Counts.Value.Partition[ctx][(int)p];
+            }
+
+            return p;
+        }
+
+        private static void DecodePartition(
+            ref TileWorkerData twd,
+            ref Vp9Common cm,
+            int miRow,
+            int miCol,
+            BlockSize bsize,
+            int n4x4L2)
+        {
+            int n8x8L2 = n4x4L2 - 1;
+            int num8x8Wh = 1 << n8x8L2;
+            int hbs = num8x8Wh >> 1;
+            PartitionType partition;
+            BlockSize subsize;
+            bool hasRows = (miRow + hbs) < cm.MiRows;
+            bool hasCols = (miCol + hbs) < cm.MiCols;
+            ref MacroBlockD xd = ref twd.Xd;
+
+            if (miRow >= cm.MiRows || miCol >= cm.MiCols)
+            {
+                return;
+            }
+
+            partition = ReadPartition(ref twd, miRow, miCol, hasRows ? 1 : 0, hasCols ? 1 : 0, n8x8L2);
+            subsize = Luts.SubsizeLookup[(int)partition][(int)bsize];
+            if (hbs == 0)
+            {
+                // Calculate bmode block dimensions (log 2)
+                xd.BmodeBlocksWl = (byte)(1 >> ((partition & PartitionType.PartitionVert) != 0 ? 1 : 0));
+                xd.BmodeBlocksHl = (byte)(1 >> ((partition & PartitionType.PartitionHorz) != 0 ? 1 : 0));
+                DecodeBlock(ref twd, ref cm, miRow, miCol, subsize, 1, 1);
+            }
+            else
+            {
+                switch (partition)
+                {
+                    case PartitionType.PartitionNone:
+                        DecodeBlock(ref twd, ref cm, miRow, miCol, subsize, n4x4L2, n4x4L2);
+                        break;
+                    case PartitionType.PartitionHorz:
+                        DecodeBlock(ref twd, ref cm, miRow, miCol, subsize, n4x4L2, n8x8L2);
+                        if (hasRows)
+                        {
+                            DecodeBlock(ref twd, ref cm, miRow + hbs, miCol, subsize, n4x4L2, n8x8L2);
+                        }
+
+                        break;
+                    case PartitionType.PartitionVert:
+                        DecodeBlock(ref twd, ref cm, miRow, miCol, subsize, n8x8L2, n4x4L2);
+                        if (hasCols)
+                        {
+                            DecodeBlock(ref twd, ref cm, miRow, miCol + hbs, subsize, n8x8L2, n4x4L2);
+                        }
+
+                        break;
+                    case PartitionType.PartitionSplit:
+                        DecodePartition(ref twd, ref cm, miRow, miCol, subsize, n8x8L2);
+                        DecodePartition(ref twd, ref cm, miRow, miCol + hbs, subsize, n8x8L2);
+                        DecodePartition(ref twd, ref cm, miRow + hbs, miCol, subsize, n8x8L2);
+                        DecodePartition(ref twd, ref cm, miRow + hbs, miCol + hbs, subsize, n8x8L2);
+                        break;
+                    default: Debug.Assert(false, "Invalid partition type"); break;
+                }
+            }
+
+            // Update partition context
+            if (bsize >= BlockSize.Block8x8 && (bsize == BlockSize.Block8x8 || partition != PartitionType.PartitionSplit))
+            {
+                DecUpdatePartitionContext(ref twd, miRow, miCol, subsize, num8x8Wh);
+            }
+        }
+
+        private static void SetupTokenDecoder(
+            ArrayPtr<byte> data,
+            int readSize,
+            ref InternalErrorInfo errorInfo,
+            ref Reader r)
+        {
+            // Validate the calculated partition length. If the buffer described by the
+            // partition can't be fully read then throw an error.
+            if (!ReadIsValid(data, readSize))
+            {
+                errorInfo.InternalError(CodecErr.CodecCorruptFrame, "Truncated packet or corrupt tile length");
+            }
+
+            if (r.Init(data, readSize))
+            {
+                errorInfo.InternalError(CodecErr.CodecMemError, "Failed to allocate bool decoder 1");
+            }
+        }
+
+        // Reads the next tile returning its size and adjusting '*data' accordingly
+        // based on 'isLast'.
+        private static void GetTileBuffer(
+            bool isLast,
+            ref InternalErrorInfo errorInfo,
+            ref ArrayPtr<byte> data,
+            ref TileBuffer buf)
+        {
+            int size;
+
+            if (!isLast)
+            {
+                if (!ReadIsValid(data, 4))
+                {
+                    errorInfo.InternalError(CodecErr.CodecCorruptFrame, "Truncated packet or corrupt tile length");
+                }
+
+                size = BinaryPrimitives.ReadInt32BigEndian(data.ToSpan());
+                data = data.Slice(4);
+
+                if (size > data.Length)
+                {
+                    errorInfo.InternalError(CodecErr.CodecCorruptFrame, "Truncated packet or corrupt tile size");
+                }
+            }
+            else
+            {
+                size = data.Length;
+            }
+
+            buf.Data = data;
+            buf.Size = size;
+
+            data = data.Slice(size);
+        }
+
+        private static void GetTileBuffers(
+            ref Vp9Common cm,
+            ArrayPtr<byte> data,
+            int tileCols,
+            int tileRows,
+            ref Array4<Array64<TileBuffer>> tileBuffers)
+        {
+            int r, c;
+
+            for (r = 0; r < tileRows; ++r)
+            {
+                for (c = 0; c < tileCols; ++c)
+                {
+                    bool isLast = (r == tileRows - 1) && (c == tileCols - 1);
+                    ref TileBuffer buf = ref tileBuffers[r][c];
+                    GetTileBuffer(isLast, ref cm.Error, ref data, ref buf);
+                }
+            }
+        }
+
+        public static unsafe ArrayPtr<byte> DecodeTiles(ref Vp9Common cm, ArrayPtr<byte> data)
+        {
+            int alignedCols = TileInfo.MiColsAlignedToSb(cm.MiCols);
+            int tileCols = 1 << cm.Log2TileCols;
+            int tileRows = 1 << cm.Log2TileRows;
+            Array4<Array64<TileBuffer>> tileBuffers = new Array4<Array64<TileBuffer>>();
+            int tileRow, tileCol;
+            int miRow, miCol;
+
+            Debug.Assert(tileRows <= 4);
+            Debug.Assert(tileCols <= (1 << 6));
+
+            // Note: this memset assumes above_context[0], [1] and [2]
+            // are allocated as part of the same buffer.
+            MemoryUtil.Fill(cm.AboveContext.ToPointer(), (sbyte)0, Constants.MaxMbPlane * 2 * alignedCols);
+            MemoryUtil.Fill(cm.AboveSegContext.ToPointer(), (sbyte)0, alignedCols);
+
+            LoopFilter.ResetLfm(ref cm);
+
+            GetTileBuffers(ref cm, data, tileCols, tileRows, ref tileBuffers);
+            // Load all tile information into tile_data.
+            for (tileRow = 0; tileRow < tileRows; ++tileRow)
+            {
+                for (tileCol = 0; tileCol < tileCols; ++tileCol)
+                {
+                    ref TileBuffer buf = ref tileBuffers[tileRow][tileCol];
+                    ref TileWorkerData tileData = ref cm.TileWorkerData[tileCols * tileRow + tileCol];
+                    tileData.Xd = cm.Mb;
+                    tileData.Xd.Corrupted = false;
+                    tileData.Xd.Counts = cm.FrameParallelDecodingMode ? Ptr<Vp9BackwardUpdates>.Null : cm.Counts;
+                    tileData.Dqcoeff = new Array32<Array32<int>>();
+                    tileData.Xd.Tile.Init(ref cm, tileRow, tileCol);
+                    SetupTokenDecoder(buf.Data, buf.Size, ref cm.Error, ref tileData.BitReader);
+                    cm.InitMacroBlockD(ref tileData.Xd, new ArrayPtr<int>(ref tileData.Dqcoeff[0][0], 32 * 32));
+                }
+            }
+
+            for (tileRow = 0; tileRow < tileRows; ++tileRow)
+            {
+                TileInfo tile = new TileInfo();
+                tile.SetRow(ref cm, tileRow);
+                for (miRow = tile.MiRowStart; miRow < tile.MiRowEnd; miRow += Constants.MiBlockSize)
+                {
+                    for (tileCol = 0; tileCol < tileCols; ++tileCol)
+                    {
+                        int col = tileCol;
+                        ref TileWorkerData tileData = ref cm.TileWorkerData[tileCols * tileRow + col];
+                        tile.SetCol(ref cm, col);
+                        tileData.Xd.LeftContext = new Array3<Array16<sbyte>>();
+                        tileData.Xd.LeftSegContext = new Array8<sbyte>();
+                        for (miCol = tile.MiColStart; miCol < tile.MiColEnd; miCol += Constants.MiBlockSize)
+                        {
+                            DecodePartition(ref tileData, ref cm, miRow, miCol, BlockSize.Block64x64, 4);
+                        }
+                        cm.Mb.Corrupted |= tileData.Xd.Corrupted;
+                        if (cm.Mb.Corrupted)
+                        {
+                            cm.Error.InternalError(CodecErr.CodecCorruptFrame, "Failed to decode tile data");
+                        };
+                    }
+                }
+            }
+
+            // Get last tile data.
+            return cm.TileWorkerData[tileCols * tileRows - 1].BitReader.FindEnd();
+        }
+    }
+}

+ 1159 - 0
Ryujinx.Graphics.Nvdec.Vp9/DecodeMv.cs

@@ -0,0 +1,1159 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using Ryujinx.Graphics.Video;
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using Mv = Ryujinx.Graphics.Nvdec.Vp9.Types.Mv;
+using MvRef = Ryujinx.Graphics.Nvdec.Vp9.Types.MvRef;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class DecodeMv
+    {
+        private const int MvrefNeighbours = 8;
+
+        private static PredictionMode ReadIntraMode(ref Reader r, ReadOnlySpan<byte> p)
+        {
+            return (PredictionMode)r.ReadTree(Luts.Vp9IntraModeTree, p);
+        }
+
+        private static PredictionMode ReadIntraModeY(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r, int sizeGroup)
+        {
+            PredictionMode yMode = ReadIntraMode(ref r, cm.Fc.Value.YModeProb[sizeGroup].ToSpan());
+            if (!xd.Counts.IsNull)
+            {
+                ++xd.Counts.Value.YMode[sizeGroup][(int)yMode];
+            }
+
+            return yMode;
+        }
+
+        private static PredictionMode ReadIntraModeUv(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r, byte yMode)
+        {
+            PredictionMode uvMode = ReadIntraMode(ref r, cm.Fc.Value.UvModeProb[yMode].ToSpan());
+            if (!xd.Counts.IsNull)
+            {
+                ++xd.Counts.Value.UvMode[yMode][(int)uvMode];
+            }
+
+            return uvMode;
+        }
+
+        private static PredictionMode ReadInterMode(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r, int ctx)
+        {
+            int mode = r.ReadTree(Luts.Vp9InterModeTree, cm.Fc.Value.InterModeProb[ctx].ToSpan());
+            if (!xd.Counts.IsNull)
+            {
+                ++xd.Counts.Value.InterMode[ctx][mode];
+            }
+
+            return PredictionMode.NearestMv + mode;
+        }
+
+        private static int ReadSegmentId(ref Reader r, ref Array7<byte> segTreeProbs)
+        {
+            return r.ReadTree(Luts.Vp9SegmentTree, segTreeProbs.ToSpan());
+        }
+
+        private static ReadOnlySpan<byte> GetTxProbs(ref Vp9EntropyProbs fc, TxSize maxTxSize, int ctx)
+        {
+            switch (maxTxSize)
+            {
+                case TxSize.Tx8x8: return fc.Tx8x8Prob[ctx].ToSpan();
+                case TxSize.Tx16x16: return fc.Tx16x16Prob[ctx].ToSpan();
+                case TxSize.Tx32x32: return fc.Tx32x32Prob[ctx].ToSpan();
+                default: Debug.Assert(false, "Invalid maxTxSize."); return ReadOnlySpan<byte>.Empty;
+            }
+        }
+
+        private static Span<uint> GetTxCounts(ref Vp9BackwardUpdates counts, TxSize maxTxSize, int ctx)
+        {
+            switch (maxTxSize)
+            {
+                case TxSize.Tx8x8: return counts.Tx8x8[ctx].ToSpan();
+                case TxSize.Tx16x16: return counts.Tx16x16[ctx].ToSpan();
+                case TxSize.Tx32x32: return counts.Tx32x32[ctx].ToSpan();
+                default: Debug.Assert(false, "Invalid maxTxSize."); return Span<uint>.Empty;
+            }
+        }
+
+        private static TxSize ReadSelectedTxSize(ref Vp9Common cm, ref MacroBlockD xd, TxSize maxTxSize, ref Reader r)
+        {
+            int ctx = xd.GetTxSizeContext();
+            ReadOnlySpan<byte> txProbs = GetTxProbs(ref cm.Fc.Value, maxTxSize, ctx);
+            TxSize txSize = (TxSize)r.Read(txProbs[0]);
+            if (txSize != TxSize.Tx4x4 && maxTxSize >= TxSize.Tx16x16)
+            {
+                txSize += r.Read(txProbs[1]);
+                if (txSize != TxSize.Tx8x8 && maxTxSize >= TxSize.Tx32x32)
+                {
+                    txSize += r.Read(txProbs[2]);
+                }
+            }
+
+            if (!xd.Counts.IsNull)
+            {
+                ++GetTxCounts(ref xd.Counts.Value, maxTxSize, ctx)[(int)txSize];
+            }
+
+            return txSize;
+        }
+
+        private static TxSize ReadTxSize(ref Vp9Common cm, ref MacroBlockD xd, bool allowSelect, ref Reader r)
+        {
+            TxMode txMode = cm.TxMode;
+            BlockSize bsize = xd.Mi[0].Value.SbType;
+            TxSize maxTxSize = Luts.MaxTxSizeLookup[(int)bsize];
+            if (allowSelect && txMode == TxMode.TxModeSelect && bsize >= BlockSize.Block8x8)
+            {
+                return ReadSelectedTxSize(ref cm, ref xd, maxTxSize, ref r);
+            }
+            else
+            {
+                return (TxSize)Math.Min((int)maxTxSize, (int)Luts.TxModeToBiggestTxSize[(int)txMode]);
+            }
+        }
+
+        private static int DecGetSegmentId(ref Vp9Common cm, ArrayPtr<byte> segmentIds, int miOffset, int xMis, int yMis)
+        {
+            int x, y, segmentId = int.MaxValue;
+
+            for (y = 0; y < yMis; y++)
+            {
+                for (x = 0; x < xMis; x++)
+                {
+                    segmentId = Math.Min(segmentId, segmentIds[miOffset + y * cm.MiCols + x]);
+                }
+            }
+
+            Debug.Assert(segmentId >= 0 && segmentId < Constants.MaxSegments);
+            return segmentId;
+        }
+
+        private static void SetSegmentId(ref Vp9Common cm, int miOffset, int xMis, int yMis, int segmentId)
+        {
+            int x, y;
+
+            Debug.Assert(segmentId >= 0 && segmentId < Constants.MaxSegments);
+
+            for (y = 0; y < yMis; y++)
+            {
+                for (x = 0; x < xMis; x++)
+                {
+                    cm.CurrentFrameSegMap[miOffset + y * cm.MiCols + x] = (byte)segmentId;
+                }
+            }
+        }
+
+        private static void CopySegmentId(
+            ref Vp9Common cm,
+            ArrayPtr<byte> lastSegmentIds,
+            ArrayPtr<byte> currentSegmentIds,
+            int miOffset,
+            int xMis,
+            int yMis)
+        {
+            int x, y;
+
+            for (y = 0; y < yMis; y++)
+            {
+                for (x = 0; x < xMis; x++)
+                {
+                    currentSegmentIds[miOffset + y * cm.MiCols + x] = (byte)(!lastSegmentIds.IsNull ? lastSegmentIds[miOffset + y * cm.MiCols + x] : 0);
+                }
+            }
+        }
+
+        private static int ReadIntraSegmentId(ref Vp9Common cm, int miOffset, int xMis, int yMis, ref Reader r)
+        {
+            ref Segmentation seg = ref cm.Seg;
+            int segmentId;
+
+            if (!seg.Enabled)
+            {
+                return 0;  // Default for disabled segmentation
+            }
+
+            if (!seg.UpdateMap)
+            {
+                CopySegmentId(ref cm, cm.LastFrameSegMap, cm.CurrentFrameSegMap, miOffset, xMis, yMis);
+                return 0;
+            }
+
+            segmentId = ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb);
+            SetSegmentId(ref cm, miOffset, xMis, yMis, segmentId);
+            return segmentId;
+        }
+
+        private static int ReadInterSegmentId(
+            ref Vp9Common cm,
+            ref MacroBlockD xd,
+            int miRow,
+            int miCol,
+            ref Reader r,
+            int xMis,
+            int yMis)
+        {
+            ref Segmentation seg = ref cm.Seg;
+            ref ModeInfo mi = ref xd.Mi[0].Value;
+            int predictedSegmentId, segmentId;
+            int miOffset = miRow * cm.MiCols + miCol;
+
+            if (!seg.Enabled)
+            {
+                return 0;  // Default for disabled segmentation
+            }
+
+            predictedSegmentId = !cm.LastFrameSegMap.IsNull
+                ? DecGetSegmentId(ref cm, cm.LastFrameSegMap, miOffset, xMis, yMis)
+                : 0;
+
+            if (!seg.UpdateMap)
+            {
+                CopySegmentId(ref cm, cm.LastFrameSegMap, cm.CurrentFrameSegMap, miOffset, xMis, yMis);
+                return predictedSegmentId;
+            }
+
+            if (seg.TemporalUpdate)
+            {
+                byte predProb = Segmentation.GetPredProbSegId(ref cm.Fc.Value.SegPredProb, ref xd);
+                mi.SegIdPredicted = (sbyte)r.Read(predProb);
+                segmentId = mi.SegIdPredicted != 0 ? predictedSegmentId : ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb);
+            }
+            else
+            {
+                segmentId = ReadSegmentId(ref r, ref cm.Fc.Value.SegTreeProb);
+            }
+            SetSegmentId(ref cm, miOffset, xMis, yMis, segmentId);
+            return segmentId;
+        }
+
+        private static int ReadSkip(ref Vp9Common cm, ref MacroBlockD xd, int segmentId, ref Reader r)
+        {
+            if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlSkip) != 0)
+            {
+                return 1;
+            }
+            else
+            {
+                int ctx = xd.GetSkipContext();
+                int skip = r.Read(cm.Fc.Value.SkipProb[ctx]);
+                if (!xd.Counts.IsNull)
+                {
+                    ++xd.Counts.Value.Skip[ctx][skip];
+                }
+
+                return skip;
+            }
+        }
+
+        private static int ReadMvComponent(ref Reader r, ref Vp9EntropyProbs fc, int mvcomp, bool usehp)
+        {
+            int mag, d, fr, hp;
+            bool sign = r.Read(fc.Sign[mvcomp]) != 0;
+            MvClassType mvClass = (MvClassType)r.ReadTree(Luts.Vp9MvClassTree, fc.Classes[mvcomp].ToSpan());
+            bool class0 = mvClass == MvClassType.MvClass0;
+
+            // Integer part
+            if (class0)
+            {
+                d = r.Read(fc.Class0[mvcomp][0]);
+                mag = 0;
+            }
+            else
+            {
+                int i;
+                int n = (int)mvClass + Constants.Class0Bits - 1;  // Number of bits
+
+                d = 0;
+                for (i = 0; i < n; ++i)
+                {
+                    d |= r.Read(fc.Bits[mvcomp][i]) << i;
+                }
+
+                mag = Constants.Class0Size << ((int)mvClass + 2);
+            }
+
+            // Fractional part
+            fr = r.ReadTree(Luts.Vp9MvFPTree, class0 ? fc.Class0Fp[mvcomp][d].ToSpan() : fc.Fp[mvcomp].ToSpan());
+
+            // High precision part (if hp is not used, the default value of the hp is 1)
+            hp = usehp ? r.Read(class0 ? fc.Class0Hp[mvcomp] : fc.Hp[mvcomp]) : 1;
+
+            // Result
+            mag += ((d << 3) | (fr << 1) | hp) + 1;
+            return sign ? -mag : mag;
+        }
+
+        private static void ReadMv(
+            ref Reader r,
+            ref Mv mv,
+            ref Mv refr,
+            ref Vp9EntropyProbs fc,
+            Ptr<Vp9BackwardUpdates> counts,
+            bool allowHP)
+        {
+            MvJointType jointType = (MvJointType)r.ReadTree(Luts.Vp9MvJointTree, fc.Joints.ToSpan());
+            bool useHP = allowHP && refr.UseMvHp();
+            Mv diff = new Mv();
+
+            if (Mv.MvJointVertical(jointType))
+            {
+                diff.Row = (short)ReadMvComponent(ref r, ref fc, 0, useHP);
+            }
+
+            if (Mv.MvJointHorizontal(jointType))
+            {
+                diff.Col = (short)ReadMvComponent(ref r, ref fc, 1, useHP);
+            }
+
+            diff.IncMv(counts);
+
+            mv.Row = (short)(refr.Row + diff.Row);
+            mv.Col = (short)(refr.Col + diff.Col);
+        }
+
+        private static ReferenceMode ReadBlockReferenceMode(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r)
+        {
+            if (cm.ReferenceMode == ReferenceMode.ReferenceModeSelect)
+            {
+                int ctx = PredCommon.GetReferenceModeContext(ref cm, ref xd);
+                ReferenceMode mode = (ReferenceMode)r.Read(cm.Fc.Value.CompInterProb[ctx]);
+                if (!xd.Counts.IsNull)
+                {
+                    ++xd.Counts.Value.CompInter[ctx][(int)mode];
+                }
+
+                return mode;  // SingleReference or CompoundReference
+            }
+            else
+            {
+                return cm.ReferenceMode;
+            }
+        }
+
+        // Read the referncence frame
+        private static void ReadRefFrames(
+            ref Vp9Common cm,
+            ref MacroBlockD xd,
+            ref Reader r,
+            int segmentId,
+            ref Array2<sbyte> refFrame)
+        {
+            ref Vp9EntropyProbs fc = ref cm.Fc.Value;
+
+            if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlRefFrame) != 0)
+            {
+                refFrame[0] = (sbyte)cm.Seg.GetSegData(segmentId, SegLvlFeatures.SegLvlRefFrame);
+                refFrame[1] = Constants.None;
+            }
+            else
+            {
+                ReferenceMode mode = ReadBlockReferenceMode(ref cm, ref xd, ref r);
+                if (mode == ReferenceMode.CompoundReference)
+                {
+                    int idx = cm.RefFrameSignBias[cm.CompFixedRef];
+                    int ctx = PredCommon.GetPredContextCompRefP(ref cm, ref xd);
+                    int bit = r.Read(fc.CompRefProb[ctx]);
+                    if (!xd.Counts.IsNull)
+                    {
+                        ++xd.Counts.Value.CompRef[ctx][bit];
+                    }
+
+                    refFrame[idx] = cm.CompFixedRef;
+                    refFrame[idx == 0 ? 1 : 0] = cm.CompVarRef[bit];
+                }
+                else if (mode == ReferenceMode.SingleReference)
+                {
+                    int ctx0 = PredCommon.GetPredContextSingleRefP1(ref xd);
+                    int bit0 = r.Read(fc.SingleRefProb[ctx0][0]);
+                    if (!xd.Counts.IsNull)
+                    {
+                        ++xd.Counts.Value.SingleRef[ctx0][0][bit0];
+                    }
+
+                    if (bit0 != 0)
+                    {
+                        int ctx1 = PredCommon.GetPredContextSingleRefP2(ref xd);
+                        int bit1 = r.Read(fc.SingleRefProb[ctx1][1]);
+                        if (!xd.Counts.IsNull)
+                        {
+                            ++xd.Counts.Value.SingleRef[ctx1][1][bit1];
+                        }
+
+                        refFrame[0] = (sbyte)(bit1 != 0 ? Constants.AltRefFrame : Constants.GoldenFrame);
+                    }
+                    else
+                    {
+                        refFrame[0] = Constants.LastFrame;
+                    }
+
+                    refFrame[1] = Constants.None;
+                }
+                else
+                {
+                    Debug.Assert(false, "Invalid prediction mode.");
+                }
+            }
+        }
+
+        private static byte ReadSwitchableInterpFilter(ref Vp9Common cm, ref MacroBlockD xd, ref Reader r)
+        {
+            int ctx = xd.GetPredContextSwitchableInterp();
+            byte type = (byte)r.ReadTree(Luts.Vp9SwitchableInterpTree, cm.Fc.Value.SwitchableInterpProb[ctx].ToSpan());
+            if (!xd.Counts.IsNull)
+            {
+                ++xd.Counts.Value.SwitchableInterp[ctx][type];
+            }
+
+            return type;
+        }
+
+        private static void ReadIntraBlockModeInfo(ref Vp9Common cm, ref MacroBlockD xd, ref ModeInfo mi, ref Reader r)
+        {
+            BlockSize bsize = mi.SbType;
+            int i;
+
+            switch (bsize)
+            {
+                case BlockSize.Block4x4:
+                    for (i = 0; i < 4; ++i)
+                    {
+                        mi.Bmi[i].Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0);
+                    }
+
+                    mi.Mode = mi.Bmi[3].Mode;
+                    break;
+                case BlockSize.Block4x8:
+                    mi.Bmi[0].Mode = mi.Bmi[2].Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0);
+                    mi.Bmi[1].Mode = mi.Bmi[3].Mode = mi.Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0);
+                    break;
+                case BlockSize.Block8x4:
+                    mi.Bmi[0].Mode = mi.Bmi[1].Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0);
+                    mi.Bmi[2].Mode = mi.Bmi[3].Mode = mi.Mode = ReadIntraModeY(ref cm, ref xd, ref r, 0);
+                    break;
+                default: mi.Mode = ReadIntraModeY(ref cm, ref xd, ref r, Luts.SizeGroupLookup[(int)bsize]); break;
+            }
+
+            mi.UvMode = ReadIntraModeUv(ref cm, ref xd, ref r, (byte)mi.Mode);
+
+            // Initialize interp_filter here so we do not have to check for inter block
+            // modes in GetPredContextSwitchableInterp()
+            mi.InterpFilter = Constants.SwitchableFilters;
+
+            mi.RefFrame[0] = Constants.IntraFrame;
+            mi.RefFrame[1] = Constants.None;
+        }
+
+        private static bool IsMvValid(ref Mv mv)
+        {
+            return mv.Row > Constants.MvLow &&
+                   mv.Row < Constants.MvUpp &&
+                   mv.Col > Constants.MvLow &&
+                   mv.Col < Constants.MvUpp;
+        }
+
+        private static void CopyMvPair(ref Array2<Mv> dst, ref Array2<Mv> src)
+        {
+            dst[0] = src[0];
+            dst[1] = src[1];
+        }
+
+        private static void ZeroMvPair(ref Array2<Mv> dst)
+        {
+            dst[0] = new Mv();
+            dst[1] = new Mv();
+        }
+
+        private static bool AssignMv(
+            ref Vp9Common cm,
+            ref MacroBlockD xd,
+            PredictionMode mode,
+            ref Array2<Mv> mv,
+            ref Array2<Mv> refMv,
+            ref Array2<Mv> nearNearestMv,
+            int isCompound,
+            bool allowHP,
+            ref Reader r)
+        {
+            int i;
+            bool ret = true;
+
+            switch (mode)
+            {
+                case PredictionMode.NewMv:
+                    {
+                        for (i = 0; i < 1 + isCompound; ++i)
+                        {
+                            ReadMv(ref r, ref mv[i], ref refMv[i], ref cm.Fc.Value, xd.Counts, allowHP);
+                            ret = ret && IsMvValid(ref mv[i]);
+                        }
+                        break;
+                    }
+                case PredictionMode.NearMv:
+                case PredictionMode.NearestMv:
+                    {
+                        CopyMvPair(ref mv, ref nearNearestMv);
+                        break;
+                    }
+                case PredictionMode.ZeroMv:
+                    {
+                        ZeroMvPair(ref mv);
+                        break;
+                    }
+                default: return false;
+            }
+            return ret;
+        }
+
+        private static bool ReadIsInterBlock(ref Vp9Common cm, ref MacroBlockD xd, int segmentId, ref Reader r)
+        {
+            if (cm.Seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlRefFrame) != 0)
+            {
+                return cm.Seg.GetSegData(segmentId, SegLvlFeatures.SegLvlRefFrame) != Constants.IntraFrame;
+            }
+            else
+            {
+                int ctx = xd.GetIntraInterContext();
+                bool isInter = r.Read(cm.Fc.Value.IntraInterProb[ctx]) != 0;
+                if (!xd.Counts.IsNull)
+                {
+                    ++xd.Counts.Value.IntraInter[ctx][isInter ? 1 : 0];
+                }
+
+                return isInter;
+            }
+        }
+
+        private static void DecFindBestRefMvs(bool allowHP, Span<Mv> mvlist, ref Mv bestMv, int refmvCount)
+        {
+            int i;
+
+            // Make sure all the candidates are properly clamped etc
+            for (i = 0; i < refmvCount; ++i)
+            {
+                mvlist[i].LowerMvPrecision(allowHP);
+                bestMv = mvlist[i];
+            }
+        }
+
+        private static bool AddMvRefListEb(Mv mv, ref int refMvCount, Span<Mv> mvRefList, bool earlyBreak)
+        {
+            if (refMvCount != 0)
+            {
+                if (Unsafe.As<Mv, int>(ref mv) != Unsafe.As<Mv, int>(ref mvRefList[0]))
+                {
+                    mvRefList[refMvCount] = mv;
+                    refMvCount++;
+                    return true;
+                }
+            }
+            else
+            {
+                mvRefList[refMvCount++] = mv;
+                if (earlyBreak)
+                {
+                    return true;
+                }
+            }
+
+            return false;
+        }
+
+        // Performs mv sign inversion if indicated by the reference frame combination.
+        private static Mv ScaleMv(ref ModeInfo mi, int refr, sbyte thisRefFrame, ref Array4<sbyte> refSignBias)
+        {
+            Mv mv = mi.Mv[refr];
+            if (refSignBias[mi.RefFrame[refr]] != refSignBias[thisRefFrame])
+            {
+                mv.Row *= -1;
+                mv.Col *= -1;
+            }
+            return mv;
+        }
+
+        private static bool IsDiffRefFrameAddMvEb(
+            ref ModeInfo mbmi,
+            sbyte refFrame,
+            ref Array4<sbyte> refSignBias,
+            ref int refmvCount,
+            Span<Mv> mvRefList,
+            bool earlyBreak)
+        {
+            if (mbmi.IsInterBlock())
+            {
+                if (mbmi.RefFrame[0] != refFrame)
+                {
+                    if (AddMvRefListEb(ScaleMv(ref mbmi, 0, refFrame, ref refSignBias), ref refmvCount, mvRefList, earlyBreak))
+                    {
+                        return true;
+                    }
+                }
+                if (mbmi.HasSecondRef() && mbmi.RefFrame[1] != refFrame && Unsafe.As<Mv, int>(ref mbmi.Mv[1]) != Unsafe.As<Mv, int>(ref mbmi.Mv[0]))
+                {
+                    if (AddMvRefListEb(ScaleMv(ref mbmi, 1, refFrame, ref refSignBias), ref refmvCount, mvRefList, earlyBreak))
+                    {
+                        return true;
+                    }
+                }
+
+            }
+            return false;
+        }
+
+        // This function searches the neighborhood of a given MB/SB
+        // to try and find candidate reference vectors.
+        private static unsafe int DecFindMvRefs(
+            ref Vp9Common cm,
+            ref MacroBlockD xd,
+            PredictionMode mode,
+            sbyte refFrame,
+            Span<Position> mvRefSearch,
+            Span<Mv> mvRefList,
+            int miRow,
+            int miCol,
+            int block,
+            int isSub8X8)
+        {
+            ref Array4<sbyte> refSignBias = ref cm.RefFrameSignBias;
+            int i, refmvCount = 0;
+            bool differentRefFound = false;
+            Ptr<MvRef> prevFrameMvs = cm.UsePrevFrameMvs ? new Ptr<MvRef>(ref cm.PrevFrameMvs[miRow * cm.MiCols + miCol]) : Ptr<MvRef>.Null;
+            ref TileInfo tile = ref xd.Tile;
+            // If mode is nearestmv or newmv (uses nearestmv as a reference) then stop
+            // searching after the first mv is found.
+            bool earlyBreak = mode != PredictionMode.NearMv;
+
+            // Blank the reference vector list
+            mvRefList.Slice(0, Constants.MaxMvRefCandidates).Fill(new Mv());
+
+            i = 0;
+            if (isSub8X8 != 0)
+            {
+                // If the size < 8x8 we get the mv from the bmi substructure for the
+                // nearest two blocks.
+                for (i = 0; i < 2; ++i)
+                {
+                    ref Position mvRef = ref mvRefSearch[i];
+                    if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef))
+                    {
+                        ref ModeInfo candidateMi = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value;
+                        differentRefFound = true;
+
+                        if (candidateMi.RefFrame[0] == refFrame)
+                        {
+                            if (AddMvRefListEb(candidateMi.GetSubBlockMv(0, mvRef.Col, block), ref refmvCount, mvRefList, earlyBreak))
+                            {
+                                goto Done;
+                            }
+                        }
+                        else if (candidateMi.RefFrame[1] == refFrame)
+                        {
+                            if (AddMvRefListEb(candidateMi.GetSubBlockMv(1, mvRef.Col, block), ref refmvCount, mvRefList, earlyBreak))
+                            {
+                                goto Done;
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Check the rest of the neighbors in much the same way
+            // as before except we don't need to keep track of sub blocks or
+            // mode counts.
+            for (; i < MvrefNeighbours; ++i)
+            {
+                ref Position mvRef = ref mvRefSearch[i];
+                if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef))
+                {
+                    ref ModeInfo candidate = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value;
+                    differentRefFound = true;
+
+                    if (candidate.RefFrame[0] == refFrame)
+                    {
+                        if (AddMvRefListEb(candidate.Mv[0], ref refmvCount, mvRefList, earlyBreak))
+                        {
+                            goto Done;
+                        }
+                    }
+                    else if (candidate.RefFrame[1] == refFrame)
+                    {
+                        if (AddMvRefListEb(candidate.Mv[1], ref refmvCount, mvRefList, earlyBreak))
+                        {
+                            goto Done;
+                        }
+                    }
+                }
+            }
+
+            // Check the last frame's mode and mv info.
+            if (!prevFrameMvs.IsNull)
+            {
+                if (prevFrameMvs.Value.RefFrame[0] == refFrame)
+                {
+                    if (AddMvRefListEb(prevFrameMvs.Value.Mv[0], ref refmvCount, mvRefList, earlyBreak))
+                    {
+                        goto Done;
+                    }
+                }
+                else if (prevFrameMvs.Value.RefFrame[1] == refFrame)
+                {
+                    if (AddMvRefListEb(prevFrameMvs.Value.Mv[1], ref refmvCount, mvRefList, earlyBreak))
+                    {
+                        goto Done;
+                    }
+                }
+            }
+
+            // Since we couldn't find 2 mvs from the same reference frame
+            // go back through the neighbors and find motion vectors from
+            // different reference frames.
+            if (differentRefFound)
+            {
+                for (i = 0; i < MvrefNeighbours; ++i)
+                {
+                    ref Position mvRef = ref mvRefSearch[i];
+                    if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef))
+                    {
+                        ref ModeInfo candidate = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value;
+
+                        // If the candidate is Intra we don't want to consider its mv.
+                        if (IsDiffRefFrameAddMvEb(ref candidate, refFrame, ref refSignBias, ref refmvCount, mvRefList, earlyBreak))
+                        {
+                            goto Done;
+                        }
+                    }
+                }
+            }
+
+            // Since we still don't have a candidate we'll try the last frame.
+            if (!prevFrameMvs.IsNull)
+            {
+                if (prevFrameMvs.Value.RefFrame[0] != refFrame && prevFrameMvs.Value.RefFrame[0] > Constants.IntraFrame)
+                {
+                    Mv mv = prevFrameMvs.Value.Mv[0];
+                    if (refSignBias[prevFrameMvs.Value.RefFrame[0]] != refSignBias[refFrame])
+                    {
+                        mv.Row *= -1;
+                        mv.Col *= -1;
+                    }
+                    if (AddMvRefListEb(mv, ref refmvCount, mvRefList, earlyBreak))
+                    {
+                        goto Done;
+                    }
+                }
+
+                if (prevFrameMvs.Value.RefFrame[1] > Constants.IntraFrame &&
+                    prevFrameMvs.Value.RefFrame[1] != refFrame &&
+                    Unsafe.As<Mv, int>(ref prevFrameMvs.Value.Mv[1]) != Unsafe.As<Mv, int>(ref prevFrameMvs.Value.Mv[0]))
+                {
+                    Mv mv = prevFrameMvs.Value.Mv[1];
+                    if (refSignBias[prevFrameMvs.Value.RefFrame[1]] != refSignBias[refFrame])
+                    {
+                        mv.Row *= -1;
+                        mv.Col *= -1;
+                    }
+                    if (AddMvRefListEb(mv, ref refmvCount, mvRefList, earlyBreak))
+                    {
+                        goto Done;
+                    }
+                }
+            }
+
+            if (mode == PredictionMode.NearMv)
+            {
+                refmvCount = Constants.MaxMvRefCandidates;
+            }
+            else
+            {
+                // We only care about the nearestmv for the remaining modes
+                refmvCount = 1;
+            }
+
+        Done:
+            // Clamp vectors
+            for (i = 0; i < refmvCount; ++i)
+            {
+                mvRefList[i].ClampMvRef(ref xd);
+            }
+
+            return refmvCount;
+        }
+
+        private static void AppendSub8x8MvsForIdx(
+            ref Vp9Common cm,
+            ref MacroBlockD xd,
+            Span<Position> mvRefSearch,
+            PredictionMode bMode,
+            int block,
+            int refr,
+            int miRow,
+            int miCol,
+            ref Mv bestSub8x8)
+        {
+            Span<Mv> mvList = stackalloc Mv[Constants.MaxMvRefCandidates];
+            ref ModeInfo mi = ref xd.Mi[0].Value;
+            ref Array4<BModeInfo> bmi = ref mi.Bmi;
+            int n;
+            int refmvCount;
+
+            Debug.Assert(Constants.MaxMvRefCandidates == 2);
+
+            refmvCount = DecFindMvRefs(ref cm, ref xd, bMode, mi.RefFrame[refr], mvRefSearch, mvList, miRow, miCol, block, 1);
+
+            switch (block)
+            {
+                case 0: bestSub8x8 = mvList[refmvCount - 1]; break;
+                case 1:
+                case 2:
+                    if (bMode == PredictionMode.NearestMv)
+                    {
+                        bestSub8x8 = bmi[0].Mv[refr];
+                    }
+                    else
+                    {
+                        bestSub8x8 = new Mv();
+                        for (n = 0; n < refmvCount; ++n)
+                        {
+                            if (Unsafe.As<Mv, int>(ref bmi[0].Mv[refr]) != Unsafe.As<Mv, int>(ref mvList[n]))
+                            {
+                                bestSub8x8 = mvList[n];
+                                break;
+                            }
+                        }
+                    }
+                    break;
+                case 3:
+                    if (bMode == PredictionMode.NearestMv)
+                    {
+                        bestSub8x8 = bmi[2].Mv[refr];
+                    }
+                    else
+                    {
+                        Span<Mv> candidates = stackalloc Mv[2 + Constants.MaxMvRefCandidates];
+                        candidates[0] = bmi[1].Mv[refr];
+                        candidates[1] = bmi[0].Mv[refr];
+                        candidates[2] = mvList[0];
+                        candidates[3] = mvList[1];
+                        bestSub8x8 = new Mv();
+                        for (n = 0; n < 2 + Constants.MaxMvRefCandidates; ++n)
+                        {
+                            if (Unsafe.As<Mv, int>(ref bmi[2].Mv[refr]) != Unsafe.As<Mv, int>(ref candidates[n]))
+                            {
+                                bestSub8x8 = candidates[n];
+                                break;
+                            }
+                        }
+                    }
+                    break;
+                default: Debug.Assert(false, "Invalid block index."); break;
+            }
+        }
+
+        private static byte GetModeContext(ref Vp9Common cm, ref MacroBlockD xd, Span<Position> mvRefSearch, int miRow, int miCol)
+        {
+            int i;
+            int contextCounter = 0;
+            ref TileInfo tile = ref xd.Tile;
+
+            // Get mode count from nearest 2 blocks
+            for (i = 0; i < 2; ++i)
+            {
+                ref Position mvRef = ref mvRefSearch[i];
+                if (tile.IsInside(miCol, miRow, cm.MiRows, ref mvRef))
+                {
+                    ref ModeInfo candidate = ref xd.Mi[mvRef.Col + mvRef.Row * xd.MiStride].Value;
+                    // Keep counts for entropy encoding.
+                    contextCounter += Luts.Mode2Counter[(int)candidate.Mode];
+                }
+            }
+
+            return (byte)Luts.CounterToContext[contextCounter];
+        }
+
+        private static void ReadInterBlockModeInfo(
+            ref Vp9Common cm,
+            ref MacroBlockD xd,
+            ref ModeInfo mi,
+            int miRow,
+            int miCol,
+            ref Reader r)
+        {
+            BlockSize bsize = mi.SbType;
+            bool allowHP = cm.AllowHighPrecisionMv;
+            Array2<Mv> bestRefMvs = new Array2<Mv>();
+            int refr, isCompound;
+            byte interModeCtx;
+            Span<Position> mvRefSearch = Luts.MvRefBlocks[(int)bsize];
+
+            ReadRefFrames(ref cm, ref xd, ref r, mi.SegmentId, ref mi.RefFrame);
+            isCompound = mi.HasSecondRef() ? 1 : 0;
+            interModeCtx = GetModeContext(ref cm, ref xd, mvRefSearch, miRow, miCol);
+
+            if (cm.Seg.IsSegFeatureActive(mi.SegmentId, SegLvlFeatures.SegLvlSkip) != 0)
+            {
+                mi.Mode = PredictionMode.ZeroMv;
+                if (bsize < BlockSize.Block8x8)
+                {
+                    xd.ErrorInfo.Value.InternalError(CodecErr.CodecUnsupBitstream, "Invalid usage of segement feature on small blocks");
+                    return;
+                }
+            }
+            else
+            {
+                if (bsize >= BlockSize.Block8x8)
+                {
+                    mi.Mode = ReadInterMode(ref cm, ref xd, ref r, interModeCtx);
+                }
+                else
+                {
+                    // Sub 8x8 blocks use the nearestmv as a ref_mv if the bMode is NewMv.
+                    // Setting mode to NearestMv forces the search to stop after the nearestmv
+                    // has been found. After bModes have been read, mode will be overwritten
+                    // by the last bMode.
+                    mi.Mode = PredictionMode.NearestMv;
+                }
+
+                if (mi.Mode != PredictionMode.ZeroMv)
+                {
+                    for (refr = 0; refr < 1 + isCompound; ++refr)
+                    {
+                        Span<Mv> tmpMvs = stackalloc Mv[Constants.MaxMvRefCandidates];
+                        sbyte frame = mi.RefFrame[refr];
+                        int refmvCount;
+
+                        refmvCount = DecFindMvRefs(ref cm, ref xd, mi.Mode, frame, mvRefSearch, tmpMvs, miRow, miCol, -1, 0);
+
+                        DecFindBestRefMvs(allowHP, tmpMvs, ref bestRefMvs[refr], refmvCount);
+                    }
+                }
+            }
+
+            mi.InterpFilter = (cm.InterpFilter == Constants.Switchable) ? ReadSwitchableInterpFilter(ref cm, ref xd, ref r) : cm.InterpFilter;
+
+            if (bsize < BlockSize.Block8x8)
+            {
+                int num4X4W = 1 << xd.BmodeBlocksWl;
+                int num4X4H = 1 << xd.BmodeBlocksHl;
+                int idx, idy;
+                PredictionMode bMode = 0;
+                Array2<Mv> bestSub8x8 = new Array2<Mv>();
+                const uint invalidMv = 0x80008000;
+                // Initialize the 2nd element as even though it won't be used meaningfully
+                // if isCompound is false.
+                Unsafe.As<Mv, uint>(ref bestSub8x8[1]) = invalidMv;
+                for (idy = 0; idy < 2; idy += num4X4H)
+                {
+                    for (idx = 0; idx < 2; idx += num4X4W)
+                    {
+                        int j = idy * 2 + idx;
+                        bMode = ReadInterMode(ref cm, ref xd, ref r, interModeCtx);
+
+                        if (bMode == PredictionMode.NearestMv || bMode == PredictionMode.NearMv)
+                        {
+                            for (refr = 0; refr < 1 + isCompound; ++refr)
+                            {
+                                AppendSub8x8MvsForIdx(ref cm, ref xd, mvRefSearch, bMode, j, refr, miRow, miCol, ref bestSub8x8[refr]);
+                            }
+                        }
+
+                        if (!AssignMv(ref cm, ref xd, bMode, ref mi.Bmi[j].Mv, ref bestRefMvs, ref bestSub8x8, isCompound, allowHP, ref r))
+                        {
+                            xd.Corrupted |= true;
+                            break;
+                        }
+
+                        if (num4X4H == 2)
+                        {
+                            mi.Bmi[j + 2] = mi.Bmi[j];
+                        }
+
+                        if (num4X4W == 2)
+                        {
+                            mi.Bmi[j + 1] = mi.Bmi[j];
+                        }
+                    }
+                }
+
+                mi.Mode = bMode;
+
+                CopyMvPair(ref mi.Mv, ref mi.Bmi[3].Mv);
+            }
+            else
+            {
+                xd.Corrupted |= !AssignMv(ref cm, ref xd, mi.Mode, ref mi.Mv, ref bestRefMvs, ref bestRefMvs, isCompound, allowHP, ref r);
+            }
+        }
+
+        private static void ReadInterFrameModeInfo(
+            ref Vp9Common cm,
+            ref MacroBlockD xd,
+            int miRow,
+            int miCol,
+            ref Reader r,
+            int xMis,
+            int yMis)
+        {
+            ref ModeInfo mi = ref xd.Mi[0].Value;
+            bool interBlock;
+
+            mi.SegmentId = (sbyte)ReadInterSegmentId(ref cm, ref xd, miRow, miCol, ref r, xMis, yMis);
+            mi.Skip = (sbyte)ReadSkip(ref cm, ref xd, mi.SegmentId, ref r);
+            interBlock = ReadIsInterBlock(ref cm, ref xd, mi.SegmentId, ref r);
+            mi.TxSize = ReadTxSize(ref cm, ref xd, mi.Skip == 0 || !interBlock, ref r);
+
+            if (interBlock)
+            {
+                ReadInterBlockModeInfo(ref cm, ref xd, ref mi, miRow, miCol, ref r);
+            }
+            else
+            {
+                ReadIntraBlockModeInfo(ref cm, ref xd, ref mi, ref r);
+            }
+        }
+
+        private static PredictionMode LeftBlockMode(Ptr<ModeInfo> curMi, Ptr<ModeInfo> leftMi, int b)
+        {
+            if (b == 0 || b == 2)
+            {
+                if (leftMi.IsNull || leftMi.Value.IsInterBlock())
+                {
+                    return PredictionMode.DcPred;
+                }
+
+                return leftMi.Value.GetYMode(b + 1);
+            }
+            else
+            {
+                Debug.Assert(b == 1 || b == 3);
+                return curMi.Value.Bmi[b - 1].Mode;
+            }
+        }
+
+        private static PredictionMode AboveBlockMode(Ptr<ModeInfo> curMi, Ptr<ModeInfo> aboveMi, int b)
+        {
+            if (b == 0 || b == 1)
+            {
+                if (aboveMi.IsNull || aboveMi.Value.IsInterBlock())
+                {
+                    return PredictionMode.DcPred;
+                }
+
+                return aboveMi.Value.GetYMode(b + 2);
+            }
+            else
+            {
+                Debug.Assert(b == 2 || b == 3);
+                return curMi.Value.Bmi[b - 2].Mode;
+            }
+        }
+
+        private static ReadOnlySpan<byte> GetYModeProbs(
+            ref Vp9EntropyProbs fc,
+            Ptr<ModeInfo> mi,
+            Ptr<ModeInfo> aboveMi,
+            Ptr<ModeInfo> leftMi,
+            int block)
+        {
+            PredictionMode above = AboveBlockMode(mi, aboveMi, block);
+            PredictionMode left = LeftBlockMode(mi, leftMi, block);
+            return fc.KfYModeProb[(int)above][(int)left].ToSpan();
+        }
+
+        private static void ReadIntraFrameModeInfo(
+            ref Vp9Common cm,
+            ref MacroBlockD xd,
+            int miRow,
+            int miCol,
+            ref Reader r,
+            int xMis,
+            int yMis)
+        {
+            Ptr<ModeInfo> mi = xd.Mi[0];
+            Ptr<ModeInfo> aboveMi = xd.AboveMi;
+            Ptr<ModeInfo> leftMi = xd.LeftMi;
+            BlockSize bsize = mi.Value.SbType;
+            int i;
+            int miOffset = miRow * cm.MiCols + miCol;
+
+            mi.Value.SegmentId = (sbyte)ReadIntraSegmentId(ref cm, miOffset, xMis, yMis, ref r);
+            mi.Value.Skip = (sbyte)ReadSkip(ref cm, ref xd, mi.Value.SegmentId, ref r);
+            mi.Value.TxSize = ReadTxSize(ref cm, ref xd, true, ref r);
+            mi.Value.RefFrame[0] = Constants.IntraFrame;
+            mi.Value.RefFrame[1] = Constants.None;
+
+            switch (bsize)
+            {
+                case BlockSize.Block4x4:
+                    for (i = 0; i < 4; ++i)
+                    {
+                        mi.Value.Bmi[i].Mode =
+                            ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, i));
+                    }
+
+                    mi.Value.Mode = mi.Value.Bmi[3].Mode;
+                    break;
+                case BlockSize.Block4x8:
+                    mi.Value.Bmi[0].Mode = mi.Value.Bmi[2].Mode =
+                        ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, 0));
+                    mi.Value.Bmi[1].Mode = mi.Value.Bmi[3].Mode = mi.Value.Mode =
+                        ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, 1));
+                    break;
+                case BlockSize.Block8x4:
+                    mi.Value.Bmi[0].Mode = mi.Value.Bmi[1].Mode =
+                        ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, 0));
+                    mi.Value.Bmi[2].Mode = mi.Value.Bmi[3].Mode = mi.Value.Mode =
+                        ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, 2));
+                    break;
+                default:
+                    mi.Value.Mode = ReadIntraMode(ref r, GetYModeProbs(ref cm.Fc.Value, mi, aboveMi, leftMi, 0));
+                    break;
+            }
+
+            mi.Value.UvMode = ReadIntraMode(ref r, cm.Fc.Value.KfUvModeProb[(int)mi.Value.Mode].ToSpan());
+        }
+
+        private static void CopyRefFramePair(ref Array2<sbyte> dst, ref Array2<sbyte> src)
+        {
+            dst[0] = src[0];
+            dst[1] = src[1];
+        }
+
+        public static void ReadModeInfo(
+            ref TileWorkerData twd,
+            ref Vp9Common cm,
+            int miRow,
+            int miCol,
+            int xMis,
+            int yMis)
+        {
+            ref Reader r = ref twd.BitReader;
+            ref MacroBlockD xd = ref twd.Xd;
+            ref ModeInfo mi = ref xd.Mi[0].Value;
+            ArrayPtr<MvRef> frameMvs = cm.CurFrameMvs.Slice(miRow * cm.MiCols + miCol);
+            int w, h;
+
+            if (cm.FrameIsIntraOnly())
+            {
+                ReadIntraFrameModeInfo(ref cm, ref xd, miRow, miCol, ref r, xMis, yMis);
+            }
+            else
+            {
+                ReadInterFrameModeInfo(ref cm, ref xd, miRow, miCol, ref r, xMis, yMis);
+
+                for (h = 0; h < yMis; ++h)
+                {
+                    for (w = 0; w < xMis; ++w)
+                    {
+                        ref MvRef mv = ref frameMvs[w];
+                        CopyRefFramePair(ref mv.RefFrame, ref mi.RefFrame);
+                        CopyMvPair(ref mv.Mv, ref mi.Mv);
+                    }
+                    frameMvs = frameMvs.Slice(cm.MiCols);
+                }
+            }
+        }
+    }
+}

+ 164 - 0
Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs

@@ -0,0 +1,164 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using Ryujinx.Graphics.Video;
+using System;
+using Vp9MvRef = Ryujinx.Graphics.Video.Vp9MvRef;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    public class Decoder : IVp9Decoder
+    {
+        public bool IsHardwareAccelerated => false;
+
+        private readonly MemoryAllocator _allocator = new MemoryAllocator();
+
+        public ISurface CreateSurface(int width, int height) => new Surface(width, height);
+
+        private static readonly byte[] LiteralToFilter = new byte[]
+        {
+            Constants.EightTapSmooth,
+            Constants.EightTap,
+            Constants.EightTapSharp,
+            Constants.Bilinear
+        };
+
+        public unsafe bool Decode(
+            ref Vp9PictureInfo pictureInfo,
+            ISurface output,
+            ReadOnlySpan<byte> bitstream,
+            ReadOnlySpan<Vp9MvRef> mvsIn,
+            Span<Vp9MvRef> mvsOut)
+        {
+            Vp9Common cm = new Vp9Common();
+
+            cm.FrameType = pictureInfo.IsKeyFrame ? FrameType.KeyFrame : FrameType.InterFrame;
+            cm.IntraOnly = pictureInfo.IntraOnly;
+
+            cm.Width = output.Width;
+            cm.Height = output.Height;
+
+            cm.UsePrevFrameMvs = pictureInfo.UsePrevInFindMvRefs;
+
+            cm.RefFrameSignBias = pictureInfo.RefFrameSignBias;
+
+            cm.BaseQindex = pictureInfo.BaseQIndex;
+            cm.YDcDeltaQ = pictureInfo.YDcDeltaQ;
+            cm.UvAcDeltaQ = pictureInfo.UvAcDeltaQ;
+            cm.UvDcDeltaQ = pictureInfo.UvDcDeltaQ;
+
+            cm.Mb.Lossless = pictureInfo.Lossless;
+
+            cm.TxMode = (TxMode)pictureInfo.TransformMode;
+
+            cm.AllowHighPrecisionMv = pictureInfo.AllowHighPrecisionMv;
+
+            cm.InterpFilter = (byte)pictureInfo.InterpFilter;
+
+            if (cm.InterpFilter != Constants.Switchable)
+            {
+                cm.InterpFilter = LiteralToFilter[cm.InterpFilter];
+            }
+
+            cm.ReferenceMode = (ReferenceMode)pictureInfo.ReferenceMode;
+
+            cm.CompFixedRef = pictureInfo.CompFixedRef;
+            cm.CompVarRef = pictureInfo.CompVarRef;
+
+            cm.Log2TileCols = pictureInfo.Log2TileCols;
+            cm.Log2TileRows = pictureInfo.Log2TileRows;
+
+            cm.Seg.Enabled = pictureInfo.SegmentEnabled;
+            cm.Seg.UpdateMap = pictureInfo.SegmentMapUpdate;
+            cm.Seg.TemporalUpdate = pictureInfo.SegmentMapTemporalUpdate;
+            cm.Seg.AbsDelta = (byte)pictureInfo.SegmentAbsDelta;
+            cm.Seg.FeatureMask = pictureInfo.SegmentFeatureEnable;
+            cm.Seg.FeatureData = pictureInfo.SegmentFeatureData;
+
+            cm.Lf.ModeRefDeltaEnabled = pictureInfo.ModeRefDeltaEnabled;
+            cm.Lf.RefDeltas = pictureInfo.RefDeltas;
+            cm.Lf.ModeDeltas = pictureInfo.ModeDeltas;
+
+            cm.Fc = new Ptr<Vp9EntropyProbs>(ref pictureInfo.Entropy);
+            cm.Counts = new Ptr<Vp9BackwardUpdates>(ref pictureInfo.BackwardUpdateCounts);
+
+            cm.FrameRefs[0].Buf = (Surface)pictureInfo.LastReference;
+            cm.FrameRefs[1].Buf = (Surface)pictureInfo.GoldenReference;
+            cm.FrameRefs[2].Buf = (Surface)pictureInfo.AltReference;
+            cm.Mb.CurBuf = (Surface)output;
+
+            cm.Mb.SetupBlockPlanes(1, 1);
+
+            cm.AllocTileWorkerData(_allocator, 1 << pictureInfo.Log2TileCols, 1 << pictureInfo.Log2TileRows);
+            cm.AllocContextBuffers(_allocator, output.Width, output.Height);
+            cm.InitContextBuffers();
+            cm.SetupSegmentationDequant();
+            cm.SetupScaleFactors();
+
+            SetMvs(ref cm, mvsIn);
+
+            fixed (byte* dataPtr = bitstream)
+            {
+                try
+                {
+                    DecodeFrame.DecodeTiles(ref cm, new ArrayPtr<byte>(dataPtr, bitstream.Length));
+                }
+                catch (InternalErrorException)
+                {
+                    return false;
+                }
+            }
+
+            GetMvs(ref cm, mvsOut);
+
+            cm.FreeTileWorkerData(_allocator);
+            cm.FreeContextBuffers(_allocator);
+
+            return true;
+        }
+
+        private static void SetMvs(ref Vp9Common cm, ReadOnlySpan<Vp9MvRef> mvs)
+        {
+            if (mvs.Length > cm.PrevFrameMvs.Length)
+            {
+                throw new ArgumentException($"Size mismatch, expected: {cm.PrevFrameMvs.Length}, but got: {mvs.Length}.");
+            }
+
+            for (int i = 0; i < mvs.Length; i++)
+            {
+                ref var mv = ref cm.PrevFrameMvs[i];
+
+                mv.Mv[0].Row = mvs[i].Mvs[0].Row;
+                mv.Mv[0].Col = mvs[i].Mvs[0].Col;
+                mv.Mv[1].Row = mvs[i].Mvs[1].Row;
+                mv.Mv[1].Col = mvs[i].Mvs[1].Col;
+
+                mv.RefFrame[0] = (sbyte)mvs[i].RefFrames[0];
+                mv.RefFrame[1] = (sbyte)mvs[i].RefFrames[1];
+            }
+        }
+
+        private static void GetMvs(ref Vp9Common cm, Span<Vp9MvRef> mvs)
+        {
+            if (mvs.Length > cm.CurFrameMvs.Length)
+            {
+                throw new ArgumentException($"Size mismatch, expected: {cm.CurFrameMvs.Length}, but got: {mvs.Length}.");
+            }
+
+            for (int i = 0; i < mvs.Length; i++)
+            {
+                ref var mv = ref cm.CurFrameMvs[i];
+
+                mvs[i].Mvs[0].Row = mv.Mv[0].Row;
+                mvs[i].Mvs[0].Col = mv.Mv[0].Col;
+                mvs[i].Mvs[1].Row = mv.Mv[1].Row;
+                mvs[i].Mvs[1].Col = mv.Mv[1].Col;
+
+                mvs[i].RefFrames[0] = mv.RefFrame[0];
+                mvs[i].RefFrames[1] = mv.RefFrame[1];
+            }
+        }
+
+        public void Dispose() => _allocator.Dispose();
+    }
+}

+ 325 - 0
Ryujinx.Graphics.Nvdec.Vp9/Detokenize.cs

@@ -0,0 +1,325 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using Ryujinx.Graphics.Video;
+using System;
+using System.Diagnostics;
+using System.Runtime.InteropServices;
+using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.InvTxfm;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class Detokenize
+    {
+        private const int EobContextNode = 0;
+        private const int ZeroContextNode = 1;
+        private const int OneContextNode = 2;
+
+        private static int GetCoefContext(ReadOnlySpan<short> neighbors, ReadOnlySpan<byte> tokenCache, int c)
+        {
+            const int maxNeighbors = 2;
+
+            return (1 + tokenCache[neighbors[maxNeighbors * c + 0]] + tokenCache[neighbors[maxNeighbors * c + 1]]) >> 1;
+        }
+
+        private static int ReadCoeff(
+            ref Reader r,
+            ReadOnlySpan<byte> probs,
+            int n,
+            ref ulong value,
+            ref int count,
+            ref uint range)
+        {
+            int i, val = 0;
+            for (i = 0; i < n; ++i)
+            {
+                val = (val << 1) | r.ReadBool(probs[i], ref value, ref count, ref range);
+            }
+
+            return val;
+        }
+
+        private static int DecodeCoefs(
+            ref MacroBlockD xd,
+            PlaneType type,
+            Span<int> dqcoeff,
+            TxSize txSize,
+            ref Array2<short> dq,
+            int ctx,
+            ReadOnlySpan<short> scan,
+            ReadOnlySpan<short> nb,
+            ref Reader r)
+        {
+            ref Vp9BackwardUpdates counts = ref xd.Counts.Value;
+            int maxEob = 16 << ((int)txSize << 1);
+            ref Vp9EntropyProbs fc = ref xd.Fc.Value;
+            int refr = xd.Mi[0].Value.IsInterBlock() ? 1 : 0;
+            int band, c = 0;
+            ref Array6<Array6<Array3<byte>>> coefProbs = ref fc.CoefProbs[(int)txSize][(int)type][refr];
+            Span<byte> tokenCache = stackalloc byte[32 * 32];
+            ReadOnlySpan<byte> bandTranslate = Luts.get_band_translate(txSize);
+            int dqShift = (txSize == TxSize.Tx32x32) ? 1 : 0;
+            int v;
+            short dqv = dq[0];
+            ReadOnlySpan<byte> cat6Prob = (xd.Bd == 12)
+                ? Luts.Vp9Cat6ProbHigh12
+                : (xd.Bd == 10) ? new ReadOnlySpan<byte>(Luts.Vp9Cat6ProbHigh12).Slice(2) : Luts.Vp9Cat6Prob;
+            int cat6Bits = (xd.Bd == 12) ? 18 : (xd.Bd == 10) ? 16 : 14;
+            // Keep value, range, and count as locals.  The compiler produces better
+            // results with the locals than using r directly.
+            ulong value = r.Value;
+            uint range = r.Range;
+            int count = r.Count;
+
+            while (c < maxEob)
+            {
+                int val = -1;
+                band = bandTranslate[0];
+                bandTranslate = bandTranslate.Slice(1);
+                ref Array3<byte> prob = ref coefProbs[band][ctx];
+                if (!xd.Counts.IsNull)
+                {
+                    ++counts.EobBranch[(int)txSize][(int)type][refr][band][ctx];
+                }
+
+                if (r.ReadBool(prob[EobContextNode], ref value, ref count, ref range) == 0)
+                {
+                    if (!xd.Counts.IsNull)
+                    {
+                        ++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.EobModelToken];
+                    }
+
+                    break;
+                }
+
+                while (r.ReadBool(prob[ZeroContextNode], ref value, ref count, ref range) == 0)
+                {
+                    if (!xd.Counts.IsNull)
+                    {
+                        ++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.ZeroToken];
+                    }
+
+                    dqv = dq[1];
+                    tokenCache[scan[c]] = 0;
+                    ++c;
+                    if (c >= maxEob)
+                    {
+                        r.Value = value;
+                        r.Range = range;
+                        r.Count = count;
+                        return c;  // Zero tokens at the end (no eob token)
+                    }
+                    ctx = GetCoefContext(nb, tokenCache, c);
+                    band = bandTranslate[0];
+                    bandTranslate = bandTranslate.Slice(1);
+                    prob = ref coefProbs[band][ctx];
+                }
+
+                if (r.ReadBool(prob[OneContextNode], ref value, ref count, ref range) != 0)
+                {
+                    ReadOnlySpan<byte> p = Luts.Vp9Pareto8Full[prob[Constants.PivotNode] - 1];
+                    if (!xd.Counts.IsNull)
+                    {
+                        ++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.TwoToken];
+                    }
+
+                    if (r.ReadBool(p[0], ref value, ref count, ref range) != 0)
+                    {
+                        if (r.ReadBool(p[3], ref value, ref count, ref range) != 0)
+                        {
+                            tokenCache[scan[c]] = 5;
+                            if (r.ReadBool(p[5], ref value, ref count, ref range) != 0)
+                            {
+                                if (r.ReadBool(p[7], ref value, ref count, ref range) != 0)
+                                {
+                                    val = Constants.Cat6MinVal + ReadCoeff(ref r, cat6Prob, cat6Bits, ref value, ref count, ref range);
+                                }
+                                else
+                                {
+                                    val = Constants.Cat5MinVal + ReadCoeff(ref r, Luts.Vp9Cat5Prob, 5, ref value, ref count, ref range);
+                                }
+                            }
+                            else if (r.ReadBool(p[6], ref value, ref count, ref range) != 0)
+                            {
+                                val = Constants.Cat4MinVal + ReadCoeff(ref r, Luts.Vp9Cat4Prob, 4, ref value, ref count, ref range);
+                            }
+                            else
+                            {
+                                val = Constants.Cat3MinVal + ReadCoeff(ref r, Luts.Vp9Cat3Prob, 3, ref value, ref count, ref range);
+                            }
+                        }
+                        else
+                        {
+                            tokenCache[scan[c]] = 4;
+                            if (r.ReadBool(p[4], ref value, ref count, ref range) != 0)
+                            {
+                                val = Constants.Cat2MinVal + ReadCoeff(ref r, Luts.Vp9Cat2Prob, 2, ref value, ref count, ref range);
+                            }
+                            else
+                            {
+                                val = Constants.Cat1MinVal + ReadCoeff(ref r, Luts.Vp9Cat1Prob, 1, ref value, ref count, ref range);
+                            }
+                        }
+                        // Val may use 18-bits
+                        v = (int)(((long)val * dqv) >> dqShift);
+                    }
+                    else
+                    {
+                        if (r.ReadBool(p[1], ref value, ref count, ref range) != 0)
+                        {
+                            tokenCache[scan[c]] = 3;
+                            v = ((3 + r.ReadBool(p[2], ref value, ref count, ref range)) * dqv) >> dqShift;
+                        }
+                        else
+                        {
+                            tokenCache[scan[c]] = 2;
+                            v = (2 * dqv) >> dqShift;
+                        }
+                    }
+                }
+                else
+                {
+                    if (!xd.Counts.IsNull)
+                    {
+                        ++counts.Coef[(int)txSize][(int)type][refr][band][ctx][Constants.OneToken];
+                    }
+
+                    tokenCache[scan[c]] = 1;
+                    v = dqv >> dqShift;
+                }
+                dqcoeff[scan[c]] = (int)HighbdCheckRange(r.ReadBool(128, ref value, ref count, ref range) != 0 ? -v : v, xd.Bd);
+                ++c;
+                ctx = GetCoefContext(nb, tokenCache, c);
+                dqv = dq[1];
+            }
+
+            r.Value = value;
+            r.Range = range;
+            r.Count = count;
+            return c;
+        }
+
+        private static void GetCtxShift(ref MacroBlockD xd, ref int ctxShiftA, ref int ctxShiftL, int x, int y, uint txSizeInBlocks)
+        {
+            if (xd.MaxBlocksWide != 0)
+            {
+                if (txSizeInBlocks + x > xd.MaxBlocksWide)
+                {
+                    ctxShiftA = (int)(txSizeInBlocks - (xd.MaxBlocksWide - x)) * 8;
+                }
+            }
+            if (xd.MaxBlocksHigh != 0)
+            {
+                if (txSizeInBlocks + y > xd.MaxBlocksHigh)
+                {
+                    ctxShiftL = (int)(txSizeInBlocks - (xd.MaxBlocksHigh - y)) * 8;
+                }
+            }
+        }
+
+        private static PlaneType GetPlaneType(int plane)
+        {
+            return (PlaneType)(plane > 0 ? 1 : 0);
+        }
+
+        public static int DecodeBlockTokens(
+            ref TileWorkerData twd,
+            int plane,
+            Luts.ScanOrder sc,
+            int x,
+            int y,
+            TxSize txSize,
+            int segId)
+        {
+            ref Reader r = ref twd.BitReader;
+            ref MacroBlockD xd = ref twd.Xd;
+            ref MacroBlockDPlane pd = ref xd.Plane[plane];
+            ref Array2<short> dequant = ref pd.SegDequant[segId];
+            int eob;
+            Span<sbyte> a = pd.AboveContext.ToSpan().Slice(x);
+            Span<sbyte> l = pd.LeftContext.ToSpan().Slice(y);
+            int ctx;
+            int ctxShiftA = 0;
+            int ctxShiftL = 0;
+
+            switch (txSize)
+            {
+                case TxSize.Tx4x4:
+                    ctx = a[0] != 0 ? 1 : 0;
+                    ctx += l[0] != 0 ? 1 : 0;
+                    eob = DecodeCoefs(
+                        ref xd,
+                        GetPlaneType(plane),
+                        pd.DqCoeff.ToSpan(),
+                        txSize,
+                        ref dequant,
+                        ctx,
+                        sc.Scan,
+                        sc.Neighbors,
+                        ref r);
+                    a[0] = l[0] = (sbyte)(eob > 0 ? 1 : 0);
+                    break;
+                case TxSize.Tx8x8:
+                    GetCtxShift(ref xd, ref ctxShiftA, ref ctxShiftL, x, y, 1 << (int)TxSize.Tx8x8);
+                    ctx = MemoryMarshal.Cast<sbyte, ushort>(a)[0] != 0 ? 1 : 0;
+                    ctx += MemoryMarshal.Cast<sbyte, ushort>(l)[0] != 0 ? 1 : 0;
+                    eob = DecodeCoefs(
+                        ref xd,
+                        GetPlaneType(plane),
+                        pd.DqCoeff.ToSpan(),
+                        txSize,
+                        ref dequant,
+                        ctx,
+                        sc.Scan,
+                        sc.Neighbors,
+                        ref r);
+                    MemoryMarshal.Cast<sbyte, ushort>(a)[0] = (ushort)((eob > 0 ? 0x0101 : 0) >> ctxShiftA);
+                    MemoryMarshal.Cast<sbyte, ushort>(l)[0] = (ushort)((eob > 0 ? 0x0101 : 0) >> ctxShiftL);
+                    break;
+                case TxSize.Tx16x16:
+                    GetCtxShift(ref xd, ref ctxShiftA, ref ctxShiftL, x, y, 1 << (int)TxSize.Tx16x16);
+                    ctx = MemoryMarshal.Cast<sbyte, uint>(a)[0] != 0 ? 1 : 0;
+                    ctx += MemoryMarshal.Cast<sbyte, uint>(l)[0] != 0 ? 1 : 0;
+                    eob = DecodeCoefs(
+                        ref xd,
+                        GetPlaneType(plane),
+                        pd.DqCoeff.ToSpan(),
+                        txSize,
+                        ref dequant,
+                        ctx,
+                        sc.Scan,
+                        sc.Neighbors,
+                        ref r);
+                    MemoryMarshal.Cast<sbyte, uint>(a)[0] = (uint)((eob > 0 ? 0x01010101 : 0) >> ctxShiftA);
+                    MemoryMarshal.Cast<sbyte, uint>(l)[0] = (uint)((eob > 0 ? 0x01010101 : 0) >> ctxShiftL);
+                    break;
+                case TxSize.Tx32x32:
+                    GetCtxShift(ref xd, ref ctxShiftA, ref ctxShiftL, x, y, 1 << (int)TxSize.Tx32x32);
+                    // NOTE: Casting to ulong here is safe because the default memory
+                    // alignment is at least 8 bytes and the Tx32x32 is aligned on 8 byte
+                    // boundaries.
+                    ctx = MemoryMarshal.Cast<sbyte, ulong>(a)[0] != 0 ? 1 : 0;
+                    ctx += MemoryMarshal.Cast<sbyte, ulong>(l)[0] != 0 ? 1 : 0;
+                    eob = DecodeCoefs(
+                        ref xd,
+                        GetPlaneType(plane),
+                        pd.DqCoeff.ToSpan(),
+                        txSize,
+                        ref dequant,
+                        ctx,
+                        sc.Scan,
+                        sc.Neighbors,
+                        ref r);
+                    MemoryMarshal.Cast<sbyte, ulong>(a)[0] = (eob > 0 ? 0x0101010101010101UL : 0) >> ctxShiftA;
+                    MemoryMarshal.Cast<sbyte, ulong>(l)[0] = (eob > 0 ? 0x0101010101010101UL : 0) >> ctxShiftL;
+                    break;
+                default:
+                    Debug.Assert(false, "Invalid transform size.");
+                    eob = 0;
+                    break;
+            }
+
+            return eob;
+        }
+    }
+}

+ 949 - 0
Ryujinx.Graphics.Nvdec.Vp9/Dsp/Convolve.cs

@@ -0,0 +1,949 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+    internal static class Convolve
+    {
+        private const bool UseIntrinsics = true;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<int> MultiplyAddAdjacent(
+            Vector128<short> vsrc0,
+            Vector128<short> vsrc1,
+            Vector128<short> vsrc2,
+            Vector128<short> vsrc3,
+            Vector128<short> vfilter,
+            Vector128<int> zero)
+        {
+            // < sumN, sumN, sumN, sumN >
+            Vector128<int> sum0 = Sse2.MultiplyAddAdjacent(vsrc0, vfilter);
+            Vector128<int> sum1 = Sse2.MultiplyAddAdjacent(vsrc1, vfilter);
+            Vector128<int> sum2 = Sse2.MultiplyAddAdjacent(vsrc2, vfilter);
+            Vector128<int> sum3 = Sse2.MultiplyAddAdjacent(vsrc3, vfilter);
+
+            // < 0, 0, sumN, sumN >
+            sum0 = Ssse3.HorizontalAdd(sum0, zero);
+            sum1 = Ssse3.HorizontalAdd(sum1, zero);
+            sum2 = Ssse3.HorizontalAdd(sum2, zero);
+            sum3 = Ssse3.HorizontalAdd(sum3, zero);
+
+            // < 0, 0, 0, sumN >
+            sum0 = Ssse3.HorizontalAdd(sum0, zero);
+            sum1 = Ssse3.HorizontalAdd(sum1, zero);
+            sum2 = Ssse3.HorizontalAdd(sum2, zero);
+            sum3 = Ssse3.HorizontalAdd(sum3, zero);
+
+            // < 0, 0, sum1, sum0 >
+            Vector128<int> sum01 = Sse2.UnpackLow(sum0, sum1);
+
+            // < 0, 0, sum3, sum2 >
+            Vector128<int> sum23 = Sse2.UnpackLow(sum2, sum3);
+
+            // < sum3, sum2, sum1, sum0 >
+            return Sse.MoveLowToHigh(sum01.AsSingle(), sum23.AsSingle()).AsInt32();
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<int> RoundShift(Vector128<int> value, Vector128<int> const64)
+        {
+            return Sse2.ShiftRightArithmetic(Sse2.Add(value, const64), FilterBits);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static Vector128<byte> PackUnsignedSaturate(Vector128<int> value, Vector128<int> zero)
+        {
+            return Sse2.PackUnsignedSaturate(Sse41.PackUnsignedSaturate(value, zero).AsInt16(), zero.AsInt16());
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void ConvolveHorizSse41(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] xFilters,
+            int x0Q4,
+            int w,
+            int h)
+        {
+            Vector128<int> zero = Vector128<int>.Zero;
+            Vector128<int> const64 = Vector128.Create(64);
+
+            ulong x, y;
+            src -= SubpelTaps / 2 - 1;
+
+            fixed (Array8<short>* xFilter = xFilters)
+            {
+                Vector128<short> vfilter = Sse2.LoadVector128((short*)xFilter + (uint)(x0Q4 & SubpelMask) * 8);
+
+                for (y = 0; y < (uint)h; ++y)
+                {
+                    ulong srcOffset = (uint)x0Q4 >> SubpelBits;
+                    for (x = 0; x < (uint)w; x += 4)
+                    {
+                        Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(&src[srcOffset + x]);
+                        Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 1]);
+                        Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 2]);
+                        Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 3]);
+
+                        Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
+
+                        Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
+                    }
+                    src += srcStride;
+                    dst += dstStride;
+                }
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void ConvolveHoriz(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] xFilters,
+            int x0Q4,
+            int xStepQ4,
+            int w,
+            int h)
+        {
+            if (Sse41.IsSupported && UseIntrinsics && xStepQ4 == 1 << SubpelBits)
+            {
+                ConvolveHorizSse41(src, srcStride, dst, dstStride, xFilters, x0Q4, w, h);
+                return;
+            }
+
+            int x, y;
+            src -= SubpelTaps / 2 - 1;
+
+            for (y = 0; y < h; ++y)
+            {
+                int xQ4 = x0Q4;
+                for (x = 0; x < w; ++x)
+                {
+                    byte* srcX = &src[xQ4 >> SubpelBits];
+                    ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
+                    int k, sum = 0;
+                    for (k = 0; k < SubpelTaps; ++k)
+                    {
+                        sum += srcX[k] * xFilter[k];
+                    }
+
+                    dst[x] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
+                    xQ4 += xStepQ4;
+                }
+                src += srcStride;
+                dst += dstStride;
+            }
+        }
+
+        private static unsafe void ConvolveAvgHoriz(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] xFilters,
+            int x0Q4,
+            int xStepQ4,
+            int w,
+            int h)
+        {
+            int x, y;
+            src -= SubpelTaps / 2 - 1;
+
+            for (y = 0; y < h; ++y)
+            {
+                int xQ4 = x0Q4;
+                for (x = 0; x < w; ++x)
+                {
+                    byte* srcX = &src[xQ4 >> SubpelBits];
+                    ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
+                    int k, sum = 0;
+                    for (k = 0; k < SubpelTaps; ++k)
+                    {
+                        sum += srcX[k] * xFilter[k];
+                    }
+
+                    dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
+                    xQ4 += xStepQ4;
+                }
+                src += srcStride;
+                dst += dstStride;
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void ConvolveVertAvx2(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] yFilters,
+            int y0Q4,
+            int w,
+            int h)
+        {
+            Vector128<int> zero = Vector128<int>.Zero;
+            Vector128<int> const64 = Vector128.Create(64);
+            Vector256<int> indices = Vector256.Create(
+                0,
+                srcStride,
+                srcStride * 2,
+                srcStride * 3,
+                srcStride * 4,
+                srcStride * 5,
+                srcStride * 6,
+                srcStride * 7);
+
+            ulong x, y;
+            src -= srcStride * (SubpelTaps / 2 - 1);
+
+            fixed (Array8<short>* yFilter = yFilters)
+            {
+                Vector128<short> vfilter = Sse2.LoadVector128((short*)yFilter + (uint)(y0Q4 & SubpelMask) * 8);
+
+                ulong srcBaseY = (uint)y0Q4 >> SubpelBits;
+                for (y = 0; y < (uint)h; ++y)
+                {
+                    ulong srcOffset = (srcBaseY + y) * (uint)srcStride;
+                    for (x = 0; x < (uint)w; x += 4)
+                    {
+                        Vector256<int> vsrc = Avx2.GatherVector256((uint*)&src[srcOffset + x], indices, 1).AsInt32();
+
+                        Vector128<int> vsrcL = vsrc.GetLower();
+                        Vector128<int> vsrcH = vsrc.GetUpper();
+
+                        Vector128<byte> vsrcUnpck11 = Sse2.UnpackLow(vsrcL.AsByte(), vsrcH.AsByte());
+                        Vector128<byte> vsrcUnpck12 = Sse2.UnpackHigh(vsrcL.AsByte(), vsrcH.AsByte());
+
+                        Vector128<byte> vsrcUnpck21 = Sse2.UnpackLow(vsrcUnpck11, vsrcUnpck12);
+                        Vector128<byte> vsrcUnpck22 = Sse2.UnpackHigh(vsrcUnpck11, vsrcUnpck12);
+
+                        Vector128<byte> vsrc01 = Sse2.UnpackLow(vsrcUnpck21, vsrcUnpck22);
+                        Vector128<byte> vsrc23 = Sse2.UnpackHigh(vsrcUnpck21, vsrcUnpck22);
+
+                        Vector128<byte> vsrc11 = Sse.MoveHighToLow(vsrc01.AsSingle(), vsrc01.AsSingle()).AsByte();
+                        Vector128<byte> vsrc33 = Sse.MoveHighToLow(vsrc23.AsSingle(), vsrc23.AsSingle()).AsByte();
+
+                        Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(vsrc01);
+                        Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(vsrc11);
+                        Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(vsrc23);
+                        Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(vsrc33);
+
+                        Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
+
+                        Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
+                    }
+                    dst += dstStride;
+                }
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static unsafe void ConvolveVert(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] yFilters,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            if (Avx2.IsSupported && UseIntrinsics && yStepQ4 == 1 << SubpelBits)
+            {
+                ConvolveVertAvx2(src, srcStride, dst, dstStride, yFilters, y0Q4, w, h);
+                return;
+            }
+
+            int x, y;
+            src -= srcStride * (SubpelTaps / 2 - 1);
+
+            for (x = 0; x < w; ++x)
+            {
+                int yQ4 = y0Q4;
+                for (y = 0; y < h; ++y)
+                {
+                    byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
+                    ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
+                    int k, sum = 0;
+                    for (k = 0; k < SubpelTaps; ++k)
+                    {
+                        sum += srcY[k * srcStride] * yFilter[k];
+                    }
+
+                    dst[y * dstStride] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
+                    yQ4 += yStepQ4;
+                }
+                ++src;
+                ++dst;
+            }
+        }
+
+        private static unsafe void ConvolveAvgVert(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] yFilters,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            int x, y;
+            src -= srcStride * (SubpelTaps / 2 - 1);
+
+            for (x = 0; x < w; ++x)
+            {
+                int yQ4 = y0Q4;
+                for (y = 0; y < h; ++y)
+                {
+                    byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
+                    ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
+                    int k, sum = 0;
+                    for (k = 0; k < SubpelTaps; ++k)
+                    {
+                        sum += srcY[k * srcStride] * yFilter[k];
+                    }
+
+                    dst[y * dstStride] = (byte)BitUtils.RoundPowerOfTwo(
+                        dst[y * dstStride] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
+                    yQ4 += yStepQ4;
+                }
+                ++src;
+                ++dst;
+            }
+        }
+
+        public static unsafe void Convolve8Horiz(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            ConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h);
+        }
+
+        public static unsafe void Convolve8AvgHoriz(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            ConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h);
+        }
+
+        public static unsafe void Convolve8Vert(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            ConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
+        }
+
+        public static unsafe void Convolve8AvgVert(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            ConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
+        }
+
+        [StructLayout(LayoutKind.Sequential, Size = 64 * 135)]
+        struct Temp
+        {
+        }
+
+        public static unsafe void Convolve8(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+            // 2d filtering proceeds in 2 steps:
+            //   (1) Interpolate horizontally into an intermediate buffer, temp.
+            //   (2) Interpolate temp vertically to derive the sub-pixel result.
+            // Deriving the maximum number of rows in the temp buffer (135):
+            // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative).
+            // --Largest block size is 64x64 pixels.
+            // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+            //   original frame (in 1/16th pixel units).
+            // --Must round-up because block may be located at sub-pixel position.
+            // --Require an additional SubpelTaps rows for the 8-tap filter tails.
+            // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+            // When calling in frame scaling function, the smallest scaling factor is x1/4
+            // ==> yStepQ4 = 64. Since w and h are at most 16, the temp buffer is still
+            // big enough.
+            Temp tempStruct;
+            byte* temp = (byte*)Unsafe.AsPointer(ref tempStruct); // Avoid zero initialization.
+            int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
+
+            Debug.Assert(w <= 64);
+            Debug.Assert(h <= 64);
+            Debug.Assert(yStepQ4 <= 32 || (yStepQ4 <= 64 && h <= 32));
+            Debug.Assert(xStepQ4 <= 64);
+
+            ConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight);
+            ConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
+        }
+
+        public static unsafe void Convolve8Avg(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            // Fixed size intermediate buffer places limits on parameters.
+            byte* temp = stackalloc byte[64 * 64];
+            Debug.Assert(w <= 64);
+            Debug.Assert(h <= 64);
+
+            Convolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+            ConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h);
+        }
+
+        public static unsafe void ConvolveCopy(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            int r;
+
+            for (r = h; r > 0; --r)
+            {
+                MemoryUtil.Copy(dst, src, w);
+                src += srcStride;
+                dst += dstStride;
+            }
+        }
+
+        public static unsafe void ConvolveAvg(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            int x, y;
+
+            for (y = 0; y < h; ++y)
+            {
+                for (x = 0; x < w; ++x)
+                {
+                    dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
+                }
+
+                src += srcStride;
+                dst += dstStride;
+            }
+        }
+
+        public static unsafe void ScaledHoriz(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            Convolve8Horiz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+        }
+
+        public static unsafe void ScaledVert(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            Convolve8Vert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+        }
+
+        public static unsafe void Scaled2D(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            Convolve8(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+        }
+
+        public static unsafe void ScaledAvgHoriz(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            Convolve8AvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+        }
+
+        public static unsafe void ScaledAvgVert(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            Convolve8AvgVert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+        }
+
+        public static unsafe void ScaledAvg2D(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h)
+        {
+            Convolve8Avg(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
+        }
+
+        private static unsafe void HighbdConvolveHoriz(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] xFilters,
+            int x0Q4,
+            int xStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            int x, y;
+            src -= SubpelTaps / 2 - 1;
+
+            for (y = 0; y < h; ++y)
+            {
+                int xQ4 = x0Q4;
+                for (x = 0; x < w; ++x)
+                {
+                    ushort* srcX = &src[xQ4 >> SubpelBits];
+                    ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
+                    int k, sum = 0;
+                    for (k = 0; k < SubpelTaps; ++k)
+                    {
+                        sum += srcX[k] * xFilter[k];
+                    }
+
+                    dst[x] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
+                    xQ4 += xStepQ4;
+                }
+                src += srcStride;
+                dst += dstStride;
+            }
+        }
+
+        private static unsafe void HighbdConvolveAvgHoriz(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] xFilters,
+            int x0Q4,
+            int xStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            int x, y;
+            src -= SubpelTaps / 2 - 1;
+
+            for (y = 0; y < h; ++y)
+            {
+                int xQ4 = x0Q4;
+                for (x = 0; x < w; ++x)
+                {
+                    ushort* srcX = &src[xQ4 >> SubpelBits];
+                    ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
+                    int k, sum = 0;
+                    for (k = 0; k < SubpelTaps; ++k)
+                    {
+                        sum += srcX[k] * xFilter[k];
+                    }
+
+                    dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
+                    xQ4 += xStepQ4;
+                }
+                src += srcStride;
+                dst += dstStride;
+            }
+        }
+
+        private static unsafe void HighbdConvolveVert(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] yFilters,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            int x, y;
+            src -= srcStride * (SubpelTaps / 2 - 1);
+
+            for (x = 0; x < w; ++x)
+            {
+                int yQ4 = y0Q4;
+                for (y = 0; y < h; ++y)
+                {
+                    ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
+                    ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
+                    int k, sum = 0;
+                    for (k = 0; k < SubpelTaps; ++k)
+                    {
+                        sum += srcY[k * srcStride] * yFilter[k];
+                    }
+
+                    dst[y * dstStride] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
+                    yQ4 += yStepQ4;
+                }
+                ++src;
+                ++dst;
+            }
+        }
+
+        private static unsafe void HighConvolveAvgVert(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] yFilters,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            int x, y;
+            src -= srcStride * (SubpelTaps / 2 - 1);
+
+            for (x = 0; x < w; ++x)
+            {
+                int yQ4 = y0Q4;
+                for (y = 0; y < h; ++y)
+                {
+                    ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
+                    ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
+                    int k, sum = 0;
+                    for (k = 0; k < SubpelTaps; ++k)
+                    {
+                        sum += srcY[k * srcStride] * yFilter[k];
+                    }
+
+                    dst[y * dstStride] = (ushort)BitUtils.RoundPowerOfTwo(
+                        dst[y * dstStride] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
+                    yQ4 += yStepQ4;
+                }
+                ++src;
+                ++dst;
+            }
+        }
+
+        private static unsafe void HighbdConvolve(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+            // 2d filtering proceeds in 2 steps:
+            //   (1) Interpolate horizontally into an intermediate buffer, temp.
+            //   (2) Interpolate temp vertically to derive the sub-pixel result.
+            // Deriving the maximum number of rows in the temp buffer (135):
+            // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative).
+            // --Largest block size is 64x64 pixels.
+            // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+            //   original frame (in 1/16th pixel units).
+            // --Must round-up because block may be located at sub-pixel position.
+            // --Require an additional SubpelTaps rows for the 8-tap filter tails.
+            // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+            ushort* temp = stackalloc ushort[64 * 135];
+            int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
+
+            Debug.Assert(w <= 64);
+            Debug.Assert(h <= 64);
+            Debug.Assert(yStepQ4 <= 32);
+            Debug.Assert(xStepQ4 <= 32);
+
+            HighbdConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w,  intermediateHeight, bd);
+            HighbdConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
+        }
+
+        public static unsafe void HighbdConvolve8Horiz(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            HighbdConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd);
+        }
+
+        public static unsafe void HighbdConvolve8AvgHoriz(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            HighbdConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4,  xStepQ4, w, h, bd);
+        }
+
+        public static unsafe void HighbdConvolve8Vert(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            HighbdConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
+        }
+
+        public static unsafe void HighbdConvolve8AvgVert(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            HighConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
+        }
+
+        public static unsafe void HighbdConvolve8(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            HighbdConvolve(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd);
+        }
+
+        public static unsafe void HighbdConvolve8Avg(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            // Fixed size intermediate buffer places limits on parameters.
+            ushort* temp = stackalloc ushort[64 * 64];
+            Debug.Assert(w <= 64);
+            Debug.Assert(h <= 64);
+
+            HighbdConvolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd);
+            HighbdConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h, bd);
+        }
+
+        public static unsafe void HighbdConvolveCopy(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            int r;
+
+            for (r = h; r > 0; --r)
+            {
+                MemoryUtil.Copy(dst, src, w);
+                src += srcStride;
+                dst += dstStride;
+            }
+        }
+
+        public static unsafe void HighbdConvolveAvg(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h,
+            int bd)
+        {
+            int x, y;
+
+            for (y = 0; y < h; ++y)
+            {
+                for (x = 0; x < w; ++x)
+                {
+                    dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
+                }
+
+                src += srcStride;
+                dst += dstStride;
+            }
+        }
+    }
+}

+ 12 - 0
Ryujinx.Graphics.Nvdec.Vp9/Dsp/Filter.cs

@@ -0,0 +1,12 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+    internal static class Filter
+    {
+        public const int FilterBits = 7;
+
+        public const int SubpelBits = 4;
+        public const int SubpelMask = (1 << SubpelBits) - 1;
+        public const int SubpelShifts = 1 << SubpelBits;
+        public const int SubpelTaps = 8;
+    }
+}

+ 1379 - 0
Ryujinx.Graphics.Nvdec.Vp9/Dsp/IntraPred.cs

@@ -0,0 +1,1379 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+    internal static class IntraPred
+    {
+        private static unsafe ref byte Dst(byte* dst, int stride, int x, int y)
+        {
+            return ref dst[x + y * stride];
+        }
+
+        private static unsafe ref ushort Dst(ushort* dst, int stride, int x, int y)
+        {
+            return ref dst[x + y * stride];
+        }
+
+        private static byte Avg3(byte a, byte b, byte c)
+        {
+            return (byte)((a + 2 * b + c + 2) >> 2);
+        }
+
+        private static ushort Avg3(ushort a, ushort b, ushort c)
+        {
+            return (ushort)((a + 2 * b + c + 2) >> 2);
+        }
+
+        private static byte Avg2(byte a, byte b)
+        {
+            return (byte)((a + b + 1) >> 1);
+        }
+
+        private static ushort Avg2(ushort a, ushort b)
+        {
+            return (ushort)((a + b + 1) >> 1);
+        }
+
+        public static unsafe void D207Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            D207Predictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void D207Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            D207Predictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void D207Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            D207Predictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void D207Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            int r, c;
+            // First column
+            for (r = 0; r < bs - 1; ++r)
+            {
+                dst[r * stride] = Avg2(left[r], left[r + 1]);
+            }
+
+            dst[(bs - 1) * stride] = left[bs - 1];
+            dst++;
+
+            // Second column
+            for (r = 0; r < bs - 2; ++r)
+            {
+                dst[r * stride] = Avg3(left[r], left[r + 1], left[r + 2]);
+            }
+
+            dst[(bs - 2) * stride] = Avg3(left[bs - 2], left[bs - 1], left[bs - 1]);
+            dst[(bs - 1) * stride] = left[bs - 1];
+            dst++;
+
+            // Rest of last row
+            for (c = 0; c < bs - 2; ++c)
+            {
+                dst[(bs - 1) * stride + c] = left[bs - 1];
+            }
+
+            for (r = bs - 2; r >= 0; --r)
+            {
+                for (c = 0; c < bs - 2; ++c)
+                {
+                    dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+                }
+            }
+        }
+
+        public static unsafe void D63Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            D63Predictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void D63Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            D63Predictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void D63Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            D63Predictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void D63Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            int r, c;
+            int size;
+            for (c = 0; c < bs; ++c)
+            {
+                dst[c] = Avg2(above[c], above[c + 1]);
+                dst[stride + c] = Avg3(above[c], above[c + 1], above[c + 2]);
+            }
+            for (r = 2, size = bs - 2; r < bs; r += 2, --size)
+            {
+                MemoryUtil.Copy(dst + (r + 0) * stride, dst + (r >> 1), size);
+                MemoryUtil.Fill(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+                MemoryUtil.Copy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
+                MemoryUtil.Fill(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
+            }
+        }
+
+        public static unsafe void D45Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            D45Predictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void D45Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            D45Predictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void D45Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            D45Predictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void D45Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            byte aboveRight = above[bs - 1];
+            byte* dstRow0 = dst;
+            int x, size;
+
+            for (x = 0; x < bs - 1; ++x)
+            {
+                dst[x] = Avg3(above[x], above[x + 1], above[x + 2]);
+            }
+            dst[bs - 1] = aboveRight;
+            dst += stride;
+            for (x = 1, size = bs - 2; x < bs; ++x, --size)
+            {
+                MemoryUtil.Copy(dst, dstRow0 + x, size);
+                MemoryUtil.Fill(dst + size, aboveRight, x + 1);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void D117Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            D117Predictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void D117Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            D117Predictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void D117Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            D117Predictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void D117Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            int r, c;
+
+            // First row
+            for (c = 0; c < bs; c++)
+            {
+                dst[c] = Avg2(above[c - 1], above[c]);
+            }
+
+            dst += stride;
+
+            // Second row
+            dst[0] = Avg3(left[0], above[-1], above[0]);
+            for (c = 1; c < bs; c++)
+            {
+                dst[c] = Avg3(above[c - 2], above[c - 1], above[c]);
+            }
+
+            dst += stride;
+
+            // The rest of first col
+            dst[0] = Avg3(above[-1], left[0], left[1]);
+            for (r = 3; r < bs; ++r)
+            {
+                dst[(r - 2) * stride] = Avg3(left[r - 3], left[r - 2], left[r - 1]);
+            }
+
+            // The rest of the block
+            for (r = 2; r < bs; ++r)
+            {
+                for (c = 1; c < bs; c++)
+                {
+                    dst[c] = dst[-2 * stride + c - 1];
+                }
+
+                dst += stride;
+            }
+        }
+
+        public static unsafe void D135Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            D135Predictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void D135Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            D135Predictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void D135Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            D135Predictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void D135Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            int i;
+            byte* border = stackalloc byte[32 + 32 - 1];  // outer border from bottom-left to top-right
+
+            // Dst(dst, stride, bs, bs - 2)[0], i.e., border starting at bottom-left
+            for (i = 0; i < bs - 2; ++i)
+            {
+                border[i] = Avg3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
+            }
+            border[bs - 2] = Avg3(above[-1], left[0], left[1]);
+            border[bs - 1] = Avg3(left[0], above[-1], above[0]);
+            border[bs - 0] = Avg3(above[-1], above[0], above[1]);
+            // dst[0][2, size), i.e., remaining top border ascending
+            for (i = 0; i < bs - 2; ++i)
+            {
+                border[bs + 1 + i] = Avg3(above[i], above[i + 1], above[i + 2]);
+            }
+
+            for (i = 0; i < bs; ++i)
+            {
+                MemoryUtil.Copy(dst + i * stride, border + bs - 1 - i, bs);
+            }
+        }
+
+        public static unsafe void D153Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            D153Predictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void D153Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            D153Predictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void D153Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            D153Predictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void D153Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            int r, c;
+            dst[0] = Avg2(above[-1], left[0]);
+            for (r = 1; r < bs; r++)
+            {
+                dst[r * stride] = Avg2(left[r - 1], left[r]);
+            }
+
+            dst++;
+
+            dst[0] = Avg3(left[0], above[-1], above[0]);
+            dst[stride] = Avg3(above[-1], left[0], left[1]);
+            for (r = 2; r < bs; r++)
+            {
+                dst[r * stride] = Avg3(left[r - 2], left[r - 1], left[r]);
+            }
+
+            dst++;
+
+            for (c = 0; c < bs - 2; c++)
+            {
+                dst[c] = Avg3(above[c - 1], above[c], above[c + 1]);
+            }
+
+            dst += stride;
+
+            for (r = 1; r < bs; ++r)
+            {
+                for (c = 0; c < bs - 2; c++)
+                {
+                    dst[c] = dst[-stride + c - 2];
+                }
+
+                dst += stride;
+            }
+        }
+
+        public static unsafe void VPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            VPredictor(dst, stride, 4, above, left);
+        }
+
+        public static unsafe void VPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            VPredictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void VPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            VPredictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void VPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            VPredictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void VPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            int r;
+
+            for (r = 0; r < bs; r++)
+            {
+                MemoryUtil.Copy(dst, above, bs);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void HPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            HPredictor(dst, stride, 4, above, left);
+        }
+
+        public static unsafe void HPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            HPredictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void HPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            HPredictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void HPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            HPredictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void HPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            int r;
+
+            for (r = 0; r < bs; r++)
+            {
+                MemoryUtil.Fill(dst, left[r], bs);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void TMPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            TMPredictor(dst, stride, 4, above, left);
+        }
+
+        public static unsafe void TMPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            TMPredictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void TMPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            TMPredictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void TMPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            TMPredictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void TMPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            int r, c;
+            int yTopLeft = above[-1];
+
+            for (r = 0; r < bs; r++)
+            {
+                for (c = 0; c < bs; c++)
+                {
+                    dst[c] = BitUtils.ClipPixel(left[r] + above[c] - yTopLeft);
+                }
+
+                dst += stride;
+            }
+        }
+
+        public static unsafe void Dc128Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            Dc128Predictor(dst, stride, 4, above, left);
+        }
+
+        public static unsafe void Dc128Predictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            Dc128Predictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void Dc128Predictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            Dc128Predictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void Dc128Predictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            Dc128Predictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void Dc128Predictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            int r;
+
+            for (r = 0; r < bs; r++)
+            {
+                MemoryUtil.Fill(dst, (byte)128, bs);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void DcLeftPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            DcLeftPredictor(dst, stride, 4, above, left);
+        }
+
+        public static unsafe void DcLeftPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            DcLeftPredictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void DcLeftPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            DcLeftPredictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void DcLeftPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            DcLeftPredictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void DcLeftPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            int i, r, expectedDc, sum = 0;
+
+            for (i = 0; i < bs; i++)
+            {
+                sum += left[i];
+            }
+
+            expectedDc = (sum + (bs >> 1)) / bs;
+
+            for (r = 0; r < bs; r++)
+            {
+                MemoryUtil.Fill(dst, (byte)expectedDc, bs);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void DcTopPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            DcTopPredictor(dst, stride, 4, above, left);
+        }
+
+        public static unsafe void DcTopPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            DcTopPredictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void DcTopPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            DcTopPredictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void DcTopPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            DcTopPredictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void DcTopPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            int i, r, expectedDc, sum = 0;
+
+            for (i = 0; i < bs; i++)
+            {
+                sum += above[i];
+            }
+
+            expectedDc = (sum + (bs >> 1)) / bs;
+
+            for (r = 0; r < bs; r++)
+            {
+                MemoryUtil.Fill(dst, (byte)expectedDc, bs);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void DcPredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            DcPredictor(dst, stride, 4, above, left);
+        }
+
+        public static unsafe void DcPredictor8x8(byte* dst, int stride, byte* above, byte* left)
+        {
+            DcPredictor(dst, stride, 8, above, left);
+        }
+
+        public static unsafe void DcPredictor16x16(byte* dst, int stride, byte* above, byte* left)
+        {
+            DcPredictor(dst, stride, 16, above, left);
+        }
+
+        public static unsafe void DcPredictor32x32(byte* dst, int stride, byte* above, byte* left)
+        {
+            DcPredictor(dst, stride, 32, above, left);
+        }
+
+        private static unsafe void DcPredictor(byte* dst, int stride, int bs, byte* above, byte* left)
+        {
+            int i, r, expectedDc, sum = 0;
+            int count = 2 * bs;
+
+            for (i = 0; i < bs; i++)
+            {
+                sum += above[i];
+                sum += left[i];
+            }
+
+            expectedDc = (sum + (count >> 1)) / count;
+
+            for (r = 0; r < bs; r++)
+            {
+                MemoryUtil.Fill(dst, (byte)expectedDc, bs);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void HePredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            byte h = above[-1];
+            byte I = left[0];
+            byte j = left[1];
+            byte k = left[2];
+            byte l = left[3];
+
+            MemoryUtil.Fill(dst + stride * 0, Avg3(h, I, j), 4);
+            MemoryUtil.Fill(dst + stride * 1, Avg3(I, j, k), 4);
+            MemoryUtil.Fill(dst + stride * 2, Avg3(j, k, l), 4);
+            MemoryUtil.Fill(dst + stride * 3, Avg3(k, l, l), 4);
+        }
+
+        public static unsafe void VePredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            byte h = above[-1];
+            byte I = above[0];
+            byte j = above[1];
+            byte k = above[2];
+            byte l = above[3];
+            byte m = above[4];
+
+            dst[0] = Avg3(h, I, j);
+            dst[1] = Avg3(I, j, k);
+            dst[2] = Avg3(j, k, l);
+            dst[3] = Avg3(k, l, m);
+            MemoryUtil.Copy(dst + stride * 1, dst, 4);
+            MemoryUtil.Copy(dst + stride * 2, dst, 4);
+            MemoryUtil.Copy(dst + stride * 3, dst, 4);
+        }
+
+        public static unsafe void D207Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            byte I = left[0];
+            byte j = left[1];
+            byte k = left[2];
+            byte l = left[3];
+            Dst(dst, stride, 0, 0) = Avg2(I, j);
+            Dst(dst, stride, 2, 0) = Dst(dst, stride, 0, 1) = Avg2(j, k);
+            Dst(dst, stride, 2, 1) = Dst(dst, stride, 0, 2) = Avg2(k, l);
+            Dst(dst, stride, 1, 0) = Avg3(I, j, k);
+            Dst(dst, stride, 3, 0) = Dst(dst, stride, 1, 1) = Avg3(j, k, l);
+            Dst(dst, stride, 3, 1) = Dst(dst, stride, 1, 2) = Avg3(k, l, l);
+            Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) = Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l;
+        }
+
+        public static unsafe void D63Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            byte a = above[0];
+            byte b = above[1];
+            byte c = above[2];
+            byte d = above[3];
+            byte e = above[4];
+            byte f = above[5];
+            byte g = above[6];
+            Dst(dst, stride, 0, 0) = Avg2(a, b);
+            Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 2) = Avg2(b, c);
+            Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 2) = Avg2(c, d);
+            Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 2) = Avg2(d, e);
+            Dst(dst, stride, 3, 2) = Avg2(e, f);  // Differs from vp8
+
+            Dst(dst, stride, 0, 1) = Avg3(a, b, c);
+            Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 3) = Avg3(b, c, d);
+            Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 3) = Avg3(c, d, e);
+            Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 3) = Avg3(d, e, f);
+            Dst(dst, stride, 3, 3) = Avg3(e, f, g);  // Differs from vp8
+        }
+
+        public static unsafe void D63ePredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            byte a = above[0];
+            byte b = above[1];
+            byte c = above[2];
+            byte d = above[3];
+            byte e = above[4];
+            byte f = above[5];
+            byte g = above[6];
+            byte h = above[7];
+            Dst(dst, stride, 0, 0) = Avg2(a, b);
+            Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 2) = Avg2(b, c);
+            Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 2) = Avg2(c, d);
+            Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 2) = Avg2(d, e);
+            Dst(dst, stride, 3, 2) = Avg3(e, f, g);
+
+            Dst(dst, stride, 0, 1) = Avg3(a, b, c);
+            Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 3) = Avg3(b, c, d);
+            Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 3) = Avg3(c, d, e);
+            Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 3) = Avg3(d, e, f);
+            Dst(dst, stride, 3, 3) = Avg3(f, g, h);
+        }
+
+        public static unsafe void D45Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            byte a = above[0];
+            byte b = above[1];
+            byte c = above[2];
+            byte d = above[3];
+            byte e = above[4];
+            byte f = above[5];
+            byte g = above[6];
+            byte h = above[7];
+            Dst(dst, stride, 0, 0) = Avg3(a, b, c);
+            Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d);
+            Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e);
+            Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f);
+            Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g);
+            Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h);
+            Dst(dst, stride, 3, 3) = h;  // differs from vp8
+        }
+
+        public static unsafe void D45ePredictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            byte a = above[0];
+            byte b = above[1];
+            byte c = above[2];
+            byte d = above[3];
+            byte e = above[4];
+            byte f = above[5];
+            byte g = above[6];
+            byte h = above[7];
+            Dst(dst, stride, 0, 0) = Avg3(a, b, c);
+            Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d);
+            Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e);
+            Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f);
+            Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g);
+            Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h);
+            Dst(dst, stride, 3, 3) = Avg3(g, h, h);
+        }
+
+        public static unsafe void D117Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            byte I = left[0];
+            byte j = left[1];
+            byte k = left[2];
+            byte x = above[-1];
+            byte a = above[0];
+            byte b = above[1];
+            byte c = above[2];
+            byte d = above[3];
+            Dst(dst, stride, 0, 0) = Dst(dst, stride, 1, 2) = Avg2(x, a);
+            Dst(dst, stride, 1, 0) = Dst(dst, stride, 2, 2) = Avg2(a, b);
+            Dst(dst, stride, 2, 0) = Dst(dst, stride, 3, 2) = Avg2(b, c);
+            Dst(dst, stride, 3, 0) = Avg2(c, d);
+
+            Dst(dst, stride, 0, 3) = Avg3(k, j, I);
+            Dst(dst, stride, 0, 2) = Avg3(j, I, x);
+            Dst(dst, stride, 0, 1) = Dst(dst, stride, 1, 3) = Avg3(I, x, a);
+            Dst(dst, stride, 1, 1) = Dst(dst, stride, 2, 3) = Avg3(x, a, b);
+            Dst(dst, stride, 2, 1) = Dst(dst, stride, 3, 3) = Avg3(a, b, c);
+            Dst(dst, stride, 3, 1) = Avg3(b, c, d);
+        }
+
+        public static unsafe void D135Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            byte I = left[0];
+            byte j = left[1];
+            byte k = left[2];
+            byte l = left[3];
+            byte x = above[-1];
+            byte a = above[0];
+            byte b = above[1];
+            byte c = above[2];
+            byte d = above[3];
+            Dst(dst, stride, 0, 3) = Avg3(j, k, l);
+            Dst(dst, stride, 1, 3) = Dst(dst, stride, 0, 2) = Avg3(I, j, k);
+            Dst(dst, stride, 2, 3) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 1) = Avg3(x, I, j);
+            Dst(dst, stride, 3, 3) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I);
+            Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 0) = Avg3(b, a, x);
+            Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 0) = Avg3(c, b, a);
+            Dst(dst, stride, 3, 0) = Avg3(d, c, b);
+        }
+
+        public static unsafe void D153Predictor4x4(byte* dst, int stride, byte* above, byte* left)
+        {
+            byte I = left[0];
+            byte j = left[1];
+            byte k = left[2];
+            byte l = left[3];
+            byte x = above[-1];
+            byte a = above[0];
+            byte b = above[1];
+            byte c = above[2];
+            Dst(dst, stride, 0, 0) = Dst(dst, stride, 2, 1) = Avg2(I, x);
+            Dst(dst, stride, 0, 1) = Dst(dst, stride, 2, 2) = Avg2(j, I);
+            Dst(dst, stride, 0, 2) = Dst(dst, stride, 2, 3) = Avg2(k, j);
+            Dst(dst, stride, 0, 3) = Avg2(l, k);
+
+            Dst(dst, stride, 3, 0) = Avg3(a, b, c);
+            Dst(dst, stride, 2, 0) = Avg3(x, a, b);
+            Dst(dst, stride, 1, 0) = Dst(dst, stride, 3, 1) = Avg3(I, x, a);
+            Dst(dst, stride, 1, 1) = Dst(dst, stride, 3, 2) = Avg3(j, I, x);
+            Dst(dst, stride, 1, 2) = Dst(dst, stride, 3, 3) = Avg3(k, j, I);
+            Dst(dst, stride, 1, 3) = Avg3(l, k, j);
+        }
+
+        public static unsafe void HighbdD207Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD207Predictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdD207Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD207Predictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdD207Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD207Predictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdD207Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            int r, c;
+
+            // First column.
+            for (r = 0; r < bs - 1; ++r)
+            {
+                dst[r * stride] = Avg2(left[r], left[r + 1]);
+            }
+            dst[(bs - 1) * stride] = left[bs - 1];
+            dst++;
+
+            // Second column.
+            for (r = 0; r < bs - 2; ++r)
+            {
+                dst[r * stride] = Avg3(left[r], left[r + 1], left[r + 2]);
+            }
+            dst[(bs - 2) * stride] = Avg3(left[bs - 2], left[bs - 1], left[bs - 1]);
+            dst[(bs - 1) * stride] = left[bs - 1];
+            dst++;
+
+            // Rest of last row.
+            for (c = 0; c < bs - 2; ++c)
+            {
+                dst[(bs - 1) * stride + c] = left[bs - 1];
+            }
+
+            for (r = bs - 2; r >= 0; --r)
+            {
+                for (c = 0; c < bs - 2; ++c)
+                {
+                    dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
+                }
+            }
+        }
+
+        public static unsafe void HighbdD63Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD63Predictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdD63Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD63Predictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdD63Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD63Predictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdD63Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            int r, c;
+            int size;
+            for (c = 0; c < bs; ++c)
+            {
+                dst[c] = Avg2(above[c], above[c + 1]);
+                dst[stride + c] = Avg3(above[c], above[c + 1], above[c + 2]);
+            }
+            for (r = 2, size = bs - 2; r < bs; r += 2, --size)
+            {
+                MemoryUtil.Copy(dst + (r + 0) * stride, dst + (r >> 1), size);
+                MemoryUtil.Fill(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+                MemoryUtil.Copy(dst + (r + 1) * stride, dst + stride + (r >> 1), size);
+                MemoryUtil.Fill(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
+            }
+        }
+
+        public static unsafe void HighbdD45Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD45Predictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdD45Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD45Predictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdD45Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD45Predictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdD45Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            ushort aboveRight = above[bs - 1];
+            ushort* dstRow0 = dst;
+            int x, size;
+
+            for (x = 0; x < bs - 1; ++x)
+            {
+                dst[x] = Avg3(above[x], above[x + 1], above[x + 2]);
+            }
+            dst[bs - 1] = aboveRight;
+            dst += stride;
+            for (x = 1, size = bs - 2; x < bs; ++x, --size)
+            {
+                MemoryUtil.Copy(dst, dstRow0 + x, size);
+                MemoryUtil.Fill(dst + size, aboveRight, x + 1);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void HighbdD117Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD117Predictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdD117Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD117Predictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdD117Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD117Predictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdD117Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            int r, c;
+
+            // First row
+            for (c = 0; c < bs; c++)
+            {
+                dst[c] = Avg2(above[c - 1], above[c]);
+            }
+
+            dst += stride;
+
+            // Second row
+            dst[0] = Avg3(left[0], above[-1], above[0]);
+            for (c = 1; c < bs; c++)
+            {
+                dst[c] = Avg3(above[c - 2], above[c - 1], above[c]);
+            }
+
+            dst += stride;
+
+            // The rest of first col
+            dst[0] = Avg3(above[-1], left[0], left[1]);
+            for (r = 3; r < bs; ++r)
+            {
+                dst[(r - 2) * stride] = Avg3(left[r - 3], left[r - 2], left[r - 1]);
+            }
+
+            // The rest of the block
+            for (r = 2; r < bs; ++r)
+            {
+                for (c = 1; c < bs; c++)
+                {
+                    dst[c] = dst[-2 * stride + c - 1];
+                }
+
+                dst += stride;
+            }
+        }
+
+        public static unsafe void HighbdD135Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD135Predictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdD135Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD135Predictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdD135Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD135Predictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdD135Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            int i;
+            ushort* border = stackalloc ushort[32 + 32 - 1];  // Outer border from bottom-left to top-right
+
+            // Dst(dst, stride, bs, bs - 2)[0], i.e., border starting at bottom-left
+            for (i = 0; i < bs - 2; ++i)
+            {
+                border[i] = Avg3(left[bs - 3 - i], left[bs - 2 - i], left[bs - 1 - i]);
+            }
+            border[bs - 2] = Avg3(above[-1], left[0], left[1]);
+            border[bs - 1] = Avg3(left[0], above[-1], above[0]);
+            border[bs - 0] = Avg3(above[-1], above[0], above[1]);
+            // dst[0][2, size), i.e., remaining top border ascending
+            for (i = 0; i < bs - 2; ++i)
+            {
+                border[bs + 1 + i] = Avg3(above[i], above[i + 1], above[i + 2]);
+            }
+
+            for (i = 0; i < bs; ++i)
+            {
+                MemoryUtil.Copy(dst + i * stride, border + bs - 1 - i, bs);
+            }
+        }
+
+        public static unsafe void HighbdD153Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD153Predictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdD153Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD153Predictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdD153Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdD153Predictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdD153Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            int r, c;
+            dst[0] = Avg2(above[-1], left[0]);
+            for (r = 1; r < bs; r++)
+            {
+                dst[r * stride] = Avg2(left[r - 1], left[r]);
+            }
+
+            dst++;
+
+            dst[0] = Avg3(left[0], above[-1], above[0]);
+            dst[stride] = Avg3(above[-1], left[0], left[1]);
+            for (r = 2; r < bs; r++)
+            {
+                dst[r * stride] = Avg3(left[r - 2], left[r - 1], left[r]);
+            }
+
+            dst++;
+
+            for (c = 0; c < bs - 2; c++)
+            {
+                dst[c] = Avg3(above[c - 1], above[c], above[c + 1]);
+            }
+
+            dst += stride;
+
+            for (r = 1; r < bs; ++r)
+            {
+                for (c = 0; c < bs - 2; c++)
+                {
+                    dst[c] = dst[-stride + c - 2];
+                }
+
+                dst += stride;
+            }
+        }
+
+        public static unsafe void HighbdVPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdVPredictor(dst, stride, 4, above, left, bd);
+        }
+
+        public static unsafe void HighbdVPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdVPredictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdVPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdVPredictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdVPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdVPredictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdVPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            int r;
+            for (r = 0; r < bs; r++)
+            {
+                MemoryUtil.Copy(dst, above, bs);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void HighbdHPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdHPredictor(dst, stride, 4, above, left, bd);
+        }
+
+        public static unsafe void HighbdHPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdHPredictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdHPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdHPredictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdHPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdHPredictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdHPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            int r;
+            for (r = 0; r < bs; r++)
+            {
+                MemoryUtil.Fill(dst, left[r], bs);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void HighbdTMPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdTMPredictor(dst, stride, 4, above, left, bd);
+        }
+
+        public static unsafe void HighbdTMPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdTMPredictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdTMPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdTMPredictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdTMPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdTMPredictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdTMPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            int r, c;
+            int yTopLeft = above[-1];
+
+            for (r = 0; r < bs; r++)
+            {
+                for (c = 0; c < bs; c++)
+                {
+                    dst[c] = BitUtils.ClipPixelHighbd(left[r] + above[c] - yTopLeft, bd);
+                }
+
+                dst += stride;
+            }
+        }
+
+        public static unsafe void HighbdDc128Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDc128Predictor(dst, stride, 4, above, left, bd);
+        }
+
+        public static unsafe void HighbdDc128Predictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDc128Predictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdDc128Predictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDc128Predictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdDc128Predictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDc128Predictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdDc128Predictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            int r;
+
+            for (r = 0; r < bs; r++)
+            {
+                MemoryUtil.Fill(dst, (ushort)(128 << (bd - 8)), bs);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void HighbdDcLeftPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDcLeftPredictor(dst, stride, 4, above, left, bd);
+        }
+
+        public static unsafe void HighbdDcLeftPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDcLeftPredictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdDcLeftPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDcLeftPredictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdDcLeftPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDcLeftPredictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdDcLeftPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            int i, r, expectedDc, sum = 0;
+
+            for (i = 0; i < bs; i++)
+            {
+                sum += left[i];
+            }
+
+            expectedDc = (sum + (bs >> 1)) / bs;
+
+            for (r = 0; r < bs; r++)
+            {
+                MemoryUtil.Fill(dst, (ushort)expectedDc, bs);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void HighbdDcTopPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDcTopPredictor(dst, stride, 4, above, left, bd);
+        }
+
+        public static unsafe void HighbdDcTopPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDcTopPredictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdDcTopPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDcTopPredictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdDcTopPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDcTopPredictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdDcTopPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            int i, r, expectedDc, sum = 0;
+
+            for (i = 0; i < bs; i++)
+            {
+                sum += above[i];
+            }
+
+            expectedDc = (sum + (bs >> 1)) / bs;
+
+            for (r = 0; r < bs; r++)
+            {
+                MemoryUtil.Fill(dst, (ushort)expectedDc, bs);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void HighbdDcPredictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDcPredictor(dst, stride, 4, above, left, bd);
+        }
+
+        public static unsafe void HighbdDcPredictor8x8(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDcPredictor(dst, stride, 8, above, left, bd);
+        }
+
+        public static unsafe void HighbdDcPredictor16x16(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDcPredictor(dst, stride, 16, above, left, bd);
+        }
+
+        public static unsafe void HighbdDcPredictor32x32(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            HighbdDcPredictor(dst, stride, 32, above, left, bd);
+        }
+
+        private static unsafe void HighbdDcPredictor(ushort* dst, int stride, int bs, ushort* above, ushort* left, int bd)
+        {
+            int i, r, expectedDc, sum = 0;
+            int count = 2 * bs;
+
+            for (i = 0; i < bs; i++)
+            {
+                sum += above[i];
+                sum += left[i];
+            }
+
+            expectedDc = (sum + (count >> 1)) / count;
+
+            for (r = 0; r < bs; r++)
+            {
+                MemoryUtil.Fill(dst, (ushort)expectedDc, bs);
+                dst += stride;
+            }
+        }
+
+        public static unsafe void HighbdD207Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            ushort I = left[0];
+            ushort j = left[1];
+            ushort k = left[2];
+            ushort l = left[3];
+            Dst(dst, stride, 0, 0) = Avg2(I, j);
+            Dst(dst, stride, 2, 0) = Dst(dst, stride, 0, 1) = Avg2(j, k);
+            Dst(dst, stride, 2, 1) = Dst(dst, stride, 0, 2) = Avg2(k, l);
+            Dst(dst, stride, 1, 0) = Avg3(I, j, k);
+            Dst(dst, stride, 3, 0) = Dst(dst, stride, 1, 1) = Avg3(j, k, l);
+            Dst(dst, stride, 3, 1) = Dst(dst, stride, 1, 2) = Avg3(k, l, l);
+            Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 0, 3) = Dst(dst, stride, 1, 3) = Dst(dst, stride, 2, 3) = Dst(dst, stride, 3, 3) = l;
+        }
+
+        public static unsafe void HighbdD63Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            ushort a = above[0];
+            ushort b = above[1];
+            ushort c = above[2];
+            ushort d = above[3];
+            ushort e = above[4];
+            ushort f = above[5];
+            ushort g = above[6];
+            Dst(dst, stride, 0, 0) = Avg2(a, b);
+            Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 2) = Avg2(b, c);
+            Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 2) = Avg2(c, d);
+            Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 2) = Avg2(d, e);
+            Dst(dst, stride, 3, 2) = Avg2(e, f);  // Differs from vp8
+
+            Dst(dst, stride, 0, 1) = Avg3(a, b, c);
+            Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 3) = Avg3(b, c, d);
+            Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 3) = Avg3(c, d, e);
+            Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 3) = Avg3(d, e, f);
+            Dst(dst, stride, 3, 3) = Avg3(e, f, g);  // Differs from vp8
+        }
+
+        public static unsafe void HighbdD45Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            ushort a = above[0];
+            ushort b = above[1];
+            ushort c = above[2];
+            ushort d = above[3];
+            ushort e = above[4];
+            ushort f = above[5];
+            ushort g = above[6];
+            ushort h = above[7];
+            Dst(dst, stride, 0, 0) = Avg3(a, b, c);
+            Dst(dst, stride, 1, 0) = Dst(dst, stride, 0, 1) = Avg3(b, c, d);
+            Dst(dst, stride, 2, 0) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 2) = Avg3(c, d, e);
+            Dst(dst, stride, 3, 0) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 3) = Avg3(d, e, f);
+            Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 3) = Avg3(e, f, g);
+            Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 3) = Avg3(f, g, h);
+            Dst(dst, stride, 3, 3) = h;  // Differs from vp8
+        }
+
+        public static unsafe void HighbdD117Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            ushort I = left[0];
+            ushort j = left[1];
+            ushort k = left[2];
+            ushort x = above[-1];
+            ushort a = above[0];
+            ushort b = above[1];
+            ushort c = above[2];
+            ushort d = above[3];
+            Dst(dst, stride, 0, 0) = Dst(dst, stride, 1, 2) = Avg2(x, a);
+            Dst(dst, stride, 1, 0) = Dst(dst, stride, 2, 2) = Avg2(a, b);
+            Dst(dst, stride, 2, 0) = Dst(dst, stride, 3, 2) = Avg2(b, c);
+            Dst(dst, stride, 3, 0) = Avg2(c, d);
+
+            Dst(dst, stride, 0, 3) = Avg3(k, j, I);
+            Dst(dst, stride, 0, 2) = Avg3(j, I, x);
+            Dst(dst, stride, 0, 1) = Dst(dst, stride, 1, 3) = Avg3(I, x, a);
+            Dst(dst, stride, 1, 1) = Dst(dst, stride, 2, 3) = Avg3(x, a, b);
+            Dst(dst, stride, 2, 1) = Dst(dst, stride, 3, 3) = Avg3(a, b, c);
+            Dst(dst, stride, 3, 1) = Avg3(b, c, d);
+        }
+
+        public static unsafe void HighbdD135Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            ushort I = left[0];
+            ushort j = left[1];
+            ushort k = left[2];
+            ushort l = left[3];
+            ushort x = above[-1];
+            ushort a = above[0];
+            ushort b = above[1];
+            ushort c = above[2];
+            ushort d = above[3];
+            Dst(dst, stride, 0, 3) = Avg3(j, k, l);
+            Dst(dst, stride, 1, 3) = Dst(dst, stride, 0, 2) = Avg3(I, j, k);
+            Dst(dst, stride, 2, 3) = Dst(dst, stride, 1, 2) = Dst(dst, stride, 0, 1) = Avg3(x, I, j);
+            Dst(dst, stride, 3, 3) = Dst(dst, stride, 2, 2) = Dst(dst, stride, 1, 1) = Dst(dst, stride, 0, 0) = Avg3(a, x, I);
+            Dst(dst, stride, 3, 2) = Dst(dst, stride, 2, 1) = Dst(dst, stride, 1, 0) = Avg3(b, a, x);
+            Dst(dst, stride, 3, 1) = Dst(dst, stride, 2, 0) = Avg3(c, b, a);
+            Dst(dst, stride, 3, 0) = Avg3(d, c, b);
+        }
+
+        public static unsafe void HighbdD153Predictor4x4(ushort* dst, int stride, ushort* above, ushort* left, int bd)
+        {
+            ushort I = left[0];
+            ushort j = left[1];
+            ushort k = left[2];
+            ushort l = left[3];
+            ushort x = above[-1];
+            ushort a = above[0];
+            ushort b = above[1];
+            ushort c = above[2];
+
+            Dst(dst, stride, 0, 0) = Dst(dst, stride, 2, 1) = Avg2(I, x);
+            Dst(dst, stride, 0, 1) = Dst(dst, stride, 2, 2) = Avg2(j, I);
+            Dst(dst, stride, 0, 2) = Dst(dst, stride, 2, 3) = Avg2(k, j);
+            Dst(dst, stride, 0, 3) = Avg2(l, k);
+
+            Dst(dst, stride, 3, 0) = Avg3(a, b, c);
+            Dst(dst, stride, 2, 0) = Avg3(x, a, b);
+            Dst(dst, stride, 1, 0) = Dst(dst, stride, 3, 1) = Avg3(I, x, a);
+            Dst(dst, stride, 1, 1) = Dst(dst, stride, 3, 2) = Avg3(j, I, x);
+            Dst(dst, stride, 1, 2) = Dst(dst, stride, 3, 3) = Avg3(k, j, I);
+            Dst(dst, stride, 1, 3) = Avg3(l, k, j);
+        }
+    }
+}

+ 2868 - 0
Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs

@@ -0,0 +1,2868 @@
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.TxfmCommon;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+    internal static class InvTxfm
+    {
+        // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
+        // transform amplify bits + 1 bit for contingency in rounding and quantizing
+        private const int HighbdValidTxfmMagnitudeRange = (1 << 25);
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int DetectInvalidHighbdInput(ReadOnlySpan<int> input, int size)
+        {
+            int i;
+            for (i = 0; i < size; ++i)
+            {
+                if (Math.Abs(input[i]) >= HighbdValidTxfmMagnitudeRange)
+                {
+                    return 1;
+                }
+            }
+
+            return 0;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static long CheckRange(long input)
+        {
+            // For valid VP9 input streams, intermediate stage coefficients should always
+            // stay within the range of a signed 16 bit integer. Coefficients can go out
+            // of this range for invalid/corrupt VP9 streams.
+            Debug.Assert(short.MinValue <= input);
+            Debug.Assert(input <= short.MaxValue);
+            return input;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static long HighbdCheckRange(long input, int bd)
+        {
+            // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+            // stay within the ranges:
+            // - 8 bit: signed 16 bit integer
+            // - 10 bit: signed 18 bit integer
+            // - 12 bit: signed 20 bit integer
+            int intMax = (1 << (7 + bd)) - 1;
+            int intMin = -intMax - 1;
+            Debug.Assert(intMin <= input);
+            Debug.Assert(input <= intMax);
+
+            return input;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int WrapLow(long x)
+        {
+            return (short)CheckRange(x);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int HighbdWrapLow(long x, int bd)
+        {
+            return ((int)HighbdCheckRange(x, bd) << (24 - bd)) >> (24 - bd);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static byte ClipPixelAdd(byte dest, long trans)
+        {
+            trans = WrapLow(trans);
+            return BitUtils.ClipPixel(dest + (int)trans);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static ushort HighbdClipPixelAdd(ushort dest, long trans, int bd)
+        {
+            trans = HighbdWrapLow(trans, bd);
+            return BitUtils.ClipPixelHighbd(dest + (int)trans, bd);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static long DctConstRoundShift(long input)
+        {
+            long rv = BitUtils.RoundPowerOfTwo(input, DctConstBits);
+            return rv;
+        }
+
+        public static void Iwht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+               0.5 shifts per pixel. */
+            int i;
+            Span<int> output = stackalloc int[16];
+            long a1, b1, c1, d1, e1;
+            ReadOnlySpan<int> ip = input;
+            Span<int> op = output;
+
+            for (i = 0; i < 4; i++)
+            {
+                a1 = ip[0] >> UnitQuantShift;
+                c1 = ip[1] >> UnitQuantShift;
+                d1 = ip[2] >> UnitQuantShift;
+                b1 = ip[3] >> UnitQuantShift;
+                a1 += c1;
+                d1 -= b1;
+                e1 = (a1 - d1) >> 1;
+                b1 = e1 - b1;
+                c1 = e1 - c1;
+                a1 -= b1;
+                d1 += c1;
+                op[0] = WrapLow(a1);
+                op[1] = WrapLow(b1);
+                op[2] = WrapLow(c1);
+                op[3] = WrapLow(d1);
+                ip = ip.Slice(4);
+                op = op.Slice(4);
+            }
+
+            Span<int> ip2 = output;
+            for (i = 0; i < 4; i++)
+            {
+                a1 = ip2[4 * 0];
+                c1 = ip2[4 * 1];
+                d1 = ip2[4 * 2];
+                b1 = ip2[4 * 3];
+                a1 += c1;
+                d1 -= b1;
+                e1 = (a1 - d1) >> 1;
+                b1 = e1 - b1;
+                c1 = e1 - c1;
+                a1 -= b1;
+                d1 += c1;
+                dest[stride * 0] = ClipPixelAdd(dest[stride * 0], WrapLow(a1));
+                dest[stride * 1] = ClipPixelAdd(dest[stride * 1], WrapLow(b1));
+                dest[stride * 2] = ClipPixelAdd(dest[stride * 2], WrapLow(c1));
+                dest[stride * 3] = ClipPixelAdd(dest[stride * 3], WrapLow(d1));
+
+                ip2 = ip2.Slice(1);
+                dest = dest.Slice(1);
+            }
+        }
+
+        public static void Iwht4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i;
+            long a1, e1;
+            Span<int> tmp = stackalloc int[4];
+            ReadOnlySpan<int> ip = input;
+            Span<int> op = tmp;
+
+            a1 = ip[0] >> UnitQuantShift;
+            e1 = a1 >> 1;
+            a1 -= e1;
+            op[0] = WrapLow(a1);
+            op[1] = op[2] = op[3] = WrapLow(e1);
+
+            Span<int> ip2 = tmp;
+            for (i = 0; i < 4; i++)
+            {
+                e1 = ip2[0] >> 1;
+                a1 = ip2[0] - e1;
+                dest[stride * 0] = ClipPixelAdd(dest[stride * 0], a1);
+                dest[stride * 1] = ClipPixelAdd(dest[stride * 1], e1);
+                dest[stride * 2] = ClipPixelAdd(dest[stride * 2], e1);
+                dest[stride * 3] = ClipPixelAdd(dest[stride * 3], e1);
+                ip2 = ip2.Slice(1);
+                dest = dest.Slice(1);
+            }
+        }
+
+        public static void Iadst4(ReadOnlySpan<int> input, Span<int> output)
+        {
+            long s0, s1, s2, s3, s4, s5, s6, s7;
+            int x0 = input[0];
+            int x1 = input[1];
+            int x2 = input[2];
+            int x3 = input[3];
+
+            if ((x0 | x1 | x2 | x3) == 0)
+            {
+                output.Slice(0, 4).Fill(0);
+                return;
+            }
+
+            // 32-bit result is enough for the following multiplications.
+            s0 = SinPi1_9 * x0;
+            s1 = SinPi2_9 * x0;
+            s2 = SinPi3_9 * x1;
+            s3 = SinPi4_9 * x2;
+            s4 = SinPi1_9 * x2;
+            s5 = SinPi2_9 * x3;
+            s6 = SinPi4_9 * x3;
+            s7 = WrapLow(x0 - x2 + x3);
+
+            s0 = s0 + s3 + s5;
+            s1 = s1 - s4 - s6;
+            s3 = s2;
+            s2 = SinPi3_9 * s7;
+
+            // 1-D transform scaling factor is sqrt(2).
+            // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+            // + 1b (addition) = 29b.
+            // Hence the output bit depth is 15b.
+            output[0] = WrapLow(DctConstRoundShift(s0 + s3));
+            output[1] = WrapLow(DctConstRoundShift(s1 + s3));
+            output[2] = WrapLow(DctConstRoundShift(s2));
+            output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3));
+        }
+
+        public static void Idct4(ReadOnlySpan<int> input, Span<int> output)
+        {
+            Span<short> step = stackalloc short[4];
+            long temp1, temp2;
+
+            // stage 1
+            temp1 = ((short)input[0] + (short)input[2]) * CosPi16_64;
+            temp2 = ((short)input[0] - (short)input[2]) * CosPi16_64;
+            step[0] = (short)WrapLow(DctConstRoundShift(temp1));
+            step[1] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = (short)input[1] * CosPi24_64 - (short)input[3] * CosPi8_64;
+            temp2 = (short)input[1] * CosPi8_64 + (short)input[3] * CosPi24_64;
+            step[2] = (short)WrapLow(DctConstRoundShift(temp1));
+            step[3] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            // stage 2
+            output[0] = WrapLow(step[0] + step[3]);
+            output[1] = WrapLow(step[1] + step[2]);
+            output[2] = WrapLow(step[1] - step[2]);
+            output[3] = WrapLow(step[0] - step[3]);
+        }
+
+        public static void Idct4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[4 * 4];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[4];
+            Span<int> tempOut = stackalloc int[4];
+
+            // Rows
+            for (i = 0; i < 4; ++i)
+            {
+                Idct4(input, outptr);
+                input = input.Slice(4);
+                outptr = outptr.Slice(4);
+            }
+
+            // Columns
+            for (i = 0; i < 4; ++i)
+            {
+                for (j = 0; j < 4; ++j)
+                {
+                    tempIn[j] = output[j * 4 + i];
+                }
+
+                Idct4(tempIn, tempOut);
+                for (j = 0; j < 4; ++j)
+                {
+                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4));
+                }
+            }
+        }
+
+        public static void Idct4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i;
+            long a1;
+            int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
+
+            output = WrapLow(DctConstRoundShift(output * CosPi16_64));
+            a1 = BitUtils.RoundPowerOfTwo(output, 4);
+
+            for (i = 0; i < 4; i++)
+            {
+                dest[0] = ClipPixelAdd(dest[0], a1);
+                dest[1] = ClipPixelAdd(dest[1], a1);
+                dest[2] = ClipPixelAdd(dest[2], a1);
+                dest[3] = ClipPixelAdd(dest[3], a1);
+                dest = dest.Slice(stride);
+            }
+        }
+
+        public static void Iadst8(ReadOnlySpan<int> input, Span<int> output)
+        {
+            int s0, s1, s2, s3, s4, s5, s6, s7;
+            long x0 = input[7];
+            long x1 = input[0];
+            long x2 = input[5];
+            long x3 = input[2];
+            long x4 = input[3];
+            long x5 = input[4];
+            long x6 = input[1];
+            long x7 = input[6];
+
+            if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0)
+            {
+                output.Slice(0, 8).Fill(0);
+                return;
+            }
+
+            // stage 1
+            s0 = (int)(CosPi2_64 * x0 + CosPi30_64 * x1);
+            s1 = (int)(CosPi30_64 * x0 - CosPi2_64 * x1);
+            s2 = (int)(CosPi10_64 * x2 + CosPi22_64 * x3);
+            s3 = (int)(CosPi22_64 * x2 - CosPi10_64 * x3);
+            s4 = (int)(CosPi18_64 * x4 + CosPi14_64 * x5);
+            s5 = (int)(CosPi14_64 * x4 - CosPi18_64 * x5);
+            s6 = (int)(CosPi26_64 * x6 + CosPi6_64 * x7);
+            s7 = (int)(CosPi6_64 * x6 - CosPi26_64 * x7);
+
+            x0 = WrapLow(DctConstRoundShift(s0 + s4));
+            x1 = WrapLow(DctConstRoundShift(s1 + s5));
+            x2 = WrapLow(DctConstRoundShift(s2 + s6));
+            x3 = WrapLow(DctConstRoundShift(s3 + s7));
+            x4 = WrapLow(DctConstRoundShift(s0 - s4));
+            x5 = WrapLow(DctConstRoundShift(s1 - s5));
+            x6 = WrapLow(DctConstRoundShift(s2 - s6));
+            x7 = WrapLow(DctConstRoundShift(s3 - s7));
+
+            // stage 2
+            s0 = (int)x0;
+            s1 = (int)x1;
+            s2 = (int)x2;
+            s3 = (int)x3;
+            s4 = (int)(CosPi8_64 * x4 + CosPi24_64 * x5);
+            s5 = (int)(CosPi24_64 * x4 - CosPi8_64 * x5);
+            s6 = (int)(-CosPi24_64 * x6 + CosPi8_64 * x7);
+            s7 = (int)(CosPi8_64 * x6 + CosPi24_64 * x7);
+
+            x0 = WrapLow(s0 + s2);
+            x1 = WrapLow(s1 + s3);
+            x2 = WrapLow(s0 - s2);
+            x3 = WrapLow(s1 - s3);
+            x4 = WrapLow(DctConstRoundShift(s4 + s6));
+            x5 = WrapLow(DctConstRoundShift(s5 + s7));
+            x6 = WrapLow(DctConstRoundShift(s4 - s6));
+            x7 = WrapLow(DctConstRoundShift(s5 - s7));
+
+            // stage 3
+            s2 = (int)(CosPi16_64 * (x2 + x3));
+            s3 = (int)(CosPi16_64 * (x2 - x3));
+            s6 = (int)(CosPi16_64 * (x6 + x7));
+            s7 = (int)(CosPi16_64 * (x6 - x7));
+
+            x2 = WrapLow(DctConstRoundShift(s2));
+            x3 = WrapLow(DctConstRoundShift(s3));
+            x6 = WrapLow(DctConstRoundShift(s6));
+            x7 = WrapLow(DctConstRoundShift(s7));
+
+            output[0] = WrapLow(x0);
+            output[1] = WrapLow(-x4);
+            output[2] = WrapLow(x6);
+            output[3] = WrapLow(-x2);
+            output[4] = WrapLow(x3);
+            output[5] = WrapLow(-x7);
+            output[6] = WrapLow(x5);
+            output[7] = WrapLow(-x1);
+        }
+
+        public static void Idct8(ReadOnlySpan<int> input, Span<int> output)
+        {
+            Span<short> step1 = stackalloc short[8];
+            Span<short> step2 = stackalloc short[8];
+            long temp1, temp2;
+
+            // stage 1
+            step1[0] = (short)input[0];
+            step1[2] = (short)input[4];
+            step1[1] = (short)input[2];
+            step1[3] = (short)input[6];
+            temp1 = (short)input[1] * CosPi28_64 - (short)input[7] * CosPi4_64;
+            temp2 = (short)input[1] * CosPi4_64 + (short)input[7] * CosPi28_64;
+            step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = (short)input[5] * CosPi12_64 - (short)input[3] * CosPi20_64;
+            temp2 = (short)input[5] * CosPi20_64 + (short)input[3] * CosPi12_64;
+            step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            // stage 2
+            temp1 = (step1[0] + step1[2]) * CosPi16_64;
+            temp2 = (step1[0] - step1[2]) * CosPi16_64;
+            step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = step1[1] * CosPi24_64 - step1[3] * CosPi8_64;
+            temp2 = step1[1] * CosPi8_64 + step1[3] * CosPi24_64;
+            step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
+            step2[4] = (short)WrapLow(step1[4] + step1[5]);
+            step2[5] = (short)WrapLow(step1[4] - step1[5]);
+            step2[6] = (short)WrapLow(-step1[6] + step1[7]);
+            step2[7] = (short)WrapLow(step1[6] + step1[7]);
+
+            // stage 3
+            step1[0] = (short)WrapLow(step2[0] + step2[3]);
+            step1[1] = (short)WrapLow(step2[1] + step2[2]);
+            step1[2] = (short)WrapLow(step2[1] - step2[2]);
+            step1[3] = (short)WrapLow(step2[0] - step2[3]);
+            step1[4] = step2[4];
+            temp1 = (step2[6] - step2[5]) * CosPi16_64;
+            temp2 = (step2[5] + step2[6]) * CosPi16_64;
+            step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
+            step1[7] = step2[7];
+
+            // stage 4
+            output[0] = WrapLow(step1[0] + step1[7]);
+            output[1] = WrapLow(step1[1] + step1[6]);
+            output[2] = WrapLow(step1[2] + step1[5]);
+            output[3] = WrapLow(step1[3] + step1[4]);
+            output[4] = WrapLow(step1[3] - step1[4]);
+            output[5] = WrapLow(step1[2] - step1[5]);
+            output[6] = WrapLow(step1[1] - step1[6]);
+            output[7] = WrapLow(step1[0] - step1[7]);
+        }
+
+        public static void Idct8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[8 * 8];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[8];
+            Span<int> tempOut = stackalloc int[8];
+
+            // First transform rows
+            for (i = 0; i < 8; ++i)
+            {
+                Idct8(input, outptr);
+                input = input.Slice(8);
+                outptr = outptr.Slice(8);
+            }
+
+            // Then transform columns
+            for (i = 0; i < 8; ++i)
+            {
+                for (j = 0; j < 8; ++j)
+                {
+                    tempIn[j] = output[j * 8 + i];
+                }
+
+                Idct8(tempIn, tempOut);
+                for (j = 0; j < 8; ++j)
+                {
+                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i],
+                                                          BitUtils.RoundPowerOfTwo(tempOut[j], 5));
+                }
+            }
+        }
+
+        public static void Idct8x812Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[8 * 8];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[8];
+            Span<int> tempOut = stackalloc int[8];
+
+            // First transform rows
+            // Only first 4 row has non-zero coefs
+            for (i = 0; i < 4; ++i)
+            {
+                Idct8(input, outptr);
+                input = input.Slice(8);
+                outptr = outptr.Slice(8);
+            }
+
+            // Then transform columns
+            for (i = 0; i < 8; ++i)
+            {
+                for (j = 0; j < 8; ++j)
+                {
+                    tempIn[j] = output[j * 8 + i];
+                }
+
+                Idct8(tempIn, tempOut);
+                for (j = 0; j < 8; ++j)
+                {
+                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i],
+                                                          BitUtils.RoundPowerOfTwo(tempOut[j], 5));
+                }
+            }
+        }
+
+        public static void Idct8x81Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i, j;
+            long a1;
+            int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
+
+            output = WrapLow(DctConstRoundShift(output * CosPi16_64));
+            a1 = BitUtils.RoundPowerOfTwo(output, 5);
+            for (j = 0; j < 8; ++j)
+            {
+                for (i = 0; i < 8; ++i)
+                {
+                    dest[i] = ClipPixelAdd(dest[i], a1);
+                }
+
+                dest = dest.Slice(stride);
+            }
+        }
+
+        public static void Iadst16(ReadOnlySpan<int> input, Span<int> output)
+        {
+            long s0, s1, s2, s3, s4, s5, s6, s7, s8;
+            long s9, s10, s11, s12, s13, s14, s15;
+            long x0 = input[15];
+            long x1 = input[0];
+            long x2 = input[13];
+            long x3 = input[2];
+            long x4 = input[11];
+            long x5 = input[4];
+            long x6 = input[9];
+            long x7 = input[6];
+            long x8 = input[7];
+            long x9 = input[8];
+            long x10 = input[5];
+            long x11 = input[10];
+            long x12 = input[3];
+            long x13 = input[12];
+            long x14 = input[1];
+            long x15 = input[14];
+
+            if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0)
+            {
+                output.Slice(0, 16).Fill(0);
+                return;
+            }
+
+            // stage 1
+            s0 = x0 * CosPi1_64 + x1 * CosPi31_64;
+            s1 = x0 * CosPi31_64 - x1 * CosPi1_64;
+            s2 = x2 * CosPi5_64 + x3 * CosPi27_64;
+            s3 = x2 * CosPi27_64 - x3 * CosPi5_64;
+            s4 = x4 * CosPi9_64 + x5 * CosPi23_64;
+            s5 = x4 * CosPi23_64 - x5 * CosPi9_64;
+            s6 = x6 * CosPi13_64 + x7 * CosPi19_64;
+            s7 = x6 * CosPi19_64 - x7 * CosPi13_64;
+            s8 = x8 * CosPi17_64 + x9 * CosPi15_64;
+            s9 = x8 * CosPi15_64 - x9 * CosPi17_64;
+            s10 = x10 * CosPi21_64 + x11 * CosPi11_64;
+            s11 = x10 * CosPi11_64 - x11 * CosPi21_64;
+            s12 = x12 * CosPi25_64 + x13 * CosPi7_64;
+            s13 = x12 * CosPi7_64 - x13 * CosPi25_64;
+            s14 = x14 * CosPi29_64 + x15 * CosPi3_64;
+            s15 = x14 * CosPi3_64 - x15 * CosPi29_64;
+
+            x0 = WrapLow(DctConstRoundShift(s0 + s8));
+            x1 = WrapLow(DctConstRoundShift(s1 + s9));
+            x2 = WrapLow(DctConstRoundShift(s2 + s10));
+            x3 = WrapLow(DctConstRoundShift(s3 + s11));
+            x4 = WrapLow(DctConstRoundShift(s4 + s12));
+            x5 = WrapLow(DctConstRoundShift(s5 + s13));
+            x6 = WrapLow(DctConstRoundShift(s6 + s14));
+            x7 = WrapLow(DctConstRoundShift(s7 + s15));
+            x8 = WrapLow(DctConstRoundShift(s0 - s8));
+            x9 = WrapLow(DctConstRoundShift(s1 - s9));
+            x10 = WrapLow(DctConstRoundShift(s2 - s10));
+            x11 = WrapLow(DctConstRoundShift(s3 - s11));
+            x12 = WrapLow(DctConstRoundShift(s4 - s12));
+            x13 = WrapLow(DctConstRoundShift(s5 - s13));
+            x14 = WrapLow(DctConstRoundShift(s6 - s14));
+            x15 = WrapLow(DctConstRoundShift(s7 - s15));
+
+            // stage 2
+            s0 = x0;
+            s1 = x1;
+            s2 = x2;
+            s3 = x3;
+            s4 = x4;
+            s5 = x5;
+            s6 = x6;
+            s7 = x7;
+            s8 = x8 * CosPi4_64 + x9 * CosPi28_64;
+            s9 = x8 * CosPi28_64 - x9 * CosPi4_64;
+            s10 = x10 * CosPi20_64 + x11 * CosPi12_64;
+            s11 = x10 * CosPi12_64 - x11 * CosPi20_64;
+            s12 = -x12 * CosPi28_64 + x13 * CosPi4_64;
+            s13 = x12 * CosPi4_64 + x13 * CosPi28_64;
+            s14 = -x14 * CosPi12_64 + x15 * CosPi20_64;
+            s15 = x14 * CosPi20_64 + x15 * CosPi12_64;
+
+            x0 = WrapLow(s0 + s4);
+            x1 = WrapLow(s1 + s5);
+            x2 = WrapLow(s2 + s6);
+            x3 = WrapLow(s3 + s7);
+            x4 = WrapLow(s0 - s4);
+            x5 = WrapLow(s1 - s5);
+            x6 = WrapLow(s2 - s6);
+            x7 = WrapLow(s3 - s7);
+            x8 = WrapLow(DctConstRoundShift(s8 + s12));
+            x9 = WrapLow(DctConstRoundShift(s9 + s13));
+            x10 = WrapLow(DctConstRoundShift(s10 + s14));
+            x11 = WrapLow(DctConstRoundShift(s11 + s15));
+            x12 = WrapLow(DctConstRoundShift(s8 - s12));
+            x13 = WrapLow(DctConstRoundShift(s9 - s13));
+            x14 = WrapLow(DctConstRoundShift(s10 - s14));
+            x15 = WrapLow(DctConstRoundShift(s11 - s15));
+
+            // stage 3
+            s0 = x0;
+            s1 = x1;
+            s2 = x2;
+            s3 = x3;
+            s4 = x4 * CosPi8_64 + x5 * CosPi24_64;
+            s5 = x4 * CosPi24_64 - x5 * CosPi8_64;
+            s6 = -x6 * CosPi24_64 + x7 * CosPi8_64;
+            s7 = x6 * CosPi8_64 + x7 * CosPi24_64;
+            s8 = x8;
+            s9 = x9;
+            s10 = x10;
+            s11 = x11;
+            s12 = x12 * CosPi8_64 + x13 * CosPi24_64;
+            s13 = x12 * CosPi24_64 - x13 * CosPi8_64;
+            s14 = -x14 * CosPi24_64 + x15 * CosPi8_64;
+            s15 = x14 * CosPi8_64 + x15 * CosPi24_64;
+
+            x0 = WrapLow(s0 + s2);
+            x1 = WrapLow(s1 + s3);
+            x2 = WrapLow(s0 - s2);
+            x3 = WrapLow(s1 - s3);
+            x4 = WrapLow(DctConstRoundShift(s4 + s6));
+            x5 = WrapLow(DctConstRoundShift(s5 + s7));
+            x6 = WrapLow(DctConstRoundShift(s4 - s6));
+            x7 = WrapLow(DctConstRoundShift(s5 - s7));
+            x8 = WrapLow(s8 + s10);
+            x9 = WrapLow(s9 + s11);
+            x10 = WrapLow(s8 - s10);
+            x11 = WrapLow(s9 - s11);
+            x12 = WrapLow(DctConstRoundShift(s12 + s14));
+            x13 = WrapLow(DctConstRoundShift(s13 + s15));
+            x14 = WrapLow(DctConstRoundShift(s12 - s14));
+            x15 = WrapLow(DctConstRoundShift(s13 - s15));
+
+            // stage 4
+            s2 = (-CosPi16_64) * (x2 + x3);
+            s3 = CosPi16_64 * (x2 - x3);
+            s6 = CosPi16_64 * (x6 + x7);
+            s7 = CosPi16_64 * (-x6 + x7);
+            s10 = CosPi16_64 * (x10 + x11);
+            s11 = CosPi16_64 * (-x10 + x11);
+            s14 = (-CosPi16_64) * (x14 + x15);
+            s15 = CosPi16_64 * (x14 - x15);
+
+            x2 = WrapLow(DctConstRoundShift(s2));
+            x3 = WrapLow(DctConstRoundShift(s3));
+            x6 = WrapLow(DctConstRoundShift(s6));
+            x7 = WrapLow(DctConstRoundShift(s7));
+            x10 = WrapLow(DctConstRoundShift(s10));
+            x11 = WrapLow(DctConstRoundShift(s11));
+            x14 = WrapLow(DctConstRoundShift(s14));
+            x15 = WrapLow(DctConstRoundShift(s15));
+
+            output[0] = WrapLow(x0);
+            output[1] = WrapLow(-x8);
+            output[2] = WrapLow(x12);
+            output[3] = WrapLow(-x4);
+            output[4] = WrapLow(x6);
+            output[5] = WrapLow(x14);
+            output[6] = WrapLow(x10);
+            output[7] = WrapLow(x2);
+            output[8] = WrapLow(x3);
+            output[9] = WrapLow(x11);
+            output[10] = WrapLow(x15);
+            output[11] = WrapLow(x7);
+            output[12] = WrapLow(x5);
+            output[13] = WrapLow(-x13);
+            output[14] = WrapLow(x9);
+            output[15] = WrapLow(-x1);
+        }
+
+        public static void Idct16(ReadOnlySpan<int> input, Span<int> output)
+        {
+            Span<short> step1 = stackalloc short[16];
+            Span<short> step2 = stackalloc short[16];
+            long temp1, temp2;
+
+            // stage 1
+            step1[0] = (short)input[0 / 2];
+            step1[1] = (short)input[16 / 2];
+            step1[2] = (short)input[8 / 2];
+            step1[3] = (short)input[24 / 2];
+            step1[4] = (short)input[4 / 2];
+            step1[5] = (short)input[20 / 2];
+            step1[6] = (short)input[12 / 2];
+            step1[7] = (short)input[28 / 2];
+            step1[8] = (short)input[2 / 2];
+            step1[9] = (short)input[18 / 2];
+            step1[10] = (short)input[10 / 2];
+            step1[11] = (short)input[26 / 2];
+            step1[12] = (short)input[6 / 2];
+            step1[13] = (short)input[22 / 2];
+            step1[14] = (short)input[14 / 2];
+            step1[15] = (short)input[30 / 2];
+
+            // stage 2
+            step2[0] = step1[0];
+            step2[1] = step1[1];
+            step2[2] = step1[2];
+            step2[3] = step1[3];
+            step2[4] = step1[4];
+            step2[5] = step1[5];
+            step2[6] = step1[6];
+            step2[7] = step1[7];
+
+            temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64;
+            temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64;
+            step2[8] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[15] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64;
+            temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64;
+            step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64;
+            temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64;
+            step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64;
+            temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64;
+            step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            // stage 3
+            step1[0] = step2[0];
+            step1[1] = step2[1];
+            step1[2] = step2[2];
+            step1[3] = step2[3];
+
+            temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64;
+            temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64;
+            step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64;
+            temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64;
+            step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            step1[8] = (short)WrapLow(step2[8] + step2[9]);
+            step1[9] = (short)WrapLow(step2[8] - step2[9]);
+            step1[10] = (short)WrapLow(-step2[10] + step2[11]);
+            step1[11] = (short)WrapLow(step2[10] + step2[11]);
+            step1[12] = (short)WrapLow(step2[12] + step2[13]);
+            step1[13] = (short)WrapLow(step2[12] - step2[13]);
+            step1[14] = (short)WrapLow(-step2[14] + step2[15]);
+            step1[15] = (short)WrapLow(step2[14] + step2[15]);
+
+            // stage 4
+            temp1 = (step1[0] + step1[1]) * CosPi16_64;
+            temp2 = (step1[0] - step1[1]) * CosPi16_64;
+            step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64;
+            temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64;
+            step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
+            step2[4] = (short)WrapLow(step1[4] + step1[5]);
+            step2[5] = (short)WrapLow(step1[4] - step1[5]);
+            step2[6] = (short)WrapLow(-step1[6] + step1[7]);
+            step2[7] = (short)WrapLow(step1[6] + step1[7]);
+
+            step2[8] = step1[8];
+            step2[15] = step1[15];
+            temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64;
+            temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64;
+            step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64;
+            temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64;
+            step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
+            step2[11] = step1[11];
+            step2[12] = step1[12];
+
+            // stage 5
+            step1[0] = (short)WrapLow(step2[0] + step2[3]);
+            step1[1] = (short)WrapLow(step2[1] + step2[2]);
+            step1[2] = (short)WrapLow(step2[1] - step2[2]);
+            step1[3] = (short)WrapLow(step2[0] - step2[3]);
+            step1[4] = step2[4];
+            temp1 = (step2[6] - step2[5]) * CosPi16_64;
+            temp2 = (step2[5] + step2[6]) * CosPi16_64;
+            step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
+            step1[7] = step2[7];
+
+            step1[8] = (short)WrapLow(step2[8] + step2[11]);
+            step1[9] = (short)WrapLow(step2[9] + step2[10]);
+            step1[10] = (short)WrapLow(step2[9] - step2[10]);
+            step1[11] = (short)WrapLow(step2[8] - step2[11]);
+            step1[12] = (short)WrapLow(-step2[12] + step2[15]);
+            step1[13] = (short)WrapLow(-step2[13] + step2[14]);
+            step1[14] = (short)WrapLow(step2[13] + step2[14]);
+            step1[15] = (short)WrapLow(step2[12] + step2[15]);
+
+            // stage 6
+            step2[0] = (short)WrapLow(step1[0] + step1[7]);
+            step2[1] = (short)WrapLow(step1[1] + step1[6]);
+            step2[2] = (short)WrapLow(step1[2] + step1[5]);
+            step2[3] = (short)WrapLow(step1[3] + step1[4]);
+            step2[4] = (short)WrapLow(step1[3] - step1[4]);
+            step2[5] = (short)WrapLow(step1[2] - step1[5]);
+            step2[6] = (short)WrapLow(step1[1] - step1[6]);
+            step2[7] = (short)WrapLow(step1[0] - step1[7]);
+            step2[8] = step1[8];
+            step2[9] = step1[9];
+            temp1 = (-step1[10] + step1[13]) * CosPi16_64;
+            temp2 = (step1[10] + step1[13]) * CosPi16_64;
+            step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = (-step1[11] + step1[12]) * CosPi16_64;
+            temp2 = (step1[11] + step1[12]) * CosPi16_64;
+            step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
+            step2[14] = step1[14];
+            step2[15] = step1[15];
+
+            // stage 7
+            output[0] = WrapLow(step2[0] + step2[15]);
+            output[1] = WrapLow(step2[1] + step2[14]);
+            output[2] = WrapLow(step2[2] + step2[13]);
+            output[3] = WrapLow(step2[3] + step2[12]);
+            output[4] = WrapLow(step2[4] + step2[11]);
+            output[5] = WrapLow(step2[5] + step2[10]);
+            output[6] = WrapLow(step2[6] + step2[9]);
+            output[7] = WrapLow(step2[7] + step2[8]);
+            output[8] = WrapLow(step2[7] - step2[8]);
+            output[9] = WrapLow(step2[6] - step2[9]);
+            output[10] = WrapLow(step2[5] - step2[10]);
+            output[11] = WrapLow(step2[4] - step2[11]);
+            output[12] = WrapLow(step2[3] - step2[12]);
+            output[13] = WrapLow(step2[2] - step2[13]);
+            output[14] = WrapLow(step2[1] - step2[14]);
+            output[15] = WrapLow(step2[0] - step2[15]);
+        }
+
+        public static void Idct16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[16 * 16];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[16];
+            Span<int> tempOut = stackalloc int[16];
+
+            // First transform rows
+            for (i = 0; i < 16; ++i)
+            {
+                Idct16(input, outptr);
+                input = input.Slice(16);
+                outptr = outptr.Slice(16);
+            }
+
+            // Then transform columns
+            for (i = 0; i < 16; ++i)
+            {
+                for (j = 0; j < 16; ++j)
+                {
+                    tempIn[j] = output[j * 16 + i];
+                }
+
+                Idct16(tempIn, tempOut);
+                for (j = 0; j < 16; ++j)
+                {
+                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+                }
+            }
+        }
+
+        public static void Idct16x1638Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[16 * 16];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[16];
+            Span<int> tempOut = stackalloc int[16];
+
+            // First transform rows. Since all non-zero dct coefficients are in
+            // upper-left 8x8 area, we only need to calculate first 8 rows here.
+            for (i = 0; i < 8; ++i)
+            {
+                Idct16(input, outptr);
+                input = input.Slice(16);
+                outptr = outptr.Slice(16);
+            }
+
+            // Then transform columns
+            for (i = 0; i < 16; ++i)
+            {
+                for (j = 0; j < 16; ++j)
+                {
+                    tempIn[j] = output[j * 16 + i];
+                }
+
+                Idct16(tempIn, tempOut);
+                for (j = 0; j < 16; ++j)
+                {
+                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+                }
+            }
+        }
+
+        public static void Idct16x1610Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[16 * 16];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[16];
+            Span<int> tempOut = stackalloc int[16];
+
+            // First transform rows. Since all non-zero dct coefficients are in
+            // upper-left 4x4 area, we only need to calculate first 4 rows here.
+            for (i = 0; i < 4; ++i)
+            {
+                Idct16(input, outptr);
+                input = input.Slice(16);
+                outptr = outptr.Slice(16);
+            }
+
+            // Then transform columns
+            for (i = 0; i < 16; ++i)
+            {
+                for (j = 0; j < 16; ++j)
+                {
+                    tempIn[j] = output[j * 16 + i];
+                }
+
+                Idct16(tempIn, tempOut);
+                for (j = 0; j < 16; ++j)
+                {
+                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+                }
+            }
+        }
+
+        public static void Idct16x161Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i, j;
+            long a1;
+            int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
+
+            output = WrapLow(DctConstRoundShift(output * CosPi16_64));
+            a1 = BitUtils.RoundPowerOfTwo(output, 6);
+            for (j = 0; j < 16; ++j)
+            {
+                for (i = 0; i < 16; ++i)
+                {
+                    dest[i] = ClipPixelAdd(dest[i], a1);
+                }
+
+                dest = dest.Slice(stride);
+            }
+        }
+
+        public static void Idct32(ReadOnlySpan<int> input, Span<int> output)
+        {
+            Span<short> step1 = stackalloc short[32];
+            Span<short> step2 = stackalloc short[32];
+            long temp1, temp2;
+
+            // stage 1
+            step1[0] = (short)input[0];
+            step1[1] = (short)input[16];
+            step1[2] = (short)input[8];
+            step1[3] = (short)input[24];
+            step1[4] = (short)input[4];
+            step1[5] = (short)input[20];
+            step1[6] = (short)input[12];
+            step1[7] = (short)input[28];
+            step1[8] = (short)input[2];
+            step1[9] = (short)input[18];
+            step1[10] = (short)input[10];
+            step1[11] = (short)input[26];
+            step1[12] = (short)input[6];
+            step1[13] = (short)input[22];
+            step1[14] = (short)input[14];
+            step1[15] = (short)input[30];
+
+            temp1 = (short)input[1] * CosPi31_64 - (short)input[31] * CosPi1_64;
+            temp2 = (short)input[1] * CosPi1_64 + (short)input[31] * CosPi31_64;
+            step1[16] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[31] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = (short)input[17] * CosPi15_64 - (short)input[15] * CosPi17_64;
+            temp2 = (short)input[17] * CosPi17_64 + (short)input[15] * CosPi15_64;
+            step1[17] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[30] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = (short)input[9] * CosPi23_64 - (short)input[23] * CosPi9_64;
+            temp2 = (short)input[9] * CosPi9_64 + (short)input[23] * CosPi23_64;
+            step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = (short)input[25] * CosPi7_64 - (short)input[7] * CosPi25_64;
+            temp2 = (short)input[25] * CosPi25_64 + (short)input[7] * CosPi7_64;
+            step1[19] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[28] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = (short)input[5] * CosPi27_64 - (short)input[27] * CosPi5_64;
+            temp2 = (short)input[5] * CosPi5_64 + (short)input[27] * CosPi27_64;
+            step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = (short)input[21] * CosPi11_64 - (short)input[11] * CosPi21_64;
+            temp2 = (short)input[21] * CosPi21_64 + (short)input[11] * CosPi11_64;
+            step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = (short)input[13] * CosPi19_64 - (short)input[19] * CosPi13_64;
+            temp2 = (short)input[13] * CosPi13_64 + (short)input[19] * CosPi19_64;
+            step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = (short)input[29] * CosPi3_64 - (short)input[3] * CosPi29_64;
+            temp2 = (short)input[29] * CosPi29_64 + (short)input[3] * CosPi3_64;
+            step1[23] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[24] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            // stage 2
+            step2[0] = step1[0];
+            step2[1] = step1[1];
+            step2[2] = step1[2];
+            step2[3] = step1[3];
+            step2[4] = step1[4];
+            step2[5] = step1[5];
+            step2[6] = step1[6];
+            step2[7] = step1[7];
+
+            temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64;
+            temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64;
+            step2[8] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[15] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64;
+            temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64;
+            step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64;
+            temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64;
+            step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64;
+            temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64;
+            step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            step2[16] = (short)WrapLow(step1[16] + step1[17]);
+            step2[17] = (short)WrapLow(step1[16] - step1[17]);
+            step2[18] = (short)WrapLow(-step1[18] + step1[19]);
+            step2[19] = (short)WrapLow(step1[18] + step1[19]);
+            step2[20] = (short)WrapLow(step1[20] + step1[21]);
+            step2[21] = (short)WrapLow(step1[20] - step1[21]);
+            step2[22] = (short)WrapLow(-step1[22] + step1[23]);
+            step2[23] = (short)WrapLow(step1[22] + step1[23]);
+            step2[24] = (short)WrapLow(step1[24] + step1[25]);
+            step2[25] = (short)WrapLow(step1[24] - step1[25]);
+            step2[26] = (short)WrapLow(-step1[26] + step1[27]);
+            step2[27] = (short)WrapLow(step1[26] + step1[27]);
+            step2[28] = (short)WrapLow(step1[28] + step1[29]);
+            step2[29] = (short)WrapLow(step1[28] - step1[29]);
+            step2[30] = (short)WrapLow(-step1[30] + step1[31]);
+            step2[31] = (short)WrapLow(step1[30] + step1[31]);
+
+            // stage 3
+            step1[0] = step2[0];
+            step1[1] = step2[1];
+            step1[2] = step2[2];
+            step1[3] = step2[3];
+
+            temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64;
+            temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64;
+            step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64;
+            temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64;
+            step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
+
+            step1[8] = (short)WrapLow(step2[8] + step2[9]);
+            step1[9] = (short)WrapLow(step2[8] - step2[9]);
+            step1[10] = (short)WrapLow(-step2[10] + step2[11]);
+            step1[11] = (short)WrapLow(step2[10] + step2[11]);
+            step1[12] = (short)WrapLow(step2[12] + step2[13]);
+            step1[13] = (short)WrapLow(step2[12] - step2[13]);
+            step1[14] = (short)WrapLow(-step2[14] + step2[15]);
+            step1[15] = (short)WrapLow(step2[14] + step2[15]);
+
+            step1[16] = step2[16];
+            step1[31] = step2[31];
+            temp1 = -step2[17] * CosPi4_64 + step2[30] * CosPi28_64;
+            temp2 = step2[17] * CosPi28_64 + step2[30] * CosPi4_64;
+            step1[17] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[30] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = -step2[18] * CosPi28_64 - step2[29] * CosPi4_64;
+            temp2 = -step2[18] * CosPi4_64 + step2[29] * CosPi28_64;
+            step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
+            step1[19] = step2[19];
+            step1[20] = step2[20];
+            temp1 = -step2[21] * CosPi20_64 + step2[26] * CosPi12_64;
+            temp2 = step2[21] * CosPi12_64 + step2[26] * CosPi20_64;
+            step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = -step2[22] * CosPi12_64 - step2[25] * CosPi20_64;
+            temp2 = -step2[22] * CosPi20_64 + step2[25] * CosPi12_64;
+            step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
+            step1[23] = step2[23];
+            step1[24] = step2[24];
+            step1[27] = step2[27];
+            step1[28] = step2[28];
+
+            // stage 4
+            temp1 = (step1[0] + step1[1]) * CosPi16_64;
+            temp2 = (step1[0] - step1[1]) * CosPi16_64;
+            step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64;
+            temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64;
+            step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
+            step2[4] = (short)WrapLow(step1[4] + step1[5]);
+            step2[5] = (short)WrapLow(step1[4] - step1[5]);
+            step2[6] = (short)WrapLow(-step1[6] + step1[7]);
+            step2[7] = (short)WrapLow(step1[6] + step1[7]);
+
+            step2[8] = step1[8];
+            step2[15] = step1[15];
+            temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64;
+            temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64;
+            step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64;
+            temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64;
+            step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
+            step2[11] = step1[11];
+            step2[12] = step1[12];
+
+            step2[16] = (short)WrapLow(step1[16] + step1[19]);
+            step2[17] = (short)WrapLow(step1[17] + step1[18]);
+            step2[18] = (short)WrapLow(step1[17] - step1[18]);
+            step2[19] = (short)WrapLow(step1[16] - step1[19]);
+            step2[20] = (short)WrapLow(-step1[20] + step1[23]);
+            step2[21] = (short)WrapLow(-step1[21] + step1[22]);
+            step2[22] = (short)WrapLow(step1[21] + step1[22]);
+            step2[23] = (short)WrapLow(step1[20] + step1[23]);
+
+            step2[24] = (short)WrapLow(step1[24] + step1[27]);
+            step2[25] = (short)WrapLow(step1[25] + step1[26]);
+            step2[26] = (short)WrapLow(step1[25] - step1[26]);
+            step2[27] = (short)WrapLow(step1[24] - step1[27]);
+            step2[28] = (short)WrapLow(-step1[28] + step1[31]);
+            step2[29] = (short)WrapLow(-step1[29] + step1[30]);
+            step2[30] = (short)WrapLow(step1[29] + step1[30]);
+            step2[31] = (short)WrapLow(step1[28] + step1[31]);
+
+            // stage 5
+            step1[0] = (short)WrapLow(step2[0] + step2[3]);
+            step1[1] = (short)WrapLow(step2[1] + step2[2]);
+            step1[2] = (short)WrapLow(step2[1] - step2[2]);
+            step1[3] = (short)WrapLow(step2[0] - step2[3]);
+            step1[4] = step2[4];
+            temp1 = (step2[6] - step2[5]) * CosPi16_64;
+            temp2 = (step2[5] + step2[6]) * CosPi16_64;
+            step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
+            step1[7] = step2[7];
+
+            step1[8] = (short)WrapLow(step2[8] + step2[11]);
+            step1[9] = (short)WrapLow(step2[9] + step2[10]);
+            step1[10] = (short)WrapLow(step2[9] - step2[10]);
+            step1[11] = (short)WrapLow(step2[8] - step2[11]);
+            step1[12] = (short)WrapLow(-step2[12] + step2[15]);
+            step1[13] = (short)WrapLow(-step2[13] + step2[14]);
+            step1[14] = (short)WrapLow(step2[13] + step2[14]);
+            step1[15] = (short)WrapLow(step2[12] + step2[15]);
+
+            step1[16] = step2[16];
+            step1[17] = step2[17];
+            temp1 = -step2[18] * CosPi8_64 + step2[29] * CosPi24_64;
+            temp2 = step2[18] * CosPi24_64 + step2[29] * CosPi8_64;
+            step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = -step2[19] * CosPi8_64 + step2[28] * CosPi24_64;
+            temp2 = step2[19] * CosPi24_64 + step2[28] * CosPi8_64;
+            step1[19] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[28] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = -step2[20] * CosPi24_64 - step2[27] * CosPi8_64;
+            temp2 = -step2[20] * CosPi8_64 + step2[27] * CosPi24_64;
+            step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = -step2[21] * CosPi24_64 - step2[26] * CosPi8_64;
+            temp2 = -step2[21] * CosPi8_64 + step2[26] * CosPi24_64;
+            step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
+            step1[22] = step2[22];
+            step1[23] = step2[23];
+            step1[24] = step2[24];
+            step1[25] = step2[25];
+            step1[30] = step2[30];
+            step1[31] = step2[31];
+
+            // stage 6
+            step2[0] = (short)WrapLow(step1[0] + step1[7]);
+            step2[1] = (short)WrapLow(step1[1] + step1[6]);
+            step2[2] = (short)WrapLow(step1[2] + step1[5]);
+            step2[3] = (short)WrapLow(step1[3] + step1[4]);
+            step2[4] = (short)WrapLow(step1[3] - step1[4]);
+            step2[5] = (short)WrapLow(step1[2] - step1[5]);
+            step2[6] = (short)WrapLow(step1[1] - step1[6]);
+            step2[7] = (short)WrapLow(step1[0] - step1[7]);
+            step2[8] = step1[8];
+            step2[9] = step1[9];
+            temp1 = (-step1[10] + step1[13]) * CosPi16_64;
+            temp2 = (step1[10] + step1[13]) * CosPi16_64;
+            step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = (-step1[11] + step1[12]) * CosPi16_64;
+            temp2 = (step1[11] + step1[12]) * CosPi16_64;
+            step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
+            step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
+            step2[14] = step1[14];
+            step2[15] = step1[15];
+
+            step2[16] = (short)WrapLow(step1[16] + step1[23]);
+            step2[17] = (short)WrapLow(step1[17] + step1[22]);
+            step2[18] = (short)WrapLow(step1[18] + step1[21]);
+            step2[19] = (short)WrapLow(step1[19] + step1[20]);
+            step2[20] = (short)WrapLow(step1[19] - step1[20]);
+            step2[21] = (short)WrapLow(step1[18] - step1[21]);
+            step2[22] = (short)WrapLow(step1[17] - step1[22]);
+            step2[23] = (short)WrapLow(step1[16] - step1[23]);
+
+            step2[24] = (short)WrapLow(-step1[24] + step1[31]);
+            step2[25] = (short)WrapLow(-step1[25] + step1[30]);
+            step2[26] = (short)WrapLow(-step1[26] + step1[29]);
+            step2[27] = (short)WrapLow(-step1[27] + step1[28]);
+            step2[28] = (short)WrapLow(step1[27] + step1[28]);
+            step2[29] = (short)WrapLow(step1[26] + step1[29]);
+            step2[30] = (short)WrapLow(step1[25] + step1[30]);
+            step2[31] = (short)WrapLow(step1[24] + step1[31]);
+
+            // stage 7
+            step1[0] = (short)WrapLow(step2[0] + step2[15]);
+            step1[1] = (short)WrapLow(step2[1] + step2[14]);
+            step1[2] = (short)WrapLow(step2[2] + step2[13]);
+            step1[3] = (short)WrapLow(step2[3] + step2[12]);
+            step1[4] = (short)WrapLow(step2[4] + step2[11]);
+            step1[5] = (short)WrapLow(step2[5] + step2[10]);
+            step1[6] = (short)WrapLow(step2[6] + step2[9]);
+            step1[7] = (short)WrapLow(step2[7] + step2[8]);
+            step1[8] = (short)WrapLow(step2[7] - step2[8]);
+            step1[9] = (short)WrapLow(step2[6] - step2[9]);
+            step1[10] = (short)WrapLow(step2[5] - step2[10]);
+            step1[11] = (short)WrapLow(step2[4] - step2[11]);
+            step1[12] = (short)WrapLow(step2[3] - step2[12]);
+            step1[13] = (short)WrapLow(step2[2] - step2[13]);
+            step1[14] = (short)WrapLow(step2[1] - step2[14]);
+            step1[15] = (short)WrapLow(step2[0] - step2[15]);
+
+            step1[16] = step2[16];
+            step1[17] = step2[17];
+            step1[18] = step2[18];
+            step1[19] = step2[19];
+            temp1 = (-step2[20] + step2[27]) * CosPi16_64;
+            temp2 = (step2[20] + step2[27]) * CosPi16_64;
+            step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = (-step2[21] + step2[26]) * CosPi16_64;
+            temp2 = (step2[21] + step2[26]) * CosPi16_64;
+            step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = (-step2[22] + step2[25]) * CosPi16_64;
+            temp2 = (step2[22] + step2[25]) * CosPi16_64;
+            step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
+            temp1 = (-step2[23] + step2[24]) * CosPi16_64;
+            temp2 = (step2[23] + step2[24]) * CosPi16_64;
+            step1[23] = (short)WrapLow(DctConstRoundShift(temp1));
+            step1[24] = (short)WrapLow(DctConstRoundShift(temp2));
+            step1[28] = step2[28];
+            step1[29] = step2[29];
+            step1[30] = step2[30];
+            step1[31] = step2[31];
+
+            // final stage
+            output[0] = WrapLow(step1[0] + step1[31]);
+            output[1] = WrapLow(step1[1] + step1[30]);
+            output[2] = WrapLow(step1[2] + step1[29]);
+            output[3] = WrapLow(step1[3] + step1[28]);
+            output[4] = WrapLow(step1[4] + step1[27]);
+            output[5] = WrapLow(step1[5] + step1[26]);
+            output[6] = WrapLow(step1[6] + step1[25]);
+            output[7] = WrapLow(step1[7] + step1[24]);
+            output[8] = WrapLow(step1[8] + step1[23]);
+            output[9] = WrapLow(step1[9] + step1[22]);
+            output[10] = WrapLow(step1[10] + step1[21]);
+            output[11] = WrapLow(step1[11] + step1[20]);
+            output[12] = WrapLow(step1[12] + step1[19]);
+            output[13] = WrapLow(step1[13] + step1[18]);
+            output[14] = WrapLow(step1[14] + step1[17]);
+            output[15] = WrapLow(step1[15] + step1[16]);
+            output[16] = WrapLow(step1[15] - step1[16]);
+            output[17] = WrapLow(step1[14] - step1[17]);
+            output[18] = WrapLow(step1[13] - step1[18]);
+            output[19] = WrapLow(step1[12] - step1[19]);
+            output[20] = WrapLow(step1[11] - step1[20]);
+            output[21] = WrapLow(step1[10] - step1[21]);
+            output[22] = WrapLow(step1[9] - step1[22]);
+            output[23] = WrapLow(step1[8] - step1[23]);
+            output[24] = WrapLow(step1[7] - step1[24]);
+            output[25] = WrapLow(step1[6] - step1[25]);
+            output[26] = WrapLow(step1[5] - step1[26]);
+            output[27] = WrapLow(step1[4] - step1[27]);
+            output[28] = WrapLow(step1[3] - step1[28]);
+            output[29] = WrapLow(step1[2] - step1[29]);
+            output[30] = WrapLow(step1[1] - step1[30]);
+            output[31] = WrapLow(step1[0] - step1[31]);
+        }
+
+        public static void Idct32x321024Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[32 * 32];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[32];
+            Span<int> tempOut = stackalloc int[32];
+
+            // Rows
+            for (i = 0; i < 32; ++i)
+            {
+                short zeroCoeff = 0;
+                for (j = 0; j < 32; ++j)
+                {
+                    zeroCoeff |= (short)input[j];
+                }
+
+                if (zeroCoeff != 0)
+                {
+                    Idct32(input, outptr);
+                }
+                else
+                {
+                    outptr.Slice(0, 32).Fill(0);
+                }
+
+                input = input.Slice(32);
+                outptr = outptr.Slice(32);
+            }
+
+            // Columns
+            for (i = 0; i < 32; ++i)
+            {
+                for (j = 0; j < 32; ++j)
+                {
+                    tempIn[j] = output[j * 32 + i];
+                }
+
+                Idct32(tempIn, tempOut);
+                for (j = 0; j < 32; ++j)
+                {
+                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+                }
+            }
+        }
+
+        public static void Idct32x32135Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[32 * 32];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[32];
+            Span<int> tempOut = stackalloc int[32];
+
+            // Rows
+            // Only upper-left 16x16 has non-zero coeff
+            for (i = 0; i < 16; ++i)
+            {
+                Idct32(input, outptr);
+                input = input.Slice(32);
+                outptr = outptr.Slice(32);
+            }
+
+            // Columns
+            for (i = 0; i < 32; ++i)
+            {
+                for (j = 0; j < 32; ++j)
+                {
+                    tempIn[j] = output[j * 32 + i];
+                }
+
+                Idct32(tempIn, tempOut);
+                for (j = 0; j < 32; ++j)
+                {
+                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+                }
+            }
+        }
+
+        public static void Idct32x3234Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[32 * 32];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[32];
+            Span<int> tempOut = stackalloc int[32];
+
+            // Rows
+            // Only upper-left 8x8 has non-zero coeff
+            for (i = 0; i < 8; ++i)
+            {
+                Idct32(input, outptr);
+                input = input.Slice(32);
+                outptr = outptr.Slice(32);
+            }
+
+            // Columns
+            for (i = 0; i < 32; ++i)
+            {
+                for (j = 0; j < 32; ++j)
+                {
+                    tempIn[j] = output[j * 32 + i];
+                }
+
+                Idct32(tempIn, tempOut);
+                for (j = 0; j < 32; ++j)
+                {
+                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+                }
+            }
+        }
+
+        public static void Idct32x321Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
+        {
+            int i, j;
+            long a1;
+            int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
+
+            output = WrapLow(DctConstRoundShift(output * CosPi16_64));
+            a1 = BitUtils.RoundPowerOfTwo(output, 6);
+
+            for (j = 0; j < 32; ++j)
+            {
+                for (i = 0; i < 32; ++i)
+                {
+                    dest[i] = ClipPixelAdd(dest[i], a1);
+                }
+
+                dest = dest.Slice(stride);
+            }
+        }
+
+        public static void HighbdIwht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+               0.5 shifts per pixel. */
+            int i;
+            Span<int> output = stackalloc int[16];
+            long a1, b1, c1, d1, e1;
+            ReadOnlySpan<int> ip = input;
+            Span<int> op = output;
+
+            for (i = 0; i < 4; i++)
+            {
+                a1 = ip[0] >> UnitQuantShift;
+                c1 = ip[1] >> UnitQuantShift;
+                d1 = ip[2] >> UnitQuantShift;
+                b1 = ip[3] >> UnitQuantShift;
+                a1 += c1;
+                d1 -= b1;
+                e1 = (a1 - d1) >> 1;
+                b1 = e1 - b1;
+                c1 = e1 - c1;
+                a1 -= b1;
+                d1 += c1;
+                op[0] = HighbdWrapLow(a1, bd);
+                op[1] = HighbdWrapLow(b1, bd);
+                op[2] = HighbdWrapLow(c1, bd);
+                op[3] = HighbdWrapLow(d1, bd);
+                ip = ip.Slice(4);
+                op = op.Slice(4);
+            }
+
+            ReadOnlySpan<int> ip2 = output;
+            for (i = 0; i < 4; i++)
+            {
+                a1 = ip2[4 * 0];
+                c1 = ip2[4 * 1];
+                d1 = ip2[4 * 2];
+                b1 = ip2[4 * 3];
+                a1 += c1;
+                d1 -= b1;
+                e1 = (a1 - d1) >> 1;
+                b1 = e1 - b1;
+                c1 = e1 - c1;
+                a1 -= b1;
+                d1 += c1;
+                dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], HighbdWrapLow(a1, bd), bd);
+                dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], HighbdWrapLow(b1, bd), bd);
+                dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], HighbdWrapLow(c1, bd), bd);
+                dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], HighbdWrapLow(d1, bd), bd);
+
+                ip2 = ip2.Slice(1);
+                dest = dest.Slice(1);
+            }
+        }
+
+        public static void HighbdIwht4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i;
+            long a1, e1;
+            Span<int> tmp = stackalloc int[4];
+            ReadOnlySpan<int> ip = input;
+            Span<int> op = tmp;
+
+            a1 = ip[0] >> UnitQuantShift;
+            e1 = a1 >> 1;
+            a1 -= e1;
+            op[0] = HighbdWrapLow(a1, bd);
+            op[1] = op[2] = op[3] = HighbdWrapLow(e1, bd);
+
+            ReadOnlySpan<int> ip2 = tmp;
+            for (i = 0; i < 4; i++)
+            {
+                e1 = ip2[0] >> 1;
+                a1 = ip2[0] - e1;
+                dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], a1, bd);
+                dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], e1, bd);
+                dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], e1, bd);
+                dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], e1, bd);
+                ip2 = ip2.Slice(1);
+                dest = dest.Slice(1);
+            }
+        }
+
+        public static void HighbdIadst4(ReadOnlySpan<int> input, Span<int> output, int bd)
+        {
+            long s0, s1, s2, s3, s4, s5, s6, s7;
+            int x0 = input[0];
+            int x1 = input[1];
+            int x2 = input[2];
+            int x3 = input[3];
+
+            if (DetectInvalidHighbdInput(input, 4) != 0)
+            {
+                Debug.Assert(false, "invalid highbd txfm input");
+                output.Slice(0, 4).Fill(0);
+                return;
+            }
+
+            if ((x0 | x1 | x2 | x3) == 0)
+            {
+                output.Slice(0, 4).Fill(0);
+                return;
+            }
+
+            s0 = (long)SinPi1_9 * x0;
+            s1 = (long)SinPi2_9 * x0;
+            s2 = (long)SinPi3_9 * x1;
+            s3 = (long)SinPi4_9 * x2;
+            s4 = (long)SinPi1_9 * x2;
+            s5 = (long)SinPi2_9 * x3;
+            s6 = (long)SinPi4_9 * x3;
+            s7 = HighbdWrapLow(x0 - x2 + x3, bd);
+
+            s0 = s0 + s3 + s5;
+            s1 = s1 - s4 - s6;
+            s3 = s2;
+            s2 = SinPi3_9 * s7;
+
+            // 1-D transform scaling factor is sqrt(2).
+            // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+            // + 1b (addition) = 29b.
+            // Hence the output bit depth is 15b.
+            output[0] = HighbdWrapLow(DctConstRoundShift(s0 + s3), bd);
+            output[1] = HighbdWrapLow(DctConstRoundShift(s1 + s3), bd);
+            output[2] = HighbdWrapLow(DctConstRoundShift(s2), bd);
+            output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd);
+        }
+
+        public static void HighbdIdct4(ReadOnlySpan<int> input, Span<int> output, int bd)
+        {
+            Span<int> step = stackalloc int[4];
+            long temp1, temp2;
+
+            if (DetectInvalidHighbdInput(input, 4) != 0)
+            {
+                Debug.Assert(false, "invalid highbd txfm input");
+                output.Slice(0, 4).Fill(0);
+                return;
+            }
+
+            // stage 1
+            temp1 = (input[0] + input[2]) * (long)CosPi16_64;
+            temp2 = (input[0] - input[2]) * (long)CosPi16_64;
+            step[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = input[1] * (long)CosPi24_64 - input[3] * (long)CosPi8_64;
+            temp2 = input[1] * (long)CosPi8_64 + input[3] * (long)CosPi24_64;
+            step[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            // stage 2
+            output[0] = HighbdWrapLow(step[0] + step[3], bd);
+            output[1] = HighbdWrapLow(step[1] + step[2], bd);
+            output[2] = HighbdWrapLow(step[1] - step[2], bd);
+            output[3] = HighbdWrapLow(step[0] - step[3], bd);
+        }
+
+        public static void HighbdIdct4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[4 * 4];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[4];
+            Span<int> tempOut = stackalloc int[4];
+
+            // Rows
+            for (i = 0; i < 4; ++i)
+            {
+                HighbdIdct4(input, outptr, bd);
+                input = input.Slice(4);
+                outptr = outptr.Slice(4);
+            }
+
+            // Columns
+            for (i = 0; i < 4; ++i)
+            {
+                for (j = 0; j < 4; ++j)
+                {
+                    tempIn[j] = output[j * 4 + i];
+                }
+
+                HighbdIdct4(tempIn, tempOut, bd);
+                for (j = 0; j < 4; ++j)
+                {
+                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd);
+                }
+            }
+        }
+
+        public static void HighbdIdct4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i;
+            long a1;
+            int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
+
+            output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
+            a1 = BitUtils.RoundPowerOfTwo(output, 4);
+
+            for (i = 0; i < 4; i++)
+            {
+                dest[0] = HighbdClipPixelAdd(dest[0], a1, bd);
+                dest[1] = HighbdClipPixelAdd(dest[1], a1, bd);
+                dest[2] = HighbdClipPixelAdd(dest[2], a1, bd);
+                dest[3] = HighbdClipPixelAdd(dest[3], a1, bd);
+                dest = dest.Slice(stride);
+            }
+        }
+
+        public static void HighbdIadst8(ReadOnlySpan<int> input, Span<int> output, int bd)
+        {
+            long s0, s1, s2, s3, s4, s5, s6, s7;
+            int x0 = input[7];
+            int x1 = input[0];
+            int x2 = input[5];
+            int x3 = input[2];
+            int x4 = input[3];
+            int x5 = input[4];
+            int x6 = input[1];
+            int x7 = input[6];
+
+            if (DetectInvalidHighbdInput(input, 8) != 0)
+            {
+                Debug.Assert(false, "invalid highbd txfm input");
+                output.Slice(0, 8).Fill(0);
+                return;
+            }
+
+            if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0)
+            {
+                output.Slice(0, 8).Fill(0);
+                return;
+            }
+
+            // stage 1
+            s0 = (long)CosPi2_64 * x0 + (long)CosPi30_64 * x1;
+            s1 = (long)CosPi30_64 * x0 - (long)CosPi2_64 * x1;
+            s2 = (long)CosPi10_64 * x2 + (long)CosPi22_64 * x3;
+            s3 = (long)CosPi22_64 * x2 - (long)CosPi10_64 * x3;
+            s4 = (long)CosPi18_64 * x4 + (long)CosPi14_64 * x5;
+            s5 = (long)CosPi14_64 * x4 - (long)CosPi18_64 * x5;
+            s6 = (long)CosPi26_64 * x6 + (long)CosPi6_64 * x7;
+            s7 = (long)CosPi6_64 * x6 - (long)CosPi26_64 * x7;
+
+            x0 = HighbdWrapLow(DctConstRoundShift(s0 + s4), bd);
+            x1 = HighbdWrapLow(DctConstRoundShift(s1 + s5), bd);
+            x2 = HighbdWrapLow(DctConstRoundShift(s2 + s6), bd);
+            x3 = HighbdWrapLow(DctConstRoundShift(s3 + s7), bd);
+            x4 = HighbdWrapLow(DctConstRoundShift(s0 - s4), bd);
+            x5 = HighbdWrapLow(DctConstRoundShift(s1 - s5), bd);
+            x6 = HighbdWrapLow(DctConstRoundShift(s2 - s6), bd);
+            x7 = HighbdWrapLow(DctConstRoundShift(s3 - s7), bd);
+
+            // stage 2
+            s0 = x0;
+            s1 = x1;
+            s2 = x2;
+            s3 = x3;
+            s4 = (long)CosPi8_64 * x4 + (long)CosPi24_64 * x5;
+            s5 = (long)CosPi24_64 * x4 - (long)CosPi8_64 * x5;
+            s6 = (long)(-CosPi24_64) * x6 + (long)CosPi8_64 * x7;
+            s7 = (long)CosPi8_64 * x6 + (long)CosPi24_64 * x7;
+
+            x0 = HighbdWrapLow(s0 + s2, bd);
+            x1 = HighbdWrapLow(s1 + s3, bd);
+            x2 = HighbdWrapLow(s0 - s2, bd);
+            x3 = HighbdWrapLow(s1 - s3, bd);
+            x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd);
+            x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd);
+            x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd);
+            x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd);
+
+            // stage 3
+            s2 = (long)CosPi16_64 * (x2 + x3);
+            s3 = (long)CosPi16_64 * (x2 - x3);
+            s6 = (long)CosPi16_64 * (x6 + x7);
+            s7 = (long)CosPi16_64 * (x6 - x7);
+
+            x2 = HighbdWrapLow(DctConstRoundShift(s2), bd);
+            x3 = HighbdWrapLow(DctConstRoundShift(s3), bd);
+            x6 = HighbdWrapLow(DctConstRoundShift(s6), bd);
+            x7 = HighbdWrapLow(DctConstRoundShift(s7), bd);
+
+            output[0] = HighbdWrapLow(x0, bd);
+            output[1] = HighbdWrapLow(-x4, bd);
+            output[2] = HighbdWrapLow(x6, bd);
+            output[3] = HighbdWrapLow(-x2, bd);
+            output[4] = HighbdWrapLow(x3, bd);
+            output[5] = HighbdWrapLow(-x7, bd);
+            output[6] = HighbdWrapLow(x5, bd);
+            output[7] = HighbdWrapLow(-x1, bd);
+        }
+
+        public static void HighbdIdct8(ReadOnlySpan<int> input, Span<int> output, int bd)
+        {
+            Span<int> step1 = stackalloc int[8];
+            Span<int> step2 = stackalloc int[8];
+            long temp1, temp2;
+
+            if (DetectInvalidHighbdInput(input, 8) != 0)
+            {
+                Debug.Assert(false, "invalid highbd txfm input");
+                output.Slice(0, 8).Fill(0);
+                return;
+            }
+
+            // stage 1
+            step1[0] = input[0];
+            step1[2] = input[4];
+            step1[1] = input[2];
+            step1[3] = input[6];
+            temp1 = input[1] * (long)CosPi28_64 - input[7] * (long)CosPi4_64;
+            temp2 = input[1] * (long)CosPi4_64 + input[7] * (long)CosPi28_64;
+            step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = input[5] * (long)CosPi12_64 - input[3] * (long)CosPi20_64;
+            temp2 = input[5] * (long)CosPi20_64 + input[3] * (long)CosPi12_64;
+            step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            // stage 2 & stage 3 - even half
+            HighbdIdct4(step1, step1, bd);
+
+            // stage 2 - odd half
+            step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
+            step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
+            step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
+            step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
+
+            // stage 3 - odd half
+            step1[4] = step2[4];
+            temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
+            temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
+            step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step1[7] = step2[7];
+
+            // stage 4
+            output[0] = HighbdWrapLow(step1[0] + step1[7], bd);
+            output[1] = HighbdWrapLow(step1[1] + step1[6], bd);
+            output[2] = HighbdWrapLow(step1[2] + step1[5], bd);
+            output[3] = HighbdWrapLow(step1[3] + step1[4], bd);
+            output[4] = HighbdWrapLow(step1[3] - step1[4], bd);
+            output[5] = HighbdWrapLow(step1[2] - step1[5], bd);
+            output[6] = HighbdWrapLow(step1[1] - step1[6], bd);
+            output[7] = HighbdWrapLow(step1[0] - step1[7], bd);
+        }
+
+        public static void HighbdIdct8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[8 * 8];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[8];
+            Span<int> tempOut = stackalloc int[8];
+
+            // First transform rows
+            for (i = 0; i < 8; ++i)
+            {
+                HighbdIdct8(input, outptr, bd);
+                input = input.Slice(8);
+                outptr = outptr.Slice(8);
+            }
+
+            // Then transform columns
+            for (i = 0; i < 8; ++i)
+            {
+                for (j = 0; j < 8; ++j)
+                {
+                    tempIn[j] = output[j * 8 + i];
+                }
+
+                HighbdIdct8(tempIn, tempOut, bd);
+                for (j = 0; j < 8; ++j)
+                {
+                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
+                }
+            }
+        }
+
+        public static void HighbdIdct8x812Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[8 * 8];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[8];
+            Span<int> tempOut = stackalloc int[8];
+
+            // First transform rows
+            // Only first 4 row has non-zero coefs
+            for (i = 0; i < 4; ++i)
+            {
+                HighbdIdct8(input, outptr, bd);
+                input = input.Slice(8);
+                outptr = outptr.Slice(8);
+            }
+
+            // Then transform columns
+            for (i = 0; i < 8; ++i)
+            {
+                for (j = 0; j < 8; ++j)
+                {
+                    tempIn[j] = output[j * 8 + i];
+                }
+
+                HighbdIdct8(tempIn, tempOut, bd);
+                for (j = 0; j < 8; ++j)
+                {
+                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
+                }
+            }
+        }
+
+        public static void vpx_Highbdidct8x8_1_add_c(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i, j;
+            long a1;
+            int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
+
+            output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
+            a1 = BitUtils.RoundPowerOfTwo(output, 5);
+            for (j = 0; j < 8; ++j)
+            {
+                for (i = 0; i < 8; ++i)
+                {
+                    dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
+                }
+
+                dest = dest.Slice(stride);
+            }
+        }
+
+        public static void HighbdIadst16(ReadOnlySpan<int> input, Span<int> output, int bd)
+        {
+            long s0, s1, s2, s3, s4, s5, s6, s7, s8;
+            long s9, s10, s11, s12, s13, s14, s15;
+            int x0 = input[15];
+            int x1 = input[0];
+            int x2 = input[13];
+            int x3 = input[2];
+            int x4 = input[11];
+            int x5 = input[4];
+            int x6 = input[9];
+            int x7 = input[6];
+            int x8 = input[7];
+            int x9 = input[8];
+            int x10 = input[5];
+            int x11 = input[10];
+            int x12 = input[3];
+            int x13 = input[12];
+            int x14 = input[1];
+            int x15 = input[14];
+            if (DetectInvalidHighbdInput(input, 16) != 0)
+            {
+                Debug.Assert(false, "invalid highbd txfm input");
+                output.Slice(0, 16).Fill(0);
+                return;
+            }
+
+            if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0)
+            {
+                output.Slice(0, 16).Fill(0);
+                return;
+            }
+
+            // stage 1
+            s0 = x0 * (long)CosPi1_64 + x1 * (long)CosPi31_64;
+            s1 = x0 * (long)CosPi31_64 - x1 * (long)CosPi1_64;
+            s2 = x2 * (long)CosPi5_64 + x3 * (long)CosPi27_64;
+            s3 = x2 * (long)CosPi27_64 - x3 * (long)CosPi5_64;
+            s4 = x4 * (long)CosPi9_64 + x5 * (long)CosPi23_64;
+            s5 = x4 * (long)CosPi23_64 - x5 * (long)CosPi9_64;
+            s6 = x6 * (long)CosPi13_64 + x7 * (long)CosPi19_64;
+            s7 = x6 * (long)CosPi19_64 - x7 * (long)CosPi13_64;
+            s8 = x8 * (long)CosPi17_64 + x9 * (long)CosPi15_64;
+            s9 = x8 * (long)CosPi15_64 - x9 * (long)CosPi17_64;
+            s10 = x10 * (long)CosPi21_64 + x11 * (long)CosPi11_64;
+            s11 = x10 * (long)CosPi11_64 - x11 * (long)CosPi21_64;
+            s12 = x12 * (long)CosPi25_64 + x13 * (long)CosPi7_64;
+            s13 = x12 * (long)CosPi7_64 - x13 * (long)CosPi25_64;
+            s14 = x14 * (long)CosPi29_64 + x15 * (long)CosPi3_64;
+            s15 = x14 * (long)CosPi3_64 - x15 * (long)CosPi29_64;
+
+            x0 = HighbdWrapLow(DctConstRoundShift(s0 + s8), bd);
+            x1 = HighbdWrapLow(DctConstRoundShift(s1 + s9), bd);
+            x2 = HighbdWrapLow(DctConstRoundShift(s2 + s10), bd);
+            x3 = HighbdWrapLow(DctConstRoundShift(s3 + s11), bd);
+            x4 = HighbdWrapLow(DctConstRoundShift(s4 + s12), bd);
+            x5 = HighbdWrapLow(DctConstRoundShift(s5 + s13), bd);
+            x6 = HighbdWrapLow(DctConstRoundShift(s6 + s14), bd);
+            x7 = HighbdWrapLow(DctConstRoundShift(s7 + s15), bd);
+            x8 = HighbdWrapLow(DctConstRoundShift(s0 - s8), bd);
+            x9 = HighbdWrapLow(DctConstRoundShift(s1 - s9), bd);
+            x10 = HighbdWrapLow(DctConstRoundShift(s2 - s10), bd);
+            x11 = HighbdWrapLow(DctConstRoundShift(s3 - s11), bd);
+            x12 = HighbdWrapLow(DctConstRoundShift(s4 - s12), bd);
+            x13 = HighbdWrapLow(DctConstRoundShift(s5 - s13), bd);
+            x14 = HighbdWrapLow(DctConstRoundShift(s6 - s14), bd);
+            x15 = HighbdWrapLow(DctConstRoundShift(s7 - s15), bd);
+
+            // stage 2
+            s0 = x0;
+            s1 = x1;
+            s2 = x2;
+            s3 = x3;
+            s4 = x4;
+            s5 = x5;
+            s6 = x6;
+            s7 = x7;
+            s8 = x8 * (long)CosPi4_64 + x9 * (long)CosPi28_64;
+            s9 = x8 * (long)CosPi28_64 - x9 * (long)CosPi4_64;
+            s10 = x10 * (long)CosPi20_64 + x11 * (long)CosPi12_64;
+            s11 = x10 * (long)CosPi12_64 - x11 * (long)CosPi20_64;
+            s12 = -x12 * (long)CosPi28_64 + x13 * (long)CosPi4_64;
+            s13 = x12 * (long)CosPi4_64 + x13 * (long)CosPi28_64;
+            s14 = -x14 * (long)CosPi12_64 + x15 * (long)CosPi20_64;
+            s15 = x14 * (long)CosPi20_64 + x15 * (long)CosPi12_64;
+
+            x0 = HighbdWrapLow(s0 + s4, bd);
+            x1 = HighbdWrapLow(s1 + s5, bd);
+            x2 = HighbdWrapLow(s2 + s6, bd);
+            x3 = HighbdWrapLow(s3 + s7, bd);
+            x4 = HighbdWrapLow(s0 - s4, bd);
+            x5 = HighbdWrapLow(s1 - s5, bd);
+            x6 = HighbdWrapLow(s2 - s6, bd);
+            x7 = HighbdWrapLow(s3 - s7, bd);
+            x8 = HighbdWrapLow(DctConstRoundShift(s8 + s12), bd);
+            x9 = HighbdWrapLow(DctConstRoundShift(s9 + s13), bd);
+            x10 = HighbdWrapLow(DctConstRoundShift(s10 + s14), bd);
+            x11 = HighbdWrapLow(DctConstRoundShift(s11 + s15), bd);
+            x12 = HighbdWrapLow(DctConstRoundShift(s8 - s12), bd);
+            x13 = HighbdWrapLow(DctConstRoundShift(s9 - s13), bd);
+            x14 = HighbdWrapLow(DctConstRoundShift(s10 - s14), bd);
+            x15 = HighbdWrapLow(DctConstRoundShift(s11 - s15), bd);
+
+            // stage 3
+            s0 = x0;
+            s1 = x1;
+            s2 = x2;
+            s3 = x3;
+            s4 = x4 * (long)CosPi8_64 + x5 * (long)CosPi24_64;
+            s5 = x4 * (long)CosPi24_64 - x5 * (long)CosPi8_64;
+            s6 = -x6 * (long)CosPi24_64 + x7 * (long)CosPi8_64;
+            s7 = x6 * (long)CosPi8_64 + x7 * (long)CosPi24_64;
+            s8 = x8;
+            s9 = x9;
+            s10 = x10;
+            s11 = x11;
+            s12 = x12 * (long)CosPi8_64 + x13 * (long)CosPi24_64;
+            s13 = x12 * (long)CosPi24_64 - x13 * (long)CosPi8_64;
+            s14 = -x14 * (long)CosPi24_64 + x15 * (long)CosPi8_64;
+            s15 = x14 * (long)CosPi8_64 + x15 * (long)CosPi24_64;
+
+            x0 = HighbdWrapLow(s0 + s2, bd);
+            x1 = HighbdWrapLow(s1 + s3, bd);
+            x2 = HighbdWrapLow(s0 - s2, bd);
+            x3 = HighbdWrapLow(s1 - s3, bd);
+            x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd);
+            x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd);
+            x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd);
+            x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd);
+            x8 = HighbdWrapLow(s8 + s10, bd);
+            x9 = HighbdWrapLow(s9 + s11, bd);
+            x10 = HighbdWrapLow(s8 - s10, bd);
+            x11 = HighbdWrapLow(s9 - s11, bd);
+            x12 = HighbdWrapLow(DctConstRoundShift(s12 + s14), bd);
+            x13 = HighbdWrapLow(DctConstRoundShift(s13 + s15), bd);
+            x14 = HighbdWrapLow(DctConstRoundShift(s12 - s14), bd);
+            x15 = HighbdWrapLow(DctConstRoundShift(s13 - s15), bd);
+
+            // stage 4
+            s2 = (long)(-CosPi16_64) * (x2 + x3);
+            s3 = (long)CosPi16_64 * (x2 - x3);
+            s6 = (long)CosPi16_64 * (x6 + x7);
+            s7 = (long)CosPi16_64 * (-x6 + x7);
+            s10 = (long)CosPi16_64 * (x10 + x11);
+            s11 = (long)CosPi16_64 * (-x10 + x11);
+            s14 = (long)(-CosPi16_64) * (x14 + x15);
+            s15 = (long)CosPi16_64 * (x14 - x15);
+
+            x2 = HighbdWrapLow(DctConstRoundShift(s2), bd);
+            x3 = HighbdWrapLow(DctConstRoundShift(s3), bd);
+            x6 = HighbdWrapLow(DctConstRoundShift(s6), bd);
+            x7 = HighbdWrapLow(DctConstRoundShift(s7), bd);
+            x10 = HighbdWrapLow(DctConstRoundShift(s10), bd);
+            x11 = HighbdWrapLow(DctConstRoundShift(s11), bd);
+            x14 = HighbdWrapLow(DctConstRoundShift(s14), bd);
+            x15 = HighbdWrapLow(DctConstRoundShift(s15), bd);
+
+            output[0] = HighbdWrapLow(x0, bd);
+            output[1] = HighbdWrapLow(-x8, bd);
+            output[2] = HighbdWrapLow(x12, bd);
+            output[3] = HighbdWrapLow(-x4, bd);
+            output[4] = HighbdWrapLow(x6, bd);
+            output[5] = HighbdWrapLow(x14, bd);
+            output[6] = HighbdWrapLow(x10, bd);
+            output[7] = HighbdWrapLow(x2, bd);
+            output[8] = HighbdWrapLow(x3, bd);
+            output[9] = HighbdWrapLow(x11, bd);
+            output[10] = HighbdWrapLow(x15, bd);
+            output[11] = HighbdWrapLow(x7, bd);
+            output[12] = HighbdWrapLow(x5, bd);
+            output[13] = HighbdWrapLow(-x13, bd);
+            output[14] = HighbdWrapLow(x9, bd);
+            output[15] = HighbdWrapLow(-x1, bd);
+        }
+
+        public static void HighbdIdct16(ReadOnlySpan<int> input, Span<int> output, int bd)
+        {
+            Span<int> step1 = stackalloc int[16];
+            Span<int> step2 = stackalloc int[16];
+            long temp1, temp2;
+
+            if (DetectInvalidHighbdInput(input, 16) != 0)
+            {
+                Debug.Assert(false, "invalid highbd txfm input");
+                output.Slice(0, 16).Fill(0);
+                return;
+            }
+
+            // stage 1
+            step1[0] = input[0 / 2];
+            step1[1] = input[16 / 2];
+            step1[2] = input[8 / 2];
+            step1[3] = input[24 / 2];
+            step1[4] = input[4 / 2];
+            step1[5] = input[20 / 2];
+            step1[6] = input[12 / 2];
+            step1[7] = input[28 / 2];
+            step1[8] = input[2 / 2];
+            step1[9] = input[18 / 2];
+            step1[10] = input[10 / 2];
+            step1[11] = input[26 / 2];
+            step1[12] = input[6 / 2];
+            step1[13] = input[22 / 2];
+            step1[14] = input[14 / 2];
+            step1[15] = input[30 / 2];
+
+            // stage 2
+            step2[0] = step1[0];
+            step2[1] = step1[1];
+            step2[2] = step1[2];
+            step2[3] = step1[3];
+            step2[4] = step1[4];
+            step2[5] = step1[5];
+            step2[6] = step1[6];
+            step2[7] = step1[7];
+
+            temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64;
+            temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64;
+            step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64;
+            temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64;
+            step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64;
+            temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64;
+            step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64;
+            temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64;
+            step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            // stage 3
+            step1[0] = step2[0];
+            step1[1] = step2[1];
+            step1[2] = step2[2];
+            step1[3] = step2[3];
+
+            temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64;
+            temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64;
+            step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64;
+            temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64;
+            step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            step1[8] = HighbdWrapLow(step2[8] + step2[9], bd);
+            step1[9] = HighbdWrapLow(step2[8] - step2[9], bd);
+            step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd);
+            step1[11] = HighbdWrapLow(step2[10] + step2[11], bd);
+            step1[12] = HighbdWrapLow(step2[12] + step2[13], bd);
+            step1[13] = HighbdWrapLow(step2[12] - step2[13], bd);
+            step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd);
+            step1[15] = HighbdWrapLow(step2[14] + step2[15], bd);
+
+            // stage 4
+            temp1 = (step1[0] + step1[1]) * (long)CosPi16_64;
+            temp2 = (step1[0] - step1[1]) * (long)CosPi16_64;
+            step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64;
+            temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64;
+            step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
+            step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
+            step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
+            step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
+
+            step2[8] = step1[8];
+            step2[15] = step1[15];
+            temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
+            temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
+            step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
+            temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64;
+            step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step2[11] = step1[11];
+            step2[12] = step1[12];
+
+            // stage 5
+            step1[0] = HighbdWrapLow(step2[0] + step2[3], bd);
+            step1[1] = HighbdWrapLow(step2[1] + step2[2], bd);
+            step1[2] = HighbdWrapLow(step2[1] - step2[2], bd);
+            step1[3] = HighbdWrapLow(step2[0] - step2[3], bd);
+            step1[4] = step2[4];
+            temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
+            temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
+            step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step1[7] = step2[7];
+
+            step1[8] = HighbdWrapLow(step2[8] + step2[11], bd);
+            step1[9] = HighbdWrapLow(step2[9] + step2[10], bd);
+            step1[10] = HighbdWrapLow(step2[9] - step2[10], bd);
+            step1[11] = HighbdWrapLow(step2[8] - step2[11], bd);
+            step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd);
+            step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd);
+            step1[14] = HighbdWrapLow(step2[13] + step2[14], bd);
+            step1[15] = HighbdWrapLow(step2[12] + step2[15], bd);
+
+            // stage 6
+            step2[0] = HighbdWrapLow(step1[0] + step1[7], bd);
+            step2[1] = HighbdWrapLow(step1[1] + step1[6], bd);
+            step2[2] = HighbdWrapLow(step1[2] + step1[5], bd);
+            step2[3] = HighbdWrapLow(step1[3] + step1[4], bd);
+            step2[4] = HighbdWrapLow(step1[3] - step1[4], bd);
+            step2[5] = HighbdWrapLow(step1[2] - step1[5], bd);
+            step2[6] = HighbdWrapLow(step1[1] - step1[6], bd);
+            step2[7] = HighbdWrapLow(step1[0] - step1[7], bd);
+            step2[8] = step1[8];
+            step2[9] = step1[9];
+            temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64;
+            temp2 = (step1[10] + step1[13]) * (long)CosPi16_64;
+            step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64;
+            temp2 = (step1[11] + step1[12]) * (long)CosPi16_64;
+            step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step2[14] = step1[14];
+            step2[15] = step1[15];
+
+            // stage 7
+            output[0] = HighbdWrapLow(step2[0] + step2[15], bd);
+            output[1] = HighbdWrapLow(step2[1] + step2[14], bd);
+            output[2] = HighbdWrapLow(step2[2] + step2[13], bd);
+            output[3] = HighbdWrapLow(step2[3] + step2[12], bd);
+            output[4] = HighbdWrapLow(step2[4] + step2[11], bd);
+            output[5] = HighbdWrapLow(step2[5] + step2[10], bd);
+            output[6] = HighbdWrapLow(step2[6] + step2[9], bd);
+            output[7] = HighbdWrapLow(step2[7] + step2[8], bd);
+            output[8] = HighbdWrapLow(step2[7] - step2[8], bd);
+            output[9] = HighbdWrapLow(step2[6] - step2[9], bd);
+            output[10] = HighbdWrapLow(step2[5] - step2[10], bd);
+            output[11] = HighbdWrapLow(step2[4] - step2[11], bd);
+            output[12] = HighbdWrapLow(step2[3] - step2[12], bd);
+            output[13] = HighbdWrapLow(step2[2] - step2[13], bd);
+            output[14] = HighbdWrapLow(step2[1] - step2[14], bd);
+            output[15] = HighbdWrapLow(step2[0] - step2[15], bd);
+        }
+
+        public static void HighbdIdct16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[16 * 16];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[16];
+            Span<int> tempOut = stackalloc int[16];
+
+            // First transform rows
+            for (i = 0; i < 16; ++i)
+            {
+                HighbdIdct16(input, outptr, bd);
+                input = input.Slice(16);
+                outptr = outptr.Slice(16);
+            }
+
+            // Then transform columns
+            for (i = 0; i < 16; ++i)
+            {
+                for (j = 0; j < 16; ++j)
+                {
+                    tempIn[j] = output[j * 16 + i];
+                }
+
+                HighbdIdct16(tempIn, tempOut, bd);
+                for (j = 0; j < 16; ++j)
+                {
+                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+                }
+            }
+        }
+
+        public static void HighbdIdct16x1638Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[16 * 16];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[16];
+            Span<int> tempOut = stackalloc int[16];
+
+            // First transform rows. Since all non-zero dct coefficients are in
+            // upper-left 8x8 area, we only need to calculate first 8 rows here.
+            for (i = 0; i < 8; ++i)
+            {
+                HighbdIdct16(input, outptr, bd);
+                input = input.Slice(16);
+                outptr = outptr.Slice(16);
+            }
+
+            // Then transform columns
+            for (i = 0; i < 16; ++i)
+            {
+                Span<ushort> destT = dest;
+                for (j = 0; j < 16; ++j)
+                {
+                    tempIn[j] = output[j * 16 + i];
+                }
+
+                HighbdIdct16(tempIn, tempOut, bd);
+                for (j = 0; j < 16; ++j)
+                {
+                    destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+                    destT = destT.Slice(stride);
+                }
+            }
+        }
+
+        public static void HighbdIdct16x1610Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[16 * 16];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[16];
+            Span<int> tempOut = stackalloc int[16];
+
+            // First transform rows. Since all non-zero dct coefficients are in
+            // upper-left 4x4 area, we only need to calculate first 4 rows here.
+            for (i = 0; i < 4; ++i)
+            {
+                HighbdIdct16(input, outptr, bd);
+                input = input.Slice(16);
+                outptr = outptr.Slice(16);
+            }
+
+            // Then transform columns
+            for (i = 0; i < 16; ++i)
+            {
+                for (j = 0; j < 16; ++j)
+                {
+                    tempIn[j] = output[j * 16 + i];
+                }
+
+                HighbdIdct16(tempIn, tempOut, bd);
+                for (j = 0; j < 16; ++j)
+                {
+                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+                }
+            }
+        }
+
+        public static void HighbdIdct16x161Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i, j;
+            long a1;
+            int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
+
+            output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
+            a1 = BitUtils.RoundPowerOfTwo(output, 6);
+            for (j = 0; j < 16; ++j)
+            {
+                for (i = 0; i < 16; ++i)
+                {
+                    dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
+                }
+
+                dest = dest.Slice(stride);
+            }
+        }
+
+        public static void HighbdIdct32(ReadOnlySpan<int> input, Span<int> output, int bd)
+        {
+            Span<int> step1 = stackalloc int[32];
+            Span<int> step2 = stackalloc int[32];
+            long temp1, temp2;
+
+            if (DetectInvalidHighbdInput(input, 32) != 0)
+            {
+                Debug.Assert(false, "invalid highbd txfm input");
+                output.Slice(0, 32).Fill(0);
+                return;
+            }
+
+            // stage 1
+            step1[0] = input[0];
+            step1[1] = input[16];
+            step1[2] = input[8];
+            step1[3] = input[24];
+            step1[4] = input[4];
+            step1[5] = input[20];
+            step1[6] = input[12];
+            step1[7] = input[28];
+            step1[8] = input[2];
+            step1[9] = input[18];
+            step1[10] = input[10];
+            step1[11] = input[26];
+            step1[12] = input[6];
+            step1[13] = input[22];
+            step1[14] = input[14];
+            step1[15] = input[30];
+
+            temp1 = input[1] * (long)CosPi31_64 - input[31] * (long)CosPi1_64;
+            temp2 = input[1] * (long)CosPi1_64 + input[31] * (long)CosPi31_64;
+            step1[16] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[31] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = input[17] * (long)CosPi15_64 - input[15] * (long)CosPi17_64;
+            temp2 = input[17] * (long)CosPi17_64 + input[15] * (long)CosPi15_64;
+            step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = input[9] * (long)CosPi23_64 - input[23] * (long)CosPi9_64;
+            temp2 = input[9] * (long)CosPi9_64 + input[23] * (long)CosPi23_64;
+            step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = input[25] * (long)CosPi7_64 - input[7] * (long)CosPi25_64;
+            temp2 = input[25] * (long)CosPi25_64 + input[7] * (long)CosPi7_64;
+            step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = input[5] * (long)CosPi27_64 - input[27] * (long)CosPi5_64;
+            temp2 = input[5] * (long)CosPi5_64 + input[27] * (long)CosPi27_64;
+            step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = input[21] * (long)CosPi11_64 - input[11] * (long)CosPi21_64;
+            temp2 = input[21] * (long)CosPi21_64 + input[11] * (long)CosPi11_64;
+            step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = input[13] * (long)CosPi19_64 - input[19] * (long)CosPi13_64;
+            temp2 = input[13] * (long)CosPi13_64 + input[19] * (long)CosPi19_64;
+            step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = input[29] * (long)CosPi3_64 - input[3] * (long)CosPi29_64;
+            temp2 = input[29] * (long)CosPi29_64 + input[3] * (long)CosPi3_64;
+            step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            // stage 2
+            step2[0] = step1[0];
+            step2[1] = step1[1];
+            step2[2] = step1[2];
+            step2[3] = step1[3];
+            step2[4] = step1[4];
+            step2[5] = step1[5];
+            step2[6] = step1[6];
+            step2[7] = step1[7];
+
+            temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64;
+            temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64;
+            step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64;
+            temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64;
+            step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64;
+            temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64;
+            step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64;
+            temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64;
+            step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            step2[16] = HighbdWrapLow(step1[16] + step1[17], bd);
+            step2[17] = HighbdWrapLow(step1[16] - step1[17], bd);
+            step2[18] = HighbdWrapLow(-step1[18] + step1[19], bd);
+            step2[19] = HighbdWrapLow(step1[18] + step1[19], bd);
+            step2[20] = HighbdWrapLow(step1[20] + step1[21], bd);
+            step2[21] = HighbdWrapLow(step1[20] - step1[21], bd);
+            step2[22] = HighbdWrapLow(-step1[22] + step1[23], bd);
+            step2[23] = HighbdWrapLow(step1[22] + step1[23], bd);
+            step2[24] = HighbdWrapLow(step1[24] + step1[25], bd);
+            step2[25] = HighbdWrapLow(step1[24] - step1[25], bd);
+            step2[26] = HighbdWrapLow(-step1[26] + step1[27], bd);
+            step2[27] = HighbdWrapLow(step1[26] + step1[27], bd);
+            step2[28] = HighbdWrapLow(step1[28] + step1[29], bd);
+            step2[29] = HighbdWrapLow(step1[28] - step1[29], bd);
+            step2[30] = HighbdWrapLow(-step1[30] + step1[31], bd);
+            step2[31] = HighbdWrapLow(step1[30] + step1[31], bd);
+
+            // stage 3
+            step1[0] = step2[0];
+            step1[1] = step2[1];
+            step1[2] = step2[2];
+            step1[3] = step2[3];
+
+            temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64;
+            temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64;
+            step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64;
+            temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64;
+            step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+
+            step1[8] = HighbdWrapLow(step2[8] + step2[9], bd);
+            step1[9] = HighbdWrapLow(step2[8] - step2[9], bd);
+            step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd);
+            step1[11] = HighbdWrapLow(step2[10] + step2[11], bd);
+            step1[12] = HighbdWrapLow(step2[12] + step2[13], bd);
+            step1[13] = HighbdWrapLow(step2[12] - step2[13], bd);
+            step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd);
+            step1[15] = HighbdWrapLow(step2[14] + step2[15], bd);
+
+            step1[16] = step2[16];
+            step1[31] = step2[31];
+            temp1 = -step2[17] * (long)CosPi4_64 + step2[30] * (long)CosPi28_64;
+            temp2 = step2[17] * (long)CosPi28_64 + step2[30] * (long)CosPi4_64;
+            step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = -step2[18] * (long)CosPi28_64 - step2[29] * (long)CosPi4_64;
+            temp2 = -step2[18] * (long)CosPi4_64 + step2[29] * (long)CosPi28_64;
+            step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step1[19] = step2[19];
+            step1[20] = step2[20];
+            temp1 = -step2[21] * (long)CosPi20_64 + step2[26] * (long)CosPi12_64;
+            temp2 = step2[21] * (long)CosPi12_64 + step2[26] * (long)CosPi20_64;
+            step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = -step2[22] * (long)CosPi12_64 - step2[25] * (long)CosPi20_64;
+            temp2 = -step2[22] * (long)CosPi20_64 + step2[25] * (long)CosPi12_64;
+            step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step1[23] = step2[23];
+            step1[24] = step2[24];
+            step1[27] = step2[27];
+            step1[28] = step2[28];
+
+            // stage 4
+            temp1 = (step1[0] + step1[1]) * (long)CosPi16_64;
+            temp2 = (step1[0] - step1[1]) * (long)CosPi16_64;
+            step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64;
+            temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64;
+            step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
+            step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
+            step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
+            step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
+
+            step2[8] = step1[8];
+            step2[15] = step1[15];
+            temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
+            temp2 =  step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
+            step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
+            temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64;
+            step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step2[11] = step1[11];
+            step2[12] = step1[12];
+
+            step2[16] = HighbdWrapLow(step1[16] + step1[19], bd);
+            step2[17] = HighbdWrapLow(step1[17] + step1[18], bd);
+            step2[18] = HighbdWrapLow(step1[17] - step1[18], bd);
+            step2[19] = HighbdWrapLow(step1[16] - step1[19], bd);
+            step2[20] = HighbdWrapLow(-step1[20] + step1[23], bd);
+            step2[21] = HighbdWrapLow(-step1[21] + step1[22], bd);
+            step2[22] = HighbdWrapLow(step1[21] + step1[22], bd);
+            step2[23] = HighbdWrapLow(step1[20] + step1[23], bd);
+
+            step2[24] = HighbdWrapLow(step1[24] + step1[27], bd);
+            step2[25] = HighbdWrapLow(step1[25] + step1[26], bd);
+            step2[26] = HighbdWrapLow(step1[25] - step1[26], bd);
+            step2[27] = HighbdWrapLow(step1[24] - step1[27], bd);
+            step2[28] = HighbdWrapLow(-step1[28] + step1[31], bd);
+            step2[29] = HighbdWrapLow(-step1[29] + step1[30], bd);
+            step2[30] = HighbdWrapLow(step1[29] + step1[30], bd);
+            step2[31] = HighbdWrapLow(step1[28] + step1[31], bd);
+
+            // stage 5
+            step1[0] = HighbdWrapLow(step2[0] + step2[3], bd);
+            step1[1] = HighbdWrapLow(step2[1] + step2[2], bd);
+            step1[2] = HighbdWrapLow(step2[1] - step2[2], bd);
+            step1[3] = HighbdWrapLow(step2[0] - step2[3], bd);
+            step1[4] = step2[4];
+            temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
+            temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
+            step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step1[7] = step2[7];
+
+            step1[8] = HighbdWrapLow(step2[8] + step2[11], bd);
+            step1[9] = HighbdWrapLow(step2[9] + step2[10], bd);
+            step1[10] = HighbdWrapLow(step2[9] - step2[10], bd);
+            step1[11] = HighbdWrapLow(step2[8] - step2[11], bd);
+            step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd);
+            step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd);
+            step1[14] = HighbdWrapLow(step2[13] + step2[14], bd);
+            step1[15] = HighbdWrapLow(step2[12] + step2[15], bd);
+
+            step1[16] = step2[16];
+            step1[17] = step2[17];
+            temp1 = -step2[18] * (long)CosPi8_64 + step2[29] * (long)CosPi24_64;
+            temp2 = step2[18] * (long)CosPi24_64 + step2[29] * (long)CosPi8_64;
+            step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = -step2[19] * (long)CosPi8_64 + step2[28] * (long)CosPi24_64;
+            temp2 = step2[19] * (long)CosPi24_64 + step2[28] * (long)CosPi8_64;
+            step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = -step2[20] * (long)CosPi24_64 - step2[27] * (long)CosPi8_64;
+            temp2 = -step2[20] * (long)CosPi8_64 + step2[27] * (long)CosPi24_64;
+            step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = -step2[21] * (long)CosPi24_64 - step2[26] * (long)CosPi8_64;
+            temp2 = -step2[21] * (long)CosPi8_64 + step2[26] * (long)CosPi24_64;
+            step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step1[22] = step2[22];
+            step1[23] = step2[23];
+            step1[24] = step2[24];
+            step1[25] = step2[25];
+            step1[30] = step2[30];
+            step1[31] = step2[31];
+
+            // stage 6
+            step2[0] = HighbdWrapLow(step1[0] + step1[7], bd);
+            step2[1] = HighbdWrapLow(step1[1] + step1[6], bd);
+            step2[2] = HighbdWrapLow(step1[2] + step1[5], bd);
+            step2[3] = HighbdWrapLow(step1[3] + step1[4], bd);
+            step2[4] = HighbdWrapLow(step1[3] - step1[4], bd);
+            step2[5] = HighbdWrapLow(step1[2] - step1[5], bd);
+            step2[6] = HighbdWrapLow(step1[1] - step1[6], bd);
+            step2[7] = HighbdWrapLow(step1[0] - step1[7], bd);
+            step2[8] = step1[8];
+            step2[9] = step1[9];
+            temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64;
+            temp2 = (step1[10] + step1[13]) * (long)CosPi16_64;
+            step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64;
+            temp2 = (step1[11] + step1[12]) * (long)CosPi16_64;
+            step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step2[14] = step1[14];
+            step2[15] = step1[15];
+
+            step2[16] = HighbdWrapLow(step1[16] + step1[23], bd);
+            step2[17] = HighbdWrapLow(step1[17] + step1[22], bd);
+            step2[18] = HighbdWrapLow(step1[18] + step1[21], bd);
+            step2[19] = HighbdWrapLow(step1[19] + step1[20], bd);
+            step2[20] = HighbdWrapLow(step1[19] - step1[20], bd);
+            step2[21] = HighbdWrapLow(step1[18] - step1[21], bd);
+            step2[22] = HighbdWrapLow(step1[17] - step1[22], bd);
+            step2[23] = HighbdWrapLow(step1[16] - step1[23], bd);
+
+            step2[24] = HighbdWrapLow(-step1[24] + step1[31], bd);
+            step2[25] = HighbdWrapLow(-step1[25] + step1[30], bd);
+            step2[26] = HighbdWrapLow(-step1[26] + step1[29], bd);
+            step2[27] = HighbdWrapLow(-step1[27] + step1[28], bd);
+            step2[28] = HighbdWrapLow(step1[27] + step1[28], bd);
+            step2[29] = HighbdWrapLow(step1[26] + step1[29], bd);
+            step2[30] = HighbdWrapLow(step1[25] + step1[30], bd);
+            step2[31] = HighbdWrapLow(step1[24] + step1[31], bd);
+
+            // stage 7
+            step1[0] = HighbdWrapLow(step2[0] + step2[15], bd);
+            step1[1] = HighbdWrapLow(step2[1] + step2[14], bd);
+            step1[2] = HighbdWrapLow(step2[2] + step2[13], bd);
+            step1[3] = HighbdWrapLow(step2[3] + step2[12], bd);
+            step1[4] = HighbdWrapLow(step2[4] + step2[11], bd);
+            step1[5] = HighbdWrapLow(step2[5] + step2[10], bd);
+            step1[6] = HighbdWrapLow(step2[6] + step2[9], bd);
+            step1[7] = HighbdWrapLow(step2[7] + step2[8], bd);
+            step1[8] = HighbdWrapLow(step2[7] - step2[8], bd);
+            step1[9] = HighbdWrapLow(step2[6] - step2[9], bd);
+            step1[10] = HighbdWrapLow(step2[5] - step2[10], bd);
+            step1[11] = HighbdWrapLow(step2[4] - step2[11], bd);
+            step1[12] = HighbdWrapLow(step2[3] - step2[12], bd);
+            step1[13] = HighbdWrapLow(step2[2] - step2[13], bd);
+            step1[14] = HighbdWrapLow(step2[1] - step2[14], bd);
+            step1[15] = HighbdWrapLow(step2[0] - step2[15], bd);
+
+            step1[16] = step2[16];
+            step1[17] = step2[17];
+            step1[18] = step2[18];
+            step1[19] = step2[19];
+            temp1 = (-step2[20] + step2[27]) * (long)CosPi16_64;
+            temp2 = (step2[20] + step2[27]) * (long)CosPi16_64;
+            step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = (-step2[21] + step2[26]) * (long)CosPi16_64;
+            temp2 = (step2[21] + step2[26]) * (long)CosPi16_64;
+            step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = (-step2[22] + step2[25]) * (long)CosPi16_64;
+            temp2 = (step2[22] + step2[25]) * (long)CosPi16_64;
+            step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            temp1 = (-step2[23] + step2[24]) * (long)CosPi16_64;
+            temp2 = (step2[23] + step2[24]) * (long)CosPi16_64;
+            step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
+            step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
+            step1[28] = step2[28];
+            step1[29] = step2[29];
+            step1[30] = step2[30];
+            step1[31] = step2[31];
+
+            // final stage
+            output[0] = HighbdWrapLow(step1[0] + step1[31], bd);
+            output[1] = HighbdWrapLow(step1[1] + step1[30], bd);
+            output[2] = HighbdWrapLow(step1[2] + step1[29], bd);
+            output[3] = HighbdWrapLow(step1[3] + step1[28], bd);
+            output[4] = HighbdWrapLow(step1[4] + step1[27], bd);
+            output[5] = HighbdWrapLow(step1[5] + step1[26], bd);
+            output[6] = HighbdWrapLow(step1[6] + step1[25], bd);
+            output[7] = HighbdWrapLow(step1[7] + step1[24], bd);
+            output[8] = HighbdWrapLow(step1[8] + step1[23], bd);
+            output[9] = HighbdWrapLow(step1[9] + step1[22], bd);
+            output[10] = HighbdWrapLow(step1[10] + step1[21], bd);
+            output[11] = HighbdWrapLow(step1[11] + step1[20], bd);
+            output[12] = HighbdWrapLow(step1[12] + step1[19], bd);
+            output[13] = HighbdWrapLow(step1[13] + step1[18], bd);
+            output[14] = HighbdWrapLow(step1[14] + step1[17], bd);
+            output[15] = HighbdWrapLow(step1[15] + step1[16], bd);
+            output[16] = HighbdWrapLow(step1[15] - step1[16], bd);
+            output[17] = HighbdWrapLow(step1[14] - step1[17], bd);
+            output[18] = HighbdWrapLow(step1[13] - step1[18], bd);
+            output[19] = HighbdWrapLow(step1[12] - step1[19], bd);
+            output[20] = HighbdWrapLow(step1[11] - step1[20], bd);
+            output[21] = HighbdWrapLow(step1[10] - step1[21], bd);
+            output[22] = HighbdWrapLow(step1[9] - step1[22], bd);
+            output[23] = HighbdWrapLow(step1[8] - step1[23], bd);
+            output[24] = HighbdWrapLow(step1[7] - step1[24], bd);
+            output[25] = HighbdWrapLow(step1[6] - step1[25], bd);
+            output[26] = HighbdWrapLow(step1[5] - step1[26], bd);
+            output[27] = HighbdWrapLow(step1[4] - step1[27], bd);
+            output[28] = HighbdWrapLow(step1[3] - step1[28], bd);
+            output[29] = HighbdWrapLow(step1[2] - step1[29], bd);
+            output[30] = HighbdWrapLow(step1[1] - step1[30], bd);
+            output[31] = HighbdWrapLow(step1[0] - step1[31], bd);
+        }
+
+        public static void HighbdIdct32x321024Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[32 * 32];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[32];
+            Span<int> tempOut = stackalloc int[32];
+
+            // Rows
+            for (i = 0; i < 32; ++i)
+            {
+                int zeroCoeff = 0;
+                for (j = 0; j < 32; ++j)
+                {
+                    zeroCoeff |= input[j];
+                }
+
+                if (zeroCoeff != 0)
+                {
+                    HighbdIdct32(input, outptr, bd);
+                }
+                else
+                {
+                    outptr.Slice(0, 32).Fill(0);
+                }
+
+                input = input.Slice(32);
+                outptr = outptr.Slice(32);
+            }
+
+            // Columns
+            for (i = 0; i < 32; ++i)
+            {
+                for (j = 0; j < 32; ++j)
+                {
+                    tempIn[j] = output[j * 32 + i];
+                }
+
+                HighbdIdct32(tempIn, tempOut, bd);
+                for (j = 0; j < 32; ++j)
+                {
+                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+                }
+            }
+        }
+
+        public static void HighbdIdct32x32135Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[32 * 32];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[32];
+            Span<int> tempOut = stackalloc int[32];
+
+            // Rows
+            // Only upper-left 16x16 has non-zero coeff
+            for (i = 0; i < 16; ++i)
+            {
+                HighbdIdct32(input, outptr, bd);
+                input = input.Slice(32);
+                outptr = outptr.Slice(32);
+            }
+
+            // Columns
+            for (i = 0; i < 32; ++i)
+            {
+                Span<ushort> destT = dest;
+                for (j = 0; j < 32; ++j)
+                {
+                    tempIn[j] = output[j * 32 + i];
+                }
+
+                HighbdIdct32(tempIn, tempOut, bd);
+                for (j = 0; j < 32; ++j)
+                {
+                    destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+                    destT = destT.Slice(stride);
+                }
+            }
+        }
+
+        public static void HighbdIdct32x3234Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[32 * 32];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[32];
+            Span<int> tempOut = stackalloc int[32];
+
+            // Rows
+            // Only upper-left 8x8 has non-zero coeff
+            for (i = 0; i < 8; ++i)
+            {
+                HighbdIdct32(input, outptr, bd);
+                input = input.Slice(32);
+                outptr = outptr.Slice(32);
+            }
+
+            // Columns
+            for (i = 0; i < 32; ++i)
+            {
+                for (j = 0; j < 32; ++j)
+                {
+                    tempIn[j] = output[j * 32 + i];
+                }
+
+                HighbdIdct32(tempIn, tempOut, bd);
+                for (j = 0; j < 32; ++j)
+                {
+                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+                }
+            }
+        }
+
+        public static void HighbdIdct32x321Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
+        {
+            int i, j;
+            int a1;
+            int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
+
+            output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
+            a1 = BitUtils.RoundPowerOfTwo(output, 6);
+
+            for (j = 0; j < 32; ++j)
+            {
+                for (i = 0; i < 32; ++i)
+                {
+                    dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
+                }
+
+                dest = dest.Slice(stride);
+            }
+        }
+    }
+}

+ 73 - 0
Ryujinx.Graphics.Nvdec.Vp9/Dsp/Prob.cs

@@ -0,0 +1,73 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using System;
+using System.Diagnostics;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+    internal static class Prob
+    {
+        public const int MaxProb = 255;
+
+        private static byte GetProb(uint num, uint den)
+        {
+            Debug.Assert(den != 0);
+            {
+                int p = (int)(((ulong)num * 256 + (den >> 1)) / den);
+                // (p > 255) ? 255 : (p < 1) ? 1 : p;
+                int clippedProb = p | ((255 - p) >> 23) | (p == 0 ? 1 : 0);
+                return (byte)clippedProb;
+            }
+        }
+
+        /* This function assumes prob1 and prob2 are already within [1,255] range. */
+        public static byte WeightedProb(int prob1, int prob2, int factor)
+        {
+            return (byte)BitUtils.RoundPowerOfTwo(prob1 * (256 - factor) + prob2 * factor, 8);
+        }
+
+        // MODE_MV_MAX_UPDATE_FACTOR (128) * count / MODE_MV_COUNT_SAT;
+        private static readonly uint[] CountToUpdateFactor = new uint[]
+        {
+            0,  6,  12, 19, 25, 32,  38,  44,  51,  57, 64,
+            70, 76, 83, 89, 96, 102, 108, 115, 121, 128
+        };
+
+        private const int ModeMvCountSat = 20;
+
+        public static byte ModeMvMergeProbs(byte preProb, uint ct0, uint ct1)
+        {
+            uint den = ct0 + ct1;
+            if (den == 0)
+            {
+                return preProb;
+            }
+            else
+            {
+                uint count = Math.Min(den, ModeMvCountSat);
+                uint factor = CountToUpdateFactor[(int)count];
+                byte prob = GetProb(ct0, den);
+                return WeightedProb(preProb, prob, (int)factor);
+            }
+        }
+
+        private static uint TreeMergeProbsImpl(
+            uint i,
+            sbyte[] tree,
+            ReadOnlySpan<byte> preProbs,
+            ReadOnlySpan<uint> counts,
+            Span<byte> probs)
+        {
+            int l = tree[i];
+            uint leftCount = (l <= 0) ? counts[-l] : TreeMergeProbsImpl((uint)l, tree, preProbs, counts, probs);
+            int r = tree[i + 1];
+            uint rightCount = (r <= 0) ? counts[-r] : TreeMergeProbsImpl((uint)r, tree, preProbs, counts, probs);
+            probs[(int)(i >> 1)] = ModeMvMergeProbs(preProbs[(int)(i >> 1)], leftCount, rightCount);
+            return leftCount + rightCount;
+        }
+
+        public static void TreeMergeProbs(sbyte[] tree, ReadOnlySpan<byte> preProbs, ReadOnlySpan<uint> counts, Span<byte> probs)
+        {
+            TreeMergeProbsImpl(0, tree, preProbs, counts, probs);
+        }
+    }
+}

+ 237 - 0
Ryujinx.Graphics.Nvdec.Vp9/Dsp/Reader.cs

@@ -0,0 +1,237 @@
+using System;
+using System.Buffers.Binary;
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+    internal struct Reader
+    {
+        private static readonly byte[] Norm = new byte[]
+        {
+            0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+            3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+        };
+        private const int BdValueSize = sizeof(ulong) * 8;
+
+        // This is meant to be a large, positive constant that can still be efficiently
+        // loaded as an immediate (on platforms like ARM, for example).
+        // Even relatively modest values like 100 would work fine.
+        private const int LotsOfBits = 0x40000000;
+
+        public ulong Value;
+        public uint Range;
+        public int Count;
+        private ArrayPtr<byte> _buffer;
+
+        public bool Init(ArrayPtr<byte> buffer, int size)
+        {
+            if (size != 0 && buffer.IsNull)
+            {
+                return true;
+            }
+            else
+            {
+                _buffer = new ArrayPtr<byte>(ref buffer[0], size);
+                Value = 0;
+                Count = -8;
+                Range = 255;
+                Fill();
+                return ReadBit() != 0;  // Marker bit
+            }
+        }
+
+        private void Fill()
+        {
+            ReadOnlySpan<byte> buffer = _buffer.ToSpan();
+            ReadOnlySpan<byte> bufferStart = buffer;
+            ulong value = Value;
+            int count = Count;
+            ulong bytesLeft = (ulong)buffer.Length;
+            ulong bitsLeft = bytesLeft * 8;
+            int shift = BdValueSize - 8 - (count + 8);
+
+            if (bitsLeft > BdValueSize)
+            {
+                int bits = (shift & unchecked((int)0xfffffff8)) + 8;
+                ulong nv;
+                ulong bigEndianValues = BinaryPrimitives.ReadUInt64BigEndian(buffer);
+                nv = bigEndianValues >> (BdValueSize - bits);
+                count += bits;
+                buffer = buffer.Slice(bits >> 3);
+                value = Value | (nv << (shift & 0x7));
+            }
+            else
+            {
+                int bitsOver = shift + 8 - (int)bitsLeft;
+                int loopEnd = 0;
+                if (bitsOver >= 0)
+                {
+                    count += LotsOfBits;
+                    loopEnd = bitsOver;
+                }
+
+                if (bitsOver < 0 || bitsLeft != 0)
+                {
+                    while (shift >= loopEnd)
+                    {
+                        count += 8;
+                        value |= (ulong)buffer[0] << shift;
+                        buffer = buffer.Slice(1);
+                        shift -= 8;
+                    }
+                }
+            }
+
+            // NOTE: Variable 'buffer' may not relate to '_buffer' after decryption,
+            // so we increase '_buffer' by the amount that 'buffer' moved, rather than
+            // assign 'buffer' to '_buffer'.
+            _buffer = _buffer.Slice(bufferStart.Length - buffer.Length);
+            Value = value;
+            Count = count;
+        }
+
+        public bool HasError()
+        {
+            // Check if we have reached the end of the buffer.
+            //
+            // Variable 'count' stores the number of bits in the 'value' buffer, minus
+            // 8. The top byte is part of the algorithm, and the remainder is buffered
+            // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+            // occupied, 8 for the algorithm and 8 in the buffer.
+            //
+            // When reading a byte from the user's buffer, count is filled with 8 and
+            // one byte is filled into the value buffer. When we reach the end of the
+            // data, count is additionally filled with LotsOfBits. So when
+            // count == LotsOfBits - 1, the user's data has been exhausted.
+            //
+            // 1 if we have tried to decode bits after the end of stream was encountered.
+            // 0 No error.
+            return Count > BdValueSize && Count < LotsOfBits;
+        }
+
+        public int Read(int prob)
+        {
+            uint bit = 0;
+            ulong value;
+            ulong bigsplit;
+            int count;
+            uint range;
+            uint split = (Range * (uint)prob + (256 - (uint)prob)) >> 8;
+
+            if (Count < 0)
+            {
+                Fill();
+            }
+
+            value = Value;
+            count = Count;
+
+            bigsplit = (ulong)split << (BdValueSize - 8);
+
+            range = split;
+
+            if (value >= bigsplit)
+            {
+                range = Range - split;
+                value -= bigsplit;
+                bit = 1;
+            }
+
+            {
+                int shift = Norm[range];
+                range <<= shift;
+                value <<= shift;
+                count -= shift;
+            }
+            Value = value;
+            Count = count;
+            Range = range;
+
+            return (int)bit;
+        }
+
+        public int ReadBit()
+        {
+            return Read(128);  // vpx_prob_half
+        }
+
+        public int ReadLiteral(int bits)
+        {
+            int literal = 0, bit;
+
+            for (bit = bits - 1; bit >= 0; bit--)
+            {
+                literal |= ReadBit() << bit;
+            }
+
+            return literal;
+        }
+
+        public int ReadTree(ReadOnlySpan<sbyte> tree, ReadOnlySpan<byte> probs)
+        {
+            sbyte i = 0;
+
+            while ((i = tree[i + Read(probs[i >> 1])]) > 0)
+            {
+                continue;
+            }
+
+            return -i;
+        }
+
+        public int ReadBool(int prob, ref ulong value, ref int count, ref uint range)
+        {
+            uint split = (range * (uint)prob + (256 - (uint)prob)) >> 8;
+            ulong bigsplit = (ulong)split << (BdValueSize - 8);
+
+            if (count < 0)
+            {
+                Value = value;
+                Count = count;
+                Fill();
+                value = Value;
+                count = Count;
+            }
+
+            if (value >= bigsplit)
+            {
+                range = range - split;
+                value = value - bigsplit;
+                {
+                    int shift = Norm[range];
+                    range <<= shift;
+                    value <<= shift;
+                    count -= shift;
+                }
+                return 1;
+            }
+            range = split;
+            {
+                int shift = Norm[range];
+                range <<= shift;
+                value <<= shift;
+                count -= shift;
+            }
+            return 0;
+        }
+
+        public ArrayPtr<byte> FindEnd()
+        {
+            // Find the end of the coded buffer
+            while (Count > 8 && Count < BdValueSize)
+            {
+                Count -= 8;
+                _buffer = _buffer.Slice(-1);
+            }
+            return _buffer;
+        }
+    }
+}

+ 54 - 0
Ryujinx.Graphics.Nvdec.Vp9/Dsp/TxfmCommon.cs

@@ -0,0 +1,54 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
+{
+    internal static class TxfmCommon
+    {
+        // Constants used by all idct/dct functions
+        public const int DctConstBits = 14;
+        public const int DctConstRounding = 1 << (DctConstBits - 1);
+
+        public const int UnitQuantShift = 2;
+        public const int UnitQuantFactor = 1 << UnitQuantShift;
+
+        // Constants:
+        //  for (int i = 1; i < 32; ++i)
+        //    Console.WriteLine("public const short CosPi{0}_64 = {1};", i, MathF.Round(16384 * MathF.Cos(i * MathF.PI / 64)));
+        // Note: sin(k * Pi / 64) = cos((32 - k) * Pi / 64)
+        public const short CosPi1_64 = 16364;
+        public const short CosPi2_64 = 16305;
+        public const short CosPi3_64 = 16207;
+        public const short CosPi4_64 = 16069;
+        public const short CosPi5_64 = 15893;
+        public const short CosPi6_64 = 15679;
+        public const short CosPi7_64 = 15426;
+        public const short CosPi8_64 = 15137;
+        public const short CosPi9_64 = 14811;
+        public const short CosPi10_64 = 14449;
+        public const short CosPi11_64 = 14053;
+        public const short CosPi12_64 = 13623;
+        public const short CosPi13_64 = 13160;
+        public const short CosPi14_64 = 12665;
+        public const short CosPi15_64 = 12140;
+        public const short CosPi16_64 = 11585;
+        public const short CosPi17_64 = 11003;
+        public const short CosPi18_64 = 10394;
+        public const short CosPi19_64 = 9760;
+        public const short CosPi20_64 = 9102;
+        public const short CosPi21_64 = 8423;
+        public const short CosPi22_64 = 7723;
+        public const short CosPi23_64 = 7005;
+        public const short CosPi24_64 = 6270;
+        public const short CosPi25_64 = 5520;
+        public const short CosPi26_64 = 4756;
+        public const short CosPi27_64 = 3981;
+        public const short CosPi28_64 = 3196;
+        public const short CosPi29_64 = 2404;
+        public const short CosPi30_64 = 1606;
+        public const short CosPi31_64 = 804;
+
+        //  16384 * sqrt(2) * sin(kPi / 9) * 2 / 3
+        public const short SinPi1_9 = 5283;
+        public const short SinPi2_9 = 9929;
+        public const short SinPi3_9 = 13377;
+        public const short SinPi4_9 = 15212;
+    }
+}

+ 536 - 0
Ryujinx.Graphics.Nvdec.Vp9/Idct.cs

@@ -0,0 +1,536 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using System;
+using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.InvTxfm;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class Idct
+    {
+        private delegate void Transform1D(ReadOnlySpan<int> input, Span<int> output);
+        private delegate void HighbdTransform1D(ReadOnlySpan<int> input, Span<int> output, int bd);
+
+        private struct Transform2D
+        {
+            public Transform1D Cols, Rows;  // Vertical and horizontal
+
+            public Transform2D(Transform1D cols, Transform1D rows)
+            {
+                Cols = cols;
+                Rows = rows;
+            }
+        }
+
+        private struct HighbdTransform2D
+        {
+            public HighbdTransform1D Cols, Rows;  // Vertical and horizontal
+
+            public HighbdTransform2D(HighbdTransform1D cols, HighbdTransform1D rows)
+            {
+                Cols = cols;
+                Rows = rows;
+            }
+        }
+
+        private static readonly Transform2D[] Iht4 = new Transform2D[]
+        {
+            new Transform2D(Idct4, Idct4),   // DCT_DCT  = 0
+            new Transform2D(Iadst4, Idct4),  // ADST_DCT = 1
+            new Transform2D(Idct4, Iadst4),  // DCT_ADST = 2
+            new Transform2D(Iadst4, Iadst4)  // ADST_ADST = 3
+        };
+
+        public static void Iht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int txType)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[4 * 4];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[4];
+            Span<int> tempOut = stackalloc int[4];
+
+            // Inverse transform row vectors
+            for (i = 0; i < 4; ++i)
+            {
+                Iht4[txType].Rows(input, outptr);
+                input = input.Slice(4);
+                outptr = outptr.Slice(4);
+            }
+
+            // Inverse transform column vectors
+            for (i = 0; i < 4; ++i)
+            {
+                for (j = 0; j < 4; ++j)
+                {
+                    tempIn[j] = output[j * 4 + i];
+                }
+
+                Iht4[txType].Cols(tempIn, tempOut);
+                for (j = 0; j < 4; ++j)
+                {
+                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4));
+                }
+            }
+        }
+
+        private static readonly Transform2D[] Iht8 = new Transform2D[]
+        {
+            new Transform2D(Idct8, Idct8),   // DCT_DCT  = 0
+            new Transform2D(Iadst8, Idct8),  // ADST_DCT = 1
+            new Transform2D(Idct8, Iadst8),  // DCT_ADST = 2
+            new Transform2D(Iadst8, Iadst8)  // ADST_ADST = 3
+        };
+
+        public static void Iht8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int txType)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[8 * 8];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[8];
+            Span<int> tempOut = stackalloc int[8];
+            Transform2D ht = Iht8[txType];
+
+            // Inverse transform row vectors
+            for (i = 0; i < 8; ++i)
+            {
+                ht.Rows(input, outptr);
+                input = input.Slice(8);
+                outptr = outptr.Slice(8);
+            }
+
+            // Inverse transform column vectors
+            for (i = 0; i < 8; ++i)
+            {
+                for (j = 0; j < 8; ++j)
+                {
+                    tempIn[j] = output[j * 8 + i];
+                }
+
+                ht.Cols(tempIn, tempOut);
+                for (j = 0; j < 8; ++j)
+                {
+                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5));
+                }
+            }
+        }
+
+        private static readonly Transform2D[] Iht16 = new Transform2D[]
+        {
+            new Transform2D(Idct16, Idct16),   // DCT_DCT  = 0
+            new Transform2D(Iadst16, Idct16),  // ADST_DCT = 1
+            new Transform2D(Idct16, Iadst16),  // DCT_ADST = 2
+            new Transform2D(Iadst16, Iadst16)  // ADST_ADST = 3
+        };
+
+        public static void Iht16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int txType)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[16 * 16];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[16];
+            Span<int> tempOut = stackalloc int[16];
+            Transform2D ht = Iht16[txType];
+
+            // Rows
+            for (i = 0; i < 16; ++i)
+            {
+                ht.Rows(input, outptr);
+                input = input.Slice(16);
+                outptr = outptr.Slice(16);
+            }
+
+            // Columns
+            for (i = 0; i < 16; ++i)
+            {
+                for (j = 0; j < 16; ++j)
+                {
+                    tempIn[j] = output[j * 16 + i];
+                }
+
+                ht.Cols(tempIn, tempOut);
+                for (j = 0; j < 16; ++j)
+                {
+                    dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
+                }
+            }
+        }
+
+        // Idct
+        public static void Idct4x4Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob)
+        {
+            if (eob > 1)
+            {
+                Idct4x416Add(input, dest, stride);
+            }
+            else
+            {
+                Idct4x41Add(input, dest, stride);
+            }
+        }
+
+        public static void Iwht4x4Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob)
+        {
+            if (eob > 1)
+            {
+                Iwht4x416Add(input, dest, stride);
+            }
+            else
+            {
+                Iwht4x41Add(input, dest, stride);
+            }
+        }
+
+        public static void Idct8x8Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob)
+        {
+            // If dc is 1, then input[0] is the reconstructed value, do not need
+            // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+            // The calculation can be simplified if there are not many non-zero dct
+            // coefficients. Use eobs to decide what to do.
+            if (eob == 1)
+            {
+                // DC only DCT coefficient
+                Idct8x81Add(input, dest, stride);
+            }
+            else if (eob <= 12)
+            {
+                Idct8x812Add(input, dest, stride);
+            }
+            else
+            {
+                Idct8x864Add(input, dest, stride);
+            }
+        }
+
+        public static void Idct16x16Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob)
+        {
+            /* The calculation can be simplified if there are not many non-zero dct
+             * coefficients. Use eobs to separate different cases. */
+            if (eob == 1) /* DC only DCT coefficient. */
+            {
+                Idct16x161Add(input, dest, stride);
+            }
+            else if (eob <= 10)
+            {
+                Idct16x1610Add(input, dest, stride);
+            }
+            else if (eob <= 38)
+            {
+                Idct16x1638Add(input, dest, stride);
+            }
+            else
+            {
+                Idct16x16256Add(input, dest, stride);
+            }
+        }
+
+        public static void Idct32x32Add(ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob)
+        {
+            if (eob == 1)
+            {
+                Idct32x321Add(input, dest, stride);
+            }
+            else if (eob <= 34)
+            {
+                // Non-zero coeff only in upper-left 8x8
+                Idct32x3234Add(input, dest, stride);
+            }
+            else if (eob <= 135)
+            {
+                // Non-zero coeff only in upper-left 16x16
+                Idct32x32135Add(input, dest, stride);
+            }
+            else
+            {
+                Idct32x321024Add(input, dest, stride);
+            }
+        }
+
+        // Iht
+        public static void Iht4x4Add(TxType txType, ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob)
+        {
+            if (txType == TxType.DctDct)
+            {
+                Idct4x4Add(input, dest, stride, eob);
+            }
+            else
+            {
+                Iht4x416Add(input, dest, stride, (int)txType);
+            }
+        }
+
+        public static void Iht8x8Add(TxType txType, ReadOnlySpan<int> input, Span<byte> dest, int stride, int eob)
+        {
+            if (txType == TxType.DctDct)
+            {
+                Idct8x8Add(input, dest, stride, eob);
+            }
+            else
+            {
+                Iht8x864Add(input, dest, stride, (int)txType);
+            }
+        }
+
+        public static void Iht16x16Add(TxType txType, ReadOnlySpan<int> input, Span<byte> dest,
+                              int stride, int eob)
+        {
+            if (txType == TxType.DctDct)
+            {
+                Idct16x16Add(input, dest, stride, eob);
+            }
+            else
+            {
+                Iht16x16256Add(input, dest, stride, (int)txType);
+            }
+        }
+
+        private static readonly HighbdTransform2D[] HighbdIht4 = new HighbdTransform2D[]
+        {
+            new HighbdTransform2D(HighbdIdct4, HighbdIdct4),   // DCT_DCT  = 0
+            new HighbdTransform2D(HighbdIadst4, HighbdIdct4),  // ADST_DCT = 1
+            new HighbdTransform2D(HighbdIdct4, HighbdIadst4),  // DCT_ADST = 2
+            new HighbdTransform2D(HighbdIadst4, HighbdIadst4)  // ADST_ADST = 3
+        };
+
+        public static void HighbdIht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType, int bd)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[4 * 4];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[4];
+            Span<int> tempOut = stackalloc int[4];
+
+            // Inverse transform row vectors.
+            for (i = 0; i < 4; ++i)
+            {
+                HighbdIht4[txType].Rows(input, outptr, bd);
+                input = input.Slice(4);
+                outptr = outptr.Slice(4);
+            }
+
+            // Inverse transform column vectors.
+            for (i = 0; i < 4; ++i)
+            {
+                for (j = 0; j < 4; ++j)
+                {
+                    tempIn[j] = output[j * 4 + i];
+                }
+
+                HighbdIht4[txType].Cols(tempIn, tempOut, bd);
+                for (j = 0; j < 4; ++j)
+                {
+                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd);
+                }
+            }
+        }
+
+        private static readonly HighbdTransform2D[] HighIht8 = new HighbdTransform2D[]
+        {
+            new HighbdTransform2D(HighbdIdct8, HighbdIdct8),   // DCT_DCT  = 0
+            new HighbdTransform2D(HighbdIadst8, HighbdIdct8),  // ADST_DCT = 1
+            new HighbdTransform2D(HighbdIdct8, HighbdIadst8),  // DCT_ADST = 2
+            new HighbdTransform2D(HighbdIadst8, HighbdIadst8)  // ADST_ADST = 3
+        };
+
+        public static void HighbdIht8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType, int bd)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[8 * 8];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[8];
+            Span<int> tempOut = stackalloc int[8];
+            HighbdTransform2D ht = HighIht8[txType];
+
+            // Inverse transform row vectors.
+            for (i = 0; i < 8; ++i)
+            {
+                ht.Rows(input, outptr, bd);
+                input = input.Slice(8);
+                outptr = output.Slice(8);
+            }
+
+            // Inverse transform column vectors.
+            for (i = 0; i < 8; ++i)
+            {
+                for (j = 0; j < 8; ++j)
+                {
+                    tempIn[j] = output[j * 8 + i];
+                }
+
+                ht.Cols(tempIn, tempOut, bd);
+                for (j = 0; j < 8; ++j)
+                {
+                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
+                }
+            }
+        }
+
+        private static readonly HighbdTransform2D[] HighIht16 = new HighbdTransform2D[]
+        {
+            new HighbdTransform2D(HighbdIdct16, HighbdIdct16),   // DCT_DCT  = 0
+            new HighbdTransform2D(HighbdIadst16, HighbdIdct16),  // ADST_DCT = 1
+            new HighbdTransform2D(HighbdIdct16, HighbdIadst16),  // DCT_ADST = 2
+            new HighbdTransform2D(HighbdIadst16, HighbdIadst16)  // ADST_ADST = 3
+        };
+
+        public static void HighbdIht16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int txType, int bd)
+        {
+            int i, j;
+            Span<int> output = stackalloc int[16 * 16];
+            Span<int> outptr = output;
+            Span<int> tempIn = stackalloc int[16];
+            Span<int> tempOut = stackalloc int[16];
+            HighbdTransform2D ht = HighIht16[txType];
+
+            // Rows
+            for (i = 0; i < 16; ++i)
+            {
+                ht.Rows(input, outptr, bd);
+                input = input.Slice(16);
+                outptr = output.Slice(16);
+            }
+
+            // Columns
+            for (i = 0; i < 16; ++i)
+            {
+                for (j = 0; j < 16; ++j)
+                {
+                    tempIn[j] = output[j * 16 + i];
+                }
+
+                ht.Cols(tempIn, tempOut, bd);
+                for (j = 0; j < 16; ++j)
+                {
+                    dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
+                }
+            }
+        }
+
+        // Idct
+        public static void HighbdIdct4x4Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
+        {
+            if (eob > 1)
+            {
+                HighbdIdct4x416Add(input, dest, stride, bd);
+            }
+            else
+            {
+                HighbdIdct4x41Add(input, dest, stride, bd);
+            }
+        }
+
+        public static void HighbdIwht4x4Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
+        {
+            if (eob > 1)
+            {
+                HighbdIwht4x416Add(input, dest, stride, bd);
+            }
+            else
+            {
+                HighbdIwht4x41Add(input, dest, stride, bd);
+            }
+        }
+
+        public static void HighbdIdct8x8Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
+        {
+            // If dc is 1, then input[0] is the reconstructed value, do not need
+            // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+            // The calculation can be simplified if there are not many non-zero dct
+            // coefficients. Use eobs to decide what to do.
+            // DC only DCT coefficient
+            if (eob == 1)
+            {
+                vpx_Highbdidct8x8_1_add_c(input, dest, stride, bd);
+            }
+            else if (eob <= 12)
+            {
+                HighbdIdct8x812Add(input, dest, stride, bd);
+            }
+            else
+            {
+                HighbdIdct8x864Add(input, dest, stride, bd);
+            }
+        }
+
+        public static void HighbdIdct16x16Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
+        {
+            // The calculation can be simplified if there are not many non-zero dct
+            // coefficients. Use eobs to separate different cases.
+            // DC only DCT coefficient.
+            if (eob == 1)
+            {
+                HighbdIdct16x161Add(input, dest, stride, bd);
+            }
+            else if (eob <= 10)
+            {
+                HighbdIdct16x1610Add(input, dest, stride, bd);
+            }
+            else if (eob <= 38)
+            {
+                HighbdIdct16x1638Add(input, dest, stride, bd);
+            }
+            else
+            {
+                HighbdIdct16x16256Add(input, dest, stride, bd);
+            }
+        }
+
+        public static void HighbdIdct32x32Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
+        {
+            // Non-zero coeff only in upper-left 8x8
+            if (eob == 1)
+            {
+                HighbdIdct32x321Add(input, dest, stride, bd);
+            }
+            else if (eob <= 34)
+            {
+                HighbdIdct32x3234Add(input, dest, stride, bd);
+            }
+            else if (eob <= 135)
+            {
+                HighbdIdct32x32135Add(input, dest, stride, bd);
+            }
+            else
+            {
+                HighbdIdct32x321024Add(input, dest, stride, bd);
+            }
+        }
+
+        // Iht
+        public static void HighbdIht4x4Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
+        {
+            if (txType == TxType.DctDct)
+            {
+                HighbdIdct4x4Add(input, dest, stride, eob, bd);
+            }
+            else
+            {
+                HighbdIht4x416Add(input, dest, stride, (int)txType, bd);
+            }
+        }
+
+        public static void HighbdIht8x8Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
+        {
+            if (txType == TxType.DctDct)
+            {
+                HighbdIdct8x8Add(input, dest, stride, eob, bd);
+            }
+            else
+            {
+                HighbdIht8x864Add(input, dest, stride, (int)txType, bd);
+            }
+        }
+
+        public static void HighbdIht16x16Add(TxType txType, ReadOnlySpan<int> input, Span<ushort> dest, int stride, int eob, int bd)
+        {
+            if (txType == TxType.DctDct)
+            {
+                HighbdIdct16x16Add(input, dest, stride, eob, bd);
+            }
+            else
+            {
+                HighbdIht16x16256Add(input, dest, stride, (int)txType, bd);
+            }
+        }
+    }
+}

+ 15 - 0
Ryujinx.Graphics.Nvdec.Vp9/InternalErrorException.cs

@@ -0,0 +1,15 @@
+using System;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    class InternalErrorException : Exception
+    {
+        public InternalErrorException(string message) : base(message)
+        {
+        }
+
+        public InternalErrorException(string message, Exception innerException) : base(message, innerException)
+        {
+        }
+    }
+}

+ 14 - 0
Ryujinx.Graphics.Nvdec.Vp9/InternalErrorInfo.cs

@@ -0,0 +1,14 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal struct InternalErrorInfo
+    {
+        public CodecErr ErrorCode;
+
+        public void InternalError(CodecErr error, string message)
+        {
+            ErrorCode = error;
+
+            throw new InternalErrorException(message);
+        }
+    }
+}

+ 418 - 0
Ryujinx.Graphics.Nvdec.Vp9/LoopFilter.cs

@@ -0,0 +1,418 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using System;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class LoopFilter
+    {
+        public const int MaxLoopFilter = 63;
+
+        public const int MaxRefLfDeltas = 4;
+        public const int MaxModeLfDeltas = 2;
+
+        // 64 bit masks for left transform size. Each 1 represents a position where
+        // we should apply a loop filter across the left border of an 8x8 block
+        // boundary.
+        //
+        // In the case of TX_16X16 ->  ( in low order byte first we end up with
+        // a mask that looks like this
+        //
+        //    10101010
+        //    10101010
+        //    10101010
+        //    10101010
+        //    10101010
+        //    10101010
+        //    10101010
+        //    10101010
+        //
+        // A loopfilter should be applied to every other 8x8 horizontally.
+        private static readonly ulong[] Left64X64TxformMask = new ulong[]
+        {
+            0xffffffffffffffffUL,  // TX_4X4
+            0xffffffffffffffffUL,  // TX_8x8
+            0x5555555555555555UL,  // TX_16x16
+            0x1111111111111111UL,  // TX_32x32
+        };
+
+        // 64 bit masks for above transform size. Each 1 represents a position where
+        // we should apply a loop filter across the top border of an 8x8 block
+        // boundary.
+        //
+        // In the case of TX_32x32 ->  ( in low order byte first we end up with
+        // a mask that looks like this
+        //
+        //    11111111
+        //    00000000
+        //    00000000
+        //    00000000
+        //    11111111
+        //    00000000
+        //    00000000
+        //    00000000
+        //
+        // A loopfilter should be applied to every other 4 the row vertically.
+        private static readonly ulong[] Above64X64TxformMask = new ulong[]
+        {
+            0xffffffffffffffffUL,  // TX_4X4
+            0xffffffffffffffffUL,  // TX_8x8
+            0x00ff00ff00ff00ffUL,  // TX_16x16
+            0x000000ff000000ffUL,  // TX_32x32
+        };
+
+        // 64 bit masks for prediction sizes (left). Each 1 represents a position
+        // where left border of an 8x8 block. These are aligned to the right most
+        // appropriate bit, and then shifted into place.
+        //
+        // In the case of TX_16x32 ->  ( low order byte first ) we end up with
+        // a mask that looks like this :
+        //
+        //  10000000
+        //  10000000
+        //  10000000
+        //  10000000
+        //  00000000
+        //  00000000
+        //  00000000
+        //  00000000
+        private static readonly ulong[] LeftPredictionMask = new ulong[]
+        {
+            0x0000000000000001UL,  // BLOCK_4X4,
+            0x0000000000000001UL,  // BLOCK_4X8,
+            0x0000000000000001UL,  // BLOCK_8X4,
+            0x0000000000000001UL,  // BLOCK_8X8,
+            0x0000000000000101UL,  // BLOCK_8X16,
+            0x0000000000000001UL,  // BLOCK_16X8,
+            0x0000000000000101UL,  // BLOCK_16X16,
+            0x0000000001010101UL,  // BLOCK_16X32,
+            0x0000000000000101UL,  // BLOCK_32X16,
+            0x0000000001010101UL,  // BLOCK_32X32,
+            0x0101010101010101UL,  // BLOCK_32X64,
+            0x0000000001010101UL,  // BLOCK_64X32,
+            0x0101010101010101UL,  // BLOCK_64X64
+        };
+
+        // 64 bit mask to shift and set for each prediction size.
+        private static readonly ulong[] AbovePredictionMask = new ulong[]
+        {
+            0x0000000000000001UL,  // BLOCK_4X4
+            0x0000000000000001UL,  // BLOCK_4X8
+            0x0000000000000001UL,  // BLOCK_8X4
+            0x0000000000000001UL,  // BLOCK_8X8
+            0x0000000000000001UL,  // BLOCK_8X16,
+            0x0000000000000003UL,  // BLOCK_16X8
+            0x0000000000000003UL,  // BLOCK_16X16
+            0x0000000000000003UL,  // BLOCK_16X32,
+            0x000000000000000fUL,  // BLOCK_32X16,
+            0x000000000000000fUL,  // BLOCK_32X32,
+            0x000000000000000fUL,  // BLOCK_32X64,
+            0x00000000000000ffUL,  // BLOCK_64X32,
+            0x00000000000000ffUL,  // BLOCK_64X64
+        };
+
+        // 64 bit mask to shift and set for each prediction size. A bit is set for
+        // each 8x8 block that would be in the left most block of the given block
+        // size in the 64x64 block.
+        private static readonly ulong[] SizeMask = new ulong[]
+        {
+            0x0000000000000001UL,  // BLOCK_4X4
+            0x0000000000000001UL,  // BLOCK_4X8
+            0x0000000000000001UL,  // BLOCK_8X4
+            0x0000000000000001UL,  // BLOCK_8X8
+            0x0000000000000101UL,  // BLOCK_8X16,
+            0x0000000000000003UL,  // BLOCK_16X8
+            0x0000000000000303UL,  // BLOCK_16X16
+            0x0000000003030303UL,  // BLOCK_16X32,
+            0x0000000000000f0fUL,  // BLOCK_32X16,
+            0x000000000f0f0f0fUL,  // BLOCK_32X32,
+            0x0f0f0f0f0f0f0f0fUL,  // BLOCK_32X64,
+            0x00000000ffffffffUL,  // BLOCK_64X32,
+            0xffffffffffffffffUL,  // BLOCK_64X64
+        };
+
+        // These are used for masking the left and above borders.
+        private const ulong LeftBorder = 0x1111111111111111UL;
+        private const ulong AboveBorder = 0x000000ff000000ffUL;
+
+        // 16 bit masks for uv transform sizes.
+        private static readonly ushort[] Left64X64TxformMaskUv = new ushort[]
+        {
+            0xffff,  // TX_4X4
+            0xffff,  // TX_8x8
+            0x5555,  // TX_16x16
+            0x1111,  // TX_32x32
+        };
+
+        private static readonly ushort[] Above64X64TxformMaskUv = new ushort[]
+        {
+            0xffff,  // TX_4X4
+            0xffff,  // TX_8x8
+            0x0f0f,  // TX_16x16
+            0x000f,  // TX_32x32
+        };
+
+        // 16 bit left mask to shift and set for each uv prediction size.
+        private static readonly ushort[] LeftPredictionMaskUv = new ushort[]
+        {
+            0x0001,  // BLOCK_4X4,
+            0x0001,  // BLOCK_4X8,
+            0x0001,  // BLOCK_8X4,
+            0x0001,  // BLOCK_8X8,
+            0x0001,  // BLOCK_8X16,
+            0x0001,  // BLOCK_16X8,
+            0x0001,  // BLOCK_16X16,
+            0x0011,  // BLOCK_16X32,
+            0x0001,  // BLOCK_32X16,
+            0x0011,  // BLOCK_32X32,
+            0x1111,  // BLOCK_32X64
+            0x0011,  // BLOCK_64X32,
+            0x1111,  // BLOCK_64X64
+        };
+
+        // 16 bit above mask to shift and set for uv each prediction size.
+        private static readonly ushort[] AbovePredictionMaskUv = new ushort[]
+        {
+            0x0001,  // BLOCK_4X4
+            0x0001,  // BLOCK_4X8
+            0x0001,  // BLOCK_8X4
+            0x0001,  // BLOCK_8X8
+            0x0001,  // BLOCK_8X16,
+            0x0001,  // BLOCK_16X8
+            0x0001,  // BLOCK_16X16
+            0x0001,  // BLOCK_16X32,
+            0x0003,  // BLOCK_32X16,
+            0x0003,  // BLOCK_32X32,
+            0x0003,  // BLOCK_32X64,
+            0x000f,  // BLOCK_64X32,
+            0x000f,  // BLOCK_64X64
+        };
+
+        // 64 bit mask to shift and set for each uv prediction size
+        private static readonly ushort[] SizeMaskUv = new ushort[]
+        {
+            0x0001,  // BLOCK_4X4
+            0x0001,  // BLOCK_4X8
+            0x0001,  // BLOCK_8X4
+            0x0001,  // BLOCK_8X8
+            0x0001,  // BLOCK_8X16,
+            0x0001,  // BLOCK_16X8
+            0x0001,  // BLOCK_16X16
+            0x0011,  // BLOCK_16X32,
+            0x0003,  // BLOCK_32X16,
+            0x0033,  // BLOCK_32X32,
+            0x3333,  // BLOCK_32X64,
+            0x00ff,  // BLOCK_64X32,
+            0xffff,  // BLOCK_64X64
+        };
+
+        private const ushort LeftBorderUv = 0x1111;
+        private const ushort AboveBorderUv = 0x000f;
+
+        private static readonly int[] ModeLfLut = new int[]
+        {
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
+            1, 1, 0, 1                     // INTER_MODES (ZEROMV == 0)
+        };
+
+        private static byte GetFilterLevel(ref LoopFilterInfoN lfiN, ref ModeInfo mi)
+        {
+            return lfiN.Lvl[mi.SegmentId][mi.RefFrame[0]][ModeLfLut[(int)mi.Mode]];
+        }
+
+        private static ref LoopFilterMask GetLfm(ref Types.LoopFilter lf, int miRow, int miCol)
+        {
+            return ref lf.Lfm[(miCol >> 3) + ((miRow >> 3) * lf.LfmStride)];
+        }
+
+        // 8x8 blocks in a superblock. A "1" represents the first block in a 16x16
+        // or greater area.
+        private static readonly byte[][] FirstBlockIn16x16 = new byte[][]
+        {
+            new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 },
+            new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 },
+            new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 },
+            new byte[] { 1, 0, 1, 0, 1, 0, 1, 0 }, new byte[] { 0, 0, 0, 0, 0, 0, 0, 0 }
+        };
+
+        // This function sets up the bit masks for a block represented
+        // by miRow, miCol in a 64x64 region.
+        public static void BuildMask(ref Vp9Common cm, ref ModeInfo mi, int miRow, int miCol, int bw, int bh)
+        {
+            BlockSize blockSize = mi.SbType;
+            TxSize txSizeY = mi.TxSize;
+            ref LoopFilterInfoN lfiN = ref cm.LfInfo;
+            int filterLevel = GetFilterLevel(ref lfiN, ref mi);
+            TxSize txSizeUv = Luts.UvTxsizeLookup[(int)blockSize][(int)txSizeY][1][1];
+            ref LoopFilterMask lfm = ref GetLfm(ref cm.Lf, miRow, miCol);
+            ref ulong leftY = ref lfm.LeftY[(int)txSizeY];
+            ref ulong aboveY = ref lfm.AboveY[(int)txSizeY];
+            ref ulong int4X4Y = ref lfm.Int4x4Y;
+            ref ushort leftUv = ref lfm.LeftUv[(int)txSizeUv];
+            ref ushort aboveUv = ref lfm.AboveUv[(int)txSizeUv];
+            ref ushort int4X4Uv = ref lfm.Int4x4Uv;
+            int rowInSb = (miRow & 7);
+            int colInSb = (miCol & 7);
+            int shiftY = colInSb + (rowInSb << 3);
+            int shiftUv = (colInSb >> 1) + ((rowInSb >> 1) << 2);
+            int buildUv = FirstBlockIn16x16[rowInSb][colInSb];
+
+            if (filterLevel == 0)
+            {
+                return;
+            }
+            else
+            {
+                int index = shiftY;
+                int i;
+                for (i = 0; i < bh; i++)
+                {
+                    MemoryMarshal.CreateSpan(ref lfm.LflY[index], 64 - index).Slice(0, bw).Fill((byte)filterLevel);
+                    index += 8;
+                }
+            }
+
+            // These set 1 in the current block size for the block size edges.
+            // For instance if the block size is 32x16, we'll set:
+            //    above =   1111
+            //              0000
+            //    and
+            //    left  =   1000
+            //          =   1000
+            // NOTE : In this example the low bit is left most ( 1000 ) is stored as
+            //        1,  not 8...
+            //
+            // U and V set things on a 16 bit scale.
+            //
+            aboveY |= AbovePredictionMask[(int)blockSize] << shiftY;
+            leftY |= LeftPredictionMask[(int)blockSize] << shiftY;
+
+            if (buildUv != 0)
+            {
+                aboveUv |= (ushort)(AbovePredictionMaskUv[(int)blockSize] << shiftUv);
+                leftUv |= (ushort)(LeftPredictionMaskUv[(int)blockSize] << shiftUv);
+            }
+
+            // If the block has no coefficients and is not intra we skip applying
+            // the loop filter on block edges.
+            if (mi.Skip != 0 && mi.IsInterBlock())
+            {
+                return;
+            }
+
+            // Add a mask for the transform size. The transform size mask is set to
+            // be correct for a 64x64 prediction block size. Mask to match the size of
+            // the block we are working on and then shift it into place.
+            aboveY |= (SizeMask[(int)blockSize] & Above64X64TxformMask[(int)txSizeY]) << shiftY;
+            leftY |= (SizeMask[(int)blockSize] & Left64X64TxformMask[(int)txSizeY]) << shiftY;
+
+            if (buildUv != 0)
+            {
+                aboveUv |= (ushort)((SizeMaskUv[(int)blockSize] & Above64X64TxformMaskUv[(int)txSizeUv]) << shiftUv);
+                leftUv |= (ushort)((SizeMaskUv[(int)blockSize] & Left64X64TxformMaskUv[(int)txSizeUv]) << shiftUv);
+            }
+
+            // Try to determine what to do with the internal 4x4 block boundaries. These
+            // differ from the 4x4 boundaries on the outside edge of an 8x8 in that the
+            // internal ones can be skipped and don't depend on the prediction block size.
+            if (txSizeY == TxSize.Tx4x4)
+            {
+                int4X4Y |= SizeMask[(int)blockSize] << shiftY;
+            }
+
+            if (buildUv != 0 && txSizeUv == TxSize.Tx4x4)
+            {
+                int4X4Uv |= (ushort)((SizeMaskUv[(int)blockSize] & 0xffff) << shiftUv);
+            }
+        }
+
+        public static unsafe void ResetLfm(ref Vp9Common cm)
+        {
+            if (cm.Lf.FilterLevel != 0)
+            {
+                MemoryUtil.Fill(cm.Lf.Lfm.ToPointer(), new LoopFilterMask(), ((cm.MiRows + (Constants.MiBlockSize - 1)) >> 3) * cm.Lf.LfmStride);
+            }
+        }
+
+        private static void UpdateSharpness(ref LoopFilterInfoN lfi, int sharpnessLvl)
+        {
+            int lvl;
+
+            // For each possible value for the loop filter fill out limits
+            for (lvl = 0; lvl <= MaxLoopFilter; lvl++)
+            {
+                // Set loop filter parameters that control sharpness.
+                int blockInsideLimit = lvl >> ((sharpnessLvl > 0 ? 1 : 0) + (sharpnessLvl > 4 ? 1 : 0));
+
+                if (sharpnessLvl > 0)
+                {
+                    if (blockInsideLimit > (9 - sharpnessLvl))
+                    {
+                        blockInsideLimit = (9 - sharpnessLvl);
+                    }
+                }
+
+                if (blockInsideLimit < 1)
+                {
+                    blockInsideLimit = 1;
+                }
+
+                lfi.Lfthr[lvl].Lim.ToSpan().Fill((byte)blockInsideLimit);
+                lfi.Lfthr[lvl].Mblim.ToSpan().Fill((byte)(2 * (lvl + 2) + blockInsideLimit));
+            }
+        }
+
+        public static void LoopFilterFrameInit(ref Vp9Common cm, int defaultFiltLvl)
+        {
+            int segId;
+            // nShift is the multiplier for lfDeltas
+            // the multiplier is 1 for when filterLvl is between 0 and 31;
+            // 2 when filterLvl is between 32 and 63
+            int scale = 1 << (defaultFiltLvl >> 5);
+            ref LoopFilterInfoN lfi = ref cm.LfInfo;
+            ref Types.LoopFilter lf = ref cm.Lf;
+            ref Segmentation seg = ref cm.Seg;
+
+            // Update limits if sharpness has changed
+            if (lf.LastSharpnessLevel != lf.SharpnessLevel)
+            {
+                UpdateSharpness(ref lfi, lf.SharpnessLevel);
+                lf.LastSharpnessLevel = lf.SharpnessLevel;
+            }
+
+            for (segId = 0; segId < Constants.MaxSegments; segId++)
+            {
+                int lvlSeg = defaultFiltLvl;
+                if (seg.IsSegFeatureActive(segId, SegLvlFeatures.SegLvlAltLf) != 0)
+                {
+                    int data = seg.GetSegData(segId, SegLvlFeatures.SegLvlAltLf);
+                    lvlSeg = Math.Clamp(seg.AbsDelta == Constants.SegmentAbsData ? data : defaultFiltLvl + data, 0, MaxLoopFilter);
+                }
+
+                if (!lf.ModeRefDeltaEnabled)
+                {
+                    // We could get rid of this if we assume that deltas are set to
+                    // zero when not in use; encoder always uses deltas
+                    MemoryMarshal.Cast<Array2<byte>, byte>(lfi.Lvl[segId].ToSpan()).Fill((byte)lvlSeg);
+                }
+                else
+                {
+                    int refr, mode;
+                    int intraLvl = lvlSeg + lf.RefDeltas[Constants.IntraFrame] * scale;
+                    lfi.Lvl[segId][Constants.IntraFrame][0] = (byte)Math.Clamp(intraLvl, 0, MaxLoopFilter);
+
+                    for (refr = Constants.LastFrame; refr < Constants.MaxRefFrames; ++refr)
+                    {
+                        for (mode = 0; mode < MaxModeLfDeltas; ++mode)
+                        {
+                            int interLvl = lvlSeg + lf.RefDeltas[refr] * scale + lf.ModeDeltas[mode] * scale;
+                            lfi.Lvl[segId][refr][mode] = (byte)Math.Clamp(interLvl, 0, MaxLoopFilter);
+                        }
+                    }
+                }
+            }
+        }
+    }
+}

+ 1612 - 0
Ryujinx.Graphics.Nvdec.Vp9/Luts.cs

@@ -0,0 +1,1612 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class Luts
+    {
+        public static readonly byte[] SizeGroupLookup = new byte[]
+        {
+            0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3
+        };
+
+        public static readonly BlockSize[][] SubsizeLookup = new BlockSize[][]
+        {
+            new BlockSize[]
+            { // PARTITION_NONE
+                BlockSize.Block4x4, BlockSize.Block4x8, BlockSize.Block8x4, BlockSize.Block8x8, BlockSize.Block8x16, BlockSize.Block16x8,
+                BlockSize.Block16x16, BlockSize.Block16x32, BlockSize.Block32x16, BlockSize.Block32x32, BlockSize.Block32x64,
+                BlockSize.Block64x32, BlockSize.Block64x64
+            },
+            new BlockSize[]
+            { // PARTITION_HORZ
+                BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block8x4, BlockSize.BlockInvalid,
+                BlockSize.BlockInvalid, BlockSize.Block16x8, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block32x16,
+                BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block64x32
+            },
+            new BlockSize[]
+            { // PARTITION_VERT
+                BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block4x8, BlockSize.BlockInvalid,
+                BlockSize.BlockInvalid, BlockSize.Block8x16, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block16x32,
+                BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block32x64
+            },
+            new BlockSize[]
+            { // PARTITION_SPLIT
+                BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block4x4, BlockSize.BlockInvalid,
+                BlockSize.BlockInvalid, BlockSize.Block8x8, BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block16x16,
+                BlockSize.BlockInvalid, BlockSize.BlockInvalid, BlockSize.Block32x32
+            }
+        };
+
+        public static readonly TxSize[] MaxTxSizeLookup = new TxSize[]
+        {
+            TxSize.Tx4x4,   TxSize.Tx4x4,   TxSize.Tx4x4,   TxSize.Tx8x8,   TxSize.Tx8x8,   TxSize.Tx8x8,  TxSize.Tx16x16,
+            TxSize.Tx16x16, TxSize.Tx16x16, TxSize.Tx32x32, TxSize.Tx32x32, TxSize.Tx32x32, TxSize.Tx32x32
+        };
+
+        public static readonly TxSize[] TxModeToBiggestTxSize = new TxSize[]
+        {
+            TxSize.Tx4x4,    // ONLY_4X4
+            TxSize.Tx8x8,    // ALLOW_8X8
+            TxSize.Tx16x16,  // ALLOW_16X16
+            TxSize.Tx32x32,  // ALLOW_32X32
+            TxSize.Tx32x32,  // TX_MODE_SELECT
+        };
+
+        public static readonly BlockSize[][][] SsSizeLookup = new BlockSize[][][]
+        {
+            //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
+            //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block4x4, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.BlockInvalid } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block4x8, BlockSize.Block4x4 }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.BlockInvalid } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block8x4, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.Block4x4, BlockSize.BlockInvalid } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block8x8, BlockSize.Block8x4 }, new BlockSize[] { BlockSize.Block4x8, BlockSize.Block4x4 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block8x16, BlockSize.Block8x8 }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.Block4x8 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block16x8, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.Block8x8, BlockSize.Block8x4 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block16x16, BlockSize.Block16x8 }, new BlockSize[] { BlockSize.Block8x16, BlockSize.Block8x8 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block16x32, BlockSize.Block16x16 }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.Block8x16 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block32x16, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.Block16x16, BlockSize.Block16x8 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block32x32, BlockSize.Block32x16 }, new BlockSize[] { BlockSize.Block16x32, BlockSize.Block16x16 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block32x64, BlockSize.Block32x32 }, new BlockSize[] { BlockSize.BlockInvalid, BlockSize.Block16x32 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block64x32, BlockSize.BlockInvalid }, new BlockSize[] { BlockSize.Block32x32, BlockSize.Block32x16 } },
+            new BlockSize[][] { new BlockSize[] { BlockSize.Block64x64, BlockSize.Block64x32 }, new BlockSize[] { BlockSize.Block32x64, BlockSize.Block32x32 } },
+        };
+
+        public static readonly TxSize[][][][] UvTxsizeLookup = new TxSize[][][][]
+        {
+          //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
+          //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
+          new TxSize[][][]
+          {
+              // BLOCK_4X4
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+          },
+          new TxSize[][][]
+          {
+              // BLOCK_4X8
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+          },
+          new TxSize[][][]
+          {
+              // BLOCK_8X4
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+          },
+          new TxSize[][][]
+          {
+              // BLOCK_8X8
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+          },
+          new TxSize[][][]
+          {
+              // BLOCK_8X16
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+          },
+          new TxSize[][][]
+          {
+              // BLOCK_16X8
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+          },
+          new TxSize[][][]
+          {
+              // BLOCK_16X16
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+          },
+          new TxSize[][][]
+          {
+              // BLOCK_16X32
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+          },
+          new TxSize[][][]
+          {
+              // BLOCK_32X16
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx8x8 } },
+          },
+          new TxSize[][][]
+          {
+              // BLOCK_32X32
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx32x32, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
+          },
+          new TxSize[][][]
+          {
+              // BLOCK_32X64
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx32x32, TxSize.Tx32x32 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
+          },
+          new TxSize[][][]
+          {
+              // BLOCK_64X32
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx32x32, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx32x32, TxSize.Tx16x16 } },
+          },
+          new TxSize[][][]
+          {
+              // BLOCK_64X64
+              new TxSize[][] { new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 }, new TxSize[] { TxSize.Tx4x4, TxSize.Tx4x4 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 }, new TxSize[] { TxSize.Tx8x8, TxSize.Tx8x8 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 }, new TxSize[] { TxSize.Tx16x16, TxSize.Tx16x16 } },
+              new TxSize[][] { new TxSize[] { TxSize.Tx32x32, TxSize.Tx32x32 }, new TxSize[] { TxSize.Tx32x32, TxSize.Tx32x32 } },
+          },
+        };
+
+        public struct PartitionContextPair
+        {
+            public sbyte Above;
+            public sbyte Left;
+
+            public PartitionContextPair(sbyte above, sbyte left)
+            {
+                Above = above;
+                Left = left;
+            }
+        }
+
+        // Generates 4 bit field in which each bit set to 1 represents
+        // a blocksize partition  1111 means we split 64x64, 32x32, 16x16
+        // and 8x8. 1000 means we just split the 64x64 to 32x32
+        public static readonly PartitionContextPair[] PartitionContextLookup = new PartitionContextPair[]
+        {
+            new PartitionContextPair(15, 15),  // 4X4   - {0b1111, 0b1111}
+            new PartitionContextPair(15, 14),  // 4X8   - {0b1111, 0b1110}
+            new PartitionContextPair(14, 15),  // 8X4   - {0b1110, 0b1111}
+            new PartitionContextPair(14, 14),  // 8X8   - {0b1110, 0b1110}
+            new PartitionContextPair(14, 12),  // 8X16  - {0b1110, 0b1100}
+            new PartitionContextPair(12, 14),  // 16X8  - {0b1100, 0b1110}
+            new PartitionContextPair(12, 12),  // 16X16 - {0b1100, 0b1100}
+            new PartitionContextPair(12, 8),   // 16X32 - {0b1100, 0b1000}
+            new PartitionContextPair(8, 12),   // 32X16 - {0b1000, 0b1100}
+            new PartitionContextPair(8, 8),    // 32X32 - {0b1000, 0b1000}
+            new PartitionContextPair(8, 0),    // 32X64 - {0b1000, 0b0000}
+            new PartitionContextPair(0, 8),    // 64X32 - {0b0000, 0b1000}
+            new PartitionContextPair(0, 0),    // 64X64 - {0b0000, 0b0000}
+        };
+
+        // Filter
+
+        private static readonly Array8<short>[] BilinearFilters = new Array8<short>[]
+        {
+            NewArray8Short(0, 0, 0, 128, 0, 0, 0, 0),  NewArray8Short(0, 0, 0, 120, 8, 0, 0, 0),
+            NewArray8Short(0, 0, 0, 112, 16, 0, 0, 0), NewArray8Short(0, 0, 0, 104, 24, 0, 0, 0),
+            NewArray8Short(0, 0, 0, 96, 32, 0, 0, 0),  NewArray8Short(0, 0, 0, 88, 40, 0, 0, 0),
+            NewArray8Short(0, 0, 0, 80, 48, 0, 0, 0),  NewArray8Short(0, 0, 0, 72, 56, 0, 0, 0),
+            NewArray8Short(0, 0, 0, 64, 64, 0, 0, 0),  NewArray8Short(0, 0, 0, 56, 72, 0, 0, 0),
+            NewArray8Short(0, 0, 0, 48, 80, 0, 0, 0),  NewArray8Short(0, 0, 0, 40, 88, 0, 0, 0),
+            NewArray8Short(0, 0, 0, 32, 96, 0, 0, 0),  NewArray8Short(0, 0, 0, 24, 104, 0, 0, 0),
+            NewArray8Short(0, 0, 0, 16, 112, 0, 0, 0), NewArray8Short(0, 0, 0, 8, 120, 0, 0, 0)
+        };
+
+        // Lagrangian interpolation filter
+        private static readonly Array8<short>[] SubPelFilters8 = new Array8<short>[]
+        {
+            NewArray8Short(0, 0, 0, 128, 0, 0, 0, 0),        NewArray8Short(0, 1, -5, 126, 8, -3, 1, 0),
+            NewArray8Short(-1, 3, -10, 122, 18, -6, 2, 0),   NewArray8Short(-1, 4, -13, 118, 27, -9, 3, -1),
+            NewArray8Short(-1, 4, -16, 112, 37, -11, 4, -1), NewArray8Short(-1, 5, -18, 105, 48, -14, 4, -1),
+            NewArray8Short(-1, 5, -19, 97, 58, -16, 5, -1),  NewArray8Short(-1, 6, -19, 88, 68, -18, 5, -1),
+            NewArray8Short(-1, 6, -19, 78, 78, -19, 6, -1),  NewArray8Short(-1, 5, -18, 68, 88, -19, 6, -1),
+            NewArray8Short(-1, 5, -16, 58, 97, -19, 5, -1),  NewArray8Short(-1, 4, -14, 48, 105, -18, 5, -1),
+            NewArray8Short(-1, 4, -11, 37, 112, -16, 4, -1), NewArray8Short(-1, 3, -9, 27, 118, -13, 4, -1),
+            NewArray8Short(0, 2, -6, 18, 122, -10, 3, -1),   NewArray8Short(0, 1, -3, 8, 126, -5, 1, 0)
+        };
+
+        // DCT based filter
+        private static readonly Array8<short>[] SubPelFilters8S = new Array8<short>[]
+        {
+            NewArray8Short(0, 0, 0, 128, 0, 0, 0, 0),         NewArray8Short(-1, 3, -7, 127, 8, -3, 1, 0),
+            NewArray8Short(-2, 5, -13, 125, 17, -6, 3, -1),   NewArray8Short(-3, 7, -17, 121, 27, -10, 5, -2),
+            NewArray8Short(-4, 9, -20, 115, 37, -13, 6, -2),  NewArray8Short(-4, 10, -23, 108, 48, -16, 8, -3),
+            NewArray8Short(-4, 10, -24, 100, 59, -19, 9, -3), NewArray8Short(-4, 11, -24, 90, 70, -21, 10, -4),
+            NewArray8Short(-4, 11, -23, 80, 80, -23, 11, -4), NewArray8Short(-4, 10, -21, 70, 90, -24, 11, -4),
+            NewArray8Short(-3, 9, -19, 59, 100, -24, 10, -4), NewArray8Short(-3, 8, -16, 48, 108, -23, 10, -4),
+            NewArray8Short(-2, 6, -13, 37, 115, -20, 9, -4),  NewArray8Short(-2, 5, -10, 27, 121, -17, 7, -3),
+            NewArray8Short(-1, 3, -6, 17, 125, -13, 5, -2),   NewArray8Short(0, 1, -3, 8, 127, -7, 3, -1)
+        };
+
+        // freqmultiplier = 0.5
+        private static readonly Array8<short>[] SubPelFilters8Lp = new Array8<short>[]
+        {
+            NewArray8Short(0, 0, 0, 128, 0, 0, 0, 0),       NewArray8Short(-3, -1, 32, 64, 38, 1, -3, 0),
+            NewArray8Short(-2, -2, 29, 63, 41, 2, -3, 0),   NewArray8Short(-2, -2, 26, 63, 43, 4, -4, 0),
+            NewArray8Short(-2, -3, 24, 62, 46, 5, -4, 0),   NewArray8Short(-2, -3, 21, 60, 49, 7, -4, 0),
+            NewArray8Short(-1, -4, 18, 59, 51, 9, -4, 0),   NewArray8Short(-1, -4, 16, 57, 53, 12, -4, -1),
+            NewArray8Short(-1, -4, 14, 55, 55, 14, -4, -1), NewArray8Short(-1, -4, 12, 53, 57, 16, -4, -1),
+            NewArray8Short(0, -4, 9, 51, 59, 18, -4, -1),   NewArray8Short(0, -4, 7, 49, 60, 21, -3, -2),
+            NewArray8Short(0, -4, 5, 46, 62, 24, -3, -2),   NewArray8Short(0, -4, 4, 43, 63, 26, -2, -2),
+            NewArray8Short(0, -3, 2, 41, 63, 29, -2, -2),   NewArray8Short(0, -3, 1, 38, 64, 32, -1, -3)
+        };
+
+        private static Array8<short> NewArray8Short(short e0, short e1, short e2, short e3, short e4, short e5, short e6, short e7)
+        {
+            Array8<short> output = new Array8<short>();
+
+            output[0] = e0;
+            output[1] = e1;
+            output[2] = e2;
+            output[3] = e3;
+            output[4] = e4;
+            output[5] = e5;
+            output[6] = e6;
+            output[7] = e7;
+
+            return output;
+        }
+
+        public static readonly Array8<short>[][] Vp9FilterKernels = new Array8<short>[][]
+        {
+            SubPelFilters8, SubPelFilters8Lp, SubPelFilters8S, BilinearFilters
+        };
+
+        // Scan
+
+        private static readonly short[] DefaultScan4X4 = new short[]
+        {
+            0, 4, 1, 5, 8, 2, 12, 9, 3, 6, 13, 10, 7, 14, 11, 15,
+        };
+
+        private static readonly short[] ColScan4X4 = new short[]
+        {
+            0, 4, 8, 1, 12, 5, 9, 2, 13, 6, 10, 3, 7, 14, 11, 15,
+        };
+
+        private static readonly short[] RowScan4X4 = new short[]
+        {
+            0, 1, 4, 2, 5, 3, 6, 8, 9, 7, 12, 10, 13, 11, 14, 15,
+        };
+
+        private static readonly short[] DefaultScan8X8 = new short[]
+        {
+            0,  8,  1,  16, 9,  2,  17, 24, 10, 3,  18, 25, 32, 11, 4,  26,
+            33, 19, 40, 12, 34, 27, 5,  41, 20, 48, 13, 35, 42, 28, 21, 6,
+            49, 56, 36, 43, 29, 7,  14, 50, 57, 44, 22, 37, 15, 51, 58, 30,
+            45, 23, 52, 59, 38, 31, 60, 53, 46, 39, 61, 54, 47, 62, 55, 63,
+        };
+
+        private static readonly short[] ColScan8X8 = new short[]
+        {
+            0,  8,  16, 1,  24, 9,  32, 17, 2,  40, 25, 10, 33, 18, 48, 3,
+            26, 41, 11, 56, 19, 34, 4,  49, 27, 42, 12, 35, 20, 57, 50, 28,
+            5,  43, 13, 36, 58, 51, 21, 44, 6,  29, 59, 37, 14, 52, 22, 7,
+            45, 60, 30, 15, 38, 53, 23, 46, 31, 61, 39, 54, 47, 62, 55, 63,
+        };
+
+        private static readonly short[] RowScan8X8 = new short[]
+        {
+            0,  1,  2,  8,  9,  3,  16, 10, 4,  17, 11, 24, 5,  18, 25, 12,
+            19, 26, 32, 6,  13, 20, 33, 27, 7,  34, 40, 21, 28, 41, 14, 35,
+            48, 42, 29, 36, 49, 22, 43, 15, 56, 37, 50, 44, 30, 57, 23, 51,
+            58, 45, 38, 52, 31, 59, 53, 46, 60, 39, 61, 47, 54, 55, 62, 63,
+        };
+
+        private static readonly short[] DefaultScan16X16 = new short[]
+        {
+            0,   16,  1,   32,  17,  2,   48,  33,  18,  3,   64,  34,  49,  19,  65,
+            80,  50,  4,   35,  66,  20,  81,  96,  51,  5,   36,  82,  97,  67,  112,
+            21,  52,  98,  37,  83,  113, 6,   68,  128, 53,  22,  99,  114, 84,  7,
+            129, 38,  69,  100, 115, 144, 130, 85,  54,  23,  8,   145, 39,  70,  116,
+            101, 131, 160, 146, 55,  86,  24,  71,  132, 117, 161, 40,  9,   102, 147,
+            176, 162, 87,  56,  25,  133, 118, 177, 148, 72,  103, 41,  163, 10,  192,
+            178, 88,  57,  134, 149, 119, 26,  164, 73,  104, 193, 42,  179, 208, 11,
+            135, 89,  165, 120, 150, 58,  194, 180, 27,  74,  209, 105, 151, 136, 43,
+            90,  224, 166, 195, 181, 121, 210, 59,  12,  152, 106, 167, 196, 75,  137,
+            225, 211, 240, 182, 122, 91,  28,  197, 13,  226, 168, 183, 153, 44,  212,
+            138, 107, 241, 60,  29,  123, 198, 184, 227, 169, 242, 76,  213, 154, 45,
+            92,  14,  199, 139, 61,  228, 214, 170, 185, 243, 108, 77,  155, 30,  15,
+            200, 229, 124, 215, 244, 93,  46,  186, 171, 201, 109, 140, 230, 62,  216,
+            245, 31,  125, 78,  156, 231, 47,  187, 202, 217, 94,  246, 141, 63,  232,
+            172, 110, 247, 157, 79,  218, 203, 126, 233, 188, 248, 95,  173, 142, 219,
+            111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251,
+            190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239,
+            255,
+        };
+
+        private static readonly short[] ColScan16X16 = new short[]
+        {
+            0,   16,  32,  48,  1,   64,  17,  80,  33,  96,  49,  2,   65,  112, 18,
+            81,  34,  128, 50,  97,  3,   66,  144, 19,  113, 35,  82,  160, 98,  51,
+            129, 4,   67,  176, 20,  114, 145, 83,  36,  99,  130, 52,  192, 5,   161,
+            68,  115, 21,  146, 84,  208, 177, 37,  131, 100, 53,  162, 224, 69,  6,
+            116, 193, 147, 85,  22,  240, 132, 38,  178, 101, 163, 54,  209, 117, 70,
+            7,   148, 194, 86,  179, 225, 23,  133, 39,  164, 8,   102, 210, 241, 55,
+            195, 118, 149, 71,  180, 24,  87,  226, 134, 165, 211, 40,  103, 56,  72,
+            150, 196, 242, 119, 9,   181, 227, 88,  166, 25,  135, 41,  104, 212, 57,
+            151, 197, 120, 73,  243, 182, 136, 167, 213, 89,  10,  228, 105, 152, 198,
+            26,  42,  121, 183, 244, 168, 58,  137, 229, 74,  214, 90,  153, 199, 184,
+            11,  106, 245, 27,  122, 230, 169, 43,  215, 59,  200, 138, 185, 246, 75,
+            12,  91,  154, 216, 231, 107, 28,  44,  201, 123, 170, 60,  247, 232, 76,
+            139, 13,  92,  217, 186, 248, 155, 108, 29,  124, 45,  202, 233, 171, 61,
+            14,  77,  140, 15,  249, 93,  30,  187, 156, 218, 46,  109, 125, 62,  172,
+            78,  203, 31,  141, 234, 94,  47,  188, 63,  157, 110, 250, 219, 79,  126,
+            204, 173, 142, 95,  189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236,
+            159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239,
+            255,
+        };
+
+        private static readonly short[] RowScan16X16 = new short[]
+        {
+            0,   1,   2,   16,  3,   17,  4,   18,  32,  5,   33,  19,  6,   34,  48,
+            20,  49,  7,   35,  21,  50,  64,  8,   36,  65,  22,  51,  37,  80,  9,
+            66,  52,  23,  38,  81,  67,  10,  53,  24,  82,  68,  96,  39,  11,  54,
+            83,  97,  69,  25,  98,  84,  40,  112, 55,  12,  70,  99,  113, 85,  26,
+            41,  56,  114, 100, 13,  71,  128, 86,  27,  115, 101, 129, 42,  57,  72,
+            116, 14,  87,  130, 102, 144, 73,  131, 117, 28,  58,  15,  88,  43,  145,
+            103, 132, 146, 118, 74,  160, 89,  133, 104, 29,  59,  147, 119, 44,  161,
+            148, 90,  105, 134, 162, 120, 176, 75,  135, 149, 30,  60,  163, 177, 45,
+            121, 91,  106, 164, 178, 150, 192, 136, 165, 179, 31,  151, 193, 76,  122,
+            61,  137, 194, 107, 152, 180, 208, 46,  166, 167, 195, 92,  181, 138, 209,
+            123, 153, 224, 196, 77,  168, 210, 182, 240, 108, 197, 62,  154, 225, 183,
+            169, 211, 47,  139, 93,  184, 226, 212, 241, 198, 170, 124, 155, 199, 78,
+            213, 185, 109, 227, 200, 63,  228, 242, 140, 214, 171, 186, 156, 229, 243,
+            125, 94,  201, 244, 215, 216, 230, 141, 187, 202, 79,  172, 110, 157, 245,
+            217, 231, 95,  246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158, 188,
+            248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175,
+            190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254,
+            255,
+        };
+
+        private static readonly short[] DefaultScan32X32 = new short[]
+        {
+            0,    32,   1,    64,  33,   2,    96,   65,   34,   128,  3,    97,   66,
+            160,  129,  35,   98,  4,    67,   130,  161,  192,  36,   99,   224,  5,
+            162,  193,  68,   131, 37,   100,  225,  194,  256,  163,  69,   132,  6,
+            226,  257,  288,  195, 101,  164,  38,   258,  7,    227,  289,  133,  320,
+            70,   196,  165,  290, 259,  228,  39,   321,  102,  352,  8,    197,  71,
+            134,  322,  291,  260, 353,  384,  229,  166,  103,  40,   354,  323,  292,
+            135,  385,  198,  261, 72,   9,    416,  167,  386,  355,  230,  324,  104,
+            293,  41,   417,  199, 136,  262,  387,  448,  325,  356,  10,   73,   418,
+            231,  168,  449,  294, 388,  105,  419,  263,  42,   200,  357,  450,  137,
+            480,  74,   326,  232, 11,   389,  169,  295,  420,  106,  451,  481,  358,
+            264,  327,  201,  43,  138,  512,  482,  390,  296,  233,  170,  421,  75,
+            452,  359,  12,   513, 265,  483,  328,  107,  202,  514,  544,  422,  391,
+            453,  139,  44,   234, 484,  297,  360,  171,  76,   515,  545,  266,  329,
+            454,  13,   423,  203, 108,  546,  485,  576,  298,  235,  140,  361,  330,
+            172,  547,  45,   455, 267,  577,  486,  77,   204,  362,  608,  14,   299,
+            578,  109,  236,  487, 609,  331,  141,  579,  46,   15,   173,  610,  363,
+            78,   205,  16,   110, 237,  611,  142,  47,   174,  79,   206,  17,   111,
+            238,  48,   143,  80,  175,  112,  207,  49,   18,   239,  81,   113,  19,
+            50,   82,   114,  51,  83,   115,  640,  516,  392,  268,  144,  20,   672,
+            641,  548,  517,  424, 393,  300,  269,  176,  145,  52,   21,   704,  673,
+            642,  580,  549,  518, 456,  425,  394,  332,  301,  270,  208,  177,  146,
+            84,   53,   22,   736, 705,  674,  643,  612,  581,  550,  519,  488,  457,
+            426,  395,  364,  333, 302,  271,  240,  209,  178,  147,  116,  85,   54,
+            23,   737,  706,  675, 613,  582,  551,  489,  458,  427,  365,  334,  303,
+            241,  210,  179,  117, 86,   55,   738,  707,  614,  583,  490,  459,  366,
+            335,  242,  211,  118, 87,   739,  615,  491,  367,  243,  119,  768,  644,
+            520,  396,  272,  148, 24,   800,  769,  676,  645,  552,  521,  428,  397,
+            304,  273,  180,  149, 56,   25,   832,  801,  770,  708,  677,  646,  584,
+            553,  522,  460,  429, 398,  336,  305,  274,  212,  181,  150,  88,   57,
+            26,   864,  833,  802, 771,  740,  709,  678,  647,  616,  585,  554,  523,
+            492,  461,  430,  399, 368,  337,  306,  275,  244,  213,  182,  151,  120,
+            89,   58,   27,   865, 834,  803,  741,  710,  679,  617,  586,  555,  493,
+            462,  431,  369,  338, 307,  245,  214,  183,  121,  90,   59,   866,  835,
+            742,  711,  618,  587, 494,  463,  370,  339,  246,  215,  122,  91,   867,
+            743,  619,  495,  371, 247,  123,  896,  772,  648,  524,  400,  276,  152,
+            28,   928,  897,  804, 773,  680,  649,  556,  525,  432,  401,  308,  277,
+            184,  153,  60,   29,  960,  929,  898,  836,  805,  774,  712,  681,  650,
+            588,  557,  526,  464, 433,  402,  340,  309,  278,  216,  185,  154,  92,
+            61,   30,   992,  961, 930,  899,  868,  837,  806,  775,  744,  713,  682,
+            651,  620,  589,  558, 527,  496,  465,  434,  403,  372,  341,  310,  279,
+            248,  217,  186,  155, 124,  93,   62,   31,   993,  962,  931,  869,  838,
+            807,  745,  714,  683, 621,  590,  559,  497,  466,  435,  373,  342,  311,
+            249,  218,  187,  125, 94,   63,   994,  963,  870,  839,  746,  715,  622,
+            591,  498,  467,  374, 343,  250,  219,  126,  95,   995,  871,  747,  623,
+            499,  375,  251,  127, 900,  776,  652,  528,  404,  280,  156,  932,  901,
+            808,  777,  684,  653, 560,  529,  436,  405,  312,  281,  188,  157,  964,
+            933,  902,  840,  809, 778,  716,  685,  654,  592,  561,  530,  468,  437,
+            406,  344,  313,  282, 220,  189,  158,  996,  965,  934,  903,  872,  841,
+            810,  779,  748,  717, 686,  655,  624,  593,  562,  531,  500,  469,  438,
+            407,  376,  345,  314, 283,  252,  221,  190,  159,  997,  966,  935,  873,
+            842,  811,  749,  718, 687,  625,  594,  563,  501,  470,  439,  377,  346,
+            315,  253,  222,  191, 998,  967,  874,  843,  750,  719,  626,  595,  502,
+            471,  378,  347,  254, 223,  999,  875,  751,  627,  503,  379,  255,  904,
+            780,  656,  532,  408, 284,  936,  905,  812,  781,  688,  657,  564,  533,
+            440,  409,  316,  285, 968,  937,  906,  844,  813,  782,  720,  689,  658,
+            596,  565,  534,  472, 441,  410,  348,  317,  286,  1000, 969,  938,  907,
+            876,  845,  814,  783, 752,  721,  690,  659,  628,  597,  566,  535,  504,
+            473,  442,  411,  380, 349,  318,  287,  1001, 970,  939,  877,  846,  815,
+            753,  722,  691,  629, 598,  567,  505,  474,  443,  381,  350,  319,  1002,
+            971,  878,  847,  754, 723,  630,  599,  506,  475,  382,  351,  1003, 879,
+            755,  631,  507,  383, 908,  784,  660,  536,  412,  940,  909,  816,  785,
+            692,  661,  568,  537, 444,  413,  972,  941,  910,  848,  817,  786,  724,
+            693,  662,  600,  569, 538,  476,  445,  414,  1004, 973,  942,  911,  880,
+            849,  818,  787,  756, 725,  694,  663,  632,  601,  570,  539,  508,  477,
+            446,  415,  1005, 974, 943,  881,  850,  819,  757,  726,  695,  633,  602,
+            571,  509,  478,  447, 1006, 975,  882,  851,  758,  727,  634,  603,  510,
+            479,  1007, 883,  759, 635,  511,  912,  788,  664,  540,  944,  913,  820,
+            789,  696,  665,  572, 541,  976,  945,  914,  852,  821,  790,  728,  697,
+            666,  604,  573,  542, 1008, 977,  946,  915,  884,  853,  822,  791,  760,
+            729,  698,  667,  636, 605,  574,  543,  1009, 978,  947,  885,  854,  823,
+            761,  730,  699,  637, 606,  575,  1010, 979,  886,  855,  762,  731,  638,
+            607,  1011, 887,  763, 639,  916,  792,  668,  948,  917,  824,  793,  700,
+            669,  980,  949,  918, 856,  825,  794,  732,  701,  670,  1012, 981,  950,
+            919,  888,  857,  826, 795,  764,  733,  702,  671,  1013, 982,  951,  889,
+            858,  827,  765,  734, 703,  1014, 983,  890,  859,  766,  735,  1015, 891,
+            767,  920,  796,  952, 921,  828,  797,  984,  953,  922,  860,  829,  798,
+            1016, 985,  954,  923, 892,  861,  830,  799,  1017, 986,  955,  893,  862,
+            831,  1018, 987,  894, 863,  1019, 895,  924,  956,  925,  988,  957,  926,
+            1020, 989,  958,  927, 1021, 990,  959,  1022, 991,  1023,
+        };
+
+        // Neighborhood 2-tuples for various scans and blocksizes,
+        // in {top, left} order for each position in corresponding scan order.
+        private static readonly short[] DefaultScan4X4Neighbors = new short[]
+        {
+            0, 0, 0, 0, 0,  0, 1, 4, 4, 4,  1,  1, 8,  8,  5,  8, 2,
+            2, 2, 5, 9, 12, 6, 9, 3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
+        };
+
+        private static readonly short[] ColScan4X4Neighbors = new short[]
+        {
+            0, 0, 0, 0, 4, 4, 0, 0, 8, 8,  1,  1, 5, 5,  1,  1, 9,
+            9, 2, 2, 6, 6, 2, 2, 3, 3, 10, 10, 7, 7, 11, 11, 0, 0,
+        };
+
+        private static readonly short[] RowScan4X4Neighbors = new short[]
+        {
+            0, 0, 0, 0, 0, 0, 1, 1,  4,  4,  2,  2,  5,  5,  4,  4, 8,
+            8, 6, 6, 8, 8, 9, 9, 12, 12, 10, 10, 13, 13, 14, 14, 0, 0,
+        };
+
+        private static readonly short[] ColScan8X8Neighbors = new short[]
+        {
+            0,  0,  0,  0,  8,  8,  0,  0,  16, 16, 1,  1,  24, 24, 9,  9,  1,  1,  32,
+            32, 17, 17, 2,  2,  25, 25, 10, 10, 40, 40, 2,  2,  18, 18, 33, 33, 3,  3,
+            48, 48, 11, 11, 26, 26, 3,  3,  41, 41, 19, 19, 34, 34, 4,  4,  27, 27, 12,
+            12, 49, 49, 42, 42, 20, 20, 4,  4,  35, 35, 5,  5,  28, 28, 50, 50, 43, 43,
+            13, 13, 36, 36, 5,  5,  21, 21, 51, 51, 29, 29, 6,  6,  44, 44, 14, 14, 6,
+            6,  37, 37, 52, 52, 22, 22, 7,  7,  30, 30, 45, 45, 15, 15, 38, 38, 23, 23,
+            53, 53, 31, 31, 46, 46, 39, 39, 54, 54, 47, 47, 55, 55, 0,  0,
+        };
+
+        private static readonly short[] RowScan8X8Neighbors = new short[]
+        {
+            0,  0,  0,  0,  1,  1,  0,  0,  8,  8,  2,  2,  8,  8,  9,  9,  3,  3,  16,
+            16, 10, 10, 16, 16, 4,  4,  17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24,
+            5,  5,  12, 12, 19, 19, 32, 32, 26, 26, 6,  6,  33, 33, 32, 32, 20, 20, 27,
+            27, 40, 40, 13, 13, 34, 34, 40, 40, 41, 41, 28, 28, 35, 35, 48, 48, 21, 21,
+            42, 42, 14, 14, 48, 48, 36, 36, 49, 49, 43, 43, 29, 29, 56, 56, 22, 22, 50,
+            50, 57, 57, 44, 44, 37, 37, 51, 51, 30, 30, 58, 58, 52, 52, 45, 45, 59, 59,
+            38, 38, 60, 60, 46, 46, 53, 53, 54, 54, 61, 61, 62, 62, 0,  0,
+        };
+
+        private static readonly short[] DefaultScan8X8Neighbors = new short[]
+        {
+            0,  0,  0,  0,  0,  0,  8,  8,  1,  8,  1,  1,  9,  16, 16, 16, 2,  9,  2,
+            2,  10, 17, 17, 24, 24, 24, 3,  10, 3,  3,  18, 25, 25, 32, 11, 18, 32, 32,
+            4,  11, 26, 33, 19, 26, 4,  4,  33, 40, 12, 19, 40, 40, 5,  12, 27, 34, 34,
+            41, 20, 27, 13, 20, 5,  5,  41, 48, 48, 48, 28, 35, 35, 42, 21, 28, 6,  6,
+            6,  13, 42, 49, 49, 56, 36, 43, 14, 21, 29, 36, 7,  14, 43, 50, 50, 57, 22,
+            29, 37, 44, 15, 22, 44, 51, 51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45,
+            31, 38, 53, 60, 46, 53, 39, 46, 54, 61, 47, 54, 55, 62, 0,  0,
+        };
+
+        private static readonly short[] ColScan16X16Neighbors = new short[]
+        {
+            0,   0,   0,   0,   16,  16,  32,  32,  0,   0,   48,  48,  1,   1,   64,
+            64,  17,  17,  80,  80,  33,  33,  1,   1,   49,  49,  96,  96,  2,   2,
+            65,  65,  18,  18,  112, 112, 34,  34,  81,  81,  2,   2,   50,  50,  128,
+            128, 3,   3,   97,  97,  19,  19,  66,  66,  144, 144, 82,  82,  35,  35,
+            113, 113, 3,   3,   51,  51,  160, 160, 4,   4,   98,  98,  129, 129, 67,
+            67,  20,  20,  83,  83,  114, 114, 36,  36,  176, 176, 4,   4,   145, 145,
+            52,  52,  99,  99,  5,   5,   130, 130, 68,  68,  192, 192, 161, 161, 21,
+            21,  115, 115, 84,  84,  37,  37,  146, 146, 208, 208, 53,  53,  5,   5,
+            100, 100, 177, 177, 131, 131, 69,  69,  6,   6,   224, 224, 116, 116, 22,
+            22,  162, 162, 85,  85,  147, 147, 38,  38,  193, 193, 101, 101, 54,  54,
+            6,   6,   132, 132, 178, 178, 70,  70,  163, 163, 209, 209, 7,   7,   117,
+            117, 23,  23,  148, 148, 7,   7,   86,  86,  194, 194, 225, 225, 39,  39,
+            179, 179, 102, 102, 133, 133, 55,  55,  164, 164, 8,   8,   71,  71,  210,
+            210, 118, 118, 149, 149, 195, 195, 24,  24,  87,  87,  40,  40,  56,  56,
+            134, 134, 180, 180, 226, 226, 103, 103, 8,   8,   165, 165, 211, 211, 72,
+            72,  150, 150, 9,   9,   119, 119, 25,  25,  88,  88,  196, 196, 41,  41,
+            135, 135, 181, 181, 104, 104, 57,  57,  227, 227, 166, 166, 120, 120, 151,
+            151, 197, 197, 73,  73,  9,   9,   212, 212, 89,  89,  136, 136, 182, 182,
+            10,  10,  26,  26,  105, 105, 167, 167, 228, 228, 152, 152, 42,  42,  121,
+            121, 213, 213, 58,  58,  198, 198, 74,  74,  137, 137, 183, 183, 168, 168,
+            10,  10,  90,  90,  229, 229, 11,  11,  106, 106, 214, 214, 153, 153, 27,
+            27,  199, 199, 43,  43,  184, 184, 122, 122, 169, 169, 230, 230, 59,  59,
+            11,  11,  75,  75,  138, 138, 200, 200, 215, 215, 91,  91,  12,  12,  28,
+            28,  185, 185, 107, 107, 154, 154, 44,  44,  231, 231, 216, 216, 60,  60,
+            123, 123, 12,  12,  76,  76,  201, 201, 170, 170, 232, 232, 139, 139, 92,
+            92,  13,  13,  108, 108, 29,  29,  186, 186, 217, 217, 155, 155, 45,  45,
+            13,  13,  61,  61,  124, 124, 14,  14,  233, 233, 77,  77,  14,  14,  171,
+            171, 140, 140, 202, 202, 30,  30,  93,  93,  109, 109, 46,  46,  156, 156,
+            62,  62,  187, 187, 15,  15,  125, 125, 218, 218, 78,  78,  31,  31,  172,
+            172, 47,  47,  141, 141, 94,  94,  234, 234, 203, 203, 63,  63,  110, 110,
+            188, 188, 157, 157, 126, 126, 79,  79,  173, 173, 95,  95,  219, 219, 142,
+            142, 204, 204, 235, 235, 111, 111, 158, 158, 127, 127, 189, 189, 220, 220,
+            143, 143, 174, 174, 205, 205, 236, 236, 159, 159, 190, 190, 221, 221, 175,
+            175, 237, 237, 206, 206, 222, 222, 191, 191, 238, 238, 207, 207, 223, 223,
+            239, 239, 0,   0,
+        };
+
+        private static readonly short[] RowScan16X16Neighbors = new short[]
+        {
+            0,   0,   0,   0,   1,   1,   0,   0,   2,   2,   16,  16,  3,   3,   17,
+            17,  16,  16,  4,   4,   32,  32,  18,  18,  5,   5,   33,  33,  32,  32,
+            19,  19,  48,  48,  6,   6,   34,  34,  20,  20,  49,  49,  48,  48,  7,
+            7,   35,  35,  64,  64,  21,  21,  50,  50,  36,  36,  64,  64,  8,   8,
+            65,  65,  51,  51,  22,  22,  37,  37,  80,  80,  66,  66,  9,   9,   52,
+            52,  23,  23,  81,  81,  67,  67,  80,  80,  38,  38,  10,  10,  53,  53,
+            82,  82,  96,  96,  68,  68,  24,  24,  97,  97,  83,  83,  39,  39,  96,
+            96,  54,  54,  11,  11,  69,  69,  98,  98,  112, 112, 84,  84,  25,  25,
+            40,  40,  55,  55,  113, 113, 99,  99,  12,  12,  70,  70,  112, 112, 85,
+            85,  26,  26,  114, 114, 100, 100, 128, 128, 41,  41,  56,  56,  71,  71,
+            115, 115, 13,  13,  86,  86,  129, 129, 101, 101, 128, 128, 72,  72,  130,
+            130, 116, 116, 27,  27,  57,  57,  14,  14,  87,  87,  42,  42,  144, 144,
+            102, 102, 131, 131, 145, 145, 117, 117, 73,  73,  144, 144, 88,  88,  132,
+            132, 103, 103, 28,  28,  58,  58,  146, 146, 118, 118, 43,  43,  160, 160,
+            147, 147, 89,  89,  104, 104, 133, 133, 161, 161, 119, 119, 160, 160, 74,
+            74,  134, 134, 148, 148, 29,  29,  59,  59,  162, 162, 176, 176, 44,  44,
+            120, 120, 90,  90,  105, 105, 163, 163, 177, 177, 149, 149, 176, 176, 135,
+            135, 164, 164, 178, 178, 30,  30,  150, 150, 192, 192, 75,  75,  121, 121,
+            60,  60,  136, 136, 193, 193, 106, 106, 151, 151, 179, 179, 192, 192, 45,
+            45,  165, 165, 166, 166, 194, 194, 91,  91,  180, 180, 137, 137, 208, 208,
+            122, 122, 152, 152, 208, 208, 195, 195, 76,  76,  167, 167, 209, 209, 181,
+            181, 224, 224, 107, 107, 196, 196, 61,  61,  153, 153, 224, 224, 182, 182,
+            168, 168, 210, 210, 46,  46,  138, 138, 92,  92,  183, 183, 225, 225, 211,
+            211, 240, 240, 197, 197, 169, 169, 123, 123, 154, 154, 198, 198, 77,  77,
+            212, 212, 184, 184, 108, 108, 226, 226, 199, 199, 62,  62,  227, 227, 241,
+            241, 139, 139, 213, 213, 170, 170, 185, 185, 155, 155, 228, 228, 242, 242,
+            124, 124, 93,  93,  200, 200, 243, 243, 214, 214, 215, 215, 229, 229, 140,
+            140, 186, 186, 201, 201, 78,  78,  171, 171, 109, 109, 156, 156, 244, 244,
+            216, 216, 230, 230, 94,  94,  245, 245, 231, 231, 125, 125, 202, 202, 246,
+            246, 232, 232, 172, 172, 217, 217, 141, 141, 110, 110, 157, 157, 187, 187,
+            247, 247, 126, 126, 233, 233, 218, 218, 248, 248, 188, 188, 203, 203, 142,
+            142, 173, 173, 158, 158, 249, 249, 234, 234, 204, 204, 219, 219, 174, 174,
+            189, 189, 250, 250, 220, 220, 190, 190, 205, 205, 235, 235, 206, 206, 236,
+            236, 251, 251, 221, 221, 252, 252, 222, 222, 237, 237, 238, 238, 253, 253,
+            254, 254, 0,   0,
+        };
+
+        private static readonly short[] DefaultScan16X16Neighbors = new short[]
+        {
+            0,   0,   0,   0,   0,   0,   16,  16,  1,   16,  1,   1,   32,  32,  17,
+            32,  2,   17,  2,   2,   48,  48,  18,  33,  33,  48,  3,   18,  49,  64,
+            64,  64,  34,  49,  3,   3,   19,  34,  50,  65,  4,   19,  65,  80,  80,
+            80,  35,  50,  4,   4,   20,  35,  66,  81,  81,  96,  51,  66,  96,  96,
+            5,   20,  36,  51,  82,  97,  21,  36,  67,  82,  97,  112, 5,   5,   52,
+            67,  112, 112, 37,  52,  6,   21,  83,  98,  98,  113, 68,  83,  6,   6,
+            113, 128, 22,  37,  53,  68,  84,  99,  99,  114, 128, 128, 114, 129, 69,
+            84,  38,  53,  7,   22,  7,   7,   129, 144, 23,  38,  54,  69,  100, 115,
+            85,  100, 115, 130, 144, 144, 130, 145, 39,  54,  70,  85,  8,   23,  55,
+            70,  116, 131, 101, 116, 145, 160, 24,  39,  8,   8,   86,  101, 131, 146,
+            160, 160, 146, 161, 71,  86,  40,  55,  9,   24,  117, 132, 102, 117, 161,
+            176, 132, 147, 56,  71,  87,  102, 25,  40,  147, 162, 9,   9,   176, 176,
+            162, 177, 72,  87,  41,  56,  118, 133, 133, 148, 103, 118, 10,  25,  148,
+            163, 57,  72,  88,  103, 177, 192, 26,  41,  163, 178, 192, 192, 10,  10,
+            119, 134, 73,  88,  149, 164, 104, 119, 134, 149, 42,  57,  178, 193, 164,
+            179, 11,  26,  58,  73,  193, 208, 89,  104, 135, 150, 120, 135, 27,  42,
+            74,  89,  208, 208, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43,
+            58,  11,  11,  136, 151, 90,  105, 151, 166, 180, 195, 59,  74,  121, 136,
+            209, 224, 195, 210, 224, 224, 166, 181, 106, 121, 75,  90,  12,  27,  181,
+            196, 12,  12,  210, 225, 152, 167, 167, 182, 137, 152, 28,  43,  196, 211,
+            122, 137, 91,  106, 225, 240, 44,  59,  13,  28,  107, 122, 182, 197, 168,
+            183, 211, 226, 153, 168, 226, 241, 60,  75,  197, 212, 138, 153, 29,  44,
+            76,  91,  13,  13,  183, 198, 123, 138, 45,  60,  212, 227, 198, 213, 154,
+            169, 169, 184, 227, 242, 92,  107, 61,  76,  139, 154, 14,  29,  14,  14,
+            184, 199, 213, 228, 108, 123, 199, 214, 228, 243, 77,  92,  30,  45,  170,
+            185, 155, 170, 185, 200, 93,  108, 124, 139, 214, 229, 46,  61,  200, 215,
+            229, 244, 15,  30,  109, 124, 62,  77,  140, 155, 215, 230, 31,  46,  171,
+            186, 186, 201, 201, 216, 78,  93,  230, 245, 125, 140, 47,  62,  216, 231,
+            156, 171, 94,  109, 231, 246, 141, 156, 63,  78,  202, 217, 187, 202, 110,
+            125, 217, 232, 172, 187, 232, 247, 79,  94,  157, 172, 126, 141, 203, 218,
+            95,  110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203, 234,
+            249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235, 250,
+            174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205, 236,
+            251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223, 238,
+            239, 254, 0,   0,
+        };
+
+        private static readonly short[] DefaultScan32X32Neighbors = new short[]
+        {
+            0,   0,    0,   0,    0,   0,    32,  32,   1,   32,  1,   1,    64,  64,
+            33,  64,   2,   33,   96,  96,   2,   2,    65,  96,  34,  65,   128, 128,
+            97,  128,  3,   34,   66,  97,   3,   3,    35,  66,  98,  129,  129, 160,
+            160, 160,  4,   35,   67,  98,   192, 192,  4,   4,   130, 161,  161, 192,
+            36,  67,   99,  130,  5,   36,   68,  99,   193, 224, 162, 193,  224, 224,
+            131, 162,  37,  68,   100, 131,  5,   5,    194, 225, 225, 256,  256, 256,
+            163, 194,  69,  100,  132, 163,  6,   37,   226, 257, 6,   6,    195, 226,
+            257, 288,  101, 132,  288, 288,  38,  69,   164, 195, 133, 164,  258, 289,
+            227, 258,  196, 227,  7,   38,   289, 320,  70,  101, 320, 320,  7,   7,
+            165, 196,  39,  70,   102, 133,  290, 321,  259, 290, 228, 259,  321, 352,
+            352, 352,  197, 228,  134, 165,  71,  102,  8,   39,  322, 353,  291, 322,
+            260, 291,  103, 134,  353, 384,  166, 197,  229, 260, 40,  71,   8,   8,
+            384, 384,  135, 166,  354, 385,  323, 354,  198, 229, 292, 323,  72,  103,
+            261, 292,  9,   40,   385, 416,  167, 198,  104, 135, 230, 261,  355, 386,
+            416, 416,  293, 324,  324, 355,  9,   9,    41,  72,  386, 417,  199, 230,
+            136, 167,  417, 448,  262, 293,  356, 387,  73,  104, 387, 418,  231, 262,
+            10,  41,   168, 199,  325, 356,  418, 449,  105, 136, 448, 448,  42,  73,
+            294, 325,  200, 231,  10,  10,   357, 388,  137, 168, 263, 294,  388, 419,
+            74,  105,  419, 450,  449, 480,  326, 357,  232, 263, 295, 326,  169, 200,
+            11,  42,   106, 137,  480, 480,  450, 481,  358, 389, 264, 295,  201, 232,
+            138, 169,  389, 420,  43,  74,   420, 451,  327, 358, 11,  11,   481, 512,
+            233, 264,  451, 482,  296, 327,  75,  106,  170, 201, 482, 513,  512, 512,
+            390, 421,  359, 390,  421, 452,  107, 138,  12,  43,  202, 233,  452, 483,
+            265, 296,  328, 359,  139, 170,  44,  75,   483, 514, 513, 544,  234, 265,
+            297, 328,  422, 453,  12,  12,   391, 422,  171, 202, 76,  107,  514, 545,
+            453, 484,  544, 544,  266, 297,  203, 234,  108, 139, 329, 360,  298, 329,
+            140, 171,  515, 546,  13,  44,   423, 454,  235, 266, 545, 576,  454, 485,
+            45,  76,   172, 203,  330, 361,  576, 576,  13,  13,  267, 298,  546, 577,
+            77,  108,  204, 235,  455, 486,  577, 608,  299, 330, 109, 140,  547, 578,
+            14,  45,   14,  14,   141, 172,  578, 609,  331, 362, 46,  77,   173, 204,
+            15,  15,   78,  109,  205, 236,  579, 610,  110, 141, 15,  46,   142, 173,
+            47,  78,   174, 205,  16,  16,   79,  110,  206, 237, 16,  47,   111, 142,
+            48,  79,   143, 174,  80,  111,  175, 206,  17,  48,  17,  17,   207, 238,
+            49,  80,   81,  112,  18,  18,   18,  49,   50,  81,  82,  113,  19,  50,
+            51,  82,   83,  114,  608, 608,  484, 515,  360, 391, 236, 267,  112, 143,
+            19,  19,   640, 640,  609, 640,  516, 547,  485, 516, 392, 423,  361, 392,
+            268, 299,  237, 268,  144, 175,  113, 144,  20,  51,  20,  20,   672, 672,
+            641, 672,  610, 641,  548, 579,  517, 548,  486, 517, 424, 455,  393, 424,
+            362, 393,  300, 331,  269, 300,  238, 269,  176, 207, 145, 176,  114, 145,
+            52,  83,   21,  52,   21,  21,   704, 704,  673, 704, 642, 673,  611, 642,
+            580, 611,  549, 580,  518, 549,  487, 518,  456, 487, 425, 456,  394, 425,
+            363, 394,  332, 363,  301, 332,  270, 301,  239, 270, 208, 239,  177, 208,
+            146, 177,  115, 146,  84,  115,  53,  84,   22,  53,  22,  22,   705, 736,
+            674, 705,  643, 674,  581, 612,  550, 581,  519, 550, 457, 488,  426, 457,
+            395, 426,  333, 364,  302, 333,  271, 302,  209, 240, 178, 209,  147, 178,
+            85,  116,  54,  85,   23,  54,   706, 737,  675, 706, 582, 613,  551, 582,
+            458, 489,  427, 458,  334, 365,  303, 334,  210, 241, 179, 210,  86,  117,
+            55,  86,   707, 738,  583, 614,  459, 490,  335, 366, 211, 242,  87,  118,
+            736, 736,  612, 643,  488, 519,  364, 395,  240, 271, 116, 147,  23,  23,
+            768, 768,  737, 768,  644, 675,  613, 644,  520, 551, 489, 520,  396, 427,
+            365, 396,  272, 303,  241, 272,  148, 179,  117, 148, 24,  55,   24,  24,
+            800, 800,  769, 800,  738, 769,  676, 707,  645, 676, 614, 645,  552, 583,
+            521, 552,  490, 521,  428, 459,  397, 428,  366, 397, 304, 335,  273, 304,
+            242, 273,  180, 211,  149, 180,  118, 149,  56,  87,  25,  56,   25,  25,
+            832, 832,  801, 832,  770, 801,  739, 770,  708, 739, 677, 708,  646, 677,
+            615, 646,  584, 615,  553, 584,  522, 553,  491, 522, 460, 491,  429, 460,
+            398, 429,  367, 398,  336, 367,  305, 336,  274, 305, 243, 274,  212, 243,
+            181, 212,  150, 181,  119, 150,  88,  119,  57,  88,  26,  57,   26,  26,
+            833, 864,  802, 833,  771, 802,  709, 740,  678, 709, 647, 678,  585, 616,
+            554, 585,  523, 554,  461, 492,  430, 461,  399, 430, 337, 368,  306, 337,
+            275, 306,  213, 244,  182, 213,  151, 182,  89,  120, 58,  89,   27,  58,
+            834, 865,  803, 834,  710, 741,  679, 710,  586, 617, 555, 586,  462, 493,
+            431, 462,  338, 369,  307, 338,  214, 245,  183, 214, 90,  121,  59,  90,
+            835, 866,  711, 742,  587, 618,  463, 494,  339, 370, 215, 246,  91,  122,
+            864, 864,  740, 771,  616, 647,  492, 523,  368, 399, 244, 275,  120, 151,
+            27,  27,   896, 896,  865, 896,  772, 803,  741, 772, 648, 679,  617, 648,
+            524, 555,  493, 524,  400, 431,  369, 400,  276, 307, 245, 276,  152, 183,
+            121, 152,  28,  59,   28,  28,   928, 928,  897, 928, 866, 897,  804, 835,
+            773, 804,  742, 773,  680, 711,  649, 680,  618, 649, 556, 587,  525, 556,
+            494, 525,  432, 463,  401, 432,  370, 401,  308, 339, 277, 308,  246, 277,
+            184, 215,  153, 184,  122, 153,  60,  91,   29,  60,  29,  29,   960, 960,
+            929, 960,  898, 929,  867, 898,  836, 867,  805, 836, 774, 805,  743, 774,
+            712, 743,  681, 712,  650, 681,  619, 650,  588, 619, 557, 588,  526, 557,
+            495, 526,  464, 495,  433, 464,  402, 433,  371, 402, 340, 371,  309, 340,
+            278, 309,  247, 278,  216, 247,  185, 216,  154, 185, 123, 154,  92,  123,
+            61,  92,   30,  61,   30,  30,   961, 992,  930, 961, 899, 930,  837, 868,
+            806, 837,  775, 806,  713, 744,  682, 713,  651, 682, 589, 620,  558, 589,
+            527, 558,  465, 496,  434, 465,  403, 434,  341, 372, 310, 341,  279, 310,
+            217, 248,  186, 217,  155, 186,  93,  124,  62,  93,  31,  62,   962, 993,
+            931, 962,  838, 869,  807, 838,  714, 745,  683, 714, 590, 621,  559, 590,
+            466, 497,  435, 466,  342, 373,  311, 342,  218, 249, 187, 218,  94,  125,
+            63,  94,   963, 994,  839, 870,  715, 746,  591, 622, 467, 498,  343, 374,
+            219, 250,  95,  126,  868, 899,  744, 775,  620, 651, 496, 527,  372, 403,
+            248, 279,  124, 155,  900, 931,  869, 900,  776, 807, 745, 776,  652, 683,
+            621, 652,  528, 559,  497, 528,  404, 435,  373, 404, 280, 311,  249, 280,
+            156, 187,  125, 156,  932, 963,  901, 932,  870, 901, 808, 839,  777, 808,
+            746, 777,  684, 715,  653, 684,  622, 653,  560, 591, 529, 560,  498, 529,
+            436, 467,  405, 436,  374, 405,  312, 343,  281, 312, 250, 281,  188, 219,
+            157, 188,  126, 157,  964, 995,  933, 964,  902, 933, 871, 902,  840, 871,
+            809, 840,  778, 809,  747, 778,  716, 747,  685, 716, 654, 685,  623, 654,
+            592, 623,  561, 592,  530, 561,  499, 530,  468, 499, 437, 468,  406, 437,
+            375, 406,  344, 375,  313, 344,  282, 313,  251, 282, 220, 251,  189, 220,
+            158, 189,  127, 158,  965, 996,  934, 965,  903, 934, 841, 872,  810, 841,
+            779, 810,  717, 748,  686, 717,  655, 686,  593, 624, 562, 593,  531, 562,
+            469, 500,  438, 469,  407, 438,  345, 376,  314, 345, 283, 314,  221, 252,
+            190, 221,  159, 190,  966, 997,  935, 966,  842, 873, 811, 842,  718, 749,
+            687, 718,  594, 625,  563, 594,  470, 501,  439, 470, 346, 377,  315, 346,
+            222, 253,  191, 222,  967, 998,  843, 874,  719, 750, 595, 626,  471, 502,
+            347, 378,  223, 254,  872, 903,  748, 779,  624, 655, 500, 531,  376, 407,
+            252, 283,  904, 935,  873, 904,  780, 811,  749, 780, 656, 687,  625, 656,
+            532, 563,  501, 532,  408, 439,  377, 408,  284, 315, 253, 284,  936, 967,
+            905, 936,  874, 905,  812, 843,  781, 812,  750, 781, 688, 719,  657, 688,
+            626, 657,  564, 595,  533, 564,  502, 533,  440, 471, 409, 440,  378, 409,
+            316, 347,  285, 316,  254, 285,  968, 999,  937, 968, 906, 937,  875, 906,
+            844, 875,  813, 844,  782, 813,  751, 782,  720, 751, 689, 720,  658, 689,
+            627, 658,  596, 627,  565, 596,  534, 565,  503, 534, 472, 503,  441, 472,
+            410, 441,  379, 410,  348, 379,  317, 348,  286, 317, 255, 286,  969, 1000,
+            938, 969,  907, 938,  845, 876,  814, 845,  783, 814, 721, 752,  690, 721,
+            659, 690,  597, 628,  566, 597,  535, 566,  473, 504, 442, 473,  411, 442,
+            349, 380,  318, 349,  287, 318,  970, 1001, 939, 970, 846, 877,  815, 846,
+            722, 753,  691, 722,  598, 629,  567, 598,  474, 505, 443, 474,  350, 381,
+            319, 350,  971, 1002, 847, 878,  723, 754,  599, 630, 475, 506,  351, 382,
+            876, 907,  752, 783,  628, 659,  504, 535,  380, 411, 908, 939,  877, 908,
+            784, 815,  753, 784,  660, 691,  629, 660,  536, 567, 505, 536,  412, 443,
+            381, 412,  940, 971,  909, 940,  878, 909,  816, 847, 785, 816,  754, 785,
+            692, 723,  661, 692,  630, 661,  568, 599,  537, 568, 506, 537,  444, 475,
+            413, 444,  382, 413,  972, 1003, 941, 972,  910, 941, 879, 910,  848, 879,
+            817, 848,  786, 817,  755, 786,  724, 755,  693, 724, 662, 693,  631, 662,
+            600, 631,  569, 600,  538, 569,  507, 538,  476, 507, 445, 476,  414, 445,
+            383, 414,  973, 1004, 942, 973,  911, 942,  849, 880, 818, 849,  787, 818,
+            725, 756,  694, 725,  663, 694,  601, 632,  570, 601, 539, 570,  477, 508,
+            446, 477,  415, 446,  974, 1005, 943, 974,  850, 881, 819, 850,  726, 757,
+            695, 726,  602, 633,  571, 602,  478, 509,  447, 478, 975, 1006, 851, 882,
+            727, 758,  603, 634,  479, 510,  880, 911,  756, 787, 632, 663,  508, 539,
+            912, 943,  881, 912,  788, 819,  757, 788,  664, 695, 633, 664,  540, 571,
+            509, 540,  944, 975,  913, 944,  882, 913,  820, 851, 789, 820,  758, 789,
+            696, 727,  665, 696,  634, 665,  572, 603,  541, 572, 510, 541,  976, 1007,
+            945, 976,  914, 945,  883, 914,  852, 883,  821, 852, 790, 821,  759, 790,
+            728, 759,  697, 728,  666, 697,  635, 666,  604, 635, 573, 604,  542, 573,
+            511, 542,  977, 1008, 946, 977,  915, 946,  853, 884, 822, 853,  791, 822,
+            729, 760,  698, 729,  667, 698,  605, 636,  574, 605, 543, 574,  978, 1009,
+            947, 978,  854, 885,  823, 854,  730, 761,  699, 730, 606, 637,  575, 606,
+            979, 1010, 855, 886,  731, 762,  607, 638,  884, 915, 760, 791,  636, 667,
+            916, 947,  885, 916,  792, 823,  761, 792,  668, 699, 637, 668,  948, 979,
+            917, 948,  886, 917,  824, 855,  793, 824,  762, 793, 700, 731,  669, 700,
+            638, 669,  980, 1011, 949, 980,  918, 949,  887, 918, 856, 887,  825, 856,
+            794, 825,  763, 794,  732, 763,  701, 732,  670, 701, 639, 670,  981, 1012,
+            950, 981,  919, 950,  857, 888,  826, 857,  795, 826, 733, 764,  702, 733,
+            671, 702,  982, 1013, 951, 982,  858, 889,  827, 858, 734, 765,  703, 734,
+            983, 1014, 859, 890,  735, 766,  888, 919,  764, 795, 920, 951,  889, 920,
+            796, 827,  765, 796,  952, 983,  921, 952,  890, 921, 828, 859,  797, 828,
+            766, 797,  984, 1015, 953, 984,  922, 953,  891, 922, 860, 891,  829, 860,
+            798, 829,  767, 798,  985, 1016, 954, 985,  923, 954, 861, 892,  830, 861,
+            799, 830,  986, 1017, 955, 986,  862, 893,  831, 862, 987, 1018, 863, 894,
+            892, 923,  924, 955,  893, 924,  956, 987,  925, 956, 894, 925,  988, 1019,
+            957, 988,  926, 957,  895, 926,  989, 1020, 958, 989, 927, 958,  990, 1021,
+            959, 990,  991, 1022, 0,   0,
+        };
+
+        private static readonly short[] Vp9DefaultIscan4X4 = new short[]
+        {
+            0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+        };
+
+        private static readonly short[] Vp9ColIscan4X4 = new short[]
+        {
+            0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
+        };
+
+        private static readonly short[] Vp9RowIscan4X4 = new short[]
+        {
+            0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
+        };
+
+        private static readonly short[] Vp9ColIscan8X8 = new short[]
+        {
+            0,  3,  8,  15, 22, 32, 40, 47, 1,  5,  11, 18, 26, 34, 44, 51,
+            2,  7,  13, 20, 28, 38, 46, 54, 4,  10, 16, 24, 31, 41, 50, 56,
+            6,  12, 21, 27, 35, 43, 52, 58, 9,  17, 25, 33, 39, 48, 55, 60,
+            14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
+        };
+
+        private static readonly short[] Vp9RowIscan8X8 = new short[]
+        {
+            0,  1,  2,  5,  8,  12, 19, 24, 3,  4,  7,  10, 15, 20, 30, 39,
+            6,  9,  13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
+            18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
+            32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
+        };
+
+        private static readonly short[] Vp9DefaultIscan8X8 = new short[]
+        {
+            0,  2,  5,  9,  14, 22, 31, 37, 1,  4,  8,  13, 19, 26, 38, 44,
+            3,  6,  10, 17, 24, 30, 42, 49, 7,  11, 15, 21, 29, 36, 47, 53,
+            12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
+            25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+        };
+
+        private static readonly short[] Vp9ColIscan16X16 = new short[]
+        {
+            0,  4,  11,  20,  31,  43,  59,  75,  85,  109, 130, 150, 165, 181, 195, 198,
+            1,  6,  14,  23,  34,  47,  64,  81,  95,  114, 135, 153, 171, 188, 201, 212,
+            2,  8,  16,  25,  38,  52,  67,  83,  101, 116, 136, 157, 172, 190, 205, 216,
+            3,  10, 18,  29,  41,  55,  71,  89,  103, 119, 141, 159, 176, 194, 208, 218,
+            5,  12, 21,  32,  45,  58,  74,  93,  104, 123, 144, 164, 179, 196, 210, 223,
+            7,  15, 26,  37,  49,  63,  78,  96,  112, 129, 146, 166, 182, 200, 215, 228,
+            9,  19, 28,  39,  54,  69,  86,  102, 117, 132, 151, 170, 187, 206, 220, 230,
+            13, 24, 35,  46,  60,  73,  91,  108, 122, 137, 154, 174, 189, 207, 224, 235,
+            17, 30, 40,  53,  66,  82,  98,  115, 126, 142, 161, 180, 197, 213, 227, 237,
+            22, 36, 48,  62,  76,  92,  105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
+            27, 44, 56,  70,  84,  99,  113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
+            33, 51, 68,  79,  94,  110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
+            42, 61, 77,  90,  106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
+            50, 72, 87,  100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
+            57, 80, 97,  111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
+            65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
+        };
+
+        private static readonly short[] Vp9RowIscan16X16 = new short[]
+        {
+            0,   1,   2,   4,   6,   9,   12,  17,  22,  29,  36,  43,  54,  64,  76,
+            86,  3,   5,   7,   11,  15,  19,  25,  32,  38,  48,  59,  68,  84,  99,
+            115, 130, 8,   10,  13,  18,  23,  27,  33,  42,  51,  60,  72,  88,  103,
+            119, 142, 167, 14,  16,  20,  26,  31,  37,  44,  53,  61,  73,  85,  100,
+            116, 135, 161, 185, 21,  24,  30,  35,  40,  47,  55,  65,  74,  81,  94,
+            112, 133, 154, 179, 205, 28,  34,  39,  45,  50,  58,  67,  77,  87,  96,
+            106, 121, 146, 169, 196, 212, 41,  46,  49,  56,  63,  70,  79,  90,  98,
+            107, 122, 138, 159, 182, 207, 222, 52,  57,  62,  69,  75,  83,  93,  102,
+            110, 120, 134, 150, 176, 195, 215, 226, 66,  71,  78,  82,  91,  97,  108,
+            113, 127, 136, 148, 168, 188, 202, 221, 232, 80,  89,  92,  101, 105, 114,
+            125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95,  104, 109, 117, 123,
+            128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129,
+            140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137,
+            145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149,
+            156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152,
+            163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253,
+            158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254,
+            255,
+        };
+
+        private static readonly short[] Vp9DefaultIscan16X16 = new short[]
+        {
+            0,   2,   5,   9,   17,  24,  36,  44,  55,  72,  88,  104, 128, 143, 166,
+            179, 1,   4,   8,   13,  20,  30,  40,  54,  66,  79,  96,  113, 141, 154,
+            178, 196, 3,   7,   11,  18,  25,  33,  46,  57,  71,  86,  101, 119, 148,
+            164, 186, 201, 6,   12,  16,  23,  31,  39,  53,  64,  78,  92,  110, 127,
+            153, 169, 193, 208, 10,  14,  19,  28,  37,  47,  58,  67,  84,  98,  114,
+            133, 161, 176, 198, 214, 15,  21,  26,  34,  43,  52,  65,  77,  91,  106,
+            120, 140, 165, 185, 205, 221, 22,  27,  32,  41,  48,  60,  73,  85,  99,
+            116, 130, 151, 175, 190, 211, 225, 29,  35,  42,  49,  59,  69,  81,  95,
+            108, 125, 139, 155, 182, 197, 217, 229, 38,  45,  51,  61,  68,  80,  93,
+            105, 118, 134, 150, 168, 191, 207, 223, 234, 50,  56,  63,  74,  83,  94,
+            109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62,  70,  76,  87,  97,
+            107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75,  82,  90,  102,
+            112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89,  100, 111,
+            123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115,
+            126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121,
+            135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254,
+            137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253,
+            255,
+        };
+
+        private static readonly short[] Vp9DefaultIscan32X32 = new short[]
+        {
+            0,    2,    5,    10,   17,   25,   38,   47,   62,   83,   101,  121,  145,
+            170,  193,  204,  210,  219,  229,  233,  245,  257,  275,  299,  342,  356,
+            377,  405,  455,  471,  495,  527,  1,    4,    8,    15,   22,   30,   45,
+            58,   74,   92,   112,  133,  158,  184,  203,  215,  222,  228,  234,  237,
+            256,  274,  298,  317,  355,  376,  404,  426,  470,  494,  526,  551,  3,
+            7,    12,   18,   28,   36,   52,   64,   82,   102,  118,  142,  164,  189,
+            208,  217,  224,  231,  235,  238,  273,  297,  316,  329,  375,  403,  425,
+            440,  493,  525,  550,  567,  6,    11,   16,   23,   31,   43,   60,   73,
+            90,   109,  126,  150,  173,  196,  211,  220,  226,  232,  236,  239,  296,
+            315,  328,  335,  402,  424,  439,  447,  524,  549,  566,  575,  9,    14,
+            19,   29,   37,   50,   65,   78,   95,   116,  134,  157,  179,  201,  214,
+            223,  244,  255,  272,  295,  341,  354,  374,  401,  454,  469,  492,  523,
+            582,  596,  617,  645,  13,   20,   26,   35,   44,   54,   72,   85,   105,
+            123,  140,  163,  182,  205,  216,  225,  254,  271,  294,  314,  353,  373,
+            400,  423,  468,  491,  522,  548,  595,  616,  644,  666,  21,   27,   33,
+            42,   53,   63,   80,   94,   113,  132,  151,  172,  190,  209,  218,  227,
+            270,  293,  313,  327,  372,  399,  422,  438,  490,  521,  547,  565,  615,
+            643,  665,  680,  24,   32,   39,   48,   57,   71,   88,   104,  120,  139,
+            159,  178,  197,  212,  221,  230,  292,  312,  326,  334,  398,  421,  437,
+            446,  520,  546,  564,  574,  642,  664,  679,  687,  34,   40,   46,   56,
+            68,   81,   96,   111,  130,  147,  167,  186,  243,  253,  269,  291,  340,
+            352,  371,  397,  453,  467,  489,  519,  581,  594,  614,  641,  693,  705,
+            723,  747,  41,   49,   55,   67,   77,   91,   107,  124,  138,  161,  177,
+            194,  252,  268,  290,  311,  351,  370,  396,  420,  466,  488,  518,  545,
+            593,  613,  640,  663,  704,  722,  746,  765,  51,   59,   66,   76,   89,
+            99,   119,  131,  149,  168,  181,  200,  267,  289,  310,  325,  369,  395,
+            419,  436,  487,  517,  544,  563,  612,  639,  662,  678,  721,  745,  764,
+            777,  61,   69,   75,   87,   100,  114,  129,  144,  162,  180,  191,  207,
+            288,  309,  324,  333,  394,  418,  435,  445,  516,  543,  562,  573,  638,
+            661,  677,  686,  744,  763,  776,  783,  70,   79,   86,   97,   108,  122,
+            137,  155,  242,  251,  266,  287,  339,  350,  368,  393,  452,  465,  486,
+            515,  580,  592,  611,  637,  692,  703,  720,  743,  788,  798,  813,  833,
+            84,   93,   103,  110,  125,  141,  154,  171,  250,  265,  286,  308,  349,
+            367,  392,  417,  464,  485,  514,  542,  591,  610,  636,  660,  702,  719,
+            742,  762,  797,  812,  832,  848,  98,   106,  115,  127,  143,  156,  169,
+            185,  264,  285,  307,  323,  366,  391,  416,  434,  484,  513,  541,  561,
+            609,  635,  659,  676,  718,  741,  761,  775,  811,  831,  847,  858,  117,
+            128,  136,  148,  160,  175,  188,  198,  284,  306,  322,  332,  390,  415,
+            433,  444,  512,  540,  560,  572,  634,  658,  675,  685,  740,  760,  774,
+            782,  830,  846,  857,  863,  135,  146,  152,  165,  241,  249,  263,  283,
+            338,  348,  365,  389,  451,  463,  483,  511,  579,  590,  608,  633,  691,
+            701,  717,  739,  787,  796,  810,  829,  867,  875,  887,  903,  153,  166,
+            174,  183,  248,  262,  282,  305,  347,  364,  388,  414,  462,  482,  510,
+            539,  589,  607,  632,  657,  700,  716,  738,  759,  795,  809,  828,  845,
+            874,  886,  902,  915,  176,  187,  195,  202,  261,  281,  304,  321,  363,
+            387,  413,  432,  481,  509,  538,  559,  606,  631,  656,  674,  715,  737,
+            758,  773,  808,  827,  844,  856,  885,  901,  914,  923,  192,  199,  206,
+            213,  280,  303,  320,  331,  386,  412,  431,  443,  508,  537,  558,  571,
+            630,  655,  673,  684,  736,  757,  772,  781,  826,  843,  855,  862,  900,
+            913,  922,  927,  240,  247,  260,  279,  337,  346,  362,  385,  450,  461,
+            480,  507,  578,  588,  605,  629,  690,  699,  714,  735,  786,  794,  807,
+            825,  866,  873,  884,  899,  930,  936,  945,  957,  246,  259,  278,  302,
+            345,  361,  384,  411,  460,  479,  506,  536,  587,  604,  628,  654,  698,
+            713,  734,  756,  793,  806,  824,  842,  872,  883,  898,  912,  935,  944,
+            956,  966,  258,  277,  301,  319,  360,  383,  410,  430,  478,  505,  535,
+            557,  603,  627,  653,  672,  712,  733,  755,  771,  805,  823,  841,  854,
+            882,  897,  911,  921,  943,  955,  965,  972,  276,  300,  318,  330,  382,
+            409,  429,  442,  504,  534,  556,  570,  626,  652,  671,  683,  732,  754,
+            770,  780,  822,  840,  853,  861,  896,  910,  920,  926,  954,  964,  971,
+            975,  336,  344,  359,  381,  449,  459,  477,  503,  577,  586,  602,  625,
+            689,  697,  711,  731,  785,  792,  804,  821,  865,  871,  881,  895,  929,
+            934,  942,  953,  977,  981,  987,  995,  343,  358,  380,  408,  458,  476,
+            502,  533,  585,  601,  624,  651,  696,  710,  730,  753,  791,  803,  820,
+            839,  870,  880,  894,  909,  933,  941,  952,  963,  980,  986,  994,  1001,
+            357,  379,  407,  428,  475,  501,  532,  555,  600,  623,  650,  670,  709,
+            729,  752,  769,  802,  819,  838,  852,  879,  893,  908,  919,  940,  951,
+            962,  970,  985,  993,  1000, 1005, 378,  406,  427,  441,  500,  531,  554,
+            569,  622,  649,  669,  682,  728,  751,  768,  779,  818,  837,  851,  860,
+            892,  907,  918,  925,  950,  961,  969,  974,  992,  999,  1004, 1007, 448,
+            457,  474,  499,  576,  584,  599,  621,  688,  695,  708,  727,  784,  790,
+            801,  817,  864,  869,  878,  891,  928,  932,  939,  949,  976,  979,  984,
+            991,  1008, 1010, 1013, 1017, 456,  473,  498,  530,  583,  598,  620,  648,
+            694,  707,  726,  750,  789,  800,  816,  836,  868,  877,  890,  906,  931,
+            938,  948,  960,  978,  983,  990,  998,  1009, 1012, 1016, 1020, 472,  497,
+            529,  553,  597,  619,  647,  668,  706,  725,  749,  767,  799,  815,  835,
+            850,  876,  889,  905,  917,  937,  947,  959,  968,  982,  989,  997,  1003,
+            1011, 1015, 1019, 1022, 496,  528,  552,  568,  618,  646,  667,  681,  724,
+            748,  766,  778,  814,  834,  849,  859,  888,  904,  916,  924,  946,  958,
+            967,  973,  988,  996,  1002, 1006, 1014, 1018, 1021, 1023,
+        };
+
+        public class ScanOrder
+        {
+            public short[] Scan { get; }
+            public short[] IScan { get; }
+            public short[] Neighbors { get; }
+
+            public ScanOrder(short[] scan, short[] iScan, short[] neighbors)
+            {
+                Scan = scan;
+                IScan = iScan;
+                Neighbors = neighbors;
+            }
+        }
+
+        public static readonly ScanOrder[] Vp9DefaultScanOrders = new ScanOrder[]
+        {
+            new ScanOrder(DefaultScan4X4, Vp9DefaultIscan4X4, DefaultScan4X4Neighbors),
+            new ScanOrder(DefaultScan8X8, Vp9DefaultIscan8X8, DefaultScan8X8Neighbors),
+            new ScanOrder(DefaultScan16X16, Vp9DefaultIscan16X16, DefaultScan16X16Neighbors),
+            new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors)
+        };
+
+        public static readonly ScanOrder[][] Vp9ScanOrders = new ScanOrder[][]
+        {
+            new ScanOrder[]
+            { // TX_4X4
+                new ScanOrder(DefaultScan4X4, Vp9DefaultIscan4X4, DefaultScan4X4Neighbors),
+                new ScanOrder(RowScan4X4, Vp9RowIscan4X4, RowScan4X4Neighbors),
+                new ScanOrder(ColScan4X4, Vp9ColIscan4X4, ColScan4X4Neighbors),
+                new ScanOrder(DefaultScan4X4, Vp9DefaultIscan4X4, DefaultScan4X4Neighbors)
+            },
+            new ScanOrder[]
+            { // TX_8X8
+                new ScanOrder(DefaultScan8X8, Vp9DefaultIscan8X8, DefaultScan8X8Neighbors),
+                new ScanOrder(RowScan8X8, Vp9RowIscan8X8, RowScan8X8Neighbors),
+                new ScanOrder(ColScan8X8, Vp9ColIscan8X8, ColScan8X8Neighbors),
+                new ScanOrder(DefaultScan8X8, Vp9DefaultIscan8X8, DefaultScan8X8Neighbors)
+            },
+            new ScanOrder[]
+            { // TX_16X16
+                new ScanOrder(DefaultScan16X16, Vp9DefaultIscan16X16, DefaultScan16X16Neighbors),
+                new ScanOrder(RowScan16X16, Vp9RowIscan16X16, RowScan16X16Neighbors),
+                new ScanOrder(ColScan16X16, Vp9ColIscan16X16, ColScan16X16Neighbors),
+                new ScanOrder(DefaultScan16X16, Vp9DefaultIscan16X16, DefaultScan16X16Neighbors)
+            },
+            new ScanOrder[]
+            { // TX_32X32
+                new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors),
+                new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors),
+                new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors),
+                new ScanOrder(DefaultScan32X32, Vp9DefaultIscan32X32, DefaultScan32X32Neighbors)
+            }
+        };
+
+        // Entropy MV
+
+        public static readonly sbyte[] Vp9MvJointTree = new sbyte[]
+        {
+            -(sbyte)MvJointType.MvJointZero, 2, -(sbyte)MvJointType.MvJointHnzvz, 4, -(sbyte)MvJointType.MvJointHzvnz, -(sbyte)MvJointType.MvJointHnzvnz
+        };
+
+        public static readonly sbyte[] Vp9MvClassTree = new sbyte[]
+        {
+            -(sbyte)MvClassType.MvClass0,
+            2,
+            -(sbyte)MvClassType.MvClass1,
+            4,
+            6,
+            8,
+            -(sbyte)MvClassType.MvClass2,
+            -(sbyte)MvClassType.MvClass3,
+            10,
+            12,
+            -(sbyte)MvClassType.MvClass4,
+            -(sbyte)MvClassType.MvClass5,
+            -(sbyte)MvClassType.MvClass6,
+            14,
+            16,
+            18,
+            -(sbyte)MvClassType.MvClass7,
+            -(sbyte)MvClassType.MvClass8,
+            -(sbyte)MvClassType.MvClass9,
+            -(sbyte)MvClassType.MvClass10,
+        };
+
+        public static readonly sbyte[] Vp9MvFPTree = new sbyte[] { -0, 2, -1, 4, -2, -3 };
+
+        // Entropy
+
+        public static readonly byte[] Vp9Cat1Prob = new byte[] { 159 };
+        public static readonly byte[] Vp9Cat2Prob = new byte[] { 165, 145 };
+        public static readonly byte[] Vp9Cat3Prob = new byte[] { 173, 148, 140 };
+        public static readonly byte[] Vp9Cat4Prob = new byte[] { 176, 155, 140, 135 };
+        public static readonly byte[] Vp9Cat5Prob = new byte[] { 180, 157, 141, 134, 130 };
+        public static readonly byte[] Vp9Cat6Prob = new byte[] { 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 };
+
+        public static readonly byte[] Vp9Cat6ProbHigh12 = new byte[]
+        {
+            255, 255, 255, 255, 254, 254, 54, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+        };
+
+        private static readonly byte[] Vp9CoefbandTrans8X8Plus = new byte[]
+        {
+            0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
+            // Beyond MAXBAND_INDEX+1 all values are filled as 5
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+        };
+
+        private static readonly byte[] Vp9CoefbandTrans4X4 = new byte[]
+        {
+            0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
+        };
+
+        public static byte[] get_band_translate(TxSize txSize)
+        {
+            return txSize == TxSize.Tx4x4 ? Vp9CoefbandTrans4X4 : Vp9CoefbandTrans8X8Plus;
+        }
+
+        public static readonly byte[][] Vp9Pareto8Full = new byte[][]
+        {
+            new byte[] { 3, 86, 128, 6, 86, 23, 88, 29 },
+            new byte[] { 6, 86, 128, 11, 87, 42, 91, 52 },
+            new byte[] { 9, 86, 129, 17, 88, 61, 94, 76 },
+            new byte[] { 12, 86, 129, 22, 88, 77, 97, 93 },
+            new byte[] { 15, 87, 129, 28, 89, 93, 100, 110 },
+            new byte[] { 17, 87, 129, 33, 90, 105, 103, 123 },
+            new byte[] { 20, 88, 130, 38, 91, 118, 106, 136 },
+            new byte[] { 23, 88, 130, 43, 91, 128, 108, 146 },
+            new byte[] { 26, 89, 131, 48, 92, 139, 111, 156 },
+            new byte[] { 28, 89, 131, 53, 93, 147, 114, 163 },
+            new byte[] { 31, 90, 131, 58, 94, 156, 117, 171 },
+            new byte[] { 34, 90, 131, 62, 94, 163, 119, 177 },
+            new byte[] { 37, 90, 132, 66, 95, 171, 122, 184 },
+            new byte[] { 39, 90, 132, 70, 96, 177, 124, 189 },
+            new byte[] { 42, 91, 132, 75, 97, 183, 127, 194 },
+            new byte[] { 44, 91, 132, 79, 97, 188, 129, 198 },
+            new byte[] { 47, 92, 133, 83, 98, 193, 132, 202 },
+            new byte[] { 49, 92, 133, 86, 99, 197, 134, 205 },
+            new byte[] { 52, 93, 133, 90, 100, 201, 137, 208 },
+            new byte[] { 54, 93, 133, 94, 100, 204, 139, 211 },
+            new byte[] { 57, 94, 134, 98, 101, 208, 142, 214 },
+            new byte[] { 59, 94, 134, 101, 102, 211, 144, 216 },
+            new byte[] { 62, 94, 135, 105, 103, 214, 146, 218 },
+            new byte[] { 64, 94, 135, 108, 103, 216, 148, 220 },
+            new byte[] { 66, 95, 135, 111, 104, 219, 151, 222 },
+            new byte[] { 68, 95, 135, 114, 105, 221, 153, 223 },
+            new byte[] { 71, 96, 136, 117, 106, 224, 155, 225 },
+            new byte[] { 73, 96, 136, 120, 106, 225, 157, 226 },
+            new byte[] { 76, 97, 136, 123, 107, 227, 159, 228 },
+            new byte[] { 78, 97, 136, 126, 108, 229, 160, 229 },
+            new byte[] { 80, 98, 137, 129, 109, 231, 162, 231 },
+            new byte[] { 82, 98, 137, 131, 109, 232, 164, 232 },
+            new byte[] { 84, 98, 138, 134, 110, 234, 166, 233 },
+            new byte[] { 86, 98, 138, 137, 111, 235, 168, 234 },
+            new byte[] { 89, 99, 138, 140, 112, 236, 170, 235 },
+            new byte[] { 91, 99, 138, 142, 112, 237, 171, 235 },
+            new byte[] { 93, 100, 139, 145, 113, 238, 173, 236 },
+            new byte[] { 95, 100, 139, 147, 114, 239, 174, 237 },
+            new byte[] { 97, 101, 140, 149, 115, 240, 176, 238 },
+            new byte[] { 99, 101, 140, 151, 115, 241, 177, 238 },
+            new byte[] { 101, 102, 140, 154, 116, 242, 179, 239 },
+            new byte[] { 103, 102, 140, 156, 117, 242, 180, 239 },
+            new byte[] { 105, 103, 141, 158, 118, 243, 182, 240 },
+            new byte[] { 107, 103, 141, 160, 118, 243, 183, 240 },
+            new byte[] { 109, 104, 141, 162, 119, 244, 185, 241 },
+            new byte[] { 111, 104, 141, 164, 119, 244, 186, 241 },
+            new byte[] { 113, 104, 142, 166, 120, 245, 187, 242 },
+            new byte[] { 114, 104, 142, 168, 121, 245, 188, 242 },
+            new byte[] { 116, 105, 143, 170, 122, 246, 190, 243 },
+            new byte[] { 118, 105, 143, 171, 122, 246, 191, 243 },
+            new byte[] { 120, 106, 143, 173, 123, 247, 192, 244 },
+            new byte[] { 121, 106, 143, 175, 124, 247, 193, 244 },
+            new byte[] { 123, 107, 144, 177, 125, 248, 195, 244 },
+            new byte[] { 125, 107, 144, 178, 125, 248, 196, 244 },
+            new byte[] { 127, 108, 145, 180, 126, 249, 197, 245 },
+            new byte[] { 128, 108, 145, 181, 127, 249, 198, 245 },
+            new byte[] { 130, 109, 145, 183, 128, 249, 199, 245 },
+            new byte[] { 132, 109, 145, 184, 128, 249, 200, 245 },
+            new byte[] { 134, 110, 146, 186, 129, 250, 201, 246 },
+            new byte[] { 135, 110, 146, 187, 130, 250, 202, 246 },
+            new byte[] { 137, 111, 147, 189, 131, 251, 203, 246 },
+            new byte[] { 138, 111, 147, 190, 131, 251, 204, 246 },
+            new byte[] { 140, 112, 147, 192, 132, 251, 205, 247 },
+            new byte[] { 141, 112, 147, 193, 132, 251, 206, 247 },
+            new byte[] { 143, 113, 148, 194, 133, 251, 207, 247 },
+            new byte[] { 144, 113, 148, 195, 134, 251, 207, 247 },
+            new byte[] { 146, 114, 149, 197, 135, 252, 208, 248 },
+            new byte[] { 147, 114, 149, 198, 135, 252, 209, 248 },
+            new byte[] { 149, 115, 149, 199, 136, 252, 210, 248 },
+            new byte[] { 150, 115, 149, 200, 137, 252, 210, 248 },
+            new byte[] { 152, 115, 150, 201, 138, 252, 211, 248 },
+            new byte[] { 153, 115, 150, 202, 138, 252, 212, 248 },
+            new byte[] { 155, 116, 151, 204, 139, 253, 213, 249 },
+            new byte[] { 156, 116, 151, 205, 139, 253, 213, 249 },
+            new byte[] { 158, 117, 151, 206, 140, 253, 214, 249 },
+            new byte[] { 159, 117, 151, 207, 141, 253, 215, 249 },
+            new byte[] { 161, 118, 152, 208, 142, 253, 216, 249 },
+            new byte[] { 162, 118, 152, 209, 142, 253, 216, 249 },
+            new byte[] { 163, 119, 153, 210, 143, 253, 217, 249 },
+            new byte[] { 164, 119, 153, 211, 143, 253, 217, 249 },
+            new byte[] { 166, 120, 153, 212, 144, 254, 218, 250 },
+            new byte[] { 167, 120, 153, 212, 145, 254, 219, 250 },
+            new byte[] { 168, 121, 154, 213, 146, 254, 220, 250 },
+            new byte[] { 169, 121, 154, 214, 146, 254, 220, 250 },
+            new byte[] { 171, 122, 155, 215, 147, 254, 221, 250 },
+            new byte[] { 172, 122, 155, 216, 147, 254, 221, 250 },
+            new byte[] { 173, 123, 155, 217, 148, 254, 222, 250 },
+            new byte[] { 174, 123, 155, 217, 149, 254, 222, 250 },
+            new byte[] { 176, 124, 156, 218, 150, 254, 223, 250 },
+            new byte[] { 177, 124, 156, 219, 150, 254, 223, 250 },
+            new byte[] { 178, 125, 157, 220, 151, 254, 224, 251 },
+            new byte[] { 179, 125, 157, 220, 151, 254, 224, 251 },
+            new byte[] { 180, 126, 157, 221, 152, 254, 225, 251 },
+            new byte[] { 181, 126, 157, 221, 152, 254, 225, 251 },
+            new byte[] { 183, 127, 158, 222, 153, 254, 226, 251 },
+            new byte[] { 184, 127, 158, 223, 154, 254, 226, 251 },
+            new byte[] { 185, 128, 159, 224, 155, 255, 227, 251 },
+            new byte[] { 186, 128, 159, 224, 155, 255, 227, 251 },
+            new byte[] { 187, 129, 160, 225, 156, 255, 228, 251 },
+            new byte[] { 188, 130, 160, 225, 156, 255, 228, 251 },
+            new byte[] { 189, 131, 160, 226, 157, 255, 228, 251 },
+            new byte[] { 190, 131, 160, 226, 158, 255, 228, 251 },
+            new byte[] { 191, 132, 161, 227, 159, 255, 229, 251 },
+            new byte[] { 192, 132, 161, 227, 159, 255, 229, 251 },
+            new byte[] { 193, 133, 162, 228, 160, 255, 230, 252 },
+            new byte[] { 194, 133, 162, 229, 160, 255, 230, 252 },
+            new byte[] { 195, 134, 163, 230, 161, 255, 231, 252 },
+            new byte[] { 196, 134, 163, 230, 161, 255, 231, 252 },
+            new byte[] { 197, 135, 163, 231, 162, 255, 231, 252 },
+            new byte[] { 198, 135, 163, 231, 162, 255, 231, 252 },
+            new byte[] { 199, 136, 164, 232, 163, 255, 232, 252 },
+            new byte[] { 200, 136, 164, 232, 164, 255, 232, 252 },
+            new byte[] { 201, 137, 165, 233, 165, 255, 233, 252 },
+            new byte[] { 201, 137, 165, 233, 165, 255, 233, 252 },
+            new byte[] { 202, 138, 166, 233, 166, 255, 233, 252 },
+            new byte[] { 203, 138, 166, 233, 166, 255, 233, 252 },
+            new byte[] { 204, 139, 166, 234, 167, 255, 234, 252 },
+            new byte[] { 205, 139, 166, 234, 167, 255, 234, 252 },
+            new byte[] { 206, 140, 167, 235, 168, 255, 235, 252 },
+            new byte[] { 206, 140, 167, 235, 168, 255, 235, 252 },
+            new byte[] { 207, 141, 168, 236, 169, 255, 235, 252 },
+            new byte[] { 208, 141, 168, 236, 170, 255, 235, 252 },
+            new byte[] { 209, 142, 169, 237, 171, 255, 236, 252 },
+            new byte[] { 209, 143, 169, 237, 171, 255, 236, 252 },
+            new byte[] { 210, 144, 169, 237, 172, 255, 236, 252 },
+            new byte[] { 211, 144, 169, 237, 172, 255, 236, 252 },
+            new byte[] { 212, 145, 170, 238, 173, 255, 237, 252 },
+            new byte[] { 213, 145, 170, 238, 173, 255, 237, 252 },
+            new byte[] { 214, 146, 171, 239, 174, 255, 237, 253 },
+            new byte[] { 214, 146, 171, 239, 174, 255, 237, 253 },
+            new byte[] { 215, 147, 172, 240, 175, 255, 238, 253 },
+            new byte[] { 215, 147, 172, 240, 175, 255, 238, 253 },
+            new byte[] { 216, 148, 173, 240, 176, 255, 238, 253 },
+            new byte[] { 217, 148, 173, 240, 176, 255, 238, 253 },
+            new byte[] { 218, 149, 173, 241, 177, 255, 239, 253 },
+            new byte[] { 218, 149, 173, 241, 178, 255, 239, 253 },
+            new byte[] { 219, 150, 174, 241, 179, 255, 239, 253 },
+            new byte[] { 219, 151, 174, 241, 179, 255, 239, 253 },
+            new byte[] { 220, 152, 175, 242, 180, 255, 240, 253 },
+            new byte[] { 221, 152, 175, 242, 180, 255, 240, 253 },
+            new byte[] { 222, 153, 176, 242, 181, 255, 240, 253 },
+            new byte[] { 222, 153, 176, 242, 181, 255, 240, 253 },
+            new byte[] { 223, 154, 177, 243, 182, 255, 240, 253 },
+            new byte[] { 223, 154, 177, 243, 182, 255, 240, 253 },
+            new byte[] { 224, 155, 178, 244, 183, 255, 241, 253 },
+            new byte[] { 224, 155, 178, 244, 183, 255, 241, 253 },
+            new byte[] { 225, 156, 178, 244, 184, 255, 241, 253 },
+            new byte[] { 225, 157, 178, 244, 184, 255, 241, 253 },
+            new byte[] { 226, 158, 179, 244, 185, 255, 242, 253 },
+            new byte[] { 227, 158, 179, 244, 185, 255, 242, 253 },
+            new byte[] { 228, 159, 180, 245, 186, 255, 242, 253 },
+            new byte[] { 228, 159, 180, 245, 186, 255, 242, 253 },
+            new byte[] { 229, 160, 181, 245, 187, 255, 242, 253 },
+            new byte[] { 229, 160, 181, 245, 187, 255, 242, 253 },
+            new byte[] { 230, 161, 182, 246, 188, 255, 243, 253 },
+            new byte[] { 230, 162, 182, 246, 188, 255, 243, 253 },
+            new byte[] { 231, 163, 183, 246, 189, 255, 243, 253 },
+            new byte[] { 231, 163, 183, 246, 189, 255, 243, 253 },
+            new byte[] { 232, 164, 184, 247, 190, 255, 243, 253 },
+            new byte[] { 232, 164, 184, 247, 190, 255, 243, 253 },
+            new byte[] { 233, 165, 185, 247, 191, 255, 244, 253 },
+            new byte[] { 233, 165, 185, 247, 191, 255, 244, 253 },
+            new byte[] { 234, 166, 185, 247, 192, 255, 244, 253 },
+            new byte[] { 234, 167, 185, 247, 192, 255, 244, 253 },
+            new byte[] { 235, 168, 186, 248, 193, 255, 244, 253 },
+            new byte[] { 235, 168, 186, 248, 193, 255, 244, 253 },
+            new byte[] { 236, 169, 187, 248, 194, 255, 244, 253 },
+            new byte[] { 236, 169, 187, 248, 194, 255, 244, 253 },
+            new byte[] { 236, 170, 188, 248, 195, 255, 245, 253 },
+            new byte[] { 236, 170, 188, 248, 195, 255, 245, 253 },
+            new byte[] { 237, 171, 189, 249, 196, 255, 245, 254 },
+            new byte[] { 237, 172, 189, 249, 196, 255, 245, 254 },
+            new byte[] { 238, 173, 190, 249, 197, 255, 245, 254 },
+            new byte[] { 238, 173, 190, 249, 197, 255, 245, 254 },
+            new byte[] { 239, 174, 191, 249, 198, 255, 245, 254 },
+            new byte[] { 239, 174, 191, 249, 198, 255, 245, 254 },
+            new byte[] { 240, 175, 192, 249, 199, 255, 246, 254 },
+            new byte[] { 240, 176, 192, 249, 199, 255, 246, 254 },
+            new byte[] { 240, 177, 193, 250, 200, 255, 246, 254 },
+            new byte[] { 240, 177, 193, 250, 200, 255, 246, 254 },
+            new byte[] { 241, 178, 194, 250, 201, 255, 246, 254 },
+            new byte[] { 241, 178, 194, 250, 201, 255, 246, 254 },
+            new byte[] { 242, 179, 195, 250, 202, 255, 246, 254 },
+            new byte[] { 242, 180, 195, 250, 202, 255, 246, 254 },
+            new byte[] { 242, 181, 196, 250, 203, 255, 247, 254 },
+            new byte[] { 242, 181, 196, 250, 203, 255, 247, 254 },
+            new byte[] { 243, 182, 197, 251, 204, 255, 247, 254 },
+            new byte[] { 243, 183, 197, 251, 204, 255, 247, 254 },
+            new byte[] { 244, 184, 198, 251, 205, 255, 247, 254 },
+            new byte[] { 244, 184, 198, 251, 205, 255, 247, 254 },
+            new byte[] { 244, 185, 199, 251, 206, 255, 247, 254 },
+            new byte[] { 244, 185, 199, 251, 206, 255, 247, 254 },
+            new byte[] { 245, 186, 200, 251, 207, 255, 247, 254 },
+            new byte[] { 245, 187, 200, 251, 207, 255, 247, 254 },
+            new byte[] { 246, 188, 201, 252, 207, 255, 248, 254 },
+            new byte[] { 246, 188, 201, 252, 207, 255, 248, 254 },
+            new byte[] { 246, 189, 202, 252, 208, 255, 248, 254 },
+            new byte[] { 246, 190, 202, 252, 208, 255, 248, 254 },
+            new byte[] { 247, 191, 203, 252, 209, 255, 248, 254 },
+            new byte[] { 247, 191, 203, 252, 209, 255, 248, 254 },
+            new byte[] { 247, 192, 204, 252, 210, 255, 248, 254 },
+            new byte[] { 247, 193, 204, 252, 210, 255, 248, 254 },
+            new byte[] { 248, 194, 205, 252, 211, 255, 248, 254 },
+            new byte[] { 248, 194, 205, 252, 211, 255, 248, 254 },
+            new byte[] { 248, 195, 206, 252, 212, 255, 249, 254 },
+            new byte[] { 248, 196, 206, 252, 212, 255, 249, 254 },
+            new byte[] { 249, 197, 207, 253, 213, 255, 249, 254 },
+            new byte[] { 249, 197, 207, 253, 213, 255, 249, 254 },
+            new byte[] { 249, 198, 208, 253, 214, 255, 249, 254 },
+            new byte[] { 249, 199, 209, 253, 214, 255, 249, 254 },
+            new byte[] { 250, 200, 210, 253, 215, 255, 249, 254 },
+            new byte[] { 250, 200, 210, 253, 215, 255, 249, 254 },
+            new byte[] { 250, 201, 211, 253, 215, 255, 249, 254 },
+            new byte[] { 250, 202, 211, 253, 215, 255, 249, 254 },
+            new byte[] { 250, 203, 212, 253, 216, 255, 249, 254 },
+            new byte[] { 250, 203, 212, 253, 216, 255, 249, 254 },
+            new byte[] { 251, 204, 213, 253, 217, 255, 250, 254 },
+            new byte[] { 251, 205, 213, 253, 217, 255, 250, 254 },
+            new byte[] { 251, 206, 214, 254, 218, 255, 250, 254 },
+            new byte[] { 251, 206, 215, 254, 218, 255, 250, 254 },
+            new byte[] { 252, 207, 216, 254, 219, 255, 250, 254 },
+            new byte[] { 252, 208, 216, 254, 219, 255, 250, 254 },
+            new byte[] { 252, 209, 217, 254, 220, 255, 250, 254 },
+            new byte[] { 252, 210, 217, 254, 220, 255, 250, 254 },
+            new byte[] { 252, 211, 218, 254, 221, 255, 250, 254 },
+            new byte[] { 252, 212, 218, 254, 221, 255, 250, 254 },
+            new byte[] { 253, 213, 219, 254, 222, 255, 250, 254 },
+            new byte[] { 253, 213, 220, 254, 222, 255, 250, 254 },
+            new byte[] { 253, 214, 221, 254, 223, 255, 250, 254 },
+            new byte[] { 253, 215, 221, 254, 223, 255, 250, 254 },
+            new byte[] { 253, 216, 222, 254, 224, 255, 251, 254 },
+            new byte[] { 253, 217, 223, 254, 224, 255, 251, 254 },
+            new byte[] { 253, 218, 224, 254, 225, 255, 251, 254 },
+            new byte[] { 253, 219, 224, 254, 225, 255, 251, 254 },
+            new byte[] { 254, 220, 225, 254, 225, 255, 251, 254 },
+            new byte[] { 254, 221, 226, 254, 225, 255, 251, 254 },
+            new byte[] { 254, 222, 227, 255, 226, 255, 251, 254 },
+            new byte[] { 254, 223, 227, 255, 226, 255, 251, 254 },
+            new byte[] { 254, 224, 228, 255, 227, 255, 251, 254 },
+            new byte[] { 254, 225, 229, 255, 227, 255, 251, 254 },
+            new byte[] { 254, 226, 230, 255, 228, 255, 251, 254 },
+            new byte[] { 254, 227, 230, 255, 229, 255, 251, 254 },
+            new byte[] { 255, 228, 231, 255, 230, 255, 251, 254 },
+            new byte[] { 255, 229, 232, 255, 230, 255, 251, 254 },
+            new byte[] { 255, 230, 233, 255, 231, 255, 252, 254 },
+            new byte[] { 255, 231, 234, 255, 231, 255, 252, 254 },
+            new byte[] { 255, 232, 235, 255, 232, 255, 252, 254 },
+            new byte[] { 255, 233, 236, 255, 232, 255, 252, 254 },
+            new byte[] { 255, 235, 237, 255, 233, 255, 252, 254 },
+            new byte[] { 255, 236, 238, 255, 234, 255, 252, 254 },
+            new byte[] { 255, 238, 240, 255, 235, 255, 252, 255 },
+            new byte[] { 255, 239, 241, 255, 235, 255, 252, 254 },
+            new byte[] { 255, 241, 243, 255, 236, 255, 252, 254 },
+            new byte[] { 255, 243, 245, 255, 237, 255, 252, 254 },
+            new byte[] { 255, 246, 247, 255, 239, 255, 253, 255 },
+        };
+
+        /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
+        public static readonly sbyte[] Vp9IntraModeTree = new sbyte[]
+        {
+            -(sbyte)PredictionMode.DcPred,   2,                                 /* 0 = DC_NODE */
+            -(sbyte)PredictionMode.TmPred,   4,                                 /* 1 = TM_NODE */
+            -(sbyte)PredictionMode.VPred,    6,                                 /* 2 = V_NODE */
+            8,                                 12,                                /* 3 = COM_NODE */
+            -(sbyte)PredictionMode.HPred,    10,                                /* 4 = H_NODE */
+            -(sbyte)PredictionMode.D135Pred, -(sbyte)PredictionMode.D117Pred, /* 5 = D135_NODE */
+            -(sbyte)PredictionMode.D45Pred,  14,                                /* 6 = D45_NODE */
+            -(sbyte)PredictionMode.D63Pred,  16,                                /* 7 = D63_NODE */
+            -(sbyte)PredictionMode.D153Pred, -(sbyte)PredictionMode.D207Pred  /* 8 = D153_NODE */
+        };
+
+        public static readonly sbyte[] Vp9InterModeTree = new sbyte[]
+        {
+            -((sbyte)PredictionMode.ZeroMv - (sbyte)PredictionMode. NearestMv), 2,
+            -((sbyte)PredictionMode.NearestMv - (sbyte)PredictionMode.NearestMv), 4,
+            -((sbyte)PredictionMode.NearMv - (sbyte)PredictionMode.NearestMv),
+            -((sbyte)PredictionMode.NewMv - (sbyte)PredictionMode.NearestMv)
+        };
+
+        public static readonly sbyte[] Vp9PartitionTree = new sbyte[]
+        {
+            -(sbyte)PartitionType.PartitionNone, 2, -(sbyte)PartitionType.PartitionHorz, 4, -(sbyte)PartitionType.PartitionVert, -(sbyte)PartitionType.PartitionSplit
+        };
+
+        public static readonly sbyte[] Vp9SwitchableInterpTree = new sbyte[]
+        {
+            -Constants.EightTap, 2, -Constants.EightTapSmooth, -Constants.EightTapSharp
+        };
+
+        public static readonly sbyte[] Vp9SegmentTree = new sbyte[]
+        {
+            2, 4, 6, 8, 10, 12, 0, -1, -2, -3, -4, -5, -6, -7
+        };
+
+        // MV Ref
+
+        // This is used to figure out a context for the ref blocks. The code flattens
+        // an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
+        // adding 9 for each intra block, 3 for each zero mv and 1 for each new
+        // motion vector. This single number is then converted into a context
+        // with a single lookup ( CounterToContext ).
+        public static readonly int[] Mode2Counter = new int[]
+        {
+            9,  // DC_PRED
+            9,  // V_PRED
+            9,  // H_PRED
+            9,  // D45_PRED
+            9,  // D135_PRED
+            9,  // D117_PRED
+            9,  // D153_PRED
+            9,  // D207_PRED
+            9,  // D63_PRED
+            9,  // TM_PRED
+            0,  // NEARESTMV
+            0,  // NEARMV
+            3,  // ZEROMV
+            1,  // NEWMV
+        };
+
+        // There are 3^3 different combinations of 3 counts that can be either 0,1 or
+        // 2. However the actual count can never be greater than 2 so the highest
+        // counter we need is 18. 9 is an invalid counter that's never used.
+        public static readonly MotionVectorContext[] CounterToContext = new MotionVectorContext[]
+        {
+            MotionVectorContext.BothPredicted,     // 0
+            MotionVectorContext.NewPlusNonIntra,   // 1
+            MotionVectorContext.BothNew,           // 2
+            MotionVectorContext.ZeroPlusPredicted, // 3
+            MotionVectorContext.NewPlusNonIntra,   // 4
+            MotionVectorContext.InvalidCase,       // 5
+            MotionVectorContext.BothZero,          // 6
+            MotionVectorContext.InvalidCase,       // 7
+            MotionVectorContext.InvalidCase,       // 8
+            MotionVectorContext.IntraPlusNonIntra, // 9
+            MotionVectorContext.IntraPlusNonIntra, // 10
+            MotionVectorContext.InvalidCase,       // 11
+            MotionVectorContext.IntraPlusNonIntra, // 12
+            MotionVectorContext.InvalidCase,       // 13
+            MotionVectorContext.InvalidCase,       // 14
+            MotionVectorContext.InvalidCase,       // 15
+            MotionVectorContext.InvalidCase,       // 16
+            MotionVectorContext.InvalidCase,       // 17
+            MotionVectorContext.BothIntra          // 18
+        };
+
+        public static readonly Position[][] MvRefBlocks = new Position[][]
+        {
+            // 4X4
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, -1 ),
+            new Position( -2, 0 ),
+            new Position( 0, -2 ),
+            new Position( -2, -1 ),
+            new Position( -1, -2 ),
+            new Position( -2, -2 ) },
+            // 4X8
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, -1 ),
+            new Position( -2, 0 ),
+            new Position( 0, -2 ),
+            new Position( -2, -1 ),
+            new Position( -1, -2 ),
+            new Position( -2, -2 ) },
+            // 8X4
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, -1 ),
+            new Position( -2, 0 ),
+            new Position( 0, -2 ),
+            new Position( -2, -1 ),
+            new Position( -1, -2 ),
+            new Position( -2, -2 ) },
+            // 8X8
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, -1 ),
+            new Position( -2, 0 ),
+            new Position( 0, -2 ),
+            new Position( -2, -1 ),
+            new Position( -1, -2 ),
+            new Position( -2, -2 ) },
+            // 8X16
+            new Position[] { new Position( 0, -1 ),
+            new Position( -1, 0 ),
+            new Position( 1, -1 ),
+            new Position( -1, -1 ),
+            new Position( 0, -2 ),
+            new Position( -2, 0 ),
+            new Position( -2, -1 ),
+            new Position( -1, -2 ) },
+            // 16X8
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, 1 ),
+            new Position( -1, -1 ),
+            new Position( -2, 0 ),
+            new Position( 0, -2 ),
+            new Position( -1, -2 ),
+            new Position( -2, -1 ) },
+            // 16X16
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, 1 ),
+            new Position( 1, -1 ),
+            new Position( -1, -1 ),
+            new Position( -3, 0 ),
+            new Position( 0, -3 ),
+            new Position( -3, -3 ) },
+            // 16X32
+            new Position[] { new Position( 0, -1 ),
+            new Position( -1, 0 ),
+            new Position( 2, -1 ),
+            new Position( -1, -1 ),
+            new Position( -1, 1 ),
+            new Position( 0, -3 ),
+            new Position( -3, 0 ),
+            new Position( -3, -3 ) },
+            // 32X16
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, 2 ),
+            new Position( -1, -1 ),
+            new Position( 1, -1 ),
+            new Position( -3, 0 ),
+            new Position( 0, -3 ),
+            new Position( -3, -3 ) },
+            // 32X32
+            new Position[] { new Position( -1, 1 ),
+            new Position( 1, -1 ),
+            new Position( -1, 2 ),
+            new Position( 2, -1 ),
+            new Position( -1, -1 ),
+            new Position( -3, 0 ),
+            new Position( 0, -3 ),
+            new Position( -3, -3 ) },
+            // 32X64
+            new Position[] { new Position( 0, -1 ),
+            new Position( -1, 0 ),
+            new Position( 4, -1 ),
+            new Position( -1, 2 ),
+            new Position( -1, -1 ),
+            new Position( 0, -3 ),
+            new Position( -3, 0 ),
+            new Position( 2, -1 ) },
+            // 64X32
+            new Position[] { new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, 4 ),
+            new Position( 2, -1 ),
+            new Position( -1, -1 ),
+            new Position( -3, 0 ),
+            new Position( 0, -3 ),
+            new Position( -1, 2 ) },
+            // 64X64
+            new Position[] { new Position( -1, 3 ),
+            new Position( 3, -1 ),
+            new Position( -1, 4 ),
+            new Position( 4, -1 ),
+            new Position( -1, -1 ),
+            new Position( -1, 0 ),
+            new Position( 0, -1 ),
+            new Position( -1, 6 ) }
+        };
+    }
+}

+ 389 - 0
Ryujinx.Graphics.Nvdec.Vp9/PredCommon.cs

@@ -0,0 +1,389 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using System.Diagnostics;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class PredCommon
+    {
+        public static int GetReferenceModeContext(ref Vp9Common cm, ref MacroBlockD xd)
+        {
+            int ctx;
+            // Note:
+            // The mode info data structure has a one element border above and to the
+            // left of the entries corresponding to real macroblocks.
+            // The prediction flags in these dummy entries are initialized to 0.
+            if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull)
+            {  // both edges available
+                if (!xd.AboveMi.Value.HasSecondRef() && !xd.LeftMi.Value.HasSecondRef())
+                {
+                    // Neither edge uses comp pred (0/1)
+                    ctx = (xd.AboveMi.Value.RefFrame[0] == cm.CompFixedRef ? 1 : 0) ^
+                          (xd.LeftMi.Value.RefFrame[0] == cm.CompFixedRef ? 1 : 0);
+                }
+                else if (!xd.AboveMi.Value.HasSecondRef())
+                {
+                    // One of two edges uses comp pred (2/3)
+                    ctx = 2 + (xd.AboveMi.Value.RefFrame[0] == cm.CompFixedRef || !xd.AboveMi.Value.IsInterBlock() ? 1 : 0);
+                }
+                else if (!xd.LeftMi.Value.HasSecondRef())
+                {
+                    // One of two edges uses comp pred (2/3)
+                    ctx = 2 + (xd.LeftMi.Value.RefFrame[0] == cm.CompFixedRef || !xd.LeftMi.Value.IsInterBlock() ? 1 : 0);
+                }
+                else  // Both edges use comp pred (4)
+                {
+                    ctx = 4;
+                }
+            }
+            else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull)
+            {  // One edge available
+                ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value;
+
+                if (!edgeMi.HasSecondRef())
+                {
+                    // Edge does not use comp pred (0/1)
+                    ctx = edgeMi.RefFrame[0] == cm.CompFixedRef ? 1 : 0;
+                }
+                else
+                {
+                    // Edge uses comp pred (3)
+                    ctx = 3;
+                }
+            }
+            else
+            {  // No edges available (1)
+                ctx = 1;
+            }
+            Debug.Assert(ctx >= 0 && ctx < Constants.CompInterContexts);
+            return ctx;
+        }
+
+        // Returns a context number for the given MB prediction signal
+        public static int GetPredContextCompRefP(ref Vp9Common cm, ref MacroBlockD xd)
+        {
+            int predContext;
+            // Note:
+            // The mode info data structure has a one element border above and to the
+            // left of the entries corresponding to real macroblocks.
+            // The prediction flags in these dummy entries are initialized to 0.
+            int fixRefIdx = cm.RefFrameSignBias[cm.CompFixedRef];
+            int varRefIdx = fixRefIdx == 0 ? 1 : 0;
+
+            if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull)
+            {  // Both edges available
+                bool aboveIntra = !xd.AboveMi.Value.IsInterBlock();
+                bool leftIntra = !xd.LeftMi.Value.IsInterBlock();
+
+                if (aboveIntra && leftIntra)
+                {  // Intra/Intra (2)
+                    predContext = 2;
+                }
+                else if (aboveIntra || leftIntra)
+                {  // Intra/Inter
+                    ref ModeInfo edgeMi = ref aboveIntra ? ref xd.LeftMi.Value : ref xd.AboveMi.Value;
+
+                    if (!edgeMi.HasSecondRef())  // single pred (1/3)
+                    {
+                        predContext = 1 + 2 * (edgeMi.RefFrame[0] != cm.CompVarRef[1] ? 1 : 0);
+                    }
+                    else  // Comp pred (1/3)
+                    {
+                        predContext = 1 + 2 * (edgeMi.RefFrame[varRefIdx] != cm.CompVarRef[1] ? 1 : 0);
+                    }
+                }
+                else
+                {  // Inter/Inter
+                    bool lSg = !xd.LeftMi.Value.HasSecondRef();
+                    bool aSg = !xd.AboveMi.Value.HasSecondRef();
+                    sbyte vrfa = aSg ? xd.AboveMi.Value.RefFrame[0] : xd.AboveMi.Value.RefFrame[varRefIdx];
+                    sbyte vrfl = lSg ? xd.LeftMi.Value.RefFrame[0] : xd.LeftMi.Value.RefFrame[varRefIdx];
+
+                    if (vrfa == vrfl && cm.CompVarRef[1] == vrfa)
+                    {
+                        predContext = 0;
+                    }
+                    else if (lSg && aSg)
+                    {  // Single/Single
+                        if ((vrfa == cm.CompFixedRef && vrfl == cm.CompVarRef[0]) ||
+                            (vrfl == cm.CompFixedRef && vrfa == cm.CompVarRef[0]))
+                        {
+                            predContext = 4;
+                        }
+                        else if (vrfa == vrfl)
+                        {
+                            predContext = 3;
+                        }
+                        else
+                        {
+                            predContext = 1;
+                        }
+                    }
+                    else if (lSg || aSg)
+                    {  // Single/Comp
+                        sbyte vrfc = lSg ? vrfa : vrfl;
+                        sbyte rfs = aSg ? vrfa : vrfl;
+                        if (vrfc == cm.CompVarRef[1] && rfs != cm.CompVarRef[1])
+                        {
+                            predContext = 1;
+                        }
+                        else if (rfs == cm.CompVarRef[1] && vrfc != cm.CompVarRef[1])
+                        {
+                            predContext = 2;
+                        }
+                        else
+                        {
+                            predContext = 4;
+                        }
+                    }
+                    else if (vrfa == vrfl)
+                    {  // Comp/Comp
+                        predContext = 4;
+                    }
+                    else
+                    {
+                        predContext = 2;
+                    }
+                }
+            }
+            else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull)
+            {  // One edge available
+                ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value;
+
+                if (!edgeMi.IsInterBlock())
+                {
+                    predContext = 2;
+                }
+                else
+                {
+                    if (edgeMi.HasSecondRef())
+                    {
+                        predContext = 4 * (edgeMi.RefFrame[varRefIdx] != cm.CompVarRef[1] ? 1 : 0);
+                    }
+                    else
+                    {
+                        predContext = 3 * (edgeMi.RefFrame[0] != cm.CompVarRef[1] ? 1 : 0);
+                    }
+                }
+            }
+            else
+            {  // No edges available (2)
+                predContext = 2;
+            }
+            Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts);
+            return predContext;
+        }
+
+        public static int GetPredContextSingleRefP1(ref MacroBlockD xd)
+        {
+            int predContext;
+            // Note:
+            // The mode info data structure has a one element border above and to the
+            // left of the entries corresponding to real macroblocks.
+            // The prediction flags in these dummy entries are initialized to 0.
+            if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull)
+            {  // Both edges available
+                bool aboveIntra = !xd.AboveMi.Value.IsInterBlock();
+                bool leftIntra = !xd.LeftMi.Value.IsInterBlock();
+
+                if (aboveIntra && leftIntra)
+                {  // Intra/Intra
+                    predContext = 2;
+                }
+                else if (aboveIntra || leftIntra)
+                {  // Intra/Inter or Inter/Intra
+                    ref ModeInfo edgeMi = ref aboveIntra ? ref xd.LeftMi.Value : ref xd.AboveMi.Value;
+                    if (!edgeMi.HasSecondRef())
+                    {
+                        predContext = 4 * (edgeMi.RefFrame[0] == Constants.LastFrame ? 1 : 0);
+                    }
+                    else
+                    {
+                        predContext = 1 + (edgeMi.RefFrame[0] == Constants.LastFrame ||
+                                           edgeMi.RefFrame[1] == Constants.LastFrame ? 1 : 0);
+                    }
+                }
+                else
+                {  // Inter/Inter
+                    bool aboveHasSecond = xd.AboveMi.Value.HasSecondRef();
+                    bool leftHasSecond = xd.LeftMi.Value.HasSecondRef();
+                    sbyte above0 = xd.AboveMi.Value.RefFrame[0];
+                    sbyte above1 = xd.AboveMi.Value.RefFrame[1];
+                    sbyte left0 = xd.LeftMi.Value.RefFrame[0];
+                    sbyte left1 = xd.LeftMi.Value.RefFrame[1];
+
+                    if (aboveHasSecond && leftHasSecond)
+                    {
+                        predContext = 1 + (above0 == Constants.LastFrame || above1 == Constants.LastFrame ||
+                                            left0 == Constants.LastFrame || left1 == Constants.LastFrame ? 1 : 0);
+                    }
+                    else if (aboveHasSecond || leftHasSecond)
+                    {
+                        sbyte rfs = !aboveHasSecond ? above0 : left0;
+                        sbyte crf1 = aboveHasSecond ? above0 : left0;
+                        sbyte crf2 = aboveHasSecond ? above1 : left1;
+
+                        if (rfs == Constants.LastFrame)
+                        {
+                            predContext = 3 + (crf1 == Constants.LastFrame || crf2 == Constants.LastFrame ? 1 : 0);
+                        }
+                        else
+                        {
+                            predContext = (crf1 == Constants.LastFrame || crf2 == Constants.LastFrame ? 1 : 0);
+                        }
+                    }
+                    else
+                    {
+                        predContext = 2 * (above0 == Constants.LastFrame ? 1 : 0) + 2 * (left0 == Constants.LastFrame ? 1 : 0);
+                    }
+                }
+            }
+            else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull)
+            {  // One edge available
+                ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value;
+                if (!edgeMi.IsInterBlock())
+                {  // Intra
+                    predContext = 2;
+                }
+                else
+                {  // Inter
+                    if (!edgeMi.HasSecondRef())
+                    {
+                        predContext = 4 * (edgeMi.RefFrame[0] == Constants.LastFrame ? 1 : 0);
+                    }
+                    else
+                    {
+                        predContext = 1 + (edgeMi.RefFrame[0] == Constants.LastFrame ||
+                                           edgeMi.RefFrame[1] == Constants.LastFrame ? 1 : 0);
+                    }
+                }
+            }
+            else
+            {  // No edges available
+                predContext = 2;
+            }
+            Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts);
+            return predContext;
+        }
+
+        public static int GetPredContextSingleRefP2(ref MacroBlockD xd)
+        {
+            int predContext;
+
+            // Note:
+            // The mode info data structure has a one element border above and to the
+            // left of the entries corresponding to real macroblocks.
+            // The prediction flags in these dummy entries are initialized to 0.
+            if (!xd.AboveMi.IsNull && !xd.LeftMi.IsNull)
+            {  // Both edges available
+                bool aboveIntra = !xd.AboveMi.Value.IsInterBlock();
+                bool leftIntra = !xd.LeftMi.Value.IsInterBlock();
+
+                if (aboveIntra && leftIntra)
+                {  // Intra/Intra
+                    predContext = 2;
+                }
+                else if (aboveIntra || leftIntra)
+                {  // Intra/Inter or Inter/Intra
+                    ref ModeInfo edgeMi = ref aboveIntra ? ref xd.LeftMi.Value : ref xd.AboveMi.Value;
+                    if (!edgeMi.HasSecondRef())
+                    {
+                        if (edgeMi.RefFrame[0] == Constants.LastFrame)
+                        {
+                            predContext = 3;
+                        }
+                        else
+                        {
+                            predContext = 4 * (edgeMi.RefFrame[0] == Constants.GoldenFrame ? 1 : 0);
+                        }
+                    }
+                    else
+                    {
+                        predContext = 1 + 2 * (edgeMi.RefFrame[0] == Constants.GoldenFrame ||
+                                               edgeMi.RefFrame[1] == Constants.GoldenFrame ? 1 : 0);
+                    }
+                }
+                else
+                {  // Inter/Inter
+                    bool aboveHasSecond = xd.AboveMi.Value.HasSecondRef();
+                    bool leftHasSecond = xd.LeftMi.Value.HasSecondRef();
+                    sbyte above0 = xd.AboveMi.Value.RefFrame[0];
+                    sbyte above1 = xd.AboveMi.Value.RefFrame[1];
+                    sbyte left0 = xd.LeftMi.Value.RefFrame[0];
+                    sbyte left1 = xd.LeftMi.Value.RefFrame[1];
+
+                    if (aboveHasSecond && leftHasSecond)
+                    {
+                        if (above0 == left0 && above1 == left1)
+                        {
+                            predContext = 3 * (above0 == Constants.GoldenFrame || above1 == Constants.GoldenFrame ||
+                                                left0 == Constants.GoldenFrame || left1 == Constants.GoldenFrame ? 1 : 0);
+                        }
+                        else
+                        {
+                            predContext = 2;
+                        }
+                    }
+                    else if (aboveHasSecond || leftHasSecond)
+                    {
+                        sbyte rfs = !aboveHasSecond ? above0 : left0;
+                        sbyte crf1 = aboveHasSecond ? above0 : left0;
+                        sbyte crf2 = aboveHasSecond ? above1 : left1;
+
+                        if (rfs == Constants.GoldenFrame)
+                        {
+                            predContext = 3 + (crf1 == Constants.GoldenFrame || crf2 == Constants.GoldenFrame ? 1 : 0);
+                        }
+                        else if (rfs == Constants.AltRefFrame)
+                        {
+                            predContext = crf1 == Constants.GoldenFrame || crf2 == Constants.GoldenFrame ? 1 : 0;
+                        }
+                        else
+                        {
+                            predContext = 1 + 2 * (crf1 == Constants.GoldenFrame || crf2 == Constants.GoldenFrame ? 1 : 0);
+                        }
+                    }
+                    else
+                    {
+                        if (above0 == Constants.LastFrame && left0 == Constants.LastFrame)
+                        {
+                            predContext = 3;
+                        }
+                        else if (above0 == Constants.LastFrame || left0 == Constants.LastFrame)
+                        {
+                            sbyte edge0 = (above0 == Constants.LastFrame) ? left0 : above0;
+                            predContext = 4 * (edge0 == Constants.GoldenFrame ? 1 : 0);
+                        }
+                        else
+                        {
+                            predContext = 2 * (above0 == Constants.GoldenFrame ? 1 : 0) + 2 * (left0 == Constants.GoldenFrame ? 1 : 0);
+                        }
+                    }
+                }
+            }
+            else if (!xd.AboveMi.IsNull || !xd.LeftMi.IsNull)
+            {  // One edge available
+                ref ModeInfo edgeMi = ref !xd.AboveMi.IsNull ? ref xd.AboveMi.Value : ref xd.LeftMi.Value;
+
+                if (!edgeMi.IsInterBlock() || (edgeMi.RefFrame[0] == Constants.LastFrame && !edgeMi.HasSecondRef()))
+                {
+                    predContext = 2;
+                }
+                else if (!edgeMi.HasSecondRef())
+                {
+                    predContext = 4 * (edgeMi.RefFrame[0] == Constants.GoldenFrame ? 1 : 0);
+                }
+                else
+                {
+                    predContext = 3 * (edgeMi.RefFrame[0] == Constants.GoldenFrame ||
+                                       edgeMi.RefFrame[1] == Constants.GoldenFrame ? 1 : 0);
+                }
+            }
+            else
+            {  // No edges available (2)
+                predContext = 2;
+            }
+            Debug.Assert(predContext >= 0 && predContext < Constants.RefContexts);
+            return predContext;
+        }
+    }
+}

+ 203 - 0
Ryujinx.Graphics.Nvdec.Vp9/QuantCommon.cs

@@ -0,0 +1,203 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using System;
+using System.Diagnostics;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class QuantCommon
+    {
+        public const int MinQ = 0;
+        public const int MaxQ = 255;
+
+        private static readonly short[] DcQlookup = new short[]
+        {
+            4,    8,    8,    9,    10,  11,  12,  12,  13,  14,  15,   16,   17,   18,
+            19,   19,   20,   21,   22,  23,  24,  25,  26,  26,  27,   28,   29,   30,
+            31,   32,   32,   33,   34,  35,  36,  37,  38,  38,  39,   40,   41,   42,
+            43,   43,   44,   45,   46,  47,  48,  48,  49,  50,  51,   52,   53,   53,
+            54,   55,   56,   57,   57,  58,  59,  60,  61,  62,  62,   63,   64,   65,
+            66,   66,   67,   68,   69,  70,  70,  71,  72,  73,  74,   74,   75,   76,
+            77,   78,   78,   79,   80,  81,  81,  82,  83,  84,  85,   85,   87,   88,
+            90,   92,   93,   95,   96,  98,  99,  101, 102, 104, 105,  107,  108,  110,
+            111,  113,  114,  116,  117, 118, 120, 121, 123, 125, 127,  129,  131,  134,
+            136,  138,  140,  142,  144, 146, 148, 150, 152, 154, 156,  158,  161,  164,
+            166,  169,  172,  174,  177, 180, 182, 185, 187, 190, 192,  195,  199,  202,
+            205,  208,  211,  214,  217, 220, 223, 226, 230, 233, 237,  240,  243,  247,
+            250,  253,  257,  261,  265, 269, 272, 276, 280, 284, 288,  292,  296,  300,
+            304,  309,  313,  317,  322, 326, 330, 335, 340, 344, 349,  354,  359,  364,
+            369,  374,  379,  384,  389, 395, 400, 406, 411, 417, 423,  429,  435,  441,
+            447,  454,  461,  467,  475, 482, 489, 497, 505, 513, 522,  530,  539,  549,
+            559,  569,  579,  590,  602, 614, 626, 640, 654, 668, 684,  700,  717,  736,
+            755,  775,  796,  819,  843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139,
+            1184, 1232, 1282, 1336,
+        };
+
+        private static readonly short[] DcQlookup10 = new short[]
+        {
+            4,    9,    10,   13,   15,   17,   20,   22,   25,   28,   31,   34,   37,
+            40,   43,   47,   50,   53,   57,   60,   64,   68,   71,   75,   78,   82,
+            86,   90,   93,   97,   101,  105,  109,  113,  116,  120,  124,  128,  132,
+            136,  140,  143,  147,  151,  155,  159,  163,  166,  170,  174,  178,  182,
+            185,  189,  193,  197,  200,  204,  208,  212,  215,  219,  223,  226,  230,
+            233,  237,  241,  244,  248,  251,  255,  259,  262,  266,  269,  273,  276,
+            280,  283,  287,  290,  293,  297,  300,  304,  307,  310,  314,  317,  321,
+            324,  327,  331,  334,  337,  343,  350,  356,  362,  369,  375,  381,  387,
+            394,  400,  406,  412,  418,  424,  430,  436,  442,  448,  454,  460,  466,
+            472,  478,  484,  490,  499,  507,  516,  525,  533,  542,  550,  559,  567,
+            576,  584,  592,  601,  609,  617,  625,  634,  644,  655,  666,  676,  687,
+            698,  708,  718,  729,  739,  749,  759,  770,  782,  795,  807,  819,  831,
+            844,  856,  868,  880,  891,  906,  920,  933,  947,  961,  975,  988,  1001,
+            1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202,
+            1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436,
+            1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717,
+            1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088,
+            2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675,
+            2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823,
+            3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
+        };
+
+        private static readonly short[] DcQlookup12 = new short[]
+        {
+            4,     12,    18,    25,    33,    41,    50,    60,    70,    80,    91,
+            103,   115,   127,   140,   153,   166,   180,   194,   208,   222,   237,
+            251,   266,   281,   296,   312,   327,   343,   358,   374,   390,   405,
+            421,   437,   453,   469,   484,   500,   516,   532,   548,   564,   580,
+            596,   611,   627,   643,   659,   674,   690,   706,   721,   737,   752,
+            768,   783,   798,   814,   829,   844,   859,   874,   889,   904,   919,
+            934,   949,   964,   978,   993,   1008,  1022,  1037,  1051,  1065,  1080,
+            1094,  1108,  1122,  1136,  1151,  1165,  1179,  1192,  1206,  1220,  1234,
+            1248,  1261,  1275,  1288,  1302,  1315,  1329,  1342,  1368,  1393,  1419,
+            1444,  1469,  1494,  1519,  1544,  1569,  1594,  1618,  1643,  1668,  1692,
+            1717,  1741,  1765,  1789,  1814,  1838,  1862,  1885,  1909,  1933,  1957,
+            1992,  2027,  2061,  2096,  2130,  2165,  2199,  2233,  2267,  2300,  2334,
+            2367,  2400,  2434,  2467,  2499,  2532,  2575,  2618,  2661,  2704,  2746,
+            2788,  2830,  2872,  2913,  2954,  2995,  3036,  3076,  3127,  3177,  3226,
+            3275,  3324,  3373,  3421,  3469,  3517,  3565,  3621,  3677,  3733,  3788,
+            3843,  3897,  3951,  4005,  4058,  4119,  4181,  4241,  4301,  4361,  4420,
+            4479,  4546,  4612,  4677,  4742,  4807,  4871,  4942,  5013,  5083,  5153,
+            5222,  5291,  5367,  5442,  5517,  5591,  5665,  5745,  5825,  5905,  5984,
+            6063,  6149,  6234,  6319,  6404,  6495,  6587,  6678,  6769,  6867,  6966,
+            7064,  7163,  7269,  7376,  7483,  7599,  7715,  7832,  7958,  8085,  8214,
+            8352,  8492,  8635,  8788,  8945,  9104,  9275,  9450,  9639,  9832,  10031,
+            10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118,
+            13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949,
+            19718, 20521, 21387,
+        };
+
+        private static readonly short[] AcQlookup = new short[]
+        {
+            4,    8,    9,    10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
+            20,   21,   22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
+            33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,   45,
+            46,   47,   48,   49,   50,   51,   52,   53,   54,   55,   56,   57,   58,
+            59,   60,   61,   62,   63,   64,   65,   66,   67,   68,   69,   70,   71,
+            72,   73,   74,   75,   76,   77,   78,   79,   80,   81,   82,   83,   84,
+            85,   86,   87,   88,   89,   90,   91,   92,   93,   94,   95,   96,   97,
+            98,   99,   100,  101,  102,  104,  106,  108,  110,  112,  114,  116,  118,
+            120,  122,  124,  126,  128,  130,  132,  134,  136,  138,  140,  142,  144,
+            146,  148,  150,  152,  155,  158,  161,  164,  167,  170,  173,  176,  179,
+            182,  185,  188,  191,  194,  197,  200,  203,  207,  211,  215,  219,  223,
+            227,  231,  235,  239,  243,  247,  251,  255,  260,  265,  270,  275,  280,
+            285,  290,  295,  300,  305,  311,  317,  323,  329,  335,  341,  347,  353,
+            359,  366,  373,  380,  387,  394,  401,  408,  416,  424,  432,  440,  448,
+            456,  465,  474,  483,  492,  501,  510,  520,  530,  540,  550,  560,  571,
+            582,  593,  604,  615,  627,  639,  651,  663,  676,  689,  702,  715,  729,
+            743,  757,  771,  786,  801,  816,  832,  848,  864,  881,  898,  915,  933,
+            951,  969,  988,  1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196,
+            1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537,
+            1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+        };
+
+        private static readonly short[] AcQlookup10 = new short[]
+        {
+            4,    9,    11,   13,   16,   18,   21,   24,   27,   30,   33,   37,   40,
+            44,   48,   51,   55,   59,   63,   67,   71,   75,   79,   83,   88,   92,
+            96,   100,  105,  109,  114,  118,  122,  127,  131,  136,  140,  145,  149,
+            154,  158,  163,  168,  172,  177,  181,  186,  190,  195,  199,  204,  208,
+            213,  217,  222,  226,  231,  235,  240,  244,  249,  253,  258,  262,  267,
+            271,  275,  280,  284,  289,  293,  297,  302,  306,  311,  315,  319,  324,
+            328,  332,  337,  341,  345,  349,  354,  358,  362,  367,  371,  375,  379,
+            384,  388,  392,  396,  401,  409,  417,  425,  433,  441,  449,  458,  466,
+            474,  482,  490,  498,  506,  514,  523,  531,  539,  547,  555,  563,  571,
+            579,  588,  596,  604,  616,  628,  640,  652,  664,  676,  688,  700,  713,
+            725,  737,  749,  761,  773,  785,  797,  809,  825,  841,  857,  873,  889,
+            905,  922,  938,  954,  970,  986,  1002, 1018, 1038, 1058, 1078, 1098, 1118,
+            1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411,
+            1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791,
+            1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283,
+            2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915,
+            2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731,
+            3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784,
+            4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148,
+            6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
+        };
+
+        private static readonly short[] AcQlookup12 = new short[]
+        {
+            4,     13,    19,    27,    35,    44,    54,    64,    75,    87,    99,
+            112,   126,   139,   154,   168,   183,   199,   214,   230,   247,   263,
+            280,   297,   314,   331,   349,   366,   384,   402,   420,   438,   456,
+            475,   493,   511,   530,   548,   567,   586,   604,   623,   642,   660,
+            679,   698,   716,   735,   753,   772,   791,   809,   828,   846,   865,
+            884,   902,   920,   939,   957,   976,   994,   1012,  1030,  1049,  1067,
+            1085,  1103,  1121,  1139,  1157,  1175,  1193,  1211,  1229,  1246,  1264,
+            1282,  1299,  1317,  1335,  1352,  1370,  1387,  1405,  1422,  1440,  1457,
+            1474,  1491,  1509,  1526,  1543,  1560,  1577,  1595,  1627,  1660,  1693,
+            1725,  1758,  1791,  1824,  1856,  1889,  1922,  1954,  1987,  2020,  2052,
+            2085,  2118,  2150,  2183,  2216,  2248,  2281,  2313,  2346,  2378,  2411,
+            2459,  2508,  2556,  2605,  2653,  2701,  2750,  2798,  2847,  2895,  2943,
+            2992,  3040,  3088,  3137,  3185,  3234,  3298,  3362,  3426,  3491,  3555,
+            3619,  3684,  3748,  3812,  3876,  3941,  4005,  4069,  4149,  4230,  4310,
+            4390,  4470,  4550,  4631,  4711,  4791,  4871,  4967,  5064,  5160,  5256,
+            5352,  5448,  5544,  5641,  5737,  5849,  5961,  6073,  6185,  6297,  6410,
+            6522,  6650,  6778,  6906,  7034,  7162,  7290,  7435,  7579,  7723,  7867,
+            8011,  8155,  8315,  8475,  8635,  8795,  8956,  9132,  9308,  9484,  9660,
+            9836,  10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885,
+            12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637,
+            14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062,
+            18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334,
+            22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599,
+            28143, 28687, 29247,
+        };
+
+        public static short DcQuant(int qindex, int delta, BitDepth bitDepth)
+        {
+            switch (bitDepth)
+            {
+                case BitDepth.Bits8: return DcQlookup[Math.Clamp(qindex + delta, 0, MaxQ)];
+                case BitDepth.Bits10: return DcQlookup10[Math.Clamp(qindex + delta, 0, MaxQ)];
+                case BitDepth.Bits12: return DcQlookup12[Math.Clamp(qindex + delta, 0, MaxQ)];
+                default:
+                    Debug.Assert(false, "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+                    return -1;
+            }
+        }
+
+        public static short AcQuant(int qindex, int delta, BitDepth bitDepth)
+        {
+            switch (bitDepth)
+            {
+                case BitDepth.Bits8: return AcQlookup[Math.Clamp(qindex + delta, 0, MaxQ)];
+                case BitDepth.Bits10: return AcQlookup10[Math.Clamp(qindex + delta, 0, MaxQ)];
+                case BitDepth.Bits12: return AcQlookup12[Math.Clamp(qindex + delta, 0, MaxQ)];
+                default:
+                    Debug.Assert(false, "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+                    return -1;
+            }
+        }
+
+        public static int GetQIndex(ref Segmentation seg, int segmentId, int baseQIndex)
+        {
+            if (seg.IsSegFeatureActive(segmentId, SegLvlFeatures.SegLvlAltQ) != 0)
+            {
+                int data = seg.GetSegData(segmentId, SegLvlFeatures.SegLvlAltQ);
+                int segQIndex = seg.AbsDelta == Constants.SegmentAbsData ? data : baseQIndex + data;
+                return Math.Clamp(segQIndex, 0, MaxQ);
+            }
+            else
+            {
+                return baseQIndex;
+            }
+        }
+    }
+}

+ 234 - 0
Ryujinx.Graphics.Nvdec.Vp9/ReconInter.cs

@@ -0,0 +1,234 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class ReconInter
+    {
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void InterPredictor(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            int subpelX,
+            int subpelY,
+            ref ScaleFactors sf,
+            int w,
+            int h,
+            int refr,
+            Array8<short>[] kernel,
+            int xs,
+            int ys)
+        {
+            sf.InterPredict(
+                subpelX != 0 ? 1 : 0,
+                subpelY != 0 ? 1 : 0,
+                refr,
+                src,
+                srcStride,
+                dst,
+                dstStride,
+                subpelX,
+                subpelY,
+                w,
+                h,
+                kernel,
+                xs,
+                ys);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void HighbdInterPredictor(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            int subpelX,
+            int subpelY,
+            ref ScaleFactors sf,
+            int w,
+            int h,
+            int refr,
+            Array8<short>[] kernel,
+            int xs,
+            int ys,
+            int bd)
+        {
+            sf.HighbdInterPredict(
+                subpelX != 0 ? 1 : 0,
+                subpelY != 0 ? 1 : 0,
+                refr,
+                src,
+                srcStride,
+                dst,
+                dstStride,
+                subpelX,
+                subpelY,
+                w,
+                h,
+                kernel,
+                xs,
+                ys,
+                bd);
+        }
+
+        private static int RoundMvCompQ4(int value)
+        {
+            return (value < 0 ? value - 2 : value + 2) / 4;
+        }
+
+        private static Mv MiMvPredQ4(ref ModeInfo mi, int idx)
+        {
+            Mv res = new Mv()
+            {
+                Row = (short)RoundMvCompQ4(
+                    mi.Bmi[0].Mv[idx].Row + mi.Bmi[1].Mv[idx].Row +
+                    mi.Bmi[2].Mv[idx].Row + mi.Bmi[3].Mv[idx].Row),
+                Col = (short)RoundMvCompQ4(
+                    mi.Bmi[0].Mv[idx].Col + mi.Bmi[1].Mv[idx].Col +
+                    mi.Bmi[2].Mv[idx].Col + mi.Bmi[3].Mv[idx].Col)
+            };
+            return res;
+        }
+
+        private static int RoundMvCompQ2(int value)
+        {
+            return (value < 0 ? value - 1 : value + 1) / 2;
+        }
+
+        private static Mv MiMvPredQ2(ref ModeInfo mi, int idx, int block0, int block1)
+        {
+            Mv res = new Mv()
+            {
+                Row = (short)RoundMvCompQ2(
+                    mi.Bmi[block0].Mv[idx].Row +
+                    mi.Bmi[block1].Mv[idx].Row),
+                Col = (short)RoundMvCompQ2(
+                    mi.Bmi[block0].Mv[idx].Col +
+                    mi.Bmi[block1].Mv[idx].Col)
+            };
+            return res;
+        }
+
+        public static Mv ClampMvToUmvBorderSb(ref MacroBlockD xd, ref Mv srcMv, int bw, int bh, int ssX, int ssY)
+        {
+            // If the MV points so far into the UMV border that no visible pixels
+            // are used for reconstruction, the subpel part of the MV can be
+            // discarded and the MV limited to 16 pixels with equivalent results.
+            int spelLeft = (Constants.Vp9InterpExtend + bw) << SubpelBits;
+            int spelRight = spelLeft - SubpelShifts;
+            int spelTop = (Constants.Vp9InterpExtend + bh) << SubpelBits;
+            int spelBottom = spelTop - SubpelShifts;
+            Mv clampedMv = new Mv()
+            {
+                Row = (short)(srcMv.Row * (1 << (1 - ssY))),
+                Col = (short)(srcMv.Col * (1 << (1 - ssX)))
+            };
+
+            Debug.Assert(ssX <= 1);
+            Debug.Assert(ssY <= 1);
+
+            clampedMv.ClampMv(
+               xd.MbToLeftEdge * (1 << (1 - ssX)) - spelLeft,
+               xd.MbToRightEdge * (1 << (1 - ssX)) + spelRight,
+               xd.MbToTopEdge * (1 << (1 - ssY)) - spelTop,
+               xd.MbToBottomEdge * (1 << (1 - ssY)) + spelBottom);
+
+            return clampedMv;
+        }
+
+        public static Mv AverageSplitMvs(ref MacroBlockDPlane pd, ref ModeInfo mi, int refr, int block)
+        {
+            int ssIdx = ((pd.SubsamplingX > 0 ? 1 : 0) << 1) | (pd.SubsamplingY > 0 ? 1 : 0);
+            Mv res = new Mv();
+            switch (ssIdx)
+            {
+                case 0: res = mi.Bmi[block].Mv[refr]; break;
+                case 1: res = MiMvPredQ2(ref mi, refr, block, block + 2); break;
+                case 2: res = MiMvPredQ2(ref mi, refr, block, block + 1); break;
+                case 3: res = MiMvPredQ4(ref mi, refr); break;
+                default: Debug.Assert(ssIdx <= 3 && ssIdx >= 0); break;
+            }
+            return res;
+        }
+
+        private static int ScaledBufferOffset(int xOffset, int yOffset, int stride, Ptr<ScaleFactors> sf)
+        {
+            int x = !sf.IsNull ? sf.Value.ScaleValueX(xOffset) : xOffset;
+            int y = !sf.IsNull ? sf.Value.ScaleValueY(yOffset) : yOffset;
+            return y * stride + x;
+        }
+
+        private static void SetupPredPlanes(
+            ref Buf2D dst,
+            ArrayPtr<byte> src,
+            int stride,
+            int miRow,
+            int miCol,
+            Ptr<ScaleFactors> scale,
+            int subsamplingX,
+            int subsamplingY)
+        {
+            int x = (Constants.MiSize * miCol) >> subsamplingX;
+            int y = (Constants.MiSize * miRow) >> subsamplingY;
+            dst.Buf = src.Slice(ScaledBufferOffset(x, y, stride, scale));
+            dst.Stride = stride;
+        }
+
+        public static void SetupDstPlanes(
+            ref Array3<MacroBlockDPlane> planes,
+            ref Surface src,
+            int miRow,
+            int miCol)
+        {
+            Span<ArrayPtr<byte>> buffers = stackalloc ArrayPtr<byte>[Constants.MaxMbPlane];
+            buffers[0] = src.YBuffer;
+            buffers[1] = src.UBuffer;
+            buffers[2] = src.VBuffer;
+            Span<int> strides = stackalloc int[Constants.MaxMbPlane];
+            strides[0] = src.Stride;
+            strides[1] = src.UvStride;
+            strides[2] = src.UvStride;
+            int i;
+
+            for (i = 0; i < Constants.MaxMbPlane; ++i)
+            {
+                ref MacroBlockDPlane pd = ref planes[i];
+                SetupPredPlanes(ref pd.Dst, buffers[i], strides[i], miRow, miCol, Ptr<ScaleFactors>.Null, pd.SubsamplingX, pd.SubsamplingY);
+            }
+        }
+
+        public static void SetupPrePlanes(
+            ref MacroBlockD xd,
+            int idx,
+            ref Surface src,
+            int miRow,
+            int miCol,
+            Ptr<ScaleFactors> sf)
+        {
+            if (!src.YBuffer.IsNull && !src.UBuffer.IsNull && !src.VBuffer.IsNull)
+            {
+                Span<ArrayPtr<byte>> buffers = stackalloc ArrayPtr<byte>[Constants.MaxMbPlane];
+                buffers[0] = src.YBuffer;
+                buffers[1] = src.UBuffer;
+                buffers[2] = src.VBuffer;
+                Span<int> strides = stackalloc int[Constants.MaxMbPlane];
+                strides[0] = src.Stride;
+                strides[1] = src.UvStride;
+                strides[2] = src.UvStride;
+                int i;
+
+                for (i = 0; i < Constants.MaxMbPlane; ++i)
+                {
+                    ref MacroBlockDPlane pd = ref xd.Plane[i];
+                    SetupPredPlanes(ref pd.Pre[idx], buffers[i], strides[i], miRow, miCol, sf, pd.SubsamplingX, pd.SubsamplingY);
+                }
+            }
+        }
+    }
+}

+ 761 - 0
Ryujinx.Graphics.Nvdec.Vp9/ReconIntra.cs

@@ -0,0 +1,761 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.IntraPred;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal static class ReconIntra
+    {
+        public static readonly TxType[] IntraModeToTxTypeLookup = new TxType[]
+        {
+            TxType.DctDct,    // DC
+            TxType.AdstDct,   // V
+            TxType.DctAdst,   // H
+            TxType.DctDct,    // D45
+            TxType.AdstAdst,  // D135
+            TxType.AdstDct,   // D117
+            TxType.DctAdst,   // D153
+            TxType.DctAdst,   // D207
+            TxType.AdstDct,   // D63
+            TxType.AdstAdst   // TM
+        };
+
+        private const int NeedLeft = 1 << 1;
+        private const int NeedAbove = 1 << 2;
+        private const int NeedAboveRight = 1 << 3;
+
+        private static readonly byte[] ExtendModes = new byte[]
+        {
+            NeedAbove | NeedLeft,  // DC
+            NeedAbove,             // V
+            NeedLeft,              // H
+            NeedAboveRight,        // D45
+            NeedLeft | NeedAbove,  // D135
+            NeedLeft | NeedAbove,  // D117
+            NeedLeft | NeedAbove,  // D153
+            NeedLeft,              // D207
+            NeedAboveRight,        // D63
+            NeedLeft | NeedAbove,  // TM
+        };
+
+        private unsafe delegate void IntraPredFn(byte* dst, int stride, byte* above, byte* left);
+
+        private static unsafe IntraPredFn[][] _pred = new IntraPredFn[][]
+        {
+            new IntraPredFn[]
+            {
+                null,
+                null,
+                null,
+                null
+            },
+            new IntraPredFn[]
+            {
+                VPredictor4x4,
+                VPredictor8x8,
+                VPredictor16x16,
+                VPredictor32x32
+            },
+            new IntraPredFn[]
+            {
+                HPredictor4x4,
+                HPredictor8x8,
+                HPredictor16x16,
+                HPredictor32x32
+            },
+            new IntraPredFn[]
+            {
+                D45Predictor4x4,
+                D45Predictor8x8,
+                D45Predictor16x16,
+                D45Predictor32x32
+            },
+            new IntraPredFn[]
+            {
+                D135Predictor4x4,
+                D135Predictor8x8,
+                D135Predictor16x16,
+                D135Predictor32x32
+            },
+            new IntraPredFn[]
+            {
+                D117Predictor4x4,
+                D117Predictor8x8,
+                D117Predictor16x16,
+                D117Predictor32x32
+            },
+            new IntraPredFn[]
+            {
+                D153Predictor4x4,
+                D153Predictor8x8,
+                D153Predictor16x16,
+                D153Predictor32x32
+            },
+            new IntraPredFn[]
+            {
+                D207Predictor4x4,
+                D207Predictor8x8,
+                D207Predictor16x16,
+                D207Predictor32x32
+            },
+            new IntraPredFn[]
+            {
+                D63Predictor4x4,
+                D63Predictor8x8,
+                D63Predictor16x16,
+                D63Predictor32x32
+            },
+            new IntraPredFn[]
+            {
+                TMPredictor4x4,
+                TMPredictor8x8,
+                TMPredictor16x16,
+                TMPredictor32x32
+            }
+        };
+
+        private static unsafe IntraPredFn[][][] _dcPred = new IntraPredFn[][][]
+        {
+            new IntraPredFn[][]
+            {
+                new IntraPredFn[]
+                {
+                    Dc128Predictor4x4,
+                    Dc128Predictor8x8,
+                    Dc128Predictor16x16,
+                    Dc128Predictor32x32
+                },
+                new IntraPredFn[]
+                {
+                    DcTopPredictor4x4,
+                    DcTopPredictor8x8,
+                    DcTopPredictor16x16,
+                    DcTopPredictor32x32
+                }
+            },
+            new IntraPredFn[][]
+            {
+                new IntraPredFn[]
+                {
+                    DcLeftPredictor4x4,
+                    DcLeftPredictor8x8,
+                    DcLeftPredictor16x16,
+                    DcLeftPredictor32x32
+                },
+                new IntraPredFn[]
+                {
+                    DcPredictor4x4,
+                    DcPredictor8x8,
+                    DcPredictor16x16,
+                    DcPredictor32x32
+                }
+            }
+        };
+
+        private unsafe delegate void IntraHighPredFn(ushort* dst, int stride, ushort* above, ushort* left, int bd);
+
+        private static unsafe IntraHighPredFn[][] _predHigh = new IntraHighPredFn[][]
+        {
+            new IntraHighPredFn[]
+            {
+                null,
+                null,
+                null,
+                null
+            },
+            new IntraHighPredFn[]
+            {
+                HighbdVPredictor4x4,
+                HighbdVPredictor8x8,
+                HighbdVPredictor16x16,
+                HighbdVPredictor32x32
+            },
+            new IntraHighPredFn[]
+            {
+                HighbdHPredictor4x4,
+                HighbdHPredictor8x8,
+                HighbdHPredictor16x16,
+                HighbdHPredictor32x32
+            },
+            new IntraHighPredFn[]
+            {
+                HighbdD45Predictor4x4,
+                HighbdD45Predictor8x8,
+                HighbdD45Predictor16x16,
+                HighbdD45Predictor32x32
+            },
+            new IntraHighPredFn[]
+            {
+                HighbdD135Predictor4x4,
+                HighbdD135Predictor8x8,
+                HighbdD135Predictor16x16,
+                HighbdD135Predictor32x32
+            },
+            new IntraHighPredFn[]
+            {
+                HighbdD117Predictor4x4,
+                HighbdD117Predictor8x8,
+                HighbdD117Predictor16x16,
+                HighbdD117Predictor32x32
+            },
+            new IntraHighPredFn[]
+            {
+                HighbdD153Predictor4x4,
+                HighbdD153Predictor8x8,
+                HighbdD153Predictor16x16,
+                HighbdD153Predictor32x32
+            },
+            new IntraHighPredFn[]
+            {
+                HighbdD207Predictor4x4,
+                HighbdD207Predictor8x8,
+                HighbdD207Predictor16x16,
+                HighbdD207Predictor32x32
+            },
+            new IntraHighPredFn[]
+            {
+                HighbdD63Predictor4x4,
+                HighbdD63Predictor8x8,
+                HighbdD63Predictor16x16,
+                HighbdD63Predictor32x32
+            },
+            new IntraHighPredFn[]
+            {
+                HighbdTMPredictor4x4,
+                HighbdTMPredictor8x8,
+                HighbdTMPredictor16x16,
+                HighbdTMPredictor32x32
+            }
+        };
+
+        private static unsafe IntraHighPredFn[][][] _dcPredHigh = new IntraHighPredFn[][][]
+        {
+            new IntraHighPredFn[][]
+            {
+                new IntraHighPredFn[]
+                {
+                    HighbdDc128Predictor4x4,
+                    HighbdDc128Predictor8x8,
+                    HighbdDc128Predictor16x16,
+                    HighbdDc128Predictor32x32
+                },
+                new IntraHighPredFn[]
+                {
+                    HighbdDcTopPredictor4x4,
+                    HighbdDcTopPredictor8x8,
+                    HighbdDcTopPredictor16x16,
+                    HighbdDcTopPredictor32x32
+                }
+            },
+            new IntraHighPredFn[][]
+            {
+                new IntraHighPredFn[]
+                {
+                    HighbdDcLeftPredictor4x4,
+                    HighbdDcLeftPredictor8x8,
+                    HighbdDcLeftPredictor16x16,
+                    HighbdDcLeftPredictor32x32
+                },
+                new IntraHighPredFn[]
+                {
+                    HighbdDcPredictor4x4,
+                    HighbdDcPredictor8x8,
+                    HighbdDcPredictor16x16,
+                    HighbdDcPredictor32x32
+                }
+            }
+        };
+
+        private static unsafe void BuildIntraPredictorsHigh(
+            ref MacroBlockD xd,
+            byte* ref8,
+            int refStride,
+            byte* dst8,
+            int dstStride,
+            PredictionMode mode,
+            TxSize txSize,
+            int upAvailable,
+            int leftAvailable,
+            int rightAvailable,
+            int x,
+            int y,
+            int plane)
+        {
+            int i;
+            ushort* dst = (ushort*)dst8;
+            ushort* refr = (ushort*)ref8;
+            ushort* leftCol = stackalloc ushort[32];
+            ushort* aboveData = stackalloc ushort[64 + 16];
+            ushort* aboveRow = aboveData + 16;
+            ushort* constAboveRow = aboveRow;
+            int bs = 4 << (int)txSize;
+            int frameWidth, frameHeight;
+            int x0, y0;
+            ref MacroBlockDPlane pd = ref xd.Plane[plane];
+            int needLeft = ExtendModes[(int)mode] & NeedLeft;
+            int needAbove = ExtendModes[(int)mode] & NeedAbove;
+            int needAboveRight = ExtendModes[(int)mode] & NeedAboveRight;
+            int baseVal = 128 << (xd.Bd - 8);
+            // 127 127 127 .. 127 127 127 127 127 127
+            // 129  A   B  ..  Y   Z
+            // 129  C   D  ..  W   X
+            // 129  E   F  ..  U   V
+            // 129  G   H  ..  S   T   T   T   T   T
+            // For 10 bit and 12 bit, 127 and 129 are replaced by base -1 and base + 1.
+
+            // Get current frame pointer, width and height.
+            if (plane == 0)
+            {
+                frameWidth = xd.CurBuf.Width;
+                frameHeight = xd.CurBuf.Height;
+            }
+            else
+            {
+                frameWidth = xd.CurBuf.UvWidth;
+                frameHeight = xd.CurBuf.UvHeight;
+            }
+
+            // Get block position in current frame.
+            x0 = (-xd.MbToLeftEdge >> (3 + pd.SubsamplingX)) + x;
+            y0 = (-xd.MbToTopEdge >> (3 + pd.SubsamplingY)) + y;
+
+            // NEED_LEFT
+            if (needLeft != 0)
+            {
+                if (leftAvailable != 0)
+                {
+                    if (xd.MbToBottomEdge < 0)
+                    {
+                        /* slower path if the block needs border extension */
+                        if (y0 + bs <= frameHeight)
+                        {
+                            for (i = 0; i < bs; ++i)
+                            {
+                                leftCol[i] = refr[i * refStride - 1];
+                            }
+                        }
+                        else
+                        {
+                            int extendBottom = frameHeight - y0;
+                            for (i = 0; i < extendBottom; ++i)
+                            {
+                                leftCol[i] = refr[i * refStride - 1];
+                            }
+
+                            for (; i < bs; ++i)
+                            {
+                                leftCol[i] = refr[(extendBottom - 1) * refStride - 1];
+                            }
+                        }
+                    }
+                    else
+                    {
+                        /* faster path if the block does not need extension */
+                        for (i = 0; i < bs; ++i)
+                        {
+                            leftCol[i] = refr[i * refStride - 1];
+                        }
+                    }
+                }
+                else
+                {
+                    MemoryUtil.Fill(leftCol, (ushort)(baseVal + 1), bs);
+                }
+            }
+
+            // NEED_ABOVE
+            if (needAbove != 0)
+            {
+                if (upAvailable != 0)
+                {
+                    ushort* aboveRef = refr - refStride;
+                    if (xd.MbToRightEdge < 0)
+                    {
+                        /* slower path if the block needs border extension */
+                        if (x0 + bs <= frameWidth)
+                        {
+                            MemoryUtil.Copy(aboveRow, aboveRef, bs);
+                        }
+                        else if (x0 <= frameWidth)
+                        {
+                            int r = frameWidth - x0;
+                            MemoryUtil.Copy(aboveRow, aboveRef, r);
+                            MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + bs - frameWidth);
+                        }
+                    }
+                    else
+                    {
+                        /* faster path if the block does not need extension */
+                        if (bs == 4 && rightAvailable != 0 && leftAvailable != 0)
+                        {
+                            constAboveRow = aboveRef;
+                        }
+                        else
+                        {
+                            MemoryUtil.Copy(aboveRow, aboveRef, bs);
+                        }
+                    }
+                    aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (ushort)(baseVal + 1);
+                }
+                else
+                {
+                    MemoryUtil.Fill(aboveRow, (ushort)(baseVal - 1), bs);
+                    aboveRow[-1] = (ushort)(baseVal - 1);
+                }
+            }
+
+            // NEED_ABOVERIGHT
+            if (needAboveRight != 0)
+            {
+                if (upAvailable != 0)
+                {
+                    ushort* aboveRef = refr - refStride;
+                    if (xd.MbToRightEdge < 0)
+                    {
+                        /* slower path if the block needs border extension */
+                        if (x0 + 2 * bs <= frameWidth)
+                        {
+                            if (rightAvailable != 0 && bs == 4)
+                            {
+                                MemoryUtil.Copy(aboveRow, aboveRef, 2 * bs);
+                            }
+                            else
+                            {
+                                MemoryUtil.Copy(aboveRow, aboveRef, bs);
+                                MemoryUtil.Fill(aboveRow + bs, aboveRow[bs - 1], bs);
+                            }
+                        }
+                        else if (x0 + bs <= frameWidth)
+                        {
+                            int r = frameWidth - x0;
+                            if (rightAvailable != 0 && bs == 4)
+                            {
+                                MemoryUtil.Copy(aboveRow, aboveRef, r);
+                                MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth);
+                            }
+                            else
+                            {
+                                MemoryUtil.Copy(aboveRow, aboveRef, bs);
+                                MemoryUtil.Fill(aboveRow + bs, aboveRow[bs - 1], bs);
+                            }
+                        }
+                        else if (x0 <= frameWidth)
+                        {
+                            int r = frameWidth - x0;
+                            MemoryUtil.Copy(aboveRow, aboveRef, r);
+                            MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth);
+                        }
+                        aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (ushort)(baseVal + 1);
+                    }
+                    else
+                    {
+                        /* faster path if the block does not need extension */
+                        if (bs == 4 && rightAvailable != 0 && leftAvailable != 0)
+                        {
+                            constAboveRow = aboveRef;
+                        }
+                        else
+                        {
+                            MemoryUtil.Copy(aboveRow, aboveRef, bs);
+                            if (bs == 4 && rightAvailable != 0)
+                            {
+                                MemoryUtil.Copy(aboveRow + bs, aboveRef + bs, bs);
+                            }
+                            else
+                            {
+                                MemoryUtil.Fill(aboveRow + bs, aboveRow[bs - 1], bs);
+                            }
+
+                            aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (ushort)(baseVal + 1);
+                        }
+                    }
+                }
+                else
+                {
+                    MemoryUtil.Fill(aboveRow, (ushort)(baseVal - 1), bs * 2);
+                    aboveRow[-1] = (ushort)(baseVal - 1);
+                }
+            }
+
+            // Predict
+            if (mode == PredictionMode.DcPred)
+            {
+                _dcPredHigh[leftAvailable][upAvailable][(int)txSize](dst, dstStride, constAboveRow, leftCol, xd.Bd);
+            }
+            else
+            {
+                _predHigh[(int)mode][(int)txSize](dst, dstStride, constAboveRow, leftCol, xd.Bd);
+            }
+        }
+
+        public static unsafe void BuildIntraPredictors(
+            ref MacroBlockD xd,
+            byte* refr,
+            int refStride,
+            byte* dst,
+            int dstStride,
+            PredictionMode mode,
+            TxSize txSize,
+            int upAvailable,
+            int leftAvailable,
+            int rightAvailable,
+            int x,
+            int y,
+            int plane)
+        {
+            int i;
+            byte* leftCol = stackalloc byte[32];
+            byte* aboveData = stackalloc byte[64 + 16];
+            byte* aboveRow = aboveData + 16;
+            byte* constAboveRow = aboveRow;
+            int bs = 4 << (int)txSize;
+            int frameWidth, frameHeight;
+            int x0, y0;
+            ref MacroBlockDPlane pd = ref xd.Plane[plane];
+
+            // 127 127 127 .. 127 127 127 127 127 127
+            // 129  A   B  ..  Y   Z
+            // 129  C   D  ..  W   X
+            // 129  E   F  ..  U   V
+            // 129  G   H  ..  S   T   T   T   T   T
+            // ..
+
+            // Get current frame pointer, width and height.
+            if (plane == 0)
+            {
+                frameWidth = xd.CurBuf.Width;
+                frameHeight = xd.CurBuf.Height;
+            }
+            else
+            {
+                frameWidth = xd.CurBuf.UvWidth;
+                frameHeight = xd.CurBuf.UvHeight;
+            }
+
+            // Get block position in current frame.
+            x0 = (-xd.MbToLeftEdge >> (3 + pd.SubsamplingX)) + x;
+            y0 = (-xd.MbToTopEdge >> (3 + pd.SubsamplingY)) + y;
+
+            // NEED_LEFT
+            if ((ExtendModes[(int)mode] & NeedLeft) != 0)
+            {
+                if (leftAvailable != 0)
+                {
+                    if (xd.MbToBottomEdge < 0)
+                    {
+                        /* Slower path if the block needs border extension */
+                        if (y0 + bs <= frameHeight)
+                        {
+                            for (i = 0; i < bs; ++i)
+                            {
+                                leftCol[i] = refr[i * refStride - 1];
+                            }
+                        }
+                        else
+                        {
+                            int extendBottom = frameHeight - y0;
+                            for (i = 0; i < extendBottom; ++i)
+                            {
+                                leftCol[i] = refr[i * refStride - 1];
+                            }
+
+                            for (; i < bs; ++i)
+                            {
+                                leftCol[i] = refr[(extendBottom - 1) * refStride - 1];
+                            }
+                        }
+                    }
+                    else
+                    {
+                        /* Faster path if the block does not need extension */
+                        for (i = 0; i < bs; ++i)
+                        {
+                            leftCol[i] = refr[i * refStride - 1];
+                        }
+                    }
+                }
+                else
+                {
+                    MemoryUtil.Fill(leftCol, (byte)129, bs);
+                }
+            }
+
+            // NEED_ABOVE
+            if ((ExtendModes[(int)mode] & NeedAbove) != 0)
+            {
+                if (upAvailable != 0)
+                {
+                    byte* aboveRef = refr - refStride;
+                    if (xd.MbToRightEdge < 0)
+                    {
+                        /* Slower path if the block needs border extension */
+                        if (x0 + bs <= frameWidth)
+                        {
+                            MemoryUtil.Copy(aboveRow, aboveRef, bs);
+                        }
+                        else if (x0 <= frameWidth)
+                        {
+                            int r = frameWidth - x0;
+                            MemoryUtil.Copy(aboveRow, aboveRef, r);
+                            MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + bs - frameWidth);
+                        }
+                    }
+                    else
+                    {
+                        /* Faster path if the block does not need extension */
+                        if (bs == 4 && rightAvailable != 0 && leftAvailable != 0)
+                        {
+                            constAboveRow = aboveRef;
+                        }
+                        else
+                        {
+                            MemoryUtil.Copy(aboveRow, aboveRef, bs);
+                        }
+                    }
+                    aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (byte)129;
+                }
+                else
+                {
+                    MemoryUtil.Fill(aboveRow, (byte)127, bs);
+                    aboveRow[-1] = 127;
+                }
+            }
+
+            // NEED_ABOVERIGHT
+            if ((ExtendModes[(int)mode] & NeedAboveRight) != 0)
+            {
+                if (upAvailable != 0)
+                {
+                    byte* aboveRef = refr - refStride;
+                    if (xd.MbToRightEdge < 0)
+                    {
+                        /* Slower path if the block needs border extension */
+                        if (x0 + 2 * bs <= frameWidth)
+                        {
+                            if (rightAvailable != 0 && bs == 4)
+                            {
+                                MemoryUtil.Copy(aboveRow, aboveRef, 2 * bs);
+                            }
+                            else
+                            {
+                                MemoryUtil.Copy(aboveRow, aboveRef, bs);
+                                MemoryUtil.Fill(aboveRow + bs, aboveRow[bs - 1], bs);
+                            }
+                        }
+                        else if (x0 + bs <= frameWidth)
+                        {
+                            int r = frameWidth - x0;
+                            if (rightAvailable != 0 && bs == 4)
+                            {
+                                MemoryUtil.Copy(aboveRow, aboveRef, r);
+                                MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth);
+                            }
+                            else
+                            {
+                                MemoryUtil.Copy(aboveRow, aboveRef, bs);
+                                MemoryUtil.Fill(aboveRow + bs, aboveRow[bs - 1], bs);
+                            }
+                        }
+                        else if (x0 <= frameWidth)
+                        {
+                            int r = frameWidth - x0;
+                            MemoryUtil.Copy(aboveRow, aboveRef, r);
+                            MemoryUtil.Fill(aboveRow + r, aboveRow[r - 1], x0 + 2 * bs - frameWidth);
+                        }
+                    }
+                    else
+                    {
+                        /* Faster path if the block does not need extension */
+                        if (bs == 4 && rightAvailable != 0 && leftAvailable != 0)
+                        {
+                            constAboveRow = aboveRef;
+                        }
+                        else
+                        {
+                            MemoryUtil.Copy(aboveRow, aboveRef, bs);
+                            if (bs == 4 && rightAvailable != 0)
+                            {
+                                MemoryUtil.Copy(aboveRow + bs, aboveRef + bs, bs);
+                            }
+                            else
+                            {
+                                MemoryUtil.Fill(aboveRow + bs, aboveRow[bs - 1], bs);
+                            }
+                        }
+                    }
+                    aboveRow[-1] = leftAvailable != 0 ? aboveRef[-1] : (byte)129;
+                }
+                else
+                {
+                    MemoryUtil.Fill(aboveRow, (byte)127, bs * 2);
+                    aboveRow[-1] = 127;
+                }
+            }
+
+            // Predict
+            if (mode == PredictionMode.DcPred)
+            {
+                _dcPred[leftAvailable][upAvailable][(int)txSize](dst, dstStride, constAboveRow, leftCol);
+            }
+            else
+            {
+                _pred[(int)mode][(int)txSize](dst, dstStride, constAboveRow, leftCol);
+            }
+        }
+
+        public static unsafe void PredictIntraBlock(
+            ref MacroBlockD xd,
+            int bwlIn,
+            TxSize txSize,
+            PredictionMode mode,
+            byte* refr,
+            int refStride,
+            byte* dst,
+            int dstStride,
+            int aoff,
+            int loff,
+            int plane)
+        {
+            int bw = 1 << bwlIn;
+            int txw = 1 << (int)txSize;
+            int haveTop = loff != 0 || !xd.AboveMi.IsNull ? 1 : 0;
+            int haveLeft = aoff != 0 || !xd.LeftMi.IsNull ? 1 : 0;
+            int haveRight = (aoff + txw) < bw ? 1 : 0;
+            int x = aoff * 4;
+            int y = loff * 4;
+
+            if (xd.CurBuf.HighBd)
+            {
+                BuildIntraPredictorsHigh(
+                    ref xd,
+                    refr,
+                    refStride,
+                    dst,
+                    dstStride,
+                    mode,
+                    txSize,
+                    haveTop,
+                    haveLeft,
+                    haveRight,
+                    x,
+                    y,
+                    plane);
+                return;
+            }
+            BuildIntraPredictors(
+                ref xd,
+                refr,
+                refStride,
+                dst,
+                dstStride,
+                mode,
+                txSize,
+                haveTop,
+                haveLeft,
+                haveRight,
+                x,
+                y,
+                plane);
+        }
+    }
+}

+ 20 - 0
Ryujinx.Graphics.Nvdec.Vp9/Ryujinx.Graphics.Nvdec.Vp9.csproj

@@ -0,0 +1,20 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+  <PropertyGroup>
+    <TargetFramework>netcoreapp3.1</TargetFramework>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|AnyCPU'">
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|AnyCPU'">
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+  </PropertyGroup>
+
+  <ItemGroup>
+    <ProjectReference Include="..\Ryujinx.Common\Ryujinx.Common.csproj" />
+    <ProjectReference Include="..\Ryujinx.Graphics.Video\Ryujinx.Graphics.Video.csproj" />
+  </ItemGroup>
+
+</Project>

+ 10 - 0
Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs

@@ -0,0 +1,10 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal struct TileBuffer
+    {
+        public ArrayPtr<byte> Data;
+        public int Size;
+    }
+}

+ 15 - 0
Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs

@@ -0,0 +1,15 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using Ryujinx.Graphics.Video;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9
+{
+    internal struct TileWorkerData
+    {
+        public Reader BitReader;
+        public MacroBlockD Xd;
+        /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+        public Array32<Array32<int>> Dqcoeff;
+    }
+}

+ 10 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/BModeInfo.cs

@@ -0,0 +1,10 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct BModeInfo
+    {
+        public PredictionMode Mode;
+        public Array2<Mv> Mv;  // First, second inter predictor motion vectors
+    }
+}

+ 21 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/BlockSize.cs

@@ -0,0 +1,21 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum BlockSize
+    {
+        Block4x4 = 0,
+        Block4x8 = 1,
+        Block8x4 = 2,
+        Block8x8 = 3,
+        Block8x16 = 4,
+        Block16x8 = 5,
+        Block16x16 = 6,
+        Block16x32 = 7,
+        Block32x16 = 8,
+        Block32x32 = 9,
+        Block32x64 = 10,
+        Block64x32 = 11,
+        Block64x64 = 12,
+        BlockSizes = 13,
+        BlockInvalid = BlockSizes
+    }
+}

+ 10 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/Buf2D.cs

@@ -0,0 +1,10 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct Buf2D
+    {
+        public ArrayPtr<byte> Buf;
+        public int Stride;
+    }
+}

+ 8 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/FrameType.cs

@@ -0,0 +1,8 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum FrameType
+    {
+        KeyFrame = 0,
+        InterFrame = 1
+    }
+}

+ 27 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilter.cs

@@ -0,0 +1,27 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct LoopFilter
+    {
+        public int FilterLevel;
+        public int LastFiltLevel;
+
+        public int SharpnessLevel;
+        public int LastSharpnessLevel;
+
+        public bool ModeRefDeltaEnabled;
+        public bool ModeRefDeltaUpdate;
+
+        // 0 = Intra, Last, GF, ARF
+        public Array4<sbyte> RefDeltas;
+        public Array4<sbyte> LastRefDeltas;
+
+        // 0 = ZERO_MV, MV
+        public Array2<sbyte> ModeDeltas;
+        public Array2<sbyte> LastModeDeltas;
+
+        public ArrayPtr<LoopFilterMask> Lfm;
+        public int LfmStride;
+    }
+}

+ 10 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterInfoN.cs

@@ -0,0 +1,10 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct LoopFilterInfoN
+    {
+        public Array64<LoopFilterThresh> Lfthr;
+        public Array8<Array4<Array2<byte>>> Lvl;
+    }
+}

+ 24 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterMask.cs

@@ -0,0 +1,24 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    // This structure holds bit masks for all 8x8 blocks in a 64x64 region.
+    // Each 1 bit represents a position in which we want to apply the loop filter.
+    // Left_ entries refer to whether we apply a filter on the border to the
+    // left of the block.   Above_ entries refer to whether or not to apply a
+    // filter on the above border.   Int_ entries refer to whether or not to
+    // apply borders on the 4x4 edges within the 8x8 block that each bit
+    // represents.
+    // Since each transform is accompanied by a potentially different type of
+    // loop filter there is a different entry in the array for each transform size.
+    internal struct LoopFilterMask
+    {
+        public Array4<ulong> LeftY;
+        public Array4<ulong> AboveY;
+        public ulong Int4x4Y;
+        public Array4<ushort> LeftUv;
+        public Array4<ushort> AboveUv;
+        public ushort Int4x4Uv;
+        public Array64<byte> LflY;
+    }
+}

+ 13 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/LoopFilterThresh.cs

@@ -0,0 +1,13 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    // Need to align this structure so when it is declared and
+    // passed it can be loaded into vector registers.
+    internal struct LoopFilterThresh
+    {
+        public Array16<byte> Mblim;
+        public Array16<byte> Lim;
+        public Array16<byte> HevThr;
+    }
+}

+ 179 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockD.cs

@@ -0,0 +1,179 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Video;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct MacroBlockD
+    {
+        public Array3<MacroBlockDPlane> Plane;
+        public byte BmodeBlocksWl;
+        public byte BmodeBlocksHl;
+
+        public Ptr<Vp9BackwardUpdates> Counts;
+        public TileInfo Tile;
+
+        public int MiStride;
+
+        // Grid of 8x8 cells is placed over the block.
+        // If some of them belong to the same mbtree-block
+        // they will just have same mi[i][j] value
+        public ArrayPtr<Ptr<ModeInfo>> Mi;
+        public Ptr<ModeInfo> LeftMi;
+        public Ptr<ModeInfo> AboveMi;
+
+        public uint MaxBlocksWide;
+        public uint MaxBlocksHigh;
+
+        public ArrayPtr<Array3<byte>> PartitionProbs;
+
+        /* Distance of MB away from frame edges */
+        public int MbToLeftEdge;
+        public int MbToRightEdge;
+        public int MbToTopEdge;
+        public int MbToBottomEdge;
+
+        public Ptr<Vp9EntropyProbs> Fc;
+
+        /* pointers to reference frames */
+        public Array2<Ptr<RefBuffer>> BlockRefs;
+
+        /* pointer to current frame */
+        public Surface CurBuf;
+
+        public Array3<ArrayPtr<sbyte>> AboveContext;
+        public Array3<Array16<sbyte>> LeftContext;
+
+        public ArrayPtr<sbyte> AboveSegContext;
+        public Array8<sbyte> LeftSegContext;
+
+        /* Bit depth: 8, 10, 12 */
+        public int Bd;
+
+        public bool Lossless;
+        public bool Corrupted;
+
+        public Ptr<InternalErrorInfo> ErrorInfo;
+
+        public int GetPredContextSegId()
+        {
+            sbyte aboveSip = !AboveMi.IsNull ? AboveMi.Value.SegIdPredicted : (sbyte)0;
+            sbyte leftSip = !LeftMi.IsNull ? LeftMi.Value.SegIdPredicted : (sbyte)0;
+
+            return aboveSip + leftSip;
+        }
+
+        public int GetSkipContext()
+        {
+            int aboveSkip = !AboveMi.IsNull ? AboveMi.Value.Skip : 0;
+            int leftSkip = !LeftMi.IsNull ? LeftMi.Value.Skip : 0;
+            return aboveSkip + leftSkip;
+        }
+
+        public int GetPredContextSwitchableInterp()
+        {
+            // Note:
+            // The mode info data structure has a one element border above and to the
+            // left of the entries corresponding to real macroblocks.
+            // The prediction flags in these dummy entries are initialized to 0.
+            int leftType = !LeftMi.IsNull ? LeftMi.Value.InterpFilter : Constants.SwitchableFilters;
+            int aboveType = !AboveMi.IsNull ? AboveMi.Value.InterpFilter : Constants.SwitchableFilters;
+
+            if (leftType == aboveType)
+            {
+                return leftType;
+            }
+            else if (leftType == Constants.SwitchableFilters)
+            {
+                return aboveType;
+            }
+            else if (aboveType == Constants.SwitchableFilters)
+            {
+                return leftType;
+            }
+            else
+            {
+                return Constants.SwitchableFilters;
+            }
+        }
+
+        // The mode info data structure has a one element border above and to the
+        // left of the entries corresponding to real macroblocks.
+        // The prediction flags in these dummy entries are initialized to 0.
+        // 0 - inter/inter, inter/--, --/inter, --/--
+        // 1 - intra/inter, inter/intra
+        // 2 - intra/--, --/intra
+        // 3 - intra/intra
+        public int GetIntraInterContext()
+        {
+            if (!AboveMi.IsNull && !LeftMi.IsNull)
+            {  // Both edges available
+                bool aboveIntra = !AboveMi.Value.IsInterBlock();
+                bool leftIntra = !LeftMi.Value.IsInterBlock();
+                return leftIntra && aboveIntra ? 3 : (leftIntra || aboveIntra ? 1 : 0);
+            }
+            else if (!AboveMi.IsNull || !LeftMi.IsNull)
+            {  // One edge available
+                return 2 * (!(!AboveMi.IsNull ? AboveMi.Value : LeftMi.Value).IsInterBlock() ? 1 : 0);
+            }
+            return 0;
+        }
+
+        // Returns a context number for the given MB prediction signal
+        // The mode info data structure has a one element border above and to the
+        // left of the entries corresponding to real blocks.
+        // The prediction flags in these dummy entries are initialized to 0.
+        public int GetTxSizeContext()
+        {
+            int maxTxSize = (int)Luts.MaxTxSizeLookup[(int)Mi[0].Value.SbType];
+            int aboveCtx = (!AboveMi.IsNull && AboveMi.Value.Skip == 0) ? (int)AboveMi.Value.TxSize : maxTxSize;
+            int leftCtx = (!LeftMi.IsNull && LeftMi.Value.Skip == 0) ? (int)LeftMi.Value.TxSize : maxTxSize;
+            if (LeftMi.IsNull)
+            {
+                leftCtx = aboveCtx;
+            }
+
+            if (AboveMi.IsNull)
+            {
+                aboveCtx = leftCtx;
+            }
+
+            return (aboveCtx + leftCtx) > maxTxSize ? 1 : 0;
+        }
+
+        public void SetupBlockPlanes(int ssX, int ssY)
+        {
+            int i;
+
+            for (i = 0; i < Constants.MaxMbPlane; i++)
+            {
+                Plane[i].SubsamplingX = i != 0 ? ssX : 0;
+                Plane[i].SubsamplingY = i != 0 ? ssY : 0;
+            }
+        }
+
+        public void SetSkipContext(int miRow, int miCol)
+        {
+            int aboveIdx = miCol * 2;
+            int leftIdx = (miRow * 2) & 15;
+            int i;
+            for (i = 0; i < Constants.MaxMbPlane; ++i)
+            {
+                ref MacroBlockDPlane pd = ref Plane[i];
+                pd.AboveContext = AboveContext[i].Slice(aboveIdx >> pd.SubsamplingX);
+                pd.LeftContext = new ArrayPtr<sbyte>(ref LeftContext[i][leftIdx >> pd.SubsamplingY], 16 - (leftIdx >> pd.SubsamplingY));
+            }
+        }
+
+        internal void SetMiRowCol(ref TileInfo tile, int miRow, int bh, int miCol, int bw, int miRows, int miCols)
+        {
+            MbToTopEdge = -((miRow * Constants.MiSize) * 8);
+            MbToBottomEdge = ((miRows - bh - miRow) * Constants.MiSize) * 8;
+            MbToLeftEdge = -((miCol * Constants.MiSize) * 8);
+            MbToRightEdge = ((miCols - bw - miCol) * Constants.MiSize) * 8;
+
+            // Are edges available for intra prediction?
+            AboveMi = (miRow != 0) ? Mi[-MiStride] : Ptr<ModeInfo>.Null;
+            LeftMi = (miCol > tile.MiColStart) ? Mi[-1] : Ptr<ModeInfo>.Null;
+        }
+    }
+}

+ 21 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/MacroBlockDPlane.cs

@@ -0,0 +1,21 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct MacroBlockDPlane
+    {
+        public ArrayPtr<int> DqCoeff;
+        public int SubsamplingX;
+        public int SubsamplingY;
+        public Buf2D Dst;
+        public Array2<Buf2D> Pre;
+        public ArrayPtr<sbyte> AboveContext;
+        public ArrayPtr<sbyte> LeftContext;
+        public Array8<Array2<short>> SegDequant;
+
+        // Number of 4x4s in current block
+        public ushort N4W, N4H;
+        // Log2 of N4W, N4H
+        public byte N4Wl, N4Hl;
+    }
+}

+ 66 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/ModeInfo.cs

@@ -0,0 +1,66 @@
+using Ryujinx.Common.Memory;
+using System.Diagnostics;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct ModeInfo
+    {
+        // Common for both Inter and Intra blocks
+        public BlockSize SbType;
+        public PredictionMode Mode;
+        public TxSize TxSize;
+        public sbyte Skip;
+        public sbyte SegmentId;
+        public sbyte SegIdPredicted;  // Valid only when TemporalUpdate is enabled
+
+        // Only for Intra blocks
+        public PredictionMode UvMode;
+
+        // Only for Inter blocks
+        public byte InterpFilter;
+
+        // if ref_frame[idx] is equal to AltRefFrame then
+        // MacroBlockD.BlockRef[idx] is an altref
+        public Array2<sbyte> RefFrame;
+
+        public Array2<Mv> Mv;
+
+        public Array4<BModeInfo> Bmi;
+
+        public PredictionMode GetYMode(int block)
+        {
+            return SbType < BlockSize.Block8x8 ? Bmi[block].Mode : Mode;
+        }
+
+        public TxSize GetUvTxSize(ref MacroBlockDPlane pd)
+        {
+            Debug.Assert(SbType < BlockSize.Block8x8 ||
+                Luts.SsSizeLookup[(int)SbType][pd.SubsamplingX][pd.SubsamplingY] != BlockSize.BlockInvalid);
+            return Luts.UvTxsizeLookup[(int)SbType][(int)TxSize][pd.SubsamplingX][pd.SubsamplingY];
+        }
+
+        public bool IsInterBlock()
+        {
+            return RefFrame[0] > Constants.IntraFrame;
+        }
+
+        public bool HasSecondRef()
+        {
+            return RefFrame[1] > Constants.IntraFrame;
+        }
+
+        private static readonly int[][] IdxNColumnToSubblock = new int[][]
+        {
+            new int[] { 1, 2 }, new int[] { 1, 3 }, new int[] { 3, 2 }, new int[] { 3, 3 }
+        };
+
+        // This function returns either the appropriate sub block or block's mv
+        // on whether the block_size < 8x8 and we have check_sub_blocks set.
+        public Mv GetSubBlockMv(int whichMv, int searchCol, int blockIdx)
+        {
+            return blockIdx >= 0 && SbType < BlockSize.Block8x8
+                ? Bmi[IdxNColumnToSubblock[blockIdx][searchCol == 0 ? 1 : 0]].Mv[whichMv]
+                : Mv[whichMv];
+        }
+    }
+}

+ 14 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/MotionVectorContext.cs

@@ -0,0 +1,14 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum MotionVectorContext
+    {
+        BothZero = 0,
+        ZeroPlusPredicted = 1,
+        BothPredicted = 2,
+        NewPlusNonIntra = 3,
+        BothNew = 4,
+        IntraPlusNonIntra = 5,
+        BothIntra = 6,
+        InvalidCase = 9
+    }
+}

+ 189 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/Mv.cs

@@ -0,0 +1,189 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Video;
+using System;
+using System.Diagnostics;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct Mv
+    {
+        public short Row;
+        public short Col;
+
+        private static readonly byte[] LogInBase2 = new byte[]
+        {
+            0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+            4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+            5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+            8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+            9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10
+        };
+
+        public bool UseMvHp()
+        {
+            const int kMvRefThresh = 64;  // Threshold for use of high-precision 1/8 mv
+            return Math.Abs(Row) < kMvRefThresh && Math.Abs(Col) < kMvRefThresh;
+        }
+
+        public static bool MvJointVertical(MvJointType type)
+        {
+            return type == MvJointType.MvJointHzvnz || type == MvJointType.MvJointHnzvnz;
+        }
+
+        public static bool MvJointHorizontal(MvJointType type)
+        {
+            return type == MvJointType.MvJointHnzvz || type == MvJointType.MvJointHnzvnz;
+        }
+
+        private static int MvClassBase(MvClassType c)
+        {
+            return c != 0 ? Constants.Class0Size << ((int)c + 2) : 0;
+        }
+
+        private static MvClassType GetMvClass(int z, Ptr<int> offset)
+        {
+            MvClassType c = (z >= Constants.Class0Size * 4096) ? MvClassType.MvClass10 : (MvClassType)LogInBase2[z >> 3];
+            if (!offset.IsNull)
+            {
+                offset.Value = z - MvClassBase(c);
+            }
+
+            return c;
+        }
+
+        private static void IncMvComponent(int v, ref Vp9BackwardUpdates counts, int comp, int incr, int usehp)
+        {
+            int s, z, c, o = 0, d, e, f;
+            Debug.Assert(v != 0); /* Should not be zero */
+            s = v < 0 ? 1 : 0;
+            counts.Sign[comp][s] += (uint)incr;
+            z = (s != 0 ? -v : v) - 1; /* Magnitude - 1 */
+
+            c = (int)GetMvClass(z, new Ptr<int>(ref o));
+            counts.Classes[comp][c] += (uint)incr;
+
+            d = (o >> 3);     /* Int mv data */
+            f = (o >> 1) & 3; /* Fractional pel mv data */
+            e = (o & 1);      /* High precision mv data */
+
+            if (c == (int)MvClassType.MvClass0)
+            {
+                counts.Class0[comp][d] += (uint)incr;
+                counts.Class0Fp[comp][d][f] += (uint)incr;
+                counts.Class0Hp[comp][e] += (uint)(usehp * incr);
+            }
+            else
+            {
+                int i;
+                int b = c + Constants.Class0Bits - 1;  // Number of bits
+                for (i = 0; i < b; ++i)
+                {
+                    counts.Bits[comp][i][((d >> i) & 1)] += (uint)incr;
+                }
+
+                counts.Fp[comp][f] += (uint)incr;
+                counts.Hp[comp][e] += (uint)(usehp * incr);
+            }
+        }
+
+        private MvJointType GetMvJoint()
+        {
+            if (Row == 0)
+            {
+                return Col == 0 ? MvJointType.MvJointZero : MvJointType.MvJointHnzvz;
+            }
+            else
+            {
+                return Col == 0 ? MvJointType.MvJointHzvnz : MvJointType.MvJointHnzvnz;
+            }
+        }
+
+        internal void IncMv(Ptr<Vp9BackwardUpdates> counts)
+        {
+            if (!counts.IsNull)
+            {
+                MvJointType j = GetMvJoint();
+                ++counts.Value.Joints[(int)j];
+
+                if (MvJointVertical(j))
+                {
+                    IncMvComponent(Row, ref counts.Value, 0, 1, 1);
+                }
+
+                if (MvJointHorizontal(j))
+                {
+                    IncMvComponent(Col, ref counts.Value, 1, 1, 1);
+                }
+            }
+        }
+
+        public void ClampMv(int minCol, int maxCol, int minRow, int maxRow)
+        {
+            Col = (short)Math.Clamp(Col, minCol, maxCol);
+            Row = (short)Math.Clamp(Row, minRow, maxRow);
+        }
+
+        private const int MvBorder = (16 << 3);  // Allow 16 pels in 1/8th pel units
+
+        public void ClampMvRef(ref MacroBlockD xd)
+        {
+            ClampMv(
+                xd.MbToLeftEdge - MvBorder,
+                xd.MbToRightEdge + MvBorder,
+                xd.MbToTopEdge - MvBorder,
+                xd.MbToBottomEdge + MvBorder);
+        }
+
+        public void LowerMvPrecision(bool allowHP)
+        {
+            bool useHP = allowHP && UseMvHp();
+            if (!useHP)
+            {
+                if ((Row & 1) != 0)
+                {
+                    Row += (short)(Row > 0 ? -1 : 1);
+                }
+
+                if ((Col & 1) != 0)
+                {
+                    Col += (short)(Col > 0 ? -1 : 1);
+                }
+            }
+        }
+    }
+}

+ 8 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/Mv32.cs

@@ -0,0 +1,8 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct Mv32
+    {
+        public int Row;
+        public int Col;
+    }
+}

+ 17 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/MvClassType.cs

@@ -0,0 +1,17 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum MvClassType
+    {
+        MvClass0 = 0,   /* (0, 2]     integer pel */
+        MvClass1 = 1,   /* (2, 4]     integer pel */
+        MvClass2 = 2,   /* (4, 8]     integer pel */
+        MvClass3 = 3,   /* (8, 16]    integer pel */
+        MvClass4 = 4,   /* (16, 32]   integer pel */
+        MvClass5 = 5,   /* (32, 64]   integer pel */
+        MvClass6 = 6,   /* (64, 128]  integer pel */
+        MvClass7 = 7,   /* (128, 256] integer pel */
+        MvClass8 = 8,   /* (256, 512] integer pel */
+        MvClass9 = 9,   /* (512, 1024] integer pel */
+        MvClass10 = 10, /* (1024,2048] integer pel */
+    }
+}

+ 10 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/MvJointType.cs

@@ -0,0 +1,10 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum MvJointType
+    {
+        MvJointZero = 0,   /* Zero vector */
+        MvJointHnzvz = 1,  /* Vert zero, hor nonzero */
+        MvJointHzvnz = 2,  /* Hor zero, vert nonzero */
+        MvJointHnzvnz = 3, /* Both components nonzero */
+    }
+}

+ 10 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/MvRef.cs

@@ -0,0 +1,10 @@
+using Ryujinx.Common.Memory;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct MvRef
+    {
+        public Array2<Mv> Mv;
+        public Array2<sbyte> RefFrame;
+    }
+}

+ 12 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/PartitionType.cs

@@ -0,0 +1,12 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum PartitionType
+    {
+        PartitionNone,
+        PartitionHorz,
+        PartitionVert,
+        PartitionSplit,
+        PartitionTypes,
+        PartitionInvalid = PartitionTypes
+    }
+}

+ 9 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/PlaneType.cs

@@ -0,0 +1,9 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum PlaneType
+    {
+        Y = 0,
+        Uv = 1,
+        PlaneTypes
+    }
+}

+ 14 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/Position.cs

@@ -0,0 +1,14 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct Position
+    {
+        public int Row;
+        public int Col;
+
+        public Position(int row, int col)
+        {
+            Row = row;
+            Col = col;
+        }
+    }
+}

+ 21 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/PredictionMode.cs

@@ -0,0 +1,21 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum PredictionMode
+    {
+        DcPred = 0,    // Average of above and left pixels
+        VPred = 1,     // Vertical
+        HPred = 2,     // Horizontal
+        D45Pred = 3,   // Directional 45  deg = round(arctan(1 / 1) * 180 / pi)
+        D135Pred = 4,  // Directional 135 deg = 180 - 45
+        D117Pred = 5,  // Directional 117 deg = 180 - 63
+        D153Pred = 6,  // Directional 153 deg = 180 - 27
+        D207Pred = 7,  // Directional 207 deg = 180 + 27
+        D63Pred = 8,   // Directional 63  deg = round(arctan(2 / 1) * 180 / pi)
+        TmPred = 9,    // True-motion
+        NearestMv = 10,
+        NearMv = 11,
+        ZeroMv = 12,
+        NewMv = 13,
+        MbModeCount = 14
+    }
+}

+ 8 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/RefBuffer.cs

@@ -0,0 +1,8 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct RefBuffer
+    {
+        public Surface Buf;
+        public ScaleFactors Sf;
+    }
+}

+ 10 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/ReferenceMode.cs

@@ -0,0 +1,10 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum ReferenceMode
+    {
+        SingleReference = 0,
+        CompoundReference = 1,
+        ReferenceModeSelect = 2,
+        ReferenceModes = 3
+    }
+}

+ 451 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/ScaleFactors.cs

@@ -0,0 +1,451 @@
+using Ryujinx.Common.Memory;
+using System.Runtime.CompilerServices;
+using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Convolve;
+using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct ScaleFactors
+    {
+        private const int RefScaleShift = 14;
+        private const int RefNoScale = (1 << RefScaleShift);
+        private const int RefInvalidScale = -1;
+
+        private unsafe delegate void ConvolveFn(
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h);
+
+        private unsafe delegate void HighbdConvolveFn(
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            Array8<short>[] filter,
+            int x0Q4,
+            int xStepQ4,
+            int y0Q4,
+            int yStepQ4,
+            int w,
+            int h,
+            int bd);
+
+        private static readonly unsafe ConvolveFn[][][] PredictX16Y16 = new ConvolveFn[][][]
+        {
+            new ConvolveFn[][]
+            {
+                new ConvolveFn[]
+                {
+                    ConvolveCopy,
+                    ConvolveAvg
+                },
+                new ConvolveFn[]
+                {
+                    Convolve8Vert,
+                    Convolve8AvgVert
+                }
+            },
+            new ConvolveFn[][]
+            {
+                new ConvolveFn[]
+                {
+                    Convolve8Horiz,
+                    Convolve8AvgHoriz
+                },
+                new ConvolveFn[]
+                {
+                    Convolve8,
+                    Convolve8Avg
+                }
+            }
+        };
+
+        private static readonly unsafe ConvolveFn[][][] PredictX16 = new ConvolveFn[][][]
+        {
+            new ConvolveFn[][]
+            {
+                new ConvolveFn[]
+                {
+                    ScaledVert,
+                    ScaledAvgVert
+                },
+                new ConvolveFn[]
+                {
+                    ScaledVert,
+                    ScaledAvgVert
+                }
+            },
+            new ConvolveFn[][]
+            {
+                new ConvolveFn[]
+                {
+                    Scaled2D,
+                    ScaledAvg2D
+                },
+                new ConvolveFn[]
+                {
+                    Scaled2D,
+                    ScaledAvg2D
+                }
+            }
+        };
+
+        private static readonly unsafe ConvolveFn[][][] PredictY16 = new ConvolveFn[][][]
+        {
+            new ConvolveFn[][]
+            {
+                new ConvolveFn[]
+                {
+                    ScaledHoriz,
+                    ScaledAvgHoriz
+                },
+                new ConvolveFn[]
+                {
+                    Scaled2D,
+                    ScaledAvg2D
+                }
+            },
+            new ConvolveFn[][]
+            {
+                new ConvolveFn[]
+                {
+                    ScaledHoriz,
+                    ScaledAvgHoriz
+                },
+                new ConvolveFn[]
+                {
+                    Scaled2D,
+                    ScaledAvg2D
+                }
+            }
+        };
+
+        private static readonly unsafe ConvolveFn[][][] Predict = new ConvolveFn[][][]
+        {
+            new ConvolveFn[][]
+            {
+                new ConvolveFn[]
+                {
+                    Scaled2D,
+                    ScaledAvg2D
+                },
+                new ConvolveFn[]
+                {
+                    Scaled2D,
+                    ScaledAvg2D
+                }
+            },
+            new ConvolveFn[][]
+            {
+                new ConvolveFn[]
+                {
+                    Scaled2D,
+                    ScaledAvg2D
+                },
+                new ConvolveFn[]
+                {
+                    Scaled2D,
+                    ScaledAvg2D
+                }
+            }
+        };
+
+        private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictX16Y16 = new HighbdConvolveFn[][][]
+        {
+            new HighbdConvolveFn[][]
+            {
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolveCopy,
+                    HighbdConvolveAvg
+                },
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8Vert,
+                    HighbdConvolve8AvgVert
+                }
+            },
+            new HighbdConvolveFn[][]
+            {
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8Horiz,
+                    HighbdConvolve8AvgHoriz
+                },
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8,
+                    HighbdConvolve8Avg
+                }
+            }
+        };
+
+        private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictX16 = new HighbdConvolveFn[][][]
+        {
+            new HighbdConvolveFn[][]
+            {
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8Vert,
+                    HighbdConvolve8AvgVert
+                },
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8Vert,
+                    HighbdConvolve8AvgVert
+                }
+            },
+            new HighbdConvolveFn[][]
+            {
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8,
+                    HighbdConvolve8Avg
+                },
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8,
+                    HighbdConvolve8Avg
+                }
+            }
+        };
+
+        private static readonly unsafe HighbdConvolveFn[][][] HighbdPredictY16 = new HighbdConvolveFn[][][]
+        {
+            new HighbdConvolveFn[][]
+            {
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8Horiz,
+                    HighbdConvolve8AvgHoriz
+                },
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8,
+                    HighbdConvolve8Avg
+                }
+            },
+            new HighbdConvolveFn[][]
+            {
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8Horiz,
+                    HighbdConvolve8AvgHoriz
+                },
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8,
+                    HighbdConvolve8Avg
+                }
+            }
+        };
+
+        private static readonly unsafe HighbdConvolveFn[][][] HighbdPredict = new HighbdConvolveFn[][][]
+        {
+            new HighbdConvolveFn[][]
+            {
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8,
+                    HighbdConvolve8Avg
+                },
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8,
+                    HighbdConvolve8Avg
+                }
+            },
+            new HighbdConvolveFn[][]
+            {
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8,
+                    HighbdConvolve8Avg
+                },
+                new HighbdConvolveFn[]
+                {
+                    HighbdConvolve8,
+                    HighbdConvolve8Avg
+                }
+            }
+        };
+
+        public int XScaleFP;  // Horizontal fixed point scale factor
+        public int YScaleFP;  // Vertical fixed point scale factor
+        public int XStepQ4;
+        public int YStepQ4;
+
+        public int ScaleValueX(int val)
+        {
+            return IsScaled() ? ScaledX(val) : val;
+        }
+
+        public int ScaleValueY(int val)
+        {
+            return IsScaled() ? ScaledY(val) : val;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public unsafe void InterPredict(
+            int horiz,
+            int vert,
+            int avg,
+            byte* src,
+            int srcStride,
+            byte* dst,
+            int dstStride,
+            int subpelX,
+            int subpelY,
+            int w,
+            int h,
+            Array8<short>[] kernel,
+            int xs,
+            int ys)
+        {
+            if (XStepQ4 == 16)
+            {
+                if (YStepQ4 == 16)
+                {
+                    // No scaling in either direction.
+                    PredictX16Y16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h);
+                }
+                else
+                {
+                    // No scaling in x direction. Must always scale in the y direction.
+                    PredictX16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h);
+                }
+            }
+            else
+            {
+                if (YStepQ4 == 16)
+                {
+                    // No scaling in the y direction. Must always scale in the x direction.
+                    PredictY16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h);
+                }
+                else
+                {
+                    // Must always scale in both directions.
+                    Predict[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h);
+                }
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public unsafe void HighbdInterPredict(
+            int horiz,
+            int vert,
+            int avg,
+            ushort* src,
+            int srcStride,
+            ushort* dst,
+            int dstStride,
+            int subpelX,
+            int subpelY,
+            int w,
+            int h,
+            Array8<short>[] kernel,
+            int xs,
+            int ys,
+            int bd)
+        {
+            if (XStepQ4 == 16)
+            {
+                if (YStepQ4 == 16)
+                {
+                    // No scaling in either direction.
+                    HighbdPredictX16Y16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd);
+                }
+                else
+                {
+                    // No scaling in x direction. Must always scale in the y direction.
+                    HighbdPredictX16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd);
+                }
+            }
+            else
+            {
+                if (YStepQ4 == 16)
+                {
+                    // No scaling in the y direction. Must always scale in the x direction.
+                    HighbdPredictY16[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd);
+                }
+                else
+                {
+                    // Must always scale in both directions.
+                    HighbdPredict[horiz][vert][avg](src, srcStride, dst, dstStride, kernel, subpelX, xs, subpelY, ys, w, h, bd);
+                }
+            }
+        }
+
+        private int ScaledX(int val)
+        {
+            return (int)((long)val * XScaleFP >> RefScaleShift);
+        }
+
+        private int ScaledY(int val)
+        {
+            return (int)((long)val * YScaleFP >> RefScaleShift);
+        }
+
+        private static int GetFixedPointScaleFactor(int otherSize, int thisSize)
+        {
+            // Calculate scaling factor once for each reference frame
+            // and use fixed point scaling factors in decoding and encoding routines.
+            // Hardware implementations can calculate scale factor in device driver
+            // and use multiplication and shifting on hardware instead of division.
+            return (otherSize << RefScaleShift) / thisSize;
+        }
+
+        public Mv32 ScaleMv(ref Mv mv, int x, int y)
+        {
+            int xOffQ4 = ScaledX(x << SubpelBits) & SubpelMask;
+            int yOffQ4 = ScaledY(y << SubpelBits) & SubpelMask;
+            Mv32 res = new Mv32()
+            {
+                Row = ScaledY(mv.Row) + yOffQ4,
+                Col = ScaledX(mv.Col) + xOffQ4
+            };
+            return res;
+        }
+
+        public bool IsValidScale()
+        {
+            return XScaleFP != RefInvalidScale && YScaleFP != RefInvalidScale;
+        }
+
+        public bool IsScaled()
+        {
+            return IsValidScale() && (XScaleFP != RefNoScale || YScaleFP != RefNoScale);
+        }
+
+        public static bool ValidRefFrameSize(int refWidth, int refHeight, int thisWidth, int thisHeight)
+        {
+            return 2 * thisWidth >= refWidth &&
+                   2 * thisHeight >= refHeight &&
+                   thisWidth <= 16 * refWidth &&
+                   thisHeight <= 16 * refHeight;
+        }
+
+        public void SetupScaleFactorsForFrame(int otherW, int otherH, int thisW, int thisH)
+        {
+            if (!ValidRefFrameSize(otherW, otherH, thisW, thisH))
+            {
+                XScaleFP = RefInvalidScale;
+                YScaleFP = RefInvalidScale;
+                return;
+            }
+
+            XScaleFP = GetFixedPointScaleFactor(otherW, thisW);
+            YScaleFP = GetFixedPointScaleFactor(otherH, thisH);
+            XStepQ4 = ScaledX(16);
+            YStepQ4 = ScaledY(16);
+        }
+    }
+}

+ 11 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/SegLvlFeatures.cs

@@ -0,0 +1,11 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum SegLvlFeatures
+    {
+        SegLvlAltQ = 0,      // Use alternate Quantizer ....
+        SegLvlAltLf = 1,     // Use alternate loop filter value...
+        SegLvlRefFrame = 2,  // Optional Segment reference frame
+        SegLvlSkip = 3,      // Optional Segment (0,0) + skip mode
+        SegLvlMax = 4        // Number of features supported
+    }
+}

+ 71 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/Segmentation.cs

@@ -0,0 +1,71 @@
+using Ryujinx.Common.Memory;
+using System.Diagnostics;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct Segmentation
+    {
+        private static readonly int[] SegFeatureDataSigned = new int[] { 1, 1, 0, 0 };
+        private static readonly int[] SegFeatureDataMax = new int[] { QuantCommon.MaxQ, Vp9.LoopFilter.MaxLoopFilter, 3, 0 };
+
+        public bool Enabled;
+        public bool UpdateMap;
+        public byte UpdateData;
+        public byte AbsDelta;
+        public bool TemporalUpdate;
+
+        public Array8<Array4<short>> FeatureData;
+        public Array8<uint> FeatureMask;
+        public int AqAvOffset;
+
+        public static byte GetPredProbSegId(ref Array3<byte> segPredProbs, ref MacroBlockD xd)
+        {
+            return segPredProbs[xd.GetPredContextSegId()];
+        }
+
+        public void ClearAllSegFeatures()
+        {
+            MemoryMarshal.CreateSpan(ref FeatureData[0][0], 8 * 4).Fill(0);
+            MemoryMarshal.CreateSpan(ref FeatureMask[0], 8).Fill(0);
+            AqAvOffset = 0;
+        }
+
+        internal void EnableSegFeature(int segmentId, SegLvlFeatures featureId)
+        {
+            FeatureMask[segmentId] |= 1u << (int)featureId;
+        }
+
+        internal static int FeatureDataMax(SegLvlFeatures featureId)
+        {
+            return SegFeatureDataMax[(int)featureId];
+        }
+
+        internal static int IsSegFeatureSigned(SegLvlFeatures featureId)
+        {
+            return SegFeatureDataSigned[(int)featureId];
+        }
+
+        internal void SetSegData(int segmentId, SegLvlFeatures featureId, int segData)
+        {
+            Debug.Assert(segData <= SegFeatureDataMax[(int)featureId]);
+            if (segData < 0)
+            {
+                Debug.Assert(SegFeatureDataSigned[(int)featureId] != 0);
+                Debug.Assert(-segData <= SegFeatureDataMax[(int)featureId]);
+            }
+
+            FeatureData[segmentId][(int)featureId] = (short)segData;
+        }
+
+        internal int IsSegFeatureActive(int segmentId, SegLvlFeatures featureId)
+        {
+            return Enabled && (FeatureMask[segmentId] & (1 << (int)featureId)) != 0 ? 1 : 0;
+        }
+
+        internal short GetSegData(int segmentId, SegLvlFeatures featureId)
+        {
+            return FeatureData[segmentId][(int)featureId];
+        }
+    }
+}

+ 80 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/Surface.cs

@@ -0,0 +1,80 @@
+using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Video;
+using System;
+using System.Runtime.InteropServices;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct Surface : ISurface
+    {
+        public ArrayPtr<byte> YBuffer;
+        public ArrayPtr<byte> UBuffer;
+        public ArrayPtr<byte> VBuffer;
+
+        public unsafe Plane YPlane => new Plane((IntPtr)YBuffer.ToPointer(), YBuffer.Length);
+        public unsafe Plane UPlane => new Plane((IntPtr)UBuffer.ToPointer(), UBuffer.Length);
+        public unsafe Plane VPlane => new Plane((IntPtr)VBuffer.ToPointer(), VBuffer.Length);
+
+        public int Width { get; }
+        public int Height { get; }
+        public int AlignedWidth { get; }
+        public int AlignedHeight { get; }
+        public int Stride { get; }
+        public int UvWidth { get; }
+        public int UvHeight { get; }
+        public int UvAlignedWidth { get; }
+        public int UvAlignedHeight { get; }
+        public int UvStride { get; }
+        public bool HighBd => false;
+
+        private readonly IntPtr _pointer;
+
+        public Surface(int width, int height)
+        {
+            const int border = 32;
+            const int ssX = 1;
+            const int ssY = 1;
+            const bool highbd = false;
+
+            int alignedWidth = (width + 7) & ~7;
+            int alignedHeight = (height + 7) & ~7;
+            int yStride = ((alignedWidth + 2 * border) + 31) & ~31;
+            int yplaneSize = (alignedHeight + 2 * border) * yStride;
+            int uvWidth = alignedWidth >> ssX;
+            int uvHeight = alignedHeight >> ssY;
+            int uvStride = yStride >> ssX;
+            int uvBorderW = border >> ssX;
+            int uvBorderH = border >> ssY;
+            int uvplaneSize = (uvHeight + 2 * uvBorderH) * uvStride;
+
+            int frameSize = (highbd ? 2 : 1) * (yplaneSize + 2 * uvplaneSize);
+
+            IntPtr pointer = Marshal.AllocHGlobal(frameSize);
+            _pointer = pointer;
+            Width = width;
+            Height = height;
+            AlignedWidth = alignedWidth;
+            AlignedHeight = alignedHeight;
+            Stride = yStride;
+            UvWidth = (width + ssX) >> ssX;
+            UvHeight = (height + ssY) >> ssY;
+            UvAlignedWidth = uvWidth;
+            UvAlignedHeight = uvHeight;
+            UvStride = uvStride;
+
+            ArrayPtr<byte> NewPlane(int start, int size, int border)
+            {
+                return new ArrayPtr<byte>(pointer + start + border, size - border);
+            }
+
+            YBuffer = NewPlane(0, yplaneSize, (border * yStride) + border);
+            UBuffer = NewPlane(yplaneSize, uvplaneSize, (uvBorderH * uvStride) + uvBorderW);
+            VBuffer = NewPlane(yplaneSize + uvplaneSize, uvplaneSize, (uvBorderH * uvStride) + uvBorderW);
+        }
+
+        public void Dispose()
+        {
+            Marshal.FreeHGlobal(_pointer);
+        }
+    }
+}

+ 85 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/TileInfo.cs

@@ -0,0 +1,85 @@
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using System;
+using System.Diagnostics;
+
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal struct TileInfo
+    {
+        private const int MinTileWidthB64 = 4;
+        private const int MaxTileWidthB64 = 64;
+
+        public int MiRowStart, MiRowEnd;
+        public int MiColStart, MiColEnd;
+
+        public static int MiColsAlignedToSb(int nMis)
+        {
+            return BitUtils.AlignPowerOfTwo(nMis, Constants.MiBlockSizeLog2);
+        }
+
+        private static int GetTileOffset(int idx, int mis, int log2)
+        {
+            int sbCols = MiColsAlignedToSb(mis) >> Constants.MiBlockSizeLog2;
+            int offset = ((idx * sbCols) >> log2) << Constants.MiBlockSizeLog2;
+            return Math.Min(offset, mis);
+        }
+
+        public void SetRow(ref Vp9Common cm, int row)
+        {
+            MiRowStart = GetTileOffset(row, cm.MiRows, cm.Log2TileRows);
+            MiRowEnd = GetTileOffset(row + 1, cm.MiRows, cm.Log2TileRows);
+        }
+
+        public void SetCol(ref Vp9Common cm, int col)
+        {
+            MiColStart = GetTileOffset(col, cm.MiCols, cm.Log2TileCols);
+            MiColEnd = GetTileOffset(col + 1, cm.MiCols, cm.Log2TileCols);
+        }
+
+        public void Init(ref Vp9Common cm, int row, int col)
+        {
+            SetRow(ref cm, row);
+            SetCol(ref cm, col);
+        }
+
+        // Checks that the given miRow, miCol and search point
+        // are inside the borders of the tile.
+        public bool IsInside(int miCol, int miRow, int miRows, ref Position miPos)
+        {
+            return !(miRow + miPos.Row < 0 ||
+                     miCol + miPos.Col < MiColStart ||
+                     miRow + miPos.Row >= miRows ||
+                     miCol + miPos.Col >= MiColEnd);
+        }
+
+        private static int GetMinLog2TileCols(int sb64Cols)
+        {
+            int minLog2 = 0;
+            while ((MaxTileWidthB64 << minLog2) < sb64Cols)
+            {
+                ++minLog2;
+            }
+
+            return minLog2;
+        }
+
+        private static int GetMaxLog2TileCols(int sb64Cols)
+        {
+            int maxLog2 = 1;
+            while ((sb64Cols >> maxLog2) >= MinTileWidthB64)
+            {
+                ++maxLog2;
+            }
+
+            return maxLog2 - 1;
+        }
+
+        public static void GetTileNBits(int miCols, ref int minLog2TileCols, ref int maxLog2TileCols)
+        {
+            int sb64Cols = MiColsAlignedToSb(miCols) >> Constants.MiBlockSizeLog2;
+            minLog2TileCols = GetMinLog2TileCols(sb64Cols);
+            maxLog2TileCols = GetMaxLog2TileCols(sb64Cols);
+            Debug.Assert(minLog2TileCols <= maxLog2TileCols);
+        }
+    }
+}

+ 12 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/TxMode.cs

@@ -0,0 +1,12 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    public enum TxMode
+    {
+        Only4X4 = 0,      // Only 4x4 transform used
+        Allow8X8 = 1,     // Allow block transform size up to 8x8
+        Allow16X16 = 2,   // Allow block transform size up to 16x16
+        Allow32X32 = 3,   // Allow block transform size up to 32x32
+        TxModeSelect = 4, // Transform specified for each block
+        TxModes = 5
+    }
+}

+ 11 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/TxSize.cs

@@ -0,0 +1,11 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    public enum TxSize
+    {
+        Tx4x4 = 0,   // 4x4 transform
+        Tx8x8 = 1,   // 8x8 transform
+        Tx16x16 = 2, // 16x16 transform
+        Tx32x32 = 3, // 32x32 transform
+        TxSizes = 4
+    }
+}

+ 11 - 0
Ryujinx.Graphics.Nvdec.Vp9/Types/TxType.cs

@@ -0,0 +1,11 @@
+namespace Ryujinx.Graphics.Nvdec.Vp9.Types
+{
+    internal enum TxType
+    {
+        DctDct = 0,   // DCT  in both horizontal and vertical
+        AdstDct = 1,  // ADST in vertical, DCT in horizontal
+        DctAdst = 2,  // DCT  in vertical, ADST in horizontal
+        AdstAdst = 3, // ADST in both directions
+        TxTypes = 4
+    }
+}

この差分においてかなりの量のファイルが変更されているため、一部のファイルを表示していません