Просмотр исходного кода

Enable multithreaded VP9 decoding (#2009)

* Enable multithreaded VP9 decoding

* Limit the number of threads used for video decoding
gdkchan 5 лет назад
Родитель
Сommit
c465d771dd

+ 176 - 4
Ryujinx.Graphics.Nvdec.Vp9/DecodeFrame.cs

@@ -1,13 +1,14 @@
 using Ryujinx.Common.Memory;
 using Ryujinx.Common.Memory;
+using Ryujinx.Graphics.Nvdec.Vp9.Common;
+using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
+using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using Ryujinx.Graphics.Video;
 using System;
 using System;
 using System.Buffers.Binary;
 using System.Buffers.Binary;
 using System.Diagnostics;
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.InteropServices;
-using Ryujinx.Graphics.Nvdec.Vp9.Common;
-using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
-using Ryujinx.Graphics.Nvdec.Vp9.Types;
-using Ryujinx.Graphics.Video;
+using System.Threading.Tasks;
 using Mv = Ryujinx.Graphics.Nvdec.Vp9.Types.Mv;
 using Mv = Ryujinx.Graphics.Nvdec.Vp9.Types.Mv;
 
 
 namespace Ryujinx.Graphics.Nvdec.Vp9
 namespace Ryujinx.Graphics.Nvdec.Vp9
@@ -1095,6 +1096,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             data = data.Slice(size);
             data = data.Slice(size);
         }
         }
 
 
+        private static void GetTileBuffers(ref Vp9Common cm, ArrayPtr<byte> data, int tileCols, ref Array64<TileBuffer> tileBuffers)
+        {
+            int c;
+
+            for (c = 0; c < tileCols; ++c)
+            {
+                bool isLast = c == tileCols - 1;
+                ref TileBuffer buf = ref tileBuffers[c];
+                buf.Col = c;
+                GetTileBuffer(isLast, ref cm.Error, ref data, ref buf);
+            }
+        }
+
         private static void GetTileBuffers(
         private static void GetTileBuffers(
             ref Vp9Common cm,
             ref Vp9Common cm,
             ArrayPtr<byte> data,
             ArrayPtr<byte> data,
@@ -1181,5 +1195,163 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             // Get last tile data.
             // Get last tile data.
             return cm.TileWorkerData[tileCols * tileRows - 1].BitReader.FindEnd();
             return cm.TileWorkerData[tileCols * tileRows - 1].BitReader.FindEnd();
         }
         }
+
+        private static bool DecodeTileCol(ref TileWorkerData tileData, ref Vp9Common cm, ref Array64<TileBuffer> tileBuffers)
+        {
+            ref TileInfo tile = ref tileData.Xd.Tile;
+            int finalCol = (1 << cm.Log2TileCols) - 1;
+            ArrayPtr<byte> bitReaderEnd = ArrayPtr<byte>.Null;
+
+            int n = tileData.BufStart;
+
+            tileData.Xd.Corrupted = false;
+
+            do
+            {
+                ref TileBuffer buf = ref tileBuffers[n];
+
+                Debug.Assert(cm.Log2TileRows == 0);
+                tileData.Dqcoeff = new Array32<Array32<int>>();
+                tile.Init(ref cm, 0, buf.Col);
+                SetupTokenDecoder(buf.Data, buf.Size, ref tileData.ErrorInfo, ref tileData.BitReader);
+                cm.InitMacroBlockD(ref tileData.Xd, new ArrayPtr<int>(ref tileData.Dqcoeff[0][0], 32 * 32));
+                tileData.Xd.ErrorInfo = new Ptr<InternalErrorInfo>(ref tileData.ErrorInfo);
+
+                for (int miRow = tile.MiRowStart; miRow < tile.MiRowEnd; miRow += Constants.MiBlockSize)
+                {
+                    tileData.Xd.LeftContext = new Array3<Array16<sbyte>>();
+                    tileData.Xd.LeftSegContext = new Array8<sbyte>();
+                    for (int miCol = tile.MiColStart; miCol < tile.MiColEnd; miCol += Constants.MiBlockSize)
+                    {
+                        DecodePartition(ref tileData, ref cm, miRow, miCol, BlockSize.Block64x64, 4);
+                    }
+                }
+
+                if (buf.Col == finalCol)
+                {
+                    bitReaderEnd = tileData.BitReader.FindEnd();
+                }
+            } while (!tileData.Xd.Corrupted && ++n <= tileData.BufEnd);
+
+            tileData.DataEnd = bitReaderEnd;
+            return !tileData.Xd.Corrupted;
+        }
+
+        public static unsafe ArrayPtr<byte> DecodeTilesMt(ref Vp9Common cm, ArrayPtr<byte> data, int maxThreads)
+        {
+            ArrayPtr<byte> bitReaderEnd = ArrayPtr<byte>.Null;
+
+            int tileCols = 1 << cm.Log2TileCols;
+            int tileRows = 1 << cm.Log2TileRows;
+            int totalTiles = tileCols * tileRows;
+            int numWorkers = Math.Min(maxThreads, tileCols);
+            int n;
+
+            Debug.Assert(tileCols <= (1 << 6));
+            Debug.Assert(tileRows == 1);
+
+            cm.AboveContext.ToSpan().Fill(0);
+            cm.AboveSegContext.ToSpan().Fill(0);
+
+            for (n = 0; n < numWorkers; ++n)
+            {
+                ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles];
+
+                tileData.Xd = cm.Mb;
+                tileData.Xd.Counts = new Ptr<Vp9BackwardUpdates>(ref tileData.Counts);
+                tileData.Counts = new Vp9BackwardUpdates();
+            }
+
+            Array64<TileBuffer> tileBuffers = new Array64<TileBuffer>();
+
+            GetTileBuffers(ref cm, data, tileCols, ref tileBuffers);
+
+            tileBuffers.ToSpan().Slice(0, tileCols).Sort(CompareTileBuffers);
+
+            if (numWorkers == tileCols)
+            {
+                TileBuffer largest = tileBuffers[0];
+                Span<TileBuffer> buffers = tileBuffers.ToSpan();
+                buffers.Slice(1).CopyTo(buffers.Slice(0, tileBuffers.Length - 1));
+                tileBuffers[tileCols - 1] = largest;
+            }
+            else
+            {
+                int start = 0, end = tileCols - 2;
+                TileBuffer tmp;
+
+                // Interleave the tiles to distribute the load between threads, assuming a
+                // larger tile implies it is more difficult to decode.
+                while (start < end)
+                {
+                    tmp = tileBuffers[start];
+                    tileBuffers[start] = tileBuffers[end];
+                    tileBuffers[end] = tmp;
+                    start += 2;
+                    end -= 2;
+                }
+            }
+
+            int baseVal = tileCols / numWorkers;
+            int remain = tileCols % numWorkers;
+            int bufStart = 0;
+
+            for (n = 0; n < numWorkers; ++n)
+            {
+                int count = baseVal + (remain + n) / numWorkers;
+                ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles];
+
+                tileData.BufStart = bufStart;
+                tileData.BufEnd = bufStart + count - 1;
+                tileData.DataEnd = data.Slice(data.Length);
+                bufStart += count;
+            }
+
+            Ptr<Vp9Common> cmPtr = new Ptr<Vp9Common>(ref cm);
+
+            Parallel.For(0, numWorkers, (n) =>
+            {
+                ref TileWorkerData tileData = ref cmPtr.Value.TileWorkerData[n + totalTiles];
+
+                if (!DecodeTileCol(ref tileData, ref cmPtr.Value, ref tileBuffers))
+                {
+                    cmPtr.Value.Mb.Corrupted = true;
+                }
+            });
+
+            for (; n > 0; --n)
+            {
+                if (bitReaderEnd.IsNull)
+                {
+                    ref TileWorkerData tileData = ref cm.TileWorkerData[n - 1 + totalTiles];
+                    bitReaderEnd = tileData.DataEnd;
+                }
+            }
+
+            for (n = 0; n < numWorkers; ++n)
+            {
+                ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles];
+                AccumulateFrameCounts(ref cm.Counts.Value, ref tileData.Counts);
+            }
+
+            Debug.Assert(!bitReaderEnd.IsNull || cm.Mb.Corrupted);
+            return bitReaderEnd;
+        }
+
+        private static int CompareTileBuffers(TileBuffer bufA, TileBuffer bufB)
+        {
+            return (bufA.Size < bufB.Size ? 1 : 0) - (bufA.Size > bufB.Size ? 1 : 0);
+        }
+
+        private static void AccumulateFrameCounts(ref Vp9BackwardUpdates accum, ref Vp9BackwardUpdates counts)
+        {
+            Span<uint> a = MemoryMarshal.Cast<Vp9BackwardUpdates, uint>(MemoryMarshal.CreateSpan(ref accum, 1));
+            Span<uint> c = MemoryMarshal.Cast<Vp9BackwardUpdates, uint>(MemoryMarshal.CreateSpan(ref counts, 1));
+
+            for (int i = 0; i < a.Length; i++)
+            {
+                a[i] += c[i];
+            }
+        }
     }
     }
 }
 }

+ 16 - 2
Ryujinx.Graphics.Nvdec.Vp9/Decoder.cs

@@ -92,7 +92,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 
 
             cm.Mb.SetupBlockPlanes(1, 1);
             cm.Mb.SetupBlockPlanes(1, 1);
 
 
-            cm.AllocTileWorkerData(_allocator, 1 << pictureInfo.Log2TileCols, 1 << pictureInfo.Log2TileRows);
+            int tileCols = 1 << pictureInfo.Log2TileCols;
+            int tileRows = 1 << pictureInfo.Log2TileRows;
+
+            // Video usually have only 4 columns, so more threads won't make a difference for those.
+            // Try to not take all CPU cores for video decoding.
+            int maxThreads = Math.Min(4, Environment.ProcessorCount / 2);
+
+            cm.AllocTileWorkerData(_allocator, tileCols, tileRows, maxThreads);
             cm.AllocContextBuffers(_allocator, output.Width, output.Height);
             cm.AllocContextBuffers(_allocator, output.Width, output.Height);
             cm.InitContextBuffers();
             cm.InitContextBuffers();
             cm.SetupSegmentationDequant();
             cm.SetupSegmentationDequant();
@@ -104,7 +111,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
             {
             {
                 try
                 try
                 {
                 {
-                    DecodeFrame.DecodeTiles(ref cm, new ArrayPtr<byte>(dataPtr, bitstream.Length));
+                    if (maxThreads > 1 && tileRows == 1 && tileCols > 1)
+                    {
+                        DecodeFrame.DecodeTilesMt(ref cm, new ArrayPtr<byte>(dataPtr, bitstream.Length), maxThreads);
+                    }
+                    else
+                    {
+                        DecodeFrame.DecodeTiles(ref cm, new ArrayPtr<byte>(dataPtr, bitstream.Length));
+                    }
                 }
                 }
                 catch (InternalErrorException)
                 catch (InternalErrorException)
                 {
                 {

+ 51 - 1
Ryujinx.Graphics.Nvdec.Vp9/Dsp/InvTxfm.cs

@@ -87,6 +87,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             return rv;
             return rv;
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Iwht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         public static void Iwht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         {
         {
             /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
             /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
@@ -142,6 +143,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Iwht4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         public static void Iwht4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         {
         {
             int i;
             int i;
@@ -209,6 +211,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3));
             output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3));
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct4(ReadOnlySpan<int> input, Span<int> output)
         public static void Idct4(ReadOnlySpan<int> input, Span<int> output)
         {
         {
             Span<short> step = stackalloc short[4];
             Span<short> step = stackalloc short[4];
@@ -231,6 +234,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[3] = WrapLow(step[0] - step[3]);
             output[3] = WrapLow(step[0] - step[3]);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         public static void Idct4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         {
         {
             int i, j;
             int i, j;
@@ -359,6 +363,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[7] = WrapLow(-x1);
             output[7] = WrapLow(-x1);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct8(ReadOnlySpan<int> input, Span<int> output)
         public static void Idct8(ReadOnlySpan<int> input, Span<int> output)
         {
         {
             Span<short> step1 = stackalloc short[8];
             Span<short> step1 = stackalloc short[8];
@@ -416,6 +421,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[7] = WrapLow(step1[0] - step1[7]);
             output[7] = WrapLow(step1[0] - step1[7]);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         public static void Idct8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         {
         {
             int i, j;
             int i, j;
@@ -449,6 +455,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct8x812Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         public static void Idct8x812Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         {
         {
             int i, j;
             int i, j;
@@ -457,6 +464,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Span<int> tempIn = stackalloc int[8];
             Span<int> tempIn = stackalloc int[8];
             Span<int> tempOut = stackalloc int[8];
             Span<int> tempOut = stackalloc int[8];
 
 
+            output.Fill(0);
+
             // First transform rows
             // First transform rows
             // Only first 4 row has non-zero coefs
             // Only first 4 row has non-zero coefs
             for (i = 0; i < 4; ++i)
             for (i = 0; i < 4; ++i)
@@ -671,6 +680,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[15] = WrapLow(-x1);
             output[15] = WrapLow(-x1);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct16(ReadOnlySpan<int> input, Span<int> output)
         public static void Idct16(ReadOnlySpan<int> input, Span<int> output)
         {
         {
             Span<short> step1 = stackalloc short[16];
             Span<short> step1 = stackalloc short[16];
@@ -838,6 +848,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[15] = WrapLow(step2[0] - step2[15]);
             output[15] = WrapLow(step2[0] - step2[15]);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         public static void Idct16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         {
         {
             int i, j;
             int i, j;
@@ -870,6 +881,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct16x1638Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         public static void Idct16x1638Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         {
         {
             int i, j;
             int i, j;
@@ -878,6 +890,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Span<int> tempIn = stackalloc int[16];
             Span<int> tempIn = stackalloc int[16];
             Span<int> tempOut = stackalloc int[16];
             Span<int> tempOut = stackalloc int[16];
 
 
+            output.Fill(0);
+
             // First transform rows. Since all non-zero dct coefficients are in
             // First transform rows. Since all non-zero dct coefficients are in
             // upper-left 8x8 area, we only need to calculate first 8 rows here.
             // upper-left 8x8 area, we only need to calculate first 8 rows here.
             for (i = 0; i < 8; ++i)
             for (i = 0; i < 8; ++i)
@@ -903,6 +917,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct16x1610Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         public static void Idct16x1610Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         {
         {
             int i, j;
             int i, j;
@@ -911,6 +926,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Span<int> tempIn = stackalloc int[16];
             Span<int> tempIn = stackalloc int[16];
             Span<int> tempOut = stackalloc int[16];
             Span<int> tempOut = stackalloc int[16];
 
 
+            output.Fill(0);
+
             // First transform rows. Since all non-zero dct coefficients are in
             // First transform rows. Since all non-zero dct coefficients are in
             // upper-left 4x4 area, we only need to calculate first 4 rows here.
             // upper-left 4x4 area, we only need to calculate first 4 rows here.
             for (i = 0; i < 4; ++i)
             for (i = 0; i < 4; ++i)
@@ -955,6 +972,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct32(ReadOnlySpan<int> input, Span<int> output)
         public static void Idct32(ReadOnlySpan<int> input, Span<int> output)
         {
         {
             Span<short> step1 = stackalloc short[32];
             Span<short> step1 = stackalloc short[32];
@@ -1324,6 +1342,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[31] = WrapLow(step1[0] - step1[31]);
             output[31] = WrapLow(step1[0] - step1[31]);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct32x321024Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         public static void Idct32x321024Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         {
         {
             int i, j;
             int i, j;
@@ -1370,6 +1389,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct32x32135Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         public static void Idct32x32135Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         {
         {
             int i, j;
             int i, j;
@@ -1378,6 +1398,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Span<int> tempIn = stackalloc int[32];
             Span<int> tempIn = stackalloc int[32];
             Span<int> tempOut = stackalloc int[32];
             Span<int> tempOut = stackalloc int[32];
 
 
+            output.Fill(0);
+
             // Rows
             // Rows
             // Only upper-left 16x16 has non-zero coeff
             // Only upper-left 16x16 has non-zero coeff
             for (i = 0; i < 16; ++i)
             for (i = 0; i < 16; ++i)
@@ -1403,6 +1425,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void Idct32x3234Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         public static void Idct32x3234Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
         {
         {
             int i, j;
             int i, j;
@@ -1411,6 +1434,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Span<int> tempIn = stackalloc int[32];
             Span<int> tempIn = stackalloc int[32];
             Span<int> tempOut = stackalloc int[32];
             Span<int> tempOut = stackalloc int[32];
 
 
+            output.Fill(0);
+
             // Rows
             // Rows
             // Only upper-left 8x8 has non-zero coeff
             // Only upper-left 8x8 has non-zero coeff
             for (i = 0; i < 8; ++i)
             for (i = 0; i < 8; ++i)
@@ -1456,6 +1481,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIwht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         public static void HighbdIwht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         {
         {
             /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
             /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
@@ -1511,6 +1537,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIwht4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         public static void HighbdIwht4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         {
         {
             int i;
             int i;
@@ -1584,6 +1611,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd);
             output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct4(ReadOnlySpan<int> input, Span<int> output, int bd)
         public static void HighbdIdct4(ReadOnlySpan<int> input, Span<int> output, int bd)
         {
         {
             Span<int> step = stackalloc int[4];
             Span<int> step = stackalloc int[4];
@@ -1613,6 +1641,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[3] = HighbdWrapLow(step[0] - step[3], bd);
             output[3] = HighbdWrapLow(step[0] - step[3], bd);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         public static void HighbdIdct4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         {
         {
             int i, j;
             int i, j;
@@ -1748,6 +1777,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[7] = HighbdWrapLow(-x1, bd);
             output[7] = HighbdWrapLow(-x1, bd);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct8(ReadOnlySpan<int> input, Span<int> output, int bd)
         public static void HighbdIdct8(ReadOnlySpan<int> input, Span<int> output, int bd)
         {
         {
             Span<int> step1 = stackalloc int[8];
             Span<int> step1 = stackalloc int[8];
@@ -1803,6 +1833,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[7] = HighbdWrapLow(step1[0] - step1[7], bd);
             output[7] = HighbdWrapLow(step1[0] - step1[7], bd);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         public static void HighbdIdct8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         {
         {
             int i, j;
             int i, j;
@@ -1835,6 +1866,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct8x812Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         public static void HighbdIdct8x812Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         {
         {
             int i, j;
             int i, j;
@@ -1843,6 +1875,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Span<int> tempIn = stackalloc int[8];
             Span<int> tempIn = stackalloc int[8];
             Span<int> tempOut = stackalloc int[8];
             Span<int> tempOut = stackalloc int[8];
 
 
+            output.Fill(0);
+
             // First transform rows
             // First transform rows
             // Only first 4 row has non-zero coefs
             // Only first 4 row has non-zero coefs
             for (i = 0; i < 4; ++i)
             for (i = 0; i < 4; ++i)
@@ -2062,6 +2096,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[15] = HighbdWrapLow(-x1, bd);
             output[15] = HighbdWrapLow(-x1, bd);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct16(ReadOnlySpan<int> input, Span<int> output, int bd)
         public static void HighbdIdct16(ReadOnlySpan<int> input, Span<int> output, int bd)
         {
         {
             Span<int> step1 = stackalloc int[16];
             Span<int> step1 = stackalloc int[16];
@@ -2236,6 +2271,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[15] = HighbdWrapLow(step2[0] - step2[15], bd);
             output[15] = HighbdWrapLow(step2[0] - step2[15], bd);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         public static void HighbdIdct16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         {
         {
             int i, j;
             int i, j;
@@ -2268,6 +2304,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct16x1638Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         public static void HighbdIdct16x1638Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         {
         {
             int i, j;
             int i, j;
@@ -2276,6 +2313,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Span<int> tempIn = stackalloc int[16];
             Span<int> tempIn = stackalloc int[16];
             Span<int> tempOut = stackalloc int[16];
             Span<int> tempOut = stackalloc int[16];
 
 
+            output.Fill(0);
+
             // First transform rows. Since all non-zero dct coefficients are in
             // First transform rows. Since all non-zero dct coefficients are in
             // upper-left 8x8 area, we only need to calculate first 8 rows here.
             // upper-left 8x8 area, we only need to calculate first 8 rows here.
             for (i = 0; i < 8; ++i)
             for (i = 0; i < 8; ++i)
@@ -2303,6 +2342,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct16x1610Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         public static void HighbdIdct16x1610Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         {
         {
             int i, j;
             int i, j;
@@ -2311,6 +2351,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Span<int> tempIn = stackalloc int[16];
             Span<int> tempIn = stackalloc int[16];
             Span<int> tempOut = stackalloc int[16];
             Span<int> tempOut = stackalloc int[16];
 
 
+            output.Fill(0);
+
             // First transform rows. Since all non-zero dct coefficients are in
             // First transform rows. Since all non-zero dct coefficients are in
             // upper-left 4x4 area, we only need to calculate first 4 rows here.
             // upper-left 4x4 area, we only need to calculate first 4 rows here.
             for (i = 0; i < 4; ++i)
             for (i = 0; i < 4; ++i)
@@ -2355,6 +2397,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct32(ReadOnlySpan<int> input, Span<int> output, int bd)
         public static void HighbdIdct32(ReadOnlySpan<int> input, Span<int> output, int bd)
         {
         {
             Span<int> step1 = stackalloc int[32];
             Span<int> step1 = stackalloc int[32];
@@ -2539,7 +2582,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             step2[8] = step1[8];
             step2[8] = step1[8];
             step2[15] = step1[15];
             step2[15] = step1[15];
             temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
             temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
-            temp2 =  step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
+            temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
             step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
             step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
             step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
             step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
             temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
             temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
@@ -2731,6 +2774,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             output[31] = HighbdWrapLow(step1[0] - step1[31], bd);
             output[31] = HighbdWrapLow(step1[0] - step1[31], bd);
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct32x321024Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         public static void HighbdIdct32x321024Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         {
         {
             int i, j;
             int i, j;
@@ -2777,6 +2821,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct32x32135Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         public static void HighbdIdct32x32135Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         {
         {
             int i, j;
             int i, j;
@@ -2785,6 +2830,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Span<int> tempIn = stackalloc int[32];
             Span<int> tempIn = stackalloc int[32];
             Span<int> tempOut = stackalloc int[32];
             Span<int> tempOut = stackalloc int[32];
 
 
+            output.Fill(0);
+
             // Rows
             // Rows
             // Only upper-left 16x16 has non-zero coeff
             // Only upper-left 16x16 has non-zero coeff
             for (i = 0; i < 16; ++i)
             for (i = 0; i < 16; ++i)
@@ -2812,6 +2859,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             }
             }
         }
         }
 
 
+        [SkipLocalsInit]
         public static void HighbdIdct32x3234Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         public static void HighbdIdct32x3234Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
         {
         {
             int i, j;
             int i, j;
@@ -2820,6 +2868,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
             Span<int> tempIn = stackalloc int[32];
             Span<int> tempIn = stackalloc int[32];
             Span<int> tempOut = stackalloc int[32];
             Span<int> tempOut = stackalloc int[32];
 
 
+            output.Fill(0);
+
             // Rows
             // Rows
             // Only upper-left 8x8 has non-zero coeff
             // Only upper-left 8x8 has non-zero coeff
             for (i = 0; i < 8; ++i)
             for (i = 0; i < 8; ++i)

+ 1 - 0
Ryujinx.Graphics.Nvdec.Vp9/TileBuffer.cs

@@ -4,6 +4,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
 {
 {
     internal struct TileBuffer
     internal struct TileBuffer
     {
     {
+        public int Col;
         public ArrayPtr<byte> Data;
         public ArrayPtr<byte> Data;
         public int Size;
         public int Size;
     }
     }

+ 6 - 0
Ryujinx.Graphics.Nvdec.Vp9/TileWorkerData.cs

@@ -1,14 +1,20 @@
 using Ryujinx.Common.Memory;
 using Ryujinx.Common.Memory;
 using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
 using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
 using Ryujinx.Graphics.Nvdec.Vp9.Types;
 using Ryujinx.Graphics.Nvdec.Vp9.Types;
+using Ryujinx.Graphics.Video;
 
 
 namespace Ryujinx.Graphics.Nvdec.Vp9
 namespace Ryujinx.Graphics.Nvdec.Vp9
 {
 {
     internal struct TileWorkerData
     internal struct TileWorkerData
     {
     {
+        public ArrayPtr<byte> DataEnd;
+        public int BufStart;
+        public int BufEnd;
         public Reader BitReader;
         public Reader BitReader;
+        public Vp9BackwardUpdates Counts;
         public MacroBlockD Xd;
         public MacroBlockD Xd;
         /* dqcoeff are shared by all the planes. So planes must be decoded serially */
         /* dqcoeff are shared by all the planes. So planes must be decoded serially */
         public Array32<Array32<int>> Dqcoeff;
         public Array32<Array32<int>> Dqcoeff;
+        public InternalErrorInfo ErrorInfo;
     }
     }
 }
 }

+ 2 - 2
Ryujinx.Graphics.Nvdec.Vp9/Types/Vp9Common.cs

@@ -127,9 +127,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
             MBs = MbRows * MbCols;
             MBs = MbRows * MbCols;
         }
         }
 
 
-        public void AllocTileWorkerData(MemoryAllocator allocator, int tileCols, int tileRows)
+        public void AllocTileWorkerData(MemoryAllocator allocator, int tileCols, int tileRows, int maxThreads)
         {
         {
-            TileWorkerData = allocator.Allocate<TileWorkerData>(tileCols * tileRows);
+            TileWorkerData = allocator.Allocate<TileWorkerData>(tileCols * tileRows + (maxThreads > 1 ? maxThreads : 0));
         }
         }
 
 
         public void FreeTileWorkerData(MemoryAllocator allocator)
         public void FreeTileWorkerData(MemoryAllocator allocator)