Convolve.cs 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949
  1. using Ryujinx.Common.Memory;
  2. using Ryujinx.Graphics.Nvdec.Vp9.Common;
  3. using System.Diagnostics;
  4. using System.Runtime.CompilerServices;
  5. using System.Runtime.InteropServices;
  6. using System.Runtime.Intrinsics;
  7. using System.Runtime.Intrinsics.X86;
  8. using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter;
  9. namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
  10. {
  11. internal static class Convolve
  12. {
  13. private const bool UseIntrinsics = true;
  14. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  15. private static Vector128<int> MultiplyAddAdjacent(
  16. Vector128<short> vsrc0,
  17. Vector128<short> vsrc1,
  18. Vector128<short> vsrc2,
  19. Vector128<short> vsrc3,
  20. Vector128<short> vfilter,
  21. Vector128<int> zero)
  22. {
  23. // < sumN, sumN, sumN, sumN >
  24. Vector128<int> sum0 = Sse2.MultiplyAddAdjacent(vsrc0, vfilter);
  25. Vector128<int> sum1 = Sse2.MultiplyAddAdjacent(vsrc1, vfilter);
  26. Vector128<int> sum2 = Sse2.MultiplyAddAdjacent(vsrc2, vfilter);
  27. Vector128<int> sum3 = Sse2.MultiplyAddAdjacent(vsrc3, vfilter);
  28. // < 0, 0, sumN, sumN >
  29. sum0 = Ssse3.HorizontalAdd(sum0, zero);
  30. sum1 = Ssse3.HorizontalAdd(sum1, zero);
  31. sum2 = Ssse3.HorizontalAdd(sum2, zero);
  32. sum3 = Ssse3.HorizontalAdd(sum3, zero);
  33. // < 0, 0, 0, sumN >
  34. sum0 = Ssse3.HorizontalAdd(sum0, zero);
  35. sum1 = Ssse3.HorizontalAdd(sum1, zero);
  36. sum2 = Ssse3.HorizontalAdd(sum2, zero);
  37. sum3 = Ssse3.HorizontalAdd(sum3, zero);
  38. // < 0, 0, sum1, sum0 >
  39. Vector128<int> sum01 = Sse2.UnpackLow(sum0, sum1);
  40. // < 0, 0, sum3, sum2 >
  41. Vector128<int> sum23 = Sse2.UnpackLow(sum2, sum3);
  42. // < sum3, sum2, sum1, sum0 >
  43. return Sse.MoveLowToHigh(sum01.AsSingle(), sum23.AsSingle()).AsInt32();
  44. }
  45. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  46. private static Vector128<int> RoundShift(Vector128<int> value, Vector128<int> const64)
  47. {
  48. return Sse2.ShiftRightArithmetic(Sse2.Add(value, const64), FilterBits);
  49. }
  50. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  51. private static Vector128<byte> PackUnsignedSaturate(Vector128<int> value, Vector128<int> zero)
  52. {
  53. return Sse2.PackUnsignedSaturate(Sse41.PackUnsignedSaturate(value, zero).AsInt16(), zero.AsInt16());
  54. }
  55. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  56. private static unsafe void ConvolveHorizSse41(
  57. byte* src,
  58. int srcStride,
  59. byte* dst,
  60. int dstStride,
  61. Array8<short>[] xFilters,
  62. int x0Q4,
  63. int w,
  64. int h)
  65. {
  66. Vector128<int> zero = Vector128<int>.Zero;
  67. Vector128<int> const64 = Vector128.Create(64);
  68. ulong x, y;
  69. src -= SubpelTaps / 2 - 1;
  70. fixed (Array8<short>* xFilter = xFilters)
  71. {
  72. Vector128<short> vfilter = Sse2.LoadVector128((short*)xFilter + (uint)(x0Q4 & SubpelMask) * 8);
  73. for (y = 0; y < (uint)h; ++y)
  74. {
  75. ulong srcOffset = (uint)x0Q4 >> SubpelBits;
  76. for (x = 0; x < (uint)w; x += 4)
  77. {
  78. Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(&src[srcOffset + x]);
  79. Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 1]);
  80. Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 2]);
  81. Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 3]);
  82. Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
  83. Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
  84. }
  85. src += srcStride;
  86. dst += dstStride;
  87. }
  88. }
  89. }
  90. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  91. private static unsafe void ConvolveHoriz(
  92. byte* src,
  93. int srcStride,
  94. byte* dst,
  95. int dstStride,
  96. Array8<short>[] xFilters,
  97. int x0Q4,
  98. int xStepQ4,
  99. int w,
  100. int h)
  101. {
  102. if (Sse41.IsSupported && UseIntrinsics && xStepQ4 == 1 << SubpelBits)
  103. {
  104. ConvolveHorizSse41(src, srcStride, dst, dstStride, xFilters, x0Q4, w, h);
  105. return;
  106. }
  107. int x, y;
  108. src -= SubpelTaps / 2 - 1;
  109. for (y = 0; y < h; ++y)
  110. {
  111. int xQ4 = x0Q4;
  112. for (x = 0; x < w; ++x)
  113. {
  114. byte* srcX = &src[xQ4 >> SubpelBits];
  115. ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
  116. int k, sum = 0;
  117. for (k = 0; k < SubpelTaps; ++k)
  118. {
  119. sum += srcX[k] * xFilter[k];
  120. }
  121. dst[x] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
  122. xQ4 += xStepQ4;
  123. }
  124. src += srcStride;
  125. dst += dstStride;
  126. }
  127. }
  128. private static unsafe void ConvolveAvgHoriz(
  129. byte* src,
  130. int srcStride,
  131. byte* dst,
  132. int dstStride,
  133. Array8<short>[] xFilters,
  134. int x0Q4,
  135. int xStepQ4,
  136. int w,
  137. int h)
  138. {
  139. int x, y;
  140. src -= SubpelTaps / 2 - 1;
  141. for (y = 0; y < h; ++y)
  142. {
  143. int xQ4 = x0Q4;
  144. for (x = 0; x < w; ++x)
  145. {
  146. byte* srcX = &src[xQ4 >> SubpelBits];
  147. ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
  148. int k, sum = 0;
  149. for (k = 0; k < SubpelTaps; ++k)
  150. {
  151. sum += srcX[k] * xFilter[k];
  152. }
  153. dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
  154. xQ4 += xStepQ4;
  155. }
  156. src += srcStride;
  157. dst += dstStride;
  158. }
  159. }
  160. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  161. private static unsafe void ConvolveVertAvx2(
  162. byte* src,
  163. int srcStride,
  164. byte* dst,
  165. int dstStride,
  166. Array8<short>[] yFilters,
  167. int y0Q4,
  168. int w,
  169. int h)
  170. {
  171. Vector128<int> zero = Vector128<int>.Zero;
  172. Vector128<int> const64 = Vector128.Create(64);
  173. Vector256<int> indices = Vector256.Create(
  174. 0,
  175. srcStride,
  176. srcStride * 2,
  177. srcStride * 3,
  178. srcStride * 4,
  179. srcStride * 5,
  180. srcStride * 6,
  181. srcStride * 7);
  182. ulong x, y;
  183. src -= srcStride * (SubpelTaps / 2 - 1);
  184. fixed (Array8<short>* yFilter = yFilters)
  185. {
  186. Vector128<short> vfilter = Sse2.LoadVector128((short*)yFilter + (uint)(y0Q4 & SubpelMask) * 8);
  187. ulong srcBaseY = (uint)y0Q4 >> SubpelBits;
  188. for (y = 0; y < (uint)h; ++y)
  189. {
  190. ulong srcOffset = (srcBaseY + y) * (uint)srcStride;
  191. for (x = 0; x < (uint)w; x += 4)
  192. {
  193. Vector256<int> vsrc = Avx2.GatherVector256((uint*)&src[srcOffset + x], indices, 1).AsInt32();
  194. Vector128<int> vsrcL = vsrc.GetLower();
  195. Vector128<int> vsrcH = vsrc.GetUpper();
  196. Vector128<byte> vsrcUnpck11 = Sse2.UnpackLow(vsrcL.AsByte(), vsrcH.AsByte());
  197. Vector128<byte> vsrcUnpck12 = Sse2.UnpackHigh(vsrcL.AsByte(), vsrcH.AsByte());
  198. Vector128<byte> vsrcUnpck21 = Sse2.UnpackLow(vsrcUnpck11, vsrcUnpck12);
  199. Vector128<byte> vsrcUnpck22 = Sse2.UnpackHigh(vsrcUnpck11, vsrcUnpck12);
  200. Vector128<byte> vsrc01 = Sse2.UnpackLow(vsrcUnpck21, vsrcUnpck22);
  201. Vector128<byte> vsrc23 = Sse2.UnpackHigh(vsrcUnpck21, vsrcUnpck22);
  202. Vector128<byte> vsrc11 = Sse.MoveHighToLow(vsrc01.AsSingle(), vsrc01.AsSingle()).AsByte();
  203. Vector128<byte> vsrc33 = Sse.MoveHighToLow(vsrc23.AsSingle(), vsrc23.AsSingle()).AsByte();
  204. Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(vsrc01);
  205. Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(vsrc11);
  206. Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(vsrc23);
  207. Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(vsrc33);
  208. Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
  209. Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
  210. }
  211. dst += dstStride;
  212. }
  213. }
  214. }
  215. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  216. private static unsafe void ConvolveVert(
  217. byte* src,
  218. int srcStride,
  219. byte* dst,
  220. int dstStride,
  221. Array8<short>[] yFilters,
  222. int y0Q4,
  223. int yStepQ4,
  224. int w,
  225. int h)
  226. {
  227. if (Avx2.IsSupported && UseIntrinsics && yStepQ4 == 1 << SubpelBits)
  228. {
  229. ConvolveVertAvx2(src, srcStride, dst, dstStride, yFilters, y0Q4, w, h);
  230. return;
  231. }
  232. int x, y;
  233. src -= srcStride * (SubpelTaps / 2 - 1);
  234. for (x = 0; x < w; ++x)
  235. {
  236. int yQ4 = y0Q4;
  237. for (y = 0; y < h; ++y)
  238. {
  239. byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
  240. ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
  241. int k, sum = 0;
  242. for (k = 0; k < SubpelTaps; ++k)
  243. {
  244. sum += srcY[k * srcStride] * yFilter[k];
  245. }
  246. dst[y * dstStride] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
  247. yQ4 += yStepQ4;
  248. }
  249. ++src;
  250. ++dst;
  251. }
  252. }
  253. private static unsafe void ConvolveAvgVert(
  254. byte* src,
  255. int srcStride,
  256. byte* dst,
  257. int dstStride,
  258. Array8<short>[] yFilters,
  259. int y0Q4,
  260. int yStepQ4,
  261. int w,
  262. int h)
  263. {
  264. int x, y;
  265. src -= srcStride * (SubpelTaps / 2 - 1);
  266. for (x = 0; x < w; ++x)
  267. {
  268. int yQ4 = y0Q4;
  269. for (y = 0; y < h; ++y)
  270. {
  271. byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
  272. ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
  273. int k, sum = 0;
  274. for (k = 0; k < SubpelTaps; ++k)
  275. {
  276. sum += srcY[k * srcStride] * yFilter[k];
  277. }
  278. dst[y * dstStride] = (byte)BitUtils.RoundPowerOfTwo(
  279. dst[y * dstStride] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
  280. yQ4 += yStepQ4;
  281. }
  282. ++src;
  283. ++dst;
  284. }
  285. }
  286. public static unsafe void Convolve8Horiz(
  287. byte* src,
  288. int srcStride,
  289. byte* dst,
  290. int dstStride,
  291. Array8<short>[] filter,
  292. int x0Q4,
  293. int xStepQ4,
  294. int y0Q4,
  295. int yStepQ4,
  296. int w,
  297. int h)
  298. {
  299. ConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h);
  300. }
  301. public static unsafe void Convolve8AvgHoriz(
  302. byte* src,
  303. int srcStride,
  304. byte* dst,
  305. int dstStride,
  306. Array8<short>[] filter,
  307. int x0Q4,
  308. int xStepQ4,
  309. int y0Q4,
  310. int yStepQ4,
  311. int w,
  312. int h)
  313. {
  314. ConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h);
  315. }
  316. public static unsafe void Convolve8Vert(
  317. byte* src,
  318. int srcStride,
  319. byte* dst,
  320. int dstStride,
  321. Array8<short>[] filter,
  322. int x0Q4,
  323. int xStepQ4,
  324. int y0Q4,
  325. int yStepQ4,
  326. int w,
  327. int h)
  328. {
  329. ConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
  330. }
  331. public static unsafe void Convolve8AvgVert(
  332. byte* src,
  333. int srcStride,
  334. byte* dst,
  335. int dstStride,
  336. Array8<short>[] filter,
  337. int x0Q4,
  338. int xStepQ4,
  339. int y0Q4,
  340. int yStepQ4,
  341. int w,
  342. int h)
  343. {
  344. ConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
  345. }
  346. [StructLayout(LayoutKind.Sequential, Size = 64 * 135)]
  347. struct Temp
  348. {
  349. }
  350. public static unsafe void Convolve8(
  351. byte* src,
  352. int srcStride,
  353. byte* dst,
  354. int dstStride,
  355. Array8<short>[] filter,
  356. int x0Q4,
  357. int xStepQ4,
  358. int y0Q4,
  359. int yStepQ4,
  360. int w,
  361. int h)
  362. {
  363. // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  364. // 2d filtering proceeds in 2 steps:
  365. // (1) Interpolate horizontally into an intermediate buffer, temp.
  366. // (2) Interpolate temp vertically to derive the sub-pixel result.
  367. // Deriving the maximum number of rows in the temp buffer (135):
  368. // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative).
  369. // --Largest block size is 64x64 pixels.
  370. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  371. // original frame (in 1/16th pixel units).
  372. // --Must round-up because block may be located at sub-pixel position.
  373. // --Require an additional SubpelTaps rows for the 8-tap filter tails.
  374. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  375. // When calling in frame scaling function, the smallest scaling factor is x1/4
  376. // ==> yStepQ4 = 64. Since w and h are at most 16, the temp buffer is still
  377. // big enough.
  378. Temp tempStruct;
  379. byte* temp = (byte*)Unsafe.AsPointer(ref tempStruct); // Avoid zero initialization.
  380. int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
  381. Debug.Assert(w <= 64);
  382. Debug.Assert(h <= 64);
  383. Debug.Assert(yStepQ4 <= 32 || (yStepQ4 <= 64 && h <= 32));
  384. Debug.Assert(xStepQ4 <= 64);
  385. ConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight);
  386. ConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
  387. }
  388. public static unsafe void Convolve8Avg(
  389. byte* src,
  390. int srcStride,
  391. byte* dst,
  392. int dstStride,
  393. Array8<short>[] filter,
  394. int x0Q4,
  395. int xStepQ4,
  396. int y0Q4,
  397. int yStepQ4,
  398. int w,
  399. int h)
  400. {
  401. // Fixed size intermediate buffer places limits on parameters.
  402. byte* temp = stackalloc byte[64 * 64];
  403. Debug.Assert(w <= 64);
  404. Debug.Assert(h <= 64);
  405. Convolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  406. ConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h);
  407. }
  408. public static unsafe void ConvolveCopy(
  409. byte* src,
  410. int srcStride,
  411. byte* dst,
  412. int dstStride,
  413. Array8<short>[] filter,
  414. int x0Q4,
  415. int xStepQ4,
  416. int y0Q4,
  417. int yStepQ4,
  418. int w,
  419. int h)
  420. {
  421. int r;
  422. for (r = h; r > 0; --r)
  423. {
  424. MemoryUtil.Copy(dst, src, w);
  425. src += srcStride;
  426. dst += dstStride;
  427. }
  428. }
  429. public static unsafe void ConvolveAvg(
  430. byte* src,
  431. int srcStride,
  432. byte* dst,
  433. int dstStride,
  434. Array8<short>[] filter,
  435. int x0Q4,
  436. int xStepQ4,
  437. int y0Q4,
  438. int yStepQ4,
  439. int w,
  440. int h)
  441. {
  442. int x, y;
  443. for (y = 0; y < h; ++y)
  444. {
  445. for (x = 0; x < w; ++x)
  446. {
  447. dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
  448. }
  449. src += srcStride;
  450. dst += dstStride;
  451. }
  452. }
  453. public static unsafe void ScaledHoriz(
  454. byte* src,
  455. int srcStride,
  456. byte* dst,
  457. int dstStride,
  458. Array8<short>[] filter,
  459. int x0Q4,
  460. int xStepQ4,
  461. int y0Q4,
  462. int yStepQ4,
  463. int w,
  464. int h)
  465. {
  466. Convolve8Horiz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  467. }
  468. public static unsafe void ScaledVert(
  469. byte* src,
  470. int srcStride,
  471. byte* dst,
  472. int dstStride,
  473. Array8<short>[] filter,
  474. int x0Q4,
  475. int xStepQ4,
  476. int y0Q4,
  477. int yStepQ4,
  478. int w,
  479. int h)
  480. {
  481. Convolve8Vert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  482. }
  483. public static unsafe void Scaled2D(
  484. byte* src,
  485. int srcStride,
  486. byte* dst,
  487. int dstStride,
  488. Array8<short>[] filter,
  489. int x0Q4,
  490. int xStepQ4,
  491. int y0Q4,
  492. int yStepQ4,
  493. int w,
  494. int h)
  495. {
  496. Convolve8(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  497. }
  498. public static unsafe void ScaledAvgHoriz(
  499. byte* src,
  500. int srcStride,
  501. byte* dst,
  502. int dstStride,
  503. Array8<short>[] filter,
  504. int x0Q4,
  505. int xStepQ4,
  506. int y0Q4,
  507. int yStepQ4,
  508. int w,
  509. int h)
  510. {
  511. Convolve8AvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  512. }
  513. public static unsafe void ScaledAvgVert(
  514. byte* src,
  515. int srcStride,
  516. byte* dst,
  517. int dstStride,
  518. Array8<short>[] filter,
  519. int x0Q4,
  520. int xStepQ4,
  521. int y0Q4,
  522. int yStepQ4,
  523. int w,
  524. int h)
  525. {
  526. Convolve8AvgVert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  527. }
  528. public static unsafe void ScaledAvg2D(
  529. byte* src,
  530. int srcStride,
  531. byte* dst,
  532. int dstStride,
  533. Array8<short>[] filter,
  534. int x0Q4,
  535. int xStepQ4,
  536. int y0Q4,
  537. int yStepQ4,
  538. int w,
  539. int h)
  540. {
  541. Convolve8Avg(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  542. }
  543. private static unsafe void HighbdConvolveHoriz(
  544. ushort* src,
  545. int srcStride,
  546. ushort* dst,
  547. int dstStride,
  548. Array8<short>[] xFilters,
  549. int x0Q4,
  550. int xStepQ4,
  551. int w,
  552. int h,
  553. int bd)
  554. {
  555. int x, y;
  556. src -= SubpelTaps / 2 - 1;
  557. for (y = 0; y < h; ++y)
  558. {
  559. int xQ4 = x0Q4;
  560. for (x = 0; x < w; ++x)
  561. {
  562. ushort* srcX = &src[xQ4 >> SubpelBits];
  563. ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
  564. int k, sum = 0;
  565. for (k = 0; k < SubpelTaps; ++k)
  566. {
  567. sum += srcX[k] * xFilter[k];
  568. }
  569. dst[x] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
  570. xQ4 += xStepQ4;
  571. }
  572. src += srcStride;
  573. dst += dstStride;
  574. }
  575. }
  576. private static unsafe void HighbdConvolveAvgHoriz(
  577. ushort* src,
  578. int srcStride,
  579. ushort* dst,
  580. int dstStride,
  581. Array8<short>[] xFilters,
  582. int x0Q4,
  583. int xStepQ4,
  584. int w,
  585. int h,
  586. int bd)
  587. {
  588. int x, y;
  589. src -= SubpelTaps / 2 - 1;
  590. for (y = 0; y < h; ++y)
  591. {
  592. int xQ4 = x0Q4;
  593. for (x = 0; x < w; ++x)
  594. {
  595. ushort* srcX = &src[xQ4 >> SubpelBits];
  596. ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
  597. int k, sum = 0;
  598. for (k = 0; k < SubpelTaps; ++k)
  599. {
  600. sum += srcX[k] * xFilter[k];
  601. }
  602. dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
  603. xQ4 += xStepQ4;
  604. }
  605. src += srcStride;
  606. dst += dstStride;
  607. }
  608. }
  609. private static unsafe void HighbdConvolveVert(
  610. ushort* src,
  611. int srcStride,
  612. ushort* dst,
  613. int dstStride,
  614. Array8<short>[] yFilters,
  615. int y0Q4,
  616. int yStepQ4,
  617. int w,
  618. int h,
  619. int bd)
  620. {
  621. int x, y;
  622. src -= srcStride * (SubpelTaps / 2 - 1);
  623. for (x = 0; x < w; ++x)
  624. {
  625. int yQ4 = y0Q4;
  626. for (y = 0; y < h; ++y)
  627. {
  628. ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
  629. ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
  630. int k, sum = 0;
  631. for (k = 0; k < SubpelTaps; ++k)
  632. {
  633. sum += srcY[k * srcStride] * yFilter[k];
  634. }
  635. dst[y * dstStride] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
  636. yQ4 += yStepQ4;
  637. }
  638. ++src;
  639. ++dst;
  640. }
  641. }
  642. private static unsafe void HighConvolveAvgVert(
  643. ushort* src,
  644. int srcStride,
  645. ushort* dst,
  646. int dstStride,
  647. Array8<short>[] yFilters,
  648. int y0Q4,
  649. int yStepQ4,
  650. int w,
  651. int h,
  652. int bd)
  653. {
  654. int x, y;
  655. src -= srcStride * (SubpelTaps / 2 - 1);
  656. for (x = 0; x < w; ++x)
  657. {
  658. int yQ4 = y0Q4;
  659. for (y = 0; y < h; ++y)
  660. {
  661. ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
  662. ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
  663. int k, sum = 0;
  664. for (k = 0; k < SubpelTaps; ++k)
  665. {
  666. sum += srcY[k * srcStride] * yFilter[k];
  667. }
  668. dst[y * dstStride] = (ushort)BitUtils.RoundPowerOfTwo(
  669. dst[y * dstStride] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
  670. yQ4 += yStepQ4;
  671. }
  672. ++src;
  673. ++dst;
  674. }
  675. }
  676. private static unsafe void HighbdConvolve(
  677. ushort* src,
  678. int srcStride,
  679. ushort* dst,
  680. int dstStride,
  681. Array8<short>[] filter,
  682. int x0Q4,
  683. int xStepQ4,
  684. int y0Q4,
  685. int yStepQ4,
  686. int w,
  687. int h,
  688. int bd)
  689. {
  690. // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  691. // 2d filtering proceeds in 2 steps:
  692. // (1) Interpolate horizontally into an intermediate buffer, temp.
  693. // (2) Interpolate temp vertically to derive the sub-pixel result.
  694. // Deriving the maximum number of rows in the temp buffer (135):
  695. // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative).
  696. // --Largest block size is 64x64 pixels.
  697. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  698. // original frame (in 1/16th pixel units).
  699. // --Must round-up because block may be located at sub-pixel position.
  700. // --Require an additional SubpelTaps rows for the 8-tap filter tails.
  701. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  702. ushort* temp = stackalloc ushort[64 * 135];
  703. int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
  704. Debug.Assert(w <= 64);
  705. Debug.Assert(h <= 64);
  706. Debug.Assert(yStepQ4 <= 32);
  707. Debug.Assert(xStepQ4 <= 32);
  708. HighbdConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight, bd);
  709. HighbdConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
  710. }
  711. public static unsafe void HighbdConvolve8Horiz(
  712. ushort* src,
  713. int srcStride,
  714. ushort* dst,
  715. int dstStride,
  716. Array8<short>[] filter,
  717. int x0Q4,
  718. int xStepQ4,
  719. int y0Q4,
  720. int yStepQ4,
  721. int w,
  722. int h,
  723. int bd)
  724. {
  725. HighbdConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd);
  726. }
  727. public static unsafe void HighbdConvolve8AvgHoriz(
  728. ushort* src,
  729. int srcStride,
  730. ushort* dst,
  731. int dstStride,
  732. Array8<short>[] filter,
  733. int x0Q4,
  734. int xStepQ4,
  735. int y0Q4,
  736. int yStepQ4,
  737. int w,
  738. int h,
  739. int bd)
  740. {
  741. HighbdConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd);
  742. }
  743. public static unsafe void HighbdConvolve8Vert(
  744. ushort* src,
  745. int srcStride,
  746. ushort* dst,
  747. int dstStride,
  748. Array8<short>[] filter,
  749. int x0Q4,
  750. int xStepQ4,
  751. int y0Q4,
  752. int yStepQ4,
  753. int w,
  754. int h,
  755. int bd)
  756. {
  757. HighbdConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
  758. }
  759. public static unsafe void HighbdConvolve8AvgVert(
  760. ushort* src,
  761. int srcStride,
  762. ushort* dst,
  763. int dstStride,
  764. Array8<short>[] filter,
  765. int x0Q4,
  766. int xStepQ4,
  767. int y0Q4,
  768. int yStepQ4,
  769. int w,
  770. int h,
  771. int bd)
  772. {
  773. HighConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
  774. }
  775. public static unsafe void HighbdConvolve8(
  776. ushort* src,
  777. int srcStride,
  778. ushort* dst,
  779. int dstStride,
  780. Array8<short>[] filter,
  781. int x0Q4,
  782. int xStepQ4,
  783. int y0Q4,
  784. int yStepQ4,
  785. int w,
  786. int h,
  787. int bd)
  788. {
  789. HighbdConvolve(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd);
  790. }
  791. public static unsafe void HighbdConvolve8Avg(
  792. ushort* src,
  793. int srcStride,
  794. ushort* dst,
  795. int dstStride,
  796. Array8<short>[] filter,
  797. int x0Q4,
  798. int xStepQ4,
  799. int y0Q4,
  800. int yStepQ4,
  801. int w,
  802. int h,
  803. int bd)
  804. {
  805. // Fixed size intermediate buffer places limits on parameters.
  806. ushort* temp = stackalloc ushort[64 * 64];
  807. Debug.Assert(w <= 64);
  808. Debug.Assert(h <= 64);
  809. HighbdConvolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd);
  810. HighbdConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h, bd);
  811. }
  812. public static unsafe void HighbdConvolveCopy(
  813. ushort* src,
  814. int srcStride,
  815. ushort* dst,
  816. int dstStride,
  817. Array8<short>[] filter,
  818. int x0Q4,
  819. int xStepQ4,
  820. int y0Q4,
  821. int yStepQ4,
  822. int w,
  823. int h,
  824. int bd)
  825. {
  826. int r;
  827. for (r = h; r > 0; --r)
  828. {
  829. MemoryUtil.Copy(dst, src, w);
  830. src += srcStride;
  831. dst += dstStride;
  832. }
  833. }
  834. public static unsafe void HighbdConvolveAvg(
  835. ushort* src,
  836. int srcStride,
  837. ushort* dst,
  838. int dstStride,
  839. Array8<short>[] filter,
  840. int x0Q4,
  841. int xStepQ4,
  842. int y0Q4,
  843. int yStepQ4,
  844. int w,
  845. int h,
  846. int bd)
  847. {
  848. int x, y;
  849. for (y = 0; y < h; ++y)
  850. {
  851. for (x = 0; x < w; ++x)
  852. {
  853. dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
  854. }
  855. src += srcStride;
  856. dst += dstStride;
  857. }
  858. }
  859. }
  860. }