Convolve.cs 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944
  1. using Ryujinx.Common.Memory;
  2. using Ryujinx.Graphics.Nvdec.Vp9.Common;
  3. using System.Diagnostics;
  4. using System.Runtime.CompilerServices;
  5. using System.Runtime.InteropServices;
  6. using System.Runtime.Intrinsics;
  7. using System.Runtime.Intrinsics.X86;
  8. using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter;
  9. namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
  10. {
  11. internal static class Convolve
  12. {
  13. private const bool UseIntrinsics = true;
  14. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  15. private static Vector128<int> MultiplyAddAdjacent(
  16. Vector128<short> vsrc0,
  17. Vector128<short> vsrc1,
  18. Vector128<short> vsrc2,
  19. Vector128<short> vsrc3,
  20. Vector128<short> vfilter,
  21. Vector128<int> zero)
  22. {
  23. // < sumN, sumN, sumN, sumN >
  24. Vector128<int> sum0 = Sse2.MultiplyAddAdjacent(vsrc0, vfilter);
  25. Vector128<int> sum1 = Sse2.MultiplyAddAdjacent(vsrc1, vfilter);
  26. Vector128<int> sum2 = Sse2.MultiplyAddAdjacent(vsrc2, vfilter);
  27. Vector128<int> sum3 = Sse2.MultiplyAddAdjacent(vsrc3, vfilter);
  28. // < 0, 0, sumN, sumN >
  29. sum0 = Ssse3.HorizontalAdd(sum0, zero);
  30. sum1 = Ssse3.HorizontalAdd(sum1, zero);
  31. sum2 = Ssse3.HorizontalAdd(sum2, zero);
  32. sum3 = Ssse3.HorizontalAdd(sum3, zero);
  33. // < 0, 0, 0, sumN >
  34. sum0 = Ssse3.HorizontalAdd(sum0, zero);
  35. sum1 = Ssse3.HorizontalAdd(sum1, zero);
  36. sum2 = Ssse3.HorizontalAdd(sum2, zero);
  37. sum3 = Ssse3.HorizontalAdd(sum3, zero);
  38. // < 0, 0, sum1, sum0 >
  39. Vector128<int> sum01 = Sse2.UnpackLow(sum0, sum1);
  40. // < 0, 0, sum3, sum2 >
  41. Vector128<int> sum23 = Sse2.UnpackLow(sum2, sum3);
  42. // < sum3, sum2, sum1, sum0 >
  43. return Sse.MoveLowToHigh(sum01.AsSingle(), sum23.AsSingle()).AsInt32();
  44. }
  45. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  46. private static Vector128<int> RoundShift(Vector128<int> value, Vector128<int> const64)
  47. {
  48. return Sse2.ShiftRightArithmetic(Sse2.Add(value, const64), FilterBits);
  49. }
  50. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  51. private static Vector128<byte> PackUnsignedSaturate(Vector128<int> value, Vector128<int> zero)
  52. {
  53. return Sse2.PackUnsignedSaturate(Sse41.PackUnsignedSaturate(value, zero).AsInt16(), zero.AsInt16());
  54. }
  55. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  56. private static unsafe void ConvolveHorizSse41(
  57. byte* src,
  58. int srcStride,
  59. byte* dst,
  60. int dstStride,
  61. Array8<short>[] xFilters,
  62. int x0Q4,
  63. int w,
  64. int h)
  65. {
  66. Vector128<int> zero = Vector128<int>.Zero;
  67. Vector128<int> const64 = Vector128.Create(64);
  68. ulong x, y;
  69. src -= SubpelTaps / 2 - 1;
  70. fixed (Array8<short>* xFilter = xFilters)
  71. {
  72. Vector128<short> vfilter = Sse2.LoadVector128((short*)xFilter + (uint)(x0Q4 & SubpelMask) * 8);
  73. for (y = 0; y < (uint)h; ++y)
  74. {
  75. ulong srcOffset = (uint)x0Q4 >> SubpelBits;
  76. for (x = 0; x < (uint)w; x += 4)
  77. {
  78. Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(&src[srcOffset + x]);
  79. Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 1]);
  80. Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 2]);
  81. Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 3]);
  82. Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
  83. Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
  84. }
  85. src += srcStride;
  86. dst += dstStride;
  87. }
  88. }
  89. }
  90. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  91. private static unsafe void ConvolveHoriz(
  92. byte* src,
  93. int srcStride,
  94. byte* dst,
  95. int dstStride,
  96. Array8<short>[] xFilters,
  97. int x0Q4,
  98. int xStepQ4,
  99. int w,
  100. int h)
  101. {
  102. if (Sse41.IsSupported && UseIntrinsics && xStepQ4 == 1 << SubpelBits)
  103. {
  104. ConvolveHorizSse41(src, srcStride, dst, dstStride, xFilters, x0Q4, w, h);
  105. return;
  106. }
  107. int x, y;
  108. src -= SubpelTaps / 2 - 1;
  109. for (y = 0; y < h; ++y)
  110. {
  111. int xQ4 = x0Q4;
  112. for (x = 0; x < w; ++x)
  113. {
  114. byte* srcX = &src[xQ4 >> SubpelBits];
  115. ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
  116. int k, sum = 0;
  117. for (k = 0; k < SubpelTaps; ++k)
  118. {
  119. sum += srcX[k] * xFilter[k];
  120. }
  121. dst[x] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
  122. xQ4 += xStepQ4;
  123. }
  124. src += srcStride;
  125. dst += dstStride;
  126. }
  127. }
  128. private static unsafe void ConvolveAvgHoriz(
  129. byte* src,
  130. int srcStride,
  131. byte* dst,
  132. int dstStride,
  133. Array8<short>[] xFilters,
  134. int x0Q4,
  135. int xStepQ4,
  136. int w,
  137. int h)
  138. {
  139. int x, y;
  140. src -= SubpelTaps / 2 - 1;
  141. for (y = 0; y < h; ++y)
  142. {
  143. int xQ4 = x0Q4;
  144. for (x = 0; x < w; ++x)
  145. {
  146. byte* srcX = &src[xQ4 >> SubpelBits];
  147. ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
  148. int k, sum = 0;
  149. for (k = 0; k < SubpelTaps; ++k)
  150. {
  151. sum += srcX[k] * xFilter[k];
  152. }
  153. dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
  154. xQ4 += xStepQ4;
  155. }
  156. src += srcStride;
  157. dst += dstStride;
  158. }
  159. }
  160. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  161. private static unsafe void ConvolveVertAvx2(
  162. byte* src,
  163. int srcStride,
  164. byte* dst,
  165. int dstStride,
  166. Array8<short>[] yFilters,
  167. int y0Q4,
  168. int w,
  169. int h)
  170. {
  171. Vector128<int> zero = Vector128<int>.Zero;
  172. Vector128<int> const64 = Vector128.Create(64);
  173. Vector256<int> indices = Vector256.Create(
  174. 0,
  175. srcStride,
  176. srcStride * 2,
  177. srcStride * 3,
  178. srcStride * 4,
  179. srcStride * 5,
  180. srcStride * 6,
  181. srcStride * 7);
  182. ulong x, y;
  183. src -= srcStride * (SubpelTaps / 2 - 1);
  184. fixed (Array8<short>* yFilter = yFilters)
  185. {
  186. Vector128<short> vfilter = Sse2.LoadVector128((short*)yFilter + (uint)(y0Q4 & SubpelMask) * 8);
  187. ulong srcBaseY = (uint)y0Q4 >> SubpelBits;
  188. for (y = 0; y < (uint)h; ++y)
  189. {
  190. ulong srcOffset = (srcBaseY + y) * (uint)srcStride;
  191. for (x = 0; x < (uint)w; x += 4)
  192. {
  193. Vector256<int> vsrc = Avx2.GatherVector256((uint*)&src[srcOffset + x], indices, 1).AsInt32();
  194. Vector128<int> vsrcL = vsrc.GetLower();
  195. Vector128<int> vsrcH = vsrc.GetUpper();
  196. Vector128<byte> vsrcUnpck11 = Sse2.UnpackLow(vsrcL.AsByte(), vsrcH.AsByte());
  197. Vector128<byte> vsrcUnpck12 = Sse2.UnpackHigh(vsrcL.AsByte(), vsrcH.AsByte());
  198. Vector128<byte> vsrcUnpck21 = Sse2.UnpackLow(vsrcUnpck11, vsrcUnpck12);
  199. Vector128<byte> vsrcUnpck22 = Sse2.UnpackHigh(vsrcUnpck11, vsrcUnpck12);
  200. Vector128<byte> vsrc01 = Sse2.UnpackLow(vsrcUnpck21, vsrcUnpck22);
  201. Vector128<byte> vsrc23 = Sse2.UnpackHigh(vsrcUnpck21, vsrcUnpck22);
  202. Vector128<byte> vsrc11 = Sse.MoveHighToLow(vsrc01.AsSingle(), vsrc01.AsSingle()).AsByte();
  203. Vector128<byte> vsrc33 = Sse.MoveHighToLow(vsrc23.AsSingle(), vsrc23.AsSingle()).AsByte();
  204. Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(vsrc01);
  205. Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(vsrc11);
  206. Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(vsrc23);
  207. Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(vsrc33);
  208. Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
  209. Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
  210. }
  211. dst += dstStride;
  212. }
  213. }
  214. }
  215. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  216. private static unsafe void ConvolveVert(
  217. byte* src,
  218. int srcStride,
  219. byte* dst,
  220. int dstStride,
  221. Array8<short>[] yFilters,
  222. int y0Q4,
  223. int yStepQ4,
  224. int w,
  225. int h)
  226. {
  227. if (Avx2.IsSupported && UseIntrinsics && yStepQ4 == 1 << SubpelBits)
  228. {
  229. ConvolveVertAvx2(src, srcStride, dst, dstStride, yFilters, y0Q4, w, h);
  230. return;
  231. }
  232. int x, y;
  233. src -= srcStride * (SubpelTaps / 2 - 1);
  234. for (x = 0; x < w; ++x)
  235. {
  236. int yQ4 = y0Q4;
  237. for (y = 0; y < h; ++y)
  238. {
  239. byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
  240. ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
  241. int k, sum = 0;
  242. for (k = 0; k < SubpelTaps; ++k)
  243. {
  244. sum += srcY[k * srcStride] * yFilter[k];
  245. }
  246. dst[y * dstStride] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
  247. yQ4 += yStepQ4;
  248. }
  249. ++src;
  250. ++dst;
  251. }
  252. }
  253. private static unsafe void ConvolveAvgVert(
  254. byte* src,
  255. int srcStride,
  256. byte* dst,
  257. int dstStride,
  258. Array8<short>[] yFilters,
  259. int y0Q4,
  260. int yStepQ4,
  261. int w,
  262. int h)
  263. {
  264. int x, y;
  265. src -= srcStride * (SubpelTaps / 2 - 1);
  266. for (x = 0; x < w; ++x)
  267. {
  268. int yQ4 = y0Q4;
  269. for (y = 0; y < h; ++y)
  270. {
  271. byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
  272. ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
  273. int k, sum = 0;
  274. for (k = 0; k < SubpelTaps; ++k)
  275. {
  276. sum += srcY[k * srcStride] * yFilter[k];
  277. }
  278. dst[y * dstStride] = (byte)BitUtils.RoundPowerOfTwo(
  279. dst[y * dstStride] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
  280. yQ4 += yStepQ4;
  281. }
  282. ++src;
  283. ++dst;
  284. }
  285. }
  286. public static unsafe void Convolve8Horiz(
  287. byte* src,
  288. int srcStride,
  289. byte* dst,
  290. int dstStride,
  291. Array8<short>[] filter,
  292. int x0Q4,
  293. int xStepQ4,
  294. int y0Q4,
  295. int yStepQ4,
  296. int w,
  297. int h)
  298. {
  299. ConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h);
  300. }
  301. public static unsafe void Convolve8AvgHoriz(
  302. byte* src,
  303. int srcStride,
  304. byte* dst,
  305. int dstStride,
  306. Array8<short>[] filter,
  307. int x0Q4,
  308. int xStepQ4,
  309. int y0Q4,
  310. int yStepQ4,
  311. int w,
  312. int h)
  313. {
  314. ConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h);
  315. }
  316. public static unsafe void Convolve8Vert(
  317. byte* src,
  318. int srcStride,
  319. byte* dst,
  320. int dstStride,
  321. Array8<short>[] filter,
  322. int x0Q4,
  323. int xStepQ4,
  324. int y0Q4,
  325. int yStepQ4,
  326. int w,
  327. int h)
  328. {
  329. ConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
  330. }
  331. public static unsafe void Convolve8AvgVert(
  332. byte* src,
  333. int srcStride,
  334. byte* dst,
  335. int dstStride,
  336. Array8<short>[] filter,
  337. int x0Q4,
  338. int xStepQ4,
  339. int y0Q4,
  340. int yStepQ4,
  341. int w,
  342. int h)
  343. {
  344. ConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
  345. }
  346. [SkipLocalsInit]
  347. public static unsafe void Convolve8(
  348. byte* src,
  349. int srcStride,
  350. byte* dst,
  351. int dstStride,
  352. Array8<short>[] filter,
  353. int x0Q4,
  354. int xStepQ4,
  355. int y0Q4,
  356. int yStepQ4,
  357. int w,
  358. int h)
  359. {
  360. // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  361. // 2d filtering proceeds in 2 steps:
  362. // (1) Interpolate horizontally into an intermediate buffer, temp.
  363. // (2) Interpolate temp vertically to derive the sub-pixel result.
  364. // Deriving the maximum number of rows in the temp buffer (135):
  365. // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative).
  366. // --Largest block size is 64x64 pixels.
  367. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  368. // original frame (in 1/16th pixel units).
  369. // --Must round-up because block may be located at sub-pixel position.
  370. // --Require an additional SubpelTaps rows for the 8-tap filter tails.
  371. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  372. // When calling in frame scaling function, the smallest scaling factor is x1/4
  373. // ==> yStepQ4 = 64. Since w and h are at most 16, the temp buffer is still
  374. // big enough.
  375. byte* temp = stackalloc byte[64 * 135];
  376. int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
  377. Debug.Assert(w <= 64);
  378. Debug.Assert(h <= 64);
  379. Debug.Assert(yStepQ4 <= 32 || (yStepQ4 <= 64 && h <= 32));
  380. Debug.Assert(xStepQ4 <= 64);
  381. ConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight);
  382. ConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
  383. }
  384. public static unsafe void Convolve8Avg(
  385. byte* src,
  386. int srcStride,
  387. byte* dst,
  388. int dstStride,
  389. Array8<short>[] filter,
  390. int x0Q4,
  391. int xStepQ4,
  392. int y0Q4,
  393. int yStepQ4,
  394. int w,
  395. int h)
  396. {
  397. // Fixed size intermediate buffer places limits on parameters.
  398. byte* temp = stackalloc byte[64 * 64];
  399. Debug.Assert(w <= 64);
  400. Debug.Assert(h <= 64);
  401. Convolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  402. ConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h);
  403. }
  404. public static unsafe void ConvolveCopy(
  405. byte* src,
  406. int srcStride,
  407. byte* dst,
  408. int dstStride,
  409. Array8<short>[] filter,
  410. int x0Q4,
  411. int xStepQ4,
  412. int y0Q4,
  413. int yStepQ4,
  414. int w,
  415. int h)
  416. {
  417. int r;
  418. for (r = h; r > 0; --r)
  419. {
  420. MemoryUtil.Copy(dst, src, w);
  421. src += srcStride;
  422. dst += dstStride;
  423. }
  424. }
  425. public static unsafe void ConvolveAvg(
  426. byte* src,
  427. int srcStride,
  428. byte* dst,
  429. int dstStride,
  430. Array8<short>[] filter,
  431. int x0Q4,
  432. int xStepQ4,
  433. int y0Q4,
  434. int yStepQ4,
  435. int w,
  436. int h)
  437. {
  438. int x, y;
  439. for (y = 0; y < h; ++y)
  440. {
  441. for (x = 0; x < w; ++x)
  442. {
  443. dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
  444. }
  445. src += srcStride;
  446. dst += dstStride;
  447. }
  448. }
  449. public static unsafe void ScaledHoriz(
  450. byte* src,
  451. int srcStride,
  452. byte* dst,
  453. int dstStride,
  454. Array8<short>[] filter,
  455. int x0Q4,
  456. int xStepQ4,
  457. int y0Q4,
  458. int yStepQ4,
  459. int w,
  460. int h)
  461. {
  462. Convolve8Horiz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  463. }
  464. public static unsafe void ScaledVert(
  465. byte* src,
  466. int srcStride,
  467. byte* dst,
  468. int dstStride,
  469. Array8<short>[] filter,
  470. int x0Q4,
  471. int xStepQ4,
  472. int y0Q4,
  473. int yStepQ4,
  474. int w,
  475. int h)
  476. {
  477. Convolve8Vert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  478. }
  479. public static unsafe void Scaled2D(
  480. byte* src,
  481. int srcStride,
  482. byte* dst,
  483. int dstStride,
  484. Array8<short>[] filter,
  485. int x0Q4,
  486. int xStepQ4,
  487. int y0Q4,
  488. int yStepQ4,
  489. int w,
  490. int h)
  491. {
  492. Convolve8(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  493. }
  494. public static unsafe void ScaledAvgHoriz(
  495. byte* src,
  496. int srcStride,
  497. byte* dst,
  498. int dstStride,
  499. Array8<short>[] filter,
  500. int x0Q4,
  501. int xStepQ4,
  502. int y0Q4,
  503. int yStepQ4,
  504. int w,
  505. int h)
  506. {
  507. Convolve8AvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  508. }
  509. public static unsafe void ScaledAvgVert(
  510. byte* src,
  511. int srcStride,
  512. byte* dst,
  513. int dstStride,
  514. Array8<short>[] filter,
  515. int x0Q4,
  516. int xStepQ4,
  517. int y0Q4,
  518. int yStepQ4,
  519. int w,
  520. int h)
  521. {
  522. Convolve8AvgVert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  523. }
  524. public static unsafe void ScaledAvg2D(
  525. byte* src,
  526. int srcStride,
  527. byte* dst,
  528. int dstStride,
  529. Array8<short>[] filter,
  530. int x0Q4,
  531. int xStepQ4,
  532. int y0Q4,
  533. int yStepQ4,
  534. int w,
  535. int h)
  536. {
  537. Convolve8Avg(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  538. }
  539. private static unsafe void HighbdConvolveHoriz(
  540. ushort* src,
  541. int srcStride,
  542. ushort* dst,
  543. int dstStride,
  544. Array8<short>[] xFilters,
  545. int x0Q4,
  546. int xStepQ4,
  547. int w,
  548. int h,
  549. int bd)
  550. {
  551. int x, y;
  552. src -= SubpelTaps / 2 - 1;
  553. for (y = 0; y < h; ++y)
  554. {
  555. int xQ4 = x0Q4;
  556. for (x = 0; x < w; ++x)
  557. {
  558. ushort* srcX = &src[xQ4 >> SubpelBits];
  559. ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
  560. int k, sum = 0;
  561. for (k = 0; k < SubpelTaps; ++k)
  562. {
  563. sum += srcX[k] * xFilter[k];
  564. }
  565. dst[x] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
  566. xQ4 += xStepQ4;
  567. }
  568. src += srcStride;
  569. dst += dstStride;
  570. }
  571. }
  572. private static unsafe void HighbdConvolveAvgHoriz(
  573. ushort* src,
  574. int srcStride,
  575. ushort* dst,
  576. int dstStride,
  577. Array8<short>[] xFilters,
  578. int x0Q4,
  579. int xStepQ4,
  580. int w,
  581. int h,
  582. int bd)
  583. {
  584. int x, y;
  585. src -= SubpelTaps / 2 - 1;
  586. for (y = 0; y < h; ++y)
  587. {
  588. int xQ4 = x0Q4;
  589. for (x = 0; x < w; ++x)
  590. {
  591. ushort* srcX = &src[xQ4 >> SubpelBits];
  592. ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
  593. int k, sum = 0;
  594. for (k = 0; k < SubpelTaps; ++k)
  595. {
  596. sum += srcX[k] * xFilter[k];
  597. }
  598. dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
  599. xQ4 += xStepQ4;
  600. }
  601. src += srcStride;
  602. dst += dstStride;
  603. }
  604. }
  605. private static unsafe void HighbdConvolveVert(
  606. ushort* src,
  607. int srcStride,
  608. ushort* dst,
  609. int dstStride,
  610. Array8<short>[] yFilters,
  611. int y0Q4,
  612. int yStepQ4,
  613. int w,
  614. int h,
  615. int bd)
  616. {
  617. int x, y;
  618. src -= srcStride * (SubpelTaps / 2 - 1);
  619. for (x = 0; x < w; ++x)
  620. {
  621. int yQ4 = y0Q4;
  622. for (y = 0; y < h; ++y)
  623. {
  624. ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
  625. ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
  626. int k, sum = 0;
  627. for (k = 0; k < SubpelTaps; ++k)
  628. {
  629. sum += srcY[k * srcStride] * yFilter[k];
  630. }
  631. dst[y * dstStride] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
  632. yQ4 += yStepQ4;
  633. }
  634. ++src;
  635. ++dst;
  636. }
  637. }
  638. private static unsafe void HighConvolveAvgVert(
  639. ushort* src,
  640. int srcStride,
  641. ushort* dst,
  642. int dstStride,
  643. Array8<short>[] yFilters,
  644. int y0Q4,
  645. int yStepQ4,
  646. int w,
  647. int h,
  648. int bd)
  649. {
  650. int x, y;
  651. src -= srcStride * (SubpelTaps / 2 - 1);
  652. for (x = 0; x < w; ++x)
  653. {
  654. int yQ4 = y0Q4;
  655. for (y = 0; y < h; ++y)
  656. {
  657. ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
  658. ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
  659. int k, sum = 0;
  660. for (k = 0; k < SubpelTaps; ++k)
  661. {
  662. sum += srcY[k * srcStride] * yFilter[k];
  663. }
  664. dst[y * dstStride] = (ushort)BitUtils.RoundPowerOfTwo(
  665. dst[y * dstStride] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
  666. yQ4 += yStepQ4;
  667. }
  668. ++src;
  669. ++dst;
  670. }
  671. }
  672. private static unsafe void HighbdConvolve(
  673. ushort* src,
  674. int srcStride,
  675. ushort* dst,
  676. int dstStride,
  677. Array8<short>[] filter,
  678. int x0Q4,
  679. int xStepQ4,
  680. int y0Q4,
  681. int yStepQ4,
  682. int w,
  683. int h,
  684. int bd)
  685. {
  686. // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  687. // 2d filtering proceeds in 2 steps:
  688. // (1) Interpolate horizontally into an intermediate buffer, temp.
  689. // (2) Interpolate temp vertically to derive the sub-pixel result.
  690. // Deriving the maximum number of rows in the temp buffer (135):
  691. // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative).
  692. // --Largest block size is 64x64 pixels.
  693. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  694. // original frame (in 1/16th pixel units).
  695. // --Must round-up because block may be located at sub-pixel position.
  696. // --Require an additional SubpelTaps rows for the 8-tap filter tails.
  697. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  698. ushort* temp = stackalloc ushort[64 * 135];
  699. int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
  700. Debug.Assert(w <= 64);
  701. Debug.Assert(h <= 64);
  702. Debug.Assert(yStepQ4 <= 32);
  703. Debug.Assert(xStepQ4 <= 32);
  704. HighbdConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight, bd);
  705. HighbdConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
  706. }
  707. public static unsafe void HighbdConvolve8Horiz(
  708. ushort* src,
  709. int srcStride,
  710. ushort* dst,
  711. int dstStride,
  712. Array8<short>[] filter,
  713. int x0Q4,
  714. int xStepQ4,
  715. int y0Q4,
  716. int yStepQ4,
  717. int w,
  718. int h,
  719. int bd)
  720. {
  721. HighbdConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd);
  722. }
  723. public static unsafe void HighbdConvolve8AvgHoriz(
  724. ushort* src,
  725. int srcStride,
  726. ushort* dst,
  727. int dstStride,
  728. Array8<short>[] filter,
  729. int x0Q4,
  730. int xStepQ4,
  731. int y0Q4,
  732. int yStepQ4,
  733. int w,
  734. int h,
  735. int bd)
  736. {
  737. HighbdConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd);
  738. }
  739. public static unsafe void HighbdConvolve8Vert(
  740. ushort* src,
  741. int srcStride,
  742. ushort* dst,
  743. int dstStride,
  744. Array8<short>[] filter,
  745. int x0Q4,
  746. int xStepQ4,
  747. int y0Q4,
  748. int yStepQ4,
  749. int w,
  750. int h,
  751. int bd)
  752. {
  753. HighbdConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
  754. }
  755. public static unsafe void HighbdConvolve8AvgVert(
  756. ushort* src,
  757. int srcStride,
  758. ushort* dst,
  759. int dstStride,
  760. Array8<short>[] filter,
  761. int x0Q4,
  762. int xStepQ4,
  763. int y0Q4,
  764. int yStepQ4,
  765. int w,
  766. int h,
  767. int bd)
  768. {
  769. HighConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
  770. }
  771. public static unsafe void HighbdConvolve8(
  772. ushort* src,
  773. int srcStride,
  774. ushort* dst,
  775. int dstStride,
  776. Array8<short>[] filter,
  777. int x0Q4,
  778. int xStepQ4,
  779. int y0Q4,
  780. int yStepQ4,
  781. int w,
  782. int h,
  783. int bd)
  784. {
  785. HighbdConvolve(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd);
  786. }
  787. public static unsafe void HighbdConvolve8Avg(
  788. ushort* src,
  789. int srcStride,
  790. ushort* dst,
  791. int dstStride,
  792. Array8<short>[] filter,
  793. int x0Q4,
  794. int xStepQ4,
  795. int y0Q4,
  796. int yStepQ4,
  797. int w,
  798. int h,
  799. int bd)
  800. {
  801. // Fixed size intermediate buffer places limits on parameters.
  802. ushort* temp = stackalloc ushort[64 * 64];
  803. Debug.Assert(w <= 64);
  804. Debug.Assert(h <= 64);
  805. HighbdConvolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd);
  806. HighbdConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h, bd);
  807. }
  808. public static unsafe void HighbdConvolveCopy(
  809. ushort* src,
  810. int srcStride,
  811. ushort* dst,
  812. int dstStride,
  813. Array8<short>[] filter,
  814. int x0Q4,
  815. int xStepQ4,
  816. int y0Q4,
  817. int yStepQ4,
  818. int w,
  819. int h,
  820. int bd)
  821. {
  822. int r;
  823. for (r = h; r > 0; --r)
  824. {
  825. MemoryUtil.Copy(dst, src, w);
  826. src += srcStride;
  827. dst += dstStride;
  828. }
  829. }
  830. public static unsafe void HighbdConvolveAvg(
  831. ushort* src,
  832. int srcStride,
  833. ushort* dst,
  834. int dstStride,
  835. Array8<short>[] filter,
  836. int x0Q4,
  837. int xStepQ4,
  838. int y0Q4,
  839. int yStepQ4,
  840. int w,
  841. int h,
  842. int bd)
  843. {
  844. int x, y;
  845. for (y = 0; y < h; ++y)
  846. {
  847. for (x = 0; x < w; ++x)
  848. {
  849. dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
  850. }
  851. src += srcStride;
  852. dst += dstStride;
  853. }
  854. }
  855. }
  856. }