Convolve.cs 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943
  1. using Ryujinx.Common.Memory;
  2. using Ryujinx.Graphics.Nvdec.Vp9.Common;
  3. using System.Diagnostics;
  4. using System.Runtime.CompilerServices;
  5. using System.Runtime.Intrinsics;
  6. using System.Runtime.Intrinsics.X86;
  7. using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.Filter;
  8. namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
  9. {
  10. internal static class Convolve
  11. {
  12. private const bool UseIntrinsics = true;
  13. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  14. private static Vector128<int> MultiplyAddAdjacent(
  15. Vector128<short> vsrc0,
  16. Vector128<short> vsrc1,
  17. Vector128<short> vsrc2,
  18. Vector128<short> vsrc3,
  19. Vector128<short> vfilter,
  20. Vector128<int> zero)
  21. {
  22. // < sumN, sumN, sumN, sumN >
  23. Vector128<int> sum0 = Sse2.MultiplyAddAdjacent(vsrc0, vfilter);
  24. Vector128<int> sum1 = Sse2.MultiplyAddAdjacent(vsrc1, vfilter);
  25. Vector128<int> sum2 = Sse2.MultiplyAddAdjacent(vsrc2, vfilter);
  26. Vector128<int> sum3 = Sse2.MultiplyAddAdjacent(vsrc3, vfilter);
  27. // < 0, 0, sumN, sumN >
  28. sum0 = Ssse3.HorizontalAdd(sum0, zero);
  29. sum1 = Ssse3.HorizontalAdd(sum1, zero);
  30. sum2 = Ssse3.HorizontalAdd(sum2, zero);
  31. sum3 = Ssse3.HorizontalAdd(sum3, zero);
  32. // < 0, 0, 0, sumN >
  33. sum0 = Ssse3.HorizontalAdd(sum0, zero);
  34. sum1 = Ssse3.HorizontalAdd(sum1, zero);
  35. sum2 = Ssse3.HorizontalAdd(sum2, zero);
  36. sum3 = Ssse3.HorizontalAdd(sum3, zero);
  37. // < 0, 0, sum1, sum0 >
  38. Vector128<int> sum01 = Sse2.UnpackLow(sum0, sum1);
  39. // < 0, 0, sum3, sum2 >
  40. Vector128<int> sum23 = Sse2.UnpackLow(sum2, sum3);
  41. // < sum3, sum2, sum1, sum0 >
  42. return Sse.MoveLowToHigh(sum01.AsSingle(), sum23.AsSingle()).AsInt32();
  43. }
  44. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  45. private static Vector128<int> RoundShift(Vector128<int> value, Vector128<int> const64)
  46. {
  47. return Sse2.ShiftRightArithmetic(Sse2.Add(value, const64), FilterBits);
  48. }
  49. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  50. private static Vector128<byte> PackUnsignedSaturate(Vector128<int> value, Vector128<int> zero)
  51. {
  52. return Sse2.PackUnsignedSaturate(Sse41.PackUnsignedSaturate(value, zero).AsInt16(), zero.AsInt16());
  53. }
  54. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  55. private static unsafe void ConvolveHorizSse41(
  56. byte* src,
  57. int srcStride,
  58. byte* dst,
  59. int dstStride,
  60. Array8<short>[] xFilters,
  61. int x0Q4,
  62. int w,
  63. int h)
  64. {
  65. Vector128<int> zero = Vector128<int>.Zero;
  66. Vector128<int> const64 = Vector128.Create(64);
  67. ulong x, y;
  68. src -= SubpelTaps / 2 - 1;
  69. fixed (Array8<short>* xFilter = xFilters)
  70. {
  71. Vector128<short> vfilter = Sse2.LoadVector128((short*)xFilter + (uint)(x0Q4 & SubpelMask) * 8);
  72. for (y = 0; y < (uint)h; ++y)
  73. {
  74. ulong srcOffset = (uint)x0Q4 >> SubpelBits;
  75. for (x = 0; x < (uint)w; x += 4)
  76. {
  77. Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(&src[srcOffset + x]);
  78. Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 1]);
  79. Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 2]);
  80. Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(&src[srcOffset + x + 3]);
  81. Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
  82. Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
  83. }
  84. src += srcStride;
  85. dst += dstStride;
  86. }
  87. }
  88. }
  89. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  90. private static unsafe void ConvolveHoriz(
  91. byte* src,
  92. int srcStride,
  93. byte* dst,
  94. int dstStride,
  95. Array8<short>[] xFilters,
  96. int x0Q4,
  97. int xStepQ4,
  98. int w,
  99. int h)
  100. {
  101. if (Sse41.IsSupported && UseIntrinsics && xStepQ4 == 1 << SubpelBits)
  102. {
  103. ConvolveHorizSse41(src, srcStride, dst, dstStride, xFilters, x0Q4, w, h);
  104. return;
  105. }
  106. int x, y;
  107. src -= SubpelTaps / 2 - 1;
  108. for (y = 0; y < h; ++y)
  109. {
  110. int xQ4 = x0Q4;
  111. for (x = 0; x < w; ++x)
  112. {
  113. byte* srcX = &src[xQ4 >> SubpelBits];
  114. ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
  115. int k, sum = 0;
  116. for (k = 0; k < SubpelTaps; ++k)
  117. {
  118. sum += srcX[k] * xFilter[k];
  119. }
  120. dst[x] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
  121. xQ4 += xStepQ4;
  122. }
  123. src += srcStride;
  124. dst += dstStride;
  125. }
  126. }
  127. private static unsafe void ConvolveAvgHoriz(
  128. byte* src,
  129. int srcStride,
  130. byte* dst,
  131. int dstStride,
  132. Array8<short>[] xFilters,
  133. int x0Q4,
  134. int xStepQ4,
  135. int w,
  136. int h)
  137. {
  138. int x, y;
  139. src -= SubpelTaps / 2 - 1;
  140. for (y = 0; y < h; ++y)
  141. {
  142. int xQ4 = x0Q4;
  143. for (x = 0; x < w; ++x)
  144. {
  145. byte* srcX = &src[xQ4 >> SubpelBits];
  146. ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
  147. int k, sum = 0;
  148. for (k = 0; k < SubpelTaps; ++k)
  149. {
  150. sum += srcX[k] * xFilter[k];
  151. }
  152. dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
  153. xQ4 += xStepQ4;
  154. }
  155. src += srcStride;
  156. dst += dstStride;
  157. }
  158. }
  159. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  160. private static unsafe void ConvolveVertAvx2(
  161. byte* src,
  162. int srcStride,
  163. byte* dst,
  164. int dstStride,
  165. Array8<short>[] yFilters,
  166. int y0Q4,
  167. int w,
  168. int h)
  169. {
  170. Vector128<int> zero = Vector128<int>.Zero;
  171. Vector128<int> const64 = Vector128.Create(64);
  172. Vector256<int> indices = Vector256.Create(
  173. 0,
  174. srcStride,
  175. srcStride * 2,
  176. srcStride * 3,
  177. srcStride * 4,
  178. srcStride * 5,
  179. srcStride * 6,
  180. srcStride * 7);
  181. ulong x, y;
  182. src -= srcStride * (SubpelTaps / 2 - 1);
  183. fixed (Array8<short>* yFilter = yFilters)
  184. {
  185. Vector128<short> vfilter = Sse2.LoadVector128((short*)yFilter + (uint)(y0Q4 & SubpelMask) * 8);
  186. ulong srcBaseY = (uint)y0Q4 >> SubpelBits;
  187. for (y = 0; y < (uint)h; ++y)
  188. {
  189. ulong srcOffset = (srcBaseY + y) * (uint)srcStride;
  190. for (x = 0; x < (uint)w; x += 4)
  191. {
  192. Vector256<int> vsrc = Avx2.GatherVector256((uint*)&src[srcOffset + x], indices, 1).AsInt32();
  193. Vector128<int> vsrcL = vsrc.GetLower();
  194. Vector128<int> vsrcH = vsrc.GetUpper();
  195. Vector128<byte> vsrcUnpck11 = Sse2.UnpackLow(vsrcL.AsByte(), vsrcH.AsByte());
  196. Vector128<byte> vsrcUnpck12 = Sse2.UnpackHigh(vsrcL.AsByte(), vsrcH.AsByte());
  197. Vector128<byte> vsrcUnpck21 = Sse2.UnpackLow(vsrcUnpck11, vsrcUnpck12);
  198. Vector128<byte> vsrcUnpck22 = Sse2.UnpackHigh(vsrcUnpck11, vsrcUnpck12);
  199. Vector128<byte> vsrc01 = Sse2.UnpackLow(vsrcUnpck21, vsrcUnpck22);
  200. Vector128<byte> vsrc23 = Sse2.UnpackHigh(vsrcUnpck21, vsrcUnpck22);
  201. Vector128<byte> vsrc11 = Sse.MoveHighToLow(vsrc01.AsSingle(), vsrc01.AsSingle()).AsByte();
  202. Vector128<byte> vsrc33 = Sse.MoveHighToLow(vsrc23.AsSingle(), vsrc23.AsSingle()).AsByte();
  203. Vector128<short> vsrc0 = Sse41.ConvertToVector128Int16(vsrc01);
  204. Vector128<short> vsrc1 = Sse41.ConvertToVector128Int16(vsrc11);
  205. Vector128<short> vsrc2 = Sse41.ConvertToVector128Int16(vsrc23);
  206. Vector128<short> vsrc3 = Sse41.ConvertToVector128Int16(vsrc33);
  207. Vector128<int> sum0123 = MultiplyAddAdjacent(vsrc0, vsrc1, vsrc2, vsrc3, vfilter, zero);
  208. Sse.StoreScalar((float*)&dst[x], PackUnsignedSaturate(RoundShift(sum0123, const64), zero).AsSingle());
  209. }
  210. dst += dstStride;
  211. }
  212. }
  213. }
  214. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  215. private static unsafe void ConvolveVert(
  216. byte* src,
  217. int srcStride,
  218. byte* dst,
  219. int dstStride,
  220. Array8<short>[] yFilters,
  221. int y0Q4,
  222. int yStepQ4,
  223. int w,
  224. int h)
  225. {
  226. if (Avx2.IsSupported && UseIntrinsics && yStepQ4 == 1 << SubpelBits)
  227. {
  228. ConvolveVertAvx2(src, srcStride, dst, dstStride, yFilters, y0Q4, w, h);
  229. return;
  230. }
  231. int x, y;
  232. src -= srcStride * (SubpelTaps / 2 - 1);
  233. for (x = 0; x < w; ++x)
  234. {
  235. int yQ4 = y0Q4;
  236. for (y = 0; y < h; ++y)
  237. {
  238. byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
  239. ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
  240. int k, sum = 0;
  241. for (k = 0; k < SubpelTaps; ++k)
  242. {
  243. sum += srcY[k * srcStride] * yFilter[k];
  244. }
  245. dst[y * dstStride] = BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits));
  246. yQ4 += yStepQ4;
  247. }
  248. ++src;
  249. ++dst;
  250. }
  251. }
  252. private static unsafe void ConvolveAvgVert(
  253. byte* src,
  254. int srcStride,
  255. byte* dst,
  256. int dstStride,
  257. Array8<short>[] yFilters,
  258. int y0Q4,
  259. int yStepQ4,
  260. int w,
  261. int h)
  262. {
  263. int x, y;
  264. src -= srcStride * (SubpelTaps / 2 - 1);
  265. for (x = 0; x < w; ++x)
  266. {
  267. int yQ4 = y0Q4;
  268. for (y = 0; y < h; ++y)
  269. {
  270. byte* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
  271. ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
  272. int k, sum = 0;
  273. for (k = 0; k < SubpelTaps; ++k)
  274. {
  275. sum += srcY[k * srcStride] * yFilter[k];
  276. }
  277. dst[y * dstStride] = (byte)BitUtils.RoundPowerOfTwo(
  278. dst[y * dstStride] + BitUtils.ClipPixel(BitUtils.RoundPowerOfTwo(sum, FilterBits)), 1);
  279. yQ4 += yStepQ4;
  280. }
  281. ++src;
  282. ++dst;
  283. }
  284. }
  285. public static unsafe void Convolve8Horiz(
  286. byte* src,
  287. int srcStride,
  288. byte* dst,
  289. int dstStride,
  290. Array8<short>[] filter,
  291. int x0Q4,
  292. int xStepQ4,
  293. int y0Q4,
  294. int yStepQ4,
  295. int w,
  296. int h)
  297. {
  298. ConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h);
  299. }
  300. public static unsafe void Convolve8AvgHoriz(
  301. byte* src,
  302. int srcStride,
  303. byte* dst,
  304. int dstStride,
  305. Array8<short>[] filter,
  306. int x0Q4,
  307. int xStepQ4,
  308. int y0Q4,
  309. int yStepQ4,
  310. int w,
  311. int h)
  312. {
  313. ConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h);
  314. }
  315. public static unsafe void Convolve8Vert(
  316. byte* src,
  317. int srcStride,
  318. byte* dst,
  319. int dstStride,
  320. Array8<short>[] filter,
  321. int x0Q4,
  322. int xStepQ4,
  323. int y0Q4,
  324. int yStepQ4,
  325. int w,
  326. int h)
  327. {
  328. ConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
  329. }
  330. public static unsafe void Convolve8AvgVert(
  331. byte* src,
  332. int srcStride,
  333. byte* dst,
  334. int dstStride,
  335. Array8<short>[] filter,
  336. int x0Q4,
  337. int xStepQ4,
  338. int y0Q4,
  339. int yStepQ4,
  340. int w,
  341. int h)
  342. {
  343. ConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
  344. }
  345. [SkipLocalsInit]
  346. public static unsafe void Convolve8(
  347. byte* src,
  348. int srcStride,
  349. byte* dst,
  350. int dstStride,
  351. Array8<short>[] filter,
  352. int x0Q4,
  353. int xStepQ4,
  354. int y0Q4,
  355. int yStepQ4,
  356. int w,
  357. int h)
  358. {
  359. // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  360. // 2d filtering proceeds in 2 steps:
  361. // (1) Interpolate horizontally into an intermediate buffer, temp.
  362. // (2) Interpolate temp vertically to derive the sub-pixel result.
  363. // Deriving the maximum number of rows in the temp buffer (135):
  364. // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative).
  365. // --Largest block size is 64x64 pixels.
  366. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  367. // original frame (in 1/16th pixel units).
  368. // --Must round-up because block may be located at sub-pixel position.
  369. // --Require an additional SubpelTaps rows for the 8-tap filter tails.
  370. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  371. // When calling in frame scaling function, the smallest scaling factor is x1/4
  372. // ==> yStepQ4 = 64. Since w and h are at most 16, the temp buffer is still
  373. // big enough.
  374. byte* temp = stackalloc byte[64 * 135];
  375. int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
  376. Debug.Assert(w <= 64);
  377. Debug.Assert(h <= 64);
  378. Debug.Assert(yStepQ4 <= 32 || (yStepQ4 <= 64 && h <= 32));
  379. Debug.Assert(xStepQ4 <= 64);
  380. ConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight);
  381. ConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h);
  382. }
  383. public static unsafe void Convolve8Avg(
  384. byte* src,
  385. int srcStride,
  386. byte* dst,
  387. int dstStride,
  388. Array8<short>[] filter,
  389. int x0Q4,
  390. int xStepQ4,
  391. int y0Q4,
  392. int yStepQ4,
  393. int w,
  394. int h)
  395. {
  396. // Fixed size intermediate buffer places limits on parameters.
  397. byte* temp = stackalloc byte[64 * 64];
  398. Debug.Assert(w <= 64);
  399. Debug.Assert(h <= 64);
  400. Convolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  401. ConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h);
  402. }
  403. public static unsafe void ConvolveCopy(
  404. byte* src,
  405. int srcStride,
  406. byte* dst,
  407. int dstStride,
  408. Array8<short>[] filter,
  409. int x0Q4,
  410. int xStepQ4,
  411. int y0Q4,
  412. int yStepQ4,
  413. int w,
  414. int h)
  415. {
  416. int r;
  417. for (r = h; r > 0; --r)
  418. {
  419. MemoryUtil.Copy(dst, src, w);
  420. src += srcStride;
  421. dst += dstStride;
  422. }
  423. }
  424. public static unsafe void ConvolveAvg(
  425. byte* src,
  426. int srcStride,
  427. byte* dst,
  428. int dstStride,
  429. Array8<short>[] filter,
  430. int x0Q4,
  431. int xStepQ4,
  432. int y0Q4,
  433. int yStepQ4,
  434. int w,
  435. int h)
  436. {
  437. int x, y;
  438. for (y = 0; y < h; ++y)
  439. {
  440. for (x = 0; x < w; ++x)
  441. {
  442. dst[x] = (byte)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
  443. }
  444. src += srcStride;
  445. dst += dstStride;
  446. }
  447. }
  448. public static unsafe void ScaledHoriz(
  449. byte* src,
  450. int srcStride,
  451. byte* dst,
  452. int dstStride,
  453. Array8<short>[] filter,
  454. int x0Q4,
  455. int xStepQ4,
  456. int y0Q4,
  457. int yStepQ4,
  458. int w,
  459. int h)
  460. {
  461. Convolve8Horiz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  462. }
  463. public static unsafe void ScaledVert(
  464. byte* src,
  465. int srcStride,
  466. byte* dst,
  467. int dstStride,
  468. Array8<short>[] filter,
  469. int x0Q4,
  470. int xStepQ4,
  471. int y0Q4,
  472. int yStepQ4,
  473. int w,
  474. int h)
  475. {
  476. Convolve8Vert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  477. }
  478. public static unsafe void Scaled2D(
  479. byte* src,
  480. int srcStride,
  481. byte* dst,
  482. int dstStride,
  483. Array8<short>[] filter,
  484. int x0Q4,
  485. int xStepQ4,
  486. int y0Q4,
  487. int yStepQ4,
  488. int w,
  489. int h)
  490. {
  491. Convolve8(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  492. }
  493. public static unsafe void ScaledAvgHoriz(
  494. byte* src,
  495. int srcStride,
  496. byte* dst,
  497. int dstStride,
  498. Array8<short>[] filter,
  499. int x0Q4,
  500. int xStepQ4,
  501. int y0Q4,
  502. int yStepQ4,
  503. int w,
  504. int h)
  505. {
  506. Convolve8AvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  507. }
  508. public static unsafe void ScaledAvgVert(
  509. byte* src,
  510. int srcStride,
  511. byte* dst,
  512. int dstStride,
  513. Array8<short>[] filter,
  514. int x0Q4,
  515. int xStepQ4,
  516. int y0Q4,
  517. int yStepQ4,
  518. int w,
  519. int h)
  520. {
  521. Convolve8AvgVert(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  522. }
  523. public static unsafe void ScaledAvg2D(
  524. byte* src,
  525. int srcStride,
  526. byte* dst,
  527. int dstStride,
  528. Array8<short>[] filter,
  529. int x0Q4,
  530. int xStepQ4,
  531. int y0Q4,
  532. int yStepQ4,
  533. int w,
  534. int h)
  535. {
  536. Convolve8Avg(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h);
  537. }
  538. private static unsafe void HighbdConvolveHoriz(
  539. ushort* src,
  540. int srcStride,
  541. ushort* dst,
  542. int dstStride,
  543. Array8<short>[] xFilters,
  544. int x0Q4,
  545. int xStepQ4,
  546. int w,
  547. int h,
  548. int bd)
  549. {
  550. int x, y;
  551. src -= SubpelTaps / 2 - 1;
  552. for (y = 0; y < h; ++y)
  553. {
  554. int xQ4 = x0Q4;
  555. for (x = 0; x < w; ++x)
  556. {
  557. ushort* srcX = &src[xQ4 >> SubpelBits];
  558. ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
  559. int k, sum = 0;
  560. for (k = 0; k < SubpelTaps; ++k)
  561. {
  562. sum += srcX[k] * xFilter[k];
  563. }
  564. dst[x] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
  565. xQ4 += xStepQ4;
  566. }
  567. src += srcStride;
  568. dst += dstStride;
  569. }
  570. }
  571. private static unsafe void HighbdConvolveAvgHoriz(
  572. ushort* src,
  573. int srcStride,
  574. ushort* dst,
  575. int dstStride,
  576. Array8<short>[] xFilters,
  577. int x0Q4,
  578. int xStepQ4,
  579. int w,
  580. int h,
  581. int bd)
  582. {
  583. int x, y;
  584. src -= SubpelTaps / 2 - 1;
  585. for (y = 0; y < h; ++y)
  586. {
  587. int xQ4 = x0Q4;
  588. for (x = 0; x < w; ++x)
  589. {
  590. ushort* srcX = &src[xQ4 >> SubpelBits];
  591. ref Array8<short> xFilter = ref xFilters[xQ4 & SubpelMask];
  592. int k, sum = 0;
  593. for (k = 0; k < SubpelTaps; ++k)
  594. {
  595. sum += srcX[k] * xFilter[k];
  596. }
  597. dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
  598. xQ4 += xStepQ4;
  599. }
  600. src += srcStride;
  601. dst += dstStride;
  602. }
  603. }
  604. private static unsafe void HighbdConvolveVert(
  605. ushort* src,
  606. int srcStride,
  607. ushort* dst,
  608. int dstStride,
  609. Array8<short>[] yFilters,
  610. int y0Q4,
  611. int yStepQ4,
  612. int w,
  613. int h,
  614. int bd)
  615. {
  616. int x, y;
  617. src -= srcStride * (SubpelTaps / 2 - 1);
  618. for (x = 0; x < w; ++x)
  619. {
  620. int yQ4 = y0Q4;
  621. for (y = 0; y < h; ++y)
  622. {
  623. ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
  624. ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
  625. int k, sum = 0;
  626. for (k = 0; k < SubpelTaps; ++k)
  627. {
  628. sum += srcY[k * srcStride] * yFilter[k];
  629. }
  630. dst[y * dstStride] = BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd);
  631. yQ4 += yStepQ4;
  632. }
  633. ++src;
  634. ++dst;
  635. }
  636. }
  637. private static unsafe void HighConvolveAvgVert(
  638. ushort* src,
  639. int srcStride,
  640. ushort* dst,
  641. int dstStride,
  642. Array8<short>[] yFilters,
  643. int y0Q4,
  644. int yStepQ4,
  645. int w,
  646. int h,
  647. int bd)
  648. {
  649. int x, y;
  650. src -= srcStride * (SubpelTaps / 2 - 1);
  651. for (x = 0; x < w; ++x)
  652. {
  653. int yQ4 = y0Q4;
  654. for (y = 0; y < h; ++y)
  655. {
  656. ushort* srcY = &src[(yQ4 >> SubpelBits) * srcStride];
  657. ref Array8<short> yFilter = ref yFilters[yQ4 & SubpelMask];
  658. int k, sum = 0;
  659. for (k = 0; k < SubpelTaps; ++k)
  660. {
  661. sum += srcY[k * srcStride] * yFilter[k];
  662. }
  663. dst[y * dstStride] = (ushort)BitUtils.RoundPowerOfTwo(
  664. dst[y * dstStride] + BitUtils.ClipPixelHighbd(BitUtils.RoundPowerOfTwo(sum, FilterBits), bd), 1);
  665. yQ4 += yStepQ4;
  666. }
  667. ++src;
  668. ++dst;
  669. }
  670. }
  671. private static unsafe void HighbdConvolve(
  672. ushort* src,
  673. int srcStride,
  674. ushort* dst,
  675. int dstStride,
  676. Array8<short>[] filter,
  677. int x0Q4,
  678. int xStepQ4,
  679. int y0Q4,
  680. int yStepQ4,
  681. int w,
  682. int h,
  683. int bd)
  684. {
  685. // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  686. // 2d filtering proceeds in 2 steps:
  687. // (1) Interpolate horizontally into an intermediate buffer, temp.
  688. // (2) Interpolate temp vertically to derive the sub-pixel result.
  689. // Deriving the maximum number of rows in the temp buffer (135):
  690. // --Smallest scaling factor is x1/2 ==> yStepQ4 = 32 (Normative).
  691. // --Largest block size is 64x64 pixels.
  692. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  693. // original frame (in 1/16th pixel units).
  694. // --Must round-up because block may be located at sub-pixel position.
  695. // --Require an additional SubpelTaps rows for the 8-tap filter tails.
  696. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  697. ushort* temp = stackalloc ushort[64 * 135];
  698. int intermediateHeight = (((h - 1) * yStepQ4 + y0Q4) >> SubpelBits) + SubpelTaps;
  699. Debug.Assert(w <= 64);
  700. Debug.Assert(h <= 64);
  701. Debug.Assert(yStepQ4 <= 32);
  702. Debug.Assert(xStepQ4 <= 32);
  703. HighbdConvolveHoriz(src - srcStride * (SubpelTaps / 2 - 1), srcStride, temp, 64, filter, x0Q4, xStepQ4, w, intermediateHeight, bd);
  704. HighbdConvolveVert(temp + 64 * (SubpelTaps / 2 - 1), 64, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
  705. }
  706. public static unsafe void HighbdConvolve8Horiz(
  707. ushort* src,
  708. int srcStride,
  709. ushort* dst,
  710. int dstStride,
  711. Array8<short>[] filter,
  712. int x0Q4,
  713. int xStepQ4,
  714. int y0Q4,
  715. int yStepQ4,
  716. int w,
  717. int h,
  718. int bd)
  719. {
  720. HighbdConvolveHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd);
  721. }
  722. public static unsafe void HighbdConvolve8AvgHoriz(
  723. ushort* src,
  724. int srcStride,
  725. ushort* dst,
  726. int dstStride,
  727. Array8<short>[] filter,
  728. int x0Q4,
  729. int xStepQ4,
  730. int y0Q4,
  731. int yStepQ4,
  732. int w,
  733. int h,
  734. int bd)
  735. {
  736. HighbdConvolveAvgHoriz(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, w, h, bd);
  737. }
  738. public static unsafe void HighbdConvolve8Vert(
  739. ushort* src,
  740. int srcStride,
  741. ushort* dst,
  742. int dstStride,
  743. Array8<short>[] filter,
  744. int x0Q4,
  745. int xStepQ4,
  746. int y0Q4,
  747. int yStepQ4,
  748. int w,
  749. int h,
  750. int bd)
  751. {
  752. HighbdConvolveVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
  753. }
  754. public static unsafe void HighbdConvolve8AvgVert(
  755. ushort* src,
  756. int srcStride,
  757. ushort* dst,
  758. int dstStride,
  759. Array8<short>[] filter,
  760. int x0Q4,
  761. int xStepQ4,
  762. int y0Q4,
  763. int yStepQ4,
  764. int w,
  765. int h,
  766. int bd)
  767. {
  768. HighConvolveAvgVert(src, srcStride, dst, dstStride, filter, y0Q4, yStepQ4, w, h, bd);
  769. }
  770. public static unsafe void HighbdConvolve8(
  771. ushort* src,
  772. int srcStride,
  773. ushort* dst,
  774. int dstStride,
  775. Array8<short>[] filter,
  776. int x0Q4,
  777. int xStepQ4,
  778. int y0Q4,
  779. int yStepQ4,
  780. int w,
  781. int h,
  782. int bd)
  783. {
  784. HighbdConvolve(src, srcStride, dst, dstStride, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd);
  785. }
  786. public static unsafe void HighbdConvolve8Avg(
  787. ushort* src,
  788. int srcStride,
  789. ushort* dst,
  790. int dstStride,
  791. Array8<short>[] filter,
  792. int x0Q4,
  793. int xStepQ4,
  794. int y0Q4,
  795. int yStepQ4,
  796. int w,
  797. int h,
  798. int bd)
  799. {
  800. // Fixed size intermediate buffer places limits on parameters.
  801. ushort* temp = stackalloc ushort[64 * 64];
  802. Debug.Assert(w <= 64);
  803. Debug.Assert(h <= 64);
  804. HighbdConvolve8(src, srcStride, temp, 64, filter, x0Q4, xStepQ4, y0Q4, yStepQ4, w, h, bd);
  805. HighbdConvolveAvg(temp, 64, dst, dstStride, null, 0, 0, 0, 0, w, h, bd);
  806. }
  807. public static unsafe void HighbdConvolveCopy(
  808. ushort* src,
  809. int srcStride,
  810. ushort* dst,
  811. int dstStride,
  812. Array8<short>[] filter,
  813. int x0Q4,
  814. int xStepQ4,
  815. int y0Q4,
  816. int yStepQ4,
  817. int w,
  818. int h,
  819. int bd)
  820. {
  821. int r;
  822. for (r = h; r > 0; --r)
  823. {
  824. MemoryUtil.Copy(dst, src, w);
  825. src += srcStride;
  826. dst += dstStride;
  827. }
  828. }
  829. public static unsafe void HighbdConvolveAvg(
  830. ushort* src,
  831. int srcStride,
  832. ushort* dst,
  833. int dstStride,
  834. Array8<short>[] filter,
  835. int x0Q4,
  836. int xStepQ4,
  837. int y0Q4,
  838. int yStepQ4,
  839. int w,
  840. int h,
  841. int bd)
  842. {
  843. int x, y;
  844. for (y = 0; y < h; ++y)
  845. {
  846. for (x = 0; x < w; ++x)
  847. {
  848. dst[x] = (ushort)BitUtils.RoundPowerOfTwo(dst[x] + src[x], 1);
  849. }
  850. src += srcStride;
  851. dst += dstStride;
  852. }
  853. }
  854. }
  855. }