InvTxfm.cs 122 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917
  1. using System;
  2. using System.Diagnostics;
  3. using System.Runtime.CompilerServices;
  4. using Ryujinx.Graphics.Nvdec.Vp9.Common;
  5. using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.TxfmCommon;
  6. namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
  7. {
  8. internal static class InvTxfm
  9. {
  10. // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
  11. // transform amplify bits + 1 bit for contingency in rounding and quantizing
  12. private const int HighbdValidTxfmMagnitudeRange = (1 << 25);
  13. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  14. private static int DetectInvalidHighbdInput(ReadOnlySpan<int> input, int size)
  15. {
  16. int i;
  17. for (i = 0; i < size; ++i)
  18. {
  19. if (Math.Abs(input[i]) >= HighbdValidTxfmMagnitudeRange)
  20. {
  21. return 1;
  22. }
  23. }
  24. return 0;
  25. }
  26. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  27. private static long CheckRange(long input)
  28. {
  29. // For valid VP9 input streams, intermediate stage coefficients should always
  30. // stay within the range of a signed 16 bit integer. Coefficients can go out
  31. // of this range for invalid/corrupt VP9 streams.
  32. Debug.Assert(short.MinValue <= input);
  33. Debug.Assert(input <= short.MaxValue);
  34. return input;
  35. }
  36. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  37. public static long HighbdCheckRange(long input, int bd)
  38. {
  39. // For valid highbitdepth VP9 streams, intermediate stage coefficients will
  40. // stay within the ranges:
  41. // - 8 bit: signed 16 bit integer
  42. // - 10 bit: signed 18 bit integer
  43. // - 12 bit: signed 20 bit integer
  44. int intMax = (1 << (7 + bd)) - 1;
  45. int intMin = -intMax - 1;
  46. Debug.Assert(intMin <= input);
  47. Debug.Assert(input <= intMax);
  48. return input;
  49. }
  50. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  51. private static int WrapLow(long x)
  52. {
  53. return (short)CheckRange(x);
  54. }
  55. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  56. private static int HighbdWrapLow(long x, int bd)
  57. {
  58. return ((int)HighbdCheckRange(x, bd) << (24 - bd)) >> (24 - bd);
  59. }
  60. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  61. public static byte ClipPixelAdd(byte dest, long trans)
  62. {
  63. trans = WrapLow(trans);
  64. return BitUtils.ClipPixel(dest + (int)trans);
  65. }
  66. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  67. public static ushort HighbdClipPixelAdd(ushort dest, long trans, int bd)
  68. {
  69. trans = HighbdWrapLow(trans, bd);
  70. return BitUtils.ClipPixelHighbd(dest + (int)trans, bd);
  71. }
  72. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  73. private static long DctConstRoundShift(long input)
  74. {
  75. long rv = BitUtils.RoundPowerOfTwo(input, DctConstBits);
  76. return rv;
  77. }
  78. [SkipLocalsInit]
  79. public static void Iwht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  80. {
  81. /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  82. 0.5 shifts per pixel. */
  83. int i;
  84. Span<int> output = stackalloc int[16];
  85. long a1, b1, c1, d1, e1;
  86. ReadOnlySpan<int> ip = input;
  87. Span<int> op = output;
  88. for (i = 0; i < 4; i++)
  89. {
  90. a1 = ip[0] >> UnitQuantShift;
  91. c1 = ip[1] >> UnitQuantShift;
  92. d1 = ip[2] >> UnitQuantShift;
  93. b1 = ip[3] >> UnitQuantShift;
  94. a1 += c1;
  95. d1 -= b1;
  96. e1 = (a1 - d1) >> 1;
  97. b1 = e1 - b1;
  98. c1 = e1 - c1;
  99. a1 -= b1;
  100. d1 += c1;
  101. op[0] = WrapLow(a1);
  102. op[1] = WrapLow(b1);
  103. op[2] = WrapLow(c1);
  104. op[3] = WrapLow(d1);
  105. ip = ip.Slice(4);
  106. op = op.Slice(4);
  107. }
  108. Span<int> ip2 = output;
  109. for (i = 0; i < 4; i++)
  110. {
  111. a1 = ip2[4 * 0];
  112. c1 = ip2[4 * 1];
  113. d1 = ip2[4 * 2];
  114. b1 = ip2[4 * 3];
  115. a1 += c1;
  116. d1 -= b1;
  117. e1 = (a1 - d1) >> 1;
  118. b1 = e1 - b1;
  119. c1 = e1 - c1;
  120. a1 -= b1;
  121. d1 += c1;
  122. dest[stride * 0] = ClipPixelAdd(dest[stride * 0], WrapLow(a1));
  123. dest[stride * 1] = ClipPixelAdd(dest[stride * 1], WrapLow(b1));
  124. dest[stride * 2] = ClipPixelAdd(dest[stride * 2], WrapLow(c1));
  125. dest[stride * 3] = ClipPixelAdd(dest[stride * 3], WrapLow(d1));
  126. ip2 = ip2.Slice(1);
  127. dest = dest.Slice(1);
  128. }
  129. }
  130. [SkipLocalsInit]
  131. public static void Iwht4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  132. {
  133. int i;
  134. long a1, e1;
  135. Span<int> tmp = stackalloc int[4];
  136. ReadOnlySpan<int> ip = input;
  137. Span<int> op = tmp;
  138. a1 = ip[0] >> UnitQuantShift;
  139. e1 = a1 >> 1;
  140. a1 -= e1;
  141. op[0] = WrapLow(a1);
  142. op[1] = op[2] = op[3] = WrapLow(e1);
  143. Span<int> ip2 = tmp;
  144. for (i = 0; i < 4; i++)
  145. {
  146. e1 = ip2[0] >> 1;
  147. a1 = ip2[0] - e1;
  148. dest[stride * 0] = ClipPixelAdd(dest[stride * 0], a1);
  149. dest[stride * 1] = ClipPixelAdd(dest[stride * 1], e1);
  150. dest[stride * 2] = ClipPixelAdd(dest[stride * 2], e1);
  151. dest[stride * 3] = ClipPixelAdd(dest[stride * 3], e1);
  152. ip2 = ip2.Slice(1);
  153. dest = dest.Slice(1);
  154. }
  155. }
  156. public static void Iadst4(ReadOnlySpan<int> input, Span<int> output)
  157. {
  158. long s0, s1, s2, s3, s4, s5, s6, s7;
  159. int x0 = input[0];
  160. int x1 = input[1];
  161. int x2 = input[2];
  162. int x3 = input[3];
  163. if ((x0 | x1 | x2 | x3) == 0)
  164. {
  165. output.Slice(0, 4).Fill(0);
  166. return;
  167. }
  168. // 32-bit result is enough for the following multiplications.
  169. s0 = SinPi1_9 * x0;
  170. s1 = SinPi2_9 * x0;
  171. s2 = SinPi3_9 * x1;
  172. s3 = SinPi4_9 * x2;
  173. s4 = SinPi1_9 * x2;
  174. s5 = SinPi2_9 * x3;
  175. s6 = SinPi4_9 * x3;
  176. s7 = WrapLow(x0 - x2 + x3);
  177. s0 = s0 + s3 + s5;
  178. s1 = s1 - s4 - s6;
  179. s3 = s2;
  180. s2 = SinPi3_9 * s7;
  181. // 1-D transform scaling factor is sqrt(2).
  182. // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  183. // + 1b (addition) = 29b.
  184. // Hence the output bit depth is 15b.
  185. output[0] = WrapLow(DctConstRoundShift(s0 + s3));
  186. output[1] = WrapLow(DctConstRoundShift(s1 + s3));
  187. output[2] = WrapLow(DctConstRoundShift(s2));
  188. output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3));
  189. }
  190. [SkipLocalsInit]
  191. public static void Idct4(ReadOnlySpan<int> input, Span<int> output)
  192. {
  193. Span<short> step = stackalloc short[4];
  194. long temp1, temp2;
  195. // stage 1
  196. temp1 = ((short)input[0] + (short)input[2]) * CosPi16_64;
  197. temp2 = ((short)input[0] - (short)input[2]) * CosPi16_64;
  198. step[0] = (short)WrapLow(DctConstRoundShift(temp1));
  199. step[1] = (short)WrapLow(DctConstRoundShift(temp2));
  200. temp1 = (short)input[1] * CosPi24_64 - (short)input[3] * CosPi8_64;
  201. temp2 = (short)input[1] * CosPi8_64 + (short)input[3] * CosPi24_64;
  202. step[2] = (short)WrapLow(DctConstRoundShift(temp1));
  203. step[3] = (short)WrapLow(DctConstRoundShift(temp2));
  204. // stage 2
  205. output[0] = WrapLow(step[0] + step[3]);
  206. output[1] = WrapLow(step[1] + step[2]);
  207. output[2] = WrapLow(step[1] - step[2]);
  208. output[3] = WrapLow(step[0] - step[3]);
  209. }
  210. [SkipLocalsInit]
  211. public static void Idct4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  212. {
  213. int i, j;
  214. Span<int> output = stackalloc int[4 * 4];
  215. Span<int> outptr = output;
  216. Span<int> tempIn = stackalloc int[4];
  217. Span<int> tempOut = stackalloc int[4];
  218. // Rows
  219. for (i = 0; i < 4; ++i)
  220. {
  221. Idct4(input, outptr);
  222. input = input.Slice(4);
  223. outptr = outptr.Slice(4);
  224. }
  225. // Columns
  226. for (i = 0; i < 4; ++i)
  227. {
  228. for (j = 0; j < 4; ++j)
  229. {
  230. tempIn[j] = output[j * 4 + i];
  231. }
  232. Idct4(tempIn, tempOut);
  233. for (j = 0; j < 4; ++j)
  234. {
  235. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4));
  236. }
  237. }
  238. }
  239. public static void Idct4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  240. {
  241. int i;
  242. long a1;
  243. int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
  244. output = WrapLow(DctConstRoundShift(output * CosPi16_64));
  245. a1 = BitUtils.RoundPowerOfTwo(output, 4);
  246. for (i = 0; i < 4; i++)
  247. {
  248. dest[0] = ClipPixelAdd(dest[0], a1);
  249. dest[1] = ClipPixelAdd(dest[1], a1);
  250. dest[2] = ClipPixelAdd(dest[2], a1);
  251. dest[3] = ClipPixelAdd(dest[3], a1);
  252. dest = dest.Slice(stride);
  253. }
  254. }
  255. public static void Iadst8(ReadOnlySpan<int> input, Span<int> output)
  256. {
  257. int s0, s1, s2, s3, s4, s5, s6, s7;
  258. long x0 = input[7];
  259. long x1 = input[0];
  260. long x2 = input[5];
  261. long x3 = input[2];
  262. long x4 = input[3];
  263. long x5 = input[4];
  264. long x6 = input[1];
  265. long x7 = input[6];
  266. if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0)
  267. {
  268. output.Slice(0, 8).Fill(0);
  269. return;
  270. }
  271. // stage 1
  272. s0 = (int)(CosPi2_64 * x0 + CosPi30_64 * x1);
  273. s1 = (int)(CosPi30_64 * x0 - CosPi2_64 * x1);
  274. s2 = (int)(CosPi10_64 * x2 + CosPi22_64 * x3);
  275. s3 = (int)(CosPi22_64 * x2 - CosPi10_64 * x3);
  276. s4 = (int)(CosPi18_64 * x4 + CosPi14_64 * x5);
  277. s5 = (int)(CosPi14_64 * x4 - CosPi18_64 * x5);
  278. s6 = (int)(CosPi26_64 * x6 + CosPi6_64 * x7);
  279. s7 = (int)(CosPi6_64 * x6 - CosPi26_64 * x7);
  280. x0 = WrapLow(DctConstRoundShift(s0 + s4));
  281. x1 = WrapLow(DctConstRoundShift(s1 + s5));
  282. x2 = WrapLow(DctConstRoundShift(s2 + s6));
  283. x3 = WrapLow(DctConstRoundShift(s3 + s7));
  284. x4 = WrapLow(DctConstRoundShift(s0 - s4));
  285. x5 = WrapLow(DctConstRoundShift(s1 - s5));
  286. x6 = WrapLow(DctConstRoundShift(s2 - s6));
  287. x7 = WrapLow(DctConstRoundShift(s3 - s7));
  288. // stage 2
  289. s0 = (int)x0;
  290. s1 = (int)x1;
  291. s2 = (int)x2;
  292. s3 = (int)x3;
  293. s4 = (int)(CosPi8_64 * x4 + CosPi24_64 * x5);
  294. s5 = (int)(CosPi24_64 * x4 - CosPi8_64 * x5);
  295. s6 = (int)(-CosPi24_64 * x6 + CosPi8_64 * x7);
  296. s7 = (int)(CosPi8_64 * x6 + CosPi24_64 * x7);
  297. x0 = WrapLow(s0 + s2);
  298. x1 = WrapLow(s1 + s3);
  299. x2 = WrapLow(s0 - s2);
  300. x3 = WrapLow(s1 - s3);
  301. x4 = WrapLow(DctConstRoundShift(s4 + s6));
  302. x5 = WrapLow(DctConstRoundShift(s5 + s7));
  303. x6 = WrapLow(DctConstRoundShift(s4 - s6));
  304. x7 = WrapLow(DctConstRoundShift(s5 - s7));
  305. // stage 3
  306. s2 = (int)(CosPi16_64 * (x2 + x3));
  307. s3 = (int)(CosPi16_64 * (x2 - x3));
  308. s6 = (int)(CosPi16_64 * (x6 + x7));
  309. s7 = (int)(CosPi16_64 * (x6 - x7));
  310. x2 = WrapLow(DctConstRoundShift(s2));
  311. x3 = WrapLow(DctConstRoundShift(s3));
  312. x6 = WrapLow(DctConstRoundShift(s6));
  313. x7 = WrapLow(DctConstRoundShift(s7));
  314. output[0] = WrapLow(x0);
  315. output[1] = WrapLow(-x4);
  316. output[2] = WrapLow(x6);
  317. output[3] = WrapLow(-x2);
  318. output[4] = WrapLow(x3);
  319. output[5] = WrapLow(-x7);
  320. output[6] = WrapLow(x5);
  321. output[7] = WrapLow(-x1);
  322. }
  323. [SkipLocalsInit]
  324. public static void Idct8(ReadOnlySpan<int> input, Span<int> output)
  325. {
  326. Span<short> step1 = stackalloc short[8];
  327. Span<short> step2 = stackalloc short[8];
  328. long temp1, temp2;
  329. // stage 1
  330. step1[0] = (short)input[0];
  331. step1[2] = (short)input[4];
  332. step1[1] = (short)input[2];
  333. step1[3] = (short)input[6];
  334. temp1 = (short)input[1] * CosPi28_64 - (short)input[7] * CosPi4_64;
  335. temp2 = (short)input[1] * CosPi4_64 + (short)input[7] * CosPi28_64;
  336. step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
  337. step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
  338. temp1 = (short)input[5] * CosPi12_64 - (short)input[3] * CosPi20_64;
  339. temp2 = (short)input[5] * CosPi20_64 + (short)input[3] * CosPi12_64;
  340. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  341. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  342. // stage 2
  343. temp1 = (step1[0] + step1[2]) * CosPi16_64;
  344. temp2 = (step1[0] - step1[2]) * CosPi16_64;
  345. step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
  346. step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
  347. temp1 = step1[1] * CosPi24_64 - step1[3] * CosPi8_64;
  348. temp2 = step1[1] * CosPi8_64 + step1[3] * CosPi24_64;
  349. step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
  350. step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
  351. step2[4] = (short)WrapLow(step1[4] + step1[5]);
  352. step2[5] = (short)WrapLow(step1[4] - step1[5]);
  353. step2[6] = (short)WrapLow(-step1[6] + step1[7]);
  354. step2[7] = (short)WrapLow(step1[6] + step1[7]);
  355. // stage 3
  356. step1[0] = (short)WrapLow(step2[0] + step2[3]);
  357. step1[1] = (short)WrapLow(step2[1] + step2[2]);
  358. step1[2] = (short)WrapLow(step2[1] - step2[2]);
  359. step1[3] = (short)WrapLow(step2[0] - step2[3]);
  360. step1[4] = step2[4];
  361. temp1 = (step2[6] - step2[5]) * CosPi16_64;
  362. temp2 = (step2[5] + step2[6]) * CosPi16_64;
  363. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  364. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  365. step1[7] = step2[7];
  366. // stage 4
  367. output[0] = WrapLow(step1[0] + step1[7]);
  368. output[1] = WrapLow(step1[1] + step1[6]);
  369. output[2] = WrapLow(step1[2] + step1[5]);
  370. output[3] = WrapLow(step1[3] + step1[4]);
  371. output[4] = WrapLow(step1[3] - step1[4]);
  372. output[5] = WrapLow(step1[2] - step1[5]);
  373. output[6] = WrapLow(step1[1] - step1[6]);
  374. output[7] = WrapLow(step1[0] - step1[7]);
  375. }
  376. [SkipLocalsInit]
  377. public static void Idct8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  378. {
  379. int i, j;
  380. Span<int> output = stackalloc int[8 * 8];
  381. Span<int> outptr = output;
  382. Span<int> tempIn = stackalloc int[8];
  383. Span<int> tempOut = stackalloc int[8];
  384. // First transform rows
  385. for (i = 0; i < 8; ++i)
  386. {
  387. Idct8(input, outptr);
  388. input = input.Slice(8);
  389. outptr = outptr.Slice(8);
  390. }
  391. // Then transform columns
  392. for (i = 0; i < 8; ++i)
  393. {
  394. for (j = 0; j < 8; ++j)
  395. {
  396. tempIn[j] = output[j * 8 + i];
  397. }
  398. Idct8(tempIn, tempOut);
  399. for (j = 0; j < 8; ++j)
  400. {
  401. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i],
  402. BitUtils.RoundPowerOfTwo(tempOut[j], 5));
  403. }
  404. }
  405. }
  406. [SkipLocalsInit]
  407. public static void Idct8x812Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  408. {
  409. int i, j;
  410. Span<int> output = stackalloc int[8 * 8];
  411. Span<int> outptr = output;
  412. Span<int> tempIn = stackalloc int[8];
  413. Span<int> tempOut = stackalloc int[8];
  414. output.Fill(0);
  415. // First transform rows
  416. // Only first 4 row has non-zero coefs
  417. for (i = 0; i < 4; ++i)
  418. {
  419. Idct8(input, outptr);
  420. input = input.Slice(8);
  421. outptr = outptr.Slice(8);
  422. }
  423. // Then transform columns
  424. for (i = 0; i < 8; ++i)
  425. {
  426. for (j = 0; j < 8; ++j)
  427. {
  428. tempIn[j] = output[j * 8 + i];
  429. }
  430. Idct8(tempIn, tempOut);
  431. for (j = 0; j < 8; ++j)
  432. {
  433. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5));
  434. }
  435. }
  436. }
  437. public static void Idct8x81Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  438. {
  439. int i, j;
  440. long a1;
  441. int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
  442. output = WrapLow(DctConstRoundShift(output * CosPi16_64));
  443. a1 = BitUtils.RoundPowerOfTwo(output, 5);
  444. for (j = 0; j < 8; ++j)
  445. {
  446. for (i = 0; i < 8; ++i)
  447. {
  448. dest[i] = ClipPixelAdd(dest[i], a1);
  449. }
  450. dest = dest.Slice(stride);
  451. }
  452. }
  453. public static void Iadst16(ReadOnlySpan<int> input, Span<int> output)
  454. {
  455. long s0, s1, s2, s3, s4, s5, s6, s7, s8;
  456. long s9, s10, s11, s12, s13, s14, s15;
  457. long x0 = input[15];
  458. long x1 = input[0];
  459. long x2 = input[13];
  460. long x3 = input[2];
  461. long x4 = input[11];
  462. long x5 = input[4];
  463. long x6 = input[9];
  464. long x7 = input[6];
  465. long x8 = input[7];
  466. long x9 = input[8];
  467. long x10 = input[5];
  468. long x11 = input[10];
  469. long x12 = input[3];
  470. long x13 = input[12];
  471. long x14 = input[1];
  472. long x15 = input[14];
  473. if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0)
  474. {
  475. output.Slice(0, 16).Fill(0);
  476. return;
  477. }
  478. // stage 1
  479. s0 = x0 * CosPi1_64 + x1 * CosPi31_64;
  480. s1 = x0 * CosPi31_64 - x1 * CosPi1_64;
  481. s2 = x2 * CosPi5_64 + x3 * CosPi27_64;
  482. s3 = x2 * CosPi27_64 - x3 * CosPi5_64;
  483. s4 = x4 * CosPi9_64 + x5 * CosPi23_64;
  484. s5 = x4 * CosPi23_64 - x5 * CosPi9_64;
  485. s6 = x6 * CosPi13_64 + x7 * CosPi19_64;
  486. s7 = x6 * CosPi19_64 - x7 * CosPi13_64;
  487. s8 = x8 * CosPi17_64 + x9 * CosPi15_64;
  488. s9 = x8 * CosPi15_64 - x9 * CosPi17_64;
  489. s10 = x10 * CosPi21_64 + x11 * CosPi11_64;
  490. s11 = x10 * CosPi11_64 - x11 * CosPi21_64;
  491. s12 = x12 * CosPi25_64 + x13 * CosPi7_64;
  492. s13 = x12 * CosPi7_64 - x13 * CosPi25_64;
  493. s14 = x14 * CosPi29_64 + x15 * CosPi3_64;
  494. s15 = x14 * CosPi3_64 - x15 * CosPi29_64;
  495. x0 = WrapLow(DctConstRoundShift(s0 + s8));
  496. x1 = WrapLow(DctConstRoundShift(s1 + s9));
  497. x2 = WrapLow(DctConstRoundShift(s2 + s10));
  498. x3 = WrapLow(DctConstRoundShift(s3 + s11));
  499. x4 = WrapLow(DctConstRoundShift(s4 + s12));
  500. x5 = WrapLow(DctConstRoundShift(s5 + s13));
  501. x6 = WrapLow(DctConstRoundShift(s6 + s14));
  502. x7 = WrapLow(DctConstRoundShift(s7 + s15));
  503. x8 = WrapLow(DctConstRoundShift(s0 - s8));
  504. x9 = WrapLow(DctConstRoundShift(s1 - s9));
  505. x10 = WrapLow(DctConstRoundShift(s2 - s10));
  506. x11 = WrapLow(DctConstRoundShift(s3 - s11));
  507. x12 = WrapLow(DctConstRoundShift(s4 - s12));
  508. x13 = WrapLow(DctConstRoundShift(s5 - s13));
  509. x14 = WrapLow(DctConstRoundShift(s6 - s14));
  510. x15 = WrapLow(DctConstRoundShift(s7 - s15));
  511. // stage 2
  512. s0 = x0;
  513. s1 = x1;
  514. s2 = x2;
  515. s3 = x3;
  516. s4 = x4;
  517. s5 = x5;
  518. s6 = x6;
  519. s7 = x7;
  520. s8 = x8 * CosPi4_64 + x9 * CosPi28_64;
  521. s9 = x8 * CosPi28_64 - x9 * CosPi4_64;
  522. s10 = x10 * CosPi20_64 + x11 * CosPi12_64;
  523. s11 = x10 * CosPi12_64 - x11 * CosPi20_64;
  524. s12 = -x12 * CosPi28_64 + x13 * CosPi4_64;
  525. s13 = x12 * CosPi4_64 + x13 * CosPi28_64;
  526. s14 = -x14 * CosPi12_64 + x15 * CosPi20_64;
  527. s15 = x14 * CosPi20_64 + x15 * CosPi12_64;
  528. x0 = WrapLow(s0 + s4);
  529. x1 = WrapLow(s1 + s5);
  530. x2 = WrapLow(s2 + s6);
  531. x3 = WrapLow(s3 + s7);
  532. x4 = WrapLow(s0 - s4);
  533. x5 = WrapLow(s1 - s5);
  534. x6 = WrapLow(s2 - s6);
  535. x7 = WrapLow(s3 - s7);
  536. x8 = WrapLow(DctConstRoundShift(s8 + s12));
  537. x9 = WrapLow(DctConstRoundShift(s9 + s13));
  538. x10 = WrapLow(DctConstRoundShift(s10 + s14));
  539. x11 = WrapLow(DctConstRoundShift(s11 + s15));
  540. x12 = WrapLow(DctConstRoundShift(s8 - s12));
  541. x13 = WrapLow(DctConstRoundShift(s9 - s13));
  542. x14 = WrapLow(DctConstRoundShift(s10 - s14));
  543. x15 = WrapLow(DctConstRoundShift(s11 - s15));
  544. // stage 3
  545. s0 = x0;
  546. s1 = x1;
  547. s2 = x2;
  548. s3 = x3;
  549. s4 = x4 * CosPi8_64 + x5 * CosPi24_64;
  550. s5 = x4 * CosPi24_64 - x5 * CosPi8_64;
  551. s6 = -x6 * CosPi24_64 + x7 * CosPi8_64;
  552. s7 = x6 * CosPi8_64 + x7 * CosPi24_64;
  553. s8 = x8;
  554. s9 = x9;
  555. s10 = x10;
  556. s11 = x11;
  557. s12 = x12 * CosPi8_64 + x13 * CosPi24_64;
  558. s13 = x12 * CosPi24_64 - x13 * CosPi8_64;
  559. s14 = -x14 * CosPi24_64 + x15 * CosPi8_64;
  560. s15 = x14 * CosPi8_64 + x15 * CosPi24_64;
  561. x0 = WrapLow(s0 + s2);
  562. x1 = WrapLow(s1 + s3);
  563. x2 = WrapLow(s0 - s2);
  564. x3 = WrapLow(s1 - s3);
  565. x4 = WrapLow(DctConstRoundShift(s4 + s6));
  566. x5 = WrapLow(DctConstRoundShift(s5 + s7));
  567. x6 = WrapLow(DctConstRoundShift(s4 - s6));
  568. x7 = WrapLow(DctConstRoundShift(s5 - s7));
  569. x8 = WrapLow(s8 + s10);
  570. x9 = WrapLow(s9 + s11);
  571. x10 = WrapLow(s8 - s10);
  572. x11 = WrapLow(s9 - s11);
  573. x12 = WrapLow(DctConstRoundShift(s12 + s14));
  574. x13 = WrapLow(DctConstRoundShift(s13 + s15));
  575. x14 = WrapLow(DctConstRoundShift(s12 - s14));
  576. x15 = WrapLow(DctConstRoundShift(s13 - s15));
  577. // stage 4
  578. s2 = (-CosPi16_64) * (x2 + x3);
  579. s3 = CosPi16_64 * (x2 - x3);
  580. s6 = CosPi16_64 * (x6 + x7);
  581. s7 = CosPi16_64 * (-x6 + x7);
  582. s10 = CosPi16_64 * (x10 + x11);
  583. s11 = CosPi16_64 * (-x10 + x11);
  584. s14 = (-CosPi16_64) * (x14 + x15);
  585. s15 = CosPi16_64 * (x14 - x15);
  586. x2 = WrapLow(DctConstRoundShift(s2));
  587. x3 = WrapLow(DctConstRoundShift(s3));
  588. x6 = WrapLow(DctConstRoundShift(s6));
  589. x7 = WrapLow(DctConstRoundShift(s7));
  590. x10 = WrapLow(DctConstRoundShift(s10));
  591. x11 = WrapLow(DctConstRoundShift(s11));
  592. x14 = WrapLow(DctConstRoundShift(s14));
  593. x15 = WrapLow(DctConstRoundShift(s15));
  594. output[0] = WrapLow(x0);
  595. output[1] = WrapLow(-x8);
  596. output[2] = WrapLow(x12);
  597. output[3] = WrapLow(-x4);
  598. output[4] = WrapLow(x6);
  599. output[5] = WrapLow(x14);
  600. output[6] = WrapLow(x10);
  601. output[7] = WrapLow(x2);
  602. output[8] = WrapLow(x3);
  603. output[9] = WrapLow(x11);
  604. output[10] = WrapLow(x15);
  605. output[11] = WrapLow(x7);
  606. output[12] = WrapLow(x5);
  607. output[13] = WrapLow(-x13);
  608. output[14] = WrapLow(x9);
  609. output[15] = WrapLow(-x1);
  610. }
  611. [SkipLocalsInit]
  612. public static void Idct16(ReadOnlySpan<int> input, Span<int> output)
  613. {
  614. Span<short> step1 = stackalloc short[16];
  615. Span<short> step2 = stackalloc short[16];
  616. long temp1, temp2;
  617. // stage 1
  618. step1[0] = (short)input[0 / 2];
  619. step1[1] = (short)input[16 / 2];
  620. step1[2] = (short)input[8 / 2];
  621. step1[3] = (short)input[24 / 2];
  622. step1[4] = (short)input[4 / 2];
  623. step1[5] = (short)input[20 / 2];
  624. step1[6] = (short)input[12 / 2];
  625. step1[7] = (short)input[28 / 2];
  626. step1[8] = (short)input[2 / 2];
  627. step1[9] = (short)input[18 / 2];
  628. step1[10] = (short)input[10 / 2];
  629. step1[11] = (short)input[26 / 2];
  630. step1[12] = (short)input[6 / 2];
  631. step1[13] = (short)input[22 / 2];
  632. step1[14] = (short)input[14 / 2];
  633. step1[15] = (short)input[30 / 2];
  634. // stage 2
  635. step2[0] = step1[0];
  636. step2[1] = step1[1];
  637. step2[2] = step1[2];
  638. step2[3] = step1[3];
  639. step2[4] = step1[4];
  640. step2[5] = step1[5];
  641. step2[6] = step1[6];
  642. step2[7] = step1[7];
  643. temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64;
  644. temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64;
  645. step2[8] = (short)WrapLow(DctConstRoundShift(temp1));
  646. step2[15] = (short)WrapLow(DctConstRoundShift(temp2));
  647. temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64;
  648. temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64;
  649. step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
  650. step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
  651. temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64;
  652. temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64;
  653. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  654. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  655. temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64;
  656. temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64;
  657. step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
  658. step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
  659. // stage 3
  660. step1[0] = step2[0];
  661. step1[1] = step2[1];
  662. step1[2] = step2[2];
  663. step1[3] = step2[3];
  664. temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64;
  665. temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64;
  666. step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
  667. step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
  668. temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64;
  669. temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64;
  670. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  671. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  672. step1[8] = (short)WrapLow(step2[8] + step2[9]);
  673. step1[9] = (short)WrapLow(step2[8] - step2[9]);
  674. step1[10] = (short)WrapLow(-step2[10] + step2[11]);
  675. step1[11] = (short)WrapLow(step2[10] + step2[11]);
  676. step1[12] = (short)WrapLow(step2[12] + step2[13]);
  677. step1[13] = (short)WrapLow(step2[12] - step2[13]);
  678. step1[14] = (short)WrapLow(-step2[14] + step2[15]);
  679. step1[15] = (short)WrapLow(step2[14] + step2[15]);
  680. // stage 4
  681. temp1 = (step1[0] + step1[1]) * CosPi16_64;
  682. temp2 = (step1[0] - step1[1]) * CosPi16_64;
  683. step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
  684. step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
  685. temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64;
  686. temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64;
  687. step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
  688. step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
  689. step2[4] = (short)WrapLow(step1[4] + step1[5]);
  690. step2[5] = (short)WrapLow(step1[4] - step1[5]);
  691. step2[6] = (short)WrapLow(-step1[6] + step1[7]);
  692. step2[7] = (short)WrapLow(step1[6] + step1[7]);
  693. step2[8] = step1[8];
  694. step2[15] = step1[15];
  695. temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64;
  696. temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64;
  697. step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
  698. step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
  699. temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64;
  700. temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64;
  701. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  702. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  703. step2[11] = step1[11];
  704. step2[12] = step1[12];
  705. // stage 5
  706. step1[0] = (short)WrapLow(step2[0] + step2[3]);
  707. step1[1] = (short)WrapLow(step2[1] + step2[2]);
  708. step1[2] = (short)WrapLow(step2[1] - step2[2]);
  709. step1[3] = (short)WrapLow(step2[0] - step2[3]);
  710. step1[4] = step2[4];
  711. temp1 = (step2[6] - step2[5]) * CosPi16_64;
  712. temp2 = (step2[5] + step2[6]) * CosPi16_64;
  713. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  714. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  715. step1[7] = step2[7];
  716. step1[8] = (short)WrapLow(step2[8] + step2[11]);
  717. step1[9] = (short)WrapLow(step2[9] + step2[10]);
  718. step1[10] = (short)WrapLow(step2[9] - step2[10]);
  719. step1[11] = (short)WrapLow(step2[8] - step2[11]);
  720. step1[12] = (short)WrapLow(-step2[12] + step2[15]);
  721. step1[13] = (short)WrapLow(-step2[13] + step2[14]);
  722. step1[14] = (short)WrapLow(step2[13] + step2[14]);
  723. step1[15] = (short)WrapLow(step2[12] + step2[15]);
  724. // stage 6
  725. step2[0] = (short)WrapLow(step1[0] + step1[7]);
  726. step2[1] = (short)WrapLow(step1[1] + step1[6]);
  727. step2[2] = (short)WrapLow(step1[2] + step1[5]);
  728. step2[3] = (short)WrapLow(step1[3] + step1[4]);
  729. step2[4] = (short)WrapLow(step1[3] - step1[4]);
  730. step2[5] = (short)WrapLow(step1[2] - step1[5]);
  731. step2[6] = (short)WrapLow(step1[1] - step1[6]);
  732. step2[7] = (short)WrapLow(step1[0] - step1[7]);
  733. step2[8] = step1[8];
  734. step2[9] = step1[9];
  735. temp1 = (-step1[10] + step1[13]) * CosPi16_64;
  736. temp2 = (step1[10] + step1[13]) * CosPi16_64;
  737. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  738. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  739. temp1 = (-step1[11] + step1[12]) * CosPi16_64;
  740. temp2 = (step1[11] + step1[12]) * CosPi16_64;
  741. step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
  742. step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
  743. step2[14] = step1[14];
  744. step2[15] = step1[15];
  745. // stage 7
  746. output[0] = WrapLow(step2[0] + step2[15]);
  747. output[1] = WrapLow(step2[1] + step2[14]);
  748. output[2] = WrapLow(step2[2] + step2[13]);
  749. output[3] = WrapLow(step2[3] + step2[12]);
  750. output[4] = WrapLow(step2[4] + step2[11]);
  751. output[5] = WrapLow(step2[5] + step2[10]);
  752. output[6] = WrapLow(step2[6] + step2[9]);
  753. output[7] = WrapLow(step2[7] + step2[8]);
  754. output[8] = WrapLow(step2[7] - step2[8]);
  755. output[9] = WrapLow(step2[6] - step2[9]);
  756. output[10] = WrapLow(step2[5] - step2[10]);
  757. output[11] = WrapLow(step2[4] - step2[11]);
  758. output[12] = WrapLow(step2[3] - step2[12]);
  759. output[13] = WrapLow(step2[2] - step2[13]);
  760. output[14] = WrapLow(step2[1] - step2[14]);
  761. output[15] = WrapLow(step2[0] - step2[15]);
  762. }
  763. [SkipLocalsInit]
  764. public static void Idct16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  765. {
  766. int i, j;
  767. Span<int> output = stackalloc int[16 * 16];
  768. Span<int> outptr = output;
  769. Span<int> tempIn = stackalloc int[16];
  770. Span<int> tempOut = stackalloc int[16];
  771. // First transform rows
  772. for (i = 0; i < 16; ++i)
  773. {
  774. Idct16(input, outptr);
  775. input = input.Slice(16);
  776. outptr = outptr.Slice(16);
  777. }
  778. // Then transform columns
  779. for (i = 0; i < 16; ++i)
  780. {
  781. for (j = 0; j < 16; ++j)
  782. {
  783. tempIn[j] = output[j * 16 + i];
  784. }
  785. Idct16(tempIn, tempOut);
  786. for (j = 0; j < 16; ++j)
  787. {
  788. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  789. }
  790. }
  791. }
  792. [SkipLocalsInit]
  793. public static void Idct16x1638Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  794. {
  795. int i, j;
  796. Span<int> output = stackalloc int[16 * 16];
  797. Span<int> outptr = output;
  798. Span<int> tempIn = stackalloc int[16];
  799. Span<int> tempOut = stackalloc int[16];
  800. output.Fill(0);
  801. // First transform rows. Since all non-zero dct coefficients are in
  802. // upper-left 8x8 area, we only need to calculate first 8 rows here.
  803. for (i = 0; i < 8; ++i)
  804. {
  805. Idct16(input, outptr);
  806. input = input.Slice(16);
  807. outptr = outptr.Slice(16);
  808. }
  809. // Then transform columns
  810. for (i = 0; i < 16; ++i)
  811. {
  812. for (j = 0; j < 16; ++j)
  813. {
  814. tempIn[j] = output[j * 16 + i];
  815. }
  816. Idct16(tempIn, tempOut);
  817. for (j = 0; j < 16; ++j)
  818. {
  819. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  820. }
  821. }
  822. }
  823. [SkipLocalsInit]
  824. public static void Idct16x1610Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  825. {
  826. int i, j;
  827. Span<int> output = stackalloc int[16 * 16];
  828. Span<int> outptr = output;
  829. Span<int> tempIn = stackalloc int[16];
  830. Span<int> tempOut = stackalloc int[16];
  831. output.Fill(0);
  832. // First transform rows. Since all non-zero dct coefficients are in
  833. // upper-left 4x4 area, we only need to calculate first 4 rows here.
  834. for (i = 0; i < 4; ++i)
  835. {
  836. Idct16(input, outptr);
  837. input = input.Slice(16);
  838. outptr = outptr.Slice(16);
  839. }
  840. // Then transform columns
  841. for (i = 0; i < 16; ++i)
  842. {
  843. for (j = 0; j < 16; ++j)
  844. {
  845. tempIn[j] = output[j * 16 + i];
  846. }
  847. Idct16(tempIn, tempOut);
  848. for (j = 0; j < 16; ++j)
  849. {
  850. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  851. }
  852. }
  853. }
  854. public static void Idct16x161Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  855. {
  856. int i, j;
  857. long a1;
  858. int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
  859. output = WrapLow(DctConstRoundShift(output * CosPi16_64));
  860. a1 = BitUtils.RoundPowerOfTwo(output, 6);
  861. for (j = 0; j < 16; ++j)
  862. {
  863. for (i = 0; i < 16; ++i)
  864. {
  865. dest[i] = ClipPixelAdd(dest[i], a1);
  866. }
  867. dest = dest.Slice(stride);
  868. }
  869. }
  870. [SkipLocalsInit]
  871. public static void Idct32(ReadOnlySpan<int> input, Span<int> output)
  872. {
  873. Span<short> step1 = stackalloc short[32];
  874. Span<short> step2 = stackalloc short[32];
  875. long temp1, temp2;
  876. // stage 1
  877. step1[0] = (short)input[0];
  878. step1[1] = (short)input[16];
  879. step1[2] = (short)input[8];
  880. step1[3] = (short)input[24];
  881. step1[4] = (short)input[4];
  882. step1[5] = (short)input[20];
  883. step1[6] = (short)input[12];
  884. step1[7] = (short)input[28];
  885. step1[8] = (short)input[2];
  886. step1[9] = (short)input[18];
  887. step1[10] = (short)input[10];
  888. step1[11] = (short)input[26];
  889. step1[12] = (short)input[6];
  890. step1[13] = (short)input[22];
  891. step1[14] = (short)input[14];
  892. step1[15] = (short)input[30];
  893. temp1 = (short)input[1] * CosPi31_64 - (short)input[31] * CosPi1_64;
  894. temp2 = (short)input[1] * CosPi1_64 + (short)input[31] * CosPi31_64;
  895. step1[16] = (short)WrapLow(DctConstRoundShift(temp1));
  896. step1[31] = (short)WrapLow(DctConstRoundShift(temp2));
  897. temp1 = (short)input[17] * CosPi15_64 - (short)input[15] * CosPi17_64;
  898. temp2 = (short)input[17] * CosPi17_64 + (short)input[15] * CosPi15_64;
  899. step1[17] = (short)WrapLow(DctConstRoundShift(temp1));
  900. step1[30] = (short)WrapLow(DctConstRoundShift(temp2));
  901. temp1 = (short)input[9] * CosPi23_64 - (short)input[23] * CosPi9_64;
  902. temp2 = (short)input[9] * CosPi9_64 + (short)input[23] * CosPi23_64;
  903. step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
  904. step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
  905. temp1 = (short)input[25] * CosPi7_64 - (short)input[7] * CosPi25_64;
  906. temp2 = (short)input[25] * CosPi25_64 + (short)input[7] * CosPi7_64;
  907. step1[19] = (short)WrapLow(DctConstRoundShift(temp1));
  908. step1[28] = (short)WrapLow(DctConstRoundShift(temp2));
  909. temp1 = (short)input[5] * CosPi27_64 - (short)input[27] * CosPi5_64;
  910. temp2 = (short)input[5] * CosPi5_64 + (short)input[27] * CosPi27_64;
  911. step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
  912. step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
  913. temp1 = (short)input[21] * CosPi11_64 - (short)input[11] * CosPi21_64;
  914. temp2 = (short)input[21] * CosPi21_64 + (short)input[11] * CosPi11_64;
  915. step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
  916. step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
  917. temp1 = (short)input[13] * CosPi19_64 - (short)input[19] * CosPi13_64;
  918. temp2 = (short)input[13] * CosPi13_64 + (short)input[19] * CosPi19_64;
  919. step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
  920. step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
  921. temp1 = (short)input[29] * CosPi3_64 - (short)input[3] * CosPi29_64;
  922. temp2 = (short)input[29] * CosPi29_64 + (short)input[3] * CosPi3_64;
  923. step1[23] = (short)WrapLow(DctConstRoundShift(temp1));
  924. step1[24] = (short)WrapLow(DctConstRoundShift(temp2));
  925. // stage 2
  926. step2[0] = step1[0];
  927. step2[1] = step1[1];
  928. step2[2] = step1[2];
  929. step2[3] = step1[3];
  930. step2[4] = step1[4];
  931. step2[5] = step1[5];
  932. step2[6] = step1[6];
  933. step2[7] = step1[7];
  934. temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64;
  935. temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64;
  936. step2[8] = (short)WrapLow(DctConstRoundShift(temp1));
  937. step2[15] = (short)WrapLow(DctConstRoundShift(temp2));
  938. temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64;
  939. temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64;
  940. step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
  941. step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
  942. temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64;
  943. temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64;
  944. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  945. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  946. temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64;
  947. temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64;
  948. step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
  949. step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
  950. step2[16] = (short)WrapLow(step1[16] + step1[17]);
  951. step2[17] = (short)WrapLow(step1[16] - step1[17]);
  952. step2[18] = (short)WrapLow(-step1[18] + step1[19]);
  953. step2[19] = (short)WrapLow(step1[18] + step1[19]);
  954. step2[20] = (short)WrapLow(step1[20] + step1[21]);
  955. step2[21] = (short)WrapLow(step1[20] - step1[21]);
  956. step2[22] = (short)WrapLow(-step1[22] + step1[23]);
  957. step2[23] = (short)WrapLow(step1[22] + step1[23]);
  958. step2[24] = (short)WrapLow(step1[24] + step1[25]);
  959. step2[25] = (short)WrapLow(step1[24] - step1[25]);
  960. step2[26] = (short)WrapLow(-step1[26] + step1[27]);
  961. step2[27] = (short)WrapLow(step1[26] + step1[27]);
  962. step2[28] = (short)WrapLow(step1[28] + step1[29]);
  963. step2[29] = (short)WrapLow(step1[28] - step1[29]);
  964. step2[30] = (short)WrapLow(-step1[30] + step1[31]);
  965. step2[31] = (short)WrapLow(step1[30] + step1[31]);
  966. // stage 3
  967. step1[0] = step2[0];
  968. step1[1] = step2[1];
  969. step1[2] = step2[2];
  970. step1[3] = step2[3];
  971. temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64;
  972. temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64;
  973. step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
  974. step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
  975. temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64;
  976. temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64;
  977. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  978. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  979. step1[8] = (short)WrapLow(step2[8] + step2[9]);
  980. step1[9] = (short)WrapLow(step2[8] - step2[9]);
  981. step1[10] = (short)WrapLow(-step2[10] + step2[11]);
  982. step1[11] = (short)WrapLow(step2[10] + step2[11]);
  983. step1[12] = (short)WrapLow(step2[12] + step2[13]);
  984. step1[13] = (short)WrapLow(step2[12] - step2[13]);
  985. step1[14] = (short)WrapLow(-step2[14] + step2[15]);
  986. step1[15] = (short)WrapLow(step2[14] + step2[15]);
  987. step1[16] = step2[16];
  988. step1[31] = step2[31];
  989. temp1 = -step2[17] * CosPi4_64 + step2[30] * CosPi28_64;
  990. temp2 = step2[17] * CosPi28_64 + step2[30] * CosPi4_64;
  991. step1[17] = (short)WrapLow(DctConstRoundShift(temp1));
  992. step1[30] = (short)WrapLow(DctConstRoundShift(temp2));
  993. temp1 = -step2[18] * CosPi28_64 - step2[29] * CosPi4_64;
  994. temp2 = -step2[18] * CosPi4_64 + step2[29] * CosPi28_64;
  995. step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
  996. step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
  997. step1[19] = step2[19];
  998. step1[20] = step2[20];
  999. temp1 = -step2[21] * CosPi20_64 + step2[26] * CosPi12_64;
  1000. temp2 = step2[21] * CosPi12_64 + step2[26] * CosPi20_64;
  1001. step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
  1002. step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
  1003. temp1 = -step2[22] * CosPi12_64 - step2[25] * CosPi20_64;
  1004. temp2 = -step2[22] * CosPi20_64 + step2[25] * CosPi12_64;
  1005. step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
  1006. step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
  1007. step1[23] = step2[23];
  1008. step1[24] = step2[24];
  1009. step1[27] = step2[27];
  1010. step1[28] = step2[28];
  1011. // stage 4
  1012. temp1 = (step1[0] + step1[1]) * CosPi16_64;
  1013. temp2 = (step1[0] - step1[1]) * CosPi16_64;
  1014. step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
  1015. step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
  1016. temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64;
  1017. temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64;
  1018. step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
  1019. step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
  1020. step2[4] = (short)WrapLow(step1[4] + step1[5]);
  1021. step2[5] = (short)WrapLow(step1[4] - step1[5]);
  1022. step2[6] = (short)WrapLow(-step1[6] + step1[7]);
  1023. step2[7] = (short)WrapLow(step1[6] + step1[7]);
  1024. step2[8] = step1[8];
  1025. step2[15] = step1[15];
  1026. temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64;
  1027. temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64;
  1028. step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
  1029. step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
  1030. temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64;
  1031. temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64;
  1032. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  1033. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  1034. step2[11] = step1[11];
  1035. step2[12] = step1[12];
  1036. step2[16] = (short)WrapLow(step1[16] + step1[19]);
  1037. step2[17] = (short)WrapLow(step1[17] + step1[18]);
  1038. step2[18] = (short)WrapLow(step1[17] - step1[18]);
  1039. step2[19] = (short)WrapLow(step1[16] - step1[19]);
  1040. step2[20] = (short)WrapLow(-step1[20] + step1[23]);
  1041. step2[21] = (short)WrapLow(-step1[21] + step1[22]);
  1042. step2[22] = (short)WrapLow(step1[21] + step1[22]);
  1043. step2[23] = (short)WrapLow(step1[20] + step1[23]);
  1044. step2[24] = (short)WrapLow(step1[24] + step1[27]);
  1045. step2[25] = (short)WrapLow(step1[25] + step1[26]);
  1046. step2[26] = (short)WrapLow(step1[25] - step1[26]);
  1047. step2[27] = (short)WrapLow(step1[24] - step1[27]);
  1048. step2[28] = (short)WrapLow(-step1[28] + step1[31]);
  1049. step2[29] = (short)WrapLow(-step1[29] + step1[30]);
  1050. step2[30] = (short)WrapLow(step1[29] + step1[30]);
  1051. step2[31] = (short)WrapLow(step1[28] + step1[31]);
  1052. // stage 5
  1053. step1[0] = (short)WrapLow(step2[0] + step2[3]);
  1054. step1[1] = (short)WrapLow(step2[1] + step2[2]);
  1055. step1[2] = (short)WrapLow(step2[1] - step2[2]);
  1056. step1[3] = (short)WrapLow(step2[0] - step2[3]);
  1057. step1[4] = step2[4];
  1058. temp1 = (step2[6] - step2[5]) * CosPi16_64;
  1059. temp2 = (step2[5] + step2[6]) * CosPi16_64;
  1060. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  1061. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  1062. step1[7] = step2[7];
  1063. step1[8] = (short)WrapLow(step2[8] + step2[11]);
  1064. step1[9] = (short)WrapLow(step2[9] + step2[10]);
  1065. step1[10] = (short)WrapLow(step2[9] - step2[10]);
  1066. step1[11] = (short)WrapLow(step2[8] - step2[11]);
  1067. step1[12] = (short)WrapLow(-step2[12] + step2[15]);
  1068. step1[13] = (short)WrapLow(-step2[13] + step2[14]);
  1069. step1[14] = (short)WrapLow(step2[13] + step2[14]);
  1070. step1[15] = (short)WrapLow(step2[12] + step2[15]);
  1071. step1[16] = step2[16];
  1072. step1[17] = step2[17];
  1073. temp1 = -step2[18] * CosPi8_64 + step2[29] * CosPi24_64;
  1074. temp2 = step2[18] * CosPi24_64 + step2[29] * CosPi8_64;
  1075. step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
  1076. step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
  1077. temp1 = -step2[19] * CosPi8_64 + step2[28] * CosPi24_64;
  1078. temp2 = step2[19] * CosPi24_64 + step2[28] * CosPi8_64;
  1079. step1[19] = (short)WrapLow(DctConstRoundShift(temp1));
  1080. step1[28] = (short)WrapLow(DctConstRoundShift(temp2));
  1081. temp1 = -step2[20] * CosPi24_64 - step2[27] * CosPi8_64;
  1082. temp2 = -step2[20] * CosPi8_64 + step2[27] * CosPi24_64;
  1083. step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
  1084. step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
  1085. temp1 = -step2[21] * CosPi24_64 - step2[26] * CosPi8_64;
  1086. temp2 = -step2[21] * CosPi8_64 + step2[26] * CosPi24_64;
  1087. step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
  1088. step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
  1089. step1[22] = step2[22];
  1090. step1[23] = step2[23];
  1091. step1[24] = step2[24];
  1092. step1[25] = step2[25];
  1093. step1[30] = step2[30];
  1094. step1[31] = step2[31];
  1095. // stage 6
  1096. step2[0] = (short)WrapLow(step1[0] + step1[7]);
  1097. step2[1] = (short)WrapLow(step1[1] + step1[6]);
  1098. step2[2] = (short)WrapLow(step1[2] + step1[5]);
  1099. step2[3] = (short)WrapLow(step1[3] + step1[4]);
  1100. step2[4] = (short)WrapLow(step1[3] - step1[4]);
  1101. step2[5] = (short)WrapLow(step1[2] - step1[5]);
  1102. step2[6] = (short)WrapLow(step1[1] - step1[6]);
  1103. step2[7] = (short)WrapLow(step1[0] - step1[7]);
  1104. step2[8] = step1[8];
  1105. step2[9] = step1[9];
  1106. temp1 = (-step1[10] + step1[13]) * CosPi16_64;
  1107. temp2 = (step1[10] + step1[13]) * CosPi16_64;
  1108. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  1109. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  1110. temp1 = (-step1[11] + step1[12]) * CosPi16_64;
  1111. temp2 = (step1[11] + step1[12]) * CosPi16_64;
  1112. step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
  1113. step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
  1114. step2[14] = step1[14];
  1115. step2[15] = step1[15];
  1116. step2[16] = (short)WrapLow(step1[16] + step1[23]);
  1117. step2[17] = (short)WrapLow(step1[17] + step1[22]);
  1118. step2[18] = (short)WrapLow(step1[18] + step1[21]);
  1119. step2[19] = (short)WrapLow(step1[19] + step1[20]);
  1120. step2[20] = (short)WrapLow(step1[19] - step1[20]);
  1121. step2[21] = (short)WrapLow(step1[18] - step1[21]);
  1122. step2[22] = (short)WrapLow(step1[17] - step1[22]);
  1123. step2[23] = (short)WrapLow(step1[16] - step1[23]);
  1124. step2[24] = (short)WrapLow(-step1[24] + step1[31]);
  1125. step2[25] = (short)WrapLow(-step1[25] + step1[30]);
  1126. step2[26] = (short)WrapLow(-step1[26] + step1[29]);
  1127. step2[27] = (short)WrapLow(-step1[27] + step1[28]);
  1128. step2[28] = (short)WrapLow(step1[27] + step1[28]);
  1129. step2[29] = (short)WrapLow(step1[26] + step1[29]);
  1130. step2[30] = (short)WrapLow(step1[25] + step1[30]);
  1131. step2[31] = (short)WrapLow(step1[24] + step1[31]);
  1132. // stage 7
  1133. step1[0] = (short)WrapLow(step2[0] + step2[15]);
  1134. step1[1] = (short)WrapLow(step2[1] + step2[14]);
  1135. step1[2] = (short)WrapLow(step2[2] + step2[13]);
  1136. step1[3] = (short)WrapLow(step2[3] + step2[12]);
  1137. step1[4] = (short)WrapLow(step2[4] + step2[11]);
  1138. step1[5] = (short)WrapLow(step2[5] + step2[10]);
  1139. step1[6] = (short)WrapLow(step2[6] + step2[9]);
  1140. step1[7] = (short)WrapLow(step2[7] + step2[8]);
  1141. step1[8] = (short)WrapLow(step2[7] - step2[8]);
  1142. step1[9] = (short)WrapLow(step2[6] - step2[9]);
  1143. step1[10] = (short)WrapLow(step2[5] - step2[10]);
  1144. step1[11] = (short)WrapLow(step2[4] - step2[11]);
  1145. step1[12] = (short)WrapLow(step2[3] - step2[12]);
  1146. step1[13] = (short)WrapLow(step2[2] - step2[13]);
  1147. step1[14] = (short)WrapLow(step2[1] - step2[14]);
  1148. step1[15] = (short)WrapLow(step2[0] - step2[15]);
  1149. step1[16] = step2[16];
  1150. step1[17] = step2[17];
  1151. step1[18] = step2[18];
  1152. step1[19] = step2[19];
  1153. temp1 = (-step2[20] + step2[27]) * CosPi16_64;
  1154. temp2 = (step2[20] + step2[27]) * CosPi16_64;
  1155. step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
  1156. step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
  1157. temp1 = (-step2[21] + step2[26]) * CosPi16_64;
  1158. temp2 = (step2[21] + step2[26]) * CosPi16_64;
  1159. step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
  1160. step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
  1161. temp1 = (-step2[22] + step2[25]) * CosPi16_64;
  1162. temp2 = (step2[22] + step2[25]) * CosPi16_64;
  1163. step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
  1164. step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
  1165. temp1 = (-step2[23] + step2[24]) * CosPi16_64;
  1166. temp2 = (step2[23] + step2[24]) * CosPi16_64;
  1167. step1[23] = (short)WrapLow(DctConstRoundShift(temp1));
  1168. step1[24] = (short)WrapLow(DctConstRoundShift(temp2));
  1169. step1[28] = step2[28];
  1170. step1[29] = step2[29];
  1171. step1[30] = step2[30];
  1172. step1[31] = step2[31];
  1173. // final stage
  1174. output[0] = WrapLow(step1[0] + step1[31]);
  1175. output[1] = WrapLow(step1[1] + step1[30]);
  1176. output[2] = WrapLow(step1[2] + step1[29]);
  1177. output[3] = WrapLow(step1[3] + step1[28]);
  1178. output[4] = WrapLow(step1[4] + step1[27]);
  1179. output[5] = WrapLow(step1[5] + step1[26]);
  1180. output[6] = WrapLow(step1[6] + step1[25]);
  1181. output[7] = WrapLow(step1[7] + step1[24]);
  1182. output[8] = WrapLow(step1[8] + step1[23]);
  1183. output[9] = WrapLow(step1[9] + step1[22]);
  1184. output[10] = WrapLow(step1[10] + step1[21]);
  1185. output[11] = WrapLow(step1[11] + step1[20]);
  1186. output[12] = WrapLow(step1[12] + step1[19]);
  1187. output[13] = WrapLow(step1[13] + step1[18]);
  1188. output[14] = WrapLow(step1[14] + step1[17]);
  1189. output[15] = WrapLow(step1[15] + step1[16]);
  1190. output[16] = WrapLow(step1[15] - step1[16]);
  1191. output[17] = WrapLow(step1[14] - step1[17]);
  1192. output[18] = WrapLow(step1[13] - step1[18]);
  1193. output[19] = WrapLow(step1[12] - step1[19]);
  1194. output[20] = WrapLow(step1[11] - step1[20]);
  1195. output[21] = WrapLow(step1[10] - step1[21]);
  1196. output[22] = WrapLow(step1[9] - step1[22]);
  1197. output[23] = WrapLow(step1[8] - step1[23]);
  1198. output[24] = WrapLow(step1[7] - step1[24]);
  1199. output[25] = WrapLow(step1[6] - step1[25]);
  1200. output[26] = WrapLow(step1[5] - step1[26]);
  1201. output[27] = WrapLow(step1[4] - step1[27]);
  1202. output[28] = WrapLow(step1[3] - step1[28]);
  1203. output[29] = WrapLow(step1[2] - step1[29]);
  1204. output[30] = WrapLow(step1[1] - step1[30]);
  1205. output[31] = WrapLow(step1[0] - step1[31]);
  1206. }
  1207. [SkipLocalsInit]
  1208. public static void Idct32x321024Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  1209. {
  1210. int i, j;
  1211. Span<int> output = stackalloc int[32 * 32];
  1212. Span<int> outptr = output;
  1213. Span<int> tempIn = stackalloc int[32];
  1214. Span<int> tempOut = stackalloc int[32];
  1215. // Rows
  1216. for (i = 0; i < 32; ++i)
  1217. {
  1218. short zeroCoeff = 0;
  1219. for (j = 0; j < 32; ++j)
  1220. {
  1221. zeroCoeff |= (short)input[j];
  1222. }
  1223. if (zeroCoeff != 0)
  1224. {
  1225. Idct32(input, outptr);
  1226. }
  1227. else
  1228. {
  1229. outptr.Slice(0, 32).Fill(0);
  1230. }
  1231. input = input.Slice(32);
  1232. outptr = outptr.Slice(32);
  1233. }
  1234. // Columns
  1235. for (i = 0; i < 32; ++i)
  1236. {
  1237. for (j = 0; j < 32; ++j)
  1238. {
  1239. tempIn[j] = output[j * 32 + i];
  1240. }
  1241. Idct32(tempIn, tempOut);
  1242. for (j = 0; j < 32; ++j)
  1243. {
  1244. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  1245. }
  1246. }
  1247. }
  1248. [SkipLocalsInit]
  1249. public static void Idct32x32135Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  1250. {
  1251. int i, j;
  1252. Span<int> output = stackalloc int[32 * 32];
  1253. Span<int> outptr = output;
  1254. Span<int> tempIn = stackalloc int[32];
  1255. Span<int> tempOut = stackalloc int[32];
  1256. output.Fill(0);
  1257. // Rows
  1258. // Only upper-left 16x16 has non-zero coeff
  1259. for (i = 0; i < 16; ++i)
  1260. {
  1261. Idct32(input, outptr);
  1262. input = input.Slice(32);
  1263. outptr = outptr.Slice(32);
  1264. }
  1265. // Columns
  1266. for (i = 0; i < 32; ++i)
  1267. {
  1268. for (j = 0; j < 32; ++j)
  1269. {
  1270. tempIn[j] = output[j * 32 + i];
  1271. }
  1272. Idct32(tempIn, tempOut);
  1273. for (j = 0; j < 32; ++j)
  1274. {
  1275. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  1276. }
  1277. }
  1278. }
  1279. [SkipLocalsInit]
  1280. public static void Idct32x3234Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  1281. {
  1282. int i, j;
  1283. Span<int> output = stackalloc int[32 * 32];
  1284. Span<int> outptr = output;
  1285. Span<int> tempIn = stackalloc int[32];
  1286. Span<int> tempOut = stackalloc int[32];
  1287. output.Fill(0);
  1288. // Rows
  1289. // Only upper-left 8x8 has non-zero coeff
  1290. for (i = 0; i < 8; ++i)
  1291. {
  1292. Idct32(input, outptr);
  1293. input = input.Slice(32);
  1294. outptr = outptr.Slice(32);
  1295. }
  1296. // Columns
  1297. for (i = 0; i < 32; ++i)
  1298. {
  1299. for (j = 0; j < 32; ++j)
  1300. {
  1301. tempIn[j] = output[j * 32 + i];
  1302. }
  1303. Idct32(tempIn, tempOut);
  1304. for (j = 0; j < 32; ++j)
  1305. {
  1306. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  1307. }
  1308. }
  1309. }
  1310. public static void Idct32x321Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  1311. {
  1312. int i, j;
  1313. long a1;
  1314. int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
  1315. output = WrapLow(DctConstRoundShift(output * CosPi16_64));
  1316. a1 = BitUtils.RoundPowerOfTwo(output, 6);
  1317. for (j = 0; j < 32; ++j)
  1318. {
  1319. for (i = 0; i < 32; ++i)
  1320. {
  1321. dest[i] = ClipPixelAdd(dest[i], a1);
  1322. }
  1323. dest = dest.Slice(stride);
  1324. }
  1325. }
  1326. [SkipLocalsInit]
  1327. public static void HighbdIwht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1328. {
  1329. /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  1330. 0.5 shifts per pixel. */
  1331. int i;
  1332. Span<int> output = stackalloc int[16];
  1333. long a1, b1, c1, d1, e1;
  1334. ReadOnlySpan<int> ip = input;
  1335. Span<int> op = output;
  1336. for (i = 0; i < 4; i++)
  1337. {
  1338. a1 = ip[0] >> UnitQuantShift;
  1339. c1 = ip[1] >> UnitQuantShift;
  1340. d1 = ip[2] >> UnitQuantShift;
  1341. b1 = ip[3] >> UnitQuantShift;
  1342. a1 += c1;
  1343. d1 -= b1;
  1344. e1 = (a1 - d1) >> 1;
  1345. b1 = e1 - b1;
  1346. c1 = e1 - c1;
  1347. a1 -= b1;
  1348. d1 += c1;
  1349. op[0] = HighbdWrapLow(a1, bd);
  1350. op[1] = HighbdWrapLow(b1, bd);
  1351. op[2] = HighbdWrapLow(c1, bd);
  1352. op[3] = HighbdWrapLow(d1, bd);
  1353. ip = ip.Slice(4);
  1354. op = op.Slice(4);
  1355. }
  1356. ReadOnlySpan<int> ip2 = output;
  1357. for (i = 0; i < 4; i++)
  1358. {
  1359. a1 = ip2[4 * 0];
  1360. c1 = ip2[4 * 1];
  1361. d1 = ip2[4 * 2];
  1362. b1 = ip2[4 * 3];
  1363. a1 += c1;
  1364. d1 -= b1;
  1365. e1 = (a1 - d1) >> 1;
  1366. b1 = e1 - b1;
  1367. c1 = e1 - c1;
  1368. a1 -= b1;
  1369. d1 += c1;
  1370. dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], HighbdWrapLow(a1, bd), bd);
  1371. dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], HighbdWrapLow(b1, bd), bd);
  1372. dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], HighbdWrapLow(c1, bd), bd);
  1373. dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], HighbdWrapLow(d1, bd), bd);
  1374. ip2 = ip2.Slice(1);
  1375. dest = dest.Slice(1);
  1376. }
  1377. }
  1378. [SkipLocalsInit]
  1379. public static void HighbdIwht4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1380. {
  1381. int i;
  1382. long a1, e1;
  1383. Span<int> tmp = stackalloc int[4];
  1384. ReadOnlySpan<int> ip = input;
  1385. Span<int> op = tmp;
  1386. a1 = ip[0] >> UnitQuantShift;
  1387. e1 = a1 >> 1;
  1388. a1 -= e1;
  1389. op[0] = HighbdWrapLow(a1, bd);
  1390. op[1] = op[2] = op[3] = HighbdWrapLow(e1, bd);
  1391. ReadOnlySpan<int> ip2 = tmp;
  1392. for (i = 0; i < 4; i++)
  1393. {
  1394. e1 = ip2[0] >> 1;
  1395. a1 = ip2[0] - e1;
  1396. dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], a1, bd);
  1397. dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], e1, bd);
  1398. dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], e1, bd);
  1399. dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], e1, bd);
  1400. ip2 = ip2.Slice(1);
  1401. dest = dest.Slice(1);
  1402. }
  1403. }
  1404. public static void HighbdIadst4(ReadOnlySpan<int> input, Span<int> output, int bd)
  1405. {
  1406. long s0, s1, s2, s3, s4, s5, s6, s7;
  1407. int x0 = input[0];
  1408. int x1 = input[1];
  1409. int x2 = input[2];
  1410. int x3 = input[3];
  1411. if (DetectInvalidHighbdInput(input, 4) != 0)
  1412. {
  1413. Debug.Assert(false, "invalid highbd txfm input");
  1414. output.Slice(0, 4).Fill(0);
  1415. return;
  1416. }
  1417. if ((x0 | x1 | x2 | x3) == 0)
  1418. {
  1419. output.Slice(0, 4).Fill(0);
  1420. return;
  1421. }
  1422. s0 = (long)SinPi1_9 * x0;
  1423. s1 = (long)SinPi2_9 * x0;
  1424. s2 = (long)SinPi3_9 * x1;
  1425. s3 = (long)SinPi4_9 * x2;
  1426. s4 = (long)SinPi1_9 * x2;
  1427. s5 = (long)SinPi2_9 * x3;
  1428. s6 = (long)SinPi4_9 * x3;
  1429. s7 = HighbdWrapLow(x0 - x2 + x3, bd);
  1430. s0 = s0 + s3 + s5;
  1431. s1 = s1 - s4 - s6;
  1432. s3 = s2;
  1433. s2 = SinPi3_9 * s7;
  1434. // 1-D transform scaling factor is sqrt(2).
  1435. // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  1436. // + 1b (addition) = 29b.
  1437. // Hence the output bit depth is 15b.
  1438. output[0] = HighbdWrapLow(DctConstRoundShift(s0 + s3), bd);
  1439. output[1] = HighbdWrapLow(DctConstRoundShift(s1 + s3), bd);
  1440. output[2] = HighbdWrapLow(DctConstRoundShift(s2), bd);
  1441. output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd);
  1442. }
  1443. [SkipLocalsInit]
  1444. public static void HighbdIdct4(ReadOnlySpan<int> input, Span<int> output, int bd)
  1445. {
  1446. Span<int> step = stackalloc int[4];
  1447. long temp1, temp2;
  1448. if (DetectInvalidHighbdInput(input, 4) != 0)
  1449. {
  1450. Debug.Assert(false, "invalid highbd txfm input");
  1451. output.Slice(0, 4).Fill(0);
  1452. return;
  1453. }
  1454. // stage 1
  1455. temp1 = (input[0] + input[2]) * (long)CosPi16_64;
  1456. temp2 = (input[0] - input[2]) * (long)CosPi16_64;
  1457. step[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1458. step[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1459. temp1 = input[1] * (long)CosPi24_64 - input[3] * (long)CosPi8_64;
  1460. temp2 = input[1] * (long)CosPi8_64 + input[3] * (long)CosPi24_64;
  1461. step[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1462. step[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1463. // stage 2
  1464. output[0] = HighbdWrapLow(step[0] + step[3], bd);
  1465. output[1] = HighbdWrapLow(step[1] + step[2], bd);
  1466. output[2] = HighbdWrapLow(step[1] - step[2], bd);
  1467. output[3] = HighbdWrapLow(step[0] - step[3], bd);
  1468. }
  1469. [SkipLocalsInit]
  1470. public static void HighbdIdct4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1471. {
  1472. int i, j;
  1473. Span<int> output = stackalloc int[4 * 4];
  1474. Span<int> outptr = output;
  1475. Span<int> tempIn = stackalloc int[4];
  1476. Span<int> tempOut = stackalloc int[4];
  1477. // Rows
  1478. for (i = 0; i < 4; ++i)
  1479. {
  1480. HighbdIdct4(input, outptr, bd);
  1481. input = input.Slice(4);
  1482. outptr = outptr.Slice(4);
  1483. }
  1484. // Columns
  1485. for (i = 0; i < 4; ++i)
  1486. {
  1487. for (j = 0; j < 4; ++j)
  1488. {
  1489. tempIn[j] = output[j * 4 + i];
  1490. }
  1491. HighbdIdct4(tempIn, tempOut, bd);
  1492. for (j = 0; j < 4; ++j)
  1493. {
  1494. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd);
  1495. }
  1496. }
  1497. }
  1498. public static void HighbdIdct4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1499. {
  1500. int i;
  1501. long a1;
  1502. int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
  1503. output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
  1504. a1 = BitUtils.RoundPowerOfTwo(output, 4);
  1505. for (i = 0; i < 4; i++)
  1506. {
  1507. dest[0] = HighbdClipPixelAdd(dest[0], a1, bd);
  1508. dest[1] = HighbdClipPixelAdd(dest[1], a1, bd);
  1509. dest[2] = HighbdClipPixelAdd(dest[2], a1, bd);
  1510. dest[3] = HighbdClipPixelAdd(dest[3], a1, bd);
  1511. dest = dest.Slice(stride);
  1512. }
  1513. }
  1514. public static void HighbdIadst8(ReadOnlySpan<int> input, Span<int> output, int bd)
  1515. {
  1516. long s0, s1, s2, s3, s4, s5, s6, s7;
  1517. int x0 = input[7];
  1518. int x1 = input[0];
  1519. int x2 = input[5];
  1520. int x3 = input[2];
  1521. int x4 = input[3];
  1522. int x5 = input[4];
  1523. int x6 = input[1];
  1524. int x7 = input[6];
  1525. if (DetectInvalidHighbdInput(input, 8) != 0)
  1526. {
  1527. Debug.Assert(false, "invalid highbd txfm input");
  1528. output.Slice(0, 8).Fill(0);
  1529. return;
  1530. }
  1531. if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0)
  1532. {
  1533. output.Slice(0, 8).Fill(0);
  1534. return;
  1535. }
  1536. // stage 1
  1537. s0 = (long)CosPi2_64 * x0 + (long)CosPi30_64 * x1;
  1538. s1 = (long)CosPi30_64 * x0 - (long)CosPi2_64 * x1;
  1539. s2 = (long)CosPi10_64 * x2 + (long)CosPi22_64 * x3;
  1540. s3 = (long)CosPi22_64 * x2 - (long)CosPi10_64 * x3;
  1541. s4 = (long)CosPi18_64 * x4 + (long)CosPi14_64 * x5;
  1542. s5 = (long)CosPi14_64 * x4 - (long)CosPi18_64 * x5;
  1543. s6 = (long)CosPi26_64 * x6 + (long)CosPi6_64 * x7;
  1544. s7 = (long)CosPi6_64 * x6 - (long)CosPi26_64 * x7;
  1545. x0 = HighbdWrapLow(DctConstRoundShift(s0 + s4), bd);
  1546. x1 = HighbdWrapLow(DctConstRoundShift(s1 + s5), bd);
  1547. x2 = HighbdWrapLow(DctConstRoundShift(s2 + s6), bd);
  1548. x3 = HighbdWrapLow(DctConstRoundShift(s3 + s7), bd);
  1549. x4 = HighbdWrapLow(DctConstRoundShift(s0 - s4), bd);
  1550. x5 = HighbdWrapLow(DctConstRoundShift(s1 - s5), bd);
  1551. x6 = HighbdWrapLow(DctConstRoundShift(s2 - s6), bd);
  1552. x7 = HighbdWrapLow(DctConstRoundShift(s3 - s7), bd);
  1553. // stage 2
  1554. s0 = x0;
  1555. s1 = x1;
  1556. s2 = x2;
  1557. s3 = x3;
  1558. s4 = (long)CosPi8_64 * x4 + (long)CosPi24_64 * x5;
  1559. s5 = (long)CosPi24_64 * x4 - (long)CosPi8_64 * x5;
  1560. s6 = (long)(-CosPi24_64) * x6 + (long)CosPi8_64 * x7;
  1561. s7 = (long)CosPi8_64 * x6 + (long)CosPi24_64 * x7;
  1562. x0 = HighbdWrapLow(s0 + s2, bd);
  1563. x1 = HighbdWrapLow(s1 + s3, bd);
  1564. x2 = HighbdWrapLow(s0 - s2, bd);
  1565. x3 = HighbdWrapLow(s1 - s3, bd);
  1566. x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd);
  1567. x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd);
  1568. x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd);
  1569. x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd);
  1570. // stage 3
  1571. s2 = (long)CosPi16_64 * (x2 + x3);
  1572. s3 = (long)CosPi16_64 * (x2 - x3);
  1573. s6 = (long)CosPi16_64 * (x6 + x7);
  1574. s7 = (long)CosPi16_64 * (x6 - x7);
  1575. x2 = HighbdWrapLow(DctConstRoundShift(s2), bd);
  1576. x3 = HighbdWrapLow(DctConstRoundShift(s3), bd);
  1577. x6 = HighbdWrapLow(DctConstRoundShift(s6), bd);
  1578. x7 = HighbdWrapLow(DctConstRoundShift(s7), bd);
  1579. output[0] = HighbdWrapLow(x0, bd);
  1580. output[1] = HighbdWrapLow(-x4, bd);
  1581. output[2] = HighbdWrapLow(x6, bd);
  1582. output[3] = HighbdWrapLow(-x2, bd);
  1583. output[4] = HighbdWrapLow(x3, bd);
  1584. output[5] = HighbdWrapLow(-x7, bd);
  1585. output[6] = HighbdWrapLow(x5, bd);
  1586. output[7] = HighbdWrapLow(-x1, bd);
  1587. }
  1588. [SkipLocalsInit]
  1589. public static void HighbdIdct8(ReadOnlySpan<int> input, Span<int> output, int bd)
  1590. {
  1591. Span<int> step1 = stackalloc int[8];
  1592. Span<int> step2 = stackalloc int[8];
  1593. long temp1, temp2;
  1594. if (DetectInvalidHighbdInput(input, 8) != 0)
  1595. {
  1596. Debug.Assert(false, "invalid highbd txfm input");
  1597. output.Slice(0, 8).Fill(0);
  1598. return;
  1599. }
  1600. // stage 1
  1601. step1[0] = input[0];
  1602. step1[2] = input[4];
  1603. step1[1] = input[2];
  1604. step1[3] = input[6];
  1605. temp1 = input[1] * (long)CosPi28_64 - input[7] * (long)CosPi4_64;
  1606. temp2 = input[1] * (long)CosPi4_64 + input[7] * (long)CosPi28_64;
  1607. step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1608. step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1609. temp1 = input[5] * (long)CosPi12_64 - input[3] * (long)CosPi20_64;
  1610. temp2 = input[5] * (long)CosPi20_64 + input[3] * (long)CosPi12_64;
  1611. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1612. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1613. // stage 2 & stage 3 - even half
  1614. HighbdIdct4(step1, step1, bd);
  1615. // stage 2 - odd half
  1616. step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
  1617. step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
  1618. step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
  1619. step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
  1620. // stage 3 - odd half
  1621. step1[4] = step2[4];
  1622. temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
  1623. temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
  1624. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1625. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1626. step1[7] = step2[7];
  1627. // stage 4
  1628. output[0] = HighbdWrapLow(step1[0] + step1[7], bd);
  1629. output[1] = HighbdWrapLow(step1[1] + step1[6], bd);
  1630. output[2] = HighbdWrapLow(step1[2] + step1[5], bd);
  1631. output[3] = HighbdWrapLow(step1[3] + step1[4], bd);
  1632. output[4] = HighbdWrapLow(step1[3] - step1[4], bd);
  1633. output[5] = HighbdWrapLow(step1[2] - step1[5], bd);
  1634. output[6] = HighbdWrapLow(step1[1] - step1[6], bd);
  1635. output[7] = HighbdWrapLow(step1[0] - step1[7], bd);
  1636. }
  1637. [SkipLocalsInit]
  1638. public static void HighbdIdct8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1639. {
  1640. int i, j;
  1641. Span<int> output = stackalloc int[8 * 8];
  1642. Span<int> outptr = output;
  1643. Span<int> tempIn = stackalloc int[8];
  1644. Span<int> tempOut = stackalloc int[8];
  1645. // First transform rows
  1646. for (i = 0; i < 8; ++i)
  1647. {
  1648. HighbdIdct8(input, outptr, bd);
  1649. input = input.Slice(8);
  1650. outptr = outptr.Slice(8);
  1651. }
  1652. // Then transform columns
  1653. for (i = 0; i < 8; ++i)
  1654. {
  1655. for (j = 0; j < 8; ++j)
  1656. {
  1657. tempIn[j] = output[j * 8 + i];
  1658. }
  1659. HighbdIdct8(tempIn, tempOut, bd);
  1660. for (j = 0; j < 8; ++j)
  1661. {
  1662. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
  1663. }
  1664. }
  1665. }
  1666. [SkipLocalsInit]
  1667. public static void HighbdIdct8x812Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1668. {
  1669. int i, j;
  1670. Span<int> output = stackalloc int[8 * 8];
  1671. Span<int> outptr = output;
  1672. Span<int> tempIn = stackalloc int[8];
  1673. Span<int> tempOut = stackalloc int[8];
  1674. output.Fill(0);
  1675. // First transform rows
  1676. // Only first 4 row has non-zero coefs
  1677. for (i = 0; i < 4; ++i)
  1678. {
  1679. HighbdIdct8(input, outptr, bd);
  1680. input = input.Slice(8);
  1681. outptr = outptr.Slice(8);
  1682. }
  1683. // Then transform columns
  1684. for (i = 0; i < 8; ++i)
  1685. {
  1686. for (j = 0; j < 8; ++j)
  1687. {
  1688. tempIn[j] = output[j * 8 + i];
  1689. }
  1690. HighbdIdct8(tempIn, tempOut, bd);
  1691. for (j = 0; j < 8; ++j)
  1692. {
  1693. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
  1694. }
  1695. }
  1696. }
  1697. public static void vpx_Highbdidct8x8_1_add_c(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1698. {
  1699. int i, j;
  1700. long a1;
  1701. int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
  1702. output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
  1703. a1 = BitUtils.RoundPowerOfTwo(output, 5);
  1704. for (j = 0; j < 8; ++j)
  1705. {
  1706. for (i = 0; i < 8; ++i)
  1707. {
  1708. dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
  1709. }
  1710. dest = dest.Slice(stride);
  1711. }
  1712. }
  1713. public static void HighbdIadst16(ReadOnlySpan<int> input, Span<int> output, int bd)
  1714. {
  1715. long s0, s1, s2, s3, s4, s5, s6, s7, s8;
  1716. long s9, s10, s11, s12, s13, s14, s15;
  1717. int x0 = input[15];
  1718. int x1 = input[0];
  1719. int x2 = input[13];
  1720. int x3 = input[2];
  1721. int x4 = input[11];
  1722. int x5 = input[4];
  1723. int x6 = input[9];
  1724. int x7 = input[6];
  1725. int x8 = input[7];
  1726. int x9 = input[8];
  1727. int x10 = input[5];
  1728. int x11 = input[10];
  1729. int x12 = input[3];
  1730. int x13 = input[12];
  1731. int x14 = input[1];
  1732. int x15 = input[14];
  1733. if (DetectInvalidHighbdInput(input, 16) != 0)
  1734. {
  1735. Debug.Assert(false, "invalid highbd txfm input");
  1736. output.Slice(0, 16).Fill(0);
  1737. return;
  1738. }
  1739. if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0)
  1740. {
  1741. output.Slice(0, 16).Fill(0);
  1742. return;
  1743. }
  1744. // stage 1
  1745. s0 = x0 * (long)CosPi1_64 + x1 * (long)CosPi31_64;
  1746. s1 = x0 * (long)CosPi31_64 - x1 * (long)CosPi1_64;
  1747. s2 = x2 * (long)CosPi5_64 + x3 * (long)CosPi27_64;
  1748. s3 = x2 * (long)CosPi27_64 - x3 * (long)CosPi5_64;
  1749. s4 = x4 * (long)CosPi9_64 + x5 * (long)CosPi23_64;
  1750. s5 = x4 * (long)CosPi23_64 - x5 * (long)CosPi9_64;
  1751. s6 = x6 * (long)CosPi13_64 + x7 * (long)CosPi19_64;
  1752. s7 = x6 * (long)CosPi19_64 - x7 * (long)CosPi13_64;
  1753. s8 = x8 * (long)CosPi17_64 + x9 * (long)CosPi15_64;
  1754. s9 = x8 * (long)CosPi15_64 - x9 * (long)CosPi17_64;
  1755. s10 = x10 * (long)CosPi21_64 + x11 * (long)CosPi11_64;
  1756. s11 = x10 * (long)CosPi11_64 - x11 * (long)CosPi21_64;
  1757. s12 = x12 * (long)CosPi25_64 + x13 * (long)CosPi7_64;
  1758. s13 = x12 * (long)CosPi7_64 - x13 * (long)CosPi25_64;
  1759. s14 = x14 * (long)CosPi29_64 + x15 * (long)CosPi3_64;
  1760. s15 = x14 * (long)CosPi3_64 - x15 * (long)CosPi29_64;
  1761. x0 = HighbdWrapLow(DctConstRoundShift(s0 + s8), bd);
  1762. x1 = HighbdWrapLow(DctConstRoundShift(s1 + s9), bd);
  1763. x2 = HighbdWrapLow(DctConstRoundShift(s2 + s10), bd);
  1764. x3 = HighbdWrapLow(DctConstRoundShift(s3 + s11), bd);
  1765. x4 = HighbdWrapLow(DctConstRoundShift(s4 + s12), bd);
  1766. x5 = HighbdWrapLow(DctConstRoundShift(s5 + s13), bd);
  1767. x6 = HighbdWrapLow(DctConstRoundShift(s6 + s14), bd);
  1768. x7 = HighbdWrapLow(DctConstRoundShift(s7 + s15), bd);
  1769. x8 = HighbdWrapLow(DctConstRoundShift(s0 - s8), bd);
  1770. x9 = HighbdWrapLow(DctConstRoundShift(s1 - s9), bd);
  1771. x10 = HighbdWrapLow(DctConstRoundShift(s2 - s10), bd);
  1772. x11 = HighbdWrapLow(DctConstRoundShift(s3 - s11), bd);
  1773. x12 = HighbdWrapLow(DctConstRoundShift(s4 - s12), bd);
  1774. x13 = HighbdWrapLow(DctConstRoundShift(s5 - s13), bd);
  1775. x14 = HighbdWrapLow(DctConstRoundShift(s6 - s14), bd);
  1776. x15 = HighbdWrapLow(DctConstRoundShift(s7 - s15), bd);
  1777. // stage 2
  1778. s0 = x0;
  1779. s1 = x1;
  1780. s2 = x2;
  1781. s3 = x3;
  1782. s4 = x4;
  1783. s5 = x5;
  1784. s6 = x6;
  1785. s7 = x7;
  1786. s8 = x8 * (long)CosPi4_64 + x9 * (long)CosPi28_64;
  1787. s9 = x8 * (long)CosPi28_64 - x9 * (long)CosPi4_64;
  1788. s10 = x10 * (long)CosPi20_64 + x11 * (long)CosPi12_64;
  1789. s11 = x10 * (long)CosPi12_64 - x11 * (long)CosPi20_64;
  1790. s12 = -x12 * (long)CosPi28_64 + x13 * (long)CosPi4_64;
  1791. s13 = x12 * (long)CosPi4_64 + x13 * (long)CosPi28_64;
  1792. s14 = -x14 * (long)CosPi12_64 + x15 * (long)CosPi20_64;
  1793. s15 = x14 * (long)CosPi20_64 + x15 * (long)CosPi12_64;
  1794. x0 = HighbdWrapLow(s0 + s4, bd);
  1795. x1 = HighbdWrapLow(s1 + s5, bd);
  1796. x2 = HighbdWrapLow(s2 + s6, bd);
  1797. x3 = HighbdWrapLow(s3 + s7, bd);
  1798. x4 = HighbdWrapLow(s0 - s4, bd);
  1799. x5 = HighbdWrapLow(s1 - s5, bd);
  1800. x6 = HighbdWrapLow(s2 - s6, bd);
  1801. x7 = HighbdWrapLow(s3 - s7, bd);
  1802. x8 = HighbdWrapLow(DctConstRoundShift(s8 + s12), bd);
  1803. x9 = HighbdWrapLow(DctConstRoundShift(s9 + s13), bd);
  1804. x10 = HighbdWrapLow(DctConstRoundShift(s10 + s14), bd);
  1805. x11 = HighbdWrapLow(DctConstRoundShift(s11 + s15), bd);
  1806. x12 = HighbdWrapLow(DctConstRoundShift(s8 - s12), bd);
  1807. x13 = HighbdWrapLow(DctConstRoundShift(s9 - s13), bd);
  1808. x14 = HighbdWrapLow(DctConstRoundShift(s10 - s14), bd);
  1809. x15 = HighbdWrapLow(DctConstRoundShift(s11 - s15), bd);
  1810. // stage 3
  1811. s0 = x0;
  1812. s1 = x1;
  1813. s2 = x2;
  1814. s3 = x3;
  1815. s4 = x4 * (long)CosPi8_64 + x5 * (long)CosPi24_64;
  1816. s5 = x4 * (long)CosPi24_64 - x5 * (long)CosPi8_64;
  1817. s6 = -x6 * (long)CosPi24_64 + x7 * (long)CosPi8_64;
  1818. s7 = x6 * (long)CosPi8_64 + x7 * (long)CosPi24_64;
  1819. s8 = x8;
  1820. s9 = x9;
  1821. s10 = x10;
  1822. s11 = x11;
  1823. s12 = x12 * (long)CosPi8_64 + x13 * (long)CosPi24_64;
  1824. s13 = x12 * (long)CosPi24_64 - x13 * (long)CosPi8_64;
  1825. s14 = -x14 * (long)CosPi24_64 + x15 * (long)CosPi8_64;
  1826. s15 = x14 * (long)CosPi8_64 + x15 * (long)CosPi24_64;
  1827. x0 = HighbdWrapLow(s0 + s2, bd);
  1828. x1 = HighbdWrapLow(s1 + s3, bd);
  1829. x2 = HighbdWrapLow(s0 - s2, bd);
  1830. x3 = HighbdWrapLow(s1 - s3, bd);
  1831. x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd);
  1832. x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd);
  1833. x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd);
  1834. x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd);
  1835. x8 = HighbdWrapLow(s8 + s10, bd);
  1836. x9 = HighbdWrapLow(s9 + s11, bd);
  1837. x10 = HighbdWrapLow(s8 - s10, bd);
  1838. x11 = HighbdWrapLow(s9 - s11, bd);
  1839. x12 = HighbdWrapLow(DctConstRoundShift(s12 + s14), bd);
  1840. x13 = HighbdWrapLow(DctConstRoundShift(s13 + s15), bd);
  1841. x14 = HighbdWrapLow(DctConstRoundShift(s12 - s14), bd);
  1842. x15 = HighbdWrapLow(DctConstRoundShift(s13 - s15), bd);
  1843. // stage 4
  1844. s2 = (long)(-CosPi16_64) * (x2 + x3);
  1845. s3 = (long)CosPi16_64 * (x2 - x3);
  1846. s6 = (long)CosPi16_64 * (x6 + x7);
  1847. s7 = (long)CosPi16_64 * (-x6 + x7);
  1848. s10 = (long)CosPi16_64 * (x10 + x11);
  1849. s11 = (long)CosPi16_64 * (-x10 + x11);
  1850. s14 = (long)(-CosPi16_64) * (x14 + x15);
  1851. s15 = (long)CosPi16_64 * (x14 - x15);
  1852. x2 = HighbdWrapLow(DctConstRoundShift(s2), bd);
  1853. x3 = HighbdWrapLow(DctConstRoundShift(s3), bd);
  1854. x6 = HighbdWrapLow(DctConstRoundShift(s6), bd);
  1855. x7 = HighbdWrapLow(DctConstRoundShift(s7), bd);
  1856. x10 = HighbdWrapLow(DctConstRoundShift(s10), bd);
  1857. x11 = HighbdWrapLow(DctConstRoundShift(s11), bd);
  1858. x14 = HighbdWrapLow(DctConstRoundShift(s14), bd);
  1859. x15 = HighbdWrapLow(DctConstRoundShift(s15), bd);
  1860. output[0] = HighbdWrapLow(x0, bd);
  1861. output[1] = HighbdWrapLow(-x8, bd);
  1862. output[2] = HighbdWrapLow(x12, bd);
  1863. output[3] = HighbdWrapLow(-x4, bd);
  1864. output[4] = HighbdWrapLow(x6, bd);
  1865. output[5] = HighbdWrapLow(x14, bd);
  1866. output[6] = HighbdWrapLow(x10, bd);
  1867. output[7] = HighbdWrapLow(x2, bd);
  1868. output[8] = HighbdWrapLow(x3, bd);
  1869. output[9] = HighbdWrapLow(x11, bd);
  1870. output[10] = HighbdWrapLow(x15, bd);
  1871. output[11] = HighbdWrapLow(x7, bd);
  1872. output[12] = HighbdWrapLow(x5, bd);
  1873. output[13] = HighbdWrapLow(-x13, bd);
  1874. output[14] = HighbdWrapLow(x9, bd);
  1875. output[15] = HighbdWrapLow(-x1, bd);
  1876. }
  1877. [SkipLocalsInit]
  1878. public static void HighbdIdct16(ReadOnlySpan<int> input, Span<int> output, int bd)
  1879. {
  1880. Span<int> step1 = stackalloc int[16];
  1881. Span<int> step2 = stackalloc int[16];
  1882. long temp1, temp2;
  1883. if (DetectInvalidHighbdInput(input, 16) != 0)
  1884. {
  1885. Debug.Assert(false, "invalid highbd txfm input");
  1886. output.Slice(0, 16).Fill(0);
  1887. return;
  1888. }
  1889. // stage 1
  1890. step1[0] = input[0 / 2];
  1891. step1[1] = input[16 / 2];
  1892. step1[2] = input[8 / 2];
  1893. step1[3] = input[24 / 2];
  1894. step1[4] = input[4 / 2];
  1895. step1[5] = input[20 / 2];
  1896. step1[6] = input[12 / 2];
  1897. step1[7] = input[28 / 2];
  1898. step1[8] = input[2 / 2];
  1899. step1[9] = input[18 / 2];
  1900. step1[10] = input[10 / 2];
  1901. step1[11] = input[26 / 2];
  1902. step1[12] = input[6 / 2];
  1903. step1[13] = input[22 / 2];
  1904. step1[14] = input[14 / 2];
  1905. step1[15] = input[30 / 2];
  1906. // stage 2
  1907. step2[0] = step1[0];
  1908. step2[1] = step1[1];
  1909. step2[2] = step1[2];
  1910. step2[3] = step1[3];
  1911. step2[4] = step1[4];
  1912. step2[5] = step1[5];
  1913. step2[6] = step1[6];
  1914. step2[7] = step1[7];
  1915. temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64;
  1916. temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64;
  1917. step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1918. step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1919. temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64;
  1920. temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64;
  1921. step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1922. step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1923. temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64;
  1924. temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64;
  1925. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1926. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1927. temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64;
  1928. temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64;
  1929. step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1930. step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1931. // stage 3
  1932. step1[0] = step2[0];
  1933. step1[1] = step2[1];
  1934. step1[2] = step2[2];
  1935. step1[3] = step2[3];
  1936. temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64;
  1937. temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64;
  1938. step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1939. step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1940. temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64;
  1941. temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64;
  1942. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1943. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1944. step1[8] = HighbdWrapLow(step2[8] + step2[9], bd);
  1945. step1[9] = HighbdWrapLow(step2[8] - step2[9], bd);
  1946. step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd);
  1947. step1[11] = HighbdWrapLow(step2[10] + step2[11], bd);
  1948. step1[12] = HighbdWrapLow(step2[12] + step2[13], bd);
  1949. step1[13] = HighbdWrapLow(step2[12] - step2[13], bd);
  1950. step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd);
  1951. step1[15] = HighbdWrapLow(step2[14] + step2[15], bd);
  1952. // stage 4
  1953. temp1 = (step1[0] + step1[1]) * (long)CosPi16_64;
  1954. temp2 = (step1[0] - step1[1]) * (long)CosPi16_64;
  1955. step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1956. step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1957. temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64;
  1958. temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64;
  1959. step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1960. step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1961. step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
  1962. step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
  1963. step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
  1964. step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
  1965. step2[8] = step1[8];
  1966. step2[15] = step1[15];
  1967. temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
  1968. temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
  1969. step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1970. step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1971. temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
  1972. temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64;
  1973. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1974. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1975. step2[11] = step1[11];
  1976. step2[12] = step1[12];
  1977. // stage 5
  1978. step1[0] = HighbdWrapLow(step2[0] + step2[3], bd);
  1979. step1[1] = HighbdWrapLow(step2[1] + step2[2], bd);
  1980. step1[2] = HighbdWrapLow(step2[1] - step2[2], bd);
  1981. step1[3] = HighbdWrapLow(step2[0] - step2[3], bd);
  1982. step1[4] = step2[4];
  1983. temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
  1984. temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
  1985. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1986. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1987. step1[7] = step2[7];
  1988. step1[8] = HighbdWrapLow(step2[8] + step2[11], bd);
  1989. step1[9] = HighbdWrapLow(step2[9] + step2[10], bd);
  1990. step1[10] = HighbdWrapLow(step2[9] - step2[10], bd);
  1991. step1[11] = HighbdWrapLow(step2[8] - step2[11], bd);
  1992. step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd);
  1993. step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd);
  1994. step1[14] = HighbdWrapLow(step2[13] + step2[14], bd);
  1995. step1[15] = HighbdWrapLow(step2[12] + step2[15], bd);
  1996. // stage 6
  1997. step2[0] = HighbdWrapLow(step1[0] + step1[7], bd);
  1998. step2[1] = HighbdWrapLow(step1[1] + step1[6], bd);
  1999. step2[2] = HighbdWrapLow(step1[2] + step1[5], bd);
  2000. step2[3] = HighbdWrapLow(step1[3] + step1[4], bd);
  2001. step2[4] = HighbdWrapLow(step1[3] - step1[4], bd);
  2002. step2[5] = HighbdWrapLow(step1[2] - step1[5], bd);
  2003. step2[6] = HighbdWrapLow(step1[1] - step1[6], bd);
  2004. step2[7] = HighbdWrapLow(step1[0] - step1[7], bd);
  2005. step2[8] = step1[8];
  2006. step2[9] = step1[9];
  2007. temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64;
  2008. temp2 = (step1[10] + step1[13]) * (long)CosPi16_64;
  2009. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2010. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2011. temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64;
  2012. temp2 = (step1[11] + step1[12]) * (long)CosPi16_64;
  2013. step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2014. step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2015. step2[14] = step1[14];
  2016. step2[15] = step1[15];
  2017. // stage 7
  2018. output[0] = HighbdWrapLow(step2[0] + step2[15], bd);
  2019. output[1] = HighbdWrapLow(step2[1] + step2[14], bd);
  2020. output[2] = HighbdWrapLow(step2[2] + step2[13], bd);
  2021. output[3] = HighbdWrapLow(step2[3] + step2[12], bd);
  2022. output[4] = HighbdWrapLow(step2[4] + step2[11], bd);
  2023. output[5] = HighbdWrapLow(step2[5] + step2[10], bd);
  2024. output[6] = HighbdWrapLow(step2[6] + step2[9], bd);
  2025. output[7] = HighbdWrapLow(step2[7] + step2[8], bd);
  2026. output[8] = HighbdWrapLow(step2[7] - step2[8], bd);
  2027. output[9] = HighbdWrapLow(step2[6] - step2[9], bd);
  2028. output[10] = HighbdWrapLow(step2[5] - step2[10], bd);
  2029. output[11] = HighbdWrapLow(step2[4] - step2[11], bd);
  2030. output[12] = HighbdWrapLow(step2[3] - step2[12], bd);
  2031. output[13] = HighbdWrapLow(step2[2] - step2[13], bd);
  2032. output[14] = HighbdWrapLow(step2[1] - step2[14], bd);
  2033. output[15] = HighbdWrapLow(step2[0] - step2[15], bd);
  2034. }
  2035. [SkipLocalsInit]
  2036. public static void HighbdIdct16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2037. {
  2038. int i, j;
  2039. Span<int> output = stackalloc int[16 * 16];
  2040. Span<int> outptr = output;
  2041. Span<int> tempIn = stackalloc int[16];
  2042. Span<int> tempOut = stackalloc int[16];
  2043. // First transform rows
  2044. for (i = 0; i < 16; ++i)
  2045. {
  2046. HighbdIdct16(input, outptr, bd);
  2047. input = input.Slice(16);
  2048. outptr = outptr.Slice(16);
  2049. }
  2050. // Then transform columns
  2051. for (i = 0; i < 16; ++i)
  2052. {
  2053. for (j = 0; j < 16; ++j)
  2054. {
  2055. tempIn[j] = output[j * 16 + i];
  2056. }
  2057. HighbdIdct16(tempIn, tempOut, bd);
  2058. for (j = 0; j < 16; ++j)
  2059. {
  2060. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2061. }
  2062. }
  2063. }
  2064. [SkipLocalsInit]
  2065. public static void HighbdIdct16x1638Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2066. {
  2067. int i, j;
  2068. Span<int> output = stackalloc int[16 * 16];
  2069. Span<int> outptr = output;
  2070. Span<int> tempIn = stackalloc int[16];
  2071. Span<int> tempOut = stackalloc int[16];
  2072. output.Fill(0);
  2073. // First transform rows. Since all non-zero dct coefficients are in
  2074. // upper-left 8x8 area, we only need to calculate first 8 rows here.
  2075. for (i = 0; i < 8; ++i)
  2076. {
  2077. HighbdIdct16(input, outptr, bd);
  2078. input = input.Slice(16);
  2079. outptr = outptr.Slice(16);
  2080. }
  2081. // Then transform columns
  2082. for (i = 0; i < 16; ++i)
  2083. {
  2084. Span<ushort> destT = dest;
  2085. for (j = 0; j < 16; ++j)
  2086. {
  2087. tempIn[j] = output[j * 16 + i];
  2088. }
  2089. HighbdIdct16(tempIn, tempOut, bd);
  2090. for (j = 0; j < 16; ++j)
  2091. {
  2092. destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2093. destT = destT.Slice(stride);
  2094. }
  2095. }
  2096. }
  2097. [SkipLocalsInit]
  2098. public static void HighbdIdct16x1610Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2099. {
  2100. int i, j;
  2101. Span<int> output = stackalloc int[16 * 16];
  2102. Span<int> outptr = output;
  2103. Span<int> tempIn = stackalloc int[16];
  2104. Span<int> tempOut = stackalloc int[16];
  2105. output.Fill(0);
  2106. // First transform rows. Since all non-zero dct coefficients are in
  2107. // upper-left 4x4 area, we only need to calculate first 4 rows here.
  2108. for (i = 0; i < 4; ++i)
  2109. {
  2110. HighbdIdct16(input, outptr, bd);
  2111. input = input.Slice(16);
  2112. outptr = outptr.Slice(16);
  2113. }
  2114. // Then transform columns
  2115. for (i = 0; i < 16; ++i)
  2116. {
  2117. for (j = 0; j < 16; ++j)
  2118. {
  2119. tempIn[j] = output[j * 16 + i];
  2120. }
  2121. HighbdIdct16(tempIn, tempOut, bd);
  2122. for (j = 0; j < 16; ++j)
  2123. {
  2124. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2125. }
  2126. }
  2127. }
  2128. public static void HighbdIdct16x161Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2129. {
  2130. int i, j;
  2131. long a1;
  2132. int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
  2133. output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
  2134. a1 = BitUtils.RoundPowerOfTwo(output, 6);
  2135. for (j = 0; j < 16; ++j)
  2136. {
  2137. for (i = 0; i < 16; ++i)
  2138. {
  2139. dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
  2140. }
  2141. dest = dest.Slice(stride);
  2142. }
  2143. }
  2144. [SkipLocalsInit]
  2145. public static void HighbdIdct32(ReadOnlySpan<int> input, Span<int> output, int bd)
  2146. {
  2147. Span<int> step1 = stackalloc int[32];
  2148. Span<int> step2 = stackalloc int[32];
  2149. long temp1, temp2;
  2150. if (DetectInvalidHighbdInput(input, 32) != 0)
  2151. {
  2152. Debug.Assert(false, "invalid highbd txfm input");
  2153. output.Slice(0, 32).Fill(0);
  2154. return;
  2155. }
  2156. // stage 1
  2157. step1[0] = input[0];
  2158. step1[1] = input[16];
  2159. step1[2] = input[8];
  2160. step1[3] = input[24];
  2161. step1[4] = input[4];
  2162. step1[5] = input[20];
  2163. step1[6] = input[12];
  2164. step1[7] = input[28];
  2165. step1[8] = input[2];
  2166. step1[9] = input[18];
  2167. step1[10] = input[10];
  2168. step1[11] = input[26];
  2169. step1[12] = input[6];
  2170. step1[13] = input[22];
  2171. step1[14] = input[14];
  2172. step1[15] = input[30];
  2173. temp1 = input[1] * (long)CosPi31_64 - input[31] * (long)CosPi1_64;
  2174. temp2 = input[1] * (long)CosPi1_64 + input[31] * (long)CosPi31_64;
  2175. step1[16] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2176. step1[31] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2177. temp1 = input[17] * (long)CosPi15_64 - input[15] * (long)CosPi17_64;
  2178. temp2 = input[17] * (long)CosPi17_64 + input[15] * (long)CosPi15_64;
  2179. step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2180. step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2181. temp1 = input[9] * (long)CosPi23_64 - input[23] * (long)CosPi9_64;
  2182. temp2 = input[9] * (long)CosPi9_64 + input[23] * (long)CosPi23_64;
  2183. step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2184. step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2185. temp1 = input[25] * (long)CosPi7_64 - input[7] * (long)CosPi25_64;
  2186. temp2 = input[25] * (long)CosPi25_64 + input[7] * (long)CosPi7_64;
  2187. step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2188. step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2189. temp1 = input[5] * (long)CosPi27_64 - input[27] * (long)CosPi5_64;
  2190. temp2 = input[5] * (long)CosPi5_64 + input[27] * (long)CosPi27_64;
  2191. step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2192. step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2193. temp1 = input[21] * (long)CosPi11_64 - input[11] * (long)CosPi21_64;
  2194. temp2 = input[21] * (long)CosPi21_64 + input[11] * (long)CosPi11_64;
  2195. step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2196. step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2197. temp1 = input[13] * (long)CosPi19_64 - input[19] * (long)CosPi13_64;
  2198. temp2 = input[13] * (long)CosPi13_64 + input[19] * (long)CosPi19_64;
  2199. step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2200. step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2201. temp1 = input[29] * (long)CosPi3_64 - input[3] * (long)CosPi29_64;
  2202. temp2 = input[29] * (long)CosPi29_64 + input[3] * (long)CosPi3_64;
  2203. step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2204. step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2205. // stage 2
  2206. step2[0] = step1[0];
  2207. step2[1] = step1[1];
  2208. step2[2] = step1[2];
  2209. step2[3] = step1[3];
  2210. step2[4] = step1[4];
  2211. step2[5] = step1[5];
  2212. step2[6] = step1[6];
  2213. step2[7] = step1[7];
  2214. temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64;
  2215. temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64;
  2216. step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2217. step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2218. temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64;
  2219. temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64;
  2220. step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2221. step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2222. temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64;
  2223. temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64;
  2224. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2225. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2226. temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64;
  2227. temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64;
  2228. step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2229. step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2230. step2[16] = HighbdWrapLow(step1[16] + step1[17], bd);
  2231. step2[17] = HighbdWrapLow(step1[16] - step1[17], bd);
  2232. step2[18] = HighbdWrapLow(-step1[18] + step1[19], bd);
  2233. step2[19] = HighbdWrapLow(step1[18] + step1[19], bd);
  2234. step2[20] = HighbdWrapLow(step1[20] + step1[21], bd);
  2235. step2[21] = HighbdWrapLow(step1[20] - step1[21], bd);
  2236. step2[22] = HighbdWrapLow(-step1[22] + step1[23], bd);
  2237. step2[23] = HighbdWrapLow(step1[22] + step1[23], bd);
  2238. step2[24] = HighbdWrapLow(step1[24] + step1[25], bd);
  2239. step2[25] = HighbdWrapLow(step1[24] - step1[25], bd);
  2240. step2[26] = HighbdWrapLow(-step1[26] + step1[27], bd);
  2241. step2[27] = HighbdWrapLow(step1[26] + step1[27], bd);
  2242. step2[28] = HighbdWrapLow(step1[28] + step1[29], bd);
  2243. step2[29] = HighbdWrapLow(step1[28] - step1[29], bd);
  2244. step2[30] = HighbdWrapLow(-step1[30] + step1[31], bd);
  2245. step2[31] = HighbdWrapLow(step1[30] + step1[31], bd);
  2246. // stage 3
  2247. step1[0] = step2[0];
  2248. step1[1] = step2[1];
  2249. step1[2] = step2[2];
  2250. step1[3] = step2[3];
  2251. temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64;
  2252. temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64;
  2253. step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2254. step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2255. temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64;
  2256. temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64;
  2257. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2258. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2259. step1[8] = HighbdWrapLow(step2[8] + step2[9], bd);
  2260. step1[9] = HighbdWrapLow(step2[8] - step2[9], bd);
  2261. step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd);
  2262. step1[11] = HighbdWrapLow(step2[10] + step2[11], bd);
  2263. step1[12] = HighbdWrapLow(step2[12] + step2[13], bd);
  2264. step1[13] = HighbdWrapLow(step2[12] - step2[13], bd);
  2265. step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd);
  2266. step1[15] = HighbdWrapLow(step2[14] + step2[15], bd);
  2267. step1[16] = step2[16];
  2268. step1[31] = step2[31];
  2269. temp1 = -step2[17] * (long)CosPi4_64 + step2[30] * (long)CosPi28_64;
  2270. temp2 = step2[17] * (long)CosPi28_64 + step2[30] * (long)CosPi4_64;
  2271. step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2272. step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2273. temp1 = -step2[18] * (long)CosPi28_64 - step2[29] * (long)CosPi4_64;
  2274. temp2 = -step2[18] * (long)CosPi4_64 + step2[29] * (long)CosPi28_64;
  2275. step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2276. step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2277. step1[19] = step2[19];
  2278. step1[20] = step2[20];
  2279. temp1 = -step2[21] * (long)CosPi20_64 + step2[26] * (long)CosPi12_64;
  2280. temp2 = step2[21] * (long)CosPi12_64 + step2[26] * (long)CosPi20_64;
  2281. step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2282. step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2283. temp1 = -step2[22] * (long)CosPi12_64 - step2[25] * (long)CosPi20_64;
  2284. temp2 = -step2[22] * (long)CosPi20_64 + step2[25] * (long)CosPi12_64;
  2285. step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2286. step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2287. step1[23] = step2[23];
  2288. step1[24] = step2[24];
  2289. step1[27] = step2[27];
  2290. step1[28] = step2[28];
  2291. // stage 4
  2292. temp1 = (step1[0] + step1[1]) * (long)CosPi16_64;
  2293. temp2 = (step1[0] - step1[1]) * (long)CosPi16_64;
  2294. step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2295. step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2296. temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64;
  2297. temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64;
  2298. step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2299. step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2300. step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
  2301. step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
  2302. step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
  2303. step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
  2304. step2[8] = step1[8];
  2305. step2[15] = step1[15];
  2306. temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
  2307. temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
  2308. step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2309. step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2310. temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
  2311. temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64;
  2312. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2313. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2314. step2[11] = step1[11];
  2315. step2[12] = step1[12];
  2316. step2[16] = HighbdWrapLow(step1[16] + step1[19], bd);
  2317. step2[17] = HighbdWrapLow(step1[17] + step1[18], bd);
  2318. step2[18] = HighbdWrapLow(step1[17] - step1[18], bd);
  2319. step2[19] = HighbdWrapLow(step1[16] - step1[19], bd);
  2320. step2[20] = HighbdWrapLow(-step1[20] + step1[23], bd);
  2321. step2[21] = HighbdWrapLow(-step1[21] + step1[22], bd);
  2322. step2[22] = HighbdWrapLow(step1[21] + step1[22], bd);
  2323. step2[23] = HighbdWrapLow(step1[20] + step1[23], bd);
  2324. step2[24] = HighbdWrapLow(step1[24] + step1[27], bd);
  2325. step2[25] = HighbdWrapLow(step1[25] + step1[26], bd);
  2326. step2[26] = HighbdWrapLow(step1[25] - step1[26], bd);
  2327. step2[27] = HighbdWrapLow(step1[24] - step1[27], bd);
  2328. step2[28] = HighbdWrapLow(-step1[28] + step1[31], bd);
  2329. step2[29] = HighbdWrapLow(-step1[29] + step1[30], bd);
  2330. step2[30] = HighbdWrapLow(step1[29] + step1[30], bd);
  2331. step2[31] = HighbdWrapLow(step1[28] + step1[31], bd);
  2332. // stage 5
  2333. step1[0] = HighbdWrapLow(step2[0] + step2[3], bd);
  2334. step1[1] = HighbdWrapLow(step2[1] + step2[2], bd);
  2335. step1[2] = HighbdWrapLow(step2[1] - step2[2], bd);
  2336. step1[3] = HighbdWrapLow(step2[0] - step2[3], bd);
  2337. step1[4] = step2[4];
  2338. temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
  2339. temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
  2340. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2341. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2342. step1[7] = step2[7];
  2343. step1[8] = HighbdWrapLow(step2[8] + step2[11], bd);
  2344. step1[9] = HighbdWrapLow(step2[9] + step2[10], bd);
  2345. step1[10] = HighbdWrapLow(step2[9] - step2[10], bd);
  2346. step1[11] = HighbdWrapLow(step2[8] - step2[11], bd);
  2347. step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd);
  2348. step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd);
  2349. step1[14] = HighbdWrapLow(step2[13] + step2[14], bd);
  2350. step1[15] = HighbdWrapLow(step2[12] + step2[15], bd);
  2351. step1[16] = step2[16];
  2352. step1[17] = step2[17];
  2353. temp1 = -step2[18] * (long)CosPi8_64 + step2[29] * (long)CosPi24_64;
  2354. temp2 = step2[18] * (long)CosPi24_64 + step2[29] * (long)CosPi8_64;
  2355. step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2356. step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2357. temp1 = -step2[19] * (long)CosPi8_64 + step2[28] * (long)CosPi24_64;
  2358. temp2 = step2[19] * (long)CosPi24_64 + step2[28] * (long)CosPi8_64;
  2359. step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2360. step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2361. temp1 = -step2[20] * (long)CosPi24_64 - step2[27] * (long)CosPi8_64;
  2362. temp2 = -step2[20] * (long)CosPi8_64 + step2[27] * (long)CosPi24_64;
  2363. step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2364. step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2365. temp1 = -step2[21] * (long)CosPi24_64 - step2[26] * (long)CosPi8_64;
  2366. temp2 = -step2[21] * (long)CosPi8_64 + step2[26] * (long)CosPi24_64;
  2367. step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2368. step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2369. step1[22] = step2[22];
  2370. step1[23] = step2[23];
  2371. step1[24] = step2[24];
  2372. step1[25] = step2[25];
  2373. step1[30] = step2[30];
  2374. step1[31] = step2[31];
  2375. // stage 6
  2376. step2[0] = HighbdWrapLow(step1[0] + step1[7], bd);
  2377. step2[1] = HighbdWrapLow(step1[1] + step1[6], bd);
  2378. step2[2] = HighbdWrapLow(step1[2] + step1[5], bd);
  2379. step2[3] = HighbdWrapLow(step1[3] + step1[4], bd);
  2380. step2[4] = HighbdWrapLow(step1[3] - step1[4], bd);
  2381. step2[5] = HighbdWrapLow(step1[2] - step1[5], bd);
  2382. step2[6] = HighbdWrapLow(step1[1] - step1[6], bd);
  2383. step2[7] = HighbdWrapLow(step1[0] - step1[7], bd);
  2384. step2[8] = step1[8];
  2385. step2[9] = step1[9];
  2386. temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64;
  2387. temp2 = (step1[10] + step1[13]) * (long)CosPi16_64;
  2388. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2389. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2390. temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64;
  2391. temp2 = (step1[11] + step1[12]) * (long)CosPi16_64;
  2392. step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2393. step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2394. step2[14] = step1[14];
  2395. step2[15] = step1[15];
  2396. step2[16] = HighbdWrapLow(step1[16] + step1[23], bd);
  2397. step2[17] = HighbdWrapLow(step1[17] + step1[22], bd);
  2398. step2[18] = HighbdWrapLow(step1[18] + step1[21], bd);
  2399. step2[19] = HighbdWrapLow(step1[19] + step1[20], bd);
  2400. step2[20] = HighbdWrapLow(step1[19] - step1[20], bd);
  2401. step2[21] = HighbdWrapLow(step1[18] - step1[21], bd);
  2402. step2[22] = HighbdWrapLow(step1[17] - step1[22], bd);
  2403. step2[23] = HighbdWrapLow(step1[16] - step1[23], bd);
  2404. step2[24] = HighbdWrapLow(-step1[24] + step1[31], bd);
  2405. step2[25] = HighbdWrapLow(-step1[25] + step1[30], bd);
  2406. step2[26] = HighbdWrapLow(-step1[26] + step1[29], bd);
  2407. step2[27] = HighbdWrapLow(-step1[27] + step1[28], bd);
  2408. step2[28] = HighbdWrapLow(step1[27] + step1[28], bd);
  2409. step2[29] = HighbdWrapLow(step1[26] + step1[29], bd);
  2410. step2[30] = HighbdWrapLow(step1[25] + step1[30], bd);
  2411. step2[31] = HighbdWrapLow(step1[24] + step1[31], bd);
  2412. // stage 7
  2413. step1[0] = HighbdWrapLow(step2[0] + step2[15], bd);
  2414. step1[1] = HighbdWrapLow(step2[1] + step2[14], bd);
  2415. step1[2] = HighbdWrapLow(step2[2] + step2[13], bd);
  2416. step1[3] = HighbdWrapLow(step2[3] + step2[12], bd);
  2417. step1[4] = HighbdWrapLow(step2[4] + step2[11], bd);
  2418. step1[5] = HighbdWrapLow(step2[5] + step2[10], bd);
  2419. step1[6] = HighbdWrapLow(step2[6] + step2[9], bd);
  2420. step1[7] = HighbdWrapLow(step2[7] + step2[8], bd);
  2421. step1[8] = HighbdWrapLow(step2[7] - step2[8], bd);
  2422. step1[9] = HighbdWrapLow(step2[6] - step2[9], bd);
  2423. step1[10] = HighbdWrapLow(step2[5] - step2[10], bd);
  2424. step1[11] = HighbdWrapLow(step2[4] - step2[11], bd);
  2425. step1[12] = HighbdWrapLow(step2[3] - step2[12], bd);
  2426. step1[13] = HighbdWrapLow(step2[2] - step2[13], bd);
  2427. step1[14] = HighbdWrapLow(step2[1] - step2[14], bd);
  2428. step1[15] = HighbdWrapLow(step2[0] - step2[15], bd);
  2429. step1[16] = step2[16];
  2430. step1[17] = step2[17];
  2431. step1[18] = step2[18];
  2432. step1[19] = step2[19];
  2433. temp1 = (-step2[20] + step2[27]) * (long)CosPi16_64;
  2434. temp2 = (step2[20] + step2[27]) * (long)CosPi16_64;
  2435. step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2436. step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2437. temp1 = (-step2[21] + step2[26]) * (long)CosPi16_64;
  2438. temp2 = (step2[21] + step2[26]) * (long)CosPi16_64;
  2439. step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2440. step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2441. temp1 = (-step2[22] + step2[25]) * (long)CosPi16_64;
  2442. temp2 = (step2[22] + step2[25]) * (long)CosPi16_64;
  2443. step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2444. step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2445. temp1 = (-step2[23] + step2[24]) * (long)CosPi16_64;
  2446. temp2 = (step2[23] + step2[24]) * (long)CosPi16_64;
  2447. step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2448. step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2449. step1[28] = step2[28];
  2450. step1[29] = step2[29];
  2451. step1[30] = step2[30];
  2452. step1[31] = step2[31];
  2453. // final stage
  2454. output[0] = HighbdWrapLow(step1[0] + step1[31], bd);
  2455. output[1] = HighbdWrapLow(step1[1] + step1[30], bd);
  2456. output[2] = HighbdWrapLow(step1[2] + step1[29], bd);
  2457. output[3] = HighbdWrapLow(step1[3] + step1[28], bd);
  2458. output[4] = HighbdWrapLow(step1[4] + step1[27], bd);
  2459. output[5] = HighbdWrapLow(step1[5] + step1[26], bd);
  2460. output[6] = HighbdWrapLow(step1[6] + step1[25], bd);
  2461. output[7] = HighbdWrapLow(step1[7] + step1[24], bd);
  2462. output[8] = HighbdWrapLow(step1[8] + step1[23], bd);
  2463. output[9] = HighbdWrapLow(step1[9] + step1[22], bd);
  2464. output[10] = HighbdWrapLow(step1[10] + step1[21], bd);
  2465. output[11] = HighbdWrapLow(step1[11] + step1[20], bd);
  2466. output[12] = HighbdWrapLow(step1[12] + step1[19], bd);
  2467. output[13] = HighbdWrapLow(step1[13] + step1[18], bd);
  2468. output[14] = HighbdWrapLow(step1[14] + step1[17], bd);
  2469. output[15] = HighbdWrapLow(step1[15] + step1[16], bd);
  2470. output[16] = HighbdWrapLow(step1[15] - step1[16], bd);
  2471. output[17] = HighbdWrapLow(step1[14] - step1[17], bd);
  2472. output[18] = HighbdWrapLow(step1[13] - step1[18], bd);
  2473. output[19] = HighbdWrapLow(step1[12] - step1[19], bd);
  2474. output[20] = HighbdWrapLow(step1[11] - step1[20], bd);
  2475. output[21] = HighbdWrapLow(step1[10] - step1[21], bd);
  2476. output[22] = HighbdWrapLow(step1[9] - step1[22], bd);
  2477. output[23] = HighbdWrapLow(step1[8] - step1[23], bd);
  2478. output[24] = HighbdWrapLow(step1[7] - step1[24], bd);
  2479. output[25] = HighbdWrapLow(step1[6] - step1[25], bd);
  2480. output[26] = HighbdWrapLow(step1[5] - step1[26], bd);
  2481. output[27] = HighbdWrapLow(step1[4] - step1[27], bd);
  2482. output[28] = HighbdWrapLow(step1[3] - step1[28], bd);
  2483. output[29] = HighbdWrapLow(step1[2] - step1[29], bd);
  2484. output[30] = HighbdWrapLow(step1[1] - step1[30], bd);
  2485. output[31] = HighbdWrapLow(step1[0] - step1[31], bd);
  2486. }
  2487. [SkipLocalsInit]
  2488. public static void HighbdIdct32x321024Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2489. {
  2490. int i, j;
  2491. Span<int> output = stackalloc int[32 * 32];
  2492. Span<int> outptr = output;
  2493. Span<int> tempIn = stackalloc int[32];
  2494. Span<int> tempOut = stackalloc int[32];
  2495. // Rows
  2496. for (i = 0; i < 32; ++i)
  2497. {
  2498. int zeroCoeff = 0;
  2499. for (j = 0; j < 32; ++j)
  2500. {
  2501. zeroCoeff |= input[j];
  2502. }
  2503. if (zeroCoeff != 0)
  2504. {
  2505. HighbdIdct32(input, outptr, bd);
  2506. }
  2507. else
  2508. {
  2509. outptr.Slice(0, 32).Fill(0);
  2510. }
  2511. input = input.Slice(32);
  2512. outptr = outptr.Slice(32);
  2513. }
  2514. // Columns
  2515. for (i = 0; i < 32; ++i)
  2516. {
  2517. for (j = 0; j < 32; ++j)
  2518. {
  2519. tempIn[j] = output[j * 32 + i];
  2520. }
  2521. HighbdIdct32(tempIn, tempOut, bd);
  2522. for (j = 0; j < 32; ++j)
  2523. {
  2524. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2525. }
  2526. }
  2527. }
  2528. [SkipLocalsInit]
  2529. public static void HighbdIdct32x32135Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2530. {
  2531. int i, j;
  2532. Span<int> output = stackalloc int[32 * 32];
  2533. Span<int> outptr = output;
  2534. Span<int> tempIn = stackalloc int[32];
  2535. Span<int> tempOut = stackalloc int[32];
  2536. output.Fill(0);
  2537. // Rows
  2538. // Only upper-left 16x16 has non-zero coeff
  2539. for (i = 0; i < 16; ++i)
  2540. {
  2541. HighbdIdct32(input, outptr, bd);
  2542. input = input.Slice(32);
  2543. outptr = outptr.Slice(32);
  2544. }
  2545. // Columns
  2546. for (i = 0; i < 32; ++i)
  2547. {
  2548. Span<ushort> destT = dest;
  2549. for (j = 0; j < 32; ++j)
  2550. {
  2551. tempIn[j] = output[j * 32 + i];
  2552. }
  2553. HighbdIdct32(tempIn, tempOut, bd);
  2554. for (j = 0; j < 32; ++j)
  2555. {
  2556. destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2557. destT = destT.Slice(stride);
  2558. }
  2559. }
  2560. }
  2561. [SkipLocalsInit]
  2562. public static void HighbdIdct32x3234Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2563. {
  2564. int i, j;
  2565. Span<int> output = stackalloc int[32 * 32];
  2566. Span<int> outptr = output;
  2567. Span<int> tempIn = stackalloc int[32];
  2568. Span<int> tempOut = stackalloc int[32];
  2569. output.Fill(0);
  2570. // Rows
  2571. // Only upper-left 8x8 has non-zero coeff
  2572. for (i = 0; i < 8; ++i)
  2573. {
  2574. HighbdIdct32(input, outptr, bd);
  2575. input = input.Slice(32);
  2576. outptr = outptr.Slice(32);
  2577. }
  2578. // Columns
  2579. for (i = 0; i < 32; ++i)
  2580. {
  2581. for (j = 0; j < 32; ++j)
  2582. {
  2583. tempIn[j] = output[j * 32 + i];
  2584. }
  2585. HighbdIdct32(tempIn, tempOut, bd);
  2586. for (j = 0; j < 32; ++j)
  2587. {
  2588. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2589. }
  2590. }
  2591. }
  2592. public static void HighbdIdct32x321Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2593. {
  2594. int i, j;
  2595. int a1;
  2596. int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
  2597. output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
  2598. a1 = BitUtils.RoundPowerOfTwo(output, 6);
  2599. for (j = 0; j < 32; ++j)
  2600. {
  2601. for (i = 0; i < 32; ++i)
  2602. {
  2603. dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
  2604. }
  2605. dest = dest.Slice(stride);
  2606. }
  2607. }
  2608. }
  2609. }