InvTxfm.cs 121 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868
  1. using System;
  2. using System.Diagnostics;
  3. using System.Runtime.CompilerServices;
  4. using Ryujinx.Graphics.Nvdec.Vp9.Common;
  5. using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.TxfmCommon;
  6. namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
  7. {
  8. internal static class InvTxfm
  9. {
  10. // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
  11. // transform amplify bits + 1 bit for contingency in rounding and quantizing
  12. private const int HighbdValidTxfmMagnitudeRange = (1 << 25);
  13. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  14. private static int DetectInvalidHighbdInput(ReadOnlySpan<int> input, int size)
  15. {
  16. int i;
  17. for (i = 0; i < size; ++i)
  18. {
  19. if (Math.Abs(input[i]) >= HighbdValidTxfmMagnitudeRange)
  20. {
  21. return 1;
  22. }
  23. }
  24. return 0;
  25. }
  26. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  27. private static long CheckRange(long input)
  28. {
  29. // For valid VP9 input streams, intermediate stage coefficients should always
  30. // stay within the range of a signed 16 bit integer. Coefficients can go out
  31. // of this range for invalid/corrupt VP9 streams.
  32. Debug.Assert(short.MinValue <= input);
  33. Debug.Assert(input <= short.MaxValue);
  34. return input;
  35. }
  36. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  37. public static long HighbdCheckRange(long input, int bd)
  38. {
  39. // For valid highbitdepth VP9 streams, intermediate stage coefficients will
  40. // stay within the ranges:
  41. // - 8 bit: signed 16 bit integer
  42. // - 10 bit: signed 18 bit integer
  43. // - 12 bit: signed 20 bit integer
  44. int intMax = (1 << (7 + bd)) - 1;
  45. int intMin = -intMax - 1;
  46. Debug.Assert(intMin <= input);
  47. Debug.Assert(input <= intMax);
  48. return input;
  49. }
  50. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  51. private static int WrapLow(long x)
  52. {
  53. return (short)CheckRange(x);
  54. }
  55. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  56. private static int HighbdWrapLow(long x, int bd)
  57. {
  58. return ((int)HighbdCheckRange(x, bd) << (24 - bd)) >> (24 - bd);
  59. }
  60. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  61. public static byte ClipPixelAdd(byte dest, long trans)
  62. {
  63. trans = WrapLow(trans);
  64. return BitUtils.ClipPixel(dest + (int)trans);
  65. }
  66. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  67. public static ushort HighbdClipPixelAdd(ushort dest, long trans, int bd)
  68. {
  69. trans = HighbdWrapLow(trans, bd);
  70. return BitUtils.ClipPixelHighbd(dest + (int)trans, bd);
  71. }
  72. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  73. private static long DctConstRoundShift(long input)
  74. {
  75. long rv = BitUtils.RoundPowerOfTwo(input, DctConstBits);
  76. return rv;
  77. }
  78. public static void Iwht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  79. {
  80. /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  81. 0.5 shifts per pixel. */
  82. int i;
  83. Span<int> output = stackalloc int[16];
  84. long a1, b1, c1, d1, e1;
  85. ReadOnlySpan<int> ip = input;
  86. Span<int> op = output;
  87. for (i = 0; i < 4; i++)
  88. {
  89. a1 = ip[0] >> UnitQuantShift;
  90. c1 = ip[1] >> UnitQuantShift;
  91. d1 = ip[2] >> UnitQuantShift;
  92. b1 = ip[3] >> UnitQuantShift;
  93. a1 += c1;
  94. d1 -= b1;
  95. e1 = (a1 - d1) >> 1;
  96. b1 = e1 - b1;
  97. c1 = e1 - c1;
  98. a1 -= b1;
  99. d1 += c1;
  100. op[0] = WrapLow(a1);
  101. op[1] = WrapLow(b1);
  102. op[2] = WrapLow(c1);
  103. op[3] = WrapLow(d1);
  104. ip = ip.Slice(4);
  105. op = op.Slice(4);
  106. }
  107. Span<int> ip2 = output;
  108. for (i = 0; i < 4; i++)
  109. {
  110. a1 = ip2[4 * 0];
  111. c1 = ip2[4 * 1];
  112. d1 = ip2[4 * 2];
  113. b1 = ip2[4 * 3];
  114. a1 += c1;
  115. d1 -= b1;
  116. e1 = (a1 - d1) >> 1;
  117. b1 = e1 - b1;
  118. c1 = e1 - c1;
  119. a1 -= b1;
  120. d1 += c1;
  121. dest[stride * 0] = ClipPixelAdd(dest[stride * 0], WrapLow(a1));
  122. dest[stride * 1] = ClipPixelAdd(dest[stride * 1], WrapLow(b1));
  123. dest[stride * 2] = ClipPixelAdd(dest[stride * 2], WrapLow(c1));
  124. dest[stride * 3] = ClipPixelAdd(dest[stride * 3], WrapLow(d1));
  125. ip2 = ip2.Slice(1);
  126. dest = dest.Slice(1);
  127. }
  128. }
  129. public static void Iwht4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  130. {
  131. int i;
  132. long a1, e1;
  133. Span<int> tmp = stackalloc int[4];
  134. ReadOnlySpan<int> ip = input;
  135. Span<int> op = tmp;
  136. a1 = ip[0] >> UnitQuantShift;
  137. e1 = a1 >> 1;
  138. a1 -= e1;
  139. op[0] = WrapLow(a1);
  140. op[1] = op[2] = op[3] = WrapLow(e1);
  141. Span<int> ip2 = tmp;
  142. for (i = 0; i < 4; i++)
  143. {
  144. e1 = ip2[0] >> 1;
  145. a1 = ip2[0] - e1;
  146. dest[stride * 0] = ClipPixelAdd(dest[stride * 0], a1);
  147. dest[stride * 1] = ClipPixelAdd(dest[stride * 1], e1);
  148. dest[stride * 2] = ClipPixelAdd(dest[stride * 2], e1);
  149. dest[stride * 3] = ClipPixelAdd(dest[stride * 3], e1);
  150. ip2 = ip2.Slice(1);
  151. dest = dest.Slice(1);
  152. }
  153. }
  154. public static void Iadst4(ReadOnlySpan<int> input, Span<int> output)
  155. {
  156. long s0, s1, s2, s3, s4, s5, s6, s7;
  157. int x0 = input[0];
  158. int x1 = input[1];
  159. int x2 = input[2];
  160. int x3 = input[3];
  161. if ((x0 | x1 | x2 | x3) == 0)
  162. {
  163. output.Slice(0, 4).Fill(0);
  164. return;
  165. }
  166. // 32-bit result is enough for the following multiplications.
  167. s0 = SinPi1_9 * x0;
  168. s1 = SinPi2_9 * x0;
  169. s2 = SinPi3_9 * x1;
  170. s3 = SinPi4_9 * x2;
  171. s4 = SinPi1_9 * x2;
  172. s5 = SinPi2_9 * x3;
  173. s6 = SinPi4_9 * x3;
  174. s7 = WrapLow(x0 - x2 + x3);
  175. s0 = s0 + s3 + s5;
  176. s1 = s1 - s4 - s6;
  177. s3 = s2;
  178. s2 = SinPi3_9 * s7;
  179. // 1-D transform scaling factor is sqrt(2).
  180. // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  181. // + 1b (addition) = 29b.
  182. // Hence the output bit depth is 15b.
  183. output[0] = WrapLow(DctConstRoundShift(s0 + s3));
  184. output[1] = WrapLow(DctConstRoundShift(s1 + s3));
  185. output[2] = WrapLow(DctConstRoundShift(s2));
  186. output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3));
  187. }
  188. public static void Idct4(ReadOnlySpan<int> input, Span<int> output)
  189. {
  190. Span<short> step = stackalloc short[4];
  191. long temp1, temp2;
  192. // stage 1
  193. temp1 = ((short)input[0] + (short)input[2]) * CosPi16_64;
  194. temp2 = ((short)input[0] - (short)input[2]) * CosPi16_64;
  195. step[0] = (short)WrapLow(DctConstRoundShift(temp1));
  196. step[1] = (short)WrapLow(DctConstRoundShift(temp2));
  197. temp1 = (short)input[1] * CosPi24_64 - (short)input[3] * CosPi8_64;
  198. temp2 = (short)input[1] * CosPi8_64 + (short)input[3] * CosPi24_64;
  199. step[2] = (short)WrapLow(DctConstRoundShift(temp1));
  200. step[3] = (short)WrapLow(DctConstRoundShift(temp2));
  201. // stage 2
  202. output[0] = WrapLow(step[0] + step[3]);
  203. output[1] = WrapLow(step[1] + step[2]);
  204. output[2] = WrapLow(step[1] - step[2]);
  205. output[3] = WrapLow(step[0] - step[3]);
  206. }
  207. public static void Idct4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  208. {
  209. int i, j;
  210. Span<int> output = stackalloc int[4 * 4];
  211. Span<int> outptr = output;
  212. Span<int> tempIn = stackalloc int[4];
  213. Span<int> tempOut = stackalloc int[4];
  214. // Rows
  215. for (i = 0; i < 4; ++i)
  216. {
  217. Idct4(input, outptr);
  218. input = input.Slice(4);
  219. outptr = outptr.Slice(4);
  220. }
  221. // Columns
  222. for (i = 0; i < 4; ++i)
  223. {
  224. for (j = 0; j < 4; ++j)
  225. {
  226. tempIn[j] = output[j * 4 + i];
  227. }
  228. Idct4(tempIn, tempOut);
  229. for (j = 0; j < 4; ++j)
  230. {
  231. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4));
  232. }
  233. }
  234. }
  235. public static void Idct4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  236. {
  237. int i;
  238. long a1;
  239. int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
  240. output = WrapLow(DctConstRoundShift(output * CosPi16_64));
  241. a1 = BitUtils.RoundPowerOfTwo(output, 4);
  242. for (i = 0; i < 4; i++)
  243. {
  244. dest[0] = ClipPixelAdd(dest[0], a1);
  245. dest[1] = ClipPixelAdd(dest[1], a1);
  246. dest[2] = ClipPixelAdd(dest[2], a1);
  247. dest[3] = ClipPixelAdd(dest[3], a1);
  248. dest = dest.Slice(stride);
  249. }
  250. }
  251. public static void Iadst8(ReadOnlySpan<int> input, Span<int> output)
  252. {
  253. int s0, s1, s2, s3, s4, s5, s6, s7;
  254. long x0 = input[7];
  255. long x1 = input[0];
  256. long x2 = input[5];
  257. long x3 = input[2];
  258. long x4 = input[3];
  259. long x5 = input[4];
  260. long x6 = input[1];
  261. long x7 = input[6];
  262. if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0)
  263. {
  264. output.Slice(0, 8).Fill(0);
  265. return;
  266. }
  267. // stage 1
  268. s0 = (int)(CosPi2_64 * x0 + CosPi30_64 * x1);
  269. s1 = (int)(CosPi30_64 * x0 - CosPi2_64 * x1);
  270. s2 = (int)(CosPi10_64 * x2 + CosPi22_64 * x3);
  271. s3 = (int)(CosPi22_64 * x2 - CosPi10_64 * x3);
  272. s4 = (int)(CosPi18_64 * x4 + CosPi14_64 * x5);
  273. s5 = (int)(CosPi14_64 * x4 - CosPi18_64 * x5);
  274. s6 = (int)(CosPi26_64 * x6 + CosPi6_64 * x7);
  275. s7 = (int)(CosPi6_64 * x6 - CosPi26_64 * x7);
  276. x0 = WrapLow(DctConstRoundShift(s0 + s4));
  277. x1 = WrapLow(DctConstRoundShift(s1 + s5));
  278. x2 = WrapLow(DctConstRoundShift(s2 + s6));
  279. x3 = WrapLow(DctConstRoundShift(s3 + s7));
  280. x4 = WrapLow(DctConstRoundShift(s0 - s4));
  281. x5 = WrapLow(DctConstRoundShift(s1 - s5));
  282. x6 = WrapLow(DctConstRoundShift(s2 - s6));
  283. x7 = WrapLow(DctConstRoundShift(s3 - s7));
  284. // stage 2
  285. s0 = (int)x0;
  286. s1 = (int)x1;
  287. s2 = (int)x2;
  288. s3 = (int)x3;
  289. s4 = (int)(CosPi8_64 * x4 + CosPi24_64 * x5);
  290. s5 = (int)(CosPi24_64 * x4 - CosPi8_64 * x5);
  291. s6 = (int)(-CosPi24_64 * x6 + CosPi8_64 * x7);
  292. s7 = (int)(CosPi8_64 * x6 + CosPi24_64 * x7);
  293. x0 = WrapLow(s0 + s2);
  294. x1 = WrapLow(s1 + s3);
  295. x2 = WrapLow(s0 - s2);
  296. x3 = WrapLow(s1 - s3);
  297. x4 = WrapLow(DctConstRoundShift(s4 + s6));
  298. x5 = WrapLow(DctConstRoundShift(s5 + s7));
  299. x6 = WrapLow(DctConstRoundShift(s4 - s6));
  300. x7 = WrapLow(DctConstRoundShift(s5 - s7));
  301. // stage 3
  302. s2 = (int)(CosPi16_64 * (x2 + x3));
  303. s3 = (int)(CosPi16_64 * (x2 - x3));
  304. s6 = (int)(CosPi16_64 * (x6 + x7));
  305. s7 = (int)(CosPi16_64 * (x6 - x7));
  306. x2 = WrapLow(DctConstRoundShift(s2));
  307. x3 = WrapLow(DctConstRoundShift(s3));
  308. x6 = WrapLow(DctConstRoundShift(s6));
  309. x7 = WrapLow(DctConstRoundShift(s7));
  310. output[0] = WrapLow(x0);
  311. output[1] = WrapLow(-x4);
  312. output[2] = WrapLow(x6);
  313. output[3] = WrapLow(-x2);
  314. output[4] = WrapLow(x3);
  315. output[5] = WrapLow(-x7);
  316. output[6] = WrapLow(x5);
  317. output[7] = WrapLow(-x1);
  318. }
  319. public static void Idct8(ReadOnlySpan<int> input, Span<int> output)
  320. {
  321. Span<short> step1 = stackalloc short[8];
  322. Span<short> step2 = stackalloc short[8];
  323. long temp1, temp2;
  324. // stage 1
  325. step1[0] = (short)input[0];
  326. step1[2] = (short)input[4];
  327. step1[1] = (short)input[2];
  328. step1[3] = (short)input[6];
  329. temp1 = (short)input[1] * CosPi28_64 - (short)input[7] * CosPi4_64;
  330. temp2 = (short)input[1] * CosPi4_64 + (short)input[7] * CosPi28_64;
  331. step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
  332. step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
  333. temp1 = (short)input[5] * CosPi12_64 - (short)input[3] * CosPi20_64;
  334. temp2 = (short)input[5] * CosPi20_64 + (short)input[3] * CosPi12_64;
  335. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  336. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  337. // stage 2
  338. temp1 = (step1[0] + step1[2]) * CosPi16_64;
  339. temp2 = (step1[0] - step1[2]) * CosPi16_64;
  340. step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
  341. step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
  342. temp1 = step1[1] * CosPi24_64 - step1[3] * CosPi8_64;
  343. temp2 = step1[1] * CosPi8_64 + step1[3] * CosPi24_64;
  344. step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
  345. step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
  346. step2[4] = (short)WrapLow(step1[4] + step1[5]);
  347. step2[5] = (short)WrapLow(step1[4] - step1[5]);
  348. step2[6] = (short)WrapLow(-step1[6] + step1[7]);
  349. step2[7] = (short)WrapLow(step1[6] + step1[7]);
  350. // stage 3
  351. step1[0] = (short)WrapLow(step2[0] + step2[3]);
  352. step1[1] = (short)WrapLow(step2[1] + step2[2]);
  353. step1[2] = (short)WrapLow(step2[1] - step2[2]);
  354. step1[3] = (short)WrapLow(step2[0] - step2[3]);
  355. step1[4] = step2[4];
  356. temp1 = (step2[6] - step2[5]) * CosPi16_64;
  357. temp2 = (step2[5] + step2[6]) * CosPi16_64;
  358. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  359. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  360. step1[7] = step2[7];
  361. // stage 4
  362. output[0] = WrapLow(step1[0] + step1[7]);
  363. output[1] = WrapLow(step1[1] + step1[6]);
  364. output[2] = WrapLow(step1[2] + step1[5]);
  365. output[3] = WrapLow(step1[3] + step1[4]);
  366. output[4] = WrapLow(step1[3] - step1[4]);
  367. output[5] = WrapLow(step1[2] - step1[5]);
  368. output[6] = WrapLow(step1[1] - step1[6]);
  369. output[7] = WrapLow(step1[0] - step1[7]);
  370. }
  371. public static void Idct8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  372. {
  373. int i, j;
  374. Span<int> output = stackalloc int[8 * 8];
  375. Span<int> outptr = output;
  376. Span<int> tempIn = stackalloc int[8];
  377. Span<int> tempOut = stackalloc int[8];
  378. // First transform rows
  379. for (i = 0; i < 8; ++i)
  380. {
  381. Idct8(input, outptr);
  382. input = input.Slice(8);
  383. outptr = outptr.Slice(8);
  384. }
  385. // Then transform columns
  386. for (i = 0; i < 8; ++i)
  387. {
  388. for (j = 0; j < 8; ++j)
  389. {
  390. tempIn[j] = output[j * 8 + i];
  391. }
  392. Idct8(tempIn, tempOut);
  393. for (j = 0; j < 8; ++j)
  394. {
  395. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i],
  396. BitUtils.RoundPowerOfTwo(tempOut[j], 5));
  397. }
  398. }
  399. }
  400. public static void Idct8x812Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  401. {
  402. int i, j;
  403. Span<int> output = stackalloc int[8 * 8];
  404. Span<int> outptr = output;
  405. Span<int> tempIn = stackalloc int[8];
  406. Span<int> tempOut = stackalloc int[8];
  407. // First transform rows
  408. // Only first 4 row has non-zero coefs
  409. for (i = 0; i < 4; ++i)
  410. {
  411. Idct8(input, outptr);
  412. input = input.Slice(8);
  413. outptr = outptr.Slice(8);
  414. }
  415. // Then transform columns
  416. for (i = 0; i < 8; ++i)
  417. {
  418. for (j = 0; j < 8; ++j)
  419. {
  420. tempIn[j] = output[j * 8 + i];
  421. }
  422. Idct8(tempIn, tempOut);
  423. for (j = 0; j < 8; ++j)
  424. {
  425. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i],
  426. BitUtils.RoundPowerOfTwo(tempOut[j], 5));
  427. }
  428. }
  429. }
  430. public static void Idct8x81Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  431. {
  432. int i, j;
  433. long a1;
  434. int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
  435. output = WrapLow(DctConstRoundShift(output * CosPi16_64));
  436. a1 = BitUtils.RoundPowerOfTwo(output, 5);
  437. for (j = 0; j < 8; ++j)
  438. {
  439. for (i = 0; i < 8; ++i)
  440. {
  441. dest[i] = ClipPixelAdd(dest[i], a1);
  442. }
  443. dest = dest.Slice(stride);
  444. }
  445. }
  446. public static void Iadst16(ReadOnlySpan<int> input, Span<int> output)
  447. {
  448. long s0, s1, s2, s3, s4, s5, s6, s7, s8;
  449. long s9, s10, s11, s12, s13, s14, s15;
  450. long x0 = input[15];
  451. long x1 = input[0];
  452. long x2 = input[13];
  453. long x3 = input[2];
  454. long x4 = input[11];
  455. long x5 = input[4];
  456. long x6 = input[9];
  457. long x7 = input[6];
  458. long x8 = input[7];
  459. long x9 = input[8];
  460. long x10 = input[5];
  461. long x11 = input[10];
  462. long x12 = input[3];
  463. long x13 = input[12];
  464. long x14 = input[1];
  465. long x15 = input[14];
  466. if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0)
  467. {
  468. output.Slice(0, 16).Fill(0);
  469. return;
  470. }
  471. // stage 1
  472. s0 = x0 * CosPi1_64 + x1 * CosPi31_64;
  473. s1 = x0 * CosPi31_64 - x1 * CosPi1_64;
  474. s2 = x2 * CosPi5_64 + x3 * CosPi27_64;
  475. s3 = x2 * CosPi27_64 - x3 * CosPi5_64;
  476. s4 = x4 * CosPi9_64 + x5 * CosPi23_64;
  477. s5 = x4 * CosPi23_64 - x5 * CosPi9_64;
  478. s6 = x6 * CosPi13_64 + x7 * CosPi19_64;
  479. s7 = x6 * CosPi19_64 - x7 * CosPi13_64;
  480. s8 = x8 * CosPi17_64 + x9 * CosPi15_64;
  481. s9 = x8 * CosPi15_64 - x9 * CosPi17_64;
  482. s10 = x10 * CosPi21_64 + x11 * CosPi11_64;
  483. s11 = x10 * CosPi11_64 - x11 * CosPi21_64;
  484. s12 = x12 * CosPi25_64 + x13 * CosPi7_64;
  485. s13 = x12 * CosPi7_64 - x13 * CosPi25_64;
  486. s14 = x14 * CosPi29_64 + x15 * CosPi3_64;
  487. s15 = x14 * CosPi3_64 - x15 * CosPi29_64;
  488. x0 = WrapLow(DctConstRoundShift(s0 + s8));
  489. x1 = WrapLow(DctConstRoundShift(s1 + s9));
  490. x2 = WrapLow(DctConstRoundShift(s2 + s10));
  491. x3 = WrapLow(DctConstRoundShift(s3 + s11));
  492. x4 = WrapLow(DctConstRoundShift(s4 + s12));
  493. x5 = WrapLow(DctConstRoundShift(s5 + s13));
  494. x6 = WrapLow(DctConstRoundShift(s6 + s14));
  495. x7 = WrapLow(DctConstRoundShift(s7 + s15));
  496. x8 = WrapLow(DctConstRoundShift(s0 - s8));
  497. x9 = WrapLow(DctConstRoundShift(s1 - s9));
  498. x10 = WrapLow(DctConstRoundShift(s2 - s10));
  499. x11 = WrapLow(DctConstRoundShift(s3 - s11));
  500. x12 = WrapLow(DctConstRoundShift(s4 - s12));
  501. x13 = WrapLow(DctConstRoundShift(s5 - s13));
  502. x14 = WrapLow(DctConstRoundShift(s6 - s14));
  503. x15 = WrapLow(DctConstRoundShift(s7 - s15));
  504. // stage 2
  505. s0 = x0;
  506. s1 = x1;
  507. s2 = x2;
  508. s3 = x3;
  509. s4 = x4;
  510. s5 = x5;
  511. s6 = x6;
  512. s7 = x7;
  513. s8 = x8 * CosPi4_64 + x9 * CosPi28_64;
  514. s9 = x8 * CosPi28_64 - x9 * CosPi4_64;
  515. s10 = x10 * CosPi20_64 + x11 * CosPi12_64;
  516. s11 = x10 * CosPi12_64 - x11 * CosPi20_64;
  517. s12 = -x12 * CosPi28_64 + x13 * CosPi4_64;
  518. s13 = x12 * CosPi4_64 + x13 * CosPi28_64;
  519. s14 = -x14 * CosPi12_64 + x15 * CosPi20_64;
  520. s15 = x14 * CosPi20_64 + x15 * CosPi12_64;
  521. x0 = WrapLow(s0 + s4);
  522. x1 = WrapLow(s1 + s5);
  523. x2 = WrapLow(s2 + s6);
  524. x3 = WrapLow(s3 + s7);
  525. x4 = WrapLow(s0 - s4);
  526. x5 = WrapLow(s1 - s5);
  527. x6 = WrapLow(s2 - s6);
  528. x7 = WrapLow(s3 - s7);
  529. x8 = WrapLow(DctConstRoundShift(s8 + s12));
  530. x9 = WrapLow(DctConstRoundShift(s9 + s13));
  531. x10 = WrapLow(DctConstRoundShift(s10 + s14));
  532. x11 = WrapLow(DctConstRoundShift(s11 + s15));
  533. x12 = WrapLow(DctConstRoundShift(s8 - s12));
  534. x13 = WrapLow(DctConstRoundShift(s9 - s13));
  535. x14 = WrapLow(DctConstRoundShift(s10 - s14));
  536. x15 = WrapLow(DctConstRoundShift(s11 - s15));
  537. // stage 3
  538. s0 = x0;
  539. s1 = x1;
  540. s2 = x2;
  541. s3 = x3;
  542. s4 = x4 * CosPi8_64 + x5 * CosPi24_64;
  543. s5 = x4 * CosPi24_64 - x5 * CosPi8_64;
  544. s6 = -x6 * CosPi24_64 + x7 * CosPi8_64;
  545. s7 = x6 * CosPi8_64 + x7 * CosPi24_64;
  546. s8 = x8;
  547. s9 = x9;
  548. s10 = x10;
  549. s11 = x11;
  550. s12 = x12 * CosPi8_64 + x13 * CosPi24_64;
  551. s13 = x12 * CosPi24_64 - x13 * CosPi8_64;
  552. s14 = -x14 * CosPi24_64 + x15 * CosPi8_64;
  553. s15 = x14 * CosPi8_64 + x15 * CosPi24_64;
  554. x0 = WrapLow(s0 + s2);
  555. x1 = WrapLow(s1 + s3);
  556. x2 = WrapLow(s0 - s2);
  557. x3 = WrapLow(s1 - s3);
  558. x4 = WrapLow(DctConstRoundShift(s4 + s6));
  559. x5 = WrapLow(DctConstRoundShift(s5 + s7));
  560. x6 = WrapLow(DctConstRoundShift(s4 - s6));
  561. x7 = WrapLow(DctConstRoundShift(s5 - s7));
  562. x8 = WrapLow(s8 + s10);
  563. x9 = WrapLow(s9 + s11);
  564. x10 = WrapLow(s8 - s10);
  565. x11 = WrapLow(s9 - s11);
  566. x12 = WrapLow(DctConstRoundShift(s12 + s14));
  567. x13 = WrapLow(DctConstRoundShift(s13 + s15));
  568. x14 = WrapLow(DctConstRoundShift(s12 - s14));
  569. x15 = WrapLow(DctConstRoundShift(s13 - s15));
  570. // stage 4
  571. s2 = (-CosPi16_64) * (x2 + x3);
  572. s3 = CosPi16_64 * (x2 - x3);
  573. s6 = CosPi16_64 * (x6 + x7);
  574. s7 = CosPi16_64 * (-x6 + x7);
  575. s10 = CosPi16_64 * (x10 + x11);
  576. s11 = CosPi16_64 * (-x10 + x11);
  577. s14 = (-CosPi16_64) * (x14 + x15);
  578. s15 = CosPi16_64 * (x14 - x15);
  579. x2 = WrapLow(DctConstRoundShift(s2));
  580. x3 = WrapLow(DctConstRoundShift(s3));
  581. x6 = WrapLow(DctConstRoundShift(s6));
  582. x7 = WrapLow(DctConstRoundShift(s7));
  583. x10 = WrapLow(DctConstRoundShift(s10));
  584. x11 = WrapLow(DctConstRoundShift(s11));
  585. x14 = WrapLow(DctConstRoundShift(s14));
  586. x15 = WrapLow(DctConstRoundShift(s15));
  587. output[0] = WrapLow(x0);
  588. output[1] = WrapLow(-x8);
  589. output[2] = WrapLow(x12);
  590. output[3] = WrapLow(-x4);
  591. output[4] = WrapLow(x6);
  592. output[5] = WrapLow(x14);
  593. output[6] = WrapLow(x10);
  594. output[7] = WrapLow(x2);
  595. output[8] = WrapLow(x3);
  596. output[9] = WrapLow(x11);
  597. output[10] = WrapLow(x15);
  598. output[11] = WrapLow(x7);
  599. output[12] = WrapLow(x5);
  600. output[13] = WrapLow(-x13);
  601. output[14] = WrapLow(x9);
  602. output[15] = WrapLow(-x1);
  603. }
  604. public static void Idct16(ReadOnlySpan<int> input, Span<int> output)
  605. {
  606. Span<short> step1 = stackalloc short[16];
  607. Span<short> step2 = stackalloc short[16];
  608. long temp1, temp2;
  609. // stage 1
  610. step1[0] = (short)input[0 / 2];
  611. step1[1] = (short)input[16 / 2];
  612. step1[2] = (short)input[8 / 2];
  613. step1[3] = (short)input[24 / 2];
  614. step1[4] = (short)input[4 / 2];
  615. step1[5] = (short)input[20 / 2];
  616. step1[6] = (short)input[12 / 2];
  617. step1[7] = (short)input[28 / 2];
  618. step1[8] = (short)input[2 / 2];
  619. step1[9] = (short)input[18 / 2];
  620. step1[10] = (short)input[10 / 2];
  621. step1[11] = (short)input[26 / 2];
  622. step1[12] = (short)input[6 / 2];
  623. step1[13] = (short)input[22 / 2];
  624. step1[14] = (short)input[14 / 2];
  625. step1[15] = (short)input[30 / 2];
  626. // stage 2
  627. step2[0] = step1[0];
  628. step2[1] = step1[1];
  629. step2[2] = step1[2];
  630. step2[3] = step1[3];
  631. step2[4] = step1[4];
  632. step2[5] = step1[5];
  633. step2[6] = step1[6];
  634. step2[7] = step1[7];
  635. temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64;
  636. temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64;
  637. step2[8] = (short)WrapLow(DctConstRoundShift(temp1));
  638. step2[15] = (short)WrapLow(DctConstRoundShift(temp2));
  639. temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64;
  640. temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64;
  641. step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
  642. step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
  643. temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64;
  644. temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64;
  645. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  646. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  647. temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64;
  648. temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64;
  649. step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
  650. step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
  651. // stage 3
  652. step1[0] = step2[0];
  653. step1[1] = step2[1];
  654. step1[2] = step2[2];
  655. step1[3] = step2[3];
  656. temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64;
  657. temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64;
  658. step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
  659. step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
  660. temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64;
  661. temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64;
  662. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  663. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  664. step1[8] = (short)WrapLow(step2[8] + step2[9]);
  665. step1[9] = (short)WrapLow(step2[8] - step2[9]);
  666. step1[10] = (short)WrapLow(-step2[10] + step2[11]);
  667. step1[11] = (short)WrapLow(step2[10] + step2[11]);
  668. step1[12] = (short)WrapLow(step2[12] + step2[13]);
  669. step1[13] = (short)WrapLow(step2[12] - step2[13]);
  670. step1[14] = (short)WrapLow(-step2[14] + step2[15]);
  671. step1[15] = (short)WrapLow(step2[14] + step2[15]);
  672. // stage 4
  673. temp1 = (step1[0] + step1[1]) * CosPi16_64;
  674. temp2 = (step1[0] - step1[1]) * CosPi16_64;
  675. step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
  676. step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
  677. temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64;
  678. temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64;
  679. step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
  680. step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
  681. step2[4] = (short)WrapLow(step1[4] + step1[5]);
  682. step2[5] = (short)WrapLow(step1[4] - step1[5]);
  683. step2[6] = (short)WrapLow(-step1[6] + step1[7]);
  684. step2[7] = (short)WrapLow(step1[6] + step1[7]);
  685. step2[8] = step1[8];
  686. step2[15] = step1[15];
  687. temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64;
  688. temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64;
  689. step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
  690. step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
  691. temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64;
  692. temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64;
  693. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  694. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  695. step2[11] = step1[11];
  696. step2[12] = step1[12];
  697. // stage 5
  698. step1[0] = (short)WrapLow(step2[0] + step2[3]);
  699. step1[1] = (short)WrapLow(step2[1] + step2[2]);
  700. step1[2] = (short)WrapLow(step2[1] - step2[2]);
  701. step1[3] = (short)WrapLow(step2[0] - step2[3]);
  702. step1[4] = step2[4];
  703. temp1 = (step2[6] - step2[5]) * CosPi16_64;
  704. temp2 = (step2[5] + step2[6]) * CosPi16_64;
  705. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  706. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  707. step1[7] = step2[7];
  708. step1[8] = (short)WrapLow(step2[8] + step2[11]);
  709. step1[9] = (short)WrapLow(step2[9] + step2[10]);
  710. step1[10] = (short)WrapLow(step2[9] - step2[10]);
  711. step1[11] = (short)WrapLow(step2[8] - step2[11]);
  712. step1[12] = (short)WrapLow(-step2[12] + step2[15]);
  713. step1[13] = (short)WrapLow(-step2[13] + step2[14]);
  714. step1[14] = (short)WrapLow(step2[13] + step2[14]);
  715. step1[15] = (short)WrapLow(step2[12] + step2[15]);
  716. // stage 6
  717. step2[0] = (short)WrapLow(step1[0] + step1[7]);
  718. step2[1] = (short)WrapLow(step1[1] + step1[6]);
  719. step2[2] = (short)WrapLow(step1[2] + step1[5]);
  720. step2[3] = (short)WrapLow(step1[3] + step1[4]);
  721. step2[4] = (short)WrapLow(step1[3] - step1[4]);
  722. step2[5] = (short)WrapLow(step1[2] - step1[5]);
  723. step2[6] = (short)WrapLow(step1[1] - step1[6]);
  724. step2[7] = (short)WrapLow(step1[0] - step1[7]);
  725. step2[8] = step1[8];
  726. step2[9] = step1[9];
  727. temp1 = (-step1[10] + step1[13]) * CosPi16_64;
  728. temp2 = (step1[10] + step1[13]) * CosPi16_64;
  729. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  730. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  731. temp1 = (-step1[11] + step1[12]) * CosPi16_64;
  732. temp2 = (step1[11] + step1[12]) * CosPi16_64;
  733. step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
  734. step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
  735. step2[14] = step1[14];
  736. step2[15] = step1[15];
  737. // stage 7
  738. output[0] = WrapLow(step2[0] + step2[15]);
  739. output[1] = WrapLow(step2[1] + step2[14]);
  740. output[2] = WrapLow(step2[2] + step2[13]);
  741. output[3] = WrapLow(step2[3] + step2[12]);
  742. output[4] = WrapLow(step2[4] + step2[11]);
  743. output[5] = WrapLow(step2[5] + step2[10]);
  744. output[6] = WrapLow(step2[6] + step2[9]);
  745. output[7] = WrapLow(step2[7] + step2[8]);
  746. output[8] = WrapLow(step2[7] - step2[8]);
  747. output[9] = WrapLow(step2[6] - step2[9]);
  748. output[10] = WrapLow(step2[5] - step2[10]);
  749. output[11] = WrapLow(step2[4] - step2[11]);
  750. output[12] = WrapLow(step2[3] - step2[12]);
  751. output[13] = WrapLow(step2[2] - step2[13]);
  752. output[14] = WrapLow(step2[1] - step2[14]);
  753. output[15] = WrapLow(step2[0] - step2[15]);
  754. }
  755. public static void Idct16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  756. {
  757. int i, j;
  758. Span<int> output = stackalloc int[16 * 16];
  759. Span<int> outptr = output;
  760. Span<int> tempIn = stackalloc int[16];
  761. Span<int> tempOut = stackalloc int[16];
  762. // First transform rows
  763. for (i = 0; i < 16; ++i)
  764. {
  765. Idct16(input, outptr);
  766. input = input.Slice(16);
  767. outptr = outptr.Slice(16);
  768. }
  769. // Then transform columns
  770. for (i = 0; i < 16; ++i)
  771. {
  772. for (j = 0; j < 16; ++j)
  773. {
  774. tempIn[j] = output[j * 16 + i];
  775. }
  776. Idct16(tempIn, tempOut);
  777. for (j = 0; j < 16; ++j)
  778. {
  779. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  780. }
  781. }
  782. }
  783. public static void Idct16x1638Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  784. {
  785. int i, j;
  786. Span<int> output = stackalloc int[16 * 16];
  787. Span<int> outptr = output;
  788. Span<int> tempIn = stackalloc int[16];
  789. Span<int> tempOut = stackalloc int[16];
  790. // First transform rows. Since all non-zero dct coefficients are in
  791. // upper-left 8x8 area, we only need to calculate first 8 rows here.
  792. for (i = 0; i < 8; ++i)
  793. {
  794. Idct16(input, outptr);
  795. input = input.Slice(16);
  796. outptr = outptr.Slice(16);
  797. }
  798. // Then transform columns
  799. for (i = 0; i < 16; ++i)
  800. {
  801. for (j = 0; j < 16; ++j)
  802. {
  803. tempIn[j] = output[j * 16 + i];
  804. }
  805. Idct16(tempIn, tempOut);
  806. for (j = 0; j < 16; ++j)
  807. {
  808. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  809. }
  810. }
  811. }
  812. public static void Idct16x1610Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  813. {
  814. int i, j;
  815. Span<int> output = stackalloc int[16 * 16];
  816. Span<int> outptr = output;
  817. Span<int> tempIn = stackalloc int[16];
  818. Span<int> tempOut = stackalloc int[16];
  819. // First transform rows. Since all non-zero dct coefficients are in
  820. // upper-left 4x4 area, we only need to calculate first 4 rows here.
  821. for (i = 0; i < 4; ++i)
  822. {
  823. Idct16(input, outptr);
  824. input = input.Slice(16);
  825. outptr = outptr.Slice(16);
  826. }
  827. // Then transform columns
  828. for (i = 0; i < 16; ++i)
  829. {
  830. for (j = 0; j < 16; ++j)
  831. {
  832. tempIn[j] = output[j * 16 + i];
  833. }
  834. Idct16(tempIn, tempOut);
  835. for (j = 0; j < 16; ++j)
  836. {
  837. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  838. }
  839. }
  840. }
  841. public static void Idct16x161Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  842. {
  843. int i, j;
  844. long a1;
  845. int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
  846. output = WrapLow(DctConstRoundShift(output * CosPi16_64));
  847. a1 = BitUtils.RoundPowerOfTwo(output, 6);
  848. for (j = 0; j < 16; ++j)
  849. {
  850. for (i = 0; i < 16; ++i)
  851. {
  852. dest[i] = ClipPixelAdd(dest[i], a1);
  853. }
  854. dest = dest.Slice(stride);
  855. }
  856. }
  857. public static void Idct32(ReadOnlySpan<int> input, Span<int> output)
  858. {
  859. Span<short> step1 = stackalloc short[32];
  860. Span<short> step2 = stackalloc short[32];
  861. long temp1, temp2;
  862. // stage 1
  863. step1[0] = (short)input[0];
  864. step1[1] = (short)input[16];
  865. step1[2] = (short)input[8];
  866. step1[3] = (short)input[24];
  867. step1[4] = (short)input[4];
  868. step1[5] = (short)input[20];
  869. step1[6] = (short)input[12];
  870. step1[7] = (short)input[28];
  871. step1[8] = (short)input[2];
  872. step1[9] = (short)input[18];
  873. step1[10] = (short)input[10];
  874. step1[11] = (short)input[26];
  875. step1[12] = (short)input[6];
  876. step1[13] = (short)input[22];
  877. step1[14] = (short)input[14];
  878. step1[15] = (short)input[30];
  879. temp1 = (short)input[1] * CosPi31_64 - (short)input[31] * CosPi1_64;
  880. temp2 = (short)input[1] * CosPi1_64 + (short)input[31] * CosPi31_64;
  881. step1[16] = (short)WrapLow(DctConstRoundShift(temp1));
  882. step1[31] = (short)WrapLow(DctConstRoundShift(temp2));
  883. temp1 = (short)input[17] * CosPi15_64 - (short)input[15] * CosPi17_64;
  884. temp2 = (short)input[17] * CosPi17_64 + (short)input[15] * CosPi15_64;
  885. step1[17] = (short)WrapLow(DctConstRoundShift(temp1));
  886. step1[30] = (short)WrapLow(DctConstRoundShift(temp2));
  887. temp1 = (short)input[9] * CosPi23_64 - (short)input[23] * CosPi9_64;
  888. temp2 = (short)input[9] * CosPi9_64 + (short)input[23] * CosPi23_64;
  889. step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
  890. step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
  891. temp1 = (short)input[25] * CosPi7_64 - (short)input[7] * CosPi25_64;
  892. temp2 = (short)input[25] * CosPi25_64 + (short)input[7] * CosPi7_64;
  893. step1[19] = (short)WrapLow(DctConstRoundShift(temp1));
  894. step1[28] = (short)WrapLow(DctConstRoundShift(temp2));
  895. temp1 = (short)input[5] * CosPi27_64 - (short)input[27] * CosPi5_64;
  896. temp2 = (short)input[5] * CosPi5_64 + (short)input[27] * CosPi27_64;
  897. step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
  898. step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
  899. temp1 = (short)input[21] * CosPi11_64 - (short)input[11] * CosPi21_64;
  900. temp2 = (short)input[21] * CosPi21_64 + (short)input[11] * CosPi11_64;
  901. step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
  902. step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
  903. temp1 = (short)input[13] * CosPi19_64 - (short)input[19] * CosPi13_64;
  904. temp2 = (short)input[13] * CosPi13_64 + (short)input[19] * CosPi19_64;
  905. step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
  906. step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
  907. temp1 = (short)input[29] * CosPi3_64 - (short)input[3] * CosPi29_64;
  908. temp2 = (short)input[29] * CosPi29_64 + (short)input[3] * CosPi3_64;
  909. step1[23] = (short)WrapLow(DctConstRoundShift(temp1));
  910. step1[24] = (short)WrapLow(DctConstRoundShift(temp2));
  911. // stage 2
  912. step2[0] = step1[0];
  913. step2[1] = step1[1];
  914. step2[2] = step1[2];
  915. step2[3] = step1[3];
  916. step2[4] = step1[4];
  917. step2[5] = step1[5];
  918. step2[6] = step1[6];
  919. step2[7] = step1[7];
  920. temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64;
  921. temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64;
  922. step2[8] = (short)WrapLow(DctConstRoundShift(temp1));
  923. step2[15] = (short)WrapLow(DctConstRoundShift(temp2));
  924. temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64;
  925. temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64;
  926. step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
  927. step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
  928. temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64;
  929. temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64;
  930. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  931. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  932. temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64;
  933. temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64;
  934. step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
  935. step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
  936. step2[16] = (short)WrapLow(step1[16] + step1[17]);
  937. step2[17] = (short)WrapLow(step1[16] - step1[17]);
  938. step2[18] = (short)WrapLow(-step1[18] + step1[19]);
  939. step2[19] = (short)WrapLow(step1[18] + step1[19]);
  940. step2[20] = (short)WrapLow(step1[20] + step1[21]);
  941. step2[21] = (short)WrapLow(step1[20] - step1[21]);
  942. step2[22] = (short)WrapLow(-step1[22] + step1[23]);
  943. step2[23] = (short)WrapLow(step1[22] + step1[23]);
  944. step2[24] = (short)WrapLow(step1[24] + step1[25]);
  945. step2[25] = (short)WrapLow(step1[24] - step1[25]);
  946. step2[26] = (short)WrapLow(-step1[26] + step1[27]);
  947. step2[27] = (short)WrapLow(step1[26] + step1[27]);
  948. step2[28] = (short)WrapLow(step1[28] + step1[29]);
  949. step2[29] = (short)WrapLow(step1[28] - step1[29]);
  950. step2[30] = (short)WrapLow(-step1[30] + step1[31]);
  951. step2[31] = (short)WrapLow(step1[30] + step1[31]);
  952. // stage 3
  953. step1[0] = step2[0];
  954. step1[1] = step2[1];
  955. step1[2] = step2[2];
  956. step1[3] = step2[3];
  957. temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64;
  958. temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64;
  959. step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
  960. step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
  961. temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64;
  962. temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64;
  963. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  964. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  965. step1[8] = (short)WrapLow(step2[8] + step2[9]);
  966. step1[9] = (short)WrapLow(step2[8] - step2[9]);
  967. step1[10] = (short)WrapLow(-step2[10] + step2[11]);
  968. step1[11] = (short)WrapLow(step2[10] + step2[11]);
  969. step1[12] = (short)WrapLow(step2[12] + step2[13]);
  970. step1[13] = (short)WrapLow(step2[12] - step2[13]);
  971. step1[14] = (short)WrapLow(-step2[14] + step2[15]);
  972. step1[15] = (short)WrapLow(step2[14] + step2[15]);
  973. step1[16] = step2[16];
  974. step1[31] = step2[31];
  975. temp1 = -step2[17] * CosPi4_64 + step2[30] * CosPi28_64;
  976. temp2 = step2[17] * CosPi28_64 + step2[30] * CosPi4_64;
  977. step1[17] = (short)WrapLow(DctConstRoundShift(temp1));
  978. step1[30] = (short)WrapLow(DctConstRoundShift(temp2));
  979. temp1 = -step2[18] * CosPi28_64 - step2[29] * CosPi4_64;
  980. temp2 = -step2[18] * CosPi4_64 + step2[29] * CosPi28_64;
  981. step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
  982. step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
  983. step1[19] = step2[19];
  984. step1[20] = step2[20];
  985. temp1 = -step2[21] * CosPi20_64 + step2[26] * CosPi12_64;
  986. temp2 = step2[21] * CosPi12_64 + step2[26] * CosPi20_64;
  987. step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
  988. step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
  989. temp1 = -step2[22] * CosPi12_64 - step2[25] * CosPi20_64;
  990. temp2 = -step2[22] * CosPi20_64 + step2[25] * CosPi12_64;
  991. step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
  992. step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
  993. step1[23] = step2[23];
  994. step1[24] = step2[24];
  995. step1[27] = step2[27];
  996. step1[28] = step2[28];
  997. // stage 4
  998. temp1 = (step1[0] + step1[1]) * CosPi16_64;
  999. temp2 = (step1[0] - step1[1]) * CosPi16_64;
  1000. step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
  1001. step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
  1002. temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64;
  1003. temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64;
  1004. step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
  1005. step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
  1006. step2[4] = (short)WrapLow(step1[4] + step1[5]);
  1007. step2[5] = (short)WrapLow(step1[4] - step1[5]);
  1008. step2[6] = (short)WrapLow(-step1[6] + step1[7]);
  1009. step2[7] = (short)WrapLow(step1[6] + step1[7]);
  1010. step2[8] = step1[8];
  1011. step2[15] = step1[15];
  1012. temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64;
  1013. temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64;
  1014. step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
  1015. step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
  1016. temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64;
  1017. temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64;
  1018. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  1019. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  1020. step2[11] = step1[11];
  1021. step2[12] = step1[12];
  1022. step2[16] = (short)WrapLow(step1[16] + step1[19]);
  1023. step2[17] = (short)WrapLow(step1[17] + step1[18]);
  1024. step2[18] = (short)WrapLow(step1[17] - step1[18]);
  1025. step2[19] = (short)WrapLow(step1[16] - step1[19]);
  1026. step2[20] = (short)WrapLow(-step1[20] + step1[23]);
  1027. step2[21] = (short)WrapLow(-step1[21] + step1[22]);
  1028. step2[22] = (short)WrapLow(step1[21] + step1[22]);
  1029. step2[23] = (short)WrapLow(step1[20] + step1[23]);
  1030. step2[24] = (short)WrapLow(step1[24] + step1[27]);
  1031. step2[25] = (short)WrapLow(step1[25] + step1[26]);
  1032. step2[26] = (short)WrapLow(step1[25] - step1[26]);
  1033. step2[27] = (short)WrapLow(step1[24] - step1[27]);
  1034. step2[28] = (short)WrapLow(-step1[28] + step1[31]);
  1035. step2[29] = (short)WrapLow(-step1[29] + step1[30]);
  1036. step2[30] = (short)WrapLow(step1[29] + step1[30]);
  1037. step2[31] = (short)WrapLow(step1[28] + step1[31]);
  1038. // stage 5
  1039. step1[0] = (short)WrapLow(step2[0] + step2[3]);
  1040. step1[1] = (short)WrapLow(step2[1] + step2[2]);
  1041. step1[2] = (short)WrapLow(step2[1] - step2[2]);
  1042. step1[3] = (short)WrapLow(step2[0] - step2[3]);
  1043. step1[4] = step2[4];
  1044. temp1 = (step2[6] - step2[5]) * CosPi16_64;
  1045. temp2 = (step2[5] + step2[6]) * CosPi16_64;
  1046. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  1047. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  1048. step1[7] = step2[7];
  1049. step1[8] = (short)WrapLow(step2[8] + step2[11]);
  1050. step1[9] = (short)WrapLow(step2[9] + step2[10]);
  1051. step1[10] = (short)WrapLow(step2[9] - step2[10]);
  1052. step1[11] = (short)WrapLow(step2[8] - step2[11]);
  1053. step1[12] = (short)WrapLow(-step2[12] + step2[15]);
  1054. step1[13] = (short)WrapLow(-step2[13] + step2[14]);
  1055. step1[14] = (short)WrapLow(step2[13] + step2[14]);
  1056. step1[15] = (short)WrapLow(step2[12] + step2[15]);
  1057. step1[16] = step2[16];
  1058. step1[17] = step2[17];
  1059. temp1 = -step2[18] * CosPi8_64 + step2[29] * CosPi24_64;
  1060. temp2 = step2[18] * CosPi24_64 + step2[29] * CosPi8_64;
  1061. step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
  1062. step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
  1063. temp1 = -step2[19] * CosPi8_64 + step2[28] * CosPi24_64;
  1064. temp2 = step2[19] * CosPi24_64 + step2[28] * CosPi8_64;
  1065. step1[19] = (short)WrapLow(DctConstRoundShift(temp1));
  1066. step1[28] = (short)WrapLow(DctConstRoundShift(temp2));
  1067. temp1 = -step2[20] * CosPi24_64 - step2[27] * CosPi8_64;
  1068. temp2 = -step2[20] * CosPi8_64 + step2[27] * CosPi24_64;
  1069. step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
  1070. step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
  1071. temp1 = -step2[21] * CosPi24_64 - step2[26] * CosPi8_64;
  1072. temp2 = -step2[21] * CosPi8_64 + step2[26] * CosPi24_64;
  1073. step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
  1074. step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
  1075. step1[22] = step2[22];
  1076. step1[23] = step2[23];
  1077. step1[24] = step2[24];
  1078. step1[25] = step2[25];
  1079. step1[30] = step2[30];
  1080. step1[31] = step2[31];
  1081. // stage 6
  1082. step2[0] = (short)WrapLow(step1[0] + step1[7]);
  1083. step2[1] = (short)WrapLow(step1[1] + step1[6]);
  1084. step2[2] = (short)WrapLow(step1[2] + step1[5]);
  1085. step2[3] = (short)WrapLow(step1[3] + step1[4]);
  1086. step2[4] = (short)WrapLow(step1[3] - step1[4]);
  1087. step2[5] = (short)WrapLow(step1[2] - step1[5]);
  1088. step2[6] = (short)WrapLow(step1[1] - step1[6]);
  1089. step2[7] = (short)WrapLow(step1[0] - step1[7]);
  1090. step2[8] = step1[8];
  1091. step2[9] = step1[9];
  1092. temp1 = (-step1[10] + step1[13]) * CosPi16_64;
  1093. temp2 = (step1[10] + step1[13]) * CosPi16_64;
  1094. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  1095. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  1096. temp1 = (-step1[11] + step1[12]) * CosPi16_64;
  1097. temp2 = (step1[11] + step1[12]) * CosPi16_64;
  1098. step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
  1099. step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
  1100. step2[14] = step1[14];
  1101. step2[15] = step1[15];
  1102. step2[16] = (short)WrapLow(step1[16] + step1[23]);
  1103. step2[17] = (short)WrapLow(step1[17] + step1[22]);
  1104. step2[18] = (short)WrapLow(step1[18] + step1[21]);
  1105. step2[19] = (short)WrapLow(step1[19] + step1[20]);
  1106. step2[20] = (short)WrapLow(step1[19] - step1[20]);
  1107. step2[21] = (short)WrapLow(step1[18] - step1[21]);
  1108. step2[22] = (short)WrapLow(step1[17] - step1[22]);
  1109. step2[23] = (short)WrapLow(step1[16] - step1[23]);
  1110. step2[24] = (short)WrapLow(-step1[24] + step1[31]);
  1111. step2[25] = (short)WrapLow(-step1[25] + step1[30]);
  1112. step2[26] = (short)WrapLow(-step1[26] + step1[29]);
  1113. step2[27] = (short)WrapLow(-step1[27] + step1[28]);
  1114. step2[28] = (short)WrapLow(step1[27] + step1[28]);
  1115. step2[29] = (short)WrapLow(step1[26] + step1[29]);
  1116. step2[30] = (short)WrapLow(step1[25] + step1[30]);
  1117. step2[31] = (short)WrapLow(step1[24] + step1[31]);
  1118. // stage 7
  1119. step1[0] = (short)WrapLow(step2[0] + step2[15]);
  1120. step1[1] = (short)WrapLow(step2[1] + step2[14]);
  1121. step1[2] = (short)WrapLow(step2[2] + step2[13]);
  1122. step1[3] = (short)WrapLow(step2[3] + step2[12]);
  1123. step1[4] = (short)WrapLow(step2[4] + step2[11]);
  1124. step1[5] = (short)WrapLow(step2[5] + step2[10]);
  1125. step1[6] = (short)WrapLow(step2[6] + step2[9]);
  1126. step1[7] = (short)WrapLow(step2[7] + step2[8]);
  1127. step1[8] = (short)WrapLow(step2[7] - step2[8]);
  1128. step1[9] = (short)WrapLow(step2[6] - step2[9]);
  1129. step1[10] = (short)WrapLow(step2[5] - step2[10]);
  1130. step1[11] = (short)WrapLow(step2[4] - step2[11]);
  1131. step1[12] = (short)WrapLow(step2[3] - step2[12]);
  1132. step1[13] = (short)WrapLow(step2[2] - step2[13]);
  1133. step1[14] = (short)WrapLow(step2[1] - step2[14]);
  1134. step1[15] = (short)WrapLow(step2[0] - step2[15]);
  1135. step1[16] = step2[16];
  1136. step1[17] = step2[17];
  1137. step1[18] = step2[18];
  1138. step1[19] = step2[19];
  1139. temp1 = (-step2[20] + step2[27]) * CosPi16_64;
  1140. temp2 = (step2[20] + step2[27]) * CosPi16_64;
  1141. step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
  1142. step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
  1143. temp1 = (-step2[21] + step2[26]) * CosPi16_64;
  1144. temp2 = (step2[21] + step2[26]) * CosPi16_64;
  1145. step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
  1146. step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
  1147. temp1 = (-step2[22] + step2[25]) * CosPi16_64;
  1148. temp2 = (step2[22] + step2[25]) * CosPi16_64;
  1149. step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
  1150. step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
  1151. temp1 = (-step2[23] + step2[24]) * CosPi16_64;
  1152. temp2 = (step2[23] + step2[24]) * CosPi16_64;
  1153. step1[23] = (short)WrapLow(DctConstRoundShift(temp1));
  1154. step1[24] = (short)WrapLow(DctConstRoundShift(temp2));
  1155. step1[28] = step2[28];
  1156. step1[29] = step2[29];
  1157. step1[30] = step2[30];
  1158. step1[31] = step2[31];
  1159. // final stage
  1160. output[0] = WrapLow(step1[0] + step1[31]);
  1161. output[1] = WrapLow(step1[1] + step1[30]);
  1162. output[2] = WrapLow(step1[2] + step1[29]);
  1163. output[3] = WrapLow(step1[3] + step1[28]);
  1164. output[4] = WrapLow(step1[4] + step1[27]);
  1165. output[5] = WrapLow(step1[5] + step1[26]);
  1166. output[6] = WrapLow(step1[6] + step1[25]);
  1167. output[7] = WrapLow(step1[7] + step1[24]);
  1168. output[8] = WrapLow(step1[8] + step1[23]);
  1169. output[9] = WrapLow(step1[9] + step1[22]);
  1170. output[10] = WrapLow(step1[10] + step1[21]);
  1171. output[11] = WrapLow(step1[11] + step1[20]);
  1172. output[12] = WrapLow(step1[12] + step1[19]);
  1173. output[13] = WrapLow(step1[13] + step1[18]);
  1174. output[14] = WrapLow(step1[14] + step1[17]);
  1175. output[15] = WrapLow(step1[15] + step1[16]);
  1176. output[16] = WrapLow(step1[15] - step1[16]);
  1177. output[17] = WrapLow(step1[14] - step1[17]);
  1178. output[18] = WrapLow(step1[13] - step1[18]);
  1179. output[19] = WrapLow(step1[12] - step1[19]);
  1180. output[20] = WrapLow(step1[11] - step1[20]);
  1181. output[21] = WrapLow(step1[10] - step1[21]);
  1182. output[22] = WrapLow(step1[9] - step1[22]);
  1183. output[23] = WrapLow(step1[8] - step1[23]);
  1184. output[24] = WrapLow(step1[7] - step1[24]);
  1185. output[25] = WrapLow(step1[6] - step1[25]);
  1186. output[26] = WrapLow(step1[5] - step1[26]);
  1187. output[27] = WrapLow(step1[4] - step1[27]);
  1188. output[28] = WrapLow(step1[3] - step1[28]);
  1189. output[29] = WrapLow(step1[2] - step1[29]);
  1190. output[30] = WrapLow(step1[1] - step1[30]);
  1191. output[31] = WrapLow(step1[0] - step1[31]);
  1192. }
  1193. public static void Idct32x321024Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  1194. {
  1195. int i, j;
  1196. Span<int> output = stackalloc int[32 * 32];
  1197. Span<int> outptr = output;
  1198. Span<int> tempIn = stackalloc int[32];
  1199. Span<int> tempOut = stackalloc int[32];
  1200. // Rows
  1201. for (i = 0; i < 32; ++i)
  1202. {
  1203. short zeroCoeff = 0;
  1204. for (j = 0; j < 32; ++j)
  1205. {
  1206. zeroCoeff |= (short)input[j];
  1207. }
  1208. if (zeroCoeff != 0)
  1209. {
  1210. Idct32(input, outptr);
  1211. }
  1212. else
  1213. {
  1214. outptr.Slice(0, 32).Fill(0);
  1215. }
  1216. input = input.Slice(32);
  1217. outptr = outptr.Slice(32);
  1218. }
  1219. // Columns
  1220. for (i = 0; i < 32; ++i)
  1221. {
  1222. for (j = 0; j < 32; ++j)
  1223. {
  1224. tempIn[j] = output[j * 32 + i];
  1225. }
  1226. Idct32(tempIn, tempOut);
  1227. for (j = 0; j < 32; ++j)
  1228. {
  1229. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  1230. }
  1231. }
  1232. }
  1233. public static void Idct32x32135Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  1234. {
  1235. int i, j;
  1236. Span<int> output = stackalloc int[32 * 32];
  1237. Span<int> outptr = output;
  1238. Span<int> tempIn = stackalloc int[32];
  1239. Span<int> tempOut = stackalloc int[32];
  1240. // Rows
  1241. // Only upper-left 16x16 has non-zero coeff
  1242. for (i = 0; i < 16; ++i)
  1243. {
  1244. Idct32(input, outptr);
  1245. input = input.Slice(32);
  1246. outptr = outptr.Slice(32);
  1247. }
  1248. // Columns
  1249. for (i = 0; i < 32; ++i)
  1250. {
  1251. for (j = 0; j < 32; ++j)
  1252. {
  1253. tempIn[j] = output[j * 32 + i];
  1254. }
  1255. Idct32(tempIn, tempOut);
  1256. for (j = 0; j < 32; ++j)
  1257. {
  1258. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  1259. }
  1260. }
  1261. }
  1262. public static void Idct32x3234Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  1263. {
  1264. int i, j;
  1265. Span<int> output = stackalloc int[32 * 32];
  1266. Span<int> outptr = output;
  1267. Span<int> tempIn = stackalloc int[32];
  1268. Span<int> tempOut = stackalloc int[32];
  1269. // Rows
  1270. // Only upper-left 8x8 has non-zero coeff
  1271. for (i = 0; i < 8; ++i)
  1272. {
  1273. Idct32(input, outptr);
  1274. input = input.Slice(32);
  1275. outptr = outptr.Slice(32);
  1276. }
  1277. // Columns
  1278. for (i = 0; i < 32; ++i)
  1279. {
  1280. for (j = 0; j < 32; ++j)
  1281. {
  1282. tempIn[j] = output[j * 32 + i];
  1283. }
  1284. Idct32(tempIn, tempOut);
  1285. for (j = 0; j < 32; ++j)
  1286. {
  1287. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  1288. }
  1289. }
  1290. }
  1291. public static void Idct32x321Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  1292. {
  1293. int i, j;
  1294. long a1;
  1295. int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
  1296. output = WrapLow(DctConstRoundShift(output * CosPi16_64));
  1297. a1 = BitUtils.RoundPowerOfTwo(output, 6);
  1298. for (j = 0; j < 32; ++j)
  1299. {
  1300. for (i = 0; i < 32; ++i)
  1301. {
  1302. dest[i] = ClipPixelAdd(dest[i], a1);
  1303. }
  1304. dest = dest.Slice(stride);
  1305. }
  1306. }
  1307. public static void HighbdIwht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1308. {
  1309. /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  1310. 0.5 shifts per pixel. */
  1311. int i;
  1312. Span<int> output = stackalloc int[16];
  1313. long a1, b1, c1, d1, e1;
  1314. ReadOnlySpan<int> ip = input;
  1315. Span<int> op = output;
  1316. for (i = 0; i < 4; i++)
  1317. {
  1318. a1 = ip[0] >> UnitQuantShift;
  1319. c1 = ip[1] >> UnitQuantShift;
  1320. d1 = ip[2] >> UnitQuantShift;
  1321. b1 = ip[3] >> UnitQuantShift;
  1322. a1 += c1;
  1323. d1 -= b1;
  1324. e1 = (a1 - d1) >> 1;
  1325. b1 = e1 - b1;
  1326. c1 = e1 - c1;
  1327. a1 -= b1;
  1328. d1 += c1;
  1329. op[0] = HighbdWrapLow(a1, bd);
  1330. op[1] = HighbdWrapLow(b1, bd);
  1331. op[2] = HighbdWrapLow(c1, bd);
  1332. op[3] = HighbdWrapLow(d1, bd);
  1333. ip = ip.Slice(4);
  1334. op = op.Slice(4);
  1335. }
  1336. ReadOnlySpan<int> ip2 = output;
  1337. for (i = 0; i < 4; i++)
  1338. {
  1339. a1 = ip2[4 * 0];
  1340. c1 = ip2[4 * 1];
  1341. d1 = ip2[4 * 2];
  1342. b1 = ip2[4 * 3];
  1343. a1 += c1;
  1344. d1 -= b1;
  1345. e1 = (a1 - d1) >> 1;
  1346. b1 = e1 - b1;
  1347. c1 = e1 - c1;
  1348. a1 -= b1;
  1349. d1 += c1;
  1350. dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], HighbdWrapLow(a1, bd), bd);
  1351. dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], HighbdWrapLow(b1, bd), bd);
  1352. dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], HighbdWrapLow(c1, bd), bd);
  1353. dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], HighbdWrapLow(d1, bd), bd);
  1354. ip2 = ip2.Slice(1);
  1355. dest = dest.Slice(1);
  1356. }
  1357. }
  1358. public static void HighbdIwht4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1359. {
  1360. int i;
  1361. long a1, e1;
  1362. Span<int> tmp = stackalloc int[4];
  1363. ReadOnlySpan<int> ip = input;
  1364. Span<int> op = tmp;
  1365. a1 = ip[0] >> UnitQuantShift;
  1366. e1 = a1 >> 1;
  1367. a1 -= e1;
  1368. op[0] = HighbdWrapLow(a1, bd);
  1369. op[1] = op[2] = op[3] = HighbdWrapLow(e1, bd);
  1370. ReadOnlySpan<int> ip2 = tmp;
  1371. for (i = 0; i < 4; i++)
  1372. {
  1373. e1 = ip2[0] >> 1;
  1374. a1 = ip2[0] - e1;
  1375. dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], a1, bd);
  1376. dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], e1, bd);
  1377. dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], e1, bd);
  1378. dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], e1, bd);
  1379. ip2 = ip2.Slice(1);
  1380. dest = dest.Slice(1);
  1381. }
  1382. }
  1383. public static void HighbdIadst4(ReadOnlySpan<int> input, Span<int> output, int bd)
  1384. {
  1385. long s0, s1, s2, s3, s4, s5, s6, s7;
  1386. int x0 = input[0];
  1387. int x1 = input[1];
  1388. int x2 = input[2];
  1389. int x3 = input[3];
  1390. if (DetectInvalidHighbdInput(input, 4) != 0)
  1391. {
  1392. Debug.Assert(false, "invalid highbd txfm input");
  1393. output.Slice(0, 4).Fill(0);
  1394. return;
  1395. }
  1396. if ((x0 | x1 | x2 | x3) == 0)
  1397. {
  1398. output.Slice(0, 4).Fill(0);
  1399. return;
  1400. }
  1401. s0 = (long)SinPi1_9 * x0;
  1402. s1 = (long)SinPi2_9 * x0;
  1403. s2 = (long)SinPi3_9 * x1;
  1404. s3 = (long)SinPi4_9 * x2;
  1405. s4 = (long)SinPi1_9 * x2;
  1406. s5 = (long)SinPi2_9 * x3;
  1407. s6 = (long)SinPi4_9 * x3;
  1408. s7 = HighbdWrapLow(x0 - x2 + x3, bd);
  1409. s0 = s0 + s3 + s5;
  1410. s1 = s1 - s4 - s6;
  1411. s3 = s2;
  1412. s2 = SinPi3_9 * s7;
  1413. // 1-D transform scaling factor is sqrt(2).
  1414. // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  1415. // + 1b (addition) = 29b.
  1416. // Hence the output bit depth is 15b.
  1417. output[0] = HighbdWrapLow(DctConstRoundShift(s0 + s3), bd);
  1418. output[1] = HighbdWrapLow(DctConstRoundShift(s1 + s3), bd);
  1419. output[2] = HighbdWrapLow(DctConstRoundShift(s2), bd);
  1420. output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd);
  1421. }
  1422. public static void HighbdIdct4(ReadOnlySpan<int> input, Span<int> output, int bd)
  1423. {
  1424. Span<int> step = stackalloc int[4];
  1425. long temp1, temp2;
  1426. if (DetectInvalidHighbdInput(input, 4) != 0)
  1427. {
  1428. Debug.Assert(false, "invalid highbd txfm input");
  1429. output.Slice(0, 4).Fill(0);
  1430. return;
  1431. }
  1432. // stage 1
  1433. temp1 = (input[0] + input[2]) * (long)CosPi16_64;
  1434. temp2 = (input[0] - input[2]) * (long)CosPi16_64;
  1435. step[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1436. step[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1437. temp1 = input[1] * (long)CosPi24_64 - input[3] * (long)CosPi8_64;
  1438. temp2 = input[1] * (long)CosPi8_64 + input[3] * (long)CosPi24_64;
  1439. step[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1440. step[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1441. // stage 2
  1442. output[0] = HighbdWrapLow(step[0] + step[3], bd);
  1443. output[1] = HighbdWrapLow(step[1] + step[2], bd);
  1444. output[2] = HighbdWrapLow(step[1] - step[2], bd);
  1445. output[3] = HighbdWrapLow(step[0] - step[3], bd);
  1446. }
  1447. public static void HighbdIdct4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1448. {
  1449. int i, j;
  1450. Span<int> output = stackalloc int[4 * 4];
  1451. Span<int> outptr = output;
  1452. Span<int> tempIn = stackalloc int[4];
  1453. Span<int> tempOut = stackalloc int[4];
  1454. // Rows
  1455. for (i = 0; i < 4; ++i)
  1456. {
  1457. HighbdIdct4(input, outptr, bd);
  1458. input = input.Slice(4);
  1459. outptr = outptr.Slice(4);
  1460. }
  1461. // Columns
  1462. for (i = 0; i < 4; ++i)
  1463. {
  1464. for (j = 0; j < 4; ++j)
  1465. {
  1466. tempIn[j] = output[j * 4 + i];
  1467. }
  1468. HighbdIdct4(tempIn, tempOut, bd);
  1469. for (j = 0; j < 4; ++j)
  1470. {
  1471. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd);
  1472. }
  1473. }
  1474. }
  1475. public static void HighbdIdct4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1476. {
  1477. int i;
  1478. long a1;
  1479. int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
  1480. output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
  1481. a1 = BitUtils.RoundPowerOfTwo(output, 4);
  1482. for (i = 0; i < 4; i++)
  1483. {
  1484. dest[0] = HighbdClipPixelAdd(dest[0], a1, bd);
  1485. dest[1] = HighbdClipPixelAdd(dest[1], a1, bd);
  1486. dest[2] = HighbdClipPixelAdd(dest[2], a1, bd);
  1487. dest[3] = HighbdClipPixelAdd(dest[3], a1, bd);
  1488. dest = dest.Slice(stride);
  1489. }
  1490. }
  1491. public static void HighbdIadst8(ReadOnlySpan<int> input, Span<int> output, int bd)
  1492. {
  1493. long s0, s1, s2, s3, s4, s5, s6, s7;
  1494. int x0 = input[7];
  1495. int x1 = input[0];
  1496. int x2 = input[5];
  1497. int x3 = input[2];
  1498. int x4 = input[3];
  1499. int x5 = input[4];
  1500. int x6 = input[1];
  1501. int x7 = input[6];
  1502. if (DetectInvalidHighbdInput(input, 8) != 0)
  1503. {
  1504. Debug.Assert(false, "invalid highbd txfm input");
  1505. output.Slice(0, 8).Fill(0);
  1506. return;
  1507. }
  1508. if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0)
  1509. {
  1510. output.Slice(0, 8).Fill(0);
  1511. return;
  1512. }
  1513. // stage 1
  1514. s0 = (long)CosPi2_64 * x0 + (long)CosPi30_64 * x1;
  1515. s1 = (long)CosPi30_64 * x0 - (long)CosPi2_64 * x1;
  1516. s2 = (long)CosPi10_64 * x2 + (long)CosPi22_64 * x3;
  1517. s3 = (long)CosPi22_64 * x2 - (long)CosPi10_64 * x3;
  1518. s4 = (long)CosPi18_64 * x4 + (long)CosPi14_64 * x5;
  1519. s5 = (long)CosPi14_64 * x4 - (long)CosPi18_64 * x5;
  1520. s6 = (long)CosPi26_64 * x6 + (long)CosPi6_64 * x7;
  1521. s7 = (long)CosPi6_64 * x6 - (long)CosPi26_64 * x7;
  1522. x0 = HighbdWrapLow(DctConstRoundShift(s0 + s4), bd);
  1523. x1 = HighbdWrapLow(DctConstRoundShift(s1 + s5), bd);
  1524. x2 = HighbdWrapLow(DctConstRoundShift(s2 + s6), bd);
  1525. x3 = HighbdWrapLow(DctConstRoundShift(s3 + s7), bd);
  1526. x4 = HighbdWrapLow(DctConstRoundShift(s0 - s4), bd);
  1527. x5 = HighbdWrapLow(DctConstRoundShift(s1 - s5), bd);
  1528. x6 = HighbdWrapLow(DctConstRoundShift(s2 - s6), bd);
  1529. x7 = HighbdWrapLow(DctConstRoundShift(s3 - s7), bd);
  1530. // stage 2
  1531. s0 = x0;
  1532. s1 = x1;
  1533. s2 = x2;
  1534. s3 = x3;
  1535. s4 = (long)CosPi8_64 * x4 + (long)CosPi24_64 * x5;
  1536. s5 = (long)CosPi24_64 * x4 - (long)CosPi8_64 * x5;
  1537. s6 = (long)(-CosPi24_64) * x6 + (long)CosPi8_64 * x7;
  1538. s7 = (long)CosPi8_64 * x6 + (long)CosPi24_64 * x7;
  1539. x0 = HighbdWrapLow(s0 + s2, bd);
  1540. x1 = HighbdWrapLow(s1 + s3, bd);
  1541. x2 = HighbdWrapLow(s0 - s2, bd);
  1542. x3 = HighbdWrapLow(s1 - s3, bd);
  1543. x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd);
  1544. x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd);
  1545. x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd);
  1546. x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd);
  1547. // stage 3
  1548. s2 = (long)CosPi16_64 * (x2 + x3);
  1549. s3 = (long)CosPi16_64 * (x2 - x3);
  1550. s6 = (long)CosPi16_64 * (x6 + x7);
  1551. s7 = (long)CosPi16_64 * (x6 - x7);
  1552. x2 = HighbdWrapLow(DctConstRoundShift(s2), bd);
  1553. x3 = HighbdWrapLow(DctConstRoundShift(s3), bd);
  1554. x6 = HighbdWrapLow(DctConstRoundShift(s6), bd);
  1555. x7 = HighbdWrapLow(DctConstRoundShift(s7), bd);
  1556. output[0] = HighbdWrapLow(x0, bd);
  1557. output[1] = HighbdWrapLow(-x4, bd);
  1558. output[2] = HighbdWrapLow(x6, bd);
  1559. output[3] = HighbdWrapLow(-x2, bd);
  1560. output[4] = HighbdWrapLow(x3, bd);
  1561. output[5] = HighbdWrapLow(-x7, bd);
  1562. output[6] = HighbdWrapLow(x5, bd);
  1563. output[7] = HighbdWrapLow(-x1, bd);
  1564. }
  1565. public static void HighbdIdct8(ReadOnlySpan<int> input, Span<int> output, int bd)
  1566. {
  1567. Span<int> step1 = stackalloc int[8];
  1568. Span<int> step2 = stackalloc int[8];
  1569. long temp1, temp2;
  1570. if (DetectInvalidHighbdInput(input, 8) != 0)
  1571. {
  1572. Debug.Assert(false, "invalid highbd txfm input");
  1573. output.Slice(0, 8).Fill(0);
  1574. return;
  1575. }
  1576. // stage 1
  1577. step1[0] = input[0];
  1578. step1[2] = input[4];
  1579. step1[1] = input[2];
  1580. step1[3] = input[6];
  1581. temp1 = input[1] * (long)CosPi28_64 - input[7] * (long)CosPi4_64;
  1582. temp2 = input[1] * (long)CosPi4_64 + input[7] * (long)CosPi28_64;
  1583. step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1584. step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1585. temp1 = input[5] * (long)CosPi12_64 - input[3] * (long)CosPi20_64;
  1586. temp2 = input[5] * (long)CosPi20_64 + input[3] * (long)CosPi12_64;
  1587. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1588. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1589. // stage 2 & stage 3 - even half
  1590. HighbdIdct4(step1, step1, bd);
  1591. // stage 2 - odd half
  1592. step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
  1593. step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
  1594. step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
  1595. step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
  1596. // stage 3 - odd half
  1597. step1[4] = step2[4];
  1598. temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
  1599. temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
  1600. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1601. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1602. step1[7] = step2[7];
  1603. // stage 4
  1604. output[0] = HighbdWrapLow(step1[0] + step1[7], bd);
  1605. output[1] = HighbdWrapLow(step1[1] + step1[6], bd);
  1606. output[2] = HighbdWrapLow(step1[2] + step1[5], bd);
  1607. output[3] = HighbdWrapLow(step1[3] + step1[4], bd);
  1608. output[4] = HighbdWrapLow(step1[3] - step1[4], bd);
  1609. output[5] = HighbdWrapLow(step1[2] - step1[5], bd);
  1610. output[6] = HighbdWrapLow(step1[1] - step1[6], bd);
  1611. output[7] = HighbdWrapLow(step1[0] - step1[7], bd);
  1612. }
  1613. public static void HighbdIdct8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1614. {
  1615. int i, j;
  1616. Span<int> output = stackalloc int[8 * 8];
  1617. Span<int> outptr = output;
  1618. Span<int> tempIn = stackalloc int[8];
  1619. Span<int> tempOut = stackalloc int[8];
  1620. // First transform rows
  1621. for (i = 0; i < 8; ++i)
  1622. {
  1623. HighbdIdct8(input, outptr, bd);
  1624. input = input.Slice(8);
  1625. outptr = outptr.Slice(8);
  1626. }
  1627. // Then transform columns
  1628. for (i = 0; i < 8; ++i)
  1629. {
  1630. for (j = 0; j < 8; ++j)
  1631. {
  1632. tempIn[j] = output[j * 8 + i];
  1633. }
  1634. HighbdIdct8(tempIn, tempOut, bd);
  1635. for (j = 0; j < 8; ++j)
  1636. {
  1637. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
  1638. }
  1639. }
  1640. }
  1641. public static void HighbdIdct8x812Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1642. {
  1643. int i, j;
  1644. Span<int> output = stackalloc int[8 * 8];
  1645. Span<int> outptr = output;
  1646. Span<int> tempIn = stackalloc int[8];
  1647. Span<int> tempOut = stackalloc int[8];
  1648. // First transform rows
  1649. // Only first 4 row has non-zero coefs
  1650. for (i = 0; i < 4; ++i)
  1651. {
  1652. HighbdIdct8(input, outptr, bd);
  1653. input = input.Slice(8);
  1654. outptr = outptr.Slice(8);
  1655. }
  1656. // Then transform columns
  1657. for (i = 0; i < 8; ++i)
  1658. {
  1659. for (j = 0; j < 8; ++j)
  1660. {
  1661. tempIn[j] = output[j * 8 + i];
  1662. }
  1663. HighbdIdct8(tempIn, tempOut, bd);
  1664. for (j = 0; j < 8; ++j)
  1665. {
  1666. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
  1667. }
  1668. }
  1669. }
  1670. public static void vpx_Highbdidct8x8_1_add_c(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1671. {
  1672. int i, j;
  1673. long a1;
  1674. int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
  1675. output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
  1676. a1 = BitUtils.RoundPowerOfTwo(output, 5);
  1677. for (j = 0; j < 8; ++j)
  1678. {
  1679. for (i = 0; i < 8; ++i)
  1680. {
  1681. dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
  1682. }
  1683. dest = dest.Slice(stride);
  1684. }
  1685. }
  1686. public static void HighbdIadst16(ReadOnlySpan<int> input, Span<int> output, int bd)
  1687. {
  1688. long s0, s1, s2, s3, s4, s5, s6, s7, s8;
  1689. long s9, s10, s11, s12, s13, s14, s15;
  1690. int x0 = input[15];
  1691. int x1 = input[0];
  1692. int x2 = input[13];
  1693. int x3 = input[2];
  1694. int x4 = input[11];
  1695. int x5 = input[4];
  1696. int x6 = input[9];
  1697. int x7 = input[6];
  1698. int x8 = input[7];
  1699. int x9 = input[8];
  1700. int x10 = input[5];
  1701. int x11 = input[10];
  1702. int x12 = input[3];
  1703. int x13 = input[12];
  1704. int x14 = input[1];
  1705. int x15 = input[14];
  1706. if (DetectInvalidHighbdInput(input, 16) != 0)
  1707. {
  1708. Debug.Assert(false, "invalid highbd txfm input");
  1709. output.Slice(0, 16).Fill(0);
  1710. return;
  1711. }
  1712. if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0)
  1713. {
  1714. output.Slice(0, 16).Fill(0);
  1715. return;
  1716. }
  1717. // stage 1
  1718. s0 = x0 * (long)CosPi1_64 + x1 * (long)CosPi31_64;
  1719. s1 = x0 * (long)CosPi31_64 - x1 * (long)CosPi1_64;
  1720. s2 = x2 * (long)CosPi5_64 + x3 * (long)CosPi27_64;
  1721. s3 = x2 * (long)CosPi27_64 - x3 * (long)CosPi5_64;
  1722. s4 = x4 * (long)CosPi9_64 + x5 * (long)CosPi23_64;
  1723. s5 = x4 * (long)CosPi23_64 - x5 * (long)CosPi9_64;
  1724. s6 = x6 * (long)CosPi13_64 + x7 * (long)CosPi19_64;
  1725. s7 = x6 * (long)CosPi19_64 - x7 * (long)CosPi13_64;
  1726. s8 = x8 * (long)CosPi17_64 + x9 * (long)CosPi15_64;
  1727. s9 = x8 * (long)CosPi15_64 - x9 * (long)CosPi17_64;
  1728. s10 = x10 * (long)CosPi21_64 + x11 * (long)CosPi11_64;
  1729. s11 = x10 * (long)CosPi11_64 - x11 * (long)CosPi21_64;
  1730. s12 = x12 * (long)CosPi25_64 + x13 * (long)CosPi7_64;
  1731. s13 = x12 * (long)CosPi7_64 - x13 * (long)CosPi25_64;
  1732. s14 = x14 * (long)CosPi29_64 + x15 * (long)CosPi3_64;
  1733. s15 = x14 * (long)CosPi3_64 - x15 * (long)CosPi29_64;
  1734. x0 = HighbdWrapLow(DctConstRoundShift(s0 + s8), bd);
  1735. x1 = HighbdWrapLow(DctConstRoundShift(s1 + s9), bd);
  1736. x2 = HighbdWrapLow(DctConstRoundShift(s2 + s10), bd);
  1737. x3 = HighbdWrapLow(DctConstRoundShift(s3 + s11), bd);
  1738. x4 = HighbdWrapLow(DctConstRoundShift(s4 + s12), bd);
  1739. x5 = HighbdWrapLow(DctConstRoundShift(s5 + s13), bd);
  1740. x6 = HighbdWrapLow(DctConstRoundShift(s6 + s14), bd);
  1741. x7 = HighbdWrapLow(DctConstRoundShift(s7 + s15), bd);
  1742. x8 = HighbdWrapLow(DctConstRoundShift(s0 - s8), bd);
  1743. x9 = HighbdWrapLow(DctConstRoundShift(s1 - s9), bd);
  1744. x10 = HighbdWrapLow(DctConstRoundShift(s2 - s10), bd);
  1745. x11 = HighbdWrapLow(DctConstRoundShift(s3 - s11), bd);
  1746. x12 = HighbdWrapLow(DctConstRoundShift(s4 - s12), bd);
  1747. x13 = HighbdWrapLow(DctConstRoundShift(s5 - s13), bd);
  1748. x14 = HighbdWrapLow(DctConstRoundShift(s6 - s14), bd);
  1749. x15 = HighbdWrapLow(DctConstRoundShift(s7 - s15), bd);
  1750. // stage 2
  1751. s0 = x0;
  1752. s1 = x1;
  1753. s2 = x2;
  1754. s3 = x3;
  1755. s4 = x4;
  1756. s5 = x5;
  1757. s6 = x6;
  1758. s7 = x7;
  1759. s8 = x8 * (long)CosPi4_64 + x9 * (long)CosPi28_64;
  1760. s9 = x8 * (long)CosPi28_64 - x9 * (long)CosPi4_64;
  1761. s10 = x10 * (long)CosPi20_64 + x11 * (long)CosPi12_64;
  1762. s11 = x10 * (long)CosPi12_64 - x11 * (long)CosPi20_64;
  1763. s12 = -x12 * (long)CosPi28_64 + x13 * (long)CosPi4_64;
  1764. s13 = x12 * (long)CosPi4_64 + x13 * (long)CosPi28_64;
  1765. s14 = -x14 * (long)CosPi12_64 + x15 * (long)CosPi20_64;
  1766. s15 = x14 * (long)CosPi20_64 + x15 * (long)CosPi12_64;
  1767. x0 = HighbdWrapLow(s0 + s4, bd);
  1768. x1 = HighbdWrapLow(s1 + s5, bd);
  1769. x2 = HighbdWrapLow(s2 + s6, bd);
  1770. x3 = HighbdWrapLow(s3 + s7, bd);
  1771. x4 = HighbdWrapLow(s0 - s4, bd);
  1772. x5 = HighbdWrapLow(s1 - s5, bd);
  1773. x6 = HighbdWrapLow(s2 - s6, bd);
  1774. x7 = HighbdWrapLow(s3 - s7, bd);
  1775. x8 = HighbdWrapLow(DctConstRoundShift(s8 + s12), bd);
  1776. x9 = HighbdWrapLow(DctConstRoundShift(s9 + s13), bd);
  1777. x10 = HighbdWrapLow(DctConstRoundShift(s10 + s14), bd);
  1778. x11 = HighbdWrapLow(DctConstRoundShift(s11 + s15), bd);
  1779. x12 = HighbdWrapLow(DctConstRoundShift(s8 - s12), bd);
  1780. x13 = HighbdWrapLow(DctConstRoundShift(s9 - s13), bd);
  1781. x14 = HighbdWrapLow(DctConstRoundShift(s10 - s14), bd);
  1782. x15 = HighbdWrapLow(DctConstRoundShift(s11 - s15), bd);
  1783. // stage 3
  1784. s0 = x0;
  1785. s1 = x1;
  1786. s2 = x2;
  1787. s3 = x3;
  1788. s4 = x4 * (long)CosPi8_64 + x5 * (long)CosPi24_64;
  1789. s5 = x4 * (long)CosPi24_64 - x5 * (long)CosPi8_64;
  1790. s6 = -x6 * (long)CosPi24_64 + x7 * (long)CosPi8_64;
  1791. s7 = x6 * (long)CosPi8_64 + x7 * (long)CosPi24_64;
  1792. s8 = x8;
  1793. s9 = x9;
  1794. s10 = x10;
  1795. s11 = x11;
  1796. s12 = x12 * (long)CosPi8_64 + x13 * (long)CosPi24_64;
  1797. s13 = x12 * (long)CosPi24_64 - x13 * (long)CosPi8_64;
  1798. s14 = -x14 * (long)CosPi24_64 + x15 * (long)CosPi8_64;
  1799. s15 = x14 * (long)CosPi8_64 + x15 * (long)CosPi24_64;
  1800. x0 = HighbdWrapLow(s0 + s2, bd);
  1801. x1 = HighbdWrapLow(s1 + s3, bd);
  1802. x2 = HighbdWrapLow(s0 - s2, bd);
  1803. x3 = HighbdWrapLow(s1 - s3, bd);
  1804. x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd);
  1805. x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd);
  1806. x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd);
  1807. x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd);
  1808. x8 = HighbdWrapLow(s8 + s10, bd);
  1809. x9 = HighbdWrapLow(s9 + s11, bd);
  1810. x10 = HighbdWrapLow(s8 - s10, bd);
  1811. x11 = HighbdWrapLow(s9 - s11, bd);
  1812. x12 = HighbdWrapLow(DctConstRoundShift(s12 + s14), bd);
  1813. x13 = HighbdWrapLow(DctConstRoundShift(s13 + s15), bd);
  1814. x14 = HighbdWrapLow(DctConstRoundShift(s12 - s14), bd);
  1815. x15 = HighbdWrapLow(DctConstRoundShift(s13 - s15), bd);
  1816. // stage 4
  1817. s2 = (long)(-CosPi16_64) * (x2 + x3);
  1818. s3 = (long)CosPi16_64 * (x2 - x3);
  1819. s6 = (long)CosPi16_64 * (x6 + x7);
  1820. s7 = (long)CosPi16_64 * (-x6 + x7);
  1821. s10 = (long)CosPi16_64 * (x10 + x11);
  1822. s11 = (long)CosPi16_64 * (-x10 + x11);
  1823. s14 = (long)(-CosPi16_64) * (x14 + x15);
  1824. s15 = (long)CosPi16_64 * (x14 - x15);
  1825. x2 = HighbdWrapLow(DctConstRoundShift(s2), bd);
  1826. x3 = HighbdWrapLow(DctConstRoundShift(s3), bd);
  1827. x6 = HighbdWrapLow(DctConstRoundShift(s6), bd);
  1828. x7 = HighbdWrapLow(DctConstRoundShift(s7), bd);
  1829. x10 = HighbdWrapLow(DctConstRoundShift(s10), bd);
  1830. x11 = HighbdWrapLow(DctConstRoundShift(s11), bd);
  1831. x14 = HighbdWrapLow(DctConstRoundShift(s14), bd);
  1832. x15 = HighbdWrapLow(DctConstRoundShift(s15), bd);
  1833. output[0] = HighbdWrapLow(x0, bd);
  1834. output[1] = HighbdWrapLow(-x8, bd);
  1835. output[2] = HighbdWrapLow(x12, bd);
  1836. output[3] = HighbdWrapLow(-x4, bd);
  1837. output[4] = HighbdWrapLow(x6, bd);
  1838. output[5] = HighbdWrapLow(x14, bd);
  1839. output[6] = HighbdWrapLow(x10, bd);
  1840. output[7] = HighbdWrapLow(x2, bd);
  1841. output[8] = HighbdWrapLow(x3, bd);
  1842. output[9] = HighbdWrapLow(x11, bd);
  1843. output[10] = HighbdWrapLow(x15, bd);
  1844. output[11] = HighbdWrapLow(x7, bd);
  1845. output[12] = HighbdWrapLow(x5, bd);
  1846. output[13] = HighbdWrapLow(-x13, bd);
  1847. output[14] = HighbdWrapLow(x9, bd);
  1848. output[15] = HighbdWrapLow(-x1, bd);
  1849. }
  1850. public static void HighbdIdct16(ReadOnlySpan<int> input, Span<int> output, int bd)
  1851. {
  1852. Span<int> step1 = stackalloc int[16];
  1853. Span<int> step2 = stackalloc int[16];
  1854. long temp1, temp2;
  1855. if (DetectInvalidHighbdInput(input, 16) != 0)
  1856. {
  1857. Debug.Assert(false, "invalid highbd txfm input");
  1858. output.Slice(0, 16).Fill(0);
  1859. return;
  1860. }
  1861. // stage 1
  1862. step1[0] = input[0 / 2];
  1863. step1[1] = input[16 / 2];
  1864. step1[2] = input[8 / 2];
  1865. step1[3] = input[24 / 2];
  1866. step1[4] = input[4 / 2];
  1867. step1[5] = input[20 / 2];
  1868. step1[6] = input[12 / 2];
  1869. step1[7] = input[28 / 2];
  1870. step1[8] = input[2 / 2];
  1871. step1[9] = input[18 / 2];
  1872. step1[10] = input[10 / 2];
  1873. step1[11] = input[26 / 2];
  1874. step1[12] = input[6 / 2];
  1875. step1[13] = input[22 / 2];
  1876. step1[14] = input[14 / 2];
  1877. step1[15] = input[30 / 2];
  1878. // stage 2
  1879. step2[0] = step1[0];
  1880. step2[1] = step1[1];
  1881. step2[2] = step1[2];
  1882. step2[3] = step1[3];
  1883. step2[4] = step1[4];
  1884. step2[5] = step1[5];
  1885. step2[6] = step1[6];
  1886. step2[7] = step1[7];
  1887. temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64;
  1888. temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64;
  1889. step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1890. step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1891. temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64;
  1892. temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64;
  1893. step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1894. step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1895. temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64;
  1896. temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64;
  1897. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1898. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1899. temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64;
  1900. temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64;
  1901. step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1902. step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1903. // stage 3
  1904. step1[0] = step2[0];
  1905. step1[1] = step2[1];
  1906. step1[2] = step2[2];
  1907. step1[3] = step2[3];
  1908. temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64;
  1909. temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64;
  1910. step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1911. step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1912. temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64;
  1913. temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64;
  1914. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1915. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1916. step1[8] = HighbdWrapLow(step2[8] + step2[9], bd);
  1917. step1[9] = HighbdWrapLow(step2[8] - step2[9], bd);
  1918. step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd);
  1919. step1[11] = HighbdWrapLow(step2[10] + step2[11], bd);
  1920. step1[12] = HighbdWrapLow(step2[12] + step2[13], bd);
  1921. step1[13] = HighbdWrapLow(step2[12] - step2[13], bd);
  1922. step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd);
  1923. step1[15] = HighbdWrapLow(step2[14] + step2[15], bd);
  1924. // stage 4
  1925. temp1 = (step1[0] + step1[1]) * (long)CosPi16_64;
  1926. temp2 = (step1[0] - step1[1]) * (long)CosPi16_64;
  1927. step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1928. step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1929. temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64;
  1930. temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64;
  1931. step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1932. step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1933. step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
  1934. step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
  1935. step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
  1936. step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
  1937. step2[8] = step1[8];
  1938. step2[15] = step1[15];
  1939. temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
  1940. temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
  1941. step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1942. step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1943. temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
  1944. temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64;
  1945. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1946. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1947. step2[11] = step1[11];
  1948. step2[12] = step1[12];
  1949. // stage 5
  1950. step1[0] = HighbdWrapLow(step2[0] + step2[3], bd);
  1951. step1[1] = HighbdWrapLow(step2[1] + step2[2], bd);
  1952. step1[2] = HighbdWrapLow(step2[1] - step2[2], bd);
  1953. step1[3] = HighbdWrapLow(step2[0] - step2[3], bd);
  1954. step1[4] = step2[4];
  1955. temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
  1956. temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
  1957. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1958. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1959. step1[7] = step2[7];
  1960. step1[8] = HighbdWrapLow(step2[8] + step2[11], bd);
  1961. step1[9] = HighbdWrapLow(step2[9] + step2[10], bd);
  1962. step1[10] = HighbdWrapLow(step2[9] - step2[10], bd);
  1963. step1[11] = HighbdWrapLow(step2[8] - step2[11], bd);
  1964. step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd);
  1965. step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd);
  1966. step1[14] = HighbdWrapLow(step2[13] + step2[14], bd);
  1967. step1[15] = HighbdWrapLow(step2[12] + step2[15], bd);
  1968. // stage 6
  1969. step2[0] = HighbdWrapLow(step1[0] + step1[7], bd);
  1970. step2[1] = HighbdWrapLow(step1[1] + step1[6], bd);
  1971. step2[2] = HighbdWrapLow(step1[2] + step1[5], bd);
  1972. step2[3] = HighbdWrapLow(step1[3] + step1[4], bd);
  1973. step2[4] = HighbdWrapLow(step1[3] - step1[4], bd);
  1974. step2[5] = HighbdWrapLow(step1[2] - step1[5], bd);
  1975. step2[6] = HighbdWrapLow(step1[1] - step1[6], bd);
  1976. step2[7] = HighbdWrapLow(step1[0] - step1[7], bd);
  1977. step2[8] = step1[8];
  1978. step2[9] = step1[9];
  1979. temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64;
  1980. temp2 = (step1[10] + step1[13]) * (long)CosPi16_64;
  1981. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1982. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1983. temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64;
  1984. temp2 = (step1[11] + step1[12]) * (long)CosPi16_64;
  1985. step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1986. step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1987. step2[14] = step1[14];
  1988. step2[15] = step1[15];
  1989. // stage 7
  1990. output[0] = HighbdWrapLow(step2[0] + step2[15], bd);
  1991. output[1] = HighbdWrapLow(step2[1] + step2[14], bd);
  1992. output[2] = HighbdWrapLow(step2[2] + step2[13], bd);
  1993. output[3] = HighbdWrapLow(step2[3] + step2[12], bd);
  1994. output[4] = HighbdWrapLow(step2[4] + step2[11], bd);
  1995. output[5] = HighbdWrapLow(step2[5] + step2[10], bd);
  1996. output[6] = HighbdWrapLow(step2[6] + step2[9], bd);
  1997. output[7] = HighbdWrapLow(step2[7] + step2[8], bd);
  1998. output[8] = HighbdWrapLow(step2[7] - step2[8], bd);
  1999. output[9] = HighbdWrapLow(step2[6] - step2[9], bd);
  2000. output[10] = HighbdWrapLow(step2[5] - step2[10], bd);
  2001. output[11] = HighbdWrapLow(step2[4] - step2[11], bd);
  2002. output[12] = HighbdWrapLow(step2[3] - step2[12], bd);
  2003. output[13] = HighbdWrapLow(step2[2] - step2[13], bd);
  2004. output[14] = HighbdWrapLow(step2[1] - step2[14], bd);
  2005. output[15] = HighbdWrapLow(step2[0] - step2[15], bd);
  2006. }
  2007. public static void HighbdIdct16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2008. {
  2009. int i, j;
  2010. Span<int> output = stackalloc int[16 * 16];
  2011. Span<int> outptr = output;
  2012. Span<int> tempIn = stackalloc int[16];
  2013. Span<int> tempOut = stackalloc int[16];
  2014. // First transform rows
  2015. for (i = 0; i < 16; ++i)
  2016. {
  2017. HighbdIdct16(input, outptr, bd);
  2018. input = input.Slice(16);
  2019. outptr = outptr.Slice(16);
  2020. }
  2021. // Then transform columns
  2022. for (i = 0; i < 16; ++i)
  2023. {
  2024. for (j = 0; j < 16; ++j)
  2025. {
  2026. tempIn[j] = output[j * 16 + i];
  2027. }
  2028. HighbdIdct16(tempIn, tempOut, bd);
  2029. for (j = 0; j < 16; ++j)
  2030. {
  2031. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2032. }
  2033. }
  2034. }
  2035. public static void HighbdIdct16x1638Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2036. {
  2037. int i, j;
  2038. Span<int> output = stackalloc int[16 * 16];
  2039. Span<int> outptr = output;
  2040. Span<int> tempIn = stackalloc int[16];
  2041. Span<int> tempOut = stackalloc int[16];
  2042. // First transform rows. Since all non-zero dct coefficients are in
  2043. // upper-left 8x8 area, we only need to calculate first 8 rows here.
  2044. for (i = 0; i < 8; ++i)
  2045. {
  2046. HighbdIdct16(input, outptr, bd);
  2047. input = input.Slice(16);
  2048. outptr = outptr.Slice(16);
  2049. }
  2050. // Then transform columns
  2051. for (i = 0; i < 16; ++i)
  2052. {
  2053. Span<ushort> destT = dest;
  2054. for (j = 0; j < 16; ++j)
  2055. {
  2056. tempIn[j] = output[j * 16 + i];
  2057. }
  2058. HighbdIdct16(tempIn, tempOut, bd);
  2059. for (j = 0; j < 16; ++j)
  2060. {
  2061. destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2062. destT = destT.Slice(stride);
  2063. }
  2064. }
  2065. }
  2066. public static void HighbdIdct16x1610Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2067. {
  2068. int i, j;
  2069. Span<int> output = stackalloc int[16 * 16];
  2070. Span<int> outptr = output;
  2071. Span<int> tempIn = stackalloc int[16];
  2072. Span<int> tempOut = stackalloc int[16];
  2073. // First transform rows. Since all non-zero dct coefficients are in
  2074. // upper-left 4x4 area, we only need to calculate first 4 rows here.
  2075. for (i = 0; i < 4; ++i)
  2076. {
  2077. HighbdIdct16(input, outptr, bd);
  2078. input = input.Slice(16);
  2079. outptr = outptr.Slice(16);
  2080. }
  2081. // Then transform columns
  2082. for (i = 0; i < 16; ++i)
  2083. {
  2084. for (j = 0; j < 16; ++j)
  2085. {
  2086. tempIn[j] = output[j * 16 + i];
  2087. }
  2088. HighbdIdct16(tempIn, tempOut, bd);
  2089. for (j = 0; j < 16; ++j)
  2090. {
  2091. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2092. }
  2093. }
  2094. }
  2095. public static void HighbdIdct16x161Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2096. {
  2097. int i, j;
  2098. long a1;
  2099. int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
  2100. output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
  2101. a1 = BitUtils.RoundPowerOfTwo(output, 6);
  2102. for (j = 0; j < 16; ++j)
  2103. {
  2104. for (i = 0; i < 16; ++i)
  2105. {
  2106. dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
  2107. }
  2108. dest = dest.Slice(stride);
  2109. }
  2110. }
  2111. public static void HighbdIdct32(ReadOnlySpan<int> input, Span<int> output, int bd)
  2112. {
  2113. Span<int> step1 = stackalloc int[32];
  2114. Span<int> step2 = stackalloc int[32];
  2115. long temp1, temp2;
  2116. if (DetectInvalidHighbdInput(input, 32) != 0)
  2117. {
  2118. Debug.Assert(false, "invalid highbd txfm input");
  2119. output.Slice(0, 32).Fill(0);
  2120. return;
  2121. }
  2122. // stage 1
  2123. step1[0] = input[0];
  2124. step1[1] = input[16];
  2125. step1[2] = input[8];
  2126. step1[3] = input[24];
  2127. step1[4] = input[4];
  2128. step1[5] = input[20];
  2129. step1[6] = input[12];
  2130. step1[7] = input[28];
  2131. step1[8] = input[2];
  2132. step1[9] = input[18];
  2133. step1[10] = input[10];
  2134. step1[11] = input[26];
  2135. step1[12] = input[6];
  2136. step1[13] = input[22];
  2137. step1[14] = input[14];
  2138. step1[15] = input[30];
  2139. temp1 = input[1] * (long)CosPi31_64 - input[31] * (long)CosPi1_64;
  2140. temp2 = input[1] * (long)CosPi1_64 + input[31] * (long)CosPi31_64;
  2141. step1[16] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2142. step1[31] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2143. temp1 = input[17] * (long)CosPi15_64 - input[15] * (long)CosPi17_64;
  2144. temp2 = input[17] * (long)CosPi17_64 + input[15] * (long)CosPi15_64;
  2145. step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2146. step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2147. temp1 = input[9] * (long)CosPi23_64 - input[23] * (long)CosPi9_64;
  2148. temp2 = input[9] * (long)CosPi9_64 + input[23] * (long)CosPi23_64;
  2149. step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2150. step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2151. temp1 = input[25] * (long)CosPi7_64 - input[7] * (long)CosPi25_64;
  2152. temp2 = input[25] * (long)CosPi25_64 + input[7] * (long)CosPi7_64;
  2153. step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2154. step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2155. temp1 = input[5] * (long)CosPi27_64 - input[27] * (long)CosPi5_64;
  2156. temp2 = input[5] * (long)CosPi5_64 + input[27] * (long)CosPi27_64;
  2157. step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2158. step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2159. temp1 = input[21] * (long)CosPi11_64 - input[11] * (long)CosPi21_64;
  2160. temp2 = input[21] * (long)CosPi21_64 + input[11] * (long)CosPi11_64;
  2161. step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2162. step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2163. temp1 = input[13] * (long)CosPi19_64 - input[19] * (long)CosPi13_64;
  2164. temp2 = input[13] * (long)CosPi13_64 + input[19] * (long)CosPi19_64;
  2165. step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2166. step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2167. temp1 = input[29] * (long)CosPi3_64 - input[3] * (long)CosPi29_64;
  2168. temp2 = input[29] * (long)CosPi29_64 + input[3] * (long)CosPi3_64;
  2169. step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2170. step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2171. // stage 2
  2172. step2[0] = step1[0];
  2173. step2[1] = step1[1];
  2174. step2[2] = step1[2];
  2175. step2[3] = step1[3];
  2176. step2[4] = step1[4];
  2177. step2[5] = step1[5];
  2178. step2[6] = step1[6];
  2179. step2[7] = step1[7];
  2180. temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64;
  2181. temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64;
  2182. step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2183. step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2184. temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64;
  2185. temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64;
  2186. step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2187. step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2188. temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64;
  2189. temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64;
  2190. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2191. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2192. temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64;
  2193. temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64;
  2194. step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2195. step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2196. step2[16] = HighbdWrapLow(step1[16] + step1[17], bd);
  2197. step2[17] = HighbdWrapLow(step1[16] - step1[17], bd);
  2198. step2[18] = HighbdWrapLow(-step1[18] + step1[19], bd);
  2199. step2[19] = HighbdWrapLow(step1[18] + step1[19], bd);
  2200. step2[20] = HighbdWrapLow(step1[20] + step1[21], bd);
  2201. step2[21] = HighbdWrapLow(step1[20] - step1[21], bd);
  2202. step2[22] = HighbdWrapLow(-step1[22] + step1[23], bd);
  2203. step2[23] = HighbdWrapLow(step1[22] + step1[23], bd);
  2204. step2[24] = HighbdWrapLow(step1[24] + step1[25], bd);
  2205. step2[25] = HighbdWrapLow(step1[24] - step1[25], bd);
  2206. step2[26] = HighbdWrapLow(-step1[26] + step1[27], bd);
  2207. step2[27] = HighbdWrapLow(step1[26] + step1[27], bd);
  2208. step2[28] = HighbdWrapLow(step1[28] + step1[29], bd);
  2209. step2[29] = HighbdWrapLow(step1[28] - step1[29], bd);
  2210. step2[30] = HighbdWrapLow(-step1[30] + step1[31], bd);
  2211. step2[31] = HighbdWrapLow(step1[30] + step1[31], bd);
  2212. // stage 3
  2213. step1[0] = step2[0];
  2214. step1[1] = step2[1];
  2215. step1[2] = step2[2];
  2216. step1[3] = step2[3];
  2217. temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64;
  2218. temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64;
  2219. step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2220. step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2221. temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64;
  2222. temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64;
  2223. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2224. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2225. step1[8] = HighbdWrapLow(step2[8] + step2[9], bd);
  2226. step1[9] = HighbdWrapLow(step2[8] - step2[9], bd);
  2227. step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd);
  2228. step1[11] = HighbdWrapLow(step2[10] + step2[11], bd);
  2229. step1[12] = HighbdWrapLow(step2[12] + step2[13], bd);
  2230. step1[13] = HighbdWrapLow(step2[12] - step2[13], bd);
  2231. step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd);
  2232. step1[15] = HighbdWrapLow(step2[14] + step2[15], bd);
  2233. step1[16] = step2[16];
  2234. step1[31] = step2[31];
  2235. temp1 = -step2[17] * (long)CosPi4_64 + step2[30] * (long)CosPi28_64;
  2236. temp2 = step2[17] * (long)CosPi28_64 + step2[30] * (long)CosPi4_64;
  2237. step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2238. step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2239. temp1 = -step2[18] * (long)CosPi28_64 - step2[29] * (long)CosPi4_64;
  2240. temp2 = -step2[18] * (long)CosPi4_64 + step2[29] * (long)CosPi28_64;
  2241. step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2242. step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2243. step1[19] = step2[19];
  2244. step1[20] = step2[20];
  2245. temp1 = -step2[21] * (long)CosPi20_64 + step2[26] * (long)CosPi12_64;
  2246. temp2 = step2[21] * (long)CosPi12_64 + step2[26] * (long)CosPi20_64;
  2247. step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2248. step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2249. temp1 = -step2[22] * (long)CosPi12_64 - step2[25] * (long)CosPi20_64;
  2250. temp2 = -step2[22] * (long)CosPi20_64 + step2[25] * (long)CosPi12_64;
  2251. step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2252. step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2253. step1[23] = step2[23];
  2254. step1[24] = step2[24];
  2255. step1[27] = step2[27];
  2256. step1[28] = step2[28];
  2257. // stage 4
  2258. temp1 = (step1[0] + step1[1]) * (long)CosPi16_64;
  2259. temp2 = (step1[0] - step1[1]) * (long)CosPi16_64;
  2260. step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2261. step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2262. temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64;
  2263. temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64;
  2264. step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2265. step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2266. step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
  2267. step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
  2268. step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
  2269. step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
  2270. step2[8] = step1[8];
  2271. step2[15] = step1[15];
  2272. temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
  2273. temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
  2274. step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2275. step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2276. temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
  2277. temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64;
  2278. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2279. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2280. step2[11] = step1[11];
  2281. step2[12] = step1[12];
  2282. step2[16] = HighbdWrapLow(step1[16] + step1[19], bd);
  2283. step2[17] = HighbdWrapLow(step1[17] + step1[18], bd);
  2284. step2[18] = HighbdWrapLow(step1[17] - step1[18], bd);
  2285. step2[19] = HighbdWrapLow(step1[16] - step1[19], bd);
  2286. step2[20] = HighbdWrapLow(-step1[20] + step1[23], bd);
  2287. step2[21] = HighbdWrapLow(-step1[21] + step1[22], bd);
  2288. step2[22] = HighbdWrapLow(step1[21] + step1[22], bd);
  2289. step2[23] = HighbdWrapLow(step1[20] + step1[23], bd);
  2290. step2[24] = HighbdWrapLow(step1[24] + step1[27], bd);
  2291. step2[25] = HighbdWrapLow(step1[25] + step1[26], bd);
  2292. step2[26] = HighbdWrapLow(step1[25] - step1[26], bd);
  2293. step2[27] = HighbdWrapLow(step1[24] - step1[27], bd);
  2294. step2[28] = HighbdWrapLow(-step1[28] + step1[31], bd);
  2295. step2[29] = HighbdWrapLow(-step1[29] + step1[30], bd);
  2296. step2[30] = HighbdWrapLow(step1[29] + step1[30], bd);
  2297. step2[31] = HighbdWrapLow(step1[28] + step1[31], bd);
  2298. // stage 5
  2299. step1[0] = HighbdWrapLow(step2[0] + step2[3], bd);
  2300. step1[1] = HighbdWrapLow(step2[1] + step2[2], bd);
  2301. step1[2] = HighbdWrapLow(step2[1] - step2[2], bd);
  2302. step1[3] = HighbdWrapLow(step2[0] - step2[3], bd);
  2303. step1[4] = step2[4];
  2304. temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
  2305. temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
  2306. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2307. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2308. step1[7] = step2[7];
  2309. step1[8] = HighbdWrapLow(step2[8] + step2[11], bd);
  2310. step1[9] = HighbdWrapLow(step2[9] + step2[10], bd);
  2311. step1[10] = HighbdWrapLow(step2[9] - step2[10], bd);
  2312. step1[11] = HighbdWrapLow(step2[8] - step2[11], bd);
  2313. step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd);
  2314. step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd);
  2315. step1[14] = HighbdWrapLow(step2[13] + step2[14], bd);
  2316. step1[15] = HighbdWrapLow(step2[12] + step2[15], bd);
  2317. step1[16] = step2[16];
  2318. step1[17] = step2[17];
  2319. temp1 = -step2[18] * (long)CosPi8_64 + step2[29] * (long)CosPi24_64;
  2320. temp2 = step2[18] * (long)CosPi24_64 + step2[29] * (long)CosPi8_64;
  2321. step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2322. step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2323. temp1 = -step2[19] * (long)CosPi8_64 + step2[28] * (long)CosPi24_64;
  2324. temp2 = step2[19] * (long)CosPi24_64 + step2[28] * (long)CosPi8_64;
  2325. step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2326. step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2327. temp1 = -step2[20] * (long)CosPi24_64 - step2[27] * (long)CosPi8_64;
  2328. temp2 = -step2[20] * (long)CosPi8_64 + step2[27] * (long)CosPi24_64;
  2329. step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2330. step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2331. temp1 = -step2[21] * (long)CosPi24_64 - step2[26] * (long)CosPi8_64;
  2332. temp2 = -step2[21] * (long)CosPi8_64 + step2[26] * (long)CosPi24_64;
  2333. step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2334. step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2335. step1[22] = step2[22];
  2336. step1[23] = step2[23];
  2337. step1[24] = step2[24];
  2338. step1[25] = step2[25];
  2339. step1[30] = step2[30];
  2340. step1[31] = step2[31];
  2341. // stage 6
  2342. step2[0] = HighbdWrapLow(step1[0] + step1[7], bd);
  2343. step2[1] = HighbdWrapLow(step1[1] + step1[6], bd);
  2344. step2[2] = HighbdWrapLow(step1[2] + step1[5], bd);
  2345. step2[3] = HighbdWrapLow(step1[3] + step1[4], bd);
  2346. step2[4] = HighbdWrapLow(step1[3] - step1[4], bd);
  2347. step2[5] = HighbdWrapLow(step1[2] - step1[5], bd);
  2348. step2[6] = HighbdWrapLow(step1[1] - step1[6], bd);
  2349. step2[7] = HighbdWrapLow(step1[0] - step1[7], bd);
  2350. step2[8] = step1[8];
  2351. step2[9] = step1[9];
  2352. temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64;
  2353. temp2 = (step1[10] + step1[13]) * (long)CosPi16_64;
  2354. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2355. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2356. temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64;
  2357. temp2 = (step1[11] + step1[12]) * (long)CosPi16_64;
  2358. step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2359. step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2360. step2[14] = step1[14];
  2361. step2[15] = step1[15];
  2362. step2[16] = HighbdWrapLow(step1[16] + step1[23], bd);
  2363. step2[17] = HighbdWrapLow(step1[17] + step1[22], bd);
  2364. step2[18] = HighbdWrapLow(step1[18] + step1[21], bd);
  2365. step2[19] = HighbdWrapLow(step1[19] + step1[20], bd);
  2366. step2[20] = HighbdWrapLow(step1[19] - step1[20], bd);
  2367. step2[21] = HighbdWrapLow(step1[18] - step1[21], bd);
  2368. step2[22] = HighbdWrapLow(step1[17] - step1[22], bd);
  2369. step2[23] = HighbdWrapLow(step1[16] - step1[23], bd);
  2370. step2[24] = HighbdWrapLow(-step1[24] + step1[31], bd);
  2371. step2[25] = HighbdWrapLow(-step1[25] + step1[30], bd);
  2372. step2[26] = HighbdWrapLow(-step1[26] + step1[29], bd);
  2373. step2[27] = HighbdWrapLow(-step1[27] + step1[28], bd);
  2374. step2[28] = HighbdWrapLow(step1[27] + step1[28], bd);
  2375. step2[29] = HighbdWrapLow(step1[26] + step1[29], bd);
  2376. step2[30] = HighbdWrapLow(step1[25] + step1[30], bd);
  2377. step2[31] = HighbdWrapLow(step1[24] + step1[31], bd);
  2378. // stage 7
  2379. step1[0] = HighbdWrapLow(step2[0] + step2[15], bd);
  2380. step1[1] = HighbdWrapLow(step2[1] + step2[14], bd);
  2381. step1[2] = HighbdWrapLow(step2[2] + step2[13], bd);
  2382. step1[3] = HighbdWrapLow(step2[3] + step2[12], bd);
  2383. step1[4] = HighbdWrapLow(step2[4] + step2[11], bd);
  2384. step1[5] = HighbdWrapLow(step2[5] + step2[10], bd);
  2385. step1[6] = HighbdWrapLow(step2[6] + step2[9], bd);
  2386. step1[7] = HighbdWrapLow(step2[7] + step2[8], bd);
  2387. step1[8] = HighbdWrapLow(step2[7] - step2[8], bd);
  2388. step1[9] = HighbdWrapLow(step2[6] - step2[9], bd);
  2389. step1[10] = HighbdWrapLow(step2[5] - step2[10], bd);
  2390. step1[11] = HighbdWrapLow(step2[4] - step2[11], bd);
  2391. step1[12] = HighbdWrapLow(step2[3] - step2[12], bd);
  2392. step1[13] = HighbdWrapLow(step2[2] - step2[13], bd);
  2393. step1[14] = HighbdWrapLow(step2[1] - step2[14], bd);
  2394. step1[15] = HighbdWrapLow(step2[0] - step2[15], bd);
  2395. step1[16] = step2[16];
  2396. step1[17] = step2[17];
  2397. step1[18] = step2[18];
  2398. step1[19] = step2[19];
  2399. temp1 = (-step2[20] + step2[27]) * (long)CosPi16_64;
  2400. temp2 = (step2[20] + step2[27]) * (long)CosPi16_64;
  2401. step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2402. step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2403. temp1 = (-step2[21] + step2[26]) * (long)CosPi16_64;
  2404. temp2 = (step2[21] + step2[26]) * (long)CosPi16_64;
  2405. step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2406. step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2407. temp1 = (-step2[22] + step2[25]) * (long)CosPi16_64;
  2408. temp2 = (step2[22] + step2[25]) * (long)CosPi16_64;
  2409. step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2410. step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2411. temp1 = (-step2[23] + step2[24]) * (long)CosPi16_64;
  2412. temp2 = (step2[23] + step2[24]) * (long)CosPi16_64;
  2413. step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2414. step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2415. step1[28] = step2[28];
  2416. step1[29] = step2[29];
  2417. step1[30] = step2[30];
  2418. step1[31] = step2[31];
  2419. // final stage
  2420. output[0] = HighbdWrapLow(step1[0] + step1[31], bd);
  2421. output[1] = HighbdWrapLow(step1[1] + step1[30], bd);
  2422. output[2] = HighbdWrapLow(step1[2] + step1[29], bd);
  2423. output[3] = HighbdWrapLow(step1[3] + step1[28], bd);
  2424. output[4] = HighbdWrapLow(step1[4] + step1[27], bd);
  2425. output[5] = HighbdWrapLow(step1[5] + step1[26], bd);
  2426. output[6] = HighbdWrapLow(step1[6] + step1[25], bd);
  2427. output[7] = HighbdWrapLow(step1[7] + step1[24], bd);
  2428. output[8] = HighbdWrapLow(step1[8] + step1[23], bd);
  2429. output[9] = HighbdWrapLow(step1[9] + step1[22], bd);
  2430. output[10] = HighbdWrapLow(step1[10] + step1[21], bd);
  2431. output[11] = HighbdWrapLow(step1[11] + step1[20], bd);
  2432. output[12] = HighbdWrapLow(step1[12] + step1[19], bd);
  2433. output[13] = HighbdWrapLow(step1[13] + step1[18], bd);
  2434. output[14] = HighbdWrapLow(step1[14] + step1[17], bd);
  2435. output[15] = HighbdWrapLow(step1[15] + step1[16], bd);
  2436. output[16] = HighbdWrapLow(step1[15] - step1[16], bd);
  2437. output[17] = HighbdWrapLow(step1[14] - step1[17], bd);
  2438. output[18] = HighbdWrapLow(step1[13] - step1[18], bd);
  2439. output[19] = HighbdWrapLow(step1[12] - step1[19], bd);
  2440. output[20] = HighbdWrapLow(step1[11] - step1[20], bd);
  2441. output[21] = HighbdWrapLow(step1[10] - step1[21], bd);
  2442. output[22] = HighbdWrapLow(step1[9] - step1[22], bd);
  2443. output[23] = HighbdWrapLow(step1[8] - step1[23], bd);
  2444. output[24] = HighbdWrapLow(step1[7] - step1[24], bd);
  2445. output[25] = HighbdWrapLow(step1[6] - step1[25], bd);
  2446. output[26] = HighbdWrapLow(step1[5] - step1[26], bd);
  2447. output[27] = HighbdWrapLow(step1[4] - step1[27], bd);
  2448. output[28] = HighbdWrapLow(step1[3] - step1[28], bd);
  2449. output[29] = HighbdWrapLow(step1[2] - step1[29], bd);
  2450. output[30] = HighbdWrapLow(step1[1] - step1[30], bd);
  2451. output[31] = HighbdWrapLow(step1[0] - step1[31], bd);
  2452. }
  2453. public static void HighbdIdct32x321024Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2454. {
  2455. int i, j;
  2456. Span<int> output = stackalloc int[32 * 32];
  2457. Span<int> outptr = output;
  2458. Span<int> tempIn = stackalloc int[32];
  2459. Span<int> tempOut = stackalloc int[32];
  2460. // Rows
  2461. for (i = 0; i < 32; ++i)
  2462. {
  2463. int zeroCoeff = 0;
  2464. for (j = 0; j < 32; ++j)
  2465. {
  2466. zeroCoeff |= input[j];
  2467. }
  2468. if (zeroCoeff != 0)
  2469. {
  2470. HighbdIdct32(input, outptr, bd);
  2471. }
  2472. else
  2473. {
  2474. outptr.Slice(0, 32).Fill(0);
  2475. }
  2476. input = input.Slice(32);
  2477. outptr = outptr.Slice(32);
  2478. }
  2479. // Columns
  2480. for (i = 0; i < 32; ++i)
  2481. {
  2482. for (j = 0; j < 32; ++j)
  2483. {
  2484. tempIn[j] = output[j * 32 + i];
  2485. }
  2486. HighbdIdct32(tempIn, tempOut, bd);
  2487. for (j = 0; j < 32; ++j)
  2488. {
  2489. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2490. }
  2491. }
  2492. }
  2493. public static void HighbdIdct32x32135Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2494. {
  2495. int i, j;
  2496. Span<int> output = stackalloc int[32 * 32];
  2497. Span<int> outptr = output;
  2498. Span<int> tempIn = stackalloc int[32];
  2499. Span<int> tempOut = stackalloc int[32];
  2500. // Rows
  2501. // Only upper-left 16x16 has non-zero coeff
  2502. for (i = 0; i < 16; ++i)
  2503. {
  2504. HighbdIdct32(input, outptr, bd);
  2505. input = input.Slice(32);
  2506. outptr = outptr.Slice(32);
  2507. }
  2508. // Columns
  2509. for (i = 0; i < 32; ++i)
  2510. {
  2511. Span<ushort> destT = dest;
  2512. for (j = 0; j < 32; ++j)
  2513. {
  2514. tempIn[j] = output[j * 32 + i];
  2515. }
  2516. HighbdIdct32(tempIn, tempOut, bd);
  2517. for (j = 0; j < 32; ++j)
  2518. {
  2519. destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2520. destT = destT.Slice(stride);
  2521. }
  2522. }
  2523. }
  2524. public static void HighbdIdct32x3234Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2525. {
  2526. int i, j;
  2527. Span<int> output = stackalloc int[32 * 32];
  2528. Span<int> outptr = output;
  2529. Span<int> tempIn = stackalloc int[32];
  2530. Span<int> tempOut = stackalloc int[32];
  2531. // Rows
  2532. // Only upper-left 8x8 has non-zero coeff
  2533. for (i = 0; i < 8; ++i)
  2534. {
  2535. HighbdIdct32(input, outptr, bd);
  2536. input = input.Slice(32);
  2537. outptr = outptr.Slice(32);
  2538. }
  2539. // Columns
  2540. for (i = 0; i < 32; ++i)
  2541. {
  2542. for (j = 0; j < 32; ++j)
  2543. {
  2544. tempIn[j] = output[j * 32 + i];
  2545. }
  2546. HighbdIdct32(tempIn, tempOut, bd);
  2547. for (j = 0; j < 32; ++j)
  2548. {
  2549. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2550. }
  2551. }
  2552. }
  2553. public static void HighbdIdct32x321Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2554. {
  2555. int i, j;
  2556. int a1;
  2557. int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
  2558. output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
  2559. a1 = BitUtils.RoundPowerOfTwo(output, 6);
  2560. for (j = 0; j < 32; ++j)
  2561. {
  2562. for (i = 0; i < 32; ++i)
  2563. {
  2564. dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
  2565. }
  2566. dest = dest.Slice(stride);
  2567. }
  2568. }
  2569. }
  2570. }