InvTxfm.cs 122 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918
  1. using System;
  2. using System.Diagnostics;
  3. using System.Runtime.CompilerServices;
  4. using Ryujinx.Graphics.Nvdec.Vp9.Common;
  5. using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.TxfmCommon;
  6. namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
  7. {
  8. internal static class InvTxfm
  9. {
  10. // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
  11. // transform amplify bits + 1 bit for contingency in rounding and quantizing
  12. private const int HighbdValidTxfmMagnitudeRange = (1 << 25);
  13. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  14. private static int DetectInvalidHighbdInput(ReadOnlySpan<int> input, int size)
  15. {
  16. int i;
  17. for (i = 0; i < size; ++i)
  18. {
  19. if (Math.Abs(input[i]) >= HighbdValidTxfmMagnitudeRange)
  20. {
  21. return 1;
  22. }
  23. }
  24. return 0;
  25. }
  26. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  27. private static long CheckRange(long input)
  28. {
  29. // For valid VP9 input streams, intermediate stage coefficients should always
  30. // stay within the range of a signed 16 bit integer. Coefficients can go out
  31. // of this range for invalid/corrupt VP9 streams.
  32. Debug.Assert(short.MinValue <= input);
  33. Debug.Assert(input <= short.MaxValue);
  34. return input;
  35. }
  36. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  37. public static long HighbdCheckRange(long input, int bd)
  38. {
  39. // For valid highbitdepth VP9 streams, intermediate stage coefficients will
  40. // stay within the ranges:
  41. // - 8 bit: signed 16 bit integer
  42. // - 10 bit: signed 18 bit integer
  43. // - 12 bit: signed 20 bit integer
  44. int intMax = (1 << (7 + bd)) - 1;
  45. int intMin = -intMax - 1;
  46. Debug.Assert(intMin <= input);
  47. Debug.Assert(input <= intMax);
  48. return input;
  49. }
  50. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  51. private static int WrapLow(long x)
  52. {
  53. return (short)CheckRange(x);
  54. }
  55. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  56. private static int HighbdWrapLow(long x, int bd)
  57. {
  58. return ((int)HighbdCheckRange(x, bd) << (24 - bd)) >> (24 - bd);
  59. }
  60. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  61. public static byte ClipPixelAdd(byte dest, long trans)
  62. {
  63. trans = WrapLow(trans);
  64. return BitUtils.ClipPixel(dest + (int)trans);
  65. }
  66. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  67. public static ushort HighbdClipPixelAdd(ushort dest, long trans, int bd)
  68. {
  69. trans = HighbdWrapLow(trans, bd);
  70. return BitUtils.ClipPixelHighbd(dest + (int)trans, bd);
  71. }
  72. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  73. private static long DctConstRoundShift(long input)
  74. {
  75. long rv = BitUtils.RoundPowerOfTwo(input, DctConstBits);
  76. return rv;
  77. }
  78. [SkipLocalsInit]
  79. public static void Iwht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  80. {
  81. /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  82. 0.5 shifts per pixel. */
  83. int i;
  84. Span<int> output = stackalloc int[16];
  85. long a1, b1, c1, d1, e1;
  86. ReadOnlySpan<int> ip = input;
  87. Span<int> op = output;
  88. for (i = 0; i < 4; i++)
  89. {
  90. a1 = ip[0] >> UnitQuantShift;
  91. c1 = ip[1] >> UnitQuantShift;
  92. d1 = ip[2] >> UnitQuantShift;
  93. b1 = ip[3] >> UnitQuantShift;
  94. a1 += c1;
  95. d1 -= b1;
  96. e1 = (a1 - d1) >> 1;
  97. b1 = e1 - b1;
  98. c1 = e1 - c1;
  99. a1 -= b1;
  100. d1 += c1;
  101. op[0] = WrapLow(a1);
  102. op[1] = WrapLow(b1);
  103. op[2] = WrapLow(c1);
  104. op[3] = WrapLow(d1);
  105. ip = ip.Slice(4);
  106. op = op.Slice(4);
  107. }
  108. Span<int> ip2 = output;
  109. for (i = 0; i < 4; i++)
  110. {
  111. a1 = ip2[4 * 0];
  112. c1 = ip2[4 * 1];
  113. d1 = ip2[4 * 2];
  114. b1 = ip2[4 * 3];
  115. a1 += c1;
  116. d1 -= b1;
  117. e1 = (a1 - d1) >> 1;
  118. b1 = e1 - b1;
  119. c1 = e1 - c1;
  120. a1 -= b1;
  121. d1 += c1;
  122. dest[stride * 0] = ClipPixelAdd(dest[stride * 0], WrapLow(a1));
  123. dest[stride * 1] = ClipPixelAdd(dest[stride * 1], WrapLow(b1));
  124. dest[stride * 2] = ClipPixelAdd(dest[stride * 2], WrapLow(c1));
  125. dest[stride * 3] = ClipPixelAdd(dest[stride * 3], WrapLow(d1));
  126. ip2 = ip2.Slice(1);
  127. dest = dest.Slice(1);
  128. }
  129. }
  130. [SkipLocalsInit]
  131. public static void Iwht4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  132. {
  133. int i;
  134. long a1, e1;
  135. Span<int> tmp = stackalloc int[4];
  136. ReadOnlySpan<int> ip = input;
  137. Span<int> op = tmp;
  138. a1 = ip[0] >> UnitQuantShift;
  139. e1 = a1 >> 1;
  140. a1 -= e1;
  141. op[0] = WrapLow(a1);
  142. op[1] = op[2] = op[3] = WrapLow(e1);
  143. Span<int> ip2 = tmp;
  144. for (i = 0; i < 4; i++)
  145. {
  146. e1 = ip2[0] >> 1;
  147. a1 = ip2[0] - e1;
  148. dest[stride * 0] = ClipPixelAdd(dest[stride * 0], a1);
  149. dest[stride * 1] = ClipPixelAdd(dest[stride * 1], e1);
  150. dest[stride * 2] = ClipPixelAdd(dest[stride * 2], e1);
  151. dest[stride * 3] = ClipPixelAdd(dest[stride * 3], e1);
  152. ip2 = ip2.Slice(1);
  153. dest = dest.Slice(1);
  154. }
  155. }
  156. public static void Iadst4(ReadOnlySpan<int> input, Span<int> output)
  157. {
  158. long s0, s1, s2, s3, s4, s5, s6, s7;
  159. int x0 = input[0];
  160. int x1 = input[1];
  161. int x2 = input[2];
  162. int x3 = input[3];
  163. if ((x0 | x1 | x2 | x3) == 0)
  164. {
  165. output.Slice(0, 4).Fill(0);
  166. return;
  167. }
  168. // 32-bit result is enough for the following multiplications.
  169. s0 = SinPi1_9 * x0;
  170. s1 = SinPi2_9 * x0;
  171. s2 = SinPi3_9 * x1;
  172. s3 = SinPi4_9 * x2;
  173. s4 = SinPi1_9 * x2;
  174. s5 = SinPi2_9 * x3;
  175. s6 = SinPi4_9 * x3;
  176. s7 = WrapLow(x0 - x2 + x3);
  177. s0 = s0 + s3 + s5;
  178. s1 = s1 - s4 - s6;
  179. s3 = s2;
  180. s2 = SinPi3_9 * s7;
  181. // 1-D transform scaling factor is sqrt(2).
  182. // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  183. // + 1b (addition) = 29b.
  184. // Hence the output bit depth is 15b.
  185. output[0] = WrapLow(DctConstRoundShift(s0 + s3));
  186. output[1] = WrapLow(DctConstRoundShift(s1 + s3));
  187. output[2] = WrapLow(DctConstRoundShift(s2));
  188. output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3));
  189. }
  190. [SkipLocalsInit]
  191. public static void Idct4(ReadOnlySpan<int> input, Span<int> output)
  192. {
  193. Span<short> step = stackalloc short[4];
  194. long temp1, temp2;
  195. // stage 1
  196. temp1 = ((short)input[0] + (short)input[2]) * CosPi16_64;
  197. temp2 = ((short)input[0] - (short)input[2]) * CosPi16_64;
  198. step[0] = (short)WrapLow(DctConstRoundShift(temp1));
  199. step[1] = (short)WrapLow(DctConstRoundShift(temp2));
  200. temp1 = (short)input[1] * CosPi24_64 - (short)input[3] * CosPi8_64;
  201. temp2 = (short)input[1] * CosPi8_64 + (short)input[3] * CosPi24_64;
  202. step[2] = (short)WrapLow(DctConstRoundShift(temp1));
  203. step[3] = (short)WrapLow(DctConstRoundShift(temp2));
  204. // stage 2
  205. output[0] = WrapLow(step[0] + step[3]);
  206. output[1] = WrapLow(step[1] + step[2]);
  207. output[2] = WrapLow(step[1] - step[2]);
  208. output[3] = WrapLow(step[0] - step[3]);
  209. }
  210. [SkipLocalsInit]
  211. public static void Idct4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  212. {
  213. int i, j;
  214. Span<int> output = stackalloc int[4 * 4];
  215. Span<int> outptr = output;
  216. Span<int> tempIn = stackalloc int[4];
  217. Span<int> tempOut = stackalloc int[4];
  218. // Rows
  219. for (i = 0; i < 4; ++i)
  220. {
  221. Idct4(input, outptr);
  222. input = input.Slice(4);
  223. outptr = outptr.Slice(4);
  224. }
  225. // Columns
  226. for (i = 0; i < 4; ++i)
  227. {
  228. for (j = 0; j < 4; ++j)
  229. {
  230. tempIn[j] = output[j * 4 + i];
  231. }
  232. Idct4(tempIn, tempOut);
  233. for (j = 0; j < 4; ++j)
  234. {
  235. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4));
  236. }
  237. }
  238. }
  239. public static void Idct4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  240. {
  241. int i;
  242. long a1;
  243. int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
  244. output = WrapLow(DctConstRoundShift(output * CosPi16_64));
  245. a1 = BitUtils.RoundPowerOfTwo(output, 4);
  246. for (i = 0; i < 4; i++)
  247. {
  248. dest[0] = ClipPixelAdd(dest[0], a1);
  249. dest[1] = ClipPixelAdd(dest[1], a1);
  250. dest[2] = ClipPixelAdd(dest[2], a1);
  251. dest[3] = ClipPixelAdd(dest[3], a1);
  252. dest = dest.Slice(stride);
  253. }
  254. }
  255. public static void Iadst8(ReadOnlySpan<int> input, Span<int> output)
  256. {
  257. int s0, s1, s2, s3, s4, s5, s6, s7;
  258. long x0 = input[7];
  259. long x1 = input[0];
  260. long x2 = input[5];
  261. long x3 = input[2];
  262. long x4 = input[3];
  263. long x5 = input[4];
  264. long x6 = input[1];
  265. long x7 = input[6];
  266. if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0)
  267. {
  268. output.Slice(0, 8).Fill(0);
  269. return;
  270. }
  271. // stage 1
  272. s0 = (int)(CosPi2_64 * x0 + CosPi30_64 * x1);
  273. s1 = (int)(CosPi30_64 * x0 - CosPi2_64 * x1);
  274. s2 = (int)(CosPi10_64 * x2 + CosPi22_64 * x3);
  275. s3 = (int)(CosPi22_64 * x2 - CosPi10_64 * x3);
  276. s4 = (int)(CosPi18_64 * x4 + CosPi14_64 * x5);
  277. s5 = (int)(CosPi14_64 * x4 - CosPi18_64 * x5);
  278. s6 = (int)(CosPi26_64 * x6 + CosPi6_64 * x7);
  279. s7 = (int)(CosPi6_64 * x6 - CosPi26_64 * x7);
  280. x0 = WrapLow(DctConstRoundShift(s0 + s4));
  281. x1 = WrapLow(DctConstRoundShift(s1 + s5));
  282. x2 = WrapLow(DctConstRoundShift(s2 + s6));
  283. x3 = WrapLow(DctConstRoundShift(s3 + s7));
  284. x4 = WrapLow(DctConstRoundShift(s0 - s4));
  285. x5 = WrapLow(DctConstRoundShift(s1 - s5));
  286. x6 = WrapLow(DctConstRoundShift(s2 - s6));
  287. x7 = WrapLow(DctConstRoundShift(s3 - s7));
  288. // stage 2
  289. s0 = (int)x0;
  290. s1 = (int)x1;
  291. s2 = (int)x2;
  292. s3 = (int)x3;
  293. s4 = (int)(CosPi8_64 * x4 + CosPi24_64 * x5);
  294. s5 = (int)(CosPi24_64 * x4 - CosPi8_64 * x5);
  295. s6 = (int)(-CosPi24_64 * x6 + CosPi8_64 * x7);
  296. s7 = (int)(CosPi8_64 * x6 + CosPi24_64 * x7);
  297. x0 = WrapLow(s0 + s2);
  298. x1 = WrapLow(s1 + s3);
  299. x2 = WrapLow(s0 - s2);
  300. x3 = WrapLow(s1 - s3);
  301. x4 = WrapLow(DctConstRoundShift(s4 + s6));
  302. x5 = WrapLow(DctConstRoundShift(s5 + s7));
  303. x6 = WrapLow(DctConstRoundShift(s4 - s6));
  304. x7 = WrapLow(DctConstRoundShift(s5 - s7));
  305. // stage 3
  306. s2 = (int)(CosPi16_64 * (x2 + x3));
  307. s3 = (int)(CosPi16_64 * (x2 - x3));
  308. s6 = (int)(CosPi16_64 * (x6 + x7));
  309. s7 = (int)(CosPi16_64 * (x6 - x7));
  310. x2 = WrapLow(DctConstRoundShift(s2));
  311. x3 = WrapLow(DctConstRoundShift(s3));
  312. x6 = WrapLow(DctConstRoundShift(s6));
  313. x7 = WrapLow(DctConstRoundShift(s7));
  314. output[0] = WrapLow(x0);
  315. output[1] = WrapLow(-x4);
  316. output[2] = WrapLow(x6);
  317. output[3] = WrapLow(-x2);
  318. output[4] = WrapLow(x3);
  319. output[5] = WrapLow(-x7);
  320. output[6] = WrapLow(x5);
  321. output[7] = WrapLow(-x1);
  322. }
  323. [SkipLocalsInit]
  324. public static void Idct8(ReadOnlySpan<int> input, Span<int> output)
  325. {
  326. Span<short> step1 = stackalloc short[8];
  327. Span<short> step2 = stackalloc short[8];
  328. long temp1, temp2;
  329. // stage 1
  330. step1[0] = (short)input[0];
  331. step1[2] = (short)input[4];
  332. step1[1] = (short)input[2];
  333. step1[3] = (short)input[6];
  334. temp1 = (short)input[1] * CosPi28_64 - (short)input[7] * CosPi4_64;
  335. temp2 = (short)input[1] * CosPi4_64 + (short)input[7] * CosPi28_64;
  336. step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
  337. step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
  338. temp1 = (short)input[5] * CosPi12_64 - (short)input[3] * CosPi20_64;
  339. temp2 = (short)input[5] * CosPi20_64 + (short)input[3] * CosPi12_64;
  340. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  341. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  342. // stage 2
  343. temp1 = (step1[0] + step1[2]) * CosPi16_64;
  344. temp2 = (step1[0] - step1[2]) * CosPi16_64;
  345. step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
  346. step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
  347. temp1 = step1[1] * CosPi24_64 - step1[3] * CosPi8_64;
  348. temp2 = step1[1] * CosPi8_64 + step1[3] * CosPi24_64;
  349. step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
  350. step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
  351. step2[4] = (short)WrapLow(step1[4] + step1[5]);
  352. step2[5] = (short)WrapLow(step1[4] - step1[5]);
  353. step2[6] = (short)WrapLow(-step1[6] + step1[7]);
  354. step2[7] = (short)WrapLow(step1[6] + step1[7]);
  355. // stage 3
  356. step1[0] = (short)WrapLow(step2[0] + step2[3]);
  357. step1[1] = (short)WrapLow(step2[1] + step2[2]);
  358. step1[2] = (short)WrapLow(step2[1] - step2[2]);
  359. step1[3] = (short)WrapLow(step2[0] - step2[3]);
  360. step1[4] = step2[4];
  361. temp1 = (step2[6] - step2[5]) * CosPi16_64;
  362. temp2 = (step2[5] + step2[6]) * CosPi16_64;
  363. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  364. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  365. step1[7] = step2[7];
  366. // stage 4
  367. output[0] = WrapLow(step1[0] + step1[7]);
  368. output[1] = WrapLow(step1[1] + step1[6]);
  369. output[2] = WrapLow(step1[2] + step1[5]);
  370. output[3] = WrapLow(step1[3] + step1[4]);
  371. output[4] = WrapLow(step1[3] - step1[4]);
  372. output[5] = WrapLow(step1[2] - step1[5]);
  373. output[6] = WrapLow(step1[1] - step1[6]);
  374. output[7] = WrapLow(step1[0] - step1[7]);
  375. }
  376. [SkipLocalsInit]
  377. public static void Idct8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  378. {
  379. int i, j;
  380. Span<int> output = stackalloc int[8 * 8];
  381. Span<int> outptr = output;
  382. Span<int> tempIn = stackalloc int[8];
  383. Span<int> tempOut = stackalloc int[8];
  384. // First transform rows
  385. for (i = 0; i < 8; ++i)
  386. {
  387. Idct8(input, outptr);
  388. input = input.Slice(8);
  389. outptr = outptr.Slice(8);
  390. }
  391. // Then transform columns
  392. for (i = 0; i < 8; ++i)
  393. {
  394. for (j = 0; j < 8; ++j)
  395. {
  396. tempIn[j] = output[j * 8 + i];
  397. }
  398. Idct8(tempIn, tempOut);
  399. for (j = 0; j < 8; ++j)
  400. {
  401. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i],
  402. BitUtils.RoundPowerOfTwo(tempOut[j], 5));
  403. }
  404. }
  405. }
  406. [SkipLocalsInit]
  407. public static void Idct8x812Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  408. {
  409. int i, j;
  410. Span<int> output = stackalloc int[8 * 8];
  411. Span<int> outptr = output;
  412. Span<int> tempIn = stackalloc int[8];
  413. Span<int> tempOut = stackalloc int[8];
  414. output.Fill(0);
  415. // First transform rows
  416. // Only first 4 row has non-zero coefs
  417. for (i = 0; i < 4; ++i)
  418. {
  419. Idct8(input, outptr);
  420. input = input.Slice(8);
  421. outptr = outptr.Slice(8);
  422. }
  423. // Then transform columns
  424. for (i = 0; i < 8; ++i)
  425. {
  426. for (j = 0; j < 8; ++j)
  427. {
  428. tempIn[j] = output[j * 8 + i];
  429. }
  430. Idct8(tempIn, tempOut);
  431. for (j = 0; j < 8; ++j)
  432. {
  433. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i],
  434. BitUtils.RoundPowerOfTwo(tempOut[j], 5));
  435. }
  436. }
  437. }
  438. public static void Idct8x81Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  439. {
  440. int i, j;
  441. long a1;
  442. int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
  443. output = WrapLow(DctConstRoundShift(output * CosPi16_64));
  444. a1 = BitUtils.RoundPowerOfTwo(output, 5);
  445. for (j = 0; j < 8; ++j)
  446. {
  447. for (i = 0; i < 8; ++i)
  448. {
  449. dest[i] = ClipPixelAdd(dest[i], a1);
  450. }
  451. dest = dest.Slice(stride);
  452. }
  453. }
  454. public static void Iadst16(ReadOnlySpan<int> input, Span<int> output)
  455. {
  456. long s0, s1, s2, s3, s4, s5, s6, s7, s8;
  457. long s9, s10, s11, s12, s13, s14, s15;
  458. long x0 = input[15];
  459. long x1 = input[0];
  460. long x2 = input[13];
  461. long x3 = input[2];
  462. long x4 = input[11];
  463. long x5 = input[4];
  464. long x6 = input[9];
  465. long x7 = input[6];
  466. long x8 = input[7];
  467. long x9 = input[8];
  468. long x10 = input[5];
  469. long x11 = input[10];
  470. long x12 = input[3];
  471. long x13 = input[12];
  472. long x14 = input[1];
  473. long x15 = input[14];
  474. if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0)
  475. {
  476. output.Slice(0, 16).Fill(0);
  477. return;
  478. }
  479. // stage 1
  480. s0 = x0 * CosPi1_64 + x1 * CosPi31_64;
  481. s1 = x0 * CosPi31_64 - x1 * CosPi1_64;
  482. s2 = x2 * CosPi5_64 + x3 * CosPi27_64;
  483. s3 = x2 * CosPi27_64 - x3 * CosPi5_64;
  484. s4 = x4 * CosPi9_64 + x5 * CosPi23_64;
  485. s5 = x4 * CosPi23_64 - x5 * CosPi9_64;
  486. s6 = x6 * CosPi13_64 + x7 * CosPi19_64;
  487. s7 = x6 * CosPi19_64 - x7 * CosPi13_64;
  488. s8 = x8 * CosPi17_64 + x9 * CosPi15_64;
  489. s9 = x8 * CosPi15_64 - x9 * CosPi17_64;
  490. s10 = x10 * CosPi21_64 + x11 * CosPi11_64;
  491. s11 = x10 * CosPi11_64 - x11 * CosPi21_64;
  492. s12 = x12 * CosPi25_64 + x13 * CosPi7_64;
  493. s13 = x12 * CosPi7_64 - x13 * CosPi25_64;
  494. s14 = x14 * CosPi29_64 + x15 * CosPi3_64;
  495. s15 = x14 * CosPi3_64 - x15 * CosPi29_64;
  496. x0 = WrapLow(DctConstRoundShift(s0 + s8));
  497. x1 = WrapLow(DctConstRoundShift(s1 + s9));
  498. x2 = WrapLow(DctConstRoundShift(s2 + s10));
  499. x3 = WrapLow(DctConstRoundShift(s3 + s11));
  500. x4 = WrapLow(DctConstRoundShift(s4 + s12));
  501. x5 = WrapLow(DctConstRoundShift(s5 + s13));
  502. x6 = WrapLow(DctConstRoundShift(s6 + s14));
  503. x7 = WrapLow(DctConstRoundShift(s7 + s15));
  504. x8 = WrapLow(DctConstRoundShift(s0 - s8));
  505. x9 = WrapLow(DctConstRoundShift(s1 - s9));
  506. x10 = WrapLow(DctConstRoundShift(s2 - s10));
  507. x11 = WrapLow(DctConstRoundShift(s3 - s11));
  508. x12 = WrapLow(DctConstRoundShift(s4 - s12));
  509. x13 = WrapLow(DctConstRoundShift(s5 - s13));
  510. x14 = WrapLow(DctConstRoundShift(s6 - s14));
  511. x15 = WrapLow(DctConstRoundShift(s7 - s15));
  512. // stage 2
  513. s0 = x0;
  514. s1 = x1;
  515. s2 = x2;
  516. s3 = x3;
  517. s4 = x4;
  518. s5 = x5;
  519. s6 = x6;
  520. s7 = x7;
  521. s8 = x8 * CosPi4_64 + x9 * CosPi28_64;
  522. s9 = x8 * CosPi28_64 - x9 * CosPi4_64;
  523. s10 = x10 * CosPi20_64 + x11 * CosPi12_64;
  524. s11 = x10 * CosPi12_64 - x11 * CosPi20_64;
  525. s12 = -x12 * CosPi28_64 + x13 * CosPi4_64;
  526. s13 = x12 * CosPi4_64 + x13 * CosPi28_64;
  527. s14 = -x14 * CosPi12_64 + x15 * CosPi20_64;
  528. s15 = x14 * CosPi20_64 + x15 * CosPi12_64;
  529. x0 = WrapLow(s0 + s4);
  530. x1 = WrapLow(s1 + s5);
  531. x2 = WrapLow(s2 + s6);
  532. x3 = WrapLow(s3 + s7);
  533. x4 = WrapLow(s0 - s4);
  534. x5 = WrapLow(s1 - s5);
  535. x6 = WrapLow(s2 - s6);
  536. x7 = WrapLow(s3 - s7);
  537. x8 = WrapLow(DctConstRoundShift(s8 + s12));
  538. x9 = WrapLow(DctConstRoundShift(s9 + s13));
  539. x10 = WrapLow(DctConstRoundShift(s10 + s14));
  540. x11 = WrapLow(DctConstRoundShift(s11 + s15));
  541. x12 = WrapLow(DctConstRoundShift(s8 - s12));
  542. x13 = WrapLow(DctConstRoundShift(s9 - s13));
  543. x14 = WrapLow(DctConstRoundShift(s10 - s14));
  544. x15 = WrapLow(DctConstRoundShift(s11 - s15));
  545. // stage 3
  546. s0 = x0;
  547. s1 = x1;
  548. s2 = x2;
  549. s3 = x3;
  550. s4 = x4 * CosPi8_64 + x5 * CosPi24_64;
  551. s5 = x4 * CosPi24_64 - x5 * CosPi8_64;
  552. s6 = -x6 * CosPi24_64 + x7 * CosPi8_64;
  553. s7 = x6 * CosPi8_64 + x7 * CosPi24_64;
  554. s8 = x8;
  555. s9 = x9;
  556. s10 = x10;
  557. s11 = x11;
  558. s12 = x12 * CosPi8_64 + x13 * CosPi24_64;
  559. s13 = x12 * CosPi24_64 - x13 * CosPi8_64;
  560. s14 = -x14 * CosPi24_64 + x15 * CosPi8_64;
  561. s15 = x14 * CosPi8_64 + x15 * CosPi24_64;
  562. x0 = WrapLow(s0 + s2);
  563. x1 = WrapLow(s1 + s3);
  564. x2 = WrapLow(s0 - s2);
  565. x3 = WrapLow(s1 - s3);
  566. x4 = WrapLow(DctConstRoundShift(s4 + s6));
  567. x5 = WrapLow(DctConstRoundShift(s5 + s7));
  568. x6 = WrapLow(DctConstRoundShift(s4 - s6));
  569. x7 = WrapLow(DctConstRoundShift(s5 - s7));
  570. x8 = WrapLow(s8 + s10);
  571. x9 = WrapLow(s9 + s11);
  572. x10 = WrapLow(s8 - s10);
  573. x11 = WrapLow(s9 - s11);
  574. x12 = WrapLow(DctConstRoundShift(s12 + s14));
  575. x13 = WrapLow(DctConstRoundShift(s13 + s15));
  576. x14 = WrapLow(DctConstRoundShift(s12 - s14));
  577. x15 = WrapLow(DctConstRoundShift(s13 - s15));
  578. // stage 4
  579. s2 = (-CosPi16_64) * (x2 + x3);
  580. s3 = CosPi16_64 * (x2 - x3);
  581. s6 = CosPi16_64 * (x6 + x7);
  582. s7 = CosPi16_64 * (-x6 + x7);
  583. s10 = CosPi16_64 * (x10 + x11);
  584. s11 = CosPi16_64 * (-x10 + x11);
  585. s14 = (-CosPi16_64) * (x14 + x15);
  586. s15 = CosPi16_64 * (x14 - x15);
  587. x2 = WrapLow(DctConstRoundShift(s2));
  588. x3 = WrapLow(DctConstRoundShift(s3));
  589. x6 = WrapLow(DctConstRoundShift(s6));
  590. x7 = WrapLow(DctConstRoundShift(s7));
  591. x10 = WrapLow(DctConstRoundShift(s10));
  592. x11 = WrapLow(DctConstRoundShift(s11));
  593. x14 = WrapLow(DctConstRoundShift(s14));
  594. x15 = WrapLow(DctConstRoundShift(s15));
  595. output[0] = WrapLow(x0);
  596. output[1] = WrapLow(-x8);
  597. output[2] = WrapLow(x12);
  598. output[3] = WrapLow(-x4);
  599. output[4] = WrapLow(x6);
  600. output[5] = WrapLow(x14);
  601. output[6] = WrapLow(x10);
  602. output[7] = WrapLow(x2);
  603. output[8] = WrapLow(x3);
  604. output[9] = WrapLow(x11);
  605. output[10] = WrapLow(x15);
  606. output[11] = WrapLow(x7);
  607. output[12] = WrapLow(x5);
  608. output[13] = WrapLow(-x13);
  609. output[14] = WrapLow(x9);
  610. output[15] = WrapLow(-x1);
  611. }
  612. [SkipLocalsInit]
  613. public static void Idct16(ReadOnlySpan<int> input, Span<int> output)
  614. {
  615. Span<short> step1 = stackalloc short[16];
  616. Span<short> step2 = stackalloc short[16];
  617. long temp1, temp2;
  618. // stage 1
  619. step1[0] = (short)input[0 / 2];
  620. step1[1] = (short)input[16 / 2];
  621. step1[2] = (short)input[8 / 2];
  622. step1[3] = (short)input[24 / 2];
  623. step1[4] = (short)input[4 / 2];
  624. step1[5] = (short)input[20 / 2];
  625. step1[6] = (short)input[12 / 2];
  626. step1[7] = (short)input[28 / 2];
  627. step1[8] = (short)input[2 / 2];
  628. step1[9] = (short)input[18 / 2];
  629. step1[10] = (short)input[10 / 2];
  630. step1[11] = (short)input[26 / 2];
  631. step1[12] = (short)input[6 / 2];
  632. step1[13] = (short)input[22 / 2];
  633. step1[14] = (short)input[14 / 2];
  634. step1[15] = (short)input[30 / 2];
  635. // stage 2
  636. step2[0] = step1[0];
  637. step2[1] = step1[1];
  638. step2[2] = step1[2];
  639. step2[3] = step1[3];
  640. step2[4] = step1[4];
  641. step2[5] = step1[5];
  642. step2[6] = step1[6];
  643. step2[7] = step1[7];
  644. temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64;
  645. temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64;
  646. step2[8] = (short)WrapLow(DctConstRoundShift(temp1));
  647. step2[15] = (short)WrapLow(DctConstRoundShift(temp2));
  648. temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64;
  649. temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64;
  650. step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
  651. step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
  652. temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64;
  653. temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64;
  654. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  655. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  656. temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64;
  657. temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64;
  658. step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
  659. step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
  660. // stage 3
  661. step1[0] = step2[0];
  662. step1[1] = step2[1];
  663. step1[2] = step2[2];
  664. step1[3] = step2[3];
  665. temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64;
  666. temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64;
  667. step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
  668. step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
  669. temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64;
  670. temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64;
  671. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  672. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  673. step1[8] = (short)WrapLow(step2[8] + step2[9]);
  674. step1[9] = (short)WrapLow(step2[8] - step2[9]);
  675. step1[10] = (short)WrapLow(-step2[10] + step2[11]);
  676. step1[11] = (short)WrapLow(step2[10] + step2[11]);
  677. step1[12] = (short)WrapLow(step2[12] + step2[13]);
  678. step1[13] = (short)WrapLow(step2[12] - step2[13]);
  679. step1[14] = (short)WrapLow(-step2[14] + step2[15]);
  680. step1[15] = (short)WrapLow(step2[14] + step2[15]);
  681. // stage 4
  682. temp1 = (step1[0] + step1[1]) * CosPi16_64;
  683. temp2 = (step1[0] - step1[1]) * CosPi16_64;
  684. step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
  685. step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
  686. temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64;
  687. temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64;
  688. step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
  689. step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
  690. step2[4] = (short)WrapLow(step1[4] + step1[5]);
  691. step2[5] = (short)WrapLow(step1[4] - step1[5]);
  692. step2[6] = (short)WrapLow(-step1[6] + step1[7]);
  693. step2[7] = (short)WrapLow(step1[6] + step1[7]);
  694. step2[8] = step1[8];
  695. step2[15] = step1[15];
  696. temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64;
  697. temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64;
  698. step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
  699. step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
  700. temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64;
  701. temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64;
  702. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  703. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  704. step2[11] = step1[11];
  705. step2[12] = step1[12];
  706. // stage 5
  707. step1[0] = (short)WrapLow(step2[0] + step2[3]);
  708. step1[1] = (short)WrapLow(step2[1] + step2[2]);
  709. step1[2] = (short)WrapLow(step2[1] - step2[2]);
  710. step1[3] = (short)WrapLow(step2[0] - step2[3]);
  711. step1[4] = step2[4];
  712. temp1 = (step2[6] - step2[5]) * CosPi16_64;
  713. temp2 = (step2[5] + step2[6]) * CosPi16_64;
  714. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  715. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  716. step1[7] = step2[7];
  717. step1[8] = (short)WrapLow(step2[8] + step2[11]);
  718. step1[9] = (short)WrapLow(step2[9] + step2[10]);
  719. step1[10] = (short)WrapLow(step2[9] - step2[10]);
  720. step1[11] = (short)WrapLow(step2[8] - step2[11]);
  721. step1[12] = (short)WrapLow(-step2[12] + step2[15]);
  722. step1[13] = (short)WrapLow(-step2[13] + step2[14]);
  723. step1[14] = (short)WrapLow(step2[13] + step2[14]);
  724. step1[15] = (short)WrapLow(step2[12] + step2[15]);
  725. // stage 6
  726. step2[0] = (short)WrapLow(step1[0] + step1[7]);
  727. step2[1] = (short)WrapLow(step1[1] + step1[6]);
  728. step2[2] = (short)WrapLow(step1[2] + step1[5]);
  729. step2[3] = (short)WrapLow(step1[3] + step1[4]);
  730. step2[4] = (short)WrapLow(step1[3] - step1[4]);
  731. step2[5] = (short)WrapLow(step1[2] - step1[5]);
  732. step2[6] = (short)WrapLow(step1[1] - step1[6]);
  733. step2[7] = (short)WrapLow(step1[0] - step1[7]);
  734. step2[8] = step1[8];
  735. step2[9] = step1[9];
  736. temp1 = (-step1[10] + step1[13]) * CosPi16_64;
  737. temp2 = (step1[10] + step1[13]) * CosPi16_64;
  738. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  739. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  740. temp1 = (-step1[11] + step1[12]) * CosPi16_64;
  741. temp2 = (step1[11] + step1[12]) * CosPi16_64;
  742. step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
  743. step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
  744. step2[14] = step1[14];
  745. step2[15] = step1[15];
  746. // stage 7
  747. output[0] = WrapLow(step2[0] + step2[15]);
  748. output[1] = WrapLow(step2[1] + step2[14]);
  749. output[2] = WrapLow(step2[2] + step2[13]);
  750. output[3] = WrapLow(step2[3] + step2[12]);
  751. output[4] = WrapLow(step2[4] + step2[11]);
  752. output[5] = WrapLow(step2[5] + step2[10]);
  753. output[6] = WrapLow(step2[6] + step2[9]);
  754. output[7] = WrapLow(step2[7] + step2[8]);
  755. output[8] = WrapLow(step2[7] - step2[8]);
  756. output[9] = WrapLow(step2[6] - step2[9]);
  757. output[10] = WrapLow(step2[5] - step2[10]);
  758. output[11] = WrapLow(step2[4] - step2[11]);
  759. output[12] = WrapLow(step2[3] - step2[12]);
  760. output[13] = WrapLow(step2[2] - step2[13]);
  761. output[14] = WrapLow(step2[1] - step2[14]);
  762. output[15] = WrapLow(step2[0] - step2[15]);
  763. }
  764. [SkipLocalsInit]
  765. public static void Idct16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  766. {
  767. int i, j;
  768. Span<int> output = stackalloc int[16 * 16];
  769. Span<int> outptr = output;
  770. Span<int> tempIn = stackalloc int[16];
  771. Span<int> tempOut = stackalloc int[16];
  772. // First transform rows
  773. for (i = 0; i < 16; ++i)
  774. {
  775. Idct16(input, outptr);
  776. input = input.Slice(16);
  777. outptr = outptr.Slice(16);
  778. }
  779. // Then transform columns
  780. for (i = 0; i < 16; ++i)
  781. {
  782. for (j = 0; j < 16; ++j)
  783. {
  784. tempIn[j] = output[j * 16 + i];
  785. }
  786. Idct16(tempIn, tempOut);
  787. for (j = 0; j < 16; ++j)
  788. {
  789. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  790. }
  791. }
  792. }
  793. [SkipLocalsInit]
  794. public static void Idct16x1638Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  795. {
  796. int i, j;
  797. Span<int> output = stackalloc int[16 * 16];
  798. Span<int> outptr = output;
  799. Span<int> tempIn = stackalloc int[16];
  800. Span<int> tempOut = stackalloc int[16];
  801. output.Fill(0);
  802. // First transform rows. Since all non-zero dct coefficients are in
  803. // upper-left 8x8 area, we only need to calculate first 8 rows here.
  804. for (i = 0; i < 8; ++i)
  805. {
  806. Idct16(input, outptr);
  807. input = input.Slice(16);
  808. outptr = outptr.Slice(16);
  809. }
  810. // Then transform columns
  811. for (i = 0; i < 16; ++i)
  812. {
  813. for (j = 0; j < 16; ++j)
  814. {
  815. tempIn[j] = output[j * 16 + i];
  816. }
  817. Idct16(tempIn, tempOut);
  818. for (j = 0; j < 16; ++j)
  819. {
  820. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  821. }
  822. }
  823. }
  824. [SkipLocalsInit]
  825. public static void Idct16x1610Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  826. {
  827. int i, j;
  828. Span<int> output = stackalloc int[16 * 16];
  829. Span<int> outptr = output;
  830. Span<int> tempIn = stackalloc int[16];
  831. Span<int> tempOut = stackalloc int[16];
  832. output.Fill(0);
  833. // First transform rows. Since all non-zero dct coefficients are in
  834. // upper-left 4x4 area, we only need to calculate first 4 rows here.
  835. for (i = 0; i < 4; ++i)
  836. {
  837. Idct16(input, outptr);
  838. input = input.Slice(16);
  839. outptr = outptr.Slice(16);
  840. }
  841. // Then transform columns
  842. for (i = 0; i < 16; ++i)
  843. {
  844. for (j = 0; j < 16; ++j)
  845. {
  846. tempIn[j] = output[j * 16 + i];
  847. }
  848. Idct16(tempIn, tempOut);
  849. for (j = 0; j < 16; ++j)
  850. {
  851. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  852. }
  853. }
  854. }
  855. public static void Idct16x161Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  856. {
  857. int i, j;
  858. long a1;
  859. int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
  860. output = WrapLow(DctConstRoundShift(output * CosPi16_64));
  861. a1 = BitUtils.RoundPowerOfTwo(output, 6);
  862. for (j = 0; j < 16; ++j)
  863. {
  864. for (i = 0; i < 16; ++i)
  865. {
  866. dest[i] = ClipPixelAdd(dest[i], a1);
  867. }
  868. dest = dest.Slice(stride);
  869. }
  870. }
  871. [SkipLocalsInit]
  872. public static void Idct32(ReadOnlySpan<int> input, Span<int> output)
  873. {
  874. Span<short> step1 = stackalloc short[32];
  875. Span<short> step2 = stackalloc short[32];
  876. long temp1, temp2;
  877. // stage 1
  878. step1[0] = (short)input[0];
  879. step1[1] = (short)input[16];
  880. step1[2] = (short)input[8];
  881. step1[3] = (short)input[24];
  882. step1[4] = (short)input[4];
  883. step1[5] = (short)input[20];
  884. step1[6] = (short)input[12];
  885. step1[7] = (short)input[28];
  886. step1[8] = (short)input[2];
  887. step1[9] = (short)input[18];
  888. step1[10] = (short)input[10];
  889. step1[11] = (short)input[26];
  890. step1[12] = (short)input[6];
  891. step1[13] = (short)input[22];
  892. step1[14] = (short)input[14];
  893. step1[15] = (short)input[30];
  894. temp1 = (short)input[1] * CosPi31_64 - (short)input[31] * CosPi1_64;
  895. temp2 = (short)input[1] * CosPi1_64 + (short)input[31] * CosPi31_64;
  896. step1[16] = (short)WrapLow(DctConstRoundShift(temp1));
  897. step1[31] = (short)WrapLow(DctConstRoundShift(temp2));
  898. temp1 = (short)input[17] * CosPi15_64 - (short)input[15] * CosPi17_64;
  899. temp2 = (short)input[17] * CosPi17_64 + (short)input[15] * CosPi15_64;
  900. step1[17] = (short)WrapLow(DctConstRoundShift(temp1));
  901. step1[30] = (short)WrapLow(DctConstRoundShift(temp2));
  902. temp1 = (short)input[9] * CosPi23_64 - (short)input[23] * CosPi9_64;
  903. temp2 = (short)input[9] * CosPi9_64 + (short)input[23] * CosPi23_64;
  904. step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
  905. step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
  906. temp1 = (short)input[25] * CosPi7_64 - (short)input[7] * CosPi25_64;
  907. temp2 = (short)input[25] * CosPi25_64 + (short)input[7] * CosPi7_64;
  908. step1[19] = (short)WrapLow(DctConstRoundShift(temp1));
  909. step1[28] = (short)WrapLow(DctConstRoundShift(temp2));
  910. temp1 = (short)input[5] * CosPi27_64 - (short)input[27] * CosPi5_64;
  911. temp2 = (short)input[5] * CosPi5_64 + (short)input[27] * CosPi27_64;
  912. step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
  913. step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
  914. temp1 = (short)input[21] * CosPi11_64 - (short)input[11] * CosPi21_64;
  915. temp2 = (short)input[21] * CosPi21_64 + (short)input[11] * CosPi11_64;
  916. step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
  917. step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
  918. temp1 = (short)input[13] * CosPi19_64 - (short)input[19] * CosPi13_64;
  919. temp2 = (short)input[13] * CosPi13_64 + (short)input[19] * CosPi19_64;
  920. step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
  921. step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
  922. temp1 = (short)input[29] * CosPi3_64 - (short)input[3] * CosPi29_64;
  923. temp2 = (short)input[29] * CosPi29_64 + (short)input[3] * CosPi3_64;
  924. step1[23] = (short)WrapLow(DctConstRoundShift(temp1));
  925. step1[24] = (short)WrapLow(DctConstRoundShift(temp2));
  926. // stage 2
  927. step2[0] = step1[0];
  928. step2[1] = step1[1];
  929. step2[2] = step1[2];
  930. step2[3] = step1[3];
  931. step2[4] = step1[4];
  932. step2[5] = step1[5];
  933. step2[6] = step1[6];
  934. step2[7] = step1[7];
  935. temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64;
  936. temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64;
  937. step2[8] = (short)WrapLow(DctConstRoundShift(temp1));
  938. step2[15] = (short)WrapLow(DctConstRoundShift(temp2));
  939. temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64;
  940. temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64;
  941. step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
  942. step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
  943. temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64;
  944. temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64;
  945. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  946. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  947. temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64;
  948. temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64;
  949. step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
  950. step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
  951. step2[16] = (short)WrapLow(step1[16] + step1[17]);
  952. step2[17] = (short)WrapLow(step1[16] - step1[17]);
  953. step2[18] = (short)WrapLow(-step1[18] + step1[19]);
  954. step2[19] = (short)WrapLow(step1[18] + step1[19]);
  955. step2[20] = (short)WrapLow(step1[20] + step1[21]);
  956. step2[21] = (short)WrapLow(step1[20] - step1[21]);
  957. step2[22] = (short)WrapLow(-step1[22] + step1[23]);
  958. step2[23] = (short)WrapLow(step1[22] + step1[23]);
  959. step2[24] = (short)WrapLow(step1[24] + step1[25]);
  960. step2[25] = (short)WrapLow(step1[24] - step1[25]);
  961. step2[26] = (short)WrapLow(-step1[26] + step1[27]);
  962. step2[27] = (short)WrapLow(step1[26] + step1[27]);
  963. step2[28] = (short)WrapLow(step1[28] + step1[29]);
  964. step2[29] = (short)WrapLow(step1[28] - step1[29]);
  965. step2[30] = (short)WrapLow(-step1[30] + step1[31]);
  966. step2[31] = (short)WrapLow(step1[30] + step1[31]);
  967. // stage 3
  968. step1[0] = step2[0];
  969. step1[1] = step2[1];
  970. step1[2] = step2[2];
  971. step1[3] = step2[3];
  972. temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64;
  973. temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64;
  974. step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
  975. step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
  976. temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64;
  977. temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64;
  978. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  979. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  980. step1[8] = (short)WrapLow(step2[8] + step2[9]);
  981. step1[9] = (short)WrapLow(step2[8] - step2[9]);
  982. step1[10] = (short)WrapLow(-step2[10] + step2[11]);
  983. step1[11] = (short)WrapLow(step2[10] + step2[11]);
  984. step1[12] = (short)WrapLow(step2[12] + step2[13]);
  985. step1[13] = (short)WrapLow(step2[12] - step2[13]);
  986. step1[14] = (short)WrapLow(-step2[14] + step2[15]);
  987. step1[15] = (short)WrapLow(step2[14] + step2[15]);
  988. step1[16] = step2[16];
  989. step1[31] = step2[31];
  990. temp1 = -step2[17] * CosPi4_64 + step2[30] * CosPi28_64;
  991. temp2 = step2[17] * CosPi28_64 + step2[30] * CosPi4_64;
  992. step1[17] = (short)WrapLow(DctConstRoundShift(temp1));
  993. step1[30] = (short)WrapLow(DctConstRoundShift(temp2));
  994. temp1 = -step2[18] * CosPi28_64 - step2[29] * CosPi4_64;
  995. temp2 = -step2[18] * CosPi4_64 + step2[29] * CosPi28_64;
  996. step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
  997. step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
  998. step1[19] = step2[19];
  999. step1[20] = step2[20];
  1000. temp1 = -step2[21] * CosPi20_64 + step2[26] * CosPi12_64;
  1001. temp2 = step2[21] * CosPi12_64 + step2[26] * CosPi20_64;
  1002. step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
  1003. step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
  1004. temp1 = -step2[22] * CosPi12_64 - step2[25] * CosPi20_64;
  1005. temp2 = -step2[22] * CosPi20_64 + step2[25] * CosPi12_64;
  1006. step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
  1007. step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
  1008. step1[23] = step2[23];
  1009. step1[24] = step2[24];
  1010. step1[27] = step2[27];
  1011. step1[28] = step2[28];
  1012. // stage 4
  1013. temp1 = (step1[0] + step1[1]) * CosPi16_64;
  1014. temp2 = (step1[0] - step1[1]) * CosPi16_64;
  1015. step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
  1016. step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
  1017. temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64;
  1018. temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64;
  1019. step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
  1020. step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
  1021. step2[4] = (short)WrapLow(step1[4] + step1[5]);
  1022. step2[5] = (short)WrapLow(step1[4] - step1[5]);
  1023. step2[6] = (short)WrapLow(-step1[6] + step1[7]);
  1024. step2[7] = (short)WrapLow(step1[6] + step1[7]);
  1025. step2[8] = step1[8];
  1026. step2[15] = step1[15];
  1027. temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64;
  1028. temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64;
  1029. step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
  1030. step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
  1031. temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64;
  1032. temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64;
  1033. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  1034. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  1035. step2[11] = step1[11];
  1036. step2[12] = step1[12];
  1037. step2[16] = (short)WrapLow(step1[16] + step1[19]);
  1038. step2[17] = (short)WrapLow(step1[17] + step1[18]);
  1039. step2[18] = (short)WrapLow(step1[17] - step1[18]);
  1040. step2[19] = (short)WrapLow(step1[16] - step1[19]);
  1041. step2[20] = (short)WrapLow(-step1[20] + step1[23]);
  1042. step2[21] = (short)WrapLow(-step1[21] + step1[22]);
  1043. step2[22] = (short)WrapLow(step1[21] + step1[22]);
  1044. step2[23] = (short)WrapLow(step1[20] + step1[23]);
  1045. step2[24] = (short)WrapLow(step1[24] + step1[27]);
  1046. step2[25] = (short)WrapLow(step1[25] + step1[26]);
  1047. step2[26] = (short)WrapLow(step1[25] - step1[26]);
  1048. step2[27] = (short)WrapLow(step1[24] - step1[27]);
  1049. step2[28] = (short)WrapLow(-step1[28] + step1[31]);
  1050. step2[29] = (short)WrapLow(-step1[29] + step1[30]);
  1051. step2[30] = (short)WrapLow(step1[29] + step1[30]);
  1052. step2[31] = (short)WrapLow(step1[28] + step1[31]);
  1053. // stage 5
  1054. step1[0] = (short)WrapLow(step2[0] + step2[3]);
  1055. step1[1] = (short)WrapLow(step2[1] + step2[2]);
  1056. step1[2] = (short)WrapLow(step2[1] - step2[2]);
  1057. step1[3] = (short)WrapLow(step2[0] - step2[3]);
  1058. step1[4] = step2[4];
  1059. temp1 = (step2[6] - step2[5]) * CosPi16_64;
  1060. temp2 = (step2[5] + step2[6]) * CosPi16_64;
  1061. step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
  1062. step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
  1063. step1[7] = step2[7];
  1064. step1[8] = (short)WrapLow(step2[8] + step2[11]);
  1065. step1[9] = (short)WrapLow(step2[9] + step2[10]);
  1066. step1[10] = (short)WrapLow(step2[9] - step2[10]);
  1067. step1[11] = (short)WrapLow(step2[8] - step2[11]);
  1068. step1[12] = (short)WrapLow(-step2[12] + step2[15]);
  1069. step1[13] = (short)WrapLow(-step2[13] + step2[14]);
  1070. step1[14] = (short)WrapLow(step2[13] + step2[14]);
  1071. step1[15] = (short)WrapLow(step2[12] + step2[15]);
  1072. step1[16] = step2[16];
  1073. step1[17] = step2[17];
  1074. temp1 = -step2[18] * CosPi8_64 + step2[29] * CosPi24_64;
  1075. temp2 = step2[18] * CosPi24_64 + step2[29] * CosPi8_64;
  1076. step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
  1077. step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
  1078. temp1 = -step2[19] * CosPi8_64 + step2[28] * CosPi24_64;
  1079. temp2 = step2[19] * CosPi24_64 + step2[28] * CosPi8_64;
  1080. step1[19] = (short)WrapLow(DctConstRoundShift(temp1));
  1081. step1[28] = (short)WrapLow(DctConstRoundShift(temp2));
  1082. temp1 = -step2[20] * CosPi24_64 - step2[27] * CosPi8_64;
  1083. temp2 = -step2[20] * CosPi8_64 + step2[27] * CosPi24_64;
  1084. step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
  1085. step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
  1086. temp1 = -step2[21] * CosPi24_64 - step2[26] * CosPi8_64;
  1087. temp2 = -step2[21] * CosPi8_64 + step2[26] * CosPi24_64;
  1088. step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
  1089. step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
  1090. step1[22] = step2[22];
  1091. step1[23] = step2[23];
  1092. step1[24] = step2[24];
  1093. step1[25] = step2[25];
  1094. step1[30] = step2[30];
  1095. step1[31] = step2[31];
  1096. // stage 6
  1097. step2[0] = (short)WrapLow(step1[0] + step1[7]);
  1098. step2[1] = (short)WrapLow(step1[1] + step1[6]);
  1099. step2[2] = (short)WrapLow(step1[2] + step1[5]);
  1100. step2[3] = (short)WrapLow(step1[3] + step1[4]);
  1101. step2[4] = (short)WrapLow(step1[3] - step1[4]);
  1102. step2[5] = (short)WrapLow(step1[2] - step1[5]);
  1103. step2[6] = (short)WrapLow(step1[1] - step1[6]);
  1104. step2[7] = (short)WrapLow(step1[0] - step1[7]);
  1105. step2[8] = step1[8];
  1106. step2[9] = step1[9];
  1107. temp1 = (-step1[10] + step1[13]) * CosPi16_64;
  1108. temp2 = (step1[10] + step1[13]) * CosPi16_64;
  1109. step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
  1110. step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
  1111. temp1 = (-step1[11] + step1[12]) * CosPi16_64;
  1112. temp2 = (step1[11] + step1[12]) * CosPi16_64;
  1113. step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
  1114. step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
  1115. step2[14] = step1[14];
  1116. step2[15] = step1[15];
  1117. step2[16] = (short)WrapLow(step1[16] + step1[23]);
  1118. step2[17] = (short)WrapLow(step1[17] + step1[22]);
  1119. step2[18] = (short)WrapLow(step1[18] + step1[21]);
  1120. step2[19] = (short)WrapLow(step1[19] + step1[20]);
  1121. step2[20] = (short)WrapLow(step1[19] - step1[20]);
  1122. step2[21] = (short)WrapLow(step1[18] - step1[21]);
  1123. step2[22] = (short)WrapLow(step1[17] - step1[22]);
  1124. step2[23] = (short)WrapLow(step1[16] - step1[23]);
  1125. step2[24] = (short)WrapLow(-step1[24] + step1[31]);
  1126. step2[25] = (short)WrapLow(-step1[25] + step1[30]);
  1127. step2[26] = (short)WrapLow(-step1[26] + step1[29]);
  1128. step2[27] = (short)WrapLow(-step1[27] + step1[28]);
  1129. step2[28] = (short)WrapLow(step1[27] + step1[28]);
  1130. step2[29] = (short)WrapLow(step1[26] + step1[29]);
  1131. step2[30] = (short)WrapLow(step1[25] + step1[30]);
  1132. step2[31] = (short)WrapLow(step1[24] + step1[31]);
  1133. // stage 7
  1134. step1[0] = (short)WrapLow(step2[0] + step2[15]);
  1135. step1[1] = (short)WrapLow(step2[1] + step2[14]);
  1136. step1[2] = (short)WrapLow(step2[2] + step2[13]);
  1137. step1[3] = (short)WrapLow(step2[3] + step2[12]);
  1138. step1[4] = (short)WrapLow(step2[4] + step2[11]);
  1139. step1[5] = (short)WrapLow(step2[5] + step2[10]);
  1140. step1[6] = (short)WrapLow(step2[6] + step2[9]);
  1141. step1[7] = (short)WrapLow(step2[7] + step2[8]);
  1142. step1[8] = (short)WrapLow(step2[7] - step2[8]);
  1143. step1[9] = (short)WrapLow(step2[6] - step2[9]);
  1144. step1[10] = (short)WrapLow(step2[5] - step2[10]);
  1145. step1[11] = (short)WrapLow(step2[4] - step2[11]);
  1146. step1[12] = (short)WrapLow(step2[3] - step2[12]);
  1147. step1[13] = (short)WrapLow(step2[2] - step2[13]);
  1148. step1[14] = (short)WrapLow(step2[1] - step2[14]);
  1149. step1[15] = (short)WrapLow(step2[0] - step2[15]);
  1150. step1[16] = step2[16];
  1151. step1[17] = step2[17];
  1152. step1[18] = step2[18];
  1153. step1[19] = step2[19];
  1154. temp1 = (-step2[20] + step2[27]) * CosPi16_64;
  1155. temp2 = (step2[20] + step2[27]) * CosPi16_64;
  1156. step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
  1157. step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
  1158. temp1 = (-step2[21] + step2[26]) * CosPi16_64;
  1159. temp2 = (step2[21] + step2[26]) * CosPi16_64;
  1160. step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
  1161. step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
  1162. temp1 = (-step2[22] + step2[25]) * CosPi16_64;
  1163. temp2 = (step2[22] + step2[25]) * CosPi16_64;
  1164. step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
  1165. step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
  1166. temp1 = (-step2[23] + step2[24]) * CosPi16_64;
  1167. temp2 = (step2[23] + step2[24]) * CosPi16_64;
  1168. step1[23] = (short)WrapLow(DctConstRoundShift(temp1));
  1169. step1[24] = (short)WrapLow(DctConstRoundShift(temp2));
  1170. step1[28] = step2[28];
  1171. step1[29] = step2[29];
  1172. step1[30] = step2[30];
  1173. step1[31] = step2[31];
  1174. // final stage
  1175. output[0] = WrapLow(step1[0] + step1[31]);
  1176. output[1] = WrapLow(step1[1] + step1[30]);
  1177. output[2] = WrapLow(step1[2] + step1[29]);
  1178. output[3] = WrapLow(step1[3] + step1[28]);
  1179. output[4] = WrapLow(step1[4] + step1[27]);
  1180. output[5] = WrapLow(step1[5] + step1[26]);
  1181. output[6] = WrapLow(step1[6] + step1[25]);
  1182. output[7] = WrapLow(step1[7] + step1[24]);
  1183. output[8] = WrapLow(step1[8] + step1[23]);
  1184. output[9] = WrapLow(step1[9] + step1[22]);
  1185. output[10] = WrapLow(step1[10] + step1[21]);
  1186. output[11] = WrapLow(step1[11] + step1[20]);
  1187. output[12] = WrapLow(step1[12] + step1[19]);
  1188. output[13] = WrapLow(step1[13] + step1[18]);
  1189. output[14] = WrapLow(step1[14] + step1[17]);
  1190. output[15] = WrapLow(step1[15] + step1[16]);
  1191. output[16] = WrapLow(step1[15] - step1[16]);
  1192. output[17] = WrapLow(step1[14] - step1[17]);
  1193. output[18] = WrapLow(step1[13] - step1[18]);
  1194. output[19] = WrapLow(step1[12] - step1[19]);
  1195. output[20] = WrapLow(step1[11] - step1[20]);
  1196. output[21] = WrapLow(step1[10] - step1[21]);
  1197. output[22] = WrapLow(step1[9] - step1[22]);
  1198. output[23] = WrapLow(step1[8] - step1[23]);
  1199. output[24] = WrapLow(step1[7] - step1[24]);
  1200. output[25] = WrapLow(step1[6] - step1[25]);
  1201. output[26] = WrapLow(step1[5] - step1[26]);
  1202. output[27] = WrapLow(step1[4] - step1[27]);
  1203. output[28] = WrapLow(step1[3] - step1[28]);
  1204. output[29] = WrapLow(step1[2] - step1[29]);
  1205. output[30] = WrapLow(step1[1] - step1[30]);
  1206. output[31] = WrapLow(step1[0] - step1[31]);
  1207. }
  1208. [SkipLocalsInit]
  1209. public static void Idct32x321024Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  1210. {
  1211. int i, j;
  1212. Span<int> output = stackalloc int[32 * 32];
  1213. Span<int> outptr = output;
  1214. Span<int> tempIn = stackalloc int[32];
  1215. Span<int> tempOut = stackalloc int[32];
  1216. // Rows
  1217. for (i = 0; i < 32; ++i)
  1218. {
  1219. short zeroCoeff = 0;
  1220. for (j = 0; j < 32; ++j)
  1221. {
  1222. zeroCoeff |= (short)input[j];
  1223. }
  1224. if (zeroCoeff != 0)
  1225. {
  1226. Idct32(input, outptr);
  1227. }
  1228. else
  1229. {
  1230. outptr.Slice(0, 32).Fill(0);
  1231. }
  1232. input = input.Slice(32);
  1233. outptr = outptr.Slice(32);
  1234. }
  1235. // Columns
  1236. for (i = 0; i < 32; ++i)
  1237. {
  1238. for (j = 0; j < 32; ++j)
  1239. {
  1240. tempIn[j] = output[j * 32 + i];
  1241. }
  1242. Idct32(tempIn, tempOut);
  1243. for (j = 0; j < 32; ++j)
  1244. {
  1245. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  1246. }
  1247. }
  1248. }
  1249. [SkipLocalsInit]
  1250. public static void Idct32x32135Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  1251. {
  1252. int i, j;
  1253. Span<int> output = stackalloc int[32 * 32];
  1254. Span<int> outptr = output;
  1255. Span<int> tempIn = stackalloc int[32];
  1256. Span<int> tempOut = stackalloc int[32];
  1257. output.Fill(0);
  1258. // Rows
  1259. // Only upper-left 16x16 has non-zero coeff
  1260. for (i = 0; i < 16; ++i)
  1261. {
  1262. Idct32(input, outptr);
  1263. input = input.Slice(32);
  1264. outptr = outptr.Slice(32);
  1265. }
  1266. // Columns
  1267. for (i = 0; i < 32; ++i)
  1268. {
  1269. for (j = 0; j < 32; ++j)
  1270. {
  1271. tempIn[j] = output[j * 32 + i];
  1272. }
  1273. Idct32(tempIn, tempOut);
  1274. for (j = 0; j < 32; ++j)
  1275. {
  1276. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  1277. }
  1278. }
  1279. }
  1280. [SkipLocalsInit]
  1281. public static void Idct32x3234Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  1282. {
  1283. int i, j;
  1284. Span<int> output = stackalloc int[32 * 32];
  1285. Span<int> outptr = output;
  1286. Span<int> tempIn = stackalloc int[32];
  1287. Span<int> tempOut = stackalloc int[32];
  1288. output.Fill(0);
  1289. // Rows
  1290. // Only upper-left 8x8 has non-zero coeff
  1291. for (i = 0; i < 8; ++i)
  1292. {
  1293. Idct32(input, outptr);
  1294. input = input.Slice(32);
  1295. outptr = outptr.Slice(32);
  1296. }
  1297. // Columns
  1298. for (i = 0; i < 32; ++i)
  1299. {
  1300. for (j = 0; j < 32; ++j)
  1301. {
  1302. tempIn[j] = output[j * 32 + i];
  1303. }
  1304. Idct32(tempIn, tempOut);
  1305. for (j = 0; j < 32; ++j)
  1306. {
  1307. dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
  1308. }
  1309. }
  1310. }
  1311. public static void Idct32x321Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  1312. {
  1313. int i, j;
  1314. long a1;
  1315. int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
  1316. output = WrapLow(DctConstRoundShift(output * CosPi16_64));
  1317. a1 = BitUtils.RoundPowerOfTwo(output, 6);
  1318. for (j = 0; j < 32; ++j)
  1319. {
  1320. for (i = 0; i < 32; ++i)
  1321. {
  1322. dest[i] = ClipPixelAdd(dest[i], a1);
  1323. }
  1324. dest = dest.Slice(stride);
  1325. }
  1326. }
  1327. [SkipLocalsInit]
  1328. public static void HighbdIwht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1329. {
  1330. /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  1331. 0.5 shifts per pixel. */
  1332. int i;
  1333. Span<int> output = stackalloc int[16];
  1334. long a1, b1, c1, d1, e1;
  1335. ReadOnlySpan<int> ip = input;
  1336. Span<int> op = output;
  1337. for (i = 0; i < 4; i++)
  1338. {
  1339. a1 = ip[0] >> UnitQuantShift;
  1340. c1 = ip[1] >> UnitQuantShift;
  1341. d1 = ip[2] >> UnitQuantShift;
  1342. b1 = ip[3] >> UnitQuantShift;
  1343. a1 += c1;
  1344. d1 -= b1;
  1345. e1 = (a1 - d1) >> 1;
  1346. b1 = e1 - b1;
  1347. c1 = e1 - c1;
  1348. a1 -= b1;
  1349. d1 += c1;
  1350. op[0] = HighbdWrapLow(a1, bd);
  1351. op[1] = HighbdWrapLow(b1, bd);
  1352. op[2] = HighbdWrapLow(c1, bd);
  1353. op[3] = HighbdWrapLow(d1, bd);
  1354. ip = ip.Slice(4);
  1355. op = op.Slice(4);
  1356. }
  1357. ReadOnlySpan<int> ip2 = output;
  1358. for (i = 0; i < 4; i++)
  1359. {
  1360. a1 = ip2[4 * 0];
  1361. c1 = ip2[4 * 1];
  1362. d1 = ip2[4 * 2];
  1363. b1 = ip2[4 * 3];
  1364. a1 += c1;
  1365. d1 -= b1;
  1366. e1 = (a1 - d1) >> 1;
  1367. b1 = e1 - b1;
  1368. c1 = e1 - c1;
  1369. a1 -= b1;
  1370. d1 += c1;
  1371. dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], HighbdWrapLow(a1, bd), bd);
  1372. dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], HighbdWrapLow(b1, bd), bd);
  1373. dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], HighbdWrapLow(c1, bd), bd);
  1374. dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], HighbdWrapLow(d1, bd), bd);
  1375. ip2 = ip2.Slice(1);
  1376. dest = dest.Slice(1);
  1377. }
  1378. }
  1379. [SkipLocalsInit]
  1380. public static void HighbdIwht4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1381. {
  1382. int i;
  1383. long a1, e1;
  1384. Span<int> tmp = stackalloc int[4];
  1385. ReadOnlySpan<int> ip = input;
  1386. Span<int> op = tmp;
  1387. a1 = ip[0] >> UnitQuantShift;
  1388. e1 = a1 >> 1;
  1389. a1 -= e1;
  1390. op[0] = HighbdWrapLow(a1, bd);
  1391. op[1] = op[2] = op[3] = HighbdWrapLow(e1, bd);
  1392. ReadOnlySpan<int> ip2 = tmp;
  1393. for (i = 0; i < 4; i++)
  1394. {
  1395. e1 = ip2[0] >> 1;
  1396. a1 = ip2[0] - e1;
  1397. dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], a1, bd);
  1398. dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], e1, bd);
  1399. dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], e1, bd);
  1400. dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], e1, bd);
  1401. ip2 = ip2.Slice(1);
  1402. dest = dest.Slice(1);
  1403. }
  1404. }
  1405. public static void HighbdIadst4(ReadOnlySpan<int> input, Span<int> output, int bd)
  1406. {
  1407. long s0, s1, s2, s3, s4, s5, s6, s7;
  1408. int x0 = input[0];
  1409. int x1 = input[1];
  1410. int x2 = input[2];
  1411. int x3 = input[3];
  1412. if (DetectInvalidHighbdInput(input, 4) != 0)
  1413. {
  1414. Debug.Assert(false, "invalid highbd txfm input");
  1415. output.Slice(0, 4).Fill(0);
  1416. return;
  1417. }
  1418. if ((x0 | x1 | x2 | x3) == 0)
  1419. {
  1420. output.Slice(0, 4).Fill(0);
  1421. return;
  1422. }
  1423. s0 = (long)SinPi1_9 * x0;
  1424. s1 = (long)SinPi2_9 * x0;
  1425. s2 = (long)SinPi3_9 * x1;
  1426. s3 = (long)SinPi4_9 * x2;
  1427. s4 = (long)SinPi1_9 * x2;
  1428. s5 = (long)SinPi2_9 * x3;
  1429. s6 = (long)SinPi4_9 * x3;
  1430. s7 = HighbdWrapLow(x0 - x2 + x3, bd);
  1431. s0 = s0 + s3 + s5;
  1432. s1 = s1 - s4 - s6;
  1433. s3 = s2;
  1434. s2 = SinPi3_9 * s7;
  1435. // 1-D transform scaling factor is sqrt(2).
  1436. // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
  1437. // + 1b (addition) = 29b.
  1438. // Hence the output bit depth is 15b.
  1439. output[0] = HighbdWrapLow(DctConstRoundShift(s0 + s3), bd);
  1440. output[1] = HighbdWrapLow(DctConstRoundShift(s1 + s3), bd);
  1441. output[2] = HighbdWrapLow(DctConstRoundShift(s2), bd);
  1442. output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd);
  1443. }
  1444. [SkipLocalsInit]
  1445. public static void HighbdIdct4(ReadOnlySpan<int> input, Span<int> output, int bd)
  1446. {
  1447. Span<int> step = stackalloc int[4];
  1448. long temp1, temp2;
  1449. if (DetectInvalidHighbdInput(input, 4) != 0)
  1450. {
  1451. Debug.Assert(false, "invalid highbd txfm input");
  1452. output.Slice(0, 4).Fill(0);
  1453. return;
  1454. }
  1455. // stage 1
  1456. temp1 = (input[0] + input[2]) * (long)CosPi16_64;
  1457. temp2 = (input[0] - input[2]) * (long)CosPi16_64;
  1458. step[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1459. step[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1460. temp1 = input[1] * (long)CosPi24_64 - input[3] * (long)CosPi8_64;
  1461. temp2 = input[1] * (long)CosPi8_64 + input[3] * (long)CosPi24_64;
  1462. step[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1463. step[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1464. // stage 2
  1465. output[0] = HighbdWrapLow(step[0] + step[3], bd);
  1466. output[1] = HighbdWrapLow(step[1] + step[2], bd);
  1467. output[2] = HighbdWrapLow(step[1] - step[2], bd);
  1468. output[3] = HighbdWrapLow(step[0] - step[3], bd);
  1469. }
  1470. [SkipLocalsInit]
  1471. public static void HighbdIdct4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1472. {
  1473. int i, j;
  1474. Span<int> output = stackalloc int[4 * 4];
  1475. Span<int> outptr = output;
  1476. Span<int> tempIn = stackalloc int[4];
  1477. Span<int> tempOut = stackalloc int[4];
  1478. // Rows
  1479. for (i = 0; i < 4; ++i)
  1480. {
  1481. HighbdIdct4(input, outptr, bd);
  1482. input = input.Slice(4);
  1483. outptr = outptr.Slice(4);
  1484. }
  1485. // Columns
  1486. for (i = 0; i < 4; ++i)
  1487. {
  1488. for (j = 0; j < 4; ++j)
  1489. {
  1490. tempIn[j] = output[j * 4 + i];
  1491. }
  1492. HighbdIdct4(tempIn, tempOut, bd);
  1493. for (j = 0; j < 4; ++j)
  1494. {
  1495. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd);
  1496. }
  1497. }
  1498. }
  1499. public static void HighbdIdct4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1500. {
  1501. int i;
  1502. long a1;
  1503. int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
  1504. output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
  1505. a1 = BitUtils.RoundPowerOfTwo(output, 4);
  1506. for (i = 0; i < 4; i++)
  1507. {
  1508. dest[0] = HighbdClipPixelAdd(dest[0], a1, bd);
  1509. dest[1] = HighbdClipPixelAdd(dest[1], a1, bd);
  1510. dest[2] = HighbdClipPixelAdd(dest[2], a1, bd);
  1511. dest[3] = HighbdClipPixelAdd(dest[3], a1, bd);
  1512. dest = dest.Slice(stride);
  1513. }
  1514. }
  1515. public static void HighbdIadst8(ReadOnlySpan<int> input, Span<int> output, int bd)
  1516. {
  1517. long s0, s1, s2, s3, s4, s5, s6, s7;
  1518. int x0 = input[7];
  1519. int x1 = input[0];
  1520. int x2 = input[5];
  1521. int x3 = input[2];
  1522. int x4 = input[3];
  1523. int x5 = input[4];
  1524. int x6 = input[1];
  1525. int x7 = input[6];
  1526. if (DetectInvalidHighbdInput(input, 8) != 0)
  1527. {
  1528. Debug.Assert(false, "invalid highbd txfm input");
  1529. output.Slice(0, 8).Fill(0);
  1530. return;
  1531. }
  1532. if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0)
  1533. {
  1534. output.Slice(0, 8).Fill(0);
  1535. return;
  1536. }
  1537. // stage 1
  1538. s0 = (long)CosPi2_64 * x0 + (long)CosPi30_64 * x1;
  1539. s1 = (long)CosPi30_64 * x0 - (long)CosPi2_64 * x1;
  1540. s2 = (long)CosPi10_64 * x2 + (long)CosPi22_64 * x3;
  1541. s3 = (long)CosPi22_64 * x2 - (long)CosPi10_64 * x3;
  1542. s4 = (long)CosPi18_64 * x4 + (long)CosPi14_64 * x5;
  1543. s5 = (long)CosPi14_64 * x4 - (long)CosPi18_64 * x5;
  1544. s6 = (long)CosPi26_64 * x6 + (long)CosPi6_64 * x7;
  1545. s7 = (long)CosPi6_64 * x6 - (long)CosPi26_64 * x7;
  1546. x0 = HighbdWrapLow(DctConstRoundShift(s0 + s4), bd);
  1547. x1 = HighbdWrapLow(DctConstRoundShift(s1 + s5), bd);
  1548. x2 = HighbdWrapLow(DctConstRoundShift(s2 + s6), bd);
  1549. x3 = HighbdWrapLow(DctConstRoundShift(s3 + s7), bd);
  1550. x4 = HighbdWrapLow(DctConstRoundShift(s0 - s4), bd);
  1551. x5 = HighbdWrapLow(DctConstRoundShift(s1 - s5), bd);
  1552. x6 = HighbdWrapLow(DctConstRoundShift(s2 - s6), bd);
  1553. x7 = HighbdWrapLow(DctConstRoundShift(s3 - s7), bd);
  1554. // stage 2
  1555. s0 = x0;
  1556. s1 = x1;
  1557. s2 = x2;
  1558. s3 = x3;
  1559. s4 = (long)CosPi8_64 * x4 + (long)CosPi24_64 * x5;
  1560. s5 = (long)CosPi24_64 * x4 - (long)CosPi8_64 * x5;
  1561. s6 = (long)(-CosPi24_64) * x6 + (long)CosPi8_64 * x7;
  1562. s7 = (long)CosPi8_64 * x6 + (long)CosPi24_64 * x7;
  1563. x0 = HighbdWrapLow(s0 + s2, bd);
  1564. x1 = HighbdWrapLow(s1 + s3, bd);
  1565. x2 = HighbdWrapLow(s0 - s2, bd);
  1566. x3 = HighbdWrapLow(s1 - s3, bd);
  1567. x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd);
  1568. x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd);
  1569. x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd);
  1570. x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd);
  1571. // stage 3
  1572. s2 = (long)CosPi16_64 * (x2 + x3);
  1573. s3 = (long)CosPi16_64 * (x2 - x3);
  1574. s6 = (long)CosPi16_64 * (x6 + x7);
  1575. s7 = (long)CosPi16_64 * (x6 - x7);
  1576. x2 = HighbdWrapLow(DctConstRoundShift(s2), bd);
  1577. x3 = HighbdWrapLow(DctConstRoundShift(s3), bd);
  1578. x6 = HighbdWrapLow(DctConstRoundShift(s6), bd);
  1579. x7 = HighbdWrapLow(DctConstRoundShift(s7), bd);
  1580. output[0] = HighbdWrapLow(x0, bd);
  1581. output[1] = HighbdWrapLow(-x4, bd);
  1582. output[2] = HighbdWrapLow(x6, bd);
  1583. output[3] = HighbdWrapLow(-x2, bd);
  1584. output[4] = HighbdWrapLow(x3, bd);
  1585. output[5] = HighbdWrapLow(-x7, bd);
  1586. output[6] = HighbdWrapLow(x5, bd);
  1587. output[7] = HighbdWrapLow(-x1, bd);
  1588. }
  1589. [SkipLocalsInit]
  1590. public static void HighbdIdct8(ReadOnlySpan<int> input, Span<int> output, int bd)
  1591. {
  1592. Span<int> step1 = stackalloc int[8];
  1593. Span<int> step2 = stackalloc int[8];
  1594. long temp1, temp2;
  1595. if (DetectInvalidHighbdInput(input, 8) != 0)
  1596. {
  1597. Debug.Assert(false, "invalid highbd txfm input");
  1598. output.Slice(0, 8).Fill(0);
  1599. return;
  1600. }
  1601. // stage 1
  1602. step1[0] = input[0];
  1603. step1[2] = input[4];
  1604. step1[1] = input[2];
  1605. step1[3] = input[6];
  1606. temp1 = input[1] * (long)CosPi28_64 - input[7] * (long)CosPi4_64;
  1607. temp2 = input[1] * (long)CosPi4_64 + input[7] * (long)CosPi28_64;
  1608. step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1609. step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1610. temp1 = input[5] * (long)CosPi12_64 - input[3] * (long)CosPi20_64;
  1611. temp2 = input[5] * (long)CosPi20_64 + input[3] * (long)CosPi12_64;
  1612. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1613. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1614. // stage 2 & stage 3 - even half
  1615. HighbdIdct4(step1, step1, bd);
  1616. // stage 2 - odd half
  1617. step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
  1618. step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
  1619. step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
  1620. step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
  1621. // stage 3 - odd half
  1622. step1[4] = step2[4];
  1623. temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
  1624. temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
  1625. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1626. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1627. step1[7] = step2[7];
  1628. // stage 4
  1629. output[0] = HighbdWrapLow(step1[0] + step1[7], bd);
  1630. output[1] = HighbdWrapLow(step1[1] + step1[6], bd);
  1631. output[2] = HighbdWrapLow(step1[2] + step1[5], bd);
  1632. output[3] = HighbdWrapLow(step1[3] + step1[4], bd);
  1633. output[4] = HighbdWrapLow(step1[3] - step1[4], bd);
  1634. output[5] = HighbdWrapLow(step1[2] - step1[5], bd);
  1635. output[6] = HighbdWrapLow(step1[1] - step1[6], bd);
  1636. output[7] = HighbdWrapLow(step1[0] - step1[7], bd);
  1637. }
  1638. [SkipLocalsInit]
  1639. public static void HighbdIdct8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1640. {
  1641. int i, j;
  1642. Span<int> output = stackalloc int[8 * 8];
  1643. Span<int> outptr = output;
  1644. Span<int> tempIn = stackalloc int[8];
  1645. Span<int> tempOut = stackalloc int[8];
  1646. // First transform rows
  1647. for (i = 0; i < 8; ++i)
  1648. {
  1649. HighbdIdct8(input, outptr, bd);
  1650. input = input.Slice(8);
  1651. outptr = outptr.Slice(8);
  1652. }
  1653. // Then transform columns
  1654. for (i = 0; i < 8; ++i)
  1655. {
  1656. for (j = 0; j < 8; ++j)
  1657. {
  1658. tempIn[j] = output[j * 8 + i];
  1659. }
  1660. HighbdIdct8(tempIn, tempOut, bd);
  1661. for (j = 0; j < 8; ++j)
  1662. {
  1663. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
  1664. }
  1665. }
  1666. }
  1667. [SkipLocalsInit]
  1668. public static void HighbdIdct8x812Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1669. {
  1670. int i, j;
  1671. Span<int> output = stackalloc int[8 * 8];
  1672. Span<int> outptr = output;
  1673. Span<int> tempIn = stackalloc int[8];
  1674. Span<int> tempOut = stackalloc int[8];
  1675. output.Fill(0);
  1676. // First transform rows
  1677. // Only first 4 row has non-zero coefs
  1678. for (i = 0; i < 4; ++i)
  1679. {
  1680. HighbdIdct8(input, outptr, bd);
  1681. input = input.Slice(8);
  1682. outptr = outptr.Slice(8);
  1683. }
  1684. // Then transform columns
  1685. for (i = 0; i < 8; ++i)
  1686. {
  1687. for (j = 0; j < 8; ++j)
  1688. {
  1689. tempIn[j] = output[j * 8 + i];
  1690. }
  1691. HighbdIdct8(tempIn, tempOut, bd);
  1692. for (j = 0; j < 8; ++j)
  1693. {
  1694. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
  1695. }
  1696. }
  1697. }
  1698. public static void vpx_Highbdidct8x8_1_add_c(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  1699. {
  1700. int i, j;
  1701. long a1;
  1702. int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
  1703. output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
  1704. a1 = BitUtils.RoundPowerOfTwo(output, 5);
  1705. for (j = 0; j < 8; ++j)
  1706. {
  1707. for (i = 0; i < 8; ++i)
  1708. {
  1709. dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
  1710. }
  1711. dest = dest.Slice(stride);
  1712. }
  1713. }
  1714. public static void HighbdIadst16(ReadOnlySpan<int> input, Span<int> output, int bd)
  1715. {
  1716. long s0, s1, s2, s3, s4, s5, s6, s7, s8;
  1717. long s9, s10, s11, s12, s13, s14, s15;
  1718. int x0 = input[15];
  1719. int x1 = input[0];
  1720. int x2 = input[13];
  1721. int x3 = input[2];
  1722. int x4 = input[11];
  1723. int x5 = input[4];
  1724. int x6 = input[9];
  1725. int x7 = input[6];
  1726. int x8 = input[7];
  1727. int x9 = input[8];
  1728. int x10 = input[5];
  1729. int x11 = input[10];
  1730. int x12 = input[3];
  1731. int x13 = input[12];
  1732. int x14 = input[1];
  1733. int x15 = input[14];
  1734. if (DetectInvalidHighbdInput(input, 16) != 0)
  1735. {
  1736. Debug.Assert(false, "invalid highbd txfm input");
  1737. output.Slice(0, 16).Fill(0);
  1738. return;
  1739. }
  1740. if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0)
  1741. {
  1742. output.Slice(0, 16).Fill(0);
  1743. return;
  1744. }
  1745. // stage 1
  1746. s0 = x0 * (long)CosPi1_64 + x1 * (long)CosPi31_64;
  1747. s1 = x0 * (long)CosPi31_64 - x1 * (long)CosPi1_64;
  1748. s2 = x2 * (long)CosPi5_64 + x3 * (long)CosPi27_64;
  1749. s3 = x2 * (long)CosPi27_64 - x3 * (long)CosPi5_64;
  1750. s4 = x4 * (long)CosPi9_64 + x5 * (long)CosPi23_64;
  1751. s5 = x4 * (long)CosPi23_64 - x5 * (long)CosPi9_64;
  1752. s6 = x6 * (long)CosPi13_64 + x7 * (long)CosPi19_64;
  1753. s7 = x6 * (long)CosPi19_64 - x7 * (long)CosPi13_64;
  1754. s8 = x8 * (long)CosPi17_64 + x9 * (long)CosPi15_64;
  1755. s9 = x8 * (long)CosPi15_64 - x9 * (long)CosPi17_64;
  1756. s10 = x10 * (long)CosPi21_64 + x11 * (long)CosPi11_64;
  1757. s11 = x10 * (long)CosPi11_64 - x11 * (long)CosPi21_64;
  1758. s12 = x12 * (long)CosPi25_64 + x13 * (long)CosPi7_64;
  1759. s13 = x12 * (long)CosPi7_64 - x13 * (long)CosPi25_64;
  1760. s14 = x14 * (long)CosPi29_64 + x15 * (long)CosPi3_64;
  1761. s15 = x14 * (long)CosPi3_64 - x15 * (long)CosPi29_64;
  1762. x0 = HighbdWrapLow(DctConstRoundShift(s0 + s8), bd);
  1763. x1 = HighbdWrapLow(DctConstRoundShift(s1 + s9), bd);
  1764. x2 = HighbdWrapLow(DctConstRoundShift(s2 + s10), bd);
  1765. x3 = HighbdWrapLow(DctConstRoundShift(s3 + s11), bd);
  1766. x4 = HighbdWrapLow(DctConstRoundShift(s4 + s12), bd);
  1767. x5 = HighbdWrapLow(DctConstRoundShift(s5 + s13), bd);
  1768. x6 = HighbdWrapLow(DctConstRoundShift(s6 + s14), bd);
  1769. x7 = HighbdWrapLow(DctConstRoundShift(s7 + s15), bd);
  1770. x8 = HighbdWrapLow(DctConstRoundShift(s0 - s8), bd);
  1771. x9 = HighbdWrapLow(DctConstRoundShift(s1 - s9), bd);
  1772. x10 = HighbdWrapLow(DctConstRoundShift(s2 - s10), bd);
  1773. x11 = HighbdWrapLow(DctConstRoundShift(s3 - s11), bd);
  1774. x12 = HighbdWrapLow(DctConstRoundShift(s4 - s12), bd);
  1775. x13 = HighbdWrapLow(DctConstRoundShift(s5 - s13), bd);
  1776. x14 = HighbdWrapLow(DctConstRoundShift(s6 - s14), bd);
  1777. x15 = HighbdWrapLow(DctConstRoundShift(s7 - s15), bd);
  1778. // stage 2
  1779. s0 = x0;
  1780. s1 = x1;
  1781. s2 = x2;
  1782. s3 = x3;
  1783. s4 = x4;
  1784. s5 = x5;
  1785. s6 = x6;
  1786. s7 = x7;
  1787. s8 = x8 * (long)CosPi4_64 + x9 * (long)CosPi28_64;
  1788. s9 = x8 * (long)CosPi28_64 - x9 * (long)CosPi4_64;
  1789. s10 = x10 * (long)CosPi20_64 + x11 * (long)CosPi12_64;
  1790. s11 = x10 * (long)CosPi12_64 - x11 * (long)CosPi20_64;
  1791. s12 = -x12 * (long)CosPi28_64 + x13 * (long)CosPi4_64;
  1792. s13 = x12 * (long)CosPi4_64 + x13 * (long)CosPi28_64;
  1793. s14 = -x14 * (long)CosPi12_64 + x15 * (long)CosPi20_64;
  1794. s15 = x14 * (long)CosPi20_64 + x15 * (long)CosPi12_64;
  1795. x0 = HighbdWrapLow(s0 + s4, bd);
  1796. x1 = HighbdWrapLow(s1 + s5, bd);
  1797. x2 = HighbdWrapLow(s2 + s6, bd);
  1798. x3 = HighbdWrapLow(s3 + s7, bd);
  1799. x4 = HighbdWrapLow(s0 - s4, bd);
  1800. x5 = HighbdWrapLow(s1 - s5, bd);
  1801. x6 = HighbdWrapLow(s2 - s6, bd);
  1802. x7 = HighbdWrapLow(s3 - s7, bd);
  1803. x8 = HighbdWrapLow(DctConstRoundShift(s8 + s12), bd);
  1804. x9 = HighbdWrapLow(DctConstRoundShift(s9 + s13), bd);
  1805. x10 = HighbdWrapLow(DctConstRoundShift(s10 + s14), bd);
  1806. x11 = HighbdWrapLow(DctConstRoundShift(s11 + s15), bd);
  1807. x12 = HighbdWrapLow(DctConstRoundShift(s8 - s12), bd);
  1808. x13 = HighbdWrapLow(DctConstRoundShift(s9 - s13), bd);
  1809. x14 = HighbdWrapLow(DctConstRoundShift(s10 - s14), bd);
  1810. x15 = HighbdWrapLow(DctConstRoundShift(s11 - s15), bd);
  1811. // stage 3
  1812. s0 = x0;
  1813. s1 = x1;
  1814. s2 = x2;
  1815. s3 = x3;
  1816. s4 = x4 * (long)CosPi8_64 + x5 * (long)CosPi24_64;
  1817. s5 = x4 * (long)CosPi24_64 - x5 * (long)CosPi8_64;
  1818. s6 = -x6 * (long)CosPi24_64 + x7 * (long)CosPi8_64;
  1819. s7 = x6 * (long)CosPi8_64 + x7 * (long)CosPi24_64;
  1820. s8 = x8;
  1821. s9 = x9;
  1822. s10 = x10;
  1823. s11 = x11;
  1824. s12 = x12 * (long)CosPi8_64 + x13 * (long)CosPi24_64;
  1825. s13 = x12 * (long)CosPi24_64 - x13 * (long)CosPi8_64;
  1826. s14 = -x14 * (long)CosPi24_64 + x15 * (long)CosPi8_64;
  1827. s15 = x14 * (long)CosPi8_64 + x15 * (long)CosPi24_64;
  1828. x0 = HighbdWrapLow(s0 + s2, bd);
  1829. x1 = HighbdWrapLow(s1 + s3, bd);
  1830. x2 = HighbdWrapLow(s0 - s2, bd);
  1831. x3 = HighbdWrapLow(s1 - s3, bd);
  1832. x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd);
  1833. x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd);
  1834. x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd);
  1835. x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd);
  1836. x8 = HighbdWrapLow(s8 + s10, bd);
  1837. x9 = HighbdWrapLow(s9 + s11, bd);
  1838. x10 = HighbdWrapLow(s8 - s10, bd);
  1839. x11 = HighbdWrapLow(s9 - s11, bd);
  1840. x12 = HighbdWrapLow(DctConstRoundShift(s12 + s14), bd);
  1841. x13 = HighbdWrapLow(DctConstRoundShift(s13 + s15), bd);
  1842. x14 = HighbdWrapLow(DctConstRoundShift(s12 - s14), bd);
  1843. x15 = HighbdWrapLow(DctConstRoundShift(s13 - s15), bd);
  1844. // stage 4
  1845. s2 = (long)(-CosPi16_64) * (x2 + x3);
  1846. s3 = (long)CosPi16_64 * (x2 - x3);
  1847. s6 = (long)CosPi16_64 * (x6 + x7);
  1848. s7 = (long)CosPi16_64 * (-x6 + x7);
  1849. s10 = (long)CosPi16_64 * (x10 + x11);
  1850. s11 = (long)CosPi16_64 * (-x10 + x11);
  1851. s14 = (long)(-CosPi16_64) * (x14 + x15);
  1852. s15 = (long)CosPi16_64 * (x14 - x15);
  1853. x2 = HighbdWrapLow(DctConstRoundShift(s2), bd);
  1854. x3 = HighbdWrapLow(DctConstRoundShift(s3), bd);
  1855. x6 = HighbdWrapLow(DctConstRoundShift(s6), bd);
  1856. x7 = HighbdWrapLow(DctConstRoundShift(s7), bd);
  1857. x10 = HighbdWrapLow(DctConstRoundShift(s10), bd);
  1858. x11 = HighbdWrapLow(DctConstRoundShift(s11), bd);
  1859. x14 = HighbdWrapLow(DctConstRoundShift(s14), bd);
  1860. x15 = HighbdWrapLow(DctConstRoundShift(s15), bd);
  1861. output[0] = HighbdWrapLow(x0, bd);
  1862. output[1] = HighbdWrapLow(-x8, bd);
  1863. output[2] = HighbdWrapLow(x12, bd);
  1864. output[3] = HighbdWrapLow(-x4, bd);
  1865. output[4] = HighbdWrapLow(x6, bd);
  1866. output[5] = HighbdWrapLow(x14, bd);
  1867. output[6] = HighbdWrapLow(x10, bd);
  1868. output[7] = HighbdWrapLow(x2, bd);
  1869. output[8] = HighbdWrapLow(x3, bd);
  1870. output[9] = HighbdWrapLow(x11, bd);
  1871. output[10] = HighbdWrapLow(x15, bd);
  1872. output[11] = HighbdWrapLow(x7, bd);
  1873. output[12] = HighbdWrapLow(x5, bd);
  1874. output[13] = HighbdWrapLow(-x13, bd);
  1875. output[14] = HighbdWrapLow(x9, bd);
  1876. output[15] = HighbdWrapLow(-x1, bd);
  1877. }
  1878. [SkipLocalsInit]
  1879. public static void HighbdIdct16(ReadOnlySpan<int> input, Span<int> output, int bd)
  1880. {
  1881. Span<int> step1 = stackalloc int[16];
  1882. Span<int> step2 = stackalloc int[16];
  1883. long temp1, temp2;
  1884. if (DetectInvalidHighbdInput(input, 16) != 0)
  1885. {
  1886. Debug.Assert(false, "invalid highbd txfm input");
  1887. output.Slice(0, 16).Fill(0);
  1888. return;
  1889. }
  1890. // stage 1
  1891. step1[0] = input[0 / 2];
  1892. step1[1] = input[16 / 2];
  1893. step1[2] = input[8 / 2];
  1894. step1[3] = input[24 / 2];
  1895. step1[4] = input[4 / 2];
  1896. step1[5] = input[20 / 2];
  1897. step1[6] = input[12 / 2];
  1898. step1[7] = input[28 / 2];
  1899. step1[8] = input[2 / 2];
  1900. step1[9] = input[18 / 2];
  1901. step1[10] = input[10 / 2];
  1902. step1[11] = input[26 / 2];
  1903. step1[12] = input[6 / 2];
  1904. step1[13] = input[22 / 2];
  1905. step1[14] = input[14 / 2];
  1906. step1[15] = input[30 / 2];
  1907. // stage 2
  1908. step2[0] = step1[0];
  1909. step2[1] = step1[1];
  1910. step2[2] = step1[2];
  1911. step2[3] = step1[3];
  1912. step2[4] = step1[4];
  1913. step2[5] = step1[5];
  1914. step2[6] = step1[6];
  1915. step2[7] = step1[7];
  1916. temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64;
  1917. temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64;
  1918. step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1919. step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1920. temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64;
  1921. temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64;
  1922. step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1923. step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1924. temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64;
  1925. temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64;
  1926. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1927. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1928. temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64;
  1929. temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64;
  1930. step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1931. step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1932. // stage 3
  1933. step1[0] = step2[0];
  1934. step1[1] = step2[1];
  1935. step1[2] = step2[2];
  1936. step1[3] = step2[3];
  1937. temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64;
  1938. temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64;
  1939. step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1940. step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1941. temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64;
  1942. temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64;
  1943. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1944. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1945. step1[8] = HighbdWrapLow(step2[8] + step2[9], bd);
  1946. step1[9] = HighbdWrapLow(step2[8] - step2[9], bd);
  1947. step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd);
  1948. step1[11] = HighbdWrapLow(step2[10] + step2[11], bd);
  1949. step1[12] = HighbdWrapLow(step2[12] + step2[13], bd);
  1950. step1[13] = HighbdWrapLow(step2[12] - step2[13], bd);
  1951. step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd);
  1952. step1[15] = HighbdWrapLow(step2[14] + step2[15], bd);
  1953. // stage 4
  1954. temp1 = (step1[0] + step1[1]) * (long)CosPi16_64;
  1955. temp2 = (step1[0] - step1[1]) * (long)CosPi16_64;
  1956. step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1957. step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1958. temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64;
  1959. temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64;
  1960. step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1961. step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1962. step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
  1963. step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
  1964. step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
  1965. step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
  1966. step2[8] = step1[8];
  1967. step2[15] = step1[15];
  1968. temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
  1969. temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
  1970. step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1971. step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1972. temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
  1973. temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64;
  1974. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1975. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1976. step2[11] = step1[11];
  1977. step2[12] = step1[12];
  1978. // stage 5
  1979. step1[0] = HighbdWrapLow(step2[0] + step2[3], bd);
  1980. step1[1] = HighbdWrapLow(step2[1] + step2[2], bd);
  1981. step1[2] = HighbdWrapLow(step2[1] - step2[2], bd);
  1982. step1[3] = HighbdWrapLow(step2[0] - step2[3], bd);
  1983. step1[4] = step2[4];
  1984. temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
  1985. temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
  1986. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  1987. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  1988. step1[7] = step2[7];
  1989. step1[8] = HighbdWrapLow(step2[8] + step2[11], bd);
  1990. step1[9] = HighbdWrapLow(step2[9] + step2[10], bd);
  1991. step1[10] = HighbdWrapLow(step2[9] - step2[10], bd);
  1992. step1[11] = HighbdWrapLow(step2[8] - step2[11], bd);
  1993. step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd);
  1994. step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd);
  1995. step1[14] = HighbdWrapLow(step2[13] + step2[14], bd);
  1996. step1[15] = HighbdWrapLow(step2[12] + step2[15], bd);
  1997. // stage 6
  1998. step2[0] = HighbdWrapLow(step1[0] + step1[7], bd);
  1999. step2[1] = HighbdWrapLow(step1[1] + step1[6], bd);
  2000. step2[2] = HighbdWrapLow(step1[2] + step1[5], bd);
  2001. step2[3] = HighbdWrapLow(step1[3] + step1[4], bd);
  2002. step2[4] = HighbdWrapLow(step1[3] - step1[4], bd);
  2003. step2[5] = HighbdWrapLow(step1[2] - step1[5], bd);
  2004. step2[6] = HighbdWrapLow(step1[1] - step1[6], bd);
  2005. step2[7] = HighbdWrapLow(step1[0] - step1[7], bd);
  2006. step2[8] = step1[8];
  2007. step2[9] = step1[9];
  2008. temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64;
  2009. temp2 = (step1[10] + step1[13]) * (long)CosPi16_64;
  2010. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2011. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2012. temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64;
  2013. temp2 = (step1[11] + step1[12]) * (long)CosPi16_64;
  2014. step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2015. step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2016. step2[14] = step1[14];
  2017. step2[15] = step1[15];
  2018. // stage 7
  2019. output[0] = HighbdWrapLow(step2[0] + step2[15], bd);
  2020. output[1] = HighbdWrapLow(step2[1] + step2[14], bd);
  2021. output[2] = HighbdWrapLow(step2[2] + step2[13], bd);
  2022. output[3] = HighbdWrapLow(step2[3] + step2[12], bd);
  2023. output[4] = HighbdWrapLow(step2[4] + step2[11], bd);
  2024. output[5] = HighbdWrapLow(step2[5] + step2[10], bd);
  2025. output[6] = HighbdWrapLow(step2[6] + step2[9], bd);
  2026. output[7] = HighbdWrapLow(step2[7] + step2[8], bd);
  2027. output[8] = HighbdWrapLow(step2[7] - step2[8], bd);
  2028. output[9] = HighbdWrapLow(step2[6] - step2[9], bd);
  2029. output[10] = HighbdWrapLow(step2[5] - step2[10], bd);
  2030. output[11] = HighbdWrapLow(step2[4] - step2[11], bd);
  2031. output[12] = HighbdWrapLow(step2[3] - step2[12], bd);
  2032. output[13] = HighbdWrapLow(step2[2] - step2[13], bd);
  2033. output[14] = HighbdWrapLow(step2[1] - step2[14], bd);
  2034. output[15] = HighbdWrapLow(step2[0] - step2[15], bd);
  2035. }
  2036. [SkipLocalsInit]
  2037. public static void HighbdIdct16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2038. {
  2039. int i, j;
  2040. Span<int> output = stackalloc int[16 * 16];
  2041. Span<int> outptr = output;
  2042. Span<int> tempIn = stackalloc int[16];
  2043. Span<int> tempOut = stackalloc int[16];
  2044. // First transform rows
  2045. for (i = 0; i < 16; ++i)
  2046. {
  2047. HighbdIdct16(input, outptr, bd);
  2048. input = input.Slice(16);
  2049. outptr = outptr.Slice(16);
  2050. }
  2051. // Then transform columns
  2052. for (i = 0; i < 16; ++i)
  2053. {
  2054. for (j = 0; j < 16; ++j)
  2055. {
  2056. tempIn[j] = output[j * 16 + i];
  2057. }
  2058. HighbdIdct16(tempIn, tempOut, bd);
  2059. for (j = 0; j < 16; ++j)
  2060. {
  2061. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2062. }
  2063. }
  2064. }
  2065. [SkipLocalsInit]
  2066. public static void HighbdIdct16x1638Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2067. {
  2068. int i, j;
  2069. Span<int> output = stackalloc int[16 * 16];
  2070. Span<int> outptr = output;
  2071. Span<int> tempIn = stackalloc int[16];
  2072. Span<int> tempOut = stackalloc int[16];
  2073. output.Fill(0);
  2074. // First transform rows. Since all non-zero dct coefficients are in
  2075. // upper-left 8x8 area, we only need to calculate first 8 rows here.
  2076. for (i = 0; i < 8; ++i)
  2077. {
  2078. HighbdIdct16(input, outptr, bd);
  2079. input = input.Slice(16);
  2080. outptr = outptr.Slice(16);
  2081. }
  2082. // Then transform columns
  2083. for (i = 0; i < 16; ++i)
  2084. {
  2085. Span<ushort> destT = dest;
  2086. for (j = 0; j < 16; ++j)
  2087. {
  2088. tempIn[j] = output[j * 16 + i];
  2089. }
  2090. HighbdIdct16(tempIn, tempOut, bd);
  2091. for (j = 0; j < 16; ++j)
  2092. {
  2093. destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2094. destT = destT.Slice(stride);
  2095. }
  2096. }
  2097. }
  2098. [SkipLocalsInit]
  2099. public static void HighbdIdct16x1610Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2100. {
  2101. int i, j;
  2102. Span<int> output = stackalloc int[16 * 16];
  2103. Span<int> outptr = output;
  2104. Span<int> tempIn = stackalloc int[16];
  2105. Span<int> tempOut = stackalloc int[16];
  2106. output.Fill(0);
  2107. // First transform rows. Since all non-zero dct coefficients are in
  2108. // upper-left 4x4 area, we only need to calculate first 4 rows here.
  2109. for (i = 0; i < 4; ++i)
  2110. {
  2111. HighbdIdct16(input, outptr, bd);
  2112. input = input.Slice(16);
  2113. outptr = outptr.Slice(16);
  2114. }
  2115. // Then transform columns
  2116. for (i = 0; i < 16; ++i)
  2117. {
  2118. for (j = 0; j < 16; ++j)
  2119. {
  2120. tempIn[j] = output[j * 16 + i];
  2121. }
  2122. HighbdIdct16(tempIn, tempOut, bd);
  2123. for (j = 0; j < 16; ++j)
  2124. {
  2125. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2126. }
  2127. }
  2128. }
  2129. public static void HighbdIdct16x161Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2130. {
  2131. int i, j;
  2132. long a1;
  2133. int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
  2134. output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
  2135. a1 = BitUtils.RoundPowerOfTwo(output, 6);
  2136. for (j = 0; j < 16; ++j)
  2137. {
  2138. for (i = 0; i < 16; ++i)
  2139. {
  2140. dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
  2141. }
  2142. dest = dest.Slice(stride);
  2143. }
  2144. }
  2145. [SkipLocalsInit]
  2146. public static void HighbdIdct32(ReadOnlySpan<int> input, Span<int> output, int bd)
  2147. {
  2148. Span<int> step1 = stackalloc int[32];
  2149. Span<int> step2 = stackalloc int[32];
  2150. long temp1, temp2;
  2151. if (DetectInvalidHighbdInput(input, 32) != 0)
  2152. {
  2153. Debug.Assert(false, "invalid highbd txfm input");
  2154. output.Slice(0, 32).Fill(0);
  2155. return;
  2156. }
  2157. // stage 1
  2158. step1[0] = input[0];
  2159. step1[1] = input[16];
  2160. step1[2] = input[8];
  2161. step1[3] = input[24];
  2162. step1[4] = input[4];
  2163. step1[5] = input[20];
  2164. step1[6] = input[12];
  2165. step1[7] = input[28];
  2166. step1[8] = input[2];
  2167. step1[9] = input[18];
  2168. step1[10] = input[10];
  2169. step1[11] = input[26];
  2170. step1[12] = input[6];
  2171. step1[13] = input[22];
  2172. step1[14] = input[14];
  2173. step1[15] = input[30];
  2174. temp1 = input[1] * (long)CosPi31_64 - input[31] * (long)CosPi1_64;
  2175. temp2 = input[1] * (long)CosPi1_64 + input[31] * (long)CosPi31_64;
  2176. step1[16] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2177. step1[31] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2178. temp1 = input[17] * (long)CosPi15_64 - input[15] * (long)CosPi17_64;
  2179. temp2 = input[17] * (long)CosPi17_64 + input[15] * (long)CosPi15_64;
  2180. step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2181. step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2182. temp1 = input[9] * (long)CosPi23_64 - input[23] * (long)CosPi9_64;
  2183. temp2 = input[9] * (long)CosPi9_64 + input[23] * (long)CosPi23_64;
  2184. step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2185. step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2186. temp1 = input[25] * (long)CosPi7_64 - input[7] * (long)CosPi25_64;
  2187. temp2 = input[25] * (long)CosPi25_64 + input[7] * (long)CosPi7_64;
  2188. step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2189. step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2190. temp1 = input[5] * (long)CosPi27_64 - input[27] * (long)CosPi5_64;
  2191. temp2 = input[5] * (long)CosPi5_64 + input[27] * (long)CosPi27_64;
  2192. step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2193. step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2194. temp1 = input[21] * (long)CosPi11_64 - input[11] * (long)CosPi21_64;
  2195. temp2 = input[21] * (long)CosPi21_64 + input[11] * (long)CosPi11_64;
  2196. step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2197. step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2198. temp1 = input[13] * (long)CosPi19_64 - input[19] * (long)CosPi13_64;
  2199. temp2 = input[13] * (long)CosPi13_64 + input[19] * (long)CosPi19_64;
  2200. step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2201. step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2202. temp1 = input[29] * (long)CosPi3_64 - input[3] * (long)CosPi29_64;
  2203. temp2 = input[29] * (long)CosPi29_64 + input[3] * (long)CosPi3_64;
  2204. step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2205. step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2206. // stage 2
  2207. step2[0] = step1[0];
  2208. step2[1] = step1[1];
  2209. step2[2] = step1[2];
  2210. step2[3] = step1[3];
  2211. step2[4] = step1[4];
  2212. step2[5] = step1[5];
  2213. step2[6] = step1[6];
  2214. step2[7] = step1[7];
  2215. temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64;
  2216. temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64;
  2217. step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2218. step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2219. temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64;
  2220. temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64;
  2221. step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2222. step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2223. temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64;
  2224. temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64;
  2225. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2226. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2227. temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64;
  2228. temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64;
  2229. step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2230. step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2231. step2[16] = HighbdWrapLow(step1[16] + step1[17], bd);
  2232. step2[17] = HighbdWrapLow(step1[16] - step1[17], bd);
  2233. step2[18] = HighbdWrapLow(-step1[18] + step1[19], bd);
  2234. step2[19] = HighbdWrapLow(step1[18] + step1[19], bd);
  2235. step2[20] = HighbdWrapLow(step1[20] + step1[21], bd);
  2236. step2[21] = HighbdWrapLow(step1[20] - step1[21], bd);
  2237. step2[22] = HighbdWrapLow(-step1[22] + step1[23], bd);
  2238. step2[23] = HighbdWrapLow(step1[22] + step1[23], bd);
  2239. step2[24] = HighbdWrapLow(step1[24] + step1[25], bd);
  2240. step2[25] = HighbdWrapLow(step1[24] - step1[25], bd);
  2241. step2[26] = HighbdWrapLow(-step1[26] + step1[27], bd);
  2242. step2[27] = HighbdWrapLow(step1[26] + step1[27], bd);
  2243. step2[28] = HighbdWrapLow(step1[28] + step1[29], bd);
  2244. step2[29] = HighbdWrapLow(step1[28] - step1[29], bd);
  2245. step2[30] = HighbdWrapLow(-step1[30] + step1[31], bd);
  2246. step2[31] = HighbdWrapLow(step1[30] + step1[31], bd);
  2247. // stage 3
  2248. step1[0] = step2[0];
  2249. step1[1] = step2[1];
  2250. step1[2] = step2[2];
  2251. step1[3] = step2[3];
  2252. temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64;
  2253. temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64;
  2254. step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2255. step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2256. temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64;
  2257. temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64;
  2258. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2259. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2260. step1[8] = HighbdWrapLow(step2[8] + step2[9], bd);
  2261. step1[9] = HighbdWrapLow(step2[8] - step2[9], bd);
  2262. step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd);
  2263. step1[11] = HighbdWrapLow(step2[10] + step2[11], bd);
  2264. step1[12] = HighbdWrapLow(step2[12] + step2[13], bd);
  2265. step1[13] = HighbdWrapLow(step2[12] - step2[13], bd);
  2266. step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd);
  2267. step1[15] = HighbdWrapLow(step2[14] + step2[15], bd);
  2268. step1[16] = step2[16];
  2269. step1[31] = step2[31];
  2270. temp1 = -step2[17] * (long)CosPi4_64 + step2[30] * (long)CosPi28_64;
  2271. temp2 = step2[17] * (long)CosPi28_64 + step2[30] * (long)CosPi4_64;
  2272. step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2273. step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2274. temp1 = -step2[18] * (long)CosPi28_64 - step2[29] * (long)CosPi4_64;
  2275. temp2 = -step2[18] * (long)CosPi4_64 + step2[29] * (long)CosPi28_64;
  2276. step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2277. step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2278. step1[19] = step2[19];
  2279. step1[20] = step2[20];
  2280. temp1 = -step2[21] * (long)CosPi20_64 + step2[26] * (long)CosPi12_64;
  2281. temp2 = step2[21] * (long)CosPi12_64 + step2[26] * (long)CosPi20_64;
  2282. step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2283. step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2284. temp1 = -step2[22] * (long)CosPi12_64 - step2[25] * (long)CosPi20_64;
  2285. temp2 = -step2[22] * (long)CosPi20_64 + step2[25] * (long)CosPi12_64;
  2286. step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2287. step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2288. step1[23] = step2[23];
  2289. step1[24] = step2[24];
  2290. step1[27] = step2[27];
  2291. step1[28] = step2[28];
  2292. // stage 4
  2293. temp1 = (step1[0] + step1[1]) * (long)CosPi16_64;
  2294. temp2 = (step1[0] - step1[1]) * (long)CosPi16_64;
  2295. step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2296. step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2297. temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64;
  2298. temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64;
  2299. step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2300. step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2301. step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
  2302. step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
  2303. step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
  2304. step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
  2305. step2[8] = step1[8];
  2306. step2[15] = step1[15];
  2307. temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
  2308. temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
  2309. step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2310. step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2311. temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
  2312. temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64;
  2313. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2314. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2315. step2[11] = step1[11];
  2316. step2[12] = step1[12];
  2317. step2[16] = HighbdWrapLow(step1[16] + step1[19], bd);
  2318. step2[17] = HighbdWrapLow(step1[17] + step1[18], bd);
  2319. step2[18] = HighbdWrapLow(step1[17] - step1[18], bd);
  2320. step2[19] = HighbdWrapLow(step1[16] - step1[19], bd);
  2321. step2[20] = HighbdWrapLow(-step1[20] + step1[23], bd);
  2322. step2[21] = HighbdWrapLow(-step1[21] + step1[22], bd);
  2323. step2[22] = HighbdWrapLow(step1[21] + step1[22], bd);
  2324. step2[23] = HighbdWrapLow(step1[20] + step1[23], bd);
  2325. step2[24] = HighbdWrapLow(step1[24] + step1[27], bd);
  2326. step2[25] = HighbdWrapLow(step1[25] + step1[26], bd);
  2327. step2[26] = HighbdWrapLow(step1[25] - step1[26], bd);
  2328. step2[27] = HighbdWrapLow(step1[24] - step1[27], bd);
  2329. step2[28] = HighbdWrapLow(-step1[28] + step1[31], bd);
  2330. step2[29] = HighbdWrapLow(-step1[29] + step1[30], bd);
  2331. step2[30] = HighbdWrapLow(step1[29] + step1[30], bd);
  2332. step2[31] = HighbdWrapLow(step1[28] + step1[31], bd);
  2333. // stage 5
  2334. step1[0] = HighbdWrapLow(step2[0] + step2[3], bd);
  2335. step1[1] = HighbdWrapLow(step2[1] + step2[2], bd);
  2336. step1[2] = HighbdWrapLow(step2[1] - step2[2], bd);
  2337. step1[3] = HighbdWrapLow(step2[0] - step2[3], bd);
  2338. step1[4] = step2[4];
  2339. temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
  2340. temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
  2341. step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2342. step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2343. step1[7] = step2[7];
  2344. step1[8] = HighbdWrapLow(step2[8] + step2[11], bd);
  2345. step1[9] = HighbdWrapLow(step2[9] + step2[10], bd);
  2346. step1[10] = HighbdWrapLow(step2[9] - step2[10], bd);
  2347. step1[11] = HighbdWrapLow(step2[8] - step2[11], bd);
  2348. step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd);
  2349. step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd);
  2350. step1[14] = HighbdWrapLow(step2[13] + step2[14], bd);
  2351. step1[15] = HighbdWrapLow(step2[12] + step2[15], bd);
  2352. step1[16] = step2[16];
  2353. step1[17] = step2[17];
  2354. temp1 = -step2[18] * (long)CosPi8_64 + step2[29] * (long)CosPi24_64;
  2355. temp2 = step2[18] * (long)CosPi24_64 + step2[29] * (long)CosPi8_64;
  2356. step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2357. step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2358. temp1 = -step2[19] * (long)CosPi8_64 + step2[28] * (long)CosPi24_64;
  2359. temp2 = step2[19] * (long)CosPi24_64 + step2[28] * (long)CosPi8_64;
  2360. step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2361. step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2362. temp1 = -step2[20] * (long)CosPi24_64 - step2[27] * (long)CosPi8_64;
  2363. temp2 = -step2[20] * (long)CosPi8_64 + step2[27] * (long)CosPi24_64;
  2364. step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2365. step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2366. temp1 = -step2[21] * (long)CosPi24_64 - step2[26] * (long)CosPi8_64;
  2367. temp2 = -step2[21] * (long)CosPi8_64 + step2[26] * (long)CosPi24_64;
  2368. step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2369. step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2370. step1[22] = step2[22];
  2371. step1[23] = step2[23];
  2372. step1[24] = step2[24];
  2373. step1[25] = step2[25];
  2374. step1[30] = step2[30];
  2375. step1[31] = step2[31];
  2376. // stage 6
  2377. step2[0] = HighbdWrapLow(step1[0] + step1[7], bd);
  2378. step2[1] = HighbdWrapLow(step1[1] + step1[6], bd);
  2379. step2[2] = HighbdWrapLow(step1[2] + step1[5], bd);
  2380. step2[3] = HighbdWrapLow(step1[3] + step1[4], bd);
  2381. step2[4] = HighbdWrapLow(step1[3] - step1[4], bd);
  2382. step2[5] = HighbdWrapLow(step1[2] - step1[5], bd);
  2383. step2[6] = HighbdWrapLow(step1[1] - step1[6], bd);
  2384. step2[7] = HighbdWrapLow(step1[0] - step1[7], bd);
  2385. step2[8] = step1[8];
  2386. step2[9] = step1[9];
  2387. temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64;
  2388. temp2 = (step1[10] + step1[13]) * (long)CosPi16_64;
  2389. step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2390. step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2391. temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64;
  2392. temp2 = (step1[11] + step1[12]) * (long)CosPi16_64;
  2393. step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2394. step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2395. step2[14] = step1[14];
  2396. step2[15] = step1[15];
  2397. step2[16] = HighbdWrapLow(step1[16] + step1[23], bd);
  2398. step2[17] = HighbdWrapLow(step1[17] + step1[22], bd);
  2399. step2[18] = HighbdWrapLow(step1[18] + step1[21], bd);
  2400. step2[19] = HighbdWrapLow(step1[19] + step1[20], bd);
  2401. step2[20] = HighbdWrapLow(step1[19] - step1[20], bd);
  2402. step2[21] = HighbdWrapLow(step1[18] - step1[21], bd);
  2403. step2[22] = HighbdWrapLow(step1[17] - step1[22], bd);
  2404. step2[23] = HighbdWrapLow(step1[16] - step1[23], bd);
  2405. step2[24] = HighbdWrapLow(-step1[24] + step1[31], bd);
  2406. step2[25] = HighbdWrapLow(-step1[25] + step1[30], bd);
  2407. step2[26] = HighbdWrapLow(-step1[26] + step1[29], bd);
  2408. step2[27] = HighbdWrapLow(-step1[27] + step1[28], bd);
  2409. step2[28] = HighbdWrapLow(step1[27] + step1[28], bd);
  2410. step2[29] = HighbdWrapLow(step1[26] + step1[29], bd);
  2411. step2[30] = HighbdWrapLow(step1[25] + step1[30], bd);
  2412. step2[31] = HighbdWrapLow(step1[24] + step1[31], bd);
  2413. // stage 7
  2414. step1[0] = HighbdWrapLow(step2[0] + step2[15], bd);
  2415. step1[1] = HighbdWrapLow(step2[1] + step2[14], bd);
  2416. step1[2] = HighbdWrapLow(step2[2] + step2[13], bd);
  2417. step1[3] = HighbdWrapLow(step2[3] + step2[12], bd);
  2418. step1[4] = HighbdWrapLow(step2[4] + step2[11], bd);
  2419. step1[5] = HighbdWrapLow(step2[5] + step2[10], bd);
  2420. step1[6] = HighbdWrapLow(step2[6] + step2[9], bd);
  2421. step1[7] = HighbdWrapLow(step2[7] + step2[8], bd);
  2422. step1[8] = HighbdWrapLow(step2[7] - step2[8], bd);
  2423. step1[9] = HighbdWrapLow(step2[6] - step2[9], bd);
  2424. step1[10] = HighbdWrapLow(step2[5] - step2[10], bd);
  2425. step1[11] = HighbdWrapLow(step2[4] - step2[11], bd);
  2426. step1[12] = HighbdWrapLow(step2[3] - step2[12], bd);
  2427. step1[13] = HighbdWrapLow(step2[2] - step2[13], bd);
  2428. step1[14] = HighbdWrapLow(step2[1] - step2[14], bd);
  2429. step1[15] = HighbdWrapLow(step2[0] - step2[15], bd);
  2430. step1[16] = step2[16];
  2431. step1[17] = step2[17];
  2432. step1[18] = step2[18];
  2433. step1[19] = step2[19];
  2434. temp1 = (-step2[20] + step2[27]) * (long)CosPi16_64;
  2435. temp2 = (step2[20] + step2[27]) * (long)CosPi16_64;
  2436. step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2437. step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2438. temp1 = (-step2[21] + step2[26]) * (long)CosPi16_64;
  2439. temp2 = (step2[21] + step2[26]) * (long)CosPi16_64;
  2440. step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2441. step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2442. temp1 = (-step2[22] + step2[25]) * (long)CosPi16_64;
  2443. temp2 = (step2[22] + step2[25]) * (long)CosPi16_64;
  2444. step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2445. step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2446. temp1 = (-step2[23] + step2[24]) * (long)CosPi16_64;
  2447. temp2 = (step2[23] + step2[24]) * (long)CosPi16_64;
  2448. step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
  2449. step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
  2450. step1[28] = step2[28];
  2451. step1[29] = step2[29];
  2452. step1[30] = step2[30];
  2453. step1[31] = step2[31];
  2454. // final stage
  2455. output[0] = HighbdWrapLow(step1[0] + step1[31], bd);
  2456. output[1] = HighbdWrapLow(step1[1] + step1[30], bd);
  2457. output[2] = HighbdWrapLow(step1[2] + step1[29], bd);
  2458. output[3] = HighbdWrapLow(step1[3] + step1[28], bd);
  2459. output[4] = HighbdWrapLow(step1[4] + step1[27], bd);
  2460. output[5] = HighbdWrapLow(step1[5] + step1[26], bd);
  2461. output[6] = HighbdWrapLow(step1[6] + step1[25], bd);
  2462. output[7] = HighbdWrapLow(step1[7] + step1[24], bd);
  2463. output[8] = HighbdWrapLow(step1[8] + step1[23], bd);
  2464. output[9] = HighbdWrapLow(step1[9] + step1[22], bd);
  2465. output[10] = HighbdWrapLow(step1[10] + step1[21], bd);
  2466. output[11] = HighbdWrapLow(step1[11] + step1[20], bd);
  2467. output[12] = HighbdWrapLow(step1[12] + step1[19], bd);
  2468. output[13] = HighbdWrapLow(step1[13] + step1[18], bd);
  2469. output[14] = HighbdWrapLow(step1[14] + step1[17], bd);
  2470. output[15] = HighbdWrapLow(step1[15] + step1[16], bd);
  2471. output[16] = HighbdWrapLow(step1[15] - step1[16], bd);
  2472. output[17] = HighbdWrapLow(step1[14] - step1[17], bd);
  2473. output[18] = HighbdWrapLow(step1[13] - step1[18], bd);
  2474. output[19] = HighbdWrapLow(step1[12] - step1[19], bd);
  2475. output[20] = HighbdWrapLow(step1[11] - step1[20], bd);
  2476. output[21] = HighbdWrapLow(step1[10] - step1[21], bd);
  2477. output[22] = HighbdWrapLow(step1[9] - step1[22], bd);
  2478. output[23] = HighbdWrapLow(step1[8] - step1[23], bd);
  2479. output[24] = HighbdWrapLow(step1[7] - step1[24], bd);
  2480. output[25] = HighbdWrapLow(step1[6] - step1[25], bd);
  2481. output[26] = HighbdWrapLow(step1[5] - step1[26], bd);
  2482. output[27] = HighbdWrapLow(step1[4] - step1[27], bd);
  2483. output[28] = HighbdWrapLow(step1[3] - step1[28], bd);
  2484. output[29] = HighbdWrapLow(step1[2] - step1[29], bd);
  2485. output[30] = HighbdWrapLow(step1[1] - step1[30], bd);
  2486. output[31] = HighbdWrapLow(step1[0] - step1[31], bd);
  2487. }
  2488. [SkipLocalsInit]
  2489. public static void HighbdIdct32x321024Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2490. {
  2491. int i, j;
  2492. Span<int> output = stackalloc int[32 * 32];
  2493. Span<int> outptr = output;
  2494. Span<int> tempIn = stackalloc int[32];
  2495. Span<int> tempOut = stackalloc int[32];
  2496. // Rows
  2497. for (i = 0; i < 32; ++i)
  2498. {
  2499. int zeroCoeff = 0;
  2500. for (j = 0; j < 32; ++j)
  2501. {
  2502. zeroCoeff |= input[j];
  2503. }
  2504. if (zeroCoeff != 0)
  2505. {
  2506. HighbdIdct32(input, outptr, bd);
  2507. }
  2508. else
  2509. {
  2510. outptr.Slice(0, 32).Fill(0);
  2511. }
  2512. input = input.Slice(32);
  2513. outptr = outptr.Slice(32);
  2514. }
  2515. // Columns
  2516. for (i = 0; i < 32; ++i)
  2517. {
  2518. for (j = 0; j < 32; ++j)
  2519. {
  2520. tempIn[j] = output[j * 32 + i];
  2521. }
  2522. HighbdIdct32(tempIn, tempOut, bd);
  2523. for (j = 0; j < 32; ++j)
  2524. {
  2525. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2526. }
  2527. }
  2528. }
  2529. [SkipLocalsInit]
  2530. public static void HighbdIdct32x32135Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2531. {
  2532. int i, j;
  2533. Span<int> output = stackalloc int[32 * 32];
  2534. Span<int> outptr = output;
  2535. Span<int> tempIn = stackalloc int[32];
  2536. Span<int> tempOut = stackalloc int[32];
  2537. output.Fill(0);
  2538. // Rows
  2539. // Only upper-left 16x16 has non-zero coeff
  2540. for (i = 0; i < 16; ++i)
  2541. {
  2542. HighbdIdct32(input, outptr, bd);
  2543. input = input.Slice(32);
  2544. outptr = outptr.Slice(32);
  2545. }
  2546. // Columns
  2547. for (i = 0; i < 32; ++i)
  2548. {
  2549. Span<ushort> destT = dest;
  2550. for (j = 0; j < 32; ++j)
  2551. {
  2552. tempIn[j] = output[j * 32 + i];
  2553. }
  2554. HighbdIdct32(tempIn, tempOut, bd);
  2555. for (j = 0; j < 32; ++j)
  2556. {
  2557. destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2558. destT = destT.Slice(stride);
  2559. }
  2560. }
  2561. }
  2562. [SkipLocalsInit]
  2563. public static void HighbdIdct32x3234Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2564. {
  2565. int i, j;
  2566. Span<int> output = stackalloc int[32 * 32];
  2567. Span<int> outptr = output;
  2568. Span<int> tempIn = stackalloc int[32];
  2569. Span<int> tempOut = stackalloc int[32];
  2570. output.Fill(0);
  2571. // Rows
  2572. // Only upper-left 8x8 has non-zero coeff
  2573. for (i = 0; i < 8; ++i)
  2574. {
  2575. HighbdIdct32(input, outptr, bd);
  2576. input = input.Slice(32);
  2577. outptr = outptr.Slice(32);
  2578. }
  2579. // Columns
  2580. for (i = 0; i < 32; ++i)
  2581. {
  2582. for (j = 0; j < 32; ++j)
  2583. {
  2584. tempIn[j] = output[j * 32 + i];
  2585. }
  2586. HighbdIdct32(tempIn, tempOut, bd);
  2587. for (j = 0; j < 32; ++j)
  2588. {
  2589. dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
  2590. }
  2591. }
  2592. }
  2593. public static void HighbdIdct32x321Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
  2594. {
  2595. int i, j;
  2596. int a1;
  2597. int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
  2598. output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
  2599. a1 = BitUtils.RoundPowerOfTwo(output, 6);
  2600. for (j = 0; j < 32; ++j)
  2601. {
  2602. for (i = 0; i < 32; ++i)
  2603. {
  2604. dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
  2605. }
  2606. dest = dest.Slice(stride);
  2607. }
  2608. }
  2609. }
  2610. }