BCnDecoder.cs 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894
  1. using Ryujinx.Common;
  2. using System;
  3. using System.Buffers.Binary;
  4. using System.Runtime.InteropServices;
  5. using System.Runtime.Intrinsics;
  6. using System.Runtime.Intrinsics.X86;
  7. namespace Ryujinx.Graphics.Texture
  8. {
  9. public static class BCnDecoder
  10. {
  11. private const int BlockWidth = 4;
  12. private const int BlockHeight = 4;
  13. public static byte[] DecodeBC1(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
  14. {
  15. int size = 0;
  16. for (int l = 0; l < levels; l++)
  17. {
  18. size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
  19. }
  20. byte[] output = new byte[size];
  21. Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
  22. Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
  23. Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
  24. Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
  25. Span<Vector128<byte>> outputLine0 = default;
  26. Span<Vector128<byte>> outputLine1 = default;
  27. Span<Vector128<byte>> outputLine2 = default;
  28. Span<Vector128<byte>> outputLine3 = default;
  29. int imageBaseOOffs = 0;
  30. for (int l = 0; l < levels; l++)
  31. {
  32. int w = BitUtils.DivRoundUp(width, BlockWidth);
  33. int h = BitUtils.DivRoundUp(height, BlockHeight);
  34. for (int l2 = 0; l2 < layers; l2++)
  35. {
  36. for (int z = 0; z < depth; z++)
  37. {
  38. for (int y = 0; y < h; y++)
  39. {
  40. int baseY = y * BlockHeight;
  41. int copyHeight = Math.Min(BlockHeight, height - baseY);
  42. int lineBaseOOffs = imageBaseOOffs + baseY * width;
  43. if (copyHeight == 4)
  44. {
  45. outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs));
  46. outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width));
  47. outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2));
  48. outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3));
  49. }
  50. for (int x = 0; x < w; x++)
  51. {
  52. int baseX = x * BlockWidth;
  53. int copyWidth = Math.Min(BlockWidth, width - baseX);
  54. BC1DecodeTileRgb(tile, data);
  55. if ((copyWidth | copyHeight) == 4)
  56. {
  57. outputLine0[x] = tileAsVector128[0];
  58. outputLine1[x] = tileAsVector128[1];
  59. outputLine2[x] = tileAsVector128[2];
  60. outputLine3[x] = tileAsVector128[3];
  61. }
  62. else
  63. {
  64. int pixelBaseOOffs = lineBaseOOffs + baseX;
  65. for (int tY = 0; tY < copyHeight; tY++)
  66. {
  67. tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
  68. }
  69. }
  70. data = data.Slice(8);
  71. }
  72. }
  73. imageBaseOOffs += width * height;
  74. }
  75. }
  76. width = Math.Max(1, width >> 1);
  77. height = Math.Max(1, height >> 1);
  78. depth = Math.Max(1, depth >> 1);
  79. }
  80. return output;
  81. }
  82. public static byte[] DecodeBC2(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
  83. {
  84. int size = 0;
  85. for (int l = 0; l < levels; l++)
  86. {
  87. size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
  88. }
  89. byte[] output = new byte[size];
  90. Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
  91. Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
  92. Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
  93. Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
  94. Span<Vector128<byte>> outputLine0 = default;
  95. Span<Vector128<byte>> outputLine1 = default;
  96. Span<Vector128<byte>> outputLine2 = default;
  97. Span<Vector128<byte>> outputLine3 = default;
  98. int imageBaseOOffs = 0;
  99. for (int l = 0; l < levels; l++)
  100. {
  101. int w = BitUtils.DivRoundUp(width, BlockWidth);
  102. int h = BitUtils.DivRoundUp(height, BlockHeight);
  103. for (int l2 = 0; l2 < layers; l2++)
  104. {
  105. for (int z = 0; z < depth; z++)
  106. {
  107. for (int y = 0; y < h; y++)
  108. {
  109. int baseY = y * BlockHeight;
  110. int copyHeight = Math.Min(BlockHeight, height - baseY);
  111. int lineBaseOOffs = imageBaseOOffs + baseY * width;
  112. if (copyHeight == 4)
  113. {
  114. outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs));
  115. outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width));
  116. outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2));
  117. outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3));
  118. }
  119. for (int x = 0; x < w; x++)
  120. {
  121. int baseX = x * BlockWidth;
  122. int copyWidth = Math.Min(BlockWidth, width - baseX);
  123. BC23DecodeTileRgb(tile, data.Slice(8));
  124. ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
  125. for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, block >>= 4)
  126. {
  127. tile[i] = (byte)((block & 0xf) | (block << 4));
  128. }
  129. if ((copyWidth | copyHeight) == 4)
  130. {
  131. outputLine0[x] = tileAsVector128[0];
  132. outputLine1[x] = tileAsVector128[1];
  133. outputLine2[x] = tileAsVector128[2];
  134. outputLine3[x] = tileAsVector128[3];
  135. }
  136. else
  137. {
  138. int pixelBaseOOffs = lineBaseOOffs + baseX;
  139. for (int tY = 0; tY < copyHeight; tY++)
  140. {
  141. tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
  142. }
  143. }
  144. data = data.Slice(16);
  145. }
  146. }
  147. imageBaseOOffs += width * height;
  148. }
  149. }
  150. width = Math.Max(1, width >> 1);
  151. height = Math.Max(1, height >> 1);
  152. depth = Math.Max(1, depth >> 1);
  153. }
  154. return output;
  155. }
  156. public static byte[] DecodeBC3(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
  157. {
  158. int size = 0;
  159. for (int l = 0; l < levels; l++)
  160. {
  161. size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
  162. }
  163. byte[] output = new byte[size];
  164. Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
  165. Span<byte> rPal = stackalloc byte[8];
  166. Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
  167. Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
  168. Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
  169. Span<Vector128<byte>> outputLine0 = default;
  170. Span<Vector128<byte>> outputLine1 = default;
  171. Span<Vector128<byte>> outputLine2 = default;
  172. Span<Vector128<byte>> outputLine3 = default;
  173. int imageBaseOOffs = 0;
  174. for (int l = 0; l < levels; l++)
  175. {
  176. int w = BitUtils.DivRoundUp(width, BlockWidth);
  177. int h = BitUtils.DivRoundUp(height, BlockHeight);
  178. for (int l2 = 0; l2 < layers; l2++)
  179. {
  180. for (int z = 0; z < depth; z++)
  181. {
  182. for (int y = 0; y < h; y++)
  183. {
  184. int baseY = y * BlockHeight;
  185. int copyHeight = Math.Min(BlockHeight, height - baseY);
  186. int lineBaseOOffs = imageBaseOOffs + baseY * width;
  187. if (copyHeight == 4)
  188. {
  189. outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs));
  190. outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width));
  191. outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 2));
  192. outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint.Slice(lineBaseOOffs + width * 3));
  193. }
  194. for (int x = 0; x < w; x++)
  195. {
  196. int baseX = x * BlockWidth;
  197. int copyWidth = Math.Min(BlockWidth, width - baseX);
  198. BC23DecodeTileRgb(tile, data.Slice(8));
  199. ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
  200. rPal[0] = (byte)block;
  201. rPal[1] = (byte)(block >> 8);
  202. BCnLerpAlphaUnorm(rPal);
  203. BCnDecodeTileAlphaRgba(tile, rPal, block >> 16);
  204. if ((copyWidth | copyHeight) == 4)
  205. {
  206. outputLine0[x] = tileAsVector128[0];
  207. outputLine1[x] = tileAsVector128[1];
  208. outputLine2[x] = tileAsVector128[2];
  209. outputLine3[x] = tileAsVector128[3];
  210. }
  211. else
  212. {
  213. int pixelBaseOOffs = lineBaseOOffs + baseX;
  214. for (int tY = 0; tY < copyHeight; tY++)
  215. {
  216. tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
  217. }
  218. }
  219. data = data.Slice(16);
  220. }
  221. }
  222. imageBaseOOffs += width * height;
  223. }
  224. }
  225. width = Math.Max(1, width >> 1);
  226. height = Math.Max(1, height >> 1);
  227. depth = Math.Max(1, depth >> 1);
  228. }
  229. return output;
  230. }
  231. public static byte[] DecodeBC4(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
  232. {
  233. int size = 0;
  234. for (int l = 0; l < levels; l++)
  235. {
  236. size += BitUtils.AlignUp(Math.Max(1, width >> l), 4) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers;
  237. }
  238. // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned.
  239. int alignedWidth = BitUtils.AlignUp(width, 4);
  240. byte[] output = new byte[size];
  241. Span<byte> outputSpan = new Span<byte>(output);
  242. ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
  243. Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight];
  244. Span<byte> rPal = stackalloc byte[8];
  245. Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
  246. Span<uint> outputLine0 = default;
  247. Span<uint> outputLine1 = default;
  248. Span<uint> outputLine2 = default;
  249. Span<uint> outputLine3 = default;
  250. int imageBaseOOffs = 0;
  251. for (int l = 0; l < levels; l++)
  252. {
  253. int w = BitUtils.DivRoundUp(width, BlockWidth);
  254. int h = BitUtils.DivRoundUp(height, BlockHeight);
  255. for (int l2 = 0; l2 < layers; l2++)
  256. {
  257. for (int z = 0; z < depth; z++)
  258. {
  259. for (int y = 0; y < h; y++)
  260. {
  261. int baseY = y * BlockHeight;
  262. int copyHeight = Math.Min(BlockHeight, height - baseY);
  263. int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth;
  264. if (copyHeight == 4)
  265. {
  266. outputLine0 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs));
  267. outputLine1 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + alignedWidth));
  268. outputLine2 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + alignedWidth * 2));
  269. outputLine3 = MemoryMarshal.Cast<byte, uint>(outputSpan.Slice(lineBaseOOffs + alignedWidth * 3));
  270. }
  271. for (int x = 0; x < w; x++)
  272. {
  273. int baseX = x * BlockWidth;
  274. int copyWidth = Math.Min(BlockWidth, width - baseX);
  275. ulong block = data64[0];
  276. rPal[0] = (byte)block;
  277. rPal[1] = (byte)(block >> 8);
  278. if (signed)
  279. {
  280. BCnLerpAlphaSnorm(rPal);
  281. }
  282. else
  283. {
  284. BCnLerpAlphaUnorm(rPal);
  285. }
  286. BCnDecodeTileAlpha(tile, rPal, block >> 16);
  287. if ((copyWidth | copyHeight) == 4)
  288. {
  289. outputLine0[x] = tileAsUint[0];
  290. outputLine1[x] = tileAsUint[1];
  291. outputLine2[x] = tileAsUint[2];
  292. outputLine3[x] = tileAsUint[3];
  293. }
  294. else
  295. {
  296. int pixelBaseOOffs = lineBaseOOffs + baseX;
  297. for (int tY = 0; tY < copyHeight; tY++)
  298. {
  299. tile.Slice(tY * 4, copyWidth).CopyTo(outputSpan.Slice(pixelBaseOOffs + alignedWidth * tY, copyWidth));
  300. }
  301. }
  302. data64 = data64.Slice(1);
  303. }
  304. }
  305. imageBaseOOffs += alignedWidth * height;
  306. }
  307. }
  308. width = Math.Max(1, width >> 1);
  309. height = Math.Max(1, height >> 1);
  310. depth = Math.Max(1, depth >> 1);
  311. alignedWidth = BitUtils.AlignUp(width, 4);
  312. }
  313. return output;
  314. }
  315. public static byte[] DecodeBC5(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
  316. {
  317. int size = 0;
  318. for (int l = 0; l < levels; l++)
  319. {
  320. size += BitUtils.AlignUp(Math.Max(1, width >> l), 2) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 2;
  321. }
  322. // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned.
  323. int alignedWidth = BitUtils.AlignUp(width, 2);
  324. byte[] output = new byte[size];
  325. ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
  326. Span<byte> rTile = stackalloc byte[BlockWidth * BlockHeight * 2];
  327. Span<byte> gTile = stackalloc byte[BlockWidth * BlockHeight * 2];
  328. Span<byte> rPal = stackalloc byte[8];
  329. Span<byte> gPal = stackalloc byte[8];
  330. Span<ushort> outputAsUshort = MemoryMarshal.Cast<byte, ushort>(output);
  331. Span<uint> rTileAsUint = MemoryMarshal.Cast<byte, uint>(rTile);
  332. Span<uint> gTileAsUint = MemoryMarshal.Cast<byte, uint>(gTile);
  333. Span<ulong> outputLine0 = default;
  334. Span<ulong> outputLine1 = default;
  335. Span<ulong> outputLine2 = default;
  336. Span<ulong> outputLine3 = default;
  337. int imageBaseOOffs = 0;
  338. for (int l = 0; l < levels; l++)
  339. {
  340. int w = BitUtils.DivRoundUp(width, BlockWidth);
  341. int h = BitUtils.DivRoundUp(height, BlockHeight);
  342. for (int l2 = 0; l2 < layers; l2++)
  343. {
  344. for (int z = 0; z < depth; z++)
  345. {
  346. for (int y = 0; y < h; y++)
  347. {
  348. int baseY = y * BlockHeight;
  349. int copyHeight = Math.Min(BlockHeight, height - baseY);
  350. int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth;
  351. if (copyHeight == 4)
  352. {
  353. outputLine0 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs));
  354. outputLine1 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + alignedWidth));
  355. outputLine2 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + alignedWidth * 2));
  356. outputLine3 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort.Slice(lineBaseOOffs + alignedWidth * 3));
  357. }
  358. for (int x = 0; x < w; x++)
  359. {
  360. int baseX = x * BlockWidth;
  361. int copyWidth = Math.Min(BlockWidth, width - baseX);
  362. ulong blockL = data64[0];
  363. ulong blockH = data64[1];
  364. rPal[0] = (byte)blockL;
  365. rPal[1] = (byte)(blockL >> 8);
  366. gPal[0] = (byte)blockH;
  367. gPal[1] = (byte)(blockH >> 8);
  368. if (signed)
  369. {
  370. BCnLerpAlphaSnorm(rPal);
  371. BCnLerpAlphaSnorm(gPal);
  372. }
  373. else
  374. {
  375. BCnLerpAlphaUnorm(rPal);
  376. BCnLerpAlphaUnorm(gPal);
  377. }
  378. BCnDecodeTileAlpha(rTile, rPal, blockL >> 16);
  379. BCnDecodeTileAlpha(gTile, gPal, blockH >> 16);
  380. if ((copyWidth | copyHeight) == 4)
  381. {
  382. outputLine0[x] = InterleaveBytes(rTileAsUint[0], gTileAsUint[0]);
  383. outputLine1[x] = InterleaveBytes(rTileAsUint[1], gTileAsUint[1]);
  384. outputLine2[x] = InterleaveBytes(rTileAsUint[2], gTileAsUint[2]);
  385. outputLine3[x] = InterleaveBytes(rTileAsUint[3], gTileAsUint[3]);
  386. }
  387. else
  388. {
  389. int pixelBaseOOffs = lineBaseOOffs + baseX;
  390. for (int tY = 0; tY < copyHeight; tY++)
  391. {
  392. int line = pixelBaseOOffs + alignedWidth * tY;
  393. for (int tX = 0; tX < copyWidth; tX++)
  394. {
  395. int texel = tY * BlockWidth + tX;
  396. outputAsUshort[line + tX] = (ushort)(rTile[texel] | (gTile[texel] << 8));
  397. }
  398. }
  399. }
  400. data64 = data64.Slice(2);
  401. }
  402. }
  403. imageBaseOOffs += alignedWidth * height;
  404. }
  405. }
  406. width = Math.Max(1, width >> 1);
  407. height = Math.Max(1, height >> 1);
  408. depth = Math.Max(1, depth >> 1);
  409. alignedWidth = BitUtils.AlignUp(width, 2);
  410. }
  411. return output;
  412. }
  413. public static byte[] DecodeBC6(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
  414. {
  415. int size = 0;
  416. for (int l = 0; l < levels; l++)
  417. {
  418. size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 8;
  419. }
  420. byte[] output = new byte[size];
  421. int inputOffset = 0;
  422. int outputOffset = 0;
  423. for (int l = 0; l < levels; l++)
  424. {
  425. int w = BitUtils.DivRoundUp(width, BlockWidth);
  426. int h = BitUtils.DivRoundUp(height, BlockHeight);
  427. for (int l2 = 0; l2 < layers; l2++)
  428. {
  429. for (int z = 0; z < depth; z++)
  430. {
  431. BC6Decoder.Decode(output.AsSpan().Slice(outputOffset), data.Slice(inputOffset), width, height, signed);
  432. inputOffset += w * h * 16;
  433. outputOffset += width * height * 8;
  434. }
  435. }
  436. width = Math.Max(1, width >> 1);
  437. height = Math.Max(1, height >> 1);
  438. depth = Math.Max(1, depth >> 1);
  439. }
  440. return output;
  441. }
  442. public static byte[] DecodeBC7(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
  443. {
  444. int size = 0;
  445. for (int l = 0; l < levels; l++)
  446. {
  447. size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
  448. }
  449. byte[] output = new byte[size];
  450. int inputOffset = 0;
  451. int outputOffset = 0;
  452. for (int l = 0; l < levels; l++)
  453. {
  454. int w = BitUtils.DivRoundUp(width, BlockWidth);
  455. int h = BitUtils.DivRoundUp(height, BlockHeight);
  456. for (int l2 = 0; l2 < layers; l2++)
  457. {
  458. for (int z = 0; z < depth; z++)
  459. {
  460. BC7Decoder.Decode(output.AsSpan().Slice(outputOffset), data.Slice(inputOffset), width, height);
  461. inputOffset += w * h * 16;
  462. outputOffset += width * height * 4;
  463. }
  464. }
  465. width = Math.Max(1, width >> 1);
  466. height = Math.Max(1, height >> 1);
  467. depth = Math.Max(1, depth >> 1);
  468. }
  469. return output;
  470. }
  471. private static ulong InterleaveBytes(uint left, uint right)
  472. {
  473. return InterleaveBytesWithZeros(left) | (InterleaveBytesWithZeros(right) << 8);
  474. }
  475. private static ulong InterleaveBytesWithZeros(uint value)
  476. {
  477. ulong output = value;
  478. output = (output ^ (output << 16)) & 0xffff0000ffffUL;
  479. output = (output ^ (output << 8)) & 0xff00ff00ff00ffUL;
  480. return output;
  481. }
  482. private static void BCnLerpAlphaUnorm(Span<byte> alpha)
  483. {
  484. byte a0 = alpha[0];
  485. byte a1 = alpha[1];
  486. if (a0 > a1)
  487. {
  488. alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
  489. alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
  490. alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
  491. alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
  492. alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
  493. alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
  494. }
  495. else
  496. {
  497. alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
  498. alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
  499. alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
  500. alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
  501. alpha[6] = 0;
  502. alpha[7] = 0xff;
  503. }
  504. }
  505. private static void BCnLerpAlphaSnorm(Span<byte> alpha)
  506. {
  507. sbyte a0 = (sbyte)alpha[0];
  508. sbyte a1 = (sbyte)alpha[1];
  509. if (a0 > a1)
  510. {
  511. alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
  512. alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
  513. alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
  514. alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
  515. alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
  516. alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
  517. }
  518. else
  519. {
  520. alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
  521. alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
  522. alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
  523. alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
  524. alpha[6] = 0x80;
  525. alpha[7] = 0x7f;
  526. }
  527. }
  528. private unsafe static void BCnDecodeTileAlpha(Span<byte> output, Span<byte> rPal, ulong rI)
  529. {
  530. if (Avx2.IsSupported)
  531. {
  532. Span<Vector128<byte>> outputAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(output);
  533. Vector128<uint> shifts = Vector128.Create(0u, 3u, 6u, 9u);
  534. Vector128<uint> masks = Vector128.Create(7u);
  535. Vector128<byte> vClut;
  536. fixed (byte* pRPal = rPal)
  537. {
  538. vClut = Sse2.LoadScalarVector128((ulong*)pRPal).AsByte();
  539. }
  540. Vector128<uint> indices0 = Vector128.Create((uint)rI);
  541. Vector128<uint> indices1 = Vector128.Create((uint)(rI >> 24));
  542. Vector128<uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
  543. Vector128<uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
  544. Vector128<uint> indices01 = Sse2.ShiftRightLogical(indices00, 12);
  545. Vector128<uint> indices11 = Sse2.ShiftRightLogical(indices10, 12);
  546. indices00 = Sse2.And(indices00, masks);
  547. indices10 = Sse2.And(indices10, masks);
  548. indices01 = Sse2.And(indices01, masks);
  549. indices11 = Sse2.And(indices11, masks);
  550. Vector128<ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32());
  551. Vector128<ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32());
  552. Vector128<byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16());
  553. outputAsVector128[0] = Ssse3.Shuffle(vClut, indices);
  554. }
  555. else
  556. {
  557. for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3)
  558. {
  559. output[i] = rPal[(int)(rI & 7)];
  560. }
  561. }
  562. }
  563. private unsafe static void BCnDecodeTileAlphaRgba(Span<byte> output, Span<byte> rPal, ulong rI)
  564. {
  565. if (Avx2.IsSupported)
  566. {
  567. Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output);
  568. Vector256<uint> shifts = Vector256.Create(0u, 3u, 6u, 9u, 12u, 15u, 18u, 21u);
  569. Vector128<uint> vClut128;
  570. fixed (byte* pRPal = rPal)
  571. {
  572. vClut128 = Sse2.LoadScalarVector128((ulong*)pRPal).AsUInt32();
  573. }
  574. Vector256<uint> vClut = Avx2.ConvertToVector256Int32(vClut128.AsByte()).AsUInt32();
  575. vClut = Avx2.ShiftLeftLogical(vClut, 24);
  576. Vector256<uint> indices0 = Vector256.Create((uint)rI);
  577. Vector256<uint> indices1 = Vector256.Create((uint)(rI >> 24));
  578. indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
  579. indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
  580. outputAsVector256[0] = Avx2.Or(outputAsVector256[0], Avx2.PermuteVar8x32(vClut, indices0));
  581. outputAsVector256[1] = Avx2.Or(outputAsVector256[1], Avx2.PermuteVar8x32(vClut, indices1));
  582. }
  583. else
  584. {
  585. for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, rI >>= 3)
  586. {
  587. output[i] = rPal[(int)(rI & 7)];
  588. }
  589. }
  590. }
  591. private unsafe static void BC1DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input)
  592. {
  593. Span<uint> clut = stackalloc uint[4];
  594. uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
  595. uint c0 = (ushort)c0c1;
  596. uint c1 = (ushort)(c0c1 >> 16);
  597. clut[0] = ConvertRgb565ToRgb888(c0) | 0xff000000;
  598. clut[1] = ConvertRgb565ToRgb888(c1) | 0xff000000;
  599. clut[2] = BC1LerpRgb2(clut[0], clut[1], c0, c1);
  600. clut[3] = BC1LerpRgb3(clut[0], clut[1], c0, c1);
  601. BCnDecodeTileRgb(clut, output, input);
  602. }
  603. private unsafe static void BC23DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input)
  604. {
  605. Span<uint> clut = stackalloc uint[4];
  606. uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
  607. uint c0 = (ushort)c0c1;
  608. uint c1 = (ushort)(c0c1 >> 16);
  609. clut[0] = ConvertRgb565ToRgb888(c0);
  610. clut[1] = ConvertRgb565ToRgb888(c1);
  611. clut[2] = BC23LerpRgb2(clut[0], clut[1]);
  612. clut[3] = BC23LerpRgb3(clut[0], clut[1]);
  613. BCnDecodeTileRgb(clut, output, input);
  614. }
  615. private unsafe static void BCnDecodeTileRgb(Span<uint> clut, Span<byte> output, ReadOnlySpan<byte> input)
  616. {
  617. if (Avx2.IsSupported)
  618. {
  619. Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output);
  620. Vector256<uint> shifts0 = Vector256.Create(0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u);
  621. Vector256<uint> shifts1 = Vector256.Create(16u, 18u, 20u, 22u, 24u, 26u, 28u, 30u);
  622. Vector256<uint> masks = Vector256.Create(3u);
  623. Vector256<uint> vClut;
  624. fixed (uint* pClut = &clut[0])
  625. {
  626. vClut = Sse2.LoadVector128(pClut).ToVector256Unsafe();
  627. }
  628. Vector256<uint> indices0;
  629. fixed (byte* pInput = input)
  630. {
  631. indices0 = Avx2.BroadcastScalarToVector256((uint*)(pInput + 4));
  632. }
  633. Vector256<uint> indices1 = indices0;
  634. indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts0);
  635. indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts1);
  636. indices0 = Avx2.And(indices0, masks);
  637. indices1 = Avx2.And(indices1, masks);
  638. outputAsVector256[0] = Avx2.PermuteVar8x32(vClut, indices0);
  639. outputAsVector256[1] = Avx2.PermuteVar8x32(vClut, indices1);
  640. }
  641. else
  642. {
  643. Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
  644. uint indices = BinaryPrimitives.ReadUInt32LittleEndian(input.Slice(4));
  645. for (int i = 0; i < BlockWidth * BlockHeight; i++, indices >>= 2)
  646. {
  647. outputAsUint[i] = clut[(int)(indices & 3)];
  648. }
  649. }
  650. }
  651. private static uint BC1LerpRgb2(uint color0, uint color1, uint c0, uint c1)
  652. {
  653. if (c0 > c1)
  654. {
  655. return BC23LerpRgb2(color0, color1) | 0xff000000;
  656. }
  657. uint carry = color0 & color1;
  658. uint addHalve = ((color0 ^ color1) >> 1) & 0x7f7f7f;
  659. return (addHalve + carry) | 0xff000000;
  660. }
  661. private static uint BC23LerpRgb2(uint color0, uint color1)
  662. {
  663. uint r0 = (byte)color0;
  664. uint g0 = color0 & 0xff00;
  665. uint b0 = color0 & 0xff0000;
  666. uint r1 = (byte)color1;
  667. uint g1 = color1 & 0xff00;
  668. uint b1 = color1 & 0xff0000;
  669. uint mixR = (2 * r0 + r1) / 3;
  670. uint mixG = (2 * g0 + g1) / 3;
  671. uint mixB = (2 * b0 + b1) / 3;
  672. return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
  673. }
  674. private static uint BC1LerpRgb3(uint color0, uint color1, uint c0, uint c1)
  675. {
  676. if (c0 > c1)
  677. {
  678. return BC23LerpRgb3(color0, color1) | 0xff000000;
  679. }
  680. return 0;
  681. }
  682. private static uint BC23LerpRgb3(uint color0, uint color1)
  683. {
  684. uint r0 = (byte)color0;
  685. uint g0 = color0 & 0xff00;
  686. uint b0 = color0 & 0xff0000;
  687. uint r1 = (byte)color1;
  688. uint g1 = color1 & 0xff00;
  689. uint b1 = color1 & 0xff0000;
  690. uint mixR = (2 * r1 + r0) / 3;
  691. uint mixG = (2 * g1 + g0) / 3;
  692. uint mixB = (2 * b1 + b0) / 3;
  693. return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
  694. }
  695. private static uint ConvertRgb565ToRgb888(uint value)
  696. {
  697. uint b = (value & 0x1f) << 19;
  698. uint g = (value << 5) & 0xfc00;
  699. uint r = (value >> 8) & 0xf8;
  700. b |= b >> 5;
  701. g |= g >> 6;
  702. r |= r >> 5;
  703. return r | (g & 0xff00) | (b & 0xff0000);
  704. }
  705. }
  706. }