BCnDecoder.cs 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897
  1. using Ryujinx.Common;
  2. using Ryujinx.Common.Memory;
  3. using System;
  4. using System.Buffers.Binary;
  5. using System.Runtime.InteropServices;
  6. using System.Runtime.Intrinsics;
  7. using System.Runtime.Intrinsics.X86;
  8. namespace Ryujinx.Graphics.Texture
  9. {
  10. public static class BCnDecoder
  11. {
  12. private const int BlockWidth = 4;
  13. private const int BlockHeight = 4;
  14. public static MemoryOwner<byte> DecodeBC1(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
  15. {
  16. int size = 0;
  17. for (int l = 0; l < levels; l++)
  18. {
  19. size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
  20. }
  21. MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
  22. Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
  23. Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
  24. Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output.Span);
  25. Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
  26. Span<Vector128<byte>> outputLine0 = default;
  27. Span<Vector128<byte>> outputLine1 = default;
  28. Span<Vector128<byte>> outputLine2 = default;
  29. Span<Vector128<byte>> outputLine3 = default;
  30. int imageBaseOOffs = 0;
  31. for (int l = 0; l < levels; l++)
  32. {
  33. int w = BitUtils.DivRoundUp(width, BlockWidth);
  34. int h = BitUtils.DivRoundUp(height, BlockHeight);
  35. for (int l2 = 0; l2 < layers; l2++)
  36. {
  37. for (int z = 0; z < depth; z++)
  38. {
  39. for (int y = 0; y < h; y++)
  40. {
  41. int baseY = y * BlockHeight;
  42. int copyHeight = Math.Min(BlockHeight, height - baseY);
  43. int lineBaseOOffs = imageBaseOOffs + baseY * width;
  44. if (copyHeight == 4)
  45. {
  46. outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[lineBaseOOffs..]);
  47. outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width)..]);
  48. outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 2)..]);
  49. outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 3)..]);
  50. }
  51. for (int x = 0; x < w; x++)
  52. {
  53. int baseX = x * BlockWidth;
  54. int copyWidth = Math.Min(BlockWidth, width - baseX);
  55. BC1DecodeTileRgb(tile, data);
  56. if ((copyWidth | copyHeight) == 4)
  57. {
  58. outputLine0[x] = tileAsVector128[0];
  59. outputLine1[x] = tileAsVector128[1];
  60. outputLine2[x] = tileAsVector128[2];
  61. outputLine3[x] = tileAsVector128[3];
  62. }
  63. else
  64. {
  65. int pixelBaseOOffs = lineBaseOOffs + baseX;
  66. for (int tY = 0; tY < copyHeight; tY++)
  67. {
  68. tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
  69. }
  70. }
  71. data = data[8..];
  72. }
  73. }
  74. imageBaseOOffs += width * height;
  75. }
  76. }
  77. width = Math.Max(1, width >> 1);
  78. height = Math.Max(1, height >> 1);
  79. depth = Math.Max(1, depth >> 1);
  80. }
  81. return output;
  82. }
  83. public static MemoryOwner<byte> DecodeBC2(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
  84. {
  85. int size = 0;
  86. for (int l = 0; l < levels; l++)
  87. {
  88. size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
  89. }
  90. MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
  91. Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
  92. Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
  93. Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output.Span);
  94. Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
  95. Span<Vector128<byte>> outputLine0 = default;
  96. Span<Vector128<byte>> outputLine1 = default;
  97. Span<Vector128<byte>> outputLine2 = default;
  98. Span<Vector128<byte>> outputLine3 = default;
  99. int imageBaseOOffs = 0;
  100. for (int l = 0; l < levels; l++)
  101. {
  102. int w = BitUtils.DivRoundUp(width, BlockWidth);
  103. int h = BitUtils.DivRoundUp(height, BlockHeight);
  104. for (int l2 = 0; l2 < layers; l2++)
  105. {
  106. for (int z = 0; z < depth; z++)
  107. {
  108. for (int y = 0; y < h; y++)
  109. {
  110. int baseY = y * BlockHeight;
  111. int copyHeight = Math.Min(BlockHeight, height - baseY);
  112. int lineBaseOOffs = imageBaseOOffs + baseY * width;
  113. if (copyHeight == 4)
  114. {
  115. outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[lineBaseOOffs..]);
  116. outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width)..]);
  117. outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 2)..]);
  118. outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 3)..]);
  119. }
  120. for (int x = 0; x < w; x++)
  121. {
  122. int baseX = x * BlockWidth;
  123. int copyWidth = Math.Min(BlockWidth, width - baseX);
  124. BC23DecodeTileRgb(tile, data[8..]);
  125. ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
  126. for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, block >>= 4)
  127. {
  128. tile[i] = (byte)((block & 0xf) | (block << 4));
  129. }
  130. if ((copyWidth | copyHeight) == 4)
  131. {
  132. outputLine0[x] = tileAsVector128[0];
  133. outputLine1[x] = tileAsVector128[1];
  134. outputLine2[x] = tileAsVector128[2];
  135. outputLine3[x] = tileAsVector128[3];
  136. }
  137. else
  138. {
  139. int pixelBaseOOffs = lineBaseOOffs + baseX;
  140. for (int tY = 0; tY < copyHeight; tY++)
  141. {
  142. tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
  143. }
  144. }
  145. data = data[16..];
  146. }
  147. }
  148. imageBaseOOffs += width * height;
  149. }
  150. }
  151. width = Math.Max(1, width >> 1);
  152. height = Math.Max(1, height >> 1);
  153. depth = Math.Max(1, depth >> 1);
  154. }
  155. return output;
  156. }
  157. public static MemoryOwner<byte> DecodeBC3(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
  158. {
  159. int size = 0;
  160. for (int l = 0; l < levels; l++)
  161. {
  162. size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
  163. }
  164. MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
  165. Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
  166. Span<byte> rPal = stackalloc byte[8];
  167. Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
  168. Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output.Span);
  169. Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
  170. Span<Vector128<byte>> outputLine0 = default;
  171. Span<Vector128<byte>> outputLine1 = default;
  172. Span<Vector128<byte>> outputLine2 = default;
  173. Span<Vector128<byte>> outputLine3 = default;
  174. int imageBaseOOffs = 0;
  175. for (int l = 0; l < levels; l++)
  176. {
  177. int w = BitUtils.DivRoundUp(width, BlockWidth);
  178. int h = BitUtils.DivRoundUp(height, BlockHeight);
  179. for (int l2 = 0; l2 < layers; l2++)
  180. {
  181. for (int z = 0; z < depth; z++)
  182. {
  183. for (int y = 0; y < h; y++)
  184. {
  185. int baseY = y * BlockHeight;
  186. int copyHeight = Math.Min(BlockHeight, height - baseY);
  187. int lineBaseOOffs = imageBaseOOffs + baseY * width;
  188. if (copyHeight == 4)
  189. {
  190. outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[lineBaseOOffs..]);
  191. outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width)..]);
  192. outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 2)..]);
  193. outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 3)..]);
  194. }
  195. for (int x = 0; x < w; x++)
  196. {
  197. int baseX = x * BlockWidth;
  198. int copyWidth = Math.Min(BlockWidth, width - baseX);
  199. BC23DecodeTileRgb(tile, data[8..]);
  200. ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
  201. rPal[0] = (byte)block;
  202. rPal[1] = (byte)(block >> 8);
  203. BCnLerpAlphaUnorm(rPal);
  204. BCnDecodeTileAlphaRgba(tile, rPal, block >> 16);
  205. if ((copyWidth | copyHeight) == 4)
  206. {
  207. outputLine0[x] = tileAsVector128[0];
  208. outputLine1[x] = tileAsVector128[1];
  209. outputLine2[x] = tileAsVector128[2];
  210. outputLine3[x] = tileAsVector128[3];
  211. }
  212. else
  213. {
  214. int pixelBaseOOffs = lineBaseOOffs + baseX;
  215. for (int tY = 0; tY < copyHeight; tY++)
  216. {
  217. tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
  218. }
  219. }
  220. data = data[16..];
  221. }
  222. }
  223. imageBaseOOffs += width * height;
  224. }
  225. }
  226. width = Math.Max(1, width >> 1);
  227. height = Math.Max(1, height >> 1);
  228. depth = Math.Max(1, depth >> 1);
  229. }
  230. return output;
  231. }
  232. public static MemoryOwner<byte> DecodeBC4(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
  233. {
  234. int size = 0;
  235. for (int l = 0; l < levels; l++)
  236. {
  237. size += BitUtils.AlignUp(Math.Max(1, width >> l), 4) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers;
  238. }
  239. // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned.
  240. int alignedWidth = BitUtils.AlignUp(width, 4);
  241. MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
  242. Span<byte> outputSpan = output.Span;
  243. ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
  244. Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight];
  245. Span<byte> rPal = stackalloc byte[8];
  246. Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
  247. Span<uint> outputLine0 = default;
  248. Span<uint> outputLine1 = default;
  249. Span<uint> outputLine2 = default;
  250. Span<uint> outputLine3 = default;
  251. int imageBaseOOffs = 0;
  252. for (int l = 0; l < levels; l++)
  253. {
  254. int w = BitUtils.DivRoundUp(width, BlockWidth);
  255. int h = BitUtils.DivRoundUp(height, BlockHeight);
  256. for (int l2 = 0; l2 < layers; l2++)
  257. {
  258. for (int z = 0; z < depth; z++)
  259. {
  260. for (int y = 0; y < h; y++)
  261. {
  262. int baseY = y * BlockHeight;
  263. int copyHeight = Math.Min(BlockHeight, height - baseY);
  264. int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth;
  265. if (copyHeight == 4)
  266. {
  267. outputLine0 = MemoryMarshal.Cast<byte, uint>(outputSpan[lineBaseOOffs..]);
  268. outputLine1 = MemoryMarshal.Cast<byte, uint>(outputSpan[(lineBaseOOffs + alignedWidth)..]);
  269. outputLine2 = MemoryMarshal.Cast<byte, uint>(outputSpan[(lineBaseOOffs + alignedWidth * 2)..]);
  270. outputLine3 = MemoryMarshal.Cast<byte, uint>(outputSpan[(lineBaseOOffs + alignedWidth * 3)..]);
  271. }
  272. for (int x = 0; x < w; x++)
  273. {
  274. int baseX = x * BlockWidth;
  275. int copyWidth = Math.Min(BlockWidth, width - baseX);
  276. ulong block = data64[0];
  277. rPal[0] = (byte)block;
  278. rPal[1] = (byte)(block >> 8);
  279. if (signed)
  280. {
  281. BCnLerpAlphaSnorm(rPal);
  282. }
  283. else
  284. {
  285. BCnLerpAlphaUnorm(rPal);
  286. }
  287. BCnDecodeTileAlpha(tile, rPal, block >> 16);
  288. if ((copyWidth | copyHeight) == 4)
  289. {
  290. outputLine0[x] = tileAsUint[0];
  291. outputLine1[x] = tileAsUint[1];
  292. outputLine2[x] = tileAsUint[2];
  293. outputLine3[x] = tileAsUint[3];
  294. }
  295. else
  296. {
  297. int pixelBaseOOffs = lineBaseOOffs + baseX;
  298. for (int tY = 0; tY < copyHeight; tY++)
  299. {
  300. tile.Slice(tY * 4, copyWidth).CopyTo(outputSpan.Slice(pixelBaseOOffs + alignedWidth * tY, copyWidth));
  301. }
  302. }
  303. data64 = data64[1..];
  304. }
  305. }
  306. imageBaseOOffs += alignedWidth * height;
  307. }
  308. }
  309. width = Math.Max(1, width >> 1);
  310. height = Math.Max(1, height >> 1);
  311. depth = Math.Max(1, depth >> 1);
  312. alignedWidth = BitUtils.AlignUp(width, 4);
  313. }
  314. return output;
  315. }
  316. public static MemoryOwner<byte> DecodeBC5(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
  317. {
  318. int size = 0;
  319. for (int l = 0; l < levels; l++)
  320. {
  321. size += BitUtils.AlignUp(Math.Max(1, width >> l), 2) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 2;
  322. }
  323. // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned.
  324. int alignedWidth = BitUtils.AlignUp(width, 2);
  325. MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
  326. ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
  327. Span<byte> rTile = stackalloc byte[BlockWidth * BlockHeight * 2];
  328. Span<byte> gTile = stackalloc byte[BlockWidth * BlockHeight * 2];
  329. Span<byte> rPal = stackalloc byte[8];
  330. Span<byte> gPal = stackalloc byte[8];
  331. Span<ushort> outputAsUshort = MemoryMarshal.Cast<byte, ushort>(output.Span);
  332. Span<uint> rTileAsUint = MemoryMarshal.Cast<byte, uint>(rTile);
  333. Span<uint> gTileAsUint = MemoryMarshal.Cast<byte, uint>(gTile);
  334. Span<ulong> outputLine0 = default;
  335. Span<ulong> outputLine1 = default;
  336. Span<ulong> outputLine2 = default;
  337. Span<ulong> outputLine3 = default;
  338. int imageBaseOOffs = 0;
  339. for (int l = 0; l < levels; l++)
  340. {
  341. int w = BitUtils.DivRoundUp(width, BlockWidth);
  342. int h = BitUtils.DivRoundUp(height, BlockHeight);
  343. for (int l2 = 0; l2 < layers; l2++)
  344. {
  345. for (int z = 0; z < depth; z++)
  346. {
  347. for (int y = 0; y < h; y++)
  348. {
  349. int baseY = y * BlockHeight;
  350. int copyHeight = Math.Min(BlockHeight, height - baseY);
  351. int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth;
  352. if (copyHeight == 4)
  353. {
  354. outputLine0 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort[lineBaseOOffs..]);
  355. outputLine1 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort[(lineBaseOOffs + alignedWidth)..]);
  356. outputLine2 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort[(lineBaseOOffs + alignedWidth * 2)..]);
  357. outputLine3 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort[(lineBaseOOffs + alignedWidth * 3)..]);
  358. }
  359. for (int x = 0; x < w; x++)
  360. {
  361. int baseX = x * BlockWidth;
  362. int copyWidth = Math.Min(BlockWidth, width - baseX);
  363. ulong blockL = data64[0];
  364. ulong blockH = data64[1];
  365. rPal[0] = (byte)blockL;
  366. rPal[1] = (byte)(blockL >> 8);
  367. gPal[0] = (byte)blockH;
  368. gPal[1] = (byte)(blockH >> 8);
  369. if (signed)
  370. {
  371. BCnLerpAlphaSnorm(rPal);
  372. BCnLerpAlphaSnorm(gPal);
  373. }
  374. else
  375. {
  376. BCnLerpAlphaUnorm(rPal);
  377. BCnLerpAlphaUnorm(gPal);
  378. }
  379. BCnDecodeTileAlpha(rTile, rPal, blockL >> 16);
  380. BCnDecodeTileAlpha(gTile, gPal, blockH >> 16);
  381. if ((copyWidth | copyHeight) == 4)
  382. {
  383. outputLine0[x] = InterleaveBytes(rTileAsUint[0], gTileAsUint[0]);
  384. outputLine1[x] = InterleaveBytes(rTileAsUint[1], gTileAsUint[1]);
  385. outputLine2[x] = InterleaveBytes(rTileAsUint[2], gTileAsUint[2]);
  386. outputLine3[x] = InterleaveBytes(rTileAsUint[3], gTileAsUint[3]);
  387. }
  388. else
  389. {
  390. int pixelBaseOOffs = lineBaseOOffs + baseX;
  391. for (int tY = 0; tY < copyHeight; tY++)
  392. {
  393. int line = pixelBaseOOffs + alignedWidth * tY;
  394. for (int tX = 0; tX < copyWidth; tX++)
  395. {
  396. int texel = tY * BlockWidth + tX;
  397. outputAsUshort[line + tX] = (ushort)(rTile[texel] | (gTile[texel] << 8));
  398. }
  399. }
  400. }
  401. data64 = data64[2..];
  402. }
  403. }
  404. imageBaseOOffs += alignedWidth * height;
  405. }
  406. }
  407. width = Math.Max(1, width >> 1);
  408. height = Math.Max(1, height >> 1);
  409. depth = Math.Max(1, depth >> 1);
  410. alignedWidth = BitUtils.AlignUp(width, 2);
  411. }
  412. return output;
  413. }
  414. public static MemoryOwner<byte> DecodeBC6(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
  415. {
  416. int size = 0;
  417. for (int l = 0; l < levels; l++)
  418. {
  419. size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 8;
  420. }
  421. MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
  422. Span<byte> outputSpan = output.Span;
  423. int inputOffset = 0;
  424. int outputOffset = 0;
  425. for (int l = 0; l < levels; l++)
  426. {
  427. int w = BitUtils.DivRoundUp(width, BlockWidth);
  428. int h = BitUtils.DivRoundUp(height, BlockHeight);
  429. for (int l2 = 0; l2 < layers; l2++)
  430. {
  431. for (int z = 0; z < depth; z++)
  432. {
  433. BC6Decoder.Decode(outputSpan[outputOffset..], data[inputOffset..], width, height, signed);
  434. inputOffset += w * h * 16;
  435. outputOffset += width * height * 8;
  436. }
  437. }
  438. width = Math.Max(1, width >> 1);
  439. height = Math.Max(1, height >> 1);
  440. depth = Math.Max(1, depth >> 1);
  441. }
  442. return output;
  443. }
  444. public static MemoryOwner<byte> DecodeBC7(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
  445. {
  446. int size = 0;
  447. for (int l = 0; l < levels; l++)
  448. {
  449. size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
  450. }
  451. MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
  452. Span<byte> outputSpan = output.Span;
  453. int inputOffset = 0;
  454. int outputOffset = 0;
  455. for (int l = 0; l < levels; l++)
  456. {
  457. int w = BitUtils.DivRoundUp(width, BlockWidth);
  458. int h = BitUtils.DivRoundUp(height, BlockHeight);
  459. for (int l2 = 0; l2 < layers; l2++)
  460. {
  461. for (int z = 0; z < depth; z++)
  462. {
  463. BC7Decoder.Decode(outputSpan[outputOffset..], data[inputOffset..], width, height);
  464. inputOffset += w * h * 16;
  465. outputOffset += width * height * 4;
  466. }
  467. }
  468. width = Math.Max(1, width >> 1);
  469. height = Math.Max(1, height >> 1);
  470. depth = Math.Max(1, depth >> 1);
  471. }
  472. return output;
  473. }
  474. private static ulong InterleaveBytes(uint left, uint right)
  475. {
  476. return InterleaveBytesWithZeros(left) | (InterleaveBytesWithZeros(right) << 8);
  477. }
  478. private static ulong InterleaveBytesWithZeros(uint value)
  479. {
  480. ulong output = value;
  481. output = (output ^ (output << 16)) & 0xffff0000ffffUL;
  482. output = (output ^ (output << 8)) & 0xff00ff00ff00ffUL;
  483. return output;
  484. }
  485. private static void BCnLerpAlphaUnorm(Span<byte> alpha)
  486. {
  487. byte a0 = alpha[0];
  488. byte a1 = alpha[1];
  489. if (a0 > a1)
  490. {
  491. alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
  492. alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
  493. alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
  494. alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
  495. alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
  496. alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
  497. }
  498. else
  499. {
  500. alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
  501. alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
  502. alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
  503. alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
  504. alpha[6] = 0;
  505. alpha[7] = 0xff;
  506. }
  507. }
  508. private static void BCnLerpAlphaSnorm(Span<byte> alpha)
  509. {
  510. sbyte a0 = (sbyte)alpha[0];
  511. sbyte a1 = (sbyte)alpha[1];
  512. if (a0 > a1)
  513. {
  514. alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
  515. alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
  516. alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
  517. alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
  518. alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
  519. alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
  520. }
  521. else
  522. {
  523. alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
  524. alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
  525. alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
  526. alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
  527. alpha[6] = 0x80;
  528. alpha[7] = 0x7f;
  529. }
  530. }
  531. private unsafe static void BCnDecodeTileAlpha(Span<byte> output, Span<byte> rPal, ulong rI)
  532. {
  533. if (Avx2.IsSupported)
  534. {
  535. Span<Vector128<byte>> outputAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(output);
  536. Vector128<uint> shifts = Vector128.Create(0u, 3u, 6u, 9u);
  537. Vector128<uint> masks = Vector128.Create(7u);
  538. Vector128<byte> vClut;
  539. fixed (byte* pRPal = rPal)
  540. {
  541. vClut = Sse2.LoadScalarVector128((ulong*)pRPal).AsByte();
  542. }
  543. Vector128<uint> indices0 = Vector128.Create((uint)rI);
  544. Vector128<uint> indices1 = Vector128.Create((uint)(rI >> 24));
  545. Vector128<uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
  546. Vector128<uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
  547. Vector128<uint> indices01 = Sse2.ShiftRightLogical(indices00, 12);
  548. Vector128<uint> indices11 = Sse2.ShiftRightLogical(indices10, 12);
  549. indices00 = Sse2.And(indices00, masks);
  550. indices10 = Sse2.And(indices10, masks);
  551. indices01 = Sse2.And(indices01, masks);
  552. indices11 = Sse2.And(indices11, masks);
  553. Vector128<ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32());
  554. Vector128<ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32());
  555. Vector128<byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16());
  556. outputAsVector128[0] = Ssse3.Shuffle(vClut, indices);
  557. }
  558. else
  559. {
  560. for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3)
  561. {
  562. output[i] = rPal[(int)(rI & 7)];
  563. }
  564. }
  565. }
  566. private unsafe static void BCnDecodeTileAlphaRgba(Span<byte> output, Span<byte> rPal, ulong rI)
  567. {
  568. if (Avx2.IsSupported)
  569. {
  570. Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output);
  571. Vector256<uint> shifts = Vector256.Create(0u, 3u, 6u, 9u, 12u, 15u, 18u, 21u);
  572. Vector128<uint> vClut128;
  573. fixed (byte* pRPal = rPal)
  574. {
  575. vClut128 = Sse2.LoadScalarVector128((ulong*)pRPal).AsUInt32();
  576. }
  577. Vector256<uint> vClut = Avx2.ConvertToVector256Int32(vClut128.AsByte()).AsUInt32();
  578. vClut = Avx2.ShiftLeftLogical(vClut, 24);
  579. Vector256<uint> indices0 = Vector256.Create((uint)rI);
  580. Vector256<uint> indices1 = Vector256.Create((uint)(rI >> 24));
  581. indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
  582. indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
  583. outputAsVector256[0] = Avx2.Or(outputAsVector256[0], Avx2.PermuteVar8x32(vClut, indices0));
  584. outputAsVector256[1] = Avx2.Or(outputAsVector256[1], Avx2.PermuteVar8x32(vClut, indices1));
  585. }
  586. else
  587. {
  588. for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, rI >>= 3)
  589. {
  590. output[i] = rPal[(int)(rI & 7)];
  591. }
  592. }
  593. }
  594. private unsafe static void BC1DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input)
  595. {
  596. Span<uint> clut = stackalloc uint[4];
  597. uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
  598. uint c0 = (ushort)c0c1;
  599. uint c1 = (ushort)(c0c1 >> 16);
  600. clut[0] = ConvertRgb565ToRgb888(c0) | 0xff000000;
  601. clut[1] = ConvertRgb565ToRgb888(c1) | 0xff000000;
  602. clut[2] = BC1LerpRgb2(clut[0], clut[1], c0, c1);
  603. clut[3] = BC1LerpRgb3(clut[0], clut[1], c0, c1);
  604. BCnDecodeTileRgb(clut, output, input);
  605. }
  606. private unsafe static void BC23DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input)
  607. {
  608. Span<uint> clut = stackalloc uint[4];
  609. uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
  610. uint c0 = (ushort)c0c1;
  611. uint c1 = (ushort)(c0c1 >> 16);
  612. clut[0] = ConvertRgb565ToRgb888(c0);
  613. clut[1] = ConvertRgb565ToRgb888(c1);
  614. clut[2] = BC23LerpRgb2(clut[0], clut[1]);
  615. clut[3] = BC23LerpRgb3(clut[0], clut[1]);
  616. BCnDecodeTileRgb(clut, output, input);
  617. }
  618. private unsafe static void BCnDecodeTileRgb(Span<uint> clut, Span<byte> output, ReadOnlySpan<byte> input)
  619. {
  620. if (Avx2.IsSupported)
  621. {
  622. Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output);
  623. Vector256<uint> shifts0 = Vector256.Create(0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u);
  624. Vector256<uint> shifts1 = Vector256.Create(16u, 18u, 20u, 22u, 24u, 26u, 28u, 30u);
  625. Vector256<uint> masks = Vector256.Create(3u);
  626. Vector256<uint> vClut;
  627. fixed (uint* pClut = &clut[0])
  628. {
  629. vClut = Sse2.LoadVector128(pClut).ToVector256Unsafe();
  630. }
  631. Vector256<uint> indices0;
  632. fixed (byte* pInput = input)
  633. {
  634. indices0 = Avx2.BroadcastScalarToVector256((uint*)(pInput + 4));
  635. }
  636. Vector256<uint> indices1 = indices0;
  637. indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts0);
  638. indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts1);
  639. indices0 = Avx2.And(indices0, masks);
  640. indices1 = Avx2.And(indices1, masks);
  641. outputAsVector256[0] = Avx2.PermuteVar8x32(vClut, indices0);
  642. outputAsVector256[1] = Avx2.PermuteVar8x32(vClut, indices1);
  643. }
  644. else
  645. {
  646. Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
  647. uint indices = BinaryPrimitives.ReadUInt32LittleEndian(input[4..]);
  648. for (int i = 0; i < BlockWidth * BlockHeight; i++, indices >>= 2)
  649. {
  650. outputAsUint[i] = clut[(int)(indices & 3)];
  651. }
  652. }
  653. }
  654. private static uint BC1LerpRgb2(uint color0, uint color1, uint c0, uint c1)
  655. {
  656. if (c0 > c1)
  657. {
  658. return BC23LerpRgb2(color0, color1) | 0xff000000;
  659. }
  660. uint carry = color0 & color1;
  661. uint addHalve = ((color0 ^ color1) >> 1) & 0x7f7f7f;
  662. return (addHalve + carry) | 0xff000000;
  663. }
  664. private static uint BC23LerpRgb2(uint color0, uint color1)
  665. {
  666. uint r0 = (byte)color0;
  667. uint g0 = color0 & 0xff00;
  668. uint b0 = color0 & 0xff0000;
  669. uint r1 = (byte)color1;
  670. uint g1 = color1 & 0xff00;
  671. uint b1 = color1 & 0xff0000;
  672. uint mixR = (2 * r0 + r1) / 3;
  673. uint mixG = (2 * g0 + g1) / 3;
  674. uint mixB = (2 * b0 + b1) / 3;
  675. return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
  676. }
  677. private static uint BC1LerpRgb3(uint color0, uint color1, uint c0, uint c1)
  678. {
  679. if (c0 > c1)
  680. {
  681. return BC23LerpRgb3(color0, color1) | 0xff000000;
  682. }
  683. return 0;
  684. }
  685. private static uint BC23LerpRgb3(uint color0, uint color1)
  686. {
  687. uint r0 = (byte)color0;
  688. uint g0 = color0 & 0xff00;
  689. uint b0 = color0 & 0xff0000;
  690. uint r1 = (byte)color1;
  691. uint g1 = color1 & 0xff00;
  692. uint b1 = color1 & 0xff0000;
  693. uint mixR = (2 * r1 + r0) / 3;
  694. uint mixG = (2 * g1 + g0) / 3;
  695. uint mixB = (2 * b1 + b0) / 3;
  696. return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
  697. }
  698. private static uint ConvertRgb565ToRgb888(uint value)
  699. {
  700. uint b = (value & 0x1f) << 19;
  701. uint g = (value << 5) & 0xfc00;
  702. uint r = (value >> 8) & 0xf8;
  703. b |= b >> 5;
  704. g |= g >> 6;
  705. r |= r >> 5;
  706. return r | (g & 0xff00) | (b & 0xff0000);
  707. }
  708. }
  709. }