BC7Encoder.cs 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005
  1. using Ryujinx.Graphics.Texture.Utils;
  2. using System;
  3. using System.Diagnostics;
  4. using System.Numerics;
  5. using System.Runtime.CompilerServices;
  6. using System.Runtime.InteropServices;
  7. using System.Runtime.Intrinsics;
  8. using System.Runtime.Intrinsics.X86;
  9. using System.Threading.Tasks;
  10. namespace Ryujinx.Graphics.Texture.Encoders
  11. {
  12. static class BC7Encoder
  13. {
  14. private const int MinColorVarianceForModeChange = 160;
  15. public static void Encode(Memory<byte> outputStorage, ReadOnlyMemory<byte> data, int width, int height, EncodeMode mode)
  16. {
  17. int widthInBlocks = (width + 3) / 4;
  18. int heightInBlocks = (height + 3) / 4;
  19. bool fastMode = (mode & EncodeMode.ModeMask) == EncodeMode.Fast;
  20. if (mode.HasFlag(EncodeMode.Multithreaded))
  21. {
  22. Parallel.For(0, heightInBlocks, (yInBlocks) =>
  23. {
  24. Span<ulong> output = MemoryMarshal.Cast<byte, ulong>(outputStorage.Span);
  25. int y = yInBlocks * 4;
  26. for (int xInBlocks = 0; xInBlocks < widthInBlocks; xInBlocks++)
  27. {
  28. int x = xInBlocks * 4;
  29. Block block = CompressBlock(data.Span, x, y, width, height, fastMode);
  30. int offset = (yInBlocks * widthInBlocks + xInBlocks) * 2;
  31. output[offset] = block.Low;
  32. output[offset + 1] = block.High;
  33. }
  34. });
  35. }
  36. else
  37. {
  38. Span<ulong> output = MemoryMarshal.Cast<byte, ulong>(outputStorage.Span);
  39. int offset = 0;
  40. for (int y = 0; y < height; y += 4)
  41. {
  42. for (int x = 0; x < width; x += 4)
  43. {
  44. Block block = CompressBlock(data.Span, x, y, width, height, fastMode);
  45. output[offset++] = block.Low;
  46. output[offset++] = block.High;
  47. }
  48. }
  49. }
  50. }
  51. private static readonly int[] _mostFrequentPartitions = new int[]
  52. {
  53. 0, 13, 2, 1, 15, 14, 10, 23
  54. };
  55. private static Block CompressBlock(ReadOnlySpan<byte> data, int x, int y, int width, int height, bool fastMode)
  56. {
  57. int w = Math.Min(4, width - x);
  58. int h = Math.Min(4, height - y);
  59. var dataUint = MemoryMarshal.Cast<byte, uint>(data);
  60. int baseOffset = y * width + x;
  61. Span<uint> tile = stackalloc uint[w * h];
  62. for (int ty = 0; ty < h; ty++)
  63. {
  64. int rowOffset = baseOffset + ty * width;
  65. for (int tx = 0; tx < w; tx++)
  66. {
  67. tile[ty * w + tx] = dataUint[rowOffset + tx];
  68. }
  69. }
  70. return fastMode ? EncodeFast(tile, w, h) : EncodeExhaustive(tile, w, h);
  71. }
  72. private static Block EncodeFast(ReadOnlySpan<uint> tile, int w, int h)
  73. {
  74. (RgbaColor8 minColor, RgbaColor8 maxColor) = BC67Utils.GetMinMaxColors(tile, w, h);
  75. bool alphaNotOne = minColor.A != 255 || maxColor.A != 255;
  76. int variance = BC67Utils.SquaredDifference(minColor.GetColor32(), maxColor.GetColor32());
  77. int selectedMode;
  78. int indexMode = 0;
  79. if (alphaNotOne)
  80. {
  81. bool constantAlpha = minColor.A == maxColor.A;
  82. if (constantAlpha)
  83. {
  84. selectedMode = variance > MinColorVarianceForModeChange ? 7 : 6;
  85. }
  86. else
  87. {
  88. if (variance > MinColorVarianceForModeChange)
  89. {
  90. Span<uint> uniqueRGB = stackalloc uint[16];
  91. Span<uint> uniqueAlpha = stackalloc uint[16];
  92. int uniqueRGBCount = 0;
  93. int uniqueAlphaCount = 0;
  94. uint rgbMask = new RgbaColor8(255, 255, 255, 0).ToUInt32();
  95. uint alphaMask = new RgbaColor8(0, 0, 0, 255).ToUInt32();
  96. for (int i = 0; i < tile.Length; i++)
  97. {
  98. uint c = tile[i];
  99. if (!uniqueRGB.Slice(0, uniqueRGBCount).Contains(c & rgbMask))
  100. {
  101. uniqueRGB[uniqueRGBCount++] = c & rgbMask;
  102. }
  103. if (!uniqueAlpha.Slice(0, uniqueAlphaCount).Contains(c & alphaMask))
  104. {
  105. uniqueAlpha[uniqueAlphaCount++] = c & alphaMask;
  106. }
  107. }
  108. selectedMode = 4;
  109. indexMode = uniqueRGBCount > uniqueAlphaCount ? 1 : 0;
  110. }
  111. else
  112. {
  113. selectedMode = 5;
  114. }
  115. }
  116. }
  117. else
  118. {
  119. if (variance > MinColorVarianceForModeChange)
  120. {
  121. selectedMode = 1;
  122. }
  123. else
  124. {
  125. selectedMode = 6;
  126. }
  127. }
  128. int selectedPartition = 0;
  129. if (selectedMode == 1 || selectedMode == 7)
  130. {
  131. int partitionSelectionLowestError = int.MaxValue;
  132. for (int i = 0; i < _mostFrequentPartitions.Length; i++)
  133. {
  134. int p = _mostFrequentPartitions[i];
  135. int error = GetEndPointSelectionErrorFast(tile, 2, p, w, h, partitionSelectionLowestError);
  136. if (error < partitionSelectionLowestError)
  137. {
  138. partitionSelectionLowestError = error;
  139. selectedPartition = p;
  140. }
  141. }
  142. }
  143. return Encode(selectedMode, selectedPartition, 0, indexMode, fastMode: true, tile, w, h, out _);
  144. }
  145. private static Block EncodeExhaustive(ReadOnlySpan<uint> tile, int w, int h)
  146. {
  147. Block bestBlock = default;
  148. int lowestError = int.MaxValue;
  149. int lowestErrorSubsets = int.MaxValue;
  150. for (int m = 0; m < 8; m++)
  151. {
  152. for (int r = 0; r < (m == 4 || m == 5 ? 4 : 1); r++)
  153. {
  154. for (int im = 0; im < (m == 4 ? 2 : 1); im++)
  155. {
  156. for (int p = 0; p < 1 << BC67Tables.BC7ModeInfos[m].PartitionBitCount; p++)
  157. {
  158. Block block = Encode(m, p, r, im, fastMode: false, tile, w, h, out int maxError);
  159. if (maxError < lowestError || (maxError == lowestError && BC67Tables.BC7ModeInfos[m].SubsetCount < lowestErrorSubsets))
  160. {
  161. lowestError = maxError;
  162. lowestErrorSubsets = BC67Tables.BC7ModeInfos[m].SubsetCount;
  163. bestBlock = block;
  164. }
  165. }
  166. }
  167. }
  168. }
  169. return bestBlock;
  170. }
  171. private static Block Encode(
  172. int mode,
  173. int partition,
  174. int rotation,
  175. int indexMode,
  176. bool fastMode,
  177. ReadOnlySpan<uint> tile,
  178. int w,
  179. int h,
  180. out int errorSum)
  181. {
  182. BC7ModeInfo modeInfo = BC67Tables.BC7ModeInfos[mode];
  183. int subsetCount = modeInfo.SubsetCount;
  184. int partitionBitCount = modeInfo.PartitionBitCount;
  185. int rotationBitCount = modeInfo.RotationBitCount;
  186. int indexModeBitCount = modeInfo.IndexModeBitCount;
  187. int colorDepth = modeInfo.ColorDepth;
  188. int alphaDepth = modeInfo.AlphaDepth;
  189. int pBits = modeInfo.PBits;
  190. int colorIndexBitCount = modeInfo.ColorIndexBitCount;
  191. int alphaIndexBitCount = modeInfo.AlphaIndexBitCount;
  192. bool separateAlphaIndices = alphaIndexBitCount != 0;
  193. uint alphaMask;
  194. if (separateAlphaIndices)
  195. {
  196. alphaMask = rotation switch
  197. {
  198. 1 => new RgbaColor8(255, 0, 0, 0).ToUInt32(),
  199. 2 => new RgbaColor8(0, 255, 0, 0).ToUInt32(),
  200. 3 => new RgbaColor8(0, 0, 255, 0).ToUInt32(),
  201. _ => new RgbaColor8(0, 0, 0, 255).ToUInt32()
  202. };
  203. }
  204. else
  205. {
  206. alphaMask = new RgbaColor8(0, 0, 0, 0).ToUInt32();
  207. }
  208. if (indexMode != 0)
  209. {
  210. alphaMask = ~alphaMask;
  211. }
  212. //
  213. // Select color palette.
  214. //
  215. Span<uint> endPoints0 = stackalloc uint[subsetCount];
  216. Span<uint> endPoints1 = stackalloc uint[subsetCount];
  217. SelectEndPoints(
  218. tile,
  219. w,
  220. h,
  221. endPoints0,
  222. endPoints1,
  223. subsetCount,
  224. partition,
  225. colorIndexBitCount,
  226. colorDepth,
  227. alphaDepth,
  228. ~alphaMask,
  229. fastMode);
  230. if (separateAlphaIndices)
  231. {
  232. SelectEndPoints(
  233. tile,
  234. w,
  235. h,
  236. endPoints0,
  237. endPoints1,
  238. subsetCount,
  239. partition,
  240. alphaIndexBitCount,
  241. colorDepth,
  242. alphaDepth,
  243. alphaMask,
  244. fastMode);
  245. }
  246. Span<int> pBitValues = stackalloc int[pBits];
  247. for (int i = 0; i < pBits; i++)
  248. {
  249. int pBit;
  250. if (pBits == subsetCount)
  251. {
  252. pBit = GetPBit(endPoints0[i], endPoints1[i], colorDepth, alphaDepth);
  253. }
  254. else
  255. {
  256. int subset = i >> 1;
  257. uint color = (i & 1) == 0 ? endPoints0[subset] : endPoints1[subset];
  258. pBit = GetPBit(color, colorDepth, alphaDepth);
  259. }
  260. pBitValues[i] = pBit;
  261. }
  262. int colorIndexCount = 1 << colorIndexBitCount;
  263. int alphaIndexCount = 1 << alphaIndexBitCount;
  264. Span<byte> colorIndices = stackalloc byte[16];
  265. Span<byte> alphaIndices = stackalloc byte[16];
  266. errorSum = BC67Utils.SelectIndices(
  267. tile,
  268. w,
  269. h,
  270. endPoints0,
  271. endPoints1,
  272. pBitValues,
  273. colorIndices,
  274. subsetCount,
  275. partition,
  276. colorIndexBitCount,
  277. colorIndexCount,
  278. colorDepth,
  279. alphaDepth,
  280. pBits,
  281. alphaMask);
  282. if (separateAlphaIndices)
  283. {
  284. errorSum += BC67Utils.SelectIndices(
  285. tile,
  286. w,
  287. h,
  288. endPoints0,
  289. endPoints1,
  290. pBitValues,
  291. alphaIndices,
  292. subsetCount,
  293. partition,
  294. alphaIndexBitCount,
  295. alphaIndexCount,
  296. colorDepth,
  297. alphaDepth,
  298. pBits,
  299. ~alphaMask);
  300. }
  301. Span<bool> colorSwapSubset = stackalloc bool[3];
  302. for (int i = 0; i < 3; i++)
  303. {
  304. colorSwapSubset[i] = colorIndices[BC67Tables.FixUpIndices[subsetCount - 1][partition][i]] >= (colorIndexCount >> 1);
  305. }
  306. bool alphaSwapSubset = alphaIndices[0] >= (alphaIndexCount >> 1);
  307. Block block = new Block();
  308. int offset = 0;
  309. block.Encode(1UL << mode, ref offset, mode + 1);
  310. block.Encode((ulong)partition, ref offset, partitionBitCount);
  311. block.Encode((ulong)rotation, ref offset, rotationBitCount);
  312. block.Encode((ulong)indexMode, ref offset, indexModeBitCount);
  313. for (int comp = 0; comp < 3; comp++)
  314. {
  315. int rotatedComp = comp;
  316. if (((comp + 1) & 3) == rotation)
  317. {
  318. rotatedComp = 3;
  319. }
  320. for (int subset = 0; subset < subsetCount; subset++)
  321. {
  322. RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]);
  323. RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]);
  324. int pBit0 = -1, pBit1 = -1;
  325. if (pBits == subsetCount)
  326. {
  327. pBit0 = pBit1 = pBitValues[subset];
  328. }
  329. else if (pBits != 0)
  330. {
  331. pBit0 = pBitValues[subset * 2];
  332. pBit1 = pBitValues[subset * 2 + 1];
  333. }
  334. if (indexMode == 0 ? colorSwapSubset[subset] : alphaSwapSubset)
  335. {
  336. block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth);
  337. block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth);
  338. }
  339. else
  340. {
  341. block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth);
  342. block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth);
  343. }
  344. }
  345. }
  346. if (alphaDepth != 0)
  347. {
  348. int rotatedComp = (rotation - 1) & 3;
  349. for (int subset = 0; subset < subsetCount; subset++)
  350. {
  351. RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]);
  352. RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]);
  353. int pBit0 = -1, pBit1 = -1;
  354. if (pBits == subsetCount)
  355. {
  356. pBit0 = pBit1 = pBitValues[subset];
  357. }
  358. else if (pBits != 0)
  359. {
  360. pBit0 = pBitValues[subset * 2];
  361. pBit1 = pBitValues[subset * 2 + 1];
  362. }
  363. if (separateAlphaIndices && indexMode == 0 ? alphaSwapSubset : colorSwapSubset[subset])
  364. {
  365. block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth);
  366. block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth);
  367. }
  368. else
  369. {
  370. block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth);
  371. block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth);
  372. }
  373. }
  374. }
  375. for (int i = 0; i < pBits; i++)
  376. {
  377. block.Encode((ulong)pBitValues[i], ref offset, 1);
  378. }
  379. byte[] fixUpTable = BC67Tables.FixUpIndices[subsetCount - 1][partition];
  380. for (int i = 0; i < 16; i++)
  381. {
  382. int subset = BC67Tables.PartitionTable[subsetCount - 1][partition][i];
  383. byte index = colorIndices[i];
  384. if (colorSwapSubset[subset])
  385. {
  386. index = (byte)(index ^ (colorIndexCount - 1));
  387. }
  388. int finalIndexBitCount = i == fixUpTable[subset] ? colorIndexBitCount - 1 : colorIndexBitCount;
  389. Debug.Assert(index < (1 << finalIndexBitCount));
  390. block.Encode(index, ref offset, finalIndexBitCount);
  391. }
  392. if (separateAlphaIndices)
  393. {
  394. for (int i = 0; i < 16; i++)
  395. {
  396. byte index = alphaIndices[i];
  397. if (alphaSwapSubset)
  398. {
  399. index = (byte)(index ^ (alphaIndexCount - 1));
  400. }
  401. int finalIndexBitCount = i == 0 ? alphaIndexBitCount - 1 : alphaIndexBitCount;
  402. Debug.Assert(index < (1 << finalIndexBitCount));
  403. block.Encode(index, ref offset, finalIndexBitCount);
  404. }
  405. }
  406. return block;
  407. }
  408. private static unsafe int GetEndPointSelectionErrorFast(ReadOnlySpan<uint> tile, int subsetCount, int partition, int w, int h, int maxError)
  409. {
  410. byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
  411. Span<RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount];
  412. Span<RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount];
  413. BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount);
  414. Span<uint> endPoints0 = stackalloc uint[subsetCount];
  415. Span<uint> endPoints1 = stackalloc uint[subsetCount];
  416. SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, uint.MaxValue);
  417. Span<RgbaColor32> palette = stackalloc RgbaColor32[8];
  418. int errorSum = 0;
  419. for (int subset = 0; subset < subsetCount; subset++)
  420. {
  421. RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
  422. int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A;
  423. if (sum != 0)
  424. {
  425. blockDir = (blockDir << 6) / new RgbaColor32(sum);
  426. }
  427. uint c0 = endPoints0[subset];
  428. uint c1 = endPoints1[subset];
  429. int pBit0 = GetPBit(c0, 6, 0);
  430. int pBit1 = GetPBit(c1, 6, 0);
  431. c0 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c0), 6, 0, pBit0).ToUInt32();
  432. c1 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c1), 6, 0, pBit1).ToUInt32();
  433. if (Sse41.IsSupported)
  434. {
  435. Vector128<byte> c0Rep = Vector128.Create(c0).AsByte();
  436. Vector128<byte> c1Rep = Vector128.Create(c1).AsByte();
  437. Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
  438. Vector128<byte> rWeights;
  439. Vector128<byte> lWeights;
  440. fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
  441. {
  442. rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte();
  443. lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte();
  444. }
  445. Vector128<byte> iWeights = Sse2.UnpackLow(rWeights, lWeights);
  446. Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
  447. Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
  448. Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  449. Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  450. Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
  451. Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
  452. static Vector128<short> ShiftRoundToNearest(Vector128<short> x)
  453. {
  454. return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6);
  455. }
  456. Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
  457. Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
  458. Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
  459. Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
  460. for (int i = 0; i < tile.Length; i++)
  461. {
  462. if (partitionTable[i] != subset)
  463. {
  464. continue;
  465. }
  466. uint c = tile[i];
  467. Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
  468. Vector128<short> delta0 = Sse2.Subtract(color, pal0);
  469. Vector128<short> delta1 = Sse2.Subtract(color, pal1);
  470. Vector128<short> delta2 = Sse2.Subtract(color, pal2);
  471. Vector128<short> delta3 = Sse2.Subtract(color, pal3);
  472. Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
  473. Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
  474. Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
  475. Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
  476. Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
  477. Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
  478. Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
  479. Vector128<ushort> min = Sse41.MinHorizontal(delta);
  480. errorSum += min.GetElement(0);
  481. }
  482. }
  483. else
  484. {
  485. RgbaColor32 e032 = RgbaColor8.FromUInt32(c0).GetColor32();
  486. RgbaColor32 e132 = RgbaColor8.FromUInt32(c1).GetColor32();
  487. palette[0] = e032;
  488. palette[palette.Length - 1] = e132;
  489. for (int i = 1; i < palette.Length - 1; i++)
  490. {
  491. palette[i] = BC67Utils.Interpolate(e032, e132, i, 3);
  492. }
  493. for (int i = 0; i < tile.Length; i++)
  494. {
  495. if (partitionTable[i] != subset)
  496. {
  497. continue;
  498. }
  499. uint c = tile[i];
  500. RgbaColor32 color = Unsafe.As<uint, RgbaColor8>(ref c).GetColor32();
  501. int bestMatchScore = int.MaxValue;
  502. for (int j = 0; j < palette.Length; j++)
  503. {
  504. int score = BC67Utils.SquaredDifference(color, palette[j]);
  505. if (score < bestMatchScore)
  506. {
  507. bestMatchScore = score;
  508. }
  509. }
  510. errorSum += bestMatchScore;
  511. }
  512. }
  513. // No point in continuing if we are already above maximum.
  514. if (errorSum >= maxError)
  515. {
  516. return int.MaxValue;
  517. }
  518. }
  519. return errorSum;
  520. }
  521. private static void SelectEndPoints(
  522. ReadOnlySpan<uint> tile,
  523. int w,
  524. int h,
  525. Span<uint> endPoints0,
  526. Span<uint> endPoints1,
  527. int subsetCount,
  528. int partition,
  529. int indexBitCount,
  530. int colorDepth,
  531. int alphaDepth,
  532. uint writeMask,
  533. bool fastMode)
  534. {
  535. byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
  536. Span<RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount];
  537. Span<RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount];
  538. BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount);
  539. uint inverseMask = ~writeMask;
  540. for (int i = 0; i < subsetCount; i++)
  541. {
  542. Unsafe.As<RgbaColor8, uint>(ref minColors[i]) |= inverseMask;
  543. Unsafe.As<RgbaColor8, uint>(ref maxColors[i]) |= inverseMask;
  544. }
  545. if (fastMode)
  546. {
  547. SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, writeMask);
  548. }
  549. else
  550. {
  551. Span<RgbaColor8> colors = stackalloc RgbaColor8[subsetCount * 16];
  552. Span<byte> counts = stackalloc byte[subsetCount];
  553. int i = 0;
  554. for (int ty = 0; ty < h; ty++)
  555. {
  556. for (int tx = 0; tx < w; tx++)
  557. {
  558. int subset = partitionTable[ty * 4 + tx];
  559. RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++] | inverseMask);
  560. static void AddIfNew(Span<RgbaColor8> values, RgbaColor8 value, int subset, ref byte count)
  561. {
  562. for (int i = 0; i < count; i++)
  563. {
  564. if (values[subset * 16 + i] == value)
  565. {
  566. return;
  567. }
  568. }
  569. values[subset * 16 + count++] = value;
  570. }
  571. AddIfNew(colors, color, subset, ref counts[subset]);
  572. }
  573. }
  574. for (int subset = 0; subset < subsetCount; subset++)
  575. {
  576. int offset = subset * 16;
  577. RgbaColor8 minColor = minColors[subset];
  578. RgbaColor8 maxColor = maxColors[subset];
  579. ReadOnlySpan<RgbaColor8> subsetColors = colors.Slice(offset, counts[subset]);
  580. (RgbaColor8 e0, RgbaColor8 e1) = SelectEndPoints(subsetColors, minColor, maxColor, indexBitCount, colorDepth, alphaDepth, inverseMask);
  581. endPoints0[subset] = (endPoints0[subset] & inverseMask) | (e0.ToUInt32() & writeMask);
  582. endPoints1[subset] = (endPoints1[subset] & inverseMask) | (e1.ToUInt32() & writeMask);
  583. }
  584. }
  585. }
  586. private static unsafe void SelectEndPointsFast(
  587. ReadOnlySpan<byte> partitionTable,
  588. ReadOnlySpan<uint> tile,
  589. int w,
  590. int h,
  591. int subsetCount,
  592. ReadOnlySpan<RgbaColor8> minColors,
  593. ReadOnlySpan<RgbaColor8> maxColors,
  594. Span<uint> endPoints0,
  595. Span<uint> endPoints1,
  596. uint writeMask)
  597. {
  598. uint inverseMask = ~writeMask;
  599. if (Sse41.IsSupported && w == 4 && h == 4)
  600. {
  601. Vector128<byte> row0, row1, row2, row3;
  602. Vector128<short> ones = Vector128<short>.AllBitsSet;
  603. fixed (uint* pTile = tile)
  604. {
  605. row0 = Sse2.LoadVector128(pTile).AsByte();
  606. row1 = Sse2.LoadVector128(pTile + 4).AsByte();
  607. row2 = Sse2.LoadVector128(pTile + 8).AsByte();
  608. row3 = Sse2.LoadVector128(pTile + 12).AsByte();
  609. }
  610. Vector128<byte> partitionMask;
  611. fixed (byte* pPartitionTable = partitionTable)
  612. {
  613. partitionMask = Sse2.LoadVector128(pPartitionTable);
  614. }
  615. for (int subset = 0; subset < subsetCount; subset++)
  616. {
  617. RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
  618. int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A;
  619. if (sum != 0)
  620. {
  621. blockDir = (blockDir << 6) / new RgbaColor32(sum);
  622. }
  623. Vector128<byte> bd = Vector128.Create(blockDir.GetColor8().ToUInt32()).AsByte();
  624. Vector128<short> delta0 = Ssse3.MultiplyAddAdjacent(row0, bd.AsSByte());
  625. Vector128<short> delta1 = Ssse3.MultiplyAddAdjacent(row1, bd.AsSByte());
  626. Vector128<short> delta2 = Ssse3.MultiplyAddAdjacent(row2, bd.AsSByte());
  627. Vector128<short> delta3 = Ssse3.MultiplyAddAdjacent(row3, bd.AsSByte());
  628. Vector128<short> delta01 = Ssse3.HorizontalAdd(delta0, delta1);
  629. Vector128<short> delta23 = Ssse3.HorizontalAdd(delta2, delta3);
  630. Vector128<byte> subsetMask = Sse2.Xor(Sse2.CompareEqual(partitionMask, Vector128.Create((byte)subset)), ones.AsByte());
  631. Vector128<short> subsetMask01 = Sse2.UnpackLow(subsetMask, subsetMask).AsInt16();
  632. Vector128<short> subsetMask23 = Sse2.UnpackHigh(subsetMask, subsetMask).AsInt16();
  633. Vector128<ushort> min01 = Sse41.MinHorizontal(Sse2.Or(delta01, subsetMask01).AsUInt16());
  634. Vector128<ushort> min23 = Sse41.MinHorizontal(Sse2.Or(delta23, subsetMask23).AsUInt16());
  635. Vector128<ushort> max01 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask01, delta01), ones).AsUInt16());
  636. Vector128<ushort> max23 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask23, delta23), ones).AsUInt16());
  637. uint minPos01 = min01.AsUInt32().GetElement(0);
  638. uint minPos23 = min23.AsUInt32().GetElement(0);
  639. uint maxPos01 = max01.AsUInt32().GetElement(0);
  640. uint maxPos23 = max23.AsUInt32().GetElement(0);
  641. uint minDistColor = (ushort)minPos23 < (ushort)minPos01
  642. ? tile[(int)(minPos23 >> 16) + 8]
  643. : tile[(int)(minPos01 >> 16)];
  644. // Note that we calculate the maximum as the minimum of the inverse, so less here is actually greater.
  645. uint maxDistColor = (ushort)maxPos23 < (ushort)maxPos01
  646. ? tile[(int)(maxPos23 >> 16) + 8]
  647. : tile[(int)(maxPos01 >> 16)];
  648. endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor & writeMask);
  649. endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor & writeMask);
  650. }
  651. }
  652. else
  653. {
  654. for (int subset = 0; subset < subsetCount; subset++)
  655. {
  656. RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
  657. blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0);
  658. int minDist = int.MaxValue;
  659. int maxDist = int.MinValue;
  660. RgbaColor8 minDistColor = default;
  661. RgbaColor8 maxDistColor = default;
  662. int i = 0;
  663. for (int ty = 0; ty < h; ty++)
  664. {
  665. for (int tx = 0; tx < w; tx++, i++)
  666. {
  667. if (partitionTable[ty * 4 + tx] != subset)
  668. {
  669. continue;
  670. }
  671. RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]);
  672. int dist = RgbaColor32.Dot(color.GetColor32(), blockDir);
  673. if (minDist > dist)
  674. {
  675. minDist = dist;
  676. minDistColor = color;
  677. }
  678. if (maxDist < dist)
  679. {
  680. maxDist = dist;
  681. maxDistColor = color;
  682. }
  683. }
  684. }
  685. endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor.ToUInt32() & writeMask);
  686. endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor.ToUInt32() & writeMask);
  687. }
  688. }
  689. }
  690. private static (RgbaColor8, RgbaColor8) SelectEndPoints(
  691. ReadOnlySpan<RgbaColor8> values,
  692. RgbaColor8 minValue,
  693. RgbaColor8 maxValue,
  694. int indexBitCount,
  695. int colorDepth,
  696. int alphaDepth,
  697. uint alphaMask)
  698. {
  699. int n = values.Length;
  700. int numInterpolatedColors = 1 << indexBitCount;
  701. int numInterpolatedColorsMinus1 = numInterpolatedColors - 1;
  702. if (n == 0)
  703. {
  704. return (default, default);
  705. }
  706. minValue = BC67Utils.Quantize(minValue, colorDepth, alphaDepth);
  707. maxValue = BC67Utils.Quantize(maxValue, colorDepth, alphaDepth);
  708. RgbaColor32 blockDir = maxValue.GetColor32() - minValue.GetColor32();
  709. blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0);
  710. int minDist = int.MaxValue;
  711. int maxDist = 0;
  712. for (int i = 0; i < values.Length; i++)
  713. {
  714. RgbaColor8 color = values[i];
  715. int dist = RgbaColor32.Dot(BC67Utils.Quantize(color, colorDepth, alphaDepth).GetColor32(), blockDir);
  716. if (minDist >= dist)
  717. {
  718. minDist = dist;
  719. }
  720. if (maxDist <= dist)
  721. {
  722. maxDist = dist;
  723. }
  724. }
  725. Span<RgbaColor8> palette = stackalloc RgbaColor8[numInterpolatedColors];
  726. int distRange = Math.Max(1, maxDist - minDist);
  727. RgbaColor32 nV = new RgbaColor32(n);
  728. int bestErrorSum = int.MaxValue;
  729. RgbaColor8 bestE0 = default;
  730. RgbaColor8 bestE1 = default;
  731. Span<int> indices = stackalloc int[n];
  732. Span<RgbaColor32> colors = stackalloc RgbaColor32[n];
  733. for (int maxIndex = numInterpolatedColorsMinus1; maxIndex >= 1; maxIndex--)
  734. {
  735. int sumX = 0;
  736. int sumXX = 0;
  737. int sumXXIncrement = 0;
  738. for (int i = 0; i < values.Length; i++)
  739. {
  740. RgbaColor32 color = values[i].GetColor32();
  741. int dist = RgbaColor32.Dot(color, blockDir);
  742. int normalizedValue = ((dist - minDist) << 6) / distRange;
  743. int texelIndex = (normalizedValue * maxIndex + 32) >> 6;
  744. indices[i] = texelIndex;
  745. colors[i] = color;
  746. sumX += texelIndex;
  747. sumXX += texelIndex * texelIndex;
  748. sumXXIncrement += 1 + texelIndex * 2;
  749. }
  750. for (int start = 0; start < numInterpolatedColors - maxIndex; start++)
  751. {
  752. RgbaColor32 sumY = new RgbaColor32(0);
  753. RgbaColor32 sumXY = new RgbaColor32(0);
  754. for (int i = 0; i < indices.Length; i++)
  755. {
  756. RgbaColor32 y = colors[i];
  757. sumY += y;
  758. sumXY += new RgbaColor32(start + indices[i]) * y;
  759. }
  760. RgbaColor32 sumXV = new RgbaColor32(sumX);
  761. RgbaColor32 sumXXV = new RgbaColor32(sumXX);
  762. RgbaColor32 m = RgbaColor32.DivideGuarded((nV * sumXY - sumXV * sumY) << 6, nV * sumXXV - sumXV * sumXV, 0);
  763. RgbaColor32 b = ((sumY << 6) - m * sumXV) / nV;
  764. RgbaColor8 candidateE0 = (b >> 6).GetColor8();
  765. RgbaColor8 candidateE1 = ((b + m * new RgbaColor32(numInterpolatedColorsMinus1)) >> 6).GetColor8();
  766. int pBit0 = GetPBit(candidateE0.ToUInt32(), colorDepth, alphaDepth);
  767. int pBit1 = GetPBit(candidateE1.ToUInt32(), colorDepth, alphaDepth);
  768. int errorSum = BC67Utils.SelectIndices(
  769. MemoryMarshal.Cast<RgbaColor8, uint>(values),
  770. candidateE0.ToUInt32(),
  771. candidateE1.ToUInt32(),
  772. pBit0,
  773. pBit1,
  774. indexBitCount,
  775. numInterpolatedColors,
  776. colorDepth,
  777. alphaDepth,
  778. alphaMask);
  779. if (errorSum <= bestErrorSum)
  780. {
  781. bestErrorSum = errorSum;
  782. bestE0 = candidateE0;
  783. bestE1 = candidateE1;
  784. }
  785. sumX += n;
  786. sumXX += sumXXIncrement;
  787. sumXXIncrement += 2 * n;
  788. }
  789. }
  790. return (bestE0, bestE1);
  791. }
  792. private static int GetPBit(uint color, int colorDepth, int alphaDepth)
  793. {
  794. uint mask = 0x808080u >> colorDepth;
  795. if (alphaDepth != 0)
  796. {
  797. // If alpha is 0, let's assume the color information is not too important and prefer
  798. // to preserve alpha instead.
  799. if ((color >> 24) == 0)
  800. {
  801. return 0;
  802. }
  803. mask |= 0x80000000u >> alphaDepth;
  804. }
  805. color &= 0x7f7f7f7fu;
  806. color += mask >> 1;
  807. int onesCount = BitOperations.PopCount(color & mask);
  808. return onesCount >= 2 ? 1 : 0;
  809. }
  810. private static int GetPBit(uint c0, uint c1, int colorDepth, int alphaDepth)
  811. {
  812. // Giving preference to the first endpoint yields better results,
  813. // might be a side effect of the endpoint selection algorithm?
  814. return GetPBit(c0, colorDepth, alphaDepth);
  815. }
  816. }
  817. }