BC67Utils.cs 54 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327
  1. using System;
  2. using System.Diagnostics;
  3. using System.Runtime.CompilerServices;
  4. using System.Runtime.Intrinsics;
  5. using System.Runtime.Intrinsics.X86;
  6. namespace Ryujinx.Graphics.Texture.Utils
  7. {
  8. static class BC67Utils
  9. {
  10. private static byte[][] _quantizationLut;
  11. private static byte[][] _quantizationLutNoPBit;
  12. static BC67Utils()
  13. {
  14. _quantizationLut = new byte[5][];
  15. _quantizationLutNoPBit = new byte[5][];
  16. for (int depth = 4; depth < 9; depth++)
  17. {
  18. byte[] lut = new byte[512];
  19. byte[] lutNoPBit = new byte[256];
  20. for (int i = 0; i < lut.Length; i++)
  21. {
  22. lut[i] = QuantizeComponentForLut((byte)i, depth, i >> 8);
  23. if (i < lutNoPBit.Length)
  24. {
  25. lutNoPBit[i] = QuantizeComponentForLut((byte)i, depth);
  26. }
  27. }
  28. _quantizationLut[depth - 4] = lut;
  29. _quantizationLutNoPBit[depth - 4] = lutNoPBit;
  30. }
  31. }
  32. public static (RgbaColor8, RgbaColor8) GetMinMaxColors(ReadOnlySpan<uint> tile, int w, int h)
  33. {
  34. if (Sse41.IsSupported && w == 4 && h == 4)
  35. {
  36. GetMinMaxColorsOneSubset4x4Sse41(tile, out RgbaColor8 minColor, out RgbaColor8 maxColor);
  37. return (minColor, maxColor);
  38. }
  39. else
  40. {
  41. RgbaColor8 minColor = new RgbaColor8(255, 255, 255, 255);
  42. RgbaColor8 maxColor = default;
  43. for (int i = 0; i < tile.Length; i++)
  44. {
  45. RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]);
  46. minColor.R = Math.Min(minColor.R, color.R);
  47. minColor.G = Math.Min(minColor.G, color.G);
  48. minColor.B = Math.Min(minColor.B, color.B);
  49. minColor.A = Math.Min(minColor.A, color.A);
  50. maxColor.R = Math.Max(maxColor.R, color.R);
  51. maxColor.G = Math.Max(maxColor.G, color.G);
  52. maxColor.B = Math.Max(maxColor.B, color.B);
  53. maxColor.A = Math.Max(maxColor.A, color.A);
  54. }
  55. return (minColor, maxColor);
  56. }
  57. }
  58. public static void GetMinMaxColors(
  59. ReadOnlySpan<byte> partitionTable,
  60. ReadOnlySpan<uint> tile,
  61. int w,
  62. int h,
  63. Span<RgbaColor8> minColors,
  64. Span<RgbaColor8> maxColors,
  65. int subsetCount)
  66. {
  67. if (Sse41.IsSupported && w == 4 && h == 4)
  68. {
  69. if (subsetCount == 1)
  70. {
  71. GetMinMaxColorsOneSubset4x4Sse41(tile, out minColors[0], out maxColors[0]);
  72. return;
  73. }
  74. else if (subsetCount == 2)
  75. {
  76. GetMinMaxColorsTwoSubsets4x4Sse41(partitionTable, tile, minColors, maxColors);
  77. return;
  78. }
  79. }
  80. minColors.Fill(new RgbaColor8(255, 255, 255, 255));
  81. int i = 0;
  82. for (int ty = 0; ty < h; ty++)
  83. {
  84. for (int tx = 0; tx < w; tx++)
  85. {
  86. int subset = partitionTable[ty * w + tx];
  87. RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++]);
  88. minColors[subset].R = Math.Min(minColors[subset].R, color.R);
  89. minColors[subset].G = Math.Min(minColors[subset].G, color.G);
  90. minColors[subset].B = Math.Min(minColors[subset].B, color.B);
  91. minColors[subset].A = Math.Min(minColors[subset].A, color.A);
  92. maxColors[subset].R = Math.Max(maxColors[subset].R, color.R);
  93. maxColors[subset].G = Math.Max(maxColors[subset].G, color.G);
  94. maxColors[subset].B = Math.Max(maxColors[subset].B, color.B);
  95. maxColors[subset].A = Math.Max(maxColors[subset].A, color.A);
  96. }
  97. }
  98. }
  99. private static unsafe void GetMinMaxColorsOneSubset4x4Sse41(ReadOnlySpan<uint> tile, out RgbaColor8 minColor, out RgbaColor8 maxColor)
  100. {
  101. Vector128<byte> min = Vector128<byte>.AllBitsSet;
  102. Vector128<byte> max = Vector128<byte>.Zero;
  103. Vector128<byte> row0, row1, row2, row3;
  104. fixed (uint* pTile = tile)
  105. {
  106. row0 = Sse2.LoadVector128(pTile).AsByte();
  107. row1 = Sse2.LoadVector128(pTile + 4).AsByte();
  108. row2 = Sse2.LoadVector128(pTile + 8).AsByte();
  109. row3 = Sse2.LoadVector128(pTile + 12).AsByte();
  110. }
  111. min = Sse2.Min(min, row0);
  112. max = Sse2.Max(max, row0);
  113. min = Sse2.Min(min, row1);
  114. max = Sse2.Max(max, row1);
  115. min = Sse2.Min(min, row2);
  116. max = Sse2.Max(max, row2);
  117. min = Sse2.Min(min, row3);
  118. max = Sse2.Max(max, row3);
  119. minColor = HorizontalMin(min);
  120. maxColor = HorizontalMax(max);
  121. }
  122. private static unsafe void GetMinMaxColorsTwoSubsets4x4Sse41(
  123. ReadOnlySpan<byte> partitionTable,
  124. ReadOnlySpan<uint> tile,
  125. Span<RgbaColor8> minColors,
  126. Span<RgbaColor8> maxColors)
  127. {
  128. Vector128<byte> partitionMask;
  129. fixed (byte* pPartitionTable = partitionTable)
  130. {
  131. partitionMask = Sse2.LoadVector128(pPartitionTable);
  132. }
  133. Vector128<byte> subset0Mask = Sse2.CompareEqual(partitionMask, Vector128<byte>.Zero);
  134. Vector128<byte> subset0MaskRep16Low = Sse2.UnpackLow(subset0Mask, subset0Mask);
  135. Vector128<byte> subset0MaskRep16High = Sse2.UnpackHigh(subset0Mask, subset0Mask);
  136. Vector128<byte> subset0Mask0 = Sse2.UnpackLow(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte();
  137. Vector128<byte> subset0Mask1 = Sse2.UnpackHigh(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte();
  138. Vector128<byte> subset0Mask2 = Sse2.UnpackLow(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte();
  139. Vector128<byte> subset0Mask3 = Sse2.UnpackHigh(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte();
  140. Vector128<byte> min0 = Vector128<byte>.AllBitsSet;
  141. Vector128<byte> min1 = Vector128<byte>.AllBitsSet;
  142. Vector128<byte> max0 = Vector128<byte>.Zero;
  143. Vector128<byte> max1 = Vector128<byte>.Zero;
  144. Vector128<byte> row0, row1, row2, row3;
  145. fixed (uint* pTile = tile)
  146. {
  147. row0 = Sse2.LoadVector128(pTile).AsByte();
  148. row1 = Sse2.LoadVector128(pTile + 4).AsByte();
  149. row2 = Sse2.LoadVector128(pTile + 8).AsByte();
  150. row3 = Sse2.LoadVector128(pTile + 12).AsByte();
  151. }
  152. min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row0, subset0Mask0));
  153. min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row1, subset0Mask1));
  154. min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row2, subset0Mask2));
  155. min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row3, subset0Mask3));
  156. min1 = Sse2.Min(min1, Sse2.Or(row0, subset0Mask0));
  157. min1 = Sse2.Min(min1, Sse2.Or(row1, subset0Mask1));
  158. min1 = Sse2.Min(min1, Sse2.Or(row2, subset0Mask2));
  159. min1 = Sse2.Min(min1, Sse2.Or(row3, subset0Mask3));
  160. max0 = Sse2.Max(max0, Sse2.And(row0, subset0Mask0));
  161. max0 = Sse2.Max(max0, Sse2.And(row1, subset0Mask1));
  162. max0 = Sse2.Max(max0, Sse2.And(row2, subset0Mask2));
  163. max0 = Sse2.Max(max0, Sse2.And(row3, subset0Mask3));
  164. max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask0, row0));
  165. max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask1, row1));
  166. max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask2, row2));
  167. max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask3, row3));
  168. minColors[0] = HorizontalMin(min0);
  169. minColors[1] = HorizontalMin(min1);
  170. maxColors[0] = HorizontalMax(max0);
  171. maxColors[1] = HorizontalMax(max1);
  172. }
  173. private static RgbaColor8 HorizontalMin(Vector128<byte> x)
  174. {
  175. x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte());
  176. x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte());
  177. return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0));
  178. }
  179. private static RgbaColor8 HorizontalMax(Vector128<byte> x)
  180. {
  181. x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte());
  182. x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte());
  183. return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0));
  184. }
  185. public static int SelectIndices(
  186. ReadOnlySpan<uint> values,
  187. uint endPoint0,
  188. uint endPoint1,
  189. int pBit0,
  190. int pBit1,
  191. int indexBitCount,
  192. int indexCount,
  193. int colorDepth,
  194. int alphaDepth,
  195. uint alphaMask)
  196. {
  197. if (Sse41.IsSupported)
  198. {
  199. if (indexBitCount == 2)
  200. {
  201. return Select2BitIndicesSse41(
  202. values,
  203. endPoint0,
  204. endPoint1,
  205. pBit0,
  206. pBit1,
  207. indexBitCount,
  208. indexCount,
  209. colorDepth,
  210. alphaDepth,
  211. alphaMask);
  212. }
  213. else if (indexBitCount == 3)
  214. {
  215. return Select3BitIndicesSse41(
  216. values,
  217. endPoint0,
  218. endPoint1,
  219. pBit0,
  220. pBit1,
  221. indexBitCount,
  222. indexCount,
  223. colorDepth,
  224. alphaDepth,
  225. alphaMask);
  226. }
  227. else if (indexBitCount == 4)
  228. {
  229. return Select4BitIndicesOneSubsetSse41(
  230. values,
  231. endPoint0,
  232. endPoint1,
  233. pBit0,
  234. pBit1,
  235. indexBitCount,
  236. indexCount,
  237. colorDepth,
  238. alphaDepth,
  239. alphaMask);
  240. }
  241. }
  242. return SelectIndicesFallback(
  243. values,
  244. endPoint0,
  245. endPoint1,
  246. pBit0,
  247. pBit1,
  248. indexBitCount,
  249. indexCount,
  250. colorDepth,
  251. alphaDepth,
  252. alphaMask);
  253. }
  254. private static unsafe int Select2BitIndicesSse41(
  255. ReadOnlySpan<uint> values,
  256. uint endPoint0,
  257. uint endPoint1,
  258. int pBit0,
  259. int pBit1,
  260. int indexBitCount,
  261. int indexCount,
  262. int colorDepth,
  263. int alphaDepth,
  264. uint alphaMask)
  265. {
  266. uint alphaMaskForPalette = alphaMask;
  267. if (alphaDepth == 0)
  268. {
  269. alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
  270. }
  271. int errorSum = 0;
  272. RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
  273. RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
  274. Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
  275. Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
  276. Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
  277. Vector128<byte> rWeights;
  278. Vector128<byte> lWeights;
  279. fixed (byte* pWeights = BC67Tables.Weights[0], pInvWeights = BC67Tables.InverseWeights[0])
  280. {
  281. rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte();
  282. lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte();
  283. }
  284. Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
  285. Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
  286. Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  287. Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  288. Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
  289. Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
  290. for (int i = 0; i < values.Length; i++)
  291. {
  292. uint c = values[i] | alphaMask;
  293. Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
  294. Vector128<short> delta0 = Sse2.Subtract(color, pal0);
  295. Vector128<short> delta1 = Sse2.Subtract(color, pal1);
  296. Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
  297. Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
  298. Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
  299. Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01);
  300. Vector128<ushort> min = Sse41.MinHorizontal(delta);
  301. ushort error = min.GetElement(0);
  302. errorSum += error;
  303. }
  304. return errorSum;
  305. }
  306. private static unsafe int Select3BitIndicesSse41(
  307. ReadOnlySpan<uint> values,
  308. uint endPoint0,
  309. uint endPoint1,
  310. int pBit0,
  311. int pBit1,
  312. int indexBitCount,
  313. int indexCount,
  314. int colorDepth,
  315. int alphaDepth,
  316. uint alphaMask)
  317. {
  318. uint alphaMaskForPalette = alphaMask;
  319. if (alphaDepth == 0)
  320. {
  321. alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
  322. }
  323. int errorSum = 0;
  324. RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
  325. RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
  326. Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
  327. Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
  328. Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
  329. Vector128<byte> rWeights;
  330. Vector128<byte> lWeights;
  331. fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
  332. {
  333. rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte();
  334. lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte();
  335. }
  336. Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
  337. Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
  338. Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
  339. Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  340. Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  341. Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
  342. Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
  343. Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
  344. Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
  345. Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
  346. Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
  347. for (int i = 0; i < values.Length; i++)
  348. {
  349. uint c = values[i] | alphaMask;
  350. Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
  351. Vector128<short> delta0 = Sse2.Subtract(color, pal0);
  352. Vector128<short> delta1 = Sse2.Subtract(color, pal1);
  353. Vector128<short> delta2 = Sse2.Subtract(color, pal2);
  354. Vector128<short> delta3 = Sse2.Subtract(color, pal3);
  355. Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
  356. Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
  357. Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
  358. Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
  359. Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
  360. Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
  361. Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
  362. Vector128<ushort> min = Sse41.MinHorizontal(delta);
  363. ushort error = min.GetElement(0);
  364. errorSum += error;
  365. }
  366. return errorSum;
  367. }
  368. private static unsafe int Select4BitIndicesOneSubsetSse41(
  369. ReadOnlySpan<uint> values,
  370. uint endPoint0,
  371. uint endPoint1,
  372. int pBit0,
  373. int pBit1,
  374. int indexBitCount,
  375. int indexCount,
  376. int colorDepth,
  377. int alphaDepth,
  378. uint alphaMask)
  379. {
  380. uint alphaMaskForPalette = alphaMask;
  381. if (alphaDepth == 0)
  382. {
  383. alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
  384. }
  385. int errorSum = 0;
  386. RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
  387. RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
  388. Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
  389. Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
  390. Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
  391. Vector128<byte> rWeights;
  392. Vector128<byte> lWeights;
  393. fixed (byte* pWeights = BC67Tables.Weights[2], pInvWeights = BC67Tables.InverseWeights[2])
  394. {
  395. rWeights = Sse2.LoadVector128(pWeights);
  396. lWeights = Sse2.LoadVector128(pInvWeights);
  397. }
  398. Vector128<byte> iWeightsLow = Sse2.UnpackLow(lWeights, rWeights);
  399. Vector128<byte> iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights);
  400. Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
  401. Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
  402. Vector128<byte> iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
  403. Vector128<byte> iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
  404. Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  405. Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  406. Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
  407. Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
  408. Vector128<byte> iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
  409. Vector128<byte> iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
  410. Vector128<byte> iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
  411. Vector128<byte> iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
  412. Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
  413. Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
  414. Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
  415. Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
  416. Vector128<short> pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte()));
  417. Vector128<short> pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte()));
  418. Vector128<short> pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte()));
  419. Vector128<short> pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte()));
  420. for (int i = 0; i < values.Length; i++)
  421. {
  422. uint c = values[i] | alphaMask;
  423. Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
  424. Vector128<short> delta0 = Sse2.Subtract(color, pal0);
  425. Vector128<short> delta1 = Sse2.Subtract(color, pal1);
  426. Vector128<short> delta2 = Sse2.Subtract(color, pal2);
  427. Vector128<short> delta3 = Sse2.Subtract(color, pal3);
  428. Vector128<short> delta4 = Sse2.Subtract(color, pal4);
  429. Vector128<short> delta5 = Sse2.Subtract(color, pal5);
  430. Vector128<short> delta6 = Sse2.Subtract(color, pal6);
  431. Vector128<short> delta7 = Sse2.Subtract(color, pal7);
  432. Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
  433. Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
  434. Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
  435. Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
  436. Vector128<int> deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4);
  437. Vector128<int> deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5);
  438. Vector128<int> deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6);
  439. Vector128<int> deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7);
  440. Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
  441. Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
  442. Vector128<int> deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5);
  443. Vector128<int> deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7);
  444. Vector128<ushort> delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
  445. Vector128<ushort> delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67);
  446. Vector128<ushort> min0123 = Sse41.MinHorizontal(delta0123);
  447. Vector128<ushort> min4567 = Sse41.MinHorizontal(delta4567);
  448. ushort minPos0123 = min0123.GetElement(0);
  449. ushort minPos4567 = min4567.GetElement(0);
  450. if (minPos4567 < minPos0123)
  451. {
  452. errorSum += minPos4567;
  453. }
  454. else
  455. {
  456. errorSum += minPos0123;
  457. }
  458. }
  459. return errorSum;
  460. }
  461. private static int SelectIndicesFallback(
  462. ReadOnlySpan<uint> values,
  463. uint endPoint0,
  464. uint endPoint1,
  465. int pBit0,
  466. int pBit1,
  467. int indexBitCount,
  468. int indexCount,
  469. int colorDepth,
  470. int alphaDepth,
  471. uint alphaMask)
  472. {
  473. int errorSum = 0;
  474. uint alphaMaskForPalette = alphaMask;
  475. if (alphaDepth == 0)
  476. {
  477. alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
  478. }
  479. Span<uint> palette = stackalloc uint[indexCount];
  480. RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
  481. RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
  482. Unsafe.As<RgbaColor8, uint>(ref c0) |= alphaMaskForPalette;
  483. Unsafe.As<RgbaColor8, uint>(ref c1) |= alphaMaskForPalette;
  484. palette[0] = c0.ToUInt32();
  485. palette[indexCount - 1] = c1.ToUInt32();
  486. for (int j = 1; j < indexCount - 1; j++)
  487. {
  488. palette[j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32();
  489. }
  490. for (int i = 0; i < values.Length; i++)
  491. {
  492. uint color = values[i] | alphaMask;
  493. int bestMatchScore = int.MaxValue;
  494. int bestMatchIndex = 0;
  495. for (int j = 0; j < indexCount; j++)
  496. {
  497. int score = SquaredDifference(
  498. RgbaColor8.FromUInt32(color).GetColor32(),
  499. RgbaColor8.FromUInt32(palette[j]).GetColor32());
  500. if (score < bestMatchScore)
  501. {
  502. bestMatchScore = score;
  503. bestMatchIndex = j;
  504. }
  505. }
  506. errorSum += bestMatchScore;
  507. }
  508. return errorSum;
  509. }
  510. public static int SelectIndices(
  511. ReadOnlySpan<uint> tile,
  512. int w,
  513. int h,
  514. ReadOnlySpan<uint> endPoints0,
  515. ReadOnlySpan<uint> endPoints1,
  516. ReadOnlySpan<int> pBitValues,
  517. Span<byte> indices,
  518. int subsetCount,
  519. int partition,
  520. int indexBitCount,
  521. int indexCount,
  522. int colorDepth,
  523. int alphaDepth,
  524. int pBits,
  525. uint alphaMask)
  526. {
  527. if (Sse41.IsSupported)
  528. {
  529. if (indexBitCount == 2)
  530. {
  531. return Select2BitIndicesSse41(
  532. tile,
  533. w,
  534. h,
  535. endPoints0,
  536. endPoints1,
  537. pBitValues,
  538. indices,
  539. subsetCount,
  540. partition,
  541. colorDepth,
  542. alphaDepth,
  543. pBits,
  544. alphaMask);
  545. }
  546. else if (indexBitCount == 3)
  547. {
  548. return Select3BitIndicesSse41(
  549. tile,
  550. w,
  551. h,
  552. endPoints0,
  553. endPoints1,
  554. pBitValues,
  555. indices,
  556. subsetCount,
  557. partition,
  558. colorDepth,
  559. alphaDepth,
  560. pBits,
  561. alphaMask);
  562. }
  563. else if (indexBitCount == 4)
  564. {
  565. Debug.Assert(subsetCount == 1);
  566. return Select4BitIndicesOneSubsetSse41(
  567. tile,
  568. w,
  569. h,
  570. endPoints0[0],
  571. endPoints1[0],
  572. pBitValues,
  573. indices,
  574. partition,
  575. colorDepth,
  576. alphaDepth,
  577. pBits,
  578. alphaMask);
  579. }
  580. }
  581. return SelectIndicesFallback(
  582. tile,
  583. w,
  584. h,
  585. endPoints0,
  586. endPoints1,
  587. pBitValues,
  588. indices,
  589. subsetCount,
  590. partition,
  591. indexBitCount,
  592. indexCount,
  593. colorDepth,
  594. alphaDepth,
  595. pBits,
  596. alphaMask);
  597. }
  598. private static unsafe int Select2BitIndicesSse41(
  599. ReadOnlySpan<uint> tile,
  600. int w,
  601. int h,
  602. ReadOnlySpan<uint> endPoints0,
  603. ReadOnlySpan<uint> endPoints1,
  604. ReadOnlySpan<int> pBitValues,
  605. Span<byte> indices,
  606. int subsetCount,
  607. int partition,
  608. int colorDepth,
  609. int alphaDepth,
  610. int pBits,
  611. uint alphaMask)
  612. {
  613. byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
  614. uint alphaMaskForPalette = alphaMask;
  615. if (alphaDepth == 0)
  616. {
  617. alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
  618. }
  619. int errorSum = 0;
  620. for (int subset = 0; subset < subsetCount; subset++)
  621. {
  622. int pBit0 = -1, pBit1 = -1;
  623. if (pBits == subsetCount)
  624. {
  625. pBit0 = pBit1 = pBitValues[subset];
  626. }
  627. else if (pBits != 0)
  628. {
  629. pBit0 = pBitValues[subset * 2];
  630. pBit1 = pBitValues[subset * 2 + 1];
  631. }
  632. RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0);
  633. RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1);
  634. Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
  635. Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
  636. Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
  637. Vector128<byte> rWeights;
  638. Vector128<byte> lWeights;
  639. fixed (byte* pWeights = BC67Tables.Weights[0], pInvWeights = BC67Tables.InverseWeights[0])
  640. {
  641. rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte();
  642. lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte();
  643. }
  644. Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
  645. Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
  646. Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  647. Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  648. Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
  649. Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
  650. int i = 0;
  651. for (int ty = 0; ty < h; ty++)
  652. {
  653. for (int tx = 0; tx < w; tx++, i++)
  654. {
  655. int tileOffset = ty * 4 + tx;
  656. if (partitionTable[tileOffset] != subset)
  657. {
  658. continue;
  659. }
  660. uint c = tile[i] | alphaMask;
  661. Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
  662. Vector128<short> delta0 = Sse2.Subtract(color, pal0);
  663. Vector128<short> delta1 = Sse2.Subtract(color, pal1);
  664. Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
  665. Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
  666. Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
  667. Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01);
  668. Vector128<ushort> min = Sse41.MinHorizontal(delta);
  669. uint minPos = min.AsUInt32().GetElement(0);
  670. ushort error = (ushort)minPos;
  671. uint index = minPos >> 16;
  672. indices[tileOffset] = (byte)index;
  673. errorSum += error;
  674. }
  675. }
  676. }
  677. return errorSum;
  678. }
  679. private static unsafe int Select3BitIndicesSse41(
  680. ReadOnlySpan<uint> tile,
  681. int w,
  682. int h,
  683. ReadOnlySpan<uint> endPoints0,
  684. ReadOnlySpan<uint> endPoints1,
  685. ReadOnlySpan<int> pBitValues,
  686. Span<byte> indices,
  687. int subsetCount,
  688. int partition,
  689. int colorDepth,
  690. int alphaDepth,
  691. int pBits,
  692. uint alphaMask)
  693. {
  694. byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
  695. uint alphaMaskForPalette = alphaMask;
  696. if (alphaDepth == 0)
  697. {
  698. alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
  699. }
  700. int errorSum = 0;
  701. for (int subset = 0; subset < subsetCount; subset++)
  702. {
  703. int pBit0 = -1, pBit1 = -1;
  704. if (pBits == subsetCount)
  705. {
  706. pBit0 = pBit1 = pBitValues[subset];
  707. }
  708. else if (pBits != 0)
  709. {
  710. pBit0 = pBitValues[subset * 2];
  711. pBit1 = pBitValues[subset * 2 + 1];
  712. }
  713. RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0);
  714. RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1);
  715. Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
  716. Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
  717. Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
  718. Vector128<byte> rWeights;
  719. Vector128<byte> lWeights;
  720. fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
  721. {
  722. rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte();
  723. lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte();
  724. }
  725. Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
  726. Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
  727. Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
  728. Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  729. Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  730. Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
  731. Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
  732. Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
  733. Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
  734. Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
  735. Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
  736. int i = 0;
  737. for (int ty = 0; ty < h; ty++)
  738. {
  739. for (int tx = 0; tx < w; tx++, i++)
  740. {
  741. int tileOffset = ty * 4 + tx;
  742. if (partitionTable[tileOffset] != subset)
  743. {
  744. continue;
  745. }
  746. uint c = tile[i] | alphaMask;
  747. Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
  748. Vector128<short> delta0 = Sse2.Subtract(color, pal0);
  749. Vector128<short> delta1 = Sse2.Subtract(color, pal1);
  750. Vector128<short> delta2 = Sse2.Subtract(color, pal2);
  751. Vector128<short> delta3 = Sse2.Subtract(color, pal3);
  752. Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
  753. Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
  754. Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
  755. Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
  756. Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
  757. Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
  758. Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
  759. Vector128<ushort> min = Sse41.MinHorizontal(delta);
  760. uint minPos = min.AsUInt32().GetElement(0);
  761. ushort error = (ushort)minPos;
  762. uint index = minPos >> 16;
  763. indices[tileOffset] = (byte)index;
  764. errorSum += error;
  765. }
  766. }
  767. }
  768. return errorSum;
  769. }
  770. private static unsafe int Select4BitIndicesOneSubsetSse41(
  771. ReadOnlySpan<uint> tile,
  772. int w,
  773. int h,
  774. uint endPoint0,
  775. uint endPoint1,
  776. ReadOnlySpan<int> pBitValues,
  777. Span<byte> indices,
  778. int partition,
  779. int colorDepth,
  780. int alphaDepth,
  781. int pBits,
  782. uint alphaMask)
  783. {
  784. uint alphaMaskForPalette = alphaMask;
  785. if (alphaDepth == 0)
  786. {
  787. alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
  788. }
  789. int errorSum = 0;
  790. int pBit0 = -1, pBit1 = -1;
  791. if (pBits != 0)
  792. {
  793. pBit0 = pBitValues[0];
  794. pBit1 = pBitValues[1];
  795. }
  796. RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
  797. RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
  798. Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
  799. Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
  800. Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
  801. Vector128<byte> rWeights;
  802. Vector128<byte> lWeights;
  803. fixed (byte* pWeights = BC67Tables.Weights[2], pInvWeights = BC67Tables.InverseWeights[2])
  804. {
  805. rWeights = Sse2.LoadVector128(pWeights);
  806. lWeights = Sse2.LoadVector128(pInvWeights);
  807. }
  808. Vector128<byte> iWeightsLow = Sse2.UnpackLow(lWeights, rWeights);
  809. Vector128<byte> iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights);
  810. Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
  811. Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
  812. Vector128<byte> iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
  813. Vector128<byte> iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
  814. Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  815. Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
  816. Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
  817. Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
  818. Vector128<byte> iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
  819. Vector128<byte> iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
  820. Vector128<byte> iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
  821. Vector128<byte> iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
  822. Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
  823. Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
  824. Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
  825. Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
  826. Vector128<short> pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte()));
  827. Vector128<short> pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte()));
  828. Vector128<short> pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte()));
  829. Vector128<short> pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte()));
  830. int i = 0;
  831. for (int ty = 0; ty < h; ty++)
  832. {
  833. for (int tx = 0; tx < w; tx++, i++)
  834. {
  835. uint c = tile[i] | alphaMask;
  836. Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
  837. Vector128<short> delta0 = Sse2.Subtract(color, pal0);
  838. Vector128<short> delta1 = Sse2.Subtract(color, pal1);
  839. Vector128<short> delta2 = Sse2.Subtract(color, pal2);
  840. Vector128<short> delta3 = Sse2.Subtract(color, pal3);
  841. Vector128<short> delta4 = Sse2.Subtract(color, pal4);
  842. Vector128<short> delta5 = Sse2.Subtract(color, pal5);
  843. Vector128<short> delta6 = Sse2.Subtract(color, pal6);
  844. Vector128<short> delta7 = Sse2.Subtract(color, pal7);
  845. Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
  846. Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
  847. Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
  848. Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
  849. Vector128<int> deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4);
  850. Vector128<int> deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5);
  851. Vector128<int> deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6);
  852. Vector128<int> deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7);
  853. Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
  854. Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
  855. Vector128<int> deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5);
  856. Vector128<int> deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7);
  857. Vector128<ushort> delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
  858. Vector128<ushort> delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67);
  859. Vector128<ushort> min0123 = Sse41.MinHorizontal(delta0123);
  860. Vector128<ushort> min4567 = Sse41.MinHorizontal(delta4567);
  861. uint minPos0123 = min0123.AsUInt32().GetElement(0);
  862. uint minPos4567 = min4567.AsUInt32().GetElement(0);
  863. if ((ushort)minPos4567 < (ushort)minPos0123)
  864. {
  865. errorSum += (ushort)minPos4567;
  866. indices[ty * 4 + tx] = (byte)(8 + (minPos4567 >> 16));
  867. }
  868. else
  869. {
  870. errorSum += (ushort)minPos0123;
  871. indices[ty * 4 + tx] = (byte)(minPos0123 >> 16);
  872. }
  873. }
  874. }
  875. return errorSum;
  876. }
  877. private static Vector128<short> ShiftRoundToNearest(Vector128<short> x)
  878. {
  879. return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6);
  880. }
  881. private static int SelectIndicesFallback(
  882. ReadOnlySpan<uint> tile,
  883. int w,
  884. int h,
  885. ReadOnlySpan<uint> endPoints0,
  886. ReadOnlySpan<uint> endPoints1,
  887. ReadOnlySpan<int> pBitValues,
  888. Span<byte> indices,
  889. int subsetCount,
  890. int partition,
  891. int indexBitCount,
  892. int indexCount,
  893. int colorDepth,
  894. int alphaDepth,
  895. int pBits,
  896. uint alphaMask)
  897. {
  898. int errorSum = 0;
  899. uint alphaMaskForPalette = alphaMask;
  900. if (alphaDepth == 0)
  901. {
  902. alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
  903. }
  904. Span<uint> palette = stackalloc uint[subsetCount * indexCount];
  905. for (int subset = 0; subset < subsetCount; subset++)
  906. {
  907. int palBase = subset * indexCount;
  908. int pBit0 = -1, pBit1 = -1;
  909. if (pBits == subsetCount)
  910. {
  911. pBit0 = pBit1 = pBitValues[subset];
  912. }
  913. else if (pBits != 0)
  914. {
  915. pBit0 = pBitValues[subset * 2];
  916. pBit1 = pBitValues[subset * 2 + 1];
  917. }
  918. RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0);
  919. RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1);
  920. Unsafe.As<RgbaColor8, uint>(ref c0) |= alphaMaskForPalette;
  921. Unsafe.As<RgbaColor8, uint>(ref c1) |= alphaMaskForPalette;
  922. palette[palBase + 0] = c0.ToUInt32();
  923. palette[palBase + indexCount - 1] = c1.ToUInt32();
  924. for (int j = 1; j < indexCount - 1; j++)
  925. {
  926. palette[palBase + j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32();
  927. }
  928. }
  929. int i = 0;
  930. for (int ty = 0; ty < h; ty++)
  931. {
  932. for (int tx = 0; tx < w; tx++)
  933. {
  934. int subset = BC67Tables.PartitionTable[subsetCount - 1][partition][ty * 4 + tx];
  935. uint color = tile[i++] | alphaMask;
  936. int bestMatchScore = int.MaxValue;
  937. int bestMatchIndex = 0;
  938. for (int j = 0; j < indexCount; j++)
  939. {
  940. int score = SquaredDifference(
  941. RgbaColor8.FromUInt32(color).GetColor32(),
  942. RgbaColor8.FromUInt32(palette[subset * indexCount + j]).GetColor32());
  943. if (score < bestMatchScore)
  944. {
  945. bestMatchScore = score;
  946. bestMatchIndex = j;
  947. }
  948. }
  949. indices[ty * 4 + tx] = (byte)bestMatchIndex;
  950. errorSum += bestMatchScore;
  951. }
  952. }
  953. return errorSum;
  954. }
  955. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  956. public static int SquaredDifference(RgbaColor32 color1, RgbaColor32 color2)
  957. {
  958. RgbaColor32 delta = color1 - color2;
  959. return RgbaColor32.Dot(delta, delta);
  960. }
  961. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  962. public static RgbaColor8 Interpolate(RgbaColor8 color1, RgbaColor8 color2, int weightIndex, int indexBitCount)
  963. {
  964. return Interpolate(color1.GetColor32(), color2.GetColor32(), weightIndex, indexBitCount).GetColor8();
  965. }
  966. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  967. public static RgbaColor32 Interpolate(RgbaColor32 color1, RgbaColor32 color2, int weightIndex, int indexBitCount)
  968. {
  969. Debug.Assert(indexBitCount >= 2 && indexBitCount <= 4);
  970. int weight = (((weightIndex << 7) / ((1 << indexBitCount) - 1)) + 1) >> 1;
  971. RgbaColor32 weightV = new RgbaColor32(weight);
  972. RgbaColor32 invWeightV = new RgbaColor32(64 - weight);
  973. return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6;
  974. }
  975. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  976. public static RgbaColor32 Interpolate(
  977. RgbaColor32 color1,
  978. RgbaColor32 color2,
  979. int colorWeightIndex,
  980. int alphaWeightIndex,
  981. int colorIndexBitCount,
  982. int alphaIndexBitCount)
  983. {
  984. Debug.Assert(colorIndexBitCount >= 2 && colorIndexBitCount <= 4);
  985. Debug.Assert(alphaIndexBitCount >= 2 && alphaIndexBitCount <= 4);
  986. int colorWeight = BC67Tables.Weights[colorIndexBitCount - 2][colorWeightIndex];
  987. int alphaWeight = BC67Tables.Weights[alphaIndexBitCount - 2][alphaWeightIndex];
  988. RgbaColor32 weightV = new RgbaColor32(colorWeight);
  989. weightV.A = alphaWeight;
  990. RgbaColor32 invWeightV = new RgbaColor32(64) - weightV;
  991. return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6;
  992. }
  993. public static RgbaColor8 Quantize(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1)
  994. {
  995. if (alphaBits == 0)
  996. {
  997. int colorShift = 8 - colorBits;
  998. uint c;
  999. if (pBit >= 0)
  1000. {
  1001. byte[] lutColor = _quantizationLut[colorBits - 4];
  1002. Debug.Assert(pBit <= 1);
  1003. int high = pBit << 8;
  1004. uint mask = (0xffu >> (colorBits + 1)) * 0x10101;
  1005. c = lutColor[color.R | high];
  1006. c |= (uint)lutColor[color.G | high] << 8;
  1007. c |= (uint)lutColor[color.B | high] << 16;
  1008. c <<= colorShift;
  1009. c |= (c >> (colorBits + 1)) & mask;
  1010. c |= ((uint)pBit * 0x10101) << (colorShift - 1);
  1011. }
  1012. else
  1013. {
  1014. byte[] lutColor = _quantizationLutNoPBit[colorBits - 4];
  1015. uint mask = (0xffu >> colorBits) * 0x10101;
  1016. c = lutColor[color.R];
  1017. c |= (uint)lutColor[color.G] << 8;
  1018. c |= (uint)lutColor[color.B] << 16;
  1019. c <<= colorShift;
  1020. c |= (c >> colorBits) & mask;
  1021. }
  1022. c |= (uint)color.A << 24;
  1023. return RgbaColor8.FromUInt32(c);
  1024. }
  1025. return QuantizeFallback(color, colorBits, alphaBits, pBit);
  1026. }
  1027. private static RgbaColor8 QuantizeFallback(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1)
  1028. {
  1029. byte r = UnquantizeComponent(QuantizeComponent(color.R, colorBits, pBit), colorBits, pBit);
  1030. byte g = UnquantizeComponent(QuantizeComponent(color.G, colorBits, pBit), colorBits, pBit);
  1031. byte b = UnquantizeComponent(QuantizeComponent(color.B, colorBits, pBit), colorBits, pBit);
  1032. byte a = alphaBits == 0 ? color.A : UnquantizeComponent(QuantizeComponent(color.A, alphaBits, pBit), alphaBits, pBit);
  1033. return new RgbaColor8(r, g, b, a);
  1034. }
  1035. public static byte QuantizeComponent(byte component, int bits, int pBit = -1)
  1036. {
  1037. return pBit >= 0 ? _quantizationLut[bits - 4][component | (pBit << 8)] : _quantizationLutNoPBit[bits - 4][component];
  1038. }
  1039. private static byte QuantizeComponentForLut(byte component, int bits, int pBit = -1)
  1040. {
  1041. int shift = 8 - bits;
  1042. int fill = component >> bits;
  1043. if (pBit >= 0)
  1044. {
  1045. Debug.Assert(pBit <= 1);
  1046. fill >>= 1;
  1047. fill |= pBit << (shift - 1);
  1048. }
  1049. int q1 = component >> shift;
  1050. int q2 = Math.Max(q1 - 1, 0);
  1051. int q3 = Math.Min(q1 + 1, (1 << bits) - 1);
  1052. int delta1 = FastAbs(((q1 << shift) | fill) - component);
  1053. int delta2 = component - ((q2 << shift) | fill);
  1054. int delta3 = ((q3 << shift) | fill) - component;
  1055. if (delta1 < delta2 && delta1 < delta3)
  1056. {
  1057. return (byte)q1;
  1058. }
  1059. else if (delta2 < delta3)
  1060. {
  1061. return (byte)q2;
  1062. }
  1063. else
  1064. {
  1065. return (byte)q3;
  1066. }
  1067. }
  1068. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  1069. private static int FastAbs(int x)
  1070. {
  1071. int sign = x >> 31;
  1072. return (x + sign) ^ sign;
  1073. }
  1074. private static byte UnquantizeComponent(byte component, int bits, int pBit)
  1075. {
  1076. int shift = 8 - bits;
  1077. int value = component << shift;
  1078. if (pBit >= 0)
  1079. {
  1080. Debug.Assert(pBit <= 1);
  1081. value |= value >> (bits + 1);
  1082. value |= pBit << (shift - 1);
  1083. }
  1084. else
  1085. {
  1086. value |= value >> bits;
  1087. }
  1088. return (byte)value;
  1089. }
  1090. }
  1091. }