FastChaCha7539EngineHelper.cs 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514
  1. #if !BESTHTTP_DISABLE_ALTERNATE_SSL && (!UNITY_WEBGL || UNITY_EDITOR) && BESTHTTP_WITH_BURST && (NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER || UNITY_2021_2_OR_NEWER)
  2. using System;
  3. using System.Runtime.CompilerServices;
  4. using Unity.Burst;
  5. using Unity.Burst.Intrinsics;
  6. using static Unity.Burst.Intrinsics.X86;
  7. using static Unity.Burst.Intrinsics.Arm;
  8. // https://github.com/sschoener/burst-simd-exercises/blob/main/Assets/Examples/2-sum-small-numbers-sse3/SumSmallNumbers_SSE3.cs
  9. // https://github.com/jratcliff63367/sse2neon/blob/master/SSE2NEON.h#L789
  10. namespace Best.HTTP.Shared.TLS.Crypto.Impl
  11. {
  12. [BurstCompile]
  13. public unsafe static class FastChaCha7539EngineHelper
  14. {
  15. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  16. public static void ProcessBlocks2(ReadOnlySpan<byte> input, Span<byte> output, uint[] state, int rounds, byte[] keyStream)
  17. {
  18. fixed (byte* pinput = input)
  19. fixed (byte* poutput = output)
  20. fixed (uint* pstate = state)
  21. fixed(byte* pkeyStream = keyStream)
  22. ProcessBlocks2Impl(pinput, input.Length, poutput, output.Length, pstate, state.Length, rounds, pkeyStream);
  23. }
  24. [BurstCompile]
  25. private static void ProcessBlocks2Impl([NoAlias] byte* input, int inputLen, [NoAlias] byte* output, int outLen, [NoAlias] uint* state, int stateLen, int rounds, [NoAlias] byte* keyStream)
  26. {
  27. if (Avx2.IsAvx2Supported)
  28. {
  29. var t0 = new v128(state[0], state[1], state[2], state[3]); //Load128_UInt32(state.AsSpan());
  30. var t1 = new v128(state[4], state[5], state[6], state[7]); //Load128_UInt32(state.AsSpan(4));
  31. var t2 = new v128(state[8], state[9], state[10], state[11]); //Load128_UInt32(state.AsSpan(8));
  32. var t3 = new v128(state[12], state[13], state[14], state[15]); //Load128_UInt32(state.AsSpan(12));
  33. ++state[12];
  34. var t4 = new v128(state[12], state[13], state[14], state[15]); //Load128_UInt32(state.AsSpan(12));
  35. ++state[12];
  36. var x0 = new v256(t0, t0); //Vector256.Create(t0, t0);
  37. var x1 = new v256(t1, t1); //Vector256.Create(t1, t1);
  38. var x2 = new v256(t2, t2); //Vector256.Create(t2, t2);
  39. var x3 = new v256(t3, t4); //Vector256.Create(t3, t4);
  40. var v0 = x0;
  41. var v1 = x1;
  42. var v2 = x2;
  43. var v3 = x3;
  44. for (int i = rounds; i > 0; i -= 2)
  45. {
  46. v0 = Avx2.mm256_add_epi32(v0, v1);
  47. v3 = Avx2.mm256_xor_si256(v3, v0);
  48. v3 = Avx2.mm256_xor_si256(Avx2.mm256_slli_epi32(v3, 16), Avx2.mm256_srli_epi32(v3, 16));
  49. v2 = Avx2.mm256_add_epi32(v2, v3);
  50. v1 = Avx2.mm256_xor_si256(v1, v2);
  51. v1 = Avx2.mm256_xor_si256(Avx2.mm256_slli_epi32(v1, 12), Avx2.mm256_srli_epi32(v1, 20));
  52. v0 = Avx2.mm256_add_epi32(v0, v1);
  53. v3 = Avx2.mm256_xor_si256(v3, v0);
  54. v3 = Avx2.mm256_xor_si256(Avx2.mm256_slli_epi32(v3, 8), Avx2.mm256_srli_epi32(v3, 24));
  55. v2 = Avx2.mm256_add_epi32(v2, v3);
  56. v1 = Avx2.mm256_xor_si256(v1, v2);
  57. v1 = Avx2.mm256_xor_si256(Avx2.mm256_slli_epi32(v1, 7), Avx2.mm256_srli_epi32(v1, 25));
  58. v1 = Avx2.mm256_shuffle_epi32(v1, 0x39);
  59. v2 = Avx2.mm256_shuffle_epi32(v2, 0x4E);
  60. v3 = Avx2.mm256_shuffle_epi32(v3, 0x93);
  61. v0 = Avx2.mm256_add_epi32(v0, v1);
  62. v3 = Avx2.mm256_xor_si256(v3, v0);
  63. v3 = Avx2.mm256_xor_si256(Avx2.mm256_slli_epi32(v3, 16), Avx2.mm256_srli_epi32(v3, 16));
  64. v2 = Avx2.mm256_add_epi32(v2, v3);
  65. v1 = Avx2.mm256_xor_si256(v1, v2);
  66. v1 = Avx2.mm256_xor_si256(Avx2.mm256_slli_epi32(v1, 12), Avx2.mm256_srli_epi32(v1, 20));
  67. v0 = Avx2.mm256_add_epi32(v0, v1);
  68. v3 = Avx2.mm256_xor_si256(v3, v0);
  69. v3 = Avx2.mm256_xor_si256(Avx2.mm256_slli_epi32(v3, 8), Avx2.mm256_srli_epi32(v3, 24));
  70. v2 = Avx2.mm256_add_epi32(v2, v3);
  71. v1 = Avx2.mm256_xor_si256(v1, v2);
  72. v1 = Avx2.mm256_xor_si256(Avx2.mm256_slli_epi32(v1, 7), Avx2.mm256_srli_epi32(v1, 25));
  73. v1 = Avx2.mm256_shuffle_epi32(v1, 0x93);
  74. v2 = Avx2.mm256_shuffle_epi32(v2, 0x4E);
  75. v3 = Avx2.mm256_shuffle_epi32(v3, 0x39);
  76. }
  77. v0 = Avx2.mm256_add_epi32(v0, x0);
  78. v1 = Avx2.mm256_add_epi32(v1, x1);
  79. v2 = Avx2.mm256_add_epi32(v2, x2);
  80. v3 = Avx2.mm256_add_epi32(v3, x3);
  81. var n0 = Avx2.mm256_permute2x128_si256(v0, v1, 0x20);
  82. var n1 = Avx2.mm256_permute2x128_si256(v2, v3, 0x20);
  83. var n2 = Avx2.mm256_permute2x128_si256(v0, v1, 0x31);
  84. var n3 = Avx2.mm256_permute2x128_si256(v2, v3, 0x31);
  85. ulong* uInput = (ulong*)input;
  86. n0 = Avx2.mm256_xor_si256(n0, new v256(uInput[0], uInput[1], uInput[2], uInput[3])); // Load256_Byte(input)
  87. n1 = Avx2.mm256_xor_si256(n1, new v256(uInput[4], uInput[5], uInput[6], uInput[7])); // Load256_Byte(input[0x20..])
  88. n2 = Avx2.mm256_xor_si256(n2, new v256(uInput[8], uInput[9], uInput[10], uInput[11])); // Load256_Byte(input[0x40..])
  89. n3 = Avx2.mm256_xor_si256(n3, new v256(uInput[12], uInput[13], uInput[14], uInput[15])); // Load256_Byte(input[0x60..])
  90. ulong* uOutput = (ulong*)output;
  91. uOutput[0] = n0.ULong0; uOutput[1] = n0.ULong1; uOutput[2] = n0.ULong2; uOutput[3] = n0.ULong3; //Store256_Byte(n0, output);
  92. uOutput[4] = n1.ULong0; uOutput[5] = n1.ULong1; uOutput[6] = n1.ULong2; uOutput[7] = n1.ULong3; //Store256_Byte(n1, output[0x20..]);
  93. uOutput[8] = n2.ULong0; uOutput[9] = n2.ULong1; uOutput[10] = n2.ULong2; uOutput[11] = n2.ULong3; //Store256_Byte(n2, output[0x40..]);
  94. uOutput[12] = n3.ULong0; uOutput[13] = n3.ULong1; uOutput[14] = n3.ULong2; uOutput[15] = n3.ULong3; //Store256_Byte(n3, output[0x60..]);
  95. }
  96. #if !UNITY_ANDROID && !UNITY_IOS
  97. else if (Sse2.IsSse2Supported)
  98. {
  99. var x0 = Sse2.loadu_si128(state); //new v128(state[0], state[1], state[2], state[3]); //Load128_UInt32(state.AsSpan());
  100. var x1 = Sse2.loadu_si128(state + 4); //new v128(state[4], state[5], state[6], state[7]); //Load128_UInt32(state.AsSpan(4));
  101. var x2 = Sse2.loadu_si128(state + 8); //new v128(state[8], state[9], state[10], state[11]); //Load128_UInt32(state.AsSpan(8));
  102. var x3 = Sse2.loadu_si128(state + 12); //new v128(state[12], state[13], state[14], state[15]); //Load128_UInt32(state.AsSpan(12));
  103. ++state[12];
  104. var v0 = x0;
  105. var v1 = x1;
  106. var v2 = x2;
  107. var v3 = x3;
  108. for (int i = rounds; i > 0; i -= 2)
  109. {
  110. v0 = Sse2.add_epi32(v0, v1);
  111. v3 = Sse2.xor_si128(v3, v0);
  112. v3 = Sse2.xor_si128(Sse2.slli_epi32(v3, 16), Sse2.srli_epi32(v3, 16));
  113. v2 = Sse2.add_epi32(v2, v3);
  114. v1 = Sse2.xor_si128(v1, v2);
  115. v1 = Sse2.xor_si128(Sse2.slli_epi32(v1, 12), Sse2.srli_epi32(v1, 20));
  116. v0 = Sse2.add_epi32(v0, v1);
  117. v3 = Sse2.xor_si128(v3, v0);
  118. v3 = Sse2.xor_si128(Sse2.slli_epi32(v3, 8), Sse2.srli_epi32(v3, 24));
  119. v2 = Sse2.add_epi32(v2, v3);
  120. v1 = Sse2.xor_si128(v1, v2);
  121. v1 = Sse2.xor_si128(Sse2.slli_epi32(v1, 7), Sse2.srli_epi32(v1, 25));
  122. v1 = Sse2.shuffle_epi32(v1, 0x39);
  123. v2 = Sse2.shuffle_epi32(v2, 0x4E);
  124. v3 = Sse2.shuffle_epi32(v3, 0x93);
  125. v0 = Sse2.add_epi32(v0, v1);
  126. v3 = Sse2.xor_si128(v3, v0);
  127. v3 = Sse2.xor_si128(Sse2.slli_epi32(v3, 16), Sse2.srli_epi32(v3, 16));
  128. v2 = Sse2.add_epi32(v2, v3);
  129. v1 = Sse2.xor_si128(v1, v2);
  130. v1 = Sse2.xor_si128(Sse2.slli_epi32(v1, 12), Sse2.srli_epi32(v1, 20));
  131. v0 = Sse2.add_epi32(v0, v1);
  132. v3 = Sse2.xor_si128(v3, v0);
  133. v3 = Sse2.xor_si128(Sse2.slli_epi32(v3, 8), Sse2.srli_epi32(v3, 24));
  134. v2 = Sse2.add_epi32(v2, v3);
  135. v1 = Sse2.xor_si128(v1, v2);
  136. v1 = Sse2.xor_si128(Sse2.slli_epi32(v1, 7), Sse2.srli_epi32(v1, 25));
  137. v1 = Sse2.shuffle_epi32(v1, 0x93);
  138. v2 = Sse2.shuffle_epi32(v2, 0x4E);
  139. v3 = Sse2.shuffle_epi32(v3, 0x39);
  140. }
  141. v0 = Sse2.add_epi32(v0, x0);
  142. v1 = Sse2.add_epi32(v1, x1);
  143. v2 = Sse2.add_epi32(v2, x2);
  144. v3 = Sse2.add_epi32(v3, x3);
  145. var n0 = Sse2.loadu_si128(input + 0x00); //Load128_Byte(input);
  146. var n1 = Sse2.loadu_si128(input + 0x10); //Load128_Byte(input[0x10..]);
  147. var n2 = Sse2.loadu_si128(input + 0x20); //Load128_Byte(input[0x20..]);
  148. var n3 = Sse2.loadu_si128(input + 0x30); //Load128_Byte(input[0x30..]);
  149. n0 = Sse2.xor_si128(n0, v0);
  150. n1 = Sse2.xor_si128(n1, v1);
  151. n2 = Sse2.xor_si128(n2, v2);
  152. n3 = Sse2.xor_si128(n3, v3);
  153. Sse2.storeu_si128(output + 0x00, n0); //Store128_Byte(n0, output);
  154. Sse2.storeu_si128(output + 0x10, n1); //Store128_Byte(n1, output[0x10..]);
  155. Sse2.storeu_si128(output + 0x20, n2); //Store128_Byte(n2, output[0x20..]);
  156. Sse2.storeu_si128(output + 0x30, n3); //Store128_Byte(n3, output[0x30..]);
  157. x3 = Sse2.loadu_si128(state + 12); // Load128_UInt32(state.AsSpan(12));
  158. ++state[12];
  159. v0 = x0;
  160. v1 = x1;
  161. v2 = x2;
  162. v3 = x3;
  163. for (int i = rounds; i > 0; i -= 2)
  164. {
  165. v0 = Sse2.add_epi32(v0, v1);
  166. v3 = Sse2.xor_si128(v3, v0);
  167. v3 = Sse2.xor_si128(Sse2.slli_epi32(v3, 16), Sse2.srli_epi32(v3, 16));
  168. v2 = Sse2.add_epi32(v2, v3);
  169. v1 = Sse2.xor_si128(v1, v2);
  170. v1 = Sse2.xor_si128(Sse2.slli_epi32(v1, 12), Sse2.srli_epi32(v1, 20));
  171. v0 = Sse2.add_epi32(v0, v1);
  172. v3 = Sse2.xor_si128(v3, v0);
  173. v3 = Sse2.xor_si128(Sse2.slli_epi32(v3, 8), Sse2.srli_epi32(v3, 24));
  174. v2 = Sse2.add_epi32(v2, v3);
  175. v1 = Sse2.xor_si128(v1, v2);
  176. v1 = Sse2.xor_si128(Sse2.slli_epi32(v1, 7), Sse2.srli_epi32(v1, 25));
  177. v1 = Sse2.shuffle_epi32(v1, 0x39);
  178. v2 = Sse2.shuffle_epi32(v2, 0x4E);
  179. v3 = Sse2.shuffle_epi32(v3, 0x93);
  180. v0 = Sse2.add_epi32(v0, v1);
  181. v3 = Sse2.xor_si128(v3, v0);
  182. v3 = Sse2.xor_si128(Sse2.slli_epi32(v3, 16), Sse2.srli_epi32(v3, 16));
  183. v2 = Sse2.add_epi32(v2, v3);
  184. v1 = Sse2.xor_si128(v1, v2);
  185. v1 = Sse2.xor_si128(Sse2.slli_epi32(v1, 12), Sse2.srli_epi32(v1, 20));
  186. v0 = Sse2.add_epi32(v0, v1);
  187. v3 = Sse2.xor_si128(v3, v0);
  188. v3 = Sse2.xor_si128(Sse2.slli_epi32(v3, 8), Sse2.srli_epi32(v3, 24));
  189. v2 = Sse2.add_epi32(v2, v3);
  190. v1 = Sse2.xor_si128(v1, v2);
  191. v1 = Sse2.xor_si128(Sse2.slli_epi32(v1, 7), Sse2.srli_epi32(v1, 25));
  192. v1 = Sse2.shuffle_epi32(v1, 0x93);
  193. v2 = Sse2.shuffle_epi32(v2, 0x4E);
  194. v3 = Sse2.shuffle_epi32(v3, 0x39);
  195. }
  196. v0 = Sse2.add_epi32(v0, x0);
  197. v1 = Sse2.add_epi32(v1, x1);
  198. v2 = Sse2.add_epi32(v2, x2);
  199. v3 = Sse2.add_epi32(v3, x3);
  200. n0 = Sse2.loadu_si128(input + 0x40); //Load128_Byte(input[0x40..]);
  201. n1 = Sse2.loadu_si128(input + 0x50); //Load128_Byte(input[0x50..]);
  202. n2 = Sse2.loadu_si128(input + 0x60); //Load128_Byte(input[0x60..]);
  203. n3 = Sse2.loadu_si128(input + 0x70); //Load128_Byte(input[0x70..]);
  204. n0 = Sse2.xor_si128(n0, v0);
  205. n1 = Sse2.xor_si128(n1, v1);
  206. n2 = Sse2.xor_si128(n2, v2);
  207. n3 = Sse2.xor_si128(n3, v3);
  208. Sse2.storeu_si128(output + 0x40, n0); //Store128_Byte(n0, output[0x40..]);
  209. Sse2.storeu_si128(output + 0x50, n1); //Store128_Byte(n1, output[0x50..]);
  210. Sse2.storeu_si128(output + 0x60, n2); //Store128_Byte(n2, output[0x60..]);
  211. Sse2.storeu_si128(output + 0x70, n3); //Store128_Byte(n3, output[0x70..]);
  212. }
  213. #endif
  214. else if (Neon.IsNeonSupported)
  215. {
  216. var x0 = Neon.vld1q_u32(state); //new v128(state[0], state[1], state[2], state[3]); //Load128_UInt32(state.AsSpan());
  217. var x1 = Neon.vld1q_u32(state + 4); //new v128(state[4], state[5], state[6], state[7]); //Load128_UInt32(state.AsSpan(4));
  218. var x2 = Neon.vld1q_u32(state + 8); //new v128(state[8], state[9], state[10], state[11]); //Load128_UInt32(state.AsSpan(8));
  219. var x3 = Neon.vld1q_u32(state + 12);
  220. ++state[12];
  221. var v0 = x0;
  222. var v1 = x1;
  223. var v2 = x2;
  224. var v3 = x3;
  225. for (int i = rounds; i > 0; i -= 2)
  226. {
  227. v0 = Neon.vaddq_u32(v0, v1);
  228. v3 = Neon.veorq_u32(v3, v0);
  229. v3 = Neon.veorq_u32(Neon.vshlq_n_u32(v3, 16), Neon.vshrq_n_u32(v3, 16));
  230. v2 = Neon.vaddq_u32(v2, v3);
  231. v1 = Neon.veorq_u32(v1, v2);
  232. v1 = Neon.veorq_u32(Neon.vshlq_n_u32(v1, 12), Neon.vshrq_n_u32(v1, 20));
  233. v0 = Neon.vaddq_u32(v0, v1);
  234. v3 = Neon.veorq_u32(v3, v0);
  235. v3 = Neon.veorq_u32(Neon.vshlq_n_u32(v3, 8), Neon.vshrq_n_u32(v3, 24));
  236. v2 = Neon.vaddq_u32(v2, v3);
  237. v1 = Neon.veorq_u32(v1, v2);
  238. v1 = Neon.veorq_u32(Neon.vshlq_n_u32(v1, 7), Neon.vshrq_n_u32(v1, 25));
  239. ///*v1 = */Neon_shuffle_epi32(v1, 0x39, out v1);
  240. v128 ret;
  241. ret = Neon.vmovq_n_u32(Neon.vgetq_lane_u32(v1, (0x39) & 0x3));
  242. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v1, ((0x39) >> 2) & 0x3), ret, 1);
  243. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v1, ((0x39) >> 4) & 0x3), ret, 2);
  244. v1 = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v1, ((0x39) >> 6) & 0x3), ret, 3);
  245. ///*v2 = */Neon_shuffle_epi32(v2, 0x4E, out v2);
  246. ret = Neon.vmovq_n_u32(Neon.vgetq_lane_u32(v2, (0x4E) & 0x3));
  247. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v2, ((0x4E) >> 2) & 0x3), ret, 1);
  248. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v2, ((0x4E) >> 4) & 0x3), ret, 2);
  249. v2 = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v2, ((0x4E) >> 6) & 0x3), ret, 3);
  250. ///*v3 = */Neon_shuffle_epi32(v3, 0x93, out v3);
  251. ret = Neon.vmovq_n_u32(Neon.vgetq_lane_u32(v3, (0x93) & 0x3));
  252. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v3, ((0x93) >> 2) & 0x3), ret, 1);
  253. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v3, ((0x93) >> 4) & 0x3), ret, 2);
  254. v3 = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v3, ((0x93) >> 6) & 0x3), ret, 3);
  255. v0 = Neon.vaddq_u32(v0, v1);
  256. v3 = Neon.veorq_u32(v3, v0);
  257. v3 = Neon.veorq_u32(Neon.vshlq_n_u32(v3, 16), Neon.vshrq_n_u32(v3, 16));
  258. v2 = Neon.vaddq_u32(v2, v3);
  259. v1 = Neon.veorq_u32(v1, v2);
  260. v1 = Neon.veorq_u32(Neon.vshlq_n_u32(v1, 12), Neon.vshrq_n_u32(v1, 20));
  261. v0 = Neon.vaddq_u32(v0, v1);
  262. v3 = Neon.veorq_u32(v3, v0);
  263. v3 = Neon.veorq_u32(Neon.vshlq_n_u32(v3, 8), Neon.vshrq_n_u32(v3, 24));
  264. v2 = Neon.vaddq_u32(v2, v3);
  265. v1 = Neon.veorq_u32(v1, v2);
  266. v1 = Neon.veorq_u32(Neon.vshlq_n_u32(v1, 7), Neon.vshrq_n_u32(v1, 25));
  267. ///*v1 = */Neon_shuffle_epi32(v1, 0x93, out v1);
  268. ret = Neon.vmovq_n_u32(Neon.vgetq_lane_u32(v1, (0x93) & 0x3));
  269. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v1, ((0x93) >> 2) & 0x3), ret, 1);
  270. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v1, ((0x93) >> 4) & 0x3), ret, 2);
  271. v1 = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v1, ((0x93) >> 6) & 0x3), ret, 3);
  272. ///*v2 = */Neon_shuffle_epi32(v2, 0x4E, out v2);
  273. ret = Neon.vmovq_n_u32(Neon.vgetq_lane_u32(v2, (0x4E) & 0x3));
  274. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v2, ((0x4E) >> 2) & 0x3), ret, 1);
  275. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v2, ((0x4E) >> 4) & 0x3), ret, 2);
  276. v2 = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v2, ((0x4E) >> 6) & 0x3), ret, 3);
  277. ///*v3 = */Neon_shuffle_epi32(v3, 0x39, out v3);
  278. ret = Neon.vmovq_n_u32(Neon.vgetq_lane_u32(v3, (0x39) & 0x3));
  279. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v3, ((0x39) >> 2) & 0x3), ret, 1);
  280. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v3, ((0x39) >> 4) & 0x3), ret, 2);
  281. v3 = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v3, ((0x39) >> 6) & 0x3), ret, 3);
  282. }
  283. v0 = Neon.vaddq_u32(v0, x0);
  284. v1 = Neon.vaddq_u32(v1, x1);
  285. v2 = Neon.vaddq_u32(v2, x2);
  286. v3 = Neon.vaddq_u32(v3, x3);
  287. var n0 = Neon.vld1q_u32((uint*)(input + 0x00)); //Load128_Byte(input);
  288. var n1 = Neon.vld1q_u32((uint*)(input + 0x10)); //Load128_Byte(input[0x10..]);
  289. var n2 = Neon.vld1q_u32((uint*)(input + 0x20)); //Load128_Byte(input[0x20..]);
  290. var n3 = Neon.vld1q_u32((uint*)(input + 0x30)); //Load128_Byte(input[0x30..]);
  291. n0 = Neon.veorq_u32(n0, v0);
  292. n1 = Neon.veorq_u32(n1, v1);
  293. n2 = Neon.veorq_u32(n2, v2);
  294. n3 = Neon.veorq_u32(n3, v3);
  295. Neon.vst1q_u32((uint*)(output + 0x00), n0); //Store128_Byte(n0, output);
  296. Neon.vst1q_u32((uint*)(output + 0x10), n1); //Store128_Byte(n1, output[0x10..]);
  297. Neon.vst1q_u32((uint*)(output + 0x20), n2); //Store128_Byte(n2, output[0x20..]);
  298. Neon.vst1q_u32((uint*)(output + 0x30), n3); //Store128_Byte(n3, output[0x30..]);
  299. x3 = Neon.vld1q_u32(state + 12); // Load128_UInt32(state.AsSpan(12));
  300. ++state[12];
  301. v0 = x0;
  302. v1 = x1;
  303. v2 = x2;
  304. v3 = x3;
  305. for (int i = rounds; i > 0; i -= 2)
  306. {
  307. v0 = Neon.vaddq_u32(v0, v1);
  308. v3 = Neon.veorq_u32(v3, v0);
  309. v3 = Neon.veorq_u32(Neon.vshlq_n_u32(v3, 16), Neon.vshrq_n_u32(v3, 16));
  310. v2 = Neon.vaddq_u32(v2, v3);
  311. v1 = Neon.veorq_u32(v1, v2);
  312. v1 = Neon.veorq_u32(Neon.vshlq_n_u32(v1, 12), Neon.vshrq_n_u32(v1, 20));
  313. v0 = Neon.vaddq_u32(v0, v1);
  314. v3 = Neon.veorq_u32(v3, v0);
  315. v3 = Neon.veorq_u32(Neon.vshlq_n_u32(v3, 8), Neon.vshrq_n_u32(v3, 24));
  316. v2 = Neon.vaddq_u32(v2, v3);
  317. v1 = Neon.veorq_u32(v1, v2);
  318. v1 = Neon.veorq_u32(Neon.vshlq_n_u32(v1, 7), Neon.vshrq_n_u32(v1, 25));
  319. ///*v1 = */Neon_shuffle_epi32(v1, 0x39, out v1);
  320. v128 ret;
  321. ret = Neon.vmovq_n_u32(Neon.vgetq_lane_u32(v1, (0x39) & 0x3));
  322. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v1, ((0x39) >> 2) & 0x3), ret, 1);
  323. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v1, ((0x39) >> 4) & 0x3), ret, 2);
  324. v1 = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v1, ((0x39) >> 6) & 0x3), ret, 3);
  325. ///*v2 = */Neon_shuffle_epi32(v2, 0x4E, out v2);
  326. ret = Neon.vmovq_n_u32(Neon.vgetq_lane_u32(v2, (0x4E) & 0x3));
  327. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v2, ((0x4E) >> 2) & 0x3), ret, 1);
  328. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v2, ((0x4E) >> 4) & 0x3), ret, 2);
  329. v2 = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v2, ((0x4E) >> 6) & 0x3), ret, 3);
  330. ///*v3 = */Neon_shuffle_epi32(v3, 0x93, out v3);
  331. ret = Neon.vmovq_n_u32(Neon.vgetq_lane_u32(v3, (0x93) & 0x3));
  332. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v3, ((0x93) >> 2) & 0x3), ret, 1);
  333. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v3, ((0x93) >> 4) & 0x3), ret, 2);
  334. v3 = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v3, ((0x93) >> 6) & 0x3), ret, 3);
  335. v0 = Neon.vaddq_u32(v0, v1);
  336. v3 = Neon.veorq_u32(v3, v0);
  337. v3 = Neon.veorq_u32(Neon.vshlq_n_u32(v3, 16), Neon.vshrq_n_u32(v3, 16));
  338. v2 = Neon.vaddq_u32(v2, v3);
  339. v1 = Neon.veorq_u32(v1, v2);
  340. v1 = Neon.veorq_u32(Neon.vshlq_n_u32(v1, 12), Neon.vshrq_n_u32(v1, 20));
  341. v0 = Neon.vaddq_u32(v0, v1);
  342. v3 = Neon.veorq_u32(v3, v0);
  343. v3 = Neon.veorq_u32(Neon.vshlq_n_u32(v3, 8), Neon.vshrq_n_u32(v3, 24));
  344. v2 = Neon.vaddq_u32(v2, v3);
  345. v1 = Neon.veorq_u32(v1, v2);
  346. v1 = Neon.veorq_u32(Neon.vshlq_n_u32(v1, 7), Neon.vshrq_n_u32(v1, 25));
  347. ///*v1 = */Neon_shuffle_epi32(v1, 0x93, out v1);
  348. ret = Neon.vmovq_n_u32(Neon.vgetq_lane_u32(v1, (0x93) & 0x3));
  349. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v1, ((0x93) >> 2) & 0x3), ret, 1);
  350. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v1, ((0x93) >> 4) & 0x3), ret, 2);
  351. v1 = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v1, ((0x93) >> 6) & 0x3), ret, 3);
  352. ///*v2 = */Neon_shuffle_epi32(v2, 0x4E, out v2);
  353. ret = Neon.vmovq_n_u32(Neon.vgetq_lane_u32(v2, (0x4E) & 0x3));
  354. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v2, ((0x4E) >> 2) & 0x3), ret, 1);
  355. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v2, ((0x4E) >> 4) & 0x3), ret, 2);
  356. v2 = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v2, ((0x4E) >> 6) & 0x3), ret, 3);
  357. ///*v3 = */Neon_shuffle_epi32(v3, 0x39, out v3);
  358. ret = Neon.vmovq_n_u32(Neon.vgetq_lane_u32(v3, (0x39) & 0x3));
  359. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v3, ((0x39) >> 2) & 0x3), ret, 1);
  360. ret = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v3, ((0x39) >> 4) & 0x3), ret, 2);
  361. v3 = Neon.vsetq_lane_u32(Neon.vgetq_lane_u32(v3, ((0x39) >> 6) & 0x3), ret, 3);
  362. }
  363. v0 = Neon.vaddq_u32(v0, x0);
  364. v1 = Neon.vaddq_u32(v1, x1);
  365. v2 = Neon.vaddq_u32(v2, x2);
  366. v3 = Neon.vaddq_u32(v3, x3);
  367. n0 = Neon.vld1q_u32((uint*)(input + 0x40)); //Load128_Byte(input[0x40..]);
  368. n1 = Neon.vld1q_u32((uint*)(input + 0x50)); //Load128_Byte(input[0x50..]);
  369. n2 = Neon.vld1q_u32((uint*)(input + 0x60)); //Load128_Byte(input[0x60..]);
  370. n3 = Neon.vld1q_u32((uint*)(input + 0x70)); //Load128_Byte(input[0x70..]);
  371. n0 = Neon.veorq_u32(n0, v0);
  372. n1 = Neon.veorq_u32(n1, v1);
  373. n2 = Neon.veorq_u32(n2, v2);
  374. n3 = Neon.veorq_u32(n3, v3);
  375. Neon.vst1q_u32((uint*)(output + 0x40), n0); //Store128_Byte(n0, output[0x40..]);
  376. Neon.vst1q_u32((uint*)(output + 0x50), n1); //Store128_Byte(n1, output[0x50..]);
  377. Neon.vst1q_u32((uint*)(output + 0x60), n2); //Store128_Byte(n2, output[0x60..]);
  378. Neon.vst1q_u32((uint*)(output + 0x70), n3); //Store128_Byte(n3, output[0x70..]);
  379. }
  380. else
  381. {
  382. // Inlined to two ImplProcessBlock calls:
  383. //ImplProcessBlock(input, output);
  384. //ImplProcessBlock(input[64..], output[64..]);
  385. FastChaChaEngineHelper.ChachaCoreImpl(rounds, state, keyStream);
  386. ++state[12];
  387. #if UNITY_ANDROID && !UNITY_EDITOR
  388. if ((long)input % sizeof(ulong) == 0)
  389. {
  390. #endif
  391. var pulinput = (ulong*)input;
  392. var puloutput = (ulong*)output;
  393. var pulkeyStream = (ulong*)keyStream;
  394. puloutput[7] = pulkeyStream[7] ^ pulinput[7];
  395. puloutput[6] = pulkeyStream[6] ^ pulinput[6];
  396. puloutput[5] = pulkeyStream[5] ^ pulinput[5];
  397. puloutput[4] = pulkeyStream[4] ^ pulinput[4];
  398. puloutput[3] = pulkeyStream[3] ^ pulinput[3];
  399. puloutput[2] = pulkeyStream[2] ^ pulinput[2];
  400. puloutput[1] = pulkeyStream[1] ^ pulinput[1];
  401. puloutput[0] = pulkeyStream[0] ^ pulinput[0];
  402. #if UNITY_ANDROID && !UNITY_EDITOR
  403. }
  404. else
  405. {
  406. for (int i = 0; i < 64; ++i)
  407. output[i] = (byte)(keyStream[i] ^ input[i]);
  408. }
  409. #endif
  410. FastChaChaEngineHelper.ChachaCoreImpl(rounds, state, keyStream);
  411. ++state[12];
  412. #if UNITY_ANDROID && !UNITY_EDITOR
  413. if ((long)input % sizeof(ulong) == 0)
  414. {
  415. ulong* pulinput = null;
  416. ulong* puloutput = null;
  417. ulong* pulkeyStream = null;
  418. #endif
  419. pulinput = (ulong*)&input[64];
  420. puloutput = (ulong*)&output[64];
  421. pulkeyStream = (ulong*)keyStream;
  422. puloutput[7] = pulkeyStream[7] ^ pulinput[7];
  423. puloutput[6] = pulkeyStream[6] ^ pulinput[6];
  424. puloutput[5] = pulkeyStream[5] ^ pulinput[5];
  425. puloutput[4] = pulkeyStream[4] ^ pulinput[4];
  426. puloutput[3] = pulkeyStream[3] ^ pulinput[3];
  427. puloutput[2] = pulkeyStream[2] ^ pulinput[2];
  428. puloutput[1] = pulkeyStream[1] ^ pulinput[1];
  429. puloutput[0] = pulkeyStream[0] ^ pulinput[0];
  430. #if UNITY_ANDROID && !UNITY_EDITOR
  431. }
  432. else
  433. {
  434. for (int i = 64; i < 128; ++i)
  435. output[i] = (byte)(keyStream[i - 64] ^ input[i]);
  436. }
  437. #endif
  438. }
  439. }
  440. }
  441. }
  442. #endif