#if !BESTHTTP_DISABLE_ALTERNATE_SSL && (!UNITY_WEBGL || UNITY_EDITOR)
#pragma warning disable
using System;
using System.Diagnostics;
#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER || UNITY_2021_2_OR_NEWER
using System.Runtime.CompilerServices;
#endif
#if NETCOREAPP3_0_OR_GREATER
using System.Buffers.Binary;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif
using Best.HTTP.SecureProtocol.Org.BouncyCastle.Crypto.Utilities;
namespace Best.HTTP.SecureProtocol.Org.BouncyCastle.Crypto.Engines
{
///
/// Implementation of Daniel J. Bernstein's ChaCha stream cipher.
///
public class ChaCha7539Engine
: Salsa20Engine
{
///
/// Creates a 20 rounds ChaCha engine.
///
public ChaCha7539Engine()
: base()
{
}
public override string AlgorithmName
{
get { return "ChaCha7539"; }
}
protected override int NonceSize
{
get { return 12; }
}
protected override void AdvanceCounter()
{
if (++engineState[12] == 0)
throw new InvalidOperationException("attempt to increase counter past 2^32.");
}
protected override void ResetCounter()
{
engineState[12] = 0;
}
protected override void SetKey(byte[] keyBytes, byte[] ivBytes)
{
if (keyBytes != null)
{
if (keyBytes.Length != 32)
throw new ArgumentException(AlgorithmName + " requires 256 bit key");
PackTauOrSigma(keyBytes.Length, engineState, 0);
// Key
Pack.LE_To_UInt32(keyBytes, 0, engineState, 4, 8);
}
// IV
Pack.LE_To_UInt32(ivBytes, 0, engineState, 13, 3);
}
protected override void GenerateKeyStream(byte[] output)
{
ChaChaEngine.ChachaCore(rounds, engineState, output);
}
internal void DoFinal(byte[] inBuf, int inOff, int inLen, byte[] outBuf, int outOff)
{
if (!initialised)
throw new InvalidOperationException(AlgorithmName + " not initialised");
if (index != 0)
throw new InvalidOperationException(AlgorithmName + " not in block-aligned state");
Check.DataLength(inBuf, inOff, inLen, "input buffer too short");
Check.OutputLength(outBuf, outOff, inLen, "output buffer too short");
while (inLen >= 128)
{
#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER || UNITY_2021_2_OR_NEWER
ProcessBlocks2(inBuf.AsSpan(inOff), outBuf.AsSpan(outOff));
#else
ProcessBlocks2(inBuf, inOff, outBuf, outOff);
#endif
inOff += 128;
inLen -= 128;
outOff += 128;
}
if (inLen >= 64)
{
#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER || UNITY_2021_2_OR_NEWER
ImplProcessBlock(inBuf.AsSpan(inOff), outBuf.AsSpan(outOff));
#else
ImplProcessBlock(inBuf, inOff, outBuf, outOff);
#endif
inOff += 64;
inLen -= 64;
outOff += 64;
}
if (inLen > 0)
{
GenerateKeyStream(keyStream);
AdvanceCounter();
for (int i = 0; i < inLen; ++i)
{
outBuf[outOff + i] = (byte)(inBuf[i + inOff] ^ keyStream[i]);
}
}
engineState[12] = 0;
// TODO Prevent re-use if encrypting
}
#if NETCOREAPP2_1_OR_GREATER || NETSTANDARD2_1_OR_GREATER || UNITY_2021_2_OR_NEWER
internal void ProcessBlock(ReadOnlySpan input, Span output)
{
if (!initialised)
throw new InvalidOperationException(AlgorithmName + " not initialised");
if (LimitExceeded(64U))
throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV");
Debug.Assert(index == 0);
ImplProcessBlock(input, output);
}
internal void ProcessBlocks2(ReadOnlySpan input, Span output)
{
if (!initialised)
throw new InvalidOperationException(AlgorithmName + " not initialised");
if (LimitExceeded(128U))
throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV");
Debug.Assert(index == 0);
#if NETCOREAPP3_0_OR_GREATER
if (Avx2.IsSupported)
{
ImplProcessBlocks2_X86_Avx2(rounds, engineState, input, output);
return;
}
if (Sse2.IsSupported)
{
ImplProcessBlocks2_X86_Sse2(rounds, engineState, input, output);
return;
}
#endif
{
ImplProcessBlock(input, output);
ImplProcessBlock(input[64..], output[64..]);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal void ImplProcessBlock(ReadOnlySpan input, Span output)
{
ChaChaEngine.ChachaCore(rounds, engineState, keyStream);
AdvanceCounter();
for (int i = 0; i < 64; ++i)
{
output[i] = (byte)(keyStream[i] ^ input[i]);
}
}
#else
internal void ProcessBlock(byte[] inBytes, int inOff, byte[] outBytes, int outOff)
{
if (!initialised)
throw new InvalidOperationException(AlgorithmName + " not initialised");
if (LimitExceeded(64U))
throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV");
Debug.Assert(index == 0);
ImplProcessBlock(inBytes, inOff, outBytes, outOff);
}
internal void ProcessBlocks2(byte[] inBytes, int inOff, byte[] outBytes, int outOff)
{
if (!initialised)
throw new InvalidOperationException(AlgorithmName + " not initialised");
if (LimitExceeded(128U))
throw new MaxBytesExceededException("2^38 byte limit per IV would be exceeded; Change IV");
Debug.Assert(index == 0);
{
ImplProcessBlock(inBytes, inOff, outBytes, outOff);
ImplProcessBlock(inBytes, inOff + 64, outBytes, outOff + 64);
}
}
#if NETSTANDARD1_0_OR_GREATER || NETCOREAPP1_0_OR_GREATER || UNITY_2021_2_OR_NEWER
[MethodImpl(MethodImplOptions.AggressiveInlining)]
#endif
internal void ImplProcessBlock(byte[] inBuf, int inOff, byte[] outBuf, int outOff)
{
ChaChaEngine.ChachaCore(rounds, engineState, keyStream);
AdvanceCounter();
for (int i = 0; i < 64; ++i)
{
outBuf[outOff + i] = (byte)(keyStream[i] ^ inBuf[inOff + i]);
}
}
#endif
#if NETCOREAPP3_0_OR_GREATER
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void ImplProcessBlocks2_X86_Avx2(int rounds, uint[] state, ReadOnlySpan input,
Span output)
{
if (!Avx2.IsSupported)
throw new PlatformNotSupportedException();
Debug.Assert(rounds % 2 == 0);
Debug.Assert(state.Length >= 16);
Debug.Assert(input.Length >= 128);
Debug.Assert(output.Length >= 128);
var t0 = Load128_UInt32(state.AsSpan());
var t1 = Load128_UInt32(state.AsSpan(4));
var t2 = Load128_UInt32(state.AsSpan(8));
var t3 = Load128_UInt32(state.AsSpan(12));
++state[12];
var t4 = Load128_UInt32(state.AsSpan(12));
++state[12];
var x0 = Vector256.Create(t0, t0);
var x1 = Vector256.Create(t1, t1);
var x2 = Vector256.Create(t2, t2);
var x3 = Vector256.Create(t3, t4);
var v0 = x0;
var v1 = x1;
var v2 = x2;
var v3 = x3;
for (int i = rounds; i > 0; i -= 2)
{
v0 = Avx2.Add(v0, v1);
v3 = Avx2.Xor(v3, v0);
v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 16), Avx2.ShiftRightLogical(v3, 16));
v2 = Avx2.Add(v2, v3);
v1 = Avx2.Xor(v1, v2);
v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 12), Avx2.ShiftRightLogical(v1, 20));
v0 = Avx2.Add(v0, v1);
v3 = Avx2.Xor(v3, v0);
v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 8), Avx2.ShiftRightLogical(v3, 24));
v2 = Avx2.Add(v2, v3);
v1 = Avx2.Xor(v1, v2);
v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 7), Avx2.ShiftRightLogical(v1, 25));
v1 = Avx2.Shuffle(v1, 0x39);
v2 = Avx2.Shuffle(v2, 0x4E);
v3 = Avx2.Shuffle(v3, 0x93);
v0 = Avx2.Add(v0, v1);
v3 = Avx2.Xor(v3, v0);
v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 16), Avx2.ShiftRightLogical(v3, 16));
v2 = Avx2.Add(v2, v3);
v1 = Avx2.Xor(v1, v2);
v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 12), Avx2.ShiftRightLogical(v1, 20));
v0 = Avx2.Add(v0, v1);
v3 = Avx2.Xor(v3, v0);
v3 = Avx2.Xor(Avx2.ShiftLeftLogical(v3, 8), Avx2.ShiftRightLogical(v3, 24));
v2 = Avx2.Add(v2, v3);
v1 = Avx2.Xor(v1, v2);
v1 = Avx2.Xor(Avx2.ShiftLeftLogical(v1, 7), Avx2.ShiftRightLogical(v1, 25));
v1 = Avx2.Shuffle(v1, 0x93);
v2 = Avx2.Shuffle(v2, 0x4E);
v3 = Avx2.Shuffle(v3, 0x39);
}
v0 = Avx2.Add(v0, x0);
v1 = Avx2.Add(v1, x1);
v2 = Avx2.Add(v2, x2);
v3 = Avx2.Add(v3, x3);
var n0 = Avx2.Permute2x128(v0, v1, 0x20).AsByte();
var n1 = Avx2.Permute2x128(v2, v3, 0x20).AsByte();
var n2 = Avx2.Permute2x128(v0, v1, 0x31).AsByte();
var n3 = Avx2.Permute2x128(v2, v3, 0x31).AsByte();
n0 = Avx2.Xor(n0, Load256_Byte(input));
n1 = Avx2.Xor(n1, Load256_Byte(input[0x20..]));
n2 = Avx2.Xor(n2, Load256_Byte(input[0x40..]));
n3 = Avx2.Xor(n3, Load256_Byte(input[0x60..]));
Store256_Byte(n0, output);
Store256_Byte(n1, output[0x20..]);
Store256_Byte(n2, output[0x40..]);
Store256_Byte(n3, output[0x60..]);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static void ImplProcessBlocks2_X86_Sse2(int rounds, uint[] state, ReadOnlySpan input,
Span output)
{
if (!Sse2.IsSupported)
throw new PlatformNotSupportedException();
Debug.Assert(rounds % 2 == 0);
Debug.Assert(state.Length >= 16);
Debug.Assert(input.Length >= 128);
Debug.Assert(output.Length >= 128);
var x0 = Load128_UInt32(state.AsSpan());
var x1 = Load128_UInt32(state.AsSpan(4));
var x2 = Load128_UInt32(state.AsSpan(8));
var x3 = Load128_UInt32(state.AsSpan(12));
++state[12];
var v0 = x0;
var v1 = x1;
var v2 = x2;
var v3 = x3;
for (int i = rounds; i > 0; i -= 2)
{
v0 = Sse2.Add(v0, v1);
v3 = Sse2.Xor(v3, v0);
v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
v2 = Sse2.Add(v2, v3);
v1 = Sse2.Xor(v1, v2);
v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
v0 = Sse2.Add(v0, v1);
v3 = Sse2.Xor(v3, v0);
v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
v2 = Sse2.Add(v2, v3);
v1 = Sse2.Xor(v1, v2);
v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
v1 = Sse2.Shuffle(v1, 0x39);
v2 = Sse2.Shuffle(v2, 0x4E);
v3 = Sse2.Shuffle(v3, 0x93);
v0 = Sse2.Add(v0, v1);
v3 = Sse2.Xor(v3, v0);
v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
v2 = Sse2.Add(v2, v3);
v1 = Sse2.Xor(v1, v2);
v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
v0 = Sse2.Add(v0, v1);
v3 = Sse2.Xor(v3, v0);
v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
v2 = Sse2.Add(v2, v3);
v1 = Sse2.Xor(v1, v2);
v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
v1 = Sse2.Shuffle(v1, 0x93);
v2 = Sse2.Shuffle(v2, 0x4E);
v3 = Sse2.Shuffle(v3, 0x39);
}
v0 = Sse2.Add(v0, x0);
v1 = Sse2.Add(v1, x1);
v2 = Sse2.Add(v2, x2);
v3 = Sse2.Add(v3, x3);
var n0 = Load128_Byte(input);
var n1 = Load128_Byte(input[0x10..]);
var n2 = Load128_Byte(input[0x20..]);
var n3 = Load128_Byte(input[0x30..]);
n0 = Sse2.Xor(n0, v0.AsByte());
n1 = Sse2.Xor(n1, v1.AsByte());
n2 = Sse2.Xor(n2, v2.AsByte());
n3 = Sse2.Xor(n3, v3.AsByte());
Store128_Byte(n0, output);
Store128_Byte(n1, output[0x10..]);
Store128_Byte(n2, output[0x20..]);
Store128_Byte(n3, output[0x30..]);
x3 = Load128_UInt32(state.AsSpan(12));
++state[12];
v0 = x0;
v1 = x1;
v2 = x2;
v3 = x3;
for (int i = rounds; i > 0; i -= 2)
{
v0 = Sse2.Add(v0, v1);
v3 = Sse2.Xor(v3, v0);
v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
v2 = Sse2.Add(v2, v3);
v1 = Sse2.Xor(v1, v2);
v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
v0 = Sse2.Add(v0, v1);
v3 = Sse2.Xor(v3, v0);
v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
v2 = Sse2.Add(v2, v3);
v1 = Sse2.Xor(v1, v2);
v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
v1 = Sse2.Shuffle(v1, 0x39);
v2 = Sse2.Shuffle(v2, 0x4E);
v3 = Sse2.Shuffle(v3, 0x93);
v0 = Sse2.Add(v0, v1);
v3 = Sse2.Xor(v3, v0);
v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 16), Sse2.ShiftRightLogical(v3, 16));
v2 = Sse2.Add(v2, v3);
v1 = Sse2.Xor(v1, v2);
v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 12), Sse2.ShiftRightLogical(v1, 20));
v0 = Sse2.Add(v0, v1);
v3 = Sse2.Xor(v3, v0);
v3 = Sse2.Xor(Sse2.ShiftLeftLogical(v3, 8), Sse2.ShiftRightLogical(v3, 24));
v2 = Sse2.Add(v2, v3);
v1 = Sse2.Xor(v1, v2);
v1 = Sse2.Xor(Sse2.ShiftLeftLogical(v1, 7), Sse2.ShiftRightLogical(v1, 25));
v1 = Sse2.Shuffle(v1, 0x93);
v2 = Sse2.Shuffle(v2, 0x4E);
v3 = Sse2.Shuffle(v3, 0x39);
}
v0 = Sse2.Add(v0, x0);
v1 = Sse2.Add(v1, x1);
v2 = Sse2.Add(v2, x2);
v3 = Sse2.Add(v3, x3);
n0 = Load128_Byte(input[0x40..]);
n1 = Load128_Byte(input[0x50..]);
n2 = Load128_Byte(input[0x60..]);
n3 = Load128_Byte(input[0x70..]);
n0 = Sse2.Xor(n0, v0.AsByte());
n1 = Sse2.Xor(n1, v1.AsByte());
n2 = Sse2.Xor(n2, v2.AsByte());
n3 = Sse2.Xor(n3, v3.AsByte());
Store128_Byte(n0, output[0x40..]);
Store128_Byte(n1, output[0x50..]);
Store128_Byte(n2, output[0x60..]);
Store128_Byte(n3, output[0x70..]);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128 Load128_Byte(ReadOnlySpan t)
{
if (BitConverter.IsLittleEndian && Unsafe.SizeOf>() == 16)
return MemoryMarshal.Read>(t);
return Vector128.Create(
BinaryPrimitives.ReadUInt64LittleEndian(t[..8]),
BinaryPrimitives.ReadUInt64LittleEndian(t[8..])
).AsByte();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128 Load128_UInt32(ReadOnlySpan t)
{
if (BitConverter.IsLittleEndian && Unsafe.SizeOf>() == 16)
return MemoryMarshal.Read>(MemoryMarshal.AsBytes(t));
return Vector128.Create(t[0], t[1], t[2], t[3]);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256 Load256_Byte(ReadOnlySpan t)
{
if (BitConverter.IsLittleEndian && Unsafe.SizeOf>() == 32)
return MemoryMarshal.Read>(t);
return Vector256.Create(
BinaryPrimitives.ReadUInt64LittleEndian(t[ 0.. 8]),
BinaryPrimitives.ReadUInt64LittleEndian(t[ 8..16]),
BinaryPrimitives.ReadUInt64LittleEndian(t[16..24]),
BinaryPrimitives.ReadUInt64LittleEndian(t[24..32])
).AsByte();
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Store128_Byte(Vector128 s, Span t)
{
if (BitConverter.IsLittleEndian && Unsafe.SizeOf>() == 16)
{
MemoryMarshal.Write(t, ref s);
return;
}
var u = s.AsUInt64();
BinaryPrimitives.WriteUInt64LittleEndian(t[..8], u.GetElement(0));
BinaryPrimitives.WriteUInt64LittleEndian(t[8..], u.GetElement(1));
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void Store256_Byte(Vector256 s, Span t)
{
if (BitConverter.IsLittleEndian && Unsafe.SizeOf>() == 32)
{
MemoryMarshal.Write(t, ref s);
return;
}
var u = s.AsUInt64();
BinaryPrimitives.WriteUInt64LittleEndian(t[ 0.. 8], u.GetElement(0));
BinaryPrimitives.WriteUInt64LittleEndian(t[ 8..16], u.GetElement(1));
BinaryPrimitives.WriteUInt64LittleEndian(t[16..24], u.GetElement(2));
BinaryPrimitives.WriteUInt64LittleEndian(t[24..32], u.GetElement(3));
}
#endif
}
}
#pragma warning restore
#endif