From 3c0aa4470d145e87618f2fe825bc785bcbc4b580 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Mon, 11 May 2026 12:15:48 +0800 Subject: [PATCH 01/18] Add XChaCha20 algorithm, using a 64-bit counter to support large files. --- CPP/7zip/Archive/7z/7zCompressionMode.h | 4 + CPP/7zip/Archive/7z/7zEncode.cpp | 4 +- CPP/7zip/Archive/7z/7zHandler.cpp | 12 +- CPP/7zip/Archive/7z/7zHandler.h | 1 + CPP/7zip/Archive/7z/7zHandlerOut.cpp | 17 + CPP/7zip/Archive/7z/7zHeader.h | 3 +- CPP/7zip/Archive/7z/7zItem.h | 5 +- CPP/7zip/Archive/7z/7zOut.cpp | 1 + CPP/7zip/Bundles/Format7zF/Arc.mak | 2 + CPP/7zip/Crypto/XChaCha20.cpp | 504 ++++++++++++++++++++++++ CPP/7zip/Crypto/XChaCha20.h | 137 +++++++ CPP/7zip/Crypto/XChaCha20Register.cpp | 17 + CPP/7zip/UI/GUI/CompressDialog.cpp | 18 +- 13 files changed, 717 insertions(+), 8 deletions(-) create mode 100644 CPP/7zip/Crypto/XChaCha20.cpp create mode 100644 CPP/7zip/Crypto/XChaCha20.h create mode 100644 CPP/7zip/Crypto/XChaCha20Register.cpp diff --git a/CPP/7zip/Archive/7z/7zCompressionMode.h b/CPP/7zip/Archive/7z/7zCompressionMode.h index ecfee7cfb..39a9e2395 100644 --- a/CPP/7zip/Archive/7z/7zCompressionMode.h +++ b/CPP/7zip/Archive/7z/7zCompressionMode.h @@ -6,6 +6,8 @@ #include "../../Common/MethodId.h" #include "../../Common/MethodProps.h" +#include "7zHeader.h" + namespace NArchive { namespace N7z { @@ -64,6 +66,7 @@ struct CCompressionMethodMode UString Password; // _Wipe UInt64 MemoryUsageLimit; + CMethodId EncryptionMethodId; bool IsEmpty() const { return (Methods.IsEmpty() && !PasswordIsDefined); } CCompressionMethodMode(): @@ -78,6 +81,7 @@ struct CCompressionMethodMode , NumThreadGroups(0) #endif , MemoryUsageLimit((UInt64)1 << 30) + , EncryptionMethodId(k_AES) {} #ifdef Z7_CPP_IS_SUPPORTED_default diff --git a/CPP/7zip/Archive/7z/7zEncode.cpp b/CPP/7zip/Archive/7z/7zEncode.cpp index 71d1ddb79..d4aa515b0 100644 --- a/CPP/7zip/Archive/7z/7zEncode.cpp +++ b/CPP/7zip/Archive/7z/7zEncode.cpp @@ -562,7 +562,7 @@ HRESULT CEncoder::EncoderConstr() throw 1; CMethodFull method; - method.Id = k_AES; + method.Id = _options.EncryptionMethodId; method.NumStreams = 1; _options.Methods.Add(method); @@ -687,7 +687,7 @@ HRESULT CEncoder::EncoderConstr() { CMethodFull method; method.NumStreams = 1; - method.Id = k_AES; + method.Id = _options.EncryptionMethodId; _options.Methods.Add(method); NCoderMixer2::CCoderStreamsInfo cod; diff --git a/CPP/7zip/Archive/7z/7zHandler.cpp b/CPP/7zip/Archive/7z/7zHandler.cpp index 81dd96691..8053fad9b 100644 --- a/CPP/7zip/Archive/7z/7zHandler.cpp +++ b/CPP/7zip/Archive/7z/7zHandler.cpp @@ -307,7 +307,7 @@ bool CHandler::IsFolderEncrypted(CNum folderIndex) const for (unsigned j = 0; j < idSize; j++) id64 = ((id64 << 8) | longID[j]); inByte.SkipDataNoCheck(idSize); - if (id64 == k_AES) + if (id64 == k_AES || id64 == k_XCHACHA20) return true; if ((mainByte & 0x20) != 0) inByte.SkipDataNoCheck(inByte.ReadNum()); @@ -505,6 +505,16 @@ HRESULT CHandler::SetMethodToProp(CNum folderIndex, PROPVARIANT *prop) const ConvertUInt32ToString(numCyclesPower, s); } } + else if (id == k_XCHACHA20) + { + name = "XChaCha20"; + if (propsSize >= 1) + { + const Byte firstByte = props[0]; + const UInt32 numCyclesPower = firstByte & 0x3F; + ConvertUInt32ToString(numCyclesPower, s); + } + } } if (name) diff --git a/CPP/7zip/Archive/7z/7zHandler.h b/CPP/7zip/Archive/7z/7zHandler.h index ed535f74d..76a1e366f 100644 --- a/CPP/7zip/Archive/7z/7zHandler.h +++ b/CPP/7zip/Archive/7z/7zHandler.h @@ -60,6 +60,7 @@ class COutHandler: public CMultiMethodProps UInt32 _decoderCompatibilityVersion; CUIntVector _enabledFilters; CUIntVector _disabledFilters; + CMethodId _encryptionMethodId; void InitSolidFiles() { _numSolidFiles = (UInt64)(Int64)(-1); } void InitSolidSize() { _numSolidBytes = (UInt64)(Int64)(-1); } diff --git a/CPP/7zip/Archive/7z/7zHandlerOut.cpp b/CPP/7zip/Archive/7z/7zHandlerOut.cpp index c1c2b6369..68e898541 100644 --- a/CPP/7zip/Archive/7z/7zHandlerOut.cpp +++ b/CPP/7zip/Archive/7z/7zHandlerOut.cpp @@ -749,6 +749,7 @@ Z7_COM7F_IMF(CHandler::UpdateItems(ISequentialOutStream *outStream, UInt32 numIt if (methodMode.PasswordIsDefined) { + methodMode.EncryptionMethodId = _encryptionMethodId; if (_encryptHeadersSpecified) encryptHeaders = _encryptHeaders; #ifndef Z7_NO_CRYPTO @@ -760,6 +761,7 @@ Z7_COM7F_IMF(CHandler::UpdateItems(ISequentialOutStream *outStream, UInt32 numIt { headerMethod.PasswordIsDefined = methodMode.PasswordIsDefined; headerMethod.Password = methodMode.Password; + headerMethod.EncryptionMethodId = _encryptionMethodId; } } @@ -874,6 +876,7 @@ void COutHandler::InitProps7z() _decoderCompatibilityVersion = k_decoderCompatibilityVersion; _enabledFilters.Clear(); _disabledFilters.Clear(); + _encryptionMethodId = k_AES; } void COutHandler::InitProps() @@ -1023,6 +1026,20 @@ HRESULT COutHandler::SetProperty(const wchar_t *nameSpec, const PROPVARIANT &val return S_OK; } + if (name.IsEqualTo("em")) + { + if (value.vt != VT_BSTR) + return E_INVALIDARG; + const wchar_t *m = value.bstrVal; + if (StringsAreEqualNoCase_Ascii(m, "AES256") || StringsAreEqualNoCase_Ascii(m, "AES-256")) + _encryptionMethodId = k_AES; + else if (StringsAreEqualNoCase_Ascii(m, "XChaCha20")) + _encryptionMethodId = k_XCHACHA20; + else + return E_INVALIDARG; + return S_OK; + } + { bool processed; RINOK(TimeOptions.Parse(name, value, processed)) diff --git a/CPP/7zip/Archive/7z/7zHeader.h b/CPP/7zip/Archive/7z/7zHeader.h index 22e796070..449bd2039 100644 --- a/CPP/7zip/Archive/7z/7zHeader.h +++ b/CPP/7zip/Archive/7z/7zHeader.h @@ -122,7 +122,8 @@ const UInt32 k_ARM = 0x3030501; const UInt32 k_ARMT = 0x3030701; const UInt32 k_SPARC = 0x3030805; -const UInt32 k_AES = 0x6F10701; +const UInt32 k_AES = 0x6F10701; +const UInt32 k_XCHACHA20 = 0x6F10702; // const UInt32 k_ZSTD = 0x4015D; // winzip zstd // 0x4F71101, 7z-zstd diff --git a/CPP/7zip/Archive/7z/7zItem.h b/CPP/7zip/Archive/7z/7zItem.h index 06737c812..edd101bb9 100644 --- a/CPP/7zip/Archive/7z/7zItem.h +++ b/CPP/7zip/Archive/7z/7zItem.h @@ -83,8 +83,11 @@ struct CFolder bool IsEncrypted() const { FOR_VECTOR(i, Coders) - if (Coders[i].MethodID == k_AES) + { + CMethodId id = Coders[i].MethodID; + if (id == k_AES || id == k_XCHACHA20) return true; + } return false; } }; diff --git a/CPP/7zip/Archive/7z/7zOut.cpp b/CPP/7zip/Archive/7z/7zOut.cpp index d0c8cf293..dfedd9091 100644 --- a/CPP/7zip/Archive/7z/7zOut.cpp +++ b/CPP/7zip/Archive/7z/7zOut.cpp @@ -933,6 +933,7 @@ HRESULT COutArchive::WriteDatabase( CCompressionMethodMode encryptOptions; encryptOptions.PasswordIsDefined = options->PasswordIsDefined; encryptOptions.Password = options->Password; + encryptOptions.EncryptionMethodId = options->EncryptionMethodId; CEncoder encoder(headerOptions.CompressMainHeader ? *options : encryptOptions); CRecordVector packSizes; CObjectVector folders; diff --git a/CPP/7zip/Bundles/Format7zF/Arc.mak b/CPP/7zip/Bundles/Format7zF/Arc.mak index b1c6fe229..4d44bceca 100644 --- a/CPP/7zip/Bundles/Format7zF/Arc.mak +++ b/CPP/7zip/Bundles/Format7zF/Arc.mak @@ -244,6 +244,8 @@ COMPRESS_OBJS = \ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ + $O\XChaCha20.obj \ + $O\XChaCha20Register.obj \ $O\HmacSha1.obj \ $O\HmacSha256.obj \ $O\MyAes.obj \ diff --git a/CPP/7zip/Crypto/XChaCha20.cpp b/CPP/7zip/Crypto/XChaCha20.cpp new file mode 100644 index 000000000..040d852d9 --- /dev/null +++ b/CPP/7zip/Crypto/XChaCha20.cpp @@ -0,0 +1,504 @@ +// XChaCha20.cpp + +#include "StdAfx.h" + +#include "../../../C/CpuArch.h" +#include "../../../C/Sha256.h" + +#include "../../Common/ComTry.h" +#include "../../Common/MyBuffer2.h" + +#ifndef Z7_ST +#include "../../Windows/Synchronization.h" +#endif + +#include "../Common/StreamUtils.h" + +#include "XChaCha20.h" + +#ifndef Z7_EXTRACT_ONLY +#include "RandGen.h" +#endif + +namespace NCrypto { +namespace NXChaCha20 { + +static const unsigned k_NumCyclesPower_Supported_MAX = 24; + +#define ROTL32(v, n) (((v) << (n)) | ((v) >> (32 - (n)))) + +#define QUARTERROUND(a, b, c, d) \ + a += b; d ^= a; d = ROTL32(d, 16); \ + c += d; b ^= c; b = ROTL32(b, 12); \ + a += b; d ^= a; d = ROTL32(d, 8); \ + c += d; b ^= c; b = ROTL32(b, 7); + +bool CKeyInfo::IsEqualTo(const CKeyInfo &a) const +{ + if (SaltSize != a.SaltSize || NumCyclesPower != a.NumCyclesPower) + return false; + for (unsigned i = 0; i < SaltSize; i++) + if (Salt[i] != a.Salt[i]) + return false; + return (Password == a.Password); +} + +void CKeyInfo::CalcKey() +{ + if (NumCyclesPower == 0x3F) + { + unsigned pos; + for (pos = 0; pos < SaltSize; pos++) + Key[pos] = Salt[pos]; + for (unsigned i = 0; i < Password.Size() && pos < kKeySize; i++) + Key[pos++] = Password[i]; + for (; pos < kKeySize; pos++) + Key[pos] = 0; + } + else + { + const unsigned kUnrPow = 6; + const UInt32 numUnroll = (UInt32)1 << (NumCyclesPower <= kUnrPow ? (unsigned)NumCyclesPower : kUnrPow); + + const size_t bufSize = 8 + SaltSize + Password.Size(); + const size_t unrollSize = bufSize * numUnroll; + + const size_t shaAllocSize = sizeof(CSha256) + unrollSize + bufSize * 2; + CAlignedBuffer1 sha(shaAllocSize); + Byte *buf = sha + sizeof(CSha256); + + memcpy(buf, Salt, SaltSize); + memcpy(buf + SaltSize, Password, Password.Size()); + memset(buf + bufSize - 8, 0, 8); + + Sha256_Init((CSha256 *)(void *)(Byte *)sha); + + { + { + Byte *dest = buf; + for (UInt32 i = 1; i < numUnroll; i++) + { + dest += bufSize; + memcpy(dest, buf, bufSize); + } + } + + const UInt32 numRounds = (UInt32)1 << NumCyclesPower; + UInt32 r = 0; + do + { + Byte *dest = buf + bufSize - 8; + UInt32 i = r; + r += numUnroll; + do + { + SetUi32(dest, i) i++; dest += bufSize; + } + while (i < r); + Sha256_Update((CSha256 *)(void *)(Byte *)sha, buf, unrollSize); + } + while (r < numRounds); + } + + Sha256_Final((CSha256 *)(void *)(Byte *)sha, Key); + memset(sha, 0, shaAllocSize); + } +} + +bool CKeyInfoCache::GetKey(CKeyInfo &key) +{ + FOR_VECTOR (i, Keys) + { + const CKeyInfo &cached = Keys[i]; + if (key.IsEqualTo(cached)) + { + for (unsigned j = 0; j < kKeySize; j++) + key.Key[j] = cached.Key[j]; + if (i != 0) + Keys.MoveToFront(i); + return true; + } + } + return false; +} + +void CKeyInfoCache::FindAndAdd(const CKeyInfo &key) +{ + FOR_VECTOR (i, Keys) + { + const CKeyInfo &cached = Keys[i]; + if (key.IsEqualTo(cached)) + { + if (i != 0) + Keys.MoveToFront(i); + return; + } + } + Add(key); +} + +void CKeyInfoCache::Add(const CKeyInfo &key) +{ + if (Keys.Size() >= Size) + Keys.DeleteBack(); + Keys.Insert(0, key); +} + +static CKeyInfoCache g_GlobalKeyCache(32); + +#ifndef Z7_ST + static NWindows::NSynchronization::CCriticalSection g_GlobalKeyCacheCriticalSection; + #define MT_LOCK NWindows::NSynchronization::CCriticalSectionLock lock(g_GlobalKeyCacheCriticalSection); +#else + #define MT_LOCK +#endif + +CBase::CBase(): + _cachedKeys(16), + _counter(0) +{ + for (unsigned i = 0; i < sizeof(_nonce); i++) + _nonce[i] = 0; +} + +void CBaseCoder::DeriveKey() +{ + HChaCha20Block(_derivedKey, _key.Key, _nonce); + _derivedKeyValid = true; +} + +void CBase::PrepareKey() +{ + MT_LOCK + + bool finded = false; + if (!_cachedKeys.GetKey(_key)) + { + finded = g_GlobalKeyCache.GetKey(_key); + if (!finded) + _key.CalcKey(); + _cachedKeys.Add(_key); + } + if (!finded) + g_GlobalKeyCache.FindAndAdd(_key); +} + +static const Byte kSigma[16] = { + 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', '2', '-', 'b', 'y', 't', 'e', ' ', 'k' +}; + +void CBaseCoder::HChaCha20Block(Byte *output, const Byte *key, const Byte *nonce) +{ + UInt32 x0, x1, x2, x3, x4, x5, x6, x7; + UInt32 x8, x9, x10, x11, x12, x13, x14, x15; + + x0 = GetUi32(kSigma); + x1 = GetUi32(kSigma + 4); + x2 = GetUi32(kSigma + 8); + x3 = GetUi32(kSigma + 12); + + x4 = GetUi32(key); + x5 = GetUi32(key + 4); + x6 = GetUi32(key + 8); + x7 = GetUi32(key + 12); + x8 = GetUi32(key + 16); + x9 = GetUi32(key + 20); + x10 = GetUi32(key + 24); + x11 = GetUi32(key + 28); + + x12 = GetUi32(nonce); + x13 = GetUi32(nonce + 4); + x14 = GetUi32(nonce + 8); + x15 = GetUi32(nonce + 12); + +#define DOUBLE_ROUND \ + QUARTERROUND(x0, x4, x8, x12); \ + QUARTERROUND(x1, x5, x9, x13); \ + QUARTERROUND(x2, x6, x10, x14); \ + QUARTERROUND(x3, x7, x11, x15); \ + QUARTERROUND(x0, x5, x10, x15); \ + QUARTERROUND(x1, x6, x11, x12); \ + QUARTERROUND(x2, x7, x8, x13); \ + QUARTERROUND(x3, x4, x9, x14); + + DOUBLE_ROUND; DOUBLE_ROUND; + DOUBLE_ROUND; DOUBLE_ROUND; + DOUBLE_ROUND; DOUBLE_ROUND; + DOUBLE_ROUND; DOUBLE_ROUND; + DOUBLE_ROUND; DOUBLE_ROUND; + +#undef DOUBLE_ROUND + + SetUi32(output, x0); + SetUi32(output + 4, x1); + SetUi32(output + 8, x2); + SetUi32(output + 12, x3); + SetUi32(output + 16, x12); + SetUi32(output + 20, x13); + SetUi32(output + 24, x14); + SetUi32(output + 28, x15); +} + +void CBaseCoder::Chacha20Block(Byte *output, const Byte *key, const Byte *nonce, UInt64 counter) +{ + UInt32 x0, x1, x2, x3, x4, x5, x6, x7; + UInt32 x8, x9, x10, x11, x12, x13, x14, x15; + + x0 = GetUi32(kSigma); + x1 = GetUi32(kSigma + 4); + x2 = GetUi32(kSigma + 8); + x3 = GetUi32(kSigma + 12); + + x4 = GetUi32(key); + x5 = GetUi32(key + 4); + x6 = GetUi32(key + 8); + x7 = GetUi32(key + 12); + x8 = GetUi32(key + 16); + x9 = GetUi32(key + 20); + x10 = GetUi32(key + 24); + x11 = GetUi32(key + 28); + + x12 = (UInt32)(counter & 0xFFFFFFFF); + x13 = (UInt32)(counter >> 32); + x14 = GetUi32(nonce); + x15 = GetUi32(nonce + 4); + +#define DOUBLE_ROUND \ + QUARTERROUND(x0, x4, x8, x12); \ + QUARTERROUND(x1, x5, x9, x13); \ + QUARTERROUND(x2, x6, x10, x14); \ + QUARTERROUND(x3, x7, x11, x15); \ + QUARTERROUND(x0, x5, x10, x15); \ + QUARTERROUND(x1, x6, x11, x12); \ + QUARTERROUND(x2, x7, x8, x13); \ + QUARTERROUND(x3, x4, x9, x14); + + DOUBLE_ROUND; DOUBLE_ROUND; + DOUBLE_ROUND; DOUBLE_ROUND; + DOUBLE_ROUND; DOUBLE_ROUND; + DOUBLE_ROUND; DOUBLE_ROUND; + DOUBLE_ROUND; DOUBLE_ROUND; + +#undef DOUBLE_ROUND + + x0 += GetUi32(kSigma); + x1 += GetUi32(kSigma + 4); + x2 += GetUi32(kSigma + 8); + x3 += GetUi32(kSigma + 12); + x4 += GetUi32(key); + x5 += GetUi32(key + 4); + x6 += GetUi32(key + 8); + x7 += GetUi32(key + 12); + x8 += GetUi32(key + 16); + x9 += GetUi32(key + 20); + x10 += GetUi32(key + 24); + x11 += GetUi32(key + 28); + x12 += (UInt32)(counter & 0xFFFFFFFF); + x13 += (UInt32)(counter >> 32); + x14 += GetUi32(nonce); + x15 += GetUi32(nonce + 4); + + SetUi32(output, x0) + SetUi32(output + 4, x1) + SetUi32(output + 8, x2) + SetUi32(output + 12, x3) + SetUi32(output + 16, x4) + SetUi32(output + 20, x5) + SetUi32(output + 24, x6) + SetUi32(output + 28, x7) + SetUi32(output + 32, x8) + SetUi32(output + 36, x9) + SetUi32(output + 40, x10) + SetUi32(output + 44, x11) + SetUi32(output + 48, x12) + SetUi32(output + 52, x13) + SetUi32(output + 56, x14) + SetUi32(output + 60, x15) +} + +void CBaseCoder::ProcessData(Byte *data, UInt32 size) +{ + if (!_derivedKeyValid) + { + DeriveKey(); + } + + while (size > 0) + { + if (_blockPos == 0 || _blockPos >= kBlockSize) + { + Chacha20Block(_block, _derivedKey, _nonce + 16, _counter); + _counter++; + _blockPos = 0; + } + + UInt32 remaining = kBlockSize - _blockPos; + UInt32 toProcess = (size < remaining) ? size : remaining; + + Byte *dataPtr = data; + const Byte *blockPtr = _block + _blockPos; + UInt32 count = toProcess; + +#ifdef MY_CPU_64BIT + while (count >= 8) + { + *(UInt64 *)dataPtr ^= *(const UInt64 *)blockPtr; + dataPtr += 8; + blockPtr += 8; + count -= 8; + } +#endif + + while (count >= 4) + { + *(UInt32 *)dataPtr ^= *(const UInt32 *)blockPtr; + dataPtr += 4; + blockPtr += 4; + count -= 4; + } + + while (count--) + *dataPtr++ ^= *blockPtr++; + + data += toProcess; + size -= toProcess; + _blockPos += toProcess; + } +} + +#ifndef Z7_EXTRACT_ONLY + +Z7_COM7F_IMF(CEncoder::ResetInitVector()) +{ + for (unsigned i = 0; i < sizeof(_nonce); i++) + _nonce[i] = 0; + MY_RAND_GEN(_nonce, kNonceSize); + _counter = 0; + _blockPos = kBlockSize; + _derivedKeyValid = false; + return S_OK; +} + +Z7_COM7F_IMF(CEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) +{ + Byte props[2 + sizeof(_key.Salt) + kNonceSize]; + unsigned propsSize = 1; + + const unsigned nonceSizeMinus1 = kNonceSize - 1; + const unsigned nonceHigh = (nonceSizeMinus1 >= 16) ? (1 << 6) : 0; + const unsigned nonceLow = nonceSizeMinus1 & 0x0F; + + props[0] = (Byte)(_key.NumCyclesPower + | (_key.SaltSize == 0 ? 0 : (1 << 7)) + | nonceHigh); + + if (_key.SaltSize != 0) + { + props[1] = (Byte)( + ((_key.SaltSize == 0 ? 0 : _key.SaltSize - 1) << 4) + | nonceLow); + memcpy(props + 2, _key.Salt, _key.SaltSize); + propsSize = 2 + _key.SaltSize; + memcpy(props + propsSize, _nonce, kNonceSize); + propsSize += kNonceSize; + } + else + { + props[1] = (Byte)(nonceLow); + propsSize = 2; + memcpy(props + propsSize, _nonce, kNonceSize); + propsSize += kNonceSize; + } + + return WriteStream(outStream, props, propsSize); +} + +CEncoder::CEncoder() +{ + _key.NumCyclesPower = 19; + _counter = 0; + _blockPos = kBlockSize; + _derivedKeyValid = false; +} + +#endif + +CDecoder::CDecoder() +{ + _counter = 0; + _blockPos = kBlockSize; + _derivedKeyValid = false; +} + +Z7_COM7F_IMF(CDecoder::SetDecoderProperties2(const Byte *data, UInt32 size)) +{ + _key.ClearProps(); + + _counter = 0; + _blockPos = kBlockSize; + _derivedKeyValid = false; + unsigned i; + for (i = 0; i < sizeof(_nonce); i++) + _nonce[i] = 0; + + if (size == 0) + return S_OK; + + const unsigned b0 = data[0]; + _key.NumCyclesPower = b0 & 0x3F; + if ((b0 & 0xC0) == 0) + return size == 1 ? S_OK : E_INVALIDARG; + if (size <= 1) + return E_INVALIDARG; + + const unsigned b1 = data[1]; + const unsigned saltSize = ((b0 >> 7) & 1) + (b1 >> 4); + const unsigned nonceSizeMinus1 = ((b0 >> 6) & 1) * 16 + (b1 & 0x0F); + const unsigned nonceSize = nonceSizeMinus1 + 1; + + if (size != 2 + saltSize + nonceSize) + return E_INVALIDARG; + _key.SaltSize = saltSize; + data += 2; + for (i = 0; i < saltSize; i++) + _key.Salt[i] = *data++; + for (i = 0; i < nonceSize && i < kNonceSize; i++) + _nonce[i] = *data++; + + return (_key.NumCyclesPower <= k_NumCyclesPower_Supported_MAX + || _key.NumCyclesPower == 0x3F) ? S_OK : E_NOTIMPL; +} + + +Z7_COM7F_IMF(CBaseCoder::CryptoSetPassword(const Byte *data, UInt32 size)) +{ + COM_TRY_BEGIN + + _key.Password.Wipe(); + _key.Password.CopyFrom(data, (size_t)size); + _derivedKeyValid = false; + return S_OK; + + COM_TRY_END +} + +Z7_COM7F_IMF(CBaseCoder::Init()) +{ + COM_TRY_BEGIN + + PrepareKey(); + _counter = 0; + _blockPos = kBlockSize; + _derivedKeyValid = false; + return S_OK; + + COM_TRY_END +} + +Z7_COM7F_IMF2(UInt32, CBaseCoder::Filter(Byte *data, UInt32 size)) +{ + ProcessData(data, size); + return size; +} + +}} diff --git a/CPP/7zip/Crypto/XChaCha20.h b/CPP/7zip/Crypto/XChaCha20.h new file mode 100644 index 000000000..8b10fe312 --- /dev/null +++ b/CPP/7zip/Crypto/XChaCha20.h @@ -0,0 +1,137 @@ +// XChaCha20.h + +#ifndef ZIP7_INC_CRYPTO_XCHACHA20_H +#define ZIP7_INC_CRYPTO_XCHACHA20_H + +#include "../../Common/MyBuffer.h" +#include "../../Common/MyCom.h" +#include "../../Common/MyVector.h" + +#include "../ICoder.h" +#include "../IPassword.h" + +namespace NCrypto { +namespace NXChaCha20 { + +const unsigned kKeySize = 32; +const unsigned kNonceSize = 24; +const unsigned kSaltSizeMax = 16; + +class CKeyInfo +{ +public: + unsigned NumCyclesPower; + unsigned SaltSize; + Byte Salt[kSaltSizeMax]; + CByteBuffer Password; + Byte Key[kKeySize]; + + bool IsEqualTo(const CKeyInfo &a) const; + void CalcKey(); + + CKeyInfo() { ClearProps(); } + void ClearProps() + { + NumCyclesPower = 0; + SaltSize = 0; + for (unsigned i = 0; i < sizeof(Salt); i++) + Salt[i] = 0; + } + + void Wipe() + { + Password.Wipe(); + NumCyclesPower = 0; + SaltSize = 0; + Z7_memset_0_ARRAY(Salt); + Z7_memset_0_ARRAY(Key); + } + +#ifdef Z7_CPP_IS_SUPPORTED_default + CKeyInfo(const CKeyInfo &) = default; +#endif + ~CKeyInfo() { Wipe(); } +}; + +class CKeyInfoCache +{ + unsigned Size; + CObjectVector Keys; +public: + CKeyInfoCache(unsigned size): Size(size) {} + bool GetKey(CKeyInfo &key); + void Add(const CKeyInfo &key); + void FindAndAdd(const CKeyInfo &key); +}; + +class CBase +{ + CKeyInfoCache _cachedKeys; +protected: + CKeyInfo _key; + Byte _nonce[kNonceSize]; + UInt64 _counter; + + void PrepareKey(); + CBase(); +}; + +class CBaseCoder: + public ICompressFilter, + public ICryptoSetPassword, + public CMyUnknownImp, + public CBase +{ + Z7_IFACE_COM7_IMP(ICompressFilter) + Z7_IFACE_COM7_IMP(ICryptoSetPassword) +protected: + virtual ~CBaseCoder() {} + + static const unsigned kBlockSize = 64; + Byte _block[kBlockSize]; + unsigned _blockPos; + Byte _derivedKey[kKeySize]; + bool _derivedKeyValid; + + void HChaCha20Block(Byte *output, const Byte *key, const Byte *nonce); + void Chacha20Block(Byte *output, const Byte *key, const Byte *nonce, UInt64 counter); + void ProcessData(Byte *data, UInt32 size); + void DeriveKey(); +}; + +#ifndef Z7_EXTRACT_ONLY + +class CEncoder Z7_final: + public CBaseCoder, + public ICompressWriteCoderProperties, + public ICryptoResetInitVector +{ + Z7_COM_UNKNOWN_IMP_4( + ICompressFilter, + ICryptoSetPassword, + ICompressWriteCoderProperties, + ICryptoResetInitVector) + Z7_IFACE_COM7_IMP(ICompressWriteCoderProperties) + Z7_IFACE_COM7_IMP(ICryptoResetInitVector) +public: + CEncoder(); +}; + +#endif + +class CDecoder Z7_final: + public CBaseCoder, + public ICompressSetDecoderProperties2 +{ + Z7_COM_UNKNOWN_IMP_3( + ICompressFilter, + ICryptoSetPassword, + ICompressSetDecoderProperties2) + Z7_IFACE_COM7_IMP(ICompressSetDecoderProperties2) +public: + CDecoder(); +}; + +}} + +#endif diff --git a/CPP/7zip/Crypto/XChaCha20Register.cpp b/CPP/7zip/Crypto/XChaCha20Register.cpp new file mode 100644 index 000000000..5d22ccf73 --- /dev/null +++ b/CPP/7zip/Crypto/XChaCha20Register.cpp @@ -0,0 +1,17 @@ +// XChaCha20Register.cpp + +#include "StdAfx.h" + +#include "../Common/RegisterCodec.h" + +#include "XChaCha20.h" + +namespace NCrypto { +namespace NXChaCha20 { + +REGISTER_FILTER_E(XChaCha20, + CDecoder, + CEncoder, + 0x6F10702, "XChaCha20") + +}} diff --git a/CPP/7zip/UI/GUI/CompressDialog.cpp b/CPP/7zip/UI/GUI/CompressDialog.cpp index 53e56fe27..28fff2dae 100644 --- a/CPP/7zip/UI/GUI/CompressDialog.cpp +++ b/CPP/7zip/UI/GUI/CompressDialog.cpp @@ -1725,8 +1725,21 @@ void CCompressDialog::SetEncryptionMethod() const CArcInfoEx &ai = Get_ArcInfoEx(); if (ai.Is_7z()) { + const int index = FindRegistryFormat(ai.Name); + UString encryptionMethod; + if (index >= 0) + { + const NCompression::CFormatOptions &fo = m_RegistryInfo.Formats[index]; + encryptionMethod = fo.EncryptionMethod; + } ComboBox_AddStringAscii(_encryptionMethod, "AES-256"); - _encryptionMethod.SetCurSel(0); + ComboBox_AddStringAscii(_encryptionMethod, "XChaCha20"); + int sel = 0; + if (encryptionMethod.IsPrefixedBy_Ascii_NoCase("xchacha")) + sel = 1; + else if (encryptionMethod.IsPrefixedBy_Ascii_NoCase("aes")) + sel = 0; + _encryptionMethod.SetCurSel(sel); _default_encryptionMethod_Index = 0; } else if (ai.Is_Zip()) @@ -1810,8 +1823,7 @@ bool CCompressDialog::IsMethodEqualTo(const UString &s) UString CCompressDialog::GetEncryptionMethodSpec() { UString s; - if (_encryptionMethod.GetCount() > 0 - && _encryptionMethod.GetCurSel() != _default_encryptionMethod_Index) + if (_encryptionMethod.GetCount() > 0) { _encryptionMethod.GetText(s); s.RemoveChar(L'-'); From cee7e7d6d5201ca816b653b827d2ac453755ad4a Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Tue, 12 May 2026 14:49:34 +0800 Subject: [PATCH 02/18] Password derivation, AES, and XChaCha20 use constant-time comparison. --- CPP/7zip/Crypto/7zAes.cpp | 17 +++++++++++++---- CPP/7zip/Crypto/7zAes.h | 4 ++++ CPP/7zip/Crypto/Rar5Aes.h | 19 ++++++++++++++++--- CPP/7zip/Crypto/XChaCha20.cpp | 23 ++++++++++++++++++----- CPP/7zip/Crypto/XChaCha20.h | 10 +++++++++- 5 files changed, 60 insertions(+), 13 deletions(-) diff --git a/CPP/7zip/Crypto/7zAes.cpp b/CPP/7zip/Crypto/7zAes.cpp index b2031050a..af6c708ba 100644 --- a/CPP/7zip/Crypto/7zAes.cpp +++ b/CPP/7zip/Crypto/7zAes.cpp @@ -26,14 +26,23 @@ namespace N7z { static const unsigned k_NumCyclesPower_Supported_MAX = 24; +static bool ConstantTimeCompare(const Byte *a, const Byte *b, size_t size) +{ + volatile Byte result = 0; + for (size_t i = 0; i < size; i++) + result |= a[i] ^ b[i]; + return result == 0; +} + bool CKeyInfo::IsEqualTo(const CKeyInfo &a) const { if (SaltSize != a.SaltSize || NumCyclesPower != a.NumCyclesPower) return false; - for (unsigned i = 0; i < SaltSize; i++) - if (Salt[i] != a.Salt[i]) - return false; - return (Password == a.Password); + if (!ConstantTimeCompare(Salt, a.Salt, SaltSize)) + return false; + if (Password.Size() != a.Password.Size()) + return false; + return ConstantTimeCompare(Password, a.Password, Password.Size()); } void CKeyInfo::CalcKey() diff --git a/CPP/7zip/Crypto/7zAes.h b/CPP/7zip/Crypto/7zAes.h index 8f7bf03eb..75cd79804 100644 --- a/CPP/7zip/Crypto/7zAes.h +++ b/CPP/7zip/Crypto/7zAes.h @@ -74,6 +74,10 @@ class CBase void PrepareKey(); CBase(); + ~CBase() + { + Z7_memset_0_ARRAY(_iv); + } }; class CBaseCoder: diff --git a/CPP/7zip/Crypto/Rar5Aes.h b/CPP/7zip/Crypto/Rar5Aes.h index c6059aa21..81e42e43f 100644 --- a/CPP/7zip/Crypto/Rar5Aes.h +++ b/CPP/7zip/Crypto/Rar5Aes.h @@ -22,6 +22,14 @@ namespace NCryptoFlags const unsigned kUseMAC = 1 << 1; } +inline bool ConstantTimeCompare(const Byte *a, const Byte *b, size_t size) +{ + volatile Byte result = 0; + for (size_t i = 0; i < size; i++) + result |= a[i] ^ b[i]; + return result == 0; +} + struct CKeyBase { protected: @@ -49,9 +57,13 @@ struct CKey: public CKeyBase bool IsKeyEqualTo(const CKey &key) { - return _numIterationsLog == key._numIterationsLog - && memcmp(_salt, key._salt, sizeof(_salt)) == 0 - && _password == key._password; + if (_numIterationsLog != key._numIterationsLog) + return false; + if (!ConstantTimeCompare(_salt, key._salt, sizeof(_salt))) + return false; + if (_password.Size() != key._password.Size()) + return false; + return ConstantTimeCompare(_password, key._password, _password.Size()); } CKey(); @@ -79,6 +91,7 @@ class CDecoder Z7_final: Byte _iv[AES_BLOCK_SIZE]; CDecoder(); + ~CDecoder() { Z7_memset_0_ARRAY(_iv); } Z7_COM7F_IMP(Init()) diff --git a/CPP/7zip/Crypto/XChaCha20.cpp b/CPP/7zip/Crypto/XChaCha20.cpp index 040d852d9..457efc95c 100644 --- a/CPP/7zip/Crypto/XChaCha20.cpp +++ b/CPP/7zip/Crypto/XChaCha20.cpp @@ -33,14 +33,23 @@ static const unsigned k_NumCyclesPower_Supported_MAX = 24; a += b; d ^= a; d = ROTL32(d, 8); \ c += d; b ^= c; b = ROTL32(b, 7); +static bool ConstantTimeCompare(const Byte *a, const Byte *b, size_t size) +{ + volatile Byte result = 0; + for (size_t i = 0; i < size; i++) + result |= a[i] ^ b[i]; + return result == 0; +} + bool CKeyInfo::IsEqualTo(const CKeyInfo &a) const { if (SaltSize != a.SaltSize || NumCyclesPower != a.NumCyclesPower) return false; - for (unsigned i = 0; i < SaltSize; i++) - if (Salt[i] != a.Salt[i]) - return false; - return (Password == a.Password); + if (!ConstantTimeCompare(Salt, a.Salt, SaltSize)) + return false; + if (Password.Size() != a.Password.Size()) + return false; + return ConstantTimeCompare(Password, a.Password, Password.Size()); } void CKeyInfo::CalcKey() @@ -328,8 +337,12 @@ void CBaseCoder::ProcessData(Byte *data, UInt32 size) if (_blockPos == 0 || _blockPos >= kBlockSize) { Chacha20Block(_block, _derivedKey, _nonce + 16, _counter); - _counter++; _blockPos = 0; + _counter++; + if (_counter == 0) + { + memset(_block, 0, kBlockSize); + } } UInt32 remaining = kBlockSize - _blockPos; diff --git a/CPP/7zip/Crypto/XChaCha20.h b/CPP/7zip/Crypto/XChaCha20.h index 8b10fe312..821aa014c 100644 --- a/CPP/7zip/Crypto/XChaCha20.h +++ b/CPP/7zip/Crypto/XChaCha20.h @@ -74,6 +74,10 @@ class CBase void PrepareKey(); CBase(); + ~CBase() + { + Z7_memset_0_ARRAY(_nonce); + } }; class CBaseCoder: @@ -85,7 +89,11 @@ class CBaseCoder: Z7_IFACE_COM7_IMP(ICompressFilter) Z7_IFACE_COM7_IMP(ICryptoSetPassword) protected: - virtual ~CBaseCoder() {} + virtual ~CBaseCoder() + { + Z7_memset_0_ARRAY(_block); + Z7_memset_0_ARRAY(_derivedKey); + } static const unsigned kBlockSize = 64; Byte _block[kBlockSize]; From 7c405bb8bbf4ee451c0011ec55bf7708494dda06 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Wed, 13 May 2026 12:34:02 +0800 Subject: [PATCH 03/18] Add XChaCha20 algorithm to self-extraction --- CPP/7zip/Bundles/SFXCon/makefile | 2 ++ CPP/7zip/Bundles/SFXWin/makefile | 2 ++ 2 files changed, 4 insertions(+) diff --git a/CPP/7zip/Bundles/SFXCon/makefile b/CPP/7zip/Bundles/SFXCon/makefile index a72e3f8fb..e087c19bb 100644 --- a/CPP/7zip/Bundles/SFXCon/makefile +++ b/CPP/7zip/Bundles/SFXCon/makefile @@ -110,6 +110,8 @@ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ $O\MyAes.obj \ + $O\XChaCha20.obj \ + $O\XChaCha20Register.obj \ C_OBJS = \ $O\7zStream.obj \ diff --git a/CPP/7zip/Bundles/SFXWin/makefile b/CPP/7zip/Bundles/SFXWin/makefile index 806bd0739..e007f07c2 100644 --- a/CPP/7zip/Bundles/SFXWin/makefile +++ b/CPP/7zip/Bundles/SFXWin/makefile @@ -132,6 +132,8 @@ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ $O\MyAes.obj \ + $O\XChaCha20.obj \ + $O\XChaCha20Register.obj \ C_OBJS = \ $O\7zStream.obj \ From f891c32c2a2a530c973dc533918728559f75d721 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Fri, 22 May 2026 00:14:40 +0800 Subject: [PATCH 04/18] The extraction and derivation method is implemented as a generic function. --- CPP/7zip/7zip_gcc.mak | 2 + CPP/7zip/Bundles/Alone/Alone.dsp | 8 ++ CPP/7zip/Bundles/Alone/makefile | 1 + CPP/7zip/Bundles/Alone/makefile.gcc | 1 + CPP/7zip/Bundles/Alone7z/Alone.dsp | 8 ++ CPP/7zip/Bundles/Alone7z/makefile | 1 + CPP/7zip/Bundles/Alone7z/makefile.gcc | 1 + CPP/7zip/Bundles/Format7z/makefile | 1 + CPP/7zip/Bundles/Format7zExtract/makefile | 1 + CPP/7zip/Bundles/Format7zF/Arc.mak | 1 + CPP/7zip/Bundles/Format7zF/Arc_gcc.mak | 1 + CPP/7zip/Bundles/Format7zF/Format7z.dsp | 18 +++ CPP/7zip/Bundles/SFXCon/SFXCon.dsp | 8 ++ CPP/7zip/Bundles/SFXCon/makefile | 1 + CPP/7zip/Bundles/SFXCon/makefile.gcc | 1 + CPP/7zip/Bundles/SFXWin/SFXWin.dsp | 8 ++ CPP/7zip/Bundles/SFXWin/makefile | 1 + CPP/7zip/Crypto/7zAes.cpp | 149 ---------------------- CPP/7zip/Crypto/7zAes.h | 59 +-------- CPP/7zip/Crypto/7zKeyDerivation.cpp | 136 ++++++++++++++++++++ CPP/7zip/Crypto/7zKeyDerivation.h | 65 ++++++++++ CPP/7zip/Crypto/XChaCha20.cpp | 122 ------------------ CPP/7zip/Crypto/XChaCha20.h | 56 +------- 23 files changed, 276 insertions(+), 374 deletions(-) create mode 100644 CPP/7zip/Crypto/7zKeyDerivation.cpp create mode 100644 CPP/7zip/Crypto/7zKeyDerivation.h diff --git a/CPP/7zip/7zip_gcc.mak b/CPP/7zip/7zip_gcc.mak index a78c0fab3..4ad4db6c3 100644 --- a/CPP/7zip/7zip_gcc.mak +++ b/CPP/7zip/7zip_gcc.mak @@ -827,6 +827,8 @@ $O/7zAes.o: ../../Crypto/7zAes.cpp $(CXX) $(CXXFLAGS) $< $O/7zAesRegister.o: ../../Crypto/7zAesRegister.cpp $(CXX) $(CXXFLAGS) $< +$O/7zKeyDerivation.o: ../../Crypto/7zKeyDerivation.cpp + $(CXX) $(CXXFLAGS) $< $O/HmacSha1.o: ../../Crypto/HmacSha1.cpp $(CXX) $(CXXFLAGS) $< $O/HmacSha256.o: ../../Crypto/HmacSha256.cpp diff --git a/CPP/7zip/Bundles/Alone/Alone.dsp b/CPP/7zip/Bundles/Alone/Alone.dsp index 6512e4091..489b20c58 100644 --- a/CPP/7zip/Bundles/Alone/Alone.dsp +++ b/CPP/7zip/Bundles/Alone/Alone.dsp @@ -1941,6 +1941,14 @@ SOURCE=..\..\Crypto\7zAesRegister.cpp # End Source File # Begin Source File +SOURCE=..\..\Crypto\7zKeyDerivation.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\7zKeyDerivation.h +# End Source File +# Begin Source File + SOURCE=..\..\Crypto\HmacSha1.cpp !IF "$(CFG)" == "Alone - Win32 Release" diff --git a/CPP/7zip/Bundles/Alone/makefile b/CPP/7zip/Bundles/Alone/makefile index 9f81f9e32..67a4d3304 100644 --- a/CPP/7zip/Bundles/Alone/makefile +++ b/CPP/7zip/Bundles/Alone/makefile @@ -186,6 +186,7 @@ COMPRESS_OBJS = \ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ + $O\7zKeyDerivation.obj \ $O\HmacSha1.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ diff --git a/CPP/7zip/Bundles/Alone/makefile.gcc b/CPP/7zip/Bundles/Alone/makefile.gcc index 822d2526a..2ae3e701f 100644 --- a/CPP/7zip/Bundles/Alone/makefile.gcc +++ b/CPP/7zip/Bundles/Alone/makefile.gcc @@ -277,6 +277,7 @@ COMPRESS_OBJS = \ CRYPTO_OBJS = \ $O/7zAes.o \ $O/7zAesRegister.o \ + $O/7zKeyDerivation.o \ $O/HmacSha1.o \ $O/MyAes.o \ $O/MyAesReg.o \ diff --git a/CPP/7zip/Bundles/Alone7z/Alone.dsp b/CPP/7zip/Bundles/Alone7z/Alone.dsp index c5c149fdd..c4ef3198f 100644 --- a/CPP/7zip/Bundles/Alone7z/Alone.dsp +++ b/CPP/7zip/Bundles/Alone7z/Alone.dsp @@ -2067,6 +2067,14 @@ SOURCE=..\..\Crypto\7zAesRegister.cpp # End Source File # Begin Source File +SOURCE=..\..\Crypto\7zKeyDerivation.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\7zKeyDerivation.h +# End Source File +# Begin Source File + SOURCE=..\..\Crypto\MyAes.cpp # End Source File # Begin Source File diff --git a/CPP/7zip/Bundles/Alone7z/makefile b/CPP/7zip/Bundles/Alone7z/makefile index f0a813acb..5fc6f7a97 100644 --- a/CPP/7zip/Bundles/Alone7z/makefile +++ b/CPP/7zip/Bundles/Alone7z/makefile @@ -125,6 +125,7 @@ COMPRESS_OBJS = \ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ + $O\7zKeyDerivation.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ $O\RandGen.obj \ diff --git a/CPP/7zip/Bundles/Alone7z/makefile.gcc b/CPP/7zip/Bundles/Alone7z/makefile.gcc index 179bfef5c..5a20ab34c 100644 --- a/CPP/7zip/Bundles/Alone7z/makefile.gcc +++ b/CPP/7zip/Bundles/Alone7z/makefile.gcc @@ -220,6 +220,7 @@ COMPRESS_OBJS = \ CRYPTO_OBJS = \ $O/7zAes.o \ $O/7zAesRegister.o \ + $O/7zKeyDerivation.o \ $O/MyAes.o \ $O/MyAesReg.o \ $O/RandGen.o \ diff --git a/CPP/7zip/Bundles/Format7z/makefile b/CPP/7zip/Bundles/Format7z/makefile index 3d4754ca6..551d2c87b 100644 --- a/CPP/7zip/Bundles/Format7z/makefile +++ b/CPP/7zip/Bundles/Format7z/makefile @@ -107,6 +107,7 @@ COMPRESS_OBJS = \ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ + $O\7zKeyDerivation.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ $O\RandGen.obj \ diff --git a/CPP/7zip/Bundles/Format7zExtract/makefile b/CPP/7zip/Bundles/Format7zExtract/makefile index 4e2ed3e80..4aeeaf412 100644 --- a/CPP/7zip/Bundles/Format7zExtract/makefile +++ b/CPP/7zip/Bundles/Format7zExtract/makefile @@ -87,6 +87,7 @@ COMPRESS_OBJS = \ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ + $O\7zKeyDerivation.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ diff --git a/CPP/7zip/Bundles/Format7zF/Arc.mak b/CPP/7zip/Bundles/Format7zF/Arc.mak index 4d44bceca..9b13c3b1b 100644 --- a/CPP/7zip/Bundles/Format7zF/Arc.mak +++ b/CPP/7zip/Bundles/Format7zF/Arc.mak @@ -244,6 +244,7 @@ COMPRESS_OBJS = \ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ + $O\7zKeyDerivation.obj \ $O\XChaCha20.obj \ $O\XChaCha20Register.obj \ $O\HmacSha1.obj \ diff --git a/CPP/7zip/Bundles/Format7zF/Arc_gcc.mak b/CPP/7zip/Bundles/Format7zF/Arc_gcc.mak index 746aaff29..b856b9e92 100644 --- a/CPP/7zip/Bundles/Format7zF/Arc_gcc.mak +++ b/CPP/7zip/Bundles/Format7zF/Arc_gcc.mak @@ -298,6 +298,7 @@ endif CRYPTO_OBJS = \ $O/7zAes.o \ $O/7zAesRegister.o \ + $O/7zKeyDerivation.o \ $O/HmacSha1.o \ $O/HmacSha256.o \ $O/MyAes.o \ diff --git a/CPP/7zip/Bundles/Format7zF/Format7z.dsp b/CPP/7zip/Bundles/Format7zF/Format7z.dsp index cf91341d5..55cd72d6c 100644 --- a/CPP/7zip/Bundles/Format7zF/Format7z.dsp +++ b/CPP/7zip/Bundles/Format7zF/Format7z.dsp @@ -1147,6 +1147,24 @@ SOURCE=..\..\Crypto\7zAesRegister.cpp # End Source File # Begin Source File +SOURCE=..\..\Crypto\7zKeyDerivation.cpp + +!IF "$(CFG)" == "7z - Win32 Release" + +# ADD CPP /O2 +# SUBTRACT CPP /YX /Yc /Yu + +!ELSEIF "$(CFG)" == "7z - Win32 Debug" + +!ENDIF + +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\7zKeyDerivation.h +# End Source File +# Begin Source File + SOURCE=..\..\Crypto\HmacSha1.cpp !IF "$(CFG)" == "7z - Win32 Release" diff --git a/CPP/7zip/Bundles/SFXCon/SFXCon.dsp b/CPP/7zip/Bundles/SFXCon/SFXCon.dsp index d49093266..b6392ceb0 100644 --- a/CPP/7zip/Bundles/SFXCon/SFXCon.dsp +++ b/CPP/7zip/Bundles/SFXCon/SFXCon.dsp @@ -357,6 +357,14 @@ SOURCE=..\..\Crypto\7zAesRegister.cpp # End Source File # Begin Source File +SOURCE=..\..\Crypto\7zKeyDerivation.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\7zKeyDerivation.h +# End Source File +# Begin Source File + SOURCE=..\..\Crypto\MyAes.cpp # End Source File # Begin Source File diff --git a/CPP/7zip/Bundles/SFXCon/makefile b/CPP/7zip/Bundles/SFXCon/makefile index e087c19bb..2a2e98bf5 100644 --- a/CPP/7zip/Bundles/SFXCon/makefile +++ b/CPP/7zip/Bundles/SFXCon/makefile @@ -109,6 +109,7 @@ COMPRESS_OBJS = \ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ + $O\7zKeyDerivation.obj \ $O\MyAes.obj \ $O\XChaCha20.obj \ $O\XChaCha20Register.obj \ diff --git a/CPP/7zip/Bundles/SFXCon/makefile.gcc b/CPP/7zip/Bundles/SFXCon/makefile.gcc index 28abdf74f..eb4f39fc5 100644 --- a/CPP/7zip/Bundles/SFXCon/makefile.gcc +++ b/CPP/7zip/Bundles/SFXCon/makefile.gcc @@ -171,6 +171,7 @@ COMPRESS_OBJS = \ CRYPTO_OBJS = \ $O/7zAes.o \ $O/7zAesRegister.o \ + $O/7zKeyDerivation.o \ $O/MyAes.o \ C_OBJS = \ diff --git a/CPP/7zip/Bundles/SFXWin/SFXWin.dsp b/CPP/7zip/Bundles/SFXWin/SFXWin.dsp index 18db7f834..2e7af3899 100644 --- a/CPP/7zip/Bundles/SFXWin/SFXWin.dsp +++ b/CPP/7zip/Bundles/SFXWin/SFXWin.dsp @@ -313,6 +313,14 @@ SOURCE=..\..\Crypto\7zAesRegister.cpp # End Source File # Begin Source File +SOURCE=..\..\Crypto\7zKeyDerivation.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\7zKeyDerivation.h +# End Source File +# Begin Source File + SOURCE=..\..\Crypto\MyAes.cpp # End Source File # Begin Source File diff --git a/CPP/7zip/Bundles/SFXWin/makefile b/CPP/7zip/Bundles/SFXWin/makefile index e007f07c2..b93492195 100644 --- a/CPP/7zip/Bundles/SFXWin/makefile +++ b/CPP/7zip/Bundles/SFXWin/makefile @@ -131,6 +131,7 @@ COMPRESS_OBJS = \ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ + $O\7zKeyDerivation.obj \ $O\MyAes.obj \ $O\XChaCha20.obj \ $O\XChaCha20Register.obj \ diff --git a/CPP/7zip/Crypto/7zAes.cpp b/CPP/7zip/Crypto/7zAes.cpp index af6c708ba..aebc52d6c 100644 --- a/CPP/7zip/Crypto/7zAes.cpp +++ b/CPP/7zip/Crypto/7zAes.cpp @@ -3,10 +3,8 @@ #include "StdAfx.h" #include "../../../C/CpuArch.h" -#include "../../../C/Sha256.h" #include "../../Common/ComTry.h" -#include "../../Common/MyBuffer2.h" #ifndef Z7_ST #include "../../Windows/Synchronization.h" @@ -26,141 +24,6 @@ namespace N7z { static const unsigned k_NumCyclesPower_Supported_MAX = 24; -static bool ConstantTimeCompare(const Byte *a, const Byte *b, size_t size) -{ - volatile Byte result = 0; - for (size_t i = 0; i < size; i++) - result |= a[i] ^ b[i]; - return result == 0; -} - -bool CKeyInfo::IsEqualTo(const CKeyInfo &a) const -{ - if (SaltSize != a.SaltSize || NumCyclesPower != a.NumCyclesPower) - return false; - if (!ConstantTimeCompare(Salt, a.Salt, SaltSize)) - return false; - if (Password.Size() != a.Password.Size()) - return false; - return ConstantTimeCompare(Password, a.Password, Password.Size()); -} - -void CKeyInfo::CalcKey() -{ - if (NumCyclesPower == 0x3F) - { - unsigned pos; - for (pos = 0; pos < SaltSize; pos++) - Key[pos] = Salt[pos]; - for (unsigned i = 0; i < Password.Size() && pos < kKeySize; i++) - Key[pos++] = Password[i]; - for (; pos < kKeySize; pos++) - Key[pos] = 0; - } - else - { - const unsigned kUnrPow = 6; - const UInt32 numUnroll = (UInt32)1 << (NumCyclesPower <= kUnrPow ? (unsigned)NumCyclesPower : kUnrPow); - - const size_t bufSize = 8 + SaltSize + Password.Size(); - const size_t unrollSize = bufSize * numUnroll; - - // MY_ALIGN (16) - // CSha256 sha; - const size_t shaAllocSize = sizeof(CSha256) + unrollSize + bufSize * 2; - CAlignedBuffer1 sha(shaAllocSize); - Byte *buf = sha + sizeof(CSha256); - - memcpy(buf, Salt, SaltSize); - memcpy(buf + SaltSize, Password, Password.Size()); - memset(buf + bufSize - 8, 0, 8); - - Sha256_Init((CSha256 *)(void *)(Byte *)sha); - - { - { - Byte *dest = buf; - for (UInt32 i = 1; i < numUnroll; i++) - { - dest += bufSize; - memcpy(dest, buf, bufSize); - } - } - - const UInt32 numRounds = (UInt32)1 << NumCyclesPower; - UInt32 r = 0; - do - { - Byte *dest = buf + bufSize - 8; - UInt32 i = r; - r += numUnroll; - do - { - SetUi32(dest, i) i++; dest += bufSize; - // SetUi32(dest, i) i++; dest += bufSize; - } - while (i < r); - Sha256_Update((CSha256 *)(void *)(Byte *)sha, buf, unrollSize); - } - while (r < numRounds); - } - /* - UInt64 numRounds = (UInt64)1 << NumCyclesPower; - - do - { - Sha256_Update((CSha256 *)(Byte *)sha, buf, bufSize); - for (unsigned i = 0; i < 8; i++) - if (++(ctr[i]) != 0) - break; - } - while (--numRounds != 0); - */ - - Sha256_Final((CSha256 *)(void *)(Byte *)sha, Key); - memset(sha, 0, shaAllocSize); - } -} - -bool CKeyInfoCache::GetKey(CKeyInfo &key) -{ - FOR_VECTOR (i, Keys) - { - const CKeyInfo &cached = Keys[i]; - if (key.IsEqualTo(cached)) - { - for (unsigned j = 0; j < kKeySize; j++) - key.Key[j] = cached.Key[j]; - if (i != 0) - Keys.MoveToFront(i); - return true; - } - } - return false; -} - -void CKeyInfoCache::FindAndAdd(const CKeyInfo &key) -{ - FOR_VECTOR (i, Keys) - { - const CKeyInfo &cached = Keys[i]; - if (key.IsEqualTo(cached)) - { - if (i != 0) - Keys.MoveToFront(i); - return; - } - } - Add(key); -} - -void CKeyInfoCache::Add(const CKeyInfo &key) -{ - if (Keys.Size() >= Size) - Keys.DeleteBack(); - Keys.Insert(0, key); -} - static CKeyInfoCache g_GlobalKeyCache(32); #ifndef Z7_ST @@ -180,7 +43,6 @@ CBase::CBase(): void CBase::PrepareKey() { - // BCJ2 threads use same password. So we use long lock. MT_LOCK bool finded = false; @@ -197,15 +59,6 @@ void CBase::PrepareKey() #ifndef Z7_EXTRACT_ONLY -/* -Z7_COM7F_IMF(CEncoder::ResetSalt()) -{ - _key.SaltSize = 4; - g_RandomGenerator.Generate(_key.Salt, _key.SaltSize); - return S_OK; -} -*/ - Z7_COM7F_IMF(CEncoder::ResetInitVector()) { for (unsigned i = 0; i < sizeof(_iv); i++) @@ -240,8 +93,6 @@ Z7_COM7F_IMF(CEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) CEncoder::CEncoder() { - // _key.SaltSize = 4; g_RandomGenerator.Generate(_key.Salt, _key.SaltSize); - // _key.NumCyclesPower = 0x3F; _key.NumCyclesPower = 19; _aesFilter = new CAesCbcEncoder(kKeySize); } diff --git a/CPP/7zip/Crypto/7zAes.h b/CPP/7zip/Crypto/7zAes.h index 75cd79804..66d334138 100644 --- a/CPP/7zip/Crypto/7zAes.h +++ b/CPP/7zip/Crypto/7zAes.h @@ -3,66 +3,22 @@ #ifndef ZIP7_INC_CRYPTO_7Z_AES_H #define ZIP7_INC_CRYPTO_7Z_AES_H -#include "../../Common/MyBuffer.h" #include "../../Common/MyCom.h" -#include "../../Common/MyVector.h" #include "../ICoder.h" #include "../IPassword.h" +#include "7zKeyDerivation.h" + namespace NCrypto { namespace N7z { -const unsigned kKeySize = 32; -const unsigned kSaltSizeMax = 16; -const unsigned kIvSizeMax = 16; // AES_BLOCK_SIZE; - -class CKeyInfo -{ -public: - unsigned NumCyclesPower; - unsigned SaltSize; - Byte Salt[kSaltSizeMax]; - CByteBuffer Password; - Byte Key[kKeySize]; - - bool IsEqualTo(const CKeyInfo &a) const; - void CalcKey(); - - CKeyInfo() { ClearProps(); } - void ClearProps() - { - NumCyclesPower = 0; - SaltSize = 0; - for (unsigned i = 0; i < sizeof(Salt); i++) - Salt[i] = 0; - } - - void Wipe() - { - Password.Wipe(); - NumCyclesPower = 0; - SaltSize = 0; - Z7_memset_0_ARRAY(Salt); - Z7_memset_0_ARRAY(Key); - } +using CKeyInfo = N7zKeyDerivation::CKeyInfo; +using CKeyInfoCache = N7zKeyDerivation::CKeyInfoCache; -#ifdef Z7_CPP_IS_SUPPORTED_default - CKeyInfo(const CKeyInfo &) = default; -#endif - ~CKeyInfo() { Wipe(); } -}; +using N7zKeyDerivation::kKeySize; -class CKeyInfoCache -{ - unsigned Size; - CObjectVector Keys; -public: - CKeyInfoCache(unsigned size): Size(size) {} - bool GetKey(CKeyInfo &key); - void Add(const CKeyInfo &key); - void FindAndAdd(const CKeyInfo &key); -}; +const unsigned kIvSizeMax = 16; class CBase { @@ -98,17 +54,14 @@ class CBaseCoder: class CEncoder Z7_final: public CBaseCoder, public ICompressWriteCoderProperties, - // public ICryptoResetSalt, public ICryptoResetInitVector { Z7_COM_UNKNOWN_IMP_4( ICompressFilter, ICryptoSetPassword, ICompressWriteCoderProperties, - // ICryptoResetSalt, ICryptoResetInitVector) Z7_IFACE_COM7_IMP(ICompressWriteCoderProperties) - // Z7_IFACE_COM7_IMP(ICryptoResetSalt) Z7_IFACE_COM7_IMP(ICryptoResetInitVector) public: CEncoder(); diff --git a/CPP/7zip/Crypto/7zKeyDerivation.cpp b/CPP/7zip/Crypto/7zKeyDerivation.cpp new file mode 100644 index 000000000..02d8f8692 --- /dev/null +++ b/CPP/7zip/Crypto/7zKeyDerivation.cpp @@ -0,0 +1,136 @@ +// 7zKeyDerivation.cpp +// Key derivation common module for 7z format + +#include "StdAfx.h" + +#include "../../../C/CpuArch.h" +#include "../../../C/Sha256.h" + +#include "../../Common/MyBuffer2.h" + +#include "7zKeyDerivation.h" + +namespace NCrypto { +namespace N7zKeyDerivation { + +static bool ConstantTimeCompare(const Byte *a, const Byte *b, size_t size) +{ + volatile Byte result = 0; + for (size_t i = 0; i < size; i++) + result |= a[i] ^ b[i]; + return result == 0; +} + +bool CKeyInfo::IsEqualTo(const CKeyInfo &a) const +{ + if (SaltSize != a.SaltSize || NumCyclesPower != a.NumCyclesPower) + return false; + if (!ConstantTimeCompare(Salt, a.Salt, SaltSize)) + return false; + if (Password.Size() != a.Password.Size()) + return false; + return ConstantTimeCompare(Password, a.Password, Password.Size()); +} + +void CKeyInfo::CalcKey() +{ + if (NumCyclesPower == 0x3F) + { + unsigned pos; + for (pos = 0; pos < SaltSize; pos++) + Key[pos] = Salt[pos]; + for (unsigned i = 0; i < Password.Size() && pos < kKeySize; i++) + Key[pos++] = Password[i]; + for (; pos < kKeySize; pos++) + Key[pos] = 0; + } + else + { + const unsigned kUnrPow = 6; + const UInt32 numUnroll = (UInt32)1 << (NumCyclesPower <= kUnrPow ? (unsigned)NumCyclesPower : kUnrPow); + + const size_t bufSize = 8 + SaltSize + Password.Size(); + const size_t unrollSize = bufSize * numUnroll; + + const size_t shaAllocSize = sizeof(CSha256) + unrollSize + bufSize * 2; + CAlignedBuffer1 sha(shaAllocSize); + Byte *buf = sha + sizeof(CSha256); + + memcpy(buf, Salt, SaltSize); + memcpy(buf + SaltSize, Password, Password.Size()); + memset(buf + bufSize - 8, 0, 8); + + Sha256_Init((CSha256 *)(void *)(Byte *)sha); + + { + { + Byte *dest = buf; + for (UInt32 i = 1; i < numUnroll; i++) + { + dest += bufSize; + memcpy(dest, buf, bufSize); + } + } + + const UInt32 numRounds = (UInt32)1 << NumCyclesPower; + UInt32 r = 0; + do + { + Byte *dest = buf + bufSize - 8; + UInt32 i = r; + r += numUnroll; + do + { + SetUi32(dest, i) i++; dest += bufSize; + } + while (i < r); + Sha256_Update((CSha256 *)(void *)(Byte *)sha, buf, unrollSize); + } + while (r < numRounds); + } + + Sha256_Final((CSha256 *)(void *)(Byte *)sha, Key); + memset(sha, 0, shaAllocSize); + } +} + +bool CKeyInfoCache::GetKey(CKeyInfo &key) +{ + FOR_VECTOR (i, Keys) + { + const CKeyInfo &cached = Keys[i]; + if (key.IsEqualTo(cached)) + { + for (unsigned j = 0; j < kKeySize; j++) + key.Key[j] = cached.Key[j]; + if (i != 0) + Keys.MoveToFront(i); + return true; + } + } + return false; +} + +void CKeyInfoCache::FindAndAdd(const CKeyInfo &key) +{ + FOR_VECTOR (i, Keys) + { + const CKeyInfo &cached = Keys[i]; + if (key.IsEqualTo(cached)) + { + if (i != 0) + Keys.MoveToFront(i); + return; + } + } + Add(key); +} + +void CKeyInfoCache::Add(const CKeyInfo &key) +{ + if (Keys.Size() >= Size) + Keys.DeleteBack(); + Keys.Insert(0, key); +} + +}} diff --git a/CPP/7zip/Crypto/7zKeyDerivation.h b/CPP/7zip/Crypto/7zKeyDerivation.h new file mode 100644 index 000000000..1ae33638b --- /dev/null +++ b/CPP/7zip/Crypto/7zKeyDerivation.h @@ -0,0 +1,65 @@ +// 7zKeyDerivation.h +// Key derivation common module for 7z format + +#ifndef ZIP7_INC_CRYPTO_7Z_KEY_DERIVATION_H +#define ZIP7_INC_CRYPTO_7Z_KEY_DERIVATION_H + +#include "../../Common/MyBuffer.h" +#include "../../Common/MyVector.h" + +namespace NCrypto { +namespace N7zKeyDerivation { + +const unsigned kKeySize = 32; +const unsigned kSaltSizeMax = 16; + +class CKeyInfo +{ +public: + unsigned NumCyclesPower; + unsigned SaltSize; + Byte Salt[kSaltSizeMax]; + CByteBuffer Password; + Byte Key[kKeySize]; + + bool IsEqualTo(const CKeyInfo &a) const; + void CalcKey(); + + CKeyInfo() { ClearProps(); } + void ClearProps() + { + NumCyclesPower = 0; + SaltSize = 0; + for (unsigned i = 0; i < sizeof(Salt); i++) + Salt[i] = 0; + } + + void Wipe() + { + Password.Wipe(); + NumCyclesPower = 0; + SaltSize = 0; + Z7_memset_0_ARRAY(Salt); + Z7_memset_0_ARRAY(Key); + } + +#ifdef Z7_CPP_IS_SUPPORTED_default + CKeyInfo(const CKeyInfo &) = default; +#endif + ~CKeyInfo() { Wipe(); } +}; + +class CKeyInfoCache +{ + unsigned Size; + CObjectVector Keys; +public: + CKeyInfoCache(unsigned size): Size(size) {} + bool GetKey(CKeyInfo &key); + void Add(const CKeyInfo &key); + void FindAndAdd(const CKeyInfo &key); +}; + +}} + +#endif diff --git a/CPP/7zip/Crypto/XChaCha20.cpp b/CPP/7zip/Crypto/XChaCha20.cpp index 457efc95c..484cacfc3 100644 --- a/CPP/7zip/Crypto/XChaCha20.cpp +++ b/CPP/7zip/Crypto/XChaCha20.cpp @@ -3,10 +3,8 @@ #include "StdAfx.h" #include "../../../C/CpuArch.h" -#include "../../../C/Sha256.h" #include "../../Common/ComTry.h" -#include "../../Common/MyBuffer2.h" #ifndef Z7_ST #include "../../Windows/Synchronization.h" @@ -33,126 +31,6 @@ static const unsigned k_NumCyclesPower_Supported_MAX = 24; a += b; d ^= a; d = ROTL32(d, 8); \ c += d; b ^= c; b = ROTL32(b, 7); -static bool ConstantTimeCompare(const Byte *a, const Byte *b, size_t size) -{ - volatile Byte result = 0; - for (size_t i = 0; i < size; i++) - result |= a[i] ^ b[i]; - return result == 0; -} - -bool CKeyInfo::IsEqualTo(const CKeyInfo &a) const -{ - if (SaltSize != a.SaltSize || NumCyclesPower != a.NumCyclesPower) - return false; - if (!ConstantTimeCompare(Salt, a.Salt, SaltSize)) - return false; - if (Password.Size() != a.Password.Size()) - return false; - return ConstantTimeCompare(Password, a.Password, Password.Size()); -} - -void CKeyInfo::CalcKey() -{ - if (NumCyclesPower == 0x3F) - { - unsigned pos; - for (pos = 0; pos < SaltSize; pos++) - Key[pos] = Salt[pos]; - for (unsigned i = 0; i < Password.Size() && pos < kKeySize; i++) - Key[pos++] = Password[i]; - for (; pos < kKeySize; pos++) - Key[pos] = 0; - } - else - { - const unsigned kUnrPow = 6; - const UInt32 numUnroll = (UInt32)1 << (NumCyclesPower <= kUnrPow ? (unsigned)NumCyclesPower : kUnrPow); - - const size_t bufSize = 8 + SaltSize + Password.Size(); - const size_t unrollSize = bufSize * numUnroll; - - const size_t shaAllocSize = sizeof(CSha256) + unrollSize + bufSize * 2; - CAlignedBuffer1 sha(shaAllocSize); - Byte *buf = sha + sizeof(CSha256); - - memcpy(buf, Salt, SaltSize); - memcpy(buf + SaltSize, Password, Password.Size()); - memset(buf + bufSize - 8, 0, 8); - - Sha256_Init((CSha256 *)(void *)(Byte *)sha); - - { - { - Byte *dest = buf; - for (UInt32 i = 1; i < numUnroll; i++) - { - dest += bufSize; - memcpy(dest, buf, bufSize); - } - } - - const UInt32 numRounds = (UInt32)1 << NumCyclesPower; - UInt32 r = 0; - do - { - Byte *dest = buf + bufSize - 8; - UInt32 i = r; - r += numUnroll; - do - { - SetUi32(dest, i) i++; dest += bufSize; - } - while (i < r); - Sha256_Update((CSha256 *)(void *)(Byte *)sha, buf, unrollSize); - } - while (r < numRounds); - } - - Sha256_Final((CSha256 *)(void *)(Byte *)sha, Key); - memset(sha, 0, shaAllocSize); - } -} - -bool CKeyInfoCache::GetKey(CKeyInfo &key) -{ - FOR_VECTOR (i, Keys) - { - const CKeyInfo &cached = Keys[i]; - if (key.IsEqualTo(cached)) - { - for (unsigned j = 0; j < kKeySize; j++) - key.Key[j] = cached.Key[j]; - if (i != 0) - Keys.MoveToFront(i); - return true; - } - } - return false; -} - -void CKeyInfoCache::FindAndAdd(const CKeyInfo &key) -{ - FOR_VECTOR (i, Keys) - { - const CKeyInfo &cached = Keys[i]; - if (key.IsEqualTo(cached)) - { - if (i != 0) - Keys.MoveToFront(i); - return; - } - } - Add(key); -} - -void CKeyInfoCache::Add(const CKeyInfo &key) -{ - if (Keys.Size() >= Size) - Keys.DeleteBack(); - Keys.Insert(0, key); -} - static CKeyInfoCache g_GlobalKeyCache(32); #ifndef Z7_ST diff --git a/CPP/7zip/Crypto/XChaCha20.h b/CPP/7zip/Crypto/XChaCha20.h index 821aa014c..53b4ba690 100644 --- a/CPP/7zip/Crypto/XChaCha20.h +++ b/CPP/7zip/Crypto/XChaCha20.h @@ -3,66 +3,22 @@ #ifndef ZIP7_INC_CRYPTO_XCHACHA20_H #define ZIP7_INC_CRYPTO_XCHACHA20_H -#include "../../Common/MyBuffer.h" #include "../../Common/MyCom.h" -#include "../../Common/MyVector.h" #include "../ICoder.h" #include "../IPassword.h" +#include "7zKeyDerivation.h" + namespace NCrypto { namespace NXChaCha20 { -const unsigned kKeySize = 32; -const unsigned kNonceSize = 24; -const unsigned kSaltSizeMax = 16; - -class CKeyInfo -{ -public: - unsigned NumCyclesPower; - unsigned SaltSize; - Byte Salt[kSaltSizeMax]; - CByteBuffer Password; - Byte Key[kKeySize]; - - bool IsEqualTo(const CKeyInfo &a) const; - void CalcKey(); - - CKeyInfo() { ClearProps(); } - void ClearProps() - { - NumCyclesPower = 0; - SaltSize = 0; - for (unsigned i = 0; i < sizeof(Salt); i++) - Salt[i] = 0; - } - - void Wipe() - { - Password.Wipe(); - NumCyclesPower = 0; - SaltSize = 0; - Z7_memset_0_ARRAY(Salt); - Z7_memset_0_ARRAY(Key); - } +using CKeyInfo = N7zKeyDerivation::CKeyInfo; +using CKeyInfoCache = N7zKeyDerivation::CKeyInfoCache; -#ifdef Z7_CPP_IS_SUPPORTED_default - CKeyInfo(const CKeyInfo &) = default; -#endif - ~CKeyInfo() { Wipe(); } -}; +using N7zKeyDerivation::kKeySize; -class CKeyInfoCache -{ - unsigned Size; - CObjectVector Keys; -public: - CKeyInfoCache(unsigned size): Size(size) {} - bool GetKey(CKeyInfo &key); - void Add(const CKeyInfo &key); - void FindAndAdd(const CKeyInfo &key); -}; +const unsigned kNonceSize = 24; class CBase { From e882d5a7a2986e66693d260287610af1129af006 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Sat, 23 May 2026 12:01:05 +0800 Subject: [PATCH 05/18] Add hardware acceleration --- CPP/7zip/Crypto/XChaCha20.cpp | 546 ++++++++++++++++++++++++++++++++++ 1 file changed, 546 insertions(+) diff --git a/CPP/7zip/Crypto/XChaCha20.cpp b/CPP/7zip/Crypto/XChaCha20.cpp index 484cacfc3..3fdae04c8 100644 --- a/CPP/7zip/Crypto/XChaCha20.cpp +++ b/CPP/7zip/Crypto/XChaCha20.cpp @@ -18,6 +18,14 @@ #include "RandGen.h" #endif +#ifdef MY_CPU_X86_OR_AMD64 +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) +#include +#endif +#endif + namespace NCrypto { namespace NXChaCha20 { @@ -203,6 +211,487 @@ void CBaseCoder::Chacha20Block(Byte *output, const Byte *key, const Byte *nonce, SetUi32(output + 60, x15) } +#ifdef MY_CPU_X86_OR_AMD64 + +#ifdef MY_CPU_SSE2 + +namespace { + +template +Z7_FORCE_INLINE __m128i RotateLeft_SSE2(const __m128i val) +{ + return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32 - R)); +} + +template <> +Z7_FORCE_INLINE __m128i RotateLeft_SSE2<8>(const __m128i val) +{ +#ifdef __SSSE3__ + const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); + return _mm_shuffle_epi8(val, mask); +#else + return _mm_or_si128(_mm_slli_epi32(val, 8), _mm_srli_epi32(val, 24)); +#endif +} + +template <> +Z7_FORCE_INLINE __m128i RotateLeft_SSE2<16>(const __m128i val) +{ +#ifdef __SSSE3__ + const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); + return _mm_shuffle_epi8(val, mask); +#else + return _mm_or_si128(_mm_slli_epi32(val, 16), _mm_srli_epi32(val, 16)); +#endif +} + +#define SSE2_QUARTERROUND(a, b, c, d) \ + a = _mm_add_epi32(a, b); \ + d = _mm_xor_si128(d, a); \ + d = RotateLeft_SSE2<16>(d); \ + c = _mm_add_epi32(c, d); \ + b = _mm_xor_si128(b, c); \ + b = RotateLeft_SSE2<12>(b); \ + a = _mm_add_epi32(a, b); \ + d = _mm_xor_si128(d, a); \ + d = RotateLeft_SSE2<8>(d); \ + c = _mm_add_epi32(c, d); \ + b = _mm_xor_si128(b, c); \ + b = RotateLeft_SSE2<7>(b); + +Z7_NO_INLINE void ChaCha20_OperateKeystream_SSE2( + const UInt32 *state, + const Byte *input, + Byte *output) +{ + const __m128i state0 = _mm_loadu_si128((const __m128i *)(state + 0)); + const __m128i state1 = _mm_loadu_si128((const __m128i *)(state + 4)); + const __m128i state2 = _mm_loadu_si128((const __m128i *)(state + 8)); + const __m128i state3 = _mm_loadu_si128((const __m128i *)(state + 12)); + + __m128i r0_0 = state0; + __m128i r0_1 = state1; + __m128i r0_2 = state2; + __m128i r0_3 = state3; + + __m128i r1_0 = state0; + __m128i r1_1 = state1; + __m128i r1_2 = state2; + __m128i r1_3 = _mm_add_epi64(state3, _mm_set_epi32(0, 0, 0, 1)); + + __m128i r2_0 = state0; + __m128i r2_1 = state1; + __m128i r2_2 = state2; + __m128i r2_3 = _mm_add_epi64(state3, _mm_set_epi32(0, 0, 0, 2)); + + __m128i r3_0 = state0; + __m128i r3_1 = state1; + __m128i r3_2 = state2; + __m128i r3_3 = _mm_add_epi64(state3, _mm_set_epi32(0, 0, 0, 3)); + + for (int i = 0; i < 10; i++) + { + SSE2_QUARTERROUND(r0_0, r0_1, r0_2, r0_3); + SSE2_QUARTERROUND(r1_0, r1_1, r1_2, r1_3); + SSE2_QUARTERROUND(r2_0, r2_1, r2_2, r2_3); + SSE2_QUARTERROUND(r3_0, r3_1, r3_2, r3_3); + + r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); + r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); + r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1)); + r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); + r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1)); + r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); + r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1)); + r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); + r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3)); + + SSE2_QUARTERROUND(r0_0, r0_1, r0_2, r0_3); + SSE2_QUARTERROUND(r1_0, r1_1, r1_2, r1_3); + SSE2_QUARTERROUND(r2_0, r2_1, r2_2, r2_3); + SSE2_QUARTERROUND(r3_0, r3_1, r3_2, r3_3); + + r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); + r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); + r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3)); + r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); + r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3)); + r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); + r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3)); + r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); + r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); + } + + r0_0 = _mm_add_epi32(r0_0, state0); + r0_1 = _mm_add_epi32(r0_1, state1); + r0_2 = _mm_add_epi32(r0_2, state2); + r0_3 = _mm_add_epi32(r0_3, state3); + + r1_0 = _mm_add_epi32(r1_0, state0); + r1_1 = _mm_add_epi32(r1_1, state1); + r1_2 = _mm_add_epi32(r1_2, state2); + r1_3 = _mm_add_epi32(r1_3, state3); + r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); + + r2_0 = _mm_add_epi32(r2_0, state0); + r2_1 = _mm_add_epi32(r2_1, state1); + r2_2 = _mm_add_epi32(r2_2, state2); + r2_3 = _mm_add_epi32(r2_3, state3); + r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); + + r3_0 = _mm_add_epi32(r3_0, state0); + r3_1 = _mm_add_epi32(r3_1, state1); + r3_2 = _mm_add_epi32(r3_2, state2); + r3_3 = _mm_add_epi32(r3_3, state3); + r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); + + if (input) + { + r0_0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 0*16)), r0_0); + r0_1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 1*16)), r0_1); + r0_2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 2*16)), r0_2); + r0_3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 3*16)), r0_3); + } + + _mm_storeu_si128((__m128i *)(output + 0*16), r0_0); + _mm_storeu_si128((__m128i *)(output + 1*16), r0_1); + _mm_storeu_si128((__m128i *)(output + 2*16), r0_2); + _mm_storeu_si128((__m128i *)(output + 3*16), r0_3); + + if (input) + { + r1_0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 4*16)), r1_0); + r1_1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 5*16)), r1_1); + r1_2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 6*16)), r1_2); + r1_3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 7*16)), r1_3); + } + + _mm_storeu_si128((__m128i *)(output + 4*16), r1_0); + _mm_storeu_si128((__m128i *)(output + 5*16), r1_1); + _mm_storeu_si128((__m128i *)(output + 6*16), r1_2); + _mm_storeu_si128((__m128i *)(output + 7*16), r1_3); + + if (input) + { + r2_0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 8*16)), r2_0); + r2_1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 9*16)), r2_1); + r2_2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 10*16)), r2_2); + r2_3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 11*16)), r2_3); + } + + _mm_storeu_si128((__m128i *)(output + 8*16), r2_0); + _mm_storeu_si128((__m128i *)(output + 9*16), r2_1); + _mm_storeu_si128((__m128i *)(output + 10*16), r2_2); + _mm_storeu_si128((__m128i *)(output + 11*16), r2_3); + + if (input) + { + r3_0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 12*16)), r3_0); + r3_1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 13*16)), r3_1); + r3_2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 14*16)), r3_2); + r3_3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 15*16)), r3_3); + } + + _mm_storeu_si128((__m128i *)(output + 12*16), r3_0); + _mm_storeu_si128((__m128i *)(output + 13*16), r3_1); + _mm_storeu_si128((__m128i *)(output + 14*16), r3_2); + _mm_storeu_si128((__m128i *)(output + 15*16), r3_3); +} + +#ifdef MY_CPU_AMD64 + +template +Z7_FORCE_INLINE __m256i RotateLeft_AVX2(const __m256i val) +{ + return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32 - R)); +} + +template <> +Z7_FORCE_INLINE __m256i RotateLeft_AVX2<8>(const __m256i val) +{ + const __m256i mask = _mm256_set_epi8( + 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3, + 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); + return _mm256_shuffle_epi8(val, mask); +} + +template <> +Z7_FORCE_INLINE __m256i RotateLeft_AVX2<16>(const __m256i val) +{ + const __m256i mask = _mm256_set_epi8( + 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2, + 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); + return _mm256_shuffle_epi8(val, mask); +} + +#define AVX2_QUARTERROUND(a, b, c, d) \ + a = _mm256_add_epi32(a, b); \ + d = _mm256_xor_si256(d, a); \ + d = RotateLeft_AVX2<16>(d); \ + c = _mm256_add_epi32(c, d); \ + b = _mm256_xor_si256(b, c); \ + b = RotateLeft_AVX2<12>(b); \ + a = _mm256_add_epi32(a, b); \ + d = _mm256_xor_si256(d, a); \ + d = RotateLeft_AVX2<8>(d); \ + c = _mm256_add_epi32(c, d); \ + b = _mm256_xor_si256(b, c); \ + b = RotateLeft_AVX2<7>(b); + +Z7_NO_INLINE void ChaCha20_OperateKeystream_AVX2( + const UInt32 *state, + const Byte *input, + Byte *output) +{ + const __m256i state0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)(state + 0))); + const __m256i state1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)(state + 4))); + const __m256i state2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)(state + 8))); + const __m256i state3 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)(state + 12))); + + const UInt32 C = 0xFFFFFFFFu - state[12]; + const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, C < 4, 4); + const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5); + const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6); + const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7); + + __m256i X0_0 = state0; + __m256i X0_1 = state1; + __m256i X0_2 = state2; + __m256i X0_3 = _mm256_add_epi32(state3, CTR0); + + __m256i X1_0 = state0; + __m256i X1_1 = state1; + __m256i X1_2 = state2; + __m256i X1_3 = _mm256_add_epi32(state3, CTR1); + + __m256i X2_0 = state0; + __m256i X2_1 = state1; + __m256i X2_2 = state2; + __m256i X2_3 = _mm256_add_epi32(state3, CTR2); + + __m256i X3_0 = state0; + __m256i X3_1 = state1; + __m256i X3_2 = state2; + __m256i X3_3 = _mm256_add_epi32(state3, CTR3); + + for (int i = 0; i < 10; i++) + { + AVX2_QUARTERROUND(X0_0, X0_1, X0_2, X0_3); + AVX2_QUARTERROUND(X1_0, X1_1, X1_2, X1_3); + AVX2_QUARTERROUND(X2_0, X2_1, X2_2, X2_3); + AVX2_QUARTERROUND(X3_0, X3_1, X3_2, X3_3); + + X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1)); + X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); + X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3)); + + X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1)); + X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2)); + X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3)); + + X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1)); + X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2)); + X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3)); + + X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1)); + X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2)); + X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3)); + + AVX2_QUARTERROUND(X0_0, X0_1, X0_2, X0_3); + AVX2_QUARTERROUND(X1_0, X1_1, X1_2, X1_3); + AVX2_QUARTERROUND(X2_0, X2_1, X2_2, X2_3); + AVX2_QUARTERROUND(X3_0, X3_1, X3_2, X3_3); + + X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3)); + X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); + X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1)); + + X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3)); + X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2)); + X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1)); + + X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3)); + X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2)); + X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1)); + + X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3)); + X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2)); + X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1)); + } + + X0_0 = _mm256_add_epi32(X0_0, state0); + X0_1 = _mm256_add_epi32(X0_1, state1); + X0_2 = _mm256_add_epi32(X0_2, state2); + X0_3 = _mm256_add_epi32(X0_3, state3); + X0_3 = _mm256_add_epi32(X0_3, CTR0); + + X1_0 = _mm256_add_epi32(X1_0, state0); + X1_1 = _mm256_add_epi32(X1_1, state1); + X1_2 = _mm256_add_epi32(X1_2, state2); + X1_3 = _mm256_add_epi32(X1_3, state3); + X1_3 = _mm256_add_epi32(X1_3, CTR1); + + X2_0 = _mm256_add_epi32(X2_0, state0); + X2_1 = _mm256_add_epi32(X2_1, state1); + X2_2 = _mm256_add_epi32(X2_2, state2); + X2_3 = _mm256_add_epi32(X2_3, state3); + X2_3 = _mm256_add_epi32(X2_3, CTR2); + + X3_0 = _mm256_add_epi32(X3_0, state0); + X3_1 = _mm256_add_epi32(X3_1, state1); + X3_2 = _mm256_add_epi32(X3_2, state2); + X3_3 = _mm256_add_epi32(X3_3, state3); + X3_3 = _mm256_add_epi32(X3_3, CTR3); + + if (input) + { + _mm256_storeu_si256((__m256i *)(output + 0*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 0*32)))); + _mm256_storeu_si256((__m256i *)(output + 1*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 1*32)))); + _mm256_storeu_si256((__m256i *)(output + 2*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 2*32)))); + _mm256_storeu_si256((__m256i *)(output + 3*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 3*32)))); + } + else + { + _mm256_storeu_si256((__m256i *)(output + 0*32), + _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))); + _mm256_storeu_si256((__m256i *)(output + 1*32), + _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4))); + _mm256_storeu_si256((__m256i *)(output + 2*32), + _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4))); + _mm256_storeu_si256((__m256i *)(output + 3*32), + _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4))); + } + + if (input) + { + _mm256_storeu_si256((__m256i *)(output + 4*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 4*32)))); + _mm256_storeu_si256((__m256i *)(output + 5*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 5*32)))); + _mm256_storeu_si256((__m256i *)(output + 6*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 6*32)))); + _mm256_storeu_si256((__m256i *)(output + 7*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 7*32)))); + } + else + { + _mm256_storeu_si256((__m256i *)(output + 4*32), + _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4))); + _mm256_storeu_si256((__m256i *)(output + 5*32), + _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4))); + _mm256_storeu_si256((__m256i *)(output + 6*32), + _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4))); + _mm256_storeu_si256((__m256i *)(output + 7*32), + _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4))); + } + + if (input) + { + _mm256_storeu_si256((__m256i *)(output + 8*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 8*32)))); + _mm256_storeu_si256((__m256i *)(output + 9*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 9*32)))); + _mm256_storeu_si256((__m256i *)(output + 10*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 10*32)))); + _mm256_storeu_si256((__m256i *)(output + 11*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 11*32)))); + } + else + { + _mm256_storeu_si256((__m256i *)(output + 8*32), + _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4))); + _mm256_storeu_si256((__m256i *)(output + 9*32), + _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4))); + _mm256_storeu_si256((__m256i *)(output + 10*32), + _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4))); + _mm256_storeu_si256((__m256i *)(output + 11*32), + _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4))); + } + + if (input) + { + _mm256_storeu_si256((__m256i *)(output + 12*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 12*32)))); + _mm256_storeu_si256((__m256i *)(output + 13*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 13*32)))); + _mm256_storeu_si256((__m256i *)(output + 14*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 14*32)))); + _mm256_storeu_si256((__m256i *)(output + 15*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 15*32)))); + } + else + { + _mm256_storeu_si256((__m256i *)(output + 12*32), + _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4))); + _mm256_storeu_si256((__m256i *)(output + 13*32), + _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4))); + _mm256_storeu_si256((__m256i *)(output + 14*32), + _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4))); + _mm256_storeu_si256((__m256i *)(output + 15*32), + _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4))); + } + + _mm256_zeroupper(); +} + +#endif + +} + +static bool g_SSE2Enabled = false; +static bool g_AVX2Enabled = false; +static bool g_SIMDInitialized = false; + +static void InitSIMD() +{ + if (g_SIMDInitialized) + return; + g_SIMDInitialized = true; + +#ifdef MY_CPU_AMD64 + g_SSE2Enabled = true; + g_AVX2Enabled = CPU_IsSupported_AVX2() != 0; +#elif defined(MY_CPU_X86) + g_SSE2Enabled = CPU_IsSupported_SSE2() != 0; +#endif +} + +#endif + +#endif + void CBaseCoder::ProcessData(Byte *data, UInt32 size) { if (!_derivedKeyValid) @@ -210,6 +699,63 @@ void CBaseCoder::ProcessData(Byte *data, UInt32 size) DeriveKey(); } +#ifdef MY_CPU_X86_OR_AMD64 +#ifdef MY_CPU_SSE2 + InitSIMD(); + + if (size >= kBlockSize * 4) + { + UInt32 state[16]; + state[0] = GetUi32(kSigma); + state[1] = GetUi32(kSigma + 4); + state[2] = GetUi32(kSigma + 8); + state[3] = GetUi32(kSigma + 12); + state[4] = GetUi32(_derivedKey); + state[5] = GetUi32(_derivedKey + 4); + state[6] = GetUi32(_derivedKey + 8); + state[7] = GetUi32(_derivedKey + 12); + state[8] = GetUi32(_derivedKey + 16); + state[9] = GetUi32(_derivedKey + 20); + state[10] = GetUi32(_derivedKey + 24); + state[11] = GetUi32(_derivedKey + 28); + state[12] = (UInt32)(_counter & 0xFFFFFFFF); + state[13] = (UInt32)(_counter >> 32); + state[14] = GetUi32(_nonce + 16); + state[15] = GetUi32(_nonce + 20); + +#ifdef MY_CPU_AMD64 + if (g_AVX2Enabled && size >= kBlockSize * 8) + { + while (size >= kBlockSize * 8) + { + ChaCha20_OperateKeystream_AVX2(state, data, data); + state[12] += 8; + if (state[12] < 8) + state[13]++; + data += kBlockSize * 8; + size -= kBlockSize * 8; + } + } +#endif + + if (g_SSE2Enabled && size >= kBlockSize * 4) + { + while (size >= kBlockSize * 4) + { + ChaCha20_OperateKeystream_SSE2(state, data, data); + state[12] += 4; + if (state[12] < 4) + state[13]++; + data += kBlockSize * 4; + size -= kBlockSize * 4; + } + } + + _counter = (UInt64)state[13] << 32 | state[12]; + } +#endif +#endif + while (size > 0) { if (_blockPos == 0 || _blockPos >= kBlockSize) From 44a13e7f4a9335da7152719b03e3884ecd656fc6 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Wed, 27 May 2026 08:28:24 +0800 Subject: [PATCH 06/18] Add XChaCha20-Poly1305, reuse hardware acceleration, and add additional data validation parameters. --- CPP/7zip/Archive/7z/7zDecode.cpp | 22 +- CPP/7zip/Archive/7z/7zHandler.cpp | 12 +- CPP/7zip/Archive/7z/7zHandlerOut.cpp | 2 + CPP/7zip/Archive/7z/7zHeader.h | 1 + CPP/7zip/Archive/7z/7zItem.h | 2 +- CPP/7zip/Bundles/Format7zF/Arc.mak | 2 + CPP/7zip/Bundles/SFXCon/makefile | 2 + CPP/7zip/Bundles/SFXWin/makefile | 2 + CPP/7zip/Crypto/ChaCha20Simd.h | 500 ++++++++++++++ CPP/7zip/Crypto/XChaCha20.cpp | 535 +-------------- CPP/7zip/Crypto/XChaCha20.h | 8 +- CPP/7zip/Crypto/XChaCha20Poly1305.cpp | 627 ++++++++++++++++++ CPP/7zip/Crypto/XChaCha20Poly1305.h | 124 ++++ CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp | 17 + CPP/7zip/IPassword.h | 4 + CPP/7zip/UI/GUI/CompressDialog.cpp | 6 +- 16 files changed, 1344 insertions(+), 522 deletions(-) create mode 100644 CPP/7zip/Crypto/ChaCha20Simd.h create mode 100644 CPP/7zip/Crypto/XChaCha20Poly1305.cpp create mode 100644 CPP/7zip/Crypto/XChaCha20Poly1305.h create mode 100644 CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp diff --git a/CPP/7zip/Archive/7z/7zDecode.cpp b/CPP/7zip/Archive/7z/7zDecode.cpp index 50cbff621..056f2d0d4 100644 --- a/CPP/7zip/Archive/7z/7zDecode.cpp +++ b/CPP/7zip/Archive/7z/7zDecode.cpp @@ -584,9 +584,27 @@ HRESULT CDecoder::Decode( progress2 = new CDecProgress(compressProgress); ISequentialOutStream *outStreamPointer = outStream; - return _mixer->Code(inStreamPointers, &outStreamPointer, + const HRESULT codeResult = _mixer->Code(inStreamPointers, &outStreamPointer, progress2 ? (ICompressProgressInfo *)progress2 : compressProgress, dataAfterEnd_Error); + + if (codeResult == S_OK) + { + for (i = 0; i < folderInfo.Coders.Size(); i++) + { + Z7_DECL_CMyComPtr_QI_FROM( + ICryptoAuthVerify, + authVerify, _mixer->GetCoder(i).GetUnknown()) + if (authVerify) + { + Int32 authResult = 0; + RINOK(authVerify->CryptoAuthVerify(&authResult)) + if (authResult != 0) + return E_FAIL; + } + } + } + return codeResult; } #ifdef USE_MIXER_ST @@ -596,4 +614,4 @@ HRESULT CDecoder::Decode( #endif } -}} +}} \ No newline at end of file diff --git a/CPP/7zip/Archive/7z/7zHandler.cpp b/CPP/7zip/Archive/7z/7zHandler.cpp index 8053fad9b..0da02f57e 100644 --- a/CPP/7zip/Archive/7z/7zHandler.cpp +++ b/CPP/7zip/Archive/7z/7zHandler.cpp @@ -307,7 +307,7 @@ bool CHandler::IsFolderEncrypted(CNum folderIndex) const for (unsigned j = 0; j < idSize; j++) id64 = ((id64 << 8) | longID[j]); inByte.SkipDataNoCheck(idSize); - if (id64 == k_AES || id64 == k_XCHACHA20) + if (id64 == k_AES || id64 == k_XCHACHA20 || id64 == k_XCHACHA20_POLY1305) return true; if ((mainByte & 0x20) != 0) inByte.SkipDataNoCheck(inByte.ReadNum()); @@ -515,6 +515,16 @@ HRESULT CHandler::SetMethodToProp(CNum folderIndex, PROPVARIANT *prop) const ConvertUInt32ToString(numCyclesPower, s); } } + else if (id == k_XCHACHA20_POLY1305) + { + name = "XChaCha20-Poly1305"; + if (propsSize >= 1) + { + const Byte firstByte = props[0]; + const UInt32 numCyclesPower = firstByte & 0x3F; + ConvertUInt32ToString(numCyclesPower, s); + } + } } if (name) diff --git a/CPP/7zip/Archive/7z/7zHandlerOut.cpp b/CPP/7zip/Archive/7z/7zHandlerOut.cpp index 68e898541..3747b294c 100644 --- a/CPP/7zip/Archive/7z/7zHandlerOut.cpp +++ b/CPP/7zip/Archive/7z/7zHandlerOut.cpp @@ -1035,6 +1035,8 @@ HRESULT COutHandler::SetProperty(const wchar_t *nameSpec, const PROPVARIANT &val _encryptionMethodId = k_AES; else if (StringsAreEqualNoCase_Ascii(m, "XChaCha20")) _encryptionMethodId = k_XCHACHA20; + else if (StringsAreEqualNoCase_Ascii(m, "XChaCha20Poly1305") || StringsAreEqualNoCase_Ascii(m, "XChaCha20-Poly1305")) + _encryptionMethodId = k_XCHACHA20_POLY1305; else return E_INVALIDARG; return S_OK; diff --git a/CPP/7zip/Archive/7z/7zHeader.h b/CPP/7zip/Archive/7z/7zHeader.h index 449bd2039..fffb327ef 100644 --- a/CPP/7zip/Archive/7z/7zHeader.h +++ b/CPP/7zip/Archive/7z/7zHeader.h @@ -124,6 +124,7 @@ const UInt32 k_SPARC = 0x3030805; const UInt32 k_AES = 0x6F10701; const UInt32 k_XCHACHA20 = 0x6F10702; +const UInt32 k_XCHACHA20_POLY1305 = 0x6F10703; // const UInt32 k_ZSTD = 0x4015D; // winzip zstd // 0x4F71101, 7z-zstd diff --git a/CPP/7zip/Archive/7z/7zItem.h b/CPP/7zip/Archive/7z/7zItem.h index edd101bb9..65eba5c3a 100644 --- a/CPP/7zip/Archive/7z/7zItem.h +++ b/CPP/7zip/Archive/7z/7zItem.h @@ -85,7 +85,7 @@ struct CFolder FOR_VECTOR(i, Coders) { CMethodId id = Coders[i].MethodID; - if (id == k_AES || id == k_XCHACHA20) + if (id == k_AES || id == k_XCHACHA20 || id == k_XCHACHA20_POLY1305) return true; } return false; diff --git a/CPP/7zip/Bundles/Format7zF/Arc.mak b/CPP/7zip/Bundles/Format7zF/Arc.mak index 9b13c3b1b..1da304762 100644 --- a/CPP/7zip/Bundles/Format7zF/Arc.mak +++ b/CPP/7zip/Bundles/Format7zF/Arc.mak @@ -247,6 +247,8 @@ CRYPTO_OBJS = \ $O\7zKeyDerivation.obj \ $O\XChaCha20.obj \ $O\XChaCha20Register.obj \ + $O\XChaCha20Poly1305.obj \ + $O\XChaCha20Poly1305Register.obj \ $O\HmacSha1.obj \ $O\HmacSha256.obj \ $O\MyAes.obj \ diff --git a/CPP/7zip/Bundles/SFXCon/makefile b/CPP/7zip/Bundles/SFXCon/makefile index 2a2e98bf5..5e98a11f5 100644 --- a/CPP/7zip/Bundles/SFXCon/makefile +++ b/CPP/7zip/Bundles/SFXCon/makefile @@ -113,6 +113,8 @@ CRYPTO_OBJS = \ $O\MyAes.obj \ $O\XChaCha20.obj \ $O\XChaCha20Register.obj \ + $O\XChaCha20Poly1305.obj \ + $O\XChaCha20Poly1305Register.obj \ C_OBJS = \ $O\7zStream.obj \ diff --git a/CPP/7zip/Bundles/SFXWin/makefile b/CPP/7zip/Bundles/SFXWin/makefile index b93492195..9b9738e0f 100644 --- a/CPP/7zip/Bundles/SFXWin/makefile +++ b/CPP/7zip/Bundles/SFXWin/makefile @@ -135,6 +135,8 @@ CRYPTO_OBJS = \ $O\MyAes.obj \ $O\XChaCha20.obj \ $O\XChaCha20Register.obj \ + $O\XChaCha20Poly1305.obj \ + $O\XChaCha20Poly1305Register.obj \ C_OBJS = \ $O\7zStream.obj \ diff --git a/CPP/7zip/Crypto/ChaCha20Simd.h b/CPP/7zip/Crypto/ChaCha20Simd.h new file mode 100644 index 000000000..736cff901 --- /dev/null +++ b/CPP/7zip/Crypto/ChaCha20Simd.h @@ -0,0 +1,500 @@ +// ChaCha20Simd.h +// Shared SIMD (SSE2/AVX2) acceleration code for ChaCha20/XChaCha20 + +#ifndef ZIP7_CRYPTO_CHACHA20_SIMD_H +#define ZIP7_CRYPTO_CHACHA20_SIMD_H + +#ifdef MY_CPU_X86_OR_AMD64 +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) +#include +#endif +#endif + +#ifdef MY_CPU_X86_OR_AMD64 + +#ifdef MY_CPU_SSE2 + +static const Byte kSigma[16] = { + 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', '2', '-', 'b', 'y', 't', 'e', ' ', 'k' +}; + +namespace { + +template +Z7_FORCE_INLINE __m128i RotateLeft_SSE2(const __m128i val) +{ + return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32 - R)); +} + +template <> +Z7_FORCE_INLINE __m128i RotateLeft_SSE2<8>(const __m128i val) +{ +#ifdef __SSSE3__ + const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); + return _mm_shuffle_epi8(val, mask); +#else + return _mm_or_si128(_mm_slli_epi32(val, 8), _mm_srli_epi32(val, 24)); +#endif +} + +template <> +Z7_FORCE_INLINE __m128i RotateLeft_SSE2<16>(const __m128i val) +{ +#ifdef __SSSE3__ + const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); + return _mm_shuffle_epi8(val, mask); +#else + return _mm_or_si128(_mm_slli_epi32(val, 16), _mm_srli_epi32(val, 16)); +#endif +} + +#define SSE2_QUARTERROUND(a, b, c, d) \ + a = _mm_add_epi32(a, b); \ + d = _mm_xor_si128(d, a); \ + d = RotateLeft_SSE2<16>(d); \ + c = _mm_add_epi32(c, d); \ + b = _mm_xor_si128(b, c); \ + b = RotateLeft_SSE2<12>(b); \ + a = _mm_add_epi32(a, b); \ + d = _mm_xor_si128(d, a); \ + d = RotateLeft_SSE2<8>(d); \ + c = _mm_add_epi32(c, d); \ + b = _mm_xor_si128(b, c); \ + b = RotateLeft_SSE2<7>(b); + +Z7_NO_INLINE void ChaCha20_OperateKeystream_SSE2( + const UInt32 *state, + const Byte *input, + Byte *output) +{ + const __m128i state0 = _mm_loadu_si128((const __m128i *)(state + 0)); + const __m128i state1 = _mm_loadu_si128((const __m128i *)(state + 4)); + const __m128i state2 = _mm_loadu_si128((const __m128i *)(state + 8)); + const __m128i state3 = _mm_loadu_si128((const __m128i *)(state + 12)); + + __m128i r0_0 = state0; + __m128i r0_1 = state1; + __m128i r0_2 = state2; + __m128i r0_3 = state3; + + __m128i r1_0 = state0; + __m128i r1_1 = state1; + __m128i r1_2 = state2; + __m128i r1_3 = _mm_add_epi64(state3, _mm_set_epi32(0, 0, 0, 1)); + + __m128i r2_0 = state0; + __m128i r2_1 = state1; + __m128i r2_2 = state2; + __m128i r2_3 = _mm_add_epi64(state3, _mm_set_epi32(0, 0, 0, 2)); + + __m128i r3_0 = state0; + __m128i r3_1 = state1; + __m128i r3_2 = state2; + __m128i r3_3 = _mm_add_epi64(state3, _mm_set_epi32(0, 0, 0, 3)); + + for (int i = 0; i < 10; i++) + { + SSE2_QUARTERROUND(r0_0, r0_1, r0_2, r0_3); + SSE2_QUARTERROUND(r1_0, r1_1, r1_2, r1_3); + SSE2_QUARTERROUND(r2_0, r2_1, r2_2, r2_3); + SSE2_QUARTERROUND(r3_0, r3_1, r3_2, r3_3); + + r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); + r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); + r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1)); + r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); + r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1)); + r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); + r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3)); + + r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1)); + r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); + r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3)); + + SSE2_QUARTERROUND(r0_0, r0_1, r0_2, r0_3); + SSE2_QUARTERROUND(r1_0, r1_1, r1_2, r1_3); + SSE2_QUARTERROUND(r2_0, r2_1, r2_2, r2_3); + SSE2_QUARTERROUND(r3_0, r3_1, r3_2, r3_3); + + r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); + r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); + r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3)); + r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); + r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3)); + r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); + r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1)); + + r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3)); + r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); + r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); + } + + r0_0 = _mm_add_epi32(r0_0, state0); + r0_1 = _mm_add_epi32(r0_1, state1); + r0_2 = _mm_add_epi32(r0_2, state2); + r0_3 = _mm_add_epi32(r0_3, state3); + + r1_0 = _mm_add_epi32(r1_0, state0); + r1_1 = _mm_add_epi32(r1_1, state1); + r1_2 = _mm_add_epi32(r1_2, state2); + r1_3 = _mm_add_epi32(r1_3, state3); + r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); + + r2_0 = _mm_add_epi32(r2_0, state0); + r2_1 = _mm_add_epi32(r2_1, state1); + r2_2 = _mm_add_epi32(r2_2, state2); + r2_3 = _mm_add_epi32(r2_3, state3); + r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); + + r3_0 = _mm_add_epi32(r3_0, state0); + r3_1 = _mm_add_epi32(r3_1, state1); + r3_2 = _mm_add_epi32(r3_2, state2); + r3_3 = _mm_add_epi32(r3_3, state3); + r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); + + if (input) + { + r0_0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 0*16)), r0_0); + r0_1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 1*16)), r0_1); + r0_2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 2*16)), r0_2); + r0_3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 3*16)), r0_3); + } + + _mm_storeu_si128((__m128i *)(output + 0*16), r0_0); + _mm_storeu_si128((__m128i *)(output + 1*16), r0_1); + _mm_storeu_si128((__m128i *)(output + 2*16), r0_2); + _mm_storeu_si128((__m128i *)(output + 3*16), r0_3); + + if (input) + { + r1_0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 4*16)), r1_0); + r1_1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 5*16)), r1_1); + r1_2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 6*16)), r1_2); + r1_3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 7*16)), r1_3); + } + + _mm_storeu_si128((__m128i *)(output + 4*16), r1_0); + _mm_storeu_si128((__m128i *)(output + 5*16), r1_1); + _mm_storeu_si128((__m128i *)(output + 6*16), r1_2); + _mm_storeu_si128((__m128i *)(output + 7*16), r1_3); + + if (input) + { + r2_0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 8*16)), r2_0); + r2_1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 9*16)), r2_1); + r2_2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 10*16)), r2_2); + r2_3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 11*16)), r2_3); + } + + _mm_storeu_si128((__m128i *)(output + 8*16), r2_0); + _mm_storeu_si128((__m128i *)(output + 9*16), r2_1); + _mm_storeu_si128((__m128i *)(output + 10*16), r2_2); + _mm_storeu_si128((__m128i *)(output + 11*16), r2_3); + + if (input) + { + r3_0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 12*16)), r3_0); + r3_1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 13*16)), r3_1); + r3_2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 14*16)), r3_2); + r3_3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 15*16)), r3_3); + } + + _mm_storeu_si128((__m128i *)(output + 12*16), r3_0); + _mm_storeu_si128((__m128i *)(output + 13*16), r3_1); + _mm_storeu_si128((__m128i *)(output + 14*16), r3_2); + _mm_storeu_si128((__m128i *)(output + 15*16), r3_3); +} + +#ifdef MY_CPU_AMD64 + +template +Z7_FORCE_INLINE __m256i RotateLeft_AVX2(const __m256i val) +{ + return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32 - R)); +} + +template <> +Z7_FORCE_INLINE __m256i RotateLeft_AVX2<8>(const __m256i val) +{ + const __m256i mask = _mm256_set_epi8( + 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3, + 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); + return _mm256_shuffle_epi8(val, mask); +} + +template <> +Z7_FORCE_INLINE __m256i RotateLeft_AVX2<16>(const __m256i val) +{ + const __m256i mask = _mm256_set_epi8( + 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2, + 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); + return _mm256_shuffle_epi8(val, mask); +} + +#define AVX2_QUARTERROUND(a, b, c, d) \ + a = _mm256_add_epi32(a, b); \ + d = _mm256_xor_si256(d, a); \ + d = RotateLeft_AVX2<16>(d); \ + c = _mm256_add_epi32(c, d); \ + b = _mm256_xor_si256(b, c); \ + b = RotateLeft_AVX2<12>(b); \ + a = _mm256_add_epi32(a, b); \ + d = _mm256_xor_si256(d, a); \ + d = RotateLeft_AVX2<8>(d); \ + c = _mm256_add_epi32(c, d); \ + b = _mm256_xor_si256(b, c); \ + b = RotateLeft_AVX2<7>(b); + +Z7_NO_INLINE void ChaCha20_OperateKeystream_AVX2( + const UInt32 *state, + const Byte *input, + Byte *output) +{ + const __m256i state0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)(state + 0))); + const __m256i state1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)(state + 4))); + const __m256i state2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)(state + 8))); + const __m256i state3 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)(state + 12))); + + const UInt32 C = 0xFFFFFFFFu - state[12]; + const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, C < 4, 4); + const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5); + const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6); + const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7); + + __m256i X0_0 = state0; + __m256i X0_1 = state1; + __m256i X0_2 = state2; + __m256i X0_3 = _mm256_add_epi32(state3, CTR0); + + __m256i X1_0 = state0; + __m256i X1_1 = state1; + __m256i X1_2 = state2; + __m256i X1_3 = _mm256_add_epi32(state3, CTR1); + + __m256i X2_0 = state0; + __m256i X2_1 = state1; + __m256i X2_2 = state2; + __m256i X2_3 = _mm256_add_epi32(state3, CTR2); + + __m256i X3_0 = state0; + __m256i X3_1 = state1; + __m256i X3_2 = state2; + __m256i X3_3 = _mm256_add_epi32(state3, CTR3); + + for (int i = 0; i < 10; i++) + { + AVX2_QUARTERROUND(X0_0, X0_1, X0_2, X0_3); + AVX2_QUARTERROUND(X1_0, X1_1, X1_2, X1_3); + AVX2_QUARTERROUND(X2_0, X2_1, X2_2, X2_3); + AVX2_QUARTERROUND(X3_0, X3_1, X3_2, X3_3); + + X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1)); + X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); + X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3)); + + X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1)); + X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2)); + X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3)); + + X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1)); + X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2)); + X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3)); + + X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1)); + X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2)); + X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3)); + + AVX2_QUARTERROUND(X0_0, X0_1, X0_2, X0_3); + AVX2_QUARTERROUND(X1_0, X1_1, X1_2, X1_3); + AVX2_QUARTERROUND(X2_0, X2_1, X2_2, X2_3); + AVX2_QUARTERROUND(X3_0, X3_1, X3_2, X3_3); + + X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3)); + X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); + X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1)); + + X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3)); + X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2)); + X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1)); + + X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3)); + X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2)); + X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1)); + + X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3)); + X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2)); + X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1)); + } + + X0_0 = _mm256_add_epi32(X0_0, state0); + X0_1 = _mm256_add_epi32(X0_1, state1); + X0_2 = _mm256_add_epi32(X0_2, state2); + X0_3 = _mm256_add_epi32(X0_3, state3); + X0_3 = _mm256_add_epi32(X0_3, CTR0); + + X1_0 = _mm256_add_epi32(X1_0, state0); + X1_1 = _mm256_add_epi32(X1_1, state1); + X1_2 = _mm256_add_epi32(X1_2, state2); + X1_3 = _mm256_add_epi32(X1_3, state3); + X1_3 = _mm256_add_epi32(X1_3, CTR1); + + X2_0 = _mm256_add_epi32(X2_0, state0); + X2_1 = _mm256_add_epi32(X2_1, state1); + X2_2 = _mm256_add_epi32(X2_2, state2); + X2_3 = _mm256_add_epi32(X2_3, state3); + X2_3 = _mm256_add_epi32(X2_3, CTR2); + + X3_0 = _mm256_add_epi32(X3_0, state0); + X3_1 = _mm256_add_epi32(X3_1, state1); + X3_2 = _mm256_add_epi32(X3_2, state2); + X3_3 = _mm256_add_epi32(X3_3, state3); + X3_3 = _mm256_add_epi32(X3_3, CTR3); + + if (input) + { + _mm256_storeu_si256((__m256i *)(output + 0*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 0*32)))); + _mm256_storeu_si256((__m256i *)(output + 1*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 1*32)))); + _mm256_storeu_si256((__m256i *)(output + 2*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 2*32)))); + _mm256_storeu_si256((__m256i *)(output + 3*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 3*32)))); + } + else + { + _mm256_storeu_si256((__m256i *)(output + 0*32), + _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))); + _mm256_storeu_si256((__m256i *)(output + 1*32), + _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4))); + _mm256_storeu_si256((__m256i *)(output + 2*32), + _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4))); + _mm256_storeu_si256((__m256i *)(output + 3*32), + _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4))); + } + + if (input) + { + _mm256_storeu_si256((__m256i *)(output + 4*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 4*32)))); + _mm256_storeu_si256((__m256i *)(output + 5*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 5*32)))); + _mm256_storeu_si256((__m256i *)(output + 6*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 6*32)))); + _mm256_storeu_si256((__m256i *)(output + 7*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 7*32)))); + } + else + { + _mm256_storeu_si256((__m256i *)(output + 4*32), + _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4))); + _mm256_storeu_si256((__m256i *)(output + 5*32), + _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4))); + _mm256_storeu_si256((__m256i *)(output + 6*32), + _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4))); + _mm256_storeu_si256((__m256i *)(output + 7*32), + _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4))); + } + + if (input) + { + _mm256_storeu_si256((__m256i *)(output + 8*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 8*32)))); + _mm256_storeu_si256((__m256i *)(output + 9*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 9*32)))); + _mm256_storeu_si256((__m256i *)(output + 10*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 10*32)))); + _mm256_storeu_si256((__m256i *)(output + 11*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 11*32)))); + } + else + { + _mm256_storeu_si256((__m256i *)(output + 8*32), + _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4))); + _mm256_storeu_si256((__m256i *)(output + 9*32), + _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4))); + _mm256_storeu_si256((__m256i *)(output + 10*32), + _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4))); + _mm256_storeu_si256((__m256i *)(output + 11*32), + _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4))); + } + + if (input) + { + _mm256_storeu_si256((__m256i *)(output + 12*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 12*32)))); + _mm256_storeu_si256((__m256i *)(output + 13*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 13*32)))); + _mm256_storeu_si256((__m256i *)(output + 14*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 14*32)))); + _mm256_storeu_si256((__m256i *)(output + 15*32), + _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)), + _mm256_loadu_si256((const __m256i *)(input + 15*32)))); + } + else + { + _mm256_storeu_si256((__m256i *)(output + 12*32), + _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4))); + _mm256_storeu_si256((__m256i *)(output + 13*32), + _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4))); + _mm256_storeu_si256((__m256i *)(output + 14*32), + _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4))); + _mm256_storeu_si256((__m256i *)(output + 15*32), + _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4))); + } + + _mm256_zeroupper(); +} + +#endif + +} + +static bool g_SSE2Enabled = false; +static bool g_AVX2Enabled = false; +static bool g_SIMDInitialized = false; + +static void InitSIMD() +{ + if (g_SIMDInitialized) + return; + g_SIMDInitialized = true; + +#ifdef MY_CPU_AMD64 + g_SSE2Enabled = true; + g_AVX2Enabled = CPU_IsSupported_AVX2() != 0; +#elif defined(MY_CPU_X86) + g_SSE2Enabled = CPU_IsSupported_SSE2() != 0; +#endif +} + +#endif + +#endif + +#endif \ No newline at end of file diff --git a/CPP/7zip/Crypto/XChaCha20.cpp b/CPP/7zip/Crypto/XChaCha20.cpp index 3fdae04c8..ad18c85a1 100644 --- a/CPP/7zip/Crypto/XChaCha20.cpp +++ b/CPP/7zip/Crypto/XChaCha20.cpp @@ -18,19 +18,11 @@ #include "RandGen.h" #endif -#ifdef MY_CPU_X86_OR_AMD64 -#if defined(_MSC_VER) -#include -#elif defined(__GNUC__) -#include -#endif -#endif +#include "ChaCha20Simd.h" namespace NCrypto { namespace NXChaCha20 { -static const unsigned k_NumCyclesPower_Supported_MAX = 24; - #define ROTL32(v, n) (((v) << (n)) | ((v) >> (32 - (n)))) #define QUARTERROUND(a, b, c, d) \ @@ -58,7 +50,7 @@ CBase::CBase(): void CBaseCoder::DeriveKey() { - HChaCha20Block(_derivedKey, _key.Key, _nonce); + XHChaCha20Block_Core(_derivedKey, _key.Key, _nonce); _derivedKeyValid = true; } @@ -78,11 +70,7 @@ void CBase::PrepareKey() g_GlobalKeyCache.FindAndAdd(_key); } -static const Byte kSigma[16] = { - 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', '2', '-', 'b', 'y', 't', 'e', ' ', 'k' -}; - -void CBaseCoder::HChaCha20Block(Byte *output, const Byte *key, const Byte *nonce) +void XHChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce) { UInt32 x0, x1, x2, x3, x4, x5, x6, x7; UInt32 x8, x9, x10, x11, x12, x13, x14, x15; @@ -107,14 +95,14 @@ void CBaseCoder::HChaCha20Block(Byte *output, const Byte *key, const Byte *nonce x15 = GetUi32(nonce + 12); #define DOUBLE_ROUND \ - QUARTERROUND(x0, x4, x8, x12); \ - QUARTERROUND(x1, x5, x9, x13); \ - QUARTERROUND(x2, x6, x10, x14); \ - QUARTERROUND(x3, x7, x11, x15); \ - QUARTERROUND(x0, x5, x10, x15); \ - QUARTERROUND(x1, x6, x11, x12); \ - QUARTERROUND(x2, x7, x8, x13); \ - QUARTERROUND(x3, x4, x9, x14); + QUARTERROUND(x0, x4, x8, x12) \ + QUARTERROUND(x1, x5, x9, x13) \ + QUARTERROUND(x2, x6, x10, x14) \ + QUARTERROUND(x3, x7, x11, x15) \ + QUARTERROUND(x0, x5, x10, x15) \ + QUARTERROUND(x1, x6, x11, x12) \ + QUARTERROUND(x2, x7, x8, x13) \ + QUARTERROUND(x3, x4, x9, x14) DOUBLE_ROUND; DOUBLE_ROUND; DOUBLE_ROUND; DOUBLE_ROUND; @@ -134,7 +122,7 @@ void CBaseCoder::HChaCha20Block(Byte *output, const Byte *key, const Byte *nonce SetUi32(output + 28, x15); } -void CBaseCoder::Chacha20Block(Byte *output, const Byte *key, const Byte *nonce, UInt64 counter) +void XChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce, UInt64 counter) { UInt32 x0, x1, x2, x3, x4, x5, x6, x7; UInt32 x8, x9, x10, x11, x12, x13, x14, x15; @@ -159,14 +147,14 @@ void CBaseCoder::Chacha20Block(Byte *output, const Byte *key, const Byte *nonce, x15 = GetUi32(nonce + 4); #define DOUBLE_ROUND \ - QUARTERROUND(x0, x4, x8, x12); \ - QUARTERROUND(x1, x5, x9, x13); \ - QUARTERROUND(x2, x6, x10, x14); \ - QUARTERROUND(x3, x7, x11, x15); \ - QUARTERROUND(x0, x5, x10, x15); \ - QUARTERROUND(x1, x6, x11, x12); \ - QUARTERROUND(x2, x7, x8, x13); \ - QUARTERROUND(x3, x4, x9, x14); + QUARTERROUND(x0, x4, x8, x12) \ + QUARTERROUND(x1, x5, x9, x13) \ + QUARTERROUND(x2, x6, x10, x14) \ + QUARTERROUND(x3, x7, x11, x15) \ + QUARTERROUND(x0, x5, x10, x15) \ + QUARTERROUND(x1, x6, x11, x12) \ + QUARTERROUND(x2, x7, x8, x13) \ + QUARTERROUND(x3, x4, x9, x14) DOUBLE_ROUND; DOUBLE_ROUND; DOUBLE_ROUND; DOUBLE_ROUND; @@ -211,487 +199,6 @@ void CBaseCoder::Chacha20Block(Byte *output, const Byte *key, const Byte *nonce, SetUi32(output + 60, x15) } -#ifdef MY_CPU_X86_OR_AMD64 - -#ifdef MY_CPU_SSE2 - -namespace { - -template -Z7_FORCE_INLINE __m128i RotateLeft_SSE2(const __m128i val) -{ - return _mm_or_si128(_mm_slli_epi32(val, R), _mm_srli_epi32(val, 32 - R)); -} - -template <> -Z7_FORCE_INLINE __m128i RotateLeft_SSE2<8>(const __m128i val) -{ -#ifdef __SSSE3__ - const __m128i mask = _mm_set_epi8(14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); - return _mm_shuffle_epi8(val, mask); -#else - return _mm_or_si128(_mm_slli_epi32(val, 8), _mm_srli_epi32(val, 24)); -#endif -} - -template <> -Z7_FORCE_INLINE __m128i RotateLeft_SSE2<16>(const __m128i val) -{ -#ifdef __SSSE3__ - const __m128i mask = _mm_set_epi8(13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); - return _mm_shuffle_epi8(val, mask); -#else - return _mm_or_si128(_mm_slli_epi32(val, 16), _mm_srli_epi32(val, 16)); -#endif -} - -#define SSE2_QUARTERROUND(a, b, c, d) \ - a = _mm_add_epi32(a, b); \ - d = _mm_xor_si128(d, a); \ - d = RotateLeft_SSE2<16>(d); \ - c = _mm_add_epi32(c, d); \ - b = _mm_xor_si128(b, c); \ - b = RotateLeft_SSE2<12>(b); \ - a = _mm_add_epi32(a, b); \ - d = _mm_xor_si128(d, a); \ - d = RotateLeft_SSE2<8>(d); \ - c = _mm_add_epi32(c, d); \ - b = _mm_xor_si128(b, c); \ - b = RotateLeft_SSE2<7>(b); - -Z7_NO_INLINE void ChaCha20_OperateKeystream_SSE2( - const UInt32 *state, - const Byte *input, - Byte *output) -{ - const __m128i state0 = _mm_loadu_si128((const __m128i *)(state + 0)); - const __m128i state1 = _mm_loadu_si128((const __m128i *)(state + 4)); - const __m128i state2 = _mm_loadu_si128((const __m128i *)(state + 8)); - const __m128i state3 = _mm_loadu_si128((const __m128i *)(state + 12)); - - __m128i r0_0 = state0; - __m128i r0_1 = state1; - __m128i r0_2 = state2; - __m128i r0_3 = state3; - - __m128i r1_0 = state0; - __m128i r1_1 = state1; - __m128i r1_2 = state2; - __m128i r1_3 = _mm_add_epi64(state3, _mm_set_epi32(0, 0, 0, 1)); - - __m128i r2_0 = state0; - __m128i r2_1 = state1; - __m128i r2_2 = state2; - __m128i r2_3 = _mm_add_epi64(state3, _mm_set_epi32(0, 0, 0, 2)); - - __m128i r3_0 = state0; - __m128i r3_1 = state1; - __m128i r3_2 = state2; - __m128i r3_3 = _mm_add_epi64(state3, _mm_set_epi32(0, 0, 0, 3)); - - for (int i = 0; i < 10; i++) - { - SSE2_QUARTERROUND(r0_0, r0_1, r0_2, r0_3); - SSE2_QUARTERROUND(r1_0, r1_1, r1_2, r1_3); - SSE2_QUARTERROUND(r2_0, r2_1, r2_2, r2_3); - SSE2_QUARTERROUND(r3_0, r3_1, r3_2, r3_3); - - r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1)); - r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); - r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3)); - - r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1)); - r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); - r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3)); - - r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1)); - r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); - r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3)); - - r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1)); - r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); - r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3)); - - SSE2_QUARTERROUND(r0_0, r0_1, r0_2, r0_3); - SSE2_QUARTERROUND(r1_0, r1_1, r1_2, r1_3); - SSE2_QUARTERROUND(r2_0, r2_1, r2_2, r2_3); - SSE2_QUARTERROUND(r3_0, r3_1, r3_2, r3_3); - - r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3)); - r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2)); - r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1)); - - r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3)); - r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2)); - r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1)); - - r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3)); - r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2)); - r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1)); - - r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3)); - r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2)); - r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1)); - } - - r0_0 = _mm_add_epi32(r0_0, state0); - r0_1 = _mm_add_epi32(r0_1, state1); - r0_2 = _mm_add_epi32(r0_2, state2); - r0_3 = _mm_add_epi32(r0_3, state3); - - r1_0 = _mm_add_epi32(r1_0, state0); - r1_1 = _mm_add_epi32(r1_1, state1); - r1_2 = _mm_add_epi32(r1_2, state2); - r1_3 = _mm_add_epi32(r1_3, state3); - r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1)); - - r2_0 = _mm_add_epi32(r2_0, state0); - r2_1 = _mm_add_epi32(r2_1, state1); - r2_2 = _mm_add_epi32(r2_2, state2); - r2_3 = _mm_add_epi32(r2_3, state3); - r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2)); - - r3_0 = _mm_add_epi32(r3_0, state0); - r3_1 = _mm_add_epi32(r3_1, state1); - r3_2 = _mm_add_epi32(r3_2, state2); - r3_3 = _mm_add_epi32(r3_3, state3); - r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3)); - - if (input) - { - r0_0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 0*16)), r0_0); - r0_1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 1*16)), r0_1); - r0_2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 2*16)), r0_2); - r0_3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 3*16)), r0_3); - } - - _mm_storeu_si128((__m128i *)(output + 0*16), r0_0); - _mm_storeu_si128((__m128i *)(output + 1*16), r0_1); - _mm_storeu_si128((__m128i *)(output + 2*16), r0_2); - _mm_storeu_si128((__m128i *)(output + 3*16), r0_3); - - if (input) - { - r1_0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 4*16)), r1_0); - r1_1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 5*16)), r1_1); - r1_2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 6*16)), r1_2); - r1_3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 7*16)), r1_3); - } - - _mm_storeu_si128((__m128i *)(output + 4*16), r1_0); - _mm_storeu_si128((__m128i *)(output + 5*16), r1_1); - _mm_storeu_si128((__m128i *)(output + 6*16), r1_2); - _mm_storeu_si128((__m128i *)(output + 7*16), r1_3); - - if (input) - { - r2_0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 8*16)), r2_0); - r2_1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 9*16)), r2_1); - r2_2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 10*16)), r2_2); - r2_3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 11*16)), r2_3); - } - - _mm_storeu_si128((__m128i *)(output + 8*16), r2_0); - _mm_storeu_si128((__m128i *)(output + 9*16), r2_1); - _mm_storeu_si128((__m128i *)(output + 10*16), r2_2); - _mm_storeu_si128((__m128i *)(output + 11*16), r2_3); - - if (input) - { - r3_0 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 12*16)), r3_0); - r3_1 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 13*16)), r3_1); - r3_2 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 14*16)), r3_2); - r3_3 = _mm_xor_si128(_mm_loadu_si128((const __m128i *)(input + 15*16)), r3_3); - } - - _mm_storeu_si128((__m128i *)(output + 12*16), r3_0); - _mm_storeu_si128((__m128i *)(output + 13*16), r3_1); - _mm_storeu_si128((__m128i *)(output + 14*16), r3_2); - _mm_storeu_si128((__m128i *)(output + 15*16), r3_3); -} - -#ifdef MY_CPU_AMD64 - -template -Z7_FORCE_INLINE __m256i RotateLeft_AVX2(const __m256i val) -{ - return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32 - R)); -} - -template <> -Z7_FORCE_INLINE __m256i RotateLeft_AVX2<8>(const __m256i val) -{ - const __m256i mask = _mm256_set_epi8( - 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3, - 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3); - return _mm256_shuffle_epi8(val, mask); -} - -template <> -Z7_FORCE_INLINE __m256i RotateLeft_AVX2<16>(const __m256i val) -{ - const __m256i mask = _mm256_set_epi8( - 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2, - 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2); - return _mm256_shuffle_epi8(val, mask); -} - -#define AVX2_QUARTERROUND(a, b, c, d) \ - a = _mm256_add_epi32(a, b); \ - d = _mm256_xor_si256(d, a); \ - d = RotateLeft_AVX2<16>(d); \ - c = _mm256_add_epi32(c, d); \ - b = _mm256_xor_si256(b, c); \ - b = RotateLeft_AVX2<12>(b); \ - a = _mm256_add_epi32(a, b); \ - d = _mm256_xor_si256(d, a); \ - d = RotateLeft_AVX2<8>(d); \ - c = _mm256_add_epi32(c, d); \ - b = _mm256_xor_si256(b, c); \ - b = RotateLeft_AVX2<7>(b); - -Z7_NO_INLINE void ChaCha20_OperateKeystream_AVX2( - const UInt32 *state, - const Byte *input, - Byte *output) -{ - const __m256i state0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)(state + 0))); - const __m256i state1 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)(state + 4))); - const __m256i state2 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)(state + 8))); - const __m256i state3 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i *)(state + 12))); - - const UInt32 C = 0xFFFFFFFFu - state[12]; - const __m256i CTR0 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, C < 4, 4); - const __m256i CTR1 = _mm256_set_epi32(0, 0, C < 1, 1, 0, 0, C < 5, 5); - const __m256i CTR2 = _mm256_set_epi32(0, 0, C < 2, 2, 0, 0, C < 6, 6); - const __m256i CTR3 = _mm256_set_epi32(0, 0, C < 3, 3, 0, 0, C < 7, 7); - - __m256i X0_0 = state0; - __m256i X0_1 = state1; - __m256i X0_2 = state2; - __m256i X0_3 = _mm256_add_epi32(state3, CTR0); - - __m256i X1_0 = state0; - __m256i X1_1 = state1; - __m256i X1_2 = state2; - __m256i X1_3 = _mm256_add_epi32(state3, CTR1); - - __m256i X2_0 = state0; - __m256i X2_1 = state1; - __m256i X2_2 = state2; - __m256i X2_3 = _mm256_add_epi32(state3, CTR2); - - __m256i X3_0 = state0; - __m256i X3_1 = state1; - __m256i X3_2 = state2; - __m256i X3_3 = _mm256_add_epi32(state3, CTR3); - - for (int i = 0; i < 10; i++) - { - AVX2_QUARTERROUND(X0_0, X0_1, X0_2, X0_3); - AVX2_QUARTERROUND(X1_0, X1_1, X1_2, X1_3); - AVX2_QUARTERROUND(X2_0, X2_1, X2_2, X2_3); - AVX2_QUARTERROUND(X3_0, X3_1, X3_2, X3_3); - - X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(0, 3, 2, 1)); - X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); - X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(2, 1, 0, 3)); - - X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(0, 3, 2, 1)); - X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2)); - X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(2, 1, 0, 3)); - - X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(0, 3, 2, 1)); - X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2)); - X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(2, 1, 0, 3)); - - X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(0, 3, 2, 1)); - X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2)); - X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(2, 1, 0, 3)); - - AVX2_QUARTERROUND(X0_0, X0_1, X0_2, X0_3); - AVX2_QUARTERROUND(X1_0, X1_1, X1_2, X1_3); - AVX2_QUARTERROUND(X2_0, X2_1, X2_2, X2_3); - AVX2_QUARTERROUND(X3_0, X3_1, X3_2, X3_3); - - X0_1 = _mm256_shuffle_epi32(X0_1, _MM_SHUFFLE(2, 1, 0, 3)); - X0_2 = _mm256_shuffle_epi32(X0_2, _MM_SHUFFLE(1, 0, 3, 2)); - X0_3 = _mm256_shuffle_epi32(X0_3, _MM_SHUFFLE(0, 3, 2, 1)); - - X1_1 = _mm256_shuffle_epi32(X1_1, _MM_SHUFFLE(2, 1, 0, 3)); - X1_2 = _mm256_shuffle_epi32(X1_2, _MM_SHUFFLE(1, 0, 3, 2)); - X1_3 = _mm256_shuffle_epi32(X1_3, _MM_SHUFFLE(0, 3, 2, 1)); - - X2_1 = _mm256_shuffle_epi32(X2_1, _MM_SHUFFLE(2, 1, 0, 3)); - X2_2 = _mm256_shuffle_epi32(X2_2, _MM_SHUFFLE(1, 0, 3, 2)); - X2_3 = _mm256_shuffle_epi32(X2_3, _MM_SHUFFLE(0, 3, 2, 1)); - - X3_1 = _mm256_shuffle_epi32(X3_1, _MM_SHUFFLE(2, 1, 0, 3)); - X3_2 = _mm256_shuffle_epi32(X3_2, _MM_SHUFFLE(1, 0, 3, 2)); - X3_3 = _mm256_shuffle_epi32(X3_3, _MM_SHUFFLE(0, 3, 2, 1)); - } - - X0_0 = _mm256_add_epi32(X0_0, state0); - X0_1 = _mm256_add_epi32(X0_1, state1); - X0_2 = _mm256_add_epi32(X0_2, state2); - X0_3 = _mm256_add_epi32(X0_3, state3); - X0_3 = _mm256_add_epi32(X0_3, CTR0); - - X1_0 = _mm256_add_epi32(X1_0, state0); - X1_1 = _mm256_add_epi32(X1_1, state1); - X1_2 = _mm256_add_epi32(X1_2, state2); - X1_3 = _mm256_add_epi32(X1_3, state3); - X1_3 = _mm256_add_epi32(X1_3, CTR1); - - X2_0 = _mm256_add_epi32(X2_0, state0); - X2_1 = _mm256_add_epi32(X2_1, state1); - X2_2 = _mm256_add_epi32(X2_2, state2); - X2_3 = _mm256_add_epi32(X2_3, state3); - X2_3 = _mm256_add_epi32(X2_3, CTR2); - - X3_0 = _mm256_add_epi32(X3_0, state0); - X3_1 = _mm256_add_epi32(X3_1, state1); - X3_2 = _mm256_add_epi32(X3_2, state2); - X3_3 = _mm256_add_epi32(X3_3, state3); - X3_3 = _mm256_add_epi32(X3_3, CTR3); - - if (input) - { - _mm256_storeu_si256((__m256i *)(output + 0*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 0*32)))); - _mm256_storeu_si256((__m256i *)(output + 1*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 1*32)))); - _mm256_storeu_si256((__m256i *)(output + 2*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 2*32)))); - _mm256_storeu_si256((__m256i *)(output + 3*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 3*32)))); - } - else - { - _mm256_storeu_si256((__m256i *)(output + 0*32), - _mm256_permute2x128_si256(X0_0, X0_1, 1 + (3 << 4))); - _mm256_storeu_si256((__m256i *)(output + 1*32), - _mm256_permute2x128_si256(X0_2, X0_3, 1 + (3 << 4))); - _mm256_storeu_si256((__m256i *)(output + 2*32), - _mm256_permute2x128_si256(X1_0, X1_1, 1 + (3 << 4))); - _mm256_storeu_si256((__m256i *)(output + 3*32), - _mm256_permute2x128_si256(X1_2, X1_3, 1 + (3 << 4))); - } - - if (input) - { - _mm256_storeu_si256((__m256i *)(output + 4*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 4*32)))); - _mm256_storeu_si256((__m256i *)(output + 5*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 5*32)))); - _mm256_storeu_si256((__m256i *)(output + 6*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 6*32)))); - _mm256_storeu_si256((__m256i *)(output + 7*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 7*32)))); - } - else - { - _mm256_storeu_si256((__m256i *)(output + 4*32), - _mm256_permute2x128_si256(X2_0, X2_1, 1 + (3 << 4))); - _mm256_storeu_si256((__m256i *)(output + 5*32), - _mm256_permute2x128_si256(X2_2, X2_3, 1 + (3 << 4))); - _mm256_storeu_si256((__m256i *)(output + 6*32), - _mm256_permute2x128_si256(X3_0, X3_1, 1 + (3 << 4))); - _mm256_storeu_si256((__m256i *)(output + 7*32), - _mm256_permute2x128_si256(X3_2, X3_3, 1 + (3 << 4))); - } - - if (input) - { - _mm256_storeu_si256((__m256i *)(output + 8*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 8*32)))); - _mm256_storeu_si256((__m256i *)(output + 9*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 9*32)))); - _mm256_storeu_si256((__m256i *)(output + 10*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 10*32)))); - _mm256_storeu_si256((__m256i *)(output + 11*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 11*32)))); - } - else - { - _mm256_storeu_si256((__m256i *)(output + 8*32), - _mm256_permute2x128_si256(X0_0, X0_1, 0 + (2 << 4))); - _mm256_storeu_si256((__m256i *)(output + 9*32), - _mm256_permute2x128_si256(X0_2, X0_3, 0 + (2 << 4))); - _mm256_storeu_si256((__m256i *)(output + 10*32), - _mm256_permute2x128_si256(X1_0, X1_1, 0 + (2 << 4))); - _mm256_storeu_si256((__m256i *)(output + 11*32), - _mm256_permute2x128_si256(X1_2, X1_3, 0 + (2 << 4))); - } - - if (input) - { - _mm256_storeu_si256((__m256i *)(output + 12*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 12*32)))); - _mm256_storeu_si256((__m256i *)(output + 13*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 13*32)))); - _mm256_storeu_si256((__m256i *)(output + 14*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 14*32)))); - _mm256_storeu_si256((__m256i *)(output + 15*32), - _mm256_xor_si256(_mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4)), - _mm256_loadu_si256((const __m256i *)(input + 15*32)))); - } - else - { - _mm256_storeu_si256((__m256i *)(output + 12*32), - _mm256_permute2x128_si256(X2_0, X2_1, 0 + (2 << 4))); - _mm256_storeu_si256((__m256i *)(output + 13*32), - _mm256_permute2x128_si256(X2_2, X2_3, 0 + (2 << 4))); - _mm256_storeu_si256((__m256i *)(output + 14*32), - _mm256_permute2x128_si256(X3_0, X3_1, 0 + (2 << 4))); - _mm256_storeu_si256((__m256i *)(output + 15*32), - _mm256_permute2x128_si256(X3_2, X3_3, 0 + (2 << 4))); - } - - _mm256_zeroupper(); -} - -#endif - -} - -static bool g_SSE2Enabled = false; -static bool g_AVX2Enabled = false; -static bool g_SIMDInitialized = false; - -static void InitSIMD() -{ - if (g_SIMDInitialized) - return; - g_SIMDInitialized = true; - -#ifdef MY_CPU_AMD64 - g_SSE2Enabled = true; - g_AVX2Enabled = CPU_IsSupported_AVX2() != 0; -#elif defined(MY_CPU_X86) - g_SSE2Enabled = CPU_IsSupported_SSE2() != 0; -#endif -} - -#endif - -#endif - void CBaseCoder::ProcessData(Byte *data, UInt32 size) { if (!_derivedKeyValid) @@ -760,7 +267,7 @@ void CBaseCoder::ProcessData(Byte *data, UInt32 size) { if (_blockPos == 0 || _blockPos >= kBlockSize) { - Chacha20Block(_block, _derivedKey, _nonce + 16, _counter); + XChaCha20Block_Core(_block, _derivedKey, _nonce + 16, _counter); _blockPos = 0; _counter++; if (_counter == 0) diff --git a/CPP/7zip/Crypto/XChaCha20.h b/CPP/7zip/Crypto/XChaCha20.h index 53b4ba690..ecddb9761 100644 --- a/CPP/7zip/Crypto/XChaCha20.h +++ b/CPP/7zip/Crypto/XChaCha20.h @@ -19,6 +19,10 @@ using CKeyInfoCache = N7zKeyDerivation::CKeyInfoCache; using N7zKeyDerivation::kKeySize; const unsigned kNonceSize = 24; +const unsigned k_NumCyclesPower_Supported_MAX = 24; + +void XChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce, UInt64 counter); +void XHChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce); class CBase { @@ -57,10 +61,8 @@ class CBaseCoder: Byte _derivedKey[kKeySize]; bool _derivedKeyValid; - void HChaCha20Block(Byte *output, const Byte *key, const Byte *nonce); - void Chacha20Block(Byte *output, const Byte *key, const Byte *nonce, UInt64 counter); void ProcessData(Byte *data, UInt32 size); - void DeriveKey(); + virtual void DeriveKey(); }; #ifndef Z7_EXTRACT_ONLY diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp new file mode 100644 index 000000000..b6061dfc5 --- /dev/null +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp @@ -0,0 +1,627 @@ +// XChaCha20Poly1305.cpp + +#include "StdAfx.h" + +#include "../../../C/CpuArch.h" + +#include "../../Common/ComTry.h" + +#ifndef Z7_ST +#include "../../Windows/Synchronization.h" +#endif + +#include "../Common/StreamUtils.h" + +#include "XChaCha20Poly1305.h" + +#ifndef Z7_EXTRACT_ONLY +#include "RandGen.h" +#endif + +#include "ChaCha20Simd.h" + +namespace NCrypto { +namespace NXChaCha20Poly1305 { + +void CBaseCoder::ComputePolyKey() +{ + Byte polyBlock[64]; + NXChaCha20::XChaCha20Block_Core(polyBlock, _derivedKey, _nonce + 16, 0); + memcpy(_polyKey, polyBlock, kPolyKeySize); + Z7_memset_0_ARRAY(polyBlock); +} + +CPoly1305::CPoly1305() +{ + Reset(); +} + +void CPoly1305::Reset() +{ + memset(_r, 0, sizeof(_r)); + memset(_s, 0, sizeof(_s)); + memset(_h, 0, sizeof(_h)); + memset(_block, 0, sizeof(_block)); + _blockPos = 0; + _totalLen = 0; + memset(_aadBlock, 0, sizeof(_aadBlock)); + _aadBlockPos = 0; + _aadLen = 0; + _finalized = false; +} + +void CPoly1305::SetKey(const Byte *key) +{ + memcpy(_r, key, 16); + _r[3] &= 15; + _r[7] &= 15; + _r[11] &= 15; + _r[15] &= 15; + _r[4] &= 252; + _r[8] &= 252; + _r[12] &= 252; + + memcpy(_s, key + 16, 16); + + memset(_h, 0, sizeof(_h)); + _blockPos = 0; + _totalLen = 0; + _aadBlockPos = 0; + _aadLen = 0; + _finalized = false; +} + +static void Poly1305_ProcessBlock(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) +{ + UInt64 d[8] = { 0 }; + UInt64 c; + + for (unsigned i = 0; i < 3; i++) + { + d[i] = (UInt64)GetUi32(h + i * 4); + } + d[3] = ((UInt64)GetUi32(h + 12)) & 0x3FFFFFF; + + for (unsigned i = 0; i < 3; i++) + { + UInt64 t = GetUi32(block + i * 4); + d[i] += t; + } + d[3] += ((UInt64)GetUi32(block + 12)) & 0x3FFFFFF; + + if (hasHighBit) + d[3] |= 0x1000000; + + UInt64 rr[4]; + rr[0] = GetUi32(r) & 0x3FFFFFF; + rr[1] = ((UInt64)GetUi32(r + 3) >> 2) & 0x3FFFF03; + rr[2] = ((UInt64)GetUi32(r + 6) >> 4) & 0x3FFC0FF; + rr[3] = ((UInt64)GetUi32(r + 9) >> 6) & 0x3F03FFF; + + UInt64 m[8] = { 0 }; + for (unsigned i = 0; i < 4; i++) + { + for (unsigned j = 0; j < 4; j++) + { + m[i + j] += d[i] * rr[j]; + } + } + + c = m[0] >> 26; m[0] &= 0x3FFFFFF; + m[1] += c; + c = m[1] >> 26; m[1] &= 0x3FFFFFF; + m[2] += c; c = m[2] >> 26; m[2] &= 0x3FFFFFF; + m[3] += c; c = m[3] >> 26; m[3] &= 0x3FFFFFF; + m[4] += c; c = m[4] >> 26; m[4] &= 0x3FFFFFF; + m[5] += c; c = m[5] >> 26; m[5] &= 0x3FFFFFF; + m[6] += c; c = m[6] >> 26; m[6] &= 0x3FFFFFF; + m[7] += c; + + c = (m[3] >> 26); m[3] &= 0x3FFFFFF; + m[4] += c; + + m[0] += (m[4] >> 26) * 5; m[4] &= 0x3FFFFFF; + m[1] += (m[5] >> 26) * 5; m[5] &= 0x3FFFFFF; + m[2] += (m[6] >> 26) * 5; m[6] &= 0x3FFFFFF; + m[3] += (m[7] >> 26) * 5; m[7] &= 0x3FFFFFF; + + c = m[0] >> 26; m[0] &= 0x3FFFFFF; + m[1] += c; + c = m[1] >> 26; m[1] &= 0x3FFFFFF; + m[2] += c; c = m[2] >> 26; m[2] &= 0x3FFFFFF; + m[3] += c; c = m[3] >> 26; m[3] &= 0x3FFFFFF; + + m[0] += (m[3] >> 26) * 5; m[3] &= 0x3FFFFFF; + + c = m[0] >> 26; m[0] &= 0x3FFFFFF; + m[1] += c; + + SetUi32(h, (UInt32)((m[0]) | (m[1] << 26))); + SetUi32(h + 4, (UInt32)((m[1] >> 6) | (m[2] << 20))); + SetUi32(h + 8, (UInt32)((m[2] >> 12) | (m[3] << 14))); + SetUi32(h + 12, (UInt32)((m[3] >> 18) | (m[4] << 8))); +} + +void CPoly1305::Update(const Byte *data, UInt32 size) +{ + if (_finalized) + return; + _totalLen += size; + + if (_blockPos > 0) + { + unsigned n = 16 - _blockPos; + if (n > size) n = size; + memcpy(_block + _blockPos, data, n); + _blockPos += n; + data += n; + size -= n; + if (_blockPos == 16) + { + Poly1305_ProcessBlock(_h, _r, _block, true); + _blockPos = 0; + } + } + + while (size >= 16) + { + Poly1305_ProcessBlock(_h, _r, data, true); + data += 16; + size -= 16; + } + + if (size > 0) + { + memcpy(_block, data, size); + _blockPos = size; + } +} + +void CPoly1305::UpdateAad(const Byte *data, UInt32 size) +{ + if (_finalized) + return; + _aadLen += size; + + if (_aadBlockPos > 0) + { + unsigned n = 16 - _aadBlockPos; + if (n > size) n = size; + memcpy(_aadBlock + _aadBlockPos, data, n); + _aadBlockPos += n; + data += n; + size -= n; + if (_aadBlockPos == 16) + { + Poly1305_ProcessBlock(_h, _r, _aadBlock, true); + _aadBlockPos = 0; + } + } + + while (size >= 16) + { + Poly1305_ProcessBlock(_h, _r, data, true); + data += 16; + size -= 16; + } + + if (size > 0) + { + memcpy(_aadBlock, data, size); + _aadBlockPos = size; + } +} + +void CPoly1305::Final(Byte *tag) +{ + if (_finalized) + return; + _finalized = true; + + unsigned aadLenMod = (unsigned)(_aadLen & 0xF); + if (aadLenMod != 0) + { + unsigned padLen = 16 - aadLenMod; + memset(_aadBlock + _aadBlockPos, 0, padLen); + Poly1305_ProcessBlock(_h, _r, _aadBlock, true); + } + + unsigned ctLenMod = (unsigned)(_totalLen & 0xF); + + if (ctLenMod != 0) + { + unsigned padLen = 16 - ctLenMod; + memset(_block + _blockPos, 0, padLen); + Poly1305_ProcessBlock(_h, _r, _block, true); + } + + { + Byte lenBlock[16]; + for (unsigned i = 0; i < 8; i++) + lenBlock[i] = (Byte)(_aadLen >> (i * 8)); + for (unsigned i = 0; i < 8; i++) + lenBlock[8 + i] = (Byte)(_totalLen >> (i * 8)); + Poly1305_ProcessBlock(_h, _r, lenBlock, true); + } + + UInt64 h0 = (UInt64)GetUi32(_h); + UInt64 h1 = (UInt64)GetUi32(_h + 4); + UInt64 h2 = (UInt64)GetUi32(_h + 8); + UInt64 h3 = (UInt64)GetUi32(_h + 12) & 0x3FFFFFF; + + UInt64 s0 = (UInt64)GetUi32(_s); + UInt64 s1 = (UInt64)GetUi32(_s + 4); + UInt64 s2 = (UInt64)GetUi32(_s + 8); + UInt64 s3 = (UInt64)GetUi32(_s + 12); + + h0 += s0; + UInt64 c = h0 >> 26; h0 &= 0x3FFFFFF; + h1 += s1 + c; c = h1 >> 26; h1 &= 0x3FFFFFF; + h2 += s2 + c; c = h2 >> 26; h2 &= 0x3FFFFFF; + h3 += s3 + c; + + UInt64 g0, g1, g2, g3; + g0 = h0 + 5; + c = g0 >> 26; g0 &= 0x3FFFFFF; + g1 = h1 + c; c = g1 >> 26; g1 &= 0x3FFFFFF; + g2 = h2 + c; c = g2 >> 26; g2 &= 0x3FFFFFF; + g3 = h3 + c - 4; + + UInt64 mask = (g3 >> 63) - 1; + h0 = (h0 & ~mask) | (g0 & mask); + h1 = (h1 & ~mask) | (g1 & mask); + h2 = (h2 & ~mask) | (g2 & mask); + h3 = (h3 & ~mask) | (g3 & mask); + + SetUi32(tag, (UInt32)(h0 | (h1 << 26))); + SetUi32(tag + 4, (UInt32)((h1 >> 6) | (h2 << 20))); + SetUi32(tag + 8, (UInt32)((h2 >> 12) | (h3 << 14))); + SetUi32(tag + 12, (UInt32)(h3 >> 18)); +} + +void CBaseCoder::DeriveKey() +{ + NXChaCha20::XHChaCha20Block_Core(_derivedKey, _key.Key, _nonce); + ComputePolyKey(); + _poly1305.SetKey(_polyKey); + if (_aadSize > 0) + { + _poly1305.UpdateAad(_aad, _aadSize); + } + _derivedKeyValid = true; +} + +void CBaseCoder::ProcessData(Byte *data, UInt32 size) +{ + if (!_derivedKeyValid) + { + DeriveKey(); + } + +#ifdef MY_CPU_X86_OR_AMD64 +#ifdef MY_CPU_SSE2 + InitSIMD(); + + if (size >= kBlockSize * 4) + { + UInt32 state[16]; + state[0] = GetUi32(kSigma); + state[1] = GetUi32(kSigma + 4); + state[2] = GetUi32(kSigma + 8); + state[3] = GetUi32(kSigma + 12); + state[4] = GetUi32(_derivedKey); + state[5] = GetUi32(_derivedKey + 4); + state[6] = GetUi32(_derivedKey + 8); + state[7] = GetUi32(_derivedKey + 12); + state[8] = GetUi32(_derivedKey + 16); + state[9] = GetUi32(_derivedKey + 20); + state[10] = GetUi32(_derivedKey + 24); + state[11] = GetUi32(_derivedKey + 28); + state[12] = (UInt32)(_counter & 0xFFFFFFFF); + state[13] = (UInt32)(_counter >> 32); + state[14] = GetUi32(_nonce + 16); + state[15] = GetUi32(_nonce + 20); + +#ifdef MY_CPU_AMD64 + if (g_AVX2Enabled && size >= kBlockSize * 8) + { + while (size >= kBlockSize * 8) + { + ChaCha20_OperateKeystream_AVX2(state, data, data); + state[12] += 8; + if (state[12] < 8) + state[13]++; + data += kBlockSize * 8; + size -= kBlockSize * 8; + } + } +#endif + + if (g_SSE2Enabled && size >= kBlockSize * 4) + { + while (size >= kBlockSize * 4) + { + ChaCha20_OperateKeystream_SSE2(state, data, data); + state[12] += 4; + if (state[12] < 4) + state[13]++; + data += kBlockSize * 4; + size -= kBlockSize * 4; + } + } + + _counter = (UInt64)state[13] << 32 | state[12]; + } +#endif +#endif + + while (size > 0) + { + if (_blockPos == 0 || _blockPos >= kBlockSize) + { + NXChaCha20::XChaCha20Block_Core(_block, _derivedKey, _nonce + 16, _counter); + _blockPos = 0; + _counter++; + if (_counter == 0) + { + memset(_block, 0, kBlockSize); + } + } + + UInt32 remaining = kBlockSize - _blockPos; + UInt32 toProcess = (size < remaining) ? size : remaining; + + Byte *dataPtr = data; + const Byte *blockPtr = _block + _blockPos; + UInt32 count = toProcess; + +#ifdef MY_CPU_64BIT + while (count >= 8) + { + *(UInt64 *)dataPtr ^= *(const UInt64 *)blockPtr; + dataPtr += 8; + blockPtr += 8; + count -= 8; + } +#endif + + while (count >= 4) + { + *(UInt32 *)dataPtr ^= *(const UInt32 *)blockPtr; + dataPtr += 4; + blockPtr += 4; + count -= 4; + } + + while (count--) + *dataPtr++ ^= *blockPtr++; + + data += toProcess; + size -= toProcess; + _blockPos += toProcess; + } +} + +Z7_COM7F_IMF(CBaseCoder::CryptoSetPassword(const Byte *data, UInt32 size)) +{ + COM_TRY_BEGIN + + _key.Password.Wipe(); + _key.Password.CopyFrom(data, (size_t)size); + _derivedKeyValid = false; + return S_OK; + + COM_TRY_END +} + +Z7_COM7F_IMF(CBaseCoder::Init()) +{ + COM_TRY_BEGIN + + PrepareKey(); + _counter = 1; + _blockPos = kBlockSize; + _derivedKeyValid = false; + _poly1305.Reset(); + + return S_OK; + + COM_TRY_END +} + +#ifndef Z7_EXTRACT_ONLY + +Z7_COM7F_IMF(CEncoder::ResetInitVector()) +{ + for (unsigned i = 0; i < sizeof(_nonce); i++) + _nonce[i] = 0; + MY_RAND_GEN(_nonce, kNonceSize); + _counter = 1; + _blockPos = kBlockSize; + _derivedKeyValid = false; + _poly1305.Reset(); + + _aadSize = 1; + const unsigned nonceSizeMinus1 = kNonceSize - 1; + const unsigned nonceHigh = (nonceSizeMinus1 >= 16) ? (1 << 6) : 0; + const unsigned nonceLow = nonceSizeMinus1 & 0x0F; + _aad[0] = (Byte)(_key.NumCyclesPower + | (_key.SaltSize == 0 ? 0 : (1 << 7)) + | nonceHigh); + if (_key.SaltSize != 0) + { + _aad[1] = (Byte)(((_key.SaltSize - 1) << 4) | nonceLow); + memcpy(_aad + 2, _key.Salt, _key.SaltSize); + _aadSize = 2 + _key.SaltSize; + memcpy(_aad + _aadSize, _nonce, kNonceSize); + _aadSize += kNonceSize; + } + else + { + _aad[1] = (Byte)(nonceLow); + _aadSize = 2; + memcpy(_aad + _aadSize, _nonce, kNonceSize); + _aadSize += kNonceSize; + } + + _tagReady = false; + memset(_computedTag, 0, kTagSize); + return S_OK; +} + +Z7_COM7F_IMF2(UInt32, CEncoder::Filter(Byte *data, UInt32 size)) +{ + ProcessData(data, size); + _poly1305.Update(data, size); + return size; +} + +Z7_COM7F_IMF(CEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) +{ + Byte props[2 + sizeof(_key.Salt) + kNonceSize + kTagSize]; + unsigned propsSize = 1; + + const unsigned nonceSizeMinus1 = kNonceSize - 1; + const unsigned nonceHigh = (nonceSizeMinus1 >= 16) ? (1 << 6) : 0; + const unsigned nonceLow = nonceSizeMinus1 & 0x0F; + + props[0] = (Byte)(_key.NumCyclesPower + | (_key.SaltSize == 0 ? 0 : (1 << 7)) + | nonceHigh); + + if (_key.SaltSize != 0) + { + props[1] = (Byte)( + ((_key.SaltSize - 1) << 4) + | nonceLow); + memcpy(props + 2, _key.Salt, _key.SaltSize); + propsSize = 2 + _key.SaltSize; + memcpy(props + propsSize, _nonce, kNonceSize); + propsSize += kNonceSize; + } + else + { + props[1] = (Byte)(nonceLow); + propsSize = 2; + memcpy(props + propsSize, _nonce, kNonceSize); + propsSize += kNonceSize; + } + + if (!_tagReady) + { + _poly1305.Final(_computedTag); + _tagReady = true; + } + + memcpy(props + propsSize, _computedTag, kTagSize); + propsSize += kTagSize; + + return WriteStream(outStream, props, propsSize); +} + +CEncoder::CEncoder() +{ + _key.NumCyclesPower = 19; + _counter = 1; + _blockPos = kBlockSize; + _derivedKeyValid = false; + _aadSize = 0; + _tagReady = false; + memset(_computedTag, 0, kTagSize); +} + +#endif + +CDecoder::CDecoder() +{ + _counter = 1; + _blockPos = kBlockSize; + _derivedKeyValid = false; + _aadSize = 0; + _authChecked = false; + _authResult = 0; + memset(_expectedTag, 0, kTagSize); +} + +Z7_COM7F_IMF2(UInt32, CDecoder::Filter(Byte *data, UInt32 size)) +{ + if (!_derivedKeyValid) + DeriveKey(); + _poly1305.Update(data, size); + ProcessData(data, size); + return size; +} + +Z7_COM7F_IMF(CDecoder::SetDecoderProperties2(const Byte *data, UInt32 size)) +{ + _key.ClearProps(); + + _counter = 1; + _blockPos = kBlockSize; + _derivedKeyValid = false; + _poly1305.Reset(); + _authChecked = false; + _authResult = 0; + memset(_expectedTag, 0, kTagSize); + + for (unsigned i = 0; i < sizeof(_nonce); i++) + _nonce[i] = 0; + + if (size == 0) + return S_OK; + + const unsigned b0 = data[0]; + _key.NumCyclesPower = b0 & 0x3F; + if ((b0 & 0xC0) == 0) + return size == 1 ? S_OK : E_INVALIDARG; + if (size <= 1) + return E_INVALIDARG; + + const unsigned b1 = data[1]; + const unsigned saltSize = ((b0 >> 7) & 1) + (b1 >> 4); + const unsigned nonceSizeMinus1 = ((b0 >> 6) & 1) * 16 + (b1 & 0x0F); + const unsigned nonceSize = nonceSizeMinus1 + 1; + + const unsigned totalSize = 2 + saltSize + nonceSize + kTagSize; + + if (size != totalSize) + { + return E_INVALIDARG; + } + + _aadSize = totalSize - kTagSize; + memcpy(_aad, data, _aadSize); + + _key.SaltSize = saltSize; + data += 2; + for (unsigned i = 0; i < saltSize; i++) + _key.Salt[i] = *data++; + for (unsigned i = 0; i < nonceSize && i < kNonceSize; i++) + _nonce[i] = *data++; + + memcpy(_expectedTag, data, kTagSize); + + return (_key.NumCyclesPower <= k_NumCyclesPower_Supported_MAX + || _key.NumCyclesPower == 0x3F) ? S_OK : E_NOTIMPL; +} + +Z7_COM7F_IMF(CDecoder::CryptoAuthVerify(Int32 *result)) +{ + if (_authChecked) + { + *result = _authResult; + return S_OK; + } + _authChecked = true; + + Byte computedTag[kTagSize]; + _poly1305.Final(computedTag); + + _authResult = (memcmp(computedTag, _expectedTag, kTagSize) == 0) ? 0 : 1; + *result = _authResult; + + Z7_memset_0_ARRAY(computedTag); + return S_OK; +} + +}} // namespace NCrypto::NXChaCha20Poly1305 \ No newline at end of file diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.h b/CPP/7zip/Crypto/XChaCha20Poly1305.h new file mode 100644 index 000000000..104f91520 --- /dev/null +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.h @@ -0,0 +1,124 @@ +// XChaCha20Poly1305.h +// XChaCha20-Poly1305 AEAD coder for 7z format +// Reuses XChaCha20 stream cipher + adds Poly1305 MAC authentication + +#ifndef ZIP7_INC_CRYPTO_XCHACHA20_POLY1305_H +#define ZIP7_INC_CRYPTO_XCHACHA20_POLY1305_H + +#include "../../Common/MyCom.h" + +#include "../ICoder.h" +#include "../IPassword.h" + +#include "XChaCha20.h" + +namespace NCrypto { +namespace NXChaCha20Poly1305 { + +using NXChaCha20::CBase; +using NXChaCha20::kNonceSize; +using NXChaCha20::k_NumCyclesPower_Supported_MAX; +using N7zKeyDerivation::kKeySize; + +const unsigned kTagSize = 16; +const unsigned kPolyKeySize = 32; + +class CPoly1305 +{ + Byte _r[16]; + Byte _s[16]; + Byte _h[16]; + Byte _block[16]; + unsigned _blockPos; + UInt64 _totalLen; + bool _finalized; + Byte _aadBlock[16]; + unsigned _aadBlockPos; + UInt64 _aadLen; +public: + CPoly1305(); + void SetKey(const Byte *key); + void Update(const Byte *data, UInt32 size); + void UpdateAad(const Byte *data, UInt32 size); + void Final(Byte *tag); + void Reset(); +}; + +class CBaseCoder: + public ICompressFilter, + public ICryptoSetPassword, + public CMyUnknownImp, + public NXChaCha20::CBase +{ + Z7_IFACE_COM7_IMP(ICryptoSetPassword) + Z7_COM7F_IMP(Init()) +protected: + virtual ~CBaseCoder() + { + Z7_memset_0_ARRAY(_polyKey); + Z7_memset_0_ARRAY(_aad); + } + + static const unsigned kBlockSize = 64; + Byte _block[kBlockSize]; + unsigned _blockPos; + Byte _derivedKey[kKeySize]; + Byte _polyKey[kPolyKeySize]; + bool _derivedKeyValid; + CPoly1305 _poly1305; + Byte _aad[2 + 16 + kNonceSize]; + unsigned _aadSize; + + void ProcessData(Byte *data, UInt32 size); + void DeriveKey(); + void ComputePolyKey(); +}; + +#ifndef Z7_EXTRACT_ONLY + +class CEncoder Z7_final: + public CBaseCoder, + public ICompressWriteCoderProperties, + public ICryptoResetInitVector +{ + Z7_COM_UNKNOWN_IMP_4( + ICompressFilter, + ICryptoSetPassword, + ICompressWriteCoderProperties, + ICryptoResetInitVector) + Z7_IFACE_COM7_IMP(ICompressWriteCoderProperties) + Z7_IFACE_COM7_IMP(ICryptoResetInitVector) + + Byte _computedTag[kTagSize]; + bool _tagReady; + Z7_COM7F_IMP2(UInt32, Filter(Byte *data, UInt32 size)) +public: + CEncoder(); +}; + +#endif + +class CDecoder Z7_final: + public CBaseCoder, + public ICompressSetDecoderProperties2, + public ICryptoAuthVerify +{ + Z7_COM_UNKNOWN_IMP_4( + ICompressFilter, + ICryptoSetPassword, + ICompressSetDecoderProperties2, + ICryptoAuthVerify) + Z7_IFACE_COM7_IMP(ICompressSetDecoderProperties2) + Z7_IFACE_COM7_IMP(ICryptoAuthVerify) + + Byte _expectedTag[kTagSize]; + bool _authChecked; + Int32 _authResult; + Z7_COM7F_IMP2(UInt32, Filter(Byte *data, UInt32 size)) +public: + CDecoder(); +}; + +}} + +#endif \ No newline at end of file diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp b/CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp new file mode 100644 index 000000000..d41614829 --- /dev/null +++ b/CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp @@ -0,0 +1,17 @@ +// XChaCha20Poly1305Register.cpp + +#include "StdAfx.h" + +#include "../Common/RegisterCodec.h" + +#include "XChaCha20Poly1305.h" + +namespace NCrypto { +namespace NXChaCha20Poly1305 { + +REGISTER_FILTER_E(XChaCha20Poly1305, + CDecoder, + CEncoder, + 0x6F10703, "XChaCha20-Poly1305") + +}} \ No newline at end of file diff --git a/CPP/7zip/IPassword.h b/CPP/7zip/IPassword.h index 689f08cb4..963ad6485 100644 --- a/CPP/7zip/IPassword.h +++ b/CPP/7zip/IPassword.h @@ -50,5 +50,9 @@ CryptoGetTextPassword2() x(CryptoGetTextPassword2(Int32 *passwordIsDefined, BSTR *password)) Z7_IFACE_CONSTR_PASSWORD(ICryptoGetTextPassword2, 0x11) +#define Z7_IFACEM_ICryptoAuthVerify(x) \ + x(CryptoAuthVerify(Int32 *result)) +Z7_IFACE_CONSTR_PASSWORD(ICryptoAuthVerify, 0x12) + Z7_PURE_INTERFACES_END #endif diff --git a/CPP/7zip/UI/GUI/CompressDialog.cpp b/CPP/7zip/UI/GUI/CompressDialog.cpp index 28fff2dae..e306d12bd 100644 --- a/CPP/7zip/UI/GUI/CompressDialog.cpp +++ b/CPP/7zip/UI/GUI/CompressDialog.cpp @@ -1734,8 +1734,12 @@ void CCompressDialog::SetEncryptionMethod() } ComboBox_AddStringAscii(_encryptionMethod, "AES-256"); ComboBox_AddStringAscii(_encryptionMethod, "XChaCha20"); + ComboBox_AddStringAscii(_encryptionMethod, "XChaCha20-Poly1305"); int sel = 0; - if (encryptionMethod.IsPrefixedBy_Ascii_NoCase("xchacha")) + if (encryptionMethod.IsPrefixedBy_Ascii_NoCase("xchacha20poly1305") + || encryptionMethod.IsPrefixedBy_Ascii_NoCase("xchacha20-poly1305")) + sel = 2; + else if (encryptionMethod.IsPrefixedBy_Ascii_NoCase("xchacha")) sel = 1; else if (encryptionMethod.IsPrefixedBy_Ascii_NoCase("aes")) sel = 0; From 6fcc6afaf01f66477000b64793ed1b91fc65fe37 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Thu, 28 May 2026 08:31:02 +0800 Subject: [PATCH 07/18] Optimize code, improve encryption speed --- CPP/7zip/Crypto/ChaCha20Simd.h | 229 ++++++++++++- CPP/7zip/Crypto/XChaCha20.cpp | 70 ++-- CPP/7zip/Crypto/XChaCha20.h | 4 +- CPP/7zip/Crypto/XChaCha20Poly1305.cpp | 471 ++++++++++++++++---------- CPP/7zip/Crypto/XChaCha20Poly1305.h | 18 +- 5 files changed, 578 insertions(+), 214 deletions(-) diff --git a/CPP/7zip/Crypto/ChaCha20Simd.h b/CPP/7zip/Crypto/ChaCha20Simd.h index 736cff901..175bc234d 100644 --- a/CPP/7zip/Crypto/ChaCha20Simd.h +++ b/CPP/7zip/Crypto/ChaCha20Simd.h @@ -1,5 +1,5 @@ // ChaCha20Simd.h -// Shared SIMD (SSE2/AVX2) acceleration code for ChaCha20/XChaCha20 +// Shared SIMD (SSE2/AVX2/NEON) acceleration code for ChaCha20/XChaCha20 #ifndef ZIP7_CRYPTO_CHACHA20_SIMD_H #define ZIP7_CRYPTO_CHACHA20_SIMD_H @@ -12,14 +12,18 @@ #endif #endif -#ifdef MY_CPU_X86_OR_AMD64 - -#ifdef MY_CPU_SSE2 +#ifdef MY_CPU_ARM_OR_ARM64 +#include +#endif static const Byte kSigma[16] = { 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', '2', '-', 'b', 'y', 't', 'e', ' ', 'k' }; +#ifdef MY_CPU_X86_OR_AMD64 + +#ifdef MY_CPU_SSE2 + namespace { template @@ -495,6 +499,223 @@ static void InitSIMD() #endif +#elif defined(MY_CPU_ARM_OR_ARM64) + +namespace { + +template +Z7_FORCE_INLINE uint32x4_t RotateLeft_NEON(const uint32x4_t val) +{ + return vorrq_u32(vshlq_n_u32(val, R), vshrq_n_u32(val, 32 - R)); +} + +template <> +Z7_FORCE_INLINE uint32x4_t RotateLeft_NEON<8>(const uint32x4_t val) +{ + const uint8x16_t mask = {3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14}; + return vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); +} + +template <> +Z7_FORCE_INLINE uint32x4_t RotateLeft_NEON<16>(const uint32x4_t val) +{ + return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(val))); +} + +Z7_FORCE_INLINE uint32x4_t Add64_NEON(const uint32x4_t a, const uint32x4_t b) +{ + return vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(a), vreinterpretq_u64_u32(b))); +} + +template +Z7_FORCE_INLINE uint32x4_t Extract_NEON(const uint32x4_t val) +{ + return vextq_u32(val, val, S); +} + +#define NEON_QUARTERROUND(a, b, c, d) \ + a = vaddq_u32(a, b); \ + d = veorq_u32(d, a); \ + d = RotateLeft_NEON<16>(d); \ + c = vaddq_u32(c, d); \ + b = veorq_u32(b, c); \ + b = RotateLeft_NEON<12>(b); \ + a = vaddq_u32(a, b); \ + d = veorq_u32(d, a); \ + d = RotateLeft_NEON<8>(d); \ + c = vaddq_u32(c, d); \ + b = veorq_u32(b, c); \ + b = RotateLeft_NEON<7>(b); + +Z7_NO_INLINE void ChaCha20_OperateKeystream_NEON( + const UInt32 *state, + const Byte *input, + Byte *output) +{ + const uint32x4_t state0 = vld1q_u32(state + 0); + const uint32x4_t state1 = vld1q_u32(state + 4); + const uint32x4_t state2 = vld1q_u32(state + 8); + const uint32x4_t state3 = vld1q_u32(state + 12); + + const UInt32 CTR[12] = {1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}; + const uint32x4_t CTR1 = vld1q_u32(CTR + 0); + const uint32x4_t CTR2 = vld1q_u32(CTR + 4); + const uint32x4_t CTR3 = vld1q_u32(CTR + 8); + + uint32x4_t r0_0 = state0; + uint32x4_t r0_1 = state1; + uint32x4_t r0_2 = state2; + uint32x4_t r0_3 = state3; + + uint32x4_t r1_0 = state0; + uint32x4_t r1_1 = state1; + uint32x4_t r1_2 = state2; + uint32x4_t r1_3 = Add64_NEON(state3, CTR1); + + uint32x4_t r2_0 = state0; + uint32x4_t r2_1 = state1; + uint32x4_t r2_2 = state2; + uint32x4_t r2_3 = Add64_NEON(state3, CTR2); + + uint32x4_t r3_0 = state0; + uint32x4_t r3_1 = state1; + uint32x4_t r3_2 = state2; + uint32x4_t r3_3 = Add64_NEON(state3, CTR3); + + for (int i = 0; i < 10; i++) + { + NEON_QUARTERROUND(r0_0, r0_1, r0_2, r0_3); + NEON_QUARTERROUND(r1_0, r1_1, r1_2, r1_3); + NEON_QUARTERROUND(r2_0, r2_1, r2_2, r2_3); + NEON_QUARTERROUND(r3_0, r3_1, r3_2, r3_3); + + r0_1 = Extract_NEON<1>(r0_1); + r0_2 = Extract_NEON<2>(r0_2); + r0_3 = Extract_NEON<3>(r0_3); + + r1_1 = Extract_NEON<1>(r1_1); + r1_2 = Extract_NEON<2>(r1_2); + r1_3 = Extract_NEON<3>(r1_3); + + r2_1 = Extract_NEON<1>(r2_1); + r2_2 = Extract_NEON<2>(r2_2); + r2_3 = Extract_NEON<3>(r2_3); + + r3_1 = Extract_NEON<1>(r3_1); + r3_2 = Extract_NEON<2>(r3_2); + r3_3 = Extract_NEON<3>(r3_3); + + NEON_QUARTERROUND(r0_0, r0_1, r0_2, r0_3); + NEON_QUARTERROUND(r1_0, r1_1, r1_2, r1_3); + NEON_QUARTERROUND(r2_0, r2_1, r2_2, r2_3); + NEON_QUARTERROUND(r3_0, r3_1, r3_2, r3_3); + + r0_1 = Extract_NEON<3>(r0_1); + r0_2 = Extract_NEON<2>(r0_2); + r0_3 = Extract_NEON<1>(r0_3); + + r1_1 = Extract_NEON<3>(r1_1); + r1_2 = Extract_NEON<2>(r1_2); + r1_3 = Extract_NEON<1>(r1_3); + + r2_1 = Extract_NEON<3>(r2_1); + r2_2 = Extract_NEON<2>(r2_2); + r2_3 = Extract_NEON<1>(r2_3); + + r3_1 = Extract_NEON<3>(r3_1); + r3_2 = Extract_NEON<2>(r3_2); + r3_3 = Extract_NEON<1>(r3_3); + } + + r0_0 = vaddq_u32(r0_0, state0); + r0_1 = vaddq_u32(r0_1, state1); + r0_2 = vaddq_u32(r0_2, state2); + r0_3 = vaddq_u32(r0_3, state3); + + r1_0 = vaddq_u32(r1_0, state0); + r1_1 = vaddq_u32(r1_1, state1); + r1_2 = vaddq_u32(r1_2, state2); + r1_3 = vaddq_u32(r1_3, state3); + r1_3 = Add64_NEON(r1_3, CTR1); + + r2_0 = vaddq_u32(r2_0, state0); + r2_1 = vaddq_u32(r2_1, state1); + r2_2 = vaddq_u32(r2_2, state2); + r2_3 = vaddq_u32(r2_3, state3); + r2_3 = Add64_NEON(r2_3, CTR2); + + r3_0 = vaddq_u32(r3_0, state0); + r3_1 = vaddq_u32(r3_1, state1); + r3_2 = vaddq_u32(r3_2, state2); + r3_3 = vaddq_u32(r3_3, state3); + r3_3 = Add64_NEON(r3_3, CTR3); + + if (input) + { + r0_0 = veorq_u32(vld1q_u32((const UInt32 *)(input + 0*16)), r0_0); + r0_1 = veorq_u32(vld1q_u32((const UInt32 *)(input + 1*16)), r0_1); + r0_2 = veorq_u32(vld1q_u32((const UInt32 *)(input + 2*16)), r0_2); + r0_3 = veorq_u32(vld1q_u32((const UInt32 *)(input + 3*16)), r0_3); + } + + vst1q_u32((UInt32 *)(output + 0*16), r0_0); + vst1q_u32((UInt32 *)(output + 1*16), r0_1); + vst1q_u32((UInt32 *)(output + 2*16), r0_2); + vst1q_u32((UInt32 *)(output + 3*16), r0_3); + + if (input) + { + r1_0 = veorq_u32(vld1q_u32((const UInt32 *)(input + 4*16)), r1_0); + r1_1 = veorq_u32(vld1q_u32((const UInt32 *)(input + 5*16)), r1_1); + r1_2 = veorq_u32(vld1q_u32((const UInt32 *)(input + 6*16)), r1_2); + r1_3 = veorq_u32(vld1q_u32((const UInt32 *)(input + 7*16)), r1_3); + } + + vst1q_u32((UInt32 *)(output + 4*16), r1_0); + vst1q_u32((UInt32 *)(output + 5*16), r1_1); + vst1q_u32((UInt32 *)(output + 6*16), r1_2); + vst1q_u32((UInt32 *)(output + 7*16), r1_3); + + if (input) + { + r2_0 = veorq_u32(vld1q_u32((const UInt32 *)(input + 8*16)), r2_0); + r2_1 = veorq_u32(vld1q_u32((const UInt32 *)(input + 9*16)), r2_1); + r2_2 = veorq_u32(vld1q_u32((const UInt32 *)(input + 10*16)), r2_2); + r2_3 = veorq_u32(vld1q_u32((const UInt32 *)(input + 11*16)), r2_3); + } + + vst1q_u32((UInt32 *)(output + 8*16), r2_0); + vst1q_u32((UInt32 *)(output + 9*16), r2_1); + vst1q_u32((UInt32 *)(output + 10*16), r2_2); + vst1q_u32((UInt32 *)(output + 11*16), r2_3); + + if (input) + { + r3_0 = veorq_u32(vld1q_u32((const UInt32 *)(input + 12*16)), r3_0); + r3_1 = veorq_u32(vld1q_u32((const UInt32 *)(input + 13*16)), r3_1); + r3_2 = veorq_u32(vld1q_u32((const UInt32 *)(input + 14*16)), r3_2); + r3_3 = veorq_u32(vld1q_u32((const UInt32 *)(input + 15*16)), r3_3); + } + + vst1q_u32((UInt32 *)(output + 12*16), r3_0); + vst1q_u32((UInt32 *)(output + 13*16), r3_1); + vst1q_u32((UInt32 *)(output + 14*16), r3_2); + vst1q_u32((UInt32 *)(output + 15*16), r3_3); +} + +} + +static bool g_NEONEnabled = false; +static bool g_SIMDARMInitialized = false; + +static void InitSIMD() +{ + if (g_SIMDARMInitialized) + return; + g_SIMDARMInitialized = true; + g_NEONEnabled = CPU_IsSupported_NEON() != 0; +} + #endif #endif \ No newline at end of file diff --git a/CPP/7zip/Crypto/XChaCha20.cpp b/CPP/7zip/Crypto/XChaCha20.cpp index ad18c85a1..d45c97853 100644 --- a/CPP/7zip/Crypto/XChaCha20.cpp +++ b/CPP/7zip/Crypto/XChaCha20.cpp @@ -31,6 +31,13 @@ namespace NXChaCha20 { a += b; d ^= a; d = ROTL32(d, 8); \ c += d; b ^= c; b = ROTL32(b, 7); +#define CHACHA20_10_DOUBLE_ROUNDS \ + DOUBLE_ROUND; DOUBLE_ROUND; \ + DOUBLE_ROUND; DOUBLE_ROUND; \ + DOUBLE_ROUND; DOUBLE_ROUND; \ + DOUBLE_ROUND; DOUBLE_ROUND; \ + DOUBLE_ROUND; DOUBLE_ROUND; + static CKeyInfoCache g_GlobalKeyCache(32); #ifndef Z7_ST @@ -104,14 +111,10 @@ void XHChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce) QUARTERROUND(x2, x7, x8, x13) \ QUARTERROUND(x3, x4, x9, x14) - DOUBLE_ROUND; DOUBLE_ROUND; - DOUBLE_ROUND; DOUBLE_ROUND; - DOUBLE_ROUND; DOUBLE_ROUND; - DOUBLE_ROUND; DOUBLE_ROUND; - DOUBLE_ROUND; DOUBLE_ROUND; - + CHACHA20_10_DOUBLE_ROUNDS + #undef DOUBLE_ROUND - + SetUi32(output, x0); SetUi32(output + 4, x1); SetUi32(output + 8, x2); @@ -156,14 +159,10 @@ void XChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce, UInt6 QUARTERROUND(x2, x7, x8, x13) \ QUARTERROUND(x3, x4, x9, x14) - DOUBLE_ROUND; DOUBLE_ROUND; - DOUBLE_ROUND; DOUBLE_ROUND; - DOUBLE_ROUND; DOUBLE_ROUND; - DOUBLE_ROUND; DOUBLE_ROUND; - DOUBLE_ROUND; DOUBLE_ROUND; - + CHACHA20_10_DOUBLE_ROUNDS + #undef DOUBLE_ROUND - + x0 += GetUi32(kSigma); x1 += GetUi32(kSigma + 4); x2 += GetUi32(kSigma + 8); @@ -262,6 +261,43 @@ void CBaseCoder::ProcessData(Byte *data, UInt32 size) } #endif #endif + +#ifdef MY_CPU_ARM_OR_ARM64 + InitSIMD(); + + if (g_NEONEnabled && size >= kBlockSize * 4) + { + UInt32 state[16]; + state[0] = GetUi32(kSigma); + state[1] = GetUi32(kSigma + 4); + state[2] = GetUi32(kSigma + 8); + state[3] = GetUi32(kSigma + 12); + state[4] = GetUi32(_derivedKey); + state[5] = GetUi32(_derivedKey + 4); + state[6] = GetUi32(_derivedKey + 8); + state[7] = GetUi32(_derivedKey + 12); + state[8] = GetUi32(_derivedKey + 16); + state[9] = GetUi32(_derivedKey + 20); + state[10] = GetUi32(_derivedKey + 24); + state[11] = GetUi32(_derivedKey + 28); + state[12] = (UInt32)(_counter & 0xFFFFFFFF); + state[13] = (UInt32)(_counter >> 32); + state[14] = GetUi32(_nonce + 16); + state[15] = GetUi32(_nonce + 20); + + while (size >= kBlockSize * 4) + { + ChaCha20_OperateKeystream_NEON(state, data, data); + state[12] += 4; + if (state[12] < 4) + state[13]++; + data += kBlockSize * 4; + size -= kBlockSize * 4; + } + + _counter = (UInt64)state[13] << 32 | state[12]; + } +#endif while (size > 0) { @@ -270,10 +306,6 @@ void CBaseCoder::ProcessData(Byte *data, UInt32 size) XChaCha20Block_Core(_block, _derivedKey, _nonce + 16, _counter); _blockPos = 0; _counter++; - if (_counter == 0) - { - memset(_block, 0, kBlockSize); - } } UInt32 remaining = kBlockSize - _blockPos; @@ -339,7 +371,7 @@ Z7_COM7F_IMF(CEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) if (_key.SaltSize != 0) { props[1] = (Byte)( - ((_key.SaltSize == 0 ? 0 : _key.SaltSize - 1) << 4) + ((_key.SaltSize - 1) << 4) | nonceLow); memcpy(props + 2, _key.Salt, _key.SaltSize); propsSize = 2 + _key.SaltSize; diff --git a/CPP/7zip/Crypto/XChaCha20.h b/CPP/7zip/Crypto/XChaCha20.h index ecddb9761..652057dc9 100644 --- a/CPP/7zip/Crypto/XChaCha20.h +++ b/CPP/7zip/Crypto/XChaCha20.h @@ -46,8 +46,8 @@ class CBaseCoder: public CMyUnknownImp, public CBase { - Z7_IFACE_COM7_IMP(ICompressFilter) - Z7_IFACE_COM7_IMP(ICryptoSetPassword) + Z7_IFACE_COM7_IMP_NONFINAL(ICompressFilter) + Z7_IFACE_COM7_IMP_NONFINAL(ICryptoSetPassword) protected: virtual ~CBaseCoder() { diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp index b6061dfc5..4c09a04db 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp @@ -18,8 +18,6 @@ #include "RandGen.h" #endif -#include "ChaCha20Simd.h" - namespace NCrypto { namespace NXChaCha20Poly1305 { @@ -52,6 +50,7 @@ void CPoly1305::Reset() void CPoly1305::SetKey(const Byte *key) { + Reset(); memcpy(_r, key, 16); _r[3] &= 15; _r[7] &= 15; @@ -62,50 +61,172 @@ void CPoly1305::SetKey(const Byte *key) _r[12] &= 252; memcpy(_s, key + 16, 16); - - memset(_h, 0, sizeof(_h)); - _blockPos = 0; - _totalLen = 0; - _aadBlockPos = 0; - _aadLen = 0; - _finalized = false; } -static void Poly1305_ProcessBlock(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) +#if defined(__SIZEOF_INT128__) && (__SIZEOF_INT128__ >= 16) + #define Z7_POLY1305_128BIT +#elif defined(_M_AMD64) + #include + #define Z7_POLY1305_128BIT +#endif + +#ifdef Z7_POLY1305_128BIT +static void Poly1305_ProcessBlock_128(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) { - UInt64 d[8] = { 0 }; - UInt64 c; + UInt64 d0 = GetUi32(h); + UInt64 d1 = GetUi32(h + 4); + UInt64 d2 = GetUi32(h + 8); + UInt64 d3 = GetUi32(h + 12) & 0x3FFFFFF; + + UInt64 m0 = d0 & 0x3FFFFFF; + UInt64 m1 = (d0 >> 26) | ((d1 & 0xFFFFF) << 6); + UInt64 m2 = (d1 >> 20) | ((d2 & 0x3FFF) << 12); + UInt64 m3 = (d2 >> 14) | ((d3 & 0xFF) << 18); + UInt64 m4 = (d3 >> 8) & 0x3FFFF; + + UInt64 msg_lo = GetUi64(block); + UInt64 msg_hi = GetUi64(block + 8); + UInt64 r0 = GetUi64(r); + UInt64 r1 = GetUi64(r + 8); + +#if defined(__SIZEOF_INT128__) && (__SIZEOF_INT128__ >= 16) + typedef unsigned __int128 U128; + + U128 hv = (U128)m0 | ((U128)m1 << 26) | ((U128)m2 << 52) | ((U128)m3 << 78) | ((U128)m4 << 104); + U128 msg = (U128)msg_lo | ((U128)msg_hi << 64); + if (hasHighBit) + msg |= (U128)1 << 128; + hv += msg; - for (unsigned i = 0; i < 3; i++) - { - d[i] = (UInt64)GetUi32(h + i * 4); - } - d[3] = ((UInt64)GetUi32(h + 12)) & 0x3FFFFFF; + U128 rv = (U128)r0 | ((U128)r1 << 64); - for (unsigned i = 0; i < 3; i++) - { - UInt64 t = GetUi32(block + i * 4); - d[i] += t; - } - d[3] += ((UInt64)GetUi32(block + 12)) & 0x3FFFFFF; + U128 product = hv * rv; - if (hasHighBit) - d[3] |= 0x1000000; + UInt64 a0 = (UInt64)product; + UInt64 a1 = (UInt64)(product >> 64); + UInt64 a2 = (UInt64)(product >> 128); + UInt64 a3 = (UInt64)(product >> 192); - UInt64 rr[4]; - rr[0] = GetUi32(r) & 0x3FFFFFF; - rr[1] = ((UInt64)GetUi32(r + 3) >> 2) & 0x3FFFF03; - rr[2] = ((UInt64)GetUi32(r + 6) >> 4) & 0x3FFC0FF; - rr[3] = ((UInt64)GetUi32(r + 9) >> 6) & 0x3F03FFF; + U128 p_lo = (U128)a0 | ((U128)a1 << 64) | ((U128)(a2 & 3) << 128); + U128 p_hi = (a2 >> 2) | ((U128)a3 << 62); - UInt64 m[8] = { 0 }; - for (unsigned i = 0; i < 4; i++) + U128 res = p_lo + p_hi * 5; + + U128 overflow = res >> 130; + while (overflow) { - for (unsigned j = 0; j < 4; j++) + res = (res & (((U128)1 << 130) - 1)) + overflow * 5; + overflow = res >> 130; + } + + UInt64 lo = (UInt64)res; + UInt64 hi = (UInt64)(res >> 64); + UInt32 top = (UInt32)(res >> 128); + + UInt64 limb0 = lo & 0x3FFFFFF; + UInt64 limb1 = (lo >> 26) & 0x3FFFFFF; + UInt64 limb2 = ((lo >> 52) | ((hi & 0x3FFF) << 12)) & 0x3FFFFFF; + UInt64 limb3 = (hi >> 14) & 0x3FFFFFF; + UInt64 limb4 = ((hi >> 40) | ((UInt64)top << 24)) & 0x3FFFFFF; + + SetUi32(h, (UInt32)(limb0 | (limb1 << 26))); + SetUi32(h + 4, (UInt32)((limb1 >> 6) | (limb2 << 20))); + SetUi32(h + 8, (UInt32)((limb2 >> 12) | (limb3 << 14))); + SetUi32(h + 12, (UInt32)((limb3 >> 18) | (limb4 << 8))); +#elif defined(_M_AMD64) + { + UInt64 hv0 = m0 | (m1 << 26) | ((m2 & 0xFFF) << 52); + UInt64 hv1 = (m2 >> 12) | (m3 << 14) | (m4 << 40); + UInt64 hv2 = 0; + + unsigned char c; + c = _addcarry_u64(0, hv0, msg_lo, &hv0); + c = _addcarry_u64(c, hv1, msg_hi, &hv1); + hv2 += c + (hasHighBit ? 1 : 0); + + UInt64 d0_hi, d0_lo = _umul128(hv0, r0, &d0_hi); + UInt64 d1a_hi, d1a_lo = _umul128(hv0, r1, &d1a_hi); + UInt64 d1b_hi, d1b_lo = _umul128(hv1, r0, &d1b_hi); + UInt64 d2a_hi, d2a_lo = _umul128(hv1, r1, &d2a_hi); + UInt64 d2b_hi, d2b_lo = _umul128(hv2, r0, &d2b_hi); + UInt64 d3_hi, d3_lo = _umul128(hv2, r1, &d3_hi); + + UInt64 a0 = d0_lo, a1 = d0_hi, a2 = 0, a3 = 0; + c = _addcarry_u64(0, a1, d1a_lo, &a1); + c = _addcarry_u64(c, a2, d1a_hi, &a2); + c = _addcarry_u64(c, a3, 0, &a3); + c = _addcarry_u64(0, a1, d1b_lo, &a1); + c = _addcarry_u64(c, a2, d1b_hi, &a2); + c = _addcarry_u64(c, a3, 0, &a3); + c = _addcarry_u64(0, a2, d2a_lo, &a2); + c = _addcarry_u64(c, a3, d2a_hi, &a3); + c = _addcarry_u64(0, a2, d2b_lo, &a2); + c = _addcarry_u64(c, a3, d2b_hi, &a3); + UInt64 a4 = c; + c = _addcarry_u64(0, a3, d3_lo, &a3); + c = _addcarry_u64(c, a4, d3_hi, &a4); + + UInt64 hi[3]; + hi[0] = (a2 >> 2) | (a3 << 62); + hi[1] = (a3 >> 2) | (a4 << 62); + hi[2] = a4 >> 2; + + UInt64 h5_0_hi, h5_0 = _umul128(hi[0], 5, &h5_0_hi); + UInt64 h5_1_hi, h5_1 = _umul128(hi[1], 5, &h5_1_hi); + UInt64 h5_2 = hi[2] * 5; + + UInt64 lo0 = a0, lo1 = a1, lo2 = a2 & 3, lo3 = 0; + c = _addcarry_u64(0, lo0, h5_0, &lo0); + c = _addcarry_u64(c, lo1, h5_0_hi, &lo1); + c = _addcarry_u64(c, lo2, 0, &lo2); + c = _addcarry_u64(0, lo1, h5_1, &lo1); + c = _addcarry_u64(c, lo2, h5_1_hi, &lo2); + c = _addcarry_u64(c, lo3, 0, &lo3); + c = _addcarry_u64(0, lo2, h5_2, &lo2); + c = _addcarry_u64(c, lo3, 0, &lo3); + + UInt64 ov0 = lo2 >> 2; + lo2 &= 3; + lo3 = 0; + + UInt64 ov5_lo, ov5_hi; + ov5_lo = _umul128(ov0, 5, &ov5_hi); + c = _addcarry_u64(0, lo0, ov5_lo, &lo0); + c = _addcarry_u64(c, lo1, ov5_hi, &lo1); + c = _addcarry_u64(c, lo2, 0, &lo2); + + ov0 = lo2 >> 2; + if (ov0) { - m[i + j] += d[i] * rr[j]; + lo2 &= 3; + ov5_lo = _umul128(ov0, 5, &ov5_hi); + c = _addcarry_u64(0, lo0, ov5_lo, &lo0); + c = _addcarry_u64(c, lo1, ov5_hi, &lo1); + c = _addcarry_u64(c, lo2, 0, &lo2); } + + UInt64 limb0 = lo0 & 0x3FFFFFF; + UInt64 limb1 = (lo0 >> 26) & 0x3FFFFFF; + UInt64 limb2 = ((lo0 >> 52) | ((lo1 & 0x3FFF) << 12)) & 0x3FFFFFF; + UInt64 limb3 = (lo1 >> 14) & 0x3FFFFFF; + UInt64 limb4 = ((lo1 >> 40) | ((UInt64)lo2 << 24)) & 0x3FFFFFF; + + SetUi32(h, (UInt32)(limb0 | (limb1 << 26))); + SetUi32(h + 4, (UInt32)((limb1 >> 6) | (limb2 << 20))); + SetUi32(h + 8, (UInt32)((limb2 >> 12) | (limb3 << 14))); + SetUi32(h + 12, (UInt32)((limb3 >> 18) | (limb4 << 8))); } +#endif +} +#else +#define Poly1305_ProcessBlock_128 Poly1305_ProcessBlock_32 +#endif + +#ifndef Z7_POLY1305_128BIT + +static void Poly1305_ReduceAndPack(Byte h[16], UInt64 m[8]) +{ + UInt64 c; c = m[0] >> 26; m[0] &= 0x3FFFFFF; m[1] += c; @@ -142,6 +263,129 @@ static void Poly1305_ProcessBlock(Byte h[16], const Byte r[16], const Byte block SetUi32(h + 12, (UInt32)((m[3] >> 18) | (m[4] << 8))); } +#if defined(MY_CPU_X86_OR_AMD64) && defined(MY_CPU_SSE2) + +#include + +static void Poly1305_ProcessBlock_SSE2_4Way(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) +{ + UInt64 d[4]; + + for (unsigned i = 0; i < 3; i++) + d[i] = (UInt64)GetUi32(h + i * 4); + d[3] = ((UInt64)GetUi32(h + 12)) & 0x3FFFFFF; + + for (unsigned i = 0; i < 3; i++) + d[i] += GetUi32(block + i * 4); + d[3] += ((UInt64)GetUi32(block + 12)) & 0x3FFFFFF; + + if (hasHighBit) + d[3] |= 0x1000000; + + UInt64 rr[4]; + rr[0] = GetUi32(r) & 0x3FFFFFF; + rr[1] = ((UInt64)GetUi32(r + 3) >> 2) & 0x3FFFF03; + rr[2] = ((UInt64)GetUi32(r + 6) >> 4) & 0x3FFC0FF; + rr[3] = ((UInt64)GetUi32(r + 9) >> 6) & 0x3F03FFF; + + __m128i d_vec = _mm_set_epi32((int)(UInt32)d[3], (int)(UInt32)d[2], + (int)(UInt32)d[1], (int)(UInt32)d[0]); + __m128i d_swap = _mm_shuffle_epi32(d_vec, _MM_SHUFFLE(0, 3, 0, 1)); + + __m128i r_even = _mm_set_epi32(0, (int)(UInt32)rr[2], 0, (int)(UInt32)rr[0]); + __m128i r_odd = _mm_set_epi32(0, (int)(UInt32)rr[3], 0, (int)(UInt32)rr[1]); + __m128i r_cross1 = _mm_set_epi32(0, (int)(UInt32)rr[0], 0, (int)(UInt32)rr[2]); + __m128i r_cross2 = _mm_set_epi32(0, (int)(UInt32)rr[1], 0, (int)(UInt32)rr[3]); + + UInt64 m[8] = { 0 }; + __m128i prod; + UInt64 pLo, pHi; + + #define POLY1305_SSE2_MUL_ACC(d_op, r_op, off_lo, off_hi) \ + prod = _mm_mul_epu32(d_op, r_op); \ + _mm_storel_epi64((__m128i *)&pLo, prod); \ + _mm_storel_epi64((__m128i *)&pHi, _mm_srli_si128(prod, 8)); \ + m[off_lo] += pLo; \ + m[off_hi] += pHi; + + POLY1305_SSE2_MUL_ACC(d_vec, r_even, 0, 4) + POLY1305_SSE2_MUL_ACC(d_vec, r_odd, 1, 5) + POLY1305_SSE2_MUL_ACC(d_swap, r_even, 1, 5) + POLY1305_SSE2_MUL_ACC(d_swap, r_odd, 2, 6) + POLY1305_SSE2_MUL_ACC(d_vec, r_cross1, 2, 2) + POLY1305_SSE2_MUL_ACC(d_vec, r_cross2, 3, 3) + POLY1305_SSE2_MUL_ACC(d_swap, r_cross1, 3, 3) + POLY1305_SSE2_MUL_ACC(d_swap, r_cross2, 4, 4) + + #undef POLY1305_SSE2_MUL_ACC + + Poly1305_ReduceAndPack(h, m); +} + +static void Poly1305_ProcessBlock_SSE2(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) +{ + Poly1305_ProcessBlock_SSE2_4Way(h, r, block, hasHighBit); +} + +#endif + +#if !defined(MY_CPU_X86_OR_AMD64) || !defined(MY_CPU_SSE2) +static void Poly1305_ProcessBlock_32(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) +{ + UInt64 d[8] = { 0 }; + + for (unsigned i = 0; i < 3; i++) + { + d[i] = (UInt64)GetUi32(h + i * 4); + } + d[3] = ((UInt64)GetUi32(h + 12)) & 0x3FFFFFF; + + for (unsigned i = 0; i < 3; i++) + { + UInt64 t = GetUi32(block + i * 4); + d[i] += t; + } + d[3] += ((UInt64)GetUi32(block + 12)) & 0x3FFFFFF; + + if (hasHighBit) + d[3] |= 0x1000000; + + UInt64 rr[4]; + rr[0] = GetUi32(r) & 0x3FFFFFF; + rr[1] = ((UInt64)GetUi32(r + 3) >> 2) & 0x3FFFF03; + rr[2] = ((UInt64)GetUi32(r + 6) >> 4) & 0x3FFC0FF; + rr[3] = ((UInt64)GetUi32(r + 9) >> 6) & 0x3F03FFF; + + UInt64 m[8] = { 0 }; + for (unsigned i = 0; i < 4; i++) + { + for (unsigned j = 0; j < 4; j++) + { + m[i + j] += d[i] * rr[j]; + } + } + + Poly1305_ReduceAndPack(h, m); +} +#endif +#endif + +#ifdef Z7_POLY1305_128BIT +static void Poly1305_ProcessBlock(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) +{ + Poly1305_ProcessBlock_128(h, r, block, hasHighBit); +} +#else +static void Poly1305_ProcessBlock(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) +{ +#if defined(MY_CPU_X86_OR_AMD64) && defined(MY_CPU_SSE2) + Poly1305_ProcessBlock_SSE2(h, r, block, hasHighBit); +#else + Poly1305_ProcessBlock_32(h, r, block, hasHighBit); +#endif +} +#endif + void CPoly1305::Update(const Byte *data, UInt32 size) { if (_finalized) @@ -212,28 +456,25 @@ void CPoly1305::UpdateAad(const Byte *data, UInt32 size) } } +void CPoly1305::PadAndProcessBlock(Byte *buf, unsigned bufPos, UInt64 len) +{ + unsigned mod = (unsigned)(len & 0xF); + if (mod != 0) + { + unsigned padLen = 16 - mod; + memset(buf + bufPos, 0, padLen); + Poly1305_ProcessBlock(_h, _r, buf, true); + } +} + void CPoly1305::Final(Byte *tag) { if (_finalized) return; _finalized = true; - unsigned aadLenMod = (unsigned)(_aadLen & 0xF); - if (aadLenMod != 0) - { - unsigned padLen = 16 - aadLenMod; - memset(_aadBlock + _aadBlockPos, 0, padLen); - Poly1305_ProcessBlock(_h, _r, _aadBlock, true); - } - - unsigned ctLenMod = (unsigned)(_totalLen & 0xF); - - if (ctLenMod != 0) - { - unsigned padLen = 16 - ctLenMod; - memset(_block + _blockPos, 0, padLen); - Poly1305_ProcessBlock(_h, _r, _block, true); - } + PadAndProcessBlock(_aadBlock, _aadBlockPos, _aadLen); + PadAndProcessBlock(_block, _blockPos, _totalLen); { Byte lenBlock[16]; @@ -291,129 +532,6 @@ void CBaseCoder::DeriveKey() _derivedKeyValid = true; } -void CBaseCoder::ProcessData(Byte *data, UInt32 size) -{ - if (!_derivedKeyValid) - { - DeriveKey(); - } - -#ifdef MY_CPU_X86_OR_AMD64 -#ifdef MY_CPU_SSE2 - InitSIMD(); - - if (size >= kBlockSize * 4) - { - UInt32 state[16]; - state[0] = GetUi32(kSigma); - state[1] = GetUi32(kSigma + 4); - state[2] = GetUi32(kSigma + 8); - state[3] = GetUi32(kSigma + 12); - state[4] = GetUi32(_derivedKey); - state[5] = GetUi32(_derivedKey + 4); - state[6] = GetUi32(_derivedKey + 8); - state[7] = GetUi32(_derivedKey + 12); - state[8] = GetUi32(_derivedKey + 16); - state[9] = GetUi32(_derivedKey + 20); - state[10] = GetUi32(_derivedKey + 24); - state[11] = GetUi32(_derivedKey + 28); - state[12] = (UInt32)(_counter & 0xFFFFFFFF); - state[13] = (UInt32)(_counter >> 32); - state[14] = GetUi32(_nonce + 16); - state[15] = GetUi32(_nonce + 20); - -#ifdef MY_CPU_AMD64 - if (g_AVX2Enabled && size >= kBlockSize * 8) - { - while (size >= kBlockSize * 8) - { - ChaCha20_OperateKeystream_AVX2(state, data, data); - state[12] += 8; - if (state[12] < 8) - state[13]++; - data += kBlockSize * 8; - size -= kBlockSize * 8; - } - } -#endif - - if (g_SSE2Enabled && size >= kBlockSize * 4) - { - while (size >= kBlockSize * 4) - { - ChaCha20_OperateKeystream_SSE2(state, data, data); - state[12] += 4; - if (state[12] < 4) - state[13]++; - data += kBlockSize * 4; - size -= kBlockSize * 4; - } - } - - _counter = (UInt64)state[13] << 32 | state[12]; - } -#endif -#endif - - while (size > 0) - { - if (_blockPos == 0 || _blockPos >= kBlockSize) - { - NXChaCha20::XChaCha20Block_Core(_block, _derivedKey, _nonce + 16, _counter); - _blockPos = 0; - _counter++; - if (_counter == 0) - { - memset(_block, 0, kBlockSize); - } - } - - UInt32 remaining = kBlockSize - _blockPos; - UInt32 toProcess = (size < remaining) ? size : remaining; - - Byte *dataPtr = data; - const Byte *blockPtr = _block + _blockPos; - UInt32 count = toProcess; - -#ifdef MY_CPU_64BIT - while (count >= 8) - { - *(UInt64 *)dataPtr ^= *(const UInt64 *)blockPtr; - dataPtr += 8; - blockPtr += 8; - count -= 8; - } -#endif - - while (count >= 4) - { - *(UInt32 *)dataPtr ^= *(const UInt32 *)blockPtr; - dataPtr += 4; - blockPtr += 4; - count -= 4; - } - - while (count--) - *dataPtr++ ^= *blockPtr++; - - data += toProcess; - size -= toProcess; - _blockPos += toProcess; - } -} - -Z7_COM7F_IMF(CBaseCoder::CryptoSetPassword(const Byte *data, UInt32 size)) -{ - COM_TRY_BEGIN - - _key.Password.Wipe(); - _key.Password.CopyFrom(data, (size_t)size); - _derivedKeyValid = false; - return S_OK; - - COM_TRY_END -} - Z7_COM7F_IMF(CBaseCoder::Init()) { COM_TRY_BEGIN @@ -617,7 +735,12 @@ Z7_COM7F_IMF(CDecoder::CryptoAuthVerify(Int32 *result)) Byte computedTag[kTagSize]; _poly1305.Final(computedTag); - _authResult = (memcmp(computedTag, _expectedTag, kTagSize) == 0) ? 0 : 1; + { + volatile Byte diff = 0; + for (unsigned i = 0; i < kTagSize; i++) + diff |= computedTag[i] ^ _expectedTag[i]; + _authResult = (diff == 0) ? 0 : 1; + } *result = _authResult; Z7_memset_0_ARRAY(computedTag); diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.h b/CPP/7zip/Crypto/XChaCha20Poly1305.h index 104f91520..57a84ac9c 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305.h +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.h @@ -1,6 +1,4 @@ // XChaCha20Poly1305.h -// XChaCha20-Poly1305 AEAD coder for 7z format -// Reuses XChaCha20 stream cipher + adds Poly1305 MAC authentication #ifndef ZIP7_INC_CRYPTO_XCHACHA20_POLY1305_H #define ZIP7_INC_CRYPTO_XCHACHA20_POLY1305_H @@ -15,10 +13,8 @@ namespace NCrypto { namespace NXChaCha20Poly1305 { -using NXChaCha20::CBase; using NXChaCha20::kNonceSize; using NXChaCha20::k_NumCyclesPower_Supported_MAX; -using N7zKeyDerivation::kKeySize; const unsigned kTagSize = 16; const unsigned kPolyKeySize = 32; @@ -35,6 +31,8 @@ class CPoly1305 Byte _aadBlock[16]; unsigned _aadBlockPos; UInt64 _aadLen; + + void PadAndProcessBlock(Byte *buf, unsigned bufPos, UInt64 len); public: CPoly1305(); void SetKey(const Byte *key); @@ -45,12 +43,8 @@ class CPoly1305 }; class CBaseCoder: - public ICompressFilter, - public ICryptoSetPassword, - public CMyUnknownImp, - public NXChaCha20::CBase + public NXChaCha20::CBaseCoder { - Z7_IFACE_COM7_IMP(ICryptoSetPassword) Z7_COM7F_IMP(Init()) protected: virtual ~CBaseCoder() @@ -59,17 +53,11 @@ class CBaseCoder: Z7_memset_0_ARRAY(_aad); } - static const unsigned kBlockSize = 64; - Byte _block[kBlockSize]; - unsigned _blockPos; - Byte _derivedKey[kKeySize]; Byte _polyKey[kPolyKeySize]; - bool _derivedKeyValid; CPoly1305 _poly1305; Byte _aad[2 + 16 + kNonceSize]; unsigned _aadSize; - void ProcessData(Byte *data, UInt32 size); void DeriveKey(); void ComputePolyKey(); }; From d6db9367d6f45a04c8b7209504ba5bbe4b6a44c6 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Wed, 3 Jun 2026 08:49:35 +0800 Subject: [PATCH 08/18] Resolved crash issues on certain platforms. --- CPP/7zip/Crypto/XChaCha20.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CPP/7zip/Crypto/XChaCha20.cpp b/CPP/7zip/Crypto/XChaCha20.cpp index d45c97853..7932ae522 100644 --- a/CPP/7zip/Crypto/XChaCha20.cpp +++ b/CPP/7zip/Crypto/XChaCha20.cpp @@ -315,7 +315,7 @@ void CBaseCoder::ProcessData(Byte *data, UInt32 size) const Byte *blockPtr = _block + _blockPos; UInt32 count = toProcess; -#ifdef MY_CPU_64BIT +#ifdef MY_CPU_LE_UNALIGN_64 while (count >= 8) { *(UInt64 *)dataPtr ^= *(const UInt64 *)blockPtr; @@ -325,6 +325,7 @@ void CBaseCoder::ProcessData(Byte *data, UInt32 size) } #endif +#ifdef MY_CPU_LE_UNALIGN while (count >= 4) { *(UInt32 *)dataPtr ^= *(const UInt32 *)blockPtr; @@ -332,6 +333,7 @@ void CBaseCoder::ProcessData(Byte *data, UInt32 size) blockPtr += 4; count -= 4; } +#endif while (count--) *dataPtr++ ^= *blockPtr++; From 13883b8254f10481559b52ac215af162e814765b Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:06:26 +0800 Subject: [PATCH 09/18] Add cascaded key derivation, cascaded encryption algorithm. --- CPP/7zip/7zip.mak | 2 +- CPP/7zip/7zip_gcc.mak | 12 + CPP/7zip/Archive/7z/7zHandler.cpp | 22 +- CPP/7zip/Archive/7z/7zHandlerOut.cpp | 15 +- CPP/7zip/Archive/7z/7zHeader.h | 2 + CPP/7zip/Archive/7z/7zItem.h | 2 +- CPP/7zip/Bundles/Alone/Alone.dsp | 47 + CPP/7zip/Bundles/Alone/makefile | 3 + CPP/7zip/Bundles/Alone/makefile.gcc | 6 + CPP/7zip/Bundles/Alone7z/Alone.dsp | 16 + CPP/7zip/Bundles/Alone7z/makefile | 3 + CPP/7zip/Bundles/Alone7z/makefile.gcc | 6 + CPP/7zip/Bundles/Format7z/makefile | 3 + CPP/7zip/Bundles/Format7zExtract/makefile | 3 + CPP/7zip/Bundles/Format7zF/Arc.mak | 10 +- CPP/7zip/Bundles/Format7zF/Arc_gcc.mak | 6 + CPP/7zip/Bundles/Format7zF/Format7z.dsp | 36 + CPP/7zip/Bundles/SFXCon/SFXCon.dsp | 16 + CPP/7zip/Bundles/SFXCon/makefile | 8 + CPP/7zip/Bundles/SFXCon/makefile.gcc | 5 + CPP/7zip/Bundles/SFXWin/SFXWin.dsp | 16 + CPP/7zip/Bundles/SFXWin/makefile | 8 + CPP/7zip/Crypto/7zKeyDerivation.cpp | 22 +- CPP/7zip/Crypto/7zKeyDerivation.h | 18 +- CPP/7zip/Crypto/Ascon.cpp | 162 +++ CPP/7zip/Crypto/Ascon.h | 31 + CPP/7zip/Crypto/AsconSimd.h | 110 ++ CPP/7zip/Crypto/Cascade.cpp | 1203 +++++++++++++++++ CPP/7zip/Crypto/Cascade.h | 266 ++++ CPP/7zip/Crypto/CascadeRegister.cpp | 29 + CPP/7zip/Crypto/ChaCha20Simd.h | 3 +- CPP/7zip/Crypto/HkdfBlake2sp.cpp | 114 ++ CPP/7zip/Crypto/HkdfBlake2sp.h | 21 + CPP/7zip/Crypto/HmacSha512.cpp | 55 + CPP/7zip/Crypto/HmacSha512.h | 29 + CPP/7zip/Crypto/Pbkdf2HmacSha512.cpp | 63 + CPP/7zip/Crypto/Pbkdf2HmacSha512.h | 20 + CPP/7zip/Crypto/XChaCha20.cpp | 40 +- CPP/7zip/Crypto/XChaCha20.h | 2 + CPP/7zip/Crypto/XChaCha20Poly1305.cpp | 74 +- CPP/7zip/Crypto/XChaCha20Poly1305.h | 3 + CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp | 2 + CPP/7zip/Crypto/XChaCha20Register.cpp | 2 + CPP/7zip/Sha512.mak | 6 + CPP/7zip/UI/GUI/CompressDialog.cpp | 16 +- CPP/7zip/UI/GUI/CompressDialog.rc | 4 +- DOC/Methods.txt | 8 +- 47 files changed, 2451 insertions(+), 99 deletions(-) create mode 100644 CPP/7zip/Crypto/Ascon.cpp create mode 100644 CPP/7zip/Crypto/Ascon.h create mode 100644 CPP/7zip/Crypto/AsconSimd.h create mode 100644 CPP/7zip/Crypto/Cascade.cpp create mode 100644 CPP/7zip/Crypto/Cascade.h create mode 100644 CPP/7zip/Crypto/CascadeRegister.cpp create mode 100644 CPP/7zip/Crypto/HkdfBlake2sp.cpp create mode 100644 CPP/7zip/Crypto/HkdfBlake2sp.h create mode 100644 CPP/7zip/Crypto/HmacSha512.cpp create mode 100644 CPP/7zip/Crypto/HmacSha512.h create mode 100644 CPP/7zip/Crypto/Pbkdf2HmacSha512.cpp create mode 100644 CPP/7zip/Crypto/Pbkdf2HmacSha512.h create mode 100644 CPP/7zip/Sha512.mak diff --git a/CPP/7zip/7zip.mak b/CPP/7zip/7zip.mak index 8bf7e73e0..8e8cf246a 100644 --- a/CPP/7zip/7zip.mak +++ b/CPP/7zip/7zip.mak @@ -231,7 +231,7 @@ $(C_OBJS): ../../../../C/$(*B).c {../../Compress}.cpp{$O}.obj:: $(COMPLB_O2) {../../Crypto}.cpp{$O}.obj:: - $(COMPLB_O2) + $(COMPLB_O2) -utf-8 {../../../../C}.c{$O}.obj:: $(CCOMPLB) diff --git a/CPP/7zip/7zip_gcc.mak b/CPP/7zip/7zip_gcc.mak index 4ad4db6c3..37a2d8674 100644 --- a/CPP/7zip/7zip_gcc.mak +++ b/CPP/7zip/7zip_gcc.mak @@ -829,16 +829,28 @@ $O/7zAesRegister.o: ../../Crypto/7zAesRegister.cpp $(CXX) $(CXXFLAGS) $< $O/7zKeyDerivation.o: ../../Crypto/7zKeyDerivation.cpp $(CXX) $(CXXFLAGS) $< +$O/Ascon.o: ../../Crypto/Ascon.cpp + $(CXX) $(CXXFLAGS) $< +$O/Cascade.o: ../../Crypto/Cascade.cpp + $(CXX) $(CXXFLAGS) $< +$O/CascadeRegister.o: ../../Crypto/CascadeRegister.cpp + $(CXX) $(CXXFLAGS) $< +$O/HkdfBlake2sp.o: ../../Crypto/HkdfBlake2sp.cpp + $(CXX) $(CXXFLAGS) $< $O/HmacSha1.o: ../../Crypto/HmacSha1.cpp $(CXX) $(CXXFLAGS) $< $O/HmacSha256.o: ../../Crypto/HmacSha256.cpp $(CXX) $(CXXFLAGS) $< +$O/HmacSha512.o: ../../Crypto/HmacSha512.cpp + $(CXX) $(CXXFLAGS) $< $O/MyAes.o: ../../Crypto/MyAes.cpp $(CXX) $(CXXFLAGS) $< $O/MyAesReg.o: ../../Crypto/MyAesReg.cpp $(CXX) $(CXXFLAGS) $< $O/Pbkdf2HmacSha1.o: ../../Crypto/Pbkdf2HmacSha1.cpp $(CXX) $(CXXFLAGS) $< +$O/Pbkdf2HmacSha512.o: ../../Crypto/Pbkdf2HmacSha512.cpp + $(CXX) $(CXXFLAGS) $< $O/RandGen.o: ../../Crypto/RandGen.cpp $(CXX) $(CXXFLAGS) $< $O/Rar20Crypto.o: ../../Crypto/Rar20Crypto.cpp diff --git a/CPP/7zip/Archive/7z/7zHandler.cpp b/CPP/7zip/Archive/7z/7zHandler.cpp index 0da02f57e..46c15c918 100644 --- a/CPP/7zip/Archive/7z/7zHandler.cpp +++ b/CPP/7zip/Archive/7z/7zHandler.cpp @@ -307,7 +307,7 @@ bool CHandler::IsFolderEncrypted(CNum folderIndex) const for (unsigned j = 0; j < idSize; j++) id64 = ((id64 << 8) | longID[j]); inByte.SkipDataNoCheck(idSize); - if (id64 == k_AES || id64 == k_XCHACHA20 || id64 == k_XCHACHA20_POLY1305) + if (id64 == k_AES || id64 == k_XCHACHA20 || id64 == k_XCHACHA20_POLY1305 || id64 == k_AES_XCHACHA20_ASCON || id64 == k_AES_XCHACHA20_POLY1305) return true; if ((mainByte & 0x20) != 0) inByte.SkipDataNoCheck(inByte.ReadNum()); @@ -525,6 +525,26 @@ HRESULT CHandler::SetMethodToProp(CNum folderIndex, PROPVARIANT *prop) const ConvertUInt32ToString(numCyclesPower, s); } } + else if (id == k_AES_XCHACHA20_ASCON) + { + name = "AES+XChaCha20+Ascon"; + if (propsSize >= 1) + { + const Byte firstByte = props[0]; + const UInt32 numCyclesPower = firstByte & 0x3F; + ConvertUInt32ToString(numCyclesPower, s); + } + } + else if (id == k_AES_XCHACHA20_POLY1305) + { + name = "AES+XChaCha20-Poly1305"; + if (propsSize >= 1) + { + const Byte firstByte = props[0]; + const UInt32 numCyclesPower = firstByte & 0x3F; + ConvertUInt32ToString(numCyclesPower, s); + } + } } if (name) diff --git a/CPP/7zip/Archive/7z/7zHandlerOut.cpp b/CPP/7zip/Archive/7z/7zHandlerOut.cpp index 3747b294c..b2cd3af7d 100644 --- a/CPP/7zip/Archive/7z/7zHandlerOut.cpp +++ b/CPP/7zip/Archive/7z/7zHandlerOut.cpp @@ -1030,13 +1030,20 @@ HRESULT COutHandler::SetProperty(const wchar_t *nameSpec, const PROPVARIANT &val { if (value.vt != VT_BSTR) return E_INVALIDARG; - const wchar_t *m = value.bstrVal; - if (StringsAreEqualNoCase_Ascii(m, "AES256") || StringsAreEqualNoCase_Ascii(m, "AES-256")) + UString m = value.bstrVal; + m.RemoveChar(L'-'); + m.RemoveChar(L'+'); + m.MakeLower_Ascii(); + if (m.IsEqualTo("aes256")) _encryptionMethodId = k_AES; - else if (StringsAreEqualNoCase_Ascii(m, "XChaCha20")) + else if (m.IsEqualTo("xchacha20")) _encryptionMethodId = k_XCHACHA20; - else if (StringsAreEqualNoCase_Ascii(m, "XChaCha20Poly1305") || StringsAreEqualNoCase_Ascii(m, "XChaCha20-Poly1305")) + else if (m.IsEqualTo("xchacha20poly1305")) _encryptionMethodId = k_XCHACHA20_POLY1305; + else if (m.IsEqualTo("axa") || m.IsEqualTo("aesxchacha20ascon")) + _encryptionMethodId = k_AES_XCHACHA20_ASCON; + else if (m.IsEqualTo("axp") || m.IsEqualTo("aesxchacha20poly1305")) + _encryptionMethodId = k_AES_XCHACHA20_POLY1305; else return E_INVALIDARG; return S_OK; diff --git a/CPP/7zip/Archive/7z/7zHeader.h b/CPP/7zip/Archive/7z/7zHeader.h index fffb327ef..9fe4f5571 100644 --- a/CPP/7zip/Archive/7z/7zHeader.h +++ b/CPP/7zip/Archive/7z/7zHeader.h @@ -125,6 +125,8 @@ const UInt32 k_SPARC = 0x3030805; const UInt32 k_AES = 0x6F10701; const UInt32 k_XCHACHA20 = 0x6F10702; const UInt32 k_XCHACHA20_POLY1305 = 0x6F10703; +const UInt32 k_AES_XCHACHA20_POLY1305 = 0x6F10704; +const UInt32 k_AES_XCHACHA20_ASCON = 0x6F10705; // const UInt32 k_ZSTD = 0x4015D; // winzip zstd // 0x4F71101, 7z-zstd diff --git a/CPP/7zip/Archive/7z/7zItem.h b/CPP/7zip/Archive/7z/7zItem.h index 65eba5c3a..8e785195c 100644 --- a/CPP/7zip/Archive/7z/7zItem.h +++ b/CPP/7zip/Archive/7z/7zItem.h @@ -85,7 +85,7 @@ struct CFolder FOR_VECTOR(i, Coders) { CMethodId id = Coders[i].MethodID; - if (id == k_AES || id == k_XCHACHA20 || id == k_XCHACHA20_POLY1305) + if (id == k_AES || id == k_XCHACHA20 || id == k_XCHACHA20_POLY1305 || id == k_AES_XCHACHA20_ASCON || id == k_AES_XCHACHA20_POLY1305) return true; } return false; diff --git a/CPP/7zip/Bundles/Alone/Alone.dsp b/CPP/7zip/Bundles/Alone/Alone.dsp index 489b20c58..bc7bede16 100644 --- a/CPP/7zip/Bundles/Alone/Alone.dsp +++ b/CPP/7zip/Bundles/Alone/Alone.dsp @@ -1971,6 +1971,28 @@ SOURCE=..\..\Crypto\HmacSha1.h # End Source File # Begin Source File +SOURCE=..\..\Crypto\HmacSha512.cpp + +!IF "$(CFG)" == "Alone - Win32 Release" + +# ADD CPP /O2 +# SUBTRACT CPP /YX /Yc /Yu + +!ELSEIF "$(CFG)" == "Alone - Win32 Debug" + +!ELSEIF "$(CFG)" == "Alone - Win32 ReleaseU" + +!ELSEIF "$(CFG)" == "Alone - Win32 DebugU" + +!ENDIF + +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\HmacSha512.h +# End Source File +# Begin Source File + SOURCE=..\..\Crypto\MyAes.cpp !IF "$(CFG)" == "Alone - Win32 Release" @@ -2022,6 +2044,31 @@ SOURCE=..\..\Crypto\Pbkdf2HmacSha1.h # End Source File # Begin Source File +SOURCE=..\..\Crypto\Pbkdf2HmacSha512.cpp + +!IF "$(CFG)" == "Alone - Win32 Release" + +# ADD CPP /O2 +# SUBTRACT CPP /YX /Yc /Yu + +!ELSEIF "$(CFG)" == "Alone - Win32 Debug" + +!ELSEIF "$(CFG)" == "Alone - Win32 ReleaseU" + +# ADD CPP /O2 +# SUBTRACT CPP /YX /Yc /Yu + +!ELSEIF "$(CFG)" == "Alone - Win32 DebugU" + +!ENDIF + +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\Pbkdf2HmacSha512.h +# End Source File +# Begin Source File + SOURCE=..\..\Crypto\RandGen.cpp # End Source File # Begin Source File diff --git a/CPP/7zip/Bundles/Alone/makefile b/CPP/7zip/Bundles/Alone/makefile index 67a4d3304..8905c21bd 100644 --- a/CPP/7zip/Bundles/Alone/makefile +++ b/CPP/7zip/Bundles/Alone/makefile @@ -188,9 +188,11 @@ CRYPTO_OBJS = \ $O\7zAesRegister.obj \ $O\7zKeyDerivation.obj \ $O\HmacSha1.obj \ + $O\HmacSha512.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ $O\Pbkdf2HmacSha1.obj \ + $O\Pbkdf2HmacSha512.obj \ $O\RandGen.obj \ $O\WzAes.obj \ $O\ZipCrypto.obj \ @@ -241,6 +243,7 @@ C_OBJS = \ !include "../../LzmaDec.mak" !include "../../Sha1.mak" !include "../../Sha256.mak" +!include "../../Sha512.mak" !include "../../Sort.mak" !include "../../7zip.mak" diff --git a/CPP/7zip/Bundles/Alone/makefile.gcc b/CPP/7zip/Bundles/Alone/makefile.gcc index 2ae3e701f..8c4f60401 100644 --- a/CPP/7zip/Bundles/Alone/makefile.gcc +++ b/CPP/7zip/Bundles/Alone/makefile.gcc @@ -128,6 +128,8 @@ COMMON_OBJS = \ $O/Sha1Reg.o \ $O/Sha256Prepare.o \ $O/Sha256Reg.o \ + $O/Sha512Prepare.o \ + $O/Sha512Reg.o \ $O/StdInStream.o \ $O/StdOutStream.o \ $O/StringConvert.o \ @@ -279,9 +281,11 @@ CRYPTO_OBJS = \ $O/7zAesRegister.o \ $O/7zKeyDerivation.o \ $O/HmacSha1.o \ + $O/HmacSha512.o \ $O/MyAes.o \ $O/MyAesReg.o \ $O/Pbkdf2HmacSha1.o \ + $O/Pbkdf2HmacSha512.o \ $O/RandGen.o \ $O/WzAes.o \ $O/ZipCrypto.o \ @@ -321,6 +325,8 @@ C_OBJS = \ $O/Sha1Opt.o \ $O/Sha256.o \ $O/Sha256Opt.o \ + $O/Sha512.o \ + $O/Sha512Opt.o \ $O/Sort.o \ $O/SwapBytes.o \ $O/Xxh64.o \ diff --git a/CPP/7zip/Bundles/Alone7z/Alone.dsp b/CPP/7zip/Bundles/Alone7z/Alone.dsp index c4ef3198f..755bb4856 100644 --- a/CPP/7zip/Bundles/Alone7z/Alone.dsp +++ b/CPP/7zip/Bundles/Alone7z/Alone.dsp @@ -2075,6 +2075,22 @@ SOURCE=..\..\Crypto\7zKeyDerivation.h # End Source File # Begin Source File +SOURCE=..\..\Crypto\HmacSha512.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\HmacSha512.h +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\Pbkdf2HmacSha512.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\Pbkdf2HmacSha512.h +# End Source File +# Begin Source File + SOURCE=..\..\Crypto\MyAes.cpp # End Source File # Begin Source File diff --git a/CPP/7zip/Bundles/Alone7z/makefile b/CPP/7zip/Bundles/Alone7z/makefile index 5fc6f7a97..173f02d66 100644 --- a/CPP/7zip/Bundles/Alone7z/makefile +++ b/CPP/7zip/Bundles/Alone7z/makefile @@ -126,6 +126,8 @@ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ $O\7zKeyDerivation.obj \ + $O\HmacSha512.obj \ + $O\Pbkdf2HmacSha512.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ $O\RandGen.obj \ @@ -164,6 +166,7 @@ C_OBJS = \ !include "../../LzFindOpt.mak" !include "../../LzmaDec.mak" !include "../../Sha256.mak" +!include "../../Sha512.mak" !include "../../Sort.mak" !include "../../7zip.mak" diff --git a/CPP/7zip/Bundles/Alone7z/makefile.gcc b/CPP/7zip/Bundles/Alone7z/makefile.gcc index 5a20ab34c..91dfcd913 100644 --- a/CPP/7zip/Bundles/Alone7z/makefile.gcc +++ b/CPP/7zip/Bundles/Alone7z/makefile.gcc @@ -124,6 +124,8 @@ COMMON_OBJS = \ $O/NewHandler.o \ $O/Sha256Prepare.o \ $O/Sha256Reg.o \ + $O/Sha512Prepare.o \ + $O/Sha512Reg.o \ $O/StdInStream.o \ $O/StdOutStream.o \ $O/StringConvert.o \ @@ -221,6 +223,8 @@ CRYPTO_OBJS = \ $O/7zAes.o \ $O/7zAesRegister.o \ $O/7zKeyDerivation.o \ + $O/HmacSha512.o \ + $O/Pbkdf2HmacSha512.o \ $O/MyAes.o \ $O/MyAesReg.o \ $O/RandGen.o \ @@ -249,6 +253,8 @@ C_OBJS = \ $O/MtDec.o \ $O/Sha256.o \ $O/Sha256Opt.o \ + $O/Sha512.o \ + $O/Sha512Opt.o \ $O/SwapBytes.o \ $O/Xz.o \ $O/XzDec.o \ diff --git a/CPP/7zip/Bundles/Format7z/makefile b/CPP/7zip/Bundles/Format7z/makefile index 551d2c87b..62b76dd91 100644 --- a/CPP/7zip/Bundles/Format7z/makefile +++ b/CPP/7zip/Bundles/Format7z/makefile @@ -108,6 +108,8 @@ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ $O\7zKeyDerivation.obj \ + $O\HmacSha512.obj \ + $O\Pbkdf2HmacSha512.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ $O\RandGen.obj \ @@ -144,6 +146,7 @@ C_OBJS = \ !include "../../LzFindOpt.mak" !include "../../LzmaDec.mak" !include "../../Sha256.mak" +!include "../../Sha512.mak" !include "../../Sort.mak" !include "../../7zip.mak" diff --git a/CPP/7zip/Bundles/Format7zExtract/makefile b/CPP/7zip/Bundles/Format7zExtract/makefile index 4aeeaf412..c03da305b 100644 --- a/CPP/7zip/Bundles/Format7zExtract/makefile +++ b/CPP/7zip/Bundles/Format7zExtract/makefile @@ -88,6 +88,8 @@ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ $O\7zKeyDerivation.obj \ + $O\HmacSha512.obj \ + $O\Pbkdf2HmacSha512.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ @@ -113,5 +115,6 @@ C_OBJS = \ !include "../../Crc.mak" !include "../../LzmaDec.mak" !include "../../Sha256.mak" +!include "../../Sha512.mak" !include "../../7zip.mak" diff --git a/CPP/7zip/Bundles/Format7zF/Arc.mak b/CPP/7zip/Bundles/Format7zF/Arc.mak index 1da304762..533da499b 100644 --- a/CPP/7zip/Bundles/Format7zF/Arc.mak +++ b/CPP/7zip/Bundles/Format7zF/Arc.mak @@ -14,7 +14,6 @@ COMMON_OBJS = \ $O\Sha256Reg.obj \ $O\Sha3Reg.obj \ $O\Sha512Reg.obj \ - $O\Sha512Prepare.obj \ $O\StringConvert.obj \ $O\StringToInt.obj \ $O\UTFConvert.obj \ @@ -245,15 +244,21 @@ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ $O\7zKeyDerivation.obj \ + $O\Ascon.obj \ + $O\Cascade.obj \ + $O\CascadeRegister.obj \ + $O\HkdfBlake2sp.obj \ $O\XChaCha20.obj \ $O\XChaCha20Register.obj \ $O\XChaCha20Poly1305.obj \ $O\XChaCha20Poly1305Register.obj \ $O\HmacSha1.obj \ $O\HmacSha256.obj \ + $O\HmacSha512.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ $O\Pbkdf2HmacSha1.obj \ + $O\Pbkdf2HmacSha512.obj \ $O\RandGen.obj \ $O\Rar20Crypto.obj \ $O\Rar5Aes.obj \ @@ -294,8 +299,6 @@ C_OBJS = \ $O\Ppmd8Dec.obj \ $O\Ppmd8Enc.obj \ $O\Sha3.obj \ - $O\Sha512.obj \ - $O\Sha512Opt.obj \ $O\SwapBytes.obj \ $O\Threads.obj \ $O\Xxh64.obj \ @@ -312,4 +315,5 @@ C_OBJS = \ !include "../../LzmaDec.mak" !include "../../Sha1.mak" !include "../../Sha256.mak" +!include "../../Sha512.mak" !include "../../Sort.mak" diff --git a/CPP/7zip/Bundles/Format7zF/Arc_gcc.mak b/CPP/7zip/Bundles/Format7zF/Arc_gcc.mak index b856b9e92..4ca3ae63c 100644 --- a/CPP/7zip/Bundles/Format7zF/Arc_gcc.mak +++ b/CPP/7zip/Bundles/Format7zF/Arc_gcc.mak @@ -299,11 +299,17 @@ CRYPTO_OBJS = \ $O/7zAes.o \ $O/7zAesRegister.o \ $O/7zKeyDerivation.o \ + $O/Ascon.o \ + $O/Cascade.o \ + $O/CascadeRegister.o \ + $O/HkdfBlake2sp.o \ $O/HmacSha1.o \ $O/HmacSha256.o \ + $O/HmacSha512.o \ $O/MyAes.o \ $O/MyAesReg.o \ $O/Pbkdf2HmacSha1.o \ + $O/Pbkdf2HmacSha512.o \ $O/RandGen.o \ $O/WzAes.o \ $O/ZipCrypto.o \ diff --git a/CPP/7zip/Bundles/Format7zF/Format7z.dsp b/CPP/7zip/Bundles/Format7zF/Format7z.dsp index 55cd72d6c..7243f7ec0 100644 --- a/CPP/7zip/Bundles/Format7zF/Format7z.dsp +++ b/CPP/7zip/Bundles/Format7zF/Format7z.dsp @@ -1201,6 +1201,24 @@ SOURCE=..\..\Crypto\HmacSha256.h # End Source File # Begin Source File +SOURCE=..\..\Crypto\HmacSha512.cpp + +!IF "$(CFG)" == "7z - Win32 Release" + +# ADD CPP /O2 +# SUBTRACT CPP /YX /Yc /Yu + +!ELSEIF "$(CFG)" == "7z - Win32 Debug" + +!ENDIF + +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\HmacSha512.h +# End Source File +# Begin Source File + SOURCE=..\..\Crypto\MyAes.cpp !IF "$(CFG)" == "7z - Win32 Release" @@ -1251,6 +1269,24 @@ SOURCE=..\..\Crypto\Pbkdf2HmacSha1.h # End Source File # Begin Source File +SOURCE=..\..\Crypto\Pbkdf2HmacSha512.cpp + +!IF "$(CFG)" == "7z - Win32 Release" + +# ADD CPP /O2 +# SUBTRACT CPP /YX /Yc /Yu + +!ELSEIF "$(CFG)" == "7z - Win32 Debug" + +!ENDIF + +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\Pbkdf2HmacSha512.h +# End Source File +# Begin Source File + SOURCE=..\..\Crypto\RandGen.cpp !IF "$(CFG)" == "7z - Win32 Release" diff --git a/CPP/7zip/Bundles/SFXCon/SFXCon.dsp b/CPP/7zip/Bundles/SFXCon/SFXCon.dsp index b6392ceb0..5bb1c5058 100644 --- a/CPP/7zip/Bundles/SFXCon/SFXCon.dsp +++ b/CPP/7zip/Bundles/SFXCon/SFXCon.dsp @@ -365,6 +365,22 @@ SOURCE=..\..\Crypto\7zKeyDerivation.h # End Source File # Begin Source File +SOURCE=..\..\Crypto\HmacSha512.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\HmacSha512.h +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\Pbkdf2HmacSha512.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\Pbkdf2HmacSha512.h +# End Source File +# Begin Source File + SOURCE=..\..\Crypto\MyAes.cpp # End Source File # Begin Source File diff --git a/CPP/7zip/Bundles/SFXCon/makefile b/CPP/7zip/Bundles/SFXCon/makefile index 5e98a11f5..ffd0a14de 100644 --- a/CPP/7zip/Bundles/SFXCon/makefile +++ b/CPP/7zip/Bundles/SFXCon/makefile @@ -110,6 +110,12 @@ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ $O\7zKeyDerivation.obj \ + $O\HkdfBlake2sp.obj \ + $O\HmacSha512.obj \ + $O\Pbkdf2HmacSha512.obj \ + $O\Ascon.obj \ + $O\Cascade.obj \ + $O\CascadeRegister.obj \ $O\MyAes.obj \ $O\XChaCha20.obj \ $O\XChaCha20Register.obj \ @@ -119,6 +125,7 @@ CRYPTO_OBJS = \ C_OBJS = \ $O\7zStream.obj \ $O\Alloc.obj \ + $O\Blake2s.obj \ $O\Bcj2.obj \ $O\Bra.obj \ $O\Bra86.obj \ @@ -138,5 +145,6 @@ C_OBJS = \ !include "../../Crc.mak" !include "../../LzmaDec.mak" !include "../../Sha256.mak" +!include "../../Sha512.mak" !include "../../7zip.mak" diff --git a/CPP/7zip/Bundles/SFXCon/makefile.gcc b/CPP/7zip/Bundles/SFXCon/makefile.gcc index eb4f39fc5..d9b996e7e 100644 --- a/CPP/7zip/Bundles/SFXCon/makefile.gcc +++ b/CPP/7zip/Bundles/SFXCon/makefile.gcc @@ -90,6 +90,7 @@ COMMON_OBJS = \ $O/MyVector.o \ $O/NewHandler.o \ $O/Sha256Prepare.o \ + $O/Sha512Prepare.o \ $O/StdInStream.o \ $O/StdOutStream.o \ $O/StringConvert.o \ @@ -172,6 +173,8 @@ CRYPTO_OBJS = \ $O/7zAes.o \ $O/7zAesRegister.o \ $O/7zKeyDerivation.o \ + $O/HmacSha512.o \ + $O/Pbkdf2HmacSha512.o \ $O/MyAes.o \ C_OBJS = \ @@ -191,6 +194,8 @@ C_OBJS = \ $O/Ppmd7Dec.o \ $O/Sha256.o \ $O/Sha256Opt.o \ + $O/Sha512.o \ + $O/Sha512Opt.o \ $O/7zCrc.o \ $O/7zCrcOpt.o \ $O/Aes.o \ diff --git a/CPP/7zip/Bundles/SFXWin/SFXWin.dsp b/CPP/7zip/Bundles/SFXWin/SFXWin.dsp index 2e7af3899..2af028627 100644 --- a/CPP/7zip/Bundles/SFXWin/SFXWin.dsp +++ b/CPP/7zip/Bundles/SFXWin/SFXWin.dsp @@ -321,6 +321,22 @@ SOURCE=..\..\Crypto\7zKeyDerivation.h # End Source File # Begin Source File +SOURCE=..\..\Crypto\HmacSha512.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\HmacSha512.h +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\Pbkdf2HmacSha512.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\Pbkdf2HmacSha512.h +# End Source File +# Begin Source File + SOURCE=..\..\Crypto\MyAes.cpp # End Source File # Begin Source File diff --git a/CPP/7zip/Bundles/SFXWin/makefile b/CPP/7zip/Bundles/SFXWin/makefile index 9b9738e0f..744ff5ea6 100644 --- a/CPP/7zip/Bundles/SFXWin/makefile +++ b/CPP/7zip/Bundles/SFXWin/makefile @@ -132,6 +132,12 @@ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ $O\7zKeyDerivation.obj \ + $O\HkdfBlake2sp.obj \ + $O\HmacSha512.obj \ + $O\Pbkdf2HmacSha512.obj \ + $O\Ascon.obj \ + $O\Cascade.obj \ + $O\CascadeRegister.obj \ $O\MyAes.obj \ $O\XChaCha20.obj \ $O\XChaCha20Register.obj \ @@ -141,6 +147,7 @@ CRYPTO_OBJS = \ C_OBJS = \ $O\7zStream.obj \ $O\Alloc.obj \ + $O\Blake2s.obj \ $O\Bcj2.obj \ $O\Bra.obj \ $O\Bra86.obj \ @@ -160,5 +167,6 @@ C_OBJS = \ !include "../../Crc.mak" !include "../../LzmaDec.mak" !include "../../Sha256.mak" +!include "../../Sha512.mak" !include "../../7zip.mak" diff --git a/CPP/7zip/Crypto/7zKeyDerivation.cpp b/CPP/7zip/Crypto/7zKeyDerivation.cpp index 02d8f8692..96bed8698 100644 --- a/CPP/7zip/Crypto/7zKeyDerivation.cpp +++ b/CPP/7zip/Crypto/7zKeyDerivation.cpp @@ -9,6 +9,7 @@ #include "../../Common/MyBuffer2.h" #include "7zKeyDerivation.h" +#include "Pbkdf2HmacSha512.h" namespace NCrypto { namespace N7zKeyDerivation { @@ -23,6 +24,8 @@ static bool ConstantTimeCompare(const Byte *a, const Byte *b, size_t size) bool CKeyInfo::IsEqualTo(const CKeyInfo &a) const { + if (DerivMode != a.DerivMode) + return false; if (SaltSize != a.SaltSize || NumCyclesPower != a.NumCyclesPower) return false; if (!ConstantTimeCompare(Salt, a.Salt, SaltSize)) @@ -34,7 +37,18 @@ bool CKeyInfo::IsEqualTo(const CKeyInfo &a) const void CKeyInfo::CalcKey() { - if (NumCyclesPower == 0x3F) + if (DerivMode == kDeriv_Cascade) + { + // PBKDF2-HMAC-SHA512, output 96 bytes + const UInt32 numIterations = (NumCyclesPower == 0x3F) ? + 1 : (UInt32)1 << NumCyclesPower; + NSha512::Pbkdf2Hmac( + Password, Password.Size(), + Salt, SaltSize, + numIterations, + CascadeKey, kCascadeKeySize); + } + else if (NumCyclesPower == 0x3F) { unsigned pos; for (pos = 0; pos < SaltSize; pos++) @@ -101,8 +115,10 @@ bool CKeyInfoCache::GetKey(CKeyInfo &key) const CKeyInfo &cached = Keys[i]; if (key.IsEqualTo(cached)) { - for (unsigned j = 0; j < kKeySize; j++) - key.Key[j] = cached.Key[j]; + if (cached.DerivMode == kDeriv_Cascade) + memcpy(key.CascadeKey, cached.CascadeKey, kCascadeKeySize); + else + memcpy(key.Key, cached.Key, kKeySize); if (i != 0) Keys.MoveToFront(i); return true; diff --git a/CPP/7zip/Crypto/7zKeyDerivation.h b/CPP/7zip/Crypto/7zKeyDerivation.h index 1ae33638b..1d6882332 100644 --- a/CPP/7zip/Crypto/7zKeyDerivation.h +++ b/CPP/7zip/Crypto/7zKeyDerivation.h @@ -11,7 +11,19 @@ namespace NCrypto { namespace N7zKeyDerivation { const unsigned kKeySize = 32; -const unsigned kSaltSizeMax = 16; +const unsigned kSaltSizeMax = 32; + +// for cascade mode +const unsigned kCascadeKeySize = 96; + +// random salt for cascade mode +const unsigned kCascadeSaltSize = 32; + +enum EDerivationMode +{ + kDeriv_Single = 0, // SHA-256 iterative + kDeriv_Cascade = 1 // PBKDF2-HMAC-SHA512 +}; class CKeyInfo { @@ -21,6 +33,8 @@ class CKeyInfo Byte Salt[kSaltSizeMax]; CByteBuffer Password; Byte Key[kKeySize]; + EDerivationMode DerivMode; + Byte CascadeKey[kCascadeKeySize]; bool IsEqualTo(const CKeyInfo &a) const; void CalcKey(); @@ -30,6 +44,7 @@ class CKeyInfo { NumCyclesPower = 0; SaltSize = 0; + DerivMode = kDeriv_Single; for (unsigned i = 0; i < sizeof(Salt); i++) Salt[i] = 0; } @@ -41,6 +56,7 @@ class CKeyInfo SaltSize = 0; Z7_memset_0_ARRAY(Salt); Z7_memset_0_ARRAY(Key); + Z7_memset_0_ARRAY(CascadeKey); } #ifdef Z7_CPP_IS_SUPPORTED_default diff --git a/CPP/7zip/Crypto/Ascon.cpp b/CPP/7zip/Crypto/Ascon.cpp new file mode 100644 index 000000000..873957c91 --- /dev/null +++ b/CPP/7zip/Crypto/Ascon.cpp @@ -0,0 +1,162 @@ +// Ascon.cpp +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ + +#include "StdAfx.h" + +#include "../../../C/CpuArch.h" + +#include "Ascon.h" +#include "AsconSimd.h" + +namespace NCrypto { +namespace NAscon { + +#ifdef MY_CPU_X86_OR_AMD64 + +bool g_SSE2Enabled = false; +bool g_AVX512Enabled = false; +bool g_SIMDInitialized = false; + +#ifdef MY_CPU_AMD64 +static UInt64 Ascon_xgetbv(UInt32 num) +{ +#if defined(_MSC_VER) + return _xgetbv(num); +#elif defined(__GNUC__) || defined(__clang__) + UInt32 a, d; + __asm__ __volatile__("xgetbv" : "=a"(a), "=d"(d) : "c"(num) : "cc"); + return ((UInt64)d << 32) | a; +#endif +} +#endif + +void InitSIMD() +{ + if (g_SIMDInitialized) + return; + g_SIMDInitialized = true; + +#ifdef MY_CPU_AMD64 + g_SSE2Enabled = true; +#else + g_SSE2Enabled = CPU_IsSupported_SSE2() != 0; +#endif + +#ifdef MY_CPU_AMD64 + if (CPU_IsSupported_AVX()) + { + if (z7_x86_cpuid_GetMaxFunc() >= 7) + { + UInt32 d[4]; + z7_x86_cpuid(d, 7); + BoolInt avx512f = (d[1] >> 16) & 1; + BoolInt avx512vl = (d[1] >> 31) & 1; + if (avx512f && avx512vl) + { + const UInt32 bm = (UInt32)Ascon_xgetbv(0); + if ((bm & 0xE0) == 0xE0) + g_AVX512Enabled = true; + } + } + } +#endif +} + +#endif + +#define RC0 0xf0 +#define RC1 0xe1 +#define RC2 0xd2 +#define RC3 0xc3 +#define RC4 0xb4 +#define RC5 0xa5 +#define RC6 0x96 +#define RC7 0x87 +#define RC8 0x78 +#define RC9 0x69 +#define RCa 0x5a +#define RCb 0x4b + +static Z7_FORCE_INLINE void AsconRound(UInt64 *st, UInt64 C) +{ + UInt64 x0 = st[0]; + UInt64 x1 = st[1]; + UInt64 x2 = st[2]; + UInt64 x3 = st[3]; + UInt64 x4 = st[4]; + + x2 ^= C; + x0 ^= x4; + x4 ^= x3; + x2 ^= x1; + + UInt64 t0 = x0 ^ (~x1 & x2); + UInt64 t2 = x2 ^ (~x3 & x4); + UInt64 t4 = x4 ^ (~x0 & x1); + UInt64 t1 = x1 ^ (~x2 & x3); + UInt64 t3 = x3 ^ (~x4 & x0); + t1 ^= t0; + t3 ^= t2; + t0 ^= t4; + t2 = ~t2; + + x0 = t0 ^ ASCON_ROR64(t0, 19) ^ ASCON_ROR64(t0, 28); + x1 = t1 ^ ASCON_ROR64(t1, 61) ^ ASCON_ROR64(t1, 39); + x2 = t2 ^ ASCON_ROR64(t2, 1) ^ ASCON_ROR64(t2, 6); + x3 = t3 ^ ASCON_ROR64(t3, 10) ^ ASCON_ROR64(t3, 17); + x4 = t4 ^ ASCON_ROR64(t4, 7) ^ ASCON_ROR64(t4, 41); + + st[0] = x0; + st[1] = x1; + st[2] = x2; + st[3] = x3; + st[4] = x4; +} + +void AsconP12(UInt64 state[5]) +{ +#ifdef MY_CPU_AMD64 + InitSIMD(); + if (g_AVX512Enabled) + { + UInt64 st[8] = { state[0], state[1], state[2], state[3], state[4] }; + AsconP12_AVX512(st); + state[0] = st[0]; + state[1] = st[1]; + state[2] = st[2]; + state[3] = st[3]; + state[4] = st[4]; + return; + } +#endif + AsconRound(state, RC0); AsconRound(state, RC1); + AsconRound(state, RC2); AsconRound(state, RC3); + AsconRound(state, RC4); AsconRound(state, RC5); + AsconRound(state, RC6); AsconRound(state, RC7); + AsconRound(state, RC8); AsconRound(state, RC9); + AsconRound(state, RCa); AsconRound(state, RCb); +} + +void AsconP8(UInt64 state[5]) +{ +#ifdef MY_CPU_AMD64 + if (g_AVX512Enabled) + { + UInt64 st[8] = { state[0], state[1], state[2], state[3], state[4] }; + AsconP8_AVX512(st); + state[0] = st[0]; + state[1] = st[1]; + state[2] = st[2]; + state[3] = st[3]; + state[4] = st[4]; + return; + } +#endif + AsconRound(state, RC4); AsconRound(state, RC5); + AsconRound(state, RC6); AsconRound(state, RC7); + AsconRound(state, RC8); AsconRound(state, RC9); + AsconRound(state, RCa); AsconRound(state, RCb); +} + +}} diff --git a/CPP/7zip/Crypto/Ascon.h b/CPP/7zip/Crypto/Ascon.h new file mode 100644 index 000000000..4725db194 --- /dev/null +++ b/CPP/7zip/Crypto/Ascon.h @@ -0,0 +1,31 @@ +// Ascon.h +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ + +#ifndef ZIP7_INC_CRYPTO_ASCON_H +#define ZIP7_INC_CRYPTO_ASCON_H + +#include "../../Common/MyCom.h" + +namespace NCrypto { +namespace NAscon { + +const unsigned kKeySize_Ascon = 16; +const unsigned kNonceSize = 16; +const unsigned kTagSize = 16; +const unsigned kRateSize = 16; + +static const UInt64 kIV_Ascon = 0x00001000808C0001ULL; + +#ifdef _MSC_VER +#include +#define ASCON_ROR64(v, n) _rotr64((v), (n)) +#else +#define ASCON_ROR64(v, n) (((v) >> (n)) | ((v) << (64 - (n)))) +#endif + +void AsconP12(UInt64 state[5]); +void AsconP8(UInt64 state[5]); + +}} +#endif diff --git a/CPP/7zip/Crypto/AsconSimd.h b/CPP/7zip/Crypto/AsconSimd.h new file mode 100644 index 000000000..c285ffe70 --- /dev/null +++ b/CPP/7zip/Crypto/AsconSimd.h @@ -0,0 +1,110 @@ +// AsconSimd.h +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ + +#ifndef ZIP7_INC_CRYPTO_ASCON_SIMD_H +#define ZIP7_INC_CRYPTO_ASCON_SIMD_H + +#include "Ascon.h" + +#ifdef MY_CPU_SSE2 +#include +#endif + +namespace NCrypto { +namespace NAscon { + +#ifdef MY_CPU_X86_OR_AMD64 + +extern bool g_SSE2Enabled; +extern bool g_AVX512Enabled; +extern bool g_SIMDInitialized; + +void InitSIMD(); + +#endif + +#ifdef MY_CPU_SSE2 + +static Z7_FORCE_INLINE void AsconEncBlock_SSE2(UInt64 state[5], Byte *data) +{ + __m128i ks = _mm_set_epi64x(state[1], state[0]); + __m128i ct = _mm_xor_si128(_mm_loadu_si128((const __m128i*)data), ks); + _mm_storeu_si128((__m128i*)data, ct); + _mm_storel_epi64((__m128i*)&state[0], ct); + _mm_storel_epi64((__m128i*)&state[1], _mm_srli_si128(ct, 8)); +} + +static Z7_FORCE_INLINE void AsconDecBlock_SSE2(UInt64 state[5], Byte *data) +{ + __m128i ks = _mm_set_epi64x(state[1], state[0]); + __m128i ct = _mm_loadu_si128((const __m128i*)data); + __m128i pt = _mm_xor_si128(ct, ks); + _mm_storeu_si128((__m128i*)data, pt); + _mm_storel_epi64((__m128i*)&state[0], ct); + _mm_storel_epi64((__m128i*)&state[1], _mm_srli_si128(ct, 8)); +} + +#endif + +#ifdef MY_CPU_AMD64 + +#include + +static Z7_FORCE_INLINE void AsconRound_AVX512(UInt64 *st, UInt64 C) +{ + const UInt64 z = 0; + const __mmask8 mxor1 = 0x15; + const __mmask8 mxor2 = 0x0b; + const __m512i pxor1 = _mm512_set_epi64(z, z, z, 3, z, 1, z, 4); + const __m512i pxor2 = _mm512_set_epi64(z, z, z, z, 2, z, 0, 4); + const __m512i rc = _mm512_set_epi64(z, z, z, 0, 0, C, 0, 0); + const __m512i neg = _mm512_set_epi64(z, z, z, 0, 0, ~(UInt64)0, 0, 0); + const __m512i pchi1 = _mm512_set_epi64(z, z, z, 0, 4, 3, 2, 1); + const __m512i pchi2 = _mm512_set_epi64(z, z, z, 1, 0, 4, 3, 2); + const __m512i rot1 = _mm512_set_epi64(z, z, z, 7, 10, 1, 61, 19); + const __m512i rot2 = _mm512_set_epi64(z, z, z, 41, 17, 6, 39, 28); + + __m512i s = _mm512_loadu_si512((const void*)st); + __m512i t0, t1, t2; + + t0 = _mm512_maskz_permutexvar_epi64(mxor1, pxor1, s); + t0 = _mm512_ternarylogic_epi64(s, t0, rc, 0x96); + + t1 = _mm512_permutexvar_epi64(pchi1, t0); + t2 = _mm512_permutexvar_epi64(pchi2, t0); + t0 = _mm512_ternarylogic_epi64(t0, t1, t2, 0xd2); + + t1 = _mm512_maskz_permutexvar_epi64(mxor2, pxor2, t0); + t0 = _mm512_ternarylogic_epi64(t0, t1, neg, 0x96); + + t1 = _mm512_rorv_epi64(t0, rot1); + t2 = _mm512_rorv_epi64(t0, rot2); + s = _mm512_ternarylogic_epi64(t0, t1, t2, 0x96); + + _mm512_storeu_si512((void*)st, s); +} + +static Z7_FORCE_INLINE void AsconP12_AVX512(UInt64 state[5]) +{ + AsconRound_AVX512(state, 0xf0); AsconRound_AVX512(state, 0xe1); + AsconRound_AVX512(state, 0xd2); AsconRound_AVX512(state, 0xc3); + AsconRound_AVX512(state, 0xb4); AsconRound_AVX512(state, 0xa5); + AsconRound_AVX512(state, 0x96); AsconRound_AVX512(state, 0x87); + AsconRound_AVX512(state, 0x78); AsconRound_AVX512(state, 0x69); + AsconRound_AVX512(state, 0x5a); AsconRound_AVX512(state, 0x4b); +} + +static Z7_FORCE_INLINE void AsconP8_AVX512(UInt64 state[5]) +{ + AsconRound_AVX512(state, 0xb4); AsconRound_AVX512(state, 0xa5); + AsconRound_AVX512(state, 0x96); AsconRound_AVX512(state, 0x87); + AsconRound_AVX512(state, 0x78); AsconRound_AVX512(state, 0x69); + AsconRound_AVX512(state, 0x5a); AsconRound_AVX512(state, 0x4b); +} + +#endif + +}} + +#endif diff --git a/CPP/7zip/Crypto/Cascade.cpp b/CPP/7zip/Crypto/Cascade.cpp new file mode 100644 index 000000000..57b44e035 --- /dev/null +++ b/CPP/7zip/Crypto/Cascade.cpp @@ -0,0 +1,1203 @@ +// Cascade.cpp +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ + +#include "StdAfx.h" + +#include "../../../C/CpuArch.h" +#include "../../../C/Aes.h" + +#include "../../Common/ComTry.h" + +#ifndef Z7_ST +#include "../../Windows/Synchronization.h" +#endif + +#include "../Common/StreamUtils.h" + +#include "Cascade.h" +#include "HkdfBlake2sp.h" +#include "XChaCha20.h" +#include "AsconSimd.h" + +#ifndef Z7_EXTRACT_ONLY +#include "RandGen.h" +#endif + +namespace NCrypto { + +static void XorBytes(Byte *dst, const Byte *src, unsigned len) +{ + Byte *d = dst; + const Byte *s = src; + +#ifdef MY_CPU_LE_UNALIGN_64 + while (len >= 8) + { + *(UInt64 *)d ^= *(const UInt64 *)s; + d += 8; + s += 8; + len -= 8; + } +#endif + +#ifdef MY_CPU_LE_UNALIGN + while (len >= 4) + { + *(UInt32 *)d ^= *(const UInt32 *)s; + d += 4; + s += 4; + len -= 4; + } +#endif + + while (len--) + *d++ ^= *s++; +} + +namespace NAXPCascade { + +static CKeyInfoCache g_AXP_GlobalKeyCache(32); + +#ifndef Z7_ST + static NWindows::NSynchronization::CCriticalSection g_AXP_GlobalKeyCacheCriticalSection; + #define AXP_MT_LOCK NWindows::NSynchronization::CCriticalSectionLock lock(g_AXP_GlobalKeyCacheCriticalSection); +#else + #define AXP_MT_LOCK +#endif + +CAXPBase::CAXPBase(): + _cachedKeys(16), + _keyDerived(false), + _xcBlockPos(64), + _xcCounter(0), + _aadSize(0), + _finalized(false), + _authOk(false) +{ + _key.DerivMode = N7zKeyDerivation::kDeriv_Cascade; + Z7_memset_0_ARRAY(_keyAes); + Z7_memset_0_ARRAY(_aesIv); + Z7_memset_0_ARRAY(_aesKeys); + Z7_memset_0_ARRAY(_keyXChaCha20); + Z7_memset_0_ARRAY(_xcNonce); + Z7_memset_0_ARRAY(_xcDerivedKey); + Z7_memset_0_ARRAY(_xcBlock); + Z7_memset_0_ARRAY(_polyKey); +} + +void CAXPBase::PrepareKey() +{ + AXP_MT_LOCK + + bool finded = false; + if (!_cachedKeys.GetKey(_key)) + { + finded = g_AXP_GlobalKeyCache.GetKey(_key); + if (!finded) + _key.CalcKey(); + _cachedKeys.Add(_key); + } + if (!finded) + g_AXP_GlobalKeyCache.FindAndAdd(_key); + _keyDerived = false; +} + +void CAXPBase::DeriveAXPKeys() +{ + NHkdfBlake2sp::Derive( + _key.CascadeKey, kCascadeKeySize, + "AES-key", 7, + _keyAes, 32); + Aes_SetKey_Enc(_aesKeys + 4, _keyAes, 32); + memcpy(_aesKeys, _aesIv, 16); + + NHkdfBlake2sp::Derive( + _key.CascadeKey, kCascadeKeySize, + "XChaCha20-key", 13, + _keyXChaCha20, 32); + NXChaCha20::XHChaCha20Block_Core(_xcDerivedKey, _keyXChaCha20, _xcNonce); + + ComputePolyKey(); + _poly1305.SetKey(_polyKey); + if (_aadSize > 0) + _poly1305.UpdateAad(_aad, _aadSize); + + _xcBlockPos = 64; + _xcCounter = 1; + + _keyDerived = true; + _finalized = false; +} + +void CAXPBase::ComputePolyKey() +{ + Byte polyBlock[64]; + NXChaCha20::XChaCha20Block_Core(polyBlock, _xcDerivedKey, _xcNonce + 16, 0); + memcpy(_polyKey, polyBlock, kPolyKeySize); + Z7_memset_0_ARRAY(polyBlock); +} + +void CAXPBase::AesCtrXorData(Byte *data, UInt32 size) +{ + if (size >= AES_BLOCK_SIZE) + { + UInt32 numBlocks = size >> 4; + AesCtr_Code(_aesKeys, data, numBlocks); + data += numBlocks << 4; + size -= numBlocks << 4; + } + if (size > 0) + { + Byte temp[16]; + memset(temp, 0, 16); + AesCtr_Code(_aesKeys, temp, 1); + for (UInt32 i = 0; i < size; i++) + data[i] ^= temp[i]; + Z7_memset_0_ARRAY(temp); + } +} + +void CAXPBase::XChaCha20XorData(Byte *data, UInt32 size) +{ + while (size > 0) + { + if (_xcBlockPos >= kXcBlockSize) + { + NXChaCha20::XChaCha20Block_Core(_xcBlock, _xcDerivedKey, _xcNonce + 16, _xcCounter); + _xcBlockPos = 0; + _xcCounter++; + } + UInt32 avail = kXcBlockSize - _xcBlockPos; + UInt32 toProcess = (size < avail) ? size : avail; + XorBytes(data, _xcBlock + _xcBlockPos, toProcess); + data += toProcess; + size -= toProcess; + _xcBlockPos += toProcess; + } +} + +void CAXPBaseCoder::ProcessEnc(Byte *data, UInt32 size) +{ + if (!_keyDerived) + DeriveAXPKeys(); + + AesCtrXorData(data, size); + XChaCha20XorData(data, size); + _poly1305.Update(data, size); +} + +void CAXPBaseCoder::ProcessDec(Byte *data, UInt32 size) +{ + if (!_keyDerived) + DeriveAXPKeys(); + + _poly1305.Update(data, size); + XChaCha20XorData(data, size); + AesCtrXorData(data, size); +} + +Z7_COM7F_IMF(CAXPBaseCoder::CryptoSetPassword(const Byte *data, UInt32 size)) +{ + COM_TRY_BEGIN + + _key.Password.Wipe(); + _key.Password.CopyFrom(data, (size_t)size); + _keyDerived = false; + return S_OK; + + COM_TRY_END +} + +Z7_COM7F_IMF(CAXPBaseCoder::Init()) +{ + COM_TRY_BEGIN + + PrepareKey(); + _keyDerived = false; + _finalized = false; + _authOk = false; + _poly1305.Reset(); + return S_OK; + + COM_TRY_END +} + +Z7_COM7F_IMF2(UInt32, CAXPBaseCoder::Filter(Byte * /* data */, UInt32 size)) +{ + return size; +} + +#ifndef Z7_EXTRACT_ONLY + +CAXPEncoder::CAXPEncoder() +{ + _key.NumCyclesPower = 19; + _key.DerivMode = N7zKeyDerivation::kDeriv_Cascade; + _keyDerived = false; + _aadSize = 0; + _finalized = false; + _xcBlockPos = 64; + _xcCounter = 1; + _tagReady = false; + memset(_computedTag, 0, kTagSize); + Z7_memset_0_ARRAY(_aesIv); + Z7_memset_0_ARRAY(_xcNonce); +} + +Z7_COM7F_IMF(CAXPEncoder::ResetInitVector()) +{ + for (unsigned i = 0; i < sizeof(_aesIv); i++) + _aesIv[i] = 0; + for (unsigned i = 0; i < sizeof(_xcNonce); i++) + _xcNonce[i] = 0; + + MY_RAND_GEN(_key.Salt, N7zKeyDerivation::kCascadeSaltSize); + _key.SaltSize = N7zKeyDerivation::kCascadeSaltSize; + + MY_RAND_GEN(_aesIv, 16); + MY_RAND_GEN(_xcNonce, 24); + _keyDerived = false; + _finalized = false; + _xcBlockPos = 64; + _xcCounter = 1; + _poly1305.Reset(); + _tagReady = false; + memset(_computedTag, 0, kTagSize); + + _aadSize = 1; + _aad[0] = (Byte)(_key.NumCyclesPower + | (1 << 7) + | (1 << 6)); + + _aad[1] = (Byte)(((_key.SaltSize - 1) << 3) & 0xF8); + memcpy(_aad + 2, _key.Salt, _key.SaltSize); + _aadSize = 2 + _key.SaltSize; + memcpy(_aad + _aadSize, _aesIv, 16); + _aadSize += 16; + memcpy(_aad + _aadSize, _xcNonce, 24); + _aadSize += 24; + + return S_OK; +} + +Z7_COM7F_IMF2(UInt32, CAXPEncoder::Filter(Byte *data, UInt32 size)) +{ + if (size == 0) + return 0; + + ProcessEnc(data, size); + return size; +} + +Z7_COM7F_IMF(CAXPEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) +{ + Byte props[2 + sizeof(_key.Salt) + 24 + 16 + kTagSize]; + unsigned propsSize = 1; + + props[0] = (Byte)(_key.NumCyclesPower + | (1 << 7) + | (1 << 6)); + + props[1] = (Byte)(((_key.SaltSize - 1) << 3) & 0xF8); + memcpy(props + 2, _key.Salt, _key.SaltSize); + propsSize = 2 + _key.SaltSize; + memcpy(props + propsSize, _aesIv, 16); + propsSize += 16; + memcpy(props + propsSize, _xcNonce, 24); + propsSize += 24; + + if (!_tagReady) + { + if (_finalized) + { + _tagReady = true; + } + else if (_keyDerived) + { + _poly1305.Final(_computedTag); + _finalized = true; + _tagReady = true; + } + else + { + memset(_computedTag, 0, kTagSize); + } + } + + memcpy(props + propsSize, _computedTag, kTagSize); + propsSize += kTagSize; + + return WriteStream(outStream, props, propsSize); +} + +#endif + +CAXPDecoder::CAXPDecoder() +{ + _key.NumCyclesPower = 19; + _key.DerivMode = N7zKeyDerivation::kDeriv_Cascade; + _keyDerived = false; + _finalized = false; + _authOk = false; + _aadSize = 0; + memset(_expectedTag, 0, kTagSize); + _xcBlockPos = 64; + _xcCounter = 1; + Z7_memset_0_ARRAY(_aesIv); + Z7_memset_0_ARRAY(_xcNonce); +} + +Z7_COM7F_IMF2(UInt32, CAXPDecoder::Filter(Byte *data, UInt32 size)) +{ + if (size == 0) + return 0; + + ProcessDec(data, size); + return size; +} + +Z7_COM7F_IMF(CAXPDecoder::SetDecoderProperties2(const Byte *data, UInt32 size)) +{ + _key.ClearProps(); + _key.DerivMode = N7zKeyDerivation::kDeriv_Cascade; + + _keyDerived = false; + _finalized = false; + _authOk = false; + _poly1305.Reset(); + memset(_expectedTag, 0, kTagSize); + _xcBlockPos = 64; + _xcCounter = 1; + + for (unsigned i = 0; i < sizeof(_aesIv); i++) + _aesIv[i] = 0; + for (unsigned i = 0; i < sizeof(_xcNonce); i++) + _xcNonce[i] = 0; + + if (size == 0) + return S_OK; + + const unsigned b0 = data[0]; + _key.NumCyclesPower = b0 & 0x3F; + + const bool saltPresent = (b0 & 0x80) != 0; + const unsigned nonceType = (b0 >> 6) & 1; + + if (!saltPresent && nonceType == 0 && size == 1) + return S_OK; + if (size <= 1) + return E_INVALIDARG; + + const unsigned b1 = data[1]; + const unsigned saltSize = saltPresent ? (((b1 >> 3) & 0x1F) + 1) : 0; + const unsigned nonceSize = (nonceType == 0) ? 16 : 24; + + const unsigned minSize = 2 + saltSize + nonceSize + 16; + if (size < minSize) + return E_INVALIDARG; + + const unsigned tagSize = size - minSize; + if (tagSize != kTagSize && tagSize != 0) + return E_INVALIDARG; + + _key.SaltSize = saltSize; + data += 2; + for (unsigned i = 0; i < saltSize; i++) + _key.Salt[i] = *data++; + for (unsigned i = 0; i < 16; i++) + _aesIv[i] = *data++; + for (unsigned i = 0; i < nonceSize && i < 24; i++) + _xcNonce[i] = *data++; + + if (tagSize == kTagSize) + memcpy(_expectedTag, data, kTagSize); + + _aadSize = 1; + _aad[0] = (Byte)(_key.NumCyclesPower + | (saltPresent ? (1 << 7) : 0) + | (nonceType << 6)); + + if (saltPresent) + { + _aad[1] = (Byte)(((_key.SaltSize - 1) << 3) & 0xF8); + memcpy(_aad + 2, _key.Salt, _key.SaltSize); + _aadSize = 2 + _key.SaltSize; + memcpy(_aad + _aadSize, _aesIv, 16); + _aadSize += 16; + memcpy(_aad + _aadSize, _xcNonce, nonceSize); + _aadSize += nonceSize; + } + else + { + _aad[1] = 0; + _aadSize = 2; + memcpy(_aad + _aadSize, _aesIv, 16); + _aadSize += 16; + memcpy(_aad + _aadSize, _xcNonce, nonceSize); + _aadSize += nonceSize; + } + + return (_key.NumCyclesPower <= k_NumCyclesPower_Supported_MAX + || _key.NumCyclesPower == 0x3F) ? S_OK : E_NOTIMPL; +} + +Z7_COM7F_IMF(CAXPDecoder::CryptoAuthVerify(Int32 *result)) +{ + if (_authOk) + { + *result = 0; + return S_OK; + } + + if (!_keyDerived) + DeriveAXPKeys(); + + Byte computedTag[kTagSize]; + _poly1305.Final(computedTag); + _finalized = true; + + { + volatile Byte diff = 0; + for (unsigned i = 0; i < kTagSize; i++) + diff |= computedTag[i] ^ _expectedTag[i]; + *result = (diff == 0) ? 0 : 1; + _authOk = (diff == 0); + } + + Z7_memset_0_ARRAY(computedTag); + + return S_OK; +} + +}} + +namespace NCrypto { +namespace NAXACascade { + +static CKeyInfoCache g_GlobalKeyCache(32); + +#ifndef Z7_ST + static NWindows::NSynchronization::CCriticalSection g_GlobalKeyCacheCriticalSection; + #define MT_LOCK NWindows::NSynchronization::CCriticalSectionLock lock(g_GlobalKeyCacheCriticalSection); +#else + #define MT_LOCK +#endif + +#ifdef MY_CPU_X86_OR_AMD64 +#define ASCON_USE_SSE2 (NAscon::g_SSE2Enabled) +#else +#define ASCON_USE_SSE2 false +#endif + +CBase::CBase(): + _cachedKeys(16), + _keyDerived(false), + _xcBlockPos(64), + _xcCounter(0) +{ + _key.DerivMode = N7zKeyDerivation::kDeriv_Cascade; + for (unsigned i = 0; i < sizeof(_nonce); i++) + _nonce[i] = 0; + Z7_memset_0_ARRAY(_keyAscon); + Z7_memset_0_ARRAY(_keyAes); + Z7_memset_0_ARRAY(_aesIv); + Z7_memset_0_ARRAY(_aesKeys); + Z7_memset_0_ARRAY(_keyXChaCha20); + Z7_memset_0_ARRAY(_xcNonce); + Z7_memset_0_ARRAY(_xcDerivedKey); + Z7_memset_0_ARRAY(_xcBlock); +} + +void CBase::PrepareKey() +{ + MT_LOCK + + bool finded = false; + if (!_cachedKeys.GetKey(_key)) + { + finded = g_GlobalKeyCache.GetKey(_key); + if (!finded) + _key.CalcKey(); + _cachedKeys.Add(_key); + } + if (!finded) + g_GlobalKeyCache.FindAndAdd(_key); + _keyDerived = false; +} + +void CBase::DeriveCascadeKeys() +{ + + NHkdfBlake2sp::Derive( + _key.CascadeKey, kCascadeKeySize, + "AES-key", 7, + _keyAes, 32); + Aes_SetKey_Enc(_aesKeys + 4, _keyAes, 32); + memcpy(_aesKeys, _aesIv, 16); + + NHkdfBlake2sp::Derive( + _key.CascadeKey, kCascadeKeySize, + "XChaCha20-key", 13, + _keyXChaCha20, 32); + NXChaCha20::XHChaCha20Block_Core(_xcDerivedKey, _keyXChaCha20, _xcNonce); + + NHkdfBlake2sp::Derive( + _key.CascadeKey, kCascadeKeySize, + "Ascon-key", 9, + _keyAscon, NAscon::kKeySize_Ascon); + + _xcBlockPos = 64; + _xcCounter = 0; + + _keyDerived = true; +} + +void CBase::AesCtrXorData(Byte *data, UInt32 size) +{ + if (size >= AES_BLOCK_SIZE) + { + UInt32 numBlocks = size >> 4; + AesCtr_Code(_aesKeys, data, numBlocks); + data += numBlocks << 4; + size -= numBlocks << 4; + } + if (size > 0) + { + Byte temp[16]; + memset(temp, 0, 16); + AesCtr_Code(_aesKeys, temp, 1); + for (UInt32 i = 0; i < size; i++) + data[i] ^= temp[i]; + Z7_memset_0_ARRAY(temp); + } +} + +void CBase::XChaCha20XorData(Byte *data, UInt32 size) +{ + while (size > 0) + { + if (_xcBlockPos >= kXcBlockSize) + { + NXChaCha20::XChaCha20Block_Core(_xcBlock, _xcDerivedKey, _xcNonce + 16, _xcCounter); + _xcBlockPos = 0; + _xcCounter++; + } + UInt32 avail = kXcBlockSize - _xcBlockPos; + UInt32 toProcess = (size < avail) ? size : avail; + XorBytes(data, _xcBlock + _xcBlockPos, toProcess); + data += toProcess; + size -= toProcess; + _xcBlockPos += toProcess; + } +} + +void CBaseCoder::InitState() +{ + Z7_memset_0_ARRAY(_state); + Z7_memset_0_ARRAY(_stateBuf); + _stateBufPos = 0; + _finalized = false; + _authOk = false; +} + +void CBaseCoder::ProcessAad(const Byte *aad, UInt64 aadLen) +{ + const UInt64 keyLo = GetUi64(_keyAscon); + const UInt64 keyHi = GetUi64(_keyAscon + 8); + const UInt64 nonceLo = GetUi64(_nonce); + const UInt64 nonceHi = GetUi64(_nonce + 8); + + _state[0] = NAscon::kIV_Ascon; + _state[1] = keyLo; + _state[2] = keyHi; + _state[3] = nonceLo; + _state[4] = nonceHi; + + NAscon::AsconP12(_state); + _state[3] ^= keyLo; + _state[4] ^= keyHi; + + if (aadLen > 0) + { + UInt64 remaining = aadLen; + const Byte *p = aad; + while (remaining >= NAscon::kRateSize) + { + _state[0] ^= GetUi64(p); + _state[1] ^= GetUi64(p + 8); + NAscon::AsconP8(_state); + p += NAscon::kRateSize; + remaining -= NAscon::kRateSize; + } + if (remaining >= 8) + { + _state[0] ^= GetUi64(p); + _state[1] ^= ((UInt64)0x01 << ((remaining - 8) * 8)); + if (remaining > 8) + { + UInt64 partial = 0; + memcpy(&partial, p + 8, (unsigned)(remaining - 8)); + _state[1] ^= partial; + } + } + else + { + _state[0] ^= ((UInt64)0x01 << (remaining * 8)); + if (remaining > 0) + { + UInt64 partial = 0; + memcpy(&partial, p, (unsigned)remaining); + _state[0] ^= partial; + } + } + NAscon::AsconP8(_state); + } + + _state[4] ^= (UInt64)0x80 << 56; +} + +void CBaseCoder::ProcessEnc(Byte *data, UInt32 size) +{ +#ifdef MY_CPU_X86_OR_AMD64 + NAscon::InitSIMD(); + const bool useSSE2 = ASCON_USE_SSE2; +#else + const bool useSSE2 = false; +#endif + if (!_keyDerived) + { + DeriveCascadeKeys(); + ProcessAad(_aad, _aadSize); + } + + AesCtrXorData(data, size); + XChaCha20XorData(data, size); + + UInt32 remaining = size; + Byte *p = data; + + if (_stateBufPos > 0) + { + UInt32 avail = NAscon::kRateSize - _stateBufPos; + UInt32 toProcess = (remaining < avail) ? remaining : avail; + + XorBytes(p, _stateBuf + _stateBufPos, toProcess); + memcpy(_stateBuf + _stateBufPos, p, toProcess); + + _stateBufPos += toProcess; + p += toProcess; + remaining -= toProcess; + + if (_stateBufPos == NAscon::kRateSize) + { + _state[0] = GetUi64(_stateBuf); + _state[1] = GetUi64(_stateBuf + 8); + NAscon::AsconP8(_state); + _stateBufPos = 0; + } + } + + if (remaining >= NAscon::kRateSize) + { +#ifdef MY_CPU_SSE2 + if (useSSE2) + { + do { + NAscon::AsconEncBlock_SSE2(_state, p); + NAscon::AsconP8(_state); + p += NAscon::kRateSize; + remaining -= NAscon::kRateSize; + } while (remaining >= NAscon::kRateSize); + } + else +#endif + { + do { + _state[0] ^= GetUi64(p); + _state[1] ^= GetUi64(p + 8); + SetUi64(p, _state[0]); + SetUi64(p + 8, _state[1]); + NAscon::AsconP8(_state); + p += NAscon::kRateSize; + remaining -= NAscon::kRateSize; + } while (remaining >= NAscon::kRateSize); + } + } + + if (remaining > 0) + { + SetUi64(_stateBuf, _state[0]); + SetUi64(_stateBuf + 8, _state[1]); + XorBytes(p, _stateBuf, remaining); + memcpy(_stateBuf, p, remaining); + memcpy(_state, _stateBuf, remaining); + _stateBufPos = remaining; + } +} + +void CBaseCoder::ProcessDec(Byte *data, UInt32 size) +{ +#ifdef MY_CPU_X86_OR_AMD64 + NAscon::InitSIMD(); + const bool useSSE2 = ASCON_USE_SSE2; +#else + const bool useSSE2 = false; +#endif + if (!_keyDerived) + { + DeriveCascadeKeys(); + ProcessAad(_aad, _aadSize); + } + + { + UInt32 remaining = size; + Byte *p = data; + + if (_stateBufPos > 0) + { + UInt32 avail = NAscon::kRateSize - _stateBufPos; + UInt32 toProcess = (remaining < avail) ? remaining : avail; + + { + UInt32 off = _stateBufPos; + UInt32 n = toProcess; + Byte *pd = p; +#ifdef MY_CPU_LE_UNALIGN_64 + while (n >= 8 && (off & 7) == 0) + { + UInt64 ct = GetUi64(pd); + SetUi64(pd, GetUi64(_stateBuf + off) ^ ct); + SetUi64(_stateBuf + off, ct); + pd += 8; off += 8; n -= 8; + } +#endif +#ifdef MY_CPU_LE_UNALIGN + while (n >= 4 && (off & 3) == 0) + { + UInt32 ct = GetUi32(pd); + SetUi32(pd, GetUi32(_stateBuf + off) ^ ct); + SetUi32(_stateBuf + off, ct); + pd += 4; off += 4; n -= 4; + } +#endif + while (n--) + { + const Byte c = *pd; + *pd = _stateBuf[off] ^ c; + _stateBuf[off] = c; + pd++; off++; + } + } + + memcpy((Byte *)_state + _stateBufPos, _stateBuf + _stateBufPos, toProcess); + + _stateBufPos += toProcess; + p += toProcess; + remaining -= toProcess; + + if (_stateBufPos == NAscon::kRateSize) + { + NAscon::AsconP8(_state); + _stateBufPos = 0; + } + } + + if (remaining >= NAscon::kRateSize) + { +#ifdef MY_CPU_SSE2 + if (useSSE2) + { + do { + NAscon::AsconDecBlock_SSE2(_state, p); + NAscon::AsconP8(_state); + p += NAscon::kRateSize; + remaining -= NAscon::kRateSize; + } while (remaining >= NAscon::kRateSize); + } + else +#endif + { + do { + UInt64 c0 = GetUi64(p); + UInt64 c1 = GetUi64(p + 8); + SetUi64(p, _state[0] ^ c0); + SetUi64(p + 8, _state[1] ^ c1); + _state[0] = c0; + _state[1] = c1; + NAscon::AsconP8(_state); + p += NAscon::kRateSize; + remaining -= NAscon::kRateSize; + } while (remaining >= NAscon::kRateSize); + } + } + + if (remaining > 0) + { + SetUi64(_stateBuf, _state[0]); + SetUi64(_stateBuf + 8, _state[1]); + + { + UInt32 n = remaining; + Byte *pd = p; + UInt32 off = 0; +#ifdef MY_CPU_LE_UNALIGN_64 + while (n >= 8) + { + UInt64 ct = GetUi64(pd); + SetUi64(pd, GetUi64(_stateBuf + off) ^ ct); + SetUi64(_stateBuf + off, ct); + pd += 8; off += 8; n -= 8; + } +#endif +#ifdef MY_CPU_LE_UNALIGN + while (n >= 4) + { + UInt32 ct = GetUi32(pd); + SetUi32(pd, GetUi32(_stateBuf + off) ^ ct); + SetUi32(_stateBuf + off, ct); + pd += 4; off += 4; n -= 4; + } +#endif + while (n--) + { + const Byte c = *pd; + *pd = _stateBuf[off] ^ c; + _stateBuf[off] = c; + pd++; off++; + } + } + + memcpy(_state, _stateBuf, remaining); + _stateBufPos = remaining; + } + } + + XChaCha20XorData(data, size); + AesCtrXorData(data, size); +} + +void CBaseCoder::Finalize(Byte *tag) +{ + if (!_finalized) + { + if (_stateBufPos > 0 && _stateBufPos < NAscon::kRateSize) + { + if (_stateBufPos < 8) + _state[0] ^= ((UInt64)0x01 << (_stateBufPos * 8)); + else + _state[1] ^= ((UInt64)0x01 << ((_stateBufPos - 8) * 8)); + } + else + { + _state[0] ^= ((UInt64)0x01); + } + + const UInt64 keyLo = GetUi64(_keyAscon); + const UInt64 keyHi = GetUi64(_keyAscon + 8); + + _state[2] ^= keyLo; + _state[3] ^= keyHi; + NAscon::AsconP12(_state); + _state[3] ^= keyLo; + _state[4] ^= keyHi; + + SetUi64(tag, _state[3]); + SetUi64(tag + 8, _state[4]); + + _finalized = true; + } +} + +Z7_COM7F_IMF(CBaseCoder::CryptoSetPassword(const Byte *data, UInt32 size)) +{ + COM_TRY_BEGIN + + _key.Password.Wipe(); + _key.Password.CopyFrom(data, (size_t)size); + _keyDerived = false; + return S_OK; + + COM_TRY_END +} + +Z7_COM7F_IMF(CBaseCoder::Init()) +{ + COM_TRY_BEGIN + + PrepareKey(); + InitState(); + _keyDerived = false; + return S_OK; + + COM_TRY_END +} + +Z7_COM7F_IMF2(UInt32, CBaseCoder::Filter(Byte * /* data */, UInt32 size)) +{ + return size; +} + +#ifndef Z7_EXTRACT_ONLY + +Z7_COM7F_IMF(CEncoder::ResetInitVector()) +{ + for (unsigned i = 0; i < sizeof(_nonce); i++) + _nonce[i] = 0; + for (unsigned i = 0; i < sizeof(_aesIv); i++) + _aesIv[i] = 0; + for (unsigned i = 0; i < sizeof(_xcNonce); i++) + _xcNonce[i] = 0; + + MY_RAND_GEN(_key.Salt, N7zKeyDerivation::kCascadeSaltSize); + _key.SaltSize = N7zKeyDerivation::kCascadeSaltSize; + + MY_RAND_GEN(_nonce, NAscon::kNonceSize); + MY_RAND_GEN(_aesIv, 16); + MY_RAND_GEN(_xcNonce, 24); + _keyDerived = false; + _stateBufPos = 0; + _finalized = false; + _xcBlockPos = 64; + _xcCounter = 0; + + const unsigned nonceType = (NAscon::kNonceSize > 16) ? 1 : 0; + + _aadSize = 1; + _aad[0] = (Byte)(_key.NumCyclesPower + | (1 << 7) + | (nonceType << 6)); + + _aad[1] = (Byte)(((_key.SaltSize - 1) << 3) & 0xF8); + memcpy(_aad + 2, _key.Salt, _key.SaltSize); + _aadSize = 2 + _key.SaltSize; + memcpy(_aad + _aadSize, _aesIv, 16); + _aadSize += 16; + memcpy(_aad + _aadSize, _xcNonce, 24); + _aadSize += 24; + memcpy(_aad + _aadSize, _nonce, NAscon::kNonceSize); + _aadSize += NAscon::kNonceSize; + + return S_OK; +} + +Z7_COM7F_IMF2(UInt32, CEncoder::Filter(Byte *data, UInt32 size)) +{ + if (size == 0) + return 0; + + ProcessEnc(data, size); + return size; +} + +CEncoder::CEncoder() +{ + _key.NumCyclesPower = 19; + _key.DerivMode = N7zKeyDerivation::kDeriv_Cascade; + _keyDerived = false; + _stateBufPos = 0; + _finalized = false; + _aadSize = 0; + _xcBlockPos = 64; + _xcCounter = 0; + _tagReady = false; + memset(_computedTag, 0, NAscon::kTagSize); + Z7_memset_0_ARRAY(_aesIv); + Z7_memset_0_ARRAY(_xcNonce); +} + +Z7_COM7F_IMF(CEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) +{ + Byte props[2 + sizeof(_key.Salt) + NAscon::kNonceSize + 16 + 24 + NAscon::kTagSize]; + unsigned propsSize = 1; + + const unsigned nonceType = (NAscon::kNonceSize > 16) ? 1 : 0; + + props[0] = (Byte)(_key.NumCyclesPower + | (1 << 7) + | (nonceType << 6)); + + props[1] = (Byte)(((_key.SaltSize - 1) << 3) & 0xF8); + memcpy(props + 2, _key.Salt, _key.SaltSize); + propsSize = 2 + _key.SaltSize; + memcpy(props + propsSize, _aesIv, 16); + propsSize += 16; + memcpy(props + propsSize, _xcNonce, 24); + propsSize += 24; + memcpy(props + propsSize, _nonce, NAscon::kNonceSize); + propsSize += NAscon::kNonceSize; + + if (!_tagReady) + { + if (_finalized) + { + _tagReady = true; + } + else if (_keyDerived) + { + Finalize(_computedTag); + _tagReady = true; + } + else + { + memset(_computedTag, 0, NAscon::kTagSize); + } + } + + memcpy(props + propsSize, _computedTag, NAscon::kTagSize); + propsSize += NAscon::kTagSize; + + return WriteStream(outStream, props, propsSize); +} + +#endif + +CDecoder::CDecoder() +{ + _key.NumCyclesPower = 19; + _key.DerivMode = N7zKeyDerivation::kDeriv_Cascade; + _keyDerived = false; + _stateBufPos = 0; + _finalized = false; + _authOk = false; + _aadSize = 0; + memset(_expectedTag, 0, NAscon::kTagSize); + _xcBlockPos = 64; + _xcCounter = 0; + Z7_memset_0_ARRAY(_aesIv); + Z7_memset_0_ARRAY(_xcNonce); +} + +Z7_COM7F_IMF2(UInt32, CDecoder::Filter(Byte *data, UInt32 size)) +{ + if (size == 0) + return 0; + + ProcessDec(data, size); + return size; +} + +Z7_COM7F_IMF(CDecoder::SetDecoderProperties2(const Byte *data, UInt32 size)) +{ + _key.ClearProps(); + _key.DerivMode = N7zKeyDerivation::kDeriv_Cascade; + + _keyDerived = false; + _stateBufPos = 0; + _finalized = false; + _authOk = false; + memset(_expectedTag, 0, NAscon::kTagSize); + _xcBlockPos = 64; + _xcCounter = 0; + + for (unsigned i = 0; i < sizeof(_nonce); i++) + _nonce[i] = 0; + for (unsigned i = 0; i < sizeof(_aesIv); i++) + _aesIv[i] = 0; + for (unsigned i = 0; i < sizeof(_xcNonce); i++) + _xcNonce[i] = 0; + + if (size == 0) + return S_OK; + + const unsigned b0 = data[0]; + _key.NumCyclesPower = b0 & 0x3F; + + const bool saltPresent = (b0 & 0x80) != 0; + const unsigned nonceType = (b0 >> 6) & 1; + + if (!saltPresent && nonceType == 0 && size == 1) + return S_OK; + if (size <= 1) + return E_INVALIDARG; + + const unsigned b1 = data[1]; + const unsigned saltSize = saltPresent ? (((b1 >> 3) & 0x1F) + 1) : 0; + const unsigned nonceSize = (nonceType == 0) ? 16 : 24; + + const unsigned minSize = 2 + saltSize + nonceSize + 16 + 24; + if (size < minSize) + return E_INVALIDARG; + + const unsigned tagSize = size - minSize; + if (tagSize != NAscon::kTagSize && tagSize != 0) + return E_INVALIDARG; + + _key.SaltSize = saltSize; + data += 2; + for (unsigned i = 0; i < saltSize; i++) + _key.Salt[i] = *data++; + for (unsigned i = 0; i < 16; i++) + _aesIv[i] = *data++; + for (unsigned i = 0; i < 24; i++) + _xcNonce[i] = *data++; + for (unsigned i = 0; i < nonceSize && i < NAscon::kNonceSize; i++) + _nonce[i] = *data++; + + if (tagSize == NAscon::kTagSize) + memcpy(_expectedTag, data, NAscon::kTagSize); + + _aadSize = 1; + _aad[0] = (Byte)(_key.NumCyclesPower + | (saltPresent ? (1 << 7) : 0) + | (nonceType << 6)); + + if (saltPresent) + { + _aad[1] = (Byte)(((_key.SaltSize - 1) << 3) & 0xF8); + memcpy(_aad + 2, _key.Salt, _key.SaltSize); + _aadSize = 2 + _key.SaltSize; + memcpy(_aad + _aadSize, _aesIv, 16); + _aadSize += 16; + memcpy(_aad + _aadSize, _xcNonce, 24); + _aadSize += 24; + memcpy(_aad + _aadSize, _nonce, NAscon::kNonceSize); + _aadSize += NAscon::kNonceSize; + } + else + { + _aad[1] = 0; + _aadSize = 2; + memcpy(_aad + _aadSize, _aesIv, 16); + _aadSize += 16; + memcpy(_aad + _aadSize, _xcNonce, 24); + _aadSize += 24; + memcpy(_aad + _aadSize, _nonce, NAscon::kNonceSize); + _aadSize += NAscon::kNonceSize; + } + + return (_key.NumCyclesPower <= k_NumCyclesPower_Supported_MAX + || _key.NumCyclesPower == 0x3F) ? S_OK : E_NOTIMPL; +} + +Z7_COM7F_IMF(CDecoder::CryptoAuthVerify(Int32 *result)) +{ + if (_authOk) + { + *result = 0; + return S_OK; + } + + if (!_keyDerived) + { + DeriveCascadeKeys(); + ProcessAad(_aad, _aadSize); + } + + Byte computedTag[NAscon::kTagSize]; + Finalize(computedTag); + + { + volatile Byte diff = 0; + for (unsigned i = 0; i < NAscon::kTagSize; i++) + diff |= computedTag[i] ^ _expectedTag[i]; + *result = (diff == 0) ? 0 : 1; + _authOk = (diff == 0); + } + + Z7_memset_0_ARRAY(computedTag); + + return S_OK; +} + +}} diff --git a/CPP/7zip/Crypto/Cascade.h b/CPP/7zip/Crypto/Cascade.h new file mode 100644 index 000000000..fadd85467 --- /dev/null +++ b/CPP/7zip/Crypto/Cascade.h @@ -0,0 +1,266 @@ +// Cascade.h +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ + +#ifndef ZIP7_INC_CRYPTO_CASCADE_H +#define ZIP7_INC_CRYPTO_CASCADE_H + +#include "../../Common/MyCom.h" + +#include "../ICoder.h" +#include "../IPassword.h" + +#include "7zKeyDerivation.h" +#include "Ascon.h" + +#define AES_BLOCK_SIZE 16 +#define AES_NUM_IVMRK_WORDS ((1 + 1 + 15) * 4) + +namespace NCrypto { +namespace NAXACascade { + +using CKeyInfo = N7zKeyDerivation::CKeyInfo; +using CKeyInfoCache = N7zKeyDerivation::CKeyInfoCache; + +using N7zKeyDerivation::kKeySize; +using N7zKeyDerivation::kCascadeKeySize; + +const unsigned k_NumCyclesPower_Supported_MAX = 24; + +static const unsigned kXcBlockSize = 64; + +class CBase +{ + CKeyInfoCache _cachedKeys; +protected: + CKeyInfo _key; + Byte _nonce[NAscon::kNonceSize]; + Byte _keyAscon[NAscon::kKeySize_Ascon]; + bool _keyDerived; + + Byte _keyAes[32]; + Byte _aesIv[16]; + UInt32 _aesKeys[AES_NUM_IVMRK_WORDS]; + + Byte _keyXChaCha20[32]; + Byte _xcNonce[24]; + Byte _xcDerivedKey[32]; + Byte _xcBlock[64]; + unsigned _xcBlockPos; + UInt64 _xcCounter; + + void PrepareKey(); + void DeriveCascadeKeys(); + void AesCtrXorData(Byte *data, UInt32 size); + void XChaCha20XorData(Byte *data, UInt32 size); + CBase(); + ~CBase() + { + Z7_memset_0_ARRAY(_nonce); + Z7_memset_0_ARRAY(_keyAscon); + Z7_memset_0_ARRAY(_keyAes); + Z7_memset_0_ARRAY(_aesIv); + Z7_memset_0_ARRAY(_aesKeys); + Z7_memset_0_ARRAY(_keyXChaCha20); + Z7_memset_0_ARRAY(_xcNonce); + Z7_memset_0_ARRAY(_xcDerivedKey); + Z7_memset_0_ARRAY(_xcBlock); + } +}; + +class CBaseCoder: + public ICompressFilter, + public ICryptoSetPassword, + public CMyUnknownImp, + public CBase +{ + Z7_IFACE_COM7_IMP_NONFINAL(ICompressFilter) + Z7_IFACE_COM7_IMP_NONFINAL(ICryptoSetPassword) +protected: + virtual ~CBaseCoder() + { + Z7_memset_0_ARRAY(_aad); + Z7_memset_0_ARRAY(_state); + } + + Byte _aad[2 + 32 + NAscon::kNonceSize + 16 + 24]; + unsigned _aadSize; + + UInt64 _state[5]; + Byte _stateBuf[NAscon::kRateSize]; + unsigned _stateBufPos; + bool _finalized; + bool _authOk; + + void InitState(); + void ProcessAad(const Byte *aad, UInt64 aadLen); + void ProcessEnc(Byte *data, UInt32 size); + void ProcessDec(Byte *data, UInt32 size); + void Finalize(Byte *tag); +}; + +#ifndef Z7_EXTRACT_ONLY + +class CEncoder Z7_final: + public CBaseCoder, + public ICompressWriteCoderProperties, + public ICryptoResetInitVector +{ + Z7_COM_UNKNOWN_IMP_4( + ICompressFilter, + ICryptoSetPassword, + ICompressWriteCoderProperties, + ICryptoResetInitVector) + Z7_IFACE_COM7_IMP(ICompressWriteCoderProperties) + Z7_IFACE_COM7_IMP(ICryptoResetInitVector) + + Byte _computedTag[NAscon::kTagSize]; + bool _tagReady; + Z7_COM7F_IMP2(UInt32, Filter(Byte *data, UInt32 size)) +public: + CEncoder(); +}; + +#endif + +class CDecoder Z7_final: + public CBaseCoder, + public ICompressSetDecoderProperties2, + public ICryptoAuthVerify +{ + Z7_COM_UNKNOWN_IMP_4( + ICompressFilter, + ICryptoSetPassword, + ICompressSetDecoderProperties2, + ICryptoAuthVerify) + Z7_IFACE_COM7_IMP(ICompressSetDecoderProperties2) + Z7_IFACE_COM7_IMP(ICryptoAuthVerify) + + Byte _expectedTag[NAscon::kTagSize]; + Z7_COM7F_IMP2(UInt32, Filter(Byte *data, UInt32 size)) +public: + CDecoder(); +}; + +}} + +#include "XChaCha20Poly1305.h" + +namespace NCrypto { +namespace NAXPCascade { + +using NAXACascade::CKeyInfo; +using NAXACascade::CKeyInfoCache; +using NAXACascade::k_NumCyclesPower_Supported_MAX; +using NAXACascade::kCascadeKeySize; +using NAXACascade::kXcBlockSize; + +using NXChaCha20Poly1305::kTagSize; +using NXChaCha20Poly1305::kPolyKeySize; + +class CAXPBase +{ + CKeyInfoCache _cachedKeys; +protected: + CKeyInfo _key; + bool _keyDerived; + + Byte _keyAes[32]; + Byte _aesIv[16]; + UInt32 _aesKeys[AES_NUM_IVMRK_WORDS]; + + Byte _keyXChaCha20[32]; + Byte _xcNonce[24]; + Byte _xcDerivedKey[32]; + Byte _xcBlock[64]; + unsigned _xcBlockPos; + UInt64 _xcCounter; + + Byte _polyKey[kPolyKeySize]; + NXChaCha20Poly1305::CPoly1305 _poly1305; + Byte _aad[2 + 32 + 24 + 16]; + unsigned _aadSize; + bool _finalized; + bool _authOk; + + void PrepareKey(); + void DeriveAXPKeys(); + void ComputePolyKey(); + void AesCtrXorData(Byte *data, UInt32 size); + void XChaCha20XorData(Byte *data, UInt32 size); + + CAXPBase(); + ~CAXPBase() + { + Z7_memset_0_ARRAY(_keyAes); + Z7_memset_0_ARRAY(_aesIv); + Z7_memset_0_ARRAY(_aesKeys); + Z7_memset_0_ARRAY(_keyXChaCha20); + Z7_memset_0_ARRAY(_xcNonce); + Z7_memset_0_ARRAY(_xcDerivedKey); + Z7_memset_0_ARRAY(_xcBlock); + Z7_memset_0_ARRAY(_polyKey); + Z7_memset_0_ARRAY(_aad); + } +}; + +class CAXPBaseCoder: + public ICompressFilter, + public ICryptoSetPassword, + public CMyUnknownImp, + public CAXPBase +{ + Z7_IFACE_COM7_IMP_NONFINAL(ICompressFilter) + Z7_IFACE_COM7_IMP_NONFINAL(ICryptoSetPassword) +protected: + virtual ~CAXPBaseCoder() {} + void ProcessEnc(Byte *data, UInt32 size); + void ProcessDec(Byte *data, UInt32 size); +}; + +#ifndef Z7_EXTRACT_ONLY + +class CAXPEncoder Z7_final: + public CAXPBaseCoder, + public ICompressWriteCoderProperties, + public ICryptoResetInitVector +{ + Z7_COM_UNKNOWN_IMP_4( + ICompressFilter, + ICryptoSetPassword, + ICompressWriteCoderProperties, + ICryptoResetInitVector) + Z7_IFACE_COM7_IMP(ICompressWriteCoderProperties) + Z7_IFACE_COM7_IMP(ICryptoResetInitVector) + + Byte _computedTag[kTagSize]; + bool _tagReady; + Z7_COM7F_IMP2(UInt32, Filter(Byte *data, UInt32 size)) +public: + CAXPEncoder(); +}; + +#endif + +class CAXPDecoder Z7_final: + public CAXPBaseCoder, + public ICompressSetDecoderProperties2, + public ICryptoAuthVerify +{ + Z7_COM_UNKNOWN_IMP_4( + ICompressFilter, + ICryptoSetPassword, + ICompressSetDecoderProperties2, + ICryptoAuthVerify) + Z7_IFACE_COM7_IMP(ICompressSetDecoderProperties2) + Z7_IFACE_COM7_IMP(ICryptoAuthVerify) + + Byte _expectedTag[kTagSize]; + Z7_COM7F_IMP2(UInt32, Filter(Byte *data, UInt32 size)) +public: + CAXPDecoder(); +}; + +}} + +#endif diff --git a/CPP/7zip/Crypto/CascadeRegister.cpp b/CPP/7zip/Crypto/CascadeRegister.cpp new file mode 100644 index 000000000..cfaa1c7e1 --- /dev/null +++ b/CPP/7zip/Crypto/CascadeRegister.cpp @@ -0,0 +1,29 @@ +// CascadeRegister.cpp +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ + +#include "StdAfx.h" + +#include "../Common/RegisterCodec.h" + +#include "Cascade.h" + +namespace NCrypto { +namespace NAXPCascade { + +REGISTER_FILTER_E(AXP, + CAXPDecoder, + CAXPEncoder, + 0x6F10704, "AES+XChaCha20-Poly1305") + +}} + +namespace NCrypto { +namespace NAXACascade { + +REGISTER_FILTER_E(AXA, + CDecoder, + CEncoder, + 0x6F10705, "AES+XChaCha20+Ascon") + +}} \ No newline at end of file diff --git a/CPP/7zip/Crypto/ChaCha20Simd.h b/CPP/7zip/Crypto/ChaCha20Simd.h index 175bc234d..3ce6f5e65 100644 --- a/CPP/7zip/Crypto/ChaCha20Simd.h +++ b/CPP/7zip/Crypto/ChaCha20Simd.h @@ -1,5 +1,6 @@ // ChaCha20Simd.h -// Shared SIMD (SSE2/AVX2/NEON) acceleration code for ChaCha20/XChaCha20 +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ #ifndef ZIP7_CRYPTO_CHACHA20_SIMD_H #define ZIP7_CRYPTO_CHACHA20_SIMD_H diff --git a/CPP/7zip/Crypto/HkdfBlake2sp.cpp b/CPP/7zip/Crypto/HkdfBlake2sp.cpp new file mode 100644 index 000000000..ecbf11b7d --- /dev/null +++ b/CPP/7zip/Crypto/HkdfBlake2sp.cpp @@ -0,0 +1,114 @@ +// HkdfBlake2sp.cpp +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ + +#include "StdAfx.h" + +#include "HkdfBlake2sp.h" + +namespace NCrypto { +namespace NHkdfBlake2sp { + +#define BLAKE2SP_BLOCK_SIZE 64 + +// HMAC-BLAKE2sp +static void HmacBlake2sp(const Byte *key, unsigned keySize, + const Byte *message, unsigned messageSize, + Byte *mac) +{ + static bool blake2spPrepared = false; + if (!blake2spPrepared) + { + z7_Black2sp_Prepare(); + blake2spPrepared = true; + } + + Byte ipad[BLAKE2SP_BLOCK_SIZE]; + Byte opad[BLAKE2SP_BLOCK_SIZE]; + + memset(ipad, 0x36, BLAKE2SP_BLOCK_SIZE); + memset(opad, 0x5c, BLAKE2SP_BLOCK_SIZE); + + for (unsigned i = 0; i < keySize && i < BLAKE2SP_BLOCK_SIZE; i++) + { + ipad[i] ^= key[i]; + opad[i] ^= key[i]; + } + + // Inner hash + CAlignedBuffer1 bufInner(sizeof(CBlake2sp)); + CBlake2sp *blake2spInner = (CBlake2sp *)(void *)(Byte *)bufInner; + Blake2sp_Init(blake2spInner); + Blake2sp_SetFunction(blake2spInner, 0); + Blake2sp_Update(blake2spInner, ipad, BLAKE2SP_BLOCK_SIZE); + Blake2sp_Update(blake2spInner, message, messageSize); + + Byte innerHash[Z7_BLAKE2S_DIGEST_SIZE]; + Blake2sp_Final(blake2spInner, innerHash); + + // Outer hash + CAlignedBuffer1 bufOuter(sizeof(CBlake2sp)); + CBlake2sp *blake2spOuter = (CBlake2sp *)(void *)(Byte *)bufOuter; + Blake2sp_Init(blake2spOuter); + Blake2sp_SetFunction(blake2spOuter, 0); + Blake2sp_Update(blake2spOuter, opad, BLAKE2SP_BLOCK_SIZE); + Blake2sp_Update(blake2spOuter, innerHash, Z7_BLAKE2S_DIGEST_SIZE); + Blake2sp_Final(blake2spOuter, mac); + + Z7_memset_0_ARRAY(ipad); + Z7_memset_0_ARRAY(opad); + Z7_memset_0_ARRAY(innerHash); +} + +// HKDF-Expand (RFC 5869) +void Derive(const Byte *prk, unsigned prkSize, + const char *info, unsigned infoLen, + Byte *output, unsigned outSize) +{ + const unsigned n = (outSize + Z7_BLAKE2S_DIGEST_SIZE - 1) / Z7_BLAKE2S_DIGEST_SIZE; + + Byte prevT[Z7_BLAKE2S_DIGEST_SIZE]; + unsigned prevTSize = 0; + + Byte *outPtr = output; + unsigned remaining = outSize; + + for (unsigned i = 1; i <= n; i++) + { + Byte message[Z7_BLAKE2S_DIGEST_SIZE + 256 + 1]; + unsigned messageSize = 0; + + if (prevTSize > 0) + { + memcpy(message + messageSize, prevT, prevTSize); + messageSize += prevTSize; + } + + if (infoLen > 0) + { + memcpy(message + messageSize, info, infoLen); + messageSize += infoLen; + } + + message[messageSize] = (Byte)i; + messageSize += 1; + + Byte ti[Z7_BLAKE2S_DIGEST_SIZE]; + HmacBlake2sp(prk, prkSize, message, messageSize, ti); + + const unsigned copySize = remaining < Z7_BLAKE2S_DIGEST_SIZE ? remaining : Z7_BLAKE2S_DIGEST_SIZE; + memcpy(outPtr, ti, copySize); + outPtr += copySize; + remaining -= copySize; + + memcpy(prevT, ti, Z7_BLAKE2S_DIGEST_SIZE); + prevTSize = Z7_BLAKE2S_DIGEST_SIZE; + + Z7_memset_0_ARRAY(ti); + Z7_memset_0_ARRAY(message); + } + + Z7_memset_0_ARRAY(prevT); +} + +}} diff --git a/CPP/7zip/Crypto/HkdfBlake2sp.h b/CPP/7zip/Crypto/HkdfBlake2sp.h new file mode 100644 index 000000000..827d21bf8 --- /dev/null +++ b/CPP/7zip/Crypto/HkdfBlake2sp.h @@ -0,0 +1,21 @@ +// HkdfBlake2sp.h +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ + +#ifndef ZIP7_INC_CRYPTO_HKDF_BLAKE2SP_H +#define ZIP7_INC_CRYPTO_HKDF_BLAKE2SP_H + +#include "../../../C/Blake2.h" +#include "../../Common/MyBuffer2.h" + +namespace NCrypto { +namespace NHkdfBlake2sp { + +// HKDF-Expand (RFC 5869) using HMAC-BLAKE2sp +void Derive(const Byte *prk, unsigned prkSize, + const char *info, unsigned infoLen, + Byte *output, unsigned outSize); + +}} + +#endif diff --git a/CPP/7zip/Crypto/HmacSha512.cpp b/CPP/7zip/Crypto/HmacSha512.cpp new file mode 100644 index 000000000..ef477bd9b --- /dev/null +++ b/CPP/7zip/Crypto/HmacSha512.cpp @@ -0,0 +1,55 @@ +// HmacSha512.cpp +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ + +#include "StdAfx.h" + +#include + +#include "../../../C/CpuArch.h" + +#include "HmacSha512.h" + +namespace NCrypto { +namespace NSha512 { + +void CHmac::SetKey(const Byte *key, size_t keySize) +{ + MY_ALIGN (16) + UInt64 temp[SHA512_NUM_BLOCK_WORDS]; + size_t i; + + for (i = 0; i < SHA512_NUM_BLOCK_WORDS; i++) + temp[i] = 0; + + if (keySize > kBlockSize) + { + Sha512_Init(&_sha, SHA512_DIGEST_SIZE); + Sha512_Update(&_sha, key, keySize); + Sha512_Final(&_sha, (Byte *)temp, SHA512_DIGEST_SIZE); + } + else + memcpy(temp, key, keySize); + + for (i = 0; i < SHA512_NUM_BLOCK_WORDS; i++) + temp[i] ^= UINT64_CONST(0x3636363636363636); + + Sha512_Init(&_sha, SHA512_DIGEST_SIZE); + Sha512_Update(&_sha, (const Byte *)temp, kBlockSize); + + for (i = 0; i < SHA512_NUM_BLOCK_WORDS; i++) + temp[i] ^= UINT64_CONST(0x3636363636363636) ^ UINT64_CONST(0x5C5C5C5C5C5C5C5C); + + Sha512_Init(&_sha2, SHA512_DIGEST_SIZE); + Sha512_Update(&_sha2, (const Byte *)temp, kBlockSize); +} + + +void CHmac::Final(Byte *mac) +{ + Sha512_Final(&_sha, mac, SHA512_DIGEST_SIZE); + Sha512_Update(&_sha2, mac, SHA512_DIGEST_SIZE); + Sha512_Final(&_sha2, mac, SHA512_DIGEST_SIZE); +} + +}} diff --git a/CPP/7zip/Crypto/HmacSha512.h b/CPP/7zip/Crypto/HmacSha512.h new file mode 100644 index 000000000..ce337d91a --- /dev/null +++ b/CPP/7zip/Crypto/HmacSha512.h @@ -0,0 +1,29 @@ +// HmacSha512.h +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ + +#ifndef ZIP7_INC_CRYPTO_HMAC_SHA512_H +#define ZIP7_INC_CRYPTO_HMAC_SHA512_H + +#include "../../../C/Sha512.h" + +namespace NCrypto { +namespace NSha512 { + +const unsigned kBlockSize = SHA512_BLOCK_SIZE; +const unsigned kDigestSize = SHA512_DIGEST_SIZE; +const unsigned kNumDigestWords = SHA512_NUM_DIGEST_WORDS; + +class CHmac +{ + CSha512 _sha; + CSha512 _sha2; +public: + void SetKey(const Byte *key, size_t keySize); + void Update(const Byte *data, size_t dataSize) { Sha512_Update(&_sha, data, dataSize); } + void Final(Byte *mac); +}; + +}} + +#endif diff --git a/CPP/7zip/Crypto/Pbkdf2HmacSha512.cpp b/CPP/7zip/Crypto/Pbkdf2HmacSha512.cpp new file mode 100644 index 000000000..55f97285f --- /dev/null +++ b/CPP/7zip/Crypto/Pbkdf2HmacSha512.cpp @@ -0,0 +1,63 @@ +// Pbkdf2HmacSha512.cpp +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ + +#include "StdAfx.h" + +#include + +#include "../../../C/CpuArch.h" + +#include "HmacSha512.h" +#include "Pbkdf2HmacSha512.h" + +namespace NCrypto { +namespace NSha512 { + +void Pbkdf2Hmac(const Byte *pwd, size_t pwdSize, + const Byte *salt, size_t saltSize, + UInt32 numIterations, + Byte *key, size_t keySize) +{ + MY_ALIGN (16) + CHmac baseCtx; + baseCtx.SetKey(pwd, pwdSize); + + for (UInt32 i = 1; keySize != 0; i++) + { + MY_ALIGN (16) + CHmac ctx; + ctx = baseCtx; + ctx.Update(salt, saltSize); + + MY_ALIGN (16) + UInt32 be_i[1]; + SetBe32(be_i, i) + + ctx.Update((const Byte *)be_i, 4); + + MY_ALIGN (16) + Byte u[kDigestSize]; + ctx.Final(u); + + MY_ALIGN (16) + Byte t[kDigestSize]; + memcpy(t, u, kDigestSize); + + for (UInt32 j = 1; j < numIterations; j++) + { + ctx = baseCtx; + ctx.Update(u, kDigestSize); + ctx.Final(u); + for (unsigned k = 0; k < kDigestSize; k++) + t[k] ^= u[k]; + } + + const unsigned curSize = (keySize < kDigestSize) ? (unsigned)keySize : kDigestSize; + memcpy(key, t, curSize); + key += curSize; + keySize -= curSize; + } +} + +}} diff --git a/CPP/7zip/Crypto/Pbkdf2HmacSha512.h b/CPP/7zip/Crypto/Pbkdf2HmacSha512.h new file mode 100644 index 000000000..78236bd15 --- /dev/null +++ b/CPP/7zip/Crypto/Pbkdf2HmacSha512.h @@ -0,0 +1,20 @@ +// Pbkdf2HmacSha512.h +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ + +#ifndef ZIP7_INC_CRYPTO_PBKDF2_HMAC_SHA512_H +#define ZIP7_INC_CRYPTO_PBKDF2_HMAC_SHA512_H + +#include + +#include "../../Common/MyTypes.h" + +namespace NCrypto { +namespace NSha512 { + +void Pbkdf2Hmac(const Byte *pwd, size_t pwdSize, const Byte *salt, size_t saltSize, + UInt32 numIterations, Byte *key, size_t keySize); + +}} + +#endif diff --git a/CPP/7zip/Crypto/XChaCha20.cpp b/CPP/7zip/Crypto/XChaCha20.cpp index 7932ae522..f93f8cd0a 100644 --- a/CPP/7zip/Crypto/XChaCha20.cpp +++ b/CPP/7zip/Crypto/XChaCha20.cpp @@ -1,4 +1,6 @@ // XChaCha20.cpp +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ #include "StdAfx.h" @@ -77,6 +79,16 @@ void CBase::PrepareKey() g_GlobalKeyCache.FindAndAdd(_key); } +#define DOUBLE_ROUND \ + QUARTERROUND(x0, x4, x8, x12) \ + QUARTERROUND(x1, x5, x9, x13) \ + QUARTERROUND(x2, x6, x10, x14) \ + QUARTERROUND(x3, x7, x11, x15) \ + QUARTERROUND(x0, x5, x10, x15) \ + QUARTERROUND(x1, x6, x11, x12) \ + QUARTERROUND(x2, x7, x8, x13) \ + QUARTERROUND(x3, x4, x9, x14) + void XHChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce) { UInt32 x0, x1, x2, x3, x4, x5, x6, x7; @@ -101,20 +113,8 @@ void XHChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce) x14 = GetUi32(nonce + 8); x15 = GetUi32(nonce + 12); -#define DOUBLE_ROUND \ - QUARTERROUND(x0, x4, x8, x12) \ - QUARTERROUND(x1, x5, x9, x13) \ - QUARTERROUND(x2, x6, x10, x14) \ - QUARTERROUND(x3, x7, x11, x15) \ - QUARTERROUND(x0, x5, x10, x15) \ - QUARTERROUND(x1, x6, x11, x12) \ - QUARTERROUND(x2, x7, x8, x13) \ - QUARTERROUND(x3, x4, x9, x14) - CHACHA20_10_DOUBLE_ROUNDS -#undef DOUBLE_ROUND - SetUi32(output, x0); SetUi32(output + 4, x1); SetUi32(output + 8, x2); @@ -149,20 +149,8 @@ void XChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce, UInt6 x14 = GetUi32(nonce); x15 = GetUi32(nonce + 4); -#define DOUBLE_ROUND \ - QUARTERROUND(x0, x4, x8, x12) \ - QUARTERROUND(x1, x5, x9, x13) \ - QUARTERROUND(x2, x6, x10, x14) \ - QUARTERROUND(x3, x7, x11, x15) \ - QUARTERROUND(x0, x5, x10, x15) \ - QUARTERROUND(x1, x6, x11, x12) \ - QUARTERROUND(x2, x7, x8, x13) \ - QUARTERROUND(x3, x4, x9, x14) - CHACHA20_10_DOUBLE_ROUNDS -#undef DOUBLE_ROUND - x0 += GetUi32(kSigma); x1 += GetUi32(kSigma + 4); x2 += GetUi32(kSigma + 8); @@ -195,9 +183,11 @@ void XChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce, UInt6 SetUi32(output + 48, x12) SetUi32(output + 52, x13) SetUi32(output + 56, x14) - SetUi32(output + 60, x15) + SetUi32(output + 60, x15); } +#undef DOUBLE_ROUND + void CBaseCoder::ProcessData(Byte *data, UInt32 size) { if (!_derivedKeyValid) diff --git a/CPP/7zip/Crypto/XChaCha20.h b/CPP/7zip/Crypto/XChaCha20.h index 652057dc9..01b4c7c6b 100644 --- a/CPP/7zip/Crypto/XChaCha20.h +++ b/CPP/7zip/Crypto/XChaCha20.h @@ -1,4 +1,6 @@ // XChaCha20.h +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ #ifndef ZIP7_INC_CRYPTO_XCHACHA20_H #define ZIP7_INC_CRYPTO_XCHACHA20_H diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp index 4c09a04db..2e03c6794 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp @@ -1,4 +1,6 @@ // XChaCha20Poly1305.cpp +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ #include "StdAfx.h" @@ -322,11 +324,6 @@ static void Poly1305_ProcessBlock_SSE2_4Way(Byte h[16], const Byte r[16], const Poly1305_ReduceAndPack(h, m); } -static void Poly1305_ProcessBlock_SSE2(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) -{ - Poly1305_ProcessBlock_SSE2_4Way(h, r, block, hasHighBit); -} - #endif #if !defined(MY_CPU_X86_OR_AMD64) || !defined(MY_CPU_SSE2) @@ -379,31 +376,29 @@ static void Poly1305_ProcessBlock(Byte h[16], const Byte r[16], const Byte block static void Poly1305_ProcessBlock(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) { #if defined(MY_CPU_X86_OR_AMD64) && defined(MY_CPU_SSE2) - Poly1305_ProcessBlock_SSE2(h, r, block, hasHighBit); + Poly1305_ProcessBlock_SSE2_4Way(h, r, block, hasHighBit); #else Poly1305_ProcessBlock_32(h, r, block, hasHighBit); #endif } #endif -void CPoly1305::Update(const Byte *data, UInt32 size) +void CPoly1305::ProcessBlocks(Byte *buf, unsigned &bufPos, UInt64 &len, const Byte *data, UInt32 size) { - if (_finalized) - return; - _totalLen += size; + len += size; - if (_blockPos > 0) + if (bufPos > 0) { - unsigned n = 16 - _blockPos; + unsigned n = 16 - bufPos; if (n > size) n = size; - memcpy(_block + _blockPos, data, n); - _blockPos += n; + memcpy(buf + bufPos, data, n); + bufPos += n; data += n; size -= n; - if (_blockPos == 16) + if (bufPos == 16) { - Poly1305_ProcessBlock(_h, _r, _block, true); - _blockPos = 0; + Poly1305_ProcessBlock(_h, _r, buf, true); + bufPos = 0; } } @@ -416,44 +411,21 @@ void CPoly1305::Update(const Byte *data, UInt32 size) if (size > 0) { - memcpy(_block, data, size); - _blockPos = size; + memcpy(buf, data, size); + bufPos = size; } } -void CPoly1305::UpdateAad(const Byte *data, UInt32 size) +void CPoly1305::Update(const Byte *data, UInt32 size) { - if (_finalized) - return; - _aadLen += size; - - if (_aadBlockPos > 0) - { - unsigned n = 16 - _aadBlockPos; - if (n > size) n = size; - memcpy(_aadBlock + _aadBlockPos, data, n); - _aadBlockPos += n; - data += n; - size -= n; - if (_aadBlockPos == 16) - { - Poly1305_ProcessBlock(_h, _r, _aadBlock, true); - _aadBlockPos = 0; - } - } - - while (size >= 16) - { - Poly1305_ProcessBlock(_h, _r, data, true); - data += 16; - size -= 16; - } + if (_finalized) return; + ProcessBlocks(_block, _blockPos, _totalLen, data, size); +} - if (size > 0) - { - memcpy(_aadBlock, data, size); - _aadBlockPos = size; - } +void CPoly1305::UpdateAad(const Byte *data, UInt32 size) +{ + if (_finalized) return; + ProcessBlocks(_aadBlock, _aadBlockPos, _aadLen, data, size); } void CPoly1305::PadAndProcessBlock(Byte *buf, unsigned bufPos, UInt64 len) @@ -747,4 +719,4 @@ Z7_COM7F_IMF(CDecoder::CryptoAuthVerify(Int32 *result)) return S_OK; } -}} // namespace NCrypto::NXChaCha20Poly1305 \ No newline at end of file +}} \ No newline at end of file diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.h b/CPP/7zip/Crypto/XChaCha20Poly1305.h index 57a84ac9c..19cd5129f 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305.h +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.h @@ -1,4 +1,6 @@ // XChaCha20Poly1305.h +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ #ifndef ZIP7_INC_CRYPTO_XCHACHA20_POLY1305_H #define ZIP7_INC_CRYPTO_XCHACHA20_POLY1305_H @@ -33,6 +35,7 @@ class CPoly1305 UInt64 _aadLen; void PadAndProcessBlock(Byte *buf, unsigned bufPos, UInt64 len); + void ProcessBlocks(Byte *buf, unsigned &bufPos, UInt64 &len, const Byte *data, UInt32 size); public: CPoly1305(); void SetKey(const Byte *key); diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp b/CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp index d41614829..c7fe3bd32 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp +++ b/CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp @@ -1,4 +1,6 @@ // XChaCha20Poly1305Register.cpp +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ #include "StdAfx.h" diff --git a/CPP/7zip/Crypto/XChaCha20Register.cpp b/CPP/7zip/Crypto/XChaCha20Register.cpp index 5d22ccf73..f429cfb23 100644 --- a/CPP/7zip/Crypto/XChaCha20Register.cpp +++ b/CPP/7zip/Crypto/XChaCha20Register.cpp @@ -1,4 +1,6 @@ // XChaCha20Register.cpp +// Copyright (C) fzxx Contributor: https://github.com/fzxx +// License: GNU LGPL v2.1+ #include "StdAfx.h" diff --git a/CPP/7zip/Sha512.mak b/CPP/7zip/Sha512.mak new file mode 100644 index 000000000..f251dee37 --- /dev/null +++ b/CPP/7zip/Sha512.mak @@ -0,0 +1,6 @@ +COMMON_OBJS = $(COMMON_OBJS) \ + $O\Sha512Prepare.obj + +C_OBJS = $(C_OBJS) \ + $O\Sha512.obj \ + $O\Sha512Opt.obj diff --git a/CPP/7zip/UI/GUI/CompressDialog.cpp b/CPP/7zip/UI/GUI/CompressDialog.cpp index e306d12bd..13f364968 100644 --- a/CPP/7zip/UI/GUI/CompressDialog.cpp +++ b/CPP/7zip/UI/GUI/CompressDialog.cpp @@ -1731,17 +1731,25 @@ void CCompressDialog::SetEncryptionMethod() { const NCompression::CFormatOptions &fo = m_RegistryInfo.Formats[index]; encryptionMethod = fo.EncryptionMethod; + encryptionMethod.RemoveChar(L'-'); + encryptionMethod.RemoveChar(L'+'); + encryptionMethod.MakeLower_Ascii(); } ComboBox_AddStringAscii(_encryptionMethod, "AES-256"); ComboBox_AddStringAscii(_encryptionMethod, "XChaCha20"); ComboBox_AddStringAscii(_encryptionMethod, "XChaCha20-Poly1305"); + ComboBox_AddStringAscii(_encryptionMethod, "AES+XChaCha20-Poly1305"); + ComboBox_AddStringAscii(_encryptionMethod, "AES+XChaCha20+Ascon"); int sel = 0; - if (encryptionMethod.IsPrefixedBy_Ascii_NoCase("xchacha20poly1305") - || encryptionMethod.IsPrefixedBy_Ascii_NoCase("xchacha20-poly1305")) + if (encryptionMethod.IsEqualTo_Ascii_NoCase("xchacha20poly1305")) sel = 2; - else if (encryptionMethod.IsPrefixedBy_Ascii_NoCase("xchacha")) + else if (encryptionMethod.IsEqualTo_Ascii_NoCase("xchacha20")) sel = 1; - else if (encryptionMethod.IsPrefixedBy_Ascii_NoCase("aes")) + else if (encryptionMethod.IsEqualTo_Ascii_NoCase("aesxchacha20poly1305") || encryptionMethod.IsEqualTo_Ascii_NoCase("axp")) + sel = 3; + else if (encryptionMethod.IsEqualTo_Ascii_NoCase("aesxchacha20ascon") || encryptionMethod.IsEqualTo_Ascii_NoCase("axa")) + sel = 4; + else if (encryptionMethod.IsEqualTo_Ascii_NoCase("aes256")) sel = 0; _encryptionMethod.SetCurSel(sel); _default_encryptionMethod_Index = 0; diff --git a/CPP/7zip/UI/GUI/CompressDialog.rc b/CPP/7zip/UI/GUI/CompressDialog.rc index df1516c35..a1888dd13 100644 --- a/CPP/7zip/UI/GUI/CompressDialog.rc +++ b/CPP/7zip/UI/GUI/CompressDialog.rc @@ -136,8 +136,8 @@ BEGIN CONTROL "Show Password", IDX_PASSWORD_SHOW, MY_CHECKBOX, g4x2, yPsw + 79, g4xs2, 10 - LTEXT "&Encryption method:", IDT_COMPRESS_ENCRYPTION_METHOD, g4x2, yPsw + 95, 100, 8 - COMBOBOX IDC_COMPRESS_ENCRYPTION_METHOD, g4x2 + 100, yPsw + 93, g4xs2 - 100, 198, MY_COMBO + LTEXT "&Encryption method:", IDT_COMPRESS_ENCRYPTION_METHOD, g4x2, yPsw + 95, 70, 8 + COMBOBOX IDC_COMPRESS_ENCRYPTION_METHOD, g4x2 + 70, yPsw + 93, g4xs2 - 70, 198, MY_COMBO CONTROL "Encrypt file &names", IDX_COMPRESS_ENCRYPT_FILE_NAMES, MY_CHECKBOX, g4x2, yPsw + 111, g4xs2, 10 diff --git a/DOC/Methods.txt b/DOC/Methods.txt index 3e5707c73..6f9b2f9a4 100644 --- a/DOC/Methods.txt +++ b/DOC/Methods.txt @@ -1,8 +1,8 @@ 7-Zip method IDs for 7z and xz archives --------------------------------------- -Version: 24.02 -Date: 2024-03-22 +Version: 26.01 +Date: 2026-06-11 Each compression or crypto method in 7z is associated with unique binary value (ID). The length of ID in bytes is arbitrary but it can not exceed 63 bits (8 bytes). @@ -171,6 +171,10 @@ List of defined IDs 07 - [7z] 01 - 7zAES (AES-256 + SHA-256) + 02 - XChaCha20 (SHA-256) + 03 - XChaCha20-Poly1305 (SHA-256) + 04 - AES+XChaCha20-Poly1305 (PBKDF2-SHA512 + HKDF-BLAKE2sp) + 05 - AES+XChaCha20+Ascon (PBKDF2-SHA512 + HKDF-BLAKE2sp) --- From 443b611245d510114fd56f438bac41c758db1f80 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Fri, 12 Jun 2026 08:06:43 +0800 Subject: [PATCH 10/18] Update document --- DOC/Methods.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/DOC/Methods.txt b/DOC/Methods.txt index 6f9b2f9a4..11bc9bccc 100644 --- a/DOC/Methods.txt +++ b/DOC/Methods.txt @@ -2,7 +2,7 @@ --------------------------------------- Version: 26.01 -Date: 2026-06-11 +Date: 2026-06-12 Each compression or crypto method in 7z is associated with unique binary value (ID). The length of ID in bytes is arbitrary but it can not exceed 63 bits (8 bytes). @@ -170,11 +170,11 @@ List of defined IDs 03 - Rar29AES (AES-128 + modified SHA-1) 07 - [7z] - 01 - 7zAES (AES-256 + SHA-256) + 01 - 7zAES (AES-256-CBC + SHA-256) 02 - XChaCha20 (SHA-256) 03 - XChaCha20-Poly1305 (SHA-256) - 04 - AES+XChaCha20-Poly1305 (PBKDF2-SHA512 + HKDF-BLAKE2sp) - 05 - AES+XChaCha20+Ascon (PBKDF2-SHA512 + HKDF-BLAKE2sp) + 04 - AES-256-CTR + XChaCha20-Poly1305 (PBKDF2-SHA512 + HKDF-BLAKE2sp) + 05 - AES-256-CTR + XChaCha20 + Ascon (PBKDF2-SHA512 + HKDF-BLAKE2sp) --- From e8f86632912b19d9700c96e3e35f3eb13275b630 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Fri, 12 Jun 2026 15:51:11 +0800 Subject: [PATCH 11/18] Fix omitted registry handling, Fix SIMD calls --- CPP/7zip/Crypto/Ascon.cpp | 1 + CPP/7zip/Crypto/CascadeRegister.cpp | 2 +- CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp | 2 +- CPP/7zip/UI/GUI/CompressDialog.cpp | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CPP/7zip/Crypto/Ascon.cpp b/CPP/7zip/Crypto/Ascon.cpp index 873957c91..e0dbb2e1b 100644 --- a/CPP/7zip/Crypto/Ascon.cpp +++ b/CPP/7zip/Crypto/Ascon.cpp @@ -141,6 +141,7 @@ void AsconP12(UInt64 state[5]) void AsconP8(UInt64 state[5]) { #ifdef MY_CPU_AMD64 + InitSIMD(); if (g_AVX512Enabled) { UInt64 st[8] = { state[0], state[1], state[2], state[3], state[4] }; diff --git a/CPP/7zip/Crypto/CascadeRegister.cpp b/CPP/7zip/Crypto/CascadeRegister.cpp index cfaa1c7e1..bdc5c273c 100644 --- a/CPP/7zip/Crypto/CascadeRegister.cpp +++ b/CPP/7zip/Crypto/CascadeRegister.cpp @@ -26,4 +26,4 @@ REGISTER_FILTER_E(AXA, CEncoder, 0x6F10705, "AES+XChaCha20+Ascon") -}} \ No newline at end of file +}} diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp b/CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp index c7fe3bd32..75bbdc1fe 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp +++ b/CPP/7zip/Crypto/XChaCha20Poly1305Register.cpp @@ -16,4 +16,4 @@ REGISTER_FILTER_E(XChaCha20Poly1305, CEncoder, 0x6F10703, "XChaCha20-Poly1305") -}} \ No newline at end of file +}} diff --git a/CPP/7zip/UI/GUI/CompressDialog.cpp b/CPP/7zip/UI/GUI/CompressDialog.cpp index 13f364968..4baca7bb3 100644 --- a/CPP/7zip/UI/GUI/CompressDialog.cpp +++ b/CPP/7zip/UI/GUI/CompressDialog.cpp @@ -1839,6 +1839,7 @@ UString CCompressDialog::GetEncryptionMethodSpec() { _encryptionMethod.GetText(s); s.RemoveChar(L'-'); + s.RemoveChar(L'+'); } return s; } From 264f0ed2bdf36eadd087ae9e57db4b45d4ccad2d Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Fri, 12 Jun 2026 21:50:46 +0800 Subject: [PATCH 12/18] Fix missing symbols --- CPP/7zip/Crypto/7zKeyDerivation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CPP/7zip/Crypto/7zKeyDerivation.cpp b/CPP/7zip/Crypto/7zKeyDerivation.cpp index 96bed8698..111155d73 100644 --- a/CPP/7zip/Crypto/7zKeyDerivation.cpp +++ b/CPP/7zip/Crypto/7zKeyDerivation.cpp @@ -95,7 +95,7 @@ void CKeyInfo::CalcKey() r += numUnroll; do { - SetUi32(dest, i) i++; dest += bufSize; + SetUi32(dest, i); i++; dest += bufSize; } while (i < r); Sha256_Update((CSha256 *)(void *)(Byte *)sha, buf, unrollSize); From f513488348948433a93a7a7e7e4734d67d7f8f04 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Sat, 13 Jun 2026 23:56:22 +0800 Subject: [PATCH 13/18] Fix build failure. --- CPP/7zip/7zip_gcc.mak | 8 ++ CPP/7zip/Bundles/Alone/makefile | 10 ++ CPP/7zip/Bundles/Alone/makefile.gcc | 10 ++ CPP/7zip/Bundles/Alone7z/makefile | 12 +- CPP/7zip/Bundles/Alone7z/makefile.gcc | 12 +- CPP/7zip/Bundles/Format7z/makefile | 12 +- CPP/7zip/Bundles/Format7zExtract/makefile | 12 +- CPP/7zip/Bundles/Format7zF/Arc_gcc.mak | 4 + CPP/7zip/Bundles/SFXCon/makefile.gcc | 12 +- CPP/7zip/Crypto/7zKeyDerivation.cpp | 7 ++ CPP/7zip/Crypto/Ascon.cpp | 5 +- CPP/7zip/Crypto/AsconSimd.h | 17 ++- CPP/7zip/Crypto/Cascade.cpp | 16 +++ CPP/7zip/Crypto/ChaCha20Simd.h | 21 +++- CPP/7zip/Crypto/XChaCha20Poly1305.cpp | 139 ++++++++++------------ CPP/7zip/Crypto/XChaCha20Poly1305.h | 2 +- CPP/Build.mak | 3 + 17 files changed, 208 insertions(+), 94 deletions(-) diff --git a/CPP/7zip/7zip_gcc.mak b/CPP/7zip/7zip_gcc.mak index 37a2d8674..69dc1f695 100644 --- a/CPP/7zip/7zip_gcc.mak +++ b/CPP/7zip/7zip_gcc.mak @@ -865,6 +865,14 @@ $O/ZipCrypto.o: ../../Crypto/ZipCrypto.cpp $(CXX) $(CXXFLAGS) $< $O/ZipStrong.o: ../../Crypto/ZipStrong.cpp $(CXX) $(CXXFLAGS) $< +$O/XChaCha20.o: ../../Crypto/XChaCha20.cpp + $(CXX) $(CXXFLAGS) $< +$O/XChaCha20Register.o: ../../Crypto/XChaCha20Register.cpp + $(CXX) $(CXXFLAGS) $< +$O/XChaCha20Poly1305.o: ../../Crypto/XChaCha20Poly1305.cpp + $(CXX) $(CXXFLAGS) $< +$O/XChaCha20Poly1305Register.o: ../../Crypto/XChaCha20Poly1305Register.cpp + $(CXX) $(CXXFLAGS) $< diff --git a/CPP/7zip/Bundles/Alone/makefile b/CPP/7zip/Bundles/Alone/makefile index 8905c21bd..a7f8c0160 100644 --- a/CPP/7zip/Bundles/Alone/makefile +++ b/CPP/7zip/Bundles/Alone/makefile @@ -187,7 +187,12 @@ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ $O\7zKeyDerivation.obj \ + $O\Ascon.obj \ + $O\Cascade.obj \ + $O\CascadeRegister.obj \ + $O\HkdfBlake2sp.obj \ $O\HmacSha1.obj \ + $O\HmacSha256.obj \ $O\HmacSha512.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ @@ -195,6 +200,10 @@ CRYPTO_OBJS = \ $O\Pbkdf2HmacSha512.obj \ $O\RandGen.obj \ $O\WzAes.obj \ + $O\XChaCha20.obj \ + $O\XChaCha20Register.obj \ + $O\XChaCha20Poly1305.obj \ + $O\XChaCha20Poly1305Register.obj \ $O\ZipCrypto.obj \ $O\ZipStrong.obj \ @@ -203,6 +212,7 @@ C_OBJS = \ $O\Alloc.obj \ $O\Bcj2.obj \ $O\Bcj2Enc.obj \ + $O\Blake2s.obj \ $O\Bra.obj \ $O\Bra86.obj \ $O\BraIA64.obj \ diff --git a/CPP/7zip/Bundles/Alone/makefile.gcc b/CPP/7zip/Bundles/Alone/makefile.gcc index 8c4f60401..5e8ab3f44 100644 --- a/CPP/7zip/Bundles/Alone/makefile.gcc +++ b/CPP/7zip/Bundles/Alone/makefile.gcc @@ -280,7 +280,12 @@ CRYPTO_OBJS = \ $O/7zAes.o \ $O/7zAesRegister.o \ $O/7zKeyDerivation.o \ + $O/Ascon.o \ + $O/Cascade.o \ + $O/CascadeRegister.o \ + $O/HkdfBlake2sp.o \ $O/HmacSha1.o \ + $O/HmacSha256.o \ $O/HmacSha512.o \ $O/MyAes.o \ $O/MyAesReg.o \ @@ -288,6 +293,10 @@ CRYPTO_OBJS = \ $O/Pbkdf2HmacSha512.o \ $O/RandGen.o \ $O/WzAes.o \ + $O/XChaCha20.o \ + $O/XChaCha20Register.o \ + $O/XChaCha20Poly1305.o \ + $O/XChaCha20Poly1305Register.o \ $O/ZipCrypto.o \ $O/ZipStrong.o \ @@ -300,6 +309,7 @@ C_OBJS = \ $O/Alloc.o \ $O/Bcj2.o \ $O/Bcj2Enc.o \ + $O/Blake2s.o \ $O/Bra.o \ $O/Bra86.o \ $O/BraIA64.o \ diff --git a/CPP/7zip/Bundles/Alone7z/makefile b/CPP/7zip/Bundles/Alone7z/makefile index 173f02d66..ea3702cf3 100644 --- a/CPP/7zip/Bundles/Alone7z/makefile +++ b/CPP/7zip/Bundles/Alone7z/makefile @@ -126,17 +126,27 @@ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ $O\7zKeyDerivation.obj \ + $O\Ascon.obj \ + $O\Cascade.obj \ + $O\CascadeRegister.obj \ + $O\HkdfBlake2sp.obj \ + $O\HmacSha256.obj \ $O\HmacSha512.obj \ - $O\Pbkdf2HmacSha512.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ + $O\Pbkdf2HmacSha512.obj \ $O\RandGen.obj \ + $O\XChaCha20.obj \ + $O\XChaCha20Register.obj \ + $O\XChaCha20Poly1305.obj \ + $O\XChaCha20Poly1305Register.obj \ C_OBJS = \ $O\7zStream.obj \ $O\Alloc.obj \ $O\Bcj2.obj \ $O\Bcj2Enc.obj \ + $O\Blake2s.obj \ $O\Bra.obj \ $O\Bra86.obj \ $O\BraIA64.obj \ diff --git a/CPP/7zip/Bundles/Alone7z/makefile.gcc b/CPP/7zip/Bundles/Alone7z/makefile.gcc index 91dfcd913..a37e54677 100644 --- a/CPP/7zip/Bundles/Alone7z/makefile.gcc +++ b/CPP/7zip/Bundles/Alone7z/makefile.gcc @@ -223,11 +223,20 @@ CRYPTO_OBJS = \ $O/7zAes.o \ $O/7zAesRegister.o \ $O/7zKeyDerivation.o \ + $O/Ascon.o \ + $O/Cascade.o \ + $O/CascadeRegister.o \ + $O/HkdfBlake2sp.o \ + $O/HmacSha256.o \ $O/HmacSha512.o \ - $O/Pbkdf2HmacSha512.o \ $O/MyAes.o \ $O/MyAesReg.o \ + $O/Pbkdf2HmacSha512.o \ $O/RandGen.o \ + $O/XChaCha20.o \ + $O/XChaCha20Register.o \ + $O/XChaCha20Poly1305.o \ + $O/XChaCha20Poly1305Register.o \ C_OBJS = \ $O/7zCrc.o \ @@ -238,6 +247,7 @@ C_OBJS = \ $O/Alloc.o \ $O/Bcj2.o \ $O/Bcj2Enc.o \ + $O/Blake2s.o \ $O/Bra.o \ $O/Bra86.o \ $O/BraIA64.o \ diff --git a/CPP/7zip/Bundles/Format7z/makefile b/CPP/7zip/Bundles/Format7z/makefile index 62b76dd91..5c8235904 100644 --- a/CPP/7zip/Bundles/Format7z/makefile +++ b/CPP/7zip/Bundles/Format7z/makefile @@ -108,17 +108,27 @@ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ $O\7zKeyDerivation.obj \ + $O\Ascon.obj \ + $O\Cascade.obj \ + $O\CascadeRegister.obj \ + $O\HkdfBlake2sp.obj \ + $O\HmacSha256.obj \ $O\HmacSha512.obj \ - $O\Pbkdf2HmacSha512.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ + $O\Pbkdf2HmacSha512.obj \ $O\RandGen.obj \ + $O\XChaCha20.obj \ + $O\XChaCha20Register.obj \ + $O\XChaCha20Poly1305.obj \ + $O\XChaCha20Poly1305Register.obj \ C_OBJS = \ $O\7zStream.obj \ $O\Alloc.obj \ $O\Bcj2.obj \ $O\Bcj2Enc.obj \ + $O\Blake2s.obj \ $O\Bra.obj \ $O\Bra86.obj \ $O\BraIA64.obj \ diff --git a/CPP/7zip/Bundles/Format7zExtract/makefile b/CPP/7zip/Bundles/Format7zExtract/makefile index c03da305b..b26631a5c 100644 --- a/CPP/7zip/Bundles/Format7zExtract/makefile +++ b/CPP/7zip/Bundles/Format7zExtract/makefile @@ -88,15 +88,25 @@ CRYPTO_OBJS = \ $O\7zAes.obj \ $O\7zAesRegister.obj \ $O\7zKeyDerivation.obj \ + $O\Ascon.obj \ + $O\Cascade.obj \ + $O\CascadeRegister.obj \ + $O\HkdfBlake2sp.obj \ + $O\HmacSha256.obj \ $O\HmacSha512.obj \ - $O\Pbkdf2HmacSha512.obj \ $O\MyAes.obj \ $O\MyAesReg.obj \ + $O\Pbkdf2HmacSha512.obj \ + $O\XChaCha20.obj \ + $O\XChaCha20Register.obj \ + $O\XChaCha20Poly1305.obj \ + $O\XChaCha20Poly1305Register.obj \ C_OBJS = \ $O\7zStream.obj \ $O\Alloc.obj \ $O\Bcj2.obj \ + $O\Blake2s.obj \ $O\Bra.obj \ $O\Bra86.obj \ $O\BraIA64.obj \ diff --git a/CPP/7zip/Bundles/Format7zF/Arc_gcc.mak b/CPP/7zip/Bundles/Format7zF/Arc_gcc.mak index 4ca3ae63c..07933d6ab 100644 --- a/CPP/7zip/Bundles/Format7zF/Arc_gcc.mak +++ b/CPP/7zip/Bundles/Format7zF/Arc_gcc.mak @@ -312,6 +312,10 @@ CRYPTO_OBJS = \ $O/Pbkdf2HmacSha512.o \ $O/RandGen.o \ $O/WzAes.o \ + $O/XChaCha20.o \ + $O/XChaCha20Poly1305.o \ + $O/XChaCha20Poly1305Register.o \ + $O/XChaCha20Register.o \ $O/ZipCrypto.o \ $O/ZipStrong.o \ diff --git a/CPP/7zip/Bundles/SFXCon/makefile.gcc b/CPP/7zip/Bundles/SFXCon/makefile.gcc index d9b996e7e..8c4899386 100644 --- a/CPP/7zip/Bundles/SFXCon/makefile.gcc +++ b/CPP/7zip/Bundles/SFXCon/makefile.gcc @@ -173,14 +173,24 @@ CRYPTO_OBJS = \ $O/7zAes.o \ $O/7zAesRegister.o \ $O/7zKeyDerivation.o \ + $O/Ascon.o \ + $O/Cascade.o \ + $O/CascadeRegister.o \ + $O/HkdfBlake2sp.o \ + $O/HmacSha256.o \ $O/HmacSha512.o \ - $O/Pbkdf2HmacSha512.o \ $O/MyAes.o \ + $O/Pbkdf2HmacSha512.o \ + $O/XChaCha20.o \ + $O/XChaCha20Register.o \ + $O/XChaCha20Poly1305.o \ + $O/XChaCha20Poly1305Register.o \ C_OBJS = \ $O/7zStream.o \ $O/Alloc.o \ $O/Bcj2.o \ + $O/Blake2s.o \ $O/Bra.o \ $O/Bra86.o \ $O/BraIA64.o \ diff --git a/CPP/7zip/Crypto/7zKeyDerivation.cpp b/CPP/7zip/Crypto/7zKeyDerivation.cpp index 111155d73..c7b3da3d6 100644 --- a/CPP/7zip/Crypto/7zKeyDerivation.cpp +++ b/CPP/7zip/Crypto/7zKeyDerivation.cpp @@ -16,9 +16,16 @@ namespace N7zKeyDerivation { static bool ConstantTimeCompare(const Byte *a, const Byte *b, size_t size) { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable: 4746) +#endif volatile Byte result = 0; for (size_t i = 0; i < size; i++) result |= a[i] ^ b[i]; +#if defined(_MSC_VER) +#pragma warning(pop) +#endif return result == 0; } diff --git a/CPP/7zip/Crypto/Ascon.cpp b/CPP/7zip/Crypto/Ascon.cpp index e0dbb2e1b..c73f0cc51 100644 --- a/CPP/7zip/Crypto/Ascon.cpp +++ b/CPP/7zip/Crypto/Ascon.cpp @@ -15,10 +15,11 @@ namespace NAscon { #ifdef MY_CPU_X86_OR_AMD64 bool g_SSE2Enabled = false; -bool g_AVX512Enabled = false; bool g_SIMDInitialized = false; #ifdef MY_CPU_AMD64 +bool g_AVX512Enabled = false; + static UInt64 Ascon_xgetbv(UInt32 num) { #if defined(_MSC_VER) @@ -27,6 +28,8 @@ static UInt64 Ascon_xgetbv(UInt32 num) UInt32 a, d; __asm__ __volatile__("xgetbv" : "=a"(a), "=d"(d) : "c"(num) : "cc"); return ((UInt64)d << 32) | a; +#else + return 0; #endif } #endif diff --git a/CPP/7zip/Crypto/AsconSimd.h b/CPP/7zip/Crypto/AsconSimd.h index c285ffe70..e9a16543f 100644 --- a/CPP/7zip/Crypto/AsconSimd.h +++ b/CPP/7zip/Crypto/AsconSimd.h @@ -17,9 +17,12 @@ namespace NAscon { #ifdef MY_CPU_X86_OR_AMD64 extern bool g_SSE2Enabled; -extern bool g_AVX512Enabled; extern bool g_SIMDInitialized; +#ifdef MY_CPU_AMD64 +extern bool g_AVX512Enabled; +#endif + void InitSIMD(); #endif @@ -51,7 +54,13 @@ static Z7_FORCE_INLINE void AsconDecBlock_SSE2(UInt64 state[5], Byte *data) #include -static Z7_FORCE_INLINE void AsconRound_AVX512(UInt64 *st, UInt64 C) +#if defined(__GNUC__) || defined(__clang__) +#define Z7_AVX512_TARGET __attribute__((target("avx512f"))) +#else +#define Z7_AVX512_TARGET +#endif + +static Z7_AVX512_TARGET Z7_FORCE_INLINE void AsconRound_AVX512(UInt64 *st, UInt64 C) { const UInt64 z = 0; const __mmask8 mxor1 = 0x15; @@ -85,7 +94,7 @@ static Z7_FORCE_INLINE void AsconRound_AVX512(UInt64 *st, UInt64 C) _mm512_storeu_si512((void*)st, s); } -static Z7_FORCE_INLINE void AsconP12_AVX512(UInt64 state[5]) +static Z7_AVX512_TARGET inline void AsconP12_AVX512(UInt64 state[5]) { AsconRound_AVX512(state, 0xf0); AsconRound_AVX512(state, 0xe1); AsconRound_AVX512(state, 0xd2); AsconRound_AVX512(state, 0xc3); @@ -95,7 +104,7 @@ static Z7_FORCE_INLINE void AsconP12_AVX512(UInt64 state[5]) AsconRound_AVX512(state, 0x5a); AsconRound_AVX512(state, 0x4b); } -static Z7_FORCE_INLINE void AsconP8_AVX512(UInt64 state[5]) +static Z7_AVX512_TARGET inline void AsconP8_AVX512(UInt64 state[5]) { AsconRound_AVX512(state, 0xb4); AsconRound_AVX512(state, 0xa5); AsconRound_AVX512(state, 0x96); AsconRound_AVX512(state, 0x87); diff --git a/CPP/7zip/Crypto/Cascade.cpp b/CPP/7zip/Crypto/Cascade.cpp index 57b44e035..26e9764b6 100644 --- a/CPP/7zip/Crypto/Cascade.cpp +++ b/CPP/7zip/Crypto/Cascade.cpp @@ -458,11 +458,18 @@ Z7_COM7F_IMF(CAXPDecoder::CryptoAuthVerify(Int32 *result)) _finalized = true; { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable: 4746) +#endif volatile Byte diff = 0; for (unsigned i = 0; i < kTagSize; i++) diff |= computedTag[i] ^ _expectedTag[i]; *result = (diff == 0) ? 0 : 1; _authOk = (diff == 0); +#if defined(_MSC_VER) +#pragma warning(pop) +#endif } Z7_memset_0_ARRAY(computedTag); @@ -664,6 +671,7 @@ void CBaseCoder::ProcessEnc(Byte *data, UInt32 size) const bool useSSE2 = ASCON_USE_SSE2; #else const bool useSSE2 = false; + (void)useSSE2; #endif if (!_keyDerived) { @@ -743,6 +751,7 @@ void CBaseCoder::ProcessDec(Byte *data, UInt32 size) const bool useSSE2 = ASCON_USE_SSE2; #else const bool useSSE2 = false; + (void)useSSE2; #endif if (!_keyDerived) { @@ -1188,11 +1197,18 @@ Z7_COM7F_IMF(CDecoder::CryptoAuthVerify(Int32 *result)) Finalize(computedTag); { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable: 4746) +#endif volatile Byte diff = 0; for (unsigned i = 0; i < NAscon::kTagSize; i++) diff |= computedTag[i] ^ _expectedTag[i]; *result = (diff == 0) ? 0 : 1; _authOk = (diff == 0); +#if defined(_MSC_VER) +#pragma warning(pop) +#endif } Z7_memset_0_ARRAY(computedTag); diff --git a/CPP/7zip/Crypto/ChaCha20Simd.h b/CPP/7zip/Crypto/ChaCha20Simd.h index 3ce6f5e65..12efe072e 100644 --- a/CPP/7zip/Crypto/ChaCha20Simd.h +++ b/CPP/7zip/Crypto/ChaCha20Simd.h @@ -222,14 +222,20 @@ Z7_NO_INLINE void ChaCha20_OperateKeystream_SSE2( #ifdef MY_CPU_AMD64 +#if defined(__GNUC__) || defined(__clang__) +#define Z7_AVX2_TARGET_ATTR __attribute__((target("avx2"))) +#else +#define Z7_AVX2_TARGET_ATTR +#endif + template -Z7_FORCE_INLINE __m256i RotateLeft_AVX2(const __m256i val) +Z7_AVX2_TARGET_ATTR Z7_FORCE_INLINE __m256i RotateLeft_AVX2(const __m256i val) { return _mm256_or_si256(_mm256_slli_epi32(val, R), _mm256_srli_epi32(val, 32 - R)); } template <> -Z7_FORCE_INLINE __m256i RotateLeft_AVX2<8>(const __m256i val) +Z7_AVX2_TARGET_ATTR Z7_FORCE_INLINE __m256i RotateLeft_AVX2<8>(const __m256i val) { const __m256i mask = _mm256_set_epi8( 14,13,12,15, 10,9,8,11, 6,5,4,7, 2,1,0,3, @@ -238,7 +244,7 @@ Z7_FORCE_INLINE __m256i RotateLeft_AVX2<8>(const __m256i val) } template <> -Z7_FORCE_INLINE __m256i RotateLeft_AVX2<16>(const __m256i val) +Z7_AVX2_TARGET_ATTR Z7_FORCE_INLINE __m256i RotateLeft_AVX2<16>(const __m256i val) { const __m256i mask = _mm256_set_epi8( 13,12,15,14, 9,8,11,10, 5,4,7,6, 1,0,3,2, @@ -260,7 +266,7 @@ Z7_FORCE_INLINE __m256i RotateLeft_AVX2<16>(const __m256i val) b = _mm256_xor_si256(b, c); \ b = RotateLeft_AVX2<7>(b); -Z7_NO_INLINE void ChaCha20_OperateKeystream_AVX2( +Z7_AVX2_TARGET_ATTR Z7_NO_INLINE void ChaCha20_OperateKeystream_AVX2( const UInt32 *state, const Byte *input, Byte *output) @@ -481,8 +487,10 @@ Z7_NO_INLINE void ChaCha20_OperateKeystream_AVX2( } static bool g_SSE2Enabled = false; -static bool g_AVX2Enabled = false; static bool g_SIMDInitialized = false; +#ifdef MY_CPU_AMD64 +static bool g_AVX2Enabled = false; +#endif static void InitSIMD() { @@ -513,7 +521,8 @@ Z7_FORCE_INLINE uint32x4_t RotateLeft_NEON(const uint32x4_t val) template <> Z7_FORCE_INLINE uint32x4_t RotateLeft_NEON<8>(const uint32x4_t val) { - const uint8x16_t mask = {3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14}; + static const uint8_t kMaskRot8[16] = {3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14}; + const uint8x16_t mask = vld1q_u8(kMaskRot8); return vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(val), mask)); } diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp index 2e03c6794..516b58089 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp @@ -20,6 +20,14 @@ #include "RandGen.h" #endif +#if defined(MY_CPU_AMD64) +#if defined(_MSC_VER) +#include +#else +#include +#endif +#endif + namespace NCrypto { namespace NXChaCha20Poly1305 { @@ -65,14 +73,28 @@ void CPoly1305::SetKey(const Byte *key) memcpy(_s, key + 16, 16); } -#if defined(__SIZEOF_INT128__) && (__SIZEOF_INT128__ >= 16) - #define Z7_POLY1305_128BIT -#elif defined(_M_AMD64) - #include +#if defined(MY_CPU_AMD64) #define Z7_POLY1305_128BIT #endif #ifdef Z7_POLY1305_128BIT + +#if defined(MY_CPU_AMD64) && !defined(_MSC_VER) +/* GCC/Clang fallback for _umul128 */ +static inline UInt64 Z7_umul128(UInt64 a, UInt64 b, UInt64 *hi) +{ + unsigned __int128 p = (unsigned __int128)a * b; + *hi = (UInt64)(p >> 64); + return (UInt64)p; +} +#define _umul128 Z7_umul128 +/* GCC/Clang: _addcarry_u64 uses unsigned long long, cast UInt64* to match */ +#define Z7_ADDCARRY_U64(c, x, y, out) \ + _addcarry_u64((c), (unsigned long long)(x), (unsigned long long)(y), (unsigned long long *)(out)) +#elif defined(_MSC_VER) +#define Z7_ADDCARRY_U64(c, x, y, out) _addcarry_u64((c), (x), (y), (out)) +#endif + static void Poly1305_ProcessBlock_128(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) { UInt64 d0 = GetUi32(h); @@ -91,59 +113,15 @@ static void Poly1305_ProcessBlock_128(Byte h[16], const Byte r[16], const Byte b UInt64 r0 = GetUi64(r); UInt64 r1 = GetUi64(r + 8); -#if defined(__SIZEOF_INT128__) && (__SIZEOF_INT128__ >= 16) - typedef unsigned __int128 U128; - - U128 hv = (U128)m0 | ((U128)m1 << 26) | ((U128)m2 << 52) | ((U128)m3 << 78) | ((U128)m4 << 104); - U128 msg = (U128)msg_lo | ((U128)msg_hi << 64); - if (hasHighBit) - msg |= (U128)1 << 128; - hv += msg; - - U128 rv = (U128)r0 | ((U128)r1 << 64); - - U128 product = hv * rv; - - UInt64 a0 = (UInt64)product; - UInt64 a1 = (UInt64)(product >> 64); - UInt64 a2 = (UInt64)(product >> 128); - UInt64 a3 = (UInt64)(product >> 192); - - U128 p_lo = (U128)a0 | ((U128)a1 << 64) | ((U128)(a2 & 3) << 128); - U128 p_hi = (a2 >> 2) | ((U128)a3 << 62); - - U128 res = p_lo + p_hi * 5; - - U128 overflow = res >> 130; - while (overflow) - { - res = (res & (((U128)1 << 130) - 1)) + overflow * 5; - overflow = res >> 130; - } - - UInt64 lo = (UInt64)res; - UInt64 hi = (UInt64)(res >> 64); - UInt32 top = (UInt32)(res >> 128); - - UInt64 limb0 = lo & 0x3FFFFFF; - UInt64 limb1 = (lo >> 26) & 0x3FFFFFF; - UInt64 limb2 = ((lo >> 52) | ((hi & 0x3FFF) << 12)) & 0x3FFFFFF; - UInt64 limb3 = (hi >> 14) & 0x3FFFFFF; - UInt64 limb4 = ((hi >> 40) | ((UInt64)top << 24)) & 0x3FFFFFF; - - SetUi32(h, (UInt32)(limb0 | (limb1 << 26))); - SetUi32(h + 4, (UInt32)((limb1 >> 6) | (limb2 << 20))); - SetUi32(h + 8, (UInt32)((limb2 >> 12) | (limb3 << 14))); - SetUi32(h + 12, (UInt32)((limb3 >> 18) | (limb4 << 8))); -#elif defined(_M_AMD64) +#if defined(MY_CPU_AMD64) { UInt64 hv0 = m0 | (m1 << 26) | ((m2 & 0xFFF) << 52); UInt64 hv1 = (m2 >> 12) | (m3 << 14) | (m4 << 40); UInt64 hv2 = 0; unsigned char c; - c = _addcarry_u64(0, hv0, msg_lo, &hv0); - c = _addcarry_u64(c, hv1, msg_hi, &hv1); + c = Z7_ADDCARRY_U64(0, hv0, msg_lo, &hv0); + c = Z7_ADDCARRY_U64(c, hv1, msg_hi, &hv1); hv2 += c + (hasHighBit ? 1 : 0); UInt64 d0_hi, d0_lo = _umul128(hv0, r0, &d0_hi); @@ -154,19 +132,19 @@ static void Poly1305_ProcessBlock_128(Byte h[16], const Byte r[16], const Byte b UInt64 d3_hi, d3_lo = _umul128(hv2, r1, &d3_hi); UInt64 a0 = d0_lo, a1 = d0_hi, a2 = 0, a3 = 0; - c = _addcarry_u64(0, a1, d1a_lo, &a1); - c = _addcarry_u64(c, a2, d1a_hi, &a2); - c = _addcarry_u64(c, a3, 0, &a3); - c = _addcarry_u64(0, a1, d1b_lo, &a1); - c = _addcarry_u64(c, a2, d1b_hi, &a2); - c = _addcarry_u64(c, a3, 0, &a3); - c = _addcarry_u64(0, a2, d2a_lo, &a2); - c = _addcarry_u64(c, a3, d2a_hi, &a3); - c = _addcarry_u64(0, a2, d2b_lo, &a2); - c = _addcarry_u64(c, a3, d2b_hi, &a3); + c = Z7_ADDCARRY_U64(0, a1, d1a_lo, &a1); + c = Z7_ADDCARRY_U64(c, a2, d1a_hi, &a2); + c = Z7_ADDCARRY_U64(c, a3, 0, &a3); + c = Z7_ADDCARRY_U64(0, a1, d1b_lo, &a1); + c = Z7_ADDCARRY_U64(c, a2, d1b_hi, &a2); + c = Z7_ADDCARRY_U64(c, a3, 0, &a3); + c = Z7_ADDCARRY_U64(0, a2, d2a_lo, &a2); + c = Z7_ADDCARRY_U64(c, a3, d2a_hi, &a3); + c = Z7_ADDCARRY_U64(0, a2, d2b_lo, &a2); + c = Z7_ADDCARRY_U64(c, a3, d2b_hi, &a3); UInt64 a4 = c; - c = _addcarry_u64(0, a3, d3_lo, &a3); - c = _addcarry_u64(c, a4, d3_hi, &a4); + c = Z7_ADDCARRY_U64(0, a3, d3_lo, &a3); + c = Z7_ADDCARRY_U64(c, a4, d3_hi, &a4); UInt64 hi[3]; hi[0] = (a2 >> 2) | (a3 << 62); @@ -178,14 +156,14 @@ static void Poly1305_ProcessBlock_128(Byte h[16], const Byte r[16], const Byte b UInt64 h5_2 = hi[2] * 5; UInt64 lo0 = a0, lo1 = a1, lo2 = a2 & 3, lo3 = 0; - c = _addcarry_u64(0, lo0, h5_0, &lo0); - c = _addcarry_u64(c, lo1, h5_0_hi, &lo1); - c = _addcarry_u64(c, lo2, 0, &lo2); - c = _addcarry_u64(0, lo1, h5_1, &lo1); - c = _addcarry_u64(c, lo2, h5_1_hi, &lo2); - c = _addcarry_u64(c, lo3, 0, &lo3); - c = _addcarry_u64(0, lo2, h5_2, &lo2); - c = _addcarry_u64(c, lo3, 0, &lo3); + c = Z7_ADDCARRY_U64(0, lo0, h5_0, &lo0); + c = Z7_ADDCARRY_U64(c, lo1, h5_0_hi, &lo1); + c = Z7_ADDCARRY_U64(c, lo2, 0, &lo2); + c = Z7_ADDCARRY_U64(0, lo1, h5_1, &lo1); + c = Z7_ADDCARRY_U64(c, lo2, h5_1_hi, &lo2); + c = Z7_ADDCARRY_U64(c, lo3, 0, &lo3); + c = Z7_ADDCARRY_U64(0, lo2, h5_2, &lo2); + c = Z7_ADDCARRY_U64(c, lo3, 0, &lo3); UInt64 ov0 = lo2 >> 2; lo2 &= 3; @@ -193,18 +171,18 @@ static void Poly1305_ProcessBlock_128(Byte h[16], const Byte r[16], const Byte b UInt64 ov5_lo, ov5_hi; ov5_lo = _umul128(ov0, 5, &ov5_hi); - c = _addcarry_u64(0, lo0, ov5_lo, &lo0); - c = _addcarry_u64(c, lo1, ov5_hi, &lo1); - c = _addcarry_u64(c, lo2, 0, &lo2); + c = Z7_ADDCARRY_U64(0, lo0, ov5_lo, &lo0); + c = Z7_ADDCARRY_U64(c, lo1, ov5_hi, &lo1); + c = Z7_ADDCARRY_U64(c, lo2, 0, &lo2); ov0 = lo2 >> 2; if (ov0) { lo2 &= 3; ov5_lo = _umul128(ov0, 5, &ov5_hi); - c = _addcarry_u64(0, lo0, ov5_lo, &lo0); - c = _addcarry_u64(c, lo1, ov5_hi, &lo1); - c = _addcarry_u64(c, lo2, 0, &lo2); + c = Z7_ADDCARRY_U64(0, lo0, ov5_lo, &lo0); + c = Z7_ADDCARRY_U64(c, lo1, ov5_hi, &lo1); + c = Z7_ADDCARRY_U64(c, lo2, 0, &lo2); } UInt64 limb0 = lo0 & 0x3FFFFFF; @@ -708,10 +686,17 @@ Z7_COM7F_IMF(CDecoder::CryptoAuthVerify(Int32 *result)) _poly1305.Final(computedTag); { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable: 4746) +#endif volatile Byte diff = 0; for (unsigned i = 0; i < kTagSize; i++) diff |= computedTag[i] ^ _expectedTag[i]; _authResult = (diff == 0) ? 0 : 1; +#if defined(_MSC_VER) +#pragma warning(pop) +#endif } *result = _authResult; diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.h b/CPP/7zip/Crypto/XChaCha20Poly1305.h index 19cd5129f..058cac155 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305.h +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.h @@ -61,7 +61,7 @@ class CBaseCoder: Byte _aad[2 + 16 + kNonceSize]; unsigned _aadSize; - void DeriveKey(); + void DeriveKey() override; void ComputePolyKey(); }; diff --git a/CPP/Build.mak b/CPP/Build.mak index d9fcfb52f..409420da5 100644 --- a/CPP/Build.mak +++ b/CPP/Build.mak @@ -164,6 +164,9 @@ LFLAGS = $(LFLAGS) /FIXED:NO !IF "$(PLATFORM)" == "arm64" # we can get better compression ratio with ARM64 filter if we change alignment to 4096 # LFLAGS = $(LFLAGS) /FILEALIGN:4096 +# ARM64 MSVC compiler doesn't suppress C4746 even with /volatile:ms flag, +# so we explicitly disable warning 4746 (volatile in /volatile:iso mode) +CFLAGS = $(CFLAGS) -wd4746 !ENDIF !IFNDEF DEF_FILE From db1b07eb7ddb94fc3caa6d25b8efc424bb0ca6ce Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Wed, 17 Jun 2026 17:47:38 +0800 Subject: [PATCH 14/18] Fix Poly-1305 bug --- CPP/7zip/Archive/7z/7zDecode.cpp | 2 + CPP/7zip/Crypto/XChaCha20.cpp | 30 +- CPP/7zip/Crypto/XChaCha20Poly1305.cpp | 454 +++++++------------------- CPP/7zip/Crypto/XChaCha20Poly1305.h | 9 +- 4 files changed, 131 insertions(+), 364 deletions(-) diff --git a/CPP/7zip/Archive/7z/7zDecode.cpp b/CPP/7zip/Archive/7z/7zDecode.cpp index 056f2d0d4..04cb7c13a 100644 --- a/CPP/7zip/Archive/7z/7zDecode.cpp +++ b/CPP/7zip/Archive/7z/7zDecode.cpp @@ -590,6 +590,7 @@ HRESULT CDecoder::Decode( if (codeResult == S_OK) { + #ifndef Z7_NO_CRYPTO for (i = 0; i < folderInfo.Coders.Size(); i++) { Z7_DECL_CMyComPtr_QI_FROM( @@ -603,6 +604,7 @@ HRESULT CDecoder::Decode( return E_FAIL; } } + #endif } return codeResult; } diff --git a/CPP/7zip/Crypto/XChaCha20.cpp b/CPP/7zip/Crypto/XChaCha20.cpp index f93f8cd0a..c06de0b2a 100644 --- a/CPP/7zip/Crypto/XChaCha20.cpp +++ b/CPP/7zip/Crypto/XChaCha20.cpp @@ -168,21 +168,21 @@ void XChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce, UInt6 x14 += GetUi32(nonce); x15 += GetUi32(nonce + 4); - SetUi32(output, x0) - SetUi32(output + 4, x1) - SetUi32(output + 8, x2) - SetUi32(output + 12, x3) - SetUi32(output + 16, x4) - SetUi32(output + 20, x5) - SetUi32(output + 24, x6) - SetUi32(output + 28, x7) - SetUi32(output + 32, x8) - SetUi32(output + 36, x9) - SetUi32(output + 40, x10) - SetUi32(output + 44, x11) - SetUi32(output + 48, x12) - SetUi32(output + 52, x13) - SetUi32(output + 56, x14) + SetUi32(output, x0); + SetUi32(output + 4, x1); + SetUi32(output + 8, x2); + SetUi32(output + 12, x3); + SetUi32(output + 16, x4); + SetUi32(output + 20, x5); + SetUi32(output + 24, x6); + SetUi32(output + 28, x7); + SetUi32(output + 32, x8); + SetUi32(output + 36, x9); + SetUi32(output + 40, x10); + SetUi32(output + 44, x11); + SetUi32(output + 48, x12); + SetUi32(output + 52, x13); + SetUi32(output + 56, x14); SetUi32(output + 60, x15); } diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp index 516b58089..84ef1d85c 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp @@ -21,11 +21,12 @@ #endif #if defined(MY_CPU_AMD64) -#if defined(_MSC_VER) -#include -#else -#include -#endif + #ifdef _MSC_VER + #include + #else + #include + #endif + #define Z7_POLY1305_128BIT #endif namespace NCrypto { @@ -49,6 +50,7 @@ void CPoly1305::Reset() memset(_r, 0, sizeof(_r)); memset(_s, 0, sizeof(_s)); memset(_h, 0, sizeof(_h)); + memset(_n, 0, sizeof(_n)); memset(_block, 0, sizeof(_block)); _blockPos = 0; _totalLen = 0; @@ -61,305 +63,79 @@ void CPoly1305::Reset() void CPoly1305::SetKey(const Byte *key) { Reset(); - memcpy(_r, key, 16); - _r[3] &= 15; - _r[7] &= 15; - _r[11] &= 15; - _r[15] &= 15; - _r[4] &= 252; - _r[8] &= 252; - _r[12] &= 252; - - memcpy(_s, key + 16, 16); -} - -#if defined(MY_CPU_AMD64) - #define Z7_POLY1305_128BIT -#endif - -#ifdef Z7_POLY1305_128BIT - -#if defined(MY_CPU_AMD64) && !defined(_MSC_VER) -/* GCC/Clang fallback for _umul128 */ -static inline UInt64 Z7_umul128(UInt64 a, UInt64 b, UInt64 *hi) -{ - unsigned __int128 p = (unsigned __int128)a * b; - *hi = (UInt64)(p >> 64); - return (UInt64)p; -} -#define _umul128 Z7_umul128 -/* GCC/Clang: _addcarry_u64 uses unsigned long long, cast UInt64* to match */ -#define Z7_ADDCARRY_U64(c, x, y, out) \ - _addcarry_u64((c), (unsigned long long)(x), (unsigned long long)(y), (unsigned long long *)(out)) -#elif defined(_MSC_VER) -#define Z7_ADDCARRY_U64(c, x, y, out) _addcarry_u64((c), (x), (y), (out)) -#endif - -static void Poly1305_ProcessBlock_128(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) -{ - UInt64 d0 = GetUi32(h); - UInt64 d1 = GetUi32(h + 4); - UInt64 d2 = GetUi32(h + 8); - UInt64 d3 = GetUi32(h + 12) & 0x3FFFFFF; - - UInt64 m0 = d0 & 0x3FFFFFF; - UInt64 m1 = (d0 >> 26) | ((d1 & 0xFFFFF) << 6); - UInt64 m2 = (d1 >> 20) | ((d2 & 0x3FFF) << 12); - UInt64 m3 = (d2 >> 14) | ((d3 & 0xFF) << 18); - UInt64 m4 = (d3 >> 8) & 0x3FFFF; - - UInt64 msg_lo = GetUi64(block); - UInt64 msg_hi = GetUi64(block + 8); - UInt64 r0 = GetUi64(r); - UInt64 r1 = GetUi64(r + 8); - -#if defined(MY_CPU_AMD64) - { - UInt64 hv0 = m0 | (m1 << 26) | ((m2 & 0xFFF) << 52); - UInt64 hv1 = (m2 >> 12) | (m3 << 14) | (m4 << 40); - UInt64 hv2 = 0; - - unsigned char c; - c = Z7_ADDCARRY_U64(0, hv0, msg_lo, &hv0); - c = Z7_ADDCARRY_U64(c, hv1, msg_hi, &hv1); - hv2 += c + (hasHighBit ? 1 : 0); - - UInt64 d0_hi, d0_lo = _umul128(hv0, r0, &d0_hi); - UInt64 d1a_hi, d1a_lo = _umul128(hv0, r1, &d1a_hi); - UInt64 d1b_hi, d1b_lo = _umul128(hv1, r0, &d1b_hi); - UInt64 d2a_hi, d2a_lo = _umul128(hv1, r1, &d2a_hi); - UInt64 d2b_hi, d2b_lo = _umul128(hv2, r0, &d2b_hi); - UInt64 d3_hi, d3_lo = _umul128(hv2, r1, &d3_hi); - - UInt64 a0 = d0_lo, a1 = d0_hi, a2 = 0, a3 = 0; - c = Z7_ADDCARRY_U64(0, a1, d1a_lo, &a1); - c = Z7_ADDCARRY_U64(c, a2, d1a_hi, &a2); - c = Z7_ADDCARRY_U64(c, a3, 0, &a3); - c = Z7_ADDCARRY_U64(0, a1, d1b_lo, &a1); - c = Z7_ADDCARRY_U64(c, a2, d1b_hi, &a2); - c = Z7_ADDCARRY_U64(c, a3, 0, &a3); - c = Z7_ADDCARRY_U64(0, a2, d2a_lo, &a2); - c = Z7_ADDCARRY_U64(c, a3, d2a_hi, &a3); - c = Z7_ADDCARRY_U64(0, a2, d2b_lo, &a2); - c = Z7_ADDCARRY_U64(c, a3, d2b_hi, &a3); - UInt64 a4 = c; - c = Z7_ADDCARRY_U64(0, a3, d3_lo, &a3); - c = Z7_ADDCARRY_U64(c, a4, d3_hi, &a4); - - UInt64 hi[3]; - hi[0] = (a2 >> 2) | (a3 << 62); - hi[1] = (a3 >> 2) | (a4 << 62); - hi[2] = a4 >> 2; - - UInt64 h5_0_hi, h5_0 = _umul128(hi[0], 5, &h5_0_hi); - UInt64 h5_1_hi, h5_1 = _umul128(hi[1], 5, &h5_1_hi); - UInt64 h5_2 = hi[2] * 5; - - UInt64 lo0 = a0, lo1 = a1, lo2 = a2 & 3, lo3 = 0; - c = Z7_ADDCARRY_U64(0, lo0, h5_0, &lo0); - c = Z7_ADDCARRY_U64(c, lo1, h5_0_hi, &lo1); - c = Z7_ADDCARRY_U64(c, lo2, 0, &lo2); - c = Z7_ADDCARRY_U64(0, lo1, h5_1, &lo1); - c = Z7_ADDCARRY_U64(c, lo2, h5_1_hi, &lo2); - c = Z7_ADDCARRY_U64(c, lo3, 0, &lo3); - c = Z7_ADDCARRY_U64(0, lo2, h5_2, &lo2); - c = Z7_ADDCARRY_U64(c, lo3, 0, &lo3); - - UInt64 ov0 = lo2 >> 2; - lo2 &= 3; - lo3 = 0; - - UInt64 ov5_lo, ov5_hi; - ov5_lo = _umul128(ov0, 5, &ov5_hi); - c = Z7_ADDCARRY_U64(0, lo0, ov5_lo, &lo0); - c = Z7_ADDCARRY_U64(c, lo1, ov5_hi, &lo1); - c = Z7_ADDCARRY_U64(c, lo2, 0, &lo2); - - ov0 = lo2 >> 2; - if (ov0) - { - lo2 &= 3; - ov5_lo = _umul128(ov0, 5, &ov5_hi); - c = Z7_ADDCARRY_U64(0, lo0, ov5_lo, &lo0); - c = Z7_ADDCARRY_U64(c, lo1, ov5_hi, &lo1); - c = Z7_ADDCARRY_U64(c, lo2, 0, &lo2); - } - - UInt64 limb0 = lo0 & 0x3FFFFFF; - UInt64 limb1 = (lo0 >> 26) & 0x3FFFFFF; - UInt64 limb2 = ((lo0 >> 52) | ((lo1 & 0x3FFF) << 12)) & 0x3FFFFFF; - UInt64 limb3 = (lo1 >> 14) & 0x3FFFFFF; - UInt64 limb4 = ((lo1 >> 40) | ((UInt64)lo2 << 24)) & 0x3FFFFFF; - - SetUi32(h, (UInt32)(limb0 | (limb1 << 26))); - SetUi32(h + 4, (UInt32)((limb1 >> 6) | (limb2 << 20))); - SetUi32(h + 8, (UInt32)((limb2 >> 12) | (limb3 << 14))); - SetUi32(h + 12, (UInt32)((limb3 >> 18) | (limb4 << 8))); - } -#endif -} -#else -#define Poly1305_ProcessBlock_128 Poly1305_ProcessBlock_32 -#endif - -#ifndef Z7_POLY1305_128BIT - -static void Poly1305_ReduceAndPack(Byte h[16], UInt64 m[8]) -{ - UInt64 c; - - c = m[0] >> 26; m[0] &= 0x3FFFFFF; - m[1] += c; - c = m[1] >> 26; m[1] &= 0x3FFFFFF; - m[2] += c; c = m[2] >> 26; m[2] &= 0x3FFFFFF; - m[3] += c; c = m[3] >> 26; m[3] &= 0x3FFFFFF; - m[4] += c; c = m[4] >> 26; m[4] &= 0x3FFFFFF; - m[5] += c; c = m[5] >> 26; m[5] &= 0x3FFFFFF; - m[6] += c; c = m[6] >> 26; m[6] &= 0x3FFFFFF; - m[7] += c; - - c = (m[3] >> 26); m[3] &= 0x3FFFFFF; - m[4] += c; - - m[0] += (m[4] >> 26) * 5; m[4] &= 0x3FFFFFF; - m[1] += (m[5] >> 26) * 5; m[5] &= 0x3FFFFFF; - m[2] += (m[6] >> 26) * 5; m[6] &= 0x3FFFFFF; - m[3] += (m[7] >> 26) * 5; m[7] &= 0x3FFFFFF; - - c = m[0] >> 26; m[0] &= 0x3FFFFFF; - m[1] += c; - c = m[1] >> 26; m[1] &= 0x3FFFFFF; - m[2] += c; c = m[2] >> 26; m[2] &= 0x3FFFFFF; - m[3] += c; c = m[3] >> 26; m[3] &= 0x3FFFFFF; - - m[0] += (m[3] >> 26) * 5; m[3] &= 0x3FFFFFF; - - c = m[0] >> 26; m[0] &= 0x3FFFFFF; - m[1] += c; - - SetUi32(h, (UInt32)((m[0]) | (m[1] << 26))); - SetUi32(h + 4, (UInt32)((m[1] >> 6) | (m[2] << 20))); - SetUi32(h + 8, (UInt32)((m[2] >> 12) | (m[3] << 14))); - SetUi32(h + 12, (UInt32)((m[3] >> 18) | (m[4] << 8))); -} -#if defined(MY_CPU_X86_OR_AMD64) && defined(MY_CPU_SSE2) + _r[0] = GetUi32(key) & 0x0fffffff; + _r[1] = GetUi32(key + 4) & 0x0ffffffc; + _r[2] = GetUi32(key + 8) & 0x0ffffffc; + _r[3] = GetUi32(key + 12) & 0x0ffffffc; -#include + _s[1] = _r[1] + (_r[1] >> 2); + _s[2] = _r[2] + (_r[2] >> 2); + _s[3] = _r[3] + (_r[3] >> 2); -static void Poly1305_ProcessBlock_SSE2_4Way(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) -{ - UInt64 d[4]; - - for (unsigned i = 0; i < 3; i++) - d[i] = (UInt64)GetUi32(h + i * 4); - d[3] = ((UInt64)GetUi32(h + 12)) & 0x3FFFFFF; - - for (unsigned i = 0; i < 3; i++) - d[i] += GetUi32(block + i * 4); - d[3] += ((UInt64)GetUi32(block + 12)) & 0x3FFFFFF; - - if (hasHighBit) - d[3] |= 0x1000000; - - UInt64 rr[4]; - rr[0] = GetUi32(r) & 0x3FFFFFF; - rr[1] = ((UInt64)GetUi32(r + 3) >> 2) & 0x3FFFF03; - rr[2] = ((UInt64)GetUi32(r + 6) >> 4) & 0x3FFC0FF; - rr[3] = ((UInt64)GetUi32(r + 9) >> 6) & 0x3F03FFF; - - __m128i d_vec = _mm_set_epi32((int)(UInt32)d[3], (int)(UInt32)d[2], - (int)(UInt32)d[1], (int)(UInt32)d[0]); - __m128i d_swap = _mm_shuffle_epi32(d_vec, _MM_SHUFFLE(0, 3, 0, 1)); - - __m128i r_even = _mm_set_epi32(0, (int)(UInt32)rr[2], 0, (int)(UInt32)rr[0]); - __m128i r_odd = _mm_set_epi32(0, (int)(UInt32)rr[3], 0, (int)(UInt32)rr[1]); - __m128i r_cross1 = _mm_set_epi32(0, (int)(UInt32)rr[0], 0, (int)(UInt32)rr[2]); - __m128i r_cross2 = _mm_set_epi32(0, (int)(UInt32)rr[1], 0, (int)(UInt32)rr[3]); - - UInt64 m[8] = { 0 }; - __m128i prod; - UInt64 pLo, pHi; - - #define POLY1305_SSE2_MUL_ACC(d_op, r_op, off_lo, off_hi) \ - prod = _mm_mul_epu32(d_op, r_op); \ - _mm_storel_epi64((__m128i *)&pLo, prod); \ - _mm_storel_epi64((__m128i *)&pHi, _mm_srli_si128(prod, 8)); \ - m[off_lo] += pLo; \ - m[off_hi] += pHi; - - POLY1305_SSE2_MUL_ACC(d_vec, r_even, 0, 4) - POLY1305_SSE2_MUL_ACC(d_vec, r_odd, 1, 5) - POLY1305_SSE2_MUL_ACC(d_swap, r_even, 1, 5) - POLY1305_SSE2_MUL_ACC(d_swap, r_odd, 2, 6) - POLY1305_SSE2_MUL_ACC(d_vec, r_cross1, 2, 2) - POLY1305_SSE2_MUL_ACC(d_vec, r_cross2, 3, 3) - POLY1305_SSE2_MUL_ACC(d_swap, r_cross1, 3, 3) - POLY1305_SSE2_MUL_ACC(d_swap, r_cross2, 4, 4) - - #undef POLY1305_SSE2_MUL_ACC - - Poly1305_ReduceAndPack(h, m); + _n[0] = GetUi32(key + 16); + _n[1] = GetUi32(key + 20); + _n[2] = GetUi32(key + 24); + _n[3] = GetUi32(key + 28); } -#endif - -#if !defined(MY_CPU_X86_OR_AMD64) || !defined(MY_CPU_SSE2) -static void Poly1305_ProcessBlock_32(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) +static inline UInt32 CONSTANT_TIME_CARRY(UInt32 a, UInt32 b) { - UInt64 d[8] = { 0 }; - - for (unsigned i = 0; i < 3; i++) - { - d[i] = (UInt64)GetUi32(h + i * 4); - } - d[3] = ((UInt64)GetUi32(h + 12)) & 0x3FFFFFF; - - for (unsigned i = 0; i < 3; i++) - { - UInt64 t = GetUi32(block + i * 4); - d[i] += t; - } - d[3] += ((UInt64)GetUi32(block + 12)) & 0x3FFFFFF; - - if (hasHighBit) - d[3] |= 0x1000000; - - UInt64 rr[4]; - rr[0] = GetUi32(r) & 0x3FFFFFF; - rr[1] = ((UInt64)GetUi32(r + 3) >> 2) & 0x3FFFF03; - rr[2] = ((UInt64)GetUi32(r + 6) >> 4) & 0x3FFC0FF; - rr[3] = ((UInt64)GetUi32(r + 9) >> 6) & 0x3F03FFF; - - UInt64 m[8] = { 0 }; - for (unsigned i = 0; i < 4; i++) - { - for (unsigned j = 0; j < 4; j++) - { - m[i + j] += d[i] * rr[j]; - } - } - - Poly1305_ReduceAndPack(h, m); + return ((a ^ ((a ^ b) | ((a - b) ^ b))) >> 31); } -#endif -#endif -#ifdef Z7_POLY1305_128BIT -static void Poly1305_ProcessBlock(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) +static void Poly1305_ProcessBlock(UInt32 h[5], const UInt32 r[4], const UInt32 s[4], const Byte block[16], bool hasHighBit) { - Poly1305_ProcessBlock_128(h, r, block, hasHighBit); + UInt32 r0 = r[0], r1 = r[1], r2 = r[2], r3 = r[3]; + UInt32 s1 = s[1], s2 = s[2], s3 = s[3]; + UInt32 h0 = h[0], h1 = h[1], h2 = h[2], h3 = h[3], h4 = h[4]; + UInt64 d0, d1, d2, d3; + UInt32 c; + + h0 = (UInt32)(d0 = (UInt64)h0 + GetUi32(block)); + h1 = (UInt32)(d1 = (UInt64)h1 + (d0 >> 32) + GetUi32(block + 4)); + h2 = (UInt32)(d2 = (UInt64)h2 + (d1 >> 32) + GetUi32(block + 8)); + h3 = (UInt32)(d3 = (UInt64)h3 + (d2 >> 32) + GetUi32(block + 12)); + h4 += (UInt32)(d3 >> 32) + (hasHighBit ? 1 : 0); + + d0 = ((UInt64)h0 * r0) + + ((UInt64)h1 * s3) + + ((UInt64)h2 * s2) + + ((UInt64)h3 * s1); + d1 = ((UInt64)h0 * r1) + + ((UInt64)h1 * r0) + + ((UInt64)h2 * s3) + + ((UInt64)h3 * s2) + + ((UInt64)h4 * s1); + d2 = ((UInt64)h0 * r2) + + ((UInt64)h1 * r1) + + ((UInt64)h2 * r0) + + ((UInt64)h3 * s3) + + ((UInt64)h4 * s2); + d3 = ((UInt64)h0 * r3) + + ((UInt64)h1 * r2) + + ((UInt64)h2 * r1) + + ((UInt64)h3 * r0) + + ((UInt64)h4 * s3); + h4 = (UInt32)(h4 * r0); + + h0 = (UInt32)d0; + h1 = (UInt32)(d1 += d0 >> 32); + h2 = (UInt32)(d2 += d1 >> 32); + h3 = (UInt32)(d3 += d2 >> 32); + h4 += (UInt32)(d3 >> 32); + + c = (h4 >> 2) + (h4 & ~3U); + h4 &= 3; + h0 += c; + h1 += (c = CONSTANT_TIME_CARRY(h0, c)); + h2 += (c = CONSTANT_TIME_CARRY(h1, c)); + h3 += (c = CONSTANT_TIME_CARRY(h2, c)); + h4 += CONSTANT_TIME_CARRY(h3, c); + + h[0] = h0; h[1] = h1; h[2] = h2; + h[3] = h3; h[4] = h4; } -#else -static void Poly1305_ProcessBlock(Byte h[16], const Byte r[16], const Byte block[16], bool hasHighBit) -{ -#if defined(MY_CPU_X86_OR_AMD64) && defined(MY_CPU_SSE2) - Poly1305_ProcessBlock_SSE2_4Way(h, r, block, hasHighBit); -#else - Poly1305_ProcessBlock_32(h, r, block, hasHighBit); -#endif -} -#endif void CPoly1305::ProcessBlocks(Byte *buf, unsigned &bufPos, UInt64 &len, const Byte *data, UInt32 size) { @@ -375,14 +151,14 @@ void CPoly1305::ProcessBlocks(Byte *buf, unsigned &bufPos, UInt64 &len, const By size -= n; if (bufPos == 16) { - Poly1305_ProcessBlock(_h, _r, buf, true); + Poly1305_ProcessBlock(_h, _r, _s, buf, true); bufPos = 0; } } while (size >= 16) { - Poly1305_ProcessBlock(_h, _r, data, true); + Poly1305_ProcessBlock(_h, _r, _s, data, true); data += 16; size -= 16; } @@ -406,14 +182,13 @@ void CPoly1305::UpdateAad(const Byte *data, UInt32 size) ProcessBlocks(_aadBlock, _aadBlockPos, _aadLen, data, size); } -void CPoly1305::PadAndProcessBlock(Byte *buf, unsigned bufPos, UInt64 len) +void CPoly1305::PadAndProcessBlock(Byte *buf, unsigned bufPos) { - unsigned mod = (unsigned)(len & 0xF); - if (mod != 0) + if (bufPos != 0) { - unsigned padLen = 16 - mod; - memset(buf + bufPos, 0, padLen); - Poly1305_ProcessBlock(_h, _r, buf, true); + buf[bufPos] = 1; + memset(buf + bufPos + 1, 0, 16 - bufPos - 1); + Poly1305_ProcessBlock(_h, _r, _s, buf, false); } } @@ -423,8 +198,8 @@ void CPoly1305::Final(Byte *tag) return; _finalized = true; - PadAndProcessBlock(_aadBlock, _aadBlockPos, _aadLen); - PadAndProcessBlock(_block, _blockPos, _totalLen); + PadAndProcessBlock(_aadBlock, _aadBlockPos); + PadAndProcessBlock(_block, _blockPos); { Byte lenBlock[16]; @@ -432,42 +207,38 @@ void CPoly1305::Final(Byte *tag) lenBlock[i] = (Byte)(_aadLen >> (i * 8)); for (unsigned i = 0; i < 8; i++) lenBlock[8 + i] = (Byte)(_totalLen >> (i * 8)); - Poly1305_ProcessBlock(_h, _r, lenBlock, true); + Poly1305_ProcessBlock(_h, _r, _s, lenBlock, true); } - UInt64 h0 = (UInt64)GetUi32(_h); - UInt64 h1 = (UInt64)GetUi32(_h + 4); - UInt64 h2 = (UInt64)GetUi32(_h + 8); - UInt64 h3 = (UInt64)GetUi32(_h + 12) & 0x3FFFFFF; - - UInt64 s0 = (UInt64)GetUi32(_s); - UInt64 s1 = (UInt64)GetUi32(_s + 4); - UInt64 s2 = (UInt64)GetUi32(_s + 8); - UInt64 s3 = (UInt64)GetUi32(_s + 12); - - h0 += s0; - UInt64 c = h0 >> 26; h0 &= 0x3FFFFFF; - h1 += s1 + c; c = h1 >> 26; h1 &= 0x3FFFFFF; - h2 += s2 + c; c = h2 >> 26; h2 &= 0x3FFFFFF; - h3 += s3 + c; - - UInt64 g0, g1, g2, g3; - g0 = h0 + 5; - c = g0 >> 26; g0 &= 0x3FFFFFF; - g1 = h1 + c; c = g1 >> 26; g1 &= 0x3FFFFFF; - g2 = h2 + c; c = g2 >> 26; g2 &= 0x3FFFFFF; - g3 = h3 + c - 4; - - UInt64 mask = (g3 >> 63) - 1; - h0 = (h0 & ~mask) | (g0 & mask); - h1 = (h1 & ~mask) | (g1 & mask); - h2 = (h2 & ~mask) | (g2 & mask); - h3 = (h3 & ~mask) | (g3 & mask); - - SetUi32(tag, (UInt32)(h0 | (h1 << 26))); - SetUi32(tag + 4, (UInt32)((h1 >> 6) | (h2 << 20))); - SetUi32(tag + 8, (UInt32)((h2 >> 12) | (h3 << 14))); - SetUi32(tag + 12, (UInt32)(h3 >> 18)); + UInt32 h0 = _h[0], h1 = _h[1], h2 = _h[2], h3 = _h[3], h4 = _h[4]; + UInt32 g0, g1, g2, g3, g4; + UInt32 mask; + UInt64 t; + + g0 = (UInt32)(t = (UInt64)h0 + 5); + g1 = (UInt32)(t = (UInt64)h1 + (t >> 32)); + g2 = (UInt32)(t = (UInt64)h2 + (t >> 32)); + g3 = (UInt32)(t = (UInt64)h3 + (t >> 32)); + g4 = h4 + (UInt32)(t >> 32); + + mask = 0 - (g4 >> 2); + g0 &= mask; g1 &= mask; + g2 &= mask; g3 &= mask; + mask = ~mask; + h0 = (h0 & mask) | g0; + h1 = (h1 & mask) | g1; + h2 = (h2 & mask) | g2; + h3 = (h3 & mask) | g3; + + h0 = (UInt32)(t = (UInt64)h0 + _n[0]); + h1 = (UInt32)(t = (UInt64)h1 + (t >> 32) + _n[1]); + h2 = (UInt32)(t = (UInt64)h2 + (t >> 32) + _n[2]); + h3 = (UInt32)(t = (UInt64)h3 + (t >> 32) + _n[3]); + + SetUi32(tag, h0); + SetUi32(tag + 4, h1); + SetUi32(tag + 8, h2); + SetUi32(tag + 12, h3); } void CBaseCoder::DeriveKey() @@ -686,17 +457,10 @@ Z7_COM7F_IMF(CDecoder::CryptoAuthVerify(Int32 *result)) _poly1305.Final(computedTag); { -#if defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable: 4746) -#endif volatile Byte diff = 0; for (unsigned i = 0; i < kTagSize; i++) diff |= computedTag[i] ^ _expectedTag[i]; _authResult = (diff == 0) ? 0 : 1; -#if defined(_MSC_VER) -#pragma warning(pop) -#endif } *result = _authResult; diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.h b/CPP/7zip/Crypto/XChaCha20Poly1305.h index 058cac155..f37aa12af 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305.h +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.h @@ -23,9 +23,10 @@ const unsigned kPolyKeySize = 32; class CPoly1305 { - Byte _r[16]; - Byte _s[16]; - Byte _h[16]; + UInt32 _r[4]; + UInt32 _s[4]; + UInt32 _h[5]; + UInt32 _n[4]; Byte _block[16]; unsigned _blockPos; UInt64 _totalLen; @@ -34,7 +35,7 @@ class CPoly1305 unsigned _aadBlockPos; UInt64 _aadLen; - void PadAndProcessBlock(Byte *buf, unsigned bufPos, UInt64 len); + void PadAndProcessBlock(Byte *buf, unsigned bufPos); void ProcessBlocks(Byte *buf, unsigned &bufPos, UInt64 &len, const Byte *data, UInt32 size); public: CPoly1305(); From ac39e962bb297654c1f69efd31baa023229e5446 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Thu, 18 Jun 2026 12:26:19 +0800 Subject: [PATCH 15/18] Fix HKDF-BLAKE2sp --- CPP/7zip/Crypto/HkdfBlake2sp.cpp | 100 +++++++++++++++++++------------ 1 file changed, 61 insertions(+), 39 deletions(-) diff --git a/CPP/7zip/Crypto/HkdfBlake2sp.cpp b/CPP/7zip/Crypto/HkdfBlake2sp.cpp index ecbf11b7d..0613f9f93 100644 --- a/CPP/7zip/Crypto/HkdfBlake2sp.cpp +++ b/CPP/7zip/Crypto/HkdfBlake2sp.cpp @@ -11,10 +11,14 @@ namespace NHkdfBlake2sp { #define BLAKE2SP_BLOCK_SIZE 64 -// HMAC-BLAKE2sp -static void HmacBlake2sp(const Byte *key, unsigned keySize, - const Byte *message, unsigned messageSize, - Byte *mac) +static void CloneBlake2spState(CBlake2sp *dest, const CBlake2sp *src) +{ + memcpy(dest, src, sizeof(CBlake2sp)); +} + +void Derive(const Byte *prk, unsigned prkSize, + const char *info, unsigned infoLen, + Byte *output, unsigned outSize) { static bool blake2spPrepared = false; if (!blake2spPrepared) @@ -23,48 +27,50 @@ static void HmacBlake2sp(const Byte *key, unsigned keySize, blake2spPrepared = true; } + Byte processedKey[Z7_BLAKE2S_DIGEST_SIZE]; + const Byte *effectiveKey; + unsigned effectiveKeySize; + + if (prkSize > BLAKE2SP_BLOCK_SIZE) + { + CAlignedBuffer1 bufHash(sizeof(CBlake2sp)); + CBlake2sp *blake2spHash = (CBlake2sp *)(void *)(Byte *)bufHash; + Blake2sp_Init(blake2spHash); + Blake2sp_SetFunction(blake2spHash, 0); + Blake2sp_Update(blake2spHash, prk, prkSize); + Blake2sp_Final(blake2spHash, processedKey); + + effectiveKey = processedKey; + effectiveKeySize = Z7_BLAKE2S_DIGEST_SIZE; + } + else + { + effectiveKey = prk; + effectiveKeySize = prkSize; + } + Byte ipad[BLAKE2SP_BLOCK_SIZE]; Byte opad[BLAKE2SP_BLOCK_SIZE]; - memset(ipad, 0x36, BLAKE2SP_BLOCK_SIZE); memset(opad, 0x5c, BLAKE2SP_BLOCK_SIZE); - - for (unsigned i = 0; i < keySize && i < BLAKE2SP_BLOCK_SIZE; i++) + for (unsigned i = 0; i < effectiveKeySize; i++) { - ipad[i] ^= key[i]; - opad[i] ^= key[i]; + ipad[i] ^= effectiveKey[i]; + opad[i] ^= effectiveKey[i]; } - // Inner hash - CAlignedBuffer1 bufInner(sizeof(CBlake2sp)); - CBlake2sp *blake2spInner = (CBlake2sp *)(void *)(Byte *)bufInner; - Blake2sp_Init(blake2spInner); - Blake2sp_SetFunction(blake2spInner, 0); - Blake2sp_Update(blake2spInner, ipad, BLAKE2SP_BLOCK_SIZE); - Blake2sp_Update(blake2spInner, message, messageSize); - - Byte innerHash[Z7_BLAKE2S_DIGEST_SIZE]; - Blake2sp_Final(blake2spInner, innerHash); - - // Outer hash - CAlignedBuffer1 bufOuter(sizeof(CBlake2sp)); - CBlake2sp *blake2spOuter = (CBlake2sp *)(void *)(Byte *)bufOuter; - Blake2sp_Init(blake2spOuter); - Blake2sp_SetFunction(blake2spOuter, 0); - Blake2sp_Update(blake2spOuter, opad, BLAKE2SP_BLOCK_SIZE); - Blake2sp_Update(blake2spOuter, innerHash, Z7_BLAKE2S_DIGEST_SIZE); - Blake2sp_Final(blake2spOuter, mac); + CAlignedBuffer1 bufInnerState(sizeof(CBlake2sp)); + CBlake2sp *innerState = (CBlake2sp *)(void *)(Byte *)bufInnerState; + Blake2sp_Init(innerState); + Blake2sp_SetFunction(innerState, 0); + Blake2sp_Update(innerState, ipad, BLAKE2SP_BLOCK_SIZE); - Z7_memset_0_ARRAY(ipad); - Z7_memset_0_ARRAY(opad); - Z7_memset_0_ARRAY(innerHash); -} + CAlignedBuffer1 bufOuterState(sizeof(CBlake2sp)); + CBlake2sp *outerState = (CBlake2sp *)(void *)(Byte *)bufOuterState; + Blake2sp_Init(outerState); + Blake2sp_SetFunction(outerState, 0); + Blake2sp_Update(outerState, opad, BLAKE2SP_BLOCK_SIZE); -// HKDF-Expand (RFC 5869) -void Derive(const Byte *prk, unsigned prkSize, - const char *info, unsigned infoLen, - Byte *output, unsigned outSize) -{ const unsigned n = (outSize + Z7_BLAKE2S_DIGEST_SIZE - 1) / Z7_BLAKE2S_DIGEST_SIZE; Byte prevT[Z7_BLAKE2S_DIGEST_SIZE]; @@ -73,6 +79,11 @@ void Derive(const Byte *prk, unsigned prkSize, Byte *outPtr = output; unsigned remaining = outSize; + CAlignedBuffer1 bufInnerTmp(sizeof(CBlake2sp)); + CAlignedBuffer1 bufOuterTmp(sizeof(CBlake2sp)); + CBlake2sp *innerTmp = (CBlake2sp *)(void *)(Byte *)bufInnerTmp; + CBlake2sp *outerTmp = (CBlake2sp *)(void *)(Byte *)bufOuterTmp; + for (unsigned i = 1; i <= n; i++) { Byte message[Z7_BLAKE2S_DIGEST_SIZE + 256 + 1]; @@ -92,9 +103,16 @@ void Derive(const Byte *prk, unsigned prkSize, message[messageSize] = (Byte)i; messageSize += 1; + CloneBlake2spState(innerTmp, innerState); + Blake2sp_Update(innerTmp, message, messageSize); + + Byte innerHash[Z7_BLAKE2S_DIGEST_SIZE]; + Blake2sp_Final(innerTmp, innerHash); + CloneBlake2spState(outerTmp, outerState); + Blake2sp_Update(outerTmp, innerHash, Z7_BLAKE2S_DIGEST_SIZE); Byte ti[Z7_BLAKE2S_DIGEST_SIZE]; - HmacBlake2sp(prk, prkSize, message, messageSize, ti); + Blake2sp_Final(outerTmp, ti); const unsigned copySize = remaining < Z7_BLAKE2S_DIGEST_SIZE ? remaining : Z7_BLAKE2S_DIGEST_SIZE; memcpy(outPtr, ti, copySize); @@ -106,9 +124,13 @@ void Derive(const Byte *prk, unsigned prkSize, Z7_memset_0_ARRAY(ti); Z7_memset_0_ARRAY(message); + Z7_memset_0_ARRAY(innerHash); } Z7_memset_0_ARRAY(prevT); + Z7_memset_0_ARRAY(ipad); + Z7_memset_0_ARRAY(opad); + Z7_memset_0_ARRAY(processedKey); } -}} +}} \ No newline at end of file From b52ada9c2a576fb0dace870ac162b2a99296af37 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Sun, 21 Jun 2026 17:21:25 +0800 Subject: [PATCH 16/18] Fixed tag validation bug --- CPP/7zip/Common/FilterCoder.cpp | 3 +++ CPP/7zip/Common/FilterCoder.h | 4 ++++ CPP/7zip/Crypto/XChaCha20Poly1305.cpp | 19 ++++++++++++++----- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/CPP/7zip/Common/FilterCoder.cpp b/CPP/7zip/Common/FilterCoder.cpp index 8d7e0dccf..0b5d4204e 100644 --- a/CPP/7zip/Common/FilterCoder.cpp +++ b/CPP/7zip/Common/FilterCoder.cpp @@ -546,6 +546,9 @@ Z7_COM7F_IMF(CFilterCoder::SetKey(const Byte *data, UInt32 size)) Z7_COM7F_IMF(CFilterCoder::SetInitVector(const Byte *data, UInt32 size)) { return _cryptoProperties->SetInitVector(data, size); } +Z7_COM7F_IMF(CFilterCoder::CryptoAuthVerify(Int32 *result)) + { return _cryptoAuthVerify->CryptoAuthVerify(result); } + #endif diff --git a/CPP/7zip/Common/FilterCoder.h b/CPP/7zip/Common/FilterCoder.h index 3a588fd52..2e3bf9daa 100644 --- a/CPP/7zip/Common/FilterCoder.h +++ b/CPP/7zip/Common/FilterCoder.h @@ -45,6 +45,7 @@ class CFilterCoder Z7_final : #ifndef Z7_NO_CRYPTO public ICryptoSetPassword, public ICryptoProperties, + public ICryptoAuthVerify, #endif #ifndef Z7_EXTRACT_ONLY @@ -92,6 +93,7 @@ class CFilterCoder Z7_final : #ifndef Z7_NO_CRYPTO CMyComPtr _setPassword; CMyComPtr _cryptoProperties; + CMyComPtr _cryptoAuthVerify; #endif #ifndef Z7_EXTRACT_ONLY @@ -148,6 +150,7 @@ class CFilterCoder Z7_final : #ifndef Z7_NO_CRYPTO Z7_COM_QI_ENTRY_AG(ICryptoSetPassword, Filter, _setPassword) Z7_COM_QI_ENTRY_AG(ICryptoProperties, Filter, _cryptoProperties) + Z7_COM_QI_ENTRY_AG(ICryptoAuthVerify, Filter, _cryptoAuthVerify) #endif #ifndef Z7_EXTRACT_ONLY @@ -182,6 +185,7 @@ class CFilterCoder Z7_final : #ifndef Z7_NO_CRYPTO Z7_IFACE_COM7_IMP(ICryptoSetPassword) Z7_IFACE_COM7_IMP(ICryptoProperties) + Z7_IFACE_COM7_IMP(ICryptoAuthVerify) #endif #ifndef Z7_EXTRACT_ONLY diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp index 84ef1d85c..f7bd68520 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp @@ -186,9 +186,9 @@ void CPoly1305::PadAndProcessBlock(Byte *buf, unsigned bufPos) { if (bufPos != 0) { - buf[bufPos] = 1; - memset(buf + bufPos + 1, 0, 16 - bufPos - 1); - Poly1305_ProcessBlock(_h, _r, _s, buf, false); + // RFC 8439: pad16 用 0x00 填充到 16 字节边界,作为完整块处理(添加 2^128) + memset(buf + bufPos, 0, 16 - bufPos); + Poly1305_ProcessBlock(_h, _r, _s, buf, true); } } @@ -348,8 +348,17 @@ Z7_COM7F_IMF(CEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) if (!_tagReady) { - _poly1305.Final(_computedTag); - _tagReady = true; + if (_derivedKeyValid) + { + _poly1305.Final(_computedTag); + _tagReady = true; + } + else + { + // 编码尚未开始(FillProps_from_Coder 第一次调用),写入空 tag + // 编码完成后 FillProps_from_Coder 会再次调用,此时计算真正的 tag + memset(_computedTag, 0, kTagSize); + } } memcpy(props + propsSize, _computedTag, kTagSize); From b4dc54e231ee8234e0e19ab7191581e46dee28a6 Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Mon, 22 Jun 2026 11:56:48 +0800 Subject: [PATCH 17/18] Fixed the tag issue for empty file encryption and the hardware acceleration issue for cascade algorithms. --- CPP/7zip/Crypto/Cascade.cpp | 96 ++++++++++-------- CPP/7zip/Crypto/Cascade.h | 15 ++- CPP/7zip/Crypto/XChaCha20.cpp | 140 ++++++++++++++++---------- CPP/7zip/Crypto/XChaCha20.h | 2 + CPP/7zip/Crypto/XChaCha20Poly1305.cpp | 10 +- CPP/7zip/Crypto/XChaCha20Poly1305.h | 1 + 6 files changed, 166 insertions(+), 98 deletions(-) diff --git a/CPP/7zip/Crypto/Cascade.cpp b/CPP/7zip/Crypto/Cascade.cpp index 26e9764b6..a90a701fd 100644 --- a/CPP/7zip/Crypto/Cascade.cpp +++ b/CPP/7zip/Crypto/Cascade.cpp @@ -26,6 +26,32 @@ namespace NCrypto { +static AES_CODE_FUNC s_AesCtrFunc = NULL; + +static void InitAesCtrFunc() +{ + if (s_AesCtrFunc) + return; + + AES_CODE_FUNC func = AesCtr_Code; + +#if defined(MY_CPU_X86_OR_AMD64) + if (CPU_IsSupported_AES()) + { + func = AesCtr_Code_HW; + if (CPU_IsSupported_VAES_AVX2()) + func = AesCtr_Code_HW_256; + } +#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) + if (CPU_IsSupported_AES()) + { + func = AesCtr_Code_HW; + } +#endif + + s_AesCtrFunc = func; +} + static void XorBytes(Byte *dst, const Byte *src, unsigned len) { Byte *d = dst; @@ -69,6 +95,7 @@ static CKeyInfoCache g_AXP_GlobalKeyCache(32); CAXPBase::CAXPBase(): _cachedKeys(16), _keyDerived(false), + _aesKeys(AES_NUM_IVMRK_WORDS * sizeof(UInt32)), _xcBlockPos(64), _xcCounter(0), _aadSize(0), @@ -78,7 +105,7 @@ CAXPBase::CAXPBase(): _key.DerivMode = N7zKeyDerivation::kDeriv_Cascade; Z7_memset_0_ARRAY(_keyAes); Z7_memset_0_ARRAY(_aesIv); - Z7_memset_0_ARRAY(_aesKeys); + memset(_aesKeys, 0, AES_NUM_IVMRK_WORDS * sizeof(UInt32)); Z7_memset_0_ARRAY(_keyXChaCha20); Z7_memset_0_ARRAY(_xcNonce); Z7_memset_0_ARRAY(_xcDerivedKey); @@ -109,8 +136,8 @@ void CAXPBase::DeriveAXPKeys() _key.CascadeKey, kCascadeKeySize, "AES-key", 7, _keyAes, 32); - Aes_SetKey_Enc(_aesKeys + 4, _keyAes, 32); - memcpy(_aesKeys, _aesIv, 16); + Aes_SetKey_Enc(AesKeys() + 4, _keyAes, 32); + memcpy(AesKeys(), _aesIv, 16); NHkdfBlake2sp::Derive( _key.CascadeKey, kCascadeKeySize, @@ -140,10 +167,11 @@ void CAXPBase::ComputePolyKey() void CAXPBase::AesCtrXorData(Byte *data, UInt32 size) { + InitAesCtrFunc(); if (size >= AES_BLOCK_SIZE) { UInt32 numBlocks = size >> 4; - AesCtr_Code(_aesKeys, data, numBlocks); + s_AesCtrFunc(AesKeys(), data, numBlocks); data += numBlocks << 4; size -= numBlocks << 4; } @@ -151,7 +179,7 @@ void CAXPBase::AesCtrXorData(Byte *data, UInt32 size) { Byte temp[16]; memset(temp, 0, 16); - AesCtr_Code(_aesKeys, temp, 1); + s_AesCtrFunc(AesKeys(), temp, 1); for (UInt32 i = 0; i < size; i++) data[i] ^= temp[i]; Z7_memset_0_ARRAY(temp); @@ -160,21 +188,7 @@ void CAXPBase::AesCtrXorData(Byte *data, UInt32 size) void CAXPBase::XChaCha20XorData(Byte *data, UInt32 size) { - while (size > 0) - { - if (_xcBlockPos >= kXcBlockSize) - { - NXChaCha20::XChaCha20Block_Core(_xcBlock, _xcDerivedKey, _xcNonce + 16, _xcCounter); - _xcBlockPos = 0; - _xcCounter++; - } - UInt32 avail = kXcBlockSize - _xcBlockPos; - UInt32 toProcess = (size < avail) ? size : avail; - XorBytes(data, _xcBlock + _xcBlockPos, toProcess); - data += toProcess; - size -= toProcess; - _xcBlockPos += toProcess; - } + NXChaCha20::XChaCha20ProcessData(data, size, _xcDerivedKey, _xcNonce, _xcCounter, _xcBlock, _xcBlockPos); } void CAXPBaseCoder::ProcessEnc(Byte *data, UInt32 size) @@ -240,6 +254,7 @@ CAXPEncoder::CAXPEncoder() _xcBlockPos = 64; _xcCounter = 1; _tagReady = false; + _propsWritten = false; memset(_computedTag, 0, kTagSize); Z7_memset_0_ARRAY(_aesIv); Z7_memset_0_ARRAY(_xcNonce); @@ -263,6 +278,7 @@ Z7_COM7F_IMF(CAXPEncoder::ResetInitVector()) _xcCounter = 1; _poly1305.Reset(); _tagReady = false; + _propsWritten = false; memset(_computedTag, 0, kTagSize); _aadSize = 1; @@ -309,6 +325,8 @@ Z7_COM7F_IMF(CAXPEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) if (!_tagReady) { + if (!_keyDerived && _propsWritten) + DeriveAXPKeys(); if (_finalized) { _tagReady = true; @@ -324,6 +342,7 @@ Z7_COM7F_IMF(CAXPEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) memset(_computedTag, 0, kTagSize); } } + _propsWritten = true; memcpy(props + propsSize, _computedTag, kTagSize); propsSize += kTagSize; @@ -500,6 +519,7 @@ static CKeyInfoCache g_GlobalKeyCache(32); CBase::CBase(): _cachedKeys(16), _keyDerived(false), + _aesKeys(AES_NUM_IVMRK_WORDS * sizeof(UInt32)), _xcBlockPos(64), _xcCounter(0) { @@ -509,7 +529,7 @@ CBase::CBase(): Z7_memset_0_ARRAY(_keyAscon); Z7_memset_0_ARRAY(_keyAes); Z7_memset_0_ARRAY(_aesIv); - Z7_memset_0_ARRAY(_aesKeys); + memset(_aesKeys, 0, AES_NUM_IVMRK_WORDS * sizeof(UInt32)); Z7_memset_0_ARRAY(_keyXChaCha20); Z7_memset_0_ARRAY(_xcNonce); Z7_memset_0_ARRAY(_xcDerivedKey); @@ -540,8 +560,8 @@ void CBase::DeriveCascadeKeys() _key.CascadeKey, kCascadeKeySize, "AES-key", 7, _keyAes, 32); - Aes_SetKey_Enc(_aesKeys + 4, _keyAes, 32); - memcpy(_aesKeys, _aesIv, 16); + Aes_SetKey_Enc(AesKeys() + 4, _keyAes, 32); + memcpy(AesKeys(), _aesIv, 16); NHkdfBlake2sp::Derive( _key.CascadeKey, kCascadeKeySize, @@ -562,10 +582,11 @@ void CBase::DeriveCascadeKeys() void CBase::AesCtrXorData(Byte *data, UInt32 size) { + InitAesCtrFunc(); if (size >= AES_BLOCK_SIZE) { UInt32 numBlocks = size >> 4; - AesCtr_Code(_aesKeys, data, numBlocks); + s_AesCtrFunc(AesKeys(), data, numBlocks); data += numBlocks << 4; size -= numBlocks << 4; } @@ -573,7 +594,7 @@ void CBase::AesCtrXorData(Byte *data, UInt32 size) { Byte temp[16]; memset(temp, 0, 16); - AesCtr_Code(_aesKeys, temp, 1); + s_AesCtrFunc(AesKeys(), temp, 1); for (UInt32 i = 0; i < size; i++) data[i] ^= temp[i]; Z7_memset_0_ARRAY(temp); @@ -582,21 +603,7 @@ void CBase::AesCtrXorData(Byte *data, UInt32 size) void CBase::XChaCha20XorData(Byte *data, UInt32 size) { - while (size > 0) - { - if (_xcBlockPos >= kXcBlockSize) - { - NXChaCha20::XChaCha20Block_Core(_xcBlock, _xcDerivedKey, _xcNonce + 16, _xcCounter); - _xcBlockPos = 0; - _xcCounter++; - } - UInt32 avail = kXcBlockSize - _xcBlockPos; - UInt32 toProcess = (size < avail) ? size : avail; - XorBytes(data, _xcBlock + _xcBlockPos, toProcess); - data += toProcess; - size -= toProcess; - _xcBlockPos += toProcess; - } + NXChaCha20::XChaCha20ProcessData(data, size, _xcDerivedKey, _xcNonce, _xcCounter, _xcBlock, _xcBlockPos); } void CBaseCoder::InitState() @@ -969,6 +976,8 @@ Z7_COM7F_IMF(CEncoder::ResetInitVector()) _finalized = false; _xcBlockPos = 64; _xcCounter = 0; + _tagReady = false; + _propsWritten = false; const unsigned nonceType = (NAscon::kNonceSize > 16) ? 1 : 0; @@ -1010,6 +1019,7 @@ CEncoder::CEncoder() _xcBlockPos = 64; _xcCounter = 0; _tagReady = false; + _propsWritten = false; memset(_computedTag, 0, NAscon::kTagSize); Z7_memset_0_ARRAY(_aesIv); Z7_memset_0_ARRAY(_xcNonce); @@ -1038,6 +1048,11 @@ Z7_COM7F_IMF(CEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) if (!_tagReady) { + if (!_keyDerived && _propsWritten) + { + DeriveCascadeKeys(); + ProcessAad(_aad, _aadSize); + } if (_finalized) { _tagReady = true; @@ -1052,6 +1067,7 @@ Z7_COM7F_IMF(CEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) memset(_computedTag, 0, NAscon::kTagSize); } } + _propsWritten = true; memcpy(props + propsSize, _computedTag, NAscon::kTagSize); propsSize += NAscon::kTagSize; diff --git a/CPP/7zip/Crypto/Cascade.h b/CPP/7zip/Crypto/Cascade.h index fadd85467..50ab5b9c1 100644 --- a/CPP/7zip/Crypto/Cascade.h +++ b/CPP/7zip/Crypto/Cascade.h @@ -6,6 +6,7 @@ #define ZIP7_INC_CRYPTO_CASCADE_H #include "../../Common/MyCom.h" +#include "../../Common/MyBuffer2.h" #include "../ICoder.h" #include "../IPassword.h" @@ -40,7 +41,7 @@ class CBase Byte _keyAes[32]; Byte _aesIv[16]; - UInt32 _aesKeys[AES_NUM_IVMRK_WORDS]; + CAlignedBuffer1 _aesKeys; Byte _keyXChaCha20[32]; Byte _xcNonce[24]; @@ -49,6 +50,8 @@ class CBase unsigned _xcBlockPos; UInt64 _xcCounter; + UInt32 *AesKeys() { return (UInt32 *)(void *)(Byte *)_aesKeys; } + void PrepareKey(); void DeriveCascadeKeys(); void AesCtrXorData(Byte *data, UInt32 size); @@ -60,7 +63,7 @@ class CBase Z7_memset_0_ARRAY(_keyAscon); Z7_memset_0_ARRAY(_keyAes); Z7_memset_0_ARRAY(_aesIv); - Z7_memset_0_ARRAY(_aesKeys); + memset(_aesKeys, 0, AES_NUM_IVMRK_WORDS * sizeof(UInt32)); Z7_memset_0_ARRAY(_keyXChaCha20); Z7_memset_0_ARRAY(_xcNonce); Z7_memset_0_ARRAY(_xcDerivedKey); @@ -116,6 +119,7 @@ class CEncoder Z7_final: Byte _computedTag[NAscon::kTagSize]; bool _tagReady; + bool _propsWritten; Z7_COM7F_IMP2(UInt32, Filter(Byte *data, UInt32 size)) public: CEncoder(); @@ -167,7 +171,7 @@ class CAXPBase Byte _keyAes[32]; Byte _aesIv[16]; - UInt32 _aesKeys[AES_NUM_IVMRK_WORDS]; + CAlignedBuffer1 _aesKeys; Byte _keyXChaCha20[32]; Byte _xcNonce[24]; @@ -183,6 +187,8 @@ class CAXPBase bool _finalized; bool _authOk; + UInt32 *AesKeys() { return (UInt32 *)(void *)(Byte *)_aesKeys; } + void PrepareKey(); void DeriveAXPKeys(); void ComputePolyKey(); @@ -194,7 +200,7 @@ class CAXPBase { Z7_memset_0_ARRAY(_keyAes); Z7_memset_0_ARRAY(_aesIv); - Z7_memset_0_ARRAY(_aesKeys); + memset(_aesKeys, 0, AES_NUM_IVMRK_WORDS * sizeof(UInt32)); Z7_memset_0_ARRAY(_keyXChaCha20); Z7_memset_0_ARRAY(_xcNonce); Z7_memset_0_ARRAY(_xcDerivedKey); @@ -235,6 +241,7 @@ class CAXPEncoder Z7_final: Byte _computedTag[kTagSize]; bool _tagReady; + bool _propsWritten; Z7_COM7F_IMP2(UInt32, Filter(Byte *data, UInt32 size)) public: CAXPEncoder(); diff --git a/CPP/7zip/Crypto/XChaCha20.cpp b/CPP/7zip/Crypto/XChaCha20.cpp index c06de0b2a..f4bec76bb 100644 --- a/CPP/7zip/Crypto/XChaCha20.cpp +++ b/CPP/7zip/Crypto/XChaCha20.cpp @@ -188,66 +188,94 @@ void XChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce, UInt6 #undef DOUBLE_ROUND -void CBaseCoder::ProcessData(Byte *data, UInt32 size) +void XChaCha20ProcessData(Byte *data, UInt32 size, const Byte *derivedKey, const Byte *nonce, UInt64 &counter, Byte *block, unsigned &blockPos) { - if (!_derivedKeyValid) + if (blockPos > 0 && blockPos < kBlockBytes) { - DeriveKey(); + UInt32 remaining = kBlockBytes - blockPos; + UInt32 toProcess = (size < remaining) ? size : remaining; + Byte *dataPtr = data; + const Byte *blockPtr = block + blockPos; + UInt32 count = toProcess; +#ifdef MY_CPU_LE_UNALIGN_64 + while (count >= 8) + { + *(UInt64 *)dataPtr ^= *(const UInt64 *)blockPtr; + dataPtr += 8; + blockPtr += 8; + count -= 8; + } +#endif +#ifdef MY_CPU_LE_UNALIGN + while (count >= 4) + { + *(UInt32 *)dataPtr ^= *(const UInt32 *)blockPtr; + dataPtr += 4; + blockPtr += 4; + count -= 4; + } +#endif + while (count--) + *dataPtr++ ^= *blockPtr++; + data += toProcess; + size -= toProcess; + blockPos += toProcess; } - + #ifdef MY_CPU_X86_OR_AMD64 #ifdef MY_CPU_SSE2 InitSIMD(); - if (size >= kBlockSize * 4) + if (size >= kBlockBytes * 4) { UInt32 state[16]; state[0] = GetUi32(kSigma); state[1] = GetUi32(kSigma + 4); state[2] = GetUi32(kSigma + 8); state[3] = GetUi32(kSigma + 12); - state[4] = GetUi32(_derivedKey); - state[5] = GetUi32(_derivedKey + 4); - state[6] = GetUi32(_derivedKey + 8); - state[7] = GetUi32(_derivedKey + 12); - state[8] = GetUi32(_derivedKey + 16); - state[9] = GetUi32(_derivedKey + 20); - state[10] = GetUi32(_derivedKey + 24); - state[11] = GetUi32(_derivedKey + 28); - state[12] = (UInt32)(_counter & 0xFFFFFFFF); - state[13] = (UInt32)(_counter >> 32); - state[14] = GetUi32(_nonce + 16); - state[15] = GetUi32(_nonce + 20); + state[4] = GetUi32(derivedKey); + state[5] = GetUi32(derivedKey + 4); + state[6] = GetUi32(derivedKey + 8); + state[7] = GetUi32(derivedKey + 12); + state[8] = GetUi32(derivedKey + 16); + state[9] = GetUi32(derivedKey + 20); + state[10] = GetUi32(derivedKey + 24); + state[11] = GetUi32(derivedKey + 28); + state[12] = (UInt32)(counter & 0xFFFFFFFF); + state[13] = (UInt32)(counter >> 32); + state[14] = GetUi32(nonce + 16); + state[15] = GetUi32(nonce + 20); #ifdef MY_CPU_AMD64 - if (g_AVX2Enabled && size >= kBlockSize * 8) + if (g_AVX2Enabled && size >= kBlockBytes * 8) { - while (size >= kBlockSize * 8) + while (size >= kBlockBytes * 8) { ChaCha20_OperateKeystream_AVX2(state, data, data); state[12] += 8; if (state[12] < 8) state[13]++; - data += kBlockSize * 8; - size -= kBlockSize * 8; + data += kBlockBytes * 8; + size -= kBlockBytes * 8; } } #endif - if (g_SSE2Enabled && size >= kBlockSize * 4) + if (g_SSE2Enabled && size >= kBlockBytes * 4) { - while (size >= kBlockSize * 4) + while (size >= kBlockBytes * 4) { ChaCha20_OperateKeystream_SSE2(state, data, data); state[12] += 4; if (state[12] < 4) state[13]++; - data += kBlockSize * 4; - size -= kBlockSize * 4; + data += kBlockBytes * 4; + size -= kBlockBytes * 4; } } - _counter = (UInt64)state[13] << 32 | state[12]; + counter = (UInt64)state[13] << 32 | state[12]; + blockPos = kBlockBytes; } #endif #endif @@ -255,54 +283,55 @@ void CBaseCoder::ProcessData(Byte *data, UInt32 size) #ifdef MY_CPU_ARM_OR_ARM64 InitSIMD(); - if (g_NEONEnabled && size >= kBlockSize * 4) + if (g_NEONEnabled && size >= kBlockBytes * 4) { UInt32 state[16]; state[0] = GetUi32(kSigma); state[1] = GetUi32(kSigma + 4); state[2] = GetUi32(kSigma + 8); state[3] = GetUi32(kSigma + 12); - state[4] = GetUi32(_derivedKey); - state[5] = GetUi32(_derivedKey + 4); - state[6] = GetUi32(_derivedKey + 8); - state[7] = GetUi32(_derivedKey + 12); - state[8] = GetUi32(_derivedKey + 16); - state[9] = GetUi32(_derivedKey + 20); - state[10] = GetUi32(_derivedKey + 24); - state[11] = GetUi32(_derivedKey + 28); - state[12] = (UInt32)(_counter & 0xFFFFFFFF); - state[13] = (UInt32)(_counter >> 32); - state[14] = GetUi32(_nonce + 16); - state[15] = GetUi32(_nonce + 20); - - while (size >= kBlockSize * 4) + state[4] = GetUi32(derivedKey); + state[5] = GetUi32(derivedKey + 4); + state[6] = GetUi32(derivedKey + 8); + state[7] = GetUi32(derivedKey + 12); + state[8] = GetUi32(derivedKey + 16); + state[9] = GetUi32(derivedKey + 20); + state[10] = GetUi32(derivedKey + 24); + state[11] = GetUi32(derivedKey + 28); + state[12] = (UInt32)(counter & 0xFFFFFFFF); + state[13] = (UInt32)(counter >> 32); + state[14] = GetUi32(nonce + 16); + state[15] = GetUi32(nonce + 20); + + while (size >= kBlockBytes * 4) { ChaCha20_OperateKeystream_NEON(state, data, data); state[12] += 4; if (state[12] < 4) state[13]++; - data += kBlockSize * 4; - size -= kBlockSize * 4; + data += kBlockBytes * 4; + size -= kBlockBytes * 4; } - _counter = (UInt64)state[13] << 32 | state[12]; + counter = (UInt64)state[13] << 32 | state[12]; + blockPos = kBlockBytes; } #endif while (size > 0) { - if (_blockPos == 0 || _blockPos >= kBlockSize) + if (blockPos == 0 || blockPos >= kBlockBytes) { - XChaCha20Block_Core(_block, _derivedKey, _nonce + 16, _counter); - _blockPos = 0; - _counter++; + XChaCha20Block_Core(block, derivedKey, nonce + 16, counter); + blockPos = 0; + counter++; } - UInt32 remaining = kBlockSize - _blockPos; + UInt32 remaining = kBlockBytes - blockPos; UInt32 toProcess = (size < remaining) ? size : remaining; Byte *dataPtr = data; - const Byte *blockPtr = _block + _blockPos; + const Byte *blockPtr = block + blockPos; UInt32 count = toProcess; #ifdef MY_CPU_LE_UNALIGN_64 @@ -330,8 +359,17 @@ void CBaseCoder::ProcessData(Byte *data, UInt32 size) data += toProcess; size -= toProcess; - _blockPos += toProcess; + blockPos += toProcess; + } +} + +void CBaseCoder::ProcessData(Byte *data, UInt32 size) +{ + if (!_derivedKeyValid) + { + DeriveKey(); } + XChaCha20ProcessData(data, size, _derivedKey, _nonce, _counter, _block, _blockPos); } #ifndef Z7_EXTRACT_ONLY diff --git a/CPP/7zip/Crypto/XChaCha20.h b/CPP/7zip/Crypto/XChaCha20.h index 01b4c7c6b..cc8745d72 100644 --- a/CPP/7zip/Crypto/XChaCha20.h +++ b/CPP/7zip/Crypto/XChaCha20.h @@ -22,9 +22,11 @@ using N7zKeyDerivation::kKeySize; const unsigned kNonceSize = 24; const unsigned k_NumCyclesPower_Supported_MAX = 24; +const unsigned kBlockBytes = 64; void XChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce, UInt64 counter); void XHChaCha20Block_Core(Byte *output, const Byte *key, const Byte *nonce); +void XChaCha20ProcessData(Byte *data, UInt32 size, const Byte *derivedKey, const Byte *nonce, UInt64 &counter, Byte *block, unsigned &blockPos); class CBase { diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp index f7bd68520..4b0ccf9dd 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp @@ -186,7 +186,6 @@ void CPoly1305::PadAndProcessBlock(Byte *buf, unsigned bufPos) { if (bufPos != 0) { - // RFC 8439: pad16 用 0x00 填充到 16 字节边界,作为完整块处理(添加 2^128) memset(buf + bufPos, 0, 16 - bufPos); Poly1305_ProcessBlock(_h, _r, _s, buf, true); } @@ -304,6 +303,7 @@ Z7_COM7F_IMF(CEncoder::ResetInitVector()) } _tagReady = false; + _propsWritten = false; memset(_computedTag, 0, kTagSize); return S_OK; } @@ -348,6 +348,8 @@ Z7_COM7F_IMF(CEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) if (!_tagReady) { + if (!_derivedKeyValid && _propsWritten) + DeriveKey(); if (_derivedKeyValid) { _poly1305.Final(_computedTag); @@ -355,11 +357,10 @@ Z7_COM7F_IMF(CEncoder::WriteCoderProperties(ISequentialOutStream *outStream)) } else { - // 编码尚未开始(FillProps_from_Coder 第一次调用),写入空 tag - // 编码完成后 FillProps_from_Coder 会再次调用,此时计算真正的 tag memset(_computedTag, 0, kTagSize); } } + _propsWritten = true; memcpy(props + propsSize, _computedTag, kTagSize); propsSize += kTagSize; @@ -375,6 +376,7 @@ CEncoder::CEncoder() _derivedKeyValid = false; _aadSize = 0; _tagReady = false; + _propsWritten = false; memset(_computedTag, 0, kTagSize); } @@ -461,6 +463,8 @@ Z7_COM7F_IMF(CDecoder::CryptoAuthVerify(Int32 *result)) return S_OK; } _authChecked = true; + if (!_derivedKeyValid) + DeriveKey(); Byte computedTag[kTagSize]; _poly1305.Final(computedTag); diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.h b/CPP/7zip/Crypto/XChaCha20Poly1305.h index f37aa12af..bb402ad4d 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305.h +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.h @@ -83,6 +83,7 @@ class CEncoder Z7_final: Byte _computedTag[kTagSize]; bool _tagReady; + bool _propsWritten; Z7_COM7F_IMP2(UInt32, Filter(Byte *data, UInt32 size)) public: CEncoder(); From 6aba265d72a740a39c495a317ec55110d139a6bf Mon Sep 17 00:00:00 2001 From: fzxx <53783792+fzxx@users.noreply.github.com> Date: Tue, 23 Jun 2026 21:27:14 +0800 Subject: [PATCH 18/18] Fix tag issues and add ARM hardware acceleration for Ascon. --- CPP/7zip/Archive/7z/7zDecode.cpp | 2 +- CPP/7zip/Bundles/SFXCon/SFXCon.dsp | 87 +++++++++++++++++++++++++++ CPP/7zip/Crypto/Ascon.cpp | 15 +++++ CPP/7zip/Crypto/AsconSimd.h | 37 ++++++++++++ CPP/7zip/Crypto/Cascade.cpp | 47 +++++++++++++++ CPP/7zip/Crypto/HkdfBlake2sp.cpp | 38 +++++------- CPP/7zip/Crypto/HkdfBlake2sp.h | 1 - CPP/7zip/Crypto/XChaCha20Poly1305.cpp | 3 +- 8 files changed, 203 insertions(+), 27 deletions(-) diff --git a/CPP/7zip/Archive/7z/7zDecode.cpp b/CPP/7zip/Archive/7z/7zDecode.cpp index 04cb7c13a..87af11c0b 100644 --- a/CPP/7zip/Archive/7z/7zDecode.cpp +++ b/CPP/7zip/Archive/7z/7zDecode.cpp @@ -601,7 +601,7 @@ HRESULT CDecoder::Decode( Int32 authResult = 0; RINOK(authVerify->CryptoAuthVerify(&authResult)) if (authResult != 0) - return E_FAIL; + return S_FALSE; } } #endif diff --git a/CPP/7zip/Bundles/SFXCon/SFXCon.dsp b/CPP/7zip/Bundles/SFXCon/SFXCon.dsp index 5bb1c5058..2b03996ed 100644 --- a/CPP/7zip/Bundles/SFXCon/SFXCon.dsp +++ b/CPP/7zip/Bundles/SFXCon/SFXCon.dsp @@ -387,6 +387,66 @@ SOURCE=..\..\Crypto\MyAes.cpp SOURCE=..\..\Crypto\MyAes.h # End Source File +# Begin Source File + +SOURCE=..\..\Crypto\XChaCha20.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\XChaCha20.h +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\XChaCha20Register.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\XChaCha20Poly1305.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\XChaCha20Poly1305.h +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\XChaCha20Poly1305Register.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\Cascade.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\Cascade.h +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\CascadeRegister.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\Ascon.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\Ascon.h +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\AsconSimd.h +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\ChaCha20Simd.h +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\HkdfBlake2sp.cpp +# End Source File +# Begin Source File + +SOURCE=..\..\Crypto\HkdfBlake2sp.h +# End Source File # End Group # Begin Group "Windows" @@ -553,6 +613,10 @@ SOURCE=..\..\..\Common\Sha256Prepare.cpp # End Source File # Begin Source File +SOURCE=..\..\..\Common\Sha512Prepare.cpp +# End Source File +# Begin Source File + SOURCE=..\..\..\Common\StdInStream.cpp # End Source File # Begin Source File @@ -997,6 +1061,29 @@ SOURCE=..\..\..\..\C\Sha256Opt.c # End Source File # Begin Source File +SOURCE=..\..\..\..\C\Sha512.c +# SUBTRACT CPP /YX /Yc /Yu +# End Source File +# Begin Source File + +SOURCE=..\..\..\..\C\Sha512.h +# End Source File +# Begin Source File + +SOURCE=..\..\..\..\C\Sha512Opt.c +# SUBTRACT CPP /YX /Yc /Yu +# End Source File +# Begin Source File + +SOURCE=..\..\..\..\C\Blake2s.c +# SUBTRACT CPP /YX /Yc /Yu +# End Source File +# Begin Source File + +SOURCE=..\..\..\..\C\Blake2.h +# End Source File +# Begin Source File + SOURCE=..\..\..\..\C\Threads.c # SUBTRACT CPP /YX /Yc /Yu # End Source File diff --git a/CPP/7zip/Crypto/Ascon.cpp b/CPP/7zip/Crypto/Ascon.cpp index c73f0cc51..2ab6b09bd 100644 --- a/CPP/7zip/Crypto/Ascon.cpp +++ b/CPP/7zip/Crypto/Ascon.cpp @@ -68,6 +68,21 @@ void InitSIMD() #endif +#ifdef MY_CPU_ARM_OR_ARM64 + +bool g_NEONEnabled = false; +bool g_SIMDARMInitialized = false; + +void InitSIMD() +{ + if (g_SIMDARMInitialized) + return; + g_SIMDARMInitialized = true; + g_NEONEnabled = CPU_IsSupported_NEON() != 0; +} + +#endif + #define RC0 0xf0 #define RC1 0xe1 #define RC2 0xd2 diff --git a/CPP/7zip/Crypto/AsconSimd.h b/CPP/7zip/Crypto/AsconSimd.h index e9a16543f..5fdb2a505 100644 --- a/CPP/7zip/Crypto/AsconSimd.h +++ b/CPP/7zip/Crypto/AsconSimd.h @@ -11,6 +11,10 @@ #include #endif +#ifdef MY_CPU_ARM_OR_ARM64 +#include +#endif + namespace NCrypto { namespace NAscon { @@ -27,6 +31,15 @@ void InitSIMD(); #endif +#ifdef MY_CPU_ARM_OR_ARM64 + +extern bool g_NEONEnabled; +extern bool g_SIMDARMInitialized; + +void InitSIMD(); + +#endif + #ifdef MY_CPU_SSE2 static Z7_FORCE_INLINE void AsconEncBlock_SSE2(UInt64 state[5], Byte *data) @@ -50,6 +63,30 @@ static Z7_FORCE_INLINE void AsconDecBlock_SSE2(UInt64 state[5], Byte *data) #endif +#ifdef MY_CPU_ARM_OR_ARM64 + +static Z7_FORCE_INLINE void AsconEncBlock_NEON(UInt64 state[5], Byte *data) +{ + uint64x2_t ks = vld1q_u64((const uint64_t*)state); + uint64x2_t pt = vld1q_u64((const uint64_t*)data); + uint64x2_t ct = veorq_u64(pt, ks); + vst1q_u64((uint64_t*)data, ct); + state[0] = vgetq_lane_u64(ct, 0); + state[1] = vgetq_lane_u64(ct, 1); +} + +static Z7_FORCE_INLINE void AsconDecBlock_NEON(UInt64 state[5], Byte *data) +{ + uint64x2_t ks = vld1q_u64((const uint64_t*)state); + uint64x2_t ct = vld1q_u64((const uint64_t*)data); + uint64x2_t pt = veorq_u64(ct, ks); + vst1q_u64((uint64_t*)data, pt); + state[0] = vgetq_lane_u64(ct, 0); + state[1] = vgetq_lane_u64(ct, 1); +} + +#endif + #ifdef MY_CPU_AMD64 #include diff --git a/CPP/7zip/Crypto/Cascade.cpp b/CPP/7zip/Crypto/Cascade.cpp index a90a701fd..0dd770761 100644 --- a/CPP/7zip/Crypto/Cascade.cpp +++ b/CPP/7zip/Crypto/Cascade.cpp @@ -516,6 +516,12 @@ static CKeyInfoCache g_GlobalKeyCache(32); #define ASCON_USE_SSE2 false #endif +#ifdef MY_CPU_ARM_OR_ARM64 +#define ASCON_USE_NEON (NAscon::g_NEONEnabled) +#else +#define ASCON_USE_NEON false +#endif + CBase::CBase(): _cachedKeys(16), _keyDerived(false), @@ -679,6 +685,13 @@ void CBaseCoder::ProcessEnc(Byte *data, UInt32 size) #else const bool useSSE2 = false; (void)useSSE2; +#endif +#ifdef MY_CPU_ARM_OR_ARM64 + NAscon::InitSIMD(); + const bool useNEON = ASCON_USE_NEON; +#else + const bool useNEON = false; + (void)useNEON; #endif if (!_keyDerived) { @@ -699,6 +712,9 @@ void CBaseCoder::ProcessEnc(Byte *data, UInt32 size) XorBytes(p, _stateBuf + _stateBufPos, toProcess); memcpy(_stateBuf + _stateBufPos, p, toProcess); + // 同步_stateBuf到_state,确保Finalize使用正确的状态计算Tag + // 与ProcessDec保持一致(参考ProcessDec第809行) + memcpy((Byte *)_state + _stateBufPos, _stateBuf + _stateBufPos, toProcess); _stateBufPos += toProcess; p += toProcess; @@ -726,6 +742,18 @@ void CBaseCoder::ProcessEnc(Byte *data, UInt32 size) } while (remaining >= NAscon::kRateSize); } else +#endif +#ifdef MY_CPU_ARM_OR_ARM64 + if (useNEON) + { + do { + NAscon::AsconEncBlock_NEON(_state, p); + NAscon::AsconP8(_state); + p += NAscon::kRateSize; + remaining -= NAscon::kRateSize; + } while (remaining >= NAscon::kRateSize); + } + else #endif { do { @@ -759,6 +787,13 @@ void CBaseCoder::ProcessDec(Byte *data, UInt32 size) #else const bool useSSE2 = false; (void)useSSE2; +#endif +#ifdef MY_CPU_ARM_OR_ARM64 + NAscon::InitSIMD(); + const bool useNEON = ASCON_USE_NEON; +#else + const bool useNEON = false; + (void)useNEON; #endif if (!_keyDerived) { @@ -832,6 +867,18 @@ void CBaseCoder::ProcessDec(Byte *data, UInt32 size) } while (remaining >= NAscon::kRateSize); } else +#endif +#ifdef MY_CPU_ARM_OR_ARM64 + if (useNEON) + { + do { + NAscon::AsconDecBlock_NEON(_state, p); + NAscon::AsconP8(_state); + p += NAscon::kRateSize; + remaining -= NAscon::kRateSize; + } while (remaining >= NAscon::kRateSize); + } + else #endif { do { diff --git a/CPP/7zip/Crypto/HkdfBlake2sp.cpp b/CPP/7zip/Crypto/HkdfBlake2sp.cpp index 0613f9f93..d8d965551 100644 --- a/CPP/7zip/Crypto/HkdfBlake2sp.cpp +++ b/CPP/7zip/Crypto/HkdfBlake2sp.cpp @@ -11,6 +11,13 @@ namespace NHkdfBlake2sp { #define BLAKE2SP_BLOCK_SIZE 64 +#define Z7_HKDF_MAX_OUT_SIZE (255 * Z7_BLAKE2S_DIGEST_SIZE) + +static struct CBlake2sp_Prepare +{ + CBlake2sp_Prepare() { z7_Black2sp_Prepare(); } +} g_Blake2sp_Prepare; + static void CloneBlake2spState(CBlake2sp *dest, const CBlake2sp *src) { memcpy(dest, src, sizeof(CBlake2sp)); @@ -20,12 +27,8 @@ void Derive(const Byte *prk, unsigned prkSize, const char *info, unsigned infoLen, Byte *output, unsigned outSize) { - static bool blake2spPrepared = false; - if (!blake2spPrepared) - { - z7_Black2sp_Prepare(); - blake2spPrepared = true; - } + if (outSize > Z7_HKDF_MAX_OUT_SIZE) + return; Byte processedKey[Z7_BLAKE2S_DIGEST_SIZE]; const Byte *effectiveKey; @@ -86,25 +89,13 @@ void Derive(const Byte *prk, unsigned prkSize, for (unsigned i = 1; i <= n; i++) { - Byte message[Z7_BLAKE2S_DIGEST_SIZE + 256 + 1]; - unsigned messageSize = 0; - + CloneBlake2spState(innerTmp, innerState); if (prevTSize > 0) - { - memcpy(message + messageSize, prevT, prevTSize); - messageSize += prevTSize; - } - + Blake2sp_Update(innerTmp, prevT, prevTSize); if (infoLen > 0) - { - memcpy(message + messageSize, info, infoLen); - messageSize += infoLen; - } - - message[messageSize] = (Byte)i; - messageSize += 1; - CloneBlake2spState(innerTmp, innerState); - Blake2sp_Update(innerTmp, message, messageSize); + Blake2sp_Update(innerTmp, (const Byte *)info, infoLen); + const Byte counter = (Byte)i; + Blake2sp_Update(innerTmp, &counter, 1); Byte innerHash[Z7_BLAKE2S_DIGEST_SIZE]; Blake2sp_Final(innerTmp, innerHash); @@ -123,7 +114,6 @@ void Derive(const Byte *prk, unsigned prkSize, prevTSize = Z7_BLAKE2S_DIGEST_SIZE; Z7_memset_0_ARRAY(ti); - Z7_memset_0_ARRAY(message); Z7_memset_0_ARRAY(innerHash); } diff --git a/CPP/7zip/Crypto/HkdfBlake2sp.h b/CPP/7zip/Crypto/HkdfBlake2sp.h index 827d21bf8..2454accee 100644 --- a/CPP/7zip/Crypto/HkdfBlake2sp.h +++ b/CPP/7zip/Crypto/HkdfBlake2sp.h @@ -11,7 +11,6 @@ namespace NCrypto { namespace NHkdfBlake2sp { -// HKDF-Expand (RFC 5869) using HMAC-BLAKE2sp void Derive(const Byte *prk, unsigned prkSize, const char *info, unsigned infoLen, Byte *output, unsigned outSize); diff --git a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp index 4b0ccf9dd..591d71d1b 100644 --- a/CPP/7zip/Crypto/XChaCha20Poly1305.cpp +++ b/CPP/7zip/Crypto/XChaCha20Poly1305.cpp @@ -187,7 +187,8 @@ void CPoly1305::PadAndProcessBlock(Byte *buf, unsigned bufPos) if (bufPos != 0) { memset(buf + bufPos, 0, 16 - bufPos); - Poly1305_ProcessBlock(_h, _r, _s, buf, true); + buf[bufPos] = 1; + Poly1305_ProcessBlock(_h, _r, _s, buf, false); } }