7zip/C/Blake2s.c
Igor Pavlov fc662341e6 24.05
2024-05-15 23:55:04 +05:00

2646 lines
76 KiB
C

/* Blake2s.c -- BLAKE2sp Hash
2024-01-29 : Igor Pavlov : Public domain
2015-2019 : Samuel Neves : original code : CC0 1.0 Universal (CC0 1.0). */
#include "Precomp.h"
// #include <stdio.h>
#include <string.h>
#include "Blake2.h"
#include "RotateDefs.h"
#include "Compiler.h"
#include "CpuArch.h"
#if defined(__SSE2__)
#define Z7_BLAKE2S_USE_VECTORS
#elif defined(MY_CPU_X86_OR_AMD64)
#if defined(_MSC_VER) && _MSC_VER > 1200 \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
|| defined(__clang__) \
|| defined(__INTEL_COMPILER)
#define Z7_BLAKE2S_USE_VECTORS
#endif
#endif
#ifdef Z7_BLAKE2S_USE_VECTORS
#define Z7_BLAKE2SP_USE_FUNCTIONS
// define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED, if CBlake2sp can be non aligned for 32-bytes.
// #define Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
// SSSE3 : for _mm_shuffle_epi8 (pshufb) that improves the performance for 5-15%.
#if defined(__SSSE3__)
#define Z7_BLAKE2S_USE_SSSE3
#elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
|| defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
#define Z7_BLAKE2S_USE_SSSE3
#endif
#ifdef Z7_BLAKE2S_USE_SSSE3
/* SSE41 : for _mm_insert_epi32 (pinsrd)
it can slightly reduce code size and improves the performance in some cases.
it's used only for last 512-1024 bytes, if FAST versions (2 or 3) of vector algos are used.
it can be used for all blocks in another algos (4+).
*/
#if defined(__SSE4_1__)
#define Z7_BLAKE2S_USE_SSE41
#elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1500) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40300) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40000) \
|| defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 20300) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1000)
#define Z7_BLAKE2S_USE_SSE41
#endif
#endif // SSSE3
#if defined(__GNUC__) || defined(__clang__)
#if defined(Z7_BLAKE2S_USE_SSE41)
#define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse4.1")))
#elif defined(Z7_BLAKE2S_USE_SSSE3)
#define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("ssse3")))
#else
#define BLAKE2S_ATTRIB_128BIT __attribute__((__target__("sse2")))
#endif
#endif
#if defined(__AVX2__)
#define Z7_BLAKE2S_USE_AVX2
#else
#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
|| defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100)
#define Z7_BLAKE2S_USE_AVX2
#ifdef Z7_BLAKE2S_USE_AVX2
#define BLAKE2S_ATTRIB_AVX2 __attribute__((__target__("avx2")))
#endif
#elif defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
#if (Z7_MSC_VER_ORIGINAL == 1900)
#pragma warning(disable : 4752) // found Intel(R) Advanced Vector Extensions; consider using /arch:AVX
#endif
#define Z7_BLAKE2S_USE_AVX2
#endif
#endif
#ifdef Z7_BLAKE2S_USE_SSE41
#include <smmintrin.h> // SSE4.1
#elif defined(Z7_BLAKE2S_USE_SSSE3)
#include <tmmintrin.h> // SSSE3
#else
#include <emmintrin.h> // SSE2
#endif
#ifdef Z7_BLAKE2S_USE_AVX2
#include <immintrin.h>
#if defined(__clang__)
#include <avxintrin.h>
#include <avx2intrin.h>
#endif
#endif // avx2
#if defined(__AVX512F__) && defined(__AVX512VL__)
// && defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL > 1930)
#define Z7_BLAKE2S_USE_AVX512_ALWAYS
// #pragma message ("=== Blake2s AVX512")
#endif
#define Z7_BLAKE2S_USE_V128_FAST
// for speed optimization for small messages:
// #define Z7_BLAKE2S_USE_V128_WAY2
#ifdef Z7_BLAKE2S_USE_AVX2
// for debug:
// gather is slow
// #define Z7_BLAKE2S_USE_GATHER
#define Z7_BLAKE2S_USE_AVX2_FAST
// for speed optimization for small messages:
// #define Z7_BLAKE2S_USE_AVX2_WAY2
// #define Z7_BLAKE2S_USE_AVX2_WAY4
#if defined(Z7_BLAKE2S_USE_AVX2_WAY2) || \
defined(Z7_BLAKE2S_USE_AVX2_WAY4)
#define Z7_BLAKE2S_USE_AVX2_WAY_SLOW
#endif
#endif
#define Z7_BLAKE2SP_ALGO_DEFAULT 0
#define Z7_BLAKE2SP_ALGO_SCALAR 1
#ifdef Z7_BLAKE2S_USE_V128_FAST
#define Z7_BLAKE2SP_ALGO_V128_FAST 2
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_FAST
#define Z7_BLAKE2SP_ALGO_V256_FAST 3
#endif
#define Z7_BLAKE2SP_ALGO_V128_WAY1 4
#ifdef Z7_BLAKE2S_USE_V128_WAY2
#define Z7_BLAKE2SP_ALGO_V128_WAY2 5
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
#define Z7_BLAKE2SP_ALGO_V256_WAY2 6
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
#define Z7_BLAKE2SP_ALGO_V256_WAY4 7
#endif
#endif // Z7_BLAKE2S_USE_VECTORS
#define BLAKE2S_FINAL_FLAG (~(UInt32)0)
#define NSW Z7_BLAKE2SP_NUM_STRUCT_WORDS
#define SUPER_BLOCK_SIZE (Z7_BLAKE2S_BLOCK_SIZE * Z7_BLAKE2SP_PARALLEL_DEGREE)
#define SUPER_BLOCK_MASK (SUPER_BLOCK_SIZE - 1)
#define V_INDEX_0_0 0
#define V_INDEX_1_0 1
#define V_INDEX_2_0 2
#define V_INDEX_3_0 3
#define V_INDEX_0_1 4
#define V_INDEX_1_1 5
#define V_INDEX_2_1 6
#define V_INDEX_3_1 7
#define V_INDEX_0_2 8
#define V_INDEX_1_2 9
#define V_INDEX_2_2 10
#define V_INDEX_3_2 11
#define V_INDEX_0_3 12
#define V_INDEX_1_3 13
#define V_INDEX_2_3 14
#define V_INDEX_3_3 15
#define V_INDEX_4_0 0
#define V_INDEX_5_0 1
#define V_INDEX_6_0 2
#define V_INDEX_7_0 3
#define V_INDEX_7_1 4
#define V_INDEX_4_1 5
#define V_INDEX_5_1 6
#define V_INDEX_6_1 7
#define V_INDEX_6_2 8
#define V_INDEX_7_2 9
#define V_INDEX_4_2 10
#define V_INDEX_5_2 11
#define V_INDEX_5_3 12
#define V_INDEX_6_3 13
#define V_INDEX_7_3 14
#define V_INDEX_4_3 15
#define V(row, col) v[V_INDEX_ ## row ## _ ## col]
#define k_Blake2s_IV_0 0x6A09E667UL
#define k_Blake2s_IV_1 0xBB67AE85UL
#define k_Blake2s_IV_2 0x3C6EF372UL
#define k_Blake2s_IV_3 0xA54FF53AUL
#define k_Blake2s_IV_4 0x510E527FUL
#define k_Blake2s_IV_5 0x9B05688CUL
#define k_Blake2s_IV_6 0x1F83D9ABUL
#define k_Blake2s_IV_7 0x5BE0CD19UL
#define KIV(n) (k_Blake2s_IV_## n)
#ifdef Z7_BLAKE2S_USE_VECTORS
MY_ALIGN(16)
static const UInt32 k_Blake2s_IV[8] =
{
KIV(0), KIV(1), KIV(2), KIV(3), KIV(4), KIV(5), KIV(6), KIV(7)
};
#endif
#define STATE_T(s) ((s) + 8)
#define STATE_F(s) ((s) + 10)
#ifdef Z7_BLAKE2S_USE_VECTORS
#define LOAD_128(p) _mm_load_si128 ((const __m128i *)(const void *)(p))
#define LOADU_128(p) _mm_loadu_si128((const __m128i *)(const void *)(p))
#ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
// here we use unaligned load and stores
// use this branch if CBlake2sp can be unaligned for 16 bytes
#define STOREU_128(p, r) _mm_storeu_si128((__m128i *)(void *)(p), r)
#define LOAD_128_FROM_STRUCT(p) LOADU_128(p)
#define STORE_128_TO_STRUCT(p, r) STOREU_128(p, r)
#else
// here we use aligned load and stores
// use this branch if CBlake2sp is aligned for 16 bytes
#define STORE_128(p, r) _mm_store_si128((__m128i *)(void *)(p), r)
#define LOAD_128_FROM_STRUCT(p) LOAD_128(p)
#define STORE_128_TO_STRUCT(p, r) STORE_128(p, r)
#endif
#endif // Z7_BLAKE2S_USE_VECTORS
#if 0
static void PrintState(const UInt32 *s, unsigned num)
{
unsigned i;
printf("\n");
for (i = 0; i < num; i++)
printf(" %08x", (unsigned)s[i]);
}
static void PrintStates2(const UInt32 *s, unsigned x, unsigned y)
{
unsigned i;
for (i = 0; i < y; i++)
PrintState(s + i * x, x);
printf("\n");
}
#endif
#define REP8_MACRO(m) { m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) }
#define BLAKE2S_NUM_ROUNDS 10
#if defined(Z7_BLAKE2S_USE_VECTORS)
#define ROUNDS_LOOP(mac) \
{ unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r++) mac(r) }
#endif
/*
#define ROUNDS_LOOP_2(mac) \
{ unsigned r; for (r = 0; r < BLAKE2S_NUM_ROUNDS; r += 2) { mac(r) mac(r + 1) } }
*/
#if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
#define ROUNDS_LOOP_UNROLLED(m) \
{ m(0) m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) }
#endif
#define SIGMA_TABLE(M) \
M( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ), \
M( 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 ), \
M( 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 ), \
M( 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 ), \
M( 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 ), \
M( 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 ), \
M( 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 ), \
M( 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 ), \
M( 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 ), \
M( 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 )
#define SIGMA_TABLE_MULT(m, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
{ a0*m,a1*m,a2*m,a3*m,a4*m,a5*m,a6*m,a7*m,a8*m,a9*m,a10*m,a11*m,a12*m,a13*m,a14*m,a15*m }
#define SIGMA_TABLE_MULT_4( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
SIGMA_TABLE_MULT(4, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
// MY_ALIGN(32)
MY_ALIGN(16)
static const Byte k_Blake2s_Sigma_4[BLAKE2S_NUM_ROUNDS][16] =
{ SIGMA_TABLE(SIGMA_TABLE_MULT_4) };
#define GET_SIGMA_PTR(p, index) \
((const void *)((const Byte *)(const void *)(p) + (index)))
#define GET_STATE_TABLE_PTR_FROM_BYTE_POS(s, pos) \
((UInt32 *)(void *)((Byte *)(void *)(s) + (pos)))
#ifdef Z7_BLAKE2S_USE_VECTORS
#if 0
// use loading constants from memory
// is faster for some compilers.
#define KK4(n) KIV(n), KIV(n), KIV(n), KIV(n)
MY_ALIGN(64)
static const UInt32 k_Blake2s_IV_WAY4[]=
{
KK4(0), KK4(1), KK4(2), KK4(3), KK4(4), KK4(5), KK4(6), KK4(7)
};
#define GET_128_IV_WAY4(i) LOAD_128(k_Blake2s_IV_WAY4 + 4 * (i))
#else
// use constant generation:
#define GET_128_IV_WAY4(i) _mm_set1_epi32((Int32)KIV(i))
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
#define GET_CONST_128_FROM_ARRAY32(k) \
_mm_set_epi32((Int32)(k)[3], (Int32)(k)[2], (Int32)(k)[1], (Int32)(k)[0])
#endif
#if 0
#define k_r8 _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
#define k_r16 _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
#define k_inc _mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE)
#define k_iv0_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0)
#define k_iv4_128 GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4)
#else
#if defined(Z7_BLAKE2S_USE_SSSE3) && \
!defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
MY_ALIGN(16) static const Byte k_r8_arr [16] = { 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12 };
MY_ALIGN(16) static const Byte k_r16_arr[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 };
#define k_r8 LOAD_128(k_r8_arr)
#define k_r16 LOAD_128(k_r16_arr)
#endif
MY_ALIGN(16) static const UInt32 k_inc_arr[4] = { Z7_BLAKE2S_BLOCK_SIZE, 0, 0, 0 };
#define k_inc LOAD_128(k_inc_arr)
#define k_iv0_128 LOAD_128(k_Blake2s_IV + 0)
#define k_iv4_128 LOAD_128(k_Blake2s_IV + 4)
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
#ifdef Z7_BLAKE2S_USE_AVX2
#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION < 80000)
#define MY_mm256_set_m128i(hi, lo) _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)
#else
#define MY_mm256_set_m128i _mm256_set_m128i
#endif
#define SET_FROM_128(a) MY_mm256_set_m128i(a, a)
#ifndef Z7_BLAKE2S_USE_AVX512_ALWAYS
MY_ALIGN(32) static const Byte k_r8_arr_256 [32] =
{
1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12,
1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8 ,13, 14, 15, 12
};
MY_ALIGN(32) static const Byte k_r16_arr_256[32] =
{
2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13,
2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
};
#define k_r8_256 LOAD_256(k_r8_arr_256)
#define k_r16_256 LOAD_256(k_r16_arr_256)
#endif
// #define k_r8_256 SET_FROM_128(_mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1))
// #define k_r16_256 SET_FROM_128(_mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2))
// #define k_inc_256 SET_FROM_128(_mm_set_epi32(0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE))
// #define k_iv0_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 0))
#define k_iv4_256 SET_FROM_128(GET_CONST_128_FROM_ARRAY32(k_Blake2s_IV + 4))
#endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
#endif
/*
IPC(TP) ports:
1 p__5 : skl- : SSE : shufps : _mm_shuffle_ps
2 p_15 : icl+
1 p__5 : nhm-bdw : SSE : xorps : _mm_xor_ps
3 p015 : skl+
3 p015 : SSE2 : pxor : _mm_xor_si128
2 p_15: snb-bdw : SSE2 : padd : _mm_add_epi32
2 p0_5: mrm-wsm :
3 p015 : skl+
2 p_15 : ivb-,icl+ : SSE2 : punpcklqdq, punpckhqdq, punpckldq, punpckhdq
2 p_15 : : SSE2 : pshufd : _mm_shuffle_epi32
2 p_15 : : SSE2 : pshuflw : _mm_shufflelo_epi16
2 p_15 : : SSE2 : psrldq :
2 p_15 : : SSE3 : pshufb : _mm_shuffle_epi8
2 p_15 : : SSE4 : pblendw : _mm_blend_epi16
1 p__5 : hsw-skl : *
1 p0 : SSE2 : pslld (i8) : _mm_slli_si128
2 p01 : skl+ :
2 p_15 : ivb- : SSE3 : palignr
1 p__5 : hsw+
2 p_15 + p23 : ivb-, icl+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, m32, i8)
1 p__5 + p23 : hsw-skl
1 p_15 + p5 : ivb-, ice+ : SSE4 : pinsrd : _mm_insert_epi32(xmm, r32, i8)
0.5 2*p5 : hsw-skl
2 p23 : SSE2 : movd (m32)
3 p23A : adl :
1 p5: : SSE2 : movd (r32)
*/
#if 0 && defined(__XOP__)
// we must debug and test __XOP__ instruction
#include <x86intrin.h>
#include <ammintrin.h>
#define LOAD_ROTATE_CONSTS
#define MM_ROR_EPI32(r, c) _mm_roti_epi32(r, -(c))
#define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
#elif 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
#define LOAD_ROTATE_CONSTS
#define MM_ROR_EPI32(r, c) _mm_ror_epi32(r, c)
#define Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
#else
// MSVC_1937+ uses "orps" instruction for _mm_or_si128().
// But "orps" has low throughput: TP=1 for bdw-nhm.
// So it can be better to use _mm_add_epi32()/"paddd" (TP=2 for bdw-nhm) instead of "xorps".
// But "orps" is fast for modern cpus (skl+).
// So we are default with "or" version:
#if 0 || 0 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1937
// minor optimization for some old cpus, if "xorps" is slow.
#define MM128_EPI32_OR_or_ADD _mm_add_epi32
#else
#define MM128_EPI32_OR_or_ADD _mm_or_si128
#endif
#define MM_ROR_EPI32_VIA_SHIFT(r, c)( \
MM128_EPI32_OR_or_ADD( \
_mm_srli_epi32((r), (c)), \
_mm_slli_epi32((r), 32-(c))))
#if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
#define LOAD_ROTATE_CONSTS \
const __m128i r8 = k_r8; \
const __m128i r16 = k_r16;
#define MM_ROR_EPI32(r, c) ( \
( 8==(c)) ? _mm_shuffle_epi8(r,r8) \
: (16==(c)) ? _mm_shuffle_epi8(r,r16) \
: MM_ROR_EPI32_VIA_SHIFT(r, c))
#else
#define LOAD_ROTATE_CONSTS
#define MM_ROR_EPI32(r, c) ( \
(16==(c)) ? _mm_shufflehi_epi16(_mm_shufflelo_epi16(r, 0xb1), 0xb1) \
: MM_ROR_EPI32_VIA_SHIFT(r, c))
#endif
#endif
/*
we have 3 main ways to load 4 32-bit integers to __m128i:
1) SSE2: _mm_set_epi32()
2) SSE2: _mm_unpacklo_epi64() / _mm_unpacklo_epi32 / _mm_cvtsi32_si128()
3) SSE41: _mm_insert_epi32() and _mm_cvtsi32_si128()
good compiler for _mm_set_epi32() generates these instructions:
{
movd xmm, [m32]; vpunpckldq; vpunpckldq; vpunpcklqdq;
}
good new compiler generates one instruction
{
for _mm_insert_epi32() : { pinsrd xmm, [m32], i }
for _mm_cvtsi32_si128() : { movd xmm, [m32] }
}
but vc2010 generates slow pair of instructions:
{
for _mm_insert_epi32() : { mov r32, [m32]; pinsrd xmm, r32, i }
for _mm_cvtsi32_si128() : { mov r32, [m32]; movd xmm, r32 }
}
_mm_insert_epi32() (pinsrd) code reduces xmm register pressure
in comparison with _mm_set_epi32() (movd + vpunpckld) code.
Note that variant with "movd xmm, r32" can be more slow,
but register pressure can be more important.
So we can force to "pinsrd" always.
*/
// #if !defined(Z7_MSC_VER_ORIGINAL) || Z7_MSC_VER_ORIGINAL > 1600 || defined(MY_CPU_X86)
#ifdef Z7_BLAKE2S_USE_SSE41
/* _mm_set_epi32() can be more effective for GCC and CLANG
_mm_insert_epi32() is more effective for MSVC */
#if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
#define Z7_BLAKE2S_USE_INSERT_INSTRUCTION
#endif
#endif // USE_SSE41
// #endif
#ifdef Z7_BLAKE2S_USE_INSERT_INSTRUCTION
// for SSE4.1
#define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
_mm_insert_epi32( \
_mm_insert_epi32( \
_mm_insert_epi32( \
_mm_cvtsi32_si128( \
*(const Int32 *)p0), \
*(const Int32 *)p1, 1), \
*(const Int32 *)p2, 2), \
*(const Int32 *)p3, 3)
#elif 0 || 1 && defined(Z7_MSC_VER_ORIGINAL)
/* MSVC 1400 implements _mm_set_epi32() via slow memory write/read.
Also _mm_unpacklo_epi32 is more effective for another MSVC compilers.
But _mm_set_epi32() is more effective for GCC and CLANG.
So we use _mm_unpacklo_epi32 for MSVC only */
#define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
_mm_unpacklo_epi64( \
_mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p0), \
_mm_cvtsi32_si128(*(const Int32 *)p1)), \
_mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const Int32 *)p2), \
_mm_cvtsi32_si128(*(const Int32 *)p3)))
#else
#define MM_LOAD_EPI32_FROM_4_POINTERS(p0, p1, p2, p3) \
_mm_set_epi32( \
*(const Int32 *)p3, \
*(const Int32 *)p2, \
*(const Int32 *)p1, \
*(const Int32 *)p0)
#endif
#define SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3) \
MM_LOAD_EPI32_FROM_4_POINTERS( \
GET_SIGMA_PTR(input, i0), \
GET_SIGMA_PTR(input, i1), \
GET_SIGMA_PTR(input, i2), \
GET_SIGMA_PTR(input, i3))
#define SET_ROW_FROM_SIGMA(input, sigma_index) \
SET_ROW_FROM_SIGMA_BASE(input, \
sigma[(sigma_index) ], \
sigma[(sigma_index) + 2 * 1], \
sigma[(sigma_index) + 2 * 2], \
sigma[(sigma_index) + 2 * 3]) \
#define ADD_128(a, b) _mm_add_epi32(a, b)
#define XOR_128(a, b) _mm_xor_si128(a, b)
#define D_ADD_128(dest, src) dest = ADD_128(dest, src)
#define D_XOR_128(dest, src) dest = XOR_128(dest, src)
#define D_ROR_128(dest, shift) dest = MM_ROR_EPI32(dest, shift)
#define D_ADD_EPI64_128(dest, src) dest = _mm_add_epi64(dest, src)
#define AXR(a, b, d, shift) \
D_ADD_128(a, b); \
D_XOR_128(d, a); \
D_ROR_128(d, shift);
#define AXR2(a, b, c, d, input, sigma_index, shift1, shift2) \
a = _mm_add_epi32 (a, SET_ROW_FROM_SIGMA(input, sigma_index)); \
AXR(a, b, d, shift1) \
AXR(c, d, b, shift2)
#define ROTATE_WORDS_TO_RIGHT(a, n) \
a = _mm_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
#define AXR4(a, b, c, d, input, sigma_index) \
AXR2(a, b, c, d, input, sigma_index, 16, 12) \
AXR2(a, b, c, d, input, sigma_index + 1, 8, 7) \
#define RR2(a, b, c, d, input) \
{ \
AXR4(a, b, c, d, input, 0) \
ROTATE_WORDS_TO_RIGHT(b, 1) \
ROTATE_WORDS_TO_RIGHT(c, 2) \
ROTATE_WORDS_TO_RIGHT(d, 3) \
AXR4(a, b, c, d, input, 8) \
ROTATE_WORDS_TO_RIGHT(b, 3) \
ROTATE_WORDS_TO_RIGHT(c, 2) \
ROTATE_WORDS_TO_RIGHT(d, 1) \
}
/*
Way1:
per 64 bytes block:
10 rounds * 4 iters * (7 + 2) = 360 cycles = if pslld TP=1
* (7 + 1) = 320 cycles = if pslld TP=2 (skl+)
additional operations per 7_op_iter :
4 movzx byte mem
1 movd mem
3 pinsrd mem
1.5 pshufd
*/
static
#if 0 || 0 && (defined(Z7_BLAKE2S_USE_V128_WAY2) || \
defined(Z7_BLAKE2S_USE_V256_WAY2))
Z7_NO_INLINE
#else
Z7_FORCE_INLINE
#endif
#ifdef BLAKE2S_ATTRIB_128BIT
BLAKE2S_ATTRIB_128BIT
#endif
void
Z7_FASTCALL
Blake2s_Compress_V128_Way1(UInt32 * const s, const Byte * const input)
{
__m128i a, b, c, d;
__m128i f0, f1;
LOAD_ROTATE_CONSTS
d = LOAD_128_FROM_STRUCT(STATE_T(s));
c = k_iv0_128;
a = f0 = LOAD_128_FROM_STRUCT(s);
b = f1 = LOAD_128_FROM_STRUCT(s + 4);
D_ADD_EPI64_128(d, k_inc);
STORE_128_TO_STRUCT (STATE_T(s), d);
D_XOR_128(d, k_iv4_128);
#define RR(r) { const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
RR2(a, b, c, d, input) }
ROUNDS_LOOP(RR)
#undef RR
STORE_128_TO_STRUCT(s , XOR_128(f0, XOR_128(a, c)));
STORE_128_TO_STRUCT(s + 4, XOR_128(f1, XOR_128(b, d)));
}
static
Z7_NO_INLINE
#ifdef BLAKE2S_ATTRIB_128BIT
BLAKE2S_ATTRIB_128BIT
#endif
void
Z7_FASTCALL
Blake2sp_Compress2_V128_Way1(UInt32 *s_items, const Byte *data, const Byte *end)
{
size_t pos = 0;
do
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
Blake2s_Compress_V128_Way1(s, data);
data += Z7_BLAKE2S_BLOCK_SIZE;
pos += Z7_BLAKE2S_BLOCK_SIZE;
pos &= SUPER_BLOCK_MASK;
}
while (data != end);
}
#if defined(Z7_BLAKE2S_USE_V128_WAY2) || \
defined(Z7_BLAKE2S_USE_AVX2_WAY2)
#if 1
#define Z7_BLAKE2S_CompressSingleBlock(s, data) \
Blake2sp_Compress2_V128_Way1(s, data, \
(const Byte *)(const void *)(data) + Z7_BLAKE2S_BLOCK_SIZE)
#else
#define Z7_BLAKE2S_CompressSingleBlock Blake2s_Compress_V128_Way1
#endif
#endif
#if (defined(Z7_BLAKE2S_USE_AVX2_WAY_SLOW) || \
defined(Z7_BLAKE2S_USE_V128_WAY2)) && \
!defined(Z7_BLAKE2S_USE_GATHER)
#define AXR2_LOAD_INDEXES(sigma_index) \
const unsigned i0 = sigma[(sigma_index)]; \
const unsigned i1 = sigma[(sigma_index) + 2 * 1]; \
const unsigned i2 = sigma[(sigma_index) + 2 * 2]; \
const unsigned i3 = sigma[(sigma_index) + 2 * 3]; \
#define SET_ROW_FROM_SIGMA_W(input) \
SET_ROW_FROM_SIGMA_BASE(input, i0, i1, i2, i3)
#endif
#ifdef Z7_BLAKE2S_USE_V128_WAY2
#if 1 || !defined(Z7_BLAKE2S_USE_SSE41)
/* we use SET_ROW_FROM_SIGMA_BASE, that uses
(SSE4) _mm_insert_epi32(), if Z7_BLAKE2S_USE_INSERT_INSTRUCTION is defined
(SSE2) _mm_set_epi32()
MSVC can be faster for this branch:
*/
#define AXR2_W(sigma_index, shift1, shift2) \
{ \
AXR2_LOAD_INDEXES(sigma_index) \
a0 = _mm_add_epi32(a0, SET_ROW_FROM_SIGMA_W(data)); \
a1 = _mm_add_epi32(a1, SET_ROW_FROM_SIGMA_W(data + Z7_BLAKE2S_BLOCK_SIZE)); \
AXR(a0, b0, d0, shift1) \
AXR(a1, b1, d1, shift1) \
AXR(c0, d0, b0, shift2) \
AXR(c1, d1, b1, shift2) \
}
#else
/* we use interleaved _mm_insert_epi32():
GCC can be faster for this branch:
*/
#define AXR2_W_PRE_INSERT(sigma_index, i) \
{ const unsigned ii = sigma[(sigma_index) + i * 2]; \
t0 = _mm_insert_epi32(t0, *(const Int32 *)GET_SIGMA_PTR(data, ii), i); \
t1 = _mm_insert_epi32(t1, *(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii), i); \
}
#define AXR2_W(sigma_index, shift1, shift2) \
{ __m128i t0, t1; \
{ const unsigned ii = sigma[sigma_index]; \
t0 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, ii)); \
t1 = _mm_cvtsi32_si128(*(const Int32 *)GET_SIGMA_PTR(data, Z7_BLAKE2S_BLOCK_SIZE + ii)); \
} \
AXR2_W_PRE_INSERT(sigma_index, 1) \
AXR2_W_PRE_INSERT(sigma_index, 2) \
AXR2_W_PRE_INSERT(sigma_index, 3) \
a0 = _mm_add_epi32(a0, t0); \
a1 = _mm_add_epi32(a1, t1); \
AXR(a0, b0, d0, shift1) \
AXR(a1, b1, d1, shift1) \
AXR(c0, d0, b0, shift2) \
AXR(c1, d1, b1, shift2) \
}
#endif
#define AXR4_W(sigma_index) \
AXR2_W(sigma_index, 16, 12) \
AXR2_W(sigma_index + 1, 8, 7) \
#define WW(r) \
{ const Byte * const sigma = k_Blake2s_Sigma_4[r]; \
AXR4_W(0) \
ROTATE_WORDS_TO_RIGHT(b0, 1) \
ROTATE_WORDS_TO_RIGHT(b1, 1) \
ROTATE_WORDS_TO_RIGHT(c0, 2) \
ROTATE_WORDS_TO_RIGHT(c1, 2) \
ROTATE_WORDS_TO_RIGHT(d0, 3) \
ROTATE_WORDS_TO_RIGHT(d1, 3) \
AXR4_W(8) \
ROTATE_WORDS_TO_RIGHT(b0, 3) \
ROTATE_WORDS_TO_RIGHT(b1, 3) \
ROTATE_WORDS_TO_RIGHT(c0, 2) \
ROTATE_WORDS_TO_RIGHT(c1, 2) \
ROTATE_WORDS_TO_RIGHT(d0, 1) \
ROTATE_WORDS_TO_RIGHT(d1, 1) \
}
static
Z7_NO_INLINE
#ifdef BLAKE2S_ATTRIB_128BIT
BLAKE2S_ATTRIB_128BIT
#endif
void
Z7_FASTCALL
Blake2sp_Compress2_V128_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
{
size_t pos = 0;
end -= Z7_BLAKE2S_BLOCK_SIZE;
if (data != end)
{
LOAD_ROTATE_CONSTS
do
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
__m128i a0, b0, c0, d0;
__m128i a1, b1, c1, d1;
{
const __m128i inc = k_inc;
const __m128i temp = k_iv4_128;
d0 = LOAD_128_FROM_STRUCT (STATE_T(s));
d1 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
D_ADD_EPI64_128(d0, inc);
D_ADD_EPI64_128(d1, inc);
STORE_128_TO_STRUCT (STATE_T(s ), d0);
STORE_128_TO_STRUCT (STATE_T(s + NSW), d1);
D_XOR_128(d0, temp);
D_XOR_128(d1, temp);
}
c1 = c0 = k_iv0_128;
a0 = LOAD_128_FROM_STRUCT(s);
b0 = LOAD_128_FROM_STRUCT(s + 4);
a1 = LOAD_128_FROM_STRUCT(s + NSW);
b1 = LOAD_128_FROM_STRUCT(s + NSW + 4);
ROUNDS_LOOP (WW)
#undef WW
D_XOR_128(a0, c0);
D_XOR_128(b0, d0);
D_XOR_128(a1, c1);
D_XOR_128(b1, d1);
D_XOR_128(a0, LOAD_128_FROM_STRUCT(s));
D_XOR_128(b0, LOAD_128_FROM_STRUCT(s + 4));
D_XOR_128(a1, LOAD_128_FROM_STRUCT(s + NSW));
D_XOR_128(b1, LOAD_128_FROM_STRUCT(s + NSW + 4));
STORE_128_TO_STRUCT(s, a0);
STORE_128_TO_STRUCT(s + 4, b0);
STORE_128_TO_STRUCT(s + NSW, a1);
STORE_128_TO_STRUCT(s + NSW + 4, b1);
data += Z7_BLAKE2S_BLOCK_SIZE * 2;
pos += Z7_BLAKE2S_BLOCK_SIZE * 2;
pos &= SUPER_BLOCK_MASK;
}
while (data < end);
if (data != end)
return;
}
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
Z7_BLAKE2S_CompressSingleBlock(s, data);
}
}
#endif // Z7_BLAKE2S_USE_V128_WAY2
#ifdef Z7_BLAKE2S_USE_V128_WAY2
#define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way2
#else
#define Z7_BLAKE2S_Compress2_V128 Blake2sp_Compress2_V128_Way1
#endif
#ifdef Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED
#define ROT_128_8(x) MM_ROR_EPI32(x, 8)
#define ROT_128_16(x) MM_ROR_EPI32(x, 16)
#define ROT_128_7(x) MM_ROR_EPI32(x, 7)
#define ROT_128_12(x) MM_ROR_EPI32(x, 12)
#else
#if defined(Z7_BLAKE2S_USE_SSSE3) || defined(Z7_BLAKE2S_USE_SSE41)
#define ROT_128_8(x) _mm_shuffle_epi8(x, r8) // k_r8
#define ROT_128_16(x) _mm_shuffle_epi8(x, r16) // k_r16
#else
#define ROT_128_8(x) MM_ROR_EPI32_VIA_SHIFT(x, 8)
#define ROT_128_16(x) MM_ROR_EPI32_VIA_SHIFT(x, 16)
#endif
#define ROT_128_7(x) MM_ROR_EPI32_VIA_SHIFT(x, 7)
#define ROT_128_12(x) MM_ROR_EPI32_VIA_SHIFT(x, 12)
#endif
#if 1
// this branch can provide similar speed on x86* in most cases,
// because [base + index*4] provides same speed as [base + index].
// but some compilers can generate different code with this branch, that can be faster sometimes.
// this branch uses additional table of 10*16=160 bytes.
#define SIGMA_TABLE_MULT_16( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
SIGMA_TABLE_MULT(16, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
MY_ALIGN(16)
static const Byte k_Blake2s_Sigma_16[BLAKE2S_NUM_ROUNDS][16] =
{ SIGMA_TABLE(SIGMA_TABLE_MULT_16) };
#define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_16[r];
#define GET_SIGMA_VAL_128(n) (sigma[n])
#else
#define GET_SIGMA_PTR_128(r) const Byte * const sigma = k_Blake2s_Sigma_4[r];
#define GET_SIGMA_VAL_128(n) (4 * (size_t)sigma[n])
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_FAST
#if 1
#define SIGMA_TABLE_MULT_32( a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
SIGMA_TABLE_MULT(32, a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15)
MY_ALIGN(64)
static const UInt16 k_Blake2s_Sigma_32[BLAKE2S_NUM_ROUNDS][16] =
{ SIGMA_TABLE(SIGMA_TABLE_MULT_32) };
#define GET_SIGMA_PTR_256(r) const UInt16 * const sigma = k_Blake2s_Sigma_32[r];
#define GET_SIGMA_VAL_256(n) (sigma[n])
#else
#define GET_SIGMA_PTR_256(r) const Byte * const sigma = k_Blake2s_Sigma_4[r];
#define GET_SIGMA_VAL_256(n) (8 * (size_t)sigma[n])
#endif
#endif // Z7_BLAKE2S_USE_AVX2_FAST
#define D_ROT_128_7(dest) dest = ROT_128_7(dest)
#define D_ROT_128_8(dest) dest = ROT_128_8(dest)
#define D_ROT_128_12(dest) dest = ROT_128_12(dest)
#define D_ROT_128_16(dest) dest = ROT_128_16(dest)
#define OP_L(a, i) D_ADD_128 (V(a, 0), \
LOAD_128((const Byte *)(w) + GET_SIGMA_VAL_128(2*(a)+(i))));
#define OP_0(a) OP_L(a, 0)
#define OP_7(a) OP_L(a, 1)
#define OP_1(a) D_ADD_128 (V(a, 0), V(a, 1));
#define OP_2(a) D_XOR_128 (V(a, 3), V(a, 0));
#define OP_4(a) D_ADD_128 (V(a, 2), V(a, 3));
#define OP_5(a) D_XOR_128 (V(a, 1), V(a, 2));
#define OP_3(a) D_ROT_128_16 (V(a, 3));
#define OP_6(a) D_ROT_128_12 (V(a, 1));
#define OP_8(a) D_ROT_128_8 (V(a, 3));
#define OP_9(a) D_ROT_128_7 (V(a, 1));
// for 32-bit x86 : interleave mode works slower, because of register pressure.
#if 0 || 1 && (defined(MY_CPU_X86) \
|| defined(__GNUC__) && !defined(__clang__))
// non-inteleaved version:
// is fast for x86 32-bit.
// is fast for GCC x86-64.
#define V4G(a) \
OP_0 (a) \
OP_1 (a) \
OP_2 (a) \
OP_3 (a) \
OP_4 (a) \
OP_5 (a) \
OP_6 (a) \
OP_7 (a) \
OP_1 (a) \
OP_2 (a) \
OP_8 (a) \
OP_4 (a) \
OP_5 (a) \
OP_9 (a) \
#define V4R \
{ \
V4G (0) \
V4G (1) \
V4G (2) \
V4G (3) \
V4G (4) \
V4G (5) \
V4G (6) \
V4G (7) \
}
#elif 0 || 1 && defined(MY_CPU_X86)
#define OP_INTER_2(op, a,b) \
op (a) \
op (b) \
#define V4G(a,b) \
OP_INTER_2 (OP_0, a,b) \
OP_INTER_2 (OP_1, a,b) \
OP_INTER_2 (OP_2, a,b) \
OP_INTER_2 (OP_3, a,b) \
OP_INTER_2 (OP_4, a,b) \
OP_INTER_2 (OP_5, a,b) \
OP_INTER_2 (OP_6, a,b) \
OP_INTER_2 (OP_7, a,b) \
OP_INTER_2 (OP_1, a,b) \
OP_INTER_2 (OP_2, a,b) \
OP_INTER_2 (OP_8, a,b) \
OP_INTER_2 (OP_4, a,b) \
OP_INTER_2 (OP_5, a,b) \
OP_INTER_2 (OP_9, a,b) \
#define V4R \
{ \
V4G (0, 1) \
V4G (2, 3) \
V4G (4, 5) \
V4G (6, 7) \
}
#else
// iterleave-4 version is fast for x64 (MSVC/CLANG)
#define OP_INTER_4(op, a,b,c,d) \
op (a) \
op (b) \
op (c) \
op (d) \
#define V4G(a,b,c,d) \
OP_INTER_4 (OP_0, a,b,c,d) \
OP_INTER_4 (OP_1, a,b,c,d) \
OP_INTER_4 (OP_2, a,b,c,d) \
OP_INTER_4 (OP_3, a,b,c,d) \
OP_INTER_4 (OP_4, a,b,c,d) \
OP_INTER_4 (OP_5, a,b,c,d) \
OP_INTER_4 (OP_6, a,b,c,d) \
OP_INTER_4 (OP_7, a,b,c,d) \
OP_INTER_4 (OP_1, a,b,c,d) \
OP_INTER_4 (OP_2, a,b,c,d) \
OP_INTER_4 (OP_8, a,b,c,d) \
OP_INTER_4 (OP_4, a,b,c,d) \
OP_INTER_4 (OP_5, a,b,c,d) \
OP_INTER_4 (OP_9, a,b,c,d) \
#define V4R \
{ \
V4G (0, 1, 2, 3) \
V4G (4, 5, 6, 7) \
}
#endif
#define V4_ROUND(r) { GET_SIGMA_PTR_128(r); V4R }
#define V4_LOAD_MSG_1(w, m, i) \
{ \
__m128i m0, m1, m2, m3; \
__m128i t0, t1, t2, t3; \
m0 = LOADU_128((m) + ((i) + 0 * 4) * 16); \
m1 = LOADU_128((m) + ((i) + 1 * 4) * 16); \
m2 = LOADU_128((m) + ((i) + 2 * 4) * 16); \
m3 = LOADU_128((m) + ((i) + 3 * 4) * 16); \
t0 = _mm_unpacklo_epi32(m0, m1); \
t1 = _mm_unpackhi_epi32(m0, m1); \
t2 = _mm_unpacklo_epi32(m2, m3); \
t3 = _mm_unpackhi_epi32(m2, m3); \
w[(i) * 4 + 0] = _mm_unpacklo_epi64(t0, t2); \
w[(i) * 4 + 1] = _mm_unpackhi_epi64(t0, t2); \
w[(i) * 4 + 2] = _mm_unpacklo_epi64(t1, t3); \
w[(i) * 4 + 3] = _mm_unpackhi_epi64(t1, t3); \
}
#define V4_LOAD_MSG(w, m) \
{ \
V4_LOAD_MSG_1 (w, m, 0) \
V4_LOAD_MSG_1 (w, m, 1) \
V4_LOAD_MSG_1 (w, m, 2) \
V4_LOAD_MSG_1 (w, m, 3) \
}
#define V4_LOAD_UNPACK_PAIR_128(src32, i, d0, d1) \
{ \
const __m128i v0 = LOAD_128_FROM_STRUCT((src32) + (i ) * 4); \
const __m128i v1 = LOAD_128_FROM_STRUCT((src32) + (i + 1) * 4); \
d0 = _mm_unpacklo_epi32(v0, v1); \
d1 = _mm_unpackhi_epi32(v0, v1); \
}
#define V4_UNPACK_PAIR_128(dest32, i, s0, s1) \
{ \
STORE_128_TO_STRUCT((dest32) + i * 4 , _mm_unpacklo_epi64(s0, s1)); \
STORE_128_TO_STRUCT((dest32) + i * 4 + 16, _mm_unpackhi_epi64(s0, s1)); \
}
#define V4_UNPACK_STATE(dest32, src32) \
{ \
__m128i t0, t1, t2, t3, t4, t5, t6, t7; \
V4_LOAD_UNPACK_PAIR_128(src32, 0, t0, t1) \
V4_LOAD_UNPACK_PAIR_128(src32, 2, t2, t3) \
V4_LOAD_UNPACK_PAIR_128(src32, 4, t4, t5) \
V4_LOAD_UNPACK_PAIR_128(src32, 6, t6, t7) \
V4_UNPACK_PAIR_128(dest32, 0, t0, t2) \
V4_UNPACK_PAIR_128(dest32, 8, t1, t3) \
V4_UNPACK_PAIR_128(dest32, 1, t4, t6) \
V4_UNPACK_PAIR_128(dest32, 9, t5, t7) \
}
static
Z7_NO_INLINE
#ifdef BLAKE2S_ATTRIB_128BIT
BLAKE2S_ATTRIB_128BIT
#endif
void
Z7_FASTCALL
Blake2sp_Compress2_V128_Fast(UInt32 *s_items, const Byte *data, const Byte *end)
{
// PrintStates2(s_items, 8, 16);
size_t pos = 0;
pos /= 2;
do
{
#if defined(Z7_BLAKE2S_USE_SSSE3) && \
!defined(Z7_BLAKE2S_MM_ROR_EPI32_IS_SUPPORTED)
const __m128i r8 = k_r8;
const __m128i r16 = k_r16;
#endif
__m128i w[16];
__m128i v[16];
UInt32 *s;
V4_LOAD_MSG(w, data)
s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
{
__m128i ctr = LOAD_128_FROM_STRUCT(s + 64);
D_ADD_EPI64_128 (ctr, k_inc);
STORE_128_TO_STRUCT(s + 64, ctr);
v[12] = XOR_128 (GET_128_IV_WAY4(4), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0)));
v[13] = XOR_128 (GET_128_IV_WAY4(5), _mm_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1)));
}
v[ 8] = GET_128_IV_WAY4(0);
v[ 9] = GET_128_IV_WAY4(1);
v[10] = GET_128_IV_WAY4(2);
v[11] = GET_128_IV_WAY4(3);
v[14] = GET_128_IV_WAY4(6);
v[15] = GET_128_IV_WAY4(7);
#define LOAD_STATE_128_FROM_STRUCT(i) \
v[i] = LOAD_128_FROM_STRUCT(s + (i) * 4);
#define UPDATE_STATE_128_IN_STRUCT(i) \
STORE_128_TO_STRUCT(s + (i) * 4, XOR_128( \
XOR_128(v[i], v[(i) + 8]), \
LOAD_128_FROM_STRUCT(s + (i) * 4)));
REP8_MACRO (LOAD_STATE_128_FROM_STRUCT)
ROUNDS_LOOP (V4_ROUND)
REP8_MACRO (UPDATE_STATE_128_IN_STRUCT)
data += Z7_BLAKE2S_BLOCK_SIZE * 4;
pos += Z7_BLAKE2S_BLOCK_SIZE * 4 / 2;
pos &= SUPER_BLOCK_SIZE / 2 - 1;
}
while (data != end);
}
static
Z7_NO_INLINE
#ifdef BLAKE2S_ATTRIB_128BIT
BLAKE2S_ATTRIB_128BIT
#endif
void
Z7_FASTCALL
Blake2sp_Final_V128_Fast(UInt32 *states)
{
const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64);
// printf("\nBlake2sp_Compress2_V128_Fast_Final4\n");
// PrintStates2(states, 8, 16);
{
ptrdiff_t pos = 8 * 4;
do
{
UInt32 *src32 = states + (size_t)(pos * 1);
UInt32 *dest32 = states + (size_t)(pos * 2);
V4_UNPACK_STATE(dest32, src32)
pos -= 8 * 4;
}
while (pos >= 0);
}
{
unsigned k;
for (k = 0; k < 8; k++)
{
UInt32 *s = states + (size_t)k * 16;
STORE_128_TO_STRUCT (STATE_T(s), ctr);
}
}
// PrintStates2(states, 8, 16);
}
#ifdef Z7_BLAKE2S_USE_AVX2
#define ADD_256(a, b) _mm256_add_epi32(a, b)
#define XOR_256(a, b) _mm256_xor_si256(a, b)
#if 1 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
#define MM256_ROR_EPI32 _mm256_ror_epi32
#define Z7_MM256_ROR_EPI32_IS_SUPPORTED
#define LOAD_ROTATE_CONSTS_256
#else
#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
#define LOAD_ROTATE_CONSTS_256 \
const __m256i r8 = k_r8_256; \
const __m256i r16 = k_r16_256;
#endif // AVX2_WAY2
#define MM256_ROR_EPI32(r, c) ( \
( 8==(c)) ? _mm256_shuffle_epi8(r,r8) \
: (16==(c)) ? _mm256_shuffle_epi8(r,r16) \
: _mm256_or_si256( \
_mm256_srli_epi32((r), (c)), \
_mm256_slli_epi32((r), 32-(c))))
#endif // WAY_SLOW
#endif
#define D_ADD_256(dest, src) dest = ADD_256(dest, src)
#define D_XOR_256(dest, src) dest = XOR_256(dest, src)
#define LOADU_256(p) _mm256_loadu_si256((const __m256i *)(const void *)(p))
#ifdef Z7_BLAKE2S_USE_AVX2_FAST
#ifdef Z7_MM256_ROR_EPI32_IS_SUPPORTED
#define ROT_256_16(x) MM256_ROR_EPI32((x), 16)
#define ROT_256_12(x) MM256_ROR_EPI32((x), 12)
#define ROT_256_8(x) MM256_ROR_EPI32((x), 8)
#define ROT_256_7(x) MM256_ROR_EPI32((x), 7)
#else
#define ROTATE8 _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, \
12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)
#define ROTATE16 _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, \
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)
#define ROT_256_16(x) _mm256_shuffle_epi8((x), ROTATE16)
#define ROT_256_12(x) _mm256_or_si256(_mm256_srli_epi32((x), 12), _mm256_slli_epi32((x), 20))
#define ROT_256_8(x) _mm256_shuffle_epi8((x), ROTATE8)
#define ROT_256_7(x) _mm256_or_si256(_mm256_srli_epi32((x), 7), _mm256_slli_epi32((x), 25))
#endif
#define D_ROT_256_7(dest) dest = ROT_256_7(dest)
#define D_ROT_256_8(dest) dest = ROT_256_8(dest)
#define D_ROT_256_12(dest) dest = ROT_256_12(dest)
#define D_ROT_256_16(dest) dest = ROT_256_16(dest)
#define LOAD_256(p) _mm256_load_si256((const __m256i *)(const void *)(p))
#ifdef Z7_BLAKE2SP_STRUCT_IS_NOT_ALIGNED
#define STOREU_256(p, r) _mm256_storeu_si256((__m256i *)(void *)(p), r)
#define LOAD_256_FROM_STRUCT(p) LOADU_256(p)
#define STORE_256_TO_STRUCT(p, r) STOREU_256(p, r)
#else
// if struct is aligned for 32-bytes
#define STORE_256(p, r) _mm256_store_si256((__m256i *)(void *)(p), r)
#define LOAD_256_FROM_STRUCT(p) LOAD_256(p)
#define STORE_256_TO_STRUCT(p, r) STORE_256(p, r)
#endif
#endif // Z7_BLAKE2S_USE_AVX2_FAST
#ifdef Z7_BLAKE2S_USE_AVX2_WAY_SLOW
#if 0
#define DIAG_PERM2(s) \
{ \
const __m256i a = LOAD_256_FROM_STRUCT((s) ); \
const __m256i b = LOAD_256_FROM_STRUCT((s) + NSW); \
STORE_256_TO_STRUCT((s ), _mm256_permute2x128_si256(a, b, 0x20)); \
STORE_256_TO_STRUCT((s + NSW), _mm256_permute2x128_si256(a, b, 0x31)); \
}
#else
#define DIAG_PERM2(s) \
{ \
const __m128i a = LOAD_128_FROM_STRUCT((s) + 4); \
const __m128i b = LOAD_128_FROM_STRUCT((s) + NSW); \
STORE_128_TO_STRUCT((s) + NSW, a); \
STORE_128_TO_STRUCT((s) + 4 , b); \
}
#endif
#define DIAG_PERM8(s_items) \
{ \
DIAG_PERM2(s_items) \
DIAG_PERM2(s_items + NSW * 2) \
DIAG_PERM2(s_items + NSW * 4) \
DIAG_PERM2(s_items + NSW * 6) \
}
#define AXR256(a, b, d, shift) \
D_ADD_256(a, b); \
D_XOR_256(d, a); \
d = MM256_ROR_EPI32(d, shift); \
#ifdef Z7_BLAKE2S_USE_GATHER
#define TABLE_GATHER_256_4(a0,a1,a2,a3) \
a0,a1,a2,a3, a0+16,a1+16,a2+16,a3+16
#define TABLE_GATHER_256( \
a0,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15) \
{ TABLE_GATHER_256_4(a0,a2,a4,a6), \
TABLE_GATHER_256_4(a1,a3,a5,a7), \
TABLE_GATHER_256_4(a8,a10,a12,a14), \
TABLE_GATHER_256_4(a9,a11,a13,a15) }
MY_ALIGN(64)
static const UInt32 k_Blake2s_Sigma_gather256[BLAKE2S_NUM_ROUNDS][16 * 2] =
{ SIGMA_TABLE(TABLE_GATHER_256) };
#define GET_SIGMA(r) \
const UInt32 * const sigma = k_Blake2s_Sigma_gather256[r];
#define AXR2_LOAD_INDEXES_AVX(sigma_index) \
const __m256i i01234567 = LOAD_256(sigma + (sigma_index));
#define SET_ROW_FROM_SIGMA_AVX(in) \
_mm256_i32gather_epi32((const void *)(in), i01234567, 4)
#define SIGMA_INTERLEAVE 8
#define SIGMA_HALF_ROW_SIZE 16
#else // !Z7_BLAKE2S_USE_GATHER
#define GET_SIGMA(r) \
const Byte * const sigma = k_Blake2s_Sigma_4[r];
#define AXR2_LOAD_INDEXES_AVX(sigma_index) \
AXR2_LOAD_INDEXES(sigma_index)
#define SET_ROW_FROM_SIGMA_AVX(in) \
MY_mm256_set_m128i( \
SET_ROW_FROM_SIGMA_W((in) + Z7_BLAKE2S_BLOCK_SIZE), \
SET_ROW_FROM_SIGMA_W(in))
#define SIGMA_INTERLEAVE 1
#define SIGMA_HALF_ROW_SIZE 8
#endif // !Z7_BLAKE2S_USE_GATHER
#define ROTATE_WORDS_TO_RIGHT_256(a, n) \
a = _mm256_shuffle_epi32(a, _MM_SHUFFLE((3+n)&3, (2+n)&3, (1+n)&3, (0+n)&3));
#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
#define AXR2_A(sigma_index, shift1, shift2) \
AXR2_LOAD_INDEXES_AVX(sigma_index) \
D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \
AXR256(a0, b0, d0, shift1) \
AXR256(c0, d0, b0, shift2) \
#define AXR4_A(sigma_index) \
{ AXR2_A(sigma_index, 16, 12) } \
{ AXR2_A(sigma_index + SIGMA_INTERLEAVE, 8, 7) }
#define EE1(r) \
{ GET_SIGMA(r) \
AXR4_A(0) \
ROTATE_WORDS_TO_RIGHT_256(b0, 1) \
ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
ROTATE_WORDS_TO_RIGHT_256(d0, 3) \
AXR4_A(SIGMA_HALF_ROW_SIZE) \
ROTATE_WORDS_TO_RIGHT_256(b0, 3) \
ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
ROTATE_WORDS_TO_RIGHT_256(d0, 1) \
}
static
Z7_NO_INLINE
#ifdef BLAKE2S_ATTRIB_AVX2
BLAKE2S_ATTRIB_AVX2
#endif
void
Z7_FASTCALL
Blake2sp_Compress2_AVX2_Way2(UInt32 *s_items, const Byte *data, const Byte *end)
{
size_t pos = 0;
end -= Z7_BLAKE2S_BLOCK_SIZE;
if (data != end)
{
LOAD_ROTATE_CONSTS_256
DIAG_PERM8(s_items)
do
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
__m256i a0, b0, c0, d0;
{
const __m128i inc = k_inc;
__m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s));
__m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
D_ADD_EPI64_128(d0_128, inc);
D_ADD_EPI64_128(d1_128, inc);
STORE_128_TO_STRUCT (STATE_T(s ), d0_128);
STORE_128_TO_STRUCT (STATE_T(s + NSW), d1_128);
d0 = MY_mm256_set_m128i(d1_128, d0_128);
D_XOR_256(d0, k_iv4_256);
}
c0 = SET_FROM_128(k_iv0_128);
a0 = LOAD_256_FROM_STRUCT(s + NSW * 0);
b0 = LOAD_256_FROM_STRUCT(s + NSW * 1);
ROUNDS_LOOP (EE1)
D_XOR_256(a0, c0);
D_XOR_256(b0, d0);
D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0));
D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1));
STORE_256_TO_STRUCT(s + NSW * 0, a0);
STORE_256_TO_STRUCT(s + NSW * 1, b0);
data += Z7_BLAKE2S_BLOCK_SIZE * 2;
pos += Z7_BLAKE2S_BLOCK_SIZE * 2;
pos &= SUPER_BLOCK_MASK;
}
while (data < end);
DIAG_PERM8(s_items)
if (data != end)
return;
}
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
Z7_BLAKE2S_CompressSingleBlock(s, data);
}
}
#endif // Z7_BLAKE2S_USE_AVX2_WAY2
#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
#define AXR2_X(sigma_index, shift1, shift2) \
AXR2_LOAD_INDEXES_AVX(sigma_index) \
D_ADD_256( a0, SET_ROW_FROM_SIGMA_AVX(data)); \
D_ADD_256( a1, SET_ROW_FROM_SIGMA_AVX((data) + Z7_BLAKE2S_BLOCK_SIZE * 2)); \
AXR256(a0, b0, d0, shift1) \
AXR256(a1, b1, d1, shift1) \
AXR256(c0, d0, b0, shift2) \
AXR256(c1, d1, b1, shift2) \
#define AXR4_X(sigma_index) \
{ AXR2_X(sigma_index, 16, 12) } \
{ AXR2_X(sigma_index + SIGMA_INTERLEAVE, 8, 7) }
#define EE2(r) \
{ GET_SIGMA(r) \
AXR4_X(0) \
ROTATE_WORDS_TO_RIGHT_256(b0, 1) \
ROTATE_WORDS_TO_RIGHT_256(b1, 1) \
ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
ROTATE_WORDS_TO_RIGHT_256(c1, 2) \
ROTATE_WORDS_TO_RIGHT_256(d0, 3) \
ROTATE_WORDS_TO_RIGHT_256(d1, 3) \
AXR4_X(SIGMA_HALF_ROW_SIZE) \
ROTATE_WORDS_TO_RIGHT_256(b0, 3) \
ROTATE_WORDS_TO_RIGHT_256(b1, 3) \
ROTATE_WORDS_TO_RIGHT_256(c0, 2) \
ROTATE_WORDS_TO_RIGHT_256(c1, 2) \
ROTATE_WORDS_TO_RIGHT_256(d0, 1) \
ROTATE_WORDS_TO_RIGHT_256(d1, 1) \
}
static
Z7_NO_INLINE
#ifdef BLAKE2S_ATTRIB_AVX2
BLAKE2S_ATTRIB_AVX2
#endif
void
Z7_FASTCALL
Blake2sp_Compress2_AVX2_Way4(UInt32 *s_items, const Byte *data, const Byte *end)
{
size_t pos = 0;
if ((size_t)(end - data) >= Z7_BLAKE2S_BLOCK_SIZE * 4)
{
#ifndef Z7_MM256_ROR_EPI32_IS_SUPPORTED
const __m256i r8 = k_r8_256;
const __m256i r16 = k_r16_256;
#endif
end -= Z7_BLAKE2S_BLOCK_SIZE * 3;
DIAG_PERM8(s_items)
do
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
__m256i a0, b0, c0, d0;
__m256i a1, b1, c1, d1;
{
const __m128i inc = k_inc;
__m128i d0_128 = LOAD_128_FROM_STRUCT (STATE_T(s));
__m128i d1_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW));
__m128i d2_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 2));
__m128i d3_128 = LOAD_128_FROM_STRUCT (STATE_T(s + NSW * 3));
D_ADD_EPI64_128(d0_128, inc);
D_ADD_EPI64_128(d1_128, inc);
D_ADD_EPI64_128(d2_128, inc);
D_ADD_EPI64_128(d3_128, inc);
STORE_128_TO_STRUCT (STATE_T(s ), d0_128);
STORE_128_TO_STRUCT (STATE_T(s + NSW * 1), d1_128);
STORE_128_TO_STRUCT (STATE_T(s + NSW * 2), d2_128);
STORE_128_TO_STRUCT (STATE_T(s + NSW * 3), d3_128);
d0 = MY_mm256_set_m128i(d1_128, d0_128);
d1 = MY_mm256_set_m128i(d3_128, d2_128);
D_XOR_256(d0, k_iv4_256);
D_XOR_256(d1, k_iv4_256);
}
c1 = c0 = SET_FROM_128(k_iv0_128);
a0 = LOAD_256_FROM_STRUCT(s + NSW * 0);
b0 = LOAD_256_FROM_STRUCT(s + NSW * 1);
a1 = LOAD_256_FROM_STRUCT(s + NSW * 2);
b1 = LOAD_256_FROM_STRUCT(s + NSW * 3);
ROUNDS_LOOP (EE2)
D_XOR_256(a0, c0);
D_XOR_256(b0, d0);
D_XOR_256(a1, c1);
D_XOR_256(b1, d1);
D_XOR_256(a0, LOAD_256_FROM_STRUCT(s + NSW * 0));
D_XOR_256(b0, LOAD_256_FROM_STRUCT(s + NSW * 1));
D_XOR_256(a1, LOAD_256_FROM_STRUCT(s + NSW * 2));
D_XOR_256(b1, LOAD_256_FROM_STRUCT(s + NSW * 3));
STORE_256_TO_STRUCT(s + NSW * 0, a0);
STORE_256_TO_STRUCT(s + NSW * 1, b0);
STORE_256_TO_STRUCT(s + NSW * 2, a1);
STORE_256_TO_STRUCT(s + NSW * 3, b1);
data += Z7_BLAKE2S_BLOCK_SIZE * 4;
pos += Z7_BLAKE2S_BLOCK_SIZE * 4;
pos &= SUPER_BLOCK_MASK;
}
while (data < end);
DIAG_PERM8(s_items)
end += Z7_BLAKE2S_BLOCK_SIZE * 3;
}
if (data == end)
return;
// Z7_BLAKE2S_Compress2_V128(s_items, data, end, pos);
do
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
Z7_BLAKE2S_CompressSingleBlock(s, data);
data += Z7_BLAKE2S_BLOCK_SIZE;
pos += Z7_BLAKE2S_BLOCK_SIZE;
pos &= SUPER_BLOCK_MASK;
}
while (data != end);
}
#endif // Z7_BLAKE2S_USE_AVX2_WAY4
#endif // Z7_BLAKE2S_USE_AVX2_WAY_SLOW
// ---------------------------------------------------------
#ifdef Z7_BLAKE2S_USE_AVX2_FAST
#define OP256_L(a, i) D_ADD_256 (V(a, 0), \
LOAD_256((const Byte *)(w) + GET_SIGMA_VAL_256(2*(a)+(i))));
#define OP256_0(a) OP256_L(a, 0)
#define OP256_7(a) OP256_L(a, 1)
#define OP256_1(a) D_ADD_256 (V(a, 0), V(a, 1));
#define OP256_2(a) D_XOR_256 (V(a, 3), V(a, 0));
#define OP256_4(a) D_ADD_256 (V(a, 2), V(a, 3));
#define OP256_5(a) D_XOR_256 (V(a, 1), V(a, 2));
#define OP256_3(a) D_ROT_256_16 (V(a, 3));
#define OP256_6(a) D_ROT_256_12 (V(a, 1));
#define OP256_8(a) D_ROT_256_8 (V(a, 3));
#define OP256_9(a) D_ROT_256_7 (V(a, 1));
#if 0 || 1 && defined(MY_CPU_X86)
#define V8_G(a) \
OP256_0 (a) \
OP256_1 (a) \
OP256_2 (a) \
OP256_3 (a) \
OP256_4 (a) \
OP256_5 (a) \
OP256_6 (a) \
OP256_7 (a) \
OP256_1 (a) \
OP256_2 (a) \
OP256_8 (a) \
OP256_4 (a) \
OP256_5 (a) \
OP256_9 (a) \
#define V8R { \
V8_G (0); \
V8_G (1); \
V8_G (2); \
V8_G (3); \
V8_G (4); \
V8_G (5); \
V8_G (6); \
V8_G (7); \
}
#else
#define OP256_INTER_4(op, a,b,c,d) \
op (a) \
op (b) \
op (c) \
op (d) \
#define V8_G(a,b,c,d) \
OP256_INTER_4 (OP256_0, a,b,c,d) \
OP256_INTER_4 (OP256_1, a,b,c,d) \
OP256_INTER_4 (OP256_2, a,b,c,d) \
OP256_INTER_4 (OP256_3, a,b,c,d) \
OP256_INTER_4 (OP256_4, a,b,c,d) \
OP256_INTER_4 (OP256_5, a,b,c,d) \
OP256_INTER_4 (OP256_6, a,b,c,d) \
OP256_INTER_4 (OP256_7, a,b,c,d) \
OP256_INTER_4 (OP256_1, a,b,c,d) \
OP256_INTER_4 (OP256_2, a,b,c,d) \
OP256_INTER_4 (OP256_8, a,b,c,d) \
OP256_INTER_4 (OP256_4, a,b,c,d) \
OP256_INTER_4 (OP256_5, a,b,c,d) \
OP256_INTER_4 (OP256_9, a,b,c,d) \
#define V8R { \
V8_G (0, 1, 2, 3) \
V8_G (4, 5, 6, 7) \
}
#endif
#define V8_ROUND(r) { GET_SIGMA_PTR_256(r); V8R }
// for debug:
// #define Z7_BLAKE2S_PERMUTE_WITH_GATHER
#if defined(Z7_BLAKE2S_PERMUTE_WITH_GATHER)
// gather instruction is slow.
#define V8_LOAD_MSG(w, m) \
{ \
unsigned i; \
for (i = 0; i < 16; ++i) { \
w[i] = _mm256_i32gather_epi32( \
(const void *)((m) + i * sizeof(UInt32)),\
_mm256_set_epi32(0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00), \
sizeof(UInt32)); \
} \
}
#else // !Z7_BLAKE2S_PERMUTE_WITH_GATHER
#define V8_LOAD_MSG_2(w, a0, a1) \
{ \
(w)[0] = _mm256_permute2x128_si256(a0, a1, 0x20); \
(w)[4] = _mm256_permute2x128_si256(a0, a1, 0x31); \
}
#define V8_LOAD_MSG_4(w, z0, z1, z2, z3) \
{ \
__m256i s0, s1, s2, s3; \
s0 = _mm256_unpacklo_epi64(z0, z1); \
s1 = _mm256_unpackhi_epi64(z0, z1); \
s2 = _mm256_unpacklo_epi64(z2, z3); \
s3 = _mm256_unpackhi_epi64(z2, z3); \
V8_LOAD_MSG_2((w) + 0, s0, s2) \
V8_LOAD_MSG_2((w) + 1, s1, s3) \
}
#define V8_LOAD_MSG_0(t0, t1, m) \
{ \
__m256i m0, m1; \
m0 = LOADU_256(m); \
m1 = LOADU_256((m) + 2 * 32); \
t0 = _mm256_unpacklo_epi32(m0, m1); \
t1 = _mm256_unpackhi_epi32(m0, m1); \
}
#define V8_LOAD_MSG_8(w, m) \
{ \
__m256i t0, t1, t2, t3, t4, t5, t6, t7; \
V8_LOAD_MSG_0(t0, t4, (m) + 0 * 4 * 32) \
V8_LOAD_MSG_0(t1, t5, (m) + 1 * 4 * 32) \
V8_LOAD_MSG_0(t2, t6, (m) + 2 * 4 * 32) \
V8_LOAD_MSG_0(t3, t7, (m) + 3 * 4 * 32) \
V8_LOAD_MSG_4((w) , t0, t1, t2, t3) \
V8_LOAD_MSG_4((w) + 2, t4, t5, t6, t7) \
}
#define V8_LOAD_MSG(w, m) \
{ \
V8_LOAD_MSG_8(w, m) \
V8_LOAD_MSG_8((w) + 8, (m) + 32) \
}
#endif // !Z7_BLAKE2S_PERMUTE_WITH_GATHER
#define V8_PERM_PAIR_STORE(u, a0, a2) \
{ \
STORE_256_TO_STRUCT((u), _mm256_permute2x128_si256(a0, a2, 0x20)); \
STORE_256_TO_STRUCT((u) + 8, _mm256_permute2x128_si256(a0, a2, 0x31)); \
}
#define V8_UNPACK_STORE_4(u, z0, z1, z2, z3) \
{ \
__m256i s0, s1, s2, s3; \
s0 = _mm256_unpacklo_epi64(z0, z1); \
s1 = _mm256_unpackhi_epi64(z0, z1); \
s2 = _mm256_unpacklo_epi64(z2, z3); \
s3 = _mm256_unpackhi_epi64(z2, z3); \
V8_PERM_PAIR_STORE(u + 0, s0, s2) \
V8_PERM_PAIR_STORE(u + 2, s1, s3) \
}
#define V8_UNPACK_STORE_0(src32, d0, d1) \
{ \
const __m256i v0 = LOAD_256_FROM_STRUCT ((src32) ); \
const __m256i v1 = LOAD_256_FROM_STRUCT ((src32) + 8); \
d0 = _mm256_unpacklo_epi32(v0, v1); \
d1 = _mm256_unpackhi_epi32(v0, v1); \
}
#define V8_UNPACK_STATE(dest32, src32) \
{ \
__m256i t0, t1, t2, t3, t4, t5, t6, t7; \
V8_UNPACK_STORE_0 ((src32) + 16 * 0, t0, t4) \
V8_UNPACK_STORE_0 ((src32) + 16 * 1, t1, t5) \
V8_UNPACK_STORE_0 ((src32) + 16 * 2, t2, t6) \
V8_UNPACK_STORE_0 ((src32) + 16 * 3, t3, t7) \
V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) , t0, t1, t2, t3) \
V8_UNPACK_STORE_4 ((__m256i *)(void *)(dest32) + 4, t4, t5, t6, t7) \
}
#define V8_LOAD_STATE_256_FROM_STRUCT(i) \
v[i] = LOAD_256_FROM_STRUCT(s_items + (i) * 8);
#if 0 || 0 && defined(MY_CPU_X86)
#define Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
#endif
#ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
// this branch doesn't use (iv) array
// so register pressure can be lower.
// it can be faster sometimes
#define V8_LOAD_STATE_256(i) V8_LOAD_STATE_256_FROM_STRUCT(i)
#define V8_UPDATE_STATE_256(i) \
{ \
STORE_256_TO_STRUCT(s_items + (i) * 8, XOR_256( \
XOR_256(v[i], v[(i) + 8]), \
LOAD_256_FROM_STRUCT(s_items + (i) * 8))); \
}
#else
// it uses more variables (iv) registers
// it's better for gcc
// maybe that branch is better, if register pressure will be lower (avx512)
#define V8_LOAD_STATE_256(i) { iv[i] = v[i]; }
#define V8_UPDATE_STATE_256(i) { v[i] = XOR_256(XOR_256(v[i], v[i + 8]), iv[i]); }
#define V8_STORE_STATE_256(i) { STORE_256_TO_STRUCT(s_items + (i) * 8, v[i]); }
#endif
#if 0
// use loading constants from memory
#define KK8(n) KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n), KIV(n)
MY_ALIGN(64)
static const UInt32 k_Blake2s_IV_WAY8[]=
{
KK8(0), KK8(1), KK8(2), KK8(3), KK8(4), KK8(5), KK8(6), KK8(7)
};
#define GET_256_IV_WAY8(i) LOAD_256(k_Blake2s_IV_WAY8 + 8 * (i))
#else
// use constant generation:
#define GET_256_IV_WAY8(i) _mm256_set1_epi32((Int32)KIV(i))
#endif
static
Z7_NO_INLINE
#ifdef BLAKE2S_ATTRIB_AVX2
BLAKE2S_ATTRIB_AVX2
#endif
void
Z7_FASTCALL
Blake2sp_Compress2_AVX2_Fast(UInt32 *s_items, const Byte *data, const Byte *end)
{
#ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
__m256i v[16];
#endif
// PrintStates2(s_items, 8, 16);
#ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
REP8_MACRO (V8_LOAD_STATE_256_FROM_STRUCT)
#endif
do
{
__m256i w[16];
#ifdef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
__m256i v[16];
#else
__m256i iv[8];
#endif
V8_LOAD_MSG(w, data)
{
// we use load/store ctr inside loop to reduce register pressure:
#if 1 || 1 && defined(MY_CPU_X86)
const __m256i ctr = _mm256_add_epi64(
LOAD_256_FROM_STRUCT(s_items + 64),
_mm256_set_epi32(
0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE,
0, 0, 0, Z7_BLAKE2S_BLOCK_SIZE));
STORE_256_TO_STRUCT(s_items + 64, ctr);
#else
const UInt64 ctr64 = *(const UInt64 *)(const void *)(s_items + 64)
+ Z7_BLAKE2S_BLOCK_SIZE;
const __m256i ctr = _mm256_set_epi64x(0, (Int64)ctr64, 0, (Int64)ctr64);
*(UInt64 *)(void *)(s_items + 64) = ctr64;
#endif
v[12] = XOR_256 (GET_256_IV_WAY8(4), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(0, 0, 0, 0)));
v[13] = XOR_256 (GET_256_IV_WAY8(5), _mm256_shuffle_epi32(ctr, _MM_SHUFFLE(1, 1, 1, 1)));
}
v[ 8] = GET_256_IV_WAY8(0);
v[ 9] = GET_256_IV_WAY8(1);
v[10] = GET_256_IV_WAY8(2);
v[11] = GET_256_IV_WAY8(3);
v[14] = GET_256_IV_WAY8(6);
v[15] = GET_256_IV_WAY8(7);
REP8_MACRO (V8_LOAD_STATE_256)
ROUNDS_LOOP (V8_ROUND)
REP8_MACRO (V8_UPDATE_STATE_256)
data += SUPER_BLOCK_SIZE;
}
while (data != end);
#ifndef Z7_BLAKE2S_AVX2_FAST_USE_STRUCT
REP8_MACRO (V8_STORE_STATE_256)
#endif
}
static
Z7_NO_INLINE
#ifdef BLAKE2S_ATTRIB_AVX2
BLAKE2S_ATTRIB_AVX2
#endif
void
Z7_FASTCALL
Blake2sp_Final_AVX2_Fast(UInt32 *states)
{
const __m128i ctr = LOAD_128_FROM_STRUCT(states + 64);
// PrintStates2(states, 8, 16);
V8_UNPACK_STATE(states, states)
// PrintStates2(states, 8, 16);
{
unsigned k;
for (k = 0; k < 8; k++)
{
UInt32 *s = states + (size_t)k * 16;
STORE_128_TO_STRUCT (STATE_T(s), ctr);
}
}
// PrintStates2(states, 8, 16);
// printf("\nafter V8_UNPACK_STATE \n");
}
#endif // Z7_BLAKE2S_USE_AVX2_FAST
#endif // avx2
#endif // vector
/*
#define Blake2s_Increment_Counter(s, inc) \
{ STATE_T(s)[0] += (inc); STATE_T(s)[1] += (STATE_T(s)[0] < (inc)); }
#define Blake2s_Increment_Counter_Small(s, inc) \
{ STATE_T(s)[0] += (inc); }
*/
#define Blake2s_Set_LastBlock(s) \
{ STATE_F(s)[0] = BLAKE2S_FINAL_FLAG; /* STATE_F(s)[1] = p->u.header.lastNode_f1; */ }
#if 0 || 1 && defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL >= 1600
// good for vs2022
#define LOOP_8(mac) { unsigned kkk; for (kkk = 0; kkk < 8; kkk++) mac(kkk) }
#else
// good for Z7_BLAKE2S_UNROLL for GCC9 (arm*/x86*) and MSC_VER_1400-x64.
#define LOOP_8(mac) { REP8_MACRO(mac) }
#endif
static
Z7_FORCE_INLINE
// Z7_NO_INLINE
void
Z7_FASTCALL
Blake2s_Compress(UInt32 *s, const Byte *input)
{
UInt32 m[16];
UInt32 v[16];
{
unsigned i;
for (i = 0; i < 16; i++)
m[i] = GetUi32(input + i * 4);
}
#define INIT_v_FROM_s(i) v[i] = s[i];
LOOP_8(INIT_v_FROM_s)
// Blake2s_Increment_Counter(s, Z7_BLAKE2S_BLOCK_SIZE)
{
const UInt32 t0 = STATE_T(s)[0] + Z7_BLAKE2S_BLOCK_SIZE;
const UInt32 t1 = STATE_T(s)[1] + (t0 < Z7_BLAKE2S_BLOCK_SIZE);
STATE_T(s)[0] = t0;
STATE_T(s)[1] = t1;
v[12] = t0 ^ KIV(4);
v[13] = t1 ^ KIV(5);
}
// v[12] = STATE_T(s)[0] ^ KIV(4);
// v[13] = STATE_T(s)[1] ^ KIV(5);
v[14] = STATE_F(s)[0] ^ KIV(6);
v[15] = STATE_F(s)[1] ^ KIV(7);
v[ 8] = KIV(0);
v[ 9] = KIV(1);
v[10] = KIV(2);
v[11] = KIV(3);
// PrintStates2((const UInt32 *)v, 1, 16);
#define ADD_SIGMA(a, index) V(a, 0) += *(const UInt32 *)GET_SIGMA_PTR(m, sigma[index]);
#define ADD32M(dest, src, a) V(a, dest) += V(a, src);
#define XOR32M(dest, src, a) V(a, dest) ^= V(a, src);
#define RTR32M(dest, shift, a) V(a, dest) = rotrFixed(V(a, dest), shift);
// big interleaving can provides big performance gain, if scheduler queues are small.
#if 0 || 1 && defined(MY_CPU_X86)
// interleave-1: for small register number (x86-32bit)
#define G2(index, a, x, y) \
ADD_SIGMA (a, (index) + 2 * 0) \
ADD32M (0, 1, a) \
XOR32M (3, 0, a) \
RTR32M (3, x, a) \
ADD32M (2, 3, a) \
XOR32M (1, 2, a) \
RTR32M (1, y, a) \
#define G(a) \
G2(a * 2 , a, 16, 12) \
G2(a * 2 + 1, a, 8, 7) \
#define R2 \
G(0) \
G(1) \
G(2) \
G(3) \
G(4) \
G(5) \
G(6) \
G(7) \
#elif 0 || 1 && defined(MY_CPU_X86_OR_AMD64)
// interleave-2: is good if the number of registers is not big (x86-64).
#define REP2(mac, dest, src, a, b) \
mac(dest, src, a) \
mac(dest, src, b)
#define G2(index, a, b, x, y) \
ADD_SIGMA (a, (index) + 2 * 0) \
ADD_SIGMA (b, (index) + 2 * 1) \
REP2 (ADD32M, 0, 1, a, b) \
REP2 (XOR32M, 3, 0, a, b) \
REP2 (RTR32M, 3, x, a, b) \
REP2 (ADD32M, 2, 3, a, b) \
REP2 (XOR32M, 1, 2, a, b) \
REP2 (RTR32M, 1, y, a, b) \
#define G(a, b) \
G2(a * 2 , a, b, 16, 12) \
G2(a * 2 + 1, a, b, 8, 7) \
#define R2 \
G(0, 1) \
G(2, 3) \
G(4, 5) \
G(6, 7) \
#else
// interleave-4:
// it has big register pressure for x86/x64.
// and MSVC compilers for x86/x64 are slow for this branch.
// but if we have big number of registers, this branch can be faster.
#define REP4(mac, dest, src, a, b, c, d) \
mac(dest, src, a) \
mac(dest, src, b) \
mac(dest, src, c) \
mac(dest, src, d)
#define G2(index, a, b, c, d, x, y) \
ADD_SIGMA (a, (index) + 2 * 0) \
ADD_SIGMA (b, (index) + 2 * 1) \
ADD_SIGMA (c, (index) + 2 * 2) \
ADD_SIGMA (d, (index) + 2 * 3) \
REP4 (ADD32M, 0, 1, a, b, c, d) \
REP4 (XOR32M, 3, 0, a, b, c, d) \
REP4 (RTR32M, 3, x, a, b, c, d) \
REP4 (ADD32M, 2, 3, a, b, c, d) \
REP4 (XOR32M, 1, 2, a, b, c, d) \
REP4 (RTR32M, 1, y, a, b, c, d) \
#define G(a, b, c, d) \
G2(a * 2 , a, b, c, d, 16, 12) \
G2(a * 2 + 1, a, b, c, d, 8, 7) \
#define R2 \
G(0, 1, 2, 3) \
G(4, 5, 6, 7) \
#endif
#define R(r) { const Byte *sigma = k_Blake2s_Sigma_4[r]; R2 }
// Z7_BLAKE2S_UNROLL gives 5-6 KB larger code, but faster:
// 20-40% faster for (x86/x64) VC2010+/GCC/CLANG.
// 30-60% faster for (arm64-arm32) GCC.
// 5-11% faster for (arm64) CLANG-MAC.
// so Z7_BLAKE2S_UNROLL is good optimization, if there is no vector branch.
// But if there is vectors branch (for x86*), this scalar code will be unused mostly.
// So we want smaller code (without unrolling) in that case (x86*).
#if 0 || 1 && !defined(Z7_BLAKE2S_USE_VECTORS)
#define Z7_BLAKE2S_UNROLL
#endif
#ifdef Z7_BLAKE2S_UNROLL
ROUNDS_LOOP_UNROLLED (R)
#else
ROUNDS_LOOP (R)
#endif
#undef G
#undef G2
#undef R
#undef R2
// printf("\n v after: \n");
// PrintStates2((const UInt32 *)v, 1, 16);
#define XOR_s_PAIR_v(i) s[i] ^= v[i] ^ v[i + 8];
LOOP_8(XOR_s_PAIR_v)
// printf("\n s after:\n");
// PrintStates2((const UInt32 *)s, 1, 16);
}
static
Z7_NO_INLINE
void
Z7_FASTCALL
Blake2sp_Compress2(UInt32 *s_items, const Byte *data, const Byte *end)
{
size_t pos = 0;
// PrintStates2(s_items, 8, 16);
do
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(s_items, pos);
Blake2s_Compress(s, data);
data += Z7_BLAKE2S_BLOCK_SIZE;
pos += Z7_BLAKE2S_BLOCK_SIZE;
pos &= SUPER_BLOCK_MASK;
}
while (data != end);
}
#ifdef Z7_BLAKE2S_USE_VECTORS
static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = Blake2sp_Compress2;
static Z7_BLAKE2SP_FUNC_COMPRESS g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = Blake2sp_Compress2;
static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Init;
static Z7_BLAKE2SP_FUNC_INIT g_Z7_BLAKE2SP_FUNC_INIT_Final;
static unsigned g_z7_Blake2sp_SupportedFlags;
#define Z7_BLAKE2SP_Compress_Fast(p) (p)->u.header.func_Compress_Fast
#define Z7_BLAKE2SP_Compress_Single(p) (p)->u.header.func_Compress_Single
#else
#define Z7_BLAKE2SP_Compress_Fast(p) Blake2sp_Compress2
#define Z7_BLAKE2SP_Compress_Single(p) Blake2sp_Compress2
#endif // Z7_BLAKE2S_USE_VECTORS
#if 1 && defined(MY_CPU_LE)
#define GET_DIGEST(_s, _digest) \
{ memcpy(_digest, _s, Z7_BLAKE2S_DIGEST_SIZE); }
#else
#define GET_DIGEST(_s, _digest) \
{ unsigned _i; for (_i = 0; _i < 8; _i++) \
{ SetUi32((_digest) + 4 * _i, (_s)[_i]) } \
}
#endif
/* ---------- BLAKE2s ---------- */
/*
// we need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0()
typedef struct
{
Byte digest_length;
Byte key_length;
Byte fanout; // = 1 : in sequential mode
Byte depth; // = 1 : in sequential mode
UInt32 leaf_length;
Byte node_offset[6]; // 0 for the first, leftmost, leaf, or in sequential mode
Byte node_depth; // 0 for the leaves, or in sequential mode
Byte inner_length; // [0, 32], 0 in sequential mode
Byte salt[BLAKE2S_SALTBYTES];
Byte personal[BLAKE2S_PERSONALBYTES];
} CBlake2sParam;
*/
#define k_Blake2sp_IV_0 \
(KIV(0) ^ (Z7_BLAKE2S_DIGEST_SIZE | ((UInt32)Z7_BLAKE2SP_PARALLEL_DEGREE << 16) | ((UInt32)2 << 24)))
#define k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth) \
(KIV(3) ^ ((UInt32)(node_depth) << 16) ^ ((UInt32)Z7_BLAKE2S_DIGEST_SIZE << 24))
Z7_FORCE_INLINE
static void Blake2sp_Init_Spec(UInt32 *s, unsigned node_offset, unsigned node_depth)
{
s[0] = k_Blake2sp_IV_0;
s[1] = KIV(1);
s[2] = KIV(2) ^ (UInt32)node_offset;
s[3] = k_Blake2sp_IV_3_FROM_NODE_DEPTH(node_depth);
s[4] = KIV(4);
s[5] = KIV(5);
s[6] = KIV(6);
s[7] = KIV(7);
STATE_T(s)[0] = 0;
STATE_T(s)[1] = 0;
STATE_F(s)[0] = 0;
STATE_F(s)[1] = 0;
}
#ifdef Z7_BLAKE2S_USE_V128_FAST
static
Z7_NO_INLINE
#ifdef BLAKE2S_ATTRIB_128BIT
BLAKE2S_ATTRIB_128BIT
#endif
void
Z7_FASTCALL
Blake2sp_InitState_V128_Fast(UInt32 *states)
{
#define STORE_128_PAIR_INIT_STATES_2(i, t0, t1) \
{ STORE_128_TO_STRUCT(states + 0 + 4 * (i), (t0)); \
STORE_128_TO_STRUCT(states + 32 + 4 * (i), (t1)); \
}
#define STORE_128_PAIR_INIT_STATES_1(i, mac) \
{ const __m128i t = mac; \
STORE_128_PAIR_INIT_STATES_2(i, t, t) \
}
#define STORE_128_PAIR_INIT_STATES_IV(i) \
STORE_128_PAIR_INIT_STATES_1(i, GET_128_IV_WAY4(i))
STORE_128_PAIR_INIT_STATES_1 (0, _mm_set1_epi32((Int32)k_Blake2sp_IV_0))
STORE_128_PAIR_INIT_STATES_IV (1)
{
const __m128i t = GET_128_IV_WAY4(2);
STORE_128_PAIR_INIT_STATES_2 (2,
XOR_128(t, _mm_set_epi32(3, 2, 1, 0)),
XOR_128(t, _mm_set_epi32(7, 6, 5, 4)))
}
STORE_128_PAIR_INIT_STATES_1 (3, _mm_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0)))
STORE_128_PAIR_INIT_STATES_IV (4)
STORE_128_PAIR_INIT_STATES_IV (5)
STORE_128_PAIR_INIT_STATES_IV (6)
STORE_128_PAIR_INIT_STATES_IV (7)
STORE_128_PAIR_INIT_STATES_1 (16, _mm_set_epi32(0, 0, 0, 0))
// printf("\n== exit Blake2sp_InitState_V128_Fast ctr=%d\n", states[64]);
}
#endif // Z7_BLAKE2S_USE_V128_FAST
#ifdef Z7_BLAKE2S_USE_AVX2_FAST
static
Z7_NO_INLINE
#ifdef BLAKE2S_ATTRIB_AVX2
BLAKE2S_ATTRIB_AVX2
#endif
void
Z7_FASTCALL
Blake2sp_InitState_AVX2_Fast(UInt32 *states)
{
#define STORE_256_INIT_STATES(i, t) \
STORE_256_TO_STRUCT(states + 8 * (i), t);
#define STORE_256_INIT_STATES_IV(i) \
STORE_256_INIT_STATES(i, GET_256_IV_WAY8(i))
STORE_256_INIT_STATES (0, _mm256_set1_epi32((Int32)k_Blake2sp_IV_0))
STORE_256_INIT_STATES_IV (1)
STORE_256_INIT_STATES (2, XOR_256( GET_256_IV_WAY8(2),
_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0)))
STORE_256_INIT_STATES (3, _mm256_set1_epi32((Int32)k_Blake2sp_IV_3_FROM_NODE_DEPTH(0)))
STORE_256_INIT_STATES_IV (4)
STORE_256_INIT_STATES_IV (5)
STORE_256_INIT_STATES_IV (6)
STORE_256_INIT_STATES_IV (7)
STORE_256_INIT_STATES (8, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0))
// printf("\n== exit Blake2sp_InitState_AVX2_Fast\n");
}
#endif // Z7_BLAKE2S_USE_AVX2_FAST
Z7_NO_INLINE
void Blake2sp_InitState(CBlake2sp *p)
{
size_t i;
// memset(p->states, 0, sizeof(p->states)); // for debug
p->u.header.cycPos = 0;
#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
if (p->u.header.func_Init)
{
p->u.header.func_Init(p->states);
return;
}
#endif
for (i = 0; i < Z7_BLAKE2SP_PARALLEL_DEGREE; i++)
Blake2sp_Init_Spec(p->states + i * NSW, (unsigned)i, 0);
}
void Blake2sp_Init(CBlake2sp *p)
{
#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
p->u.header.func_Compress_Fast =
#ifdef Z7_BLAKE2S_USE_VECTORS
g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast;
#else
NULL;
#endif
p->u.header.func_Compress_Single =
#ifdef Z7_BLAKE2S_USE_VECTORS
g_Z7_BLAKE2SP_FUNC_COMPRESS_Single;
#else
NULL;
#endif
p->u.header.func_Init =
#ifdef Z7_BLAKE2S_USE_VECTORS
g_Z7_BLAKE2SP_FUNC_INIT_Init;
#else
NULL;
#endif
p->u.header.func_Final =
#ifdef Z7_BLAKE2S_USE_VECTORS
g_Z7_BLAKE2SP_FUNC_INIT_Final;
#else
NULL;
#endif
#endif
Blake2sp_InitState(p);
}
void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size)
{
size_t pos;
// printf("\nsize = 0x%6x, cycPos = %5u data = %p\n", (unsigned)size, (unsigned)p->u.header.cycPos, data);
if (size == 0)
return;
pos = p->u.header.cycPos;
// pos < SUPER_BLOCK_SIZE * 2 : is expected
// pos == SUPER_BLOCK_SIZE * 2 : is not expected, but is supported also
{
const size_t pos2 = pos & SUPER_BLOCK_MASK;
if (pos2)
{
const size_t rem = SUPER_BLOCK_SIZE - pos2;
if (rem > size)
{
p->u.header.cycPos = (unsigned)(pos + size);
// cycPos < SUPER_BLOCK_SIZE * 2
memcpy((Byte *)(void *)p->buf32 + pos, data, size);
/* to simpilify the code here we don't try to process first superblock,
if (cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE) */
return;
}
// (rem <= size)
memcpy((Byte *)(void *)p->buf32 + pos, data, rem);
pos += rem;
data += rem;
size -= rem;
}
}
// pos <= SUPER_BLOCK_SIZE * 2
// pos % SUPER_BLOCK_SIZE == 0
if (pos)
{
/* pos == SUPER_BLOCK_SIZE ||
pos == SUPER_BLOCK_SIZE * 2 */
size_t end = pos;
if (size > SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE
|| (end -= SUPER_BLOCK_SIZE))
{
Z7_BLAKE2SP_Compress_Fast(p)(p->states,
(const Byte *)(const void *)p->buf32,
(const Byte *)(const void *)p->buf32 + end);
if (pos -= end)
memcpy(p->buf32, (const Byte *)(const void *)p->buf32
+ SUPER_BLOCK_SIZE, SUPER_BLOCK_SIZE);
}
}
// pos == 0 || (pos == SUPER_BLOCK_SIZE && size <= SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE)
if (size > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE)
{
// pos == 0
const Byte *end;
const size_t size2 = (size - (SUPER_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE + 1))
& ~(size_t)SUPER_BLOCK_MASK;
size -= size2;
// size < SUPER_BLOCK_SIZE * 2
end = data + size2;
Z7_BLAKE2SP_Compress_Fast(p)(p->states, data, end);
data = end;
}
if (size != 0)
{
memcpy((Byte *)(void *)p->buf32 + pos, data, size);
pos += size;
}
p->u.header.cycPos = (unsigned)pos;
// cycPos < SUPER_BLOCK_SIZE * 2
}
void Blake2sp_Final(CBlake2sp *p, Byte *digest)
{
// UInt32 * const R_states = p->states;
// printf("\nBlake2sp_Final \n");
#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
if (p->u.header.func_Final)
p->u.header.func_Final(p->states);
#endif
// printf("\n=====\nBlake2sp_Final \n");
// PrintStates(p->states, 32);
// (p->u.header.cycPos == SUPER_BLOCK_SIZE) can be processed in any branch:
if (p->u.header.cycPos <= SUPER_BLOCK_SIZE)
{
unsigned pos;
memset((Byte *)(void *)p->buf32 + p->u.header.cycPos,
0, SUPER_BLOCK_SIZE - p->u.header.cycPos);
STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos);
Blake2s_Set_LastBlock(s)
if (pos + Z7_BLAKE2S_BLOCK_SIZE > p->u.header.cycPos)
{
UInt32 delta = Z7_BLAKE2S_BLOCK_SIZE;
if (pos < p->u.header.cycPos)
delta -= p->u.header.cycPos & (Z7_BLAKE2S_BLOCK_SIZE - 1);
// 0 < delta <= Z7_BLAKE2S_BLOCK_SIZE
{
const UInt32 v = STATE_T(s)[0];
STATE_T(s)[1] -= v < delta; // (v < delta) is same condition here as (v == 0)
STATE_T(s)[0] = v - delta;
}
}
}
// PrintStates(p->states, 16);
Z7_BLAKE2SP_Compress_Single(p)(p->states,
(Byte *)(void *)p->buf32,
(Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE);
// PrintStates(p->states, 16);
}
else
{
// (p->u.header.cycPos > SUPER_BLOCK_SIZE)
unsigned pos;
for (pos = 0; pos < SUPER_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos);
if (pos + SUPER_BLOCK_SIZE >= p->u.header.cycPos)
Blake2s_Set_LastBlock(s)
}
if (p->u.header.cycPos <= SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE)
STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
Z7_BLAKE2SP_Compress_Single(p)(p->states,
(Byte *)(void *)p->buf32,
(Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE);
// if (p->u.header.cycPos > SUPER_BLOCK_SIZE * 2 - Z7_BLAKE2S_BLOCK_SIZE;
STATE_F(&p->states[(Z7_BLAKE2SP_PARALLEL_DEGREE - 1) * NSW])[1] = BLAKE2S_FINAL_FLAG;
// if (p->u.header.cycPos != SUPER_BLOCK_SIZE)
{
pos = SUPER_BLOCK_SIZE;
for (;;)
{
UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, pos & SUPER_BLOCK_MASK);
Blake2s_Set_LastBlock(s)
pos += Z7_BLAKE2S_BLOCK_SIZE;
if (pos >= p->u.header.cycPos)
{
if (pos != p->u.header.cycPos)
{
const UInt32 delta = pos - p->u.header.cycPos;
const UInt32 v = STATE_T(s)[0];
STATE_T(s)[1] -= v < delta;
STATE_T(s)[0] = v - delta;
memset((Byte *)(void *)p->buf32 + p->u.header.cycPos, 0, delta);
}
break;
}
}
Z7_BLAKE2SP_Compress_Single(p)(p->states,
(Byte *)(void *)p->buf32 + SUPER_BLOCK_SIZE,
(Byte *)(void *)p->buf32 + pos);
}
}
{
size_t pos;
for (pos = 0; pos < SUPER_BLOCK_SIZE / 2; pos += Z7_BLAKE2S_BLOCK_SIZE / 2)
{
const UInt32 * const s = GET_STATE_TABLE_PTR_FROM_BYTE_POS(p->states, (pos * 2));
Byte *dest = (Byte *)(void *)p->buf32 + pos;
GET_DIGEST(s, dest)
}
}
Blake2sp_Init_Spec(p->states, 0, 1);
{
size_t pos;
for (pos = 0; pos < (Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2S_DIGEST_SIZE)
- Z7_BLAKE2S_BLOCK_SIZE; pos += Z7_BLAKE2S_BLOCK_SIZE)
{
Z7_BLAKE2SP_Compress_Single(p)(p->states,
(const Byte *)(const void *)p->buf32 + pos,
(const Byte *)(const void *)p->buf32 + pos + Z7_BLAKE2S_BLOCK_SIZE);
}
}
// Blake2s_Final(p->states, 0, digest, p, (Byte *)(void *)p->buf32 + i);
Blake2s_Set_LastBlock(p->states)
STATE_F(p->states)[1] = BLAKE2S_FINAL_FLAG;
{
Z7_BLAKE2SP_Compress_Single(p)(p->states,
(const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE - Z7_BLAKE2S_BLOCK_SIZE,
(const Byte *)(const void *)p->buf32 + Z7_BLAKE2SP_PARALLEL_DEGREE / 2 * Z7_BLAKE2S_BLOCK_SIZE);
}
GET_DIGEST(p->states, digest)
// printf("\n Blake2sp_Final 555 numDataInBufs = %5u\n", (unsigned)p->u.header.numDataInBufs);
}
BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo)
{
// printf("\n========== setfunction = %d ======== \n", algo);
#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
Z7_BLAKE2SP_FUNC_COMPRESS func = NULL;
Z7_BLAKE2SP_FUNC_COMPRESS func_Single = NULL;
Z7_BLAKE2SP_FUNC_INIT func_Final = NULL;
Z7_BLAKE2SP_FUNC_INIT func_Init = NULL;
#else
UNUSED_VAR(p)
#endif
#ifdef Z7_BLAKE2S_USE_VECTORS
func = func_Single = Blake2sp_Compress2;
if (algo != Z7_BLAKE2SP_ALGO_SCALAR)
{
// printf("\n========== setfunction NON-SCALER ======== \n");
if (algo == Z7_BLAKE2SP_ALGO_DEFAULT)
{
func = g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast;
func_Single = g_Z7_BLAKE2SP_FUNC_COMPRESS_Single;
func_Init = g_Z7_BLAKE2SP_FUNC_INIT_Init;
func_Final = g_Z7_BLAKE2SP_FUNC_INIT_Final;
}
else
{
if ((g_z7_Blake2sp_SupportedFlags & (1u << algo)) == 0)
return False;
#ifdef Z7_BLAKE2S_USE_AVX2
func_Single =
#if defined(Z7_BLAKE2S_USE_AVX2_WAY2)
Blake2sp_Compress2_AVX2_Way2;
#else
Z7_BLAKE2S_Compress2_V128;
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_FAST
if (algo == Z7_BLAKE2SP_ALGO_V256_FAST)
{
func = Blake2sp_Compress2_AVX2_Fast;
func_Final = Blake2sp_Final_AVX2_Fast;
func_Init = Blake2sp_InitState_AVX2_Fast;
}
else
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
if (algo == Z7_BLAKE2SP_ALGO_V256_WAY2)
func = Blake2sp_Compress2_AVX2_Way2;
else
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
if (algo == Z7_BLAKE2SP_ALGO_V256_WAY4)
{
func_Single = func = Blake2sp_Compress2_AVX2_Way4;
}
else
#endif
#endif // avx2
{
if (algo == Z7_BLAKE2SP_ALGO_V128_FAST)
{
func = Blake2sp_Compress2_V128_Fast;
func_Final = Blake2sp_Final_V128_Fast;
func_Init = Blake2sp_InitState_V128_Fast;
func_Single = Z7_BLAKE2S_Compress2_V128;
}
else
#ifdef Z7_BLAKE2S_USE_V128_WAY2
if (algo == Z7_BLAKE2SP_ALGO_V128_WAY2)
func = func_Single = Blake2sp_Compress2_V128_Way2;
else
#endif
{
if (algo != Z7_BLAKE2SP_ALGO_V128_WAY1)
return False;
func = func_Single = Blake2sp_Compress2_V128_Way1;
}
}
}
}
#else // !VECTORS
if (algo > 1) // Z7_BLAKE2SP_ALGO_SCALAR
return False;
#endif // !VECTORS
#ifdef Z7_BLAKE2SP_USE_FUNCTIONS
p->u.header.func_Compress_Fast = func;
p->u.header.func_Compress_Single = func_Single;
p->u.header.func_Final = func_Final;
p->u.header.func_Init = func_Init;
#endif
// printf("\n p->u.header.func_Compress = %p", p->u.header.func_Compress);
return True;
}
void z7_Black2sp_Prepare(void)
{
#ifdef Z7_BLAKE2S_USE_VECTORS
unsigned flags = 0; // (1u << Z7_BLAKE2SP_ALGO_V128_SCALAR);
Z7_BLAKE2SP_FUNC_COMPRESS func_Fast = Blake2sp_Compress2;
Z7_BLAKE2SP_FUNC_COMPRESS func_Single = Blake2sp_Compress2;
Z7_BLAKE2SP_FUNC_INIT func_Init = NULL;
Z7_BLAKE2SP_FUNC_INIT func_Final = NULL;
#if defined(MY_CPU_X86_OR_AMD64)
#if defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
if (CPU_IsSupported_AVX512F_AVX512VL())
#endif
#if defined(Z7_BLAKE2S_USE_SSE41)
if (CPU_IsSupported_SSE41())
#elif defined(Z7_BLAKE2S_USE_SSSE3)
if (CPU_IsSupported_SSSE3())
#elif !defined(MY_CPU_AMD64)
if (CPU_IsSupported_SSE2())
#endif
#endif
{
#if defined(Z7_BLAKE2S_USE_SSE41)
// printf("\n========== Blake2s SSE41 128-bit\n");
#elif defined(Z7_BLAKE2S_USE_SSSE3)
// printf("\n========== Blake2s SSSE3 128-bit\n");
#else
// printf("\n========== Blake2s SSE2 128-bit\n");
#endif
// func_Fast = f_vector = Blake2sp_Compress2_V128_Way2;
// printf("\n========== Blake2sp_Compress2_V128_Way2\n");
func_Fast =
func_Single = Z7_BLAKE2S_Compress2_V128;
flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY1);
#ifdef Z7_BLAKE2S_USE_V128_WAY2
flags |= (1u << Z7_BLAKE2SP_ALGO_V128_WAY2);
#endif
#ifdef Z7_BLAKE2S_USE_V128_FAST
flags |= (1u << Z7_BLAKE2SP_ALGO_V128_FAST);
func_Fast = Blake2sp_Compress2_V128_Fast;
func_Init = Blake2sp_InitState_V128_Fast;
func_Final = Blake2sp_Final_V128_Fast;
#endif
#ifdef Z7_BLAKE2S_USE_AVX2
#if defined(MY_CPU_X86_OR_AMD64)
if (
#if 0 && defined(Z7_BLAKE2S_USE_AVX512_ALWAYS)
CPU_IsSupported_AVX512F_AVX512VL() &&
#endif
CPU_IsSupported_AVX2()
)
#endif
{
// #pragma message ("=== Blake2s AVX2")
// printf("\n========== Blake2s AVX2\n");
#ifdef Z7_BLAKE2S_USE_AVX2_WAY2
func_Single = Blake2sp_Compress2_AVX2_Way2;
flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY2);
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_WAY4
flags |= (1u << Z7_BLAKE2SP_ALGO_V256_WAY4);
#endif
#ifdef Z7_BLAKE2S_USE_AVX2_FAST
flags |= (1u << Z7_BLAKE2SP_ALGO_V256_FAST);
func_Fast = Blake2sp_Compress2_AVX2_Fast;
func_Init = Blake2sp_InitState_AVX2_Fast;
func_Final = Blake2sp_Final_AVX2_Fast;
#elif defined(Z7_BLAKE2S_USE_AVX2_WAY4)
func_Fast = Blake2sp_Compress2_AVX2_Way4;
#elif defined(Z7_BLAKE2S_USE_AVX2_WAY2)
func_Fast = Blake2sp_Compress2_AVX2_Way2;
#endif
} // avx2
#endif // avx2
} // sse*
g_Z7_BLAKE2SP_FUNC_COMPRESS_Fast = func_Fast;
g_Z7_BLAKE2SP_FUNC_COMPRESS_Single = func_Single;
g_Z7_BLAKE2SP_FUNC_INIT_Init = func_Init;
g_Z7_BLAKE2SP_FUNC_INIT_Final = func_Final;
g_z7_Blake2sp_SupportedFlags = flags;
// printf("\nflags=%x\n", flags);
#endif // vectors
}
/*
#ifdef Z7_BLAKE2S_USE_VECTORS
void align_test2(CBlake2sp *sp);
void align_test2(CBlake2sp *sp)
{
__m128i a = LOAD_128(sp->states);
D_XOR_128(a, LOAD_128(sp->states + 4));
STORE_128(sp->states, a);
}
void align_test2(void);
void align_test2(void)
{
CBlake2sp sp;
Blake2sp_Init(&sp);
Blake2sp_Update(&sp, NULL, 0);
}
#endif
*/