7zip/C/Xxh64.c
Igor Pavlov fc662341e6 24.05
2024-05-15 23:55:04 +05:00

328 lines
7.6 KiB
C

/* Xxh64.c -- XXH64 hash calculation
original code: Copyright (c) Yann Collet.
2023-08-18 : modified by Igor Pavlov.
This source code is licensed under BSD 2-Clause License.
*/
#include "Precomp.h"
#include "CpuArch.h"
#include "RotateDefs.h"
#include "Xxh64.h"
#define Z7_XXH_PRIME64_1 UINT64_CONST(0x9E3779B185EBCA87)
#define Z7_XXH_PRIME64_2 UINT64_CONST(0xC2B2AE3D27D4EB4F)
#define Z7_XXH_PRIME64_3 UINT64_CONST(0x165667B19E3779F9)
#define Z7_XXH_PRIME64_4 UINT64_CONST(0x85EBCA77C2B2AE63)
#define Z7_XXH_PRIME64_5 UINT64_CONST(0x27D4EB2F165667C5)
void Xxh64State_Init(CXxh64State *p)
{
const UInt64 seed = 0;
p->v[0] = seed + Z7_XXH_PRIME64_1 + Z7_XXH_PRIME64_2;
p->v[1] = seed + Z7_XXH_PRIME64_2;
p->v[2] = seed;
p->v[3] = seed - Z7_XXH_PRIME64_1;
}
#if !defined(MY_CPU_64BIT) && defined(MY_CPU_X86) && defined(_MSC_VER)
#define Z7_XXH64_USE_ASM
#endif
#if !defined(MY_CPU_64BIT) && defined(MY_CPU_X86) \
&& defined(Z7_MSC_VER_ORIGINAL) && Z7_MSC_VER_ORIGINAL > 1200
/* we try to avoid __allmul calls in MSVC for 64-bit multiply.
But MSVC6 still uses __allmul for our code.
So for MSVC6 we use default 64-bit multiply without our optimization.
*/
#define LOW32(b) ((UInt32)(b & 0xffffffff))
/* MSVC compiler (MSVC > 1200) can use "mul" instruction
without __allmul for our MY_emulu MACRO.
MY_emulu is similar to __emulu(a, b) MACRO */
#define MY_emulu(a, b) ((UInt64)(a) * (b))
#define MY_SET_HIGH32(a) ((UInt64)(a) << 32)
#define MY_MUL32_SET_HIGH32(a, b) MY_SET_HIGH32((UInt32)(a) * (UInt32)(b))
// /*
#define MY_MUL64(a, b) \
( MY_emulu((UInt32)(a), LOW32(b)) + \
MY_SET_HIGH32( \
(UInt32)((a) >> 32) * LOW32(b) + \
(UInt32)(a) * (UInt32)((b) >> 32) \
))
// */
/*
#define MY_MUL64(a, b) \
( MY_emulu((UInt32)(a), LOW32(b)) \
+ MY_MUL32_SET_HIGH32((a) >> 32, LOW32(b)) + \
+ MY_MUL32_SET_HIGH32(a, (b) >> 32) \
)
*/
#define MY_MUL_32_64(a32, b) \
( MY_emulu((UInt32)(a32), LOW32(b)) \
+ MY_MUL32_SET_HIGH32(a32, (b) >> 32) \
)
#else
#define MY_MUL64(a, b) ((a) * (b))
#define MY_MUL_32_64(a32, b) ((a32) * (UInt64)(b))
#endif
static
Z7_FORCE_INLINE
UInt64 Xxh64_Round(UInt64 acc, UInt64 input)
{
acc += MY_MUL64(input, Z7_XXH_PRIME64_2);
acc = Z7_ROTL64(acc, 31);
return MY_MUL64(acc, Z7_XXH_PRIME64_1);
}
static UInt64 Xxh64_Merge(UInt64 acc, UInt64 val)
{
acc ^= Xxh64_Round(0, val);
return MY_MUL64(acc, Z7_XXH_PRIME64_1) + Z7_XXH_PRIME64_4;
}
#ifdef Z7_XXH64_USE_ASM
#define Z7_XXH_PRIME64_1_HIGH 0x9E3779B1
#define Z7_XXH_PRIME64_1_LOW 0x85EBCA87
#define Z7_XXH_PRIME64_2_HIGH 0xC2B2AE3D
#define Z7_XXH_PRIME64_2_LOW 0x27D4EB4F
void
Z7_NO_INLINE
__declspec(naked)
Z7_FASTCALL
Xxh64State_UpdateBlocks(CXxh64State *p, const void *data, const void *end)
{
#if !defined(__clang__)
UNUSED_VAR(p)
UNUSED_VAR(data)
UNUSED_VAR(end)
#endif
__asm push ebx
__asm push ebp
__asm push esi
__asm push edi
#define STACK_OFFSET 4 * 8
__asm sub esp, STACK_OFFSET
#define COPY_1(n) \
__asm mov eax, [ecx + n * 4] \
__asm mov [esp + n * 4], eax \
#define COPY_2(n) \
__asm mov eax, [esp + n * 4] \
__asm mov [ecx + n * 4], eax \
COPY_1(0)
__asm mov edi, [ecx + 1 * 4] \
COPY_1(2)
COPY_1(3)
COPY_1(4)
COPY_1(5)
COPY_1(6)
COPY_1(7)
__asm mov esi, edx \
__asm mov [esp + 0 * 8 + 4], ecx
__asm mov ecx, Z7_XXH_PRIME64_2_LOW \
__asm mov ebp, Z7_XXH_PRIME64_1_LOW \
#define R(n, state1, state1_reg) \
__asm mov eax, [esi + n * 8] \
__asm imul ebx, eax, Z7_XXH_PRIME64_2_HIGH \
__asm add ebx, state1 \
__asm mul ecx \
__asm add edx, ebx \
__asm mov ebx, [esi + n * 8 + 4] \
__asm imul ebx, ecx \
__asm add eax, [esp + n * 8] \
__asm adc edx, ebx \
__asm mov ebx, eax \
__asm shld eax, edx, 31 \
__asm shld edx, ebx, 31 \
__asm imul state1_reg, eax, Z7_XXH_PRIME64_1_HIGH \
__asm imul edx, ebp \
__asm add state1_reg, edx \
__asm mul ebp \
__asm add state1_reg, edx \
__asm mov [esp + n * 8], eax \
#define R2(n) \
R(n, [esp + n * 8 + 4], ebx) \
__asm mov [esp + n * 8 + 4], ebx \
__asm align 16
__asm main_loop:
R(0, edi, edi)
R2(1)
R2(2)
R2(3)
__asm add esi, 32
__asm cmp esi, [esp + STACK_OFFSET + 4 * 4 + 4]
__asm jne main_loop
__asm mov ecx, [esp + 0 * 8 + 4]
COPY_2(0)
__asm mov [ecx + 1 * 4], edi
COPY_2(2)
COPY_2(3)
COPY_2(4)
COPY_2(5)
COPY_2(6)
COPY_2(7)
__asm add esp, STACK_OFFSET
__asm pop edi
__asm pop esi
__asm pop ebp
__asm pop ebx
__asm ret 4
}
#else
void
Z7_NO_INLINE
Z7_FASTCALL
Xxh64State_UpdateBlocks(CXxh64State *p, const void *_data, const void *end)
{
const Byte *data = (const Byte *)_data;
UInt64 v[4];
v[0] = p->v[0];
v[1] = p->v[1];
v[2] = p->v[2];
v[3] = p->v[3];
do
{
v[0] = Xxh64_Round(v[0], GetUi64(data)); data += 8;
v[1] = Xxh64_Round(v[1], GetUi64(data)); data += 8;
v[2] = Xxh64_Round(v[2], GetUi64(data)); data += 8;
v[3] = Xxh64_Round(v[3], GetUi64(data)); data += 8;
}
while (data != end);
p->v[0] = v[0];
p->v[1] = v[1];
p->v[2] = v[2];
p->v[3] = v[3];
}
#endif
UInt64 Xxh64State_Digest(const CXxh64State *p, const void *_data, UInt64 count)
{
UInt64 h = p->v[2];
if (count >= 32)
{
h = Z7_ROTL64(p->v[0], 1) +
Z7_ROTL64(p->v[1], 7) +
Z7_ROTL64(h, 12) +
Z7_ROTL64(p->v[3], 18);
h = Xxh64_Merge(h, p->v[0]);
h = Xxh64_Merge(h, p->v[1]);
h = Xxh64_Merge(h, p->v[2]);
h = Xxh64_Merge(h, p->v[3]);
}
else
h += Z7_XXH_PRIME64_5;
h += count;
// XXH64_finalize():
{
unsigned cnt = (unsigned)count & 31;
const Byte *data = (const Byte *)_data;
while (cnt >= 8)
{
h ^= Xxh64_Round(0, GetUi64(data));
data += 8;
h = Z7_ROTL64(h, 27);
h = MY_MUL64(h, Z7_XXH_PRIME64_1) + Z7_XXH_PRIME64_4;
cnt -= 8;
}
if (cnt >= 4)
{
const UInt32 v = GetUi32(data);
data += 4;
h ^= MY_MUL_32_64(v, Z7_XXH_PRIME64_1);
h = Z7_ROTL64(h, 23);
h = MY_MUL64(h, Z7_XXH_PRIME64_2) + Z7_XXH_PRIME64_3;
cnt -= 4;
}
while (cnt)
{
const UInt32 v = *data++;
h ^= MY_MUL_32_64(v, Z7_XXH_PRIME64_5);
h = Z7_ROTL64(h, 11);
h = MY_MUL64(h, Z7_XXH_PRIME64_1);
cnt--;
}
// XXH64_avalanche(h):
h ^= h >> 33; h = MY_MUL64(h, Z7_XXH_PRIME64_2);
h ^= h >> 29; h = MY_MUL64(h, Z7_XXH_PRIME64_3);
h ^= h >> 32;
return h;
}
}
void Xxh64_Init(CXxh64 *p)
{
Xxh64State_Init(&p->state);
p->count = 0;
p->buf64[0] = 0;
p->buf64[1] = 0;
p->buf64[2] = 0;
p->buf64[3] = 0;
}
void Xxh64_Update(CXxh64 *p, const void *_data, size_t size)
{
const Byte *data = (const Byte *)_data;
unsigned cnt;
if (size == 0)
return;
cnt = (unsigned)p->count;
p->count += size;
if (cnt &= 31)
{
unsigned rem = 32 - cnt;
Byte *dest = (Byte *)p->buf64 + cnt;
if (rem > size)
rem = (unsigned)size;
size -= rem;
cnt += rem;
// memcpy((Byte *)p->buf64 + cnt, data, rem);
do
*dest++ = *data++;
while (--rem);
if (cnt != 32)
return;
Xxh64State_UpdateBlocks(&p->state, p->buf64, &p->buf64[4]);
}
if (size &= ~(size_t)31)
{
Xxh64State_UpdateBlocks(&p->state, data, data + size);
data += size;
}
cnt = (unsigned)p->count & 31;
if (cnt)
{
// memcpy(p->buf64, data, cnt);
Byte *dest = (Byte *)p->buf64;
do
*dest++ = *data++;
while (--cnt);
}
}