2021-12-27 01:00:00 +01:00
|
|
|
/* Sort.c -- Sort functions
|
2025-07-05 02:00:00 +02:00
|
|
|
: Igor Pavlov : Public domain */
|
2021-12-27 01:00:00 +01:00
|
|
|
|
|
|
|
|
#include "Precomp.h"
|
|
|
|
|
|
|
|
|
|
#include "Sort.h"
|
2025-07-05 02:00:00 +02:00
|
|
|
#include "CpuArch.h"
|
2021-12-27 01:00:00 +01:00
|
|
|
|
2025-07-05 02:00:00 +02:00
|
|
|
#if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
|
|
|
|
|
|| (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
|
|
|
|
|
)
|
|
|
|
|
// the code with prefetch is slow for small arrays on x86.
|
|
|
|
|
// So we disable prefetch for x86.
|
|
|
|
|
#ifndef MY_CPU_X86
|
|
|
|
|
// #pragma message("Z7_PREFETCH : __builtin_prefetch")
|
|
|
|
|
#define Z7_PREFETCH(a) __builtin_prefetch((a))
|
|
|
|
|
#endif
|
2021-12-27 01:00:00 +01:00
|
|
|
|
2025-07-05 02:00:00 +02:00
|
|
|
#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
|
|
|
|
|
|
|
|
|
|
#include "7zWindows.h"
|
|
|
|
|
|
|
|
|
|
// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
|
|
|
|
|
// For example, clang-cl can generate "prefetcht2" instruction for
|
|
|
|
|
// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
|
|
|
|
|
// But we want to generate "prefetcht0" instruction.
|
|
|
|
|
// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
|
|
|
|
|
// instead of PreFetchCacheLine() / _mm_prefetch().
|
|
|
|
|
|
|
|
|
|
// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
|
|
|
|
|
// But old x86 cpus don't support "prefetcht0".
|
|
|
|
|
// So we will use PreFetchCacheLine(), only if we are sure that
|
|
|
|
|
// generated instruction is supported by all cpus of that isa.
|
|
|
|
|
#if defined(MY_CPU_AMD64) \
|
|
|
|
|
|| defined(MY_CPU_ARM64) \
|
|
|
|
|
|| defined(MY_CPU_IA64)
|
|
|
|
|
// we need to use additional braces for (a) in PreFetchCacheLine call, because
|
|
|
|
|
// PreFetchCacheLine macro doesn't use braces:
|
|
|
|
|
// #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l)
|
|
|
|
|
// #pragma message("Z7_PREFETCH : PreFetchCacheLine")
|
|
|
|
|
#define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#endif // _WIN32
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#define PREFETCH_NO(p,k,s,size)
|
|
|
|
|
|
|
|
|
|
#ifndef Z7_PREFETCH
|
|
|
|
|
#define SORT_PREFETCH(p,k,s,size)
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
|
|
// #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes
|
|
|
|
|
#define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch)
|
|
|
|
|
// #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch)
|
|
|
|
|
|
|
|
|
|
#if PREFETCH_LEVEL == 0
|
|
|
|
|
|
|
|
|
|
#define SORT_PREFETCH(p,k,s,size)
|
|
|
|
|
|
|
|
|
|
#else // PREFETCH_LEVEL != 0
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
|
|
|
|
|
we prefetch one value per cache line.
|
|
|
|
|
Use it if array is aligned for cache line size (64 bytes)
|
|
|
|
|
or if array is small (less than L1 cache size).
|
|
|
|
|
|
|
|
|
|
if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
|
|
|
|
|
we perfetch all cache lines that can be required.
|
|
|
|
|
it can be faster for big unaligned arrays.
|
|
|
|
|
*/
|
|
|
|
|
#define USE_PREFETCH_FOR_ALIGNED_ARRAY
|
|
|
|
|
|
|
|
|
|
// s == k * 2
|
|
|
|
|
#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
|
|
|
|
|
// x86 supports (lea r1*8+offset)
|
|
|
|
|
#define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL)
|
|
|
|
|
#else
|
|
|
|
|
#define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1))
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
|
|
|
|
|
#define PREFETCH_ADD_OFFSET 0
|
|
|
|
|
#else
|
|
|
|
|
// last offset that can be reqiured in PREFETCH_LEVEL step:
|
|
|
|
|
#define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1)
|
|
|
|
|
#define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if PREFETCH_LEVEL <= 3
|
|
|
|
|
|
|
|
|
|
#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
|
|
|
|
|
#define SORT_PREFETCH(p,k,s,size) \
|
|
|
|
|
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
|
|
|
|
|
if (s2 <= size) { \
|
|
|
|
|
Z7_PREFETCH((p + s2)); \
|
|
|
|
|
}}
|
|
|
|
|
#else /* for unaligned array */
|
|
|
|
|
#define SORT_PREFETCH(p,k,s,size) \
|
|
|
|
|
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
|
|
|
|
|
if (s2 <= size) { \
|
|
|
|
|
Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
|
|
|
|
|
Z7_PREFETCH((p + s2)); \
|
|
|
|
|
}}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#else // PREFETCH_LEVEL > 3
|
|
|
|
|
|
|
|
|
|
#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
|
|
|
|
|
#define SORT_PREFETCH(p,k,s,size) \
|
|
|
|
|
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
|
|
|
|
|
if (s2 <= size) { \
|
|
|
|
|
Z7_PREFETCH((p + s2 - 16)); \
|
|
|
|
|
Z7_PREFETCH((p + s2)); \
|
|
|
|
|
}}
|
|
|
|
|
#else /* for unaligned array */
|
|
|
|
|
#define SORT_PREFETCH(p,k,s,size) \
|
|
|
|
|
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
|
|
|
|
|
if (s2 <= size) { \
|
|
|
|
|
Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
|
|
|
|
|
Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
|
|
|
|
|
Z7_PREFETCH((p + s2)); \
|
|
|
|
|
}}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#endif // PREFETCH_LEVEL > 3
|
|
|
|
|
#endif // PREFETCH_LEVEL != 0
|
|
|
|
|
#endif // Z7_PREFETCH
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#if defined(MY_CPU_ARM64) \
|
|
|
|
|
/* || defined(MY_CPU_AMD64) */ \
|
|
|
|
|
/* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
|
|
|
|
|
// we want to use cmov, if cmov is very fast:
|
|
|
|
|
// - this cmov version is slower for clang-x64.
|
|
|
|
|
// - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
|
|
|
|
|
#define Z7_FAST_CMOV_SUPPORTED
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#ifdef Z7_FAST_CMOV_SUPPORTED
|
|
|
|
|
// we want to use cmov here, if cmov is fast: new arm64 cpus.
|
|
|
|
|
// we want the compiler to use conditional move for this branch
|
|
|
|
|
#define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1;
|
|
|
|
|
#else
|
|
|
|
|
// use this branch, if cpu doesn't support fast conditional move.
|
|
|
|
|
// it uses slow array access reading:
|
|
|
|
|
#define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#define HeapSortDown(p, k, size, temp, macro_prefetch) \
|
|
|
|
|
{ \
|
|
|
|
|
for (;;) { \
|
|
|
|
|
UInt32 n0, n1; \
|
|
|
|
|
size_t s = k * 2; \
|
|
|
|
|
if (s >= size) { \
|
|
|
|
|
if (s == size) { \
|
|
|
|
|
n0 = p[s]; \
|
|
|
|
|
p[k] = n0; \
|
|
|
|
|
if (temp < n0) k = s; \
|
|
|
|
|
} \
|
|
|
|
|
break; \
|
|
|
|
|
} \
|
|
|
|
|
n0 = p[k * 2]; \
|
|
|
|
|
n1 = p[k * 2 + 1]; \
|
|
|
|
|
s += n0 < n1; \
|
|
|
|
|
GET_MAX_VAL(n0, n1, p[s]) \
|
|
|
|
|
if (temp >= n0) break; \
|
|
|
|
|
macro_prefetch(p, k, s, size) \
|
|
|
|
|
p[k] = n0; \
|
|
|
|
|
k = s; \
|
|
|
|
|
} \
|
|
|
|
|
p[k] = temp; \
|
2021-12-27 01:00:00 +01:00
|
|
|
}
|
|
|
|
|
|
2025-07-05 02:00:00 +02:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
stage-1 : O(n) :
|
|
|
|
|
we generate intermediate partially sorted binary tree:
|
|
|
|
|
p[0] : it's additional item for better alignment of tree structure in memory.
|
|
|
|
|
p[1]
|
|
|
|
|
p[2] p[3]
|
|
|
|
|
p[4] p[5] p[6] p[7]
|
|
|
|
|
...
|
|
|
|
|
p[x] >= p[x * 2]
|
|
|
|
|
p[x] >= p[x * 2 + 1]
|
|
|
|
|
|
|
|
|
|
stage-2 : O(n)*log2(N):
|
|
|
|
|
we move largest item p[0] from head of tree to the end of array
|
|
|
|
|
and insert last item to sorted binary tree.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
// (p) must be aligned for cache line size (64-bytes) for best performance
|
|
|
|
|
|
|
|
|
|
void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
|
2021-12-27 01:00:00 +01:00
|
|
|
{
|
2025-07-05 02:00:00 +02:00
|
|
|
if (size < 2)
|
2021-12-27 01:00:00 +01:00
|
|
|
return;
|
2025-07-05 02:00:00 +02:00
|
|
|
if (size == 2)
|
2021-12-27 01:00:00 +01:00
|
|
|
{
|
2025-07-05 02:00:00 +02:00
|
|
|
const UInt32 a0 = p[0];
|
|
|
|
|
const UInt32 a1 = p[1];
|
|
|
|
|
const unsigned k = a1 < a0;
|
|
|
|
|
p[k] = a0;
|
|
|
|
|
p[k ^ 1] = a1;
|
|
|
|
|
return;
|
2021-12-27 01:00:00 +01:00
|
|
|
}
|
|
|
|
|
{
|
2025-07-05 02:00:00 +02:00
|
|
|
// stage-1 : O(n)
|
|
|
|
|
// we transform array to partially sorted binary tree.
|
|
|
|
|
size_t i = --size / 2;
|
|
|
|
|
// (size) now is the index of the last item in tree,
|
|
|
|
|
// if (i)
|
2021-12-27 01:00:00 +01:00
|
|
|
{
|
2025-07-05 02:00:00 +02:00
|
|
|
do
|
|
|
|
|
{
|
|
|
|
|
const UInt32 temp = p[i];
|
|
|
|
|
size_t k = i;
|
|
|
|
|
HeapSortDown(p, k, size, temp, PREFETCH_NO)
|
|
|
|
|
}
|
|
|
|
|
while (--i);
|
|
|
|
|
}
|
|
|
|
|
{
|
|
|
|
|
const UInt32 temp = p[0];
|
|
|
|
|
const UInt32 a1 = p[1];
|
|
|
|
|
if (temp < a1)
|
|
|
|
|
{
|
|
|
|
|
size_t k = 1;
|
|
|
|
|
p[0] = a1;
|
|
|
|
|
HeapSortDown(p, k, size, temp, PREFETCH_NO)
|
|
|
|
|
}
|
2021-12-27 01:00:00 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-07-05 02:00:00 +02:00
|
|
|
if (size < 3)
|
|
|
|
|
{
|
|
|
|
|
// size == 2
|
|
|
|
|
const UInt32 a0 = p[0];
|
|
|
|
|
p[0] = p[2];
|
|
|
|
|
p[2] = a0;
|
2021-12-27 01:00:00 +01:00
|
|
|
return;
|
2025-07-05 02:00:00 +02:00
|
|
|
}
|
|
|
|
|
if (size != 3)
|
2021-12-27 01:00:00 +01:00
|
|
|
{
|
2025-07-05 02:00:00 +02:00
|
|
|
// stage-2 : O(size) * log2(size):
|
|
|
|
|
// we move largest item p[0] from head to the end of array,
|
|
|
|
|
// and insert last item to sorted binary tree.
|
2021-12-27 01:00:00 +01:00
|
|
|
do
|
|
|
|
|
{
|
2025-07-05 02:00:00 +02:00
|
|
|
const UInt32 temp = p[size];
|
|
|
|
|
size_t k = p[2] < p[3] ? 3 : 2;
|
|
|
|
|
p[size--] = p[0];
|
|
|
|
|
p[0] = p[1];
|
|
|
|
|
p[1] = p[k];
|
|
|
|
|
HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
|
2021-12-27 01:00:00 +01:00
|
|
|
}
|
2025-07-05 02:00:00 +02:00
|
|
|
while (size != 3);
|
2021-12-27 01:00:00 +01:00
|
|
|
}
|
|
|
|
|
{
|
2025-07-05 02:00:00 +02:00
|
|
|
const UInt32 a2 = p[2];
|
|
|
|
|
const UInt32 a3 = p[3];
|
|
|
|
|
const size_t k = a2 < a3;
|
|
|
|
|
p[2] = p[1];
|
|
|
|
|
p[3] = p[0];
|
|
|
|
|
p[k] = a3;
|
|
|
|
|
p[k ^ 1] = a2;
|
2021-12-27 01:00:00 +01:00
|
|
|
}
|
|
|
|
|
}
|