/* Sort.c -- Sort functions
: Igor Pavlov : Public domain */

#include "Precomp.h"

#include "Sort.h"
#include "CpuArch.h"

#if (  (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
    || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
    )
// the code with prefetch is slow for small arrays on x86.
// So we disable prefetch for x86.
#ifndef MY_CPU_X86
  // #pragma message("Z7_PREFETCH : __builtin_prefetch")
  #define Z7_PREFETCH(a)  __builtin_prefetch((a))
#endif

#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)

#include "7zWindows.h"

// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
// For example, clang-cl can generate "prefetcht2" instruction for
// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
// But we want to generate "prefetcht0" instruction.
// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
// instead of PreFetchCacheLine() / _mm_prefetch().

// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
// But old x86 cpus don't support "prefetcht0".
// So we will use PreFetchCacheLine(), only if we are sure that
// generated instruction is supported by all cpus of that isa.
#if defined(MY_CPU_AMD64) \
    || defined(MY_CPU_ARM64) \
    || defined(MY_CPU_IA64)
// we need to use additional braces for (a) in PreFetchCacheLine call, because
// PreFetchCacheLine macro doesn't use braces:
//   #define PreFetchCacheLine(l, a)  _mm_prefetch((CHAR CONST *) a, l)
  // #pragma message("Z7_PREFETCH : PreFetchCacheLine")
  #define Z7_PREFETCH(a)  PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
#endif

#endif // _WIN32


#define PREFETCH_NO(p,k,s,size)

#ifndef Z7_PREFETCH
  #define SORT_PREFETCH(p,k,s,size)
#else

// #define PREFETCH_LEVEL 2  // use it if cache line is 32-bytes
#define PREFETCH_LEVEL 3  // it is fast for most cases (64-bytes cache line prefetch)
// #define PREFETCH_LEVEL 4  // it can be faster for big array (128-bytes prefetch)

#if PREFETCH_LEVEL == 0

  #define SORT_PREFETCH(p,k,s,size)

#else // PREFETCH_LEVEL != 0

/*
if  defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
    we prefetch one value per cache line.
    Use it if array is aligned for cache line size (64 bytes)
    or if array is small (less than L1 cache size).

if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
    we perfetch all cache lines that can be required.
    it can be faster for big unaligned arrays.
*/
  #define USE_PREFETCH_FOR_ALIGNED_ARRAY

// s == k * 2
#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
  // x86 supports (lea r1*8+offset)
  #define PREFETCH_OFFSET(k,s)  ((s) << PREFETCH_LEVEL)
#else
  #define PREFETCH_OFFSET(k,s)  ((k) << (PREFETCH_LEVEL + 1))
#endif

#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
  #define PREFETCH_ADD_OFFSET   0
#else
  // last offset that can be reqiured in PREFETCH_LEVEL step:
  #define PREFETCH_RANGE        ((2 << PREFETCH_LEVEL) - 1)
  #define PREFETCH_ADD_OFFSET   PREFETCH_RANGE / 2
#endif

#if PREFETCH_LEVEL <= 3

#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
  #define SORT_PREFETCH(p,k,s,size) \
  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
    if (s2 <= size) { \
      Z7_PREFETCH((p + s2)); \
  }}
#else /* for unaligned array */
  #define SORT_PREFETCH(p,k,s,size) \
  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
    if (s2 <= size) { \
      Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
      Z7_PREFETCH((p + s2)); \
  }}
#endif

#else // PREFETCH_LEVEL > 3

#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
  #define SORT_PREFETCH(p,k,s,size) \
  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
    if (s2 <= size) { \
      Z7_PREFETCH((p + s2 - 16)); \
      Z7_PREFETCH((p + s2)); \
  }}
#else /* for unaligned array */
  #define SORT_PREFETCH(p,k,s,size) \
  { const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
    if (s2 <= size) { \
      Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
      Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
      Z7_PREFETCH((p + s2)); \
  }}
#endif

#endif // PREFETCH_LEVEL > 3
#endif // PREFETCH_LEVEL != 0
#endif // Z7_PREFETCH


#if defined(MY_CPU_ARM64) \
    /* || defined(MY_CPU_AMD64) */ \
    /* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
  // we want to use cmov, if cmov is very fast:
  // - this cmov version is slower for clang-x64.
  // - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
  #define Z7_FAST_CMOV_SUPPORTED
#endif
 
#ifdef Z7_FAST_CMOV_SUPPORTED
  // we want to use cmov here, if cmov is fast: new arm64 cpus.
  // we want the compiler to use conditional move for this branch
  #define GET_MAX_VAL(n0, n1, max_val_slow)  if (n0 < n1) n0 = n1;
#else
  // use this branch, if cpu doesn't support fast conditional move.
  // it uses slow array access reading:
  #define GET_MAX_VAL(n0, n1, max_val_slow)  n0 = max_val_slow;
#endif

#define HeapSortDown(p, k, size, temp, macro_prefetch) \
{ \
  for (;;) { \
    UInt32 n0, n1; \
    size_t s = k * 2; \
    if (s >= size) { \
      if (s == size) { \
        n0 = p[s]; \
        p[k] = n0; \
        if (temp < n0) k = s; \
      } \
      break; \
    } \
    n0 = p[k * 2]; \
    n1 = p[k * 2 + 1]; \
    s += n0 < n1; \
    GET_MAX_VAL(n0, n1, p[s]) \
    if (temp >= n0) break; \
    macro_prefetch(p, k, s, size) \
    p[k] = n0; \
    k = s; \
  } \
  p[k] = temp; \
}


/*
stage-1 : O(n) :
  we generate intermediate partially sorted binary tree:
  p[0]  : it's additional item for better alignment of tree structure in memory.
  p[1]
  p[2]       p[3]
  p[4] p[5]  p[6] p[7]
  ...
  p[x] >= p[x * 2]
  p[x] >= p[x * 2 + 1]
  
stage-2 : O(n)*log2(N):
  we move largest item p[0] from head of tree to the end of array
  and insert last item to sorted binary tree.
*/

// (p) must be aligned for cache line size (64-bytes) for best performance

void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
{
  if (size < 2)
    return;
  if (size == 2)
  {
    const UInt32 a0 = p[0];
    const UInt32 a1 = p[1];
    const unsigned k = a1 < a0;
    p[k] = a0;
    p[k ^ 1] = a1;
    return;
  }
  {
    // stage-1 : O(n)
    // we transform array to partially sorted binary tree.
    size_t i = --size / 2;
    // (size) now is the index of the last item in tree,
    // if (i)
    {
      do
      {
        const UInt32 temp = p[i];
        size_t k = i;
        HeapSortDown(p, k, size, temp, PREFETCH_NO)
      }
      while (--i);
    }
    {
      const UInt32 temp = p[0];
      const UInt32 a1 = p[1];
      if (temp < a1)
      {
        size_t k = 1;
        p[0] = a1;
        HeapSortDown(p, k, size, temp, PREFETCH_NO)
      }
    }
  }

  if (size < 3)
  {
    // size == 2
    const UInt32 a0 = p[0];
    p[0] = p[2];
    p[2] = a0;
    return;
  }
  if (size != 3)
  {
    // stage-2 : O(size) * log2(size):
    // we move largest item p[0] from head to the end of array,
    // and insert last item to sorted binary tree.
    do
    {
      const UInt32 temp = p[size];
      size_t k = p[2] < p[3] ? 3 : 2;
      p[size--] = p[0];
      p[0] = p[1];
      p[1] = p[k];
      HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
    }
    while (size != 3);
  }
  {
    const UInt32 a2 = p[2];
    const UInt32 a3 = p[3];
    const size_t k = a2 < a3;
    p[2] = p[1];
    p[3] = p[0];
    p[k] = a3;
    p[k ^ 1] = a2;
  }
}