mirror of
https://github.com/ip7z/7zip.git
synced 2025-12-06 07:12:00 +01:00
861 lines
20 KiB
NASM
861 lines
20 KiB
NASM
; SortTest.asm -- ASM version of HeapSort() function
|
|
; Igor Pavlov : Public domain
|
|
|
|
include ../../../../Asm/x86/7zAsm.asm
|
|
|
|
MY_ASM_START
|
|
|
|
ifndef Z7_SORT_ASM_USE_SEGMENT
|
|
if (IS_LINUX gt 0)
|
|
; Z7_SORT_ASM_USE_SEGMENT equ 1
|
|
else
|
|
; Z7_SORT_ASM_USE_SEGMENT equ 1
|
|
endif
|
|
endif
|
|
|
|
ifdef Z7_SORT_ASM_USE_SEGMENT
|
|
_TEXT$Z7_SORT SEGMENT ALIGN(64) 'CODE'
|
|
MY_ALIGN macro num:req
|
|
align num
|
|
endm
|
|
else
|
|
MY_ALIGN macro num:req
|
|
; We expect that ".text" is aligned for 16-bytes.
|
|
; So we don't need large alignment inside our function.
|
|
align 16
|
|
endm
|
|
endif
|
|
|
|
|
|
MY_ALIGN_16 macro
|
|
MY_ALIGN 16
|
|
endm
|
|
|
|
MY_ALIGN_32 macro
|
|
MY_ALIGN 32
|
|
endm
|
|
|
|
MY_ALIGN_64 macro
|
|
MY_ALIGN 64
|
|
endm
|
|
|
|
ifdef x64
|
|
|
|
NUM_PREFETCH_LEVELS equ 3 ; to prefetch 1x 64-bytes line (is good for most cases)
|
|
; NUM_PREFETCH_LEVELS equ 4 ; to prefetch 2x 64-bytes lines (better for big arrays)
|
|
|
|
acc equ x0
|
|
k equ r0
|
|
k_x equ x0
|
|
|
|
p equ r1
|
|
|
|
s equ r2
|
|
s_x equ x2
|
|
|
|
a0 equ x3
|
|
t0 equ a0
|
|
|
|
a3 equ x5
|
|
qq equ a3
|
|
|
|
a1 equ x6
|
|
t1 equ a1
|
|
t1_r equ r6
|
|
|
|
a2 equ x7
|
|
t2 equ a2
|
|
|
|
i equ r8
|
|
e0 equ x8
|
|
|
|
e1 equ x9
|
|
|
|
num_last equ r10
|
|
num_last_x equ x10
|
|
|
|
next4_lim equ r11
|
|
pref_lim equ r12
|
|
|
|
|
|
|
|
SORT_2_WITH_TEMP_REG macro b0, b1, temp_reg
|
|
mov temp_reg, b0
|
|
cmp b0, b1
|
|
cmovae b0, b1 ; min
|
|
cmovae b1, temp_reg ; max
|
|
endm
|
|
|
|
SORT macro b0, b1
|
|
SORT_2_WITH_TEMP_REG b0, b1, acc
|
|
endm
|
|
|
|
LOAD macro dest:req, index:req
|
|
mov dest, [p + 4 * index]
|
|
endm
|
|
|
|
STORE macro reg:req, index:req
|
|
mov [p + 4 * index], reg
|
|
endm
|
|
|
|
|
|
if (NUM_PREFETCH_LEVELS gt 3)
|
|
num_prefetches equ (1 SHL (NUM_PREFETCH_LEVELS - 3))
|
|
else
|
|
num_prefetches equ 1
|
|
endif
|
|
|
|
PREFETCH_OP macro offs
|
|
cur_offset = 7 * 4 ; it's average offset in 64-bytes cache line.
|
|
; cur_offset = 0 ; we can use zero offset, if we are sure that array is aligned for 64-bytes.
|
|
rept num_prefetches
|
|
if 1
|
|
prefetcht0 byte ptr [p + offs + cur_offset]
|
|
else
|
|
mov pref_x, dword ptr [p + offs + cur_offset]
|
|
endif
|
|
cur_offset = cur_offset + 64
|
|
endm
|
|
endm
|
|
|
|
PREFETCH_MY macro
|
|
if 1
|
|
if 1
|
|
shl k, NUM_PREFETCH_LEVELS + 3
|
|
else
|
|
; we delay prefetch instruction to improve main loads
|
|
shl k, NUM_PREFETCH_LEVELS
|
|
shl k, 3
|
|
; shl k, 0
|
|
endif
|
|
PREFETCH_OP k
|
|
elseif 1
|
|
shl k, 3
|
|
PREFETCH_OP k * (1 SHL NUM_PREFETCH_LEVELS) ; change it
|
|
endif
|
|
endm
|
|
|
|
|
|
STEP_1 macro exit_label, prefetch_macro
|
|
use_cmov_1 equ 1 ; set 1 for cmov, but it's slower in some cases
|
|
; set 0 for LOAD after adc s, 0
|
|
cmp t0, t1
|
|
if use_cmov_1
|
|
cmovb t0, t1
|
|
; STORE t0, k
|
|
endif
|
|
adc s, 0
|
|
if use_cmov_1 eq 0
|
|
LOAD t0, s
|
|
endif
|
|
cmp qq, t0
|
|
jae exit_label
|
|
if 1 ; use_cmov_1 eq 0
|
|
STORE t0, k
|
|
endif
|
|
prefetch_macro
|
|
mov t0, [p + s * 8]
|
|
mov t1, [p + s * 8 + 4]
|
|
mov k, s
|
|
add s, s ; slower for some cpus
|
|
; lea s, dword ptr [s + s] ; slower for some cpus
|
|
; shl s, 1 ; faster for some cpus
|
|
; lea s, dword ptr [s * 2] ; faster for some cpus
|
|
rept 0 ; 1000 for debug : 0 for normal
|
|
; number of calls in generate_stage : ~0.6 of number of items
|
|
shl k, 0
|
|
endm
|
|
endm
|
|
|
|
|
|
STEP_2 macro exit_label, prefetch_macro
|
|
use_cmov_2 equ 0 ; set 1 for cmov, but it's slower in some cases
|
|
; set 0 for LOAD after adc s, 0
|
|
cmp t0, t1
|
|
if use_cmov_2
|
|
mov t2, t0
|
|
cmovb t2, t1
|
|
; STORE t2, k
|
|
endif
|
|
mov t0, [p + s * 8]
|
|
mov t1, [p + s * 8 + 4]
|
|
cmovb t0, [p + s * 8 + 8]
|
|
cmovb t1, [p + s * 8 + 12]
|
|
adc s, 0
|
|
if use_cmov_2 eq 0
|
|
LOAD t2, s
|
|
endif
|
|
cmp qq, t2
|
|
jae exit_label
|
|
if 1 ; use_cmov_2 eq 0
|
|
STORE t2, k
|
|
endif
|
|
prefetch_macro
|
|
mov k, s
|
|
; add s, s
|
|
; lea s, [s + s]
|
|
shl s, 1
|
|
; lea s, [s * 2]
|
|
endm
|
|
|
|
|
|
MOVE_SMALLEST_UP macro STEP, use_prefetch, num_unrolls
|
|
LOCAL exit_1, exit_2, leaves, opt_loop, last_nodes
|
|
|
|
; s == k * 2
|
|
; t0 == (p)[s]
|
|
; t1 == (p)[s + 1]
|
|
cmp k, next4_lim
|
|
jae leaves
|
|
|
|
rept num_unrolls
|
|
STEP exit_2
|
|
cmp k, next4_lim
|
|
jae leaves
|
|
endm
|
|
|
|
if use_prefetch
|
|
prefetch_macro equ PREFETCH_MY
|
|
pref_lim_2 equ pref_lim
|
|
; lea pref_lim, dword ptr [num_last + 1]
|
|
; shr pref_lim, NUM_PREFETCH_LEVELS + 1
|
|
cmp k, pref_lim_2
|
|
jae last_nodes
|
|
else
|
|
prefetch_macro equ
|
|
pref_lim_2 equ next4_lim
|
|
endif
|
|
|
|
MY_ALIGN_16
|
|
opt_loop:
|
|
STEP exit_2, prefetch_macro
|
|
cmp k, pref_lim_2
|
|
jb opt_loop
|
|
|
|
last_nodes:
|
|
; k >= pref_lim_2
|
|
; 2 cases are possible:
|
|
; case-1: num_after_prefetch_levels == 0 && next4_lim = pref_lim_2
|
|
; case-2: num_after_prefetch_levels == NUM_PREFETCH_LEVELS - 1 &&
|
|
; next4_lim = pref_lim_2 / (NUM_PREFETCH_LEVELS - 1)
|
|
if use_prefetch
|
|
yyy = NUM_PREFETCH_LEVELS - 1
|
|
while yyy
|
|
yyy = yyy - 1
|
|
STEP exit_2
|
|
if yyy
|
|
cmp k, next4_lim
|
|
jae leaves
|
|
endif
|
|
endm
|
|
endif
|
|
|
|
leaves:
|
|
; k >= next4_lim == (num_last + 1) / 4 must be provided by previous code.
|
|
; we have 2 nodes in (s) level : always
|
|
; we can have some nodes in (s * 2) level : low probability case
|
|
; we have no nodes in (s * 4) level
|
|
; s == k * 2
|
|
; t0 == (p)[s]
|
|
; t1 == (p)[s + 1]
|
|
cmp t0, t1
|
|
cmovb t0, t1
|
|
adc s, 0
|
|
STORE t0, k
|
|
|
|
; t0 == (p)[s]
|
|
; s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
|
|
; we have 3 possible cases here:
|
|
; s * 2 > num_last : (s) node has no childs
|
|
; s * 2 == num_last : (s) node has 1 leaf child that is last item of array
|
|
; s * 2 < num_last : (s) node has 2 leaf childs. We provide (s * 4 > num_last)
|
|
; we check for (s * 2 > num_last) before "cmp qq, t0" check, because
|
|
; we will replace conditional jump with cmov instruction later.
|
|
lea t1_r, dword ptr [s + s]
|
|
cmp t1_r, num_last
|
|
ja exit_1 ; if (s * 2 > num_last), we have no childs : it's high probability branch
|
|
|
|
; it's low probability branch
|
|
; s * 2 <= num_last
|
|
cmp qq, t0
|
|
jae exit_2
|
|
|
|
; qq < t0, so we go to next level
|
|
; we check 1 or 2 childs in next level
|
|
mov t0, [p + s * 8]
|
|
mov k, s
|
|
mov s, t1_r
|
|
cmp t1_r, num_last
|
|
je @F ; (s == num_last) means that we have single child in tree
|
|
|
|
; (s < num_last) : so we must read both childs and select max of them.
|
|
mov t1, [p + k * 8 + 4]
|
|
cmp t0, t1
|
|
cmovb t0, t1
|
|
adc s, 0
|
|
@@:
|
|
STORE t0, k
|
|
exit_1:
|
|
; t0 == (p)[s], s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
|
|
cmp qq, t0
|
|
cmovb k, s
|
|
exit_2:
|
|
STORE qq, k
|
|
endm
|
|
|
|
|
|
|
|
|
|
ifdef Z7_SORT_ASM_USE_SEGMENT
|
|
; MY_ALIGN_64
|
|
else
|
|
MY_ALIGN_16
|
|
endif
|
|
|
|
MY_PROC HeapSort, 2
|
|
|
|
if (IS_LINUX gt 0)
|
|
mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux
|
|
endif
|
|
mov num_last, REG_ABI_PARAM_1 ; r10 <- r6 : linux
|
|
; r10 <- r2 : win64
|
|
cmp num_last, 2
|
|
jb end_1
|
|
|
|
; MY_PUSH_PRESERVED_ABI_REGS
|
|
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
|
|
push r12
|
|
|
|
cmp num_last, 4
|
|
ja sort_5
|
|
|
|
LOAD a0, 0
|
|
LOAD a1, 1
|
|
SORT a0, a1
|
|
cmp num_last, 3
|
|
jb end_2
|
|
|
|
LOAD a2, 2
|
|
je sort_3
|
|
|
|
LOAD a3, 3
|
|
SORT a2, a3
|
|
SORT a1, a3
|
|
STORE a3, 3
|
|
sort_3:
|
|
SORT a0, a2
|
|
SORT a1, a2
|
|
STORE a2, 2
|
|
jmp end_2
|
|
|
|
sort_5:
|
|
; (num_last > 4) is required here
|
|
; if (num_last >= 6) : we will use optimized loop for leaf nodes loop_down_1
|
|
mov next4_lim, num_last
|
|
shr next4_lim, 2
|
|
|
|
dec num_last
|
|
mov k, num_last
|
|
shr k, 1
|
|
mov i, num_last
|
|
shr i, 2
|
|
test num_last, 1
|
|
jnz size_even
|
|
|
|
; ODD number of items. So we compare parent with single child
|
|
LOAD t1, num_last
|
|
LOAD t0, k
|
|
SORT_2_WITH_TEMP_REG t1, t0, t2
|
|
STORE t1, num_last
|
|
STORE t0, k
|
|
dec k
|
|
|
|
size_even:
|
|
cmp k, i
|
|
jbe loop_down ; jump for num_last == 4 case
|
|
|
|
if 0 ; 1 for debug
|
|
mov r15, k
|
|
mov r14d, 1 ; 100
|
|
loop_benchmark:
|
|
endif
|
|
; optimized loop for leaf nodes:
|
|
mov t0, [p + k * 8]
|
|
mov t1, [p + k * 8 + 4]
|
|
|
|
MY_ALIGN_16
|
|
loop_down_1:
|
|
; we compare parent with max of childs:
|
|
; lea s, dword ptr [2 * k]
|
|
mov s, k
|
|
cmp t0, t1
|
|
cmovb t0, t1
|
|
adc s, s
|
|
LOAD t2, k
|
|
STORE t0, k
|
|
cmp t2, t0
|
|
cmovae s, k
|
|
dec k
|
|
; we preload next items before STORE operation for calculated address
|
|
mov t0, [p + k * 8]
|
|
mov t1, [p + k * 8 + 4]
|
|
STORE t2, s
|
|
cmp k, i
|
|
jne loop_down_1
|
|
|
|
if 0 ; 1 for debug
|
|
mov k, r15
|
|
dec r14d
|
|
jnz loop_benchmark
|
|
; jmp end_debug
|
|
endif
|
|
|
|
MY_ALIGN_16
|
|
loop_down:
|
|
mov t0, [p + i * 8]
|
|
mov t1, [p + i * 8 + 4]
|
|
LOAD qq, i
|
|
mov k, i
|
|
lea s, dword ptr [i + i]
|
|
; jmp end_debug
|
|
DOWN_use_prefetch equ 0
|
|
DOWN_num_unrolls equ 0
|
|
MOVE_SMALLEST_UP STEP_1, DOWN_use_prefetch, DOWN_num_unrolls
|
|
sub i, 1
|
|
jnb loop_down
|
|
|
|
; jmp end_debug
|
|
LOAD e0, 0
|
|
LOAD e1, 1
|
|
|
|
LEVEL_3_LIMIT equ 8 ; 8 is default, but 7 also can work
|
|
|
|
cmp num_last, LEVEL_3_LIMIT + 1
|
|
jb main_loop_sort_5
|
|
|
|
MY_ALIGN_16
|
|
main_loop_sort:
|
|
; num_last > LEVEL_3_LIMIT
|
|
; p[size--] = p[0];
|
|
LOAD qq, num_last
|
|
STORE e0, num_last
|
|
mov e0, e1
|
|
|
|
mov next4_lim, num_last
|
|
shr next4_lim, 2
|
|
mov pref_lim, num_last
|
|
shr pref_lim, NUM_PREFETCH_LEVELS + 1
|
|
|
|
dec num_last
|
|
if 0 ; 1 for debug
|
|
; that optional optimization can improve the performance, if there are identical items in array
|
|
; 3 times improvement : if all items in array are identical
|
|
; 20% improvement : if items are different for 1 bit only
|
|
; 1-10% improvement : if items are different for (2+) bits
|
|
; no gain : if items are different
|
|
cmp qq, e1
|
|
jae next_iter_main
|
|
endif
|
|
LOAD e1, 2
|
|
LOAD t0, 3
|
|
mov k_x, 2
|
|
cmp e1, t0
|
|
cmovb e1, t0
|
|
mov t0, [p + 4 * (4 + 0)]
|
|
mov t1, [p + 4 * (4 + 1)]
|
|
cmovb t0, [p + 4 * (4 + 2)]
|
|
cmovb t1, [p + 4 * (4 + 3)]
|
|
adc k_x, 0
|
|
; (qq <= e1), because the tree is correctly sorted
|
|
; also here we could check (qq >= e1) or (qq == e1) for faster exit
|
|
lea s, dword ptr [k + k]
|
|
MAIN_use_prefetch equ 1
|
|
MAIN_num_unrolls equ 0
|
|
MOVE_SMALLEST_UP STEP_2, MAIN_use_prefetch, MAIN_num_unrolls
|
|
|
|
next_iter_main:
|
|
cmp num_last, LEVEL_3_LIMIT
|
|
jne main_loop_sort
|
|
|
|
; num_last == LEVEL_3_LIMIT
|
|
main_loop_sort_5:
|
|
; 4 <= num_last <= LEVEL_3_LIMIT
|
|
; p[size--] = p[0];
|
|
LOAD qq, num_last
|
|
STORE e0, num_last
|
|
mov e0, e1
|
|
dec num_last_x
|
|
|
|
LOAD e1, 2
|
|
LOAD t0, 3
|
|
mov k_x, 2
|
|
cmp e1, t0
|
|
cmovb e1, t0
|
|
adc k_x, 0
|
|
|
|
lea s_x, dword ptr [k * 2]
|
|
cmp s_x, num_last_x
|
|
ja exit_2
|
|
|
|
mov t0, [p + k * 8]
|
|
je exit_1
|
|
|
|
; s < num_last
|
|
mov t1, [p + k * 8 + 4]
|
|
cmp t0, t1
|
|
cmovb t0, t1
|
|
adc s_x, 0
|
|
exit_1:
|
|
STORE t0, k
|
|
cmp qq, t0
|
|
cmovb k_x, s_x
|
|
exit_2:
|
|
STORE qq, k
|
|
cmp num_last_x, 3
|
|
jne main_loop_sort_5
|
|
|
|
; num_last == 3 (real_size == 4)
|
|
LOAD a0, 2
|
|
LOAD a1, 3
|
|
STORE e1, 2
|
|
STORE e0, 3
|
|
SORT a0, a1
|
|
end_2:
|
|
STORE a0, 0
|
|
STORE a1, 1
|
|
; end_debug:
|
|
; MY_POP_PRESERVED_ABI_REGS
|
|
pop r12
|
|
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
|
|
end_1:
|
|
MY_ENDP
|
|
|
|
|
|
|
|
else
|
|
; ------------ x86 32-bit ------------
|
|
|
|
ifdef x64
|
|
IS_CDECL = 0
|
|
endif
|
|
|
|
acc equ x0
|
|
k equ r0
|
|
k_x equ acc
|
|
|
|
p equ r1
|
|
|
|
num_last equ r2
|
|
num_last_x equ x2
|
|
|
|
a0 equ x3
|
|
t0 equ a0
|
|
|
|
a3 equ x5
|
|
i equ r5
|
|
e0 equ a3
|
|
|
|
a1 equ x6
|
|
qq equ a1
|
|
|
|
a2 equ x7
|
|
s equ r7
|
|
s_x equ a2
|
|
|
|
|
|
SORT macro b0, b1
|
|
cmp b1, b0
|
|
jae @F
|
|
if 1
|
|
xchg b0, b1
|
|
else
|
|
mov acc, b0
|
|
mov b0, b1 ; min
|
|
mov b1, acc ; max
|
|
endif
|
|
@@:
|
|
endm
|
|
|
|
LOAD macro dest:req, index:req
|
|
mov dest, [p + 4 * index]
|
|
endm
|
|
|
|
STORE macro reg:req, index:req
|
|
mov [p + 4 * index], reg
|
|
endm
|
|
|
|
|
|
STEP_1 macro exit_label
|
|
mov t0, [p + k * 8]
|
|
cmp t0, [p + k * 8 + 4]
|
|
adc s, 0
|
|
LOAD t0, s
|
|
STORE t0, k ; we lookahed stooring for most expected branch
|
|
cmp qq, t0
|
|
jae exit_label
|
|
; STORE t0, k ; use if
|
|
mov k, s
|
|
add s, s
|
|
; lea s, dword ptr [s + s]
|
|
; shl s, 1
|
|
; lea s, dword ptr [s * 2]
|
|
endm
|
|
|
|
STEP_BRANCH macro exit_label
|
|
mov t0, [p + k * 8]
|
|
cmp t0, [p + k * 8 + 4]
|
|
jae @F
|
|
inc s
|
|
mov t0, [p + k * 8 + 4]
|
|
@@:
|
|
cmp qq, t0
|
|
jae exit_label
|
|
STORE t0, k
|
|
mov k, s
|
|
add s, s
|
|
endm
|
|
|
|
|
|
|
|
MOVE_SMALLEST_UP macro STEP, num_unrolls, exit_2
|
|
LOCAL leaves, opt_loop, single
|
|
|
|
; s == k * 2
|
|
rept num_unrolls
|
|
cmp s, num_last
|
|
jae leaves
|
|
STEP_1 exit_2
|
|
endm
|
|
cmp s, num_last
|
|
jb opt_loop
|
|
|
|
leaves:
|
|
; (s >= num_last)
|
|
jne exit_2
|
|
single:
|
|
; (s == num_last)
|
|
mov t0, [p + k * 8]
|
|
cmp qq, t0
|
|
jae exit_2
|
|
STORE t0, k
|
|
mov k, s
|
|
jmp exit_2
|
|
|
|
MY_ALIGN_16
|
|
opt_loop:
|
|
STEP exit_2
|
|
cmp s, num_last
|
|
jb opt_loop
|
|
je single
|
|
exit_2:
|
|
STORE qq, k
|
|
endm
|
|
|
|
|
|
|
|
|
|
ifdef Z7_SORT_ASM_USE_SEGMENT
|
|
; MY_ALIGN_64
|
|
else
|
|
MY_ALIGN_16
|
|
endif
|
|
|
|
MY_PROC HeapSort, 2
|
|
ifdef x64
|
|
if (IS_LINUX gt 0)
|
|
mov num_last, REG_ABI_PARAM_1 ; r2 <- r6 : linux
|
|
mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux
|
|
endif
|
|
elseif (IS_CDECL gt 0)
|
|
mov num_last, [r4 + REG_SIZE * 2]
|
|
mov p, [r4 + REG_SIZE * 1]
|
|
endif
|
|
cmp num_last, 2
|
|
jb end_1
|
|
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
|
|
|
|
cmp num_last, 4
|
|
ja sort_5
|
|
|
|
LOAD a0, 0
|
|
LOAD a1, 1
|
|
SORT a0, a1
|
|
cmp num_last, 3
|
|
jb end_2
|
|
|
|
LOAD a2, 2
|
|
je sort_3
|
|
|
|
LOAD a3, 3
|
|
SORT a2, a3
|
|
SORT a1, a3
|
|
STORE a3, 3
|
|
sort_3:
|
|
SORT a0, a2
|
|
SORT a1, a2
|
|
STORE a2, 2
|
|
jmp end_2
|
|
|
|
sort_5:
|
|
; num_last > 4
|
|
lea i, dword ptr [num_last - 2]
|
|
dec num_last
|
|
test i, 1
|
|
jz loop_down
|
|
|
|
; single child
|
|
mov t0, [p + num_last * 4]
|
|
mov qq, [p + num_last * 2]
|
|
dec i
|
|
cmp qq, t0
|
|
jae loop_down
|
|
|
|
mov [p + num_last * 2], t0
|
|
mov [p + num_last * 4], qq
|
|
|
|
MY_ALIGN_16
|
|
loop_down:
|
|
mov t0, [p + i * 4]
|
|
cmp t0, [p + i * 4 + 4]
|
|
mov k, i
|
|
mov qq, [p + i * 2]
|
|
adc k, 0
|
|
LOAD t0, k
|
|
cmp qq, t0
|
|
jae down_next
|
|
mov [p + i * 2], t0
|
|
lea s, dword ptr [k + k]
|
|
|
|
DOWN_num_unrolls equ 0
|
|
MOVE_SMALLEST_UP STEP_1, DOWN_num_unrolls, down_exit_label
|
|
down_next:
|
|
sub i, 2
|
|
jnb loop_down
|
|
; jmp end_debug
|
|
|
|
LOAD e0, 0
|
|
|
|
MY_ALIGN_16
|
|
main_loop_sort:
|
|
; num_last > 3
|
|
mov t0, [p + 2 * 4]
|
|
cmp t0, [p + 3 * 4]
|
|
LOAD qq, num_last
|
|
STORE e0, num_last
|
|
LOAD e0, 1
|
|
mov s_x, 2
|
|
mov k_x, 1
|
|
adc s, 0
|
|
LOAD t0, s
|
|
dec num_last
|
|
cmp qq, t0
|
|
jae main_exit_label
|
|
STORE t0, 1
|
|
mov k, s
|
|
add s, s
|
|
if 1
|
|
; for branch data prefetch mode :
|
|
; it's faster for large arrays : larger than (1 << 13) items.
|
|
MAIN_num_unrolls equ 10
|
|
STEP_LOOP equ STEP_BRANCH
|
|
else
|
|
MAIN_num_unrolls equ 0
|
|
STEP_LOOP equ STEP_1
|
|
endif
|
|
|
|
MOVE_SMALLEST_UP STEP_LOOP, MAIN_num_unrolls, main_exit_label
|
|
|
|
; jmp end_debug
|
|
cmp num_last, 3
|
|
jne main_loop_sort
|
|
|
|
; num_last == 3 (real_size == 4)
|
|
LOAD a0, 2
|
|
LOAD a1, 3
|
|
LOAD a2, 1
|
|
STORE e0, 3 ; e0 is alias for a3
|
|
STORE a2, 2
|
|
SORT a0, a1
|
|
end_2:
|
|
STORE a0, 0
|
|
STORE a1, 1
|
|
; end_debug:
|
|
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
|
|
end_1:
|
|
MY_ENDP
|
|
|
|
endif
|
|
|
|
ifdef Z7_SORT_ASM_USE_SEGMENT
|
|
_TEXT$Z7_SORT ENDS
|
|
endif
|
|
|
|
if 0
|
|
LEA_IS_D8 (R64) [R2 * 4 + 16]
|
|
Lat : TP
|
|
2 : 1 : adl-e
|
|
2 : 3 p056 adl-p
|
|
1 : 2 : p15 hsw-rocket
|
|
1 : 2 : p01 snb-ivb
|
|
1 : 1 : p1 conroe-wsm
|
|
1 : 4 : zen3,zen4
|
|
2 : 4 : zen1,zen2
|
|
|
|
LEA_B_IS (R64) [R2 + R3 * 4]
|
|
Lat : TP
|
|
1 : 1 : adl-e
|
|
2 : 3 p056 adl-p
|
|
1 : 2 : p15 hsw-rocket
|
|
1 : 2 : p01 snb-ivb
|
|
1 : 1 : p1 nhm-wsm
|
|
1 : 1 : p0 conroe-wsm
|
|
1 : 4 : zen3,zen4
|
|
2 :2,4 : zen1,zen2
|
|
|
|
LEA_B_IS_D8 (R64) [R2 + R3 * 4 + 16]
|
|
Lat : TP
|
|
2 : 1 : adl-e
|
|
2 : 3 p056 adl-p
|
|
1 : 2 : p15 ice-rocket
|
|
3 : 1 : p1/p15 hsw-rocket
|
|
3 : 1 : p01 snb-ivb
|
|
1 : 1 : p1 nhm-wsm
|
|
1 : 1 : p0 conroe-wsm
|
|
2,1 : 2 : zen3,zen4
|
|
2 : 2 : zen1,zen2
|
|
|
|
CMOVB (R64, R64)
|
|
Lat : TP
|
|
1,2 : 2 : adl-e
|
|
1 : 2 p06 adl-p
|
|
1 : 2 : p06 bwd-rocket
|
|
1,2 : 2 : p0156+p06 hsw
|
|
1,2 :1.5 : p015+p05 snb-ivb
|
|
1,2 : 1 : p015+p05 nhm
|
|
1 : 1 : 2*p015 conroe
|
|
1 : 2 : zen3,zen4
|
|
1 : 4 : zen1,zen2
|
|
|
|
ADC (R64, 0)
|
|
Lat : TP
|
|
1,2 : 2 : adl-e
|
|
1 : 2 p06 adl-p
|
|
1 : 2 : p06 bwd-rocket
|
|
1 :1.5 : p0156+p06 hsw
|
|
1 :1.5 : p015+p05 snb-ivb
|
|
2 : 1 : 2*p015 conroe-wstm
|
|
1 : 2 : zen1,zen2,zen3,zen4
|
|
|
|
PREFETCHNTA : fetch data into non-temporal cache close to the processor, minimizing cache pollution.
|
|
L1 : Pentium3
|
|
L2 : NetBurst
|
|
L1, not L2: Core duo, Core 2, Atom processors
|
|
L1, not L2, may fetch into L3 with fast replacement: Nehalem, Westmere, Sandy Bridge, ...
|
|
NEHALEM: Fills L1/L3, L1 LRU is not updated
|
|
L3 with fast replacement: Xeon Processors based on Nehalem, Westmere, Sandy Bridge, ...
|
|
PREFETCHT0 : fetch data into all cache levels.
|
|
PREFETCHT1 : fetch data into L2 and L3
|
|
endif
|
|
|
|
end
|