Compare commits

..

No commits in common. "main" and "21.07" have entirely different histories.
main ... 21.07

1049 changed files with 40219 additions and 100531 deletions

View file

@ -1,12 +1,7 @@
; 7zAsm.asm -- ASM macros
; 2023-12-08 : Igor Pavlov : Public domain
; 2021-12-25 : Igor Pavlov : Public domain
; UASM can require these changes
; OPTION FRAMEPRESERVEFLAGS:ON
; OPTION PROLOGUE:NONE
; OPTION EPILOGUE:NONE
ifdef @wordsize
; @wordsize is defined only in JWASM and ASMC and is not defined in MASM
; @wordsize eq 8 for 64-bit x64
@ -43,7 +38,7 @@ else
endif
endif
OPTION PROLOGUE:NONE
OPTION PROLOGUE:NONE
OPTION EPILOGUE:NONE
MY_ASM_START macro
@ -121,29 +116,10 @@ endif
x2_H equ DH
x3_H equ BH
; r0_L equ AL
; r1_L equ CL
; r2_L equ DL
; r3_L equ BL
; r0_H equ AH
; r1_H equ CH
; r2_H equ DH
; r3_H equ BH
ifdef x64
x5_L equ BPL
x6_L equ SIL
x7_L equ DIL
x8_L equ r8b
x9_L equ r9b
x10_L equ r10b
x11_L equ r11b
x12_L equ r12b
x13_L equ r13b
x14_L equ r14b
x15_L equ r15b
r0 equ RAX
r1 equ RCX
@ -172,22 +148,6 @@ else
r7 equ x7
endif
x0_R equ r0
x1_R equ r1
x2_R equ r2
x3_R equ r3
x4_R equ r4
x5_R equ r5
x6_R equ r6
x7_R equ r7
x8_R equ r8
x9_R equ r9
x10_R equ r10
x11_R equ r11
x12_R equ r12
x13_R equ r13
x14_R equ r14
x15_R equ r15
ifdef x64
ifdef ABI_LINUX
@ -235,14 +195,6 @@ REG_ABI_PARAM_0 equ REG_PARAM_0
REG_ABI_PARAM_1_x equ REG_PARAM_1_x
REG_ABI_PARAM_1 equ REG_PARAM_1
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro
MY_PUSH_4_REGS
endm
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro
MY_POP_4_REGS
endm
else
; x64
@ -304,25 +256,12 @@ endm
endif ; IS_LINUX
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro
MY_PUSH_PRESERVED_ABI_REGS macro
if (IS_LINUX gt 0)
MY_PUSH_2_REGS
else
MY_PUSH_4_REGS
endif
endm
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro
if (IS_LINUX gt 0)
MY_POP_2_REGS
else
MY_POP_4_REGS
endif
endm
MY_PUSH_PRESERVED_ABI_REGS macro
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
push r12
push r13
push r14
@ -335,7 +274,11 @@ MY_POP_PRESERVED_ABI_REGS macro
pop r14
pop r13
pop r12
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
if (IS_LINUX gt 0)
MY_POP_2_REGS
else
MY_POP_4_REGS
endif
endm
endif ; x64

View file

@ -1,258 +1,180 @@
; 7zCrcOpt.asm -- CRC32 calculation : optimized version
; 2023-12-08 : Igor Pavlov : Public domain
; 2021-02-07 : Igor Pavlov : Public domain
include 7zAsm.asm
MY_ASM_START
NUM_WORDS equ 3
UNROLL_CNT equ 2
rD equ r2
rN equ r7
rT equ r5
if (NUM_WORDS lt 1) or (NUM_WORDS gt 64)
.err <NUM_WORDS_IS_INCORRECT>
endif
if (UNROLL_CNT lt 1)
.err <UNROLL_CNT_IS_INCORRECT>
ifdef x64
num_VAR equ r8
table_VAR equ r9
else
if (IS_CDECL gt 0)
crc_OFFS equ (REG_SIZE * 5)
data_OFFS equ (REG_SIZE + crc_OFFS)
size_OFFS equ (REG_SIZE + data_OFFS)
else
size_OFFS equ (REG_SIZE * 5)
endif
table_OFFS equ (REG_SIZE + size_OFFS)
num_VAR equ [r4 + size_OFFS]
table_VAR equ [r4 + table_OFFS]
endif
rD equ r2
rD_x equ x2
rN equ r7
rT equ r5
ifndef x64
if (IS_CDECL gt 0)
crc_OFFS equ (REG_SIZE * 5)
data_OFFS equ (REG_SIZE + crc_OFFS)
size_OFFS equ (REG_SIZE + data_OFFS)
else
size_OFFS equ (REG_SIZE * 5)
endif
table_OFFS equ (REG_SIZE + size_OFFS)
endif
; rN + rD is same speed as rD, but we reduce one instruction in loop
SRCDAT_1 equ rN + rD * 1 + 1 *
SRCDAT_4 equ rN + rD * 1 + 4 *
SRCDAT equ rD + rN * 1 + 4 *
CRC macro op:req, dest:req, src:req, t:req
op dest, dword ptr [rT + @CatStr(src, _R) * 4 + 0400h * (t)]
op dest, DWORD PTR [rT + src * 4 + 0400h * t]
endm
CRC_XOR macro dest:req, src:req, t:req
CRC xor, dest, src, t
CRC xor, dest, src, t
endm
CRC_MOV macro dest:req, src:req, t:req
CRC mov, dest, src, t
CRC mov, dest, src, t
endm
MOVZXLO macro dest:req, src:req
movzx dest, @CatStr(src, _L)
endm
MOVZXHI macro dest:req, src:req
movzx dest, @CatStr(src, _H)
endm
; movzx x0, x0_L - is slow in some cpus (ivb), if same register for src and dest
; movzx x3, x0_L sometimes is 0 cycles latency (not always)
; movzx x3, x0_L sometimes is 0.5 cycles latency
; movzx x3, x0_H is 2 cycles latency in some cpus
CRC1b macro
movzx x6, byte ptr [rD]
MOVZXLO x3, x0
inc rD
shr x0, 8
xor x6, x3
CRC_XOR x0, x6, 0
dec rN
movzx x6, BYTE PTR [rD]
inc rD
movzx x3, x0_L
xor x6, x3
shr x0, 8
CRC xor, x0, r6, 0
dec rN
endm
LOAD_1 macro dest:req, t:req, iter:req, index:req
movzx dest, byte ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)]
endm
MY_PROLOG macro crc_end:req
LOAD_2 macro dest:req, t:req, iter:req, index:req
movzx dest, word ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)]
endm
CRC_QUAD macro nn, t:req, iter:req
ifdef x64
; paired memory loads give 1-3% speed gain, but it uses more registers
LOAD_2 x3, t, iter, 0
LOAD_2 x9, t, iter, 2
MOVZXLO x6, x3
shr x3, 8
CRC_XOR nn, x6, t * 4 + 3
MOVZXLO x6, x9
shr x9, 8
CRC_XOR nn, x3, t * 4 + 2
CRC_XOR nn, x6, t * 4 + 1
CRC_XOR nn, x9, t * 4 + 0
elseif 0
LOAD_2 x3, t, iter, 0
MOVZXLO x6, x3
shr x3, 8
CRC_XOR nn, x6, t * 4 + 3
CRC_XOR nn, x3, t * 4 + 2
LOAD_2 x3, t, iter, 2
MOVZXLO x6, x3
shr x3, 8
CRC_XOR nn, x6, t * 4 + 1
CRC_XOR nn, x3, t * 4 + 0
elseif 0
LOAD_1 x3, t, iter, 0
LOAD_1 x6, t, iter, 1
CRC_XOR nn, x3, t * 4 + 3
CRC_XOR nn, x6, t * 4 + 2
LOAD_1 x3, t, iter, 2
LOAD_1 x6, t, iter, 3
CRC_XOR nn, x3, t * 4 + 1
CRC_XOR nn, x6, t * 4 + 0
else
; 32-bit load is better if there is only one read port (core2)
; but that code can be slower if there are 2 read ports (snb)
mov x3, dword ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + 0)]
MOVZXLO x6, x3
CRC_XOR nn, x6, t * 4 + 3
MOVZXHI x6, x3
shr x3, 16
CRC_XOR nn, x6, t * 4 + 2
MOVZXLO x6, x3
shr x3, 8
CRC_XOR nn, x6, t * 4 + 1
CRC_XOR nn, x3, t * 4 + 0
endif
endm
LAST equ (4 * (NUM_WORDS - 1))
CRC_ITER macro qq, nn, iter
mov nn, [SRCDAT_4 (NUM_WORDS * (1 + iter))]
i = 0
rept NUM_WORDS - 1
CRC_QUAD nn, i, iter
i = i + 1
endm
MOVZXLO x6, qq
mov x3, qq
shr x3, 24
CRC_XOR nn, x6, LAST + 3
CRC_XOR nn, x3, LAST + 0
ror qq, 16
MOVZXLO x6, qq
shr qq, 24
CRC_XOR nn, x6, LAST + 1
if ((UNROLL_CNT and 1) eq 1) and (iter eq (UNROLL_CNT - 1))
CRC_MOV qq, qq, LAST + 2
xor qq, nn
else
CRC_XOR nn, qq, LAST + 2
endif
endm
; + 4 for prefetching next 4-bytes after current iteration
NUM_BYTES_LIMIT equ (NUM_WORDS * 4 * UNROLL_CNT + 4)
ALIGN_MASK equ 3
; MY_PROC @CatStr(CrcUpdateT, 12), 4
MY_PROC @CatStr(CrcUpdateT, %(NUM_WORDS * 4)), 4
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
ifdef x64
mov x0, REG_ABI_PARAM_0_x ; x0 = x1(win) / x7(linux)
mov rT, REG_ABI_PARAM_3 ; r5 = r9(win) / x1(linux)
mov rN, REG_ABI_PARAM_2 ; r7 = r8(win) / r2(linux)
; mov rD, REG_ABI_PARAM_1 ; r2 = r2(win)
if (IS_LINUX gt 0)
MY_PUSH_2_REGS
mov x0, REG_ABI_PARAM_0_x ; x0 = x7
mov rT, REG_ABI_PARAM_3 ; r5 = r1
mov rN, REG_ABI_PARAM_2 ; r7 = r2
mov rD, REG_ABI_PARAM_1 ; r2 = r6
else
MY_PUSH_4_REGS
mov x0, REG_ABI_PARAM_0_x ; x0 = x1
mov rT, REG_ABI_PARAM_3 ; r5 = r9
mov rN, REG_ABI_PARAM_2 ; r7 = r8
; mov rD, REG_ABI_PARAM_1 ; r2 = r2
endif
else
MY_PUSH_4_REGS
if (IS_CDECL gt 0)
mov x0, [r4 + crc_OFFS]
mov rD, [r4 + data_OFFS]
else
mov x0, REG_ABI_PARAM_0_x
endif
mov rN, [r4 + size_OFFS]
mov rT, [r4 + table_OFFS]
mov rN, num_VAR
mov rT, table_VAR
endif
cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK
jb crc_end
@@:
test rD_x, ALIGN_MASK ; test rD, ALIGN_MASK
jz @F
CRC1b
jmp @B
@@:
xor x0, dword ptr [rD]
lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
sub rD, rN
align 16
@@:
unr_index = 0
while unr_index lt UNROLL_CNT
if (unr_index and 1) eq 0
CRC_ITER x0, x1, unr_index
else
CRC_ITER x1, x0, unr_index
endif
unr_index = unr_index + 1
test rN, rN
jz crc_end
@@:
test rD, 7
jz @F
CRC1b
jnz @B
@@:
cmp rN, 16
jb crc_end
add rN, rD
mov num_VAR, rN
sub rN, 8
and rN, NOT 7
sub rD, rN
xor x0, [SRCDAT 0]
endm
add rD, NUM_WORDS * 4 * UNROLL_CNT
jnc @B
MY_EPILOG macro crc_end:req
xor x0, [SRCDAT 0]
mov rD, rN
mov rN, num_VAR
sub rN, rD
crc_end:
test rN, rN
jz @F
CRC1b
jmp crc_end
@@:
if (IS_X64 gt 0) and (IS_LINUX gt 0)
MY_POP_2_REGS
else
MY_POP_4_REGS
endif
endm
if 0
; byte verson
add rD, rN
xor x0, dword ptr [rD]
add rN, NUM_BYTES_LIMIT - 1
else
; 4-byte version
add rN, 4 * NUM_WORDS * UNROLL_CNT
sub rD, 4 * NUM_WORDS * UNROLL_CNT
@@:
MOVZXLO x3, x0
MOVZXHI x1, x0
shr x0, 16
MOVZXLO x6, x0
shr x0, 8
CRC_MOV x0, x0, 0
CRC_XOR x0, x3, 3
CRC_XOR x0, x1, 2
CRC_XOR x0, x6, 1
MY_PROC CrcUpdateT8, 4
MY_PROLOG crc_end_8
mov x1, [SRCDAT 1]
align 16
main_loop_8:
mov x6, [SRCDAT 2]
movzx x3, x1_L
CRC_XOR x6, r3, 3
movzx x3, x1_H
CRC_XOR x6, r3, 2
shr x1, 16
movzx x3, x1_L
movzx x1, x1_H
CRC_XOR x6, r3, 1
movzx x3, x0_L
CRC_XOR x6, r1, 0
add rD, 4
if (NUM_WORDS * UNROLL_CNT) ne 1
jc @F
xor x0, [SRCDAT_4 0]
jmp @B
@@:
endif
add rD, rN
add rN, 4 - 1
endif
sub rN, rD
crc_end:
test rN, rN
jz func_end
@@:
CRC1b
jnz @B
mov x1, [SRCDAT 3]
CRC_XOR x6, r3, 7
movzx x3, x0_H
shr x0, 16
CRC_XOR x6, r3, 6
movzx x3, x0_L
CRC_XOR x6, r3, 5
movzx x3, x0_H
CRC_MOV x0, r3, 4
xor x0, x6
add rD, 8
jnz main_loop_8
func_end:
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
MY_EPILOG crc_end_8
MY_ENDP
MY_PROC CrcUpdateT4, 4
MY_PROLOG crc_end_4
align 16
main_loop_4:
movzx x1, x0_L
movzx x3, x0_H
shr x0, 16
movzx x6, x0_H
and x0, 0FFh
CRC_MOV x1, r1, 3
xor x1, [SRCDAT 1]
CRC_XOR x1, r3, 2
CRC_XOR x1, r6, 0
CRC_XOR x1, r0, 1
movzx x0, x1_L
movzx x3, x1_H
shr x1, 16
movzx x6, x1_H
and x1, 0FFh
CRC_MOV x0, r0, 3
xor x0, [SRCDAT 2]
CRC_XOR x0, r3, 2
CRC_XOR x0, r6, 0
CRC_XOR x0, r1, 1
add rD, 8
jnz main_loop_4
MY_EPILOG crc_end_4
MY_ENDP
end

View file

@ -1,5 +1,5 @@
; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function
; 2024-06-18: Igor Pavlov : Public domain
; 2021-07-21: Igor Pavlov : Public domain
;
ifndef x64
@ -11,31 +11,10 @@ include 7zAsm.asm
MY_ASM_START
ifndef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT
if (IS_LINUX gt 0)
Z7_LZ_FIND_OPT_ASM_USE_SEGMENT equ 1
else
Z7_LZ_FIND_OPT_ASM_USE_SEGMENT equ 1
endif
endif
ifdef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT
_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE'
MY_ALIGN macro num:req
align num
; align 16
endm
else
MY_ALIGN macro num:req
; We expect that ".text" is aligned for 16-bytes.
; So we don't need large alignment inside our function.
align 16
endm
endif
MY_ALIGN_16 macro
MY_ALIGN 16
endm
MY_ALIGN_32 macro
@ -157,11 +136,7 @@ COPY_VAR_64 macro dest_var, src_var
endm
ifdef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT
; MY_ALIGN_64
else
MY_ALIGN_16
endif
MY_PROC GetMatchesSpecN_2, 13
MY_PUSH_PRESERVED_ABI_REGS
mov r0, RSP
@ -533,8 +508,6 @@ fin:
MY_POP_PRESERVED_ABI_REGS
MY_ENDP
ifdef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT
_TEXT$LZFINDOPT ENDS
endif
end

View file

@ -1,5 +1,5 @@
; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
; 2024-06-18: Igor Pavlov : Public domain
; 2021-02-23: Igor Pavlov : Public domain
;
; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
; function for check at link time.
@ -17,41 +17,11 @@ include 7zAsm.asm
MY_ASM_START
; if Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT is defined, we use additional SEGMENT with 64-byte alignment.
; if Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT is not defined, we use default SEGMENT (where default 16-byte alignment of segment is expected).
; The performance is almost identical in our tests.
; But the performance can depend from position of lzmadec code inside instruction cache
; or micro-op cache line (depending from low address bits in 32-byte/64-byte cache lines).
; And 64-byte alignment provides a more consistent speed regardless
; of the code's position in the executable.
; But also it's possible that code without Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT can be
; slightly faster than 64-bytes aligned code in some cases, if offset of lzmadec
; code in 64-byte block after compilation provides better speed by some reason.
; Note that Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT adds an extra section to the ELF file.
; If you don't want to get that extra section, do not define Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT.
ifndef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
if (IS_LINUX gt 0)
Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT equ 1
else
Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT equ 1
endif
endif
ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'
MY_ALIGN macro num:req
align num
; align 16
endm
else
MY_ALIGN macro num:req
; We expect that ".text" is aligned for 16-bytes.
; So we don't need large alignment inside out function.
align 16
endm
endif
MY_ALIGN_16 macro
MY_ALIGN 16
@ -640,11 +610,7 @@ PARAM_lzma equ REG_ABI_PARAM_0
PARAM_limit equ REG_ABI_PARAM_1
PARAM_bufLimit equ REG_ABI_PARAM_2
ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
; MY_ALIGN_64
else
MY_ALIGN_16
endif
MY_PROC LzmaDec_DecodeReal_3, 3
MY_PUSH_PRESERVED_ABI_REGS
@ -1332,8 +1298,6 @@ fin:
MY_POP_PRESERVED_ABI_REGS
MY_ENDP
ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
_TEXT$LZMADECOPT ENDS
endif
end

View file

@ -1,5 +1,5 @@
; Sha1Opt.asm -- SHA-1 optimized code for SHA-1 x86 hardware instructions
; 2024-06-16 : Igor Pavlov : Public domain
; 2021-03-10 : Igor Pavlov : Public domain
include 7zAsm.asm
@ -20,7 +20,7 @@ MY_ASM_START
CONST SEGMENT READONLY
CONST SEGMENT
align 16
Reverse_Endian_Mask db 15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0

View file

@ -1,5 +1,5 @@
; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions
; 2024-06-16 : Igor Pavlov : Public domain
; 2021-03-10 : Igor Pavlov : Public domain
include 7zAsm.asm
@ -20,7 +20,7 @@ endif
EXTRN K_CONST:xmmword
@
CONST SEGMENT READONLY
CONST SEGMENT
align 16
Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
@ -54,20 +54,14 @@ ifndef x64
.686
.xmm
endif
; jwasm-based assemblers for linux and linker from new versions of binutils
; can generate incorrect code for load [ARRAY + offset] instructions.
; 22.00: we load K_CONST offset to (rTable) register to avoid jwasm+binutils problem
rTable equ r0
; rTable equ K_CONST
ifdef x64
rNum equ REG_ABI_PARAM_2
if (IS_LINUX eq 0)
LOCAL_SIZE equ (16 * 2)
endif
else
rNum equ r3
rNum equ r0
LOCAL_SIZE equ (16 * 1)
endif
@ -109,18 +103,15 @@ MY_PROLOG macro
movdqa [r4 + 16], xmm9
endif
else ; x86
push r3
if (IS_CDECL gt 0)
mov rState, [r4 + REG_SIZE * 1]
mov rData, [r4 + REG_SIZE * 2]
mov rNum, [r4 + REG_SIZE * 3]
else ; fastcall
mov rNum, [r4 + REG_SIZE * 1]
endif
push r5
mov r5, r4
NUM_PUSH_REGS equ 2
PARAM_OFFSET equ (REG_SIZE * (1 + NUM_PUSH_REGS))
if (IS_CDECL gt 0)
mov rState, [r4 + PARAM_OFFSET]
mov rData, [r4 + PARAM_OFFSET + REG_SIZE * 1]
mov rNum, [r4 + PARAM_OFFSET + REG_SIZE * 2]
else ; fastcall
mov rNum, [r4 + PARAM_OFFSET]
endif
and r4, -16
sub r4, LOCAL_SIZE
endif
@ -138,7 +129,6 @@ MY_EPILOG macro
else ; x86
mov r4, r5
pop r5
pop r3
endif
MY_ENDP
endm
@ -181,7 +171,7 @@ pre2 equ 2
RND4 macro k
movdqa msg, xmmword ptr [rTable + (k) * 16]
movdqa msg, xmmword ptr [K_CONST + (k) * 16]
paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4)))
MY_sha256rnds2 state0_N, state1_N
pshufd msg, msg, 0eH
@ -220,8 +210,6 @@ endm
MY_PROC Sha256_UpdateBlocks_HW, 3
MY_PROLOG
lea rTable, [K_CONST]
cmp rNum, 0
je end_c

View file

@ -1,860 +0,0 @@
; SortTest.asm -- ASM version of HeapSort() function
; Igor Pavlov : Public domain
include ../../../../Asm/x86/7zAsm.asm
MY_ASM_START
ifndef Z7_SORT_ASM_USE_SEGMENT
if (IS_LINUX gt 0)
; Z7_SORT_ASM_USE_SEGMENT equ 1
else
; Z7_SORT_ASM_USE_SEGMENT equ 1
endif
endif
ifdef Z7_SORT_ASM_USE_SEGMENT
_TEXT$Z7_SORT SEGMENT ALIGN(64) 'CODE'
MY_ALIGN macro num:req
align num
endm
else
MY_ALIGN macro num:req
; We expect that ".text" is aligned for 16-bytes.
; So we don't need large alignment inside our function.
align 16
endm
endif
MY_ALIGN_16 macro
MY_ALIGN 16
endm
MY_ALIGN_32 macro
MY_ALIGN 32
endm
MY_ALIGN_64 macro
MY_ALIGN 64
endm
ifdef x64
NUM_PREFETCH_LEVELS equ 3 ; to prefetch 1x 64-bytes line (is good for most cases)
; NUM_PREFETCH_LEVELS equ 4 ; to prefetch 2x 64-bytes lines (better for big arrays)
acc equ x0
k equ r0
k_x equ x0
p equ r1
s equ r2
s_x equ x2
a0 equ x3
t0 equ a0
a3 equ x5
qq equ a3
a1 equ x6
t1 equ a1
t1_r equ r6
a2 equ x7
t2 equ a2
i equ r8
e0 equ x8
e1 equ x9
num_last equ r10
num_last_x equ x10
next4_lim equ r11
pref_lim equ r12
SORT_2_WITH_TEMP_REG macro b0, b1, temp_reg
mov temp_reg, b0
cmp b0, b1
cmovae b0, b1 ; min
cmovae b1, temp_reg ; max
endm
SORT macro b0, b1
SORT_2_WITH_TEMP_REG b0, b1, acc
endm
LOAD macro dest:req, index:req
mov dest, [p + 4 * index]
endm
STORE macro reg:req, index:req
mov [p + 4 * index], reg
endm
if (NUM_PREFETCH_LEVELS gt 3)
num_prefetches equ (1 SHL (NUM_PREFETCH_LEVELS - 3))
else
num_prefetches equ 1
endif
PREFETCH_OP macro offs
cur_offset = 7 * 4 ; it's average offset in 64-bytes cache line.
; cur_offset = 0 ; we can use zero offset, if we are sure that array is aligned for 64-bytes.
rept num_prefetches
if 1
prefetcht0 byte ptr [p + offs + cur_offset]
else
mov pref_x, dword ptr [p + offs + cur_offset]
endif
cur_offset = cur_offset + 64
endm
endm
PREFETCH_MY macro
if 1
if 1
shl k, NUM_PREFETCH_LEVELS + 3
else
; we delay prefetch instruction to improve main loads
shl k, NUM_PREFETCH_LEVELS
shl k, 3
; shl k, 0
endif
PREFETCH_OP k
elseif 1
shl k, 3
PREFETCH_OP k * (1 SHL NUM_PREFETCH_LEVELS) ; change it
endif
endm
STEP_1 macro exit_label, prefetch_macro
use_cmov_1 equ 1 ; set 1 for cmov, but it's slower in some cases
; set 0 for LOAD after adc s, 0
cmp t0, t1
if use_cmov_1
cmovb t0, t1
; STORE t0, k
endif
adc s, 0
if use_cmov_1 eq 0
LOAD t0, s
endif
cmp qq, t0
jae exit_label
if 1 ; use_cmov_1 eq 0
STORE t0, k
endif
prefetch_macro
mov t0, [p + s * 8]
mov t1, [p + s * 8 + 4]
mov k, s
add s, s ; slower for some cpus
; lea s, dword ptr [s + s] ; slower for some cpus
; shl s, 1 ; faster for some cpus
; lea s, dword ptr [s * 2] ; faster for some cpus
rept 0 ; 1000 for debug : 0 for normal
; number of calls in generate_stage : ~0.6 of number of items
shl k, 0
endm
endm
STEP_2 macro exit_label, prefetch_macro
use_cmov_2 equ 0 ; set 1 for cmov, but it's slower in some cases
; set 0 for LOAD after adc s, 0
cmp t0, t1
if use_cmov_2
mov t2, t0
cmovb t2, t1
; STORE t2, k
endif
mov t0, [p + s * 8]
mov t1, [p + s * 8 + 4]
cmovb t0, [p + s * 8 + 8]
cmovb t1, [p + s * 8 + 12]
adc s, 0
if use_cmov_2 eq 0
LOAD t2, s
endif
cmp qq, t2
jae exit_label
if 1 ; use_cmov_2 eq 0
STORE t2, k
endif
prefetch_macro
mov k, s
; add s, s
; lea s, [s + s]
shl s, 1
; lea s, [s * 2]
endm
MOVE_SMALLEST_UP macro STEP, use_prefetch, num_unrolls
LOCAL exit_1, exit_2, leaves, opt_loop, last_nodes
; s == k * 2
; t0 == (p)[s]
; t1 == (p)[s + 1]
cmp k, next4_lim
jae leaves
rept num_unrolls
STEP exit_2
cmp k, next4_lim
jae leaves
endm
if use_prefetch
prefetch_macro equ PREFETCH_MY
pref_lim_2 equ pref_lim
; lea pref_lim, dword ptr [num_last + 1]
; shr pref_lim, NUM_PREFETCH_LEVELS + 1
cmp k, pref_lim_2
jae last_nodes
else
prefetch_macro equ
pref_lim_2 equ next4_lim
endif
MY_ALIGN_16
opt_loop:
STEP exit_2, prefetch_macro
cmp k, pref_lim_2
jb opt_loop
last_nodes:
; k >= pref_lim_2
; 2 cases are possible:
; case-1: num_after_prefetch_levels == 0 && next4_lim = pref_lim_2
; case-2: num_after_prefetch_levels == NUM_PREFETCH_LEVELS - 1 &&
; next4_lim = pref_lim_2 / (NUM_PREFETCH_LEVELS - 1)
if use_prefetch
yyy = NUM_PREFETCH_LEVELS - 1
while yyy
yyy = yyy - 1
STEP exit_2
if yyy
cmp k, next4_lim
jae leaves
endif
endm
endif
leaves:
; k >= next4_lim == (num_last + 1) / 4 must be provided by previous code.
; we have 2 nodes in (s) level : always
; we can have some nodes in (s * 2) level : low probability case
; we have no nodes in (s * 4) level
; s == k * 2
; t0 == (p)[s]
; t1 == (p)[s + 1]
cmp t0, t1
cmovb t0, t1
adc s, 0
STORE t0, k
; t0 == (p)[s]
; s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
; we have 3 possible cases here:
; s * 2 > num_last : (s) node has no childs
; s * 2 == num_last : (s) node has 1 leaf child that is last item of array
; s * 2 < num_last : (s) node has 2 leaf childs. We provide (s * 4 > num_last)
; we check for (s * 2 > num_last) before "cmp qq, t0" check, because
; we will replace conditional jump with cmov instruction later.
lea t1_r, dword ptr [s + s]
cmp t1_r, num_last
ja exit_1 ; if (s * 2 > num_last), we have no childs : it's high probability branch
; it's low probability branch
; s * 2 <= num_last
cmp qq, t0
jae exit_2
; qq < t0, so we go to next level
; we check 1 or 2 childs in next level
mov t0, [p + s * 8]
mov k, s
mov s, t1_r
cmp t1_r, num_last
je @F ; (s == num_last) means that we have single child in tree
; (s < num_last) : so we must read both childs and select max of them.
mov t1, [p + k * 8 + 4]
cmp t0, t1
cmovb t0, t1
adc s, 0
@@:
STORE t0, k
exit_1:
; t0 == (p)[s], s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
cmp qq, t0
cmovb k, s
exit_2:
STORE qq, k
endm
ifdef Z7_SORT_ASM_USE_SEGMENT
; MY_ALIGN_64
else
MY_ALIGN_16
endif
MY_PROC HeapSort, 2
if (IS_LINUX gt 0)
mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux
endif
mov num_last, REG_ABI_PARAM_1 ; r10 <- r6 : linux
; r10 <- r2 : win64
cmp num_last, 2
jb end_1
; MY_PUSH_PRESERVED_ABI_REGS
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
push r12
cmp num_last, 4
ja sort_5
LOAD a0, 0
LOAD a1, 1
SORT a0, a1
cmp num_last, 3
jb end_2
LOAD a2, 2
je sort_3
LOAD a3, 3
SORT a2, a3
SORT a1, a3
STORE a3, 3
sort_3:
SORT a0, a2
SORT a1, a2
STORE a2, 2
jmp end_2
sort_5:
; (num_last > 4) is required here
; if (num_last >= 6) : we will use optimized loop for leaf nodes loop_down_1
mov next4_lim, num_last
shr next4_lim, 2
dec num_last
mov k, num_last
shr k, 1
mov i, num_last
shr i, 2
test num_last, 1
jnz size_even
; ODD number of items. So we compare parent with single child
LOAD t1, num_last
LOAD t0, k
SORT_2_WITH_TEMP_REG t1, t0, t2
STORE t1, num_last
STORE t0, k
dec k
size_even:
cmp k, i
jbe loop_down ; jump for num_last == 4 case
if 0 ; 1 for debug
mov r15, k
mov r14d, 1 ; 100
loop_benchmark:
endif
; optimized loop for leaf nodes:
mov t0, [p + k * 8]
mov t1, [p + k * 8 + 4]
MY_ALIGN_16
loop_down_1:
; we compare parent with max of childs:
; lea s, dword ptr [2 * k]
mov s, k
cmp t0, t1
cmovb t0, t1
adc s, s
LOAD t2, k
STORE t0, k
cmp t2, t0
cmovae s, k
dec k
; we preload next items before STORE operation for calculated address
mov t0, [p + k * 8]
mov t1, [p + k * 8 + 4]
STORE t2, s
cmp k, i
jne loop_down_1
if 0 ; 1 for debug
mov k, r15
dec r14d
jnz loop_benchmark
; jmp end_debug
endif
MY_ALIGN_16
loop_down:
mov t0, [p + i * 8]
mov t1, [p + i * 8 + 4]
LOAD qq, i
mov k, i
lea s, dword ptr [i + i]
; jmp end_debug
DOWN_use_prefetch equ 0
DOWN_num_unrolls equ 0
MOVE_SMALLEST_UP STEP_1, DOWN_use_prefetch, DOWN_num_unrolls
sub i, 1
jnb loop_down
; jmp end_debug
LOAD e0, 0
LOAD e1, 1
LEVEL_3_LIMIT equ 8 ; 8 is default, but 7 also can work
cmp num_last, LEVEL_3_LIMIT + 1
jb main_loop_sort_5
MY_ALIGN_16
main_loop_sort:
; num_last > LEVEL_3_LIMIT
; p[size--] = p[0];
LOAD qq, num_last
STORE e0, num_last
mov e0, e1
mov next4_lim, num_last
shr next4_lim, 2
mov pref_lim, num_last
shr pref_lim, NUM_PREFETCH_LEVELS + 1
dec num_last
if 0 ; 1 for debug
; that optional optimization can improve the performance, if there are identical items in array
; 3 times improvement : if all items in array are identical
; 20% improvement : if items are different for 1 bit only
; 1-10% improvement : if items are different for (2+) bits
; no gain : if items are different
cmp qq, e1
jae next_iter_main
endif
LOAD e1, 2
LOAD t0, 3
mov k_x, 2
cmp e1, t0
cmovb e1, t0
mov t0, [p + 4 * (4 + 0)]
mov t1, [p + 4 * (4 + 1)]
cmovb t0, [p + 4 * (4 + 2)]
cmovb t1, [p + 4 * (4 + 3)]
adc k_x, 0
; (qq <= e1), because the tree is correctly sorted
; also here we could check (qq >= e1) or (qq == e1) for faster exit
lea s, dword ptr [k + k]
MAIN_use_prefetch equ 1
MAIN_num_unrolls equ 0
MOVE_SMALLEST_UP STEP_2, MAIN_use_prefetch, MAIN_num_unrolls
next_iter_main:
cmp num_last, LEVEL_3_LIMIT
jne main_loop_sort
; num_last == LEVEL_3_LIMIT
main_loop_sort_5:
; 4 <= num_last <= LEVEL_3_LIMIT
; p[size--] = p[0];
LOAD qq, num_last
STORE e0, num_last
mov e0, e1
dec num_last_x
LOAD e1, 2
LOAD t0, 3
mov k_x, 2
cmp e1, t0
cmovb e1, t0
adc k_x, 0
lea s_x, dword ptr [k * 2]
cmp s_x, num_last_x
ja exit_2
mov t0, [p + k * 8]
je exit_1
; s < num_last
mov t1, [p + k * 8 + 4]
cmp t0, t1
cmovb t0, t1
adc s_x, 0
exit_1:
STORE t0, k
cmp qq, t0
cmovb k_x, s_x
exit_2:
STORE qq, k
cmp num_last_x, 3
jne main_loop_sort_5
; num_last == 3 (real_size == 4)
LOAD a0, 2
LOAD a1, 3
STORE e1, 2
STORE e0, 3
SORT a0, a1
end_2:
STORE a0, 0
STORE a1, 1
; end_debug:
; MY_POP_PRESERVED_ABI_REGS
pop r12
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
end_1:
MY_ENDP
else
; ------------ x86 32-bit ------------
ifdef x64
IS_CDECL = 0
endif
acc equ x0
k equ r0
k_x equ acc
p equ r1
num_last equ r2
num_last_x equ x2
a0 equ x3
t0 equ a0
a3 equ x5
i equ r5
e0 equ a3
a1 equ x6
qq equ a1
a2 equ x7
s equ r7
s_x equ a2
SORT macro b0, b1
cmp b1, b0
jae @F
if 1
xchg b0, b1
else
mov acc, b0
mov b0, b1 ; min
mov b1, acc ; max
endif
@@:
endm
LOAD macro dest:req, index:req
mov dest, [p + 4 * index]
endm
STORE macro reg:req, index:req
mov [p + 4 * index], reg
endm
STEP_1 macro exit_label
mov t0, [p + k * 8]
cmp t0, [p + k * 8 + 4]
adc s, 0
LOAD t0, s
STORE t0, k ; we lookahed stooring for most expected branch
cmp qq, t0
jae exit_label
; STORE t0, k ; use if
mov k, s
add s, s
; lea s, dword ptr [s + s]
; shl s, 1
; lea s, dword ptr [s * 2]
endm
STEP_BRANCH macro exit_label
mov t0, [p + k * 8]
cmp t0, [p + k * 8 + 4]
jae @F
inc s
mov t0, [p + k * 8 + 4]
@@:
cmp qq, t0
jae exit_label
STORE t0, k
mov k, s
add s, s
endm
MOVE_SMALLEST_UP macro STEP, num_unrolls, exit_2
LOCAL leaves, opt_loop, single
; s == k * 2
rept num_unrolls
cmp s, num_last
jae leaves
STEP_1 exit_2
endm
cmp s, num_last
jb opt_loop
leaves:
; (s >= num_last)
jne exit_2
single:
; (s == num_last)
mov t0, [p + k * 8]
cmp qq, t0
jae exit_2
STORE t0, k
mov k, s
jmp exit_2
MY_ALIGN_16
opt_loop:
STEP exit_2
cmp s, num_last
jb opt_loop
je single
exit_2:
STORE qq, k
endm
ifdef Z7_SORT_ASM_USE_SEGMENT
; MY_ALIGN_64
else
MY_ALIGN_16
endif
MY_PROC HeapSort, 2
ifdef x64
if (IS_LINUX gt 0)
mov num_last, REG_ABI_PARAM_1 ; r2 <- r6 : linux
mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux
endif
elseif (IS_CDECL gt 0)
mov num_last, [r4 + REG_SIZE * 2]
mov p, [r4 + REG_SIZE * 1]
endif
cmp num_last, 2
jb end_1
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
cmp num_last, 4
ja sort_5
LOAD a0, 0
LOAD a1, 1
SORT a0, a1
cmp num_last, 3
jb end_2
LOAD a2, 2
je sort_3
LOAD a3, 3
SORT a2, a3
SORT a1, a3
STORE a3, 3
sort_3:
SORT a0, a2
SORT a1, a2
STORE a2, 2
jmp end_2
sort_5:
; num_last > 4
lea i, dword ptr [num_last - 2]
dec num_last
test i, 1
jz loop_down
; single child
mov t0, [p + num_last * 4]
mov qq, [p + num_last * 2]
dec i
cmp qq, t0
jae loop_down
mov [p + num_last * 2], t0
mov [p + num_last * 4], qq
MY_ALIGN_16
loop_down:
mov t0, [p + i * 4]
cmp t0, [p + i * 4 + 4]
mov k, i
mov qq, [p + i * 2]
adc k, 0
LOAD t0, k
cmp qq, t0
jae down_next
mov [p + i * 2], t0
lea s, dword ptr [k + k]
DOWN_num_unrolls equ 0
MOVE_SMALLEST_UP STEP_1, DOWN_num_unrolls, down_exit_label
down_next:
sub i, 2
jnb loop_down
; jmp end_debug
LOAD e0, 0
MY_ALIGN_16
main_loop_sort:
; num_last > 3
mov t0, [p + 2 * 4]
cmp t0, [p + 3 * 4]
LOAD qq, num_last
STORE e0, num_last
LOAD e0, 1
mov s_x, 2
mov k_x, 1
adc s, 0
LOAD t0, s
dec num_last
cmp qq, t0
jae main_exit_label
STORE t0, 1
mov k, s
add s, s
if 1
; for branch data prefetch mode :
; it's faster for large arrays : larger than (1 << 13) items.
MAIN_num_unrolls equ 10
STEP_LOOP equ STEP_BRANCH
else
MAIN_num_unrolls equ 0
STEP_LOOP equ STEP_1
endif
MOVE_SMALLEST_UP STEP_LOOP, MAIN_num_unrolls, main_exit_label
; jmp end_debug
cmp num_last, 3
jne main_loop_sort
; num_last == 3 (real_size == 4)
LOAD a0, 2
LOAD a1, 3
LOAD a2, 1
STORE e0, 3 ; e0 is alias for a3
STORE a2, 2
SORT a0, a1
end_2:
STORE a0, 0
STORE a1, 1
; end_debug:
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
end_1:
MY_ENDP
endif
ifdef Z7_SORT_ASM_USE_SEGMENT
_TEXT$Z7_SORT ENDS
endif
if 0
LEA_IS_D8 (R64) [R2 * 4 + 16]
Lat : TP
2 : 1 : adl-e
2 : 3 p056 adl-p
1 : 2 : p15 hsw-rocket
1 : 2 : p01 snb-ivb
1 : 1 : p1 conroe-wsm
1 : 4 : zen3,zen4
2 : 4 : zen1,zen2
LEA_B_IS (R64) [R2 + R3 * 4]
Lat : TP
1 : 1 : adl-e
2 : 3 p056 adl-p
1 : 2 : p15 hsw-rocket
1 : 2 : p01 snb-ivb
1 : 1 : p1 nhm-wsm
1 : 1 : p0 conroe-wsm
1 : 4 : zen3,zen4
2 :2,4 : zen1,zen2
LEA_B_IS_D8 (R64) [R2 + R3 * 4 + 16]
Lat : TP
2 : 1 : adl-e
2 : 3 p056 adl-p
1 : 2 : p15 ice-rocket
3 : 1 : p1/p15 hsw-rocket
3 : 1 : p01 snb-ivb
1 : 1 : p1 nhm-wsm
1 : 1 : p0 conroe-wsm
2,1 : 2 : zen3,zen4
2 : 2 : zen1,zen2
CMOVB (R64, R64)
Lat : TP
1,2 : 2 : adl-e
1 : 2 p06 adl-p
1 : 2 : p06 bwd-rocket
1,2 : 2 : p0156+p06 hsw
1,2 :1.5 : p015+p05 snb-ivb
1,2 : 1 : p015+p05 nhm
1 : 1 : 2*p015 conroe
1 : 2 : zen3,zen4
1 : 4 : zen1,zen2
ADC (R64, 0)
Lat : TP
1,2 : 2 : adl-e
1 : 2 p06 adl-p
1 : 2 : p06 bwd-rocket
1 :1.5 : p0156+p06 hsw
1 :1.5 : p015+p05 snb-ivb
2 : 1 : 2*p015 conroe-wstm
1 : 2 : zen1,zen2,zen3,zen4
PREFETCHNTA : fetch data into non-temporal cache close to the processor, minimizing cache pollution.
L1 : Pentium3
L2 : NetBurst
L1, not L2: Core duo, Core 2, Atom processors
L1, not L2, may fetch into L3 with fast replacement: Nehalem, Westmere, Sandy Bridge, ...
NEHALEM: Fills L1/L3, L1 LRU is not updated
L3 with fast replacement: Xeon Processors based on Nehalem, Westmere, Sandy Bridge, ...
PREFETCHT0 : fetch data into all cache levels.
PREFETCHT1 : fetch data into L2 and L3
endif
end

View file

@ -1,231 +1,113 @@
; XzCrc64Opt.asm -- CRC64 calculation : optimized version
; 2023-12-08 : Igor Pavlov : Public domain
; 2021-02-06 : Igor Pavlov : Public domain
include 7zAsm.asm
MY_ASM_START
NUM_WORDS equ 3
if (NUM_WORDS lt 1) or (NUM_WORDS gt 64)
.err <num_words_IS_INCORRECT>
endif
NUM_SKIP_BYTES equ ((NUM_WORDS - 2) * 4)
MOVZXLO macro dest:req, src:req
movzx dest, @CatStr(src, _L)
endm
MOVZXHI macro dest:req, src:req
movzx dest, @CatStr(src, _H)
endm
ifdef x64
rD equ r11
rD equ r9
rN equ r10
rT equ r9
CRC_OP macro op:req, dest:req, src:req, t:req
op dest, QWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t)]
endm
rT equ r5
num_VAR equ r8
SRCDAT4 equ dword ptr [rD + rN * 1]
CRC_XOR macro dest:req, src:req, t:req
CRC_OP xor, dest, src, t
endm
CRC_MOV macro dest:req, src:req, t:req
CRC_OP mov, dest, src, t
xor dest, QWORD PTR [rT + src * 8 + 0800h * t]
endm
CRC1b macro
movzx x6, BYTE PTR [rD]
inc rD
MOVZXLO x3, x0
xor x6, x3
shr r0, 8
CRC_XOR r0, x6, 0
dec rN
movzx x6, BYTE PTR [rD]
inc rD
movzx x3, x0_L
xor x6, x3
shr r0, 8
CRC_XOR r0, r6, 0
dec rN
endm
; ALIGN_MASK is 3 or 7 bytes alignment:
ALIGN_MASK equ (7 - (NUM_WORDS and 1) * 4)
if NUM_WORDS eq 1
src_rN_offset equ 4
; + 4 for prefetching next 4-bytes after current iteration
NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 4)
SRCDAT4 equ DWORD PTR [rN + rD * 1]
XOR_NEXT macro
mov x1, [rD]
xor r0, r1
MY_PROLOG macro crc_end:req
ifdef ABI_LINUX
MY_PUSH_2_REGS
else
MY_PUSH_4_REGS
endif
mov r0, REG_ABI_PARAM_0
mov rN, REG_ABI_PARAM_2
mov rT, REG_ABI_PARAM_3
mov rD, REG_ABI_PARAM_1
test rN, rN
jz crc_end
@@:
test rD, 3
jz @F
CRC1b
jnz @B
@@:
cmp rN, 8
jb crc_end
add rN, rD
mov num_VAR, rN
sub rN, 4
and rN, NOT 3
sub rD, rN
mov x1, SRCDAT4
xor r0, r1
add rN, 4
endm
else ; NUM_WORDS > 1
src_rN_offset equ 8
; + 8 for prefetching next 8-bytes after current iteration
NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 8)
XOR_NEXT macro
xor r0, QWORD PTR [rD] ; 64-bit read, can be unaligned
MY_EPILOG macro crc_end:req
sub rN, 4
mov x1, SRCDAT4
xor r0, r1
mov rD, rN
mov rN, num_VAR
sub rN, rD
crc_end:
test rN, rN
jz @F
CRC1b
jmp crc_end
@@:
ifdef ABI_LINUX
MY_POP_2_REGS
else
MY_POP_4_REGS
endif
endm
; 32-bit or 64-bit
LOAD_SRC_MULT4 macro dest:req, word_index:req
mov dest, [rN + rD * 1 + 4 * (word_index) - src_rN_offset];
endm
MY_PROC XzCrc64UpdateT4, 4
MY_PROLOG crc_end_4
align 16
main_loop_4:
mov x1, SRCDAT4
movzx x2, x0_L
movzx x3, x0_H
shr r0, 16
movzx x6, x0_L
movzx x7, x0_H
shr r0, 16
CRC_XOR r1, r2, 3
CRC_XOR r0, r3, 2
CRC_XOR r1, r6, 1
CRC_XOR r0, r7, 0
xor r0, r1
endif
add rD, 4
jnz main_loop_4
MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 4
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
mov r0, REG_ABI_PARAM_0 ; r0 <- r1 / r7
mov rD, REG_ABI_PARAM_1 ; r11 <- r2 / r6
mov rN, REG_ABI_PARAM_2 ; r10 <- r8 / r2
if (IS_LINUX gt 0)
mov rT, REG_ABI_PARAM_3 ; r9 <- r9 / r1
endif
cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK
jb crc_end
@@:
test rD, ALIGN_MASK
jz @F
CRC1b
jmp @B
@@:
XOR_NEXT
lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
sub rD, rN
add rN, src_rN_offset
align 16
@@:
if NUM_WORDS eq 1
mov x1, x0
shr x1, 8
MOVZXLO x3, x1
MOVZXLO x2, x0
shr x1, 8
shr r0, 32
xor x0, SRCDAT4
CRC_XOR r0, x2, 3
CRC_XOR r0, x3, 2
MOVZXLO x2, x1
shr x1, 8
CRC_XOR r0, x2, 1
CRC_XOR r0, x1, 0
else ; NUM_WORDS > 1
if NUM_WORDS ne 2
k = 2
while k lt NUM_WORDS
LOAD_SRC_MULT4 x1, k
crc_op1 textequ <xor>
if k eq 2
if (NUM_WORDS and 1)
LOAD_SRC_MULT4 x7, NUM_WORDS ; aligned 32-bit
LOAD_SRC_MULT4 x6, NUM_WORDS + 1 ; aligned 32-bit
shl r6, 32
else
LOAD_SRC_MULT4 r6, NUM_WORDS ; aligned 64-bit
crc_op1 textequ <mov>
endif
endif
table = 4 * (NUM_WORDS - 1 - k)
MOVZXLO x3, x1
CRC_OP crc_op1, r7, x3, 3 + table
MOVZXHI x3, x1
shr x1, 16
CRC_XOR r6, x3, 2 + table
MOVZXLO x3, x1
shr x1, 8
CRC_XOR r7, x3, 1 + table
CRC_XOR r6, x1, 0 + table
k = k + 1
endm
crc_op2 textequ <xor>
else ; NUM_WORDS == 2
LOAD_SRC_MULT4 r6, NUM_WORDS ; aligned 64-bit
crc_op2 textequ <mov>
endif ; NUM_WORDS == 2
MOVZXHI x3, x0
MOVZXLO x2, x0
mov r1, r0
shr r1, 32
shr x0, 16
CRC_XOR r6, x2, NUM_SKIP_BYTES + 7
CRC_OP crc_op2, r7, x3, NUM_SKIP_BYTES + 6
MOVZXLO x2, x0
MOVZXHI x5, x1
MOVZXLO x3, x1
shr x0, 8
shr x1, 16
CRC_XOR r7, x2, NUM_SKIP_BYTES + 5
CRC_XOR r6, x3, NUM_SKIP_BYTES + 3
CRC_XOR r7, x0, NUM_SKIP_BYTES + 4
CRC_XOR r6, x5, NUM_SKIP_BYTES + 2
MOVZXLO x2, x1
shr x1, 8
CRC_XOR r7, x2, NUM_SKIP_BYTES + 1
CRC_MOV r0, x1, NUM_SKIP_BYTES + 0
xor r0, r6
xor r0, r7
endif ; NUM_WORDS > 1
add rD, NUM_WORDS * 4
jnc @B
sub rN, src_rN_offset
add rD, rN
XOR_NEXT
add rN, NUM_BYTES_LIMIT - 1
sub rN, rD
crc_end:
test rN, rN
jz func_end
@@:
CRC1b
jnz @B
func_end:
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
MY_EPILOG crc_end_4
MY_ENDP
else
; ==================================================================
; x86 (32-bit)
rD equ r7
rN equ r1
rD equ r1
rN equ r7
rT equ r5
xA equ x6
xA_R equ r6
ifdef x64
num_VAR equ r8
else
crc_OFFS equ (REG_SIZE * 5)
if (IS_CDECL gt 0) or (IS_LINUX gt 0)
@ -251,273 +133,107 @@ else
table_VAR equ [r4 + table_OFFS]
num_VAR equ table_VAR
endif
endif ; x64
SRCDAT4 equ DWORD PTR [rN + rD * 1]
CRC_1 macro op:req, dest:req, src:req, t:req, word_index:req
op dest, DWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t) + (word_index) * 4]
endm
SRCDAT4 equ dword ptr [rD + rN * 1]
CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req
CRC_1 op0, dest0, src, t, 0
CRC_1 op1, dest1, src, t, 1
op0 dest0, DWORD PTR [rT + src * 8 + 0800h * t]
op1 dest1, DWORD PTR [rT + src * 8 + 0800h * t + 4]
endm
CRC_XOR macro dest0:req, dest1:req, src:req, t:req
CRC xor, xor, dest0, dest1, src, t
CRC xor, xor, dest0, dest1, src, t
endm
CRC1b macro
movzx xA, BYTE PTR [rD]
inc rD
MOVZXLO x3, x0
xor xA, x3
shrd x0, x2, 8
shr x2, 8
CRC_XOR x0, x2, xA, 0
dec rN
movzx x6, BYTE PTR [rD]
inc rD
movzx x3, x0_L
xor x6, x3
shrd r0, r2, 8
shr r2, 8
CRC_XOR r0, r2, r6, 0
dec rN
endm
MY_PROLOG macro crc_end:req
MY_PUSH_4_REGS
MY_PROLOG_BASE macro
MY_PUSH_4_REGS
ifdef x64
mov r0, REG_ABI_PARAM_0 ; r0 <- r1 / r7
mov rT, REG_ABI_PARAM_3 ; r5 <- r9 / r1
mov rN, REG_ABI_PARAM_2 ; r1 <- r8 / r2
mov rD, REG_ABI_PARAM_1 ; r7 <- r2 / r6
mov r2, r0
shr r2, 32
mov x0, x0
else
if (IS_CDECL gt 0) or (IS_LINUX gt 0)
proc_numParams = proc_numParams + 2 ; for ABI_LINUX
mov rN, [r4 + size_OFFS]
mov rD, [r4 + data_OFFS]
else
mov rD, REG_ABI_PARAM_0 ; r7 <- r1 : (data)
mov rN, REG_ABI_PARAM_1 ; r1 <- r2 : (size)
endif
mov x0, [r4 + crc_OFFS]
mov x2, [r4 + crc_OFFS + 4]
mov rT, table_VAR
endif
endm
MY_EPILOG_BASE macro crc_end:req, func_end:req
crc_end:
test rN, rN
jz func_end
@@:
CRC1b
jnz @B
func_end:
ifdef x64
shl r2, 32
xor r0, r2
endif
MY_POP_4_REGS
endm
; ALIGN_MASK is 3 or 7 bytes alignment:
ALIGN_MASK equ (7 - (NUM_WORDS and 1) * 4)
if (NUM_WORDS eq 1)
NUM_BYTES_LIMIT_T4 equ (NUM_WORDS * 4 + 4)
MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5
MY_PROLOG_BASE
cmp rN, NUM_BYTES_LIMIT_T4 + ALIGN_MASK
jb crc_end_4
@@:
test rD, ALIGN_MASK
jz @F
CRC1b
jmp @B
@@:
xor x0, [rD]
lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT_T4 - 1)]
sub rD, rN
add rN, 4
MOVZXLO xA, x0
align 16
@@:
mov x3, SRCDAT4
xor x3, x2
shr x0, 8
CRC xor, mov, x3, x2, xA, 3
MOVZXLO xA, x0
shr x0, 8
; MOVZXHI xA, x0
; shr x0, 16
CRC_XOR x3, x2, xA, 2
MOVZXLO xA, x0
shr x0, 8
CRC_XOR x3, x2, xA, 1
CRC_XOR x3, x2, x0, 0
MOVZXLO xA, x3
mov x0, x3
add rD, 4
jnc @B
sub rN, 4
add rD, rN
xor x0, [rD]
add rN, NUM_BYTES_LIMIT_T4 - 1
sub rN, rD
MY_EPILOG_BASE crc_end_4, func_end_4
MY_ENDP
else ; NUM_WORDS > 1
SHR_X macro x, imm
shr x, imm
endm
ITER_1 macro v0, v1, a, off
MOVZXLO xA, a
SHR_X a, 8
CRC_XOR v0, v1, xA, off
endm
ITER_4 macro v0, v1, a, off
if 0 eq 0
ITER_1 v0, v1, a, off + 3
ITER_1 v0, v1, a, off + 2
ITER_1 v0, v1, a, off + 1
CRC_XOR v0, v1, a, off
elseif 0 eq 0
MOVZXLO xA, a
CRC_XOR v0, v1, xA, off + 3
mov xA, a
ror a, 16 ; 32-bit ror
shr xA, 24
CRC_XOR v0, v1, xA, off
MOVZXLO xA, a
SHR_X a, 24
CRC_XOR v0, v1, xA, off + 1
CRC_XOR v0, v1, a, off + 2
else
; MOVZXHI provides smaller code, but MOVZX_HI_BYTE is not fast instruction
MOVZXLO xA, a
CRC_XOR v0, v1, xA, off + 3
MOVZXHI xA, a
SHR_X a, 16
CRC_XOR v0, v1, xA, off + 2
MOVZXLO xA, a
SHR_X a, 8
CRC_XOR v0, v1, xA, off + 1
CRC_XOR v0, v1, a, off
endif
endm
ITER_1_PAIR macro v0, v1, a0, a1, off
ITER_1 v0, v1, a0, off + 4
ITER_1 v0, v1, a1, off
endm
src_rD_offset equ 8
STEP_SIZE equ (NUM_WORDS * 4)
ITER_12_NEXT macro op, index, v0, v1
op v0, DWORD PTR [rD + (index + 1) * STEP_SIZE - src_rD_offset]
op v1, DWORD PTR [rD + (index + 1) * STEP_SIZE + 4 - src_rD_offset]
endm
ITER_12 macro index, a0, a1, v0, v1
if NUM_SKIP_BYTES eq 0
ITER_12_NEXT mov, index, v0, v1
if (IS_CDECL gt 0) or (IS_LINUX gt 0)
proc_numParams = proc_numParams + 2 ; for ABI_LINUX
mov rN, [r4 + size_OFFS]
mov rD, [r4 + data_OFFS]
else
k = 0
while k lt NUM_SKIP_BYTES
movzx xA, BYTE PTR [rD + (index) * STEP_SIZE + k + 8 - src_rD_offset]
if k eq 0
CRC mov, mov, v0, v1, xA, NUM_SKIP_BYTES - 1 - k
else
CRC_XOR v0, v1, xA, NUM_SKIP_BYTES - 1 - k
endif
k = k + 1
endm
ITER_12_NEXT xor, index, v0, v1
mov rN, r2
endif
if 0 eq 0
ITER_4 v0, v1, a0, NUM_SKIP_BYTES + 4
ITER_4 v0, v1, a1, NUM_SKIP_BYTES
else ; interleave version is faster/slower for different processors
ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 3
ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 2
ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 1
CRC_XOR v0, v1, a0, NUM_SKIP_BYTES + 4
CRC_XOR v0, v1, a1, NUM_SKIP_BYTES
endif
mov x0, [r4 + crc_OFFS]
mov x2, [r4 + crc_OFFS + 4]
mov rT, table_VAR
test rN, rN
jz crc_end
@@:
test rD, 3
jz @F
CRC1b
jnz @B
@@:
cmp rN, 8
jb crc_end
add rN, rD
mov num_VAR, rN
sub rN, 4
and rN, NOT 3
sub rD, rN
xor r0, SRCDAT4
add rN, 4
endm
; we use (UNROLL_CNT > 1) to reduce read ports pressure (num_VAR reads)
UNROLL_CNT equ (2 * 1)
NUM_BYTES_LIMIT equ (STEP_SIZE * UNROLL_CNT + 8)
MY_EPILOG macro crc_end:req
sub rN, 4
xor r0, SRCDAT4
MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5
MY_PROLOG_BASE
mov rD, rN
mov rN, num_VAR
sub rN, rD
crc_end:
test rN, rN
jz @F
CRC1b
jmp crc_end
@@:
MY_POP_4_REGS
endm
cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK
jb crc_end_12
@@:
test rD, ALIGN_MASK
jz @F
CRC1b
jmp @B
@@:
xor x0, [rD]
xor x2, [rD + 4]
add rD, src_rD_offset
lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
mov num_VAR, rN
MY_PROC XzCrc64UpdateT4, 5
MY_PROLOG crc_end_4
movzx x6, x0_L
align 16
main_loop_4:
mov r3, SRCDAT4
xor r3, r2
align 16
@@:
i = 0
rept UNROLL_CNT
if (i and 1) eq 0
ITER_12 i, x0, x2, x1, x3
else
ITER_12 i, x1, x3, x0, x2
endif
i = i + 1
endm
CRC xor, mov, r3, r2, r6, 3
movzx x6, x0_H
shr r0, 16
CRC_XOR r3, r2, r6, 2
if (UNROLL_CNT and 1)
mov x0, x1
mov x2, x3
endif
add rD, STEP_SIZE * UNROLL_CNT
cmp rD, num_VAR
jb @B
movzx x6, x0_L
movzx x0, x0_H
CRC_XOR r3, r2, r6, 1
CRC_XOR r3, r2, r0, 0
movzx x6, x3_L
mov r0, r3
mov rN, num_VAR
add rN, NUM_BYTES_LIMIT - 1
sub rN, rD
sub rD, src_rD_offset
xor x0, [rD]
xor x2, [rD + 4]
add rD, 4
jnz main_loop_4
MY_EPILOG_BASE crc_end_12, func_end_12
MY_EPILOG crc_end_4
MY_ENDP
endif ; (NUM_WORDS > 1)
endif ; ! x64
end

12
C/7z.h
View file

@ -1,8 +1,8 @@
/* 7z.h -- 7z interface
2023-04-02 : Igor Pavlov : Public domain */
2018-07-02 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_7Z_H
#define ZIP7_INC_7Z_H
#ifndef __7Z_H
#define __7Z_H
#include "7zTypes.h"
@ -98,7 +98,7 @@ typedef struct
UInt64 SzAr_GetFolderUnpackSize(const CSzAr *p, UInt32 folderIndex);
SRes SzAr_DecodeFolder(const CSzAr *p, UInt32 folderIndex,
ILookInStreamPtr stream, UInt64 startPos,
ILookInStream *stream, UInt64 startPos,
Byte *outBuffer, size_t outSize,
ISzAllocPtr allocMain);
@ -174,7 +174,7 @@ UInt16 *SzArEx_GetFullNameUtf16_Back(const CSzArEx *p, size_t fileIndex, UInt16
SRes SzArEx_Extract(
const CSzArEx *db,
ILookInStreamPtr inStream,
ILookInStream *inStream,
UInt32 fileIndex, /* index of file */
UInt32 *blockIndex, /* index of solid block */
Byte **outBuffer, /* pointer to pointer to output buffer (allocated with allocMain) */
@ -196,7 +196,7 @@ SZ_ERROR_INPUT_EOF
SZ_ERROR_FAIL
*/
SRes SzArEx_Open(CSzArEx *p, ILookInStreamPtr inStream,
SRes SzArEx_Open(CSzArEx *p, ILookInStream *inStream,
ISzAllocPtr allocMain, ISzAllocPtr allocTemp);
EXTERN_C_END

View file

@ -1,5 +1,5 @@
/* 7zAlloc.c -- Allocation functions for 7z processing
2023-03-04 : Igor Pavlov : Public domain */
/* 7zAlloc.c -- Allocation functions
2017-04-03 : Igor Pavlov : Public domain */
#include "Precomp.h"
@ -7,83 +7,74 @@
#include "7zAlloc.h"
/* #define SZ_ALLOC_DEBUG */
/* use SZ_ALLOC_DEBUG to debug alloc/free operations */
/* #define _SZ_ALLOC_DEBUG */
/* use _SZ_ALLOC_DEBUG to debug alloc/free operations */
#ifdef SZ_ALLOC_DEBUG
#ifdef _SZ_ALLOC_DEBUG
/*
#ifdef _WIN32
#include "7zWindows.h"
#include <windows.h>
#endif
*/
#include <stdio.h>
static int g_allocCount = 0;
static int g_allocCountTemp = 0;
int g_allocCount = 0;
int g_allocCountTemp = 0;
static void Print_Alloc(const char *s, size_t size, int *counter)
{
const unsigned size2 = (unsigned)size;
fprintf(stderr, "\n%s count = %10d : %10u bytes; ", s, *counter, size2);
(*counter)++;
}
static void Print_Free(const char *s, int *counter)
{
(*counter)--;
fprintf(stderr, "\n%s count = %10d", s, *counter);
}
#endif
void *SzAlloc(ISzAllocPtr p, size_t size)
{
UNUSED_VAR(p)
UNUSED_VAR(p);
if (size == 0)
return 0;
#ifdef SZ_ALLOC_DEBUG
Print_Alloc("Alloc", size, &g_allocCount);
#ifdef _SZ_ALLOC_DEBUG
fprintf(stderr, "\nAlloc %10u bytes; count = %10d", (unsigned)size, g_allocCount);
g_allocCount++;
#endif
return malloc(size);
}
void SzFree(ISzAllocPtr p, void *address)
{
UNUSED_VAR(p)
#ifdef SZ_ALLOC_DEBUG
if (address)
Print_Free("Free ", &g_allocCount);
UNUSED_VAR(p);
#ifdef _SZ_ALLOC_DEBUG
if (address != 0)
{
g_allocCount--;
fprintf(stderr, "\nFree; count = %10d", g_allocCount);
}
#endif
free(address);
}
void *SzAllocTemp(ISzAllocPtr p, size_t size)
{
UNUSED_VAR(p)
UNUSED_VAR(p);
if (size == 0)
return 0;
#ifdef SZ_ALLOC_DEBUG
Print_Alloc("Alloc_temp", size, &g_allocCountTemp);
/*
#ifdef _SZ_ALLOC_DEBUG
fprintf(stderr, "\nAlloc_temp %10u bytes; count = %10d", (unsigned)size, g_allocCountTemp);
g_allocCountTemp++;
#ifdef _WIN32
return HeapAlloc(GetProcessHeap(), 0, size);
#endif
*/
#endif
return malloc(size);
}
void SzFreeTemp(ISzAllocPtr p, void *address)
{
UNUSED_VAR(p)
#ifdef SZ_ALLOC_DEBUG
if (address)
Print_Free("Free_temp ", &g_allocCountTemp);
/*
UNUSED_VAR(p);
#ifdef _SZ_ALLOC_DEBUG
if (address != 0)
{
g_allocCountTemp--;
fprintf(stderr, "\nFree_temp; count = %10d", g_allocCountTemp);
}
#ifdef _WIN32
HeapFree(GetProcessHeap(), 0, address);
return;
#endif
*/
#endif
free(address);
}

View file

@ -1,8 +1,8 @@
/* 7zAlloc.h -- Allocation functions
2023-03-04 : Igor Pavlov : Public domain */
2017-04-03 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_7Z_ALLOC_H
#define ZIP7_INC_7Z_ALLOC_H
#ifndef __7Z_ALLOC_H
#define __7Z_ALLOC_H
#include "7zTypes.h"

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,8 @@
/* 7zBuf.h -- Byte Buffer
2023-03-04 : Igor Pavlov : Public domain */
2017-04-03 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_7Z_BUF_H
#define ZIP7_INC_7Z_BUF_H
#ifndef __7Z_BUF_H
#define __7Z_BUF_H
#include "7zTypes.h"

562
C/7zCrc.c
View file

@ -1,218 +1,182 @@
/* 7zCrc.c -- CRC32 calculation and init
2024-03-01 : Igor Pavlov : Public domain */
/* 7zCrc.c -- CRC32 init
2021-04-01 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include "7zCrc.h"
#include "CpuArch.h"
// for debug:
// #define __ARM_FEATURE_CRC32 1
#define kCrcPoly 0xEDB88320
#ifdef __ARM_FEATURE_CRC32
// #pragma message("__ARM_FEATURE_CRC32")
#define Z7_CRC_HW_FORCE
#endif
// #define Z7_CRC_DEBUG_BE
#ifdef Z7_CRC_DEBUG_BE
#undef MY_CPU_LE
#define MY_CPU_BE
#endif
#ifdef Z7_CRC_HW_FORCE
#define Z7_CRC_NUM_TABLES_USE 1
#ifdef MY_CPU_LE
#define CRC_NUM_TABLES 8
#else
#ifdef Z7_CRC_NUM_TABLES
#define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES
#else
#define Z7_CRC_NUM_TABLES_USE 12
#endif
#define CRC_NUM_TABLES 9
#define CRC_UINT32_SWAP(v) ((v >> 24) | ((v >> 8) & 0xFF00) | ((v << 8) & 0xFF0000) | (v << 24))
UInt32 MY_FAST_CALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table);
UInt32 MY_FAST_CALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table);
#endif
#if Z7_CRC_NUM_TABLES_USE < 1
#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
#ifndef MY_CPU_BE
UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table);
UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table);
#endif
#if defined(MY_CPU_LE) || (Z7_CRC_NUM_TABLES_USE == 1)
#define Z7_CRC_NUM_TABLES_TOTAL Z7_CRC_NUM_TABLES_USE
#else
#define Z7_CRC_NUM_TABLES_TOTAL (Z7_CRC_NUM_TABLES_USE + 1)
#endif
typedef UInt32 (MY_FAST_CALL *CRC_FUNC)(UInt32 v, const void *data, size_t size, const UInt32 *table);
#ifndef Z7_CRC_HW_FORCE
extern
CRC_FUNC g_CrcUpdateT4;
CRC_FUNC g_CrcUpdateT4;
extern
CRC_FUNC g_CrcUpdateT8;
CRC_FUNC g_CrcUpdateT8;
extern
CRC_FUNC g_CrcUpdateT0_32;
CRC_FUNC g_CrcUpdateT0_32;
extern
CRC_FUNC g_CrcUpdateT0_64;
CRC_FUNC g_CrcUpdateT0_64;
extern
CRC_FUNC g_CrcUpdate;
CRC_FUNC g_CrcUpdate;
#if Z7_CRC_NUM_TABLES_USE == 1 \
|| (!defined(MY_CPU_LE) && !defined(MY_CPU_BE))
#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
#define Z7_CRC_UPDATE_T1_FUNC_NAME CrcUpdateGT1
static UInt32 Z7_FASTCALL Z7_CRC_UPDATE_T1_FUNC_NAME(UInt32 v, const void *data, size_t size)
UInt32 g_CrcTable[256 * CRC_NUM_TABLES];
UInt32 MY_FAST_CALL CrcUpdate(UInt32 v, const void *data, size_t size)
{
return g_CrcUpdate(v, data, size, g_CrcTable);
}
UInt32 MY_FAST_CALL CrcCalc(const void *data, size_t size)
{
return g_CrcUpdate(CRC_INIT_VAL, data, size, g_CrcTable) ^ CRC_INIT_VAL;
}
#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
UInt32 MY_FAST_CALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table);
UInt32 MY_FAST_CALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table)
{
const UInt32 *table = g_CrcTable;
const Byte *p = (const Byte *)data;
const Byte *lim = p + size;
for (; p != lim; p++)
const Byte *pEnd = p + size;
for (; p != pEnd; p++)
v = CRC_UPDATE_BYTE_2(v, *p);
return v;
}
#endif
#if Z7_CRC_NUM_TABLES_USE != 1
#ifndef MY_CPU_BE
#define FUNC_NAME_LE_2(s) CrcUpdateT ## s
#define FUNC_NAME_LE_1(s) FUNC_NAME_LE_2(s)
#define FUNC_NAME_LE FUNC_NAME_LE_1(Z7_CRC_NUM_TABLES_USE)
UInt32 Z7_FASTCALL FUNC_NAME_LE (UInt32 v, const void *data, size_t size, const UInt32 *table);
#endif
#ifndef MY_CPU_LE
#define FUNC_NAME_BE_2(s) CrcUpdateT1_BeT ## s
#define FUNC_NAME_BE_1(s) FUNC_NAME_BE_2(s)
#define FUNC_NAME_BE FUNC_NAME_BE_1(Z7_CRC_NUM_TABLES_USE)
UInt32 Z7_FASTCALL FUNC_NAME_BE (UInt32 v, const void *data, size_t size, const UInt32 *table);
#endif
#endif
#endif // Z7_CRC_HW_FORCE
/* ---------- hardware CRC ---------- */
#ifdef MY_CPU_LE
#if defined(MY_CPU_ARM_OR_ARM64)
// #pragma message("ARM*")
#if (defined(__clang__) && (__clang_major__ >= 3)) \
|| defined(__GNUC__) && (__GNUC__ >= 6) && defined(MY_CPU_ARM64) \
|| defined(__GNUC__) && (__GNUC__ >= 8)
#if !defined(__ARM_FEATURE_CRC32)
// #pragma message("!defined(__ARM_FEATURE_CRC32)")
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define __ARM_FEATURE_CRC32 1
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#define Z7_ARM_FEATURE_CRC32_WAS_SET
#if defined(__clang__)
#if defined(MY_CPU_ARM64)
#define ATTRIB_CRC __attribute__((__target__("crc")))
#else
#define ATTRIB_CRC __attribute__((__target__("armv8-a,crc")))
#endif
#else
#if defined(MY_CPU_ARM64)
#if !defined(Z7_GCC_VERSION) || (Z7_GCC_VERSION >= 60000)
#define ATTRIB_CRC __attribute__((__target__("+crc")))
#endif
#else
#if !defined(Z7_GCC_VERSION) || (__GNUC__ >= 8)
#if defined(__ARM_FP) && __GNUC__ >= 8
// for -mfloat-abi=hard: similar to <arm_acle.h>
#define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc+simd")))
#else
#define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc")))
#endif
#endif
#endif
#endif
#endif
#if defined(__ARM_FEATURE_CRC32)
// #pragma message("<arm_acle.h>")
/*
arm_acle.h (GGC):
before Nov 17, 2017:
#ifdef __ARM_FEATURE_CRC32
Nov 17, 2017: gcc10.0 (gcc 9.2.0) checked"
#if __ARM_ARCH >= 8
#pragma GCC target ("arch=armv8-a+crc")
Aug 22, 2019: GCC 8.4?, 9.2.1, 10.1:
#ifdef __ARM_FEATURE_CRC32
#ifdef __ARM_FP
#pragma GCC target ("arch=armv8-a+crc+simd")
#else
#pragma GCC target ("arch=armv8-a+crc")
#endif
*/
#if defined(__ARM_ARCH) && __ARM_ARCH < 8
#if defined(Z7_GCC_VERSION) && (__GNUC__ == 8) && (Z7_GCC_VERSION < 80400) \
|| defined(Z7_GCC_VERSION) && (__GNUC__ == 9) && (Z7_GCC_VERSION < 90201) \
|| defined(Z7_GCC_VERSION) && (__GNUC__ == 10) && (Z7_GCC_VERSION < 100100)
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
// #pragma message("#define __ARM_ARCH 8")
#undef __ARM_ARCH
#define __ARM_ARCH 8
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif
#define Z7_CRC_HW_USE
#include <arm_acle.h>
#endif
#elif defined(_MSC_VER)
#if defined(_MSC_VER)
#if defined(MY_CPU_ARM64)
#if (_MSC_VER >= 1910)
#ifdef __clang__
// #define Z7_CRC_HW_USE
// #include <arm_acle.h>
#else
#define Z7_CRC_HW_USE
#include <intrin.h>
#endif
#define USE_ARM64_CRC
#endif
#endif
#elif (defined(__clang__) && (__clang_major__ >= 3)) \
|| (defined(__GNUC__) && (__GNUC__ > 4))
#if !defined(__ARM_FEATURE_CRC32)
#define __ARM_FEATURE_CRC32 1
#if (!defined(__clang__) || (__clang_major__ > 3)) // fix these numbers
#define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc")))
#endif
#endif
#if defined(__ARM_FEATURE_CRC32)
#define USE_ARM64_CRC
#include <arm_acle.h>
#endif
#endif
#else // non-ARM*
#else
// #define Z7_CRC_HW_USE // for debug : we can test HW-branch of code
#ifdef Z7_CRC_HW_USE
#include "7zCrcEmu.h"
// no hardware CRC
// #define USE_CRC_EMU
#ifdef USE_CRC_EMU
#pragma message("ARM64 CRC emulation")
MY_FORCE_INLINE
UInt32 __crc32b(UInt32 v, UInt32 data)
{
const UInt32 *table = g_CrcTable;
v = CRC_UPDATE_BYTE_2(v, (Byte)data);
return v;
}
MY_FORCE_INLINE
UInt32 __crc32w(UInt32 v, UInt32 data)
{
const UInt32 *table = g_CrcTable;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
return v;
}
MY_FORCE_INLINE
UInt32 __crc32d(UInt32 v, UInt64 data)
{
const UInt32 *table = g_CrcTable;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
return v;
}
#endif // USE_CRC_EMU
#endif // defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
#if defined(USE_ARM64_CRC) || defined(USE_CRC_EMU)
#define T0_32_UNROLL_BYTES (4 * 4)
#define T0_64_UNROLL_BYTES (4 * 8)
#ifndef ATTRIB_CRC
#define ATTRIB_CRC
#endif
#endif // non-ARM*
#if defined(Z7_CRC_HW_USE)
// #pragma message("USE ARM HW CRC")
#ifdef MY_CPU_64BIT
#define CRC_HW_WORD_TYPE UInt64
#define CRC_HW_WORD_FUNC __crc32d
#else
#define CRC_HW_WORD_TYPE UInt32
#define CRC_HW_WORD_FUNC __crc32w
#endif
#define CRC_HW_UNROLL_BYTES (sizeof(CRC_HW_WORD_TYPE) * 4)
#ifdef ATTRIB_CRC
ATTRIB_CRC
#endif
Z7_NO_INLINE
#ifdef Z7_CRC_HW_FORCE
UInt32 Z7_FASTCALL CrcUpdate
#else
static UInt32 Z7_FASTCALL CrcUpdate_HW
#endif
(UInt32 v, const void *data, size_t size)
ATTRIB_CRC
UInt32 MY_FAST_CALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table);
ATTRIB_CRC
UInt32 MY_FAST_CALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table)
{
const Byte *p = (const Byte *)data;
for (; size != 0 && ((unsigned)(ptrdiff_t)p & (CRC_HW_UNROLL_BYTES - 1)) != 0; size--)
UNUSED_VAR(table);
for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_32_UNROLL_BYTES - 1)) != 0; size--)
v = __crc32b(v, *p++);
if (size >= CRC_HW_UNROLL_BYTES)
if (size >= T0_32_UNROLL_BYTES)
{
const Byte *lim = p + size;
size &= CRC_HW_UNROLL_BYTES - 1;
size &= (T0_32_UNROLL_BYTES - 1);
lim -= size;
do
{
v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p));
v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE)));
p += 2 * sizeof(CRC_HW_WORD_TYPE);
v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p));
v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE)));
p += 2 * sizeof(CRC_HW_WORD_TYPE);
v = __crc32w(v, *(const UInt32 *)(const void *)(p));
v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4;
v = __crc32w(v, *(const UInt32 *)(const void *)(p));
v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4;
}
while (p != lim);
}
@ -223,198 +187,136 @@ Z7_NO_INLINE
return v;
}
#ifdef Z7_ARM_FEATURE_CRC32_WAS_SET
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#undef __ARM_FEATURE_CRC32
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#undef Z7_ARM_FEATURE_CRC32_WAS_SET
#endif
ATTRIB_CRC
UInt32 MY_FAST_CALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table);
ATTRIB_CRC
UInt32 MY_FAST_CALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table)
{
const Byte *p = (const Byte *)data;
UNUSED_VAR(table);
for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_64_UNROLL_BYTES - 1)) != 0; size--)
v = __crc32b(v, *p++);
if (size >= T0_64_UNROLL_BYTES)
{
const Byte *lim = p + size;
size &= (T0_64_UNROLL_BYTES - 1);
lim -= size;
do
{
v = __crc32d(v, *(const UInt64 *)(const void *)(p));
v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8;
v = __crc32d(v, *(const UInt64 *)(const void *)(p));
v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8;
}
while (p != lim);
}
for (; size != 0; size--)
v = __crc32b(v, *p++);
return v;
}
#endif // defined(USE_ARM64_CRC) || defined(USE_CRC_EMU)
#endif // defined(Z7_CRC_HW_USE)
#endif // MY_CPU_LE
#ifndef Z7_CRC_HW_FORCE
#if defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME)
/*
typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_WITH_TABLE_FUNC)
(UInt32 v, const void *data, size_t size, const UInt32 *table);
Z7_CRC_UPDATE_WITH_TABLE_FUNC g_CrcUpdate;
*/
static unsigned g_Crc_Algo;
#if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE))
static unsigned g_Crc_Be;
#endif
#endif // defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME)
Z7_NO_INLINE
#ifdef Z7_CRC_HW_USE
static UInt32 Z7_FASTCALL CrcUpdate_Base
#else
UInt32 Z7_FASTCALL CrcUpdate
#endif
(UInt32 crc, const void *data, size_t size)
{
#if Z7_CRC_NUM_TABLES_USE == 1
return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size);
#else // Z7_CRC_NUM_TABLES_USE != 1
#ifdef Z7_CRC_UPDATE_T1_FUNC_NAME
if (g_Crc_Algo == 1)
return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size);
#endif
#ifdef MY_CPU_LE
return FUNC_NAME_LE(crc, data, size, g_CrcTable);
#elif defined(MY_CPU_BE)
return FUNC_NAME_BE(crc, data, size, g_CrcTable);
#else
if (g_Crc_Be)
return FUNC_NAME_BE(crc, data, size, g_CrcTable);
else
return FUNC_NAME_LE(crc, data, size, g_CrcTable);
#endif
#endif // Z7_CRC_NUM_TABLES_USE != 1
}
#ifdef Z7_CRC_HW_USE
Z7_NO_INLINE
UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size)
{
if (g_Crc_Algo == 0)
return CrcUpdate_HW(crc, data, size);
return CrcUpdate_Base(crc, data, size);
}
#endif
#endif // !defined(Z7_CRC_HW_FORCE)
UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size)
{
return CrcUpdate(CRC_INIT_VAL, data, size) ^ CRC_INIT_VAL;
}
MY_ALIGN(64)
UInt32 g_CrcTable[256 * Z7_CRC_NUM_TABLES_TOTAL];
void Z7_FASTCALL CrcGenerateTable(void)
void MY_FAST_CALL CrcGenerateTable()
{
UInt32 i;
for (i = 0; i < 256; i++)
{
#if defined(Z7_CRC_HW_FORCE)
g_CrcTable[i] = __crc32b(i, 0);
#else
#define kCrcPoly 0xEDB88320
UInt32 r = i;
unsigned j;
for (j = 0; j < 8; j++)
r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1)));
g_CrcTable[i] = r;
#endif
}
for (i = 256; i < 256 * Z7_CRC_NUM_TABLES_USE; i++)
for (i = 256; i < 256 * CRC_NUM_TABLES; i++)
{
const UInt32 r = g_CrcTable[(size_t)i - 256];
UInt32 r = g_CrcTable[(size_t)i - 256];
g_CrcTable[i] = g_CrcTable[r & 0xFF] ^ (r >> 8);
}
#if !defined(Z7_CRC_HW_FORCE) && \
(defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) || defined(MY_CPU_BE))
#if CRC_NUM_TABLES < 4
g_CrcUpdate = CrcUpdateT1;
#else
#ifdef MY_CPU_LE
#if Z7_CRC_NUM_TABLES_USE <= 1
g_Crc_Algo = 1;
#else // Z7_CRC_NUM_TABLES_USE <= 1
g_CrcUpdateT4 = CrcUpdateT4;
g_CrcUpdate = CrcUpdateT4;
#if defined(MY_CPU_LE)
g_Crc_Algo = Z7_CRC_NUM_TABLES_USE;
#else // !defined(MY_CPU_LE)
#if CRC_NUM_TABLES >= 8
g_CrcUpdateT8 = CrcUpdateT8;
#ifdef MY_CPU_X86_OR_AMD64
if (!CPU_Is_InOrder())
#endif
g_CrcUpdate = CrcUpdateT8;
#endif
#else
{
#ifndef MY_CPU_BE
#ifndef MY_CPU_BE
UInt32 k = 0x01020304;
const Byte *p = (const Byte *)&k;
if (p[0] == 4 && p[1] == 3)
g_Crc_Algo = Z7_CRC_NUM_TABLES_USE;
else if (p[0] != 1 || p[1] != 2)
g_Crc_Algo = 1;
else
#endif // MY_CPU_BE
{
for (i = 256 * Z7_CRC_NUM_TABLES_TOTAL - 1; i >= 256; i--)
g_CrcUpdateT4 = CrcUpdateT4;
g_CrcUpdate = CrcUpdateT4;
#if CRC_NUM_TABLES >= 8
g_CrcUpdateT8 = CrcUpdateT8;
g_CrcUpdate = CrcUpdateT8;
#endif
}
else if (p[0] != 1 || p[1] != 2)
g_CrcUpdate = CrcUpdateT1;
else
#endif
{
for (i = 256 * CRC_NUM_TABLES - 1; i >= 256; i--)
{
const UInt32 x = g_CrcTable[(size_t)i - 256];
g_CrcTable[i] = Z7_BSWAP32(x);
UInt32 x = g_CrcTable[(size_t)i - 256];
g_CrcTable[i] = CRC_UINT32_SWAP(x);
}
#if defined(Z7_CRC_UPDATE_T1_FUNC_NAME)
g_Crc_Algo = Z7_CRC_NUM_TABLES_USE;
#endif
#if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE))
g_Crc_Be = 1;
#endif
g_CrcUpdateT4 = CrcUpdateT1_BeT4;
g_CrcUpdate = CrcUpdateT1_BeT4;
#if CRC_NUM_TABLES >= 8
g_CrcUpdateT8 = CrcUpdateT1_BeT8;
g_CrcUpdate = CrcUpdateT1_BeT8;
#endif
}
}
#endif // !defined(MY_CPU_LE)
#ifdef MY_CPU_LE
#ifdef Z7_CRC_HW_USE
if (CPU_IsSupported_CRC32())
g_Crc_Algo = 0;
#endif // Z7_CRC_HW_USE
#endif // MY_CPU_LE
#endif // Z7_CRC_NUM_TABLES_USE <= 1
#endif // g_Crc_Algo was declared
}
Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo)
{
if (algo == 0)
return &CrcUpdate;
#if defined(Z7_CRC_HW_USE)
if (algo == sizeof(CRC_HW_WORD_TYPE) * 8)
{
#ifdef Z7_CRC_HW_FORCE
return &CrcUpdate;
#else
if (g_Crc_Algo == 0)
return &CrcUpdate_HW;
#endif
}
#endif
#ifndef Z7_CRC_HW_FORCE
if (algo == Z7_CRC_NUM_TABLES_USE)
return
#ifdef Z7_CRC_HW_USE
&CrcUpdate_Base;
#else
&CrcUpdate;
#endif
#endif
#endif
return NULL;
#ifdef MY_CPU_LE
#ifdef USE_ARM64_CRC
if (CPU_IsSupported_CRC32())
{
g_CrcUpdateT0_32 = CrcUpdateT0_32;
g_CrcUpdateT0_64 = CrcUpdateT0_64;
g_CrcUpdate =
#if defined(MY_CPU_ARM)
CrcUpdateT0_32;
#else
CrcUpdateT0_64;
#endif
}
#endif
#ifdef USE_CRC_EMU
g_CrcUpdateT0_32 = CrcUpdateT0_32;
g_CrcUpdateT0_64 = CrcUpdateT0_64;
g_CrcUpdate = CrcUpdateT0_64;
#endif
#endif
}
#undef kCrcPoly
#undef Z7_CRC_NUM_TABLES_USE
#undef Z7_CRC_NUM_TABLES_TOTAL
#undef CRC_UPDATE_BYTE_2
#undef FUNC_NAME_LE_2
#undef FUNC_NAME_LE_1
#undef FUNC_NAME_LE
#undef FUNC_NAME_BE_2
#undef FUNC_NAME_BE_1
#undef FUNC_NAME_BE
#undef CRC_HW_UNROLL_BYTES
#undef CRC_HW_WORD_FUNC
#undef CRC_HW_WORD_TYPE

View file

@ -1,8 +1,8 @@
/* 7zCrc.h -- CRC32 calculation
2024-01-22 : Igor Pavlov : Public domain */
2013-01-18 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_7Z_CRC_H
#define ZIP7_INC_7Z_CRC_H
#ifndef __7Z_CRC_H
#define __7Z_CRC_H
#include "7zTypes.h"
@ -11,17 +11,14 @@ EXTERN_C_BEGIN
extern UInt32 g_CrcTable[];
/* Call CrcGenerateTable one time before other CRC functions */
void Z7_FASTCALL CrcGenerateTable(void);
void MY_FAST_CALL CrcGenerateTable(void);
#define CRC_INIT_VAL 0xFFFFFFFF
#define CRC_GET_DIGEST(crc) ((crc) ^ CRC_INIT_VAL)
#define CRC_UPDATE_BYTE(crc, b) (g_CrcTable[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size);
UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size);
typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_FUNC)(UInt32 v, const void *data, size_t size);
Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo);
UInt32 MY_FAST_CALL CrcUpdate(UInt32 crc, const void *data, size_t size);
UInt32 MY_FAST_CALL CrcCalc(const void *data, size_t size);
EXTERN_C_END

View file

@ -1,199 +1,117 @@
/* 7zCrcOpt.c -- CRC32 calculation (optimized functions)
2023-12-07 : Igor Pavlov : Public domain */
/* 7zCrcOpt.c -- CRC32 calculation
2021-02-09 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include "CpuArch.h"
#if !defined(Z7_CRC_NUM_TABLES) || Z7_CRC_NUM_TABLES > 1
// for debug only : define Z7_CRC_DEBUG_BE to test big-endian code in little-endian cpu
// #define Z7_CRC_DEBUG_BE
#ifdef Z7_CRC_DEBUG_BE
#undef MY_CPU_LE
#define MY_CPU_BE
#endif
// the value Z7_CRC_NUM_TABLES_USE must be defined to same value as in 7zCrc.c
#ifdef Z7_CRC_NUM_TABLES
#define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES
#else
#define Z7_CRC_NUM_TABLES_USE 12
#endif
#if Z7_CRC_NUM_TABLES_USE % 4 || \
Z7_CRC_NUM_TABLES_USE < 4 * 1 || \
Z7_CRC_NUM_TABLES_USE > 4 * 6
#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
#endif
#ifndef MY_CPU_BE
#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
#define Q(n, d) \
( (table + ((n) * 4 + 3) * 0x100)[(Byte)(d)] \
^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \
^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \
^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] )
#define R(a) *((const UInt32 *)(const void *)p + (a))
#define CRC_FUNC_PRE_LE2(step) \
UInt32 Z7_FASTCALL CrcUpdateT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table)
#define CRC_FUNC_PRE_LE(step) \
CRC_FUNC_PRE_LE2(step); \
CRC_FUNC_PRE_LE2(step)
CRC_FUNC_PRE_LE(Z7_CRC_NUM_TABLES_USE)
UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table);
UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table)
{
const Byte *p = (const Byte *)data;
const Byte *lim;
for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++)
for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++)
v = CRC_UPDATE_BYTE_2(v, *p);
lim = p + size;
if (size >= Z7_CRC_NUM_TABLES_USE)
for (; size >= 4; size -= 4, p += 4)
{
lim -= Z7_CRC_NUM_TABLES_USE;
do
{
v ^= R(0);
{
#if Z7_CRC_NUM_TABLES_USE == 1 * 4
v = Q(0, v);
#else
#define U2(r, op) \
{ d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); }
UInt32 d, x;
U2(1, =)
#if Z7_CRC_NUM_TABLES_USE >= 3 * 4
#define U(r) U2(r, ^=)
U(2)
#if Z7_CRC_NUM_TABLES_USE >= 4 * 4
U(3)
#if Z7_CRC_NUM_TABLES_USE >= 5 * 4
U(4)
#if Z7_CRC_NUM_TABLES_USE >= 6 * 4
U(5)
#if Z7_CRC_NUM_TABLES_USE >= 7 * 4
#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
#endif
#endif
#endif
#endif
#endif
#undef U
#undef U2
v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v);
#endif
}
p += Z7_CRC_NUM_TABLES_USE;
}
while (p <= lim);
lim += Z7_CRC_NUM_TABLES_USE;
v ^= *(const UInt32 *)(const void *)p;
v =
(table + 0x300)[((v ) & 0xFF)]
^ (table + 0x200)[((v >> 8) & 0xFF)]
^ (table + 0x100)[((v >> 16) & 0xFF)]
^ (table + 0x000)[((v >> 24))];
}
for (; p < lim; p++)
for (; size > 0; size--, p++)
v = CRC_UPDATE_BYTE_2(v, *p);
return v;
}
#undef CRC_UPDATE_BYTE_2
#undef R
#undef Q
#undef CRC_FUNC_PRE_LE
#undef CRC_FUNC_PRE_LE2
UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table);
UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table)
{
const Byte *p = (const Byte *)data;
for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++)
v = CRC_UPDATE_BYTE_2(v, *p);
for (; size >= 8; size -= 8, p += 8)
{
UInt32 d;
v ^= *(const UInt32 *)(const void *)p;
v =
(table + 0x700)[((v ) & 0xFF)]
^ (table + 0x600)[((v >> 8) & 0xFF)]
^ (table + 0x500)[((v >> 16) & 0xFF)]
^ (table + 0x400)[((v >> 24))];
d = *((const UInt32 *)(const void *)p + 1);
v ^=
(table + 0x300)[((d ) & 0xFF)]
^ (table + 0x200)[((d >> 8) & 0xFF)]
^ (table + 0x100)[((d >> 16) & 0xFF)]
^ (table + 0x000)[((d >> 24))];
}
for (; size > 0; size--, p++)
v = CRC_UPDATE_BYTE_2(v, *p);
return v;
}
#endif
#ifndef MY_CPU_LE
#define CRC_UPDATE_BYTE_2_BE(crc, b) (table[((crc) >> 24) ^ (b)] ^ ((crc) << 8))
#define CRC_UINT32_SWAP(v) ((v >> 24) | ((v >> 8) & 0xFF00) | ((v << 8) & 0xFF0000) | (v << 24))
#define Q(n, d) \
( (table + ((n) * 4 + 0) * 0x100)[((d)) & 0xFF] \
^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \
^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \
^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] )
#define CRC_UPDATE_BYTE_2_BE(crc, b) (table[(((crc) >> 24) ^ (b))] ^ ((crc) << 8))
#ifdef Z7_CRC_DEBUG_BE
#define R(a) GetBe32a((const UInt32 *)(const void *)p + (a))
#else
#define R(a) *((const UInt32 *)(const void *)p + (a))
#endif
#define CRC_FUNC_PRE_BE2(step) \
UInt32 Z7_FASTCALL CrcUpdateT1_BeT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table)
#define CRC_FUNC_PRE_BE(step) \
CRC_FUNC_PRE_BE2(step); \
CRC_FUNC_PRE_BE2(step)
CRC_FUNC_PRE_BE(Z7_CRC_NUM_TABLES_USE)
UInt32 MY_FAST_CALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table)
{
const Byte *p = (const Byte *)data;
const Byte *lim;
table += 0x100;
v = Z7_BSWAP32(v);
for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++)
v = CRC_UINT32_SWAP(v);
for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++)
v = CRC_UPDATE_BYTE_2_BE(v, *p);
lim = p + size;
if (size >= Z7_CRC_NUM_TABLES_USE)
for (; size >= 4; size -= 4, p += 4)
{
lim -= Z7_CRC_NUM_TABLES_USE;
do
{
v ^= R(0);
{
#if Z7_CRC_NUM_TABLES_USE == 1 * 4
v = Q(0, v);
#else
#define U2(r, op) \
{ d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); }
UInt32 d, x;
U2(1, =)
#if Z7_CRC_NUM_TABLES_USE >= 3 * 4
#define U(r) U2(r, ^=)
U(2)
#if Z7_CRC_NUM_TABLES_USE >= 4 * 4
U(3)
#if Z7_CRC_NUM_TABLES_USE >= 5 * 4
U(4)
#if Z7_CRC_NUM_TABLES_USE >= 6 * 4
U(5)
#if Z7_CRC_NUM_TABLES_USE >= 7 * 4
#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
#endif
#endif
#endif
#endif
#endif
#undef U
#undef U2
v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v);
#endif
}
p += Z7_CRC_NUM_TABLES_USE;
}
while (p <= lim);
lim += Z7_CRC_NUM_TABLES_USE;
v ^= *(const UInt32 *)(const void *)p;
v =
(table + 0x000)[((v ) & 0xFF)]
^ (table + 0x100)[((v >> 8) & 0xFF)]
^ (table + 0x200)[((v >> 16) & 0xFF)]
^ (table + 0x300)[((v >> 24))];
}
for (; p < lim; p++)
for (; size > 0; size--, p++)
v = CRC_UPDATE_BYTE_2_BE(v, *p);
return Z7_BSWAP32(v);
return CRC_UINT32_SWAP(v);
}
#undef CRC_UPDATE_BYTE_2_BE
#undef R
#undef Q
#undef CRC_FUNC_PRE_BE
#undef CRC_FUNC_PRE_BE2
UInt32 MY_FAST_CALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table)
{
const Byte *p = (const Byte *)data;
table += 0x100;
v = CRC_UINT32_SWAP(v);
for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++)
v = CRC_UPDATE_BYTE_2_BE(v, *p);
for (; size >= 8; size -= 8, p += 8)
{
UInt32 d;
v ^= *(const UInt32 *)(const void *)p;
v =
(table + 0x400)[((v ) & 0xFF)]
^ (table + 0x500)[((v >> 8) & 0xFF)]
^ (table + 0x600)[((v >> 16) & 0xFF)]
^ (table + 0x700)[((v >> 24))];
d = *((const UInt32 *)(const void *)p + 1);
v ^=
(table + 0x000)[((d ) & 0xFF)]
^ (table + 0x100)[((d >> 8) & 0xFF)]
^ (table + 0x200)[((d >> 16) & 0xFF)]
^ (table + 0x300)[((d >> 24))];
}
for (; size > 0; size--, p++)
v = CRC_UPDATE_BYTE_2_BE(v, *p);
return CRC_UINT32_SWAP(v);
}
#endif
#undef Z7_CRC_NUM_TABLES_USE
#endif

201
C/7zDec.c
View file

@ -1,11 +1,11 @@
/* 7zDec.c -- Decoding from 7z folder
: Igor Pavlov : Public domain */
2021-02-09 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include <string.h>
/* #define Z7_PPMD_SUPPORT */
/* #define _7ZIP_PPMD_SUPPPORT */
#include "7z.h"
#include "7zCrc.h"
@ -16,50 +16,27 @@
#include "Delta.h"
#include "LzmaDec.h"
#include "Lzma2Dec.h"
#ifdef Z7_PPMD_SUPPORT
#ifdef _7ZIP_PPMD_SUPPPORT
#include "Ppmd7.h"
#endif
#define k_Copy 0
#ifndef Z7_NO_METHOD_LZMA2
#ifndef _7Z_NO_METHOD_LZMA2
#define k_LZMA2 0x21
#endif
#define k_LZMA 0x30101
#define k_BCJ2 0x303011B
#if !defined(Z7_NO_METHODS_FILTERS)
#define Z7_USE_BRANCH_FILTER
#endif
#if !defined(Z7_NO_METHODS_FILTERS) || \
defined(Z7_USE_NATIVE_BRANCH_FILTER) && defined(MY_CPU_ARM64)
#define Z7_USE_FILTER_ARM64
#ifndef Z7_USE_BRANCH_FILTER
#define Z7_USE_BRANCH_FILTER
#endif
#define k_ARM64 0xa
#endif
#if !defined(Z7_NO_METHODS_FILTERS) || \
defined(Z7_USE_NATIVE_BRANCH_FILTER) && defined(MY_CPU_ARMT)
#define Z7_USE_FILTER_ARMT
#ifndef Z7_USE_BRANCH_FILTER
#define Z7_USE_BRANCH_FILTER
#endif
#define k_ARMT 0x3030701
#endif
#ifndef Z7_NO_METHODS_FILTERS
#ifndef _7Z_NO_METHODS_FILTERS
#define k_Delta 3
#define k_RISCV 0xb
#define k_BCJ 0x3030103
#define k_PPC 0x3030205
#define k_IA64 0x3030401
#define k_ARM 0x3030501
#define k_ARMT 0x3030701
#define k_SPARC 0x3030805
#endif
#ifdef Z7_PPMD_SUPPORT
#ifdef _7ZIP_PPMD_SUPPPORT
#define k_PPMD 0x30401
@ -72,12 +49,12 @@ typedef struct
UInt64 processed;
BoolInt extra;
SRes res;
ILookInStreamPtr inStream;
const ILookInStream *inStream;
} CByteInToLook;
static Byte ReadByte(IByteInPtr pp)
static Byte ReadByte(const IByteIn *pp)
{
Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CByteInToLook)
CByteInToLook *p = CONTAINER_FROM_VTBL(pp, CByteInToLook, vt);
if (p->cur != p->end)
return *p->cur++;
if (p->res == SZ_OK)
@ -90,13 +67,13 @@ static Byte ReadByte(IByteInPtr pp)
p->cur = p->begin;
p->end = p->begin + size;
if (size != 0)
return *p->cur++;
return *p->cur++;;
}
p->extra = True;
return 0;
}
static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStreamPtr inStream,
static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, const ILookInStream *inStream,
Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain)
{
CPpmd7 ppmd;
@ -161,14 +138,14 @@ static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, I
#endif
static SRes SzDecodeLzma(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStreamPtr inStream,
static SRes SzDecodeLzma(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStream *inStream,
Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain)
{
CLzmaDec state;
SRes res = SZ_OK;
LzmaDec_CONSTRUCT(&state)
RINOK(LzmaDec_AllocateProbs(&state, props, propsSize, allocMain))
LzmaDec_Construct(&state);
RINOK(LzmaDec_AllocateProbs(&state, props, propsSize, allocMain));
state.dic = outBuffer;
state.dicBufSize = outSize;
LzmaDec_Init(&state);
@ -219,18 +196,18 @@ static SRes SzDecodeLzma(const Byte *props, unsigned propsSize, UInt64 inSize, I
}
#ifndef Z7_NO_METHOD_LZMA2
#ifndef _7Z_NO_METHOD_LZMA2
static SRes SzDecodeLzma2(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStreamPtr inStream,
static SRes SzDecodeLzma2(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStream *inStream,
Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain)
{
CLzma2Dec state;
SRes res = SZ_OK;
Lzma2Dec_CONSTRUCT(&state)
Lzma2Dec_Construct(&state);
if (propsSize != 1)
return SZ_ERROR_DATA;
RINOK(Lzma2Dec_AllocateProbs(&state, props[0], allocMain))
RINOK(Lzma2Dec_AllocateProbs(&state, props[0], allocMain));
state.decoder.dic = outBuffer;
state.decoder.dicBufSize = outSize;
Lzma2Dec_Init(&state);
@ -280,7 +257,7 @@ static SRes SzDecodeLzma2(const Byte *props, unsigned propsSize, UInt64 inSize,
#endif
static SRes SzDecodeCopy(UInt64 inSize, ILookInStreamPtr inStream, Byte *outBuffer)
static SRes SzDecodeCopy(UInt64 inSize, ILookInStream *inStream, Byte *outBuffer)
{
while (inSize > 0)
{
@ -288,13 +265,13 @@ static SRes SzDecodeCopy(UInt64 inSize, ILookInStreamPtr inStream, Byte *outBuff
size_t curSize = (1 << 18);
if (curSize > inSize)
curSize = (size_t)inSize;
RINOK(ILookInStream_Look(inStream, &inBuf, &curSize))
RINOK(ILookInStream_Look(inStream, &inBuf, &curSize));
if (curSize == 0)
return SZ_ERROR_INPUT_EOF;
memcpy(outBuffer, inBuf, curSize);
outBuffer += curSize;
inSize -= curSize;
RINOK(ILookInStream_Skip(inStream, curSize))
RINOK(ILookInStream_Skip(inStream, curSize));
}
return SZ_OK;
}
@ -305,16 +282,15 @@ static BoolInt IS_MAIN_METHOD(UInt32 m)
{
case k_Copy:
case k_LZMA:
#ifndef Z7_NO_METHOD_LZMA2
#ifndef _7Z_NO_METHOD_LZMA2
case k_LZMA2:
#endif
#ifdef Z7_PPMD_SUPPORT
#endif
#ifdef _7ZIP_PPMD_SUPPPORT
case k_PPMD:
#endif
#endif
return True;
default:
return False;
}
return False;
}
static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c)
@ -341,7 +317,7 @@ static SRes CheckSupportedFolder(const CSzFolder *f)
}
#if defined(Z7_USE_BRANCH_FILTER)
#ifndef _7Z_NO_METHODS_FILTERS
if (f->NumCoders == 2)
{
@ -357,21 +333,13 @@ static SRes CheckSupportedFolder(const CSzFolder *f)
return SZ_ERROR_UNSUPPORTED;
switch ((UInt32)c->MethodID)
{
#if !defined(Z7_NO_METHODS_FILTERS)
case k_Delta:
case k_BCJ:
case k_PPC:
case k_IA64:
case k_SPARC:
case k_ARM:
case k_RISCV:
#endif
#ifdef Z7_USE_FILTER_ARM64
case k_ARM64:
#endif
#ifdef Z7_USE_FILTER_ARMT
case k_ARMT:
#endif
break;
default:
return SZ_ERROR_UNSUPPORTED;
@ -404,16 +372,15 @@ static SRes CheckSupportedFolder(const CSzFolder *f)
return SZ_ERROR_UNSUPPORTED;
}
#ifndef _7Z_NO_METHODS_FILTERS
#define CASE_BRA_CONV(isa) case k_ ## isa: isa ## _Convert(outBuffer, outSize, 0, 0); break;
#endif
static SRes SzFolder_Decode2(const CSzFolder *folder,
const Byte *propsData,
const UInt64 *unpackSizes,
const UInt64 *packPositions,
ILookInStreamPtr inStream, UInt64 startPos,
ILookInStream *inStream, UInt64 startPos,
Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain,
Byte *tempBuf[])
{
@ -422,7 +389,7 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
SizeT tempSize3 = 0;
Byte *tempBuf3 = 0;
RINOK(CheckSupportedFolder(folder))
RINOK(CheckSupportedFolder(folder));
for (ci = 0; ci < folder->NumCoders; ci++)
{
@ -437,8 +404,8 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
SizeT outSizeCur = outSize;
if (folder->NumCoders == 4)
{
const UInt32 indices[] = { 3, 2, 0 };
const UInt64 unpackSize = unpackSizes[ci];
UInt32 indices[] = { 3, 2, 0 };
UInt64 unpackSize = unpackSizes[ci];
si = indices[ci];
if (ci < 2)
{
@ -464,37 +431,37 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
}
offset = packPositions[si];
inSize = packPositions[(size_t)si + 1] - offset;
RINOK(LookInStream_SeekTo(inStream, startPos + offset))
RINOK(LookInStream_SeekTo(inStream, startPos + offset));
if (coder->MethodID == k_Copy)
{
if (inSize != outSizeCur) /* check it */
return SZ_ERROR_DATA;
RINOK(SzDecodeCopy(inSize, inStream, outBufCur))
RINOK(SzDecodeCopy(inSize, inStream, outBufCur));
}
else if (coder->MethodID == k_LZMA)
{
RINOK(SzDecodeLzma(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain))
RINOK(SzDecodeLzma(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain));
}
#ifndef Z7_NO_METHOD_LZMA2
#ifndef _7Z_NO_METHOD_LZMA2
else if (coder->MethodID == k_LZMA2)
{
RINOK(SzDecodeLzma2(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain))
RINOK(SzDecodeLzma2(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain));
}
#endif
#ifdef Z7_PPMD_SUPPORT
#endif
#ifdef _7ZIP_PPMD_SUPPPORT
else if (coder->MethodID == k_PPMD)
{
RINOK(SzDecodePpmd(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain))
RINOK(SzDecodePpmd(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain));
}
#endif
#endif
else
return SZ_ERROR_UNSUPPORTED;
}
else if (coder->MethodID == k_BCJ2)
{
const UInt64 offset = packPositions[1];
const UInt64 s3Size = packPositions[2] - offset;
UInt64 offset = packPositions[1];
UInt64 s3Size = packPositions[2] - offset;
if (ci != 3)
return SZ_ERROR_UNSUPPORTED;
@ -506,8 +473,8 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
if (!tempBuf[2] && tempSizes[2] != 0)
return SZ_ERROR_MEM;
RINOK(LookInStream_SeekTo(inStream, startPos + offset))
RINOK(SzDecodeCopy(s3Size, inStream, tempBuf[2]))
RINOK(LookInStream_SeekTo(inStream, startPos + offset));
RINOK(SzDecodeCopy(s3Size, inStream, tempBuf[2]));
if ((tempSizes[0] & 3) != 0 ||
(tempSizes[1] & 3) != 0 ||
@ -526,22 +493,26 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
p.destLim = outBuffer + outSize;
Bcj2Dec_Init(&p);
RINOK(Bcj2Dec_Decode(&p))
RINOK(Bcj2Dec_Decode(&p));
{
unsigned i;
for (i = 0; i < 4; i++)
if (p.bufs[i] != p.lims[i])
return SZ_ERROR_DATA;
if (p.dest != p.destLim || !Bcj2Dec_IsMaybeFinished(&p))
if (!Bcj2Dec_IsFinished(&p))
return SZ_ERROR_DATA;
if (p.dest != p.destLim
|| p.state != BCJ2_STREAM_MAIN)
return SZ_ERROR_DATA;
}
}
}
#if defined(Z7_USE_BRANCH_FILTER)
#ifndef _7Z_NO_METHODS_FILTERS
else if (ci == 1)
{
#if !defined(Z7_NO_METHODS_FILTERS)
if (coder->MethodID == k_Delta)
{
if (coder->PropsSize != 1)
@ -551,75 +522,31 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
Delta_Init(state);
Delta_Decode(state, (unsigned)(propsData[coder->PropsOffset]) + 1, outBuffer, outSize);
}
continue;
}
#endif
#ifdef Z7_USE_FILTER_ARM64
if (coder->MethodID == k_ARM64)
{
UInt32 pc = 0;
if (coder->PropsSize == 4)
{
pc = GetUi32(propsData + coder->PropsOffset);
if (pc & 3)
return SZ_ERROR_UNSUPPORTED;
}
else if (coder->PropsSize != 0)
return SZ_ERROR_UNSUPPORTED;
z7_BranchConv_ARM64_Dec(outBuffer, outSize, pc);
continue;
}
#endif
#if !defined(Z7_NO_METHODS_FILTERS)
if (coder->MethodID == k_RISCV)
{
UInt32 pc = 0;
if (coder->PropsSize == 4)
{
pc = GetUi32(propsData + coder->PropsOffset);
if (pc & 1)
return SZ_ERROR_UNSUPPORTED;
}
else if (coder->PropsSize != 0)
return SZ_ERROR_UNSUPPORTED;
z7_BranchConv_RISCV_Dec(outBuffer, outSize, pc);
continue;
}
#endif
#if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT)
else
{
if (coder->PropsSize != 0)
return SZ_ERROR_UNSUPPORTED;
#define CASE_BRA_CONV(isa) case k_ ## isa: Z7_BRANCH_CONV_DEC(isa)(outBuffer, outSize, 0); break; // pc = 0;
switch (coder->MethodID)
{
#if !defined(Z7_NO_METHODS_FILTERS)
case k_BCJ:
{
UInt32 state = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL;
z7_BranchConvSt_X86_Dec(outBuffer, outSize, 0, &state); // pc = 0
UInt32 state;
x86_Convert_Init(state);
x86_Convert(outBuffer, outSize, 0, &state, 0);
break;
}
case k_PPC: Z7_BRANCH_CONV_DEC_2(BranchConv_PPC)(outBuffer, outSize, 0); break; // pc = 0;
// CASE_BRA_CONV(PPC)
CASE_BRA_CONV(PPC)
CASE_BRA_CONV(IA64)
CASE_BRA_CONV(SPARC)
CASE_BRA_CONV(ARM)
#endif
#if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT)
CASE_BRA_CONV(ARMT)
#endif
default:
return SZ_ERROR_UNSUPPORTED;
}
continue;
}
#endif
} // (c == 1)
#endif // Z7_USE_BRANCH_FILTER
}
#endif
else
return SZ_ERROR_UNSUPPORTED;
}
@ -629,7 +556,7 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
SRes SzAr_DecodeFolder(const CSzAr *p, UInt32 folderIndex,
ILookInStreamPtr inStream, UInt64 startPos,
ILookInStream *inStream, UInt64 startPos,
Byte *outBuffer, size_t outSize,
ISzAllocPtr allocMain)
{

View file

@ -1,5 +1,5 @@
/* 7zFile.c -- File IO
2023-04-02 : Igor Pavlov : Public domain */
2021-04-29 : Igor Pavlov : Public domain */
#include "Precomp.h"
@ -268,7 +268,7 @@ WRes File_Write(CSzFile *p, const void *data, size_t *size)
return errno;
if (processed == 0)
break;
data = (const void *)((const Byte *)data + (size_t)processed);
data = (void *)((Byte *)data + (size_t)processed);
originalSize -= (size_t)processed;
*size += (size_t)processed;
}
@ -287,8 +287,7 @@ WRes File_Seek(CSzFile *p, Int64 *pos, ESzSeek origin)
DWORD moveMethod;
UInt32 low = (UInt32)*pos;
LONG high = (LONG)((UInt64)*pos >> 16 >> 16); /* for case when UInt64 is 32-bit only */
// (int) to eliminate clang warning
switch ((int)origin)
switch (origin)
{
case SZ_SEEK_SET: moveMethod = FILE_BEGIN; break;
case SZ_SEEK_CUR: moveMethod = FILE_CURRENT; break;
@ -309,7 +308,7 @@ WRes File_Seek(CSzFile *p, Int64 *pos, ESzSeek origin)
int moveMethod; // = origin;
switch ((int)origin)
switch (origin)
{
case SZ_SEEK_SET: moveMethod = SEEK_SET; break;
case SZ_SEEK_CUR: moveMethod = SEEK_CUR; break;
@ -388,10 +387,10 @@ WRes File_GetLength(CSzFile *p, UInt64 *length)
/* ---------- FileSeqInStream ---------- */
static SRes FileSeqInStream_Read(ISeqInStreamPtr pp, void *buf, size_t *size)
static SRes FileSeqInStream_Read(const ISeqInStream *pp, void *buf, size_t *size)
{
Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileSeqInStream)
const WRes wres = File_Read(&p->file, buf, size);
CFileSeqInStream *p = CONTAINER_FROM_VTBL(pp, CFileSeqInStream, vt);
WRes wres = File_Read(&p->file, buf, size);
p->wres = wres;
return (wres == 0) ? SZ_OK : SZ_ERROR_READ;
}
@ -404,18 +403,18 @@ void FileSeqInStream_CreateVTable(CFileSeqInStream *p)
/* ---------- FileInStream ---------- */
static SRes FileInStream_Read(ISeekInStreamPtr pp, void *buf, size_t *size)
static SRes FileInStream_Read(const ISeekInStream *pp, void *buf, size_t *size)
{
Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileInStream)
const WRes wres = File_Read(&p->file, buf, size);
CFileInStream *p = CONTAINER_FROM_VTBL(pp, CFileInStream, vt);
WRes wres = File_Read(&p->file, buf, size);
p->wres = wres;
return (wres == 0) ? SZ_OK : SZ_ERROR_READ;
}
static SRes FileInStream_Seek(ISeekInStreamPtr pp, Int64 *pos, ESzSeek origin)
static SRes FileInStream_Seek(const ISeekInStream *pp, Int64 *pos, ESzSeek origin)
{
Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileInStream)
const WRes wres = File_Seek(&p->file, pos, origin);
CFileInStream *p = CONTAINER_FROM_VTBL(pp, CFileInStream, vt);
WRes wres = File_Seek(&p->file, pos, origin);
p->wres = wres;
return (wres == 0) ? SZ_OK : SZ_ERROR_READ;
}
@ -429,10 +428,10 @@ void FileInStream_CreateVTable(CFileInStream *p)
/* ---------- FileOutStream ---------- */
static size_t FileOutStream_Write(ISeqOutStreamPtr pp, const void *data, size_t size)
static size_t FileOutStream_Write(const ISeqOutStream *pp, const void *data, size_t size)
{
Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileOutStream)
const WRes wres = File_Write(&p->file, data, &size);
CFileOutStream *p = CONTAINER_FROM_VTBL(pp, CFileOutStream, vt);
WRes wres = File_Write(&p->file, data, &size);
p->wres = wres;
return size;
}

View file

@ -1,8 +1,8 @@
/* 7zFile.h -- File IO
2023-03-05 : Igor Pavlov : Public domain */
2021-02-15 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_FILE_H
#define ZIP7_INC_FILE_H
#ifndef __7Z_FILE_H
#define __7Z_FILE_H
#ifdef _WIN32
#define USE_WINDOWS_FILE
@ -10,8 +10,7 @@
#endif
#ifdef USE_WINDOWS_FILE
#include "7zWindows.h"
#include <windows.h>
#else
// note: USE_FOPEN mode is limited to 32-bit file size
// #define USE_FOPEN

View file

@ -1,5 +1,5 @@
/* 7zStream.c -- 7z Stream functions
2023-04-02 : Igor Pavlov : Public domain */
2021-02-09 : Igor Pavlov : Public domain */
#include "Precomp.h"
@ -7,33 +7,12 @@
#include "7zTypes.h"
SRes SeqInStream_ReadMax(ISeqInStreamPtr stream, void *buf, size_t *processedSize)
{
size_t size = *processedSize;
*processedSize = 0;
while (size != 0)
{
size_t cur = size;
const SRes res = ISeqInStream_Read(stream, buf, &cur);
*processedSize += cur;
buf = (void *)((Byte *)buf + cur);
size -= cur;
if (res != SZ_OK)
return res;
if (cur == 0)
return SZ_OK;
}
return SZ_OK;
}
/*
SRes SeqInStream_Read2(ISeqInStreamPtr stream, void *buf, size_t size, SRes errorType)
SRes SeqInStream_Read2(const ISeqInStream *stream, void *buf, size_t size, SRes errorType)
{
while (size != 0)
{
size_t processed = size;
RINOK(ISeqInStream_Read(stream, buf, &processed))
RINOK(ISeqInStream_Read(stream, buf, &processed));
if (processed == 0)
return errorType;
buf = (void *)((Byte *)buf + processed);
@ -42,44 +21,42 @@ SRes SeqInStream_Read2(ISeqInStreamPtr stream, void *buf, size_t size, SRes erro
return SZ_OK;
}
SRes SeqInStream_Read(ISeqInStreamPtr stream, void *buf, size_t size)
SRes SeqInStream_Read(const ISeqInStream *stream, void *buf, size_t size)
{
return SeqInStream_Read2(stream, buf, size, SZ_ERROR_INPUT_EOF);
}
*/
SRes SeqInStream_ReadByte(ISeqInStreamPtr stream, Byte *buf)
SRes SeqInStream_ReadByte(const ISeqInStream *stream, Byte *buf)
{
size_t processed = 1;
RINOK(ISeqInStream_Read(stream, buf, &processed))
RINOK(ISeqInStream_Read(stream, buf, &processed));
return (processed == 1) ? SZ_OK : SZ_ERROR_INPUT_EOF;
}
SRes LookInStream_SeekTo(ILookInStreamPtr stream, UInt64 offset)
SRes LookInStream_SeekTo(const ILookInStream *stream, UInt64 offset)
{
Int64 t = (Int64)offset;
return ILookInStream_Seek(stream, &t, SZ_SEEK_SET);
}
SRes LookInStream_LookRead(ILookInStreamPtr stream, void *buf, size_t *size)
SRes LookInStream_LookRead(const ILookInStream *stream, void *buf, size_t *size)
{
const void *lookBuf;
if (*size == 0)
return SZ_OK;
RINOK(ILookInStream_Look(stream, &lookBuf, size))
RINOK(ILookInStream_Look(stream, &lookBuf, size));
memcpy(buf, lookBuf, *size);
return ILookInStream_Skip(stream, *size);
}
SRes LookInStream_Read2(ILookInStreamPtr stream, void *buf, size_t size, SRes errorType)
SRes LookInStream_Read2(const ILookInStream *stream, void *buf, size_t size, SRes errorType)
{
while (size != 0)
{
size_t processed = size;
RINOK(ILookInStream_Read(stream, buf, &processed))
RINOK(ILookInStream_Read(stream, buf, &processed));
if (processed == 0)
return errorType;
buf = (void *)((Byte *)buf + processed);
@ -88,16 +65,16 @@ SRes LookInStream_Read2(ILookInStreamPtr stream, void *buf, size_t size, SRes er
return SZ_OK;
}
SRes LookInStream_Read(ILookInStreamPtr stream, void *buf, size_t size)
SRes LookInStream_Read(const ILookInStream *stream, void *buf, size_t size)
{
return LookInStream_Read2(stream, buf, size, SZ_ERROR_INPUT_EOF);
}
#define GET_LookToRead2 Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CLookToRead2)
#define GET_LookToRead2 CLookToRead2 *p = CONTAINER_FROM_VTBL(pp, CLookToRead2, vt);
static SRes LookToRead2_Look_Lookahead(ILookInStreamPtr pp, const void **buf, size_t *size)
static SRes LookToRead2_Look_Lookahead(const ILookInStream *pp, const void **buf, size_t *size)
{
SRes res = SZ_OK;
GET_LookToRead2
@ -116,7 +93,7 @@ static SRes LookToRead2_Look_Lookahead(ILookInStreamPtr pp, const void **buf, si
return res;
}
static SRes LookToRead2_Look_Exact(ILookInStreamPtr pp, const void **buf, size_t *size)
static SRes LookToRead2_Look_Exact(const ILookInStream *pp, const void **buf, size_t *size)
{
SRes res = SZ_OK;
GET_LookToRead2
@ -136,14 +113,14 @@ static SRes LookToRead2_Look_Exact(ILookInStreamPtr pp, const void **buf, size_t
return res;
}
static SRes LookToRead2_Skip(ILookInStreamPtr pp, size_t offset)
static SRes LookToRead2_Skip(const ILookInStream *pp, size_t offset)
{
GET_LookToRead2
p->pos += offset;
return SZ_OK;
}
static SRes LookToRead2_Read(ILookInStreamPtr pp, void *buf, size_t *size)
static SRes LookToRead2_Read(const ILookInStream *pp, void *buf, size_t *size)
{
GET_LookToRead2
size_t rem = p->size - p->pos;
@ -157,7 +134,7 @@ static SRes LookToRead2_Read(ILookInStreamPtr pp, void *buf, size_t *size)
return SZ_OK;
}
static SRes LookToRead2_Seek(ILookInStreamPtr pp, Int64 *pos, ESzSeek origin)
static SRes LookToRead2_Seek(const ILookInStream *pp, Int64 *pos, ESzSeek origin)
{
GET_LookToRead2
p->pos = p->size = 0;
@ -176,9 +153,9 @@ void LookToRead2_CreateVTable(CLookToRead2 *p, int lookahead)
static SRes SecToLook_Read(ISeqInStreamPtr pp, void *buf, size_t *size)
static SRes SecToLook_Read(const ISeqInStream *pp, void *buf, size_t *size)
{
Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSecToLook)
CSecToLook *p = CONTAINER_FROM_VTBL(pp, CSecToLook, vt);
return LookInStream_LookRead(p->realStream, buf, size);
}
@ -187,9 +164,9 @@ void SecToLook_CreateVTable(CSecToLook *p)
p->vt.Read = SecToLook_Read;
}
static SRes SecToRead_Read(ISeqInStreamPtr pp, void *buf, size_t *size)
static SRes SecToRead_Read(const ISeqInStream *pp, void *buf, size_t *size)
{
Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSecToRead)
CSecToRead *p = CONTAINER_FROM_VTBL(pp, CSecToRead, vt);
return ILookInStream_Read(p->realStream, buf, size);
}

View file

@ -1,8 +1,8 @@
/* 7zTypes.h -- Basic types
2024-01-24 : Igor Pavlov : Public domain */
2021-12-25 : Igor Pavlov : Public domain */
#ifndef ZIP7_7Z_TYPES_H
#define ZIP7_7Z_TYPES_H
#ifndef __7Z_TYPES_H
#define __7Z_TYPES_H
#ifdef _WIN32
/* #include <windows.h> */
@ -52,11 +52,6 @@ typedef int SRes;
#define MY_ALIGN(n)
#endif
#else
/*
// C11/C++11:
#include <stdalign.h>
#define MY_ALIGN(n) alignas(n)
*/
#define MY_ALIGN(n) __attribute__ ((aligned(n)))
#endif
@ -67,7 +62,7 @@ typedef int SRes;
typedef unsigned WRes;
#define MY_SRes_HRESULT_FROM_WRes(x) HRESULT_FROM_WIN32(x)
// #define MY_HRES_ERROR_INTERNAL_ERROR MY_SRes_HRESULT_FROM_WRes(ERROR_INTERNAL_ERROR)
// #define MY_HRES_ERROR__INTERNAL_ERROR MY_SRes_HRESULT_FROM_WRes(ERROR_INTERNAL_ERROR)
#else // _WIN32
@ -75,13 +70,13 @@ typedef unsigned WRes;
typedef int WRes;
// (FACILITY_ERRNO = 0x800) is 7zip's FACILITY constant to represent (errno) errors in HRESULT
#define MY_FACILITY_ERRNO 0x800
#define MY_FACILITY_WIN32 7
#define MY_FACILITY_WRes MY_FACILITY_ERRNO
#define MY__FACILITY_ERRNO 0x800
#define MY__FACILITY_WIN32 7
#define MY__FACILITY__WRes MY__FACILITY_ERRNO
#define MY_HRESULT_FROM_errno_CONST_ERROR(x) ((HRESULT)( \
( (HRESULT)(x) & 0x0000FFFF) \
| (MY_FACILITY_WRes << 16) \
| (MY__FACILITY__WRes << 16) \
| (HRESULT)0x80000000 ))
#define MY_SRes_HRESULT_FROM_WRes(x) \
@ -125,19 +120,23 @@ typedef int WRes;
#define ERROR_INVALID_REPARSE_DATA ((HRESULT)0x80071128L)
#define ERROR_REPARSE_TAG_INVALID ((HRESULT)0x80071129L)
// if (MY_FACILITY_WRes != FACILITY_WIN32),
// if (MY__FACILITY__WRes != FACILITY_WIN32),
// we use FACILITY_WIN32 for COM errors:
#define E_OUTOFMEMORY ((HRESULT)0x8007000EL)
#define E_INVALIDARG ((HRESULT)0x80070057L)
#define MY_E_ERROR_NEGATIVE_SEEK ((HRESULT)0x80070083L)
#define MY__E_ERROR_NEGATIVE_SEEK ((HRESULT)0x80070083L)
/*
// we can use FACILITY_ERRNO for some COM errors, that have errno equivalents:
#define E_OUTOFMEMORY MY_HRESULT_FROM_errno_CONST_ERROR(ENOMEM)
#define E_INVALIDARG MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL)
#define MY_E_ERROR_NEGATIVE_SEEK MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL)
#define MY__E_ERROR_NEGATIVE_SEEK MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL)
*/
// gcc / clang : (sizeof(long) == sizeof(void*)) in 32/64 bits
typedef long INT_PTR;
typedef unsigned long UINT_PTR;
#define TEXT(quote) quote
#define FILE_ATTRIBUTE_READONLY 0x0001
@ -161,18 +160,18 @@ typedef int WRes;
#ifndef RINOK
#define RINOK(x) { const int _result_ = (x); if (_result_ != 0) return _result_; }
#define RINOK(x) { int __result__ = (x); if (__result__ != 0) return __result__; }
#endif
#ifndef RINOK_WRes
#define RINOK_WRes(x) { const WRes _result_ = (x); if (_result_ != 0) return _result_; }
#define RINOK_WRes(x) { WRes __result__ = (x); if (__result__ != 0) return __result__; }
#endif
typedef unsigned char Byte;
typedef short Int16;
typedef unsigned short UInt16;
#ifdef Z7_DECL_Int32_AS_long
#ifdef _LZMA_UINT32_IS_ULONG
typedef long Int32;
typedef unsigned long UInt32;
#else
@ -211,51 +210,37 @@ typedef size_t SIZE_T;
#endif // _WIN32
#define MY_HRES_ERROR_INTERNAL_ERROR ((HRESULT)0x8007054FL)
#define MY_HRES_ERROR__INTERNAL_ERROR ((HRESULT)0x8007054FL)
#ifdef Z7_DECL_Int64_AS_long
#ifdef _SZ_NO_INT_64
/* define _SZ_NO_INT_64, if your compiler doesn't support 64-bit integers.
NOTES: Some code will work incorrectly in that case! */
typedef long Int64;
typedef unsigned long UInt64;
#else
#if (defined(_MSC_VER) || defined(__BORLANDC__)) && !defined(__clang__)
#if defined(_MSC_VER) || defined(__BORLANDC__)
typedef __int64 Int64;
typedef unsigned __int64 UInt64;
#else
#if defined(__clang__) || defined(__GNUC__)
#include <stdint.h>
typedef int64_t Int64;
typedef uint64_t UInt64;
#define UINT64_CONST(n) n
#else
typedef long long int Int64;
typedef unsigned long long int UInt64;
// #define UINT64_CONST(n) n ## ULL
#endif
#define UINT64_CONST(n) n ## ULL
#endif
#endif
#define UINT64_CONST(n) n
#ifdef Z7_DECL_SizeT_AS_unsigned_int
typedef unsigned int SizeT;
#ifdef _LZMA_NO_SYSTEM_SIZE_T
typedef UInt32 SizeT;
#else
typedef size_t SizeT;
#endif
/*
#if (defined(_MSC_VER) && _MSC_VER <= 1200)
typedef size_t MY_uintptr_t;
#else
#include <stdint.h>
typedef uintptr_t MY_uintptr_t;
#endif
*/
typedef int BoolInt;
/* typedef BoolInt Bool; */
#define True 1
@ -263,23 +248,23 @@ typedef int BoolInt;
#ifdef _WIN32
#define Z7_STDCALL __stdcall
#define MY_STD_CALL __stdcall
#else
#define Z7_STDCALL
#define MY_STD_CALL
#endif
#ifdef _MSC_VER
#if _MSC_VER >= 1300
#define Z7_NO_INLINE __declspec(noinline)
#define MY_NO_INLINE __declspec(noinline)
#else
#define Z7_NO_INLINE
#define MY_NO_INLINE
#endif
#define Z7_FORCE_INLINE __forceinline
#define MY_FORCE_INLINE __forceinline
#define Z7_CDECL __cdecl
#define Z7_FASTCALL __fastcall
#define MY_CDECL __cdecl
#define MY_FAST_CALL __fastcall
#else // _MSC_VER
@ -287,25 +272,27 @@ typedef int BoolInt;
|| (defined(__clang__) && (__clang_major__ >= 4)) \
|| defined(__INTEL_COMPILER) \
|| defined(__xlC__)
#define Z7_NO_INLINE __attribute__((noinline))
#define Z7_FORCE_INLINE __attribute__((always_inline)) inline
#define MY_NO_INLINE __attribute__((noinline))
// #define MY_FORCE_INLINE __attribute__((always_inline)) inline
#else
#define Z7_NO_INLINE
#define Z7_FORCE_INLINE
#define MY_NO_INLINE
#endif
#define Z7_CDECL
#define MY_FORCE_INLINE
#define MY_CDECL
#if defined(_M_IX86) \
|| defined(__i386__)
// #define Z7_FASTCALL __attribute__((fastcall))
// #define Z7_FASTCALL __attribute__((cdecl))
#define Z7_FASTCALL
// #define MY_FAST_CALL __attribute__((fastcall))
// #define MY_FAST_CALL __attribute__((cdecl))
#define MY_FAST_CALL
#elif defined(MY_CPU_AMD64)
// #define Z7_FASTCALL __attribute__((ms_abi))
#define Z7_FASTCALL
// #define MY_FAST_CALL __attribute__((ms_abi))
#define MY_FAST_CALL
#else
#define Z7_FASTCALL
#define MY_FAST_CALL
#endif
#endif // _MSC_VER
@ -313,49 +300,41 @@ typedef int BoolInt;
/* The following interfaces use first parameter as pointer to structure */
// #define Z7_C_IFACE_CONST_QUAL
#define Z7_C_IFACE_CONST_QUAL const
#define Z7_C_IFACE_DECL(a) \
struct a ## _; \
typedef Z7_C_IFACE_CONST_QUAL struct a ## _ * a ## Ptr; \
typedef struct a ## _ a; \
struct a ## _
Z7_C_IFACE_DECL (IByteIn)
typedef struct IByteIn IByteIn;
struct IByteIn
{
Byte (*Read)(IByteInPtr p); /* reads one byte, returns 0 in case of EOF or error */
Byte (*Read)(const IByteIn *p); /* reads one byte, returns 0 in case of EOF or error */
};
#define IByteIn_Read(p) (p)->Read(p)
Z7_C_IFACE_DECL (IByteOut)
typedef struct IByteOut IByteOut;
struct IByteOut
{
void (*Write)(IByteOutPtr p, Byte b);
void (*Write)(const IByteOut *p, Byte b);
};
#define IByteOut_Write(p, b) (p)->Write(p, b)
Z7_C_IFACE_DECL (ISeqInStream)
typedef struct ISeqInStream ISeqInStream;
struct ISeqInStream
{
SRes (*Read)(ISeqInStreamPtr p, void *buf, size_t *size);
SRes (*Read)(const ISeqInStream *p, void *buf, size_t *size);
/* if (input(*size) != 0 && output(*size) == 0) means end_of_stream.
(output(*size) < input(*size)) is allowed */
};
#define ISeqInStream_Read(p, buf, size) (p)->Read(p, buf, size)
/* try to read as much as avail in stream and limited by (*processedSize) */
SRes SeqInStream_ReadMax(ISeqInStreamPtr stream, void *buf, size_t *processedSize);
/* it can return SZ_ERROR_INPUT_EOF */
// SRes SeqInStream_Read(ISeqInStreamPtr stream, void *buf, size_t size);
// SRes SeqInStream_Read2(ISeqInStreamPtr stream, void *buf, size_t size, SRes errorType);
SRes SeqInStream_ReadByte(ISeqInStreamPtr stream, Byte *buf);
SRes SeqInStream_Read(const ISeqInStream *stream, void *buf, size_t size);
SRes SeqInStream_Read2(const ISeqInStream *stream, void *buf, size_t size, SRes errorType);
SRes SeqInStream_ReadByte(const ISeqInStream *stream, Byte *buf);
Z7_C_IFACE_DECL (ISeqOutStream)
typedef struct ISeqOutStream ISeqOutStream;
struct ISeqOutStream
{
size_t (*Write)(ISeqOutStreamPtr p, const void *buf, size_t size);
size_t (*Write)(const ISeqOutStream *p, const void *buf, size_t size);
/* Returns: result - the number of actually written bytes.
(result < size) means error */
};
@ -369,26 +348,29 @@ typedef enum
} ESzSeek;
Z7_C_IFACE_DECL (ISeekInStream)
typedef struct ISeekInStream ISeekInStream;
struct ISeekInStream
{
SRes (*Read)(ISeekInStreamPtr p, void *buf, size_t *size); /* same as ISeqInStream::Read */
SRes (*Seek)(ISeekInStreamPtr p, Int64 *pos, ESzSeek origin);
SRes (*Read)(const ISeekInStream *p, void *buf, size_t *size); /* same as ISeqInStream::Read */
SRes (*Seek)(const ISeekInStream *p, Int64 *pos, ESzSeek origin);
};
#define ISeekInStream_Read(p, buf, size) (p)->Read(p, buf, size)
#define ISeekInStream_Seek(p, pos, origin) (p)->Seek(p, pos, origin)
Z7_C_IFACE_DECL (ILookInStream)
typedef struct ILookInStream ILookInStream;
struct ILookInStream
{
SRes (*Look)(ILookInStreamPtr p, const void **buf, size_t *size);
SRes (*Look)(const ILookInStream *p, const void **buf, size_t *size);
/* if (input(*size) != 0 && output(*size) == 0) means end_of_stream.
(output(*size) > input(*size)) is not allowed
(output(*size) < input(*size)) is allowed */
SRes (*Skip)(ILookInStreamPtr p, size_t offset);
SRes (*Skip)(const ILookInStream *p, size_t offset);
/* offset must be <= output(*size) of Look */
SRes (*Read)(ILookInStreamPtr p, void *buf, size_t *size);
SRes (*Read)(const ILookInStream *p, void *buf, size_t *size);
/* reads directly (without buffer). It's same as ISeqInStream::Read */
SRes (*Seek)(ILookInStreamPtr p, Int64 *pos, ESzSeek origin);
SRes (*Seek)(const ILookInStream *p, Int64 *pos, ESzSeek origin);
};
#define ILookInStream_Look(p, buf, size) (p)->Look(p, buf, size)
@ -397,18 +379,19 @@ Z7_C_IFACE_DECL (ILookInStream)
#define ILookInStream_Seek(p, pos, origin) (p)->Seek(p, pos, origin)
SRes LookInStream_LookRead(ILookInStreamPtr stream, void *buf, size_t *size);
SRes LookInStream_SeekTo(ILookInStreamPtr stream, UInt64 offset);
SRes LookInStream_LookRead(const ILookInStream *stream, void *buf, size_t *size);
SRes LookInStream_SeekTo(const ILookInStream *stream, UInt64 offset);
/* reads via ILookInStream::Read */
SRes LookInStream_Read2(ILookInStreamPtr stream, void *buf, size_t size, SRes errorType);
SRes LookInStream_Read(ILookInStreamPtr stream, void *buf, size_t size);
SRes LookInStream_Read2(const ILookInStream *stream, void *buf, size_t size, SRes errorType);
SRes LookInStream_Read(const ILookInStream *stream, void *buf, size_t size);
typedef struct
{
ILookInStream vt;
ISeekInStreamPtr realStream;
const ISeekInStream *realStream;
size_t pos;
size_t size; /* it's data size */
@ -420,13 +403,13 @@ typedef struct
void LookToRead2_CreateVTable(CLookToRead2 *p, int lookahead);
#define LookToRead2_INIT(p) { (p)->pos = (p)->size = 0; }
#define LookToRead2_Init(p) { (p)->pos = (p)->size = 0; }
typedef struct
{
ISeqInStream vt;
ILookInStreamPtr realStream;
const ILookInStream *realStream;
} CSecToLook;
void SecToLook_CreateVTable(CSecToLook *p);
@ -436,19 +419,20 @@ void SecToLook_CreateVTable(CSecToLook *p);
typedef struct
{
ISeqInStream vt;
ILookInStreamPtr realStream;
const ILookInStream *realStream;
} CSecToRead;
void SecToRead_CreateVTable(CSecToRead *p);
Z7_C_IFACE_DECL (ICompressProgress)
typedef struct ICompressProgress ICompressProgress;
struct ICompressProgress
{
SRes (*Progress)(ICompressProgressPtr p, UInt64 inSize, UInt64 outSize);
SRes (*Progress)(const ICompressProgress *p, UInt64 inSize, UInt64 outSize);
/* Returns: result. (result != SZ_OK) means break.
Value (UInt64)(Int64)-1 for size means unknown value. */
};
#define ICompressProgress_Progress(p, inSize, outSize) (p)->Progress(p, inSize, outSize)
@ -486,13 +470,13 @@ struct ISzAlloc
#ifndef Z7_container_of
#ifndef MY_container_of
/*
#define Z7_container_of(ptr, type, m) container_of(ptr, type, m)
#define Z7_container_of(ptr, type, m) CONTAINING_RECORD(ptr, type, m)
#define Z7_container_of(ptr, type, m) ((type *)((char *)(ptr) - offsetof(type, m)))
#define Z7_container_of(ptr, type, m) (&((type *)0)->m == (ptr), ((type *)(((char *)(ptr)) - MY_offsetof(type, m))))
#define MY_container_of(ptr, type, m) container_of(ptr, type, m)
#define MY_container_of(ptr, type, m) CONTAINING_RECORD(ptr, type, m)
#define MY_container_of(ptr, type, m) ((type *)((char *)(ptr) - offsetof(type, m)))
#define MY_container_of(ptr, type, m) (&((type *)0)->m == (ptr), ((type *)(((char *)(ptr)) - MY_offsetof(type, m))))
*/
/*
@ -501,64 +485,24 @@ struct ISzAlloc
GCC 4.8.1 : classes with non-public variable members"
*/
#define Z7_container_of(ptr, type, m) \
((type *)(void *)((char *)(void *) \
(1 ? (ptr) : &((type *)NULL)->m) - MY_offsetof(type, m)))
#define MY_container_of(ptr, type, m) ((type *)(void *)((char *)(void *)(1 ? (ptr) : &((type *)0)->m) - MY_offsetof(type, m)))
#define Z7_container_of_CONST(ptr, type, m) \
((const type *)(const void *)((const char *)(const void *) \
(1 ? (ptr) : &((type *)NULL)->m) - MY_offsetof(type, m)))
#endif
#define CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) ((type *)(void *)(ptr))
/*
#define Z7_container_of_NON_CONST_FROM_CONST(ptr, type, m) \
((type *)(void *)(const void *)((const char *)(const void *) \
(1 ? (ptr) : &((type *)NULL)->m) - MY_offsetof(type, m)))
#define CONTAINER_FROM_VTBL(ptr, type, m) CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m)
*/
#define CONTAINER_FROM_VTBL(ptr, type, m) MY_container_of(ptr, type, m)
#endif
#define Z7_CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) ((type *)(void *)(ptr))
// #define Z7_CONTAINER_FROM_VTBL(ptr, type, m) Z7_CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m)
#define Z7_CONTAINER_FROM_VTBL(ptr, type, m) Z7_container_of(ptr, type, m)
// #define Z7_CONTAINER_FROM_VTBL(ptr, type, m) Z7_container_of_NON_CONST_FROM_CONST(ptr, type, m)
#define Z7_CONTAINER_FROM_VTBL_CONST(ptr, type, m) Z7_container_of_CONST(ptr, type, m)
#define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m)
#define CONTAINER_FROM_VTBL_CLS(ptr, type, m) CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m)
/*
#define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL(ptr, type, m)
#define CONTAINER_FROM_VTBL_CLS(ptr, type, m) CONTAINER_FROM_VTBL(ptr, type, m)
*/
#if defined (__clang__) || defined(__GNUC__)
#define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
#define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL \
_Pragma("GCC diagnostic pop")
#else
#define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL
#define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL
#endif
#define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(ptr, type, m, p) \
Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \
type *p = Z7_CONTAINER_FROM_VTBL(ptr, type, m); \
Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL
#define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(type) \
Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(pp, type, vt, p)
// #define ZIP7_DECLARE_HANDLE(name) typedef void *name;
#define Z7_DECLARE_HANDLE(name) struct name##_dummy{int unused;}; typedef struct name##_dummy *name;
#define Z7_memset_0_ARRAY(a) memset((a), 0, sizeof(a))
#ifndef Z7_ARRAY_SIZE
#define Z7_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
#endif
#define MY_memset_0_ARRAY(a) memset((a), 0, sizeof(a))
#ifdef _WIN32
@ -576,22 +520,6 @@ struct ISzAlloc
#endif
#define k_PropVar_TimePrec_0 0
#define k_PropVar_TimePrec_Unix 1
#define k_PropVar_TimePrec_DOS 2
#define k_PropVar_TimePrec_HighPrec 3
#define k_PropVar_TimePrec_Base 16
#define k_PropVar_TimePrec_100ns (k_PropVar_TimePrec_Base + 7)
#define k_PropVar_TimePrec_1ns (k_PropVar_TimePrec_Base + 9)
EXTERN_C_END
#endif
/*
#ifndef Z7_ST
#ifdef _7ZIP_ST
#define Z7_ST
#endif
#endif
*/

View file

@ -1,7 +1,7 @@
#define MY_VER_MAJOR 25
#define MY_VER_MINOR 1
#define MY_VER_MAJOR 21
#define MY_VER_MINOR 07
#define MY_VER_BUILD 0
#define MY_VERSION_NUMBERS "25.01"
#define MY_VERSION_NUMBERS "21.07"
#define MY_VERSION MY_VERSION_NUMBERS
#ifdef MY_CPU_NAME
@ -10,12 +10,12 @@
#define MY_VERSION_CPU MY_VERSION
#endif
#define MY_DATE "2025-08-03"
#define MY_DATE "2021-12-26"
#undef MY_COPYRIGHT
#undef MY_VERSION_COPYRIGHT_DATE
#define MY_AUTHOR_NAME "Igor Pavlov"
#define MY_COPYRIGHT_PD "Igor Pavlov : Public domain"
#define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov"
#define MY_COPYRIGHT_CR "Copyright (c) 1999-2021 Igor Pavlov"
#ifdef USE_COPYRIGHT_CR
#define MY_COPYRIGHT MY_COPYRIGHT_CR

View file

@ -1,101 +0,0 @@
/* 7zWindows.h -- StdAfx
2023-04-02 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_7Z_WINDOWS_H
#define ZIP7_INC_7Z_WINDOWS_H
#ifdef _WIN32
#if defined(__clang__)
# pragma clang diagnostic push
#endif
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable : 4668) // '_WIN32_WINNT' is not defined as a preprocessor macro, replacing with '0' for '#if/#elif'
#if _MSC_VER == 1900
// for old kit10 versions
// #pragma warning(disable : 4255) // winuser.h(13979): warning C4255: 'GetThreadDpiAwarenessContext':
#endif
// win10 Windows Kit:
#endif // _MSC_VER
#if defined(_MSC_VER) && _MSC_VER <= 1200 && !defined(_WIN64)
// for msvc6 without sdk2003
#define RPC_NO_WINDOWS_H
#endif
#if defined(__MINGW32__) || defined(__MINGW64__)
// #if defined(__GNUC__) && !defined(__clang__)
#include <windows.h>
#else
#include <Windows.h>
#endif
// #include <basetsd.h>
// #include <wtypes.h>
// but if precompiled with clang-cl then we need
// #include <windows.h>
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
#if defined(__clang__)
# pragma clang diagnostic pop
#endif
#if defined(_MSC_VER) && _MSC_VER <= 1200 && !defined(_WIN64)
#ifndef _W64
typedef long LONG_PTR, *PLONG_PTR;
typedef unsigned long ULONG_PTR, *PULONG_PTR;
typedef ULONG_PTR DWORD_PTR, *PDWORD_PTR;
#define Z7_OLD_WIN_SDK
#endif // _W64
#endif // _MSC_VER == 1200
#ifdef Z7_OLD_WIN_SDK
#ifndef INVALID_FILE_ATTRIBUTES
#define INVALID_FILE_ATTRIBUTES ((DWORD)-1)
#endif
#ifndef INVALID_SET_FILE_POINTER
#define INVALID_SET_FILE_POINTER ((DWORD)-1)
#endif
#ifndef FILE_SPECIAL_ACCESS
#define FILE_SPECIAL_ACCESS (FILE_ANY_ACCESS)
#endif
// ShlObj.h:
// #define BIF_NEWDIALOGSTYLE 0x0040
#pragma warning(disable : 4201)
// #pragma warning(disable : 4115)
#undef VARIANT_TRUE
#define VARIANT_TRUE ((VARIANT_BOOL)-1)
#endif
#endif // Z7_OLD_WIN_SDK
#ifdef UNDER_CE
#undef VARIANT_TRUE
#define VARIANT_TRUE ((VARIANT_BOOL)-1)
#endif
#if defined(_MSC_VER)
#if _MSC_VER >= 1400 && _MSC_VER <= 1600
// BaseTsd.h(148) : 'HandleToULong' : unreferenced inline function has been removed
// string.h
// #pragma warning(disable : 4514)
#endif
#endif
/* #include "7zTypes.h" */
#endif

View file

@ -4,57 +4,24 @@ MY_ARCH_2 = $(MY_ARCH)
MY_ASM = jwasm
MY_ASM = asmc
ifndef RC
#RC=windres.exe --target=pe-x86-64
#RC=windres.exe -F pe-i386
RC=windres.exe
endif
PROGPATH = $(O)/$(PROG)
PROGPATH_STATIC = $(O)/$(PROG)s
ifneq ($(CC), xlc)
CFLAGS_WARN_WALL = -Wall -Werror -Wextra
endif
# for object file
CFLAGS_BASE_LIST = -c
# for ASM file
# CFLAGS_BASE_LIST = -S
FLAGS_FLTO = -flto
FLAGS_FLTO =
CFLAGS_BASE = $(MY_ARCH_2) -O2 $(CFLAGS_BASE_LIST) $(CFLAGS_WARN_WALL) $(CFLAGS_WARN) \
CFLAGS_BASE = $(MY_ARCH_2) -O2 $(CFLAGS_BASE_LIST) -Wall -Werror -Wextra $(CFLAGS_WARN) \
-DNDEBUG -D_REENTRANT -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE
LDFLAGS_STATIC = -DNDEBUG
# -static
ifdef SystemDrive
IS_MINGW = 1
else
ifdef SYSTEMDRIVE
# ifdef OS
IS_MINGW = 1
endif
endif
ifdef IS_MINGW
LDFLAGS_STATIC_2 = -static
else
ifndef DEF_FILE
ifndef IS_NOT_STANDALONE
ifndef MY_DYNAMIC_LINK
ifneq ($(CC), clang)
LDFLAGS_STATIC_2 =
# -static
# -static-libstdc++ -static-libgcc
endif
endif
endif
endif
endif
LDFLAGS_STATIC = -DNDEBUG $(LDFLAGS_STATIC_2)
ifdef DEF_FILE
@ -95,29 +62,22 @@ endif
ifdef IS_MINGW
ifdef MSYSTEM
RM = rm -f
MY_MKDIR=mkdir -p
DEL_OBJ_EXE = -$(RM) $(PROGPATH) $(PROGPATH_STATIC) $(OBJS)
else
RM = del
MY_MKDIR=mkdir
DEL_OBJ_EXE = -$(RM) $(O)\*.o $(O)\$(PROG).exe $(O)\$(PROG).dll
endif
LIB2 = -loleaut32 -luuid -ladvapi32 -lUser32
LIB2 = -lOle32 -loleaut32 -luuid -ladvapi32 -lUser32 -lShell32
CFLAGS_EXTRA = -DUNICODE -D_UNICODE
CXXFLAGS_EXTRA = -DUNICODE -D_UNICODE
# -Wno-delete-non-virtual-dtor
DEL_OBJ_EXE = -$(RM) $(O)\*.o $(O)\$(PROG).exe $(O)\$(PROG).dll
else
RM = rm -f
MY_MKDIR=mkdir -p
# CFLAGS_BASE := $(CFLAGS_BASE) -DZ7_ST
# CFLAGS_EXTRA = -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE
# CFLAGS_BASE := $(CFLAGS_BASE) -D_7ZIP_ST
# CXXFLAGS_EXTRA = -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE
# LOCAL_LIBS=-lpthread
# LOCAL_LIBS_DLL=$(LOCAL_LIBS) -ldl
@ -128,6 +88,10 @@ DEL_OBJ_EXE = -$(RM) $(PROGPATH) $(PROGPATH_STATIC) $(OBJS)
endif
CFLAGS = $(LOCAL_FLAGS) $(CFLAGS_BASE2) $(CFLAGS_BASE) $(CC_SHARED) -o $@
ifdef IS_X64
AFLAGS_ABI = -elf64 -DABI_LINUX
else
@ -138,9 +102,12 @@ AFLAGS_ABI = -elf -DABI_LINUX -DABI_CDECL
endif
AFLAGS = $(AFLAGS_ABI) -Fo$(O)/
C_WARN_FLAGS =
CFLAGS = $(LOCAL_FLAGS) $(CFLAGS_BASE2) $(CFLAGS_BASE) $(CFLAGS_EXTRA) $(C_WARN_FLAGS) $(FLAGS_FLTO) $(CC_SHARED) -o $@
CXX_WARN_FLAGS =
#-Wno-invalid-offsetof
#-Wno-reorder
CXXFLAGS = $(LOCAL_FLAGS) $(CXXFLAGS_BASE2) $(CFLAGS_BASE) $(CXXFLAGS_EXTRA) $(CC_SHARED) -o $@ $(CXX_WARN_FLAGS)
STATIC_TARGET=
ifdef COMPL_STATIC
@ -153,27 +120,18 @@ all: $(O) $(PROGPATH) $(STATIC_TARGET)
$(O):
$(MY_MKDIR) $(O)
ifneq ($(CC), $(CROSS_COMPILE)clang)
LFLAGS_STRIP = -s
endif
LFLAGS_ALL = $(LFLAGS_STRIP) $(MY_ARCH_2) $(LDFLAGS) $(FLAGS_FLTO) $(LD_arch) $(OBJS) $(MY_LIBS) $(LIB2)
LFLAGS_ALL = -s $(MY_ARCH_2) $(LDFLAGS) $(LD_arch) $(OBJS) $(MY_LIBS) $(LIB2)
$(PROGPATH): $(OBJS)
$(CC) -o $(PROGPATH) $(LFLAGS_ALL)
$(CXX) -o $(PROGPATH) $(LFLAGS_ALL)
$(PROGPATH_STATIC): $(OBJS)
$(CC) -static -o $(PROGPATH_STATIC) $(LFLAGS_ALL)
$(CXX) -static -o $(PROGPATH_STATIC) $(LFLAGS_ALL)
ifndef NO_DEFAULT_RES
# old mingw without -FO
# windres.exe $(RFLAGS) resource.rc $O/resource.o
$O/resource.o: resource.rc
$(RC) $(RFLAGS) resource.rc $(O)/resource.o
windres.exe $(RFLAGS) resource.rc $O/resource.o
endif
# windres.exe $(RFLAGS) resource.rc $(O)\resource.o
# windres.exe $(RFLAGS) resource.rc -FO $(O)/resource.o
# $(RC) $(RFLAGS) resource.rc -FO $(O)/resource.o
@ -271,18 +229,10 @@ $O/Sha256.o: ../../../C/Sha256.c
$(CC) $(CFLAGS) $<
$O/Sort.o: ../../../C/Sort.c
$(CC) $(CFLAGS) $<
$O/SwapBytes.o: ../../../C/SwapBytes.c
$(CC) $(CFLAGS) $<
$O/Xz.o: ../../../C/Xz.c
$(CC) $(CFLAGS) $<
$O/XzCrc64.o: ../../../C/XzCrc64.c
$(CC) $(CFLAGS) $<
$O/XzDec.o: ../../../C/XzDec.c
$(CC) $(CFLAGS) $<
$O/XzEnc.o: ../../../C/XzEnc.c
$(CC) $(CFLAGS) $<
$O/XzIn.o: ../../../C/XzIn.c
$(CC) $(CFLAGS) $<
ifdef USE_ASM
@ -329,11 +279,11 @@ endif
ifdef IS_ARM64
$O/LzmaDecOpt.o: ../../../Asm/arm64/LzmaDecOpt.S ../../../Asm/arm64/7zAsm.S
$(CC) $(CFLAGS) $(ASM_FLAGS) $<
$(CC) $(CFLAGS) $<
endif
$O/LzmaDec.o: ../../LzmaDec.c
$(CC) $(CFLAGS) -DZ7_LZMA_DEC_OPT $<
$(CC) $(CFLAGS) -D_LZMA_DEC_OPT $<
else
@ -344,16 +294,19 @@ endif
$O/XzDec.o: ../../../C/XzDec.c
$(CC) $(CFLAGS) $<
$O/XzEnc.o: ../../../C/XzEnc.c
$(CC) $(CFLAGS) $<
$O/XzIn.o: ../../../C/XzIn.c
$(CC) $(CFLAGS) $<
$O/7zMain.o: ../../../C/Util/7z/7zMain.c
$(CC) $(CFLAGS) $<
$O/7zipInstall.o: ../../../C/Util/7zipInstall/7zipInstall.c
$(CC) $(CFLAGS) $<
$O/7zipUninstall.o: ../../../C/Util/7zipUninstall/7zipUninstall.c
$(CC) $(CFLAGS) $<
$O/LzmaUtil.o: ../../../C/Util/Lzma/LzmaUtil.c
$(CC) $(CFLAGS) $<
$O/XzUtil.o: ../../../C/Util/Xz/XzUtil.c
$(CC) $(CFLAGS) $<
clean:

162
C/Aes.c
View file

@ -1,5 +1,5 @@
/* Aes.c -- AES encryption / decryption
2024-03-01 : Igor Pavlov : Public domain */
2021-05-13 : Igor Pavlov : Public domain */
#include "Precomp.h"
@ -7,15 +7,13 @@
#include "Aes.h"
AES_CODE_FUNC g_AesCbc_Decode;
#ifndef Z7_SFX
#ifndef _SFX
AES_CODE_FUNC g_AesCbc_Encode;
AES_CODE_FUNC g_AesCtr_Code;
UInt32 g_Aes_SupportedFunctions_Flags;
#endif
MY_ALIGN(64)
static UInt32 T[256 * 4];
MY_ALIGN(64)
static const Byte Sbox[256] = {
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
@ -35,9 +33,7 @@ static const Byte Sbox[256] = {
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
MY_ALIGN(64)
static UInt32 D[256 * 4];
MY_ALIGN(64)
static Byte InvS[256];
#define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF)
@ -55,62 +51,32 @@ static Byte InvS[256];
#define DD(x) (D + (x << 8))
// #define Z7_SHOW_AES_STATUS
// #define _SHOW_AES_STATUS
#ifdef MY_CPU_X86_OR_AMD64
#if defined(__INTEL_COMPILER)
#if (__INTEL_COMPILER >= 1110)
#define USE_HW_AES
#if (__INTEL_COMPILER >= 1900)
#define USE_HW_VAES
#endif
#endif
#elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400)
#define USE_HW_AES
#if defined(__clang__) && (__clang_major__ >= 8) \
|| defined(__GNUC__) && (__GNUC__ >= 8)
#define USE_HW_VAES
#endif
#elif defined(_MSC_VER)
#define USE_HW_AES
#define USE_HW_VAES
#endif
#define USE_HW_AES
#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
#if defined(__ARM_FEATURE_AES) \
|| defined(__ARM_FEATURE_CRYPTO)
#define USE_HW_AES
#else
#if defined(MY_CPU_ARM64) \
|| defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
|| defined(Z7_MSC_VER_ORIGINAL)
#if defined(__ARM_FP) && \
( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
|| defined(__GNUC__) && (__GNUC__ >= 6) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
#if defined(MY_CPU_ARM64) \
|| !defined(Z7_CLANG_VERSION) \
|| defined(__ARM_NEON) && \
(Z7_CLANG_VERSION < 170000 || \
Z7_CLANG_VERSION > 170001)
#if defined(__clang__)
#if (__clang_major__ >= 8) // fix that check
#define USE_HW_AES
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 6) // fix that check
#define USE_HW_AES
#endif
#elif defined(_MSC_VER)
#if _MSC_VER >= 1910
#define USE_HW_AES
#endif
#endif
#endif
#ifdef USE_HW_AES
// #pragma message("=== Aes.c USE_HW_AES === ")
#ifdef Z7_SHOW_AES_STATUS
#ifdef _SHOW_AES_STATUS
#include <stdio.h>
#define PRF(x) x
#define _PRF(x) x
#else
#define PRF(x)
#define _PRF(x)
#endif
#endif
@ -124,23 +90,23 @@ void AesGenTables(void)
for (i = 0; i < 256; i++)
{
{
const UInt32 a1 = Sbox[i];
const UInt32 a2 = xtime(a1);
const UInt32 a3 = a2 ^ a1;
UInt32 a1 = Sbox[i];
UInt32 a2 = xtime(a1);
UInt32 a3 = a2 ^ a1;
TT(0)[i] = Ui32(a2, a1, a1, a3);
TT(1)[i] = Ui32(a3, a2, a1, a1);
TT(2)[i] = Ui32(a1, a3, a2, a1);
TT(3)[i] = Ui32(a1, a1, a3, a2);
}
{
const UInt32 a1 = InvS[i];
const UInt32 a2 = xtime(a1);
const UInt32 a4 = xtime(a2);
const UInt32 a8 = xtime(a4);
const UInt32 a9 = a8 ^ a1;
const UInt32 aB = a8 ^ a2 ^ a1;
const UInt32 aD = a8 ^ a4 ^ a1;
const UInt32 aE = a8 ^ a4 ^ a2;
UInt32 a1 = InvS[i];
UInt32 a2 = xtime(a1);
UInt32 a4 = xtime(a2);
UInt32 a8 = xtime(a4);
UInt32 a9 = a8 ^ a1;
UInt32 aB = a8 ^ a2 ^ a1;
UInt32 aD = a8 ^ a4 ^ a1;
UInt32 aE = a8 ^ a4 ^ a2;
DD(0)[i] = Ui32(aE, a9, aD, aB);
DD(1)[i] = Ui32(aB, aE, a9, aD);
DD(2)[i] = Ui32(aD, aB, aE, a9);
@ -150,7 +116,7 @@ void AesGenTables(void)
{
AES_CODE_FUNC d = AesCbc_Decode;
#ifndef Z7_SFX
#ifndef _SFX
AES_CODE_FUNC e = AesCbc_Encode;
AES_CODE_FUNC c = AesCtr_Code;
UInt32 flags = 0;
@ -160,33 +126,31 @@ void AesGenTables(void)
if (CPU_IsSupported_AES())
{
// #pragma message ("AES HW")
PRF(printf("\n===AES HW\n"));
_PRF(printf("\n===AES HW\n"));
d = AesCbc_Decode_HW;
#ifndef Z7_SFX
#ifndef _SFX
e = AesCbc_Encode_HW;
c = AesCtr_Code_HW;
flags = k_Aes_SupportedFunctions_HW;
#endif
#ifdef MY_CPU_X86_OR_AMD64
#ifdef USE_HW_VAES
if (CPU_IsSupported_VAES_AVX2())
{
PRF(printf("\n===vaes avx2\n"));
_PRF(printf("\n===vaes avx2\n"));
d = AesCbc_Decode_HW_256;
#ifndef Z7_SFX
#ifndef _SFX
c = AesCtr_Code_HW_256;
flags |= k_Aes_SupportedFunctions_HW_256;
#endif
}
#endif
#endif
}
#endif
g_AesCbc_Decode = d;
#ifndef Z7_SFX
#ifndef _SFX
g_AesCbc_Encode = e;
g_AesCtr_Code = c;
g_Aes_SupportedFunctions_Flags = flags;
@ -230,7 +194,7 @@ void AesGenTables(void)
#define FD(i, x) InvS[gb(x, m[(i - x) & 3])]
#define FD4(i) dest[i] = Ui32(FD(i, 0), FD(i, 1), FD(i, 2), FD(i, 3)) ^ w[i];
void Z7_FASTCALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize)
void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize)
{
unsigned i, m;
const UInt32 *wLim;
@ -266,7 +230,7 @@ void Z7_FASTCALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize)
while (++w != wLim);
}
void Z7_FASTCALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize)
void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize)
{
unsigned i, num;
Aes_SetKey_Enc(w, key, keySize);
@ -287,7 +251,7 @@ void Z7_FASTCALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize)
src and dest are pointers to 4 UInt32 words.
src and dest can point to same block */
// Z7_FORCE_INLINE
// MY_FORCE_INLINE
static void Aes_Encode(const UInt32 *w, UInt32 *dest, const UInt32 *src)
{
UInt32 s[4];
@ -301,20 +265,17 @@ static void Aes_Encode(const UInt32 *w, UInt32 *dest, const UInt32 *src)
w += 4;
for (;;)
{
HT16(m, s, 0)
HT16(m, s, 0);
if (--numRounds2 == 0)
break;
HT16(s, m, 4)
HT16(s, m, 4);
w += 8;
}
w += 4;
FT4(0)
FT4(1)
FT4(2)
FT4(3)
FT4(0); FT4(1); FT4(2); FT4(3);
}
Z7_FORCE_INLINE
MY_FORCE_INLINE
static void Aes_Decode(const UInt32 *w, UInt32 *dest, const UInt32 *src)
{
UInt32 s[4];
@ -328,15 +289,12 @@ static void Aes_Decode(const UInt32 *w, UInt32 *dest, const UInt32 *src)
for (;;)
{
w -= 8;
HD16(m, s, 4)
HD16(m, s, 4);
if (--numRounds2 == 0)
break;
HD16(s, m, 0)
HD16(s, m, 0);
}
FD4(0)
FD4(1)
FD4(2)
FD4(3)
FD4(0); FD4(1); FD4(2); FD4(3);
}
void AesCbc_Init(UInt32 *p, const Byte *iv)
@ -346,7 +304,7 @@ void AesCbc_Init(UInt32 *p, const Byte *iv)
p[i] = GetUi32(iv + i * 4);
}
void Z7_FASTCALL AesCbc_Encode(UInt32 *p, Byte *data, size_t numBlocks)
void MY_FAST_CALL AesCbc_Encode(UInt32 *p, Byte *data, size_t numBlocks)
{
for (; numBlocks != 0; numBlocks--, data += AES_BLOCK_SIZE)
{
@ -357,14 +315,14 @@ void Z7_FASTCALL AesCbc_Encode(UInt32 *p, Byte *data, size_t numBlocks)
Aes_Encode(p + 4, p, p);
SetUi32(data, p[0])
SetUi32(data + 4, p[1])
SetUi32(data + 8, p[2])
SetUi32(data + 12, p[3])
SetUi32(data, p[0]);
SetUi32(data + 4, p[1]);
SetUi32(data + 8, p[2]);
SetUi32(data + 12, p[3]);
}
}
void Z7_FASTCALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks)
void MY_FAST_CALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks)
{
UInt32 in[4], out[4];
for (; numBlocks != 0; numBlocks--, data += AES_BLOCK_SIZE)
@ -376,10 +334,10 @@ void Z7_FASTCALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks)
Aes_Decode(p + 4, out, in);
SetUi32(data, p[0] ^ out[0])
SetUi32(data + 4, p[1] ^ out[1])
SetUi32(data + 8, p[2] ^ out[2])
SetUi32(data + 12, p[3] ^ out[3])
SetUi32(data, p[0] ^ out[0]);
SetUi32(data + 4, p[1] ^ out[1]);
SetUi32(data + 8, p[2] ^ out[2]);
SetUi32(data + 12, p[3] ^ out[3]);
p[0] = in[0];
p[1] = in[1];
@ -388,7 +346,7 @@ void Z7_FASTCALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks)
}
}
void Z7_FASTCALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks)
void MY_FAST_CALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks)
{
for (; numBlocks != 0; numBlocks--)
{
@ -402,7 +360,7 @@ void Z7_FASTCALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks)
for (i = 0; i < 4; i++, data += 4)
{
const UInt32 t = temp[i];
UInt32 t = temp[i];
#ifdef MY_CPU_LE_UNALIGN
*((UInt32 *)(void *)data) ^= t;
@ -415,15 +373,3 @@ void Z7_FASTCALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks)
}
}
}
#undef xtime
#undef Ui32
#undef gb0
#undef gb1
#undef gb2
#undef gb3
#undef gb
#undef TT
#undef DD
#undef USE_HW_AES
#undef PRF

36
C/Aes.h
View file

@ -1,8 +1,8 @@
/* Aes.h -- AES encryption / decryption
2023-04-02 : Igor Pavlov : Public domain */
2018-04-28 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_AES_H
#define ZIP7_INC_AES_H
#ifndef __AES_H
#define __AES_H
#include "7zTypes.h"
@ -20,19 +20,19 @@ void AesGenTables(void);
/* aes - 16-byte aligned pointer to keyMode+roundKeys sequence */
/* keySize = 16 or 24 or 32 (bytes) */
typedef void (Z7_FASTCALL *AES_SET_KEY_FUNC)(UInt32 *aes, const Byte *key, unsigned keySize);
void Z7_FASTCALL Aes_SetKey_Enc(UInt32 *aes, const Byte *key, unsigned keySize);
void Z7_FASTCALL Aes_SetKey_Dec(UInt32 *aes, const Byte *key, unsigned keySize);
typedef void (MY_FAST_CALL *AES_SET_KEY_FUNC)(UInt32 *aes, const Byte *key, unsigned keySize);
void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *aes, const Byte *key, unsigned keySize);
void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *aes, const Byte *key, unsigned keySize);
/* ivAes - 16-byte aligned pointer to iv+keyMode+roundKeys sequence: UInt32[AES_NUM_IVMRK_WORDS] */
void AesCbc_Init(UInt32 *ivAes, const Byte *iv); /* iv size is AES_BLOCK_SIZE */
/* data - 16-byte aligned pointer to data */
/* numBlocks - the number of 16-byte blocks in data array */
typedef void (Z7_FASTCALL *AES_CODE_FUNC)(UInt32 *ivAes, Byte *data, size_t numBlocks);
typedef void (MY_FAST_CALL *AES_CODE_FUNC)(UInt32 *ivAes, Byte *data, size_t numBlocks);
extern AES_CODE_FUNC g_AesCbc_Decode;
#ifndef Z7_SFX
#ifndef _SFX
extern AES_CODE_FUNC g_AesCbc_Encode;
extern AES_CODE_FUNC g_AesCtr_Code;
#define k_Aes_SupportedFunctions_HW (1 << 2)
@ -41,19 +41,19 @@ extern UInt32 g_Aes_SupportedFunctions_Flags;
#endif
#define Z7_DECLARE_AES_CODE_FUNC(funcName) \
void Z7_FASTCALL funcName(UInt32 *ivAes, Byte *data, size_t numBlocks);
#define DECLARE__AES_CODE_FUNC(funcName) \
void MY_FAST_CALL funcName(UInt32 *ivAes, Byte *data, size_t numBlocks);
Z7_DECLARE_AES_CODE_FUNC (AesCbc_Encode)
Z7_DECLARE_AES_CODE_FUNC (AesCbc_Decode)
Z7_DECLARE_AES_CODE_FUNC (AesCtr_Code)
DECLARE__AES_CODE_FUNC (AesCbc_Encode)
DECLARE__AES_CODE_FUNC (AesCbc_Decode)
DECLARE__AES_CODE_FUNC (AesCtr_Code)
Z7_DECLARE_AES_CODE_FUNC (AesCbc_Encode_HW)
Z7_DECLARE_AES_CODE_FUNC (AesCbc_Decode_HW)
Z7_DECLARE_AES_CODE_FUNC (AesCtr_Code_HW)
DECLARE__AES_CODE_FUNC (AesCbc_Encode_HW)
DECLARE__AES_CODE_FUNC (AesCbc_Decode_HW)
DECLARE__AES_CODE_FUNC (AesCtr_Code_HW)
Z7_DECLARE_AES_CODE_FUNC (AesCbc_Decode_HW_256)
Z7_DECLARE_AES_CODE_FUNC (AesCtr_Code_HW_256)
DECLARE__AES_CODE_FUNC (AesCbc_Decode_HW_256)
DECLARE__AES_CODE_FUNC (AesCtr_Code_HW_256)
EXTERN_C_END

File diff suppressed because it is too large Load diff

330
C/Alloc.c
View file

@ -1,53 +1,38 @@
/* Alloc.c -- Memory allocation functions
2024-02-18 : Igor Pavlov : Public domain */
2021-07-13 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include <stdio.h>
#ifdef _WIN32
#include "7zWindows.h"
#include <Windows.h>
#endif
#include <stdlib.h>
#include "Alloc.h"
#if defined(Z7_LARGE_PAGES) && defined(_WIN32) && \
(!defined(Z7_WIN32_WINNT_MIN) || Z7_WIN32_WINNT_MIN < 0x0502) // < Win2003 (xp-64)
#define Z7_USE_DYN_GetLargePageMinimum
#endif
/* #define _SZ_ALLOC_DEBUG */
// for debug:
#if 0
#if defined(__CHERI__) && defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
// #pragma message("=== Z7_ALLOC_NO_OFFSET_ALLOCATOR === ")
#define Z7_ALLOC_NO_OFFSET_ALLOCATOR
#endif
#endif
/* use _SZ_ALLOC_DEBUG to debug alloc/free operations */
#ifdef _SZ_ALLOC_DEBUG
// #define SZ_ALLOC_DEBUG
/* #define SZ_ALLOC_DEBUG */
/* use SZ_ALLOC_DEBUG to debug alloc/free operations */
#ifdef SZ_ALLOC_DEBUG
#include <string.h>
#include <stdio.h>
static int g_allocCount = 0;
#ifdef _WIN32
static int g_allocCountMid = 0;
static int g_allocCountBig = 0;
#endif
int g_allocCount = 0;
int g_allocCountMid = 0;
int g_allocCountBig = 0;
#define CONVERT_INT_TO_STR(charType, tempSize) \
char temp[tempSize]; unsigned i = 0; \
while (val >= 10) { temp[i++] = (char)('0' + (unsigned)(val % 10)); val /= 10; } \
unsigned char temp[tempSize]; unsigned i = 0; \
while (val >= 10) { temp[i++] = (unsigned char)('0' + (unsigned)(val % 10)); val /= 10; } \
*s++ = (charType)('0' + (unsigned)val); \
while (i != 0) { i--; *s++ = temp[i]; } \
*s = 0;
static void ConvertUInt64ToString(UInt64 val, char *s)
{
CONVERT_INT_TO_STR(char, 24)
CONVERT_INT_TO_STR(char, 24);
}
#define GET_HEX_CHAR(t) ((char)(((t < 10) ? ('0' + t) : ('A' + (t - 10)))))
@ -92,7 +77,7 @@ static void PrintAligned(const char *s, size_t align)
Print(s);
}
static void PrintLn(void)
static void PrintLn()
{
Print("\n");
}
@ -104,10 +89,10 @@ static void PrintHex(UInt64 v, size_t align)
PrintAligned(s, align);
}
static void PrintDec(int v, size_t align)
static void PrintDec(UInt64 v, size_t align)
{
char s[32];
ConvertUInt64ToString((unsigned)v, s);
ConvertUInt64ToString(v, s);
PrintAligned(s, align);
}
@ -117,19 +102,12 @@ static void PrintAddr(void *p)
}
#define PRINT_REALLOC(name, cnt, size, ptr) { \
Print(name " "); \
if (!ptr) PrintDec(cnt++, 10); \
PrintHex(size, 10); \
PrintAddr(ptr); \
PrintLn(); }
#define PRINT_ALLOC(name, cnt, size, ptr) { \
#define PRINT_ALLOC(name, cnt, size, ptr) \
Print(name " "); \
PrintDec(cnt++, 10); \
PrintHex(size, 10); \
PrintAddr(ptr); \
PrintLn(); }
PrintLn();
#define PRINT_FREE(name, cnt, ptr) if (ptr) { \
Print(name " "); \
@ -139,45 +117,26 @@ static void PrintAddr(void *p)
#else
#ifdef _WIN32
#define PRINT_ALLOC(name, cnt, size, ptr)
#endif
#define PRINT_FREE(name, cnt, ptr)
#define Print(s)
#define PrintLn()
#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
#define PrintHex(v, align)
#endif
#define PrintAddr(p)
#endif
/*
by specification:
malloc(non_NULL, 0) : returns NULL or a unique pointer value that can later be successfully passed to free()
realloc(NULL, size) : the call is equivalent to malloc(size)
realloc(non_NULL, 0) : the call is equivalent to free(ptr)
in main compilers:
malloc(0) : returns non_NULL
realloc(NULL, 0) : returns non_NULL
realloc(non_NULL, 0) : returns NULL
*/
void *MyAlloc(size_t size)
{
if (size == 0)
return NULL;
// PRINT_ALLOC("Alloc ", g_allocCount, size, NULL)
#ifdef SZ_ALLOC_DEBUG
PRINT_ALLOC("Alloc ", g_allocCount, size, NULL);
#ifdef _SZ_ALLOC_DEBUG
{
void *p = malloc(size);
if (p)
{
PRINT_ALLOC("Alloc ", g_allocCount, size, p)
}
// PRINT_ALLOC("Alloc ", g_allocCount, size, p);
return p;
}
#else
@ -187,107 +146,71 @@ void *MyAlloc(size_t size)
void MyFree(void *address)
{
PRINT_FREE("Free ", g_allocCount, address)
PRINT_FREE("Free ", g_allocCount, address);
free(address);
}
void *MyRealloc(void *address, size_t size)
{
if (size == 0)
{
MyFree(address);
return NULL;
}
// PRINT_REALLOC("Realloc ", g_allocCount, size, address)
#ifdef SZ_ALLOC_DEBUG
{
void *p = realloc(address, size);
if (p)
{
PRINT_REALLOC("Realloc ", g_allocCount, size, address)
}
return p;
}
#else
return realloc(address, size);
#endif
}
#ifdef _WIN32
void *MidAlloc(size_t size)
{
if (size == 0)
return NULL;
#ifdef SZ_ALLOC_DEBUG
{
void *p = VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE);
if (p)
{
PRINT_ALLOC("Alloc-Mid", g_allocCountMid, size, p)
}
return p;
}
#else
PRINT_ALLOC("Alloc-Mid", g_allocCountMid, size, NULL);
return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE);
#endif
}
void MidFree(void *address)
{
PRINT_FREE("Free-Mid", g_allocCountMid, address)
PRINT_FREE("Free-Mid", g_allocCountMid, address);
if (!address)
return;
VirtualFree(address, 0, MEM_RELEASE);
}
#ifdef Z7_LARGE_PAGES
#ifdef _7ZIP_LARGE_PAGES
#ifdef MEM_LARGE_PAGES
#define MY_MEM_LARGE_PAGES MEM_LARGE_PAGES
#define MY__MEM_LARGE_PAGES MEM_LARGE_PAGES
#else
#define MY_MEM_LARGE_PAGES 0x20000000
#define MY__MEM_LARGE_PAGES 0x20000000
#endif
extern
SIZE_T g_LargePageSize;
SIZE_T g_LargePageSize = 0;
typedef SIZE_T (WINAPI *Func_GetLargePageMinimum)(VOID);
typedef SIZE_T (WINAPI *GetLargePageMinimumP)(VOID);
void SetLargePageSize(void)
#endif // _7ZIP_LARGE_PAGES
void SetLargePageSize()
{
#ifdef _7ZIP_LARGE_PAGES
SIZE_T size;
#ifdef Z7_USE_DYN_GetLargePageMinimum
Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
const
Func_GetLargePageMinimum fn =
(Func_GetLargePageMinimum) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
"GetLargePageMinimum");
if (!fn)
GetLargePageMinimumP largePageMinimum = (GetLargePageMinimumP)
GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "GetLargePageMinimum");
if (!largePageMinimum)
return;
size = fn();
#else
size = GetLargePageMinimum();
#endif
size = largePageMinimum();
if (size == 0 || (size & (size - 1)) != 0)
return;
g_LargePageSize = size;
#endif
}
#endif // Z7_LARGE_PAGES
void *BigAlloc(size_t size)
{
if (size == 0)
return NULL;
PRINT_ALLOC("Alloc-Big", g_allocCountBig, size, NULL)
#ifdef Z7_LARGE_PAGES
PRINT_ALLOC("Alloc-Big", g_allocCountBig, size, NULL);
#ifdef _7ZIP_LARGE_PAGES
{
SIZE_T ps = g_LargePageSize;
if (ps != 0 && ps <= (1 << 30) && size > (ps / 2))
@ -297,43 +220,56 @@ void *BigAlloc(size_t size)
size2 = (size + ps) & ~ps;
if (size2 >= size)
{
void *p = VirtualAlloc(NULL, size2, MEM_COMMIT | MY_MEM_LARGE_PAGES, PAGE_READWRITE);
if (p)
{
PRINT_ALLOC("Alloc-BM ", g_allocCountMid, size2, p)
return p;
}
void *res = VirtualAlloc(NULL, size2, MEM_COMMIT | MY__MEM_LARGE_PAGES, PAGE_READWRITE);
if (res)
return res;
}
}
}
#endif
return MidAlloc(size);
return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE);
}
void BigFree(void *address)
{
PRINT_FREE("Free-Big", g_allocCountBig, address)
MidFree(address);
PRINT_FREE("Free-Big", g_allocCountBig, address);
if (!address)
return;
VirtualFree(address, 0, MEM_RELEASE);
}
#endif // _WIN32
#endif
static void *SzAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p) return MyAlloc(size); }
static void SzFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p) MyFree(address); }
static void *SzAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return MyAlloc(size); }
static void SzFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); MyFree(address); }
const ISzAlloc g_Alloc = { SzAlloc, SzFree };
#ifdef _WIN32
static void *SzMidAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p) return MidAlloc(size); }
static void SzMidFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p) MidFree(address); }
static void *SzBigAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p) return BigAlloc(size); }
static void SzBigFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p) BigFree(address); }
static void *SzMidAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return MidAlloc(size); }
static void SzMidFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); MidFree(address); }
static void *SzBigAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return BigAlloc(size); }
static void SzBigFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); BigFree(address); }
const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree };
const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree };
#endif
#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
/*
uintptr_t : <stdint.h> C99 (optional)
: unsupported in VS6
*/
#ifdef _WIN32
typedef UINT_PTR UIntPtr;
#else
/*
typedef uintptr_t UIntPtr;
*/
typedef ptrdiff_t UIntPtr;
#endif
#define ADJUST_ALLOC_SIZE 0
/*
@ -344,36 +280,14 @@ const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree };
MyAlloc() can return address that is NOT multiple of sizeof(void *).
*/
/*
uintptr_t : <stdint.h> C99 (optional)
: unsupported in VS6
#define MY_ALIGN_PTR_DOWN(p, align) ((void *)((char *)(p) - ((size_t)(UIntPtr)(p) & ((align) - 1))))
*/
typedef
#ifdef _WIN32
UINT_PTR
#elif 1
uintptr_t
#else
ptrdiff_t
#endif
MY_uintptr_t;
#define MY_ALIGN_PTR_DOWN(p, align) ((void *)((((UIntPtr)(p)) & ~((UIntPtr)(align) - 1))))
#if 0 \
|| (defined(__CHERI__) \
|| defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ > 8))
// for 128-bit pointers (cheri):
#define MY_ALIGN_PTR_DOWN(p, align) \
((void *)((char *)(p) - ((size_t)(MY_uintptr_t)(p) & ((align) - 1))))
#else
#define MY_ALIGN_PTR_DOWN(p, align) \
((void *)((((MY_uintptr_t)(p)) & ~((MY_uintptr_t)(align) - 1))))
#endif
#endif
#if !defined(_WIN32) \
&& (defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) \
|| defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L))
#if !defined(_WIN32) && defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L)
#define USE_posix_memalign
#endif
@ -413,13 +327,14 @@ static int posix_memalign(void **ptr, size_t align, size_t size)
#define ALLOC_ALIGN_SIZE ((size_t)1 << 7)
void *z7_AlignedAlloc(size_t size)
static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size)
{
#ifndef USE_posix_memalign
#ifndef USE_posix_memalign
void *p;
void *pAligned;
size_t newSize;
UNUSED_VAR(pp);
/* also we can allocate additional dummy ALLOC_ALIGN_SIZE bytes after aligned
block to prevent cache line sharing with another allocated blocks */
@ -444,9 +359,10 @@ void *z7_AlignedAlloc(size_t size)
return pAligned;
#else
#else
void *p;
UNUSED_VAR(pp);
if (posix_memalign(&p, ALLOC_ALIGN_SIZE, size))
return NULL;
@ -455,37 +371,19 @@ void *z7_AlignedAlloc(size_t size)
return p;
#endif
}
void z7_AlignedFree(void *address)
{
#ifndef USE_posix_memalign
if (address)
MyFree(((void **)address)[-1]);
#else
free(address);
#endif
}
static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size)
{
UNUSED_VAR(pp)
return z7_AlignedAlloc(size);
#endif
}
static void SzAlignedFree(ISzAllocPtr pp, void *address)
{
UNUSED_VAR(pp)
#ifndef USE_posix_memalign
UNUSED_VAR(pp);
#ifndef USE_posix_memalign
if (address)
MyFree(((void **)address)[-1]);
#else
#else
free(address);
#endif
#endif
}
@ -493,45 +391,17 @@ const ISzAlloc g_AlignedAlloc = { SzAlignedAlloc, SzAlignedFree };
#define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *))
/* we align ptr to support cases where CAlignOffsetAlloc::offset is not multiply of sizeof(void *) */
#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
#if 1
#define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *))
#define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1]
#else
// we can use this simplified code,
// if (CAlignOffsetAlloc::offset == (k * sizeof(void *))
#define REAL_BLOCK_PTR_VAR(p) (((void **)(p))[-1])
#endif
#endif
#if 0
#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
#include <stdio.h>
static void PrintPtr(const char *s, const void *p)
{
const Byte *p2 = (const Byte *)&p;
unsigned i;
printf("%s %p ", s, p);
for (i = sizeof(p); i != 0;)
{
i--;
printf("%02x", p2[i]);
}
printf("\n");
}
#endif
#endif
#define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1]
/*
#define REAL_BLOCK_PTR_VAR(p) ((void **)(p))[-1]
*/
static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size)
{
#if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR)
UNUSED_VAR(pp)
return z7_AlignedAlloc(size);
#else
const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt);
CAlignOffsetAlloc *p = CONTAINER_FROM_VTBL(pp, CAlignOffsetAlloc, vt);
void *adr;
void *pAligned;
size_t newSize;
@ -559,12 +429,6 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size)
pAligned = (char *)MY_ALIGN_PTR_DOWN((char *)adr +
alignSize - p->offset + extra + ADJUST_ALLOC_SIZE, alignSize) + p->offset;
#if 0
printf("\nalignSize = %6x, offset=%6x, size=%8x \n", (unsigned)alignSize, (unsigned)p->offset, (unsigned)size);
PrintPtr("base", adr);
PrintPtr("alig", pAligned);
#endif
PrintLn();
Print("- Aligned: ");
Print(" size="); PrintHex(size, 8);
@ -576,25 +440,19 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size)
REAL_BLOCK_PTR_VAR(pAligned) = adr;
return pAligned;
#endif
}
static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address)
{
#if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR)
UNUSED_VAR(pp)
z7_AlignedFree(address);
#else
if (address)
{
const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt);
CAlignOffsetAlloc *p = CONTAINER_FROM_VTBL(pp, CAlignOffsetAlloc, vt);
PrintLn();
Print("- Aligned Free: ");
PrintLn();
ISzAlloc_Free(p->baseAlloc, REAL_BLOCK_PTR_VAR(address));
}
#endif
}

View file

@ -1,49 +1,31 @@
/* Alloc.h -- Memory allocation functions
2024-01-22 : Igor Pavlov : Public domain */
2021-07-13 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_ALLOC_H
#define ZIP7_INC_ALLOC_H
#ifndef __COMMON_ALLOC_H
#define __COMMON_ALLOC_H
#include "7zTypes.h"
EXTERN_C_BEGIN
/*
MyFree(NULL) : is allowed, as free(NULL)
MyAlloc(0) : returns NULL : but malloc(0) is allowed to return NULL or non_NULL
MyRealloc(NULL, 0) : returns NULL : but realloc(NULL, 0) is allowed to return NULL or non_NULL
MyRealloc() is similar to realloc() for the following cases:
MyRealloc(non_NULL, 0) : returns NULL and always calls MyFree(ptr)
MyRealloc(NULL, non_ZERO) : returns NULL, if allocation failed
MyRealloc(non_NULL, non_ZERO) : returns NULL, if reallocation failed
*/
void *MyAlloc(size_t size);
void MyFree(void *address);
void *MyRealloc(void *address, size_t size);
void *z7_AlignedAlloc(size_t size);
void z7_AlignedFree(void *p);
#ifdef _WIN32
#ifdef Z7_LARGE_PAGES
void SetLargePageSize(void);
#endif
void *MidAlloc(size_t size);
void MidFree(void *address);
void *BigAlloc(size_t size);
void BigFree(void *address);
/* #define Z7_BIG_ALLOC_IS_ZERO_FILLED */
#else
#define MidAlloc(size) z7_AlignedAlloc(size)
#define MidFree(address) z7_AlignedFree(address)
#define BigAlloc(size) z7_AlignedAlloc(size)
#define BigFree(address) z7_AlignedFree(address)
#define MidAlloc(size) MyAlloc(size)
#define MidFree(address) MyFree(address)
#define BigAlloc(size) MyAlloc(size)
#define BigFree(address) MyFree(address)
#endif

View file

@ -1,12 +0,0 @@
!IFDEF ASM_OBJS
!IF "$(PLATFORM)" == "arm64"
$(ASM_OBJS): ../../../Asm/arm64/$(*B).S
$(COMPL_ASM_CLANG)
!ELSEIF "$(PLATFORM)" == "arm"
$(ASM_OBJS): ../../../Asm/arm/$(*B).asm
$(COMPL_ASM)
!ELSEIF "$(PLATFORM)" != "ia64" && "$(PLATFORM)" != "mips"
$(ASM_OBJS): ../../../Asm/x86/$(*B).asm
$(COMPL_ASM)
!ENDIF
!ENDIF

331
C/Bcj2.c
View file

@ -1,24 +1,29 @@
/* Bcj2.c -- BCJ2 Decoder (Converter for x86 code)
2023-03-01 : Igor Pavlov : Public domain */
2021-02-09 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include "Bcj2.h"
#include "CpuArch.h"
#define CProb UInt16
#define kTopValue ((UInt32)1 << 24)
#define kNumBitModelTotalBits 11
#define kBitModelTotal (1 << kNumBitModelTotalBits)
#define kNumModelBits 11
#define kBitModelTotal (1 << kNumModelBits)
#define kNumMoveBits 5
// UInt32 bcj2_stats[256 + 2][2];
#define _IF_BIT_0 ttt = *prob; bound = (p->range >> kNumModelBits) * ttt; if (p->code < bound)
#define _UPDATE_0 p->range = bound; *prob = (CProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
#define _UPDATE_1 p->range -= bound; p->code -= bound; *prob = (CProb)(ttt - (ttt >> kNumMoveBits));
void Bcj2Dec_Init(CBcj2Dec *p)
{
unsigned i;
p->state = BCJ2_STREAM_RC; // BCJ2_DEC_STATE_OK;
p->state = BCJ2_DEC_STATE_OK;
p->ip = 0;
p->temp = 0;
p->temp[3] = 0;
p->range = 0;
p->code = 0;
for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
@ -27,248 +32,217 @@ void Bcj2Dec_Init(CBcj2Dec *p)
SRes Bcj2Dec_Decode(CBcj2Dec *p)
{
UInt32 v = p->temp;
// const Byte *src;
if (p->range <= 5)
{
UInt32 code = p->code;
p->state = BCJ2_DEC_STATE_ERROR; /* for case if we return SZ_ERROR_DATA; */
p->state = BCJ2_DEC_STATE_OK;
for (; p->range != 5; p->range++)
{
if (p->range == 1 && code != 0)
if (p->range == 1 && p->code != 0)
return SZ_ERROR_DATA;
if (p->bufs[BCJ2_STREAM_RC] == p->lims[BCJ2_STREAM_RC])
{
p->state = BCJ2_STREAM_RC;
return SZ_OK;
}
code = (code << 8) | *(p->bufs[BCJ2_STREAM_RC])++;
p->code = code;
p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++;
}
if (code == 0xffffffff)
if (p->code == 0xFFFFFFFF)
return SZ_ERROR_DATA;
p->range = 0xffffffff;
p->range = 0xFFFFFFFF;
}
// else
else if (p->state >= BCJ2_DEC_STATE_ORIG_0)
{
unsigned state = p->state;
// we check BCJ2_IS_32BIT_STREAM() here instead of check in the main loop
if (BCJ2_IS_32BIT_STREAM(state))
{
const Byte *cur = p->bufs[state];
if (cur == p->lims[state])
return SZ_OK;
p->bufs[state] = cur + 4;
{
const UInt32 ip = p->ip + 4;
v = GetBe32a(cur) - ip;
p->ip = ip;
}
state = BCJ2_DEC_STATE_ORIG_0;
}
if ((unsigned)(state - BCJ2_DEC_STATE_ORIG_0) < 4)
while (p->state <= BCJ2_DEC_STATE_ORIG_3)
{
Byte *dest = p->dest;
for (;;)
{
if (dest == p->destLim)
{
p->state = state;
p->temp = v;
return SZ_OK;
}
*dest++ = (Byte)v;
p->dest = dest;
if (++state == BCJ2_DEC_STATE_ORIG_3 + 1)
break;
v >>= 8;
}
if (dest == p->destLim)
return SZ_OK;
*dest = p->temp[(size_t)p->state - BCJ2_DEC_STATE_ORIG_0];
p->state++;
p->dest = dest + 1;
}
}
// src = p->bufs[BCJ2_STREAM_MAIN];
/*
if (BCJ2_IS_32BIT_STREAM(p->state))
{
const Byte *cur = p->bufs[p->state];
if (cur == p->lims[p->state])
return SZ_OK;
p->bufs[p->state] = cur + 4;
{
UInt32 val;
Byte *dest;
SizeT rem;
p->ip += 4;
val = GetBe32(cur) - p->ip;
dest = p->dest;
rem = p->destLim - dest;
if (rem < 4)
{
SizeT i;
SetUi32(p->temp, val);
for (i = 0; i < rem; i++)
dest[i] = p->temp[i];
p->dest = dest + rem;
p->state = BCJ2_DEC_STATE_ORIG_0 + (unsigned)rem;
return SZ_OK;
}
SetUi32(dest, val);
p->temp[3] = (Byte)(val >> 24);
p->dest = dest + 4;
p->state = BCJ2_DEC_STATE_OK;
}
}
*/
for (;;)
{
/*
if (BCJ2_IS_32BIT_STREAM(p->state))
p->state = BCJ2_DEC_STATE_OK;
else
*/
{
if (p->range < kTopValue)
{
if (p->bufs[BCJ2_STREAM_RC] == p->lims[BCJ2_STREAM_RC])
{
p->state = BCJ2_STREAM_RC;
p->temp = v;
return SZ_OK;
}
p->range <<= 8;
p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++;
}
{
const Byte *src = p->bufs[BCJ2_STREAM_MAIN];
const Byte *srcLim;
Byte *dest = p->dest;
Byte *dest;
SizeT num = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - src);
if (num == 0)
{
const SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - src);
SizeT num = (SizeT)(p->destLim - dest);
if (num >= rem)
num = rem;
#define NUM_ITERS 4
#if (NUM_ITERS & (NUM_ITERS - 1)) == 0
num &= ~((SizeT)NUM_ITERS - 1); // if (NUM_ITERS == (1 << x))
#else
num -= num % NUM_ITERS; // if (NUM_ITERS != (1 << x))
#endif
srcLim = src + num;
p->state = BCJ2_STREAM_MAIN;
return SZ_OK;
}
#define NUM_SHIFT_BITS 24
#define ONE_ITER(indx) { \
const unsigned b = src[indx]; \
*dest++ = (Byte)b; \
v = (v << NUM_SHIFT_BITS) | b; \
if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
/* ++dest */; /* v = b; */ }
if (src != srcLim)
for (;;)
dest = p->dest;
if (num > (SizeT)(p->destLim - dest))
{
/* The dependency chain of 2-cycle for (v) calculation is not big problem here.
But we can remove dependency chain with v = b in the end of loop. */
ONE_ITER(0)
#if (NUM_ITERS > 1)
ONE_ITER(1)
#if (NUM_ITERS > 2)
ONE_ITER(2)
#if (NUM_ITERS > 3)
ONE_ITER(3)
#if (NUM_ITERS > 4)
ONE_ITER(4)
#if (NUM_ITERS > 5)
ONE_ITER(5)
#if (NUM_ITERS > 6)
ONE_ITER(6)
#if (NUM_ITERS > 7)
ONE_ITER(7)
#endif
#endif
#endif
#endif
#endif
#endif
#endif
src += NUM_ITERS;
if (src == srcLim)
break;
}
if (src == srcLim)
#if (NUM_ITERS > 1)
for (;;)
#endif
{
#if (NUM_ITERS > 1)
if (src == p->lims[BCJ2_STREAM_MAIN] || dest == p->destLim)
#endif
num = (SizeT)(p->destLim - dest);
if (num == 0)
{
const SizeT num = (SizeT)(src - p->bufs[BCJ2_STREAM_MAIN]);
p->bufs[BCJ2_STREAM_MAIN] = src;
p->dest = dest;
p->ip += (UInt32)num;
/* state BCJ2_STREAM_MAIN has more priority than BCJ2_STATE_ORIG */
p->state =
src == p->lims[BCJ2_STREAM_MAIN] ?
(unsigned)BCJ2_STREAM_MAIN :
(unsigned)BCJ2_DEC_STATE_ORIG;
p->temp = v;
p->state = BCJ2_DEC_STATE_ORIG;
return SZ_OK;
}
#if (NUM_ITERS > 1)
ONE_ITER(0)
src++;
#endif
}
srcLim = src + num;
if (p->temp[3] == 0x0F && (src[0] & 0xF0) == 0x80)
*dest = src[0];
else for (;;)
{
const SizeT num = (SizeT)(dest - p->dest);
p->dest = dest; // p->dest += num;
p->bufs[BCJ2_STREAM_MAIN] += num; // = src;
p->ip += (UInt32)num;
Byte b = *src;
*dest = b;
if (b != 0x0F)
{
if ((b & 0xFE) == 0xE8)
break;
dest++;
if (++src != srcLim)
continue;
break;
}
dest++;
if (++src == srcLim)
break;
if ((*src & 0xF0) != 0x80)
continue;
*dest = *src;
break;
}
num = (SizeT)(src - p->bufs[BCJ2_STREAM_MAIN]);
if (src == srcLim)
{
p->temp[3] = src[-1];
p->bufs[BCJ2_STREAM_MAIN] = src;
p->ip += (UInt32)num;
p->dest += num;
p->state =
p->bufs[BCJ2_STREAM_MAIN] ==
p->lims[BCJ2_STREAM_MAIN] ?
(unsigned)BCJ2_STREAM_MAIN :
(unsigned)BCJ2_DEC_STATE_ORIG;
return SZ_OK;
}
{
UInt32 bound, ttt;
CBcj2Prob *prob; // unsigned index;
/*
prob = p->probs + (unsigned)((Byte)v == 0xe8 ?
2 + (Byte)(v >> 8) :
((v >> 5) & 1)); // ((Byte)v < 0xe8 ? 0 : 1));
*/
CProb *prob;
Byte b = src[0];
Byte prev = (Byte)(num == 0 ? p->temp[3] : src[-1]);
p->temp[3] = b;
p->bufs[BCJ2_STREAM_MAIN] = src + 1;
num++;
p->ip += (UInt32)num;
p->dest += num;
prob = p->probs + (unsigned)(b == 0xE8 ? 2 + (unsigned)prev : (b == 0xE9 ? 1 : 0));
_IF_BIT_0
{
const unsigned c = ((v + 0x17) >> 6) & 1;
prob = p->probs + (unsigned)
(((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
// (Byte)
// 8x->0 : e9->1 : xxe8->xx+2
// 8x->0x100 : e9->0x101 : xxe8->xx
// (((0x100 - (e & ~v)) & (0x100 | (v >> 8))) + (e & v));
// (((0x101 + (~e | v)) & (0x100 | (v >> 8))) + (e & v));
}
ttt = *prob;
bound = (p->range >> kNumBitModelTotalBits) * ttt;
if (p->code < bound)
{
// bcj2_stats[prob - p->probs][0]++;
p->range = bound;
*prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
_UPDATE_0
continue;
}
{
// bcj2_stats[prob - p->probs][1]++;
p->range -= bound;
p->code -= bound;
*prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
}
_UPDATE_1
}
}
}
{
/* (v == 0xe8 ? 0 : 1) uses setcc instruction with additional zero register usage in x64 MSVC. */
// const unsigned cj = ((Byte)v == 0xe8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP;
const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
UInt32 val;
unsigned cj = (p->temp[3] == 0xE8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP;
const Byte *cur = p->bufs[cj];
Byte *dest;
SizeT rem;
if (cur == p->lims[cj])
{
p->state = cj;
break;
}
v = GetBe32a(cur);
val = GetBe32(cur);
p->bufs[cj] = cur + 4;
{
const UInt32 ip = p->ip + 4;
v -= ip;
p->ip = ip;
}
p->ip += 4;
val -= p->ip;
dest = p->dest;
rem = (SizeT)(p->destLim - dest);
if (rem < 4)
{
if ((unsigned)rem > 0) { dest[0] = (Byte)v; v >>= 8;
if ((unsigned)rem > 1) { dest[1] = (Byte)v; v >>= 8;
if ((unsigned)rem > 2) { dest[2] = (Byte)v; v >>= 8; }}}
p->temp = v;
p->temp[0] = (Byte)val; if (rem > 0) dest[0] = (Byte)val; val >>= 8;
p->temp[1] = (Byte)val; if (rem > 1) dest[1] = (Byte)val; val >>= 8;
p->temp[2] = (Byte)val; if (rem > 2) dest[2] = (Byte)val; val >>= 8;
p->temp[3] = (Byte)val;
p->dest = dest + rem;
p->state = BCJ2_DEC_STATE_ORIG_0 + (unsigned)rem;
break;
}
SetUi32(dest, v)
v >>= 24;
SetUi32(dest, val);
p->temp[3] = (Byte)(val >> 24);
p->dest = dest + 4;
}
}
@ -278,13 +252,6 @@ SRes Bcj2Dec_Decode(CBcj2Dec *p)
p->range <<= 8;
p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++;
}
return SZ_OK;
}
#undef NUM_ITERS
#undef ONE_ITER
#undef NUM_SHIFT_BITS
#undef kTopValue
#undef kNumBitModelTotalBits
#undef kBitModelTotal
#undef kNumMoveBits

270
C/Bcj2.h
View file

@ -1,8 +1,8 @@
/* Bcj2.h -- BCJ2 converter for x86 code (Branch CALL/JUMP variant2)
2023-03-02 : Igor Pavlov : Public domain */
/* Bcj2.h -- BCJ2 Converter for x86 code
2014-11-10 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_BCJ2_H
#define ZIP7_INC_BCJ2_H
#ifndef __BCJ2_H
#define __BCJ2_H
#include "7zTypes.h"
@ -26,68 +26,37 @@ enum
BCJ2_DEC_STATE_ORIG_3,
BCJ2_DEC_STATE_ORIG,
BCJ2_DEC_STATE_ERROR /* after detected data error */
BCJ2_DEC_STATE_OK
};
enum
{
BCJ2_ENC_STATE_ORIG = BCJ2_NUM_STREAMS,
BCJ2_ENC_STATE_FINISHED /* it's state after fully encoded stream */
BCJ2_ENC_STATE_OK
};
/* #define BCJ2_IS_32BIT_STREAM(s) ((s) == BCJ2_STREAM_CALL || (s) == BCJ2_STREAM_JUMP) */
#define BCJ2_IS_32BIT_STREAM(s) ((unsigned)((unsigned)(s) - (unsigned)BCJ2_STREAM_CALL) < 2)
#define BCJ2_IS_32BIT_STREAM(s) ((s) == BCJ2_STREAM_CALL || (s) == BCJ2_STREAM_JUMP)
/*
CBcj2Dec / CBcj2Enc
bufs sizes:
BUF_SIZE(n) = lims[n] - bufs[n]
bufs sizes for BCJ2_STREAM_CALL and BCJ2_STREAM_JUMP must be multiply of 4:
bufs sizes for BCJ2_STREAM_CALL and BCJ2_STREAM_JUMP must be mutliply of 4:
(BUF_SIZE(BCJ2_STREAM_CALL) & 3) == 0
(BUF_SIZE(BCJ2_STREAM_JUMP) & 3) == 0
*/
// typedef UInt32 CBcj2Prob;
typedef UInt16 CBcj2Prob;
/*
BCJ2 encoder / decoder internal requirements:
- If last bytes of stream contain marker (e8/e8/0f8x), then
there is also encoded symbol (0 : no conversion) in RC stream.
- One case of overlapped instructions is supported,
if last byte of converted instruction is (0f) and next byte is (8x):
marker [xx xx xx 0f] 8x
then the pair (0f 8x) is treated as marker.
*/
/* ---------- BCJ2 Decoder ---------- */
/*
CBcj2Dec:
(dest) is allowed to overlap with bufs[BCJ2_STREAM_MAIN], with the following conditions:
dest is allowed to overlap with bufs[BCJ2_STREAM_MAIN], with the following conditions:
bufs[BCJ2_STREAM_MAIN] >= dest &&
bufs[BCJ2_STREAM_MAIN] - dest >=
bufs[BCJ2_STREAM_MAIN] - dest >= tempReserv +
BUF_SIZE(BCJ2_STREAM_CALL) +
BUF_SIZE(BCJ2_STREAM_JUMP)
reserve = bufs[BCJ2_STREAM_MAIN] - dest -
( BUF_SIZE(BCJ2_STREAM_CALL) +
BUF_SIZE(BCJ2_STREAM_JUMP) )
and additional conditions:
if (it's first call of Bcj2Dec_Decode() after Bcj2Dec_Init())
{
(reserve != 1) : if (ver < v23.00)
}
else // if there are more than one calls of Bcj2Dec_Decode() after Bcj2Dec_Init())
{
(reserve >= 6) : if (ver < v23.00)
(reserve >= 4) : if (ver >= v23.00)
We need that (reserve) because after first call of Bcj2Dec_Decode(),
CBcj2Dec::temp can contain up to 4 bytes for writing to (dest).
}
(reserve == 0) is allowed, if we decode full stream via single call of Bcj2Dec_Decode().
(reserve == 0) also is allowed in case of multi-call, if we use fixed buffers,
and (reserve) is calculated from full (final) sizes of all streams before first call.
tempReserv = 0 : for first call of Bcj2Dec_Decode
tempReserv = 4 : for any other calls of Bcj2Dec_Decode
overlap with offset = 1 is not allowed
*/
typedef struct
@ -99,66 +68,22 @@ typedef struct
unsigned state; /* BCJ2_STREAM_MAIN has more priority than BCJ2_STATE_ORIG */
UInt32 ip; /* property of starting base for decoding */
UInt32 temp; /* Byte temp[4]; */
UInt32 ip;
Byte temp[4];
UInt32 range;
UInt32 code;
CBcj2Prob probs[2 + 256];
UInt16 probs[2 + 256];
} CBcj2Dec;
/* Note:
Bcj2Dec_Init() sets (CBcj2Dec::ip = 0)
if (ip != 0) property is required, the caller must set CBcj2Dec::ip after Bcj2Dec_Init()
*/
void Bcj2Dec_Init(CBcj2Dec *p);
/* Bcj2Dec_Decode():
returns:
SZ_OK
SZ_ERROR_DATA : if data in 5 starting bytes of BCJ2_STREAM_RC stream are not correct
*/
/* Returns: SZ_OK or SZ_ERROR_DATA */
SRes Bcj2Dec_Decode(CBcj2Dec *p);
/* To check that decoding was finished you can compare
sizes of processed streams with sizes known from another sources.
You must do at least one mandatory check from the two following options:
- the check for size of processed output (ORIG) stream.
- the check for size of processed input (MAIN) stream.
additional optional checks:
- the checks for processed sizes of all input streams (MAIN, CALL, JUMP, RC)
- the checks Bcj2Dec_IsMaybeFinished*()
also before actual decoding you can check that the
following condition is met for stream sizes:
( size(ORIG) == size(MAIN) + size(CALL) + size(JUMP) )
*/
/* (state == BCJ2_STREAM_MAIN) means that decoder is ready for
additional input data in BCJ2_STREAM_MAIN stream.
Note that (state == BCJ2_STREAM_MAIN) is allowed for non-finished decoding.
*/
#define Bcj2Dec_IsMaybeFinished_state_MAIN(_p_) ((_p_)->state == BCJ2_STREAM_MAIN)
/* if the stream decoding was finished correctly, then range decoder
part of CBcj2Dec also was finished, and then (CBcj2Dec::code == 0).
Note that (CBcj2Dec::code == 0) is allowed for non-finished decoding.
*/
#define Bcj2Dec_IsMaybeFinished_code(_p_) ((_p_)->code == 0)
/* use Bcj2Dec_IsMaybeFinished() only as additional check
after at least one mandatory check from the two following options:
- the check for size of processed output (ORIG) stream.
- the check for size of processed input (MAIN) stream.
*/
#define Bcj2Dec_IsMaybeFinished(_p_) ( \
Bcj2Dec_IsMaybeFinished_state_MAIN(_p_) && \
Bcj2Dec_IsMaybeFinished_code(_p_))
#define Bcj2Dec_IsFinished(_p_) ((_p_)->code == 0)
/* ---------- BCJ2 Encoder ---------- */
typedef enum
{
BCJ2_ENC_FINISH_MODE_CONTINUE,
@ -166,91 +91,6 @@ typedef enum
BCJ2_ENC_FINISH_MODE_END_STREAM
} EBcj2Enc_FinishMode;
/*
BCJ2_ENC_FINISH_MODE_CONTINUE:
process non finished encoding.
It notifies the encoder that additional further calls
can provide more input data (src) than provided by current call.
In that case the CBcj2Enc encoder still can move (src) pointer
up to (srcLim), but CBcj2Enc encoder can store some of the last
processed bytes (up to 4 bytes) from src to internal CBcj2Enc::temp[] buffer.
at return:
(CBcj2Enc::src will point to position that includes
processed data and data copied to (temp[]) buffer)
That data from (temp[]) buffer will be used in further calls.
BCJ2_ENC_FINISH_MODE_END_BLOCK:
finish encoding of current block (ended at srcLim) without RC flushing.
at return: if (CBcj2Enc::state == BCJ2_ENC_STATE_ORIG) &&
CBcj2Enc::src == CBcj2Enc::srcLim)
: it shows that block encoding was finished. And the encoder is
ready for new (src) data or for stream finish operation.
finished block means
{
CBcj2Enc has completed block encoding up to (srcLim).
(1 + 4 bytes) or (2 + 4 bytes) CALL/JUMP cortages will
not cross block boundary at (srcLim).
temporary CBcj2Enc buffer for (ORIG) src data is empty.
3 output uncompressed streams (MAIN, CALL, JUMP) were flushed.
RC stream was not flushed. And RC stream will cross block boundary.
}
Note: some possible implementation of BCJ2 encoder could
write branch marker (e8/e8/0f8x) in one call of Bcj2Enc_Encode(),
and it could calculate symbol for RC in another call of Bcj2Enc_Encode().
BCJ2 encoder uses ip/fileIp/fileSize/relatLimit values to calculate RC symbol.
And these CBcj2Enc variables can have different values in different Bcj2Enc_Encode() calls.
So caller must finish each block with BCJ2_ENC_FINISH_MODE_END_BLOCK
to ensure that RC symbol is calculated and written in proper block.
BCJ2_ENC_FINISH_MODE_END_STREAM
finish encoding of stream (ended at srcLim) fully including RC flushing.
at return: if (CBcj2Enc::state == BCJ2_ENC_STATE_FINISHED)
: it shows that stream encoding was finished fully,
and all output streams were flushed fully.
also Bcj2Enc_IsFinished() can be called.
*/
/*
32-bit relative offset in JUMP/CALL commands is
- (mod 4 GiB) for 32-bit x86 code
- signed Int32 for 64-bit x86-64 code
BCJ2 encoder also does internal relative to absolute address conversions.
And there are 2 possible ways to do it:
before v23: we used 32-bit variables and (mod 4 GiB) conversion
since v23: we use 64-bit variables and (signed Int32 offset) conversion.
The absolute address condition for conversion in v23:
((UInt64)((Int64)ip64 - (Int64)fileIp64 + 5 + (Int32)offset) < (UInt64)fileSize64)
note that if (fileSize64 > 2 GiB). there is difference between
old (mod 4 GiB) way (v22) and new (signed Int32 offset) way (v23).
And new (v23) way is more suitable to encode 64-bit x86-64 code for (fileSize64 > 2 GiB) cases.
*/
/*
// for old (v22) way for conversion:
typedef UInt32 CBcj2Enc_ip_unsigned;
typedef Int32 CBcj2Enc_ip_signed;
#define BCJ2_ENC_FileSize_MAX ((UInt32)1 << 31)
*/
typedef UInt64 CBcj2Enc_ip_unsigned;
typedef Int64 CBcj2Enc_ip_signed;
/* maximum size of file that can be used for conversion condition */
#define BCJ2_ENC_FileSize_MAX ((CBcj2Enc_ip_unsigned)0 - 2)
/* default value of fileSize64_minus1 variable that means
that absolute address limitation will not be used */
#define BCJ2_ENC_FileSizeField_UNLIMITED ((CBcj2Enc_ip_unsigned)0 - 1)
/* calculate value that later can be set to CBcj2Enc::fileSize64_minus1 */
#define BCJ2_ENC_GET_FileSizeField_VAL_FROM_FileSize(fileSize) \
((CBcj2Enc_ip_unsigned)(fileSize) - 1)
/* set CBcj2Enc::fileSize64_minus1 variable from size of file */
#define Bcj2Enc_SET_FileSize(p, fileSize) \
(p)->fileSize64_minus1 = BCJ2_ENC_GET_FileSizeField_VAL_FROM_FileSize(fileSize);
typedef struct
{
Byte *bufs[BCJ2_NUM_STREAMS];
@ -261,71 +101,45 @@ typedef struct
unsigned state;
EBcj2Enc_FinishMode finishMode;
Byte context;
Byte flushRem;
Byte isFlushState;
Byte prevByte;
Byte cache;
UInt32 range;
UInt64 low;
UInt64 cacheSize;
// UInt32 context; // for marker version, it can include marker flag.
/* (ip64) and (fileIp64) correspond to virtual source stream position
that doesn't include data in temp[] */
CBcj2Enc_ip_unsigned ip64; /* current (ip) position */
CBcj2Enc_ip_unsigned fileIp64; /* start (ip) position of current file */
CBcj2Enc_ip_unsigned fileSize64_minus1; /* size of current file (for conversion limitation) */
UInt32 relatLimit; /* (relatLimit <= ((UInt32)1 << 31)) : 0 means disable_conversion */
// UInt32 relatExcludeBits;
UInt32 ip;
/* 32-bit ralative offset in JUMP/CALL commands is
- (mod 4 GB) in 32-bit mode
- signed Int32 in 64-bit mode
We use (mod 4 GB) check for fileSize.
Use fileSize up to 2 GB, if you want to support 32-bit and 64-bit code conversion. */
UInt32 fileIp;
UInt32 fileSize; /* (fileSize <= ((UInt32)1 << 31)), 0 means no_limit */
UInt32 relatLimit; /* (relatLimit <= ((UInt32)1 << 31)), 0 means desable_conversion */
UInt32 tempTarget;
unsigned tempPos; /* the number of bytes that were copied to temp[] buffer
(tempPos <= 4) outside of Bcj2Enc_Encode() */
// Byte temp[4]; // for marker version
Byte temp[8];
CBcj2Prob probs[2 + 256];
unsigned tempPos;
Byte temp[4 * 2];
unsigned flushPos;
UInt16 probs[2 + 256];
} CBcj2Enc;
void Bcj2Enc_Init(CBcj2Enc *p);
/*
Bcj2Enc_Encode(): at exit:
p->State < BCJ2_NUM_STREAMS : we need more buffer space for output stream
(bufs[p->State] == lims[p->State])
p->State == BCJ2_ENC_STATE_ORIG : we need more data in input src stream
(src == srcLim)
p->State == BCJ2_ENC_STATE_FINISHED : after fully encoded stream
*/
void Bcj2Enc_Encode(CBcj2Enc *p);
/* Bcj2Enc encoder can look ahead for up 4 bytes of source stream.
CBcj2Enc::tempPos : is the number of bytes that were copied from input stream to temp[] buffer.
(CBcj2Enc::src) after Bcj2Enc_Encode() is starting position after
fully processed data and after data copied to temp buffer.
So if the caller needs to get real number of fully processed input
bytes (without look ahead data in temp buffer),
the caller must subtruct (CBcj2Enc::tempPos) value from processed size
value that is calculated based on current (CBcj2Enc::src):
cur_processed_pos = Calc_Big_Processed_Pos(enc.src)) -
Bcj2Enc_Get_AvailInputSize_in_Temp(&enc);
*/
/* get the size of input data that was stored in temp[] buffer: */
#define Bcj2Enc_Get_AvailInputSize_in_Temp(p) ((p)->tempPos)
#define Bcj2Enc_Get_InputData_Size(p) ((SizeT)((p)->srcLim - (p)->src) + (p)->tempPos)
#define Bcj2Enc_IsFinished(p) ((p)->flushPos == 5)
#define Bcj2Enc_IsFinished(p) ((p)->flushRem == 0)
/* Note : the decoder supports overlapping of marker (0f 80).
But we can eliminate such overlapping cases by setting
the limit for relative offset conversion as
CBcj2Enc::relatLimit <= (0x0f << 24) == (240 MiB)
*/
/* default value for CBcj2Enc::relatLimit */
#define BCJ2_ENC_RELAT_LIMIT_DEFAULT ((UInt32)0x0f << 24)
#define BCJ2_ENC_RELAT_LIMIT_MAX ((UInt32)1 << 31)
// #define BCJ2_RELAT_EXCLUDE_NUM_BITS 5
#define BCJ2_RELAT_LIMIT_NUM_BITS 26
#define BCJ2_RELAT_LIMIT ((UInt32)1 << BCJ2_RELAT_LIMIT_NUM_BITS)
/* limit for CBcj2Enc::fileSize variable */
#define BCJ2_FileSize_MAX ((UInt32)1 << 31)
EXTERN_C_END

View file

@ -1,62 +1,60 @@
/* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2)
2023-04-02 : Igor Pavlov : Public domain */
/* Bcj2Enc.c -- BCJ2 Encoder (Converter for x86 code)
2021-02-09 : Igor Pavlov : Public domain */
#include "Precomp.h"
/* #define SHOW_STAT */
#ifdef SHOW_STAT
#include <stdio.h>
#define PRF2(s) printf("%s ip=%8x tempPos=%d src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src));
#define PRF(x) x
#else
#define PRF2(s)
#define PRF(x)
#endif
#include <string.h>
#include "Bcj2.h"
#include "CpuArch.h"
#define CProb UInt16
#define kTopValue ((UInt32)1 << 24)
#define kNumBitModelTotalBits 11
#define kBitModelTotal (1 << kNumBitModelTotalBits)
#define kNumModelBits 11
#define kBitModelTotal (1 << kNumModelBits)
#define kNumMoveBits 5
void Bcj2Enc_Init(CBcj2Enc *p)
{
unsigned i;
p->state = BCJ2_ENC_STATE_ORIG;
p->state = BCJ2_ENC_STATE_OK;
p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
p->context = 0;
p->flushRem = 5;
p->isFlushState = 0;
p->prevByte = 0;
p->cache = 0;
p->range = 0xffffffff;
p->range = 0xFFFFFFFF;
p->low = 0;
p->cacheSize = 1;
p->ip64 = 0;
p->fileIp64 = 0;
p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED;
p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT;
// p->relatExcludeBits = 0;
p->ip = 0;
p->fileIp = 0;
p->fileSize = 0;
p->relatLimit = BCJ2_RELAT_LIMIT;
p->tempPos = 0;
p->flushPos = 0;
for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
p->probs[i] = kBitModelTotal >> 1;
}
// Z7_NO_INLINE
Z7_FORCE_INLINE
static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)
static BoolInt MY_FAST_CALL RangeEnc_ShiftLow(CBcj2Enc *p)
{
const UInt32 low = (UInt32)p->low;
const unsigned high = (unsigned)
#if defined(Z7_MSC_VER_ORIGINAL) \
&& defined(MY_CPU_X86) \
&& defined(MY_CPU_LE) \
&& !defined(MY_CPU_64BIT)
// we try to rid of __aullshr() call in MSVS-x86
(((const UInt32 *)&p->low)[1]); // [1] : for little-endian only
#else
(p->low >> 32);
#endif
if (low < (UInt32)0xff000000 || high != 0)
if ((UInt32)p->low < (UInt32)0xFF000000 || (UInt32)(p->low >> 32) != 0)
{
Byte *buf = p->bufs[BCJ2_STREAM_RC];
do
@ -67,440 +65,247 @@ static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)
p->bufs[BCJ2_STREAM_RC] = buf;
return True;
}
*buf++ = (Byte)(p->cache + high);
p->cache = 0xff;
*buf++ = (Byte)(p->cache + (Byte)(p->low >> 32));
p->cache = 0xFF;
}
while (--p->cacheSize);
p->bufs[BCJ2_STREAM_RC] = buf;
p->cache = (Byte)(low >> 24);
p->cache = (Byte)((UInt32)p->low >> 24);
}
p->cacheSize++;
p->low = low << 8;
p->low = (UInt32)p->low << 8;
return False;
}
/*
We can use 2 alternative versions of code:
1) non-marker version:
Byte CBcj2Enc::context
Byte temp[8];
Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer.
Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction
with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call.
2) marker version:
UInt32 CBcj2Enc::context
Byte CBcj2Enc::temp[4];
MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker.
it's allowed that
one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest,
and another call of Bcj2Enc_Encode_2() does offset conversion.
So different values of (fileIp) and (fileSize) are possible
in these different Bcj2Enc_Encode_2() calls.
Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop.
So we use non-marker version.
*/
/*
Corner cases with overlap in multi-block.
before v23: there was one corner case, where converted instruction
could start in one sub-stream and finish in next sub-stream.
If multi-block (solid) encoding is used,
and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream.
and (0f) is last byte of previous sub-stream
and (8x) is first byte of current sub-stream
then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder.
BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage,
if that offset meets limit requirements.
If encoder allows 32-bit offset conversion for such overlap case,
then the data in 3 uncompressed BCJ2 streams for some sub-stream
can depend from data of previous sub-stream.
That corner case is not big problem, and it's rare case.
Since v23.00 we do additional check to prevent conversions in such overlap cases.
*/
/*
Bcj2Enc_Encode_2() output variables at exit:
{
if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG))
{
it means that encoder needs more input data.
if (p->srcLim == p->src) at exit, then
{
(p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
all input data were read and processed, and we are ready for
new input data.
}
else
{
(p->srcLim != p->src)
(p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
The encoder have found e8/e9/0f_8x marker,
and p->src points to last byte of that marker,
Bcj2Enc_Encode_2() needs more input data to get totally
5 bytes (last byte of marker and 32-bit branch offset)
as continuous array starting from p->src.
(p->srcLim - p->src < 5) requirement is met after exit.
So non-processed resedue from p->src to p->srcLim is always less than 5 bytes.
}
}
}
*/
Z7_NO_INLINE
static void Bcj2Enc_Encode_2(CBcj2Enc *p)
{
if (!p->isFlushState)
if (BCJ2_IS_32BIT_STREAM(p->state))
{
const Byte *src;
UInt32 v;
Byte *cur = p->bufs[p->state];
if (cur == p->lims[p->state])
return;
SetBe32(cur, p->tempTarget);
p->bufs[p->state] = cur + 4;
}
p->state = BCJ2_ENC_STATE_ORIG;
for (;;)
{
if (p->range < kTopValue)
{
const unsigned state = p->state;
if (BCJ2_IS_32BIT_STREAM(state))
{
Byte *cur = p->bufs[state];
if (cur == p->lims[state])
return;
SetBe32a(cur, p->tempTarget)
p->bufs[state] = cur + 4;
}
if (RangeEnc_ShiftLow(p))
return;
p->range <<= 8;
}
p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit
src = p->src;
v = p->context;
// #define WRITE_CONTEXT p->context = v; // for marker version
#define WRITE_CONTEXT p->context = (Byte)v;
#define WRITE_CONTEXT_AND_SRC p->src = src; WRITE_CONTEXT
for (;;)
{
// const Byte *src;
// UInt32 v;
CBcj2Enc_ip_unsigned ip;
if (p->range < kTopValue)
{
// to reduce register pressure and code size: we save and restore local variables.
WRITE_CONTEXT_AND_SRC
if (Bcj2_RangeEnc_ShiftLow(p))
return;
p->range <<= 8;
src = p->src;
v = p->context;
}
// src = p->src;
// #define MARKER_FLAG ((UInt32)1 << 17)
// if ((v & MARKER_FLAG) == 0) // for marker version
{
const Byte *src = p->src;
const Byte *srcLim;
Byte *dest = p->bufs[BCJ2_STREAM_MAIN];
{
const SizeT remSrc = (SizeT)(p->srcLim - src);
SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
if (rem >= remSrc)
rem = remSrc;
srcLim = src + rem;
}
/* p->context contains context of previous byte:
bits [0 : 7] : src[-1], if (src) was changed in this call
bits [8 : 31] : are undefined for non-marker version
*/
// v = p->context;
#define NUM_SHIFT_BITS 24
#define CONV_FLAG ((UInt32)1 << 16)
#define ONE_ITER { \
b = src[0]; \
*dest++ = (Byte)b; \
v = (v << NUM_SHIFT_BITS) | b; \
if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
src++; if (src == srcLim) { break; } }
Byte *dest;
SizeT num = (SizeT)(p->srcLim - src);
if (src != srcLim)
for (;;)
if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
{
/* clang can generate ineffective code with setne instead of two jcc instructions.
we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */
unsigned b;
ONE_ITER
ONE_ITER
if (num <= 4)
return;
num -= 4;
}
ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]);
p->bufs[BCJ2_STREAM_MAIN] = dest;
p->ip64 = ip;
else if (num == 0)
break;
if (src == srcLim)
dest = p->bufs[BCJ2_STREAM_MAIN];
if (num > (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest))
{
WRITE_CONTEXT_AND_SRC
if (src != p->srcLim)
num = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
if (num == 0)
{
p->state = BCJ2_STREAM_MAIN;
return;
}
/* (p->src == p->srcLim)
(p->state == BCJ2_ENC_STATE_ORIG) */
if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
return;
/* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */
// (p->flushRem == 5);
p->isFlushState = 1;
}
srcLim = src + num;
if (p->prevByte == 0x0F && (src[0] & 0xF0) == 0x80)
*dest = src[0];
else for (;;)
{
Byte b = *src;
*dest = b;
if (b != 0x0F)
{
if ((b & 0xFE) == 0xE8)
break;
dest++;
if (++src != srcLim)
continue;
break;
}
dest++;
if (++src == srcLim)
break;
if ((*src & 0xF0) != 0x80)
continue;
*dest = *src;
break;
}
src++;
// p->src = src;
}
// ip = p->ip; // for marker version
/* marker was found */
/* (v) contains marker that was found:
bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7]
: value of src[-2] : xx/xx/0f
bits [0 : 7] : value of src[-1] : e8/e9/8x
*/
{
num = (SizeT)(src - p->src);
if (src == srcLim)
{
#if NUM_SHIFT_BITS != 24
v &= ~(UInt32)CONV_FLAG;
#endif
// UInt32 relat = 0;
p->prevByte = src[-1];
p->bufs[BCJ2_STREAM_MAIN] = dest;
p->src = src;
p->ip += (UInt32)num;
continue;
}
{
Byte context = (Byte)(num == 0 ? p->prevByte : src[-1]);
BoolInt needConvert;
p->bufs[BCJ2_STREAM_MAIN] = dest + 1;
p->ip += (UInt32)num + 1;
src++;
needConvert = False;
if ((SizeT)(p->srcLim - src) >= 4)
{
/*
if (relat != 0 || (Byte)v != 0xe8)
BoolInt isBigOffset = True;
*/
const UInt32 relat = GetUi32(src);
/*
#define EXCLUDE_FLAG ((UInt32)1 << 4)
#define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0)
if (p->relatExcludeBits != 0)
{
const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1);
isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0);
}
// isBigOffset = False; // for debug
*/
ip -= p->fileIp64;
// Use the following if check, if (ip) is 64-bit:
if (ip > (((v + 0x20) >> 5) & 1)) // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9)
if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1)
if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit)
v |= CONV_FLAG;
UInt32 relatVal = GetUi32(src);
if ((p->fileSize == 0 || (UInt32)(p->ip + 4 + relatVal - p->fileIp) < p->fileSize)
&& ((relatVal + p->relatLimit) >> 1) < p->relatLimit)
needConvert = True;
}
else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
{
// (p->srcLim - src < 4)
// /*
// for non-marker version
p->ip64--; // p->ip = ip - 1;
p->bufs[BCJ2_STREAM_MAIN]--;
src--;
v >>= NUM_SHIFT_BITS;
// (0 < p->srcLim - p->src <= 4)
// */
// v |= MARKER_FLAG; // for marker version
/* (p->state == BCJ2_ENC_STATE_ORIG) */
WRITE_CONTEXT_AND_SRC
return;
}
{
const unsigned c = ((v + 0x17) >> 6) & 1;
CBcj2Prob *prob = p->probs + (unsigned)
(((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
/*
((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) :
((Byte)v < 0xe8 ? 0 : 1)); // ((v >> 5) & 1));
*/
const unsigned ttt = *prob;
const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt;
if ((v & CONV_FLAG) == 0)
UInt32 bound;
unsigned ttt;
Byte b = src[-1];
CProb *prob = p->probs + (unsigned)(b == 0xE8 ? 2 + (unsigned)context : (b == 0xE9 ? 1 : 0));
ttt = *prob;
bound = (p->range >> kNumModelBits) * ttt;
if (!needConvert)
{
// static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy);
// v = (Byte)v; // for marker version
p->range = bound;
*prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
// WRITE_CONTEXT_AND_SRC
*prob = (CProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
p->src = src;
p->prevByte = b;
continue;
}
p->low += bound;
p->range -= bound;
*prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
}
// p->context = src[3];
{
// const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP);
const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
ip = p->ip64;
v = GetUi32(src); // relat
ip += 4;
p->ip64 = ip;
src += 4;
// p->src = src;
*prob = (CProb)(ttt - (ttt >> kNumMoveBits));
{
const UInt32 absol = (UInt32)ip + v;
Byte *cur = p->bufs[cj];
v >>= 24;
// WRITE_CONTEXT
if (cur == p->lims[cj])
UInt32 relatVal = GetUi32(src);
UInt32 absVal;
p->ip += 4;
absVal = p->ip + relatVal;
p->prevByte = src[3];
src += 4;
p->src = src;
{
p->state = cj;
p->tempTarget = absol;
WRITE_CONTEXT_AND_SRC
return;
unsigned cj = (b == 0xE8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP;
Byte *cur = p->bufs[cj];
if (cur == p->lims[cj])
{
p->state = cj;
p->tempTarget = absVal;
return;
}
SetBe32(cur, absVal);
p->bufs[cj] = cur + 4;
}
SetBe32a(cur, absol)
p->bufs[cj] = cur + 4;
}
}
}
}
} // end of loop
}
}
for (; p->flushRem != 0; p->flushRem--)
if (Bcj2_RangeEnc_ShiftLow(p))
if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
return;
for (; p->flushPos < 5; p->flushPos++)
if (RangeEnc_ShiftLow(p))
return;
p->state = BCJ2_ENC_STATE_FINISHED;
p->state = BCJ2_ENC_STATE_OK;
}
/*
BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer.
So base function Bcj2Enc_Encode_2()
in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with
(p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim)
Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer.
so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(),
then (p->src == p->srcLim).
And the caller's code is simpler with Bcj2Enc_Encode().
*/
Z7_NO_INLINE
void Bcj2Enc_Encode(CBcj2Enc *p)
{
PRF2("\n----")
PRF(printf("\n"));
PRF(printf("---- ip = %8d tempPos = %8d src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src));
if (p->tempPos != 0)
{
/* extra: number of bytes that were copied from (src) to (temp) buffer in this call */
unsigned extra = 0;
/* We will touch only minimal required number of bytes in input (src) stream.
So we will add input bytes from (src) stream to temp[] with step of 1 byte.
We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call
in first loop iteration because
- previous call of Bcj2Enc_Encode() could use another (finishMode),
- previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG).
the case with full temp[] buffer (p->tempPos == 4) is possible here.
*/
for (;;)
{
// (0 < p->tempPos <= 5) // in non-marker version
/* p->src : the current src data position including extra bytes
that were copied to temp[] buffer in this call */
const Byte *src = p->src;
const Byte *srcLim = p->srcLim;
const EBcj2Enc_FinishMode finishMode = p->finishMode;
if (src != srcLim)
{
/* if there are some src data after the data copied to temp[],
then we use MODE_CONTINUE for temp data */
p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
}
EBcj2Enc_FinishMode finishMode = p->finishMode;
p->src = p->temp;
p->srcLim = p->temp + p->tempPos;
PRF2(" ")
if (src != srcLim)
p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
PRF(printf(" ip = %8d tempPos = %8d src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src));
Bcj2Enc_Encode_2(p);
{
const unsigned num = (unsigned)(p->src - p->temp);
const unsigned tempPos = p->tempPos - num;
unsigned num = (unsigned)(p->src - p->temp);
unsigned tempPos = p->tempPos - num;
unsigned i;
p->tempPos = tempPos;
for (i = 0; i < tempPos; i++)
p->temp[i] = p->temp[(SizeT)i + num];
// tempPos : number of bytes in temp buffer
p->temp[i] = p->temp[(size_t)i + num];
p->src = src;
p->srcLim = srcLim;
p->finishMode = finishMode;
if (p->state != BCJ2_ENC_STATE_ORIG)
{
// (p->tempPos <= 4) // in non-marker version
/* if (the reason of exit from Bcj2Enc_Encode_2()
is not BCJ2_ENC_STATE_ORIG),
then we exit from Bcj2Enc_Encode() with same reason */
// optional code begin : we rollback (src) and tempPos, if it's possible:
if (extra >= tempPos)
extra = tempPos;
p->src = src - extra;
p->tempPos = tempPos - extra;
// optional code end : rollback of (src) and tempPos
if (p->state != BCJ2_ENC_STATE_ORIG || src == srcLim)
return;
}
/* (p->tempPos <= 4)
(p->state == BCJ2_ENC_STATE_ORIG)
so encoder needs more data than in temp[] */
if (src == srcLim)
return; // src buffer has no more input data.
/* (src != srcLim)
so we can provide more input data from src for Bcj2Enc_Encode_2() */
if (extra >= tempPos)
{
/* (extra >= tempPos) means that temp buffer contains
only data from src buffer of this call.
So now we can encode without temp buffer */
p->src = src - tempPos; // rollback (src)
p->src = src - tempPos;
p->tempPos = 0;
break;
}
// we append one additional extra byte from (src) to temp[] buffer:
p->temp[tempPos] = *src;
p->temp[tempPos] = src[0];
p->tempPos = tempPos + 1;
// (0 < p->tempPos <= 5) // in non-marker version
p->src = src + 1;
extra++;
}
}
}
PRF2("++++")
// (p->tempPos == 0)
PRF(printf("++++ ip = %8d tempPos = %8d src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src));
Bcj2Enc_Encode_2(p);
PRF2("====")
if (p->state == BCJ2_ENC_STATE_ORIG)
{
const Byte *src = p->src;
const Byte *srcLim = p->srcLim;
const unsigned rem = (unsigned)(srcLim - src);
/* (rem <= 4) here.
if (p->src != p->srcLim), then
- we copy non-processed bytes from (p->src) to temp[] buffer,
- we set p->src equal to p->srcLim.
*/
if (rem)
{
unsigned i = 0;
p->src = srcLim;
p->tempPos = rem;
// (0 < p->tempPos <= 4)
do
p->temp[i] = src[i];
while (++i != rem);
}
// (p->tempPos <= 4)
// (p->src == p->srcLim)
unsigned rem = (unsigned)(p->srcLim - src);
unsigned i;
for (i = 0; i < rem; i++)
p->temp[i] = src[i];
p->tempPos = rem;
p->src = src + rem;
}
}
#undef PRF2
#undef CONV_FLAG
#undef MARKER_FLAG
#undef WRITE_CONTEXT
#undef WRITE_CONTEXT_AND_SRC
#undef ONE_ITER
#undef NUM_SHIFT_BITS
#undef kTopValue
#undef kNumBitModelTotalBits
#undef kBitModelTotal
#undef kNumMoveBits

View file

@ -1,104 +1,47 @@
/* Blake2.h -- BLAKE2sp Hash
2024-01-17 : Igor Pavlov : Public domain */
/* Blake2.h -- BLAKE2 Hash
2015-06-30 : Igor Pavlov : Public domain
2015 : Samuel Neves : Public domain */
#ifndef ZIP7_INC_BLAKE2_H
#define ZIP7_INC_BLAKE2_H
#ifndef __BLAKE2_H
#define __BLAKE2_H
#include "7zTypes.h"
#if 0
#include "Compiler.h"
#include "CpuArch.h"
#if defined(MY_CPU_X86_OR_AMD64)
#if defined(__SSE2__) \
|| defined(_MSC_VER) && _MSC_VER > 1200 \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
|| defined(__clang__) \
|| defined(__INTEL_COMPILER)
#include <emmintrin.h> // SSE2
#endif
#if defined(__AVX2__) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
|| defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \
|| defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
#include <immintrin.h>
#if defined(__clang__)
#include <avxintrin.h>
#include <avx2intrin.h>
#endif
#endif // avx2
#endif // MY_CPU_X86_OR_AMD64
#endif // 0
EXTERN_C_BEGIN
#define Z7_BLAKE2S_BLOCK_SIZE 64
#define Z7_BLAKE2S_DIGEST_SIZE 32
#define Z7_BLAKE2SP_PARALLEL_DEGREE 8
#define Z7_BLAKE2SP_NUM_STRUCT_WORDS 16
#define BLAKE2S_BLOCK_SIZE 64
#define BLAKE2S_DIGEST_SIZE 32
#define BLAKE2SP_PARALLEL_DEGREE 8
#if 1 || defined(Z7_BLAKE2SP_USE_FUNCTIONS)
typedef void (Z7_FASTCALL *Z7_BLAKE2SP_FUNC_COMPRESS)(UInt32 *states, const Byte *data, const Byte *end);
typedef void (Z7_FASTCALL *Z7_BLAKE2SP_FUNC_INIT)(UInt32 *states);
#endif
// it's required that CBlake2sp is aligned for 32-bytes,
// because the code can use unaligned access with sse and avx256.
// but 64-bytes alignment can be better.
MY_ALIGN(64)
typedef struct
{
union
{
#if 0
#if defined(MY_CPU_X86_OR_AMD64)
#if defined(__SSE2__) \
|| defined(_MSC_VER) && _MSC_VER > 1200 \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
|| defined(__clang__) \
|| defined(__INTEL_COMPILER)
__m128i _pad_align_128bit[4];
#endif // sse2
#if defined(__AVX2__) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
|| defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \
|| defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
__m256i _pad_align_256bit[2];
#endif // avx2
#endif // x86
#endif // 0
UInt32 h[8];
UInt32 t[2];
UInt32 f[2];
Byte buf[BLAKE2S_BLOCK_SIZE];
UInt32 bufPos;
UInt32 lastNode_f1;
UInt32 dummy[2]; /* for sizeof(CBlake2s) alignment */
} CBlake2s;
void * _pad_align_ptr[8];
UInt32 _pad_align_32bit[16];
struct
{
unsigned cycPos;
unsigned _pad_unused;
#if 1 || defined(Z7_BLAKE2SP_USE_FUNCTIONS)
Z7_BLAKE2SP_FUNC_COMPRESS func_Compress_Fast;
Z7_BLAKE2SP_FUNC_COMPRESS func_Compress_Single;
Z7_BLAKE2SP_FUNC_INIT func_Init;
Z7_BLAKE2SP_FUNC_INIT func_Final;
#endif
} header;
} u;
// MY_ALIGN(64)
UInt32 states[Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2SP_NUM_STRUCT_WORDS];
// MY_ALIGN(64)
UInt32 buf32[Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2SP_NUM_STRUCT_WORDS * 2];
/* You need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() */
/*
void Blake2s_Init0(CBlake2s *p);
void Blake2s_Update(CBlake2s *p, const Byte *data, size_t size);
void Blake2s_Final(CBlake2s *p, Byte *digest);
*/
typedef struct
{
CBlake2s S[BLAKE2SP_PARALLEL_DEGREE];
unsigned bufPos;
} CBlake2sp;
BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo);
void Blake2sp_Init(CBlake2sp *p);
void Blake2sp_InitState(CBlake2sp *p);
void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size);
void Blake2sp_Final(CBlake2sp *p, Byte *digest);
void z7_Black2sp_Prepare(void);
EXTERN_C_END

File diff suppressed because it is too large Load diff

799
C/Bra.c
View file

@ -1,709 +1,230 @@
/* Bra.c -- Branch converters for RISC code
2024-01-20 : Igor Pavlov : Public domain */
/* Bra.c -- Converters for RISC code
2021-02-09 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include "Bra.h"
#include "RotateDefs.h"
#include "CpuArch.h"
#include "Bra.h"
#if defined(MY_CPU_SIZEOF_POINTER) \
&& ( MY_CPU_SIZEOF_POINTER == 4 \
|| MY_CPU_SIZEOF_POINTER == 8)
#define BR_CONV_USE_OPT_PC_PTR
#endif
#ifdef BR_CONV_USE_OPT_PC_PTR
#define BR_PC_INIT pc -= (UInt32)(SizeT)p;
#define BR_PC_GET (pc + (UInt32)(SizeT)p)
#else
#define BR_PC_INIT pc += (UInt32)size;
#define BR_PC_GET (pc - (UInt32)(SizeT)(lim - p))
// #define BR_PC_INIT
// #define BR_PC_GET (pc + (UInt32)(SizeT)(p - data))
#endif
#define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c;
// #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c;
#define Z7_BRANCH_CONV(name) z7_ ## name
#define Z7_BRANCH_FUNC_MAIN(name) \
static \
Z7_FORCE_INLINE \
Z7_ATTRIB_NO_VECTOR \
Byte *Z7_BRANCH_CONV(name)(Byte *p, SizeT size, UInt32 pc, int encoding)
#define Z7_BRANCH_FUNC_IMP(name, m, encoding) \
Z7_NO_INLINE \
Z7_ATTRIB_NO_VECTOR \
Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \
{ return Z7_BRANCH_CONV(name)(data, size, pc, encoding); } \
#ifdef Z7_EXTRACT_ONLY
#define Z7_BRANCH_FUNCS_IMP(name) \
Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0)
#else
#define Z7_BRANCH_FUNCS_IMP(name) \
Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) \
Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC_2, 1)
#endif
#if defined(__clang__)
#define BR_EXTERNAL_FOR
#define BR_NEXT_ITERATION continue;
#else
#define BR_EXTERNAL_FOR for (;;)
#define BR_NEXT_ITERATION break;
#endif
#if defined(__clang__) && (__clang_major__ >= 8) \
|| defined(__GNUC__) && (__GNUC__ >= 1000) \
// GCC is not good for __builtin_expect() here
/* || defined(_MSC_VER) && (_MSC_VER >= 1920) */
// #define Z7_unlikely [[unlikely]]
// #define Z7_LIKELY(x) (__builtin_expect((x), 1))
#define Z7_UNLIKELY(x) (__builtin_expect((x), 0))
// #define Z7_likely [[likely]]
#else
// #define Z7_LIKELY(x) (x)
#define Z7_UNLIKELY(x) (x)
// #define Z7_likely
#endif
Z7_BRANCH_FUNC_MAIN(BranchConv_ARM64)
SizeT ARM_Convert(Byte *data, SizeT size, UInt32 ip, int encoding)
{
// Byte *p = data;
Byte *p;
const Byte *lim;
const UInt32 flag = (UInt32)1 << (24 - 4);
const UInt32 mask = ((UInt32)1 << 24) - (flag << 1);
size &= ~(SizeT)3;
// if (size == 0) return p;
lim = p + size;
BR_PC_INIT
pc -= 4; // because (p) will point to next instruction
BR_EXTERNAL_FOR
{
// Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
for (;;)
{
UInt32 v;
if Z7_UNLIKELY(p == lim)
return p;
v = GetUi32a(p);
p += 4;
if Z7_UNLIKELY(((v - 0x94000000) & 0xfc000000) == 0)
{
UInt32 c = BR_PC_GET >> 2;
BR_CONVERT_VAL(v, c)
v &= 0x03ffffff;
v |= 0x94000000;
SetUi32a(p - 4, v)
BR_NEXT_ITERATION
}
// v = rotlFixed(v, 8); v += (flag << 8) - 0x90; if Z7_UNLIKELY((v & ((mask << 8) + 0x9f)) == 0)
v -= 0x90000000; if Z7_UNLIKELY((v & 0x9f000000) == 0)
{
UInt32 z, c;
// v = rotrFixed(v, 8);
v += flag; if Z7_UNLIKELY(v & mask) continue;
z = (v & 0xffffffe0) | (v >> 26);
c = (BR_PC_GET >> (12 - 3)) & ~(UInt32)7;
BR_CONVERT_VAL(z, c)
v &= 0x1f;
v |= 0x90000000;
v |= z << 26;
v |= 0x00ffffe0 & ((z & (((flag << 1) - 1))) - flag);
SetUi32a(p - 4, v)
}
}
}
}
Z7_BRANCH_FUNCS_IMP(BranchConv_ARM64)
size &= ~(size_t)3;
ip += 4;
p = data;
lim = data + size;
if (encoding)
Z7_BRANCH_FUNC_MAIN(BranchConv_ARM)
{
// Byte *p = data;
const Byte *lim;
size &= ~(SizeT)3;
lim = p + size;
BR_PC_INIT
/* in ARM: branch offset is relative to the +2 instructions from current instruction.
(p) will point to next instruction */
pc += 8 - 4;
for (;;)
{
for (;;)
{
if Z7_UNLIKELY(p >= lim) { return p; } p += 4; if Z7_UNLIKELY(p[-1] == 0xeb) break;
if Z7_UNLIKELY(p >= lim) { return p; } p += 4; if Z7_UNLIKELY(p[-1] == 0xeb) break;
}
{
UInt32 v = GetUi32a(p - 4);
UInt32 c = BR_PC_GET >> 2;
BR_CONVERT_VAL(v, c)
v &= 0x00ffffff;
v |= 0xeb000000;
SetUi32a(p - 4, v)
}
}
}
Z7_BRANCH_FUNCS_IMP(BranchConv_ARM)
Z7_BRANCH_FUNC_MAIN(BranchConv_PPC)
{
// Byte *p = data;
const Byte *lim;
size &= ~(SizeT)3;
lim = p + size;
BR_PC_INIT
pc -= 4; // because (p) will point to next instruction
for (;;)
{
UInt32 v;
for (;;)
{
if Z7_UNLIKELY(p == lim)
return p;
// v = GetBe32a(p);
v = *(UInt32 *)(void *)p;
if (p >= lim)
return (SizeT)(p - data);
p += 4;
// if ((v & 0xfc000003) == 0x48000001) break;
// if ((p[-4] & 0xFC) == 0x48 && (p[-1] & 3) == 1) break;
if Z7_UNLIKELY(
((v - Z7_CONV_BE_TO_NATIVE_CONST32(0x48000001))
& Z7_CONV_BE_TO_NATIVE_CONST32(0xfc000003)) == 0) break;
}
{
v = Z7_CONV_NATIVE_TO_BE_32(v);
{
UInt32 c = BR_PC_GET;
BR_CONVERT_VAL(v, c)
}
v &= 0x03ffffff;
v |= 0x48000000;
SetBe32a(p - 4, v)
}
}
}
Z7_BRANCH_FUNCS_IMP(BranchConv_PPC)
#ifdef Z7_CPU_FAST_ROTATE_SUPPORTED
#define BR_SPARC_USE_ROTATE
#endif
Z7_BRANCH_FUNC_MAIN(BranchConv_SPARC)
{
// Byte *p = data;
const Byte *lim;
const UInt32 flag = (UInt32)1 << 22;
size &= ~(SizeT)3;
lim = p + size;
BR_PC_INIT
pc -= 4; // because (p) will point to next instruction
for (;;)
{
UInt32 v;
for (;;)
{
if Z7_UNLIKELY(p == lim)
return p;
/* // the code without GetBe32a():
{ const UInt32 v = GetUi16a(p) & 0xc0ff; p += 4; if (v == 0x40 || v == 0xc07f) break; }
*/
v = GetBe32a(p);
p += 4;
#ifdef BR_SPARC_USE_ROTATE
v = rotlFixed(v, 2);
v += (flag << 2) - 1;
if Z7_UNLIKELY((v & (3 - (flag << 3))) == 0)
#else
v += (UInt32)5 << 29;
v ^= (UInt32)7 << 29;
v += flag;
if Z7_UNLIKELY((v & (0 - (flag << 1))) == 0)
#endif
if (p[-1] == 0xEB)
break;
}
{
// UInt32 v = GetBe32a(p - 4);
#ifndef BR_SPARC_USE_ROTATE
UInt32 v = GetUi32(p - 4);
v <<= 2;
#endif
{
UInt32 c = BR_PC_GET;
BR_CONVERT_VAL(v, c)
}
v &= (flag << 3) - 1;
#ifdef BR_SPARC_USE_ROTATE
v -= (flag << 2) - 1;
v = rotrFixed(v, 2);
#else
v -= (flag << 2);
v += ip + (UInt32)(p - data);
v >>= 2;
v |= (UInt32)1 << 30;
#endif
SetBe32a(p - 4, v)
v &= 0x00FFFFFF;
v |= 0xEB000000;
SetUi32(p - 4, v);
}
}
for (;;)
{
for (;;)
{
if (p >= lim)
return (SizeT)(p - data);
p += 4;
if (p[-1] == 0xEB)
break;
}
{
UInt32 v = GetUi32(p - 4);
v <<= 2;
v -= ip + (UInt32)(p - data);
v >>= 2;
v &= 0x00FFFFFF;
v |= 0xEB000000;
SetUi32(p - 4, v);
}
}
}
Z7_BRANCH_FUNCS_IMP(BranchConv_SPARC)
Z7_BRANCH_FUNC_MAIN(BranchConv_ARMT)
SizeT ARMT_Convert(Byte *data, SizeT size, UInt32 ip, int encoding)
{
// Byte *p = data;
Byte *lim;
size &= ~(SizeT)1;
// if (size == 0) return p;
if (size <= 2) return p;
size -= 2;
lim = p + size;
BR_PC_INIT
/* in ARM: branch offset is relative to the +2 instructions from current instruction.
(p) will point to the +2 instructions from current instruction */
// pc += 4 - 4;
// if (encoding) pc -= 0xf800 << 1; else pc += 0xf800 << 1;
// #define ARMT_TAIL_PROC { goto armt_tail; }
#define ARMT_TAIL_PROC { return p; }
Byte *p;
const Byte *lim;
size &= ~(size_t)1;
p = data;
lim = data + size - 4;
if (encoding)
do
for (;;)
{
/* in MSVC 32-bit x86 compilers:
UInt32 version : it loads value from memory with movzx
Byte version : it loads value to 8-bit register (AL/CL)
movzx version is slightly faster in some cpus
*/
unsigned b1;
// Byte / unsigned
b1 = p[1];
// optimized version to reduce one (p >= lim) check:
// unsigned a1 = p[1]; b1 = p[3]; p += 2; if Z7_LIKELY((b1 & (a1 ^ 8)) < 0xf8)
UInt32 b1;
for (;;)
{
unsigned b3; // Byte / UInt32
/* (Byte)(b3) normalization can use low byte computations in MSVC.
It gives smaller code, and no loss of speed in some compilers/cpus.
But new MSVC 32-bit x86 compilers use more slow load
from memory to low byte register in that case.
So we try to use full 32-bit computations for faster code.
*/
// if (p >= lim) { ARMT_TAIL_PROC } b3 = b1 + 8; b1 = p[3]; p += 2; if ((b3 & b1) >= 0xf8) break;
if Z7_UNLIKELY(p >= lim) { ARMT_TAIL_PROC } b3 = p[3]; p += 2; if Z7_UNLIKELY((b3 & (b1 ^ 8)) >= 0xf8) break;
if Z7_UNLIKELY(p >= lim) { ARMT_TAIL_PROC } b1 = p[3]; p += 2; if Z7_UNLIKELY((b1 & (b3 ^ 8)) >= 0xf8) break;
UInt32 b3;
if (p > lim)
return (SizeT)(p - data);
b1 = p[1];
b3 = p[3];
p += 2;
b1 ^= 8;
if ((b3 & b1) >= 0xF8)
break;
}
{
/* we can adjust pc for (0xf800) to rid of (& 0x7FF) operation.
But gcc/clang for arm64 can use bfi instruction for full code here */
UInt32 v =
((UInt32)GetUi16a(p - 2) << 11) |
((UInt32)GetUi16a(p) & 0x7FF);
/*
UInt32 v =
((UInt32)p[1 - 2] << 19)
((UInt32)b1 << 19)
+ (((UInt32)p[1] & 0x7) << 8)
+ (((UInt32)p[-2] << 11))
+ (p[0]);
*/
p += 2;
{
UInt32 c = BR_PC_GET >> 1;
BR_CONVERT_VAL(v, c)
UInt32 cur = (ip + (UInt32)(p - data)) >> 1;
v += cur;
}
SetUi16a(p - 4, (UInt16)(((v >> 11) & 0x7ff) | 0xf000))
SetUi16a(p - 2, (UInt16)(v | 0xf800))
/*
p[-4] = (Byte)(v >> 11);
p[-3] = (Byte)(0xf0 | ((v >> 19) & 0x7));
p[-3] = (Byte)(0xF0 | ((v >> 19) & 0x7));
p[-2] = (Byte)v;
p[-1] = (Byte)(0xf8 | (v >> 8));
*/
p[-1] = (Byte)(0xF8 | (v >> 8));
}
}
while (p < lim);
return p;
// armt_tail:
// if ((Byte)((lim[1] & 0xf8)) != 0xf0) { lim += 2; } return lim;
// return (Byte *)(lim + ((Byte)((lim[1] ^ 0xf0) & 0xf8) == 0 ? 0 : 2));
// return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2));
// return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2));
}
Z7_BRANCH_FUNCS_IMP(BranchConv_ARMT)
// #define BR_IA64_NO_INLINE
Z7_BRANCH_FUNC_MAIN(BranchConv_IA64)
{
// Byte *p = data;
const Byte *lim;
size &= ~(SizeT)15;
lim = p + size;
pc -= 1 << 4;
pc >>= 4 - 1;
// pc -= 1 << 1;
for (;;)
{
unsigned m;
UInt32 b1;
for (;;)
{
if Z7_UNLIKELY(p == lim)
return p;
m = (unsigned)((UInt32)0x334b0000 >> (*p & 0x1e));
p += 16;
pc += 1 << 1;
if (m &= 3)
UInt32 b3;
if (p > lim)
return (SizeT)(p - data);
b1 = p[1];
b3 = p[3];
p += 2;
b1 ^= 8;
if ((b3 & b1) >= 0xF8)
break;
}
{
p += (ptrdiff_t)m * 5 - 20; // negative value is expected here.
do
UInt32 v =
((UInt32)b1 << 19)
+ (((UInt32)p[1] & 0x7) << 8)
+ (((UInt32)p[-2] << 11))
+ (p[0]);
p += 2;
{
const UInt32 t =
#if defined(MY_CPU_X86_OR_AMD64)
// we use 32-bit load here to reduce code size on x86:
GetUi32(p);
#else
GetUi16(p);
#endif
UInt32 z = GetUi32(p + 1) >> m;
p += 5;
if (((t >> m) & (0x70 << 1)) == 0
&& ((z - (0x5000000 << 1)) & (0xf000000 << 1)) == 0)
{
UInt32 v = (UInt32)((0x8fffff << 1) | 1) & z;
z ^= v;
#ifdef BR_IA64_NO_INLINE
v |= (v & ((UInt32)1 << (23 + 1))) >> 3;
{
UInt32 c = pc;
BR_CONVERT_VAL(v, c)
}
v &= (0x1fffff << 1) | 1;
#else
{
if (encoding)
{
// pc &= ~(0xc00000 << 1); // we just need to clear at least 2 bits
pc &= (0x1fffff << 1) | 1;
v += pc;
}
else
{
// pc |= 0xc00000 << 1; // we need to set at least 2 bits
pc |= ~(UInt32)((0x1fffff << 1) | 1);
v -= pc;
}
}
v &= ~(UInt32)(0x600000 << 1);
#endif
v += (0x700000 << 1);
v &= (0x8fffff << 1) | 1;
z |= v;
z <<= m;
SetUi32(p + 1 - 5, z)
}
m++;
UInt32 cur = (ip + (UInt32)(p - data)) >> 1;
v -= cur;
}
while (m &= 3); // while (m < 4);
/*
SetUi16(p - 4, (UInt16)(((v >> 11) & 0x7FF) | 0xF000));
SetUi16(p - 2, (UInt16)(v | 0xF800));
*/
p[-4] = (Byte)(v >> 11);
p[-3] = (Byte)(0xF0 | ((v >> 19) & 0x7));
p[-2] = (Byte)v;
p[-1] = (Byte)(0xF8 | (v >> 8));
}
}
}
Z7_BRANCH_FUNCS_IMP(BranchConv_IA64)
#define BR_CONVERT_VAL_ENC(v) v += BR_PC_GET;
#define BR_CONVERT_VAL_DEC(v) v -= BR_PC_GET;
#if 1 && defined(MY_CPU_LE_UNALIGN)
#define RISCV_USE_UNALIGNED_LOAD
#endif
#ifdef RISCV_USE_UNALIGNED_LOAD
#define RISCV_GET_UI32(p) GetUi32(p)
#define RISCV_SET_UI32(p, v) { SetUi32(p, v) }
#else
#define RISCV_GET_UI32(p) \
((UInt32)GetUi16a(p) + \
((UInt32)GetUi16a((p) + 2) << 16))
#define RISCV_SET_UI32(p, v) { \
SetUi16a(p, (UInt16)(v)) \
SetUi16a((p) + 2, (UInt16)(v >> 16)) }
#endif
#if 1 && defined(MY_CPU_LE)
#define RISCV_USE_16BIT_LOAD
#endif
#ifdef RISCV_USE_16BIT_LOAD
#define RISCV_LOAD_VAL(p) GetUi16a(p)
#else
#define RISCV_LOAD_VAL(p) (*(p))
#endif
#define RISCV_INSTR_SIZE 2
#define RISCV_STEP_1 (4 + RISCV_INSTR_SIZE)
#define RISCV_STEP_2 4
#define RISCV_REG_VAL (2 << 7)
#define RISCV_CMD_VAL 3
#if 1
// for code size optimization:
#define RISCV_DELTA_7F 0x7f
#else
#define RISCV_DELTA_7F 0
#endif
#define RISCV_CHECK_1(v, b) \
(((((b) - RISCV_CMD_VAL) ^ ((v) << 8)) & (0xf8000 + RISCV_CMD_VAL)) == 0)
#if 1
#define RISCV_CHECK_2(v, r) \
((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL | 8)) \
<< 18) \
< ((r) & 0x1d))
#else
// this branch gives larger code, because
// compilers generate larger code for big constants.
#define RISCV_CHECK_2(v, r) \
((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \
& ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \
< ((r) & 0x1d))
#endif
#define RISCV_SCAN_LOOP \
Byte *lim; \
size &= ~(SizeT)(RISCV_INSTR_SIZE - 1); \
if (size <= 6) return p; \
size -= 6; \
lim = p + size; \
BR_PC_INIT \
for (;;) \
{ \
UInt32 a, v; \
/* Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE */ \
for (;;) \
{ \
if Z7_UNLIKELY(p >= lim) { return p; } \
a = (RISCV_LOAD_VAL(p) ^ 0x10u) + 1; \
if ((a & 0x77) == 0) break; \
a = (RISCV_LOAD_VAL(p + RISCV_INSTR_SIZE) ^ 0x10u) + 1; \
p += RISCV_INSTR_SIZE * 2; \
if ((a & 0x77) == 0) \
{ \
p -= RISCV_INSTR_SIZE; \
if Z7_UNLIKELY(p >= lim) { return p; } \
break; \
} \
}
// (xx6f ^ 10) + 1 = xx7f + 1 = xx80 : JAL
// (xxef ^ 10) + 1 = xxff + 1 = xx00 + 100 : JAL
// (xx17 ^ 10) + 1 = xx07 + 1 = xx08 : AUIPC
// (xx97 ^ 10) + 1 = xx87 + 1 = xx88 : AUIPC
Byte * Z7_BRANCH_CONV_ENC(RISCV)(Byte *p, SizeT size, UInt32 pc)
SizeT PPC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding)
{
RISCV_SCAN_LOOP
v = a;
a = RISCV_GET_UI32(p);
#ifndef RISCV_USE_16BIT_LOAD
v += (UInt32)p[1] << 8;
#endif
Byte *p;
const Byte *lim;
size &= ~(size_t)3;
ip -= 4;
p = data;
lim = data + size;
if ((v & 8) == 0) // JAL
for (;;)
{
for (;;)
{
if ((v - (0x100 /* - RISCV_DELTA_7F */)) & 0xd80)
{
p += RISCV_INSTR_SIZE;
continue;
}
{
v = ((a & 1u << 31) >> 11)
| ((a & 0x3ff << 21) >> 20)
| ((a & 1 << 20) >> 9)
| (a & 0xff << 12);
BR_CONVERT_VAL_ENC(v)
// ((v & 1) == 0)
// v: bits [1 : 20] contain offset bits
#if 0 && defined(RISCV_USE_UNALIGNED_LOAD)
a &= 0xfff;
a |= ((UInt32)(v << 23))
| ((UInt32)(v << 7) & ((UInt32)0xff << 16))
| ((UInt32)(v >> 5) & ((UInt32)0xf0 << 8));
RISCV_SET_UI32(p, a)
#else // aligned
#if 0
SetUi16a(p, (UInt16)(((v >> 5) & 0xf000) | (a & 0xfff)))
#else
p[1] = (Byte)(((v >> 13) & 0xf0) | ((a >> 8) & 0xf));
#endif
#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
v <<= 15;
v = Z7_BSWAP32(v);
SetUi16a(p + 2, (UInt16)v)
#else
p[2] = (Byte)(v >> 9);
p[3] = (Byte)(v >> 1);
#endif
#endif // aligned
}
if (p >= lim)
return (SizeT)(p - data);
p += 4;
continue;
} // JAL
{
// AUIPC
if (v & 0xe80) // (not x0) and (not x2)
{
const UInt32 b = RISCV_GET_UI32(p + 4);
if (RISCV_CHECK_1(v, b))
{
{
const UInt32 temp = (b << 12) | (0x17 + RISCV_REG_VAL);
RISCV_SET_UI32(p, temp)
}
a &= 0xfffff000;
{
#if 1
const int t = -1 >> 1;
if (t != -1)
a += (b >> 20) - ((b >> 19) & 0x1000); // arithmetic right shift emulation
else
#endif
a += (UInt32)((Int32)b >> 20); // arithmetic right shift (sign-extension).
}
BR_CONVERT_VAL_ENC(a)
#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
a = Z7_BSWAP32(a);
RISCV_SET_UI32(p + 4, a)
#else
SetBe32(p + 4, a)
#endif
p += 8;
}
else
p += RISCV_STEP_1;
}
else
{
UInt32 r = a >> 27;
if (RISCV_CHECK_2(v, r))
{
v = RISCV_GET_UI32(p + 4);
r = (r << 7) + 0x17 + (v & 0xfffff000);
a = (a >> 12) | (v << 20);
RISCV_SET_UI32(p, r)
RISCV_SET_UI32(p + 4, a)
p += 8;
}
else
p += RISCV_STEP_2;
}
/* if ((v & 0xFC000003) == 0x48000001) */
if ((p[-4] & 0xFC) == 0x48 && (p[-1] & 3) == 1)
break;
}
} // for
{
UInt32 v = GetBe32(p - 4);
if (encoding)
v += ip + (UInt32)(p - data);
else
v -= ip + (UInt32)(p - data);
v &= 0x03FFFFFF;
v |= 0x48000000;
SetBe32(p - 4, v);
}
}
}
Byte * Z7_BRANCH_CONV_DEC(RISCV)(Byte *p, SizeT size, UInt32 pc)
SizeT SPARC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding)
{
RISCV_SCAN_LOOP
#ifdef RISCV_USE_16BIT_LOAD
if ((a & 8) == 0)
{
#else
v = a;
a += (UInt32)p[1] << 8;
if ((v & 8) == 0)
{
#endif
// JAL
a -= 0x100 - RISCV_DELTA_7F;
if (a & 0xd80)
{
p += RISCV_INSTR_SIZE;
continue;
}
{
const UInt32 a_old = (a + (0xef - RISCV_DELTA_7F)) & 0xfff;
#if 0 // unaligned
a = GetUi32(p);
v = (UInt32)(a >> 23) & ((UInt32)0xff << 1)
| (UInt32)(a >> 7) & ((UInt32)0xff << 9)
#elif 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
v = GetUi16a(p + 2);
v = Z7_BSWAP32(v) >> 15
#else
v = (UInt32)p[3] << 1
| (UInt32)p[2] << 9
#endif
| (UInt32)((a & 0xf000) << 5);
BR_CONVERT_VAL_DEC(v)
a = a_old
| (v << 11 & 1u << 31)
| (v << 20 & 0x3ff << 21)
| (v << 9 & 1 << 20)
| (v & 0xff << 12);
RISCV_SET_UI32(p, a)
}
p += 4;
continue;
} // JAL
Byte *p;
const Byte *lim;
size &= ~(size_t)3;
ip -= 4;
p = data;
lim = data + size;
for (;;)
{
for (;;)
{
// AUIPC
v = a;
#if 1 && defined(RISCV_USE_UNALIGNED_LOAD)
a = GetUi32(p);
#else
a |= (UInt32)GetUi16a(p + 2) << 16;
#endif
if ((v & 0xe80) == 0) // x0/x2
{
const UInt32 r = a >> 27;
if (RISCV_CHECK_2(v, r))
{
UInt32 b;
#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
b = RISCV_GET_UI32(p + 4);
b = Z7_BSWAP32(b);
#else
b = GetBe32(p + 4);
#endif
v = a >> 12;
BR_CONVERT_VAL_DEC(b)
a = (r << 7) + 0x17;
a += (b + 0x800) & 0xfffff000;
v |= b << 20;
RISCV_SET_UI32(p, a)
RISCV_SET_UI32(p + 4, v)
p += 8;
}
else
p += RISCV_STEP_2;
}
else
{
const UInt32 b = RISCV_GET_UI32(p + 4);
if (!RISCV_CHECK_1(v, b))
p += RISCV_STEP_1;
else
{
v = (a & 0xfffff000) | (b >> 20);
a = (b << 12) | (0x17 + RISCV_REG_VAL);
RISCV_SET_UI32(p, a)
RISCV_SET_UI32(p + 4, v)
p += 8;
}
}
if (p >= lim)
return (SizeT)(p - data);
/*
v = GetBe32(p);
p += 4;
m = v + ((UInt32)5 << 29);
m ^= (UInt32)7 << 29;
m += (UInt32)1 << 22;
if ((m & ((UInt32)0x1FF << 23)) == 0)
break;
*/
p += 4;
if ((p[-4] == 0x40 && (p[-3] & 0xC0) == 0) ||
(p[-4] == 0x7F && (p[-3] >= 0xC0)))
break;
}
} // for
{
UInt32 v = GetBe32(p - 4);
v <<= 2;
if (encoding)
v += ip + (UInt32)(p - data);
else
v -= ip + (UInt32)(p - data);
v &= 0x01FFFFFF;
v -= (UInt32)1 << 24;
v ^= 0xFF000000;
v >>= 2;
v |= 0x40000000;
SetBe32(p - 4, v);
}
}
}

123
C/Bra.h
View file

@ -1,105 +1,64 @@
/* Bra.h -- Branch converters for executables
2024-01-20 : Igor Pavlov : Public domain */
2013-01-18 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_BRA_H
#define ZIP7_INC_BRA_H
#ifndef __BRA_H
#define __BRA_H
#include "7zTypes.h"
EXTERN_C_BEGIN
/* #define PPC BAD_PPC_11 // for debug */
#define Z7_BRANCH_CONV_DEC_2(name) z7_ ## name ## _Dec
#define Z7_BRANCH_CONV_ENC_2(name) z7_ ## name ## _Enc
#define Z7_BRANCH_CONV_DEC(name) Z7_BRANCH_CONV_DEC_2(BranchConv_ ## name)
#define Z7_BRANCH_CONV_ENC(name) Z7_BRANCH_CONV_ENC_2(BranchConv_ ## name)
#define Z7_BRANCH_CONV_ST_DEC(name) z7_BranchConvSt_ ## name ## _Dec
#define Z7_BRANCH_CONV_ST_ENC(name) z7_BranchConvSt_ ## name ## _Enc
#define Z7_BRANCH_CONV_DECL(name) Byte * name(Byte *data, SizeT size, UInt32 pc)
#define Z7_BRANCH_CONV_ST_DECL(name) Byte * name(Byte *data, SizeT size, UInt32 pc, UInt32 *state)
typedef Z7_BRANCH_CONV_DECL( (*z7_Func_BranchConv));
typedef Z7_BRANCH_CONV_ST_DECL((*z7_Func_BranchConvSt));
#define Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL 0
Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_DEC(X86));
Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_ENC(X86));
#define Z7_BRANCH_FUNCS_DECL(name) \
Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_DEC_2(name)); \
Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_ENC_2(name));
Z7_BRANCH_FUNCS_DECL (BranchConv_ARM64)
Z7_BRANCH_FUNCS_DECL (BranchConv_ARM)
Z7_BRANCH_FUNCS_DECL (BranchConv_ARMT)
Z7_BRANCH_FUNCS_DECL (BranchConv_PPC)
Z7_BRANCH_FUNCS_DECL (BranchConv_SPARC)
Z7_BRANCH_FUNCS_DECL (BranchConv_IA64)
Z7_BRANCH_FUNCS_DECL (BranchConv_RISCV)
/*
These functions convert data that contain CPU instructions.
Each such function converts relative addresses to absolute addresses in some
branch instructions: CALL (in all converters) and JUMP (X86 converter only).
Such conversion allows to increase compression ratio, if we compress that data.
There are 2 types of converters:
Byte * Conv_RISC (Byte *data, SizeT size, UInt32 pc);
Byte * ConvSt_X86(Byte *data, SizeT size, UInt32 pc, UInt32 *state);
Each Converter supports 2 versions: one for encoding
and one for decoding (_Enc/_Dec postfixes in function name).
In params:
data : data buffer
size : size of data
pc : current virtual Program Counter (Instruction Pointer) value
In/Out param:
state : pointer to state variable (for X86 converter only)
Return:
The pointer to position in (data) buffer after last byte that was processed.
If the caller calls converter again, it must call it starting with that position.
But the caller is allowed to move data in buffer. So pointer to
current processed position also will be changed for next call.
Also the caller must increase internal (pc) value for next call.
These functions convert relative addresses to absolute addresses
in CALL instructions to increase the compression ratio.
In:
data - data buffer
size - size of data
ip - current virtual Instruction Pinter (IP) value
state - state variable for x86 converter
encoding - 0 (for decoding), 1 (for encoding)
Out:
state - state variable for x86 converter
Returns:
The number of processed bytes. If you call these functions with multiple calls,
you must start next call with first byte after block of processed bytes.
Each converter has some characteristics: Endian, Alignment, LookAhead.
Type Endian Alignment LookAhead
X86 little 1 4
x86 little 1 4
ARMT little 2 2
RISCV little 2 6
ARM little 4 0
ARM64 little 4 0
PPC big 4 0
SPARC big 4 0
IA64 little 16 0
(data) must be aligned for (Alignment).
processed size can be calculated as:
SizeT processed = Conv(data, size, pc) - data;
if (processed == 0)
it means that converter needs more data for processing.
If (size < Alignment + LookAhead)
then (processed == 0) is allowed.
size must be >= Alignment + LookAhead, if it's not last block.
If (size < Alignment + LookAhead), converter returns 0.
Example code for conversion in loop:
UInt32 pc = 0;
size = 0;
for (;;)
{
size += Load_more_input_data(data + size);
SizeT processed = Conv(data, size, pc) - data;
if (processed == 0 && no_more_input_data_after_size)
break; // we stop convert loop
data += processed;
size -= processed;
pc += processed;
}
Example:
UInt32 ip = 0;
for ()
{
; size must be >= Alignment + LookAhead, if it's not last block
SizeT processed = Convert(data, size, ip, 1);
data += processed;
size -= processed;
ip += processed;
}
*/
#define x86_Convert_Init(state) { state = 0; }
SizeT x86_Convert(Byte *data, SizeT size, UInt32 ip, UInt32 *state, int encoding);
SizeT ARM_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
SizeT ARMT_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
SizeT PPC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
SizeT SPARC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
SizeT IA64_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
EXTERN_C_END
#endif

225
C/Bra86.c
View file

@ -1,187 +1,82 @@
/* Bra86.c -- Branch converter for X86 code (BCJ)
2023-04-02 : Igor Pavlov : Public domain */
/* Bra86.c -- Converter for x86 code (BCJ)
2021-02-09 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include "Bra.h"
#include "CpuArch.h"
#define Test86MSByte(b) ((((b) + 1) & 0xFE) == 0)
#if defined(MY_CPU_SIZEOF_POINTER) \
&& ( MY_CPU_SIZEOF_POINTER == 4 \
|| MY_CPU_SIZEOF_POINTER == 8)
#define BR_CONV_USE_OPT_PC_PTR
#endif
#ifdef BR_CONV_USE_OPT_PC_PTR
#define BR_PC_INIT pc -= (UInt32)(SizeT)p; // (MY_uintptr_t)
#define BR_PC_GET (pc + (UInt32)(SizeT)p)
#else
#define BR_PC_INIT pc += (UInt32)size;
#define BR_PC_GET (pc - (UInt32)(SizeT)(lim - p))
// #define BR_PC_INIT
// #define BR_PC_GET (pc + (UInt32)(SizeT)(p - data))
#endif
#define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c;
// #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c;
#define Z7_BRANCH_CONV_ST(name) z7_BranchConvSt_ ## name
#define BR86_NEED_CONV_FOR_MS_BYTE(b) ((((b) + 1) & 0xfe) == 0)
#ifdef MY_CPU_LE_UNALIGN
#define BR86_PREPARE_BCJ_SCAN const UInt32 v = GetUi32(p) ^ 0xe8e8e8e8;
#define BR86_IS_BCJ_BYTE(n) ((v & ((UInt32)0xfe << (n) * 8)) == 0)
#else
#define BR86_PREPARE_BCJ_SCAN
// bad for MSVC X86 (partial write to byte reg):
#define BR86_IS_BCJ_BYTE(n) ((p[n - 4] & 0xfe) == 0xe8)
// bad for old MSVC (partial write to byte reg):
// #define BR86_IS_BCJ_BYTE(n) (((*p ^ 0xe8) & 0xfe) == 0)
#endif
static
Z7_FORCE_INLINE
Z7_ATTRIB_NO_VECTOR
Byte *Z7_BRANCH_CONV_ST(X86)(Byte *p, SizeT size, UInt32 pc, UInt32 *state, int encoding)
SizeT x86_Convert(Byte *data, SizeT size, UInt32 ip, UInt32 *state, int encoding)
{
SizeT pos = 0;
UInt32 mask = *state & 7;
if (size < 5)
return p;
{
// Byte *p = data;
const Byte *lim = p + size - 4;
unsigned mask = (unsigned)*state; // & 7;
#ifdef BR_CONV_USE_OPT_PC_PTR
/* if BR_CONV_USE_OPT_PC_PTR is defined: we need to adjust (pc) for (+4),
because call/jump offset is relative to the next instruction.
if BR_CONV_USE_OPT_PC_PTR is not defined : we don't need to adjust (pc) for (+4),
because BR_PC_GET uses (pc - (lim - p)), and lim was adjusted for (-4) before.
*/
pc += 4;
#endif
BR_PC_INIT
goto start;
return 0;
size -= 4;
ip += 5;
for (;; mask |= 4)
for (;;)
{
// cont: mask |= 4;
start:
if (p >= lim)
goto fin;
{
BR86_PREPARE_BCJ_SCAN
p += 4;
if (BR86_IS_BCJ_BYTE(0)) { goto m0; } mask >>= 1;
if (BR86_IS_BCJ_BYTE(1)) { goto m1; } mask >>= 1;
if (BR86_IS_BCJ_BYTE(2)) { goto m2; } mask = 0;
if (BR86_IS_BCJ_BYTE(3)) { goto a3; }
}
goto main_loop;
Byte *p = data + pos;
const Byte *limit = data + size;
for (; p < limit; p++)
if ((*p & 0xFE) == 0xE8)
break;
m0: p--;
m1: p--;
m2: p--;
if (mask == 0)
goto a3;
if (p > lim)
goto fin_p;
// if (((0x17u >> mask) & 1) == 0)
if (mask > 4 || mask == 3)
{
mask >>= 1;
continue; // goto cont;
}
mask >>= 1;
if (BR86_NEED_CONV_FOR_MS_BYTE(p[mask]))
continue; // goto cont;
// if (!BR86_NEED_CONV_FOR_MS_BYTE(p[3])) continue; // goto cont;
{
UInt32 v = GetUi32(p);
UInt32 c;
v += (1 << 24); if (v & 0xfe000000) continue; // goto cont;
c = BR_PC_GET;
BR_CONVERT_VAL(v, c)
SizeT d = (SizeT)(p - data) - pos;
pos = (SizeT)(p - data);
if (p >= limit)
{
mask <<= 3;
if (BR86_NEED_CONV_FOR_MS_BYTE(v >> mask))
*state = (d > 2 ? 0 : mask >> (unsigned)d);
return pos;
}
if (d > 2)
mask = 0;
else
{
mask >>= (unsigned)d;
if (mask != 0 && (mask > 4 || mask == 3 || Test86MSByte(p[(size_t)(mask >> 1) + 1])))
{
v ^= (((UInt32)0x100 << mask) - 1);
#ifdef MY_CPU_X86
// for X86 : we can recalculate (c) to reduce register pressure
c = BR_PC_GET;
#endif
BR_CONVERT_VAL(v, c)
mask = (mask >> 1) | 4;
pos++;
continue;
}
}
}
if (Test86MSByte(p[4]))
{
UInt32 v = ((UInt32)p[4] << 24) | ((UInt32)p[3] << 16) | ((UInt32)p[2] << 8) | ((UInt32)p[1]);
UInt32 cur = ip + (UInt32)pos;
pos += 5;
if (encoding)
v += cur;
else
v -= cur;
if (mask != 0)
{
unsigned sh = (mask & 6) << 2;
if (Test86MSByte((Byte)(v >> sh)))
{
v ^= (((UInt32)0x100 << sh) - 1);
if (encoding)
v += cur;
else
v -= cur;
}
mask = 0;
}
// v = (v & ((1 << 24) - 1)) - (v & (1 << 24));
v &= (1 << 25) - 1; v -= (1 << 24);
SetUi32(p, v)
p += 4;
goto main_loop;
p[1] = (Byte)v;
p[2] = (Byte)(v >> 8);
p[3] = (Byte)(v >> 16);
p[4] = (Byte)(0 - ((v >> 24) & 1));
}
main_loop:
if (p >= lim)
goto fin;
for (;;)
else
{
BR86_PREPARE_BCJ_SCAN
p += 4;
if (BR86_IS_BCJ_BYTE(0)) { goto a0; }
if (BR86_IS_BCJ_BYTE(1)) { goto a1; }
if (BR86_IS_BCJ_BYTE(2)) { goto a2; }
if (BR86_IS_BCJ_BYTE(3)) { goto a3; }
if (p >= lim)
goto fin;
}
a0: p--;
a1: p--;
a2: p--;
a3:
if (p > lim)
goto fin_p;
// if (!BR86_NEED_CONV_FOR_MS_BYTE(p[3])) continue; // goto cont;
{
UInt32 v = GetUi32(p);
UInt32 c;
v += (1 << 24); if (v & 0xfe000000) continue; // goto cont;
c = BR_PC_GET;
BR_CONVERT_VAL(v, c)
// v = (v & ((1 << 24) - 1)) - (v & (1 << 24));
v &= (1 << 25) - 1; v -= (1 << 24);
SetUi32(p, v)
p += 4;
goto main_loop;
mask = (mask >> 1) | 4;
pos++;
}
}
fin_p:
p--;
fin:
// the following processing for tail is optional and can be commented
/*
lim += 4;
for (; p < lim; p++, mask >>= 1)
if ((*p & 0xfe) == 0xe8)
break;
*/
*state = (UInt32)mask;
return p;
}
}
#define Z7_BRANCH_CONV_ST_FUNC_IMP(name, m, encoding) \
Z7_NO_INLINE \
Z7_ATTRIB_NO_VECTOR \
Byte *m(name)(Byte *data, SizeT size, UInt32 pc, UInt32 *state) \
{ return Z7_BRANCH_CONV_ST(name)(data, size, pc, state, encoding); }
Z7_BRANCH_CONV_ST_FUNC_IMP(X86, Z7_BRANCH_CONV_ST_DEC, 0)
#ifndef Z7_EXTRACT_ONLY
Z7_BRANCH_CONV_ST_FUNC_IMP(X86, Z7_BRANCH_CONV_ST_ENC, 1)
#endif

View file

@ -1,14 +1,53 @@
/* BraIA64.c -- Converter for IA-64 code
2023-02-20 : Igor Pavlov : Public domain */
2017-01-26 : Igor Pavlov : Public domain */
#include "Precomp.h"
// the code was moved to Bra.c
#include "CpuArch.h"
#include "Bra.h"
#ifdef _MSC_VER
#pragma warning(disable : 4206) // nonstandard extension used : translation unit is empty
#endif
#if defined(__clang__)
#pragma GCC diagnostic ignored "-Wempty-translation-unit"
#endif
SizeT IA64_Convert(Byte *data, SizeT size, UInt32 ip, int encoding)
{
SizeT i;
if (size < 16)
return 0;
size -= 16;
i = 0;
do
{
unsigned m = ((UInt32)0x334B0000 >> (data[i] & 0x1E)) & 3;
if (m)
{
m++;
do
{
Byte *p = data + (i + (size_t)m * 5 - 8);
if (((p[3] >> m) & 15) == 5
&& (((p[-1] | ((UInt32)p[0] << 8)) >> m) & 0x70) == 0)
{
unsigned raw = GetUi32(p);
unsigned v = raw >> m;
v = (v & 0xFFFFF) | ((v & (1 << 23)) >> 3);
v <<= 4;
if (encoding)
v += ip + (UInt32)i;
else
v -= ip + (UInt32)i;
v >>= 4;
v &= 0x1FFFFF;
v += 0x700000;
v &= 0x8FFFFF;
raw &= ~((UInt32)0x8FFFFF << m);
raw |= (v << m);
SetUi32(p, raw);
}
}
while (++m <= 4);
}
i += 16;
}
while (i <= size);
return i;
}

View file

@ -1,5 +1,5 @@
/* BwtSort.c -- BWT block sorting
: Igor Pavlov : Public domain */
2021-04-01 : Igor Pavlov : Public domain */
#include "Precomp.h"
@ -7,44 +7,8 @@
#include "Sort.h"
/* #define BLOCK_SORT_USE_HEAP_SORT */
// #define BLOCK_SORT_USE_HEAP_SORT
#ifdef BLOCK_SORT_USE_HEAP_SORT
#define HeapSortRefDown(p, vals, n, size, temp) \
{ size_t k = n; UInt32 val = vals[temp]; for (;;) { \
size_t s = k << 1; \
if (s > size) break; \
if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \
if (val >= vals[p[s]]) break; \
p[k] = p[s]; k = s; \
} p[k] = temp; }
void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
{
if (size <= 1)
return;
p--;
{
size_t i = size / 2;
do
{
UInt32 temp = p[i];
HeapSortRefDown(p, vals, i, size, temp);
}
while (--i != 0);
}
do
{
UInt32 temp = p[size];
p[size--] = p[1];
HeapSortRefDown(p, vals, 1, size, temp);
}
while (size > 1);
}
#endif // BLOCK_SORT_USE_HEAP_SORT
#define NO_INLINE MY_FAST_CALL
/* Don't change it !!! */
#define kNumHashBytes 2
@ -65,27 +29,26 @@ void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
#else
#define kNumBitsMax 20
#define kIndexMask (((UInt32)1 << kNumBitsMax) - 1)
#define kNumExtraBits (32 - kNumBitsMax)
#define kNumExtra0Bits (kNumExtraBits - 2)
#define kNumExtra0Mask ((1 << kNumExtra0Bits) - 1)
#define kNumBitsMax 20
#define kIndexMask ((1 << kNumBitsMax) - 1)
#define kNumExtraBits (32 - kNumBitsMax)
#define kNumExtra0Bits (kNumExtraBits - 2)
#define kNumExtra0Mask ((1 << kNumExtra0Bits) - 1)
#define SetFinishedGroupSize(p, size) \
{ *(p) |= ((((UInt32)(size) - 1) & kNumExtra0Mask) << kNumBitsMax); \
{ *(p) |= ((((size) - 1) & kNumExtra0Mask) << kNumBitsMax); \
if ((size) > (1 << kNumExtra0Bits)) { \
*(p) |= 0x40000000; \
*((p) + 1) |= (((UInt32)(size) - 1) >> kNumExtra0Bits) << kNumBitsMax; } } \
*(p) |= 0x40000000; *((p) + 1) |= ((((size) - 1)>> kNumExtra0Bits) << kNumBitsMax); } } \
static void SetGroupSize(UInt32 *p, size_t size)
static void SetGroupSize(UInt32 *p, UInt32 size)
{
if (--size == 0)
return;
*p |= 0x80000000 | (((UInt32)size & kNumExtra0Mask) << kNumBitsMax);
*p |= 0x80000000 | ((size & kNumExtra0Mask) << kNumBitsMax);
if (size >= (1 << kNumExtra0Bits))
{
*p |= 0x40000000;
p[1] |= (((UInt32)size >> kNumExtra0Bits) << kNumBitsMax);
p[1] |= ((size >> kNumExtra0Bits) << kNumBitsMax);
}
}
@ -97,15 +60,10 @@ SortGroup - is recursive Range-Sort function with HeapSort optimization for smal
returns: 1 - if there are groups, 0 - no more groups
*/
static
unsigned
Z7_FASTCALL
SortGroup(size_t BlockSize, size_t NumSortedBytes,
size_t groupOffset, size_t groupSize,
unsigned NumRefBits, UInt32 *Indices
#ifndef BLOCK_SORT_USE_HEAP_SORT
, size_t left, size_t range
#endif
static UInt32 NO_INLINE SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 groupSize, int NumRefBits, UInt32 *Indices
#ifndef BLOCK_SORT_USE_HEAP_SORT
, UInt32 left, UInt32 range
#endif
)
{
UInt32 *ind2 = Indices + groupOffset;
@ -114,99 +72,96 @@ SortGroup(size_t BlockSize, size_t NumSortedBytes,
{
/*
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
SetFinishedGroupSize(ind2, 1)
SetFinishedGroupSize(ind2, 1);
#endif
*/
return 0;
}
Groups = Indices + BlockSize + BS_TEMP_SIZE;
if (groupSize <= ((size_t)1 << NumRefBits)
#ifndef BLOCK_SORT_USE_HEAP_SORT
if (groupSize <= ((UInt32)1 << NumRefBits)
#ifndef BLOCK_SORT_USE_HEAP_SORT
&& groupSize <= range
#endif
#endif
)
{
UInt32 *temp = Indices + BlockSize;
size_t j, group;
UInt32 mask, cg;
unsigned thereAreGroups;
UInt32 j;
UInt32 mask, thereAreGroups, group, cg;
{
UInt32 gPrev;
UInt32 gRes = 0;
{
size_t sp = ind2[0] + NumSortedBytes;
if (sp >= BlockSize)
sp -= BlockSize;
UInt32 sp = ind2[0] + NumSortedBytes;
if (sp >= BlockSize) sp -= BlockSize;
gPrev = Groups[sp];
temp[0] = gPrev << NumRefBits;
temp[0] = (gPrev << NumRefBits);
}
for (j = 1; j < groupSize; j++)
{
size_t sp = ind2[j] + NumSortedBytes;
UInt32 sp = ind2[j] + NumSortedBytes;
UInt32 g;
if (sp >= BlockSize)
sp -= BlockSize;
if (sp >= BlockSize) sp -= BlockSize;
g = Groups[sp];
temp[j] = (g << NumRefBits) | (UInt32)j;
temp[j] = (g << NumRefBits) | j;
gRes |= (gPrev ^ g);
}
if (gRes == 0)
{
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
SetGroupSize(ind2, groupSize);
#endif
#endif
return 1;
}
}
HeapSort(temp, groupSize);
mask = ((UInt32)1 << NumRefBits) - 1;
mask = (((UInt32)1 << NumRefBits) - 1);
thereAreGroups = 0;
group = groupOffset;
cg = temp[0] >> NumRefBits;
cg = (temp[0] >> NumRefBits);
temp[0] = ind2[temp[0] & mask];
{
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 *Flags = Groups + BlockSize;
#else
size_t prevGroupStart = 0;
#endif
#else
UInt32 prevGroupStart = 0;
#endif
for (j = 1; j < groupSize; j++)
{
const UInt32 val = temp[j];
const UInt32 cgCur = val >> NumRefBits;
UInt32 val = temp[j];
UInt32 cgCur = (val >> NumRefBits);
if (cgCur != cg)
{
cg = cgCur;
group = groupOffset + j;
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
{
const size_t t = group - 1;
Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask));
UInt32 t = group - 1;
Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask));
}
#else
#else
SetGroupSize(temp + prevGroupStart, j - prevGroupStart);
prevGroupStart = j;
#endif
#endif
}
else
thereAreGroups = 1;
{
const UInt32 ind = ind2[val & mask];
temp[j] = ind;
Groups[ind] = (UInt32)group;
UInt32 ind = ind2[val & mask];
temp[j] = ind;
Groups[ind] = group;
}
}
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
SetGroupSize(temp + prevGroupStart, j - prevGroupStart);
#endif
#endif
}
for (j = 0; j < groupSize; j++)
@ -216,42 +171,37 @@ SortGroup(size_t BlockSize, size_t NumSortedBytes,
/* Check that all strings are in one group (cannot sort) */
{
UInt32 group;
size_t j;
size_t sp = ind2[0] + NumSortedBytes;
if (sp >= BlockSize)
sp -= BlockSize;
UInt32 group, j;
UInt32 sp = ind2[0] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize;
group = Groups[sp];
for (j = 1; j < groupSize; j++)
{
sp = ind2[j] + NumSortedBytes;
if (sp >= BlockSize)
sp -= BlockSize;
sp = ind2[j] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize;
if (Groups[sp] != group)
break;
}
if (j == groupSize)
{
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
SetGroupSize(ind2, groupSize);
#endif
#endif
return 1;
}
}
#ifndef BLOCK_SORT_USE_HEAP_SORT
#ifndef BLOCK_SORT_USE_HEAP_SORT
{
/* ---------- Range Sort ---------- */
size_t i;
size_t mid;
UInt32 i;
UInt32 mid;
for (;;)
{
size_t j;
UInt32 j;
if (range <= 1)
{
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
SetGroupSize(ind2, groupSize);
#endif
#endif
return 1;
}
mid = left + ((range + 1) >> 1);
@ -259,7 +209,7 @@ SortGroup(size_t BlockSize, size_t NumSortedBytes,
i = 0;
do
{
size_t sp = ind2[i] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize;
UInt32 sp = ind2[i] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize;
if (Groups[sp] >= mid)
{
for (j--; j > i; j--)
@ -287,53 +237,51 @@ SortGroup(size_t BlockSize, size_t NumSortedBytes,
break;
}
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
{
const size_t t = groupOffset + i - 1;
UInt32 t = (groupOffset + i - 1);
UInt32 *Flags = Groups + BlockSize;
Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask));
Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask));
}
#endif
#endif
{
size_t j;
UInt32 j;
for (j = i; j < groupSize; j++)
Groups[ind2[j]] = (UInt32)(groupOffset + i);
Groups[ind2[j]] = groupOffset + i;
}
{
unsigned res = SortGroup(BlockSize, NumSortedBytes, groupOffset, i, NumRefBits, Indices, left, mid - left);
return res | SortGroup(BlockSize, NumSortedBytes, groupOffset + i, groupSize - i, NumRefBits, Indices, mid, range - (mid - left));
UInt32 res = SortGroup(BlockSize, NumSortedBytes, groupOffset, i, NumRefBits, Indices, left, mid - left);
return res | SortGroup(BlockSize, NumSortedBytes, groupOffset + i, groupSize - i, NumRefBits, Indices, mid, range - (mid - left));
}
}
#else // BLOCK_SORT_USE_HEAP_SORT
#else
/* ---------- Heap Sort ---------- */
{
size_t j;
UInt32 j;
for (j = 0; j < groupSize; j++)
{
size_t sp = ind2[j] + NumSortedBytes;
if (sp >= BlockSize)
sp -= BlockSize;
ind2[j] = (UInt32)sp;
UInt32 sp = ind2[j] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize;
ind2[j] = sp;
}
HeapSortRef(ind2, Groups, groupSize);
/* Write Flags */
{
size_t sp = ind2[0];
UInt32 sp = ind2[0];
UInt32 group = Groups[sp];
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 *Flags = Groups + BlockSize;
#else
size_t prevGroupStart = 0;
#endif
#else
UInt32 prevGroupStart = 0;
#endif
for (j = 1; j < groupSize; j++)
{
@ -341,210 +289,149 @@ SortGroup(size_t BlockSize, size_t NumSortedBytes,
if (Groups[sp] != group)
{
group = Groups[sp];
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
{
const size_t t = groupOffset + j - 1;
UInt32 t = groupOffset + j - 1;
Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask));
}
#else
#else
SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart);
prevGroupStart = j;
#endif
#endif
}
}
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart);
#endif
#endif
}
{
/* Write new Groups values and Check that there are groups */
unsigned thereAreGroups = 0;
UInt32 thereAreGroups = 0;
for (j = 0; j < groupSize; j++)
{
size_t group = groupOffset + j;
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 group = groupOffset + j;
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 subGroupSize = ((ind2[j] & ~0xC0000000) >> kNumBitsMax);
if (ind2[j] & 0x40000000)
if ((ind2[j] & 0x40000000) != 0)
subGroupSize += ((ind2[(size_t)j + 1] >> kNumBitsMax) << kNumExtra0Bits);
subGroupSize++;
for (;;)
{
const UInt32 original = ind2[j];
size_t sp = original & kIndexMask;
if (sp < NumSortedBytes)
sp += BlockSize;
sp -= NumSortedBytes;
ind2[j] = (UInt32)sp | (original & ~kIndexMask);
Groups[sp] = (UInt32)group;
UInt32 original = ind2[j];
UInt32 sp = original & kIndexMask;
if (sp < NumSortedBytes) sp += BlockSize; sp -= NumSortedBytes;
ind2[j] = sp | (original & ~kIndexMask);
Groups[sp] = group;
if (--subGroupSize == 0)
break;
j++;
thereAreGroups = 1;
}
#else
#else
UInt32 *Flags = Groups + BlockSize;
for (;;)
{
size_t sp = ind2[j];
if (sp < NumSortedBytes)
sp += BlockSize;
sp -= NumSortedBytes;
ind2[j] = (UInt32)sp;
Groups[sp] = (UInt32)group;
UInt32 sp = ind2[j]; if (sp < NumSortedBytes) sp += BlockSize; sp -= NumSortedBytes;
ind2[j] = sp;
Groups[sp] = group;
if ((Flags[(groupOffset + j) >> kNumFlagsBits] & (1 << ((groupOffset + j) & kFlagsMask))) == 0)
break;
j++;
thereAreGroups = 1;
}
#endif
#endif
}
return thereAreGroups;
}
}
#endif // BLOCK_SORT_USE_HEAP_SORT
#endif
}
/* conditions: blockSize > 0 */
UInt32 BlockSort(UInt32 *Indices, const Byte *data, size_t blockSize)
UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize)
{
UInt32 *counters = Indices + blockSize;
size_t i;
UInt32 i;
UInt32 *Groups;
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 *Flags;
#endif
#endif
/* Radix-Sort for 2 bytes */
// { UInt32 yyy; for (yyy = 0; yyy < 100; yyy++) {
/* Radix-Sort for 2 bytes */
for (i = 0; i < kNumHashValues; i++)
counters[i] = 0;
{
const Byte *data2 = data;
size_t a = data[(size_t)blockSize - 1];
const Byte *data_lim = data + blockSize;
if (blockSize >= 4)
{
data_lim -= 3;
do
{
size_t b;
b = data2[0]; counters[(a << 8) | b]++;
a = data2[1]; counters[(b << 8) | a]++;
b = data2[2]; counters[(a << 8) | b]++;
a = data2[3]; counters[(b << 8) | a]++;
data2 += 4;
}
while (data2 < data_lim);
data_lim += 3;
}
while (data2 != data_lim)
{
size_t b = *data2++;
counters[(a << 8) | b]++;
a = b;
}
}
// }}
for (i = 0; i < blockSize - 1; i++)
counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]++;
counters[((UInt32)data[i] << 8) | data[0]]++;
Groups = counters + BS_TEMP_SIZE;
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
Flags = Groups + blockSize;
{
const size_t numWords = (blockSize + kFlagsMask) >> kNumFlagsBits;
for (i = 0; i < numWords; i++)
Flags[i] = kAllFlags;
}
#endif
{
UInt32 numWords = (blockSize + kFlagsMask) >> kNumFlagsBits;
for (i = 0; i < numWords; i++)
Flags[i] = kAllFlags;
}
#endif
{
UInt32 sum = 0;
for (i = 0; i < kNumHashValues; i++)
{
const UInt32 groupSize = counters[i];
counters[i] = sum;
sum += groupSize;
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
if (groupSize)
UInt32 groupSize = counters[i];
if (groupSize > 0)
{
const UInt32 t = sum - 1;
Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask));
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 t = sum + groupSize - 1;
Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask));
#endif
sum += groupSize;
}
#endif
}
}
for (i = 0; i < blockSize - 1; i++)
Groups[i] = counters[((unsigned)data[i] << 8) | data[(size_t)i + 1]];
Groups[i] = counters[((unsigned)data[i] << 8) | data[0]];
{
#define SET_Indices(a, b, i) \
{ UInt32 c; \
a = (a << 8) | (b); \
c = counters[a]; \
Indices[c] = (UInt32)i++; \
counters[a] = c + 1; \
counters[i] = sum - groupSize;
}
size_t a = data[0];
const Byte *data_ptr = data + 1;
i = 0;
if (blockSize >= 3)
for (i = 0; i < blockSize - 1; i++)
Groups[i] = counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]];
Groups[i] = counters[((UInt32)data[i] << 8) | data[0]];
for (i = 0; i < blockSize - 1; i++)
Indices[counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]++] = i;
Indices[counters[((UInt32)data[i] << 8) | data[0]]++] = i;
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
{
blockSize -= 2;
do
{
size_t b;
b = data_ptr[0]; SET_Indices(a, b, i)
a = data_ptr[1]; SET_Indices(b, a, i)
data_ptr += 2;
}
while (i < blockSize);
blockSize += 2;
}
if (i < blockSize - 1)
{
SET_Indices(a, data[(size_t)i + 1], i)
a = (Byte)a;
}
SET_Indices(a, data[0], i)
}
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
{
UInt32 prev = 0;
for (i = 0; i < kNumHashValues; i++)
{
const UInt32 prevGroupSize = counters[i] - prev;
UInt32 prevGroupSize = counters[i] - prev;
if (prevGroupSize == 0)
continue;
SetGroupSize(Indices + prev, prevGroupSize);
prev = counters[i];
}
}
#endif
}
#endif
{
unsigned NumRefBits;
size_t NumSortedBytes;
for (NumRefBits = 0; ((blockSize - 1) >> NumRefBits) != 0; NumRefBits++)
{}
int NumRefBits;
UInt32 NumSortedBytes;
for (NumRefBits = 0; ((blockSize - 1) >> NumRefBits) != 0; NumRefBits++);
NumRefBits = 32 - NumRefBits;
if (NumRefBits > kNumRefBitsMax)
NumRefBits = kNumRefBitsMax;
NumRefBits = kNumRefBitsMax;
for (NumSortedBytes = kNumHashBytes; ; NumSortedBytes <<= 1)
{
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
size_t finishedGroupSize = 0;
#endif
size_t newLimit = 0;
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 finishedGroupSize = 0;
#endif
UInt32 newLimit = 0;
for (i = 0; i < blockSize;)
{
size_t groupSize;
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 groupSize;
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
if ((Flags[i >> kNumFlagsBits] & (1 << (i & kFlagsMask))) == 0)
{
@ -553,56 +440,56 @@ UInt32 BlockSort(UInt32 *Indices, const Byte *data, size_t blockSize)
}
for (groupSize = 1;
(Flags[(i + groupSize) >> kNumFlagsBits] & (1 << ((i + groupSize) & kFlagsMask))) != 0;
groupSize++)
{}
groupSize++);
groupSize++;
#else
#else
groupSize = (Indices[i] & ~0xC0000000) >> kNumBitsMax;
groupSize = ((Indices[i] & ~0xC0000000) >> kNumBitsMax);
{
const BoolInt finishedGroup = ((Indices[i] & 0x80000000) == 0);
if (Indices[i] & 0x40000000)
BoolInt finishedGroup = ((Indices[i] & 0x80000000) == 0);
if ((Indices[i] & 0x40000000) != 0)
{
groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits);
Indices[(size_t)i + 1] &= kIndexMask;
}
Indices[i] &= kIndexMask;
groupSize++;
if (finishedGroup || groupSize == 1)
{
Indices[i - finishedGroupSize] &= kIndexMask;
if (finishedGroupSize > 1)
Indices[(size_t)(i - finishedGroupSize) + 1] &= kIndexMask;
{
groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits);
Indices[(size_t)i + 1] &= kIndexMask;
UInt32 newGroupSize = groupSize + finishedGroupSize;
SetFinishedGroupSize(Indices + i - finishedGroupSize, newGroupSize);
finishedGroupSize = newGroupSize;
}
Indices[i] &= kIndexMask;
groupSize++;
if (finishedGroup || groupSize == 1)
{
Indices[i - finishedGroupSize] &= kIndexMask;
if (finishedGroupSize > 1)
Indices[(size_t)(i - finishedGroupSize) + 1] &= kIndexMask;
{
const size_t newGroupSize = groupSize + finishedGroupSize;
SetFinishedGroupSize(Indices + i - finishedGroupSize, newGroupSize)
finishedGroupSize = newGroupSize;
}
i += groupSize;
continue;
}
finishedGroupSize = 0;
i += groupSize;
continue;
}
finishedGroupSize = 0;
}
#endif
#endif
if (NumSortedBytes >= blockSize)
{
size_t j;
UInt32 j;
for (j = 0; j < groupSize; j++)
{
size_t t = i + j;
UInt32 t = (i + j);
/* Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); */
Groups[Indices[t]] = (UInt32)t;
Groups[Indices[t]] = t;
}
}
else
if (SortGroup(blockSize, NumSortedBytes, i, groupSize, NumRefBits, Indices
#ifndef BLOCK_SORT_USE_HEAP_SORT
, 0, blockSize
#endif
))
#ifndef BLOCK_SORT_USE_HEAP_SORT
, 0, blockSize
#endif
) != 0)
newLimit = i + groupSize;
i += groupSize;
}
@ -610,19 +497,19 @@ UInt32 BlockSort(UInt32 *Indices, const Byte *data, size_t blockSize)
break;
}
}
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
for (i = 0; i < blockSize;)
{
size_t groupSize = (Indices[i] & ~0xC0000000) >> kNumBitsMax;
if (Indices[i] & 0x40000000)
UInt32 groupSize = ((Indices[i] & ~0xC0000000) >> kNumBitsMax);
if ((Indices[i] & 0x40000000) != 0)
{
groupSize += (Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits;
groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits);
Indices[(size_t)i + 1] &= kIndexMask;
}
Indices[i] &= kIndexMask;
groupSize++;
i += groupSize;
}
#endif
#endif
return Groups[0];
}

View file

@ -1,8 +1,8 @@
/* BwtSort.h -- BWT block sorting
: Igor Pavlov : Public domain */
2013-01-18 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_BWT_SORT_H
#define ZIP7_INC_BWT_SORT_H
#ifndef __BWT_SORT_H
#define __BWT_SORT_H
#include "7zTypes.h"
@ -10,17 +10,16 @@ EXTERN_C_BEGIN
/* use BLOCK_SORT_EXTERNAL_FLAGS if blockSize can be > 1M */
/* #define BLOCK_SORT_EXTERNAL_FLAGS */
// #define BLOCK_SORT_EXTERNAL_FLAGS
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) (((blockSize) + 31) >> 5)
#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) ((((blockSize) + 31) >> 5))
#else
#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) 0
#endif
#define BLOCK_SORT_BUF_SIZE(blockSize) ((blockSize) * 2 + BLOCK_SORT_EXTERNAL_SIZE(blockSize) + (1 << 16))
UInt32 BlockSort(UInt32 *indices, const Byte *data, size_t blockSize);
UInt32 BlockSort(UInt32 *indices, const Byte *data, UInt32 blockSize);
EXTERN_C_END

View file

@ -1,105 +1,12 @@
/* Compiler.h : Compiler specific defines and pragmas
: Igor Pavlov : Public domain */
/* Compiler.h
2021-01-05 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_COMPILER_H
#define ZIP7_INC_COMPILER_H
#ifndef __7Z_COMPILER_H
#define __7Z_COMPILER_H
#if defined(__clang__)
# define Z7_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
#endif
#if defined(__clang__) && defined(__apple_build_version__)
# define Z7_APPLE_CLANG_VERSION Z7_CLANG_VERSION
#elif defined(__clang__)
# define Z7_LLVM_CLANG_VERSION Z7_CLANG_VERSION
#elif defined(__GNUC__)
# define Z7_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
#endif
#ifdef _MSC_VER
#if !defined(__clang__) && !defined(__GNUC__)
#define Z7_MSC_VER_ORIGINAL _MSC_VER
#endif
#endif
#if defined(__MINGW32__) || defined(__MINGW64__)
#define Z7_MINGW
#endif
#if defined(__LCC__) && (defined(__MCST__) || defined(__e2k__))
#define Z7_MCST_LCC
#define Z7_MCST_LCC_VERSION (__LCC__ * 100 + __LCC_MINOR__)
#endif
/*
#if defined(__AVX2__) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
|| defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \
|| defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
#define Z7_COMPILER_AVX2_SUPPORTED
#ifdef __clang__
#pragma clang diagnostic ignored "-Wunused-private-field"
#endif
#endif
*/
// #pragma GCC diagnostic ignored "-Wunknown-pragmas"
#ifdef __clang__
// padding size of '' with 4 bytes to alignment boundary
#pragma GCC diagnostic ignored "-Wpadded"
#if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13) \
&& defined(__FreeBSD__)
// freebsd:
#pragma GCC diagnostic ignored "-Wexcess-padding"
#endif
#if __clang_major__ >= 16
#pragma GCC diagnostic ignored "-Wunsafe-buffer-usage"
#endif
#if __clang_major__ == 13
#if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
// cheri
#pragma GCC diagnostic ignored "-Wcapability-to-integer-cast"
#endif
#endif
#if __clang_major__ == 13
// for <arm_neon.h>
#pragma GCC diagnostic ignored "-Wreserved-identifier"
#endif
#endif // __clang__
#if defined(_WIN32) && defined(__clang__) && __clang_major__ >= 16
// #pragma GCC diagnostic ignored "-Wcast-function-type-strict"
#define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION \
_Pragma("GCC diagnostic ignored \"-Wcast-function-type-strict\"")
#else
#define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
#endif
typedef void (*Z7_void_Function)(void);
#if defined(__clang__) || defined(__GNUC__)
#define Z7_CAST_FUNC_C (Z7_void_Function)
#elif defined(_MSC_VER) && _MSC_VER > 1920
#define Z7_CAST_FUNC_C (void *)
// #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()'
#else
#define Z7_CAST_FUNC_C
#endif
/*
#if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__)
// #pragma GCC diagnostic ignored "-Wcast-function-type"
#endif
*/
#ifdef __GNUC__
#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40000) && (Z7_GCC_VERSION < 70000)
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif
#endif
#ifdef _MSC_VER
@ -110,134 +17,24 @@ typedef void (*Z7_void_Function)(void);
#pragma warning(disable : 4214) // nonstandard extension used : bit field types other than int
#endif
#if defined(_MSC_VER) && _MSC_VER >= 1800
#pragma warning(disable : 4464) // relative include path contains '..'
#endif
#if _MSC_VER >= 1300
#pragma warning(disable : 4996) // This function or variable may be unsafe
#else
#pragma warning(disable : 4511) // copy constructor could not be generated
#pragma warning(disable : 4512) // assignment operator could not be generated
#pragma warning(disable : 4514) // unreferenced inline function has been removed
#pragma warning(disable : 4702) // unreachable code
#pragma warning(disable : 4710) // not inlined
#pragma warning(disable : 4714) // function marked as __forceinline not inlined
#pragma warning(disable : 4786) // identifier was truncated to '255' characters in the debug information
#endif
// == 1200 : -O1 : for __forceinline
// >= 1900 : -O1 : for printf
#pragma warning(disable : 4710) // function not inlined
#ifdef __clang__
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
#pragma clang diagnostic ignored "-Wmicrosoft-exception-spec"
// #pragma clang diagnostic ignored "-Wreserved-id-macro"
#endif
#if _MSC_VER < 1900
// winnt.h: 'Int64ShllMod32'
#pragma warning(disable : 4514) // unreferenced inline function has been removed
#endif
#if _MSC_VER < 1300
// #pragma warning(disable : 4702) // unreachable code
// Bra.c : -O1:
#pragma warning(disable : 4714) // function marked as __forceinline not inlined
#endif
/*
#if _MSC_VER > 1400 && _MSC_VER <= 1900
// strcat: This function or variable may be unsafe
// sysinfoapi.h: kit10: GetVersion was declared deprecated
#pragma warning(disable : 4996)
#endif
*/
#if _MSC_VER > 1200
// -Wall warnings
#pragma warning(disable : 4711) // function selected for automatic inline expansion
#pragma warning(disable : 4820) // '2' bytes padding added after data member
#if _MSC_VER >= 1400 && _MSC_VER < 1920
// 1400: string.h: _DBG_MEMCPY_INLINE_
// 1600 - 191x : smmintrin.h __cplusplus'
// is not defined as a preprocessor macro, replacing with '0' for '#if/#elif'
#pragma warning(disable : 4668)
// 1400 - 1600 : WinDef.h : 'FARPROC' :
// 1900 - 191x : immintrin.h: _readfsbase_u32
// no function prototype given : converting '()' to '(void)'
#pragma warning(disable : 4255)
#endif
#if _MSC_VER >= 1914
// Compiler will insert Spectre mitigation for memory load if /Qspectre switch specified
#pragma warning(disable : 5045)
#endif
#endif // _MSC_VER > 1200
#endif // _MSC_VER
#if defined(__clang__) && (__clang_major__ >= 4)
#define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
_Pragma("clang loop unroll(disable)") \
_Pragma("clang loop vectorize(disable)")
#define Z7_ATTRIB_NO_VECTORIZE
#elif defined(__GNUC__) && (__GNUC__ >= 5) \
&& (!defined(Z7_MCST_LCC_VERSION) || (Z7_MCST_LCC_VERSION >= 12610))
#define Z7_ATTRIB_NO_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
// __attribute__((optimize("no-unroll-loops")));
#define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
#elif defined(_MSC_VER) && (_MSC_VER >= 1920)
#define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
_Pragma("loop( no_vector )")
#define Z7_ATTRIB_NO_VECTORIZE
#else
#define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
#define Z7_ATTRIB_NO_VECTORIZE
#endif
#if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920)
#define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )")
#define Z7_PRAGMA_OPTIMIZE_DEFAULT _Pragma("optimize ( \"\", on )")
#else
#define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE
#define Z7_PRAGMA_OPTIMIZE_DEFAULT
#endif
#if defined(MY_CPU_X86_OR_AMD64) && ( \
defined(__clang__) && (__clang_major__ >= 4) \
|| defined(__GNUC__) && (__GNUC__ >= 5))
#define Z7_ATTRIB_NO_SSE __attribute__((__target__("no-sse")))
#else
#define Z7_ATTRIB_NO_SSE
#endif
#define Z7_ATTRIB_NO_VECTOR \
Z7_ATTRIB_NO_VECTORIZE \
Z7_ATTRIB_NO_SSE
#if defined(__clang__) && (__clang_major__ >= 8) \
|| defined(__GNUC__) && (__GNUC__ >= 1000) \
/* || defined(_MSC_VER) && (_MSC_VER >= 1920) */
// GCC is not good for __builtin_expect()
#define Z7_LIKELY(x) (__builtin_expect((x), 1))
#define Z7_UNLIKELY(x) (__builtin_expect((x), 0))
// #define Z7_unlikely [[unlikely]]
// #define Z7_likely [[likely]]
#else
#define Z7_LIKELY(x) (x)
#define Z7_UNLIKELY(x) (x)
// #define Z7_likely
#endif
#if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30600))
#if (Z7_CLANG_VERSION < 130000)
#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wreserved-id-macro\"")
#else
#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wreserved-macro-identifier\"")
#endif
#define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER \
_Pragma("GCC diagnostic pop")
#else
#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#define UNUSED_VAR(x) (void)x;

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,8 @@
/* CpuArch.h -- CPU specific code
Igor Pavlov : Public domain */
2021-07-13 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_CPU_ARCH_H
#define ZIP7_INC_CPU_ARCH_H
#ifndef __CPU_ARCH_H
#define __CPU_ARCH_H
#include "7zTypes.h"
@ -20,7 +20,6 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8)
*/
#if !defined(_M_ARM64EC)
#if defined(_M_X64) \
|| defined(_M_AMD64) \
|| defined(__x86_64__) \
@ -36,7 +35,6 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#endif
#define MY_CPU_64BIT
#endif
#endif
#if defined(_M_IX86) \
@ -47,34 +45,13 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#define MY_CPU_SIZEOF_POINTER 4
#endif
#if defined(__SSE2__) \
|| defined(MY_CPU_AMD64) \
|| defined(_M_IX86_FP) && (_M_IX86_FP >= 2)
#define MY_CPU_SSE2
#endif
#if defined(_M_ARM64) \
|| defined(_M_ARM64EC) \
|| defined(__AARCH64EL__) \
|| defined(__AARCH64EB__) \
|| defined(__aarch64__)
#define MY_CPU_ARM64
#if defined(__ILP32__) \
|| defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
#define MY_CPU_NAME "arm64-32"
#define MY_CPU_SIZEOF_POINTER 4
#elif defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
#define MY_CPU_NAME "arm64-128"
#define MY_CPU_SIZEOF_POINTER 16
#else
#if defined(_M_ARM64EC)
#define MY_CPU_NAME "arm64ec"
#else
#define MY_CPU_NAME "arm64"
#endif
#define MY_CPU_SIZEOF_POINTER 8
#endif
#define MY_CPU_NAME "arm64"
#define MY_CPU_64BIT
#endif
@ -91,10 +68,8 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#define MY_CPU_ARM
#if defined(__thumb__) || defined(__THUMBEL__) || defined(_M_ARMT)
#define MY_CPU_ARMT
#define MY_CPU_NAME "armt"
#else
#define MY_CPU_ARM32
#define MY_CPU_NAME "arm"
#endif
/* #define MY_CPU_32BIT */
@ -128,8 +103,6 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
|| defined(__PPC__) \
|| defined(_POWER)
#define MY_CPU_PPC_OR_PPC64
#if defined(__ppc64__) \
|| defined(__powerpc64__) \
|| defined(_LP64) \
@ -150,76 +123,12 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#endif
#if defined(__sparc__) \
|| defined(__sparc)
#define MY_CPU_SPARC
#if defined(__LP64__) \
|| defined(_LP64) \
|| defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
#define MY_CPU_NAME "sparcv9"
#define MY_CPU_SIZEOF_POINTER 8
#define MY_CPU_64BIT
#elif defined(__sparc_v9__) \
|| defined(__sparcv9)
#define MY_CPU_64BIT
#if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
#define MY_CPU_NAME "sparcv9-32"
#else
#define MY_CPU_NAME "sparcv9m"
#endif
#elif defined(__sparc_v8__) \
|| defined(__sparcv8)
#define MY_CPU_NAME "sparcv8"
#define MY_CPU_SIZEOF_POINTER 4
#else
#define MY_CPU_NAME "sparc"
#endif
#endif
#if defined(__riscv) \
|| defined(__riscv__)
#define MY_CPU_RISCV
#if __riscv_xlen == 32
#define MY_CPU_NAME "riscv32"
#elif __riscv_xlen == 64
#define MY_CPU_NAME "riscv64"
#else
#define MY_CPU_NAME "riscv"
#endif
#endif
#if defined(__loongarch__)
#define MY_CPU_LOONGARCH
#if defined(__loongarch64) || defined(__loongarch_grlen) && (__loongarch_grlen == 64)
#define MY_CPU_64BIT
#endif
#if defined(__loongarch64)
#define MY_CPU_NAME "loongarch64"
#define MY_CPU_LOONGARCH64
#else
#define MY_CPU_NAME "loongarch"
#endif
#endif
// #undef MY_CPU_NAME
// #undef MY_CPU_SIZEOF_POINTER
// #define __e2k__
// #define __SIZEOF_POINTER__ 4
#if defined(__e2k__)
#define MY_CPU_E2K
#if defined(__ILP32__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
#define MY_CPU_NAME "e2k-32"
#define MY_CPU_SIZEOF_POINTER 4
#else
#define MY_CPU_NAME "e2k"
#if defined(__LP64__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
#define MY_CPU_SIZEOF_POINTER 8
#endif
#endif
#if defined(__sparc64__)
#define MY_CPU_NAME "sparc64"
#define MY_CPU_64BIT
#elif defined(__sparc__)
#define MY_CPU_NAME "sparc"
/* #define MY_CPU_32BIT */
#endif
@ -253,7 +162,6 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
|| defined(MY_CPU_ARM_LE) \
|| defined(MY_CPU_ARM64_LE) \
|| defined(MY_CPU_IA64_LE) \
|| defined(_LITTLE_ENDIAN) \
|| defined(__LITTLE_ENDIAN__) \
|| defined(__ARMEL__) \
|| defined(__THUMBEL__) \
@ -286,9 +194,6 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#error Stop_Compiling_Bad_Endian
#endif
#if !defined(MY_CPU_LE) && !defined(MY_CPU_BE)
#error Stop_Compiling_CPU_ENDIAN_must_be_detected_at_compile_time
#endif
#if defined(MY_CPU_32BIT) && defined(MY_CPU_64BIT)
#error Stop_Compiling_Bad_32_64_BIT
@ -330,7 +235,6 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#ifndef MY_CPU_NAME
// #define MY_CPU_IS_UNKNOWN
#ifdef MY_CPU_LE
#define MY_CPU_NAME "LE"
#elif defined(MY_CPU_BE)
@ -346,121 +250,15 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#ifdef __has_builtin
#define Z7_has_builtin(x) __has_builtin(x)
#else
#define Z7_has_builtin(x) 0
#endif
#define Z7_BSWAP32_CONST(v) \
( (((UInt32)(v) << 24) ) \
| (((UInt32)(v) << 8) & (UInt32)0xff0000) \
| (((UInt32)(v) >> 8) & (UInt32)0xff00 ) \
| (((UInt32)(v) >> 24) ))
#if defined(_MSC_VER) && (_MSC_VER >= 1300)
#include <stdlib.h>
/* Note: these macros will use bswap instruction (486), that is unsupported in 386 cpu */
#pragma intrinsic(_byteswap_ushort)
#pragma intrinsic(_byteswap_ulong)
#pragma intrinsic(_byteswap_uint64)
#define Z7_BSWAP16(v) _byteswap_ushort(v)
#define Z7_BSWAP32(v) _byteswap_ulong (v)
#define Z7_BSWAP64(v) _byteswap_uint64(v)
#define Z7_CPU_FAST_BSWAP_SUPPORTED
/* GCC can generate slow code that calls function for __builtin_bswap32() for:
- GCC for RISCV, if Zbb/XTHeadBb extension is not used.
- GCC for SPARC.
The code from CLANG for SPARC also is not fastest.
So we don't define Z7_CPU_FAST_BSWAP_SUPPORTED in some cases.
*/
#elif (!defined(MY_CPU_RISCV) || defined (__riscv_zbb) || defined(__riscv_xtheadbb)) \
&& !defined(MY_CPU_SPARC) \
&& ( \
(defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \
|| (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) \
)
#define Z7_BSWAP16(v) __builtin_bswap16(v)
#define Z7_BSWAP32(v) __builtin_bswap32(v)
#define Z7_BSWAP64(v) __builtin_bswap64(v)
#define Z7_CPU_FAST_BSWAP_SUPPORTED
#else
#define Z7_BSWAP16(v) ((UInt16) \
( ((UInt32)(v) << 8) \
| ((UInt32)(v) >> 8) \
))
#define Z7_BSWAP32(v) Z7_BSWAP32_CONST(v)
#define Z7_BSWAP64(v) \
( ( ( (UInt64)(v) ) << 8 * 7 ) \
| ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 1) ) << 8 * 5 ) \
| ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 2) ) << 8 * 3 ) \
| ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 3) ) << 8 * 1 ) \
| ( ( (UInt64)(v) >> 8 * 1 ) & ((UInt32)0xff << 8 * 3) ) \
| ( ( (UInt64)(v) >> 8 * 3 ) & ((UInt32)0xff << 8 * 2) ) \
| ( ( (UInt64)(v) >> 8 * 5 ) & ((UInt32)0xff << 8 * 1) ) \
| ( ( (UInt64)(v) >> 8 * 7 ) ) \
)
#endif
#ifdef MY_CPU_LE
#if defined(MY_CPU_X86_OR_AMD64) \
|| defined(MY_CPU_ARM64) \
|| defined(MY_CPU_RISCV) && defined(__riscv_misaligned_fast) \
|| defined(MY_CPU_E2K) && defined(__iset__) && (__iset__ >= 6)
|| defined(MY_CPU_ARM64)
#define MY_CPU_LE_UNALIGN
#define MY_CPU_LE_UNALIGN_64
#elif defined(__ARM_FEATURE_UNALIGNED)
/* === ALIGNMENT on 32-bit arm and LDRD/STRD/LDM/STM instructions.
Description of problems:
problem-1 : 32-bit ARM architecture:
multi-access (pair of 32-bit accesses) instructions (LDRD/STRD/LDM/STM)
require 32-bit (WORD) alignment (by 32-bit ARM architecture).
So there is "Alignment fault exception", if data is not aligned for 32-bit.
problem-2 : 32-bit kernels and arm64 kernels:
32-bit linux kernels provide fixup for these "paired" instruction "Alignment fault exception".
So unaligned paired-access instructions work via exception handler in kernel in 32-bit linux.
But some arm64 kernels do not handle these faults in 32-bit programs.
So we have unhandled exception for such instructions.
Probably some new arm64 kernels have fixed it, and unaligned
paired-access instructions work in new kernels?
problem-3 : compiler for 32-bit arm:
Compilers use LDRD/STRD/LDM/STM for UInt64 accesses
and for another cases where two 32-bit accesses are fused
to one multi-access instruction.
So UInt64 variables must be aligned for 32-bit, and each
32-bit access must be aligned for 32-bit, if we want to
avoid "Alignment fault" exception (handled or unhandled).
problem-4 : performace:
Even if unaligned access is handled by kernel, it will be slow.
So if we allow unaligned access, we can get fast unaligned
single-access, and slow unaligned paired-access.
We don't allow unaligned access on 32-bit arm, because compiler
genarates paired-access instructions that require 32-bit alignment,
and some arm64 kernels have no handler for these instructions.
Also unaligned paired-access instructions will be slow, if kernel handles them.
*/
// it must be disabled:
// #define MY_CPU_LE_UNALIGN
/* gcc9 for 32-bit arm can use LDRD instruction that requires 32-bit alignment.
So we can't use unaligned 64-bit operations. */
#define MY_CPU_LE_UNALIGN
#endif
#endif
@ -471,11 +269,13 @@ problem-4 : performace:
#define GetUi32(p) (*(const UInt32 *)(const void *)(p))
#ifdef MY_CPU_LE_UNALIGN_64
#define GetUi64(p) (*(const UInt64 *)(const void *)(p))
#define SetUi64(p, v) { *(UInt64 *)(void *)(p) = (v); }
#endif
#define SetUi16(p, v) { *(UInt16 *)(void *)(p) = (v); }
#define SetUi32(p, v) { *(UInt32 *)(void *)(p) = (v); }
#ifdef MY_CPU_LE_UNALIGN_64
#define SetUi64(p, v) { *(UInt64 *)(void *)(p) = (v); }
#endif
#else
@ -502,33 +302,50 @@ problem-4 : performace:
#endif
#ifndef GetUi64
#ifndef MY_CPU_LE_UNALIGN_64
#define GetUi64(p) (GetUi32(p) | ((UInt64)GetUi32(((const Byte *)(p)) + 4) << 32))
#endif
#ifndef SetUi64
#define SetUi64(p, v) { Byte *_ppp2_ = (Byte *)(p); UInt64 _vvv2_ = (v); \
SetUi32(_ppp2_ , (UInt32)_vvv2_) \
SetUi32(_ppp2_ + 4, (UInt32)(_vvv2_ >> 32)) }
SetUi32(_ppp2_ , (UInt32)_vvv2_); \
SetUi32(_ppp2_ + 4, (UInt32)(_vvv2_ >> 32)); }
#endif
#if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
#if 0
// Z7_BSWAP16 can be slow for x86-msvc
#define GetBe16_to32(p) (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p)))
#ifdef __has_builtin
#define MY__has_builtin(x) __has_builtin(x)
#else
#define GetBe16_to32(p) (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16)
#define MY__has_builtin(x) 0
#endif
#define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p))
#define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); }
#if defined(MY_CPU_LE_UNALIGN) && /* defined(_WIN64) && */ defined(_MSC_VER) && (_MSC_VER >= 1300)
#if defined(MY_CPU_LE_UNALIGN_64)
#define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p))
#define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); }
#endif
/* Note: we use bswap instruction, that is unsupported in 386 cpu */
#include <stdlib.h>
#pragma intrinsic(_byteswap_ushort)
#pragma intrinsic(_byteswap_ulong)
#pragma intrinsic(_byteswap_uint64)
/* #define GetBe16(p) _byteswap_ushort(*(const UInt16 *)(const Byte *)(p)) */
#define GetBe32(p) _byteswap_ulong (*(const UInt32 *)(const void *)(p))
#define GetBe64(p) _byteswap_uint64(*(const UInt64 *)(const void *)(p))
#define SetBe32(p, v) (*(UInt32 *)(void *)(p)) = _byteswap_ulong(v)
#elif defined(MY_CPU_LE_UNALIGN) && ( \
(defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \
|| (defined(__clang__) && MY__has_builtin(__builtin_bswap16)) )
/* #define GetBe16(p) __builtin_bswap16(*(const UInt16 *)(const void *)(p)) */
#define GetBe32(p) __builtin_bswap32(*(const UInt32 *)(const void *)(p))
#define GetBe64(p) __builtin_bswap64(*(const UInt64 *)(const void *)(p))
#define SetBe32(p, v) (*(UInt32 *)(void *)(p)) = __builtin_bswap32(v)
#else
@ -538,6 +355,8 @@ problem-4 : performace:
((UInt32)((const Byte *)(p))[2] << 8) | \
((const Byte *)(p))[3] )
#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
#define SetBe32(p, v) { Byte *_ppp_ = (Byte *)(p); UInt32 _vvv_ = (v); \
_ppp_[0] = (Byte)(_vvv_ >> 24); \
_ppp_[1] = (Byte)(_vvv_ >> 16); \
@ -546,115 +365,53 @@ problem-4 : performace:
#endif
#ifndef GetBe64
#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
#endif
#ifndef SetBe64
#define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \
_ppp_[0] = (Byte)(_vvv_ >> 56); \
_ppp_[1] = (Byte)(_vvv_ >> 48); \
_ppp_[2] = (Byte)(_vvv_ >> 40); \
_ppp_[3] = (Byte)(_vvv_ >> 32); \
_ppp_[4] = (Byte)(_vvv_ >> 24); \
_ppp_[5] = (Byte)(_vvv_ >> 16); \
_ppp_[6] = (Byte)(_vvv_ >> 8); \
_ppp_[7] = (Byte)_vvv_; }
#endif
#ifndef GetBe16
#ifdef GetBe16_to32
#define GetBe16(p) ( (UInt16) GetBe16_to32(p))
#else
#define GetBe16(p) ( (UInt16) ( \
((UInt16)((const Byte *)(p))[0] << 8) | \
((const Byte *)(p))[1] ))
#endif
#endif
#if defined(MY_CPU_BE)
#define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v)
#define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
#define Z7_CONV_NATIVE_TO_BE_32(v) (v)
// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b1) | ((b0) << 8))
#elif defined(MY_CPU_LE)
#define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
#define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v)
#define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v)
// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b0) | ((b1) << 8))
#else
#error Stop_Compiling_Unknown_Endian_CONV
#endif
#if defined(MY_CPU_BE)
#define GetBe64a(p) (*(const UInt64 *)(const void *)(p))
#define GetBe32a(p) (*(const UInt32 *)(const void *)(p))
#define GetBe16a(p) (*(const UInt16 *)(const void *)(p))
#define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); }
#define SetBe16a(p, v) { *(UInt16 *)(void *)(p) = (v); }
#define GetUi64a(p) GetUi64(p)
#define GetUi32a(p) GetUi32(p)
#define GetUi16a(p) GetUi16(p)
#define SetUi32a(p, v) SetUi32(p, v)
#define SetUi16a(p, v) SetUi16(p, v)
#elif defined(MY_CPU_LE)
#define GetUi64a(p) (*(const UInt64 *)(const void *)(p))
#define GetUi32a(p) (*(const UInt32 *)(const void *)(p))
#define GetUi16a(p) (*(const UInt16 *)(const void *)(p))
#define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); }
#define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); }
#define GetBe64a(p) GetBe64(p)
#define GetBe32a(p) GetBe32(p)
#define GetBe16a(p) GetBe16(p)
#define SetBe32a(p, v) SetBe32(p, v)
#define SetBe16a(p, v) SetBe16(p, v)
#else
#error Stop_Compiling_Unknown_Endian_CPU_a
#endif
#ifndef GetBe16_to32
#define GetBe16_to32(p) GetBe16(p)
#endif
#if defined(MY_CPU_X86_OR_AMD64) \
|| defined(MY_CPU_ARM_OR_ARM64) \
|| defined(MY_CPU_PPC_OR_PPC64)
#define Z7_CPU_FAST_ROTATE_SUPPORTED
#endif
#ifdef MY_CPU_X86_OR_AMD64
void Z7_FASTCALL z7_x86_cpuid(UInt32 a[4], UInt32 function);
UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void);
#if defined(MY_CPU_AMD64)
#define Z7_IF_X86_CPUID_SUPPORTED
#else
#define Z7_IF_X86_CPUID_SUPPORTED if (z7_x86_cpuid_GetMaxFunc())
#endif
typedef struct
{
UInt32 maxFunc;
UInt32 vendor[3];
UInt32 ver;
UInt32 b;
UInt32 c;
UInt32 d;
} Cx86cpuid;
enum
{
CPU_FIRM_INTEL,
CPU_FIRM_AMD,
CPU_FIRM_VIA
};
void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d);
BoolInt x86cpuid_CheckAndRead(Cx86cpuid *p);
int x86cpuid_GetFirm(const Cx86cpuid *p);
#define x86cpuid_GetFamily(ver) (((ver >> 16) & 0xFF0) | ((ver >> 8) & 0xF))
#define x86cpuid_GetModel(ver) (((ver >> 12) & 0xF0) | ((ver >> 4) & 0xF))
#define x86cpuid_GetStepping(ver) (ver & 0xF)
BoolInt CPU_Is_InOrder(void);
BoolInt CPU_IsSupported_AES(void);
BoolInt CPU_IsSupported_AVX(void);
BoolInt CPU_IsSupported_AVX2(void);
BoolInt CPU_IsSupported_AVX512F_AVX512VL(void);
BoolInt CPU_IsSupported_VAES_AVX2(void);
BoolInt CPU_IsSupported_CMOV(void);
BoolInt CPU_IsSupported_SSE(void);
BoolInt CPU_IsSupported_SSE2(void);
BoolInt CPU_IsSupported_SSSE3(void);
BoolInt CPU_IsSupported_SSE41(void);
BoolInt CPU_IsSupported_SHA(void);
BoolInt CPU_IsSupported_SHA512(void);
BoolInt CPU_IsSupported_PageGB(void);
#elif defined(MY_CPU_ARM_OR_ARM64)
@ -672,13 +429,12 @@ BoolInt CPU_IsSupported_SHA1(void);
BoolInt CPU_IsSupported_SHA2(void);
BoolInt CPU_IsSupported_AES(void);
#endif
BoolInt CPU_IsSupported_SHA512(void);
#endif
#if defined(__APPLE__)
int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize);
int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val);
int My_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize);
int My_sysctlbyname_Get_UInt32(const char *name, UInt32 *val);
#endif
EXTERN_C_END

View file

@ -1,8 +1,8 @@
/* Delta.h -- Delta converter
2023-03-03 : Igor Pavlov : Public domain */
2013-01-18 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_DELTA_H
#define ZIP7_INC_DELTA_H
#ifndef __DELTA_H
#define __DELTA_H
#include "7zTypes.h"

View file

@ -1,99 +1,110 @@
/* DllSecur.c -- DLL loading security
2023-12-03 : Igor Pavlov : Public domain */
2021-12-25 : Igor Pavlov : Public domain */
#include "Precomp.h"
#ifdef _WIN32
#include "7zWindows.h"
#include <Windows.h>
#include "DllSecur.h"
#ifndef UNDER_CE
Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags);
#define MY_LOAD_LIBRARY_SEARCH_USER_DIRS 0x400
#define MY_LOAD_LIBRARY_SEARCH_SYSTEM32 0x800
#define DELIM "\0"
static const char * const g_Dlls =
"userenv"
DELIM "setupapi"
DELIM "apphelp"
DELIM "propsys"
DELIM "dwmapi"
DELIM "cryptbase"
DELIM "oleacc"
DELIM "clbcatq"
DELIM "version"
#ifndef _CONSOLE
DELIM "uxtheme"
"UXTHEME\0"
#endif
DELIM;
"USERENV\0"
"SETUPAPI\0"
"APPHELP\0"
"PROPSYS\0"
"DWMAPI\0"
"CRYPTBASE\0"
"OLEACC\0"
"CLBCATQ\0"
"VERSION\0"
;
#endif
#ifdef __clang__
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
#if defined (_MSC_VER) && _MSC_VER >= 1900
// sysinfoapi.h: kit10: GetVersion was declared deprecated
#pragma warning(disable : 4996)
#endif
// #define MY_CAST_FUNC (void(*)())
#define MY_CAST_FUNC
#define IF_NON_VISTA_SET_DLL_DIRS_AND_RETURN \
if ((UInt16)GetVersion() != 6) { \
const \
Func_SetDefaultDllDirectories setDllDirs = \
(Func_SetDefaultDllDirectories) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), \
"SetDefaultDllDirectories"); \
if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; }
void My_SetDefaultDllDirectories(void)
void My_SetDefaultDllDirectories()
{
#ifndef UNDER_CE
IF_NON_VISTA_SET_DLL_DIRS_AND_RETURN
OSVERSIONINFO vi;
vi.dwOSVersionInfoSize = sizeof(vi);
if (!GetVersionEx(&vi) || vi.dwMajorVersion != 6 || vi.dwMinorVersion != 0)
{
Func_SetDefaultDllDirectories setDllDirs = (Func_SetDefaultDllDirectories)
MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories");
if (setDllDirs)
if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS))
return;
}
#endif
}
void LoadSecurityDlls(void)
void LoadSecurityDlls()
{
#ifndef UNDER_CE
// at Vista (ver 6.0) : CoCreateInstance(CLSID_ShellLink, ...) doesn't work after SetDefaultDllDirectories() : Check it ???
IF_NON_VISTA_SET_DLL_DIRS_AND_RETURN
wchar_t buf[MAX_PATH + 100];
{
wchar_t buf[MAX_PATH + 100];
const char *dll;
unsigned pos = GetSystemDirectoryW(buf, MAX_PATH + 2);
if (pos == 0 || pos > MAX_PATH)
// at Vista (ver 6.0) : CoCreateInstance(CLSID_ShellLink, ...) doesn't work after SetDefaultDllDirectories() : Check it ???
OSVERSIONINFO vi;
vi.dwOSVersionInfoSize = sizeof(vi);
if (!GetVersionEx(&vi) || vi.dwMajorVersion != 6 || vi.dwMinorVersion != 0)
{
Func_SetDefaultDllDirectories setDllDirs = (Func_SetDefaultDllDirectories)
MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories");
if (setDllDirs)
if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS))
return;
}
}
{
unsigned len = GetSystemDirectoryW(buf, MAX_PATH + 2);
if (len == 0 || len > MAX_PATH)
return;
}
{
const char *dll;
unsigned pos = (unsigned)lstrlenW(buf);
if (buf[pos - 1] != '\\')
buf[pos++] = '\\';
for (dll = g_Dlls; *dll != 0;)
for (dll = g_Dlls; dll[0] != 0;)
{
wchar_t *dest = &buf[pos];
unsigned k = 0;
for (;;)
{
const char c = *dll++;
char c = *dll++;
buf[pos + k] = (Byte)c;
k++;
if (c == 0)
break;
*dest++ = (Byte)c;
}
dest[0] = '.';
dest[1] = 'd';
dest[2] = 'l';
dest[3] = 'l';
dest[4] = 0;
// lstrcatW(buf, L".dll");
lstrcatW(buf, L".dll");
LoadLibraryExW(buf, NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
}
}
#endif
}
#endif // _WIN32
#endif

View file

@ -1,8 +1,8 @@
/* DllSecur.h -- DLL loading for security
2023-03-03 : Igor Pavlov : Public domain */
2018-02-19 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_DLL_SECUR_H
#define ZIP7_INC_DLL_SECUR_H
#ifndef __DLL_SECUR_H
#define __DLL_SECUR_H
#include "7zTypes.h"

View file

@ -1,125 +1,60 @@
/* HuffEnc.c -- functions for Huffman encoding
Igor Pavlov : Public domain */
2021-02-09 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include <string.h>
#include "HuffEnc.h"
#include "Sort.h"
#include "CpuArch.h"
#define kMaxLen Z7_HUFFMAN_LEN_MAX
#define NUM_BITS 10
#define MASK ((1u << NUM_BITS) - 1)
#define FREQ_MASK (~(UInt32)MASK)
#define NUM_COUNTERS (48 * 2)
#define kMaxLen 16
#define NUM_BITS 10
#define MASK (((unsigned)1 << NUM_BITS) - 1)
#if 1 && (defined(MY_CPU_LE) || defined(MY_CPU_BE))
#if defined(MY_CPU_LE)
#define HI_HALF_OFFSET 1
#else
#define HI_HALF_OFFSET 0
#endif
#define LOAD_PARENT(p) ((unsigned)*((const UInt16 *)(p) + HI_HALF_OFFSET))
#define STORE_PARENT(p, fb, val) *((UInt16 *)(p) + HI_HALF_OFFSET) = (UInt16)(val);
#define STORE_PARENT_DIRECT(p, fb, hi) STORE_PARENT(p, fb, hi)
#define UPDATE_E(eHi) eHi++;
#else
#define LOAD_PARENT(p) ((unsigned)(*(p) >> NUM_BITS))
#define STORE_PARENT_DIRECT(p, fb, hi) *(p) = ((fb) & MASK) | (hi); // set parent field
#define STORE_PARENT(p, fb, val) STORE_PARENT_DIRECT(p, fb, ((UInt32)(val) << NUM_BITS))
#define UPDATE_E(eHi) eHi += 1 << NUM_BITS;
#endif
#define NUM_COUNTERS 64
void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, unsigned numSymbols, unsigned maxLen)
#define HUFFMAN_SPEED_OPT
void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymbols, UInt32 maxLen)
{
#if NUM_COUNTERS > 2
unsigned counters[NUM_COUNTERS];
#endif
#if 1 && NUM_COUNTERS > (kMaxLen + 4) * 2
#define lenCounters (counters)
#define codes (counters + kMaxLen + 4)
#else
unsigned lenCounters[kMaxLen + 1];
UInt32 codes[kMaxLen + 1];
#endif
unsigned num;
UInt32 num = 0;
/* if (maxLen > 10) maxLen = 10; */
{
unsigned i;
// UInt32 sum = 0;
#if NUM_COUNTERS > 2
UInt32 i;
#define CTR_ITEM_FOR_FREQ(freq) \
counters[(freq) >= NUM_COUNTERS - 1 ? NUM_COUNTERS - 1 : (unsigned)(freq)]
#ifdef HUFFMAN_SPEED_OPT
UInt32 counters[NUM_COUNTERS];
for (i = 0; i < NUM_COUNTERS; i++)
counters[i] = 0;
memset(lens, 0, numSymbols);
{
const UInt32 *fp = freqs + numSymbols;
#define NUM_UNROLLS 1
#if NUM_UNROLLS > 1 // use 1 if odd (numSymbols) is possisble
if (numSymbols & 1)
{
UInt32 f;
f = *--fp; CTR_ITEM_FOR_FREQ(f)++;
// sum += f;
}
#endif
do
{
UInt32 f;
fp -= NUM_UNROLLS;
f = fp[0]; CTR_ITEM_FOR_FREQ(f)++;
// sum += f;
#if NUM_UNROLLS > 1
f = fp[1]; CTR_ITEM_FOR_FREQ(f)++;
// sum += f;
#endif
}
while (fp != freqs);
}
#if 0
printf("\nsum=%8u numSymbols =%3u ctrs:", sum, numSymbols);
{
unsigned k = 0;
for (k = 0; k < NUM_COUNTERS; k++)
printf(" %u", counters[k]);
}
#endif
num = counters[1];
counters[1] = 0;
for (i = 2; i != NUM_COUNTERS; i += 2)
{
unsigned c;
c = (counters )[i]; (counters )[i] = num; num += c;
c = (counters + 1)[i]; (counters + 1)[i] = num; num += c;
}
counters[0] = num; // we want to write (freq==0) symbols to the end of (p) array
{
i = 0;
do
{
const UInt32 f = freqs[i];
#if 0
if (f == 0) lens[i] = 0; else
#endif
p[CTR_ITEM_FOR_FREQ(f)++] = i | (f << NUM_BITS);
}
while (++i != numSymbols);
}
HeapSort(p + counters[NUM_COUNTERS - 2], counters[NUM_COUNTERS - 1] - counters[NUM_COUNTERS - 2]);
#else // NUM_COUNTERS <= 2
num = 0;
for (i = 0; i < numSymbols; i++)
{
const UInt32 freq = freqs[i];
UInt32 freq = freqs[i];
counters[(freq < NUM_COUNTERS - 1) ? freq : NUM_COUNTERS - 1]++;
}
for (i = 1; i < NUM_COUNTERS; i++)
{
UInt32 temp = counters[i];
counters[i] = num;
num += temp;
}
for (i = 0; i < numSymbols; i++)
{
UInt32 freq = freqs[i];
if (freq == 0)
lens[i] = 0;
else
p[counters[((freq < NUM_COUNTERS - 1) ? freq : NUM_COUNTERS - 1)]++] = i | (freq << NUM_BITS);
}
counters[0] = 0;
HeapSort(p + counters[NUM_COUNTERS - 2], counters[NUM_COUNTERS - 1] - counters[NUM_COUNTERS - 2]);
#else
for (i = 0; i < numSymbols; i++)
{
UInt32 freq = freqs[i];
if (freq == 0)
lens[i] = 0;
else
@ -127,27 +62,17 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, unsigned numSy
}
HeapSort(p, num);
#endif
#endif
}
if (num <= 2)
if (num < 2)
{
unsigned minCode = 0;
unsigned maxCode = 1;
if (num)
if (num == 1)
{
maxCode = (unsigned)p[(size_t)num - 1] & MASK;
if (num == 2)
{
minCode = (unsigned)p[0] & MASK;
if (minCode > maxCode)
{
const unsigned temp = minCode;
minCode = maxCode;
maxCode = temp;
}
}
else if (maxCode == 0)
maxCode = (unsigned)p[0] & MASK;
if (maxCode == 0)
maxCode++;
}
p[minCode] = 0;
@ -155,206 +80,69 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, unsigned numSy
lens[minCode] = lens[maxCode] = 1;
return;
}
{
unsigned i;
for (i = 0; i <= kMaxLen; i++)
lenCounters[i] = 0;
lenCounters[1] = 2; // by default root node has 2 child leaves at level 1.
}
// if (num != 2)
{
// num > 2
// the binary tree will contain (num - 1) internal nodes.
// p[num - 2] will be root node of binary tree.
UInt32 *b;
UInt32 *n;
// first node will have two leaf childs: p[0] and p[1]:
// p[0] += p[1] & FREQ_MASK; // set frequency sum of child leafs
// if (pi == n) exit(0);
// if (pi != n)
UInt32 b, e, i;
i = b = e = 0;
do
{
UInt32 fb = (p[1] & FREQ_MASK) + p[0];
UInt32 f = p[2] & FREQ_MASK;
const UInt32 *pi = p + 2;
UInt32 *e = p;
UInt32 eHi = 0;
n = p + num;
b = p;
// p[0] = fb;
for (;;)
{
// (b <= e)
UInt32 sum;
e++;
UPDATE_E(eHi)
// (b < e)
// p range : high bits
// [0, b) : parent : processed nodes that have parent and childs
// [b, e) : FREQ : non-processed nodes that have no parent but have childs
// [e, pi) : FREQ : processed leaves for which parent node was created
// [pi, n) : FREQ : non-processed leaves for which parent node was not created
// first child
// note : (*b < f) is same result as ((*b & FREQ_MASK) < f)
if (fb < f)
{
// node freq is smaller
sum = fb & FREQ_MASK;
STORE_PARENT_DIRECT (b, fb, eHi)
b++;
fb = *b;
if (b == e)
{
if (++pi == n)
break;
sum += f;
fb &= MASK;
fb |= sum;
*e = fb;
f = *pi & FREQ_MASK;
continue;
}
}
else if (++pi == n)
{
STORE_PARENT_DIRECT (b, fb, eHi)
b++;
break;
}
else
{
sum = f;
f = *pi & FREQ_MASK;
}
// (b < e)
// second child
if (fb < f)
{
sum += fb;
sum &= FREQ_MASK;
STORE_PARENT_DIRECT (b, fb, eHi)
b++;
*e = (*e & MASK) | sum; // set frequency sum
// (b <= e) is possible here
fb = *b;
}
else if (++pi == n)
break;
else
{
sum += f;
f = *pi & FREQ_MASK;
*e = (*e & MASK) | sum; // set frequency sum
}
}
UInt32 n, m, freq;
n = (i != num && (b == e || (p[i] >> NUM_BITS) <= (p[b] >> NUM_BITS))) ? i++ : b++;
freq = (p[n] & ~MASK);
p[n] = (p[n] & MASK) | (e << NUM_BITS);
m = (i != num && (b == e || (p[i] >> NUM_BITS) <= (p[b] >> NUM_BITS))) ? i++ : b++;
freq += (p[m] & ~MASK);
p[m] = (p[m] & MASK) | (e << NUM_BITS);
p[e] = (p[e] & MASK) | freq;
e++;
}
while (num - e > 1);
// printf("\nnum-e=%3u, numSymbols=%3u, num=%3u, b=%3u", n - e, numSymbols, n - p, b - p);
{
n -= 2;
*n &= MASK; // root node : we clear high bits (zero bits mean level == 0)
if (n != b)
UInt32 lenCounters[kMaxLen + 1];
for (i = 0; i <= kMaxLen; i++)
lenCounters[i] = 0;
p[--e] &= MASK;
lenCounters[1] = 2;
while (e > 0)
{
// We go here, if we have some number of non-created nodes up to root.
// We process them in simplified code:
// position of parent for each pair of nodes is known.
// n[-2], n[-1] : current pair of child nodes
// (p1) : parent node for current pair.
UInt32 *p1 = n;
do
{
const unsigned len = LOAD_PARENT(p1) + 1;
p1--;
(lenCounters )[len] -= 2; // we remove 2 leaves from level (len)
(lenCounters + 1)[len] += 2 * 2; // we add 4 leaves at level (len + 1)
n -= 2;
STORE_PARENT (n , n[0], len)
STORE_PARENT (n + 1, n[1], len)
}
while (n != b);
}
}
if (b != p)
{
// we detect level of each node (realtive to root),
// and update lenCounters[].
// We process only intermediate nodes and we don't process leaves.
do
{
// if (ii < b) : parent_bits_of (p[ii]) == index of parent node : ii < (p[ii])
// if (ii >= b) : parent_bits_of (p[ii]) == level of this (ii) node in tree
unsigned len;
b--;
len = (unsigned)LOAD_PARENT(p + LOAD_PARENT(b)) + 1;
STORE_PARENT (b, *b, len)
UInt32 len = (p[p[--e] >> NUM_BITS] >> NUM_BITS) + 1;
p[e] = (p[e] & MASK) | (len << NUM_BITS);
if (len >= maxLen)
for (len = maxLen - 1; lenCounters[len] == 0; len--);
lenCounters[len]--;
lenCounters[(size_t)len + 1] += 2;
}
{
UInt32 len;
i = 0;
for (len = maxLen; len != 0; len--)
{
// We are not allowed to create node at level (maxLen) and higher,
// because all leaves must be placed to level (maxLen) or lower.
// We find nearest allowed leaf and place current node to level of that leaf:
for (len = maxLen - 1; lenCounters[len] == 0; len--) {}
UInt32 k;
for (k = lenCounters[len]; k != 0; k--)
lens[p[i++] & MASK] = (Byte)len;
}
lenCounters[len]--; // we remove 1 leaf from level (len)
(lenCounters + 1)[len] += 2; // we add 2 leaves at level (len + 1)
}
while (b != p);
}
}
{
{
unsigned len = maxLen;
const UInt32 *p2 = p;
do
{
unsigned k = lenCounters[len];
if (k)
do
lens[(unsigned)*p2++ & MASK] = (Byte)len;
while (--k);
UInt32 nextCodes[kMaxLen + 1];
{
UInt32 code = 0;
UInt32 len;
for (len = 1; len <= kMaxLen; len++)
nextCodes[len] = code = (code + lenCounters[(size_t)len - 1]) << 1;
}
/* if (code + lenCounters[kMaxLen] - 1 != (1 << kMaxLen) - 1) throw 1; */
{
UInt32 k;
for (k = 0; k < numSymbols; k++)
p[k] = nextCodes[lens[k]]++;
}
}
while (--len);
}
codes[0] = 0; // we don't want garbage values to be written to p[] array.
// codes[1] = 0;
{
UInt32 code = 0;
unsigned len;
for (len = 0; len < kMaxLen; len++)
(codes + 1)[len] = code = (code + lenCounters[len]) << 1;
}
/* if (code + lenCounters[kMaxLen] - 1 != (1 << kMaxLen) - 1) throw 1; */
{
const Byte * const limit = lens + numSymbols;
do
{
unsigned len;
UInt32 c;
len = lens[0]; c = codes[len]; p[0] = c; codes[len] = c + 1;
// len = lens[1]; c = codes[len]; p[1] = c; codes[len] = c + 1;
p += 1;
lens += 1;
}
while (lens != limit);
}
}
}
#undef kMaxLen
#undef NUM_BITS
#undef MASK
#undef FREQ_MASK
#undef NUM_COUNTERS
#undef CTR_ITEM_FOR_FREQ
#undef LOAD_PARENT
#undef STORE_PARENT
#undef STORE_PARENT_DIRECT
#undef UPDATE_E
#undef HI_HALF_OFFSET
#undef NUM_UNROLLS
#undef lenCounters
#undef codes

View file

@ -1,21 +1,21 @@
/* HuffEnc.h -- Huffman encoding
Igor Pavlov : Public domain */
2013-01-18 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_HUFF_ENC_H
#define ZIP7_INC_HUFF_ENC_H
#ifndef __HUFF_ENC_H
#define __HUFF_ENC_H
#include "7zTypes.h"
EXTERN_C_BEGIN
#define Z7_HUFFMAN_LEN_MAX 16
/*
Conditions:
2 <= num <= 1024 = 2 ^ NUM_BITS
num <= 1024 = 2 ^ NUM_BITS
Sum(freqs) < 4M = 2 ^ (32 - NUM_BITS)
1 <= maxLen <= 16 = Z7_HUFFMAN_LEN_MAX
maxLen <= 16 = kMaxLen
Num_Items(p) >= HUFFMAN_TEMP_SIZE(num)
*/
void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 num, UInt32 maxLen);
EXTERN_C_END

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,8 @@
/* LzFind.h -- Match finder for LZ algorithms
2024-01-22 : Igor Pavlov : Public domain */
2021-07-13 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_LZ_FIND_H
#define ZIP7_INC_LZ_FIND_H
#ifndef __LZ_FIND_H
#define __LZ_FIND_H
#include "7zTypes.h"
@ -10,9 +10,9 @@ EXTERN_C_BEGIN
typedef UInt32 CLzRef;
typedef struct
typedef struct _CMatchFinder
{
const Byte *buffer;
Byte *buffer;
UInt32 pos;
UInt32 posLimit;
UInt32 streamPos; /* wrap over Zero is allowed (streamPos < pos). Use (UInt32)(streamPos - pos) */
@ -32,8 +32,8 @@ typedef struct
UInt32 hashMask;
UInt32 cutValue;
Byte *bufBase;
ISeqInStreamPtr stream;
Byte *bufferBase;
ISeqInStream *stream;
UInt32 blockSize;
UInt32 keepSizeBefore;
@ -43,9 +43,7 @@ typedef struct
size_t directInputRem;
UInt32 historySize;
UInt32 fixedHashSize;
Byte numHashBytes_Min;
Byte numHashOutBits;
Byte _pad2_[2];
UInt32 hashSizeSum;
SRes result;
UInt32 crc[256];
size_t numRefs;
@ -71,45 +69,24 @@ void MatchFinder_ReadIfRequired(CMatchFinder *p);
void MatchFinder_Construct(CMatchFinder *p);
/* (directInput = 0) is default value.
It's required to provide correct (directInput) value
before calling MatchFinder_Create().
You can set (directInput) by any of the following calls:
- MatchFinder_SET_DIRECT_INPUT_BUF()
- MatchFinder_SET_STREAM()
- MatchFinder_SET_STREAM_MODE()
/* Conditions:
historySize <= 3 GB
keepAddBufferBefore + matchMaxLen + keepAddBufferAfter < 511MB
*/
#define MatchFinder_SET_DIRECT_INPUT_BUF(p, _src_, _srcLen_) { \
(p)->stream = NULL; \
(p)->directInput = 1; \
(p)->buffer = (_src_); \
(p)->directInputRem = (_srcLen_); }
/*
#define MatchFinder_SET_STREAM_MODE(p) { \
(p)->directInput = 0; }
*/
#define MatchFinder_SET_STREAM(p, _stream_) { \
(p)->stream = _stream_; \
(p)->directInput = 0; }
int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
ISzAllocPtr alloc);
void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc);
void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems);
// void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue);
/*
#define MatchFinder_INIT_POS(p, val) \
#define Inline_MatchFinder_InitPos(p, val) \
(p)->pos = (val); \
(p)->streamPos = (val);
*/
// void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue);
#define MatchFinder_REDUCE_OFFSETS(p, subValue) \
#define Inline_MatchFinder_ReduceOffsets(p, subValue) \
(p)->pos -= (subValue); \
(p)->streamPos -= (subValue);
@ -130,7 +107,7 @@ typedef const Byte * (*Mf_GetPointerToCurrentPos_Func)(void *object);
typedef UInt32 * (*Mf_GetMatches_Func)(void *object, UInt32 *distances);
typedef void (*Mf_Skip_Func)(void *object, UInt32);
typedef struct
typedef struct _IMatchFinder
{
Mf_Init_Func Init;
Mf_GetNumAvailableBytes_Func GetNumAvailableBytes;
@ -144,8 +121,7 @@ void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable);
void MatchFinder_Init_LowHash(CMatchFinder *p);
void MatchFinder_Init_HighHash(CMatchFinder *p);
void MatchFinder_Init_4(CMatchFinder *p);
// void MatchFinder_Init(CMatchFinder *p);
void MatchFinder_Init(void *p);
void MatchFinder_Init(CMatchFinder *p);
UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances);
UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances);

View file

@ -1,5 +1,5 @@
/* LzFindMt.c -- multithreaded Match finder for LZ algorithms
: Igor Pavlov : Public domain */
2021-12-21 : Igor Pavlov : Public domain */
#include "Precomp.h"
@ -69,7 +69,7 @@ extern UInt64 g_NumIters_Bytes;
UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); }
#define MT_HASH4_CALC { \
#define __MT_HASH4_CALC { \
UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
h2 = temp & (kHash2Size - 1); \
temp ^= ((UInt32)cur[2] << 8); \
@ -79,16 +79,14 @@ extern UInt64 g_NumIters_Bytes;
*/
Z7_NO_INLINE
MY_NO_INLINE
static void MtSync_Construct(CMtSync *p)
{
p->affinityGroup = -1;
p->affinityInGroup = 0;
p->affinity = 0;
p->wasCreated = False;
p->csWasInitialized = False;
p->csWasEntered = False;
Thread_CONSTRUCT(&p->thread)
Thread_Construct(&p->thread);
Event_Construct(&p->canStart);
Event_Construct(&p->wasStopped);
Semaphore_Construct(&p->freeSemaphore);
@ -96,7 +94,7 @@ static void MtSync_Construct(CMtSync *p)
}
// #define DEBUG_BUFFER_LOCK // define it to debug lock state
#define DEBUG_BUFFER_LOCK // define it to debug lock state
#ifdef DEBUG_BUFFER_LOCK
#include <stdlib.h>
@ -118,7 +116,7 @@ static void MtSync_Construct(CMtSync *p)
(p)->csWasEntered = False; }
Z7_NO_INLINE
MY_NO_INLINE
static UInt32 MtSync_GetNextBlock(CMtSync *p)
{
UInt32 numBlocks = 0;
@ -142,14 +140,14 @@ static UInt32 MtSync_GetNextBlock(CMtSync *p)
// buffer is UNLOCKED here
Semaphore_Wait(&p->filledSemaphore);
LOCK_BUFFER(p)
LOCK_BUFFER(p);
return numBlocks;
}
/* if Writing (Processing) thread was started, we must call MtSync_StopWriting() */
Z7_NO_INLINE
MY_NO_INLINE
static void MtSync_StopWriting(CMtSync *p)
{
if (!Thread_WasCreated(&p->thread) || p->needStart)
@ -187,7 +185,7 @@ static void MtSync_StopWriting(CMtSync *p)
}
Z7_NO_INLINE
MY_NO_INLINE
static void MtSync_Destruct(CMtSync *p)
{
PRF(printf("\nMtSync_Destruct %p\n", p));
@ -222,11 +220,11 @@ static void MtSync_Destruct(CMtSync *p)
// #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; }
// we want to get real system error codes here instead of SZ_ERROR_THREAD
#define RINOK_THREAD(x) RINOK_WRes(x)
#define RINOK_THREAD(x) RINOK(x)
// call it before each new file (when new starting is required):
Z7_NO_INLINE
MY_NO_INLINE
static SRes MtSync_Init(CMtSync *p, UInt32 numBlocks)
{
WRes wres;
@ -247,12 +245,12 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *
if (p->wasCreated)
return SZ_OK;
RINOK_THREAD(CriticalSection_Init(&p->cs))
RINOK_THREAD(CriticalSection_Init(&p->cs));
p->csWasInitialized = True;
p->csWasEntered = False;
RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart))
RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped))
RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart));
RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped));
p->needStart = True;
p->exit = True; /* p->exit is unused before (canStart) Event.
@ -261,24 +259,18 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *
// return ERROR_TOO_MANY_POSTS; // for debug
// return EINVAL; // for debug
#ifdef _WIN32
if (p->affinityGroup >= 0)
wres = Thread_Create_With_Group(&p->thread, startAddress, obj,
(unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup);
else
#endif
if (p->affinity != 0)
wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity);
else
wres = Thread_Create(&p->thread, startAddress, obj);
RINOK_THREAD(wres)
RINOK_THREAD(wres);
p->wasCreated = True;
return SZ_OK;
}
Z7_NO_INLINE
MY_NO_INLINE
static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj)
{
const WRes wres = MtSync_Create_WRes(p, startAddress, obj);
@ -527,7 +519,7 @@ static void HashThreadFunc(CMatchFinderMt *mt)
if (mf->pos > (UInt32)kMtMaxValForNormalize - num)
{
const UInt32 subValue = (mf->pos - mf->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1);
MatchFinder_REDUCE_OFFSETS(mf, subValue)
Inline_MatchFinder_ReduceOffsets(mf, subValue);
MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1);
}
@ -568,7 +560,7 @@ static void HashThreadFunc(CMatchFinderMt *mt)
*/
UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
UInt32 *posRes);
@ -757,7 +749,7 @@ static void BtFillBlock(CMatchFinderMt *p, UInt32 globalBlockIndex)
}
Z7_NO_INLINE
MY_NO_INLINE
static void BtThreadFunc(CMatchFinderMt *mt)
{
CMtSync *p = &mt->btSync;
@ -872,22 +864,21 @@ SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddB
if (!MatchFinder_Create(mf, historySize, keepAddBufferBefore, matchMaxLen, keepAddBufferAfter, alloc))
return SZ_ERROR_MEM;
RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p))
RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p))
RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p));
RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p));
return SZ_OK;
}
SRes MatchFinderMt_InitMt(CMatchFinderMt *p)
{
RINOK(MtSync_Init(&p->hashSync, kMtHashNumBlocks))
RINOK(MtSync_Init(&p->hashSync, kMtHashNumBlocks));
return MtSync_Init(&p->btSync, kMtBtNumBlocks);
}
static void MatchFinderMt_Init(void *_p)
static void MatchFinderMt_Init(CMatchFinderMt *p)
{
CMatchFinderMt *p = (CMatchFinderMt *)_p;
CMatchFinder *mf = MF(p);
p->btBufPos =
@ -950,7 +941,7 @@ void MatchFinderMt_ReleaseStream(CMatchFinderMt *p)
}
Z7_NO_INLINE
MY_NO_INLINE
static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p)
{
if (p->failure_LZ_BT)
@ -990,9 +981,8 @@ static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p)
static const Byte * MatchFinderMt_GetPointerToCurrentPos(void *_p)
static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p)
{
CMatchFinderMt *p = (CMatchFinderMt *)_p;
return p->pointerToCurPos;
}
@ -1000,9 +990,8 @@ static const Byte * MatchFinderMt_GetPointerToCurrentPos(void *_p)
#define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p);
static UInt32 MatchFinderMt_GetNumAvailableBytes(void *_p)
static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p)
{
CMatchFinderMt *p = (CMatchFinderMt *)_p;
if (p->btBufPos != p->btBufPosLimit)
return p->btNumAvailBytes;
return MatchFinderMt_GetNextBlock_Bt(p);
@ -1174,7 +1163,7 @@ UInt32* MatchFinderMt_GetMatches_Bt4(CMatchFinderMt *p, UInt32 *d)
*/
static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
{
UInt32 h2, h3, /* h4, */ c2, c3 /* , c4 */;
UInt32 *hash = p->hash;
@ -1190,8 +1179,9 @@ static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
(hash + kFix3HashSize)[h3] = m;
// (hash + kFix4HashSize)[h4] = m;
// #define BT5_USE_H2
// #ifdef BT5_USE_H2
#define _USE_H2
#ifdef _USE_H2
if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
{
d[1] = m - c2 - 1;
@ -1207,8 +1197,8 @@ static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
}
d[0] = 3;
d += 2;
#ifdef BT5_USE_H4
#ifdef _USE_H4
if (c4 >= matchMinPos)
if (
cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] &&
@ -1224,7 +1214,7 @@ static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
d[0] = 2;
d += 2;
}
// #endif
#endif
if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
{
@ -1238,7 +1228,7 @@ static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
d += 2;
}
#ifdef BT5_USE_H4
#ifdef _USE_H4
if (c4 >= matchMinPos)
if (
cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] &&
@ -1254,9 +1244,8 @@ static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
}
static UInt32 * MatchFinderMt2_GetMatches(void *_p, UInt32 *d)
static UInt32* MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d)
{
CMatchFinderMt *p = (CMatchFinderMt *)_p;
const UInt32 *bt = p->btBufPos;
const UInt32 len = *bt++;
const UInt32 *btLim = bt + len;
@ -1279,9 +1268,8 @@ static UInt32 * MatchFinderMt2_GetMatches(void *_p, UInt32 *d)
static UInt32 * MatchFinderMt_GetMatches(void *_p, UInt32 *d)
static UInt32* MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d)
{
CMatchFinderMt *p = (CMatchFinderMt *)_p;
const UInt32 *bt = p->btBufPos;
UInt32 len = *bt++;
const UInt32 avail = p->btNumAvailBytes - 1;
@ -1328,16 +1316,14 @@ static UInt32 * MatchFinderMt_GetMatches(void *_p, UInt32 *d)
#define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash;
#define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0);
static void MatchFinderMt0_Skip(void *_p, UInt32 num)
static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num)
{
CMatchFinderMt *p = (CMatchFinderMt *)_p;
SKIP_HEADER2_MT { p->btNumAvailBytes--;
SKIP_FOOTER_MT
}
static void MatchFinderMt2_Skip(void *_p, UInt32 num)
static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num)
{
CMatchFinderMt *p = (CMatchFinderMt *)_p;
SKIP_HEADER_MT(2)
UInt32 h2;
MT_HASH2_CALC
@ -1345,9 +1331,8 @@ static void MatchFinderMt2_Skip(void *_p, UInt32 num)
SKIP_FOOTER_MT
}
static void MatchFinderMt3_Skip(void *_p, UInt32 num)
static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num)
{
CMatchFinderMt *p = (CMatchFinderMt *)_p;
SKIP_HEADER_MT(3)
UInt32 h2, h3;
MT_HASH3_CALC
@ -1377,46 +1362,39 @@ static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num)
void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable)
{
vTable->Init = MatchFinderMt_Init;
vTable->GetNumAvailableBytes = MatchFinderMt_GetNumAvailableBytes;
vTable->GetPointerToCurrentPos = MatchFinderMt_GetPointerToCurrentPos;
vTable->GetMatches = MatchFinderMt_GetMatches;
vTable->Init = (Mf_Init_Func)MatchFinderMt_Init;
vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes;
vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos;
vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches;
switch (MF(p)->numHashBytes)
{
case 2:
p->GetHeadsFunc = GetHeads2;
p->MixMatchesFunc = NULL;
vTable->Skip = MatchFinderMt0_Skip;
vTable->GetMatches = MatchFinderMt2_GetMatches;
p->MixMatchesFunc = (Mf_Mix_Matches)NULL;
vTable->Skip = (Mf_Skip_Func)MatchFinderMt0_Skip;
vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches;
break;
case 3:
p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3;
p->MixMatchesFunc = MixMatches2;
vTable->Skip = MatchFinderMt2_Skip;
p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2;
vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip;
break;
case 4:
p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4;
// it's fast inline version of GetMatches()
// vTable->GetMatches = MatchFinderMt_GetMatches_Bt4;
// vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches_Bt4;
p->MixMatchesFunc = MixMatches3;
vTable->Skip = MatchFinderMt3_Skip;
p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches3;
vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip;
break;
default:
p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5;
p->MixMatchesFunc = MixMatches4;
p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4;
vTable->Skip =
MatchFinderMt3_Skip;
// MatchFinderMt4_Skip;
(Mf_Skip_Func)MatchFinderMt3_Skip;
// (Mf_Skip_Func)MatchFinderMt4_Skip;
break;
}
}
#undef RINOK_THREAD
#undef PRF
#undef MF
#undef GetUi24hi_from32
#undef LOCK_BUFFER
#undef UNLOCK_BUFFER

View file

@ -1,21 +1,19 @@
/* LzFindMt.h -- multithreaded Match finder for LZ algorithms
: Igor Pavlov : Public domain */
2021-07-12 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_LZ_FIND_MT_H
#define ZIP7_INC_LZ_FIND_MT_H
#ifndef __LZ_FIND_MT_H
#define __LZ_FIND_MT_H
#include "LzFind.h"
#include "Threads.h"
EXTERN_C_BEGIN
typedef struct
typedef struct _CMtSync
{
UInt32 numProcessedBlocks;
Int32 affinityGroup;
UInt64 affinityInGroup;
UInt64 affinity;
CThread thread;
UInt64 affinity;
BoolInt wasCreated;
BoolInt needStart;
@ -33,10 +31,7 @@ typedef struct
// UInt32 numBlocks_Sent;
} CMtSync;
struct CMatchFinderMt_;
typedef UInt32 * (*Mf_Mix_Matches)(struct CMatchFinderMt_ *p, UInt32 matchMinPos, UInt32 *distances);
typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distances);
/* kMtCacheLineDummy must be >= size_of_CPU_cache_line */
#define kMtCacheLineDummy 128
@ -44,7 +39,7 @@ typedef UInt32 * (*Mf_Mix_Matches)(struct CMatchFinderMt_ *p, UInt32 matchMinPos
typedef void (*Mf_GetHeads)(const Byte *buffer, UInt32 pos,
UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc);
typedef struct CMatchFinderMt_
typedef struct _CMatchFinderMt
{
/* LZ */
const Byte *pointerToCurPos;

View file

@ -1,5 +1,5 @@
/* LzFindOpt.c -- multithreaded Match finder for LZ algorithms
2023-04-02 : Igor Pavlov : Public domain */
2021-07-13 : Igor Pavlov : Public domain */
#include "Precomp.h"
@ -41,8 +41,8 @@ UInt64 g_NumIters_Bytes;
// #define CYC_TO_POS_OFFSET 1 // for debug
/*
Z7_NO_INLINE
UInt32 * Z7_FASTCALL GetMatchesSpecN_1(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
MY_NO_INLINE
UInt32 * MY_FAST_CALL GetMatchesSpecN_1(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 *posRes)
{
do
@ -214,13 +214,13 @@ else
to eliminate "movsx" BUG in old MSVC x64 compiler.
*/
UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
UInt32 *posRes);
Z7_NO_INLINE
UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
MY_NO_INLINE
UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
UInt32 *posRes)
@ -404,7 +404,7 @@ else
/*
typedef UInt32 uint32plus; // size_t
UInt32 * Z7_FASTCALL GetMatchesSpecN_3(uint32plus lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 * MY_FAST_CALL GetMatchesSpecN_3(uint32plus lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 _cutValue, UInt32 *d, uint32plus _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
UInt32 *posRes)

View file

@ -1,8 +1,8 @@
/* LzHash.h -- HASH constants for LZ algorithms
2023-03-05 : Igor Pavlov : Public domain */
/* LzHash.h -- HASH functions for LZ algorithms
2019-10-30 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_LZ_HASH_H
#define ZIP7_INC_LZ_HASH_H
#ifndef __LZ_HASH_H
#define __LZ_HASH_H
/*
(kHash2Size >= (1 << 8)) : Required

View file

@ -1,5 +1,5 @@
/* Lzma2Dec.c -- LZMA2 Decoder
2024-03-01 : Igor Pavlov : Public domain */
2021-02-09 : Igor Pavlov : Public domain */
/* #define SHOW_DEBUG_INFO */
@ -71,14 +71,14 @@ static SRes Lzma2Dec_GetOldProps(Byte prop, Byte *props)
SRes Lzma2Dec_AllocateProbs(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc)
{
Byte props[LZMA_PROPS_SIZE];
RINOK(Lzma2Dec_GetOldProps(prop, props))
RINOK(Lzma2Dec_GetOldProps(prop, props));
return LzmaDec_AllocateProbs(&p->decoder, props, LZMA_PROPS_SIZE, alloc);
}
SRes Lzma2Dec_Allocate(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc)
{
Byte props[LZMA_PROPS_SIZE];
RINOK(Lzma2Dec_GetOldProps(prop, props))
RINOK(Lzma2Dec_GetOldProps(prop, props));
return LzmaDec_Allocate(&p->decoder, props, LZMA_PROPS_SIZE, alloc);
}
@ -157,10 +157,8 @@ static unsigned Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b)
p->decoder.prop.lp = (Byte)lp;
return LZMA2_STATE_DATA;
}
default:
return LZMA2_STATE_ERROR;
}
return LZMA2_STATE_ERROR;
}
static void LzmaDec_UpdateWithUncompressed(CLzmaDec *p, const Byte *src, SizeT size)
@ -476,8 +474,8 @@ SRes Lzma2Decode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
SizeT outSize = *destLen, inSize = *srcLen;
*destLen = *srcLen = 0;
*status = LZMA_STATUS_NOT_SPECIFIED;
Lzma2Dec_CONSTRUCT(&p)
RINOK(Lzma2Dec_AllocateProbs(&p, prop, alloc))
Lzma2Dec_Construct(&p);
RINOK(Lzma2Dec_AllocateProbs(&p, prop, alloc));
p.decoder.dic = dest;
p.decoder.dicBufSize = outSize;
Lzma2Dec_Init(&p);
@ -489,5 +487,3 @@ SRes Lzma2Decode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
Lzma2Dec_FreeProbs(&p, alloc);
return res;
}
#undef PRF

View file

@ -1,8 +1,8 @@
/* Lzma2Dec.h -- LZMA2 Decoder
2023-03-03 : Igor Pavlov : Public domain */
2018-02-19 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_LZMA2_DEC_H
#define ZIP7_INC_LZMA2_DEC_H
#ifndef __LZMA2_DEC_H
#define __LZMA2_DEC_H
#include "LzmaDec.h"
@ -22,10 +22,9 @@ typedef struct
CLzmaDec decoder;
} CLzma2Dec;
#define Lzma2Dec_CONSTRUCT(p) LzmaDec_CONSTRUCT(&(p)->decoder)
#define Lzma2Dec_Construct(p) Lzma2Dec_CONSTRUCT(p)
#define Lzma2Dec_FreeProbs(p, alloc) LzmaDec_FreeProbs(&(p)->decoder, alloc)
#define Lzma2Dec_Free(p, alloc) LzmaDec_Free(&(p)->decoder, alloc)
#define Lzma2Dec_Construct(p) LzmaDec_Construct(&(p)->decoder)
#define Lzma2Dec_FreeProbs(p, alloc) LzmaDec_FreeProbs(&(p)->decoder, alloc)
#define Lzma2Dec_Free(p, alloc) LzmaDec_Free(&(p)->decoder, alloc)
SRes Lzma2Dec_AllocateProbs(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc);
SRes Lzma2Dec_Allocate(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc);
@ -91,7 +90,7 @@ Lzma2Dec_GetUnpackExtra() returns the value that shows
at current input positon.
*/
#define Lzma2Dec_GetUnpackExtra(p) ((p)->isExtraMode ? (p)->unpackSize : 0)
#define Lzma2Dec_GetUnpackExtra(p) ((p)->isExtraMode ? (p)->unpackSize : 0);
/* ---------- One Call Interface ---------- */

View file

@ -1,44 +1,44 @@
/* Lzma2DecMt.c -- LZMA2 Decoder Multi-thread
2023-04-13 : Igor Pavlov : Public domain */
2021-04-01 : Igor Pavlov : Public domain */
#include "Precomp.h"
// #define SHOW_DEBUG_INFO
// #define Z7_ST
// #define _7ZIP_ST
#ifdef SHOW_DEBUG_INFO
#include <stdio.h>
#endif
#ifndef _7ZIP_ST
#ifdef SHOW_DEBUG_INFO
#define PRF(x) x
#else
#define PRF(x)
#endif
#define PRF_STR(s) PRF(printf("\n" s "\n"))
#define PRF_STR_INT_2(s, d1, d2) PRF(printf("\n" s " %d %d\n", (unsigned)d1, (unsigned)d2))
#endif
#include "Alloc.h"
#include "Lzma2Dec.h"
#include "Lzma2DecMt.h"
#ifndef Z7_ST
#ifndef _7ZIP_ST
#include "MtDec.h"
#define LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT (1 << 28)
#endif
#ifndef Z7_ST
#ifdef SHOW_DEBUG_INFO
#define PRF(x) x
#else
#define PRF(x)
#endif
#define PRF_STR(s) PRF(printf("\n" s "\n");)
#define PRF_STR_INT_2(s, d1, d2) PRF(printf("\n" s " %d %d\n", (unsigned)d1, (unsigned)d2);)
#endif
void Lzma2DecMtProps_Init(CLzma2DecMtProps *p)
{
p->inBufSize_ST = 1 << 20;
p->outStep_ST = 1 << 20;
#ifndef Z7_ST
#ifndef _7ZIP_ST
p->numThreads = 1;
p->inBufSize_MT = 1 << 18;
p->outBlockMax = LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT;
@ -48,7 +48,7 @@ void Lzma2DecMtProps_Init(CLzma2DecMtProps *p)
#ifndef Z7_ST
#ifndef _7ZIP_ST
/* ---------- CLzma2DecMtThread ---------- */
@ -81,7 +81,7 @@ typedef struct
/* ---------- CLzma2DecMt ---------- */
struct CLzma2DecMt
typedef struct
{
// ISzAllocPtr alloc;
ISzAllocPtr allocMid;
@ -90,9 +90,9 @@ struct CLzma2DecMt
CLzma2DecMtProps props;
Byte prop;
ISeqInStreamPtr inStream;
ISeqOutStreamPtr outStream;
ICompressProgressPtr progress;
ISeqInStream *inStream;
ISeqOutStream *outStream;
ICompressProgress *progress;
BoolInt finishMode;
BoolInt outSize_Defined;
@ -111,13 +111,14 @@ struct CLzma2DecMt
size_t inPos;
size_t inLim;
#ifndef Z7_ST
#ifndef _7ZIP_ST
UInt64 outProcessed_Parse;
BoolInt mtc_WasConstructed;
CMtDec mtc;
CLzma2DecMtThread coders[MTDEC_THREADS_MAX];
CLzma2DecMtThread coders[MTDEC__THREADS_MAX];
#endif
};
} CLzma2DecMt;
@ -141,11 +142,11 @@ CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid)
// Lzma2DecMtProps_Init(&p->props);
#ifndef Z7_ST
#ifndef _7ZIP_ST
p->mtc_WasConstructed = False;
{
unsigned i;
for (i = 0; i < MTDEC_THREADS_MAX; i++)
for (i = 0; i < MTDEC__THREADS_MAX; i++)
{
CLzma2DecMtThread *t = &p->coders[i];
t->dec_created = False;
@ -155,16 +156,16 @@ CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid)
}
#endif
return (CLzma2DecMtHandle)(void *)p;
return p;
}
#ifndef Z7_ST
#ifndef _7ZIP_ST
static void Lzma2DecMt_FreeOutBufs(CLzma2DecMt *p)
{
unsigned i;
for (i = 0; i < MTDEC_THREADS_MAX; i++)
for (i = 0; i < MTDEC__THREADS_MAX; i++)
{
CLzma2DecMtThread *t = &p->coders[i];
if (t->outBuf)
@ -195,15 +196,13 @@ static void Lzma2DecMt_FreeSt(CLzma2DecMt *p)
}
// #define GET_CLzma2DecMt_p CLzma2DecMt *p = (CLzma2DecMt *)(void *)pp;
void Lzma2DecMt_Destroy(CLzma2DecMtHandle p)
void Lzma2DecMt_Destroy(CLzma2DecMtHandle pp)
{
// GET_CLzma2DecMt_p
CLzma2DecMt *p = (CLzma2DecMt *)pp;
Lzma2DecMt_FreeSt(p);
#ifndef Z7_ST
#ifndef _7ZIP_ST
if (p->mtc_WasConstructed)
{
@ -212,7 +211,7 @@ void Lzma2DecMt_Destroy(CLzma2DecMtHandle p)
}
{
unsigned i;
for (i = 0; i < MTDEC_THREADS_MAX; i++)
for (i = 0; i < MTDEC__THREADS_MAX; i++)
{
CLzma2DecMtThread *t = &p->coders[i];
if (t->dec_created)
@ -227,19 +226,19 @@ void Lzma2DecMt_Destroy(CLzma2DecMtHandle p)
#endif
ISzAlloc_Free(p->alignOffsetAlloc.baseAlloc, p);
ISzAlloc_Free(p->alignOffsetAlloc.baseAlloc, pp);
}
#ifndef Z7_ST
#ifndef _7ZIP_ST
static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCallbackInfo *cc)
{
CLzma2DecMt *me = (CLzma2DecMt *)obj;
CLzma2DecMtThread *t = &me->coders[coderIndex];
PRF_STR_INT_2("Parse", coderIndex, cc->srcSize)
PRF_STR_INT_2("Parse", coderIndex, cc->srcSize);
cc->state = MTDEC_PARSE_CONTINUE;
@ -247,7 +246,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
{
if (!t->dec_created)
{
Lzma2Dec_CONSTRUCT(&t->dec)
Lzma2Dec_Construct(&t->dec);
t->dec_created = True;
AlignOffsetAlloc_CreateVTable(&t->alloc);
{
@ -298,7 +297,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
// that must be finished at position <= outBlockMax.
{
const size_t srcOrig = cc->srcSize;
const SizeT srcOrig = cc->srcSize;
SizeT srcSize_Point = 0;
SizeT dicPos_Point = 0;
@ -307,10 +306,10 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
for (;;)
{
SizeT srcCur = (SizeT)(srcOrig - cc->srcSize);
SizeT srcCur = srcOrig - cc->srcSize;
status = Lzma2Dec_Parse(&t->dec,
(SizeT)limit - t->dec.decoder.dicPos,
limit - t->dec.decoder.dicPos,
cc->src + cc->srcSize, &srcCur,
checkFinishBlock);
@ -334,7 +333,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
if (t->dec.decoder.dicPos >= (1 << 14))
break;
dicPos_Point = t->dec.decoder.dicPos;
srcSize_Point = (SizeT)cc->srcSize;
srcSize_Point = cc->srcSize;
continue;
}
@ -392,7 +391,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
if (unpackRem != 0)
{
/* we also reserve space for max possible number of output bytes of current LZMA chunk */
size_t rem = limit - dicPos;
SizeT rem = limit - dicPos;
if (rem > unpackRem)
rem = unpackRem;
dicPos += rem;
@ -445,7 +444,7 @@ static SRes Lzma2DecMt_MtCallback_PreCode(void *pp, unsigned coderIndex)
}
t->dec.decoder.dic = dest;
t->dec.decoder.dicBufSize = (SizeT)t->outPreSize;
t->dec.decoder.dicBufSize = t->outPreSize;
t->needInit = True;
@ -463,7 +462,7 @@ static SRes Lzma2DecMt_MtCallback_Code(void *pp, unsigned coderIndex,
UNUSED_VAR(srcFinished)
PRF_STR_INT_2("Code", coderIndex, srcSize)
PRF_STR_INT_2("Code", coderIndex, srcSize);
*inCodePos = t->inCodeSize;
*outCodePos = 0;
@ -477,13 +476,13 @@ static SRes Lzma2DecMt_MtCallback_Code(void *pp, unsigned coderIndex,
{
ELzmaStatus status;
SizeT srcProcessed = (SizeT)srcSize;
size_t srcProcessed = srcSize;
BoolInt blockWasFinished =
((int)t->parseStatus == LZMA_STATUS_FINISHED_WITH_MARK
|| t->parseStatus == LZMA2_PARSE_STATUS_NEW_BLOCK);
SRes res = Lzma2Dec_DecodeToDic(&t->dec,
(SizeT)t->outPreSize,
t->outPreSize,
src, &srcProcessed,
blockWasFinished ? LZMA_FINISH_END : LZMA_FINISH_ANY,
&status);
@ -541,7 +540,7 @@ static SRes Lzma2DecMt_MtCallback_Write(void *pp, unsigned coderIndex,
UNUSED_VAR(srcSize)
UNUSED_VAR(isCross)
PRF_STR_INT_2("Write", coderIndex, srcSize)
PRF_STR_INT_2("Write", coderIndex, srcSize);
*needContinue = False;
*canRecode = True;
@ -589,7 +588,7 @@ static SRes Lzma2DecMt_MtCallback_Write(void *pp, unsigned coderIndex,
*needContinue = needContinue2;
return SZ_OK;
}
RINOK(MtProgress_ProgressAdd(&me->mtc.mtProgress, 0, 0))
RINOK(MtProgress_ProgressAdd(&me->mtc.mtProgress, 0, 0));
}
}
@ -612,11 +611,11 @@ static SRes Lzma2Dec_Prepare_ST(CLzma2DecMt *p)
{
if (!p->dec_created)
{
Lzma2Dec_CONSTRUCT(&p->dec)
Lzma2Dec_Construct(&p->dec);
p->dec_created = True;
}
RINOK(Lzma2Dec_Allocate(&p->dec, p->prop, &p->alignOffsetAlloc.vt))
RINOK(Lzma2Dec_Allocate(&p->dec, p->prop, &p->alignOffsetAlloc.vt));
if (!p->inBuf || p->inBufSize != p->props.inBufSize_ST)
{
@ -635,7 +634,7 @@ static SRes Lzma2Dec_Prepare_ST(CLzma2DecMt *p)
static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
#ifndef Z7_ST
#ifndef _7ZIP_ST
, BoolInt tMode
#endif
)
@ -647,7 +646,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
CLzma2Dec *dec;
#ifndef Z7_ST
#ifndef _7ZIP_ST
if (tMode)
{
Lzma2DecMt_FreeOutBufs(p);
@ -655,7 +654,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
}
#endif
RINOK(Lzma2Dec_Prepare_ST(p))
RINOK(Lzma2Dec_Prepare_ST(p));
dec = &p->dec;
@ -682,7 +681,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
if (inPos == inLim)
{
#ifndef Z7_ST
#ifndef _7ZIP_ST
if (tMode)
{
inData = MtDec_Read(&p->mtc, &inLim);
@ -711,7 +710,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
{
SizeT next = dec->decoder.dicBufSize;
if (next - wrPos > p->props.outStep_ST)
next = wrPos + (SizeT)p->props.outStep_ST;
next = wrPos + p->props.outStep_ST;
size = next - dicPos;
}
@ -727,7 +726,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
}
}
inProcessed = (SizeT)(inLim - inPos);
inProcessed = inLim - inPos;
res = Lzma2Dec_DecodeToDic(dec, dicPos + size, inData + inPos, &inProcessed, finishMode, &status);
@ -756,7 +755,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
dec->decoder.dicPos = 0;
wrPos = dec->decoder.dicPos;
RINOK(res2)
RINOK(res2);
if (needStop)
{
@ -789,7 +788,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
UInt64 outDelta = p->outProcessed - outPrev;
if (inDelta >= (1 << 22) || outDelta >= (1 << 22))
{
RINOK(ICompressProgress_Progress(p->progress, p->inProcessed, p->outProcessed))
RINOK(ICompressProgress_Progress(p->progress, p->inProcessed, p->outProcessed));
inPrev = p->inProcessed;
outPrev = p->outProcessed;
}
@ -799,20 +798,20 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
Byte prop,
const CLzma2DecMtProps *props,
ISeqOutStreamPtr outStream, const UInt64 *outDataSize, int finishMode,
ISeqOutStream *outStream, const UInt64 *outDataSize, int finishMode,
// Byte *outBuf, size_t *outBufSize,
ISeqInStreamPtr inStream,
ISeqInStream *inStream,
// const Byte *inData, size_t inDataSize,
UInt64 *inProcessed,
// UInt64 *outProcessed,
int *isMT,
ICompressProgressPtr progress)
ICompressProgress *progress)
{
// GET_CLzma2DecMt_p
#ifndef Z7_ST
CLzma2DecMt *p = (CLzma2DecMt *)pp;
#ifndef _7ZIP_ST
BoolInt tMode;
#endif
@ -846,7 +845,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
*isMT = False;
#ifndef Z7_ST
#ifndef _7ZIP_ST
tMode = False;
@ -940,7 +939,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
p->readWasFinished = p->mtc.readWasFinished;
p->inProcessed = p->mtc.inProcessed;
PRF_STR("----- decoding ST -----")
PRF_STR("----- decoding ST -----");
}
}
@ -951,7 +950,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
{
SRes res = Lzma2Dec_Decode_ST(p
#ifndef Z7_ST
#ifndef _7ZIP_ST
, tMode
#endif
);
@ -968,7 +967,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
res = p->readRes;
/*
#ifndef Z7_ST
#ifndef _7ZIP_ST
if (res == SZ_OK && tMode && p->mtc.parseRes != SZ_OK)
res = p->mtc.parseRes;
#endif
@ -981,13 +980,13 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
/* ---------- Read from CLzma2DecMtHandle Interface ---------- */
SRes Lzma2DecMt_Init(CLzma2DecMtHandle p,
SRes Lzma2DecMt_Init(CLzma2DecMtHandle pp,
Byte prop,
const CLzma2DecMtProps *props,
const UInt64 *outDataSize, int finishMode,
ISeqInStreamPtr inStream)
ISeqInStream *inStream)
{
// GET_CLzma2DecMt_p
CLzma2DecMt *p = (CLzma2DecMt *)pp;
if (prop > 40)
return SZ_ERROR_UNSUPPORTED;
@ -1016,11 +1015,11 @@ SRes Lzma2DecMt_Init(CLzma2DecMtHandle p,
}
SRes Lzma2DecMt_Read(CLzma2DecMtHandle p,
SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp,
Byte *data, size_t *outSize,
UInt64 *inStreamProcessed)
{
// GET_CLzma2DecMt_p
CLzma2DecMt *p = (CLzma2DecMt *)pp;
ELzmaFinishMode finishMode;
SRes readRes;
size_t size = *outSize;
@ -1056,8 +1055,8 @@ SRes Lzma2DecMt_Read(CLzma2DecMtHandle p,
readRes = ISeqInStream_Read(p->inStream, p->inBuf, &p->inLim);
}
inCur = (SizeT)(p->inLim - p->inPos);
outCur = (SizeT)size;
inCur = p->inLim - p->inPos;
outCur = size;
res = Lzma2Dec_DecodeToBuf(&p->dec, data, &outCur,
p->inBuf + p->inPos, &inCur, finishMode, &status);
@ -1089,7 +1088,3 @@ SRes Lzma2DecMt_Read(CLzma2DecMtHandle p,
return readRes;
}
}
#undef PRF
#undef PRF_STR
#undef PRF_STR_INT_2

View file

@ -1,8 +1,8 @@
/* Lzma2DecMt.h -- LZMA2 Decoder Multi-thread
2023-04-13 : Igor Pavlov : Public domain */
2018-02-17 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_LZMA2_DEC_MT_H
#define ZIP7_INC_LZMA2_DEC_MT_H
#ifndef __LZMA2_DEC_MT_H
#define __LZMA2_DEC_MT_H
#include "7zTypes.h"
@ -13,7 +13,7 @@ typedef struct
size_t inBufSize_ST;
size_t outStep_ST;
#ifndef Z7_ST
#ifndef _7ZIP_ST
unsigned numThreads;
size_t inBufSize_MT;
size_t outBlockMax;
@ -38,9 +38,7 @@ SRes:
SZ_ERROR_THREAD - error in multithreading functions (only for Mt version)
*/
typedef struct CLzma2DecMt CLzma2DecMt;
typedef CLzma2DecMt * CLzma2DecMtHandle;
// Z7_DECLARE_HANDLE(CLzma2DecMtHandle)
typedef void * CLzma2DecMtHandle;
CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid);
void Lzma2DecMt_Destroy(CLzma2DecMtHandle p);
@ -48,11 +46,11 @@ void Lzma2DecMt_Destroy(CLzma2DecMtHandle p);
SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
Byte prop,
const CLzma2DecMtProps *props,
ISeqOutStreamPtr outStream,
ISeqOutStream *outStream,
const UInt64 *outDataSize, // NULL means undefined
int finishMode, // 0 - partial unpacking is allowed, 1 - if lzma2 stream must be finished
// Byte *outBuf, size_t *outBufSize,
ISeqInStreamPtr inStream,
ISeqInStream *inStream,
// const Byte *inData, size_t inDataSize,
// out variables:
@ -60,7 +58,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
int *isMT, /* out: (*isMT == 0), if single thread decoding was used */
// UInt64 *outProcessed,
ICompressProgressPtr progress);
ICompressProgress *progress);
/* ---------- Read from CLzma2DecMtHandle Interface ---------- */
@ -69,7 +67,7 @@ SRes Lzma2DecMt_Init(CLzma2DecMtHandle pp,
Byte prop,
const CLzma2DecMtProps *props,
const UInt64 *outDataSize, int finishMode,
ISeqInStreamPtr inStream);
ISeqInStream *inStream);
SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp,
Byte *data, size_t *outSize,

View file

@ -1,18 +1,18 @@
/* Lzma2Enc.c -- LZMA2 Encoder
: Igor Pavlov : Public domain */
2021-02-09 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include <string.h>
/* #define Z7_ST */
/* #define _7ZIP_ST */
#include "Lzma2Enc.h"
#ifndef Z7_ST
#ifndef _7ZIP_ST
#include "MtCoder.h"
#else
#define MTCODER_THREADS_MAX 1
#define MTCODER__THREADS_MAX 1
#endif
#define LZMA2_CONTROL_LZMA (1 << 7)
@ -40,7 +40,7 @@
typedef struct
{
ISeqInStream vt;
ISeqInStreamPtr realStream;
ISeqInStream *realStream;
UInt64 limit;
UInt64 processed;
int finished;
@ -53,15 +53,15 @@ static void LimitedSeqInStream_Init(CLimitedSeqInStream *p)
p->finished = 0;
}
static SRes LimitedSeqInStream_Read(ISeqInStreamPtr pp, void *data, size_t *size)
static SRes LimitedSeqInStream_Read(const ISeqInStream *pp, void *data, size_t *size)
{
Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CLimitedSeqInStream)
CLimitedSeqInStream *p = CONTAINER_FROM_VTBL(pp, CLimitedSeqInStream, vt);
size_t size2 = *size;
SRes res = SZ_OK;
if (p->limit != (UInt64)(Int64)-1)
{
const UInt64 rem = p->limit - p->processed;
UInt64 rem = p->limit - p->processed;
if (size2 > rem)
size2 = (size_t)rem;
}
@ -95,8 +95,8 @@ static SRes Lzma2EncInt_InitStream(CLzma2EncInt *p, const CLzma2EncProps *props)
{
SizeT propsSize = LZMA_PROPS_SIZE;
Byte propsEncoded[LZMA_PROPS_SIZE];
RINOK(LzmaEnc_SetProps(p->enc, &props->lzmaProps))
RINOK(LzmaEnc_WriteProperties(p->enc, propsEncoded, &propsSize))
RINOK(LzmaEnc_SetProps(p->enc, &props->lzmaProps));
RINOK(LzmaEnc_WriteProperties(p->enc, propsEncoded, &propsSize));
p->propsByte = propsEncoded[0];
p->propsAreSet = True;
}
@ -111,23 +111,23 @@ static void Lzma2EncInt_InitBlock(CLzma2EncInt *p)
}
SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle p, ISeqInStreamPtr inStream, UInt32 keepWindowSize,
SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp, ISeqInStream *inStream, UInt32 keepWindowSize,
ISzAllocPtr alloc, ISzAllocPtr allocBig);
SRes LzmaEnc_MemPrepare(CLzmaEncHandle p, const Byte *src, SizeT srcLen,
SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const Byte *src, SizeT srcLen,
UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig);
SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle p, BoolInt reInit,
SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit,
Byte *dest, size_t *destLen, UInt32 desiredPackSize, UInt32 *unpackSize);
const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle p);
void LzmaEnc_Finish(CLzmaEncHandle p);
void LzmaEnc_SaveState(CLzmaEncHandle p);
void LzmaEnc_RestoreState(CLzmaEncHandle p);
const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle pp);
void LzmaEnc_Finish(CLzmaEncHandle pp);
void LzmaEnc_SaveState(CLzmaEncHandle pp);
void LzmaEnc_RestoreState(CLzmaEncHandle pp);
/*
UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle p);
UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle pp);
*/
static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf,
size_t *packSizeRes, ISeqOutStreamPtr outStream)
size_t *packSizeRes, ISeqOutStream *outStream)
{
size_t packSizeLimit = *packSizeRes;
size_t packSize = packSizeLimit;
@ -167,7 +167,7 @@ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf,
while (unpackSize > 0)
{
const UInt32 u = (unpackSize < LZMA2_COPY_CHUNK_SIZE) ? unpackSize : LZMA2_COPY_CHUNK_SIZE;
UInt32 u = (unpackSize < LZMA2_COPY_CHUNK_SIZE) ? unpackSize : LZMA2_COPY_CHUNK_SIZE;
if (packSizeLimit - destPos < u + 3)
return SZ_ERROR_OUTPUT_EOF;
outBuf[destPos++] = (Byte)(p->srcPos == 0 ? LZMA2_CONTROL_COPY_RESET_DIC : LZMA2_CONTROL_COPY_NO_RESET);
@ -196,9 +196,9 @@ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf,
{
size_t destPos = 0;
const UInt32 u = unpackSize - 1;
const UInt32 pm = (UInt32)(packSize - 1);
const unsigned mode = (p->srcPos == 0) ? 3 : (p->needInitState ? (p->needInitProp ? 2 : 1) : 0);
UInt32 u = unpackSize - 1;
UInt32 pm = (UInt32)(packSize - 1);
unsigned mode = (p->srcPos == 0) ? 3 : (p->needInitState ? (p->needInitProp ? 2 : 1) : 0);
PRF(printf(" "));
@ -231,11 +231,10 @@ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf,
void Lzma2EncProps_Init(CLzma2EncProps *p)
{
LzmaEncProps_Init(&p->lzmaProps);
p->blockSize = LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO;
p->blockSize = LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO;
p->numBlockThreads_Reduced = -1;
p->numBlockThreads_Max = -1;
p->numTotalThreads = -1;
p->numThreadGroups = 0;
}
void Lzma2EncProps_Normalize(CLzma2EncProps *p)
@ -252,8 +251,8 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
t2 = p->numBlockThreads_Max;
t3 = p->numTotalThreads;
if (t2 > MTCODER_THREADS_MAX)
t2 = MTCODER_THREADS_MAX;
if (t2 > MTCODER__THREADS_MAX)
t2 = MTCODER__THREADS_MAX;
if (t3 <= 0)
{
@ -269,8 +268,8 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
t1 = 1;
t2 = t3;
}
if (t2 > MTCODER_THREADS_MAX)
t2 = MTCODER_THREADS_MAX;
if (t2 > MTCODER__THREADS_MAX)
t2 = MTCODER__THREADS_MAX;
}
else if (t1 <= 0)
{
@ -287,8 +286,8 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
fileSize = p->lzmaProps.reduceSize;
if ( p->blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID
&& p->blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO
if ( p->blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID
&& p->blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO
&& (p->blockSize < fileSize || fileSize == (UInt64)(Int64)-1))
p->lzmaProps.reduceSize = p->blockSize;
@ -298,19 +297,19 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
t1 = p->lzmaProps.numThreads;
if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID)
if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID)
{
t2r = t2 = 1;
t3 = t1;
}
else if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO && t2 <= 1)
else if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO && t2 <= 1)
{
/* if there is no block multi-threading, we use SOLID block */
p->blockSize = LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID;
p->blockSize = LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID;
}
else
{
if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO)
if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO)
{
const UInt32 kMinSize = (UInt32)1 << 20;
const UInt32 kMaxSize = (UInt32)1 << 28;
@ -345,7 +344,7 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
}
static SRes Progress(ICompressProgressPtr p, UInt64 inSize, UInt64 outSize)
static SRes Progress(ICompressProgress *p, UInt64 inSize, UInt64 outSize)
{
return (p && ICompressProgress_Progress(p, inSize, outSize) != SZ_OK) ? SZ_ERROR_PROGRESS : SZ_OK;
}
@ -353,7 +352,7 @@ static SRes Progress(ICompressProgressPtr p, UInt64 inSize, UInt64 outSize)
/* ---------- Lzma2 ---------- */
struct CLzma2Enc
typedef struct
{
Byte propEncoded;
CLzma2EncProps props;
@ -364,22 +363,23 @@ struct CLzma2Enc
ISzAllocPtr alloc;
ISzAllocPtr allocBig;
CLzma2EncInt coders[MTCODER_THREADS_MAX];
CLzma2EncInt coders[MTCODER__THREADS_MAX];
#ifndef Z7_ST
#ifndef _7ZIP_ST
ISeqOutStreamPtr outStream;
ISeqOutStream *outStream;
Byte *outBuf;
size_t outBuf_Rem; /* remainder in outBuf */
size_t outBufSize; /* size of allocated outBufs[i] */
size_t outBufsDataSizes[MTCODER_BLOCKS_MAX];
size_t outBufsDataSizes[MTCODER__BLOCKS_MAX];
BoolInt mtCoder_WasConstructed;
CMtCoder mtCoder;
Byte *outBufs[MTCODER_BLOCKS_MAX];
Byte *outBufs[MTCODER__BLOCKS_MAX];
#endif
};
} CLzma2Enc;
@ -396,30 +396,30 @@ CLzma2EncHandle Lzma2Enc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig)
p->allocBig = allocBig;
{
unsigned i;
for (i = 0; i < MTCODER_THREADS_MAX; i++)
for (i = 0; i < MTCODER__THREADS_MAX; i++)
p->coders[i].enc = NULL;
}
#ifndef Z7_ST
#ifndef _7ZIP_ST
p->mtCoder_WasConstructed = False;
{
unsigned i;
for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
for (i = 0; i < MTCODER__BLOCKS_MAX; i++)
p->outBufs[i] = NULL;
p->outBufSize = 0;
}
#endif
return (CLzma2EncHandle)p;
return p;
}
#ifndef Z7_ST
#ifndef _7ZIP_ST
static void Lzma2Enc_FreeOutBufs(CLzma2Enc *p)
{
unsigned i;
for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
for (i = 0; i < MTCODER__BLOCKS_MAX; i++)
if (p->outBufs[i])
{
ISzAlloc_Free(p->alloc, p->outBufs[i]);
@ -430,13 +430,12 @@ static void Lzma2Enc_FreeOutBufs(CLzma2Enc *p)
#endif
// #define GET_CLzma2Enc_p CLzma2Enc *p = (CLzma2Enc *)(void *)p;
void Lzma2Enc_Destroy(CLzma2EncHandle p)
void Lzma2Enc_Destroy(CLzma2EncHandle pp)
{
// GET_CLzma2Enc_p
CLzma2Enc *p = (CLzma2Enc *)pp;
unsigned i;
for (i = 0; i < MTCODER_THREADS_MAX; i++)
for (i = 0; i < MTCODER__THREADS_MAX; i++)
{
CLzma2EncInt *t = &p->coders[i];
if (t->enc)
@ -447,7 +446,7 @@ void Lzma2Enc_Destroy(CLzma2EncHandle p)
}
#ifndef Z7_ST
#ifndef _7ZIP_ST
if (p->mtCoder_WasConstructed)
{
MtCoder_Destruct(&p->mtCoder);
@ -459,13 +458,13 @@ void Lzma2Enc_Destroy(CLzma2EncHandle p)
ISzAlloc_Free(p->alloc, p->tempBufLzma);
p->tempBufLzma = NULL;
ISzAlloc_Free(p->alloc, p);
ISzAlloc_Free(p->alloc, pp);
}
SRes Lzma2Enc_SetProps(CLzma2EncHandle p, const CLzma2EncProps *props)
SRes Lzma2Enc_SetProps(CLzma2EncHandle pp, const CLzma2EncProps *props)
{
// GET_CLzma2Enc_p
CLzma2Enc *p = (CLzma2Enc *)pp;
CLzmaEncProps lzmaProps = props->lzmaProps;
LzmaEncProps_Normalize(&lzmaProps);
if (lzmaProps.lc + lzmaProps.lp > LZMA2_LCLP_MAX)
@ -476,16 +475,16 @@ SRes Lzma2Enc_SetProps(CLzma2EncHandle p, const CLzma2EncProps *props)
}
void Lzma2Enc_SetDataSize(CLzma2EncHandle p, UInt64 expectedDataSiize)
void Lzma2Enc_SetDataSize(CLzmaEncHandle pp, UInt64 expectedDataSiize)
{
// GET_CLzma2Enc_p
CLzma2Enc *p = (CLzma2Enc *)pp;
p->expectedDataSize = expectedDataSiize;
}
Byte Lzma2Enc_WriteProperties(CLzma2EncHandle p)
Byte Lzma2Enc_WriteProperties(CLzma2EncHandle pp)
{
// GET_CLzma2Enc_p
CLzma2Enc *p = (CLzma2Enc *)pp;
unsigned i;
UInt32 dicSize = LzmaEncProps_GetDictSize(&p->props.lzmaProps);
for (i = 0; i < 40; i++)
@ -498,12 +497,12 @@ Byte Lzma2Enc_WriteProperties(CLzma2EncHandle p)
static SRes Lzma2Enc_EncodeMt1(
CLzma2Enc *me,
CLzma2EncInt *p,
ISeqOutStreamPtr outStream,
ISeqOutStream *outStream,
Byte *outBuf, size_t *outBufSize,
ISeqInStreamPtr inStream,
ISeqInStream *inStream,
const Byte *inData, size_t inDataSize,
int finished,
ICompressProgressPtr progress)
ICompressProgress *progress)
{
UInt64 unpackTotal = 0;
UInt64 packTotal = 0;
@ -541,12 +540,12 @@ static SRes Lzma2Enc_EncodeMt1(
}
}
RINOK(Lzma2EncInt_InitStream(p, &me->props))
RINOK(Lzma2EncInt_InitStream(p, &me->props));
for (;;)
{
SRes res = SZ_OK;
SizeT inSizeCur = 0;
size_t inSizeCur = 0;
Lzma2EncInt_InitBlock(p);
@ -560,7 +559,7 @@ static SRes Lzma2Enc_EncodeMt1(
if (me->expectedDataSize != (UInt64)(Int64)-1
&& me->expectedDataSize >= unpackTotal)
expected = me->expectedDataSize - unpackTotal;
if (me->props.blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID
if (me->props.blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID
&& expected > me->props.blockSize)
expected = (size_t)me->props.blockSize;
@ -570,14 +569,14 @@ static SRes Lzma2Enc_EncodeMt1(
&limitedInStream.vt,
LZMA2_KEEP_WINDOW_SIZE,
me->alloc,
me->allocBig))
me->allocBig));
}
else
{
inSizeCur = (SizeT)(inDataSize - (size_t)unpackTotal);
if (me->props.blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID
inSizeCur = inDataSize - (size_t)unpackTotal;
if (me->props.blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID
&& inSizeCur > me->props.blockSize)
inSizeCur = (SizeT)(size_t)me->props.blockSize;
inSizeCur = (size_t)me->props.blockSize;
// LzmaEnc_SetDataSize(p->enc, inSizeCur);
@ -585,7 +584,7 @@ static SRes Lzma2Enc_EncodeMt1(
inData + (size_t)unpackTotal, inSizeCur,
LZMA2_KEEP_WINDOW_SIZE,
me->alloc,
me->allocBig))
me->allocBig));
}
for (;;)
@ -622,7 +621,7 @@ static SRes Lzma2Enc_EncodeMt1(
unpackTotal += p->srcPos;
RINOK(res)
RINOK(res);
if (p->srcPos != (inStream ? limitedInStream.processed : inSizeCur))
return SZ_ERROR_FAIL;
@ -653,12 +652,12 @@ static SRes Lzma2Enc_EncodeMt1(
#ifndef Z7_ST
#ifndef _7ZIP_ST
static SRes Lzma2Enc_MtCallback_Code(void *p, unsigned coderIndex, unsigned outBufIndex,
static SRes Lzma2Enc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBufIndex,
const Byte *src, size_t srcSize, int finished)
{
CLzma2Enc *me = (CLzma2Enc *)p;
CLzma2Enc *me = (CLzma2Enc *)pp;
size_t destSize = me->outBufSize;
SRes res;
CMtProgressThunk progressThunk;
@ -693,9 +692,9 @@ static SRes Lzma2Enc_MtCallback_Code(void *p, unsigned coderIndex, unsigned outB
}
static SRes Lzma2Enc_MtCallback_Write(void *p, unsigned outBufIndex)
static SRes Lzma2Enc_MtCallback_Write(void *pp, unsigned outBufIndex)
{
CLzma2Enc *me = (CLzma2Enc *)p;
CLzma2Enc *me = (CLzma2Enc *)pp;
size_t size = me->outBufsDataSizes[outBufIndex];
const Byte *data = me->outBufs[outBufIndex];
@ -714,14 +713,14 @@ static SRes Lzma2Enc_MtCallback_Write(void *p, unsigned outBufIndex)
SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
ISeqOutStreamPtr outStream,
SRes Lzma2Enc_Encode2(CLzma2EncHandle pp,
ISeqOutStream *outStream,
Byte *outBuf, size_t *outBufSize,
ISeqInStreamPtr inStream,
ISeqInStream *inStream,
const Byte *inData, size_t inDataSize,
ICompressProgressPtr progress)
ICompressProgress *progress)
{
// GET_CLzma2Enc_p
CLzma2Enc *p = (CLzma2Enc *)pp;
if (inStream && inData)
return SZ_ERROR_PARAM;
@ -731,11 +730,11 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
{
unsigned i;
for (i = 0; i < MTCODER_THREADS_MAX; i++)
for (i = 0; i < MTCODER__THREADS_MAX; i++)
p->coders[i].propsAreSet = False;
}
#ifndef Z7_ST
#ifndef _7ZIP_ST
if (p->props.numBlockThreads_Reduced > 1)
{
@ -773,7 +772,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
return SZ_ERROR_PARAM; /* SZ_ERROR_MEM */
{
const size_t destBlockSize = p->mtCoder.blockSize + (p->mtCoder.blockSize >> 10) + 16;
size_t destBlockSize = p->mtCoder.blockSize + (p->mtCoder.blockSize >> 10) + 16;
if (destBlockSize < p->mtCoder.blockSize)
return SZ_ERROR_PARAM;
if (p->outBufSize != destBlockSize)
@ -782,11 +781,10 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
}
p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max;
p->mtCoder.numThreadGroups = p->props.numThreadGroups;
p->mtCoder.expectedDataSize = p->expectedDataSize;
{
const SRes res = MtCoder_Code(&p->mtCoder);
SRes res = MtCoder_Code(&p->mtCoder);
if (!outStream)
*outBufSize = (size_t)(p->outBuf - outBuf);
return res;
@ -803,5 +801,3 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
True, /* finished */
progress);
}
#undef PRF

View file

@ -1,15 +1,15 @@
/* Lzma2Enc.h -- LZMA2 Encoder
2023-04-13 : Igor Pavlov : Public domain */
2017-07-27 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_LZMA2_ENC_H
#define ZIP7_INC_LZMA2_ENC_H
#ifndef __LZMA2_ENC_H
#define __LZMA2_ENC_H
#include "LzmaEnc.h"
EXTERN_C_BEGIN
#define LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO 0
#define LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID ((UInt64)(Int64)-1)
#define LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO 0
#define LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID ((UInt64)(Int64)-1)
typedef struct
{
@ -18,7 +18,6 @@ typedef struct
int numBlockThreads_Reduced;
int numBlockThreads_Max;
int numTotalThreads;
unsigned numThreadGroups; // 0 : no groups
} CLzma2EncProps;
void Lzma2EncProps_Init(CLzma2EncProps *p);
@ -37,9 +36,7 @@ SRes:
SZ_ERROR_THREAD - error in multithreading functions (only for Mt version)
*/
typedef struct CLzma2Enc CLzma2Enc;
typedef CLzma2Enc * CLzma2EncHandle;
// Z7_DECLARE_HANDLE(CLzma2EncHandle)
typedef void * CLzma2EncHandle;
CLzma2EncHandle Lzma2Enc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig);
void Lzma2Enc_Destroy(CLzma2EncHandle p);
@ -47,11 +44,11 @@ SRes Lzma2Enc_SetProps(CLzma2EncHandle p, const CLzma2EncProps *props);
void Lzma2Enc_SetDataSize(CLzma2EncHandle p, UInt64 expectedDataSiize);
Byte Lzma2Enc_WriteProperties(CLzma2EncHandle p);
SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
ISeqOutStreamPtr outStream,
ISeqOutStream *outStream,
Byte *outBuf, size_t *outBufSize,
ISeqInStreamPtr inStream,
ISeqInStream *inStream,
const Byte *inData, size_t inDataSize,
ICompressProgressPtr progress);
ICompressProgress *progress);
EXTERN_C_END

View file

@ -1,8 +1,8 @@
/* Lzma86.h -- LZMA + x86 (BCJ) Filter
2023-03-03 : Igor Pavlov : Public domain */
2013-01-18 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_LZMA86_H
#define ZIP7_INC_LZMA86_H
#ifndef __LZMA86_H
#define __LZMA86_H
#include "7zTypes.h"

View file

@ -1,5 +1,5 @@
/* Lzma86Dec.c -- LZMA + x86 (BCJ) Filter Decoder
2023-03-03 : Igor Pavlov : Public domain */
2016-05-16 : Igor Pavlov : Public domain */
#include "Precomp.h"
@ -46,8 +46,9 @@ SRes Lzma86_Decode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen)
return res;
if (useFilter == 1)
{
UInt32 x86State = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL;
z7_BranchConvSt_X86_Dec(dest, *destLen, 0, &x86State);
UInt32 x86State;
x86_Convert_Init(x86State);
x86_Convert(dest, *destLen, 0, &x86State, 0);
}
return SZ_OK;
}

View file

@ -1,5 +1,5 @@
/* Lzma86Enc.c -- LZMA + x86 (BCJ) Filter Encoder
2023-03-03 : Igor Pavlov : Public domain */
2018-07-04 : Igor Pavlov : Public domain */
#include "Precomp.h"
@ -46,8 +46,9 @@ int Lzma86_Encode(Byte *dest, size_t *destLen, const Byte *src, size_t srcLen,
memcpy(filteredStream, src, srcLen);
}
{
UInt32 x86State = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL;
z7_BranchConvSt_X86_Enc(filteredStream, srcLen, 0, &x86State);
UInt32 x86State;
x86_Convert_Init(x86State);
x86_Convert(filteredStream, srcLen, 0, &x86State, 1);
}
}

View file

@ -1,5 +1,5 @@
/* LzmaDec.c -- LZMA Decoder
2023-04-07 : Igor Pavlov : Public domain */
2021-04-01 : Igor Pavlov : Public domain */
#include "Precomp.h"
@ -8,15 +8,15 @@
/* #include "CpuArch.h" */
#include "LzmaDec.h"
// #define kNumTopBits 24
#define kTopValue ((UInt32)1 << 24)
#define kNumTopBits 24
#define kTopValue ((UInt32)1 << kNumTopBits)
#define kNumBitModelTotalBits 11
#define kBitModelTotal (1 << kNumBitModelTotalBits)
#define RC_INIT_SIZE 5
#ifndef Z7_LZMA_DEC_OPT
#ifndef _LZMA_DEC_OPT
#define kNumMoveBits 5
#define NORMALIZE if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); }
@ -25,14 +25,14 @@
#define UPDATE_0(p) range = bound; *(p) = (CLzmaProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
#define UPDATE_1(p) range -= bound; code -= bound; *(p) = (CLzmaProb)(ttt - (ttt >> kNumMoveBits));
#define GET_BIT2(p, i, A0, A1) IF_BIT_0(p) \
{ UPDATE_0(p) i = (i + i); A0; } else \
{ UPDATE_1(p) i = (i + i) + 1; A1; }
{ UPDATE_0(p); i = (i + i); A0; } else \
{ UPDATE_1(p); i = (i + i) + 1; A1; }
#define TREE_GET_BIT(probs, i) { GET_BIT2(probs + i, i, ;, ;); }
#define REV_BIT(p, i, A0, A1) IF_BIT_0(p + i) \
{ UPDATE_0(p + i) A0; } else \
{ UPDATE_1(p + i) A1; }
{ UPDATE_0(p + i); A0; } else \
{ UPDATE_1(p + i); A1; }
#define REV_BIT_VAR( p, i, m) REV_BIT(p, i, i += m; m += m, m += m; i += m; )
#define REV_BIT_CONST(p, i, m) REV_BIT(p, i, i += m; , i += m * 2; )
#define REV_BIT_LAST( p, i, m) REV_BIT(p, i, i -= m , ; )
@ -40,19 +40,19 @@
#define TREE_DECODE(probs, limit, i) \
{ i = 1; do { TREE_GET_BIT(probs, i); } while (i < limit); i -= limit; }
/* #define Z7_LZMA_SIZE_OPT */
/* #define _LZMA_SIZE_OPT */
#ifdef Z7_LZMA_SIZE_OPT
#ifdef _LZMA_SIZE_OPT
#define TREE_6_DECODE(probs, i) TREE_DECODE(probs, (1 << 6), i)
#else
#define TREE_6_DECODE(probs, i) \
{ i = 1; \
TREE_GET_BIT(probs, i) \
TREE_GET_BIT(probs, i) \
TREE_GET_BIT(probs, i) \
TREE_GET_BIT(probs, i) \
TREE_GET_BIT(probs, i) \
TREE_GET_BIT(probs, i) \
TREE_GET_BIT(probs, i); \
TREE_GET_BIT(probs, i); \
TREE_GET_BIT(probs, i); \
TREE_GET_BIT(probs, i); \
TREE_GET_BIT(probs, i); \
TREE_GET_BIT(probs, i); \
i -= 0x40; }
#endif
@ -64,25 +64,25 @@
probLit = prob + (offs + bit + symbol); \
GET_BIT2(probLit, symbol, offs ^= bit; , ;)
#endif // Z7_LZMA_DEC_OPT
#endif // _LZMA_DEC_OPT
#define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_INPUT_EOF; range <<= 8; code = (code << 8) | (*buf++); }
#define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK bound = (range >> kNumBitModelTotalBits) * (UInt32)ttt; if (code < bound)
#define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK; bound = (range >> kNumBitModelTotalBits) * (UInt32)ttt; if (code < bound)
#define UPDATE_0_CHECK range = bound;
#define UPDATE_1_CHECK range -= bound; code -= bound;
#define GET_BIT2_CHECK(p, i, A0, A1) IF_BIT_0_CHECK(p) \
{ UPDATE_0_CHECK i = (i + i); A0; } else \
{ UPDATE_1_CHECK i = (i + i) + 1; A1; }
{ UPDATE_0_CHECK; i = (i + i); A0; } else \
{ UPDATE_1_CHECK; i = (i + i) + 1; A1; }
#define GET_BIT_CHECK(p, i) GET_BIT2_CHECK(p, i, ; , ;)
#define TREE_DECODE_CHECK(probs, limit, i) \
{ i = 1; do { GET_BIT_CHECK(probs + i, i) } while (i < limit); i -= limit; }
#define REV_BIT_CHECK(p, i, m) IF_BIT_0_CHECK(p + i) \
{ UPDATE_0_CHECK i += m; m += m; } else \
{ UPDATE_1_CHECK m += m; i += m; }
{ UPDATE_0_CHECK; i += m; m += m; } else \
{ UPDATE_1_CHECK; m += m; i += m; }
#define kNumPosBitsMax 4
@ -224,14 +224,14 @@ Out:
*/
#ifdef Z7_LZMA_DEC_OPT
#ifdef _LZMA_DEC_OPT
int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit);
int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit);
#else
static
int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
{
CLzmaProb *probs = GET_PROBS;
unsigned state = (unsigned)p->state;
@ -263,7 +263,7 @@ int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
IF_BIT_0(prob)
{
unsigned symbol;
UPDATE_0(prob)
UPDATE_0(prob);
prob = probs + Literal;
if (processedPos != 0 || checkDicSize != 0)
prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);
@ -273,7 +273,7 @@ int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
{
state -= (state < 4) ? state : 3;
symbol = 1;
#ifdef Z7_LZMA_SIZE_OPT
#ifdef _LZMA_SIZE_OPT
do { NORMAL_LITER_DEC } while (symbol < 0x100);
#else
NORMAL_LITER_DEC
@ -292,7 +292,7 @@ int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
unsigned offs = 0x100;
state -= (state < 10) ? 3 : 6;
symbol = 1;
#ifdef Z7_LZMA_SIZE_OPT
#ifdef _LZMA_SIZE_OPT
do
{
unsigned bit;
@ -321,25 +321,25 @@ int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
}
{
UPDATE_1(prob)
UPDATE_1(prob);
prob = probs + IsRep + state;
IF_BIT_0(prob)
{
UPDATE_0(prob)
UPDATE_0(prob);
state += kNumStates;
prob = probs + LenCoder;
}
else
{
UPDATE_1(prob)
UPDATE_1(prob);
prob = probs + IsRepG0 + state;
IF_BIT_0(prob)
{
UPDATE_0(prob)
UPDATE_0(prob);
prob = probs + IsRep0Long + COMBINED_PS_STATE;
IF_BIT_0(prob)
{
UPDATE_0(prob)
UPDATE_0(prob);
// that case was checked before with kBadRepCode
// if (checkDicSize == 0 && processedPos == 0) { len = kMatchSpecLen_Error_Data + 1; break; }
@ -353,30 +353,30 @@ int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
state = state < kNumLitStates ? 9 : 11;
continue;
}
UPDATE_1(prob)
UPDATE_1(prob);
}
else
{
UInt32 distance;
UPDATE_1(prob)
UPDATE_1(prob);
prob = probs + IsRepG1 + state;
IF_BIT_0(prob)
{
UPDATE_0(prob)
UPDATE_0(prob);
distance = rep1;
}
else
{
UPDATE_1(prob)
UPDATE_1(prob);
prob = probs + IsRepG2 + state;
IF_BIT_0(prob)
{
UPDATE_0(prob)
UPDATE_0(prob);
distance = rep2;
}
else
{
UPDATE_1(prob)
UPDATE_1(prob);
distance = rep3;
rep3 = rep2;
}
@ -389,37 +389,37 @@ int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
prob = probs + RepLenCoder;
}
#ifdef Z7_LZMA_SIZE_OPT
#ifdef _LZMA_SIZE_OPT
{
unsigned lim, offset;
CLzmaProb *probLen = prob + LenChoice;
IF_BIT_0(probLen)
{
UPDATE_0(probLen)
UPDATE_0(probLen);
probLen = prob + LenLow + GET_LEN_STATE;
offset = 0;
lim = (1 << kLenNumLowBits);
}
else
{
UPDATE_1(probLen)
UPDATE_1(probLen);
probLen = prob + LenChoice2;
IF_BIT_0(probLen)
{
UPDATE_0(probLen)
UPDATE_0(probLen);
probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits);
offset = kLenNumLowSymbols;
lim = (1 << kLenNumLowBits);
}
else
{
UPDATE_1(probLen)
UPDATE_1(probLen);
probLen = prob + LenHigh;
offset = kLenNumLowSymbols * 2;
lim = (1 << kLenNumHighBits);
}
}
TREE_DECODE(probLen, lim, len)
TREE_DECODE(probLen, lim, len);
len += offset;
}
#else
@ -427,32 +427,32 @@ int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
CLzmaProb *probLen = prob + LenChoice;
IF_BIT_0(probLen)
{
UPDATE_0(probLen)
UPDATE_0(probLen);
probLen = prob + LenLow + GET_LEN_STATE;
len = 1;
TREE_GET_BIT(probLen, len)
TREE_GET_BIT(probLen, len)
TREE_GET_BIT(probLen, len)
TREE_GET_BIT(probLen, len);
TREE_GET_BIT(probLen, len);
TREE_GET_BIT(probLen, len);
len -= 8;
}
else
{
UPDATE_1(probLen)
UPDATE_1(probLen);
probLen = prob + LenChoice2;
IF_BIT_0(probLen)
{
UPDATE_0(probLen)
UPDATE_0(probLen);
probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits);
len = 1;
TREE_GET_BIT(probLen, len)
TREE_GET_BIT(probLen, len)
TREE_GET_BIT(probLen, len)
TREE_GET_BIT(probLen, len);
TREE_GET_BIT(probLen, len);
TREE_GET_BIT(probLen, len);
}
else
{
UPDATE_1(probLen)
UPDATE_1(probLen);
probLen = prob + LenHigh;
TREE_DECODE(probLen, (1 << kLenNumHighBits), len)
TREE_DECODE(probLen, (1 << kLenNumHighBits), len);
len += kLenNumLowSymbols * 2;
}
}
@ -464,7 +464,7 @@ int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
UInt32 distance;
prob = probs + PosSlot +
((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
TREE_6_DECODE(prob, distance)
TREE_6_DECODE(prob, distance);
if (distance >= kStartPosModelIndex)
{
unsigned posSlot = (unsigned)distance;
@ -479,7 +479,7 @@ int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
distance++;
do
{
REV_BIT_VAR(prob, distance, m)
REV_BIT_VAR(prob, distance, m);
}
while (--numDirectBits);
distance -= m;
@ -514,10 +514,10 @@ int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
distance <<= kNumAlignBits;
{
unsigned i = 1;
REV_BIT_CONST(prob, i, 1)
REV_BIT_CONST(prob, i, 2)
REV_BIT_CONST(prob, i, 4)
REV_BIT_LAST (prob, i, 8)
REV_BIT_CONST(prob, i, 1);
REV_BIT_CONST(prob, i, 2);
REV_BIT_CONST(prob, i, 4);
REV_BIT_LAST (prob, i, 8);
distance |= i;
}
if (distance == (UInt32)0xFFFFFFFF)
@ -592,7 +592,7 @@ int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
}
while (dicPos < limit && buf < bufLimit);
NORMALIZE
NORMALIZE;
p->buf = buf;
p->range = range;
@ -613,7 +613,7 @@ int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
static void Z7_FASTCALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit)
static void MY_FAST_CALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit)
{
unsigned len = (unsigned)p->remainLen;
if (len == 0 /* || len >= kMatchSpecLenStart */)
@ -683,7 +683,7 @@ and we support the following state of (p->checkDicSize):
(p->checkDicSize == p->prop.dicSize)
*/
static int Z7_FASTCALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
static int MY_FAST_CALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
{
if (p->checkDicSize == 0)
{
@ -767,54 +767,54 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byt
else
{
unsigned len;
UPDATE_1_CHECK
UPDATE_1_CHECK;
prob = probs + IsRep + state;
IF_BIT_0_CHECK(prob)
{
UPDATE_0_CHECK
UPDATE_0_CHECK;
state = 0;
prob = probs + LenCoder;
res = DUMMY_MATCH;
}
else
{
UPDATE_1_CHECK
UPDATE_1_CHECK;
res = DUMMY_REP;
prob = probs + IsRepG0 + state;
IF_BIT_0_CHECK(prob)
{
UPDATE_0_CHECK
UPDATE_0_CHECK;
prob = probs + IsRep0Long + COMBINED_PS_STATE;
IF_BIT_0_CHECK(prob)
{
UPDATE_0_CHECK
UPDATE_0_CHECK;
break;
}
else
{
UPDATE_1_CHECK
UPDATE_1_CHECK;
}
}
else
{
UPDATE_1_CHECK
UPDATE_1_CHECK;
prob = probs + IsRepG1 + state;
IF_BIT_0_CHECK(prob)
{
UPDATE_0_CHECK
UPDATE_0_CHECK;
}
else
{
UPDATE_1_CHECK
UPDATE_1_CHECK;
prob = probs + IsRepG2 + state;
IF_BIT_0_CHECK(prob)
{
UPDATE_0_CHECK
UPDATE_0_CHECK;
}
else
{
UPDATE_1_CHECK
UPDATE_1_CHECK;
}
}
}
@ -826,31 +826,31 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byt
const CLzmaProb *probLen = prob + LenChoice;
IF_BIT_0_CHECK(probLen)
{
UPDATE_0_CHECK
UPDATE_0_CHECK;
probLen = prob + LenLow + GET_LEN_STATE;
offset = 0;
limit = 1 << kLenNumLowBits;
}
else
{
UPDATE_1_CHECK
UPDATE_1_CHECK;
probLen = prob + LenChoice2;
IF_BIT_0_CHECK(probLen)
{
UPDATE_0_CHECK
UPDATE_0_CHECK;
probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits);
offset = kLenNumLowSymbols;
limit = 1 << kLenNumLowBits;
}
else
{
UPDATE_1_CHECK
UPDATE_1_CHECK;
probLen = prob + LenHigh;
offset = kLenNumLowSymbols * 2;
limit = 1 << kLenNumHighBits;
}
}
TREE_DECODE_CHECK(probLen, limit, len)
TREE_DECODE_CHECK(probLen, limit, len);
len += offset;
}
@ -860,7 +860,7 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byt
prob = probs + PosSlot +
((len < kNumLenToPosStates - 1 ? len : kNumLenToPosStates - 1) <<
kNumPosSlotBits);
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot)
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot);
if (posSlot >= kStartPosModelIndex)
{
unsigned numDirectBits = ((posSlot >> 1) - 1);
@ -888,7 +888,7 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byt
unsigned m = 1;
do
{
REV_BIT_CHECK(prob, i, m)
REV_BIT_CHECK(prob, i, m);
}
while (--numDirectBits);
}
@ -897,7 +897,7 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byt
}
break;
}
NORMALIZE_CHECK
NORMALIZE_CHECK;
*bufOut = buf;
return res;
@ -943,7 +943,7 @@ When the decoder lookahead, and the lookahead symbol is not end_marker, we have
*/
#define RETURN_NOT_FINISHED_FOR_FINISH \
#define RETURN__NOT_FINISHED__FOR_FINISH \
*status = LZMA_STATUS_NOT_FINISHED; \
return SZ_ERROR_DATA; // for strict mode
// return SZ_OK; // for relaxed mode
@ -1029,7 +1029,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr
}
if (p->remainLen != 0)
{
RETURN_NOT_FINISHED_FOR_FINISH
RETURN__NOT_FINISHED__FOR_FINISH;
}
checkEndMarkNow = 1;
}
@ -1072,7 +1072,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr
for (i = 0; i < (unsigned)dummyProcessed; i++)
p->tempBuf[i] = src[i];
// p->remainLen = kMatchSpecLen_Error_Data;
RETURN_NOT_FINISHED_FOR_FINISH
RETURN__NOT_FINISHED__FOR_FINISH;
}
bufLimit = src;
@ -1150,7 +1150,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr
(*srcLen) += (unsigned)dummyProcessed - p->tempBufSize;
p->tempBufSize = (unsigned)dummyProcessed;
// p->remainLen = kMatchSpecLen_Error_Data;
RETURN_NOT_FINISHED_FOR_FINISH
RETURN__NOT_FINISHED__FOR_FINISH;
}
}
@ -1299,8 +1299,8 @@ static SRes LzmaDec_AllocateProbs2(CLzmaDec *p, const CLzmaProps *propNew, ISzAl
SRes LzmaDec_AllocateProbs(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAllocPtr alloc)
{
CLzmaProps propNew;
RINOK(LzmaProps_Decode(&propNew, props, propsSize))
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc))
RINOK(LzmaProps_Decode(&propNew, props, propsSize));
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc));
p->prop = propNew;
return SZ_OK;
}
@ -1309,14 +1309,14 @@ SRes LzmaDec_Allocate(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAll
{
CLzmaProps propNew;
SizeT dicBufSize;
RINOK(LzmaProps_Decode(&propNew, props, propsSize))
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc))
RINOK(LzmaProps_Decode(&propNew, props, propsSize));
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc));
{
UInt32 dictSize = propNew.dicSize;
SizeT mask = ((UInt32)1 << 12) - 1;
if (dictSize >= ((UInt32)1 << 30)) mask = ((UInt32)1 << 22) - 1;
else if (dictSize >= ((UInt32)1 << 22)) mask = ((UInt32)1 << 20) - 1;
else if (dictSize >= ((UInt32)1 << 22)) mask = ((UInt32)1 << 20) - 1;;
dicBufSize = ((SizeT)dictSize + mask) & ~mask;
if (dicBufSize < dictSize)
dicBufSize = dictSize;
@ -1348,8 +1348,8 @@ SRes LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
*status = LZMA_STATUS_NOT_SPECIFIED;
if (inSize < RC_INIT_SIZE)
return SZ_ERROR_INPUT_EOF;
LzmaDec_CONSTRUCT(&p)
RINOK(LzmaDec_AllocateProbs(&p, propData, propSize, alloc))
LzmaDec_Construct(&p);
RINOK(LzmaDec_AllocateProbs(&p, propData, propSize, alloc));
p.dic = dest;
p.dicBufSize = outSize;
LzmaDec_Init(&p);

View file

@ -1,19 +1,19 @@
/* LzmaDec.h -- LZMA Decoder
2023-04-02 : Igor Pavlov : Public domain */
2020-03-19 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_LZMA_DEC_H
#define ZIP7_INC_LZMA_DEC_H
#ifndef __LZMA_DEC_H
#define __LZMA_DEC_H
#include "7zTypes.h"
EXTERN_C_BEGIN
/* #define Z7_LZMA_PROB32 */
/* Z7_LZMA_PROB32 can increase the speed on some CPUs,
/* #define _LZMA_PROB32 */
/* _LZMA_PROB32 can increase the speed on some CPUs,
but memory usage for CLzmaDec::probs will be doubled in that case */
typedef
#ifdef Z7_LZMA_PROB32
#ifdef _LZMA_PROB32
UInt32
#else
UInt16
@ -25,7 +25,7 @@ typedef
#define LZMA_PROPS_SIZE 5
typedef struct
typedef struct _CLzmaProps
{
Byte lc;
Byte lp;
@ -73,8 +73,7 @@ typedef struct
Byte tempBuf[LZMA_REQUIRED_INPUT_MAX];
} CLzmaDec;
#define LzmaDec_CONSTRUCT(p) { (p)->dic = NULL; (p)->probs = NULL; }
#define LzmaDec_Construct(p) LzmaDec_CONSTRUCT(p)
#define LzmaDec_Construct(p) { (p)->dic = NULL; (p)->probs = NULL; }
void LzmaDec_Init(CLzmaDec *p);

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,8 @@
/* LzmaEnc.h -- LZMA Encoder
: Igor Pavlov : Public domain */
2019-10-30 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_LZMA_ENC_H
#define ZIP7_INC_LZMA_ENC_H
#ifndef __LZMA_ENC_H
#define __LZMA_ENC_H
#include "7zTypes.h"
@ -10,7 +10,7 @@ EXTERN_C_BEGIN
#define LZMA_PROPS_SIZE 5
typedef struct
typedef struct _CLzmaEncProps
{
int level; /* 0 <= level <= 9 */
UInt32 dictSize; /* (1 << 12) <= dictSize <= (1 << 27) for 32-bit version
@ -23,19 +23,14 @@ typedef struct
int fb; /* 5 <= fb <= 273, default = 32 */
int btMode; /* 0 - hashChain Mode, 1 - binTree mode - normal, default = 1 */
int numHashBytes; /* 2, 3 or 4, default = 4 */
unsigned numHashOutBits; /* default = ? */
UInt32 mc; /* 1 <= mc <= (1 << 30), default = 32 */
unsigned writeEndMark; /* 0 - do not write EOPM, 1 - write EOPM, default = 0 */
int numThreads; /* 1 or 2, default = 2 */
// int _pad;
Int32 affinityGroup;
UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1.
Encoder uses this value to reduce dictionary size */
UInt64 affinity;
UInt64 affinityInGroup;
} CLzmaEncProps;
void LzmaEncProps_Init(CLzmaEncProps *p);
@ -56,9 +51,7 @@ SRes:
SZ_ERROR_THREAD - error in multithreading functions (only for Mt version)
*/
typedef struct CLzmaEnc CLzmaEnc;
typedef CLzmaEnc * CLzmaEncHandle;
// Z7_DECLARE_HANDLE(CLzmaEncHandle)
typedef void * CLzmaEncHandle;
CLzmaEncHandle LzmaEnc_Create(ISzAllocPtr alloc);
void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAllocPtr alloc, ISzAllocPtr allocBig);
@ -68,17 +61,17 @@ void LzmaEnc_SetDataSize(CLzmaEncHandle p, UInt64 expectedDataSiize);
SRes LzmaEnc_WriteProperties(CLzmaEncHandle p, Byte *properties, SizeT *size);
unsigned LzmaEnc_IsWriteEndMark(CLzmaEncHandle p);
SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream,
ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStream *outStream, ISeqInStream *inStream,
ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
SRes LzmaEnc_MemEncode(CLzmaEncHandle p, Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen,
int writeEndMark, ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
int writeEndMark, ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
/* ---------- One Call Interface ---------- */
SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen,
const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark,
ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
EXTERN_C_END

View file

@ -1,14 +1,12 @@
/* LzmaLib.c -- LZMA library wrapper
2023-04-02 : Igor Pavlov : Public domain */
#include "Precomp.h"
2015-06-13 : Igor Pavlov : Public domain */
#include "Alloc.h"
#include "LzmaDec.h"
#include "LzmaEnc.h"
#include "LzmaLib.h"
Z7_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen,
MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen,
unsigned char *outProps, size_t *outPropsSize,
int level, /* 0 <= level <= 9, default = 5 */
unsigned dictSize, /* use (1 << N) or (3 << N). 4 KB < dictSize <= 128 MB */
@ -34,7 +32,7 @@ Z7_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char
}
Z7_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t *srcLen,
MY_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t *srcLen,
const unsigned char *props, size_t propsSize)
{
ELzmaStatus status;

View file

@ -1,14 +1,14 @@
/* LzmaLib.h -- LZMA library interface
2023-04-02 : Igor Pavlov : Public domain */
2021-04-03 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_LZMA_LIB_H
#define ZIP7_INC_LZMA_LIB_H
#ifndef __LZMA_LIB_H
#define __LZMA_LIB_H
#include "7zTypes.h"
EXTERN_C_BEGIN
#define Z7_STDAPI int Z7_STDCALL
#define MY_STDAPI int MY_STD_CALL
#define LZMA_PROPS_SIZE 5
@ -100,7 +100,7 @@ Returns:
SZ_ERROR_THREAD - errors in multithreading functions (only for Mt version)
*/
Z7_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen,
MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen,
unsigned char *outProps, size_t *outPropsSize, /* *outPropsSize must be = 5 */
int level, /* 0 <= level <= 9, default = 5 */
unsigned dictSize, /* default = (1 << 24) */
@ -130,7 +130,7 @@ Returns:
SZ_ERROR_INPUT_EOF - it needs more bytes in input buffer (src)
*/
Z7_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, SizeT *srcLen,
MY_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, SizeT *srcLen,
const unsigned char *props, size_t propsSize);
EXTERN_C_END

206
C/Md5.c
View file

@ -1,206 +0,0 @@
/* Md5.c -- MD5 Hash
: Igor Pavlov : Public domain
This code is based on Colin Plumb's public domain md5.c code */
#include "Precomp.h"
#include <string.h>
#include "Md5.h"
#include "RotateDefs.h"
#include "CpuArch.h"
#define MD5_UPDATE_BLOCKS(p) Md5_UpdateBlocks
Z7_NO_INLINE
void Md5_Init(CMd5 *p)
{
p->count = 0;
p->state[0] = 0x67452301;
p->state[1] = 0xefcdab89;
p->state[2] = 0x98badcfe;
p->state[3] = 0x10325476;
}
#if 0 && !defined(MY_CPU_LE_UNALIGN)
// optional optimization for Big-endian processors or processors without unaligned access:
// it is intended to reduce the number of complex LE32 memory reading from 64 to 16.
// But some compilers (sparc, armt) are better without this optimization.
#define Z7_MD5_USE_DATA32_ARRAY
#endif
#define LOAD_DATA(i) GetUi32((const UInt32 *)(const void *)data + (i))
#ifdef Z7_MD5_USE_DATA32_ARRAY
#define D(i) data32[i]
#else
#define D(i) LOAD_DATA(i)
#endif
#define F1(x, y, z) (z ^ (x & (y ^ z)))
#define F2(x, y, z) F1(z, x, y)
#define F3(x, y, z) (x ^ y ^ z)
#define F4(x, y, z) (y ^ (x | ~z))
#define R1(i, f, start, step, w, x, y, z, s, k) \
w += D((start + step * (i)) % 16) + k; \
w += f(x, y, z); \
w = rotlFixed(w, s) + x; \
#define R4(i4, f, start, step, s0,s1,s2,s3, k0,k1,k2,k3) \
R1 (i4*4+0, f, start, step, a,b,c,d, s0, k0) \
R1 (i4*4+1, f, start, step, d,a,b,c, s1, k1) \
R1 (i4*4+2, f, start, step, c,d,a,b, s2, k2) \
R1 (i4*4+3, f, start, step, b,c,d,a, s3, k3) \
#define R16(f, start, step, s0,s1,s2,s3, k00,k01,k02,k03, k10,k11,k12,k13, k20,k21,k22,k23, k30,k31,k32,k33) \
R4 (0, f, start, step, s0,s1,s2,s3, k00,k01,k02,k03) \
R4 (1, f, start, step, s0,s1,s2,s3, k10,k11,k12,k13) \
R4 (2, f, start, step, s0,s1,s2,s3, k20,k21,k22,k23) \
R4 (3, f, start, step, s0,s1,s2,s3, k30,k31,k32,k33) \
static
Z7_NO_INLINE
void Z7_FASTCALL Md5_UpdateBlocks(UInt32 state[4], const Byte *data, size_t numBlocks)
{
UInt32 a, b, c, d;
// if (numBlocks == 0) return;
a = state[0];
b = state[1];
c = state[2];
d = state[3];
do
{
#ifdef Z7_MD5_USE_DATA32_ARRAY
UInt32 data32[MD5_NUM_BLOCK_WORDS];
{
#define LOAD_data32_x4(i) { \
data32[i ] = LOAD_DATA(i ); \
data32[i + 1] = LOAD_DATA(i + 1); \
data32[i + 2] = LOAD_DATA(i + 2); \
data32[i + 3] = LOAD_DATA(i + 3); }
#if 1
LOAD_data32_x4 (0 * 4)
LOAD_data32_x4 (1 * 4)
LOAD_data32_x4 (2 * 4)
LOAD_data32_x4 (3 * 4)
#else
unsigned i;
for (i = 0; i < MD5_NUM_BLOCK_WORDS; i += 4)
{
LOAD_data32_x4(i)
}
#endif
}
#endif
R16 (F1, 0, 1, 7,12,17,22, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821)
R16 (F2, 1, 5, 5, 9,14,20, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a)
R16 (F3, 5, 3, 4,11,16,23, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665)
R16 (F4, 0, 7, 6,10,15,21, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391)
a += state[0];
b += state[1];
c += state[2];
d += state[3];
state[0] = a;
state[1] = b;
state[2] = c;
state[3] = d;
data += MD5_BLOCK_SIZE;
}
while (--numBlocks);
}
#define Md5_UpdateBlock(p) MD5_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
void Md5_Update(CMd5 *p, const Byte *data, size_t size)
{
if (size == 0)
return;
{
const unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1);
const unsigned num = MD5_BLOCK_SIZE - pos;
p->count += size;
if (num > size)
{
memcpy(p->buffer + pos, data, size);
return;
}
if (pos != 0)
{
size -= num;
memcpy(p->buffer + pos, data, num);
data += num;
Md5_UpdateBlock(p);
}
}
{
const size_t numBlocks = size >> 6;
if (numBlocks)
MD5_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
size &= MD5_BLOCK_SIZE - 1;
if (size == 0)
return;
data += (numBlocks << 6);
memcpy(p->buffer, data, size);
}
}
void Md5_Final(CMd5 *p, Byte *digest)
{
unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1);
p->buffer[pos++] = 0x80;
if (pos > (MD5_BLOCK_SIZE - 4 * 2))
{
while (pos != MD5_BLOCK_SIZE) { p->buffer[pos++] = 0; }
// memset(&p->buf.buffer[pos], 0, MD5_BLOCK_SIZE - pos);
Md5_UpdateBlock(p);
pos = 0;
}
memset(&p->buffer[pos], 0, (MD5_BLOCK_SIZE - 4 * 2) - pos);
{
const UInt64 numBits = p->count << 3;
#if defined(MY_CPU_LE_UNALIGN)
SetUi64 (p->buffer + MD5_BLOCK_SIZE - 4 * 2, numBits)
#else
SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 2, (UInt32)(numBits))
SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 1, (UInt32)(numBits >> 32))
#endif
}
Md5_UpdateBlock(p);
SetUi32(digest, p->state[0])
SetUi32(digest + 4, p->state[1])
SetUi32(digest + 8, p->state[2])
SetUi32(digest + 12, p->state[3])
Md5_Init(p);
}
#undef R1
#undef R4
#undef R16
#undef D
#undef LOAD_DATA
#undef LOAD_data32_x4
#undef F1
#undef F2
#undef F3
#undef F4

34
C/Md5.h
View file

@ -1,34 +0,0 @@
/* Md5.h -- MD5 Hash
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_MD5_H
#define ZIP7_INC_MD5_H
#include "7zTypes.h"
EXTERN_C_BEGIN
#define MD5_NUM_BLOCK_WORDS 16
#define MD5_NUM_DIGEST_WORDS 4
#define MD5_BLOCK_SIZE (MD5_NUM_BLOCK_WORDS * 4)
#define MD5_DIGEST_SIZE (MD5_NUM_DIGEST_WORDS * 4)
typedef struct
{
UInt64 count;
UInt64 _pad_1;
// we want 16-bytes alignment here
UInt32 state[MD5_NUM_DIGEST_WORDS];
UInt64 _pad_2[4];
// we want 64-bytes alignment here
Byte buffer[MD5_BLOCK_SIZE];
} CMd5;
void Md5_Init(CMd5 *p);
void Md5_Update(CMd5 *p, const Byte *data, size_t size);
void Md5_Final(CMd5 *p, Byte *digest);
EXTERN_C_END
#endif

View file

@ -1,28 +1,28 @@
/* MtCoder.c -- Multi-thread Coder
: Igor Pavlov : Public domain */
2021-12-21 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include "MtCoder.h"
#ifndef Z7_ST
#ifndef _7ZIP_ST
static SRes MtProgressThunk_Progress(ICompressProgressPtr pp, UInt64 inSize, UInt64 outSize)
static SRes MtProgressThunk_Progress(const ICompressProgress *pp, UInt64 inSize, UInt64 outSize)
{
Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CMtProgressThunk)
CMtProgressThunk *thunk = CONTAINER_FROM_VTBL(pp, CMtProgressThunk, vt);
UInt64 inSize2 = 0;
UInt64 outSize2 = 0;
if (inSize != (UInt64)(Int64)-1)
{
inSize2 = inSize - p->inSize;
p->inSize = inSize;
inSize2 = inSize - thunk->inSize;
thunk->inSize = inSize;
}
if (outSize != (UInt64)(Int64)-1)
{
outSize2 = outSize - p->outSize;
p->outSize = outSize;
outSize2 = outSize - thunk->outSize;
thunk->outSize = outSize;
}
return MtProgress_ProgressAdd(p->mtProgress, inSize2, outSize2);
return MtProgress_ProgressAdd(thunk->mtProgress, inSize2, outSize2);
}
@ -36,31 +36,25 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p)
#define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; }
static WRes ArEvent_OptCreate_And_Reset(CEvent *p)
{
if (Event_IsCreated(p))
return Event_Reset(p);
return AutoResetEvent_CreateNotSignaled(p);
}
static THREAD_FUNC_DECL ThreadFunc(void *pp);
static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t
#ifdef _WIN32
, CMtCoder * const mtc
#endif
)
static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
{
WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent);
// printf("\n====== MtCoderThread_CreateAndStart : \n");
WRes wres = ArEvent_OptCreate_And_Reset(&t->startEvent);
if (wres == 0)
{
t->stop = False;
if (!Thread_WasCreated(&t->thread))
{
#ifdef _WIN32
if (mtc->numThreadGroups)
wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t,
ThreadNextGroup_GetNext(&mtc->nextGroup), // group
0); // affinityMask
else
#endif
wres = Thread_Create(&t->thread, ThreadFunc, t);
}
wres = Thread_Create(&t->thread, ThreadFunc, t);
if (wres == 0)
wres = Event_Set(&t->startEvent);
}
@ -70,7 +64,6 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t
}
Z7_FORCE_INLINE
static void MtCoderThread_Destruct(CMtCoderThread *t)
{
if (Thread_WasCreated(&t->thread))
@ -91,6 +84,24 @@ static void MtCoderThread_Destruct(CMtCoderThread *t)
static SRes FullRead(ISeqInStream *stream, Byte *data, size_t *processedSize)
{
size_t size = *processedSize;
*processedSize = 0;
while (size != 0)
{
size_t cur = size;
SRes res = ISeqInStream_Read(stream, data, &cur);
*processedSize += cur;
data += cur;
size -= cur;
RINOK(res);
if (cur == 0)
return SZ_OK;
}
return SZ_OK;
}
/*
ThreadFunc2() returns:
@ -100,7 +111,7 @@ static void MtCoderThread_Destruct(CMtCoderThread *t)
static SRes ThreadFunc2(CMtCoderThread *t)
{
CMtCoder * const mtc = t->mtCoder;
CMtCoder *mtc = t->mtCoder;
for (;;)
{
@ -141,7 +152,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
}
if (res == SZ_OK)
{
res = SeqInStream_ReadMax(mtc->inStream, t->inBuf, &size);
res = FullRead(mtc->inStream, t->inBuf, &size);
readProcessed = mtc->readProcessed + size;
mtc->readProcessed = readProcessed;
}
@ -200,11 +211,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
if (mtc->numStartedThreads < mtc->numStartedThreadsLimit
&& mtc->expectedDataSize != readProcessed)
{
res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]
#ifdef _WIN32
, mtc
#endif
);
res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]);
if (res == SZ_OK)
mtc->numStartedThreads++;
else
@ -240,13 +247,13 @@ static SRes ThreadFunc2(CMtCoderThread *t)
}
{
CMtCoderBlock * const block = &mtc->blocks[bi];
CMtCoderBlock *block = &mtc->blocks[bi];
block->res = res;
block->bufIndex = bufIndex;
block->finished = finished;
}
#ifdef MTCODER_USE_WRITE_THREAD
#ifdef MTCODER__USE_WRITE_THREAD
RINOK_THREAD(Event_Set(&mtc->writeEvents[bi]))
#else
{
@ -330,7 +337,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
static THREAD_FUNC_DECL ThreadFunc(void *pp)
{
CMtCoderThread * const t = (CMtCoderThread *)pp;
CMtCoderThread *t = (CMtCoderThread *)pp;
for (;;)
{
if (Event_Wait(&t->startEvent) != 0)
@ -338,16 +345,16 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
if (t->stop)
return 0;
{
const SRes res = ThreadFunc2(t);
SRes res = ThreadFunc2(t);
CMtCoder *mtc = t->mtCoder;
if (res != SZ_OK)
{
MtProgress_SetError(&mtc->mtProgress, res);
}
#ifndef MTCODER_USE_WRITE_THREAD
#ifndef MTCODER__USE_WRITE_THREAD
{
const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
if (numFinished == mtc->numStartedThreads)
if (Event_Set(&mtc->finishedEvent) != 0)
return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD;
@ -365,7 +372,6 @@ void MtCoder_Construct(CMtCoder *p)
p->blockSize = 0;
p->numThreadsMax = 0;
p->numThreadGroups = 0;
p->expectedDataSize = (UInt64)(Int64)-1;
p->inStream = NULL;
@ -383,7 +389,7 @@ void MtCoder_Construct(CMtCoder *p)
Event_Construct(&p->readEvent);
Semaphore_Construct(&p->blocksSemaphore);
for (i = 0; i < MTCODER_THREADS_MAX; i++)
for (i = 0; i < MTCODER__THREADS_MAX; i++)
{
CMtCoderThread *t = &p->threads[i];
t->mtCoder = p;
@ -391,11 +397,11 @@ void MtCoder_Construct(CMtCoder *p)
t->inBuf = NULL;
t->stop = False;
Event_Construct(&t->startEvent);
Thread_CONSTRUCT(&t->thread)
Thread_Construct(&t->thread);
}
#ifdef MTCODER_USE_WRITE_THREAD
for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
#ifdef MTCODER__USE_WRITE_THREAD
for (i = 0; i < MTCODER__BLOCKS_MAX; i++)
Event_Construct(&p->writeEvents[i]);
#else
Event_Construct(&p->finishedEvent);
@ -418,14 +424,14 @@ static void MtCoder_Free(CMtCoder *p)
Event_Set(&p->readEvent);
*/
for (i = 0; i < MTCODER_THREADS_MAX; i++)
for (i = 0; i < MTCODER__THREADS_MAX; i++)
MtCoderThread_Destruct(&p->threads[i]);
Event_Close(&p->readEvent);
Semaphore_Close(&p->blocksSemaphore);
#ifdef MTCODER_USE_WRITE_THREAD
for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
#ifdef MTCODER__USE_WRITE_THREAD
for (i = 0; i < MTCODER__BLOCKS_MAX; i++)
Event_Close(&p->writeEvents[i]);
#else
Event_Close(&p->finishedEvent);
@ -449,22 +455,20 @@ SRes MtCoder_Code(CMtCoder *p)
unsigned i;
SRes res = SZ_OK;
// printf("\n====== MtCoder_Code : \n");
if (numThreads > MTCODER_THREADS_MAX)
numThreads = MTCODER_THREADS_MAX;
numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads);
if (numThreads > MTCODER__THREADS_MAX)
numThreads = MTCODER__THREADS_MAX;
numBlocksMax = MTCODER__GET_NUM_BLOCKS_FROM_THREADS(numThreads);
if (p->blockSize < ((UInt32)1 << 26)) numBlocksMax++;
if (p->blockSize < ((UInt32)1 << 24)) numBlocksMax++;
if (p->blockSize < ((UInt32)1 << 22)) numBlocksMax++;
if (numBlocksMax > MTCODER_BLOCKS_MAX)
numBlocksMax = MTCODER_BLOCKS_MAX;
if (numBlocksMax > MTCODER__BLOCKS_MAX)
numBlocksMax = MTCODER__BLOCKS_MAX;
if (p->blockSize != p->allocatedBufsSize)
{
for (i = 0; i < MTCODER_THREADS_MAX; i++)
for (i = 0; i < MTCODER__THREADS_MAX; i++)
{
CMtCoderThread *t = &p->threads[i];
if (t->inBuf)
@ -480,23 +484,23 @@ SRes MtCoder_Code(CMtCoder *p)
MtProgress_Init(&p->mtProgress, p->progress);
#ifdef MTCODER_USE_WRITE_THREAD
#ifdef MTCODER__USE_WRITE_THREAD
for (i = 0; i < numBlocksMax; i++)
{
RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->writeEvents[i]))
RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->writeEvents[i]));
}
#else
RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->finishedEvent))
RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->finishedEvent));
#endif
{
RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->readEvent))
RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, (UInt32)numBlocksMax, (UInt32)numBlocksMax))
RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->readEvent));
RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, numBlocksMax, numBlocksMax));
}
for (i = 0; i < MTCODER_BLOCKS_MAX - 1; i++)
for (i = 0; i < MTCODER__BLOCKS_MAX - 1; i++)
p->freeBlockList[i] = i + 1;
p->freeBlockList[MTCODER_BLOCKS_MAX - 1] = (unsigned)(int)-1;
p->freeBlockList[MTCODER__BLOCKS_MAX - 1] = (unsigned)(int)-1;
p->freeBlockHead = 0;
p->readProcessed = 0;
@ -504,37 +508,26 @@ SRes MtCoder_Code(CMtCoder *p)
p->numBlocksMax = numBlocksMax;
p->stopReading = False;
#ifndef MTCODER_USE_WRITE_THREAD
#ifndef MTCODER__USE_WRITE_THREAD
p->writeIndex = 0;
p->writeRes = SZ_OK;
for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
for (i = 0; i < MTCODER__BLOCKS_MAX; i++)
p->ReadyBlocks[i] = False;
p->numFinishedThreads = 0;
#endif
p->numStartedThreadsLimit = numThreads;
p->numStartedThreads = 0;
ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup
// for (i = 0; i < numThreads; i++)
{
// here we create new thread for first block.
// And each new thread will create another new thread after block reading
// until numStartedThreadsLimit is reached.
CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++];
{
const SRes res2 = MtCoderThread_CreateAndStart(nextThread
#ifdef _WIN32
, p
#endif
);
RINOK(res2)
}
RINOK(MtCoderThread_CreateAndStart(nextThread));
}
RINOK_THREAD(Event_Set(&p->readEvent))
#ifdef MTCODER_USE_WRITE_THREAD
#ifdef MTCODER__USE_WRITE_THREAD
{
unsigned bi = 0;
@ -546,9 +539,9 @@ SRes MtCoder_Code(CMtCoder *p)
RINOK_THREAD(Event_Wait(&p->writeEvents[bi]))
{
const CMtCoderBlock * const block = &p->blocks[bi];
const unsigned bufIndex = block->bufIndex;
const BoolInt finished = block->finished;
const CMtCoderBlock *block = &p->blocks[bi];
unsigned bufIndex = block->bufIndex;
BoolInt finished = block->finished;
if (res == SZ_OK && block->res != SZ_OK)
res = block->res;
@ -578,7 +571,7 @@ SRes MtCoder_Code(CMtCoder *p)
}
#else
{
const WRes wres = Event_Wait(&p->finishedEvent);
WRes wres = Event_Wait(&p->finishedEvent);
res = MY_SRes_HRESULT_FROM_WRes(wres);
}
#endif
@ -589,7 +582,7 @@ SRes MtCoder_Code(CMtCoder *p)
if (res == SZ_OK)
res = p->mtProgress.res;
#ifndef MTCODER_USE_WRITE_THREAD
#ifndef MTCODER__USE_WRITE_THREAD
if (res == SZ_OK)
res = p->writeRes;
#endif
@ -600,5 +593,3 @@ SRes MtCoder_Code(CMtCoder *p)
}
#endif
#undef RINOK_THREAD

View file

@ -1,30 +1,30 @@
/* MtCoder.h -- Multi-thread Coder
: Igor Pavlov : Public domain */
2018-07-04 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_MT_CODER_H
#define ZIP7_INC_MT_CODER_H
#ifndef __MT_CODER_H
#define __MT_CODER_H
#include "MtDec.h"
EXTERN_C_BEGIN
/*
if ( defined MTCODER_USE_WRITE_THREAD) : main thread writes all data blocks to output stream
if (not defined MTCODER_USE_WRITE_THREAD) : any coder thread can write data blocks to output stream
if ( defined MTCODER__USE_WRITE_THREAD) : main thread writes all data blocks to output stream
if (not defined MTCODER__USE_WRITE_THREAD) : any coder thread can write data blocks to output stream
*/
/* #define MTCODER_USE_WRITE_THREAD */
/* #define MTCODER__USE_WRITE_THREAD */
#ifndef Z7_ST
#define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1)
#define MTCODER_THREADS_MAX 256
#define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3)
#ifndef _7ZIP_ST
#define MTCODER__GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1)
#define MTCODER__THREADS_MAX 64
#define MTCODER__BLOCKS_MAX (MTCODER__GET_NUM_BLOCKS_FROM_THREADS(MTCODER__THREADS_MAX) + 3)
#else
#define MTCODER_THREADS_MAX 1
#define MTCODER_BLOCKS_MAX 1
#define MTCODER__THREADS_MAX 1
#define MTCODER__BLOCKS_MAX 1
#endif
#ifndef Z7_ST
#ifndef _7ZIP_ST
typedef struct
@ -37,15 +37,15 @@ typedef struct
void MtProgressThunk_CreateVTable(CMtProgressThunk *p);
#define MtProgressThunk_INIT(p) { (p)->inSize = 0; (p)->outSize = 0; }
#define MtProgressThunk_Init(p) { (p)->inSize = 0; (p)->outSize = 0; }
struct CMtCoder_;
struct _CMtCoder;
typedef struct
{
struct CMtCoder_ *mtCoder;
struct _CMtCoder *mtCoder;
unsigned index;
int stop;
Byte *inBuf;
@ -71,20 +71,19 @@ typedef struct
} CMtCoderBlock;
typedef struct CMtCoder_
typedef struct _CMtCoder
{
/* input variables */
size_t blockSize; /* size of input block */
unsigned numThreadsMax;
unsigned numThreadGroups;
UInt64 expectedDataSize;
ISeqInStreamPtr inStream;
ISeqInStream *inStream;
const Byte *inData;
size_t inDataSize;
ICompressProgressPtr progress;
ICompressProgress *progress;
ISzAllocPtr allocBig;
IMtCoderCallback2 *mtCallback;
@ -101,13 +100,13 @@ typedef struct CMtCoder_
BoolInt stopReading;
SRes readRes;
#ifdef MTCODER_USE_WRITE_THREAD
CAutoResetEvent writeEvents[MTCODER_BLOCKS_MAX];
#ifdef MTCODER__USE_WRITE_THREAD
CAutoResetEvent writeEvents[MTCODER__BLOCKS_MAX];
#else
CAutoResetEvent finishedEvent;
SRes writeRes;
unsigned writeIndex;
Byte ReadyBlocks[MTCODER_BLOCKS_MAX];
Byte ReadyBlocks[MTCODER__BLOCKS_MAX];
LONG numFinishedThreads;
#endif
@ -121,13 +120,11 @@ typedef struct CMtCoder_
CCriticalSection cs;
unsigned freeBlockHead;
unsigned freeBlockList[MTCODER_BLOCKS_MAX];
unsigned freeBlockList[MTCODER__BLOCKS_MAX];
CMtProgress mtProgress;
CMtCoderBlock blocks[MTCODER_BLOCKS_MAX];
CMtCoderThread threads[MTCODER_THREADS_MAX];
CThreadNextGroup nextGroup;
CMtCoderBlock blocks[MTCODER__BLOCKS_MAX];
CMtCoderThread threads[MTCODER__THREADS_MAX];
} CMtCoder;

117
C/MtDec.c
View file

@ -1,5 +1,5 @@
/* MtDec.c -- Multi-thread Decoder
2024-02-20 : Igor Pavlov : Public domain */
2021-12-21 : Igor Pavlov : Public domain */
#include "Precomp.h"
@ -14,7 +14,7 @@
#include "MtDec.h"
#ifndef Z7_ST
#ifndef _7ZIP_ST
#ifdef SHOW_DEBUG_INFO
#define PRF(x) x
@ -24,7 +24,7 @@
#define PRF_STR_INT(s, d) PRF(printf("\n" s " %d\n", (unsigned)d))
void MtProgress_Init(CMtProgress *p, ICompressProgressPtr progress)
void MtProgress_Init(CMtProgress *p, ICompressProgress *progress)
{
p->progress = progress;
p->res = SZ_OK;
@ -81,28 +81,36 @@ void MtProgress_SetError(CMtProgress *p, SRes res)
#define RINOK_THREAD(x) RINOK_WRes(x)
struct CMtDecBufLink_
static WRes ArEvent_OptCreate_And_Reset(CEvent *p)
{
struct CMtDecBufLink_ *next;
if (Event_IsCreated(p))
return Event_Reset(p);
return AutoResetEvent_CreateNotSignaled(p);
}
struct __CMtDecBufLink
{
struct __CMtDecBufLink *next;
void *pad[3];
};
typedef struct CMtDecBufLink_ CMtDecBufLink;
typedef struct __CMtDecBufLink CMtDecBufLink;
#define MTDEC__LINK_DATA_OFFSET sizeof(CMtDecBufLink)
#define MTDEC__DATA_PTR_FROM_LINK(link) ((Byte *)(link) + MTDEC__LINK_DATA_OFFSET)
static THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp);
static THREAD_FUNC_DECL ThreadFunc(void *pp);
static WRes MtDecThread_CreateEvents(CMtDecThread *t)
{
WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->canWrite);
WRes wres = ArEvent_OptCreate_And_Reset(&t->canWrite);
if (wres == 0)
{
wres = AutoResetEvent_OptCreate_And_Reset(&t->canRead);
wres = ArEvent_OptCreate_And_Reset(&t->canRead);
if (wres == 0)
return SZ_OK;
}
@ -118,7 +126,7 @@ static SRes MtDecThread_CreateAndStart(CMtDecThread *t)
{
if (Thread_WasCreated(&t->thread))
return SZ_OK;
wres = Thread_Create(&t->thread, MtDec_ThreadFunc, t);
wres = Thread_Create(&t->thread, ThreadFunc, t);
if (wres == 0)
return SZ_OK;
}
@ -159,7 +167,7 @@ static void MtDecThread_CloseThread(CMtDecThread *t)
static void MtDec_CloseThreads(CMtDec *p)
{
unsigned i;
for (i = 0; i < MTDEC_THREADS_MAX; i++)
for (i = 0; i < MTDEC__THREADS_MAX; i++)
MtDecThread_CloseThread(&p->threads[i]);
}
@ -171,6 +179,25 @@ static void MtDecThread_Destruct(CMtDecThread *t)
static SRes FullRead(ISeqInStream *stream, Byte *data, size_t *processedSize)
{
size_t size = *processedSize;
*processedSize = 0;
while (size != 0)
{
size_t cur = size;
SRes res = ISeqInStream_Read(stream, data, &cur);
*processedSize += cur;
data += cur;
size -= cur;
RINOK(res);
if (cur == 0)
return SZ_OK;
}
return SZ_OK;
}
static SRes MtDec_GetError_Spec(CMtDec *p, UInt64 interruptIndex, BoolInt *wasInterrupted)
{
SRes res;
@ -226,7 +253,7 @@ Byte *MtDec_GetCrossBuff(CMtDec *p)
/*
MtDec_ThreadFunc2() returns:
ThreadFunc2() returns:
0 - in all normal cases (even for stream error or memory allocation error)
(!= 0) - WRes error return by system threading function
*/
@ -234,11 +261,11 @@ Byte *MtDec_GetCrossBuff(CMtDec *p)
// #define MTDEC_ProgessStep (1 << 22)
#define MTDEC_ProgessStep (1 << 0)
static WRes MtDec_ThreadFunc2(CMtDecThread *t)
static WRes ThreadFunc2(CMtDecThread *t)
{
CMtDec *p = t->mtDec;
PRF_STR_INT("MtDec_ThreadFunc2", t->index)
PRF_STR_INT("ThreadFunc2", t->index);
// SetThreadAffinityMask(GetCurrentThread(), 1 << t->index);
@ -268,13 +295,13 @@ static WRes MtDec_ThreadFunc2(CMtDecThread *t)
// CMtDecCallbackInfo parse;
CMtDecThread *nextThread;
PRF_STR_INT("=============== Event_Wait(&t->canRead)", t->index)
PRF_STR_INT("=============== Event_Wait(&t->canRead)", t->index);
RINOK_THREAD(Event_Wait(&t->canRead))
RINOK_THREAD(Event_Wait(&t->canRead));
if (p->exitThread)
return 0;
PRF_STR_INT("after Event_Wait(&t->canRead)", t->index)
PRF_STR_INT("after Event_Wait(&t->canRead)", t->index);
// if (t->index == 3) return 19; // for test
@ -346,7 +373,7 @@ static WRes MtDec_ThreadFunc2(CMtDecThread *t)
{
size = p->inBufSize;
res = SeqInStream_ReadMax(p->inStream, data, &size);
res = FullRead(p->inStream, data, &size);
// size = 10; // test
@ -588,7 +615,7 @@ static WRes MtDec_ThreadFunc2(CMtDecThread *t)
// if ( !finish ) we must call Event_Set(&nextThread->canWrite) in any case
// if ( finish ) we switch to single-thread mode and there are 2 ways at the end of current iteration (current block):
// - if (needContinue) after Write(&needContinue), we restore decoding with new iteration
// - otherwise we stop decoding and exit from MtDec_ThreadFunc2()
// - otherwise we stop decoding and exit from ThreadFunc2()
// Don't change (finish) variable in the further code
@ -661,7 +688,7 @@ static WRes MtDec_ThreadFunc2(CMtDecThread *t)
// ---------- WRITE ----------
RINOK_THREAD(Event_Wait(&t->canWrite))
RINOK_THREAD(Event_Wait(&t->canWrite));
{
BoolInt isErrorMode = False;
@ -774,14 +801,14 @@ static WRes MtDec_ThreadFunc2(CMtDecThread *t)
if (!finish)
{
RINOK_THREAD(Event_Set(&nextThread->canWrite))
RINOK_THREAD(Event_Set(&nextThread->canWrite));
}
else
{
if (needContinue)
{
// we restore decoding with new iteration
RINOK_THREAD(Event_Set(&p->threads[0].canWrite))
RINOK_THREAD(Event_Set(&p->threads[0].canWrite));
}
else
{
@ -790,7 +817,7 @@ static WRes MtDec_ThreadFunc2(CMtDecThread *t)
return SZ_OK;
p->exitThread = True;
}
RINOK_THREAD(Event_Set(&p->threads[0].canRead))
RINOK_THREAD(Event_Set(&p->threads[0].canRead));
}
}
}
@ -809,17 +836,7 @@ static WRes MtDec_ThreadFunc2(CMtDecThread *t)
#endif
typedef
#ifdef _WIN32
UINT_PTR
#elif 1
uintptr_t
#else
ptrdiff_t
#endif
MY_uintptr_t;
static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp)
static THREAD_FUNC_DECL ThreadFunc1(void *pp)
{
WRes res;
@ -828,10 +845,10 @@ static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp)
// fprintf(stdout, "\n%d = %p\n", t->index, &t);
res = MtDec_ThreadFunc2(t);
res = ThreadFunc2(t);
p = t->mtDec;
if (res == 0)
return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)p->exitThreadWRes;
return (THREAD_FUNC_RET_TYPE)(UINT_PTR)p->exitThreadWRes;
{
// it's unexpected situation for some threading function error
if (p->exitThreadWRes == 0)
@ -842,17 +859,17 @@ static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp)
Event_Set(&p->threads[0].canWrite);
MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res));
}
return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)res;
return (THREAD_FUNC_RET_TYPE)(UINT_PTR)res;
}
static Z7_NO_INLINE THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp)
static MY_NO_INLINE THREAD_FUNC_DECL ThreadFunc(void *pp)
{
#ifdef USE_ALLOCA
CMtDecThread *t = (CMtDecThread *)pp;
// fprintf(stderr, "\n%d = %p - before", t->index, &t);
t->allocaPtr = alloca(t->index * 128);
#endif
return MtDec_ThreadFunc1(pp);
return ThreadFunc1(pp);
}
@ -866,7 +883,7 @@ int MtDec_PrepareRead(CMtDec *p)
{
unsigned i;
for (i = 0; i < MTDEC_THREADS_MAX; i++)
for (i = 0; i < MTDEC__THREADS_MAX; i++)
if (i > p->numStartedThreads
|| p->numFilledThreads <=
(i >= p->filledThreadStart ?
@ -970,7 +987,7 @@ void MtDec_Construct(CMtDec *p)
p->allocatedBufsSize = 0;
for (i = 0; i < MTDEC_THREADS_MAX; i++)
for (i = 0; i < MTDEC__THREADS_MAX; i++)
{
CMtDecThread *t = &p->threads[i];
t->mtDec = p;
@ -978,7 +995,7 @@ void MtDec_Construct(CMtDec *p)
t->inBuf = NULL;
Event_Construct(&t->canRead);
Event_Construct(&t->canWrite);
Thread_CONSTRUCT(&t->thread)
Thread_Construct(&t->thread);
}
// Event_Construct(&p->finishedEvent);
@ -993,7 +1010,7 @@ static void MtDec_Free(CMtDec *p)
p->exitThread = True;
for (i = 0; i < MTDEC_THREADS_MAX; i++)
for (i = 0; i < MTDEC__THREADS_MAX; i++)
MtDecThread_Destruct(&p->threads[i]);
// Event_Close(&p->finishedEvent);
@ -1044,15 +1061,15 @@ SRes MtDec_Code(CMtDec *p)
{
unsigned numThreads = p->numThreadsMax;
if (numThreads > MTDEC_THREADS_MAX)
numThreads = MTDEC_THREADS_MAX;
if (numThreads > MTDEC__THREADS_MAX)
numThreads = MTDEC__THREADS_MAX;
p->numStartedThreads_Limit = numThreads;
p->numStartedThreads = 0;
}
if (p->inBufSize != p->allocatedBufsSize)
{
for (i = 0; i < MTDEC_THREADS_MAX; i++)
for (i = 0; i < MTDEC__THREADS_MAX; i++)
{
CMtDecThread *t = &p->threads[i];
if (t->inBuf)
@ -1069,7 +1086,7 @@ SRes MtDec_Code(CMtDec *p)
MtProgress_Init(&p->mtProgress, p->progress);
// RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->finishedEvent))
// RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->finishedEvent));
p->exitThread = False;
p->exitThreadWRes = 0;
@ -1081,8 +1098,8 @@ SRes MtDec_Code(CMtDec *p)
wres = MtDecThread_CreateEvents(nextThread);
if (wres == 0) { wres = Event_Set(&nextThread->canWrite);
if (wres == 0) { wres = Event_Set(&nextThread->canRead);
if (wres == 0) { THREAD_FUNC_RET_TYPE res = MtDec_ThreadFunc(nextThread);
wres = (WRes)(MY_uintptr_t)res;
if (wres == 0) { THREAD_FUNC_RET_TYPE res = ThreadFunc(nextThread);
wres = (WRes)(UINT_PTR)res;
if (wres != 0)
{
p->needContinue = False;
@ -1120,5 +1137,3 @@ SRes MtDec_Code(CMtDec *p)
}
#endif
#undef PRF

View file

@ -1,46 +1,46 @@
/* MtDec.h -- Multi-thread Decoder
2023-04-02 : Igor Pavlov : Public domain */
2020-03-05 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_MT_DEC_H
#define ZIP7_INC_MT_DEC_H
#ifndef __MT_DEC_H
#define __MT_DEC_H
#include "7zTypes.h"
#ifndef Z7_ST
#ifndef _7ZIP_ST
#include "Threads.h"
#endif
EXTERN_C_BEGIN
#ifndef Z7_ST
#ifndef _7ZIP_ST
#ifndef Z7_ST
#define MTDEC_THREADS_MAX 32
#ifndef _7ZIP_ST
#define MTDEC__THREADS_MAX 32
#else
#define MTDEC_THREADS_MAX 1
#define MTDEC__THREADS_MAX 1
#endif
typedef struct
{
ICompressProgressPtr progress;
ICompressProgress *progress;
SRes res;
UInt64 totalInSize;
UInt64 totalOutSize;
CCriticalSection cs;
} CMtProgress;
void MtProgress_Init(CMtProgress *p, ICompressProgressPtr progress);
void MtProgress_Init(CMtProgress *p, ICompressProgress *progress);
SRes MtProgress_Progress_ST(CMtProgress *p);
SRes MtProgress_ProgressAdd(CMtProgress *p, UInt64 inSize, UInt64 outSize);
SRes MtProgress_GetError(CMtProgress *p);
void MtProgress_SetError(CMtProgress *p, SRes res);
struct CMtDec;
struct _CMtDec;
typedef struct
{
struct CMtDec_ *mtDec;
struct _CMtDec *mtDec;
unsigned index;
void *inBuf;
@ -117,7 +117,7 @@ typedef struct
typedef struct CMtDec_
typedef struct _CMtDec
{
/* input variables */
@ -126,11 +126,11 @@ typedef struct CMtDec_
// size_t inBlockMax;
unsigned numThreadsMax_2;
ISeqInStreamPtr inStream;
ISeqInStream *inStream;
// const Byte *inData;
// size_t inDataSize;
ICompressProgressPtr progress;
ICompressProgress *progress;
ISzAllocPtr alloc;
IMtDecCallback2 *mtCallback;
@ -171,11 +171,11 @@ typedef struct CMtDec_
unsigned filledThreadStart;
unsigned numFilledThreads;
#ifndef Z7_ST
#ifndef _7ZIP_ST
BoolInt needInterrupt;
UInt64 interruptIndex;
CMtProgress mtProgress;
CMtDecThread threads[MTDEC_THREADS_MAX];
CMtDecThread threads[MTDEC__THREADS_MAX];
#endif
} CMtDec;

View file

@ -1,9 +1,9 @@
/* Ppmd.h -- PPMD codec common code
2023-03-05 : Igor Pavlov : Public domain
2021-04-13 : Igor Pavlov : Public domain
This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
#ifndef ZIP7_INC_PPMD_H
#define ZIP7_INC_PPMD_H
#ifndef __PPMD_H
#define __PPMD_H
#include "CpuArch.h"
@ -48,10 +48,8 @@ typedef struct
Byte Count; /* Count to next change of Shift */
} CPpmd_See;
#define Ppmd_See_UPDATE(p) \
{ if ((p)->Shift < PPMD_PERIOD_BITS && --(p)->Count == 0) \
{ (p)->Summ = (UInt16)((p)->Summ << 1); \
(p)->Count = (Byte)(3 << (p)->Shift++); }}
#define Ppmd_See_Update(p) if ((p)->Shift < PPMD_PERIOD_BITS && --(p)->Count == 0) \
{ (p)->Summ = (UInt16)((p)->Summ << 1); (p)->Count = (Byte)(3 << (p)->Shift++); }
typedef struct

217
C/Ppmd7.c
View file

@ -1,5 +1,5 @@
/* Ppmd7.c -- PPMdH codec
2023-09-07 : Igor Pavlov : Public domain
2021-04-13 : Igor Pavlov : Public domain
This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
#include "Precomp.h"
@ -14,7 +14,7 @@ This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
MY_ALIGN(16)
static const Byte PPMD7_kExpEscape[16] = { 25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2 };
MY_ALIGN(16)
static const UInt16 PPMD7_kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051};
static const UInt16 kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051};
#define MAX_FREQ 124
#define UNIT_SIZE 12
@ -33,7 +33,7 @@ static const UInt16 PPMD7_kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64
#define ONE_STATE(ctx) Ppmd7Context_OneState(ctx)
#define SUFFIX(ctx) CTX((ctx)->Suffix)
typedef CPpmd7_Context * PPMD7_CTX_PTR;
typedef CPpmd7_Context * CTX_PTR;
struct CPpmd7_Node_;
@ -107,14 +107,14 @@ BoolInt Ppmd7_Alloc(CPpmd7 *p, UInt32 size, ISzAllocPtr alloc)
// ---------- Internal Memory Allocator ----------
/* We can use CPpmd7_Node in list of free units (as in Ppmd8)
But we still need one additional list walk pass in Ppmd7_GlueFreeBlocks().
So we use simple CPpmd_Void_Ref instead of CPpmd7_Node in Ppmd7_InsertNode() / Ppmd7_RemoveNode()
But we still need one additional list walk pass in GlueFreeBlocks().
So we use simple CPpmd_Void_Ref instead of CPpmd7_Node in InsertNode() / RemoveNode()
*/
#define EMPTY_NODE 0
static void Ppmd7_InsertNode(CPpmd7 *p, void *node, unsigned indx)
static void InsertNode(CPpmd7 *p, void *node, unsigned indx)
{
*((CPpmd_Void_Ref *)node) = p->FreeList[indx];
// ((CPpmd7_Node *)node)->Next = (CPpmd7_Node_Ref)p->FreeList[indx];
@ -124,7 +124,7 @@ static void Ppmd7_InsertNode(CPpmd7 *p, void *node, unsigned indx)
}
static void *Ppmd7_RemoveNode(CPpmd7 *p, unsigned indx)
static void *RemoveNode(CPpmd7 *p, unsigned indx)
{
CPpmd_Void_Ref *node = (CPpmd_Void_Ref *)Ppmd7_GetPtr(p, p->FreeList[indx]);
p->FreeList[indx] = *node;
@ -134,32 +134,32 @@ static void *Ppmd7_RemoveNode(CPpmd7 *p, unsigned indx)
}
static void Ppmd7_SplitBlock(CPpmd7 *p, void *ptr, unsigned oldIndx, unsigned newIndx)
static void SplitBlock(CPpmd7 *p, void *ptr, unsigned oldIndx, unsigned newIndx)
{
unsigned i, nu = I2U(oldIndx) - I2U(newIndx);
ptr = (Byte *)ptr + U2B(I2U(newIndx));
if (I2U(i = U2I(nu)) != nu)
{
unsigned k = I2U(--i);
Ppmd7_InsertNode(p, ((Byte *)ptr) + U2B(k), nu - k - 1);
InsertNode(p, ((Byte *)ptr) + U2B(k), nu - k - 1);
}
Ppmd7_InsertNode(p, ptr, i);
InsertNode(p, ptr, i);
}
/* we use CPpmd7_Node_Union union to solve XLC -O2 strict pointer aliasing problem */
typedef union
typedef union _CPpmd7_Node_Union
{
CPpmd7_Node Node;
CPpmd7_Node_Ref NextRef;
} CPpmd7_Node_Union;
/* Original PPmdH (Ppmd7) code uses doubly linked list in Ppmd7_GlueFreeBlocks()
/* Original PPmdH (Ppmd7) code uses doubly linked list in GlueFreeBlocks()
we use single linked list similar to Ppmd8 code */
static void Ppmd7_GlueFreeBlocks(CPpmd7 *p)
static void GlueFreeBlocks(CPpmd7 *p)
{
/*
we use first UInt16 field of 12-bytes UNITs as record type stamp
@ -239,27 +239,27 @@ static void Ppmd7_GlueFreeBlocks(CPpmd7 *p)
if (nu == 0)
continue;
for (; nu > 128; nu -= 128, node += 128)
Ppmd7_InsertNode(p, node, PPMD_NUM_INDEXES - 1);
InsertNode(p, node, PPMD_NUM_INDEXES - 1);
if (I2U(i = U2I(nu)) != nu)
{
unsigned k = I2U(--i);
Ppmd7_InsertNode(p, node + k, (unsigned)nu - k - 1);
InsertNode(p, node + k, (unsigned)nu - k - 1);
}
Ppmd7_InsertNode(p, node, i);
InsertNode(p, node, i);
}
}
Z7_NO_INLINE
static void *Ppmd7_AllocUnitsRare(CPpmd7 *p, unsigned indx)
MY_NO_INLINE
static void *AllocUnitsRare(CPpmd7 *p, unsigned indx)
{
unsigned i;
if (p->GlueCount == 0)
{
Ppmd7_GlueFreeBlocks(p);
GlueFreeBlocks(p);
if (p->FreeList[indx] != 0)
return Ppmd7_RemoveNode(p, indx);
return RemoveNode(p, indx);
}
i = indx;
@ -277,17 +277,17 @@ static void *Ppmd7_AllocUnitsRare(CPpmd7 *p, unsigned indx)
while (p->FreeList[i] == 0);
{
void *block = Ppmd7_RemoveNode(p, i);
Ppmd7_SplitBlock(p, block, i, indx);
void *block = RemoveNode(p, i);
SplitBlock(p, block, i, indx);
return block;
}
}
static void *Ppmd7_AllocUnits(CPpmd7 *p, unsigned indx)
static void *AllocUnits(CPpmd7 *p, unsigned indx)
{
if (p->FreeList[indx] != 0)
return Ppmd7_RemoveNode(p, indx);
return RemoveNode(p, indx);
{
UInt32 numBytes = U2B(I2U(indx));
Byte *lo = p->LoUnit;
@ -297,22 +297,13 @@ static void *Ppmd7_AllocUnits(CPpmd7 *p, unsigned indx)
return lo;
}
}
return Ppmd7_AllocUnitsRare(p, indx);
return AllocUnitsRare(p, indx);
}
#define MEM_12_CPY(dest, src, num) \
{ UInt32 *d = (UInt32 *)(dest); \
const UInt32 *z = (const UInt32 *)(src); \
unsigned n = (num); \
do { \
d[0] = z[0]; \
d[1] = z[1]; \
d[2] = z[2]; \
z += 3; \
d += 3; \
} while (--n); \
}
#define MyMem12Cpy(dest, src, num) \
{ UInt32 *d = (UInt32 *)dest; const UInt32 *z = (const UInt32 *)src; UInt32 n = num; \
do { d[0] = z[0]; d[1] = z[1]; d[2] = z[2]; z += 3; d += 3; } while (--n); }
/*
@ -324,12 +315,12 @@ static void *ShrinkUnits(CPpmd7 *p, void *oldPtr, unsigned oldNU, unsigned newNU
return oldPtr;
if (p->FreeList[i1] != 0)
{
void *ptr = Ppmd7_RemoveNode(p, i1);
MEM_12_CPY(ptr, oldPtr, newNU)
Ppmd7_InsertNode(p, oldPtr, i0);
void *ptr = RemoveNode(p, i1);
MyMem12Cpy(ptr, oldPtr, newNU);
InsertNode(p, oldPtr, i0);
return ptr;
}
Ppmd7_SplitBlock(p, oldPtr, i0, i1);
SplitBlock(p, oldPtr, i0, i1);
return oldPtr;
}
*/
@ -338,14 +329,14 @@ static void *ShrinkUnits(CPpmd7 *p, void *oldPtr, unsigned oldNU, unsigned newNU
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
static void SetSuccessor(CPpmd_State *p, CPpmd_Void_Ref v)
{
Ppmd_SET_SUCCESSOR(p, v)
Ppmd_SET_SUCCESSOR(p, v);
}
Z7_NO_INLINE
MY_NO_INLINE
static
void Ppmd7_RestartModel(CPpmd7 *p)
void RestartModel(CPpmd7 *p)
{
unsigned i, k;
@ -361,8 +352,8 @@ void Ppmd7_RestartModel(CPpmd7 *p)
p->PrevSuccess = 0;
{
CPpmd7_Context *mc = (PPMD7_CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */
CPpmd_State *s = (CPpmd_State *)p->LoUnit; /* Ppmd7_AllocUnits(p, PPMD_NUM_INDEXES - 1); */
CPpmd7_Context *mc = (CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */
CPpmd_State *s = (CPpmd_State *)p->LoUnit; /* AllocUnits(p, PPMD_NUM_INDEXES - 1); */
p->LoUnit += U2B(256 / 2);
p->MaxContext = p->MinContext = mc;
@ -400,7 +391,7 @@ void Ppmd7_RestartModel(CPpmd7 *p)
{
unsigned m;
UInt16 *dest = p->BinSumm[i] + k;
const UInt16 val = (UInt16)(PPMD_BIN_SCALE - PPMD7_kInitBinEsc[k] / (i + 2));
UInt16 val = (UInt16)(PPMD_BIN_SCALE - kInitBinEsc[k] / (i + 2));
for (m = 0; m < 64; m += 8)
dest[m] = val;
}
@ -432,13 +423,13 @@ void Ppmd7_Init(CPpmd7 *p, unsigned maxOrder)
{
p->MaxOrder = maxOrder;
Ppmd7_RestartModel(p);
RestartModel(p);
}
/*
Ppmd7_CreateSuccessors()
CreateSuccessors()
It's called when (FoundState->Successor) is RAW-Successor,
that is the link to position in Raw text.
So we create Context records and write the links to
@ -454,10 +445,10 @@ void Ppmd7_Init(CPpmd7 *p, unsigned maxOrder)
also it can return pointer to real context of same order,
*/
Z7_NO_INLINE
static PPMD7_CTX_PTR Ppmd7_CreateSuccessors(CPpmd7 *p)
MY_NO_INLINE
static CTX_PTR CreateSuccessors(CPpmd7 *p)
{
PPMD7_CTX_PTR c = p->MinContext;
CTX_PTR c = p->MinContext;
CPpmd_Byte_Ref upBranch = (CPpmd_Byte_Ref)SUCCESSOR(p->FoundState);
Byte newSym, newFreq;
unsigned numPs = 0;
@ -531,15 +522,15 @@ static PPMD7_CTX_PTR Ppmd7_CreateSuccessors(CPpmd7 *p)
do
{
PPMD7_CTX_PTR c1;
CTX_PTR c1;
/* = AllocContext(p); */
if (p->HiUnit != p->LoUnit)
c1 = (PPMD7_CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE);
c1 = (CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE);
else if (p->FreeList[0] != 0)
c1 = (PPMD7_CTX_PTR)Ppmd7_RemoveNode(p, 0);
c1 = (CTX_PTR)RemoveNode(p, 0);
else
{
c1 = (PPMD7_CTX_PTR)Ppmd7_AllocUnitsRare(p, 0);
c1 = (CTX_PTR)AllocUnitsRare(p, 0);
if (!c1)
return NULL;
}
@ -559,16 +550,16 @@ static PPMD7_CTX_PTR Ppmd7_CreateSuccessors(CPpmd7 *p)
#define SWAP_STATES(s) \
#define SwapStates(s) \
{ CPpmd_State tmp = s[0]; s[0] = s[-1]; s[-1] = tmp; }
void Ppmd7_UpdateModel(CPpmd7 *p);
Z7_NO_INLINE
MY_NO_INLINE
void Ppmd7_UpdateModel(CPpmd7 *p)
{
CPpmd_Void_Ref maxSuccessor, minSuccessor;
PPMD7_CTX_PTR c, mc;
CTX_PTR c, mc;
unsigned s0, ns;
@ -601,7 +592,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
if (s[0].Freq >= s[-1].Freq)
{
SWAP_STATES(s)
SwapStates(s);
s--;
}
}
@ -619,10 +610,10 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
{
/* MAX ORDER context */
/* (FoundState->Successor) is RAW-Successor. */
p->MaxContext = p->MinContext = Ppmd7_CreateSuccessors(p);
p->MaxContext = p->MinContext = CreateSuccessors(p);
if (!p->MinContext)
{
Ppmd7_RestartModel(p);
RestartModel(p);
return;
}
SetSuccessor(p->FoundState, REF(p->MinContext));
@ -638,7 +629,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
p->Text = text;
if (text >= p->UnitsStart)
{
Ppmd7_RestartModel(p);
RestartModel(p);
return;
}
maxSuccessor = REF(text);
@ -654,10 +645,10 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
if (minSuccessor <= maxSuccessor)
{
// minSuccessor is RAW-Successor. So we will create real contexts records:
PPMD7_CTX_PTR cs = Ppmd7_CreateSuccessors(p);
CTX_PTR cs = CreateSuccessors(p);
if (!cs)
{
Ppmd7_RestartModel(p);
RestartModel(p);
return;
}
minSuccessor = REF(cs);
@ -720,27 +711,27 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
if ((ns1 & 1) == 0)
{
/* Expand for one UNIT */
const unsigned oldNU = ns1 >> 1;
const unsigned i = U2I(oldNU);
unsigned oldNU = ns1 >> 1;
unsigned i = U2I(oldNU);
if (i != U2I((size_t)oldNU + 1))
{
void *ptr = Ppmd7_AllocUnits(p, i + 1);
void *ptr = AllocUnits(p, i + 1);
void *oldPtr;
if (!ptr)
{
Ppmd7_RestartModel(p);
RestartModel(p);
return;
}
oldPtr = STATS(c);
MEM_12_CPY(ptr, oldPtr, oldNU)
Ppmd7_InsertNode(p, oldPtr, i);
MyMem12Cpy(ptr, oldPtr, oldNU);
InsertNode(p, oldPtr, i);
c->Union4.Stats = STATS_REF(ptr);
}
}
sum = c->Union2.SummFreq;
/* max increase of Escape_Freq is 3 here.
total increase of Union2.SummFreq for all symbols is less than 256 here */
sum += (UInt32)(unsigned)((2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1)));
sum += (UInt32)(2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1));
/* original PPMdH uses 16-bit variable for (sum) here.
But (sum < 0x9000). So we don't truncate (sum) to 16-bit */
// sum = (UInt16)sum;
@ -748,10 +739,10 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
else
{
// instead of One-symbol context we create 2-symbol context
CPpmd_State *s = (CPpmd_State*)Ppmd7_AllocUnits(p, 0);
CPpmd_State *s = (CPpmd_State*)AllocUnits(p, 0);
if (!s)
{
Ppmd7_RestartModel(p);
RestartModel(p);
return;
}
{
@ -770,7 +761,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
// (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context
s->Freq = (Byte)freq;
// max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here
sum = (UInt32)(freq + p->InitEsc + (ns > 3));
sum = freq + p->InitEsc + (ns > 3);
}
}
@ -804,8 +795,8 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
Z7_NO_INLINE
static void Ppmd7_Rescale(CPpmd7 *p)
MY_NO_INLINE
static void Rescale(CPpmd7 *p)
{
unsigned i, adder, sumFreq, escFreq;
CPpmd_State *stats = STATS(p->MinContext);
@ -894,7 +885,7 @@ static void Ppmd7_Rescale(CPpmd7 *p)
*s = *stats;
s->Freq = (Byte)freq; // (freq <= 260 / 4)
p->FoundState = s;
Ppmd7_InsertNode(p, stats, U2I(n0));
InsertNode(p, stats, U2I(n0));
return;
}
@ -908,13 +899,13 @@ static void Ppmd7_Rescale(CPpmd7 *p)
{
if (p->FreeList[i1] != 0)
{
void *ptr = Ppmd7_RemoveNode(p, i1);
void *ptr = RemoveNode(p, i1);
p->MinContext->Union4.Stats = STATS_REF(ptr);
MEM_12_CPY(ptr, (const void *)stats, n1)
Ppmd7_InsertNode(p, stats, i0);
MyMem12Cpy(ptr, (const void *)stats, n1);
InsertNode(p, stats, i0);
}
else
Ppmd7_SplitBlock(p, stats, i0, i1);
SplitBlock(p, stats, i0, i1);
}
}
}
@ -942,10 +933,10 @@ CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq)
p->HiBitsFlag;
{
// if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ
const unsigned summ = (UInt16)see->Summ; // & 0xFFFF
const unsigned r = (summ >> see->Shift);
unsigned summ = (UInt16)see->Summ; // & 0xFFFF
unsigned r = (summ >> see->Shift);
see->Summ = (UInt16)(summ - r);
*escFreq = (UInt32)(r + (r == 0));
*escFreq = r + (r == 0);
}
}
else
@ -957,9 +948,9 @@ CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq)
}
static void Ppmd7_NextContext(CPpmd7 *p)
static void NextContext(CPpmd7 *p)
{
PPMD7_CTX_PTR c = CTX(SUCCESSOR(p->FoundState));
CTX_PTR c = CTX(SUCCESSOR(p->FoundState));
if (p->OrderFall == 0 && (const Byte *)c > p->Text)
p->MaxContext = p->MinContext = c;
else
@ -976,12 +967,12 @@ void Ppmd7_Update1(CPpmd7 *p)
s->Freq = (Byte)freq;
if (freq > s[-1].Freq)
{
SWAP_STATES(s)
SwapStates(s);
p->FoundState = --s;
if (freq > MAX_FREQ)
Ppmd7_Rescale(p);
Rescale(p);
}
Ppmd7_NextContext(p);
NextContext(p);
}
@ -990,15 +981,15 @@ void Ppmd7_Update1_0(CPpmd7 *p)
CPpmd_State *s = p->FoundState;
CPpmd7_Context *mc = p->MinContext;
unsigned freq = s->Freq;
const unsigned summFreq = mc->Union2.SummFreq;
unsigned summFreq = mc->Union2.SummFreq;
p->PrevSuccess = (2 * freq > summFreq);
p->RunLength += (Int32)p->PrevSuccess;
p->RunLength += (int)p->PrevSuccess;
mc->Union2.SummFreq = (UInt16)(summFreq + 4);
freq += 4;
s->Freq = (Byte)freq;
if (freq > MAX_FREQ)
Ppmd7_Rescale(p);
Ppmd7_NextContext(p);
Rescale(p);
NextContext(p);
}
@ -1009,7 +1000,7 @@ void Ppmd7_UpdateBin(CPpmd7 *p)
p->FoundState->Freq = (Byte)(freq + (freq < 128));
p->PrevSuccess = 1;
p->RunLength++;
Ppmd7_NextContext(p);
NextContext(p);
}
*/
@ -1022,7 +1013,7 @@ void Ppmd7_Update2(CPpmd7 *p)
p->MinContext->Union2.SummFreq = (UInt16)(p->MinContext->Union2.SummFreq + 4);
s->Freq = (Byte)freq;
if (freq > MAX_FREQ)
Ppmd7_Rescale(p);
Rescale(p);
Ppmd7_UpdateModel(p);
}
@ -1051,8 +1042,8 @@ Last UNIT of array at offset (Size - 12) is root order-0 CPpmd7_Context record.
The code can free UNITs memory blocks that were allocated to store CPpmd_State vectors.
The code doesn't free UNITs allocated for CPpmd7_Context records.
The code calls Ppmd7_RestartModel(), when there is no free memory for allocation.
And Ppmd7_RestartModel() changes the state to orignal start state, with full free block.
The code calls RestartModel(), when there is no free memory for allocation.
And RestartModel() changes the state to orignal start state, with full free block.
The code allocates UNITs with the following order:
@ -1060,14 +1051,14 @@ The code allocates UNITs with the following order:
Allocation of 1 UNIT for Context record
- from free space (HiUnit) down to (LoUnit)
- from FreeList[0]
- Ppmd7_AllocUnitsRare()
- AllocUnitsRare()
Ppmd7_AllocUnits() for CPpmd_State vectors:
AllocUnits() for CPpmd_State vectors:
- from FreeList[i]
- from free space (LoUnit) up to (HiUnit)
- Ppmd7_AllocUnitsRare()
- AllocUnitsRare()
Ppmd7_AllocUnitsRare()
AllocUnitsRare()
- if (GlueCount == 0)
{ Glue lists, GlueCount = 255, allocate from FreeList[i]] }
- loop for all higher sized FreeList[...] lists
@ -1102,8 +1093,8 @@ The PPMd code tries to fulfill the condition:
We have (Sum(Stats[].Freq) <= 256 * 124), because of (MAX_FREQ = 124)
So (4 = 128 - 124) is average reserve for Escape_Freq for each symbol.
If (CPpmd_State::Freq) is not aligned for 4, the reserve can be 5, 6 or 7.
SummFreq and Escape_Freq can be changed in Ppmd7_Rescale() and *Update*() functions.
Ppmd7_Rescale() can remove symbols only from max-order contexts. So Escape_Freq can increase after multiple calls of Ppmd7_Rescale() for
SummFreq and Escape_Freq can be changed in Rescale() and *Update*() functions.
Rescale() can remove symbols only from max-order contexts. So Escape_Freq can increase after multiple calls of Rescale() for
max-order context.
When the PPMd code still break (Total <= RC::Range) condition in range coder,
@ -1111,21 +1102,3 @@ we have two ways to resolve that problem:
1) we can report error, if we want to keep compatibility with original PPMd code that has no fix for such cases.
2) we can reduce (Total) value to (RC::Range) by reducing (Escape_Freq) part of (Total) value.
*/
#undef MAX_FREQ
#undef UNIT_SIZE
#undef U2B
#undef U2I
#undef I2U
#undef I2U_UInt16
#undef REF
#undef STATS_REF
#undef CTX
#undef STATS
#undef ONE_STATE
#undef SUFFIX
#undef NODE
#undef EMPTY_NODE
#undef MEM_12_CPY
#undef SUCCESSOR
#undef SWAP_STATES

View file

@ -1,11 +1,11 @@
/* Ppmd7.h -- Ppmd7 (PPMdH) compression codec
2023-04-02 : Igor Pavlov : Public domain
2021-04-13 : Igor Pavlov : Public domain
This code is based on:
PPMd var.H (2001): Dmitry Shkarin : Public domain */
#ifndef ZIP7_INC_PPMD7_H
#define ZIP7_INC_PPMD7_H
#ifndef __PPMD7_H
#define __PPMD7_H
#include "Ppmd.h"
@ -55,7 +55,7 @@ typedef struct
UInt32 Range;
UInt32 Code;
UInt32 Low;
IByteInPtr Stream;
IByteIn *Stream;
} CPpmd7_RangeDec;
@ -66,7 +66,7 @@ typedef struct
// Byte _dummy_[3];
UInt64 Low;
UInt64 CacheSize;
IByteOutPtr Stream;
IByteOut *Stream;
} CPpmd7z_RangeEnc;

View file

@ -1,5 +1,5 @@
/* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder
2023-09-07 : Igor Pavlov : Public domain
2021-04-13 : Igor Pavlov : Public domain
This code is based on:
PPMd var.H (2001): Dmitry Shkarin : Public domain */
@ -8,7 +8,7 @@ This code is based on:
#include "Ppmd7.h"
#define kTopValue ((UInt32)1 << 24)
#define kTopValue (1 << 24)
#define READ_BYTE(p) IByteIn_Read((p)->Stream)
@ -37,9 +37,9 @@ BoolInt Ppmd7z_RangeDec_Init(CPpmd7_RangeDec *p)
#define R (&p->rc.dec)
Z7_FORCE_INLINE
// Z7_NO_INLINE
static void Ppmd7z_RD_Decode(CPpmd7 *p, UInt32 start, UInt32 size)
MY_FORCE_INLINE
// MY_NO_INLINE
static void RangeDec_Decode(CPpmd7 *p, UInt32 start, UInt32 size)
{
@ -48,18 +48,18 @@ static void Ppmd7z_RD_Decode(CPpmd7 *p, UInt32 start, UInt32 size)
RC_NORM_LOCAL(R)
}
#define RC_Decode(start, size) Ppmd7z_RD_Decode(p, start, size);
#define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R)
#define RC_GetThreshold(total) (R->Code / (R->Range /= (total)))
#define RC_Decode(start, size) RangeDec_Decode(p, start, size);
#define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R)
#define RC_GetThreshold(total) (R->Code / (R->Range /= (total)))
#define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref))
// typedef CPpmd7_Context * CTX_PTR;
typedef CPpmd7_Context * CTX_PTR;
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
void Ppmd7_UpdateModel(CPpmd7 *p);
#define MASK(sym) ((Byte *)charMask)[sym]
// Z7_FORCE_INLINE
#define MASK(sym) ((unsigned char *)charMask)[sym]
// MY_FORCE_INLINE
// static
int Ppmd7z_DecodeSymbol(CPpmd7 *p)
{
@ -70,7 +70,7 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
CPpmd_State *s = Ppmd7_GetStats(p, p->MinContext);
unsigned i;
UInt32 count, hiCnt;
const UInt32 summFreq = p->MinContext->Union2.SummFreq;
UInt32 summFreq = p->MinContext->Union2.SummFreq;
@ -81,7 +81,7 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
if ((Int32)(count -= s->Freq) < 0)
{
Byte sym;
RC_DecodeFinal(0, s->Freq)
RC_DecodeFinal(0, s->Freq);
p->FoundState = s;
sym = s->Symbol;
Ppmd7_Update1_0(p);
@ -96,7 +96,7 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
if ((Int32)(count -= (++s)->Freq) < 0)
{
Byte sym;
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq);
p->FoundState = s;
sym = s->Symbol;
Ppmd7_Update1(p);
@ -109,10 +109,10 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
return PPMD7_SYM_ERROR;
hiCnt -= count;
RC_Decode(hiCnt, summFreq - hiCnt)
RC_Decode(hiCnt, summFreq - hiCnt);
p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol);
PPMD_SetAllBitsIn256Bytes(charMask)
PPMD_SetAllBitsIn256Bytes(charMask);
// i = p->MinContext->NumStats - 1;
// do { MASK((--s)->Symbol) = 0; } while (--i);
{
@ -120,8 +120,8 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
MASK(s->Symbol) = 0;
do
{
const unsigned sym0 = s2[0].Symbol;
const unsigned sym1 = s2[1].Symbol;
unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol;
s2 += 2;
MASK(sym0) = 0;
MASK(sym1) = 0;
@ -152,7 +152,7 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
// Ppmd7_UpdateBin(p);
{
unsigned freq = s->Freq;
CPpmd7_Context *c = CTX(SUCCESSOR(s));
CTX_PTR c = CTX(SUCCESSOR(s));
sym = s->Symbol;
p->FoundState = s;
p->PrevSuccess = 1;
@ -176,7 +176,7 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
R->Range -= size0;
RC_NORM_LOCAL(R)
PPMD_SetAllBitsIn256Bytes(charMask)
PPMD_SetAllBitsIn256Bytes(charMask);
MASK(Ppmd7Context_OneState(p->MinContext)->Symbol) = 0;
p->PrevSuccess = 0;
}
@ -209,17 +209,17 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
unsigned num2 = num / 2;
num &= 1;
hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num);
hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num);
s += num;
p->MinContext = mc;
do
{
const unsigned sym0 = s[0].Symbol;
const unsigned sym1 = s[1].Symbol;
unsigned sym0 = s[0].Symbol;
unsigned sym1 = s[1].Symbol;
s += 2;
hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0)));
hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1)));
hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0)));
hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1)));
}
while (--num2);
}
@ -238,20 +238,20 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
s = Ppmd7_GetStats(p, p->MinContext);
hiCnt = count;
// count -= s->Freq & (UInt32)(MASK(s->Symbol));
// count -= s->Freq & (unsigned)(MASK(s->Symbol));
// if ((Int32)count >= 0)
{
for (;;)
{
count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
// count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
}
count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
// count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
};
}
s--;
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq);
// new (see->Summ) value can overflow over 16-bits in some rare cases
Ppmd_See_UPDATE(see)
Ppmd_See_Update(see);
p->FoundState = s;
sym = s->Symbol;
Ppmd7_Update2(p);
@ -261,7 +261,7 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
if (count >= freqSum)
return PPMD7_SYM_ERROR;
RC_Decode(hiCnt, freqSum - hiCnt)
RC_Decode(hiCnt, freqSum - hiCnt);
// We increase (see->Summ) for sum of Freqs of all non_Masked symbols.
// new (see->Summ) value can overflow over 16-bits in some rare cases
@ -295,18 +295,3 @@ Byte *Ppmd7z_DecodeSymbols(CPpmd7 *p, Byte *buf, const Byte *lim)
return buf;
}
*/
#undef kTopValue
#undef READ_BYTE
#undef RC_NORM_BASE
#undef RC_NORM_1
#undef RC_NORM
#undef RC_NORM_LOCAL
#undef RC_NORM_REMOTE
#undef R
#undef RC_Decode
#undef RC_DecodeFinal
#undef RC_GetThreshold
#undef CTX
#undef SUCCESSOR
#undef MASK

View file

@ -1,5 +1,5 @@
/* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder
2023-09-07 : Igor Pavlov : Public domain
2021-04-13 : Igor Pavlov : Public domain
This code is based on:
PPMd var.H (2001): Dmitry Shkarin : Public domain */
@ -8,7 +8,7 @@ This code is based on:
#include "Ppmd7.h"
#define kTopValue ((UInt32)1 << 24)
#define kTopValue (1 << 24)
#define R (&p->rc.enc)
@ -20,8 +20,8 @@ void Ppmd7z_Init_RangeEnc(CPpmd7 *p)
R->CacheSize = 1;
}
Z7_NO_INLINE
static void Ppmd7z_RangeEnc_ShiftLow(CPpmd7 *p)
MY_NO_INLINE
static void RangeEnc_ShiftLow(CPpmd7 *p)
{
if ((UInt32)R->Low < (UInt32)0xFF000000 || (unsigned)(R->Low >> 32) != 0)
{
@ -38,53 +38,53 @@ static void Ppmd7z_RangeEnc_ShiftLow(CPpmd7 *p)
R->Low = (UInt32)((UInt32)R->Low << 8);
}
#define RC_NORM_BASE(p) if (R->Range < kTopValue) { R->Range <<= 8; Ppmd7z_RangeEnc_ShiftLow(p);
#define RC_NORM_1(p) RC_NORM_BASE(p) }
#define RC_NORM(p) RC_NORM_BASE(p) RC_NORM_BASE(p) }}
#define RC_NORM_BASE(p) if (R->Range < kTopValue) { R->Range <<= 8; RangeEnc_ShiftLow(p);
#define RC_NORM_1(p) RC_NORM_BASE(p) }
#define RC_NORM(p) RC_NORM_BASE(p) RC_NORM_BASE(p) }}
// we must use only one type of Normalization from two: LOCAL or REMOTE
#define RC_NORM_LOCAL(p) // RC_NORM(p)
#define RC_NORM_REMOTE(p) RC_NORM(p)
/*
#define Ppmd7z_RangeEnc_Encode(p, start, _size_) \
#define RangeEnc_Encode(p, start, _size_) \
{ UInt32 size = _size_; \
R->Low += start * R->Range; \
R->Range *= size; \
RC_NORM_LOCAL(p); }
*/
Z7_FORCE_INLINE
// Z7_NO_INLINE
static void Ppmd7z_RangeEnc_Encode(CPpmd7 *p, UInt32 start, UInt32 size)
MY_FORCE_INLINE
// MY_NO_INLINE
static void RangeEnc_Encode(CPpmd7 *p, UInt32 start, UInt32 size)
{
R->Low += start * R->Range;
R->Range *= size;
RC_NORM_LOCAL(p)
RC_NORM_LOCAL(p);
}
void Ppmd7z_Flush_RangeEnc(CPpmd7 *p)
{
unsigned i;
for (i = 0; i < 5; i++)
Ppmd7z_RangeEnc_ShiftLow(p);
RangeEnc_ShiftLow(p);
}
#define RC_Encode(start, size) Ppmd7z_RangeEnc_Encode(p, start, size);
#define RC_EncodeFinal(start, size) RC_Encode(start, size) RC_NORM_REMOTE(p)
#define RC_Encode(start, size) RangeEnc_Encode(p, start, size);
#define RC_EncodeFinal(start, size) RC_Encode(start, size); RC_NORM_REMOTE(p);
#define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref))
#define SUFFIX(ctx) CTX((ctx)->Suffix)
// typedef CPpmd7_Context * CTX_PTR;
typedef CPpmd7_Context * CTX_PTR;
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
void Ppmd7_UpdateModel(CPpmd7 *p);
#define MASK(sym) ((Byte *)charMask)[sym]
#define MASK(sym) ((unsigned char *)charMask)[sym]
Z7_FORCE_INLINE
MY_FORCE_INLINE
static
void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
{
@ -104,7 +104,7 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
if (s->Symbol == symbol)
{
// R->Range /= p->MinContext->Union2.SummFreq;
RC_EncodeFinal(0, s->Freq)
RC_EncodeFinal(0, s->Freq);
p->FoundState = s;
Ppmd7_Update1_0(p);
return;
@ -117,7 +117,7 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
if ((++s)->Symbol == symbol)
{
// R->Range /= p->MinContext->Union2.SummFreq;
RC_EncodeFinal(sum, s->Freq)
RC_EncodeFinal(sum, s->Freq);
p->FoundState = s;
Ppmd7_Update1(p);
return;
@ -127,10 +127,10 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
while (--i);
// R->Range /= p->MinContext->Union2.SummFreq;
RC_Encode(sum, p->MinContext->Union2.SummFreq - sum)
RC_Encode(sum, p->MinContext->Union2.SummFreq - sum);
p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol);
PPMD_SetAllBitsIn256Bytes(charMask)
PPMD_SetAllBitsIn256Bytes(charMask);
// MASK(s->Symbol) = 0;
// i = p->MinContext->NumStats - 1;
// do { MASK((--s)->Symbol) = 0; } while (--i);
@ -139,8 +139,8 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
MASK(s->Symbol) = 0;
do
{
const unsigned sym0 = s2[0].Symbol;
const unsigned sym1 = s2[1].Symbol;
unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol;
s2 += 2;
MASK(sym0) = 0;
MASK(sym1) = 0;
@ -153,20 +153,20 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
UInt16 *prob = Ppmd7_GetBinSumm(p);
CPpmd_State *s = Ppmd7Context_OneState(p->MinContext);
UInt32 pr = *prob;
const UInt32 bound = (R->Range >> 14) * pr;
UInt32 bound = (R->Range >> 14) * pr;
pr = PPMD_UPDATE_PROB_1(pr);
if (s->Symbol == symbol)
{
*prob = (UInt16)(pr + (1 << PPMD_INT_BITS));
// RangeEnc_EncodeBit_0(p, bound);
R->Range = bound;
RC_NORM_1(p)
RC_NORM_1(p);
// p->FoundState = s;
// Ppmd7_UpdateBin(p);
{
const unsigned freq = s->Freq;
CPpmd7_Context *c = CTX(SUCCESSOR(s));
unsigned freq = s->Freq;
CTX_PTR c = CTX(SUCCESSOR(s));
p->FoundState = s;
p->PrevSuccess = 1;
p->RunLength++;
@ -187,7 +187,7 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
R->Range -= bound;
RC_NORM_LOCAL(p)
PPMD_SetAllBitsIn256Bytes(charMask)
PPMD_SetAllBitsIn256Bytes(charMask);
MASK(s->Symbol) = 0;
p->PrevSuccess = 0;
}
@ -248,14 +248,14 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
do
{
const unsigned cur = s->Symbol;
unsigned cur = s->Symbol;
if ((int)cur == symbol)
{
const UInt32 low = sum;
const UInt32 freq = s->Freq;
UInt32 low = sum;
UInt32 freq = s->Freq;
unsigned num2;
Ppmd_See_UPDATE(see)
Ppmd_See_Update(see);
p->FoundState = s;
sum += escFreq;
@ -265,20 +265,21 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
if (num2 != 0)
{
s += i;
do
for (;;)
{
const unsigned sym0 = s[0].Symbol;
const unsigned sym1 = s[1].Symbol;
unsigned sym0 = s[0].Symbol;
unsigned sym1 = s[1].Symbol;
s += 2;
sum += (s[-2].Freq & (unsigned)(MASK(sym0)));
sum += (s[-1].Freq & (unsigned)(MASK(sym1)));
if (--num2 == 0)
break;
}
while (--num2);
}
R->Range /= sum;
RC_EncodeFinal(low, freq)
RC_EncodeFinal(low, freq);
Ppmd7_Update2(p);
return;
}
@ -288,21 +289,21 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
while (--i);
{
const UInt32 total = sum + escFreq;
UInt32 total = sum + escFreq;
see->Summ = (UInt16)(see->Summ + total);
R->Range /= total;
RC_Encode(sum, escFreq)
RC_Encode(sum, escFreq);
}
{
const CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext);
CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext);
s--;
MASK(s->Symbol) = 0;
do
{
const unsigned sym0 = s2[0].Symbol;
const unsigned sym1 = s2[1].Symbol;
unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol;
s2 += 2;
MASK(sym0) = 0;
MASK(sym1) = 0;
@ -320,18 +321,3 @@ void Ppmd7z_EncodeSymbols(CPpmd7 *p, const Byte *buf, const Byte *lim)
Ppmd7z_EncodeSymbol(p, *buf);
}
}
#undef kTopValue
#undef WRITE_BYTE
#undef RC_NORM_BASE
#undef RC_NORM_1
#undef RC_NORM
#undef RC_NORM_LOCAL
#undef RC_NORM_REMOTE
#undef R
#undef RC_Encode
#undef RC_EncodeFinal
#undef SUFFIX
#undef CTX
#undef SUCCESSOR
#undef MASK

View file

@ -1,5 +1,5 @@
/* Ppmd7aDec.c -- PPMd7a (PPMdH) Decoder
2023-09-07 : Igor Pavlov : Public domain
2021-04-13 : Igor Pavlov : Public domain
This code is based on:
PPMd var.H (2001): Dmitry Shkarin : Public domain
Carryless rangecoder (1999): Dmitry Subbotin : Public domain */
@ -8,8 +8,8 @@ This code is based on:
#include "Ppmd7.h"
#define kTop ((UInt32)1 << 24)
#define kBot ((UInt32)1 << 15)
#define kTop (1 << 24)
#define kBot (1 << 15)
#define READ_BYTE(p) IByteIn_Read((p)->Stream)
@ -37,9 +37,9 @@ BoolInt Ppmd7a_RangeDec_Init(CPpmd7_RangeDec *p)
#define R (&p->rc.dec)
Z7_FORCE_INLINE
// Z7_NO_INLINE
static void Ppmd7a_RD_Decode(CPpmd7 *p, UInt32 start, UInt32 size)
MY_FORCE_INLINE
// MY_NO_INLINE
static void RangeDec_Decode(CPpmd7 *p, UInt32 start, UInt32 size)
{
start *= R->Range;
R->Low += start;
@ -48,9 +48,9 @@ static void Ppmd7a_RD_Decode(CPpmd7 *p, UInt32 start, UInt32 size)
RC_NORM_LOCAL(R)
}
#define RC_Decode(start, size) Ppmd7a_RD_Decode(p, start, size);
#define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R)
#define RC_GetThreshold(total) (R->Code / (R->Range /= (total)))
#define RC_Decode(start, size) RangeDec_Decode(p, start, size);
#define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R)
#define RC_GetThreshold(total) (R->Code / (R->Range /= (total)))
#define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref))
@ -58,7 +58,7 @@ typedef CPpmd7_Context * CTX_PTR;
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
void Ppmd7_UpdateModel(CPpmd7 *p);
#define MASK(sym) ((Byte *)charMask)[sym]
#define MASK(sym) ((unsigned char *)charMask)[sym]
int Ppmd7a_DecodeSymbol(CPpmd7 *p)
@ -70,7 +70,7 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
CPpmd_State *s = Ppmd7_GetStats(p, p->MinContext);
unsigned i;
UInt32 count, hiCnt;
const UInt32 summFreq = p->MinContext->Union2.SummFreq;
UInt32 summFreq = p->MinContext->Union2.SummFreq;
if (summFreq > R->Range)
return PPMD7_SYM_ERROR;
@ -81,7 +81,7 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
if ((Int32)(count -= s->Freq) < 0)
{
Byte sym;
RC_DecodeFinal(0, s->Freq)
RC_DecodeFinal(0, s->Freq);
p->FoundState = s;
sym = s->Symbol;
Ppmd7_Update1_0(p);
@ -96,7 +96,7 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
if ((Int32)(count -= (++s)->Freq) < 0)
{
Byte sym;
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq);
p->FoundState = s;
sym = s->Symbol;
Ppmd7_Update1(p);
@ -109,10 +109,10 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
return PPMD7_SYM_ERROR;
hiCnt -= count;
RC_Decode(hiCnt, summFreq - hiCnt)
RC_Decode(hiCnt, summFreq - hiCnt);
p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol);
PPMD_SetAllBitsIn256Bytes(charMask)
PPMD_SetAllBitsIn256Bytes(charMask);
// i = p->MinContext->NumStats - 1;
// do { MASK((--s)->Symbol) = 0; } while (--i);
{
@ -120,8 +120,8 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
MASK(s->Symbol) = 0;
do
{
const unsigned sym0 = s2[0].Symbol;
const unsigned sym1 = s2[1].Symbol;
unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol;
s2 += 2;
MASK(sym0) = 0;
MASK(sym1) = 0;
@ -176,7 +176,7 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
R->Range = (R->Range & ~((UInt32)PPMD_BIN_SCALE - 1)) - size0;
RC_NORM_LOCAL(R)
PPMD_SetAllBitsIn256Bytes(charMask)
PPMD_SetAllBitsIn256Bytes(charMask);
MASK(Ppmd7Context_OneState(p->MinContext)->Symbol) = 0;
p->PrevSuccess = 0;
}
@ -209,17 +209,17 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
unsigned num2 = num / 2;
num &= 1;
hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num);
hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num);
s += num;
p->MinContext = mc;
do
{
const unsigned sym0 = s[0].Symbol;
const unsigned sym1 = s[1].Symbol;
unsigned sym0 = s[0].Symbol;
unsigned sym1 = s[1].Symbol;
s += 2;
hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0)));
hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1)));
hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0)));
hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1)));
}
while (--num2);
}
@ -238,20 +238,20 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
s = Ppmd7_GetStats(p, p->MinContext);
hiCnt = count;
// count -= s->Freq & (UInt32)(MASK(s->Symbol));
// count -= s->Freq & (unsigned)(MASK(s->Symbol));
// if ((Int32)count >= 0)
{
for (;;)
{
count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
// count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
}
count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
// count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
};
}
s--;
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq);
// new (see->Summ) value can overflow over 16-bits in some rare cases
Ppmd_See_UPDATE(see)
Ppmd_See_Update(see);
p->FoundState = s;
sym = s->Symbol;
Ppmd7_Update2(p);
@ -261,7 +261,7 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
if (count >= freqSum)
return PPMD7_SYM_ERROR;
RC_Decode(hiCnt, freqSum - hiCnt)
RC_Decode(hiCnt, freqSum - hiCnt);
// We increase (see->Summ) for sum of Freqs of all non_Masked symbols.
// new (see->Summ) value can overflow over 16-bits in some rare cases
@ -277,19 +277,3 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
while (s != s2);
}
}
#undef kTop
#undef kBot
#undef READ_BYTE
#undef RC_NORM_BASE
#undef RC_NORM_1
#undef RC_NORM
#undef RC_NORM_LOCAL
#undef RC_NORM_REMOTE
#undef R
#undef RC_Decode
#undef RC_DecodeFinal
#undef RC_GetThreshold
#undef CTX
#undef SUCCESSOR
#undef MASK

279
C/Ppmd8.c
View file

@ -1,5 +1,5 @@
/* Ppmd8.c -- PPMdI codec
2023-09-07 : Igor Pavlov : Public domain
2021-04-13 : Igor Pavlov : Public domain
This code is based on PPMd var.I (2002): Dmitry Shkarin : Public domain */
#include "Precomp.h"
@ -14,7 +14,7 @@ This code is based on PPMd var.I (2002): Dmitry Shkarin : Public domain */
MY_ALIGN(16)
static const Byte PPMD8_kExpEscape[16] = { 25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2 };
MY_ALIGN(16)
static const UInt16 PPMD8_kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051};
static const UInt16 kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051};
#define MAX_FREQ 124
#define UNIT_SIZE 12
@ -33,7 +33,7 @@ static const UInt16 PPMD8_kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64
#define ONE_STATE(ctx) Ppmd8Context_OneState(ctx)
#define SUFFIX(ctx) CTX((ctx)->Suffix)
typedef CPpmd8_Context * PPMD8_CTX_PTR;
typedef CPpmd8_Context * CTX_PTR;
struct CPpmd8_Node_;
@ -114,7 +114,7 @@ BoolInt Ppmd8_Alloc(CPpmd8 *p, UInt32 size, ISzAllocPtr alloc)
#define EMPTY_NODE 0xFFFFFFFF
static void Ppmd8_InsertNode(CPpmd8 *p, void *node, unsigned indx)
static void InsertNode(CPpmd8 *p, void *node, unsigned indx)
{
((CPpmd8_Node *)node)->Stamp = EMPTY_NODE;
((CPpmd8_Node *)node)->Next = (CPpmd8_Node_Ref)p->FreeList[indx];
@ -124,7 +124,7 @@ static void Ppmd8_InsertNode(CPpmd8 *p, void *node, unsigned indx)
}
static void *Ppmd8_RemoveNode(CPpmd8 *p, unsigned indx)
static void *RemoveNode(CPpmd8 *p, unsigned indx)
{
CPpmd8_Node *node = NODE((CPpmd8_Node_Ref)p->FreeList[indx]);
p->FreeList[indx] = node->Next;
@ -134,16 +134,16 @@ static void *Ppmd8_RemoveNode(CPpmd8 *p, unsigned indx)
}
static void Ppmd8_SplitBlock(CPpmd8 *p, void *ptr, unsigned oldIndx, unsigned newIndx)
static void SplitBlock(CPpmd8 *p, void *ptr, unsigned oldIndx, unsigned newIndx)
{
unsigned i, nu = I2U(oldIndx) - I2U(newIndx);
ptr = (Byte *)ptr + U2B(I2U(newIndx));
if (I2U(i = U2I(nu)) != nu)
{
unsigned k = I2U(--i);
Ppmd8_InsertNode(p, ((Byte *)ptr) + U2B(k), nu - k - 1);
InsertNode(p, ((Byte *)ptr) + U2B(k), nu - k - 1);
}
Ppmd8_InsertNode(p, ptr, i);
InsertNode(p, ptr, i);
}
@ -159,7 +159,7 @@ static void Ppmd8_SplitBlock(CPpmd8 *p, void *ptr, unsigned oldIndx, unsigned ne
static void Ppmd8_GlueFreeBlocks(CPpmd8 *p)
static void GlueFreeBlocks(CPpmd8 *p)
{
/*
we use first UInt32 field of 12-bytes UNITs as record type stamp
@ -239,27 +239,27 @@ static void Ppmd8_GlueFreeBlocks(CPpmd8 *p)
if (nu == 0)
continue;
for (; nu > 128; nu -= 128, node += 128)
Ppmd8_InsertNode(p, node, PPMD_NUM_INDEXES - 1);
InsertNode(p, node, PPMD_NUM_INDEXES - 1);
if (I2U(i = U2I(nu)) != nu)
{
unsigned k = I2U(--i);
Ppmd8_InsertNode(p, node + k, (unsigned)nu - k - 1);
InsertNode(p, node + k, (unsigned)nu - k - 1);
}
Ppmd8_InsertNode(p, node, i);
InsertNode(p, node, i);
}
}
Z7_NO_INLINE
static void *Ppmd8_AllocUnitsRare(CPpmd8 *p, unsigned indx)
MY_NO_INLINE
static void *AllocUnitsRare(CPpmd8 *p, unsigned indx)
{
unsigned i;
if (p->GlueCount == 0)
{
Ppmd8_GlueFreeBlocks(p);
GlueFreeBlocks(p);
if (p->FreeList[indx] != 0)
return Ppmd8_RemoveNode(p, indx);
return RemoveNode(p, indx);
}
i = indx;
@ -277,17 +277,17 @@ static void *Ppmd8_AllocUnitsRare(CPpmd8 *p, unsigned indx)
while (p->FreeList[i] == 0);
{
void *block = Ppmd8_RemoveNode(p, i);
Ppmd8_SplitBlock(p, block, i, indx);
void *block = RemoveNode(p, i);
SplitBlock(p, block, i, indx);
return block;
}
}
static void *Ppmd8_AllocUnits(CPpmd8 *p, unsigned indx)
static void *AllocUnits(CPpmd8 *p, unsigned indx)
{
if (p->FreeList[indx] != 0)
return Ppmd8_RemoveNode(p, indx);
return RemoveNode(p, indx);
{
UInt32 numBytes = U2B(I2U(indx));
Byte *lo = p->LoUnit;
@ -297,22 +297,13 @@ static void *Ppmd8_AllocUnits(CPpmd8 *p, unsigned indx)
return lo;
}
}
return Ppmd8_AllocUnitsRare(p, indx);
return AllocUnitsRare(p, indx);
}
#define MEM_12_CPY(dest, src, num) \
{ UInt32 *d = (UInt32 *)(dest); \
const UInt32 *z = (const UInt32 *)(src); \
unsigned n = (num); \
do { \
d[0] = z[0]; \
d[1] = z[1]; \
d[2] = z[2]; \
z += 3; \
d += 3; \
} while (--n); \
}
#define MyMem12Cpy(dest, src, num) \
{ UInt32 *d = (UInt32 *)dest; const UInt32 *z = (const UInt32 *)src; UInt32 n = num; \
do { d[0] = z[0]; d[1] = z[1]; d[2] = z[2]; z += 3; d += 3; } while (--n); }
@ -324,26 +315,26 @@ static void *ShrinkUnits(CPpmd8 *p, void *oldPtr, unsigned oldNU, unsigned newNU
return oldPtr;
if (p->FreeList[i1] != 0)
{
void *ptr = Ppmd8_RemoveNode(p, i1);
MEM_12_CPY(ptr, oldPtr, newNU)
Ppmd8_InsertNode(p, oldPtr, i0);
void *ptr = RemoveNode(p, i1);
MyMem12Cpy(ptr, oldPtr, newNU);
InsertNode(p, oldPtr, i0);
return ptr;
}
Ppmd8_SplitBlock(p, oldPtr, i0, i1);
SplitBlock(p, oldPtr, i0, i1);
return oldPtr;
}
static void FreeUnits(CPpmd8 *p, void *ptr, unsigned nu)
{
Ppmd8_InsertNode(p, ptr, U2I(nu));
InsertNode(p, ptr, U2I(nu));
}
static void SpecialFreeUnit(CPpmd8 *p, void *ptr)
{
if ((Byte *)ptr != p->UnitsStart)
Ppmd8_InsertNode(p, ptr, 0);
InsertNode(p, ptr, 0);
else
{
#ifdef PPMD8_FREEZE_SUPPORT
@ -361,10 +352,10 @@ static void *MoveUnitsUp(CPpmd8 *p, void *oldPtr, unsigned nu)
void *ptr;
if ((Byte *)oldPtr > p->UnitsStart + (1 << 14) || REF(oldPtr) > p->FreeList[indx])
return oldPtr;
ptr = Ppmd8_RemoveNode(p, indx);
MEM_12_CPY(ptr, oldPtr, nu)
ptr = RemoveNode(p, indx);
MyMem12Cpy(ptr, oldPtr, nu);
if ((Byte *)oldPtr != p->UnitsStart)
Ppmd8_InsertNode(p, oldPtr, indx);
InsertNode(p, oldPtr, indx);
else
p->UnitsStart += U2B(I2U(indx));
return ptr;
@ -420,22 +411,22 @@ static void ExpandTextArea(CPpmd8 *p)
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
static void Ppmd8State_SetSuccessor(CPpmd_State *p, CPpmd_Void_Ref v)
static void SetSuccessor(CPpmd_State *p, CPpmd_Void_Ref v)
{
Ppmd_SET_SUCCESSOR(p, v)
Ppmd_SET_SUCCESSOR(p, v);
}
#define RESET_TEXT(offs) { p->Text = p->Base + p->AlignOffset + (offs); }
Z7_NO_INLINE
MY_NO_INLINE
static
void Ppmd8_RestartModel(CPpmd8 *p)
void RestartModel(CPpmd8 *p)
{
unsigned i, k, m;
memset(p->FreeList, 0, sizeof(p->FreeList));
memset(p->Stamps, 0, sizeof(p->Stamps));
RESET_TEXT(0)
RESET_TEXT(0);
p->HiUnit = p->Text + p->Size;
p->LoUnit = p->UnitsStart = p->HiUnit - p->Size / 8 / UNIT_SIZE * 7 * UNIT_SIZE;
p->GlueCount = 0;
@ -445,8 +436,8 @@ void Ppmd8_RestartModel(CPpmd8 *p)
p->PrevSuccess = 0;
{
CPpmd8_Context *mc = (PPMD8_CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */
CPpmd_State *s = (CPpmd_State *)p->LoUnit; /* Ppmd8_AllocUnits(p, PPMD_NUM_INDEXES - 1); */
CPpmd8_Context *mc = (CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */
CPpmd_State *s = (CPpmd_State *)p->LoUnit; /* AllocUnits(p, PPMD_NUM_INDEXES - 1); */
p->LoUnit += U2B(256 / 2);
p->MaxContext = p->MinContext = mc;
@ -461,7 +452,7 @@ void Ppmd8_RestartModel(CPpmd8 *p)
{
s->Symbol = (Byte)i;
s->Freq = 1;
Ppmd8State_SetSuccessor(s, 0);
SetSuccessor(s, 0);
}
}
@ -484,7 +475,7 @@ void Ppmd8_RestartModel(CPpmd8 *p)
{
unsigned r;
UInt16 *dest = p->BinSumm[m] + k;
const UInt16 val = (UInt16)(PPMD_BIN_SCALE - PPMD8_kInitBinEsc[k] / (i + 1));
UInt16 val = (UInt16)(PPMD_BIN_SCALE - kInitBinEsc[k] / (i + 1));
for (r = 0; r < 64; r += 8)
dest[r] = val;
}
@ -516,7 +507,7 @@ void Ppmd8_Init(CPpmd8 *p, unsigned maxOrder, unsigned restoreMethod)
{
p->MaxOrder = maxOrder;
p->RestoreMethod = restoreMethod;
Ppmd8_RestartModel(p);
RestartModel(p);
}
@ -540,7 +531,7 @@ Refresh() is called when we remove some symbols (successors) in context.
It increases Escape_Freq for sum of all removed symbols.
*/
static void Refresh(CPpmd8 *p, PPMD8_CTX_PTR ctx, unsigned oldNU, unsigned scale)
static void Refresh(CPpmd8 *p, CTX_PTR ctx, unsigned oldNU, unsigned scale)
{
unsigned i = ctx->NumStats, escFreq, sumFreq, flags;
CPpmd_State *s = (CPpmd_State *)ShrinkUnits(p, STATS(ctx), oldNU, (i + 2) >> 1);
@ -590,7 +581,7 @@ static void Refresh(CPpmd8 *p, PPMD8_CTX_PTR ctx, unsigned oldNU, unsigned scale
}
static void SWAP_STATES(CPpmd_State *t1, CPpmd_State *t2)
static void SwapStates(CPpmd_State *t1, CPpmd_State *t2)
{
CPpmd_State tmp = *t1;
*t1 = *t2;
@ -606,7 +597,7 @@ CutOff() reduces contexts:
if the (Union4.Stats) is close to (UnitsStart), it moves it up.
*/
static CPpmd_Void_Ref CutOff(CPpmd8 *p, PPMD8_CTX_PTR ctx, unsigned order)
static CPpmd_Void_Ref CutOff(CPpmd8 *p, CTX_PTR ctx, unsigned order)
{
int ns = ctx->NumStats;
unsigned nu;
@ -622,7 +613,7 @@ static CPpmd_Void_Ref CutOff(CPpmd8 *p, PPMD8_CTX_PTR ctx, unsigned order)
successor = CutOff(p, CTX(successor), order + 1);
else
successor = 0;
Ppmd8State_SetSuccessor(s, successor);
SetSuccessor(s, successor);
if (successor || order <= 9) /* O_BOUND */
return REF(ctx);
}
@ -639,11 +630,11 @@ static CPpmd_Void_Ref CutOff(CPpmd8 *p, PPMD8_CTX_PTR ctx, unsigned order)
if ((UInt32)((Byte *)stats - p->UnitsStart) <= (1 << 14)
&& (CPpmd_Void_Ref)ctx->Union4.Stats <= p->FreeList[indx])
{
void *ptr = Ppmd8_RemoveNode(p, indx);
void *ptr = RemoveNode(p, indx);
ctx->Union4.Stats = STATS_REF(ptr);
MEM_12_CPY(ptr, (const void *)stats, nu)
MyMem12Cpy(ptr, (const void *)stats, nu);
if ((Byte *)stats != p->UnitsStart)
Ppmd8_InsertNode(p, stats, indx);
InsertNode(p, stats, indx);
else
p->UnitsStart += U2B(I2U(indx));
stats = ptr;
@ -665,16 +656,16 @@ static CPpmd_Void_Ref CutOff(CPpmd8 *p, PPMD8_CTX_PTR ctx, unsigned order)
}
else
{
SWAP_STATES(s, s2);
Ppmd8State_SetSuccessor(s2, 0);
SwapStates(s, s2);
SetSuccessor(s2, 0);
}
}
else
{
if (order < p->MaxOrder)
Ppmd8State_SetSuccessor(s, CutOff(p, CTX(successor), order + 1));
SetSuccessor(s, CutOff(p, CTX(successor), order + 1));
else
Ppmd8State_SetSuccessor(s, 0);
SetSuccessor(s, 0);
}
}
while (--s >= stats);
@ -720,7 +711,7 @@ RemoveBinContexts()
removes Bin Context without Successor, if suffix of that context is also binary.
*/
static CPpmd_Void_Ref RemoveBinContexts(CPpmd8 *p, PPMD8_CTX_PTR ctx, unsigned order)
static CPpmd_Void_Ref RemoveBinContexts(CPpmd8 *p, CTX_PTR ctx, unsigned order)
{
if (!ctx->NumStats)
{
@ -730,7 +721,7 @@ static CPpmd_Void_Ref RemoveBinContexts(CPpmd8 *p, PPMD8_CTX_PTR ctx, unsigned o
successor = RemoveBinContexts(p, CTX(successor), order + 1);
else
successor = 0;
Ppmd8State_SetSuccessor(s, successor);
SetSuccessor(s, successor);
/* Suffix context can be removed already, since different (high-order)
Successors may refer to same context. So we check Flags == 0xFF (Stamp == EMPTY_NODE) */
if (!successor && (!SUFFIX(ctx)->NumStats || SUFFIX(ctx)->Flags == 0xFF))
@ -746,9 +737,9 @@ static CPpmd_Void_Ref RemoveBinContexts(CPpmd8 *p, PPMD8_CTX_PTR ctx, unsigned o
{
CPpmd_Void_Ref successor = SUCCESSOR(s);
if ((Byte *)Ppmd8_GetPtr(p, successor) >= p->UnitsStart && order < p->MaxOrder)
Ppmd8State_SetSuccessor(s, RemoveBinContexts(p, CTX(successor), order + 1));
SetSuccessor(s, RemoveBinContexts(p, CTX(successor), order + 1));
else
Ppmd8State_SetSuccessor(s, 0);
SetSuccessor(s, 0);
}
while (--s >= STATS(ctx));
}
@ -776,15 +767,15 @@ static UInt32 GetUsedMemory(const CPpmd8 *p)
#endif
static void RestoreModel(CPpmd8 *p, PPMD8_CTX_PTR ctxError
static void RestoreModel(CPpmd8 *p, CTX_PTR ctxError
#ifdef PPMD8_FREEZE_SUPPORT
, PPMD8_CTX_PTR fSuccessor
, CTX_PTR fSuccessor
#endif
)
{
PPMD8_CTX_PTR c;
CTX_PTR c;
CPpmd_State *s;
RESET_TEXT(0)
RESET_TEXT(0);
// we go here in cases of error of allocation for context (c1)
// Order(MinContext) < Order(ctxError) <= Order(MaxContext)
@ -840,7 +831,7 @@ static void RestoreModel(CPpmd8 *p, PPMD8_CTX_PTR ctxError
else
#endif
if (p->RestoreMethod == PPMD8_RESTORE_METHOD_RESTART || GetUsedMemory(p) < (p->Size >> 1))
Ppmd8_RestartModel(p);
RestartModel(p);
else
{
while (p->MaxContext->Suffix)
@ -859,8 +850,8 @@ static void RestoreModel(CPpmd8 *p, PPMD8_CTX_PTR ctxError
Z7_NO_INLINE
static PPMD8_CTX_PTR Ppmd8_CreateSuccessors(CPpmd8 *p, BoolInt skip, CPpmd_State *s1, PPMD8_CTX_PTR c)
MY_NO_INLINE
static CTX_PTR CreateSuccessors(CPpmd8 *p, BoolInt skip, CPpmd_State *s1, CTX_PTR c)
{
CPpmd_Byte_Ref upBranch = (CPpmd_Byte_Ref)SUCCESSOR(p->FoundState);
@ -936,15 +927,15 @@ static PPMD8_CTX_PTR Ppmd8_CreateSuccessors(CPpmd8 *p, BoolInt skip, CPpmd_State
do
{
PPMD8_CTX_PTR c1;
CTX_PTR c1;
/* = AllocContext(p); */
if (p->HiUnit != p->LoUnit)
c1 = (PPMD8_CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE);
c1 = (CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE);
else if (p->FreeList[0] != 0)
c1 = (PPMD8_CTX_PTR)Ppmd8_RemoveNode(p, 0);
c1 = (CTX_PTR)RemoveNode(p, 0);
else
{
c1 = (PPMD8_CTX_PTR)Ppmd8_AllocUnitsRare(p, 0);
c1 = (CTX_PTR)AllocUnitsRare(p, 0);
if (!c1)
return NULL;
}
@ -952,9 +943,9 @@ static PPMD8_CTX_PTR Ppmd8_CreateSuccessors(CPpmd8 *p, BoolInt skip, CPpmd_State
c1->NumStats = 0;
c1->Union2.State2.Symbol = newSym;
c1->Union2.State2.Freq = newFreq;
Ppmd8State_SetSuccessor(ONE_STATE(c1), upBranch);
SetSuccessor(ONE_STATE(c1), upBranch);
c1->Suffix = REF(c);
Ppmd8State_SetSuccessor(ps[--numPs], REF(c1));
SetSuccessor(ps[--numPs], REF(c1));
c = c1;
}
while (numPs != 0);
@ -963,10 +954,10 @@ static PPMD8_CTX_PTR Ppmd8_CreateSuccessors(CPpmd8 *p, BoolInt skip, CPpmd_State
}
static PPMD8_CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, PPMD8_CTX_PTR c)
static CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, CTX_PTR c)
{
CPpmd_State *s = NULL;
PPMD8_CTX_PTR c1 = c;
CTX_PTR c1 = c;
CPpmd_Void_Ref upBranch = REF(p->Text);
#ifdef PPMD8_FREEZE_SUPPORT
@ -976,7 +967,7 @@ static PPMD8_CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, PPMD8_CTX_PTR c)
ps[numPs++] = p->FoundState;
#endif
Ppmd8State_SetSuccessor(p->FoundState, upBranch);
SetSuccessor(p->FoundState, upBranch);
p->OrderFall++;
for (;;)
@ -994,8 +985,8 @@ static PPMD8_CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, PPMD8_CTX_PTR c)
#ifdef PPMD8_FREEZE_SUPPORT
if (p->RestoreMethod > PPMD8_RESTORE_METHOD_FREEZE)
{
do { Ppmd8State_SetSuccessor(ps[--numPs], REF(c)); } while (numPs);
RESET_TEXT(1)
do { SetSuccessor(ps[--numPs], REF(c)); } while (numPs);
RESET_TEXT(1);
p->OrderFall = 1;
}
#endif
@ -1023,7 +1014,7 @@ static PPMD8_CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, PPMD8_CTX_PTR c)
#ifdef PPMD8_FREEZE_SUPPORT
ps[numPs++] = s;
#endif
Ppmd8State_SetSuccessor(s, upBranch);
SetSuccessor(s, upBranch);
p->OrderFall++;
}
@ -1031,8 +1022,8 @@ static PPMD8_CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, PPMD8_CTX_PTR c)
if (p->RestoreMethod > PPMD8_RESTORE_METHOD_FREEZE)
{
c = CTX(SUCCESSOR(s));
do { Ppmd8State_SetSuccessor(ps[--numPs], REF(c)); } while (numPs);
RESET_TEXT(1)
do { SetSuccessor(ps[--numPs], REF(c)); } while (numPs);
RESET_TEXT(1);
p->OrderFall = 1;
return c;
}
@ -1040,15 +1031,15 @@ static PPMD8_CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, PPMD8_CTX_PTR c)
#endif
if (SUCCESSOR(s) <= upBranch)
{
PPMD8_CTX_PTR successor;
CTX_PTR successor;
CPpmd_State *s2 = p->FoundState;
p->FoundState = s;
successor = Ppmd8_CreateSuccessors(p, False, NULL, c);
successor = CreateSuccessors(p, False, NULL, c);
if (!successor)
Ppmd8State_SetSuccessor(s, 0);
SetSuccessor(s, 0);
else
Ppmd8State_SetSuccessor(s, REF(successor));
SetSuccessor(s, REF(successor));
p->FoundState = s2;
}
@ -1056,7 +1047,7 @@ static PPMD8_CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, PPMD8_CTX_PTR c)
CPpmd_Void_Ref successor = SUCCESSOR(s);
if (p->OrderFall == 1 && c1 == p->MaxContext)
{
Ppmd8State_SetSuccessor(p->FoundState, successor);
SetSuccessor(p->FoundState, successor);
p->Text--;
}
if (successor == 0)
@ -1068,11 +1059,11 @@ static PPMD8_CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, PPMD8_CTX_PTR c)
void Ppmd8_UpdateModel(CPpmd8 *p);
Z7_NO_INLINE
MY_NO_INLINE
void Ppmd8_UpdateModel(CPpmd8 *p)
{
CPpmd_Void_Ref maxSuccessor, minSuccessor = SUCCESSOR(p->FoundState);
PPMD8_CTX_PTR c;
CTX_PTR c;
unsigned s0, ns, fFreq = p->FoundState->Freq;
Byte flag, fSymbol = p->FoundState->Symbol;
{
@ -1105,7 +1096,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
if (s[0].Freq >= s[-1].Freq)
{
SWAP_STATES(&s[0], &s[-1]);
SwapStates(&s[0], &s[-1]);
s--;
}
}
@ -1121,14 +1112,14 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
c = p->MaxContext;
if (p->OrderFall == 0 && minSuccessor)
{
PPMD8_CTX_PTR cs = Ppmd8_CreateSuccessors(p, True, s, p->MinContext);
CTX_PTR cs = CreateSuccessors(p, True, s, p->MinContext);
if (!cs)
{
Ppmd8State_SetSuccessor(p->FoundState, 0);
SetSuccessor(p->FoundState, 0);
RESTORE_MODEL(c, CTX(minSuccessor));
return;
}
Ppmd8State_SetSuccessor(p->FoundState, REF(cs));
SetSuccessor(p->FoundState, REF(cs));
p->MinContext = p->MaxContext = cs;
return;
}
@ -1150,7 +1141,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
if (!minSuccessor)
{
PPMD8_CTX_PTR cs = ReduceOrder(p, s, p->MinContext);
CTX_PTR cs = ReduceOrder(p, s, p->MinContext);
if (!cs)
{
RESTORE_MODEL(c, NULL);
@ -1160,7 +1151,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
}
else if ((Byte *)Ppmd8_GetPtr(p, minSuccessor) < p->UnitsStart)
{
PPMD8_CTX_PTR cs = Ppmd8_CreateSuccessors(p, False, s, p->MinContext);
CTX_PTR cs = CreateSuccessors(p, False, s, p->MinContext);
if (!cs)
{
RESTORE_MODEL(c, NULL);
@ -1178,7 +1169,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
else if (p->RestoreMethod > PPMD8_RESTORE_METHOD_FREEZE)
{
maxSuccessor = minSuccessor;
RESET_TEXT(0)
RESET_TEXT(0);
p->OrderFall = 0;
}
#endif
@ -1224,11 +1215,11 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
if ((ns1 & 1) != 0)
{
/* Expand for one UNIT */
const unsigned oldNU = (ns1 + 1) >> 1;
const unsigned i = U2I(oldNU);
unsigned oldNU = (ns1 + 1) >> 1;
unsigned i = U2I(oldNU);
if (i != U2I((size_t)oldNU + 1))
{
void *ptr = Ppmd8_AllocUnits(p, i + 1);
void *ptr = AllocUnits(p, i + 1);
void *oldPtr;
if (!ptr)
{
@ -1236,15 +1227,15 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
return;
}
oldPtr = STATS(c);
MEM_12_CPY(ptr, oldPtr, oldNU)
Ppmd8_InsertNode(p, oldPtr, i);
MyMem12Cpy(ptr, oldPtr, oldNU);
InsertNode(p, oldPtr, i);
c->Union4.Stats = STATS_REF(ptr);
}
}
sum = c->Union2.SummFreq;
/* max increase of Escape_Freq is 1 here.
an average increase is 1/3 per symbol */
sum += (UInt32)(unsigned)(3 * ns1 + 1 < ns);
sum += (3 * ns1 + 1 < ns);
/* original PPMdH uses 16-bit variable for (sum) here.
But (sum < ???). Do we need to truncate (sum) to 16-bit */
// sum = (UInt16)sum;
@ -1252,7 +1243,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
else
{
CPpmd_State *s = (CPpmd_State*)Ppmd8_AllocUnits(p, 0);
CPpmd_State *s = (CPpmd_State*)AllocUnits(p, 0);
if (!s)
{
RESTORE_MODEL(c, CTX(minSuccessor));
@ -1264,7 +1255,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
s->Symbol = c->Union2.State2.Symbol;
s->Successor_0 = c->Union4.State4.Successor_0;
s->Successor_1 = c->Union4.State4.Successor_1;
// Ppmd8State_SetSuccessor(s, c->Union4.Stats); // call it only for debug purposes to check the order of
// SetSuccessor(s, c->Union4.Stats); // call it only for debug purposes to check the order of
// (Successor_0 and Successor_1) in LE/BE.
c->Union4.Stats = REF(s);
if (freq < MAX_FREQ / 4 - 1)
@ -1274,7 +1265,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
s->Freq = (Byte)freq;
sum = (UInt32)(freq + p->InitEsc + (ns > 2)); // Ppmd8 (> 2)
sum = freq + p->InitEsc + (ns > 2); // Ppmd8 (> 2)
}
}
@ -1284,7 +1275,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
UInt32 sf = (UInt32)s0 + sum;
s->Symbol = fSymbol;
c->NumStats = (Byte)(ns1 + 1);
Ppmd8State_SetSuccessor(s, maxSuccessor);
SetSuccessor(s, maxSuccessor);
c->Flags |= flag;
if (cf < 6 * sf)
{
@ -1308,8 +1299,8 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
Z7_NO_INLINE
static void Ppmd8_Rescale(CPpmd8 *p)
MY_NO_INLINE
static void Rescale(CPpmd8 *p)
{
unsigned i, adder, sumFreq, escFreq;
CPpmd_State *stats = STATS(p->MinContext);
@ -1398,7 +1389,7 @@ static void Ppmd8_Rescale(CPpmd8 *p)
*s = *stats;
s->Freq = (Byte)freq;
p->FoundState = s;
Ppmd8_InsertNode(p, stats, U2I(n0));
InsertNode(p, stats, U2I(n0));
return;
}
@ -1446,10 +1437,10 @@ CPpmd_See *Ppmd8_MakeEscFreq(CPpmd8 *p, unsigned numMasked1, UInt32 *escFreq)
{
// if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ
const unsigned summ = (UInt16)see->Summ; // & 0xFFFF
const unsigned r = (summ >> see->Shift);
unsigned summ = (UInt16)see->Summ; // & 0xFFFF
unsigned r = (summ >> see->Shift);
see->Summ = (UInt16)(summ - r);
*escFreq = (UInt32)(r + (r == 0));
*escFreq = r + (r == 0);
}
}
else
@ -1461,9 +1452,9 @@ CPpmd_See *Ppmd8_MakeEscFreq(CPpmd8 *p, unsigned numMasked1, UInt32 *escFreq)
}
static void Ppmd8_NextContext(CPpmd8 *p)
static void NextContext(CPpmd8 *p)
{
PPMD8_CTX_PTR c = CTX(SUCCESSOR(p->FoundState));
CTX_PTR c = CTX(SUCCESSOR(p->FoundState));
if (p->OrderFall == 0 && (const Byte *)c >= p->UnitsStart)
p->MaxContext = p->MinContext = c;
else
@ -1480,12 +1471,12 @@ void Ppmd8_Update1(CPpmd8 *p)
s->Freq = (Byte)freq;
if (freq > s[-1].Freq)
{
SWAP_STATES(s, &s[-1]);
SwapStates(s, &s[-1]);
p->FoundState = --s;
if (freq > MAX_FREQ)
Ppmd8_Rescale(p);
Rescale(p);
}
Ppmd8_NextContext(p);
NextContext(p);
}
@ -1494,15 +1485,15 @@ void Ppmd8_Update1_0(CPpmd8 *p)
CPpmd_State *s = p->FoundState;
CPpmd8_Context *mc = p->MinContext;
unsigned freq = s->Freq;
const unsigned summFreq = mc->Union2.SummFreq;
unsigned summFreq = mc->Union2.SummFreq;
p->PrevSuccess = (2 * freq >= summFreq); // Ppmd8 (>=)
p->RunLength += (Int32)p->PrevSuccess;
p->RunLength += (int)p->PrevSuccess;
mc->Union2.SummFreq = (UInt16)(summFreq + 4);
freq += 4;
s->Freq = (Byte)freq;
if (freq > MAX_FREQ)
Ppmd8_Rescale(p);
Ppmd8_NextContext(p);
Rescale(p);
NextContext(p);
}
@ -1513,7 +1504,7 @@ void Ppmd8_UpdateBin(CPpmd8 *p)
p->FoundState->Freq = (Byte)(freq + (freq < 196)); // Ppmd8 (196)
p->PrevSuccess = 1;
p->RunLength++;
Ppmd8_NextContext(p);
NextContext(p);
}
*/
@ -1526,7 +1517,7 @@ void Ppmd8_Update2(CPpmd8 *p)
p->MinContext->Union2.SummFreq = (UInt16)(p->MinContext->Union2.SummFreq + 4);
s->Freq = (Byte)freq;
if (freq > MAX_FREQ)
Ppmd8_Rescale(p);
Rescale(p);
Ppmd8_UpdateModel(p);
}
@ -1535,7 +1526,7 @@ void Ppmd8_Update2(CPpmd8 *p)
GlueCount, and Glue method
BinSum
See / EscFreq
Ppmd8_CreateSuccessors updates more suffix contexts
CreateSuccessors updates more suffix contexts
Ppmd8_UpdateModel consts.
PrevSuccess Update
@ -1544,31 +1535,3 @@ Flags:
(1 << 3) - there is symbol in Stats with (sym >= 0x40) in
(1 << 4) - main symbol of context is (sym >= 0x40)
*/
#undef RESET_TEXT
#undef FLAG_RESCALED
#undef FLAG_PREV_HIGH
#undef HiBits_Prepare
#undef HiBits_Convert_3
#undef HiBits_Convert_4
#undef PPMD8_HiBitsFlag_3
#undef PPMD8_HiBitsFlag_4
#undef RESTORE_MODEL
#undef MAX_FREQ
#undef UNIT_SIZE
#undef U2B
#undef U2I
#undef I2U
#undef REF
#undef STATS_REF
#undef CTX
#undef STATS
#undef ONE_STATE
#undef SUFFIX
#undef NODE
#undef EMPTY_NODE
#undef MEM_12_CPY
#undef SUCCESSOR
#undef SWAP_STATES

View file

@ -1,11 +1,11 @@
/* Ppmd8.h -- Ppmd8 (PPMdI) compression codec
2023-04-02 : Igor Pavlov : Public domain
2021-04-13 : Igor Pavlov : Public domain
This code is based on:
PPMd var.I (2002): Dmitry Shkarin : Public domain
Carryless rangecoder (1999): Dmitry Subbotin : Public domain */
#ifndef ZIP7_INC_PPMD8_H
#define ZIP7_INC_PPMD8_H
#ifndef __PPMD8_H
#define __PPMD8_H
#include "Ppmd.h"
@ -87,8 +87,8 @@ typedef struct
UInt32 Low;
union
{
IByteInPtr In;
IByteOutPtr Out;
IByteIn *In;
IByteOut *Out;
} Stream;
Byte Indx2Units[PPMD_NUM_INDEXES + 2]; // +2 for alignment

View file

@ -1,5 +1,5 @@
/* Ppmd8Dec.c -- Ppmd8 (PPMdI) Decoder
2023-09-07 : Igor Pavlov : Public domain
2021-04-13 : Igor Pavlov : Public domain
This code is based on:
PPMd var.I (2002): Dmitry Shkarin : Public domain
Carryless rangecoder (1999): Dmitry Subbotin : Public domain */
@ -8,8 +8,8 @@ This code is based on:
#include "Ppmd8.h"
#define kTop ((UInt32)1 << 24)
#define kBot ((UInt32)1 << 15)
#define kTop (1 << 24)
#define kBot (1 << 15)
#define READ_BYTE(p) IByteIn_Read((p)->Stream.In)
@ -37,9 +37,9 @@ BoolInt Ppmd8_Init_RangeDec(CPpmd8 *p)
#define R p
Z7_FORCE_INLINE
// Z7_NO_INLINE
static void Ppmd8_RD_Decode(CPpmd8 *p, UInt32 start, UInt32 size)
MY_FORCE_INLINE
// MY_NO_INLINE
static void RangeDec_Decode(CPpmd8 *p, UInt32 start, UInt32 size)
{
start *= R->Range;
R->Low += start;
@ -48,17 +48,17 @@ static void Ppmd8_RD_Decode(CPpmd8 *p, UInt32 start, UInt32 size)
RC_NORM_LOCAL(R)
}
#define RC_Decode(start, size) Ppmd8_RD_Decode(p, start, size);
#define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R)
#define RC_GetThreshold(total) (R->Code / (R->Range /= (total)))
#define RC_Decode(start, size) RangeDec_Decode(p, start, size);
#define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R)
#define RC_GetThreshold(total) (R->Code / (R->Range /= (total)))
#define CTX(ref) ((CPpmd8_Context *)Ppmd8_GetContext(p, ref))
// typedef CPpmd8_Context * CTX_PTR;
typedef CPpmd8_Context * CTX_PTR;
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
void Ppmd8_UpdateModel(CPpmd8 *p);
#define MASK(sym) ((Byte *)charMask)[sym]
#define MASK(sym) ((unsigned char *)charMask)[sym]
int Ppmd8_DecodeSymbol(CPpmd8 *p)
@ -81,7 +81,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
if ((Int32)(count -= s->Freq) < 0)
{
Byte sym;
RC_DecodeFinal(0, s->Freq)
RC_DecodeFinal(0, s->Freq);
p->FoundState = s;
sym = s->Symbol;
Ppmd8_Update1_0(p);
@ -96,7 +96,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
if ((Int32)(count -= (++s)->Freq) < 0)
{
Byte sym;
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq);
p->FoundState = s;
sym = s->Symbol;
Ppmd8_Update1(p);
@ -109,10 +109,10 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
return PPMD8_SYM_ERROR;
hiCnt -= count;
RC_Decode(hiCnt, summFreq - hiCnt)
RC_Decode(hiCnt, summFreq - hiCnt);
PPMD_SetAllBitsIn256Bytes(charMask)
PPMD_SetAllBitsIn256Bytes(charMask);
// i = p->MinContext->NumStats - 1;
// do { MASK((--s)->Symbol) = 0; } while (--i);
{
@ -120,8 +120,8 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
MASK(s->Symbol) = 0;
do
{
const unsigned sym0 = s2[0].Symbol;
const unsigned sym1 = s2[1].Symbol;
unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol;
s2 += 2;
MASK(sym0) = 0;
MASK(sym1) = 0;
@ -152,7 +152,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
// Ppmd8_UpdateBin(p);
{
unsigned freq = s->Freq;
CPpmd8_Context *c = CTX(SUCCESSOR(s));
CTX_PTR c = CTX(SUCCESSOR(s));
sym = s->Symbol;
p->FoundState = s;
p->PrevSuccess = 1;
@ -176,7 +176,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
R->Range = (R->Range & ~((UInt32)PPMD_BIN_SCALE - 1)) - size0;
RC_NORM_LOCAL(R)
PPMD_SetAllBitsIn256Bytes(charMask)
PPMD_SetAllBitsIn256Bytes(charMask);
MASK(Ppmd8Context_OneState(p->MinContext)->Symbol) = 0;
p->PrevSuccess = 0;
}
@ -209,17 +209,17 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
unsigned num2 = num / 2;
num &= 1;
hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num);
hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num);
s += num;
p->MinContext = mc;
do
{
const unsigned sym0 = s[0].Symbol;
const unsigned sym1 = s[1].Symbol;
unsigned sym0 = s[0].Symbol;
unsigned sym1 = s[1].Symbol;
s += 2;
hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0)));
hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1)));
hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0)));
hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1)));
}
while (--num2);
}
@ -227,7 +227,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
see = Ppmd8_MakeEscFreq(p, numMasked, &freqSum);
freqSum += hiCnt;
freqSum2 = freqSum;
PPMD8_CORRECT_SUM_RANGE(R, freqSum2)
PPMD8_CORRECT_SUM_RANGE(R, freqSum2);
count = RC_GetThreshold(freqSum2);
@ -235,7 +235,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
if (count < hiCnt)
{
Byte sym;
// Ppmd_See_UPDATE(see) // new (see->Summ) value can overflow over 16-bits in some rare cases
// Ppmd_See_Update(see); // new (see->Summ) value can overflow over 16-bits in some rare cases
s = Ppmd8_GetStats(p, p->MinContext);
hiCnt = count;
@ -243,15 +243,15 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
{
for (;;)
{
count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
// count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
// count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
}
}
s--;
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq);
// new (see->Summ) value can overflow over 16-bits in some rare cases
Ppmd_See_UPDATE(see)
Ppmd_See_Update(see);
p->FoundState = s;
sym = s->Symbol;
Ppmd8_Update2(p);
@ -261,7 +261,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
if (count >= freqSum2)
return PPMD8_SYM_ERROR;
RC_Decode(hiCnt, freqSum2 - hiCnt)
RC_Decode(hiCnt, freqSum2 - hiCnt);
// We increase (see->Summ) for sum of Freqs of all non_Masked symbols.
// new (see->Summ) value can overflow over 16-bits in some rare cases
@ -277,19 +277,3 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
while (s != s2);
}
}
#undef kTop
#undef kBot
#undef READ_BYTE
#undef RC_NORM_BASE
#undef RC_NORM_1
#undef RC_NORM
#undef RC_NORM_LOCAL
#undef RC_NORM_REMOTE
#undef R
#undef RC_Decode
#undef RC_DecodeFinal
#undef RC_GetThreshold
#undef CTX
#undef SUCCESSOR
#undef MASK

View file

@ -1,5 +1,5 @@
/* Ppmd8Enc.c -- Ppmd8 (PPMdI) Encoder
2023-09-07 : Igor Pavlov : Public domain
2021-04-13 : Igor Pavlov : Public domain
This code is based on:
PPMd var.I (2002): Dmitry Shkarin : Public domain
Carryless rangecoder (1999): Dmitry Subbotin : Public domain */
@ -8,8 +8,8 @@ This code is based on:
#include "Ppmd8.h"
#define kTop ((UInt32)1 << 24)
#define kBot ((UInt32)1 << 15)
#define kTop (1 << 24)
#define kBot (1 << 15)
#define WRITE_BYTE(p) IByteOut_Write(p->Stream.Out, (Byte)(p->Low >> 24))
@ -54,13 +54,13 @@ void Ppmd8_Flush_RangeEnc(CPpmd8 *p)
Z7_FORCE_INLINE
// Z7_NO_INLINE
static void Ppmd8_RangeEnc_Encode(CPpmd8 *p, UInt32 start, UInt32 size, UInt32 total)
MY_FORCE_INLINE
// MY_NO_INLINE
static void RangeEnc_Encode(CPpmd8 *p, UInt32 start, UInt32 size, UInt32 total)
{
R->Low += start * (R->Range /= total);
R->Range *= size;
RC_NORM_LOCAL(R)
RC_NORM_LOCAL(R);
}
@ -72,19 +72,19 @@ static void Ppmd8_RangeEnc_Encode(CPpmd8 *p, UInt32 start, UInt32 size, UInt32 t
#define RC_Encode(start, size, total) Ppmd8_RangeEnc_Encode(p, start, size, total);
#define RC_EncodeFinal(start, size, total) RC_Encode(start, size, total) RC_NORM_REMOTE(p)
#define RC_Encode(start, size, total) RangeEnc_Encode(p, start, size, total);
#define RC_EncodeFinal(start, size, total) RC_Encode(start, size, total); RC_NORM_REMOTE(p);
#define CTX(ref) ((CPpmd8_Context *)Ppmd8_GetContext(p, ref))
// typedef CPpmd8_Context * CTX_PTR;
typedef CPpmd8_Context * CTX_PTR;
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
void Ppmd8_UpdateModel(CPpmd8 *p);
#define MASK(sym) ((Byte *)charMask)[sym]
#define MASK(sym) ((unsigned char *)charMask)[sym]
// Z7_FORCE_INLINE
// MY_FORCE_INLINE
// static
void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
{
@ -104,7 +104,7 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
if (s->Symbol == symbol)
{
RC_EncodeFinal(0, s->Freq, summFreq)
RC_EncodeFinal(0, s->Freq, summFreq);
p->FoundState = s;
Ppmd8_Update1_0(p);
return;
@ -117,7 +117,7 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
if ((++s)->Symbol == symbol)
{
RC_EncodeFinal(sum, s->Freq, summFreq)
RC_EncodeFinal(sum, s->Freq, summFreq);
p->FoundState = s;
Ppmd8_Update1(p);
return;
@ -127,10 +127,10 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
while (--i);
RC_Encode(sum, summFreq - sum, summFreq)
RC_Encode(sum, summFreq - sum, summFreq);
PPMD_SetAllBitsIn256Bytes(charMask)
PPMD_SetAllBitsIn256Bytes(charMask);
// MASK(s->Symbol) = 0;
// i = p->MinContext->NumStats;
// do { MASK((--s)->Symbol) = 0; } while (--i);
@ -139,8 +139,8 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
MASK(s->Symbol) = 0;
do
{
const unsigned sym0 = s2[0].Symbol;
const unsigned sym1 = s2[1].Symbol;
unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol;
s2 += 2;
MASK(sym0) = 0;
MASK(sym1) = 0;
@ -153,20 +153,20 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
UInt16 *prob = Ppmd8_GetBinSumm(p);
CPpmd_State *s = Ppmd8Context_OneState(p->MinContext);
UInt32 pr = *prob;
const UInt32 bound = (R->Range >> 14) * pr;
UInt32 bound = (R->Range >> 14) * pr;
pr = PPMD_UPDATE_PROB_1(pr);
if (s->Symbol == symbol)
{
*prob = (UInt16)(pr + (1 << PPMD_INT_BITS));
// RangeEnc_EncodeBit_0(p, bound);
R->Range = bound;
RC_NORM(R)
RC_NORM(R);
// p->FoundState = s;
// Ppmd8_UpdateBin(p);
{
const unsigned freq = s->Freq;
CPpmd8_Context *c = CTX(SUCCESSOR(s));
unsigned freq = s->Freq;
CTX_PTR c = CTX(SUCCESSOR(s));
p->FoundState = s;
p->PrevSuccess = 1;
p->RunLength++;
@ -187,7 +187,7 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
R->Range = (R->Range & ~((UInt32)PPMD_BIN_SCALE - 1)) - bound;
RC_NORM_LOCAL(R)
PPMD_SetAllBitsIn256Bytes(charMask)
PPMD_SetAllBitsIn256Bytes(charMask);
MASK(s->Symbol) = 0;
p->PrevSuccess = 0;
}
@ -248,14 +248,14 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
do
{
const unsigned cur = s->Symbol;
unsigned cur = s->Symbol;
if ((int)cur == symbol)
{
const UInt32 low = sum;
const UInt32 freq = s->Freq;
UInt32 low = sum;
UInt32 freq = s->Freq;
unsigned num2;
Ppmd_See_UPDATE(see)
Ppmd_See_Update(see);
p->FoundState = s;
sum += escFreq;
@ -265,20 +265,21 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
if (num2 != 0)
{
s += i;
do
for (;;)
{
const unsigned sym0 = s[0].Symbol;
const unsigned sym1 = s[1].Symbol;
unsigned sym0 = s[0].Symbol;
unsigned sym1 = s[1].Symbol;
s += 2;
sum += (s[-2].Freq & (unsigned)(MASK(sym0)));
sum += (s[-1].Freq & (unsigned)(MASK(sym1)));
if (--num2 == 0)
break;
}
while (--num2);
}
PPMD8_CORRECT_SUM_RANGE(p, sum)
PPMD8_CORRECT_SUM_RANGE(p, sum);
RC_EncodeFinal(low, freq, sum)
RC_EncodeFinal(low, freq, sum);
Ppmd8_Update2(p);
return;
}
@ -290,19 +291,19 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
{
UInt32 total = sum + escFreq;
see->Summ = (UInt16)(see->Summ + total);
PPMD8_CORRECT_SUM_RANGE(p, total)
PPMD8_CORRECT_SUM_RANGE(p, total);
RC_Encode(sum, total - sum, total)
RC_Encode(sum, total - sum, total);
}
{
const CPpmd_State *s2 = Ppmd8_GetStats(p, p->MinContext);
CPpmd_State *s2 = Ppmd8_GetStats(p, p->MinContext);
s--;
MASK(s->Symbol) = 0;
do
{
const unsigned sym0 = s2[0].Symbol;
const unsigned sym1 = s2[1].Symbol;
unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol;
s2 += 2;
MASK(sym0) = 0;
MASK(sym1) = 0;
@ -311,27 +312,3 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
}
}
}
#undef kTop
#undef kBot
#undef WRITE_BYTE
#undef RC_NORM_BASE
#undef RC_NORM_1
#undef RC_NORM
#undef RC_NORM_LOCAL
#undef RC_NORM_REMOTE
#undef R
#undef RC_Encode
#undef RC_EncodeFinal
#undef CTX
#undef SUCCESSOR
#undef MASK

View file

@ -1,127 +1,10 @@
/* Precomp.h -- precompilation file
2024-01-25 : Igor Pavlov : Public domain */
/* Precomp.h -- StdAfx
2013-11-12 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_PRECOMP_H
#define ZIP7_INC_PRECOMP_H
/*
this file must be included before another *.h files and before <windows.h>.
this file is included from the following files:
C\*.c
C\Util\*\Precomp.h <- C\Util\*\*.c
CPP\Common\Common.h <- *\StdAfx.h <- *\*.cpp
this file can set the following macros:
Z7_LARGE_PAGES 1
Z7_LONG_PATH 1
Z7_WIN32_WINNT_MIN 0x0500 (or higher) : we require at least win2000+ for 7-Zip
_WIN32_WINNT 0x0500 (or higher)
WINVER _WIN32_WINNT
UNICODE 1
_UNICODE 1
*/
#ifndef __7Z_PRECOMP_H
#define __7Z_PRECOMP_H
#include "Compiler.h"
#ifdef _MSC_VER
// #pragma warning(disable : 4206) // nonstandard extension used : translation unit is empty
#if _MSC_VER >= 1912
// #pragma warning(disable : 5039) // pointer or reference to potentially throwing function passed to 'extern "C"' function under - EHc.Undefined behavior may occur if this function throws an exception.
#endif
#endif
/*
// for debug:
#define UNICODE 1
#define _UNICODE 1
#define _WIN32_WINNT 0x0500 // win2000
#ifndef WINVER
#define WINVER _WIN32_WINNT
#endif
*/
#ifdef _WIN32
/*
this "Precomp.h" file must be included before <windows.h>,
if we want to define _WIN32_WINNT before <windows.h>.
*/
#ifndef Z7_LARGE_PAGES
#ifndef Z7_NO_LARGE_PAGES
#define Z7_LARGE_PAGES 1
#endif
#endif
#ifndef Z7_LONG_PATH
#ifndef Z7_NO_LONG_PATH
#define Z7_LONG_PATH 1
#endif
#endif
#ifndef Z7_DEVICE_FILE
#ifndef Z7_NO_DEVICE_FILE
// #define Z7_DEVICE_FILE 1
#endif
#endif
// we don't change macros if included after <windows.h>
#ifndef _WINDOWS_
#ifndef Z7_WIN32_WINNT_MIN
#if defined(_M_ARM64) || defined(__aarch64__)
// #define Z7_WIN32_WINNT_MIN 0x0a00 // win10
#define Z7_WIN32_WINNT_MIN 0x0600 // vista
#elif defined(_M_ARM) && defined(_M_ARMT) && defined(_M_ARM_NT)
// #define Z7_WIN32_WINNT_MIN 0x0602 // win8
#define Z7_WIN32_WINNT_MIN 0x0600 // vista
#elif defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) || defined(_M_IA64)
#define Z7_WIN32_WINNT_MIN 0x0503 // win2003
// #elif defined(_M_IX86) || defined(__i386__)
// #define Z7_WIN32_WINNT_MIN 0x0500 // win2000
#else // x86 and another(old) systems
#define Z7_WIN32_WINNT_MIN 0x0500 // win2000
// #define Z7_WIN32_WINNT_MIN 0x0502 // win2003 // for debug
#endif
#endif // Z7_WIN32_WINNT_MIN
#ifndef Z7_DO_NOT_DEFINE_WIN32_WINNT
#ifdef _WIN32_WINNT
// #error Stop_Compiling_Bad_WIN32_WINNT
#else
#ifndef Z7_NO_DEFINE_WIN32_WINNT
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define _WIN32_WINNT Z7_WIN32_WINNT_MIN
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif // _WIN32_WINNT
#ifndef WINVER
#define WINVER _WIN32_WINNT
#endif
#endif // Z7_DO_NOT_DEFINE_WIN32_WINNT
#ifndef _MBCS
#ifndef Z7_NO_UNICODE
// UNICODE and _UNICODE are used by <windows.h> and by 7-zip code.
#ifndef UNICODE
#define UNICODE 1
#endif
#ifndef _UNICODE
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define _UNICODE 1
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif // Z7_NO_UNICODE
#endif // _MBCS
#endif // _WINDOWS_
// #include "7zWindows.h"
#endif // _WIN32
/* #include "7zTypes.h" */
#endif

View file

@ -1,14 +1,14 @@
/* RotateDefs.h -- Rotate functions
2023-06-18 : Igor Pavlov : Public domain */
2015-03-25 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_ROTATE_DEFS_H
#define ZIP7_INC_ROTATE_DEFS_H
#ifndef __ROTATE_DEFS_H
#define __ROTATE_DEFS_H
#ifdef _MSC_VER
#include <stdlib.h>
/* don't use _rotl with old MINGW. It can insert slow call to function. */
/* don't use _rotl with MINGW. It can insert slow call to function. */
/* #if (_MSC_VER >= 1200) */
#pragma intrinsic(_rotl)
@ -18,32 +18,12 @@
#define rotlFixed(x, n) _rotl((x), (n))
#define rotrFixed(x, n) _rotr((x), (n))
#if (_MSC_VER >= 1300)
#define Z7_ROTL64(x, n) _rotl64((x), (n))
#define Z7_ROTR64(x, n) _rotr64((x), (n))
#else
#define Z7_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#define Z7_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
#endif
#else
/* new compilers can translate these macros to fast commands. */
#if defined(__clang__) && (__clang_major__ >= 4) \
|| defined(__GNUC__) && (__GNUC__ >= 5)
/* GCC 4.9.0 and clang 3.5 can recognize more correct version: */
#define rotlFixed(x, n) (((x) << (n)) | ((x) >> (-(n) & 31)))
#define rotrFixed(x, n) (((x) >> (n)) | ((x) << (-(n) & 31)))
#define Z7_ROTL64(x, n) (((x) << (n)) | ((x) >> (-(n) & 63)))
#define Z7_ROTR64(x, n) (((x) >> (n)) | ((x) << (-(n) & 63)))
#else
/* for old GCC / clang: */
#define rotlFixed(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
#define rotrFixed(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#define Z7_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#define Z7_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
#endif
#endif

316
C/Sha1.c
View file

@ -1,60 +1,64 @@
/* Sha1.c -- SHA-1 Hash
: Igor Pavlov : Public domain
2021-07-13 : Igor Pavlov : Public domain
This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */
#include "Precomp.h"
#include <string.h>
#include "Sha1.h"
#include "RotateDefs.h"
#include "CpuArch.h"
#include "RotateDefs.h"
#include "Sha1.h"
#if defined(_MSC_VER) && (_MSC_VER < 1900)
// #define USE_MY_MM
#endif
#ifdef MY_CPU_X86_OR_AMD64
#if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \
|| defined(_MSC_VER) && (_MSC_VER >= 1200)
#define Z7_COMPILER_SHA1_SUPPORTED
#ifdef _MSC_VER
#if _MSC_VER >= 1200
#define _SHA_SUPPORTED
#endif
#elif defined(__clang__)
#if (__clang_major__ >= 8) // fix that check
#define _SHA_SUPPORTED
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 8) // fix that check
#define _SHA_SUPPORTED
#endif
#elif defined(__INTEL_COMPILER)
#if (__INTEL_COMPILER >= 1800) // fix that check
#define _SHA_SUPPORTED
#endif
#endif
#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) \
&& (!defined(Z7_MSC_VER_ORIGINAL) || (_MSC_VER >= 1929) && (_MSC_FULL_VER >= 192930037))
#if defined(__ARM_FEATURE_SHA2) \
|| defined(__ARM_FEATURE_CRYPTO)
#define Z7_COMPILER_SHA1_SUPPORTED
#else
#if defined(MY_CPU_ARM64) \
|| defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
|| defined(Z7_MSC_VER_ORIGINAL)
#if defined(__ARM_FP) && \
( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
|| defined(__GNUC__) && (__GNUC__ >= 6) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
#if defined(MY_CPU_ARM64) \
|| !defined(Z7_CLANG_VERSION) \
|| defined(__ARM_NEON) && \
(Z7_CLANG_VERSION < 170000 || \
Z7_CLANG_VERSION > 170001)
#define Z7_COMPILER_SHA1_SUPPORTED
#elif defined(MY_CPU_ARM_OR_ARM64)
#ifdef _MSC_VER
#if _MSC_VER >= 1910 && _MSC_VER >= 1929 && _MSC_FULL_VER >= 192930037
#define _SHA_SUPPORTED
#endif
#elif defined(__clang__)
#if (__clang_major__ >= 8) // fix that check
#define _SHA_SUPPORTED
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 6) // fix that check
#define _SHA_SUPPORTED
#endif
#endif
#endif
void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks);
void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks);
#ifdef Z7_COMPILER_SHA1_SUPPORTED
void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
#ifdef _SHA_SUPPORTED
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks;
static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS_HW;
static SHA1_FUNC_UPDATE_BLOCKS g_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks;
static SHA1_FUNC_UPDATE_BLOCKS g_FUNC_UPDATE_BLOCKS_HW;
#define SHA1_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
#define UPDATE_BLOCKS(p) p->func_UpdateBlocks
#else
#define SHA1_UPDATE_BLOCKS(p) Sha1_UpdateBlocks
#define UPDATE_BLOCKS(p) Sha1_UpdateBlocks
#endif
@ -62,16 +66,16 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
{
SHA1_FUNC_UPDATE_BLOCKS func = Sha1_UpdateBlocks;
#ifdef Z7_COMPILER_SHA1_SUPPORTED
#ifdef _SHA_SUPPORTED
if (algo != SHA1_ALGO_SW)
{
if (algo == SHA1_ALGO_DEFAULT)
func = g_SHA1_FUNC_UPDATE_BLOCKS;
func = g_FUNC_UPDATE_BLOCKS;
else
{
if (algo != SHA1_ALGO_HW)
return False;
func = g_SHA1_FUNC_UPDATE_BLOCKS_HW;
func = g_FUNC_UPDATE_BLOCKS_HW;
if (!func)
return False;
}
@ -81,28 +85,27 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
return False;
#endif
p->v.vars.func_UpdateBlocks = func;
p->func_UpdateBlocks = func;
return True;
}
/* define it for speed optimization */
// #define Z7_SHA1_UNROLL
// #define _SHA1_UNROLL
// allowed unroll steps: (1, 2, 4, 5, 20)
#undef Z7_SHA1_BIG_W
#ifdef Z7_SHA1_UNROLL
#ifdef _SHA1_UNROLL
#define STEP_PRE 20
#define STEP_MAIN 20
#else
#define Z7_SHA1_BIG_W
#define _SHA1_BIG_W
#define STEP_PRE 5
#define STEP_MAIN 5
#endif
#ifdef Z7_SHA1_BIG_W
#ifdef _SHA1_BIG_W
#define kNumW 80
#define w(i) W[i]
#else
@ -147,11 +150,11 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
*/
#define M5(i, fx, wx0, wx1) \
T5 ( a,b,c,d,e, fx, wx0((i) ) ) \
T5 ( e,a,b,c,d, fx, wx1((i)+1) ) \
T5 ( d,e,a,b,c, fx, wx1((i)+2) ) \
T5 ( c,d,e,a,b, fx, wx1((i)+3) ) \
T5 ( b,c,d,e,a, fx, wx1((i)+4) ) \
T5 ( a,b,c,d,e, fx, wx0((i) ) ); \
T5 ( e,a,b,c,d, fx, wx1((i)+1) ); \
T5 ( d,e,a,b,c, fx, wx1((i)+2) ); \
T5 ( c,d,e,a,b, fx, wx1((i)+3) ); \
T5 ( b,c,d,e,a, fx, wx1((i)+4) ); \
#define R5(i, fx, wx) \
M5 ( i, fx, wx, wx) \
@ -160,17 +163,17 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
#if STEP_PRE > 5
#define R20_START \
R5 ( 0, f0, w0) \
R5 ( 5, f0, w0) \
R5 ( 10, f0, w0) \
M5 ( 15, f0, w0, w1) \
R5 ( 0, f0, w0); \
R5 ( 5, f0, w0); \
R5 ( 10, f0, w0); \
M5 ( 15, f0, w0, w1); \
#elif STEP_PRE == 5
#define R20_START \
{ size_t i; for (i = 0; i < 15; i += STEP_PRE) \
{ R5(i, f0, w0) } } \
M5 ( 15, f0, w0, w1) \
{ R5(i, f0, w0); } } \
M5 ( 15, f0, w0, w1); \
#else
@ -184,8 +187,8 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
#define R20_START \
{ size_t i; for (i = 0; i < 16; i += STEP_PRE) \
{ R_PRE(i, f0, w0) } } \
R4 ( 16, f0, w1) \
{ R_PRE(i, f0, w0); } } \
R4 ( 16, f0, w1); \
#endif
@ -194,10 +197,10 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
#if STEP_MAIN > 5
#define R20(ii, fx) \
R5 ( (ii) , fx, w1) \
R5 ( (ii) + 5 , fx, w1) \
R5 ( (ii) + 10, fx, w1) \
R5 ( (ii) + 15, fx, w1) \
R5 ( (ii) , fx, w1); \
R5 ( (ii) + 5 , fx, w1); \
R5 ( (ii) + 10, fx, w1); \
R5 ( (ii) + 15, fx, w1); \
#else
@ -213,7 +216,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
#define R20(ii, fx) \
{ size_t i; for (i = (ii); i < (ii) + 20; i += STEP_MAIN) \
{ R_MAIN(i, fx, w1) } } \
{ R_MAIN(i, fx, w1); } } \
#endif
@ -221,7 +224,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
void Sha1_InitState(CSha1 *p)
{
p->v.vars.count = 0;
p->count = 0;
p->state[0] = 0x67452301;
p->state[1] = 0xEFCDAB89;
p->state[2] = 0x98BADCFE;
@ -231,9 +234,9 @@ void Sha1_InitState(CSha1 *p)
void Sha1_Init(CSha1 *p)
{
p->v.vars.func_UpdateBlocks =
#ifdef Z7_COMPILER_SHA1_SUPPORTED
g_SHA1_FUNC_UPDATE_BLOCKS;
p->func_UpdateBlocks =
#ifdef _SHA_SUPPORTED
g_FUNC_UPDATE_BLOCKS;
#else
NULL;
#endif
@ -241,12 +244,12 @@ void Sha1_Init(CSha1 *p)
}
Z7_NO_INLINE
void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks)
MY_NO_INLINE
void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks)
{
UInt32 a, b, c, d, e;
UInt32 W[kNumW];
// if (numBlocks != 0x1264378347) return;
if (numBlocks == 0)
return;
@ -263,9 +266,9 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num
#endif
R20_START
R20(20, f1)
R20(40, f2)
R20(60, f3)
R20(20, f1);
R20(40, f2);
R20(60, f3);
a += state[0];
b += state[1];
@ -279,27 +282,32 @@ void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t num
state[3] = d;
state[4] = e;
data += SHA1_BLOCK_SIZE;
data += 64;
}
while (--numBlocks);
}
#define Sha1_UpdateBlock(p) SHA1_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
#define Sha1_UpdateBlock(p) UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
void Sha1_Update(CSha1 *p, const Byte *data, size_t size)
{
if (size == 0)
return;
{
const unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1);
const unsigned num = SHA1_BLOCK_SIZE - pos;
p->v.vars.count += size;
unsigned pos = (unsigned)p->count & 0x3F;
unsigned num;
p->count += size;
num = 64 - pos;
if (num > size)
{
memcpy(p->buffer + pos, data, size);
return;
}
if (pos != 0)
{
size -= num;
@ -309,10 +317,9 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size)
}
}
{
const size_t numBlocks = size >> 6;
// if (numBlocks)
SHA1_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
size &= SHA1_BLOCK_SIZE - 1;
size_t numBlocks = size >> 6;
UPDATE_BLOCKS(p)(p->state, data, numBlocks);
size &= 0x3F;
if (size == 0)
return;
data += (numBlocks << 6);
@ -323,40 +330,64 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size)
void Sha1_Final(CSha1 *p, Byte *digest)
{
unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1);
unsigned pos = (unsigned)p->count & 0x3F;
p->buffer[pos++] = 0x80;
if (pos > (SHA1_BLOCK_SIZE - 4 * 2))
if (pos > (64 - 8))
{
while (pos != SHA1_BLOCK_SIZE) { p->buffer[pos++] = 0; }
// memset(&p->buf.buffer[pos], 0, SHA1_BLOCK_SIZE - pos);
while (pos != 64) { p->buffer[pos++] = 0; }
// memset(&p->buf.buffer[pos], 0, 64 - pos);
Sha1_UpdateBlock(p);
pos = 0;
}
memset(&p->buffer[pos], 0, (SHA1_BLOCK_SIZE - 4 * 2) - pos);
/*
if (pos & 3)
{
const UInt64 numBits = p->v.vars.count << 3;
SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
p->buffer[pos] = 0;
p->buffer[pos + 1] = 0;
p->buffer[pos + 2] = 0;
pos += 3;
pos &= ~3;
}
{
for (; pos < 64 - 8; pos += 4)
*(UInt32 *)(&p->buffer[pos]) = 0;
}
*/
memset(&p->buffer[pos], 0, (64 - 8) - pos);
{
UInt64 numBits = (p->count << 3);
SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32));
SetBe32(p->buffer + 64 - 4, (UInt32)(numBits));
}
Sha1_UpdateBlock(p);
SetBe32(digest, p->state[0])
SetBe32(digest + 4, p->state[1])
SetBe32(digest + 8, p->state[2])
SetBe32(digest + 12, p->state[3])
SetBe32(digest + 16, p->state[4])
SetBe32(digest, p->state[0]);
SetBe32(digest + 4, p->state[1]);
SetBe32(digest + 8, p->state[2]);
SetBe32(digest + 12, p->state[3]);
SetBe32(digest + 16, p->state[4]);
Sha1_InitState(p);
}
void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size)
{
const UInt64 numBits = (p->v.vars.count + size) << 3;
SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32))
SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits))
const UInt64 numBits = (p->count + size) << 3;
SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32));
SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits));
// SetBe32((UInt32 *)(block + size), 0x80000000);
SetUi32((UInt32 *)(void *)(block + size), 0x80)
SetUi32((UInt32 *)(void *)(block + size), 0x80);
size += 4;
while (size != (SHA1_NUM_BLOCK_WORDS - 2) * 4)
{
@ -376,66 +407,67 @@ void Sha1_GetBlockDigest(const CSha1 *p, const Byte *data, Byte *destDigest)
st[3] = p->state[3];
st[4] = p->state[4];
SHA1_UPDATE_BLOCKS(p)(st, data, 1);
UPDATE_BLOCKS(p)(st, data, 1);
SetBe32(destDigest + 0 , st[0])
SetBe32(destDigest + 1 * 4, st[1])
SetBe32(destDigest + 2 * 4, st[2])
SetBe32(destDigest + 3 * 4, st[3])
SetBe32(destDigest + 4 * 4, st[4])
SetBe32(destDigest + 0 , st[0]);
SetBe32(destDigest + 1 * 4, st[1]);
SetBe32(destDigest + 2 * 4, st[2]);
SetBe32(destDigest + 3 * 4, st[3]);
SetBe32(destDigest + 4 * 4, st[4]);
}
void Sha1Prepare(void)
void Sha1Prepare()
{
#ifdef Z7_COMPILER_SHA1_SUPPORTED
#ifdef _SHA_SUPPORTED
SHA1_FUNC_UPDATE_BLOCKS f, f_hw;
f = Sha1_UpdateBlocks;
f_hw = NULL;
#ifdef MY_CPU_X86_OR_AMD64
#ifdef MY_CPU_X86_OR_AMD64
#ifndef USE_MY_MM
if (CPU_IsSupported_SHA()
&& CPU_IsSupported_SSSE3()
// && CPU_IsSupported_SSE41()
)
#else
#endif
#else
if (CPU_IsSupported_SHA1())
#endif
#endif
{
// printf("\n========== HW SHA1 ======== \n");
#if 1 && defined(MY_CPU_ARM_OR_ARM64) && defined(Z7_MSC_VER_ORIGINAL) && (_MSC_FULL_VER < 192930037)
#if defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER)
/* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037).
It generated incorrect SHA-1 code. */
#pragma message("== SHA1 code can work incorrectly with this compiler")
#error Stop_Compiling_MSC_Compiler_BUG_SHA1
#endif
It generated incorrect SHA-1 code.
21.03 : we test sha1-hardware code at runtime initialization */
#pragma message("== SHA1 code: MSC compiler : failure-check code was inserted")
UInt32 state[5] = { 0, 1, 2, 3, 4 } ;
Byte data[64];
unsigned i;
for (i = 0; i < sizeof(data); i += 2)
{
data[i ] = (Byte)(i);
data[i + 1] = (Byte)(i + 1);
}
Sha1_UpdateBlocks_HW(state, data, sizeof(data) / 64);
if ( state[0] != 0x9acd7297
|| state[1] != 0x4624d898
|| state[2] != 0x0bf079f0
|| state[3] != 0x031e61b3
|| state[4] != 0x8323fe20)
{
// printf("\n========== SHA-1 hardware version failure ======== \n");
}
else
#endif
{
f = f_hw = Sha1_UpdateBlocks_HW;
}
}
g_SHA1_FUNC_UPDATE_BLOCKS = f;
g_SHA1_FUNC_UPDATE_BLOCKS_HW = f_hw;
#endif
g_FUNC_UPDATE_BLOCKS = f;
g_FUNC_UPDATE_BLOCKS_HW = f_hw;
#endif
}
#undef kNumW
#undef w
#undef w0
#undef w1
#undef f0
#undef f1
#undef f2
#undef f3
#undef T1
#undef T5
#undef M5
#undef R1
#undef R2
#undef R4
#undef R5
#undef R20_START
#undef R_PRE
#undef R_MAIN
#undef STEP_PRE
#undef STEP_MAIN
#undef Z7_SHA1_BIG_W
#undef Z7_SHA1_UNROLL
#undef Z7_COMPILER_SHA1_SUPPORTED

View file

@ -1,8 +1,8 @@
/* Sha1.h -- SHA-1 Hash
: Igor Pavlov : Public domain */
2021-02-08 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_SHA1_H
#define ZIP7_INC_SHA1_H
#ifndef __7Z_SHA1_H
#define __7Z_SHA1_H
#include "7zTypes.h"
@ -14,10 +14,7 @@ EXTERN_C_BEGIN
#define SHA1_BLOCK_SIZE (SHA1_NUM_BLOCK_WORDS * 4)
#define SHA1_DIGEST_SIZE (SHA1_NUM_DIGEST_WORDS * 4)
typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks);
typedef void (MY_FAST_CALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks);
/*
if (the system supports different SHA1 code implementations)
@ -35,18 +32,11 @@ typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte
typedef struct
{
union
{
struct
{
SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
UInt64 count;
} vars;
UInt64 _pad_64bit[4];
void *_pad_align_ptr[2];
} v;
SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
UInt64 count;
UInt64 __pad_2[2];
UInt32 state[SHA1_NUM_DIGEST_WORDS];
UInt32 _pad_3[3];
UInt32 __pad_3[3];
Byte buffer[SHA1_BLOCK_SIZE];
} CSha1;
@ -72,7 +62,7 @@ void Sha1_Final(CSha1 *p, Byte *digest);
void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size);
void Sha1_GetBlockDigest(const CSha1 *p, const Byte *data, Byte *destDigest);
// void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks);
// void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks);
/*
call Sha1Prepare() once at program start.

View file

@ -1,53 +1,71 @@
/* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions
: Igor Pavlov : Public domain */
2021-04-01 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include "Compiler.h"
#if defined(_MSC_VER)
#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
// #define USE_MY_MM
#endif
#endif
#include "CpuArch.h"
// #define Z7_USE_HW_SHA_STUB // for debug
#ifdef MY_CPU_X86_OR_AMD64
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
#if defined(__clang__)
#if (__clang_major__ >= 8) // fix that check
#define USE_HW_SHA
#elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
#define USE_HW_SHA
#if !defined(__INTEL_COMPILER)
// icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
#if !defined(__SHA__) || !defined(__SSSE3__)
#ifndef __SHA__
#define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
#if defined(_MSC_VER)
// SSSE3: for clang-cl:
#include <tmmintrin.h>
#define __SHA__
#endif
#endif
#endif
#elif defined(_MSC_VER)
#if (_MSC_VER >= 1900)
#pragma clang diagnostic ignored "-Wvector-conversion"
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 8) // fix that check
#define USE_HW_SHA
#ifndef __SHA__
#define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
// #pragma GCC target("sha,ssse3")
#endif
#endif
#elif defined(__INTEL_COMPILER)
#if (__INTEL_COMPILER >= 1800) // fix that check
#define USE_HW_SHA
#endif
#elif defined(_MSC_VER)
#ifdef USE_MY_MM
#define USE_VER_MIN 1300
#else
#define Z7_USE_HW_SHA_STUB
#define USE_VER_MIN 1910
#endif
#if _MSC_VER >= USE_VER_MIN
#define USE_HW_SHA
#endif
#endif
// #endif // MY_CPU_X86_OR_AMD64
#ifndef USE_HW_SHA
// #define Z7_USE_HW_SHA_STUB // for debug
#endif
#ifdef USE_HW_SHA
// #pragma message("Sha1 HW")
// #include <wmmintrin.h>
// sse/sse2/ssse3:
#include <tmmintrin.h>
// sha*:
#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
#include <immintrin.h>
#if defined (__clang__) && defined(_MSC_VER)
#if !defined(__SHA__)
#include <shaintrin.h>
#endif
#else
#include <emmintrin.h>
#if defined(_MSC_VER) && (_MSC_VER >= 1600)
// #include <intrin.h>
#endif
#ifdef USE_MY_MM
#include "My_mm.h"
#endif
#endif
@ -69,71 +87,86 @@ SHA:
_mm_sha1*
*/
#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
#define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src);
#define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask);
#define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask);
#ifdef __clang__
#define SHA1_RNDS4_RET_TYPE_CAST (__m128i)
#else
#define SHA1_RNDS4_RET_TYPE_CAST
#endif
#define SHA1_RND4(abcd, e0, f) abcd = SHA1_RNDS4_RET_TYPE_CAST _mm_sha1rnds4_epu32(abcd, e0, f);
#define SHA1_NEXTE(e, m) e = _mm_sha1nexte_epu32(e, m);
#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
#define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src);
#define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src);
#define SHA1_RND4(abcd, e0, f) abcd = _mm_sha1rnds4_epu32(abcd, e0, f);
#define SHA1_NEXTE(e, m) e = _mm_sha1nexte_epu32(e, m);
#define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src);
#define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src);
#define LOAD_SHUFFLE(m, k) \
m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
SHUFFLE_EPI8(m, mask) \
SHUFFLE_EPI8(m, mask); \
#define SM1(m0, m1, m2, m3) \
SHA1_MSG1(m0, m1); \
#define SM2(m0, m1, m2, m3) \
XOR_SI128(m3, m1); \
SHA1_MSG2(m3, m2); \
#define SM3(m0, m1, m2, m3) \
XOR_SI128(m3, m1); \
SM1(m0, m1, m2, m3) \
SHA1_MSG2(m3, m2); \
#define NNN(m0, m1, m2, m3)
#define SM1(m0, m1, m2, m3) \
SHA1_MSG1(m0, m1) \
#define SM2(m0, m1, m2, m3) \
XOR_SI128(m3, m1) \
SHA1_MSG2(m3, m2) \
#define SM3(m0, m1, m2, m3) \
XOR_SI128(m3, m1) \
SM1(m0, m1, m2, m3) \
SHA1_MSG2(m3, m2) \
#define R4(k, m0, m1, m2, m3, e0, e1, OP) \
#define R4(k, e0, e1, m0, m1, m2, m3, OP) \
e1 = abcd; \
SHA1_RND4(abcd, e0, (k) / 5) \
SHA1_NEXTE(e1, m1) \
OP(m0, m1, m2, m3) \
SHA1_RND4(abcd, e0, (k) / 5); \
SHA1_NEXTE(e1, m1); \
OP(m0, m1, m2, m3); \
#define R16(k, mx, OP0, OP1, OP2, OP3) \
R4 ( (k)*4+0, m0,m1,m2,m3, e0,e1, OP0 ) \
R4 ( (k)*4+1, m1,m2,m3,m0, e1,e0, OP1 ) \
R4 ( (k)*4+2, m2,m3,m0,m1, e0,e1, OP2 ) \
R4 ( (k)*4+3, m3,mx,m1,m2, e1,e0, OP3 ) \
R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \
R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \
R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \
R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \
#define PREPARE_STATE \
SHUFFLE_EPI32 (abcd, 0x1B) \
SHUFFLE_EPI32 (e0, 0x1B) \
SHUFFLE_EPI32 (abcd, 0x1B); \
SHUFFLE_EPI32 (e0, 0x1B); \
void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
#ifdef ATTRIB_SHA
ATTRIB_SHA
#endif
void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
{
const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
__m128i abcd, e0;
if (numBlocks == 0)
return;
@ -157,15 +190,15 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
LOAD_SHUFFLE (m2, 2)
LOAD_SHUFFLE (m3, 3)
ADD_EPI32(e0, m0)
ADD_EPI32(e0, m0);
R16 ( 0, m0, SM1, SM3, SM3, SM3 )
R16 ( 1, m0, SM3, SM3, SM3, SM3 )
R16 ( 2, m0, SM3, SM3, SM3, SM3 )
R16 ( 3, m0, SM3, SM3, SM3, SM3 )
R16 ( 4, e2, SM2, NNN, NNN, NNN )
R16 ( 0, m0, SM1, SM3, SM3, SM3 );
R16 ( 1, m0, SM3, SM3, SM3, SM3 );
R16 ( 2, m0, SM3, SM3, SM3, SM3 );
R16 ( 3, m0, SM3, SM3, SM3, SM3 );
R16 ( 4, e2, SM2, NNN, NNN, NNN );
ADD_EPI32(abcd, abcd_save)
ADD_EPI32(abcd, abcd_save);
data += 64;
}
@ -174,155 +207,78 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
PREPARE_STATE
_mm_storeu_si128((__m128i *) (void *) state, abcd);
*(state + 4) = (UInt32)_mm_cvtsi128_si32(e0);
*(state+4) = (UInt32)_mm_cvtsi128_si32(e0);
}
#endif // USE_HW_SHA
#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) \
&& (!defined(Z7_MSC_VER_ORIGINAL) || (_MSC_VER >= 1929) && (_MSC_FULL_VER >= 192930037))
#if defined(__ARM_FEATURE_SHA2) \
|| defined(__ARM_FEATURE_CRYPTO)
#define USE_HW_SHA
#else
#if defined(MY_CPU_ARM64) \
|| defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
|| defined(Z7_MSC_VER_ORIGINAL)
#if defined(__ARM_FP) && \
( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
|| defined(__GNUC__) && (__GNUC__ >= 6) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
#if defined(MY_CPU_ARM64) \
|| !defined(Z7_CLANG_VERSION) \
|| defined(__ARM_NEON) && \
(Z7_CLANG_VERSION < 170000 || \
Z7_CLANG_VERSION > 170001)
#elif defined(MY_CPU_ARM_OR_ARM64)
#if defined(__clang__)
#if (__clang_major__ >= 8) // fix that check
#define USE_HW_SHA
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 6) // fix that check
#define USE_HW_SHA
#endif
#elif defined(_MSC_VER)
#if _MSC_VER >= 1910
#define USE_HW_SHA
#endif
#endif
#ifdef USE_HW_SHA
// #pragma message("=== Sha1 HW === ")
// __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_SHA2
#if defined(__clang__) || defined(__GNUC__)
#if !defined(__ARM_FEATURE_SHA2) && \
!defined(__ARM_FEATURE_CRYPTO)
#ifdef MY_CPU_ARM64
#if defined(__clang__)
#define ATTRIB_SHA __attribute__((__target__("crypto")))
#else
#define ATTRIB_SHA __attribute__((__target__("+crypto")))
#endif
#else
#if defined(__clang__) && (__clang_major__ >= 1)
#define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2")))
#else
#define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
#endif
#endif
#endif
#else
// _MSC_VER
// for arm32
#define _ARM_USE_NEW_NEON_INTRINSICS
#endif
#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
#if defined(_MSC_VER) && defined(MY_CPU_ARM64)
#include <arm64_neon.h>
#else
#if defined(__clang__) && __clang_major__ < 16
#if !defined(__ARM_FEATURE_SHA2) && \
!defined(__ARM_FEATURE_CRYPTO)
// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
// #if defined(__clang__) && __clang_major__ < 13
#define __ARM_FEATURE_CRYPTO 1
// #else
#define __ARM_FEATURE_SHA2 1
// #endif
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif // clang
#if defined(__clang__)
#if defined(__ARM_ARCH) && __ARM_ARCH < 8
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
// #pragma message("#define __ARM_ARCH 8")
#undef __ARM_ARCH
#define __ARM_ARCH 8
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif // clang
#include <arm_neon.h>
#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
defined(__ARM_FEATURE_CRYPTO) && \
defined(__ARM_FEATURE_SHA2)
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#undef __ARM_FEATURE_CRYPTO
#undef __ARM_FEATURE_SHA2
#undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
#endif
#endif // Z7_MSC_VER_ORIGINAL
typedef uint32x4_t v128;
// typedef __n128 v128; // MSVC
// the bug in clang 3.8.1:
// __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1);
#if defined(__clang__) && (__clang_major__ <= 9)
#pragma GCC diagnostic ignored "-Wvector-conversion"
#endif
#ifdef MY_CPU_BE
#define MY_rev32_for_LE(x) x
#define MY_rev32_for_LE(x)
#else
#define MY_rev32_for_LE(x) vrev32q_u8(x)
#define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
#endif
#define LOAD_128_32(_p) vld1q_u32(_p)
#define LOAD_128_8(_p) vld1q_u8 (_p)
#define STORE_128_32(_p, _v) vst1q_u32(_p, _v)
#define LOAD_128(_p) (*(const v128 *)(const void *)(_p))
#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
#define LOAD_SHUFFLE(m, k) \
m = vreinterpretq_u32_u8( \
MY_rev32_for_LE( \
LOAD_128_8(data + (k) * 16))); \
m = LOAD_128((data + (k) * 16)); \
MY_rev32_for_LE(m); \
#define N0(dest, src2, src3)
#define N1(dest, src)
#define U0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3);
#define U1(dest, src) dest = vsha1su1q_u32(dest, src);
#define C(e) abcd = vsha1cq_u32(abcd, e, t)
#define P(e) abcd = vsha1pq_u32(abcd, e, t)
#define M(e) abcd = vsha1mq_u32(abcd, e, t)
#define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3);
#define SU1(dest, src) dest = vsha1su1q_u32(dest, src);
#define C(e) abcd = vsha1cq_u32(abcd, e, t);
#define P(e) abcd = vsha1pq_u32(abcd, e, t);
#define M(e) abcd = vsha1mq_u32(abcd, e, t);
#define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0))
#define T(m, c) t = vaddq_u32(m, c)
#define R16(d0,d1,d2,d3, f0,z0, f1,z1, f2,z2, f3,z3, w0,w1,w2,w3) \
T(m0, d0); f0(m3, m0, m1) z0(m2, m1) H(e1); w0(e0); \
T(m1, d1); f1(m0, m1, m2) z1(m3, m2) H(e0); w1(e1); \
T(m2, d2); f2(m1, m2, m3) z2(m0, m3) H(e1); w2(e0); \
T(m3, d3); f3(m2, m3, m0) z3(m1, m0) H(e0); w3(e1); \
void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
#ifdef ATTRIB_SHA
ATTRIB_SHA
#endif
void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
{
v128 abcd;
v128 c0, c1, c2, c3;
@ -336,7 +292,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
c2 = vdupq_n_u32(0x8f1bbcdc);
c3 = vdupq_n_u32(0xca62c1d6);
abcd = LOAD_128_32(&state[0]);
abcd = LOAD_128(&state[0]);
e0 = state[4];
do
@ -354,11 +310,26 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
LOAD_SHUFFLE (m2, 2)
LOAD_SHUFFLE (m3, 3)
R16 ( c0,c0,c0,c0, N0,N1, U0,N1, U0,U1, U0,U1, C,C,C,C )
R16 ( c0,c1,c1,c1, U0,U1, U0,U1, U0,U1, U0,U1, C,P,P,P )
R16 ( c1,c1,c2,c2, U0,U1, U0,U1, U0,U1, U0,U1, P,P,M,M )
R16 ( c2,c2,c2,c3, U0,U1, U0,U1, U0,U1, U0,U1, M,M,M,P )
R16 ( c3,c3,c3,c3, U0,U1, N0,U1, N0,N1, N0,N1, P,P,P,P )
T(m0, c0); H(e1); C(e0);
T(m1, c0); SU0(m0, m1, m2); H(e0); C(e1);
T(m2, c0); SU0(m1, m2, m3); SU1(m0, m3); H(e1); C(e0);
T(m3, c0); SU0(m2, m3, m0); SU1(m1, m0); H(e0); C(e1);
T(m0, c0); SU0(m3, m0, m1); SU1(m2, m1); H(e1); C(e0);
T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1);
T(m2, c1); SU0(m1, m2, m3); SU1(m0, m3); H(e1); P(e0);
T(m3, c1); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1);
T(m0, c1); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0);
T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1);
T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0);
T(m3, c2); SU0(m2, m3, m0); SU1(m1, m0); H(e0); M(e1);
T(m0, c2); SU0(m3, m0, m1); SU1(m2, m1); H(e1); M(e0);
T(m1, c2); SU0(m0, m1, m2); SU1(m3, m2); H(e0); M(e1);
T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0);
T(m3, c3); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1);
T(m0, c3); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0);
T(m1, c3); SU1(m3, m2); H(e0); P(e1);
T(m2, c3); H(e1); P(e0);
T(m3, c3); H(e0); P(e1);
abcd = vaddq_u32(abcd, abcd_save);
e0 += e0_save;
@ -367,7 +338,7 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
}
while (--numBlocks);
STORE_128_32(&state[0], abcd);
STORE_128(&state[0], abcd);
state[4] = e0;
}
@ -375,16 +346,19 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
#endif // MY_CPU_ARM_OR_ARM64
#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
#ifndef USE_HW_SHA
// #error Stop_Compiling_UNSUPPORTED_SHA
// #include <stdlib.h>
// #include "Sha1.h"
// #if defined(_MSC_VER)
void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks);
#pragma message("Sha1 HW-SW stub was used")
// #endif
void Z7_FASTCALL Sha1_UpdateBlocks (UInt32 state[5], const Byte *data, size_t numBlocks);
void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
{
Sha1_UpdateBlocks(state, data, numBlocks);
/*
@ -395,30 +369,5 @@ void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
return;
*/
}
#endif
#undef U0
#undef U1
#undef N0
#undef N1
#undef C
#undef P
#undef M
#undef H
#undef T
#undef MY_rev32_for_LE
#undef NNN
#undef LOAD_128
#undef STORE_128
#undef LOAD_SHUFFLE
#undef SM1
#undef SM2
#undef SM3
#undef NNN
#undef R4
#undef R16
#undef PREPARE_STATE
#undef USE_HW_SHA
#undef ATTRIB_SHA
#undef USE_VER_MIN
#undef Z7_USE_HW_SHA_STUB
#endif

View file

@ -1,60 +1,64 @@
/* Sha256.c -- SHA-256 Hash
: Igor Pavlov : Public domain
2021-04-01 : Igor Pavlov : Public domain
This code is based on public domain code from Wei Dai's Crypto++ library. */
#include "Precomp.h"
#include <string.h>
#include "Sha256.h"
#include "RotateDefs.h"
#include "CpuArch.h"
#include "RotateDefs.h"
#include "Sha256.h"
#if defined(_MSC_VER) && (_MSC_VER < 1900)
// #define USE_MY_MM
#endif
#ifdef MY_CPU_X86_OR_AMD64
#if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \
|| defined(_MSC_VER) && (_MSC_VER >= 1200)
#define Z7_COMPILER_SHA256_SUPPORTED
#ifdef _MSC_VER
#if _MSC_VER >= 1200
#define _SHA_SUPPORTED
#endif
#elif defined(__clang__)
#if (__clang_major__ >= 8) // fix that check
#define _SHA_SUPPORTED
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 8) // fix that check
#define _SHA_SUPPORTED
#endif
#elif defined(__INTEL_COMPILER)
#if (__INTEL_COMPILER >= 1800) // fix that check
#define _SHA_SUPPORTED
#endif
#endif
#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
#if defined(__ARM_FEATURE_SHA2) \
|| defined(__ARM_FEATURE_CRYPTO)
#define Z7_COMPILER_SHA256_SUPPORTED
#else
#if defined(MY_CPU_ARM64) \
|| defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
|| defined(Z7_MSC_VER_ORIGINAL)
#if defined(__ARM_FP) && \
( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
|| defined(__GNUC__) && (__GNUC__ >= 6) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
#if defined(MY_CPU_ARM64) \
|| !defined(Z7_CLANG_VERSION) \
|| defined(__ARM_NEON) && \
(Z7_CLANG_VERSION < 170000 || \
Z7_CLANG_VERSION > 170001)
#define Z7_COMPILER_SHA256_SUPPORTED
#elif defined(MY_CPU_ARM_OR_ARM64)
#ifdef _MSC_VER
#if _MSC_VER >= 1910
#define _SHA_SUPPORTED
#endif
#elif defined(__clang__)
#if (__clang_major__ >= 8) // fix that check
#define _SHA_SUPPORTED
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 6) // fix that check
#define _SHA_SUPPORTED
#endif
#endif
#endif
void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
#ifdef Z7_COMPILER_SHA256_SUPPORTED
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
#ifdef _SHA_SUPPORTED
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks;
static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW;
static SHA256_FUNC_UPDATE_BLOCKS g_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks;
static SHA256_FUNC_UPDATE_BLOCKS g_FUNC_UPDATE_BLOCKS_HW;
#define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
#define UPDATE_BLOCKS(p) p->func_UpdateBlocks
#else
#define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks
#define UPDATE_BLOCKS(p) Sha256_UpdateBlocks
#endif
@ -62,16 +66,16 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
{
SHA256_FUNC_UPDATE_BLOCKS func = Sha256_UpdateBlocks;
#ifdef Z7_COMPILER_SHA256_SUPPORTED
#ifdef _SHA_SUPPORTED
if (algo != SHA256_ALGO_SW)
{
if (algo == SHA256_ALGO_DEFAULT)
func = g_SHA256_FUNC_UPDATE_BLOCKS;
func = g_FUNC_UPDATE_BLOCKS;
else
{
if (algo != SHA256_ALGO_HW)
return False;
func = g_SHA256_FUNC_UPDATE_BLOCKS_HW;
func = g_FUNC_UPDATE_BLOCKS_HW;
if (!func)
return False;
}
@ -81,25 +85,24 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
return False;
#endif
p->v.vars.func_UpdateBlocks = func;
p->func_UpdateBlocks = func;
return True;
}
/* define it for speed optimization */
#ifdef Z7_SFX
#ifdef _SFX
#define STEP_PRE 1
#define STEP_MAIN 1
#else
#define STEP_PRE 2
#define STEP_MAIN 4
// #define Z7_SHA256_UNROLL
// #define _SHA256_UNROLL
#endif
#undef Z7_SHA256_BIG_W
#if STEP_MAIN != 16
#define Z7_SHA256_BIG_W
#define _SHA256_BIG_W
#endif
@ -107,7 +110,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
void Sha256_InitState(CSha256 *p)
{
p->v.vars.count = 0;
p->count = 0;
p->state[0] = 0x6a09e667;
p->state[1] = 0xbb67ae85;
p->state[2] = 0x3c6ef372;
@ -118,28 +121,21 @@ void Sha256_InitState(CSha256 *p)
p->state[7] = 0x5be0cd19;
}
void Sha256_Init(CSha256 *p)
{
p->v.vars.func_UpdateBlocks =
#ifdef Z7_COMPILER_SHA256_SUPPORTED
g_SHA256_FUNC_UPDATE_BLOCKS;
p->func_UpdateBlocks =
#ifdef _SHA_SUPPORTED
g_FUNC_UPDATE_BLOCKS;
#else
NULL;
#endif
Sha256_InitState(p);
}
#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x,22))
#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x,25))
#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22))
#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25))
#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3))
#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >>10))
#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10))
#define Ch(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) ((x&y)|(z&(x|y)))
@ -149,7 +145,7 @@ void Sha256_Init(CSha256 *p)
#define blk2_main(j, i) s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15))
#ifdef Z7_SHA256_BIG_W
#ifdef _SHA256_BIG_W
// we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned.
#define w(j, i) W[(size_t)(j) + i]
#define blk2(j, i) (w(j, i) = w(j, (i)-16) + blk2_main(j, i))
@ -180,7 +176,7 @@ void Sha256_Init(CSha256 *p)
#define R1_PRE(i) T1( W_PRE, i)
#define R1_MAIN(i) T1( W_MAIN, i)
#if (!defined(Z7_SHA256_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4)
#if (!defined(_SHA256_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4)
#define R2_MAIN(i) \
R1_MAIN(i) \
R1_MAIN(i + 1) \
@ -189,7 +185,7 @@ void Sha256_Init(CSha256 *p)
#if defined(Z7_SHA256_UNROLL) && STEP_MAIN >= 8
#if defined(_SHA256_UNROLL) && STEP_MAIN >= 8
#define T4( a,b,c,d,e,f,g,h, wx, i) \
h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
@ -227,10 +223,14 @@ void Sha256_Init(CSha256 *p)
#endif
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
extern
MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64];
MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = {
// static
extern MY_ALIGN(64)
const UInt32 SHA256_K_ARRAY[64];
MY_ALIGN(64)
const UInt32 SHA256_K_ARRAY[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@ -249,29 +249,27 @@ MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = {
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
#define K SHA256_K_ARRAY
Z7_NO_INLINE
void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks)
MY_NO_INLINE
void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks)
{
UInt32 W
#ifdef Z7_SHA256_BIG_W
#ifdef _SHA256_BIG_W
[64];
#else
#else
[16];
#endif
unsigned j;
UInt32 a,b,c,d,e,f,g,h;
#if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
UInt32 tmp;
#endif
if (numBlocks == 0) return;
#endif
unsigned j;
UInt32 a,b,c,d,e,f,g,h;
#if !defined(_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
UInt32 tmp;
#endif
a = state[0];
b = state[1];
c = state[2];
@ -281,7 +279,7 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
g = state[6];
h = state[7];
do
while (numBlocks)
{
for (j = 0; j < 16; j += STEP_PRE)
@ -299,12 +297,12 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
#else
R1_PRE(0)
R1_PRE(0);
#if STEP_PRE >= 2
R1_PRE(1)
R1_PRE(1);
#if STEP_PRE >= 4
R1_PRE(2)
R1_PRE(3)
R1_PRE(2);
R1_PRE(3);
#endif
#endif
@ -313,32 +311,32 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
for (j = 16; j < 64; j += STEP_MAIN)
{
#if defined(Z7_SHA256_UNROLL) && STEP_MAIN >= 8
#if defined(_SHA256_UNROLL) && STEP_MAIN >= 8
#if STEP_MAIN < 8
R4_MAIN(0)
R4_MAIN(0);
#else
R8_MAIN(0)
R8_MAIN(0);
#if STEP_MAIN == 16
R8_MAIN(8)
R8_MAIN(8);
#endif
#endif
#else
R1_MAIN(0)
R1_MAIN(0);
#if STEP_MAIN >= 2
R1_MAIN(1)
R1_MAIN(1);
#if STEP_MAIN >= 4
R2_MAIN(2)
R2_MAIN(2);
#if STEP_MAIN >= 8
R2_MAIN(4)
R2_MAIN(6)
R2_MAIN(4);
R2_MAIN(6);
#if STEP_MAIN >= 16
R2_MAIN(8)
R2_MAIN(10)
R2_MAIN(12)
R2_MAIN(14)
R2_MAIN(8);
R2_MAIN(10);
R2_MAIN(12);
R2_MAIN(14);
#endif
#endif
#endif
@ -355,27 +353,40 @@ void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t n
g += state[6]; state[6] = g;
h += state[7]; state[7] = h;
data += SHA256_BLOCK_SIZE;
data += 64;
numBlocks--;
}
while (--numBlocks);
/* Wipe variables */
/* memset(W, 0, sizeof(W)); */
}
#undef S0
#undef S1
#undef s0
#undef s1
#undef K
#define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
#define Sha256_UpdateBlock(p) UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
{
if (size == 0)
return;
{
const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
const unsigned num = SHA256_BLOCK_SIZE - pos;
p->v.vars.count += size;
unsigned pos = (unsigned)p->count & 0x3F;
unsigned num;
p->count += size;
num = 64 - pos;
if (num > size)
{
memcpy(p->buffer + pos, data, size);
return;
}
if (pos != 0)
{
size -= num;
@ -385,10 +396,9 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
}
}
{
const size_t numBlocks = size >> 6;
// if (numBlocks)
SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
size &= SHA256_BLOCK_SIZE - 1;
size_t numBlocks = size >> 6;
UPDATE_BLOCKS(p)(p->state, data, numBlocks);
size &= 0x3F;
if (size == 0)
return;
data += (numBlocks << 6);
@ -399,94 +409,78 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
void Sha256_Final(CSha256 *p, Byte *digest)
{
unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
unsigned pos = (unsigned)p->count & 0x3F;
unsigned i;
p->buffer[pos++] = 0x80;
if (pos > (SHA256_BLOCK_SIZE - 4 * 2))
if (pos > (64 - 8))
{
while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; }
// memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos);
while (pos != 64) { p->buffer[pos++] = 0; }
// memset(&p->buf.buffer[pos], 0, 64 - pos);
Sha256_UpdateBlock(p);
pos = 0;
}
memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos);
/*
if (pos & 3)
{
const UInt64 numBits = p->v.vars.count << 3;
SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
p->buffer[pos] = 0;
p->buffer[pos + 1] = 0;
p->buffer[pos + 2] = 0;
pos += 3;
pos &= ~3;
}
{
for (; pos < 64 - 8; pos += 4)
*(UInt32 *)(&p->buffer[pos]) = 0;
}
*/
memset(&p->buffer[pos], 0, (64 - 8) - pos);
{
UInt64 numBits = (p->count << 3);
SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32));
SetBe32(p->buffer + 64 - 4, (UInt32)(numBits));
}
Sha256_UpdateBlock(p);
#if 1 && defined(MY_CPU_BE)
memcpy(digest, p->state, SHA256_DIGEST_SIZE);
#else
for (i = 0; i < 8; i += 2)
{
unsigned i;
for (i = 0; i < 8; i += 2)
{
const UInt32 v0 = p->state[i];
const UInt32 v1 = p->state[(size_t)i + 1];
SetBe32(digest , v0)
SetBe32(digest + 4, v1)
digest += 4 * 2;
}
UInt32 v0 = p->state[i];
UInt32 v1 = p->state[(size_t)i + 1];
SetBe32(digest , v0);
SetBe32(digest + 4, v1);
digest += 8;
}
#endif
Sha256_InitState(p);
}
void Sha256Prepare(void)
void Sha256Prepare()
{
#ifdef Z7_COMPILER_SHA256_SUPPORTED
#ifdef _SHA_SUPPORTED
SHA256_FUNC_UPDATE_BLOCKS f, f_hw;
f = Sha256_UpdateBlocks;
f_hw = NULL;
#ifdef MY_CPU_X86_OR_AMD64
#ifdef MY_CPU_X86_OR_AMD64
#ifndef USE_MY_MM
if (CPU_IsSupported_SHA()
&& CPU_IsSupported_SSSE3()
// && CPU_IsSupported_SSE41()
)
#else
#endif
#else
if (CPU_IsSupported_SHA2())
#endif
#endif
{
// printf("\n========== HW SHA256 ======== \n");
f = f_hw = Sha256_UpdateBlocks_HW;
}
g_SHA256_FUNC_UPDATE_BLOCKS = f;
g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw;
#endif
g_FUNC_UPDATE_BLOCKS = f;
g_FUNC_UPDATE_BLOCKS_HW = f_hw;
#endif
}
#undef U64C
#undef K
#undef S0
#undef S1
#undef s0
#undef s1
#undef Ch
#undef Maj
#undef W_MAIN
#undef W_PRE
#undef w
#undef blk2_main
#undef blk2
#undef T1
#undef T4
#undef T8
#undef R1_PRE
#undef R1_MAIN
#undef R2_MAIN
#undef R4
#undef R4_PRE
#undef R4_MAIN
#undef R8
#undef R8_PRE
#undef R8_MAIN
#undef STEP_PRE
#undef STEP_MAIN
#undef Z7_SHA256_BIG_W
#undef Z7_SHA256_UNROLL
#undef Z7_COMPILER_SHA256_SUPPORTED

View file

@ -1,8 +1,8 @@
/* Sha256.h -- SHA-256 Hash
: Igor Pavlov : Public domain */
2021-01-01 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_SHA256_H
#define ZIP7_INC_SHA256_H
#ifndef __7Z_SHA256_H
#define __7Z_SHA256_H
#include "7zTypes.h"
@ -14,10 +14,7 @@ EXTERN_C_BEGIN
#define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4)
#define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4)
typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks);
typedef void (MY_FAST_CALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks);
/*
if (the system supports different SHA256 code implementations)
@ -35,16 +32,9 @@ typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byt
typedef struct
{
union
{
struct
{
SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
UInt64 count;
} vars;
UInt64 _pad_64bit[4];
void *_pad_align_ptr[2];
} v;
SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
UInt64 count;
UInt64 __pad_2[2];
UInt32 state[SHA256_NUM_DIGEST_WORDS];
Byte buffer[SHA256_BLOCK_SIZE];
@ -72,7 +62,7 @@ void Sha256_Final(CSha256 *p, Byte *digest);
// void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
// void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
/*
call Sha256Prepare() once at program start.

View file

@ -1,53 +1,71 @@
/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
: Igor Pavlov : Public domain */
2021-04-01 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include "Compiler.h"
#if defined(_MSC_VER)
#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
// #define USE_MY_MM
#endif
#endif
#include "CpuArch.h"
// #define Z7_USE_HW_SHA_STUB // for debug
#ifdef MY_CPU_X86_OR_AMD64
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
#if defined(__clang__)
#if (__clang_major__ >= 8) // fix that check
#define USE_HW_SHA
#elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
#define USE_HW_SHA
#if !defined(__INTEL_COMPILER)
// icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
#if !defined(__SHA__) || !defined(__SSSE3__)
#ifndef __SHA__
#define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
#if defined(_MSC_VER)
// SSSE3: for clang-cl:
#include <tmmintrin.h>
#define __SHA__
#endif
#endif
#endif
#elif defined(_MSC_VER)
#if (_MSC_VER >= 1900)
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 8) // fix that check
#define USE_HW_SHA
#ifndef __SHA__
#define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
// #pragma GCC target("sha,ssse3")
#endif
#endif
#elif defined(__INTEL_COMPILER)
#if (__INTEL_COMPILER >= 1800) // fix that check
#define USE_HW_SHA
#endif
#elif defined(_MSC_VER)
#ifdef USE_MY_MM
#define USE_VER_MIN 1300
#else
#define Z7_USE_HW_SHA_STUB
#define USE_VER_MIN 1910
#endif
#if _MSC_VER >= USE_VER_MIN
#define USE_HW_SHA
#endif
#endif
// #endif // MY_CPU_X86_OR_AMD64
#ifndef USE_HW_SHA
// #define Z7_USE_HW_SHA_STUB // for debug
#endif
#ifdef USE_HW_SHA
// #pragma message("Sha256 HW")
// #include <wmmintrin.h>
// sse/sse2/ssse3:
#include <tmmintrin.h>
// sha*:
#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
#include <immintrin.h>
#if defined (__clang__) && defined(_MSC_VER)
#if !defined(__SHA__)
#include <shaintrin.h>
#endif
#else
#include <emmintrin.h>
#if defined(_MSC_VER) && (_MSC_VER >= 1600)
// #include <intrin.h>
#endif
#ifdef USE_MY_MM
#include "My_mm.h"
#endif
#endif
@ -76,44 +94,60 @@ SHA:
extern
MY_ALIGN(64)
const UInt32 SHA256_K_ARRAY[64];
#define K SHA256_K_ARRAY
#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
#define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
#define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
#define LOAD_SHUFFLE(m, k) \
m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
m = _mm_shuffle_epi8(m, mask); \
#define NNN(m0, m1, m2, m3)
#define SM1(g0, g1, g2, g3) \
SHA256_MSG1(g3, g0); \
#define SM1(m1, m2, m3, m0) \
SHA256_MSG1(m0, m1); \
#define SM2(g0, g1, g2, g3) \
tmp = _mm_alignr_epi8(g1, g0, 4); \
ADD_EPI32(g2, tmp); \
SHA25G_MSG2(g2, g1); \
// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
#define NNN(g0, g1, g2, g3)
#define SM2(m2, m3, m0, m1) \
ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \
SHA256_MSG2(m0, m3); \
#define RND2(t0, t1) \
t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
#define R4(k, m0, m1, m2, m3, OP0, OP1) \
msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \
#define RND2_0(m, k) \
msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
RND2(state0, state1); \
msg = _mm_shuffle_epi32(msg, 0x0E); \
OP0(m0, m1, m2, m3) \
#define RND2_1 \
RND2(state1, state0); \
OP1(m0, m1, m2, m3) \
// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
#define R4(k, g0, g1, g2, g3, OP0, OP1) \
RND2_0(g0, k); \
OP0(g0, g1, g2, g3); \
RND2_1; \
OP1(g0, g1, g2, g3); \
#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \
R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \
R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \
R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
#define PREPARE_STATE \
tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
@ -123,16 +157,15 @@ const UInt32 SHA256_K_ARRAY[64];
state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
#ifdef ATTRIB_SHA
ATTRIB_SHA
#endif
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
{
const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
__m128i tmp, state0, state1;
__m128i tmp;
__m128i state0, state1;
if (numBlocks == 0)
return;
@ -159,13 +192,13 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 );
R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN );
ADD_EPI32(state0, state0_save)
ADD_EPI32(state1, state1_save)
ADD_EPI32(state0, state0_save);
ADD_EPI32(state1, state1_save);
data += 64;
}
@ -179,28 +212,19 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
#endif // USE_HW_SHA
#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
#if defined(__ARM_FEATURE_SHA2) \
|| defined(__ARM_FEATURE_CRYPTO)
#define USE_HW_SHA
#else
#if defined(MY_CPU_ARM64) \
|| defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
|| defined(Z7_MSC_VER_ORIGINAL)
#if defined(__ARM_FP) && \
( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
|| defined(__GNUC__) && (__GNUC__ >= 6) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
#if defined(MY_CPU_ARM64) \
|| !defined(Z7_CLANG_VERSION) \
|| defined(__ARM_NEON) && \
(Z7_CLANG_VERSION < 170000 || \
Z7_CLANG_VERSION > 170001)
#elif defined(MY_CPU_ARM_OR_ARM64)
#if defined(__clang__)
#if (__clang_major__ >= 8) // fix that check
#define USE_HW_SHA
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 6) // fix that check
#define USE_HW_SHA
#endif
#elif defined(_MSC_VER)
#if _MSC_VER >= 1910
#define USE_HW_SHA
#endif
#endif
@ -208,144 +232,63 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
// #pragma message("=== Sha256 HW === ")
#if defined(__clang__) || defined(__GNUC__)
#if !defined(__ARM_FEATURE_SHA2) && \
!defined(__ARM_FEATURE_CRYPTO)
#ifdef MY_CPU_ARM64
#if defined(__clang__)
#define ATTRIB_SHA __attribute__((__target__("crypto")))
#else
#define ATTRIB_SHA __attribute__((__target__("+crypto")))
#endif
#else
#if defined(__clang__) && (__clang_major__ >= 1)
#define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2")))
#else
#define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
#endif
#endif
#endif
#else
// _MSC_VER
// for arm32
#define _ARM_USE_NEW_NEON_INTRINSICS
#endif
#if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
#if defined(_MSC_VER) && defined(MY_CPU_ARM64)
#include <arm64_neon.h>
#else
#if defined(__clang__) && __clang_major__ < 16
#if !defined(__ARM_FEATURE_SHA2) && \
!defined(__ARM_FEATURE_CRYPTO)
// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
// #if defined(__clang__) && __clang_major__ < 13
#define __ARM_FEATURE_CRYPTO 1
// #else
#define __ARM_FEATURE_SHA2 1
// #endif
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif // clang
#if defined(__clang__)
#if defined(__ARM_ARCH) && __ARM_ARCH < 8
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
// #pragma message("#define __ARM_ARCH 8")
#undef __ARM_ARCH
#define __ARM_ARCH 8
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif // clang
#include <arm_neon.h>
#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
defined(__ARM_FEATURE_CRYPTO) && \
defined(__ARM_FEATURE_SHA2)
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#undef __ARM_FEATURE_CRYPTO
#undef __ARM_FEATURE_SHA2
#undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
#endif
#endif // Z7_MSC_VER_ORIGINAL
typedef uint32x4_t v128;
// typedef __n128 v128; // MSVC
#ifdef MY_CPU_BE
#define MY_rev32_for_LE(x) x
#define MY_rev32_for_LE(x)
#else
#define MY_rev32_for_LE(x) vrev32q_u8(x)
#define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
#endif
#if 1 // 0 for debug
// for arm32: it works slower by some reason than direct code
/*
for arm32 it generates:
MSVC-2022, GCC-9:
vld1.32 {d18,d19}, [r10]
vst1.32 {d4,d5}, [r3]
vld1.8 {d20-d21}, [r4]
there is no align hint (like [r10:128]). So instruction allows unaligned access
*/
#define LOAD_128_32(_p) vld1q_u32(_p)
#define LOAD_128_8(_p) vld1q_u8 (_p)
#define STORE_128_32(_p, _v) vst1q_u32(_p, _v)
#else
/*
for arm32:
MSVC-2022:
vldm r10,{d18,d19}
vstm r3,{d4,d5}
does it require strict alignment?
GCC-9:
vld1.64 {d30-d31}, [r0:64]
vldr d28, [r0, #16]
vldr d29, [r0, #24]
vst1.64 {d30-d31}, [r0:64]
vstr d28, [r0, #16]
vstr d29, [r0, #24]
there is hint [r0:64], so does it requires 64-bit alignment.
*/
#define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p))
#define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p))
#define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v)
#endif
#define LOAD_128(_p) (*(const v128 *)(const void *)(_p))
#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v)
#define LOAD_SHUFFLE(m, k) \
m = vreinterpretq_u32_u8( \
MY_rev32_for_LE( \
LOAD_128_8(data + (k) * 16))); \
m = LOAD_128((data + (k) * 16)); \
MY_rev32_for_LE(m); \
// K array must be aligned for 16-bytes at least.
extern
MY_ALIGN(64)
const UInt32 SHA256_K_ARRAY[64];
#define K SHA256_K_ARRAY
#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src);
#define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
#define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0)
#define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1)
#define NNN(m0, m1, m2, m3)
#define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0)
#define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1)
#define NNN(g0, g1, g2, g3)
#define R4(k, m0, m1, m2, m3, OP0, OP1) \
msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \
#define R4(k, g0, g1, g2, g3, OP0, OP1) \
msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
tmp = state0; \
state0 = vsha256hq_u32( state0, state1, msg ); \
state1 = vsha256h2q_u32( state1, tmp, msg ); \
OP0(m0, m1, m2, m3); \
OP1(m0, m1, m2, m3); \
OP0(g0, g1, g2, g3); \
OP1(g0, g1, g2, g3); \
#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
@ -355,19 +298,19 @@ const UInt32 SHA256_K_ARRAY[64];
R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
#ifdef ATTRIB_SHA
ATTRIB_SHA
#endif
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
{
v128 state0, state1;
if (numBlocks == 0)
return;
state0 = LOAD_128_32(&state[0]);
state1 = LOAD_128_32(&state[4]);
state0 = LOAD_128(&state[0]);
state1 = LOAD_128(&state[4]);
do
{
@ -383,10 +326,10 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
LOAD_SHUFFLE (m2, 2)
LOAD_SHUFFLE (m3, 3)
R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 );
R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 );
R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN );
state0 = vaddq_u32(state0, state0_save);
state1 = vaddq_u32(state1, state1_save);
@ -395,8 +338,8 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
}
while (--numBlocks);
STORE_128_32(&state[0], state0);
STORE_128_32(&state[4], state1);
STORE_128(&state[0], state0);
STORE_128(&state[4], state1);
}
#endif // USE_HW_SHA
@ -404,19 +347,18 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
#endif // MY_CPU_ARM_OR_ARM64
#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
#ifndef USE_HW_SHA
// #error Stop_Compiling_UNSUPPORTED_SHA
// #include <stdlib.h>
// We can compile this file with another C compiler,
// or we can compile asm version.
// So we can generate real code instead of this stub function.
// #include "Sha256.h"
// #if defined(_MSC_VER)
void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
#pragma message("Sha256 HW-SW stub was used")
// #endif
void Z7_FASTCALL Sha256_UpdateBlocks (UInt32 state[8], const Byte *data, size_t numBlocks);
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
{
Sha256_UpdateBlocks(state, data, numBlocks);
/*
@ -427,25 +369,5 @@ void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_
return;
*/
}
#endif
#undef K
#undef RND2
#undef MY_rev32_for_LE
#undef NNN
#undef LOAD_128
#undef STORE_128
#undef LOAD_SHUFFLE
#undef SM1
#undef SM2
#undef R4
#undef R16
#undef PREPARE_STATE
#undef USE_HW_SHA
#undef ATTRIB_SHA
#undef USE_VER_MIN
#undef Z7_USE_HW_SHA_STUB

359
C/Sha3.c
View file

@ -1,359 +0,0 @@
/* Sha3.c -- SHA-3 Hash
: Igor Pavlov : Public domain
This code is based on public domain code from Wei Dai's Crypto++ library. */
#include "Precomp.h"
#include <string.h>
#include "Sha3.h"
#include "RotateDefs.h"
#include "CpuArch.h"
#define U64C(x) UINT64_CONST(x)
static
MY_ALIGN(64)
const UInt64 SHA3_K_ARRAY[24] =
{
U64C(0x0000000000000001), U64C(0x0000000000008082),
U64C(0x800000000000808a), U64C(0x8000000080008000),
U64C(0x000000000000808b), U64C(0x0000000080000001),
U64C(0x8000000080008081), U64C(0x8000000000008009),
U64C(0x000000000000008a), U64C(0x0000000000000088),
U64C(0x0000000080008009), U64C(0x000000008000000a),
U64C(0x000000008000808b), U64C(0x800000000000008b),
U64C(0x8000000000008089), U64C(0x8000000000008003),
U64C(0x8000000000008002), U64C(0x8000000000000080),
U64C(0x000000000000800a), U64C(0x800000008000000a),
U64C(0x8000000080008081), U64C(0x8000000000008080),
U64C(0x0000000080000001), U64C(0x8000000080008008)
};
void Sha3_Init(CSha3 *p)
{
p->count = 0;
memset(p->state, 0, sizeof(p->state));
}
#define GET_state(i, a) UInt64 a = state[i];
#define SET_state(i, a) state[i] = a;
#define LS_5(M, i, a0,a1,a2,a3,a4) \
M ((i) * 5 , a0) \
M ((i) * 5 + 1, a1) \
M ((i) * 5 + 2, a2) \
M ((i) * 5 + 3, a3) \
M ((i) * 5 + 4, a4) \
#define LS_25(M) \
LS_5 (M, 0, a50, a51, a52, a53, a54) \
LS_5 (M, 1, a60, a61, a62, a63, a64) \
LS_5 (M, 2, a70, a71, a72, a73, a74) \
LS_5 (M, 3, a80, a81, a82, a83, a84) \
LS_5 (M, 4, a90, a91, a92, a93, a94) \
#define XOR_1(i, a0) \
a0 ^= GetUi64(data + (i) * 8); \
#define XOR_4(i, a0,a1,a2,a3) \
XOR_1 ((i) , a0); \
XOR_1 ((i) + 1, a1); \
XOR_1 ((i) + 2, a2); \
XOR_1 ((i) + 3, a3); \
#define D(d,b1,b2) \
d = b1 ^ Z7_ROTL64(b2, 1);
#define D5 \
D (d0, c4, c1) \
D (d1, c0, c2) \
D (d2, c1, c3) \
D (d3, c2, c4) \
D (d4, c3, c0) \
#define C0(c,a,d) \
c = a ^ d; \
#define C(c,a,d,k) \
c = a ^ d; \
c = Z7_ROTL64(c, k); \
#define E4(e1,e2,e3,e4) \
e1 = c1 ^ (~c2 & c3); \
e2 = c2 ^ (~c3 & c4); \
e3 = c3 ^ (~c4 & c0); \
e4 = c4 ^ (~c0 & c1); \
#define CK( v0,w0, \
v1,w1,k1, \
v2,w2,k2, \
v3,w3,k3, \
v4,w4,k4, e0,e1,e2,e3,e4, keccak_c) \
C0(c0,v0,w0) \
C (c1,v1,w1,k1) \
C (c2,v2,w2,k2) \
C (c3,v3,w3,k3) \
C (c4,v4,w4,k4) \
e0 = c0 ^ (~c1 & c2) ^ keccak_c; \
E4(e1,e2,e3,e4) \
#define CE( v0,w0,k0, \
v1,w1,k1, \
v2,w2,k2, \
v3,w3,k3, \
v4,w4,k4, e0,e1,e2,e3,e4) \
C (c0,v0,w0,k0) \
C (c1,v1,w1,k1) \
C (c2,v2,w2,k2) \
C (c3,v3,w3,k3) \
C (c4,v4,w4,k4) \
e0 = c0 ^ (~c1 & c2); \
E4(e1,e2,e3,e4) \
// numBlocks != 0
static
Z7_NO_INLINE
void Z7_FASTCALL Sha3_UpdateBlocks(UInt64 state[SHA3_NUM_STATE_WORDS],
const Byte *data, size_t numBlocks, size_t blockSize)
{
LS_25 (GET_state)
do
{
unsigned round;
XOR_4 ( 0, a50, a51, a52, a53)
XOR_4 ( 4, a54, a60, a61, a62)
XOR_1 ( 8, a63)
if (blockSize > 8 * 9) { XOR_4 ( 9, a64, a70, a71, a72) // sha3-384
if (blockSize > 8 * 13) { XOR_4 (13, a73, a74, a80, a81) // sha3-256
if (blockSize > 8 * 17) { XOR_1 (17, a82) // sha3-224
if (blockSize > 8 * 18) { XOR_1 (18, a83) // shake128
XOR_1 (19, a84)
XOR_1 (20, a90) }}}}
data += blockSize;
for (round = 0; round < 24; round += 2)
{
UInt64 c0, c1, c2, c3, c4;
UInt64 d0, d1, d2, d3, d4;
UInt64 e50, e51, e52, e53, e54;
UInt64 e60, e61, e62, e63, e64;
UInt64 e70, e71, e72, e73, e74;
UInt64 e80, e81, e82, e83, e84;
UInt64 e90, e91, e92, e93, e94;
c0 = a50^a60^a70^a80^a90;
c1 = a51^a61^a71^a81^a91;
c2 = a52^a62^a72^a82^a92;
c3 = a53^a63^a73^a83^a93;
c4 = a54^a64^a74^a84^a94;
D5
CK( a50, d0,
a61, d1, 44,
a72, d2, 43,
a83, d3, 21,
a94, d4, 14, e50, e51, e52, e53, e54, SHA3_K_ARRAY[round])
CE( a53, d3, 28,
a64, d4, 20,
a70, d0, 3,
a81, d1, 45,
a92, d2, 61, e60, e61, e62, e63, e64)
CE( a51, d1, 1,
a62, d2, 6,
a73, d3, 25,
a84, d4, 8,
a90, d0, 18, e70, e71, e72, e73, e74)
CE( a54, d4, 27,
a60, d0, 36,
a71, d1, 10,
a82, d2, 15,
a93, d3, 56, e80, e81, e82, e83, e84)
CE( a52, d2, 62,
a63, d3, 55,
a74, d4, 39,
a80, d0, 41,
a91, d1, 2, e90, e91, e92, e93, e94)
// ---------- ROUND + 1 ----------
c0 = e50^e60^e70^e80^e90;
c1 = e51^e61^e71^e81^e91;
c2 = e52^e62^e72^e82^e92;
c3 = e53^e63^e73^e83^e93;
c4 = e54^e64^e74^e84^e94;
D5
CK( e50, d0,
e61, d1, 44,
e72, d2, 43,
e83, d3, 21,
e94, d4, 14, a50, a51, a52, a53, a54, SHA3_K_ARRAY[(size_t)round + 1])
CE( e53, d3, 28,
e64, d4, 20,
e70, d0, 3,
e81, d1, 45,
e92, d2, 61, a60, a61, a62, a63, a64)
CE( e51, d1, 1,
e62, d2, 6,
e73, d3, 25,
e84, d4, 8,
e90, d0, 18, a70, a71, a72, a73, a74)
CE (e54, d4, 27,
e60, d0, 36,
e71, d1, 10,
e82, d2, 15,
e93, d3, 56, a80, a81, a82, a83, a84)
CE (e52, d2, 62,
e63, d3, 55,
e74, d4, 39,
e80, d0, 41,
e91, d1, 2, a90, a91, a92, a93, a94)
}
}
while (--numBlocks);
LS_25 (SET_state)
}
#define Sha3_UpdateBlock(p) \
Sha3_UpdateBlocks(p->state, p->buffer, 1, p->blockSize)
void Sha3_Update(CSha3 *p, const Byte *data, size_t size)
{
/*
for (;;)
{
if (size == 0)
return;
unsigned cur = p->blockSize - p->count;
if (cur > size)
cur = (unsigned)size;
size -= cur;
unsigned pos = p->count;
p->count = pos + cur;
while (pos & 7)
{
if (cur == 0)
return;
Byte *pb = &(((Byte *)p->state)[pos]);
*pb = (Byte)(*pb ^ *data++);
cur--;
pos++;
}
if (cur >= 8)
{
do
{
*(UInt64 *)(void *)&(((Byte *)p->state)[pos]) ^= GetUi64(data);
data += 8;
pos += 8;
cur -= 8;
}
while (cur >= 8);
}
if (pos != p->blockSize)
{
if (cur)
{
Byte *pb = &(((Byte *)p->state)[pos]);
do
{
*pb = (Byte)(*pb ^ *data++);
pb++;
}
while (--cur);
}
return;
}
Sha3_UpdateBlock(p->state);
p->count = 0;
}
*/
if (size == 0)
return;
{
const unsigned pos = p->count;
const unsigned num = p->blockSize - pos;
if (num > size)
{
p->count = pos + (unsigned)size;
memcpy(p->buffer + pos, data, size);
return;
}
if (pos != 0)
{
size -= num;
memcpy(p->buffer + pos, data, num);
data += num;
Sha3_UpdateBlock(p);
}
}
if (size >= p->blockSize)
{
const size_t numBlocks = size / p->blockSize;
const Byte *dataOld = data;
data += numBlocks * p->blockSize;
size = (size_t)(dataOld + size - data);
Sha3_UpdateBlocks(p->state, dataOld, numBlocks, p->blockSize);
}
p->count = (unsigned)size;
if (size)
memcpy(p->buffer, data, size);
}
// we support only (digestSize % 4 == 0) cases
void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake)
{
memset(p->buffer + p->count, 0, p->blockSize - p->count);
// we write bits markers from low to higher in current byte:
// - if sha-3 : 2 bits : 0,1
// - if shake : 4 bits : 1111
// then we write bit 1 to same byte.
// And we write bit 1 to highest bit of last byte of block.
p->buffer[p->count] = (Byte)(shake ? 0x1f : 0x06);
// we need xor operation (^= 0x80) here because we must write 0x80 bit
// to same byte as (0x1f : 0x06), if (p->count == p->blockSize - 1) !!!
p->buffer[p->blockSize - 1] ^= 0x80;
/*
((Byte *)p->state)[p->count] ^= (Byte)(shake ? 0x1f : 0x06);
((Byte *)p->state)[p->blockSize - 1] ^= 0x80;
*/
Sha3_UpdateBlock(p);
#if 1 && defined(MY_CPU_LE)
memcpy(digest, p->state, digestSize);
#else
{
const unsigned numWords = digestSize >> 3;
unsigned i;
for (i = 0; i < numWords; i++)
{
const UInt64 v = p->state[i];
SetUi64(digest, v)
digest += 8;
}
if (digestSize & 4) // for SHA3-224
{
const UInt32 v = (UInt32)p->state[numWords];
SetUi32(digest, v)
}
}
#endif
Sha3_Init(p);
}
#undef GET_state
#undef SET_state
#undef LS_5
#undef LS_25
#undef XOR_1
#undef XOR_4
#undef D
#undef D5
#undef C0
#undef C
#undef E4
#undef CK
#undef CE

View file

@ -1,36 +0,0 @@
/* Sha3.h -- SHA-3 Hash
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_MD5_H
#define ZIP7_INC_MD5_H
#include "7zTypes.h"
EXTERN_C_BEGIN
#define SHA3_NUM_STATE_WORDS 25
#define SHA3_BLOCK_SIZE_FROM_DIGEST_SIZE(digestSize) \
(SHA3_NUM_STATE_WORDS * 8 - (digestSize) * 2)
typedef struct
{
UInt32 count; // < blockSize
UInt32 blockSize; // <= SHA3_NUM_STATE_WORDS * 8
UInt64 _pad1[3];
// we want 32-bytes alignment here
UInt64 state[SHA3_NUM_STATE_WORDS];
UInt64 _pad2[3];
// we want 64-bytes alignment here
Byte buffer[SHA3_NUM_STATE_WORDS * 8]; // last bytes will be unused with predefined blockSize values
} CSha3;
#define Sha3_SET_blockSize(p, blockSize) { (p)->blockSize = (blockSize); }
void Sha3_Init(CSha3 *p);
void Sha3_Update(CSha3 *p, const Byte *data, size_t size);
void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake);
EXTERN_C_END
#endif

View file

@ -1,711 +0,0 @@
/* Sha512.c -- SHA-512 Hash
: Igor Pavlov : Public domain
This code is based on public domain code from Wei Dai's Crypto++ library. */
#include "Precomp.h"
#include <string.h>
#include "Sha512.h"
#include "RotateDefs.h"
#include "CpuArch.h"
#ifdef MY_CPU_X86_OR_AMD64
#if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) \
|| defined(_MSC_VER) && (_MSC_VER >= 1940)
#define Z7_COMPILER_SHA512_SUPPORTED
#endif
#elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
#if defined(__ARM_FEATURE_SHA512)
#define Z7_COMPILER_SHA512_SUPPORTED
#else
#if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \
|| defined(__GNUC__) && (__GNUC__ >= 9) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it
#define Z7_COMPILER_SHA512_SUPPORTED
#endif
#endif
#endif
void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks);
#ifdef Z7_COMPILER_SHA512_SUPPORTED
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS = Sha512_UpdateBlocks;
static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS_HW;
#define SHA512_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
#else
#define SHA512_UPDATE_BLOCKS(p) Sha512_UpdateBlocks
#endif
BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo)
{
SHA512_FUNC_UPDATE_BLOCKS func = Sha512_UpdateBlocks;
#ifdef Z7_COMPILER_SHA512_SUPPORTED
if (algo != SHA512_ALGO_SW)
{
if (algo == SHA512_ALGO_DEFAULT)
func = g_SHA512_FUNC_UPDATE_BLOCKS;
else
{
if (algo != SHA512_ALGO_HW)
return False;
func = g_SHA512_FUNC_UPDATE_BLOCKS_HW;
if (!func)
return False;
}
}
#else
if (algo > 1)
return False;
#endif
p->v.vars.func_UpdateBlocks = func;
return True;
}
/* define it for speed optimization */
#if 0 // 1 for size optimization
#define STEP_PRE 1
#define STEP_MAIN 1
#else
#define STEP_PRE 2
#define STEP_MAIN 4
// #define Z7_SHA512_UNROLL
#endif
#undef Z7_SHA512_BIG_W
#if STEP_MAIN != 16
#define Z7_SHA512_BIG_W
#endif
#define U64C(x) UINT64_CONST(x)
static MY_ALIGN(64) const UInt64 SHA512_INIT_ARRAYS[4][8] = {
{ U64C(0x8c3d37c819544da2), U64C(0x73e1996689dcd4d6), U64C(0x1dfab7ae32ff9c82), U64C(0x679dd514582f9fcf),
U64C(0x0f6d2b697bd44da8), U64C(0x77e36f7304c48942), U64C(0x3f9d85a86a1d36c8), U64C(0x1112e6ad91d692a1)
},
{ U64C(0x22312194fc2bf72c), U64C(0x9f555fa3c84c64c2), U64C(0x2393b86b6f53b151), U64C(0x963877195940eabd),
U64C(0x96283ee2a88effe3), U64C(0xbe5e1e2553863992), U64C(0x2b0199fc2c85b8aa), U64C(0x0eb72ddc81c52ca2)
},
{ U64C(0xcbbb9d5dc1059ed8), U64C(0x629a292a367cd507), U64C(0x9159015a3070dd17), U64C(0x152fecd8f70e5939),
U64C(0x67332667ffc00b31), U64C(0x8eb44a8768581511), U64C(0xdb0c2e0d64f98fa7), U64C(0x47b5481dbefa4fa4)
},
{ U64C(0x6a09e667f3bcc908), U64C(0xbb67ae8584caa73b), U64C(0x3c6ef372fe94f82b), U64C(0xa54ff53a5f1d36f1),
U64C(0x510e527fade682d1), U64C(0x9b05688c2b3e6c1f), U64C(0x1f83d9abfb41bd6b), U64C(0x5be0cd19137e2179)
}};
void Sha512_InitState(CSha512 *p, unsigned digestSize)
{
p->v.vars.count = 0;
memcpy(p->state, SHA512_INIT_ARRAYS[(size_t)(digestSize >> 4) - 1], sizeof(p->state));
}
void Sha512_Init(CSha512 *p, unsigned digestSize)
{
p->v.vars.func_UpdateBlocks =
#ifdef Z7_COMPILER_SHA512_SUPPORTED
g_SHA512_FUNC_UPDATE_BLOCKS;
#else
NULL;
#endif
Sha512_InitState(p, digestSize);
}
#define S0(x) (Z7_ROTR64(x,28) ^ Z7_ROTR64(x,34) ^ Z7_ROTR64(x,39))
#define S1(x) (Z7_ROTR64(x,14) ^ Z7_ROTR64(x,18) ^ Z7_ROTR64(x,41))
#define s0(x) (Z7_ROTR64(x, 1) ^ Z7_ROTR64(x, 8) ^ (x >> 7))
#define s1(x) (Z7_ROTR64(x,19) ^ Z7_ROTR64(x,61) ^ (x >> 6))
#define Ch(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) ((x&y)|(z&(x|y)))
#define W_PRE(i) (W[(i) + (size_t)(j)] = GetBe64(data + ((size_t)(j) + i) * 8))
#define blk2_main(j, i) s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15))
#ifdef Z7_SHA512_BIG_W
// we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned.
#define w(j, i) W[(size_t)(j) + i]
#define blk2(j, i) (w(j, i) = w(j, (i)-16) + blk2_main(j, i))
#else
#if STEP_MAIN == 16
#define w(j, i) W[(i) & 15]
#else
#define w(j, i) W[((size_t)(j) + (i)) & 15]
#endif
#define blk2(j, i) (w(j, i) += blk2_main(j, i))
#endif
#define W_MAIN(i) blk2(j, i)
#define T1(wx, i) \
tmp = h + S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
h = g; \
g = f; \
f = e; \
e = d + tmp; \
tmp += S0(a) + Maj(a, b, c); \
d = c; \
c = b; \
b = a; \
a = tmp; \
#define R1_PRE(i) T1( W_PRE, i)
#define R1_MAIN(i) T1( W_MAIN, i)
#if (!defined(Z7_SHA512_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4)
#define R2_MAIN(i) \
R1_MAIN(i) \
R1_MAIN(i + 1) \
#endif
#if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8
#define T4( a,b,c,d,e,f,g,h, wx, i) \
h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
tmp = h; \
h += d; \
d = tmp + S0(a) + Maj(a, b, c); \
#define R4( wx, i) \
T4 ( a,b,c,d,e,f,g,h, wx, (i )); \
T4 ( d,a,b,c,h,e,f,g, wx, (i+1)); \
T4 ( c,d,a,b,g,h,e,f, wx, (i+2)); \
T4 ( b,c,d,a,f,g,h,e, wx, (i+3)); \
#define R4_PRE(i) R4( W_PRE, i)
#define R4_MAIN(i) R4( W_MAIN, i)
#define T8( a,b,c,d,e,f,g,h, wx, i) \
h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
d += h; \
h += S0(a) + Maj(a, b, c); \
#define R8( wx, i) \
T8 ( a,b,c,d,e,f,g,h, wx, i ); \
T8 ( h,a,b,c,d,e,f,g, wx, i+1); \
T8 ( g,h,a,b,c,d,e,f, wx, i+2); \
T8 ( f,g,h,a,b,c,d,e, wx, i+3); \
T8 ( e,f,g,h,a,b,c,d, wx, i+4); \
T8 ( d,e,f,g,h,a,b,c, wx, i+5); \
T8 ( c,d,e,f,g,h,a,b, wx, i+6); \
T8 ( b,c,d,e,f,g,h,a, wx, i+7); \
#define R8_PRE(i) R8( W_PRE, i)
#define R8_MAIN(i) R8( W_MAIN, i)
#endif
extern
MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80];
MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80] = {
U64C(0x428a2f98d728ae22), U64C(0x7137449123ef65cd), U64C(0xb5c0fbcfec4d3b2f), U64C(0xe9b5dba58189dbbc),
U64C(0x3956c25bf348b538), U64C(0x59f111f1b605d019), U64C(0x923f82a4af194f9b), U64C(0xab1c5ed5da6d8118),
U64C(0xd807aa98a3030242), U64C(0x12835b0145706fbe), U64C(0x243185be4ee4b28c), U64C(0x550c7dc3d5ffb4e2),
U64C(0x72be5d74f27b896f), U64C(0x80deb1fe3b1696b1), U64C(0x9bdc06a725c71235), U64C(0xc19bf174cf692694),
U64C(0xe49b69c19ef14ad2), U64C(0xefbe4786384f25e3), U64C(0x0fc19dc68b8cd5b5), U64C(0x240ca1cc77ac9c65),
U64C(0x2de92c6f592b0275), U64C(0x4a7484aa6ea6e483), U64C(0x5cb0a9dcbd41fbd4), U64C(0x76f988da831153b5),
U64C(0x983e5152ee66dfab), U64C(0xa831c66d2db43210), U64C(0xb00327c898fb213f), U64C(0xbf597fc7beef0ee4),
U64C(0xc6e00bf33da88fc2), U64C(0xd5a79147930aa725), U64C(0x06ca6351e003826f), U64C(0x142929670a0e6e70),
U64C(0x27b70a8546d22ffc), U64C(0x2e1b21385c26c926), U64C(0x4d2c6dfc5ac42aed), U64C(0x53380d139d95b3df),
U64C(0x650a73548baf63de), U64C(0x766a0abb3c77b2a8), U64C(0x81c2c92e47edaee6), U64C(0x92722c851482353b),
U64C(0xa2bfe8a14cf10364), U64C(0xa81a664bbc423001), U64C(0xc24b8b70d0f89791), U64C(0xc76c51a30654be30),
U64C(0xd192e819d6ef5218), U64C(0xd69906245565a910), U64C(0xf40e35855771202a), U64C(0x106aa07032bbd1b8),
U64C(0x19a4c116b8d2d0c8), U64C(0x1e376c085141ab53), U64C(0x2748774cdf8eeb99), U64C(0x34b0bcb5e19b48a8),
U64C(0x391c0cb3c5c95a63), U64C(0x4ed8aa4ae3418acb), U64C(0x5b9cca4f7763e373), U64C(0x682e6ff3d6b2b8a3),
U64C(0x748f82ee5defb2fc), U64C(0x78a5636f43172f60), U64C(0x84c87814a1f0ab72), U64C(0x8cc702081a6439ec),
U64C(0x90befffa23631e28), U64C(0xa4506cebde82bde9), U64C(0xbef9a3f7b2c67915), U64C(0xc67178f2e372532b),
U64C(0xca273eceea26619c), U64C(0xd186b8c721c0c207), U64C(0xeada7dd6cde0eb1e), U64C(0xf57d4f7fee6ed178),
U64C(0x06f067aa72176fba), U64C(0x0a637dc5a2c898a6), U64C(0x113f9804bef90dae), U64C(0x1b710b35131c471b),
U64C(0x28db77f523047d84), U64C(0x32caab7b40c72493), U64C(0x3c9ebe0a15c9bebc), U64C(0x431d67c49c100d4c),
U64C(0x4cc5d4becb3e42b6), U64C(0x597f299cfc657e2a), U64C(0x5fcb6fab3ad6faec), U64C(0x6c44198c4a475817)
};
#define K SHA512_K_ARRAY
Z7_NO_INLINE
void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks)
{
UInt64 W
#ifdef Z7_SHA512_BIG_W
[80];
#else
[16];
#endif
unsigned j;
UInt64 a,b,c,d,e,f,g,h;
#if !defined(Z7_SHA512_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
UInt64 tmp;
#endif
if (numBlocks == 0) return;
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
f = state[5];
g = state[6];
h = state[7];
do
{
for (j = 0; j < 16; j += STEP_PRE)
{
#if STEP_PRE > 4
#if STEP_PRE < 8
R4_PRE(0);
#else
R8_PRE(0);
#if STEP_PRE == 16
R8_PRE(8);
#endif
#endif
#else
R1_PRE(0)
#if STEP_PRE >= 2
R1_PRE(1)
#if STEP_PRE >= 4
R1_PRE(2)
R1_PRE(3)
#endif
#endif
#endif
}
for (j = 16; j < 80; j += STEP_MAIN)
{
#if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8
#if STEP_MAIN < 8
R4_MAIN(0)
#else
R8_MAIN(0)
#if STEP_MAIN == 16
R8_MAIN(8)
#endif
#endif
#else
R1_MAIN(0)
#if STEP_MAIN >= 2
R1_MAIN(1)
#if STEP_MAIN >= 4
R2_MAIN(2)
#if STEP_MAIN >= 8
R2_MAIN(4)
R2_MAIN(6)
#if STEP_MAIN >= 16
R2_MAIN(8)
R2_MAIN(10)
R2_MAIN(12)
R2_MAIN(14)
#endif
#endif
#endif
#endif
#endif
}
a += state[0]; state[0] = a;
b += state[1]; state[1] = b;
c += state[2]; state[2] = c;
d += state[3]; state[3] = d;
e += state[4]; state[4] = e;
f += state[5]; state[5] = f;
g += state[6]; state[6] = g;
h += state[7]; state[7] = h;
data += SHA512_BLOCK_SIZE;
}
while (--numBlocks);
}
#define Sha512_UpdateBlock(p) SHA512_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
void Sha512_Update(CSha512 *p, const Byte *data, size_t size)
{
if (size == 0)
return;
{
const unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1);
const unsigned num = SHA512_BLOCK_SIZE - pos;
p->v.vars.count += size;
if (num > size)
{
memcpy(p->buffer + pos, data, size);
return;
}
if (pos != 0)
{
size -= num;
memcpy(p->buffer + pos, data, num);
data += num;
Sha512_UpdateBlock(p);
}
}
{
const size_t numBlocks = size >> 7;
// if (numBlocks)
SHA512_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
size &= SHA512_BLOCK_SIZE - 1;
if (size == 0)
return;
data += (numBlocks << 7);
memcpy(p->buffer, data, size);
}
}
void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize)
{
unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1);
p->buffer[pos++] = 0x80;
if (pos > (SHA512_BLOCK_SIZE - 8 * 2))
{
while (pos != SHA512_BLOCK_SIZE) { p->buffer[pos++] = 0; }
// memset(&p->buf.buffer[pos], 0, SHA512_BLOCK_SIZE - pos);
Sha512_UpdateBlock(p);
pos = 0;
}
memset(&p->buffer[pos], 0, (SHA512_BLOCK_SIZE - 8 * 2) - pos);
{
const UInt64 numBits = p->v.vars.count << 3;
SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 2, 0) // = (p->v.vars.count >> (64 - 3)); (high 64-bits)
SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 1, numBits)
}
Sha512_UpdateBlock(p);
#if 1 && defined(MY_CPU_BE)
memcpy(digest, p->state, digestSize);
#else
{
const unsigned numWords = digestSize >> 3;
unsigned i;
for (i = 0; i < numWords; i++)
{
const UInt64 v = p->state[i];
SetBe64(digest, v)
digest += 8;
}
if (digestSize & 4) // digestSize == SHA512_224_DIGEST_SIZE
{
const UInt32 v = (UInt32)((p->state[numWords]) >> 32);
SetBe32(digest, v)
}
}
#endif
Sha512_InitState(p, digestSize);
}
// #define Z7_SHA512_PROBE_DEBUG // for debug
#if defined(Z7_SHA512_PROBE_DEBUG) || defined(Z7_COMPILER_SHA512_SUPPORTED)
#if defined(Z7_SHA512_PROBE_DEBUG) \
|| defined(_WIN32) && defined(MY_CPU_ARM64)
#ifndef Z7_SHA512_USE_PROBE
#define Z7_SHA512_USE_PROBE
#endif
#endif
#ifdef Z7_SHA512_USE_PROBE
#ifdef Z7_SHA512_PROBE_DEBUG
#include <stdio.h>
#define PRF(x) x
#else
#define PRF(x)
#endif
#if 0 || !defined(_MSC_VER) // 1 || : for debug LONGJMP mode
// MINGW doesn't support __try. So we use signal() / longjmp().
// Note: signal() / longjmp() probably is not thread-safe.
// So we must call Sha512Prepare() from main thread at program start.
#ifndef Z7_SHA512_USE_LONGJMP
#define Z7_SHA512_USE_LONGJMP
#endif
#endif
#ifdef Z7_SHA512_USE_LONGJMP
#include <signal.h>
#include <setjmp.h>
static jmp_buf g_Sha512_jmp_buf;
// static int g_Sha512_Unsupported;
#if defined(__GNUC__) && (__GNUC__ >= 8) \
|| defined(__clang__) && (__clang_major__ >= 3)
__attribute__((noreturn))
#endif
static void Z7_CDECL Sha512_signal_Handler(int v)
{
PRF(printf("======== Sha512_signal_Handler = %x\n", (unsigned)v);)
// g_Sha512_Unsupported = 1;
longjmp(g_Sha512_jmp_buf, 1);
}
#endif // Z7_SHA512_USE_LONGJMP
#if defined(_WIN32)
#include "7zWindows.h"
#endif
#if defined(MY_CPU_ARM64)
// #define Z7_SHA512_USE_SIMPLIFIED_PROBE // for debug
#endif
#ifdef Z7_SHA512_USE_SIMPLIFIED_PROBE
#include <arm_neon.h>
#if defined(__clang__)
__attribute__((__target__("sha3")))
#elif !defined(_MSC_VER)
__attribute__((__target__("arch=armv8.2-a+sha3")))
#endif
#endif
static BoolInt CPU_IsSupported_SHA512_Probe(void)
{
PRF(printf("\n== CPU_IsSupported_SHA512_Probe\n");)
#if defined(_WIN32) && defined(MY_CPU_ARM64)
// we have no SHA512 flag for IsProcessorFeaturePresent() still.
if (!CPU_IsSupported_CRYPTO())
return False;
PRF(printf("==== Registry check\n");)
{
// we can't read ID_AA64ISAR0_EL1 register from application.
// but ID_AA64ISAR0_EL1 register is mapped to "CP 4030" registry value.
HKEY key = NULL;
LONG res = RegOpenKeyEx(HKEY_LOCAL_MACHINE,
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
0, KEY_READ, &key);
if (res != ERROR_SUCCESS)
return False;
{
DWORD type = 0;
DWORD count = sizeof(UInt64);
UInt64 val = 0;
res = RegQueryValueEx(key, TEXT("CP 4030"), NULL,
&type, (LPBYTE)&val, &count);
RegCloseKey(key);
if (res != ERROR_SUCCESS
|| type != REG_QWORD
|| count != sizeof(UInt64)
|| ((unsigned)(val >> 12) & 0xf) != 2)
return False;
// we parse SHA2 field of ID_AA64ISAR0_EL1 register:
// 0 : No SHA2 instructions implemented
// 1 : SHA256 implemented
// 2 : SHA256 and SHA512 implemented
}
}
#endif // defined(_WIN32) && defined(MY_CPU_ARM64)
#if 1 // 0 for debug to disable SHA512 PROBE code
/*
----- SHA512 PROBE -----
We suppose that "CP 4030" registry reading is enough.
But we use additional SHA512 PROBE code, because
we can catch exception here, and we don't catch exceptions,
if we call Sha512 functions from main code.
NOTE: arm64 PROBE code doesn't work, if we call it via Wine in linux-arm64.
The program just stops.
Also x64 version of PROBE code doesn't work, if we run it via Intel SDE emulator
without SHA512 support (-skl switch),
The program stops, and we have message from SDE:
TID 0 SDE-ERROR: Executed instruction not valid for specified chip (SKYLAKE): vsha512msg1
But we still want to catch that exception instead of process stopping.
Does this PROBE code work in native Windows-arm64 (with/without sha512 hw instructions)?
Are there any ways to fix the problems with arm64-wine and x64-SDE cases?
*/
PRF(printf("==== CPU_IsSupported_SHA512 PROBE\n");)
{
BoolInt isSupported = False;
#ifdef Z7_SHA512_USE_LONGJMP
void (Z7_CDECL *signal_prev)(int);
/*
if (g_Sha512_Unsupported)
{
PRF(printf("==== g_Sha512_Unsupported\n");)
return False;
}
*/
printf("====== signal(SIGILL)\n");
signal_prev = signal(SIGILL, Sha512_signal_Handler);
if (signal_prev == SIG_ERR)
{
PRF(printf("====== signal fail\n");)
return False;
}
// PRF(printf("==== signal_prev = %p\n", (void *)signal_prev);)
// docs: Before the specified function is executed,
// the value of func is set to SIG_DFL.
// So we can exit if (setjmp(g_Sha512_jmp_buf) != 0).
PRF(printf("====== setjmp\n");)
if (!setjmp(g_Sha512_jmp_buf))
#else // Z7_SHA512_USE_LONGJMP
#ifdef _MSC_VER
#ifdef __clang_major__
#pragma GCC diagnostic ignored "-Wlanguage-extension-token"
#endif
__try
#endif
#endif // Z7_SHA512_USE_LONGJMP
{
#if defined(Z7_COMPILER_SHA512_SUPPORTED)
#ifdef Z7_SHA512_USE_SIMPLIFIED_PROBE
// simplified sha512 check for arm64:
const uint64x2_t a = vdupq_n_u64(1);
const uint64x2_t b = vsha512hq_u64(a, a, a);
PRF(printf("======== vsha512hq_u64 probe\n");)
if ((UInt32)vgetq_lane_u64(b, 0) == 0x11800002)
#else
MY_ALIGN(16)
UInt64 temp[SHA512_NUM_DIGEST_WORDS + SHA512_NUM_BLOCK_WORDS];
memset(temp, 0x5a, sizeof(temp));
PRF(printf("======== Sha512_UpdateBlocks_HW\n");)
Sha512_UpdateBlocks_HW(temp,
(const Byte *)(const void *)(temp + SHA512_NUM_DIGEST_WORDS), 1);
// PRF(printf("======== t = %x\n", (UInt32)temp[0]);)
if ((UInt32)temp[0] == 0xa33cfdf7)
#endif
{
PRF(printf("======== PROBE SHA512: SHA512 is supported\n");)
isSupported = True;
}
#else // Z7_COMPILER_SHA512_SUPPORTED
// for debug : we generate bad instrction or raise exception.
// __except() doesn't catch raise() calls.
#ifdef Z7_SHA512_USE_LONGJMP
PRF(printf("====== raise(SIGILL)\n");)
raise(SIGILL);
#else
#if defined(_MSC_VER) && defined(MY_CPU_X86)
__asm ud2
#endif
#endif // Z7_SHA512_USE_LONGJMP
#endif // Z7_COMPILER_SHA512_SUPPORTED
}
#ifdef Z7_SHA512_USE_LONGJMP
PRF(printf("====== restore signal SIGILL\n");)
signal(SIGILL, signal_prev);
#elif _MSC_VER
__except (EXCEPTION_EXECUTE_HANDLER)
{
PRF(printf("==== CPU_IsSupported_SHA512 __except(EXCEPTION_EXECUTE_HANDLER)\n");)
}
#endif
PRF(printf("== return (sha512 supported) = %d\n", isSupported);)
return isSupported;
}
#else
// without SHA512 PROBE code
return True;
#endif
}
#endif // Z7_SHA512_USE_PROBE
#endif // defined(Z7_SHA512_PROBE_DEBUG) || defined(Z7_COMPILER_SHA512_SUPPORTED)
void Sha512Prepare(void)
{
#ifdef Z7_COMPILER_SHA512_SUPPORTED
SHA512_FUNC_UPDATE_BLOCKS f, f_hw;
f = Sha512_UpdateBlocks;
f_hw = NULL;
#ifdef Z7_SHA512_USE_PROBE
if (CPU_IsSupported_SHA512_Probe())
#elif defined(MY_CPU_X86_OR_AMD64)
if (CPU_IsSupported_SHA512() && CPU_IsSupported_AVX2())
#else
if (CPU_IsSupported_SHA512())
#endif
{
// printf("\n========== HW SHA512 ======== \n");
f = f_hw = Sha512_UpdateBlocks_HW;
}
g_SHA512_FUNC_UPDATE_BLOCKS = f;
g_SHA512_FUNC_UPDATE_BLOCKS_HW = f_hw;
#elif defined(Z7_SHA512_PROBE_DEBUG)
CPU_IsSupported_SHA512_Probe(); // for debug
#endif
}
#undef K
#undef S0
#undef S1
#undef s0
#undef s1
#undef Ch
#undef Maj
#undef W_MAIN
#undef W_PRE
#undef w
#undef blk2_main
#undef blk2
#undef T1
#undef T4
#undef T8
#undef R1_PRE
#undef R1_MAIN
#undef R2_MAIN
#undef R4
#undef R4_PRE
#undef R4_MAIN
#undef R8
#undef R8_PRE
#undef R8_MAIN
#undef STEP_PRE
#undef STEP_MAIN
#undef Z7_SHA512_BIG_W
#undef Z7_SHA512_UNROLL
#undef Z7_COMPILER_SHA512_SUPPORTED

View file

@ -1,86 +0,0 @@
/* Sha512.h -- SHA-512 Hash
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_SHA512_H
#define ZIP7_INC_SHA512_H
#include "7zTypes.h"
EXTERN_C_BEGIN
#define SHA512_NUM_BLOCK_WORDS 16
#define SHA512_NUM_DIGEST_WORDS 8
#define SHA512_BLOCK_SIZE (SHA512_NUM_BLOCK_WORDS * 8)
#define SHA512_DIGEST_SIZE (SHA512_NUM_DIGEST_WORDS * 8)
#define SHA512_224_DIGEST_SIZE (224 / 8)
#define SHA512_256_DIGEST_SIZE (256 / 8)
#define SHA512_384_DIGEST_SIZE (384 / 8)
typedef void (Z7_FASTCALL *SHA512_FUNC_UPDATE_BLOCKS)(UInt64 state[8], const Byte *data, size_t numBlocks);
/*
if (the system supports different SHA512 code implementations)
{
(CSha512::func_UpdateBlocks) will be used
(CSha512::func_UpdateBlocks) can be set by
Sha512_Init() - to default (fastest)
Sha512_SetFunction() - to any algo
}
else
{
(CSha512::func_UpdateBlocks) is ignored.
}
*/
typedef struct
{
union
{
struct
{
SHA512_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
UInt64 count;
} vars;
UInt64 _pad_64bit[8];
void *_pad_align_ptr[2];
} v;
UInt64 state[SHA512_NUM_DIGEST_WORDS];
Byte buffer[SHA512_BLOCK_SIZE];
} CSha512;
#define SHA512_ALGO_DEFAULT 0
#define SHA512_ALGO_SW 1
#define SHA512_ALGO_HW 2
/*
Sha512_SetFunction()
return:
0 - (algo) value is not supported, and func_UpdateBlocks was not changed
1 - func_UpdateBlocks was set according (algo) value.
*/
BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo);
// we support only these (digestSize) values: 224/8, 256/8, 384/8, 512/8
void Sha512_InitState(CSha512 *p, unsigned digestSize);
void Sha512_Init(CSha512 *p, unsigned digestSize);
void Sha512_Update(CSha512 *p, const Byte *data, size_t size);
void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize);
// void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks);
/*
call Sha512Prepare() once at program start.
It prepares all supported implementations, and detects the fastest implementation.
*/
void Sha512Prepare(void);
EXTERN_C_END
#endif

View file

@ -1,395 +0,0 @@
/* Sha512Opt.c -- SHA-512 optimized code for SHA-512 hardware instructions
: Igor Pavlov : Public domain */
#include "Precomp.h"
#include "Compiler.h"
#include "CpuArch.h"
// #define Z7_USE_HW_SHA_STUB // for debug
#ifdef MY_CPU_X86_OR_AMD64
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) // fix it
#define USE_HW_SHA
#elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000)
#define USE_HW_SHA
#if !defined(__INTEL_COMPILER)
// icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
#if !defined(__SHA512__) || !defined(__AVX2__)
#define ATTRIB_SHA512 __attribute__((__target__("sha512,avx2")))
#endif
#endif
#elif defined(Z7_MSC_VER_ORIGINAL)
#if (_MSC_VER >= 1940)
#define USE_HW_SHA
#else
// #define Z7_USE_HW_SHA_STUB
#endif
#endif
// #endif // MY_CPU_X86_OR_AMD64
#ifndef USE_HW_SHA
// #define Z7_USE_HW_SHA_STUB // for debug
#endif
#ifdef USE_HW_SHA
// #pragma message("Sha512 HW")
#include <immintrin.h>
#if defined (__clang__) && defined(_MSC_VER)
#if !defined(__AVX__)
#include <avxintrin.h>
#endif
#if !defined(__AVX2__)
#include <avx2intrin.h>
#endif
#if !defined(__SHA512__)
#include <sha512intrin.h>
#endif
#else
#endif
/*
SHA512 uses:
AVX:
_mm256_loadu_si256 (vmovdqu)
_mm256_storeu_si256
_mm256_set_epi32 (unused)
AVX2:
_mm256_add_epi64 : vpaddq
_mm256_shuffle_epi8 : vpshufb
_mm256_shuffle_epi32 : pshufd
_mm256_blend_epi32 : vpblendd
_mm256_permute4x64_epi64 : vpermq : 3c
_mm256_permute2x128_si256: vperm2i128 : 3c
_mm256_extracti128_si256 : vextracti128 : 3c
SHA512:
_mm256_sha512*
*/
// K array must be aligned for 32-bytes at least.
// The compiler can look align attribute and selects
// vmovdqu - for code without align attribute
// vmovdqa - for code with align attribute
extern
MY_ALIGN(64)
const UInt64 SHA512_K_ARRAY[80];
#define K SHA512_K_ARRAY
#define ADD_EPI64(dest, src) dest = _mm256_add_epi64(dest, src);
#define SHA512_MSG1(dest, src) dest = _mm256_sha512msg1_epi64(dest, _mm256_extracti128_si256(src, 0));
#define SHA512_MSG2(dest, src) dest = _mm256_sha512msg2_epi64(dest, src);
#define LOAD_SHUFFLE(m, k) \
m = _mm256_loadu_si256((const __m256i *)(const void *)(data + (k) * 32)); \
m = _mm256_shuffle_epi8(m, mask); \
#define NNN(m0, m1, m2, m3)
#define SM1(m1, m2, m3, m0) \
SHA512_MSG1(m0, m1); \
#define SM2(m2, m3, m0, m1) \
ADD_EPI64(m0, _mm256_permute4x64_epi64(_mm256_blend_epi32(m2, m3, 3), 0x39)); \
SHA512_MSG2(m0, m3); \
#define RND2(t0, t1, lane) \
t0 = _mm256_sha512rnds2_epi64(t0, t1, _mm256_extracti128_si256(msg, lane));
#define R4(k, m0, m1, m2, m3, OP0, OP1) \
msg = _mm256_add_epi64(m0, *(const __m256i *) (const void *) &K[(k) * 4]); \
RND2(state0, state1, 0); OP0(m0, m1, m2, m3) \
RND2(state1, state0, 1); OP1(m0, m1, m2, m3) \
#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
#define PREPARE_STATE \
state0 = _mm256_shuffle_epi32(state0, 0x4e); /* cdab */ \
state1 = _mm256_shuffle_epi32(state1, 0x4e); /* ghef */ \
tmp = state0; \
state0 = _mm256_permute2x128_si256(state0, state1, 0x13); /* cdgh */ \
state1 = _mm256_permute2x128_si256(tmp, state1, 2); /* abef */ \
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
#ifdef ATTRIB_SHA512
ATTRIB_SHA512
#endif
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
{
const __m256i mask = _mm256_set_epi32(
0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607,
0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607);
__m256i tmp, state0, state1;
if (numBlocks == 0)
return;
state0 = _mm256_loadu_si256((const __m256i *) (const void *) &state[0]);
state1 = _mm256_loadu_si256((const __m256i *) (const void *) &state[4]);
PREPARE_STATE
do
{
__m256i state0_save, state1_save;
__m256i m0, m1, m2, m3;
__m256i msg;
// #define msg tmp
state0_save = state0;
state1_save = state1;
LOAD_SHUFFLE (m0, 0)
LOAD_SHUFFLE (m1, 1)
LOAD_SHUFFLE (m2, 2)
LOAD_SHUFFLE (m3, 3)
R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 3, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 4, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
ADD_EPI64(state0, state0_save)
ADD_EPI64(state1, state1_save)
data += 128;
}
while (--numBlocks);
PREPARE_STATE
_mm256_storeu_si256((__m256i *) (void *) &state[0], state0);
_mm256_storeu_si256((__m256i *) (void *) &state[4], state1);
}
#endif // USE_HW_SHA
// gcc 8.5 also supports sha512, but we need also support in assembler that is called by gcc
#elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
#if defined(__ARM_FEATURE_SHA512)
#define USE_HW_SHA
#else
#if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \
|| defined(__GNUC__) && (__GNUC__ >= 9) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it
#define USE_HW_SHA
#endif
#endif
#ifdef USE_HW_SHA
// #pragma message("=== Sha512 HW === ")
#if defined(__clang__) || defined(__GNUC__)
#if !defined(__ARM_FEATURE_SHA512)
// #pragma message("=== we define SHA3 ATTRIB_SHA512 === ")
#if defined(__clang__)
#define ATTRIB_SHA512 __attribute__((__target__("sha3"))) // "armv8.2-a,sha3"
#else
#define ATTRIB_SHA512 __attribute__((__target__("arch=armv8.2-a+sha3")))
#endif
#endif
#endif
#if defined(Z7_MSC_VER_ORIGINAL)
#include <arm64_neon.h>
#else
#if defined(__clang__) && __clang_major__ < 16
#if !defined(__ARM_FEATURE_SHA512)
// #pragma message("=== we set __ARM_FEATURE_SHA512 1 === ")
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define Z7_ARM_FEATURE_SHA512_WAS_SET 1
#define __ARM_FEATURE_SHA512 1
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif // clang
#include <arm_neon.h>
#if defined(Z7_ARM_FEATURE_SHA512_WAS_SET) && \
defined(__ARM_FEATURE_SHA512)
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#undef __ARM_FEATURE_SHA512
#undef Z7_ARM_FEATURE_SHA512_WAS_SET
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
#endif
#endif // Z7_MSC_VER_ORIGINAL
typedef uint64x2_t v128_64;
// typedef __n128 v128_64; // MSVC
#ifdef MY_CPU_BE
#define MY_rev64_for_LE(x) x
#else
#define MY_rev64_for_LE(x) vrev64q_u8(x)
#endif
#define LOAD_128_64(_p) vld1q_u64(_p)
#define LOAD_128_8(_p) vld1q_u8 (_p)
#define STORE_128_64(_p, _v) vst1q_u64(_p, _v)
#define LOAD_SHUFFLE(m, k) \
m = vreinterpretq_u64_u8( \
MY_rev64_for_LE( \
LOAD_128_8(data + (k) * 16))); \
// K array must be aligned for 16-bytes at least.
extern
MY_ALIGN(64)
const UInt64 SHA512_K_ARRAY[80];
#define K SHA512_K_ARRAY
#define NN(m0, m1, m4, m5, m7)
#define SM(m0, m1, m4, m5, m7) \
m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1));
#define R2(k, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP) \
OP(m0, m1, m4, m5, m7) \
t = vaddq_u64(m0, vld1q_u64(k)); \
t = vaddq_u64(vextq_u64(t, t, 1), a3); \
t = vsha512hq_u64(t, vextq_u64(a2, a3, 1), vextq_u64(a1, a2, 1)); \
a3 = vsha512h2q_u64(t, a1, a0); \
a1 = vaddq_u64(a1, t); \
#define R8(k, m0,m1,m2,m3,m4,m5,m6,m7, OP) \
R2 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP ) \
R2 ( (k)+1*2, m1,m2,m3,m4,m5,m6,m7,m0, a3,a0,a1,a2, OP ) \
R2 ( (k)+2*2, m2,m3,m4,m5,m6,m7,m0,m1, a2,a3,a0,a1, OP ) \
R2 ( (k)+3*2, m3,m4,m5,m6,m7,m0,m1,m2, a1,a2,a3,a0, OP ) \
#define R16(k, OP) \
R8 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, OP ) \
R8 ( (k)+4*2, m4,m5,m6,m7,m0,m1,m2,m3, OP ) \
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
#ifdef ATTRIB_SHA512
ATTRIB_SHA512
#endif
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
{
v128_64 a0, a1, a2, a3;
if (numBlocks == 0)
return;
a0 = LOAD_128_64(&state[0]);
a1 = LOAD_128_64(&state[2]);
a2 = LOAD_128_64(&state[4]);
a3 = LOAD_128_64(&state[6]);
do
{
v128_64 a0_save, a1_save, a2_save, a3_save;
v128_64 m0, m1, m2, m3, m4, m5, m6, m7;
v128_64 t;
unsigned i;
const UInt64 *k_ptr;
LOAD_SHUFFLE (m0, 0)
LOAD_SHUFFLE (m1, 1)
LOAD_SHUFFLE (m2, 2)
LOAD_SHUFFLE (m3, 3)
LOAD_SHUFFLE (m4, 4)
LOAD_SHUFFLE (m5, 5)
LOAD_SHUFFLE (m6, 6)
LOAD_SHUFFLE (m7, 7)
a0_save = a0;
a1_save = a1;
a2_save = a2;
a3_save = a3;
R16 ( K, NN )
k_ptr = K + 16;
for (i = 0; i < 4; i++)
{
R16 ( k_ptr, SM )
k_ptr += 16;
}
a0 = vaddq_u64(a0, a0_save);
a1 = vaddq_u64(a1, a1_save);
a2 = vaddq_u64(a2, a2_save);
a3 = vaddq_u64(a3, a3_save);
data += 128;
}
while (--numBlocks);
STORE_128_64(&state[0], a0);
STORE_128_64(&state[2], a1);
STORE_128_64(&state[4], a2);
STORE_128_64(&state[6], a3);
}
#endif // USE_HW_SHA
#endif // MY_CPU_ARM_OR_ARM64
#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
// #error Stop_Compiling_UNSUPPORTED_SHA
// #include <stdlib.h>
// We can compile this file with another C compiler,
// or we can compile asm version.
// So we can generate real code instead of this stub function.
// #include "Sha512.h"
// #if defined(_MSC_VER)
#pragma message("Sha512 HW-SW stub was used")
// #endif
void Z7_FASTCALL Sha512_UpdateBlocks (UInt64 state[8], const Byte *data, size_t numBlocks);
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
{
Sha512_UpdateBlocks(state, data, numBlocks);
/*
UNUSED_VAR(state);
UNUSED_VAR(data);
UNUSED_VAR(numBlocks);
exit(1);
return;
*/
}
#endif
#undef K
#undef RND2
#undef MY_rev64_for_LE
#undef NN
#undef NNN
#undef LOAD_128
#undef STORE_128
#undef LOAD_SHUFFLE
#undef SM1
#undef SM2
#undef SM
#undef R2
#undef R4
#undef R16
#undef PREPARE_STATE
#undef USE_HW_SHA
#undef ATTRIB_SHA512
#undef USE_VER_MIN
#undef Z7_USE_HW_SHA_STUB

375
C/Sort.c
View file

@ -1,268 +1,141 @@
/* Sort.c -- Sort functions
: Igor Pavlov : Public domain */
2014-04-05 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include "Sort.h"
#include "CpuArch.h"
#if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
|| (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
)
// the code with prefetch is slow for small arrays on x86.
// So we disable prefetch for x86.
#ifndef MY_CPU_X86
// #pragma message("Z7_PREFETCH : __builtin_prefetch")
#define Z7_PREFETCH(a) __builtin_prefetch((a))
#endif
#define HeapSortDown(p, k, size, temp) \
{ for (;;) { \
size_t s = (k << 1); \
if (s > size) break; \
if (s < size && p[s + 1] > p[s]) s++; \
if (temp >= p[s]) break; \
p[k] = p[s]; k = s; \
} p[k] = temp; }
#elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
#include "7zWindows.h"
// NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
// For example, clang-cl can generate "prefetcht2" instruction for
// PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
// But we want to generate "prefetcht0" instruction.
// So for CLANG/GCC we must use __builtin_prefetch() in code branch above
// instead of PreFetchCacheLine() / _mm_prefetch().
// New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
// But old x86 cpus don't support "prefetcht0".
// So we will use PreFetchCacheLine(), only if we are sure that
// generated instruction is supported by all cpus of that isa.
#if defined(MY_CPU_AMD64) \
|| defined(MY_CPU_ARM64) \
|| defined(MY_CPU_IA64)
// we need to use additional braces for (a) in PreFetchCacheLine call, because
// PreFetchCacheLine macro doesn't use braces:
// #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l)
// #pragma message("Z7_PREFETCH : PreFetchCacheLine")
#define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
#endif
#endif // _WIN32
#define PREFETCH_NO(p,k,s,size)
#ifndef Z7_PREFETCH
#define SORT_PREFETCH(p,k,s,size)
#else
// #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes
#define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch)
// #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch)
#if PREFETCH_LEVEL == 0
#define SORT_PREFETCH(p,k,s,size)
#else // PREFETCH_LEVEL != 0
/*
if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
we prefetch one value per cache line.
Use it if array is aligned for cache line size (64 bytes)
or if array is small (less than L1 cache size).
if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
we perfetch all cache lines that can be required.
it can be faster for big unaligned arrays.
*/
#define USE_PREFETCH_FOR_ALIGNED_ARRAY
// s == k * 2
#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
// x86 supports (lea r1*8+offset)
#define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL)
#else
#define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1))
#endif
#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
#define PREFETCH_ADD_OFFSET 0
#else
// last offset that can be reqiured in PREFETCH_LEVEL step:
#define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1)
#define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2
#endif
#if PREFETCH_LEVEL <= 3
#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2)); \
}}
#else /* for unaligned array */
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
Z7_PREFETCH((p + s2)); \
}}
#endif
#else // PREFETCH_LEVEL > 3
#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2 - 16)); \
Z7_PREFETCH((p + s2)); \
}}
#else /* for unaligned array */
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
Z7_PREFETCH((p + s2)); \
}}
#endif
#endif // PREFETCH_LEVEL > 3
#endif // PREFETCH_LEVEL != 0
#endif // Z7_PREFETCH
#if defined(MY_CPU_ARM64) \
/* || defined(MY_CPU_AMD64) */ \
/* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
// we want to use cmov, if cmov is very fast:
// - this cmov version is slower for clang-x64.
// - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
#define Z7_FAST_CMOV_SUPPORTED
#endif
#ifdef Z7_FAST_CMOV_SUPPORTED
// we want to use cmov here, if cmov is fast: new arm64 cpus.
// we want the compiler to use conditional move for this branch
#define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1;
#else
// use this branch, if cpu doesn't support fast conditional move.
// it uses slow array access reading:
#define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow;
#endif
#define HeapSortDown(p, k, size, temp, macro_prefetch) \
{ \
for (;;) { \
UInt32 n0, n1; \
size_t s = k * 2; \
if (s >= size) { \
if (s == size) { \
n0 = p[s]; \
p[k] = n0; \
if (temp < n0) k = s; \
} \
break; \
} \
n0 = p[k * 2]; \
n1 = p[k * 2 + 1]; \
s += n0 < n1; \
GET_MAX_VAL(n0, n1, p[s]) \
if (temp >= n0) break; \
macro_prefetch(p, k, s, size) \
p[k] = n0; \
k = s; \
} \
p[k] = temp; \
}
/*
stage-1 : O(n) :
we generate intermediate partially sorted binary tree:
p[0] : it's additional item for better alignment of tree structure in memory.
p[1]
p[2] p[3]
p[4] p[5] p[6] p[7]
...
p[x] >= p[x * 2]
p[x] >= p[x * 2 + 1]
stage-2 : O(n)*log2(N):
we move largest item p[0] from head of tree to the end of array
and insert last item to sorted binary tree.
*/
// (p) must be aligned for cache line size (64-bytes) for best performance
void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
void HeapSort(UInt32 *p, size_t size)
{
if (size < 2)
if (size <= 1)
return;
if (size == 2)
p--;
{
const UInt32 a0 = p[0];
const UInt32 a1 = p[1];
const unsigned k = a1 < a0;
p[k] = a0;
p[k ^ 1] = a1;
return;
}
{
// stage-1 : O(n)
// we transform array to partially sorted binary tree.
size_t i = --size / 2;
// (size) now is the index of the last item in tree,
// if (i)
{
do
{
const UInt32 temp = p[i];
size_t k = i;
HeapSortDown(p, k, size, temp, PREFETCH_NO)
}
while (--i);
}
{
const UInt32 temp = p[0];
const UInt32 a1 = p[1];
if (temp < a1)
{
size_t k = 1;
p[0] = a1;
HeapSortDown(p, k, size, temp, PREFETCH_NO)
}
}
}
if (size < 3)
{
// size == 2
const UInt32 a0 = p[0];
p[0] = p[2];
p[2] = a0;
return;
}
if (size != 3)
{
// stage-2 : O(size) * log2(size):
// we move largest item p[0] from head to the end of array,
// and insert last item to sorted binary tree.
size_t i = size / 2;
do
{
const UInt32 temp = p[size];
size_t k = p[2] < p[3] ? 3 : 2;
p[size--] = p[0];
p[0] = p[1];
p[1] = p[k];
HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
UInt32 temp = p[i];
size_t k = i;
HeapSortDown(p, k, size, temp)
}
while (size != 3);
while (--i != 0);
}
/*
do
{
size_t k = 1;
UInt32 temp = p[size];
p[size--] = p[1];
HeapSortDown(p, k, size, temp)
}
while (size > 1);
*/
while (size > 3)
{
UInt32 temp = p[size];
size_t k = (p[3] > p[2]) ? 3 : 2;
p[size--] = p[1];
p[1] = p[k];
HeapSortDown(p, k, size, temp)
}
{
const UInt32 a2 = p[2];
const UInt32 a3 = p[3];
const size_t k = a2 < a3;
p[2] = p[1];
p[3] = p[0];
p[k] = a3;
p[k ^ 1] = a2;
UInt32 temp = p[size];
p[size] = p[1];
if (size > 2 && p[2] < temp)
{
p[1] = p[2];
p[2] = temp;
}
else
p[1] = temp;
}
}
void HeapSort64(UInt64 *p, size_t size)
{
if (size <= 1)
return;
p--;
{
size_t i = size / 2;
do
{
UInt64 temp = p[i];
size_t k = i;
HeapSortDown(p, k, size, temp)
}
while (--i != 0);
}
/*
do
{
size_t k = 1;
UInt64 temp = p[size];
p[size--] = p[1];
HeapSortDown(p, k, size, temp)
}
while (size > 1);
*/
while (size > 3)
{
UInt64 temp = p[size];
size_t k = (p[3] > p[2]) ? 3 : 2;
p[size--] = p[1];
p[1] = p[k];
HeapSortDown(p, k, size, temp)
}
{
UInt64 temp = p[size];
p[size] = p[1];
if (size > 2 && p[2] < temp)
{
p[1] = p[2];
p[2] = temp;
}
else
p[1] = temp;
}
}
/*
#define HeapSortRefDown(p, vals, n, size, temp) \
{ size_t k = n; UInt32 val = vals[temp]; for (;;) { \
size_t s = (k << 1); \
if (s > size) break; \
if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \
if (val >= vals[p[s]]) break; \
p[k] = p[s]; k = s; \
} p[k] = temp; }
void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
{
if (size <= 1)
return;
p--;
{
size_t i = size / 2;
do
{
UInt32 temp = p[i];
HeapSortRefDown(p, vals, i, size, temp);
}
while (--i != 0);
}
do
{
UInt32 temp = p[size];
p[size--] = p[1];
HeapSortRefDown(p, vals, 1, size, temp);
}
while (size > 1);
}
*/

Some files were not shown because too many files have changed in this diff Show more