Compare commits

...

10 commits
21.07 ... main

Author SHA1 Message Date
Igor Pavlov 5e96a82794 25.01 2025-08-03 16:14:59 +05:00
Igor Pavlov 395149956d 25.00 2025-07-05 19:27:33 +05:00
Igor Pavlov e5431fa6f5 24.09 2024-11-30 15:27:15 +05:00
Igor Pavlov e008ce3976 24.08 2024-08-12 16:50:32 +05:00
Igor Pavlov a7a1d4a241 24.07 2024-06-19 15:32:41 +05:00
Igor Pavlov 89a73b9012 24.06 2024-05-27 12:41:10 +05:00
Igor Pavlov fc662341e6 24.05 2024-05-15 23:55:04 +05:00
Igor Pavlov 5b39dc76f1 23.01 2023-12-17 14:59:19 +05:00
Igor Pavlov 93be7d4abf 22.01 2023-12-17 14:35:38 +05:00
Igor Pavlov a3e1d22737 22.00 2023-12-17 13:35:20 +05:00
1049 changed files with 100560 additions and 40248 deletions

View file

@ -1,7 +1,12 @@
; 7zAsm.asm -- ASM macros ; 7zAsm.asm -- ASM macros
; 2021-12-25 : Igor Pavlov : Public domain ; 2023-12-08 : Igor Pavlov : Public domain
; UASM can require these changes
; OPTION FRAMEPRESERVEFLAGS:ON
; OPTION PROLOGUE:NONE
; OPTION EPILOGUE:NONE
ifdef @wordsize ifdef @wordsize
; @wordsize is defined only in JWASM and ASMC and is not defined in MASM ; @wordsize is defined only in JWASM and ASMC and is not defined in MASM
; @wordsize eq 8 for 64-bit x64 ; @wordsize eq 8 for 64-bit x64
@ -116,10 +121,29 @@ endif
x2_H equ DH x2_H equ DH
x3_H equ BH x3_H equ BH
; r0_L equ AL
; r1_L equ CL
; r2_L equ DL
; r3_L equ BL
; r0_H equ AH
; r1_H equ CH
; r2_H equ DH
; r3_H equ BH
ifdef x64 ifdef x64
x5_L equ BPL x5_L equ BPL
x6_L equ SIL x6_L equ SIL
x7_L equ DIL x7_L equ DIL
x8_L equ r8b
x9_L equ r9b
x10_L equ r10b
x11_L equ r11b
x12_L equ r12b
x13_L equ r13b
x14_L equ r14b
x15_L equ r15b
r0 equ RAX r0 equ RAX
r1 equ RCX r1 equ RCX
@ -148,6 +172,22 @@ else
r7 equ x7 r7 equ x7
endif endif
x0_R equ r0
x1_R equ r1
x2_R equ r2
x3_R equ r3
x4_R equ r4
x5_R equ r5
x6_R equ r6
x7_R equ r7
x8_R equ r8
x9_R equ r9
x10_R equ r10
x11_R equ r11
x12_R equ r12
x13_R equ r13
x14_R equ r14
x15_R equ r15
ifdef x64 ifdef x64
ifdef ABI_LINUX ifdef ABI_LINUX
@ -195,6 +235,14 @@ REG_ABI_PARAM_0 equ REG_PARAM_0
REG_ABI_PARAM_1_x equ REG_PARAM_1_x REG_ABI_PARAM_1_x equ REG_PARAM_1_x
REG_ABI_PARAM_1 equ REG_PARAM_1 REG_ABI_PARAM_1 equ REG_PARAM_1
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro
MY_PUSH_4_REGS
endm
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro
MY_POP_4_REGS
endm
else else
; x64 ; x64
@ -256,12 +304,25 @@ endm
endif ; IS_LINUX endif ; IS_LINUX
MY_PUSH_PRESERVED_ABI_REGS macro MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro
if (IS_LINUX gt 0) if (IS_LINUX gt 0)
MY_PUSH_2_REGS MY_PUSH_2_REGS
else else
MY_PUSH_4_REGS MY_PUSH_4_REGS
endif endif
endm
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11 macro
if (IS_LINUX gt 0)
MY_POP_2_REGS
else
MY_POP_4_REGS
endif
endm
MY_PUSH_PRESERVED_ABI_REGS macro
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
push r12 push r12
push r13 push r13
push r14 push r14
@ -274,11 +335,7 @@ MY_POP_PRESERVED_ABI_REGS macro
pop r14 pop r14
pop r13 pop r13
pop r12 pop r12
if (IS_LINUX gt 0) MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
MY_POP_2_REGS
else
MY_POP_4_REGS
endif
endm endm
endif ; x64 endif ; x64

View file

@ -1,180 +1,258 @@
; 7zCrcOpt.asm -- CRC32 calculation : optimized version ; 7zCrcOpt.asm -- CRC32 calculation : optimized version
; 2021-02-07 : Igor Pavlov : Public domain ; 2023-12-08 : Igor Pavlov : Public domain
include 7zAsm.asm include 7zAsm.asm
MY_ASM_START MY_ASM_START
rD equ r2 NUM_WORDS equ 3
rN equ r7 UNROLL_CNT equ 2
rT equ r5
ifdef x64 if (NUM_WORDS lt 1) or (NUM_WORDS gt 64)
num_VAR equ r8 .err <NUM_WORDS_IS_INCORRECT>
table_VAR equ r9 endif
else if (UNROLL_CNT lt 1)
if (IS_CDECL gt 0) .err <UNROLL_CNT_IS_INCORRECT>
crc_OFFS equ (REG_SIZE * 5)
data_OFFS equ (REG_SIZE + crc_OFFS)
size_OFFS equ (REG_SIZE + data_OFFS)
else
size_OFFS equ (REG_SIZE * 5)
endif
table_OFFS equ (REG_SIZE + size_OFFS)
num_VAR equ [r4 + size_OFFS]
table_VAR equ [r4 + table_OFFS]
endif endif
SRCDAT equ rD + rN * 1 + 4 * rD equ r2
rD_x equ x2
rN equ r7
rT equ r5
ifndef x64
if (IS_CDECL gt 0)
crc_OFFS equ (REG_SIZE * 5)
data_OFFS equ (REG_SIZE + crc_OFFS)
size_OFFS equ (REG_SIZE + data_OFFS)
else
size_OFFS equ (REG_SIZE * 5)
endif
table_OFFS equ (REG_SIZE + size_OFFS)
endif
; rN + rD is same speed as rD, but we reduce one instruction in loop
SRCDAT_1 equ rN + rD * 1 + 1 *
SRCDAT_4 equ rN + rD * 1 + 4 *
CRC macro op:req, dest:req, src:req, t:req CRC macro op:req, dest:req, src:req, t:req
op dest, DWORD PTR [rT + src * 4 + 0400h * t] op dest, dword ptr [rT + @CatStr(src, _R) * 4 + 0400h * (t)]
endm endm
CRC_XOR macro dest:req, src:req, t:req CRC_XOR macro dest:req, src:req, t:req
CRC xor, dest, src, t CRC xor, dest, src, t
endm endm
CRC_MOV macro dest:req, src:req, t:req CRC_MOV macro dest:req, src:req, t:req
CRC mov, dest, src, t CRC mov, dest, src, t
endm endm
MOVZXLO macro dest:req, src:req
movzx dest, @CatStr(src, _L)
endm
MOVZXHI macro dest:req, src:req
movzx dest, @CatStr(src, _H)
endm
; movzx x0, x0_L - is slow in some cpus (ivb), if same register for src and dest
; movzx x3, x0_L sometimes is 0 cycles latency (not always)
; movzx x3, x0_L sometimes is 0.5 cycles latency
; movzx x3, x0_H is 2 cycles latency in some cpus
CRC1b macro CRC1b macro
movzx x6, BYTE PTR [rD] movzx x6, byte ptr [rD]
inc rD MOVZXLO x3, x0
movzx x3, x0_L inc rD
xor x6, x3 shr x0, 8
shr x0, 8 xor x6, x3
CRC xor, x0, r6, 0 CRC_XOR x0, x6, 0
dec rN dec rN
endm endm
MY_PROLOG macro crc_end:req LOAD_1 macro dest:req, t:req, iter:req, index:req
movzx dest, byte ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)]
endm
LOAD_2 macro dest:req, t:req, iter:req, index:req
movzx dest, word ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + index)]
endm
CRC_QUAD macro nn, t:req, iter:req
ifdef x64
; paired memory loads give 1-3% speed gain, but it uses more registers
LOAD_2 x3, t, iter, 0
LOAD_2 x9, t, iter, 2
MOVZXLO x6, x3
shr x3, 8
CRC_XOR nn, x6, t * 4 + 3
MOVZXLO x6, x9
shr x9, 8
CRC_XOR nn, x3, t * 4 + 2
CRC_XOR nn, x6, t * 4 + 1
CRC_XOR nn, x9, t * 4 + 0
elseif 0
LOAD_2 x3, t, iter, 0
MOVZXLO x6, x3
shr x3, 8
CRC_XOR nn, x6, t * 4 + 3
CRC_XOR nn, x3, t * 4 + 2
LOAD_2 x3, t, iter, 2
MOVZXLO x6, x3
shr x3, 8
CRC_XOR nn, x6, t * 4 + 1
CRC_XOR nn, x3, t * 4 + 0
elseif 0
LOAD_1 x3, t, iter, 0
LOAD_1 x6, t, iter, 1
CRC_XOR nn, x3, t * 4 + 3
CRC_XOR nn, x6, t * 4 + 2
LOAD_1 x3, t, iter, 2
LOAD_1 x6, t, iter, 3
CRC_XOR nn, x3, t * 4 + 1
CRC_XOR nn, x6, t * 4 + 0
else
; 32-bit load is better if there is only one read port (core2)
; but that code can be slower if there are 2 read ports (snb)
mov x3, dword ptr [SRCDAT_1 (4 * (NUM_WORDS - 1 - t + iter * NUM_WORDS) + 0)]
MOVZXLO x6, x3
CRC_XOR nn, x6, t * 4 + 3
MOVZXHI x6, x3
shr x3, 16
CRC_XOR nn, x6, t * 4 + 2
MOVZXLO x6, x3
shr x3, 8
CRC_XOR nn, x6, t * 4 + 1
CRC_XOR nn, x3, t * 4 + 0
endif
endm
LAST equ (4 * (NUM_WORDS - 1))
CRC_ITER macro qq, nn, iter
mov nn, [SRCDAT_4 (NUM_WORDS * (1 + iter))]
i = 0
rept NUM_WORDS - 1
CRC_QUAD nn, i, iter
i = i + 1
endm
MOVZXLO x6, qq
mov x3, qq
shr x3, 24
CRC_XOR nn, x6, LAST + 3
CRC_XOR nn, x3, LAST + 0
ror qq, 16
MOVZXLO x6, qq
shr qq, 24
CRC_XOR nn, x6, LAST + 1
if ((UNROLL_CNT and 1) eq 1) and (iter eq (UNROLL_CNT - 1))
CRC_MOV qq, qq, LAST + 2
xor qq, nn
else
CRC_XOR nn, qq, LAST + 2
endif
endm
; + 4 for prefetching next 4-bytes after current iteration
NUM_BYTES_LIMIT equ (NUM_WORDS * 4 * UNROLL_CNT + 4)
ALIGN_MASK equ 3
; MY_PROC @CatStr(CrcUpdateT, 12), 4
MY_PROC @CatStr(CrcUpdateT, %(NUM_WORDS * 4)), 4
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
ifdef x64 ifdef x64
mov x0, REG_ABI_PARAM_0_x ; x0 = x1(win) / x7(linux)
mov rT, REG_ABI_PARAM_3 ; r5 = r9(win) / x1(linux)
mov rN, REG_ABI_PARAM_2 ; r7 = r8(win) / r2(linux)
; mov rD, REG_ABI_PARAM_1 ; r2 = r2(win)
if (IS_LINUX gt 0) if (IS_LINUX gt 0)
MY_PUSH_2_REGS
mov x0, REG_ABI_PARAM_0_x ; x0 = x7
mov rT, REG_ABI_PARAM_3 ; r5 = r1
mov rN, REG_ABI_PARAM_2 ; r7 = r2
mov rD, REG_ABI_PARAM_1 ; r2 = r6 mov rD, REG_ABI_PARAM_1 ; r2 = r6
else
MY_PUSH_4_REGS
mov x0, REG_ABI_PARAM_0_x ; x0 = x1
mov rT, REG_ABI_PARAM_3 ; r5 = r9
mov rN, REG_ABI_PARAM_2 ; r7 = r8
; mov rD, REG_ABI_PARAM_1 ; r2 = r2
endif endif
else else
MY_PUSH_4_REGS
if (IS_CDECL gt 0) if (IS_CDECL gt 0)
mov x0, [r4 + crc_OFFS] mov x0, [r4 + crc_OFFS]
mov rD, [r4 + data_OFFS] mov rD, [r4 + data_OFFS]
else else
mov x0, REG_ABI_PARAM_0_x mov x0, REG_ABI_PARAM_0_x
endif endif
mov rN, num_VAR mov rN, [r4 + size_OFFS]
mov rT, table_VAR mov rT, [r4 + table_OFFS]
endif endif
test rN, rN cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK
jz crc_end jb crc_end
@@: @@:
test rD, 7 test rD_x, ALIGN_MASK ; test rD, ALIGN_MASK
jz @F jz @F
CRC1b CRC1b
jnz @B jmp @B
@@: @@:
cmp rN, 16 xor x0, dword ptr [rD]
jb crc_end lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
add rN, rD sub rD, rN
mov num_VAR, rN
sub rN, 8 align 16
and rN, NOT 7 @@:
sub rD, rN unr_index = 0
xor x0, [SRCDAT 0] while unr_index lt UNROLL_CNT
if (unr_index and 1) eq 0
CRC_ITER x0, x1, unr_index
else
CRC_ITER x1, x0, unr_index
endif
unr_index = unr_index + 1
endm endm
MY_EPILOG macro crc_end:req add rD, NUM_WORDS * 4 * UNROLL_CNT
xor x0, [SRCDAT 0] jnc @B
mov rD, rN
mov rN, num_VAR
sub rN, rD
crc_end:
test rN, rN
jz @F
CRC1b
jmp crc_end
@@:
if (IS_X64 gt 0) and (IS_LINUX gt 0)
MY_POP_2_REGS
else
MY_POP_4_REGS
endif
endm
MY_PROC CrcUpdateT8, 4 if 0
MY_PROLOG crc_end_8 ; byte verson
mov x1, [SRCDAT 1] add rD, rN
align 16 xor x0, dword ptr [rD]
main_loop_8: add rN, NUM_BYTES_LIMIT - 1
mov x6, [SRCDAT 2] else
movzx x3, x1_L ; 4-byte version
CRC_XOR x6, r3, 3 add rN, 4 * NUM_WORDS * UNROLL_CNT
movzx x3, x1_H sub rD, 4 * NUM_WORDS * UNROLL_CNT
CRC_XOR x6, r3, 2 @@:
shr x1, 16 MOVZXLO x3, x0
movzx x3, x1_L MOVZXHI x1, x0
movzx x1, x1_H shr x0, 16
CRC_XOR x6, r3, 1 MOVZXLO x6, x0
movzx x3, x0_L shr x0, 8
CRC_XOR x6, r1, 0 CRC_MOV x0, x0, 0
CRC_XOR x0, x3, 3
CRC_XOR x0, x1, 2
CRC_XOR x0, x6, 1
mov x1, [SRCDAT 3] add rD, 4
CRC_XOR x6, r3, 7 if (NUM_WORDS * UNROLL_CNT) ne 1
movzx x3, x0_H jc @F
shr x0, 16 xor x0, [SRCDAT_4 0]
CRC_XOR x6, r3, 6 jmp @B
movzx x3, x0_L @@:
CRC_XOR x6, r3, 5 endif
movzx x3, x0_H add rD, rN
CRC_MOV x0, r3, 4 add rN, 4 - 1
xor x0, x6
add rD, 8
jnz main_loop_8
MY_EPILOG crc_end_8 endif
MY_ENDP
MY_PROC CrcUpdateT4, 4 sub rN, rD
MY_PROLOG crc_end_4 crc_end:
align 16 test rN, rN
main_loop_4: jz func_end
movzx x1, x0_L @@:
movzx x3, x0_H CRC1b
shr x0, 16 jnz @B
movzx x6, x0_H
and x0, 0FFh
CRC_MOV x1, r1, 3
xor x1, [SRCDAT 1]
CRC_XOR x1, r3, 2
CRC_XOR x1, r6, 0
CRC_XOR x1, r0, 1
movzx x0, x1_L func_end:
movzx x3, x1_H MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
shr x1, 16
movzx x6, x1_H
and x1, 0FFh
CRC_MOV x0, r0, 3
xor x0, [SRCDAT 2]
CRC_XOR x0, r3, 2
CRC_XOR x0, r6, 0
CRC_XOR x0, r1, 1
add rD, 8
jnz main_loop_4
MY_EPILOG crc_end_4
MY_ENDP MY_ENDP
end end

View file

@ -1,5 +1,5 @@
; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function ; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function
; 2021-07-21: Igor Pavlov : Public domain ; 2024-06-18: Igor Pavlov : Public domain
; ;
ifndef x64 ifndef x64
@ -11,10 +11,31 @@ include 7zAsm.asm
MY_ASM_START MY_ASM_START
_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE' ifndef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT
if (IS_LINUX gt 0)
Z7_LZ_FIND_OPT_ASM_USE_SEGMENT equ 1
else
Z7_LZ_FIND_OPT_ASM_USE_SEGMENT equ 1
endif
endif
ifdef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT
_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE'
MY_ALIGN macro num:req MY_ALIGN macro num:req
align num align num
; align 16
endm
else
MY_ALIGN macro num:req
; We expect that ".text" is aligned for 16-bytes.
; So we don't need large alignment inside our function.
align 16
endm
endif
MY_ALIGN_16 macro
MY_ALIGN 16
endm endm
MY_ALIGN_32 macro MY_ALIGN_32 macro
@ -136,7 +157,11 @@ COPY_VAR_64 macro dest_var, src_var
endm endm
ifdef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT
; MY_ALIGN_64 ; MY_ALIGN_64
else
MY_ALIGN_16
endif
MY_PROC GetMatchesSpecN_2, 13 MY_PROC GetMatchesSpecN_2, 13
MY_PUSH_PRESERVED_ABI_REGS MY_PUSH_PRESERVED_ABI_REGS
mov r0, RSP mov r0, RSP
@ -508,6 +533,8 @@ fin:
MY_POP_PRESERVED_ABI_REGS MY_POP_PRESERVED_ABI_REGS
MY_ENDP MY_ENDP
ifdef Z7_LZ_FIND_OPT_ASM_USE_SEGMENT
_TEXT$LZFINDOPT ENDS _TEXT$LZFINDOPT ENDS
endif
end end

View file

@ -1,5 +1,5 @@
; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function ; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
; 2021-02-23: Igor Pavlov : Public domain ; 2024-06-18: Igor Pavlov : Public domain
; ;
; 3 - is the code compatibility version of LzmaDec_DecodeReal_*() ; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
; function for check at link time. ; function for check at link time.
@ -17,11 +17,41 @@ include 7zAsm.asm
MY_ASM_START MY_ASM_START
_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE' ; if Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT is defined, we use additional SEGMENT with 64-byte alignment.
; if Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT is not defined, we use default SEGMENT (where default 16-byte alignment of segment is expected).
; The performance is almost identical in our tests.
; But the performance can depend from position of lzmadec code inside instruction cache
; or micro-op cache line (depending from low address bits in 32-byte/64-byte cache lines).
; And 64-byte alignment provides a more consistent speed regardless
; of the code's position in the executable.
; But also it's possible that code without Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT can be
; slightly faster than 64-bytes aligned code in some cases, if offset of lzmadec
; code in 64-byte block after compilation provides better speed by some reason.
; Note that Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT adds an extra section to the ELF file.
; If you don't want to get that extra section, do not define Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT.
ifndef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
if (IS_LINUX gt 0)
Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT equ 1
else
Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT equ 1
endif
endif
ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'
MY_ALIGN macro num:req MY_ALIGN macro num:req
align num align num
; align 16
endm endm
else
MY_ALIGN macro num:req
; We expect that ".text" is aligned for 16-bytes.
; So we don't need large alignment inside out function.
align 16
endm
endif
MY_ALIGN_16 macro MY_ALIGN_16 macro
MY_ALIGN 16 MY_ALIGN 16
@ -610,7 +640,11 @@ PARAM_lzma equ REG_ABI_PARAM_0
PARAM_limit equ REG_ABI_PARAM_1 PARAM_limit equ REG_ABI_PARAM_1
PARAM_bufLimit equ REG_ABI_PARAM_2 PARAM_bufLimit equ REG_ABI_PARAM_2
ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
; MY_ALIGN_64 ; MY_ALIGN_64
else
MY_ALIGN_16
endif
MY_PROC LzmaDec_DecodeReal_3, 3 MY_PROC LzmaDec_DecodeReal_3, 3
MY_PUSH_PRESERVED_ABI_REGS MY_PUSH_PRESERVED_ABI_REGS
@ -1298,6 +1332,8 @@ fin:
MY_POP_PRESERVED_ABI_REGS MY_POP_PRESERVED_ABI_REGS
MY_ENDP MY_ENDP
ifdef Z7_LZMA_DEC_OPT_ASM_USE_SEGMENT
_TEXT$LZMADECOPT ENDS _TEXT$LZMADECOPT ENDS
endif
end end

View file

@ -1,5 +1,5 @@
; Sha1Opt.asm -- SHA-1 optimized code for SHA-1 x86 hardware instructions ; Sha1Opt.asm -- SHA-1 optimized code for SHA-1 x86 hardware instructions
; 2021-03-10 : Igor Pavlov : Public domain ; 2024-06-16 : Igor Pavlov : Public domain
include 7zAsm.asm include 7zAsm.asm
@ -20,7 +20,7 @@ MY_ASM_START
CONST SEGMENT CONST SEGMENT READONLY
align 16 align 16
Reverse_Endian_Mask db 15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0 Reverse_Endian_Mask db 15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0

View file

@ -1,5 +1,5 @@
; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions ; Sha256Opt.asm -- SHA-256 optimized code for SHA-256 x86 hardware instructions
; 2021-03-10 : Igor Pavlov : Public domain ; 2024-06-16 : Igor Pavlov : Public domain
include 7zAsm.asm include 7zAsm.asm
@ -20,7 +20,7 @@ endif
EXTRN K_CONST:xmmword EXTRN K_CONST:xmmword
@ @
CONST SEGMENT CONST SEGMENT READONLY
align 16 align 16
Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12 Reverse_Endian_Mask db 3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12
@ -55,13 +55,19 @@ ifndef x64
.xmm .xmm
endif endif
; jwasm-based assemblers for linux and linker from new versions of binutils
; can generate incorrect code for load [ARRAY + offset] instructions.
; 22.00: we load K_CONST offset to (rTable) register to avoid jwasm+binutils problem
rTable equ r0
; rTable equ K_CONST
ifdef x64 ifdef x64
rNum equ REG_ABI_PARAM_2 rNum equ REG_ABI_PARAM_2
if (IS_LINUX eq 0) if (IS_LINUX eq 0)
LOCAL_SIZE equ (16 * 2) LOCAL_SIZE equ (16 * 2)
endif endif
else else
rNum equ r0 rNum equ r3
LOCAL_SIZE equ (16 * 1) LOCAL_SIZE equ (16 * 1)
endif endif
@ -103,15 +109,18 @@ MY_PROLOG macro
movdqa [r4 + 16], xmm9 movdqa [r4 + 16], xmm9
endif endif
else ; x86 else ; x86
if (IS_CDECL gt 0) push r3
mov rState, [r4 + REG_SIZE * 1]
mov rData, [r4 + REG_SIZE * 2]
mov rNum, [r4 + REG_SIZE * 3]
else ; fastcall
mov rNum, [r4 + REG_SIZE * 1]
endif
push r5 push r5
mov r5, r4 mov r5, r4
NUM_PUSH_REGS equ 2
PARAM_OFFSET equ (REG_SIZE * (1 + NUM_PUSH_REGS))
if (IS_CDECL gt 0)
mov rState, [r4 + PARAM_OFFSET]
mov rData, [r4 + PARAM_OFFSET + REG_SIZE * 1]
mov rNum, [r4 + PARAM_OFFSET + REG_SIZE * 2]
else ; fastcall
mov rNum, [r4 + PARAM_OFFSET]
endif
and r4, -16 and r4, -16
sub r4, LOCAL_SIZE sub r4, LOCAL_SIZE
endif endif
@ -129,6 +138,7 @@ MY_EPILOG macro
else ; x86 else ; x86
mov r4, r5 mov r4, r5
pop r5 pop r5
pop r3
endif endif
MY_ENDP MY_ENDP
endm endm
@ -171,7 +181,7 @@ pre2 equ 2
RND4 macro k RND4 macro k
movdqa msg, xmmword ptr [K_CONST + (k) * 16] movdqa msg, xmmword ptr [rTable + (k) * 16]
paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4))) paddd msg, @CatStr(xmm, %(w_regs + ((k + 0) mod 4)))
MY_sha256rnds2 state0_N, state1_N MY_sha256rnds2 state0_N, state1_N
pshufd msg, msg, 0eH pshufd msg, msg, 0eH
@ -210,6 +220,8 @@ endm
MY_PROC Sha256_UpdateBlocks_HW, 3 MY_PROC Sha256_UpdateBlocks_HW, 3
MY_PROLOG MY_PROLOG
lea rTable, [K_CONST]
cmp rNum, 0 cmp rNum, 0
je end_c je end_c

860
Asm/x86/Sort.asm Normal file
View file

@ -0,0 +1,860 @@
; SortTest.asm -- ASM version of HeapSort() function
; Igor Pavlov : Public domain
include ../../../../Asm/x86/7zAsm.asm
MY_ASM_START
ifndef Z7_SORT_ASM_USE_SEGMENT
if (IS_LINUX gt 0)
; Z7_SORT_ASM_USE_SEGMENT equ 1
else
; Z7_SORT_ASM_USE_SEGMENT equ 1
endif
endif
ifdef Z7_SORT_ASM_USE_SEGMENT
_TEXT$Z7_SORT SEGMENT ALIGN(64) 'CODE'
MY_ALIGN macro num:req
align num
endm
else
MY_ALIGN macro num:req
; We expect that ".text" is aligned for 16-bytes.
; So we don't need large alignment inside our function.
align 16
endm
endif
MY_ALIGN_16 macro
MY_ALIGN 16
endm
MY_ALIGN_32 macro
MY_ALIGN 32
endm
MY_ALIGN_64 macro
MY_ALIGN 64
endm
ifdef x64
NUM_PREFETCH_LEVELS equ 3 ; to prefetch 1x 64-bytes line (is good for most cases)
; NUM_PREFETCH_LEVELS equ 4 ; to prefetch 2x 64-bytes lines (better for big arrays)
acc equ x0
k equ r0
k_x equ x0
p equ r1
s equ r2
s_x equ x2
a0 equ x3
t0 equ a0
a3 equ x5
qq equ a3
a1 equ x6
t1 equ a1
t1_r equ r6
a2 equ x7
t2 equ a2
i equ r8
e0 equ x8
e1 equ x9
num_last equ r10
num_last_x equ x10
next4_lim equ r11
pref_lim equ r12
SORT_2_WITH_TEMP_REG macro b0, b1, temp_reg
mov temp_reg, b0
cmp b0, b1
cmovae b0, b1 ; min
cmovae b1, temp_reg ; max
endm
SORT macro b0, b1
SORT_2_WITH_TEMP_REG b0, b1, acc
endm
LOAD macro dest:req, index:req
mov dest, [p + 4 * index]
endm
STORE macro reg:req, index:req
mov [p + 4 * index], reg
endm
if (NUM_PREFETCH_LEVELS gt 3)
num_prefetches equ (1 SHL (NUM_PREFETCH_LEVELS - 3))
else
num_prefetches equ 1
endif
PREFETCH_OP macro offs
cur_offset = 7 * 4 ; it's average offset in 64-bytes cache line.
; cur_offset = 0 ; we can use zero offset, if we are sure that array is aligned for 64-bytes.
rept num_prefetches
if 1
prefetcht0 byte ptr [p + offs + cur_offset]
else
mov pref_x, dword ptr [p + offs + cur_offset]
endif
cur_offset = cur_offset + 64
endm
endm
PREFETCH_MY macro
if 1
if 1
shl k, NUM_PREFETCH_LEVELS + 3
else
; we delay prefetch instruction to improve main loads
shl k, NUM_PREFETCH_LEVELS
shl k, 3
; shl k, 0
endif
PREFETCH_OP k
elseif 1
shl k, 3
PREFETCH_OP k * (1 SHL NUM_PREFETCH_LEVELS) ; change it
endif
endm
STEP_1 macro exit_label, prefetch_macro
use_cmov_1 equ 1 ; set 1 for cmov, but it's slower in some cases
; set 0 for LOAD after adc s, 0
cmp t0, t1
if use_cmov_1
cmovb t0, t1
; STORE t0, k
endif
adc s, 0
if use_cmov_1 eq 0
LOAD t0, s
endif
cmp qq, t0
jae exit_label
if 1 ; use_cmov_1 eq 0
STORE t0, k
endif
prefetch_macro
mov t0, [p + s * 8]
mov t1, [p + s * 8 + 4]
mov k, s
add s, s ; slower for some cpus
; lea s, dword ptr [s + s] ; slower for some cpus
; shl s, 1 ; faster for some cpus
; lea s, dword ptr [s * 2] ; faster for some cpus
rept 0 ; 1000 for debug : 0 for normal
; number of calls in generate_stage : ~0.6 of number of items
shl k, 0
endm
endm
STEP_2 macro exit_label, prefetch_macro
use_cmov_2 equ 0 ; set 1 for cmov, but it's slower in some cases
; set 0 for LOAD after adc s, 0
cmp t0, t1
if use_cmov_2
mov t2, t0
cmovb t2, t1
; STORE t2, k
endif
mov t0, [p + s * 8]
mov t1, [p + s * 8 + 4]
cmovb t0, [p + s * 8 + 8]
cmovb t1, [p + s * 8 + 12]
adc s, 0
if use_cmov_2 eq 0
LOAD t2, s
endif
cmp qq, t2
jae exit_label
if 1 ; use_cmov_2 eq 0
STORE t2, k
endif
prefetch_macro
mov k, s
; add s, s
; lea s, [s + s]
shl s, 1
; lea s, [s * 2]
endm
MOVE_SMALLEST_UP macro STEP, use_prefetch, num_unrolls
LOCAL exit_1, exit_2, leaves, opt_loop, last_nodes
; s == k * 2
; t0 == (p)[s]
; t1 == (p)[s + 1]
cmp k, next4_lim
jae leaves
rept num_unrolls
STEP exit_2
cmp k, next4_lim
jae leaves
endm
if use_prefetch
prefetch_macro equ PREFETCH_MY
pref_lim_2 equ pref_lim
; lea pref_lim, dword ptr [num_last + 1]
; shr pref_lim, NUM_PREFETCH_LEVELS + 1
cmp k, pref_lim_2
jae last_nodes
else
prefetch_macro equ
pref_lim_2 equ next4_lim
endif
MY_ALIGN_16
opt_loop:
STEP exit_2, prefetch_macro
cmp k, pref_lim_2
jb opt_loop
last_nodes:
; k >= pref_lim_2
; 2 cases are possible:
; case-1: num_after_prefetch_levels == 0 && next4_lim = pref_lim_2
; case-2: num_after_prefetch_levels == NUM_PREFETCH_LEVELS - 1 &&
; next4_lim = pref_lim_2 / (NUM_PREFETCH_LEVELS - 1)
if use_prefetch
yyy = NUM_PREFETCH_LEVELS - 1
while yyy
yyy = yyy - 1
STEP exit_2
if yyy
cmp k, next4_lim
jae leaves
endif
endm
endif
leaves:
; k >= next4_lim == (num_last + 1) / 4 must be provided by previous code.
; we have 2 nodes in (s) level : always
; we can have some nodes in (s * 2) level : low probability case
; we have no nodes in (s * 4) level
; s == k * 2
; t0 == (p)[s]
; t1 == (p)[s + 1]
cmp t0, t1
cmovb t0, t1
adc s, 0
STORE t0, k
; t0 == (p)[s]
; s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
; we have 3 possible cases here:
; s * 2 > num_last : (s) node has no childs
; s * 2 == num_last : (s) node has 1 leaf child that is last item of array
; s * 2 < num_last : (s) node has 2 leaf childs. We provide (s * 4 > num_last)
; we check for (s * 2 > num_last) before "cmp qq, t0" check, because
; we will replace conditional jump with cmov instruction later.
lea t1_r, dword ptr [s + s]
cmp t1_r, num_last
ja exit_1 ; if (s * 2 > num_last), we have no childs : it's high probability branch
; it's low probability branch
; s * 2 <= num_last
cmp qq, t0
jae exit_2
; qq < t0, so we go to next level
; we check 1 or 2 childs in next level
mov t0, [p + s * 8]
mov k, s
mov s, t1_r
cmp t1_r, num_last
je @F ; (s == num_last) means that we have single child in tree
; (s < num_last) : so we must read both childs and select max of them.
mov t1, [p + k * 8 + 4]
cmp t0, t1
cmovb t0, t1
adc s, 0
@@:
STORE t0, k
exit_1:
; t0 == (p)[s], s / 2 == k : (s) is index of max item from (p)[k * 2], (p)[k * 2 + 1]
cmp qq, t0
cmovb k, s
exit_2:
STORE qq, k
endm
ifdef Z7_SORT_ASM_USE_SEGMENT
; MY_ALIGN_64
else
MY_ALIGN_16
endif
MY_PROC HeapSort, 2
if (IS_LINUX gt 0)
mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux
endif
mov num_last, REG_ABI_PARAM_1 ; r10 <- r6 : linux
; r10 <- r2 : win64
cmp num_last, 2
jb end_1
; MY_PUSH_PRESERVED_ABI_REGS
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
push r12
cmp num_last, 4
ja sort_5
LOAD a0, 0
LOAD a1, 1
SORT a0, a1
cmp num_last, 3
jb end_2
LOAD a2, 2
je sort_3
LOAD a3, 3
SORT a2, a3
SORT a1, a3
STORE a3, 3
sort_3:
SORT a0, a2
SORT a1, a2
STORE a2, 2
jmp end_2
sort_5:
; (num_last > 4) is required here
; if (num_last >= 6) : we will use optimized loop for leaf nodes loop_down_1
mov next4_lim, num_last
shr next4_lim, 2
dec num_last
mov k, num_last
shr k, 1
mov i, num_last
shr i, 2
test num_last, 1
jnz size_even
; ODD number of items. So we compare parent with single child
LOAD t1, num_last
LOAD t0, k
SORT_2_WITH_TEMP_REG t1, t0, t2
STORE t1, num_last
STORE t0, k
dec k
size_even:
cmp k, i
jbe loop_down ; jump for num_last == 4 case
if 0 ; 1 for debug
mov r15, k
mov r14d, 1 ; 100
loop_benchmark:
endif
; optimized loop for leaf nodes:
mov t0, [p + k * 8]
mov t1, [p + k * 8 + 4]
MY_ALIGN_16
loop_down_1:
; we compare parent with max of childs:
; lea s, dword ptr [2 * k]
mov s, k
cmp t0, t1
cmovb t0, t1
adc s, s
LOAD t2, k
STORE t0, k
cmp t2, t0
cmovae s, k
dec k
; we preload next items before STORE operation for calculated address
mov t0, [p + k * 8]
mov t1, [p + k * 8 + 4]
STORE t2, s
cmp k, i
jne loop_down_1
if 0 ; 1 for debug
mov k, r15
dec r14d
jnz loop_benchmark
; jmp end_debug
endif
MY_ALIGN_16
loop_down:
mov t0, [p + i * 8]
mov t1, [p + i * 8 + 4]
LOAD qq, i
mov k, i
lea s, dword ptr [i + i]
; jmp end_debug
DOWN_use_prefetch equ 0
DOWN_num_unrolls equ 0
MOVE_SMALLEST_UP STEP_1, DOWN_use_prefetch, DOWN_num_unrolls
sub i, 1
jnb loop_down
; jmp end_debug
LOAD e0, 0
LOAD e1, 1
LEVEL_3_LIMIT equ 8 ; 8 is default, but 7 also can work
cmp num_last, LEVEL_3_LIMIT + 1
jb main_loop_sort_5
MY_ALIGN_16
main_loop_sort:
; num_last > LEVEL_3_LIMIT
; p[size--] = p[0];
LOAD qq, num_last
STORE e0, num_last
mov e0, e1
mov next4_lim, num_last
shr next4_lim, 2
mov pref_lim, num_last
shr pref_lim, NUM_PREFETCH_LEVELS + 1
dec num_last
if 0 ; 1 for debug
; that optional optimization can improve the performance, if there are identical items in array
; 3 times improvement : if all items in array are identical
; 20% improvement : if items are different for 1 bit only
; 1-10% improvement : if items are different for (2+) bits
; no gain : if items are different
cmp qq, e1
jae next_iter_main
endif
LOAD e1, 2
LOAD t0, 3
mov k_x, 2
cmp e1, t0
cmovb e1, t0
mov t0, [p + 4 * (4 + 0)]
mov t1, [p + 4 * (4 + 1)]
cmovb t0, [p + 4 * (4 + 2)]
cmovb t1, [p + 4 * (4 + 3)]
adc k_x, 0
; (qq <= e1), because the tree is correctly sorted
; also here we could check (qq >= e1) or (qq == e1) for faster exit
lea s, dword ptr [k + k]
MAIN_use_prefetch equ 1
MAIN_num_unrolls equ 0
MOVE_SMALLEST_UP STEP_2, MAIN_use_prefetch, MAIN_num_unrolls
next_iter_main:
cmp num_last, LEVEL_3_LIMIT
jne main_loop_sort
; num_last == LEVEL_3_LIMIT
main_loop_sort_5:
; 4 <= num_last <= LEVEL_3_LIMIT
; p[size--] = p[0];
LOAD qq, num_last
STORE e0, num_last
mov e0, e1
dec num_last_x
LOAD e1, 2
LOAD t0, 3
mov k_x, 2
cmp e1, t0
cmovb e1, t0
adc k_x, 0
lea s_x, dword ptr [k * 2]
cmp s_x, num_last_x
ja exit_2
mov t0, [p + k * 8]
je exit_1
; s < num_last
mov t1, [p + k * 8 + 4]
cmp t0, t1
cmovb t0, t1
adc s_x, 0
exit_1:
STORE t0, k
cmp qq, t0
cmovb k_x, s_x
exit_2:
STORE qq, k
cmp num_last_x, 3
jne main_loop_sort_5
; num_last == 3 (real_size == 4)
LOAD a0, 2
LOAD a1, 3
STORE e1, 2
STORE e0, 3
SORT a0, a1
end_2:
STORE a0, 0
STORE a1, 1
; end_debug:
; MY_POP_PRESERVED_ABI_REGS
pop r12
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
end_1:
MY_ENDP
else
; ------------ x86 32-bit ------------
ifdef x64
IS_CDECL = 0
endif
acc equ x0
k equ r0
k_x equ acc
p equ r1
num_last equ r2
num_last_x equ x2
a0 equ x3
t0 equ a0
a3 equ x5
i equ r5
e0 equ a3
a1 equ x6
qq equ a1
a2 equ x7
s equ r7
s_x equ a2
SORT macro b0, b1
cmp b1, b0
jae @F
if 1
xchg b0, b1
else
mov acc, b0
mov b0, b1 ; min
mov b1, acc ; max
endif
@@:
endm
LOAD macro dest:req, index:req
mov dest, [p + 4 * index]
endm
STORE macro reg:req, index:req
mov [p + 4 * index], reg
endm
STEP_1 macro exit_label
mov t0, [p + k * 8]
cmp t0, [p + k * 8 + 4]
adc s, 0
LOAD t0, s
STORE t0, k ; we lookahed stooring for most expected branch
cmp qq, t0
jae exit_label
; STORE t0, k ; use if
mov k, s
add s, s
; lea s, dword ptr [s + s]
; shl s, 1
; lea s, dword ptr [s * 2]
endm
STEP_BRANCH macro exit_label
mov t0, [p + k * 8]
cmp t0, [p + k * 8 + 4]
jae @F
inc s
mov t0, [p + k * 8 + 4]
@@:
cmp qq, t0
jae exit_label
STORE t0, k
mov k, s
add s, s
endm
MOVE_SMALLEST_UP macro STEP, num_unrolls, exit_2
LOCAL leaves, opt_loop, single
; s == k * 2
rept num_unrolls
cmp s, num_last
jae leaves
STEP_1 exit_2
endm
cmp s, num_last
jb opt_loop
leaves:
; (s >= num_last)
jne exit_2
single:
; (s == num_last)
mov t0, [p + k * 8]
cmp qq, t0
jae exit_2
STORE t0, k
mov k, s
jmp exit_2
MY_ALIGN_16
opt_loop:
STEP exit_2
cmp s, num_last
jb opt_loop
je single
exit_2:
STORE qq, k
endm
ifdef Z7_SORT_ASM_USE_SEGMENT
; MY_ALIGN_64
else
MY_ALIGN_16
endif
MY_PROC HeapSort, 2
ifdef x64
if (IS_LINUX gt 0)
mov num_last, REG_ABI_PARAM_1 ; r2 <- r6 : linux
mov p, REG_ABI_PARAM_0 ; r1 <- r7 : linux
endif
elseif (IS_CDECL gt 0)
mov num_last, [r4 + REG_SIZE * 2]
mov p, [r4 + REG_SIZE * 1]
endif
cmp num_last, 2
jb end_1
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
cmp num_last, 4
ja sort_5
LOAD a0, 0
LOAD a1, 1
SORT a0, a1
cmp num_last, 3
jb end_2
LOAD a2, 2
je sort_3
LOAD a3, 3
SORT a2, a3
SORT a1, a3
STORE a3, 3
sort_3:
SORT a0, a2
SORT a1, a2
STORE a2, 2
jmp end_2
sort_5:
; num_last > 4
lea i, dword ptr [num_last - 2]
dec num_last
test i, 1
jz loop_down
; single child
mov t0, [p + num_last * 4]
mov qq, [p + num_last * 2]
dec i
cmp qq, t0
jae loop_down
mov [p + num_last * 2], t0
mov [p + num_last * 4], qq
MY_ALIGN_16
loop_down:
mov t0, [p + i * 4]
cmp t0, [p + i * 4 + 4]
mov k, i
mov qq, [p + i * 2]
adc k, 0
LOAD t0, k
cmp qq, t0
jae down_next
mov [p + i * 2], t0
lea s, dword ptr [k + k]
DOWN_num_unrolls equ 0
MOVE_SMALLEST_UP STEP_1, DOWN_num_unrolls, down_exit_label
down_next:
sub i, 2
jnb loop_down
; jmp end_debug
LOAD e0, 0
MY_ALIGN_16
main_loop_sort:
; num_last > 3
mov t0, [p + 2 * 4]
cmp t0, [p + 3 * 4]
LOAD qq, num_last
STORE e0, num_last
LOAD e0, 1
mov s_x, 2
mov k_x, 1
adc s, 0
LOAD t0, s
dec num_last
cmp qq, t0
jae main_exit_label
STORE t0, 1
mov k, s
add s, s
if 1
; for branch data prefetch mode :
; it's faster for large arrays : larger than (1 << 13) items.
MAIN_num_unrolls equ 10
STEP_LOOP equ STEP_BRANCH
else
MAIN_num_unrolls equ 0
STEP_LOOP equ STEP_1
endif
MOVE_SMALLEST_UP STEP_LOOP, MAIN_num_unrolls, main_exit_label
; jmp end_debug
cmp num_last, 3
jne main_loop_sort
; num_last == 3 (real_size == 4)
LOAD a0, 2
LOAD a1, 3
LOAD a2, 1
STORE e0, 3 ; e0 is alias for a3
STORE a2, 2
SORT a0, a1
end_2:
STORE a0, 0
STORE a1, 1
; end_debug:
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
end_1:
MY_ENDP
endif
ifdef Z7_SORT_ASM_USE_SEGMENT
_TEXT$Z7_SORT ENDS
endif
if 0
LEA_IS_D8 (R64) [R2 * 4 + 16]
Lat : TP
2 : 1 : adl-e
2 : 3 p056 adl-p
1 : 2 : p15 hsw-rocket
1 : 2 : p01 snb-ivb
1 : 1 : p1 conroe-wsm
1 : 4 : zen3,zen4
2 : 4 : zen1,zen2
LEA_B_IS (R64) [R2 + R3 * 4]
Lat : TP
1 : 1 : adl-e
2 : 3 p056 adl-p
1 : 2 : p15 hsw-rocket
1 : 2 : p01 snb-ivb
1 : 1 : p1 nhm-wsm
1 : 1 : p0 conroe-wsm
1 : 4 : zen3,zen4
2 :2,4 : zen1,zen2
LEA_B_IS_D8 (R64) [R2 + R3 * 4 + 16]
Lat : TP
2 : 1 : adl-e
2 : 3 p056 adl-p
1 : 2 : p15 ice-rocket
3 : 1 : p1/p15 hsw-rocket
3 : 1 : p01 snb-ivb
1 : 1 : p1 nhm-wsm
1 : 1 : p0 conroe-wsm
2,1 : 2 : zen3,zen4
2 : 2 : zen1,zen2
CMOVB (R64, R64)
Lat : TP
1,2 : 2 : adl-e
1 : 2 p06 adl-p
1 : 2 : p06 bwd-rocket
1,2 : 2 : p0156+p06 hsw
1,2 :1.5 : p015+p05 snb-ivb
1,2 : 1 : p015+p05 nhm
1 : 1 : 2*p015 conroe
1 : 2 : zen3,zen4
1 : 4 : zen1,zen2
ADC (R64, 0)
Lat : TP
1,2 : 2 : adl-e
1 : 2 p06 adl-p
1 : 2 : p06 bwd-rocket
1 :1.5 : p0156+p06 hsw
1 :1.5 : p015+p05 snb-ivb
2 : 1 : 2*p015 conroe-wstm
1 : 2 : zen1,zen2,zen3,zen4
PREFETCHNTA : fetch data into non-temporal cache close to the processor, minimizing cache pollution.
L1 : Pentium3
L2 : NetBurst
L1, not L2: Core duo, Core 2, Atom processors
L1, not L2, may fetch into L3 with fast replacement: Nehalem, Westmere, Sandy Bridge, ...
NEHALEM: Fills L1/L3, L1 LRU is not updated
L3 with fast replacement: Xeon Processors based on Nehalem, Westmere, Sandy Bridge, ...
PREFETCHT0 : fetch data into all cache levels.
PREFETCHT1 : fetch data into L2 and L3
endif
end

View file

@ -1,113 +1,231 @@
; XzCrc64Opt.asm -- CRC64 calculation : optimized version ; XzCrc64Opt.asm -- CRC64 calculation : optimized version
; 2021-02-06 : Igor Pavlov : Public domain ; 2023-12-08 : Igor Pavlov : Public domain
include 7zAsm.asm include 7zAsm.asm
MY_ASM_START MY_ASM_START
NUM_WORDS equ 3
if (NUM_WORDS lt 1) or (NUM_WORDS gt 64)
.err <num_words_IS_INCORRECT>
endif
NUM_SKIP_BYTES equ ((NUM_WORDS - 2) * 4)
MOVZXLO macro dest:req, src:req
movzx dest, @CatStr(src, _L)
endm
MOVZXHI macro dest:req, src:req
movzx dest, @CatStr(src, _H)
endm
ifdef x64 ifdef x64
rD equ r9 rD equ r11
rN equ r10 rN equ r10
rT equ r5 rT equ r9
num_VAR equ r8
SRCDAT4 equ dword ptr [rD + rN * 1] CRC_OP macro op:req, dest:req, src:req, t:req
op dest, QWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t)]
endm
CRC_XOR macro dest:req, src:req, t:req CRC_XOR macro dest:req, src:req, t:req
xor dest, QWORD PTR [rT + src * 8 + 0800h * t] CRC_OP xor, dest, src, t
endm
CRC_MOV macro dest:req, src:req, t:req
CRC_OP mov, dest, src, t
endm endm
CRC1b macro CRC1b macro
movzx x6, BYTE PTR [rD] movzx x6, BYTE PTR [rD]
inc rD inc rD
movzx x3, x0_L MOVZXLO x3, x0
xor x6, x3 xor x6, x3
shr r0, 8 shr r0, 8
CRC_XOR r0, r6, 0 CRC_XOR r0, x6, 0
dec rN dec rN
endm endm
MY_PROLOG macro crc_end:req
ifdef ABI_LINUX ; ALIGN_MASK is 3 or 7 bytes alignment:
MY_PUSH_2_REGS ALIGN_MASK equ (7 - (NUM_WORDS and 1) * 4)
else
MY_PUSH_4_REGS if NUM_WORDS eq 1
endif
mov r0, REG_ABI_PARAM_0 src_rN_offset equ 4
mov rN, REG_ABI_PARAM_2 ; + 4 for prefetching next 4-bytes after current iteration
mov rT, REG_ABI_PARAM_3 NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 4)
mov rD, REG_ABI_PARAM_1 SRCDAT4 equ DWORD PTR [rN + rD * 1]
test rN, rN
jz crc_end XOR_NEXT macro
@@: mov x1, [rD]
test rD, 3 xor r0, r1
jz @F
CRC1b
jnz @B
@@:
cmp rN, 8
jb crc_end
add rN, rD
mov num_VAR, rN
sub rN, 4
and rN, NOT 3
sub rD, rN
mov x1, SRCDAT4
xor r0, r1
add rN, 4
endm endm
MY_EPILOG macro crc_end:req else ; NUM_WORDS > 1
sub rN, 4
mov x1, SRCDAT4 src_rN_offset equ 8
xor r0, r1 ; + 8 for prefetching next 8-bytes after current iteration
mov rD, rN NUM_BYTES_LIMIT equ (NUM_WORDS * 4 + 8)
mov rN, num_VAR
sub rN, rD XOR_NEXT macro
crc_end: xor r0, QWORD PTR [rD] ; 64-bit read, can be unaligned
test rN, rN
jz @F
CRC1b
jmp crc_end
@@:
ifdef ABI_LINUX
MY_POP_2_REGS
else
MY_POP_4_REGS
endif
endm endm
MY_PROC XzCrc64UpdateT4, 4 ; 32-bit or 64-bit
MY_PROLOG crc_end_4 LOAD_SRC_MULT4 macro dest:req, word_index:req
align 16 mov dest, [rN + rD * 1 + 4 * (word_index) - src_rN_offset];
main_loop_4: endm
mov x1, SRCDAT4
movzx x2, x0_L
movzx x3, x0_H
shr r0, 16
movzx x6, x0_L
movzx x7, x0_H
shr r0, 16
CRC_XOR r1, r2, 3
CRC_XOR r0, r3, 2
CRC_XOR r1, r6, 1
CRC_XOR r0, r7, 0
xor r0, r1
add rD, 4 endif
jnz main_loop_4
MY_EPILOG crc_end_4
MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 4
MY_PUSH_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
mov r0, REG_ABI_PARAM_0 ; r0 <- r1 / r7
mov rD, REG_ABI_PARAM_1 ; r11 <- r2 / r6
mov rN, REG_ABI_PARAM_2 ; r10 <- r8 / r2
if (IS_LINUX gt 0)
mov rT, REG_ABI_PARAM_3 ; r9 <- r9 / r1
endif
cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK
jb crc_end
@@:
test rD, ALIGN_MASK
jz @F
CRC1b
jmp @B
@@:
XOR_NEXT
lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
sub rD, rN
add rN, src_rN_offset
align 16
@@:
if NUM_WORDS eq 1
mov x1, x0
shr x1, 8
MOVZXLO x3, x1
MOVZXLO x2, x0
shr x1, 8
shr r0, 32
xor x0, SRCDAT4
CRC_XOR r0, x2, 3
CRC_XOR r0, x3, 2
MOVZXLO x2, x1
shr x1, 8
CRC_XOR r0, x2, 1
CRC_XOR r0, x1, 0
else ; NUM_WORDS > 1
if NUM_WORDS ne 2
k = 2
while k lt NUM_WORDS
LOAD_SRC_MULT4 x1, k
crc_op1 textequ <xor>
if k eq 2
if (NUM_WORDS and 1)
LOAD_SRC_MULT4 x7, NUM_WORDS ; aligned 32-bit
LOAD_SRC_MULT4 x6, NUM_WORDS + 1 ; aligned 32-bit
shl r6, 32
else
LOAD_SRC_MULT4 r6, NUM_WORDS ; aligned 64-bit
crc_op1 textequ <mov>
endif
endif
table = 4 * (NUM_WORDS - 1 - k)
MOVZXLO x3, x1
CRC_OP crc_op1, r7, x3, 3 + table
MOVZXHI x3, x1
shr x1, 16
CRC_XOR r6, x3, 2 + table
MOVZXLO x3, x1
shr x1, 8
CRC_XOR r7, x3, 1 + table
CRC_XOR r6, x1, 0 + table
k = k + 1
endm
crc_op2 textequ <xor>
else ; NUM_WORDS == 2
LOAD_SRC_MULT4 r6, NUM_WORDS ; aligned 64-bit
crc_op2 textequ <mov>
endif ; NUM_WORDS == 2
MOVZXHI x3, x0
MOVZXLO x2, x0
mov r1, r0
shr r1, 32
shr x0, 16
CRC_XOR r6, x2, NUM_SKIP_BYTES + 7
CRC_OP crc_op2, r7, x3, NUM_SKIP_BYTES + 6
MOVZXLO x2, x0
MOVZXHI x5, x1
MOVZXLO x3, x1
shr x0, 8
shr x1, 16
CRC_XOR r7, x2, NUM_SKIP_BYTES + 5
CRC_XOR r6, x3, NUM_SKIP_BYTES + 3
CRC_XOR r7, x0, NUM_SKIP_BYTES + 4
CRC_XOR r6, x5, NUM_SKIP_BYTES + 2
MOVZXLO x2, x1
shr x1, 8
CRC_XOR r7, x2, NUM_SKIP_BYTES + 1
CRC_MOV r0, x1, NUM_SKIP_BYTES + 0
xor r0, r6
xor r0, r7
endif ; NUM_WORDS > 1
add rD, NUM_WORDS * 4
jnc @B
sub rN, src_rN_offset
add rD, rN
XOR_NEXT
add rN, NUM_BYTES_LIMIT - 1
sub rN, rD
crc_end:
test rN, rN
jz func_end
@@:
CRC1b
jnz @B
func_end:
MY_POP_PRESERVED_ABI_REGS_UP_TO_INCLUDING_R11
MY_ENDP MY_ENDP
else else
; ==================================================================
; x86 (32-bit) ; x86 (32-bit)
rD equ r1 rD equ r7
rN equ r7 rN equ r1
rT equ r5 rT equ r5
xA equ x6
xA_R equ r6
ifdef x64
num_VAR equ r8
else
crc_OFFS equ (REG_SIZE * 5) crc_OFFS equ (REG_SIZE * 5)
if (IS_CDECL gt 0) or (IS_LINUX gt 0) if (IS_CDECL gt 0) or (IS_LINUX gt 0)
@ -133,107 +251,273 @@ else
table_VAR equ [r4 + table_OFFS] table_VAR equ [r4 + table_OFFS]
num_VAR equ table_VAR num_VAR equ table_VAR
endif endif
endif ; x64
SRCDAT4 equ dword ptr [rD + rN * 1] SRCDAT4 equ DWORD PTR [rN + rD * 1]
CRC_1 macro op:req, dest:req, src:req, t:req, word_index:req
op dest, DWORD PTR [rT + @CatStr(src, _R) * 8 + 0800h * (t) + (word_index) * 4]
endm
CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req CRC macro op0:req, op1:req, dest0:req, dest1:req, src:req, t:req
op0 dest0, DWORD PTR [rT + src * 8 + 0800h * t] CRC_1 op0, dest0, src, t, 0
op1 dest1, DWORD PTR [rT + src * 8 + 0800h * t + 4] CRC_1 op1, dest1, src, t, 1
endm endm
CRC_XOR macro dest0:req, dest1:req, src:req, t:req CRC_XOR macro dest0:req, dest1:req, src:req, t:req
CRC xor, xor, dest0, dest1, src, t CRC xor, xor, dest0, dest1, src, t
endm endm
CRC1b macro CRC1b macro
movzx x6, BYTE PTR [rD] movzx xA, BYTE PTR [rD]
inc rD inc rD
movzx x3, x0_L MOVZXLO x3, x0
xor x6, x3 xor xA, x3
shrd r0, r2, 8 shrd x0, x2, 8
shr r2, 8 shr x2, 8
CRC_XOR r0, r2, r6, 0 CRC_XOR x0, x2, xA, 0
dec rN dec rN
endm endm
MY_PROLOG macro crc_end:req
MY_PUSH_4_REGS
if (IS_CDECL gt 0) or (IS_LINUX gt 0) MY_PROLOG_BASE macro
proc_numParams = proc_numParams + 2 ; for ABI_LINUX MY_PUSH_4_REGS
mov rN, [r4 + size_OFFS] ifdef x64
mov rD, [r4 + data_OFFS] mov r0, REG_ABI_PARAM_0 ; r0 <- r1 / r7
else mov rT, REG_ABI_PARAM_3 ; r5 <- r9 / r1
mov rN, r2 mov rN, REG_ABI_PARAM_2 ; r1 <- r8 / r2
endif mov rD, REG_ABI_PARAM_1 ; r7 <- r2 / r6
mov r2, r0
mov x0, [r4 + crc_OFFS] shr r2, 32
mov x2, [r4 + crc_OFFS + 4] mov x0, x0
mov rT, table_VAR else
test rN, rN if (IS_CDECL gt 0) or (IS_LINUX gt 0)
jz crc_end proc_numParams = proc_numParams + 2 ; for ABI_LINUX
@@: mov rN, [r4 + size_OFFS]
test rD, 3 mov rD, [r4 + data_OFFS]
jz @F else
CRC1b mov rD, REG_ABI_PARAM_0 ; r7 <- r1 : (data)
jnz @B mov rN, REG_ABI_PARAM_1 ; r1 <- r2 : (size)
@@: endif
cmp rN, 8 mov x0, [r4 + crc_OFFS]
jb crc_end mov x2, [r4 + crc_OFFS + 4]
add rN, rD mov rT, table_VAR
endif
mov num_VAR, rN
sub rN, 4
and rN, NOT 3
sub rD, rN
xor r0, SRCDAT4
add rN, 4
endm endm
MY_EPILOG macro crc_end:req
sub rN, 4
xor r0, SRCDAT4
mov rD, rN MY_EPILOG_BASE macro crc_end:req, func_end:req
mov rN, num_VAR crc_end:
sub rN, rD test rN, rN
crc_end: jz func_end
test rN, rN @@:
jz @F CRC1b
CRC1b jnz @B
jmp crc_end func_end:
@@: ifdef x64
MY_POP_4_REGS shl r2, 32
xor r0, r2
endif
MY_POP_4_REGS
endm endm
MY_PROC XzCrc64UpdateT4, 5
MY_PROLOG crc_end_4
movzx x6, x0_L
align 16
main_loop_4:
mov r3, SRCDAT4
xor r3, r2
CRC xor, mov, r3, r2, r6, 3 ; ALIGN_MASK is 3 or 7 bytes alignment:
movzx x6, x0_H ALIGN_MASK equ (7 - (NUM_WORDS and 1) * 4)
shr r0, 16
CRC_XOR r3, r2, r6, 2
movzx x6, x0_L if (NUM_WORDS eq 1)
movzx x0, x0_H
CRC_XOR r3, r2, r6, 1
CRC_XOR r3, r2, r0, 0
movzx x6, x3_L
mov r0, r3
add rD, 4 NUM_BYTES_LIMIT_T4 equ (NUM_WORDS * 4 + 4)
jnz main_loop_4
MY_EPILOG crc_end_4 MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5
MY_PROLOG_BASE
cmp rN, NUM_BYTES_LIMIT_T4 + ALIGN_MASK
jb crc_end_4
@@:
test rD, ALIGN_MASK
jz @F
CRC1b
jmp @B
@@:
xor x0, [rD]
lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT_T4 - 1)]
sub rD, rN
add rN, 4
MOVZXLO xA, x0
align 16
@@:
mov x3, SRCDAT4
xor x3, x2
shr x0, 8
CRC xor, mov, x3, x2, xA, 3
MOVZXLO xA, x0
shr x0, 8
; MOVZXHI xA, x0
; shr x0, 16
CRC_XOR x3, x2, xA, 2
MOVZXLO xA, x0
shr x0, 8
CRC_XOR x3, x2, xA, 1
CRC_XOR x3, x2, x0, 0
MOVZXLO xA, x3
mov x0, x3
add rD, 4
jnc @B
sub rN, 4
add rD, rN
xor x0, [rD]
add rN, NUM_BYTES_LIMIT_T4 - 1
sub rN, rD
MY_EPILOG_BASE crc_end_4, func_end_4
MY_ENDP MY_ENDP
endif ; ! x64 else ; NUM_WORDS > 1
SHR_X macro x, imm
shr x, imm
endm
ITER_1 macro v0, v1, a, off
MOVZXLO xA, a
SHR_X a, 8
CRC_XOR v0, v1, xA, off
endm
ITER_4 macro v0, v1, a, off
if 0 eq 0
ITER_1 v0, v1, a, off + 3
ITER_1 v0, v1, a, off + 2
ITER_1 v0, v1, a, off + 1
CRC_XOR v0, v1, a, off
elseif 0 eq 0
MOVZXLO xA, a
CRC_XOR v0, v1, xA, off + 3
mov xA, a
ror a, 16 ; 32-bit ror
shr xA, 24
CRC_XOR v0, v1, xA, off
MOVZXLO xA, a
SHR_X a, 24
CRC_XOR v0, v1, xA, off + 1
CRC_XOR v0, v1, a, off + 2
else
; MOVZXHI provides smaller code, but MOVZX_HI_BYTE is not fast instruction
MOVZXLO xA, a
CRC_XOR v0, v1, xA, off + 3
MOVZXHI xA, a
SHR_X a, 16
CRC_XOR v0, v1, xA, off + 2
MOVZXLO xA, a
SHR_X a, 8
CRC_XOR v0, v1, xA, off + 1
CRC_XOR v0, v1, a, off
endif
endm
ITER_1_PAIR macro v0, v1, a0, a1, off
ITER_1 v0, v1, a0, off + 4
ITER_1 v0, v1, a1, off
endm
src_rD_offset equ 8
STEP_SIZE equ (NUM_WORDS * 4)
ITER_12_NEXT macro op, index, v0, v1
op v0, DWORD PTR [rD + (index + 1) * STEP_SIZE - src_rD_offset]
op v1, DWORD PTR [rD + (index + 1) * STEP_SIZE + 4 - src_rD_offset]
endm
ITER_12 macro index, a0, a1, v0, v1
if NUM_SKIP_BYTES eq 0
ITER_12_NEXT mov, index, v0, v1
else
k = 0
while k lt NUM_SKIP_BYTES
movzx xA, BYTE PTR [rD + (index) * STEP_SIZE + k + 8 - src_rD_offset]
if k eq 0
CRC mov, mov, v0, v1, xA, NUM_SKIP_BYTES - 1 - k
else
CRC_XOR v0, v1, xA, NUM_SKIP_BYTES - 1 - k
endif
k = k + 1
endm
ITER_12_NEXT xor, index, v0, v1
endif
if 0 eq 0
ITER_4 v0, v1, a0, NUM_SKIP_BYTES + 4
ITER_4 v0, v1, a1, NUM_SKIP_BYTES
else ; interleave version is faster/slower for different processors
ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 3
ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 2
ITER_1_PAIR v0, v1, a0, a1, NUM_SKIP_BYTES + 1
CRC_XOR v0, v1, a0, NUM_SKIP_BYTES + 4
CRC_XOR v0, v1, a1, NUM_SKIP_BYTES
endif
endm
; we use (UNROLL_CNT > 1) to reduce read ports pressure (num_VAR reads)
UNROLL_CNT equ (2 * 1)
NUM_BYTES_LIMIT equ (STEP_SIZE * UNROLL_CNT + 8)
MY_PROC @CatStr(XzCrc64UpdateT, %(NUM_WORDS * 4)), 5
MY_PROLOG_BASE
cmp rN, NUM_BYTES_LIMIT + ALIGN_MASK
jb crc_end_12
@@:
test rD, ALIGN_MASK
jz @F
CRC1b
jmp @B
@@:
xor x0, [rD]
xor x2, [rD + 4]
add rD, src_rD_offset
lea rN, [rD + rN * 1 - (NUM_BYTES_LIMIT - 1)]
mov num_VAR, rN
align 16
@@:
i = 0
rept UNROLL_CNT
if (i and 1) eq 0
ITER_12 i, x0, x2, x1, x3
else
ITER_12 i, x1, x3, x0, x2
endif
i = i + 1
endm
if (UNROLL_CNT and 1)
mov x0, x1
mov x2, x3
endif
add rD, STEP_SIZE * UNROLL_CNT
cmp rD, num_VAR
jb @B
mov rN, num_VAR
add rN, NUM_BYTES_LIMIT - 1
sub rN, rD
sub rD, src_rD_offset
xor x0, [rD]
xor x2, [rD + 4]
MY_EPILOG_BASE crc_end_12, func_end_12
MY_ENDP
endif ; (NUM_WORDS > 1)
endif ; ! x64
end end

12
C/7z.h
View file

@ -1,8 +1,8 @@
/* 7z.h -- 7z interface /* 7z.h -- 7z interface
2018-07-02 : Igor Pavlov : Public domain */ 2023-04-02 : Igor Pavlov : Public domain */
#ifndef __7Z_H #ifndef ZIP7_INC_7Z_H
#define __7Z_H #define ZIP7_INC_7Z_H
#include "7zTypes.h" #include "7zTypes.h"
@ -98,7 +98,7 @@ typedef struct
UInt64 SzAr_GetFolderUnpackSize(const CSzAr *p, UInt32 folderIndex); UInt64 SzAr_GetFolderUnpackSize(const CSzAr *p, UInt32 folderIndex);
SRes SzAr_DecodeFolder(const CSzAr *p, UInt32 folderIndex, SRes SzAr_DecodeFolder(const CSzAr *p, UInt32 folderIndex,
ILookInStream *stream, UInt64 startPos, ILookInStreamPtr stream, UInt64 startPos,
Byte *outBuffer, size_t outSize, Byte *outBuffer, size_t outSize,
ISzAllocPtr allocMain); ISzAllocPtr allocMain);
@ -174,7 +174,7 @@ UInt16 *SzArEx_GetFullNameUtf16_Back(const CSzArEx *p, size_t fileIndex, UInt16
SRes SzArEx_Extract( SRes SzArEx_Extract(
const CSzArEx *db, const CSzArEx *db,
ILookInStream *inStream, ILookInStreamPtr inStream,
UInt32 fileIndex, /* index of file */ UInt32 fileIndex, /* index of file */
UInt32 *blockIndex, /* index of solid block */ UInt32 *blockIndex, /* index of solid block */
Byte **outBuffer, /* pointer to pointer to output buffer (allocated with allocMain) */ Byte **outBuffer, /* pointer to pointer to output buffer (allocated with allocMain) */
@ -196,7 +196,7 @@ SZ_ERROR_INPUT_EOF
SZ_ERROR_FAIL SZ_ERROR_FAIL
*/ */
SRes SzArEx_Open(CSzArEx *p, ILookInStream *inStream, SRes SzArEx_Open(CSzArEx *p, ILookInStreamPtr inStream,
ISzAllocPtr allocMain, ISzAllocPtr allocTemp); ISzAllocPtr allocMain, ISzAllocPtr allocTemp);
EXTERN_C_END EXTERN_C_END

View file

@ -1,5 +1,5 @@
/* 7zAlloc.c -- Allocation functions /* 7zAlloc.c -- Allocation functions for 7z processing
2017-04-03 : Igor Pavlov : Public domain */ 2023-03-04 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -7,74 +7,83 @@
#include "7zAlloc.h" #include "7zAlloc.h"
/* #define _SZ_ALLOC_DEBUG */ /* #define SZ_ALLOC_DEBUG */
/* use _SZ_ALLOC_DEBUG to debug alloc/free operations */ /* use SZ_ALLOC_DEBUG to debug alloc/free operations */
#ifdef _SZ_ALLOC_DEBUG #ifdef SZ_ALLOC_DEBUG
/*
#ifdef _WIN32 #ifdef _WIN32
#include <windows.h> #include "7zWindows.h"
#endif #endif
*/
#include <stdio.h> #include <stdio.h>
int g_allocCount = 0; static int g_allocCount = 0;
int g_allocCountTemp = 0; static int g_allocCountTemp = 0;
static void Print_Alloc(const char *s, size_t size, int *counter)
{
const unsigned size2 = (unsigned)size;
fprintf(stderr, "\n%s count = %10d : %10u bytes; ", s, *counter, size2);
(*counter)++;
}
static void Print_Free(const char *s, int *counter)
{
(*counter)--;
fprintf(stderr, "\n%s count = %10d", s, *counter);
}
#endif #endif
void *SzAlloc(ISzAllocPtr p, size_t size) void *SzAlloc(ISzAllocPtr p, size_t size)
{ {
UNUSED_VAR(p); UNUSED_VAR(p)
if (size == 0) if (size == 0)
return 0; return 0;
#ifdef _SZ_ALLOC_DEBUG #ifdef SZ_ALLOC_DEBUG
fprintf(stderr, "\nAlloc %10u bytes; count = %10d", (unsigned)size, g_allocCount); Print_Alloc("Alloc", size, &g_allocCount);
g_allocCount++;
#endif #endif
return malloc(size); return malloc(size);
} }
void SzFree(ISzAllocPtr p, void *address) void SzFree(ISzAllocPtr p, void *address)
{ {
UNUSED_VAR(p); UNUSED_VAR(p)
#ifdef _SZ_ALLOC_DEBUG #ifdef SZ_ALLOC_DEBUG
if (address != 0) if (address)
{ Print_Free("Free ", &g_allocCount);
g_allocCount--;
fprintf(stderr, "\nFree; count = %10d", g_allocCount);
}
#endif #endif
free(address); free(address);
} }
void *SzAllocTemp(ISzAllocPtr p, size_t size) void *SzAllocTemp(ISzAllocPtr p, size_t size)
{ {
UNUSED_VAR(p); UNUSED_VAR(p)
if (size == 0) if (size == 0)
return 0; return 0;
#ifdef _SZ_ALLOC_DEBUG #ifdef SZ_ALLOC_DEBUG
fprintf(stderr, "\nAlloc_temp %10u bytes; count = %10d", (unsigned)size, g_allocCountTemp); Print_Alloc("Alloc_temp", size, &g_allocCountTemp);
g_allocCountTemp++; /*
#ifdef _WIN32 #ifdef _WIN32
return HeapAlloc(GetProcessHeap(), 0, size); return HeapAlloc(GetProcessHeap(), 0, size);
#endif #endif
*/
#endif #endif
return malloc(size); return malloc(size);
} }
void SzFreeTemp(ISzAllocPtr p, void *address) void SzFreeTemp(ISzAllocPtr p, void *address)
{ {
UNUSED_VAR(p); UNUSED_VAR(p)
#ifdef _SZ_ALLOC_DEBUG #ifdef SZ_ALLOC_DEBUG
if (address != 0) if (address)
{ Print_Free("Free_temp ", &g_allocCountTemp);
g_allocCountTemp--; /*
fprintf(stderr, "\nFree_temp; count = %10d", g_allocCountTemp);
}
#ifdef _WIN32 #ifdef _WIN32
HeapFree(GetProcessHeap(), 0, address); HeapFree(GetProcessHeap(), 0, address);
return; return;
#endif #endif
*/
#endif #endif
free(address); free(address);
} }

View file

@ -1,8 +1,8 @@
/* 7zAlloc.h -- Allocation functions /* 7zAlloc.h -- Allocation functions
2017-04-03 : Igor Pavlov : Public domain */ 2023-03-04 : Igor Pavlov : Public domain */
#ifndef __7Z_ALLOC_H #ifndef ZIP7_INC_7Z_ALLOC_H
#define __7Z_ALLOC_H #define ZIP7_INC_7Z_ALLOC_H
#include "7zTypes.h" #include "7zTypes.h"

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,8 @@
/* 7zBuf.h -- Byte Buffer /* 7zBuf.h -- Byte Buffer
2017-04-03 : Igor Pavlov : Public domain */ 2023-03-04 : Igor Pavlov : Public domain */
#ifndef __7Z_BUF_H #ifndef ZIP7_INC_7Z_BUF_H
#define __7Z_BUF_H #define ZIP7_INC_7Z_BUF_H
#include "7zTypes.h" #include "7zTypes.h"

552
C/7zCrc.c
View file

@ -1,182 +1,218 @@
/* 7zCrc.c -- CRC32 init /* 7zCrc.c -- CRC32 calculation and init
2021-04-01 : Igor Pavlov : Public domain */ 2024-03-01 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include "7zCrc.h" #include "7zCrc.h"
#include "CpuArch.h" #include "CpuArch.h"
#define kCrcPoly 0xEDB88320 // for debug:
// #define __ARM_FEATURE_CRC32 1
#ifdef MY_CPU_LE #ifdef __ARM_FEATURE_CRC32
#define CRC_NUM_TABLES 8 // #pragma message("__ARM_FEATURE_CRC32")
#define Z7_CRC_HW_FORCE
#endif
// #define Z7_CRC_DEBUG_BE
#ifdef Z7_CRC_DEBUG_BE
#undef MY_CPU_LE
#define MY_CPU_BE
#endif
#ifdef Z7_CRC_HW_FORCE
#define Z7_CRC_NUM_TABLES_USE 1
#else #else
#define CRC_NUM_TABLES 9 #ifdef Z7_CRC_NUM_TABLES
#define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES
#define CRC_UINT32_SWAP(v) ((v >> 24) | ((v >> 8) & 0xFF00) | ((v << 8) & 0xFF0000) | (v << 24)) #else
#define Z7_CRC_NUM_TABLES_USE 12
UInt32 MY_FAST_CALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table); #endif
UInt32 MY_FAST_CALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table);
#endif #endif
#ifndef MY_CPU_BE #if Z7_CRC_NUM_TABLES_USE < 1
UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table); #error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table);
#endif #endif
typedef UInt32 (MY_FAST_CALL *CRC_FUNC)(UInt32 v, const void *data, size_t size, const UInt32 *table); #if defined(MY_CPU_LE) || (Z7_CRC_NUM_TABLES_USE == 1)
#define Z7_CRC_NUM_TABLES_TOTAL Z7_CRC_NUM_TABLES_USE
#else
#define Z7_CRC_NUM_TABLES_TOTAL (Z7_CRC_NUM_TABLES_USE + 1)
#endif
extern #ifndef Z7_CRC_HW_FORCE
CRC_FUNC g_CrcUpdateT4;
CRC_FUNC g_CrcUpdateT4;
extern
CRC_FUNC g_CrcUpdateT8;
CRC_FUNC g_CrcUpdateT8;
extern
CRC_FUNC g_CrcUpdateT0_32;
CRC_FUNC g_CrcUpdateT0_32;
extern
CRC_FUNC g_CrcUpdateT0_64;
CRC_FUNC g_CrcUpdateT0_64;
extern
CRC_FUNC g_CrcUpdate;
CRC_FUNC g_CrcUpdate;
UInt32 g_CrcTable[256 * CRC_NUM_TABLES]; #if Z7_CRC_NUM_TABLES_USE == 1 \
|| (!defined(MY_CPU_LE) && !defined(MY_CPU_BE))
UInt32 MY_FAST_CALL CrcUpdate(UInt32 v, const void *data, size_t size) #define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
{ #define Z7_CRC_UPDATE_T1_FUNC_NAME CrcUpdateGT1
return g_CrcUpdate(v, data, size, g_CrcTable); static UInt32 Z7_FASTCALL Z7_CRC_UPDATE_T1_FUNC_NAME(UInt32 v, const void *data, size_t size)
}
UInt32 MY_FAST_CALL CrcCalc(const void *data, size_t size)
{
return g_CrcUpdate(CRC_INIT_VAL, data, size, g_CrcTable) ^ CRC_INIT_VAL;
}
#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
UInt32 MY_FAST_CALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table);
UInt32 MY_FAST_CALL CrcUpdateT1(UInt32 v, const void *data, size_t size, const UInt32 *table)
{ {
const UInt32 *table = g_CrcTable;
const Byte *p = (const Byte *)data; const Byte *p = (const Byte *)data;
const Byte *pEnd = p + size; const Byte *lim = p + size;
for (; p != pEnd; p++) for (; p != lim; p++)
v = CRC_UPDATE_BYTE_2(v, *p); v = CRC_UPDATE_BYTE_2(v, *p);
return v; return v;
} }
#endif
#if Z7_CRC_NUM_TABLES_USE != 1
#ifndef MY_CPU_BE
#define FUNC_NAME_LE_2(s) CrcUpdateT ## s
#define FUNC_NAME_LE_1(s) FUNC_NAME_LE_2(s)
#define FUNC_NAME_LE FUNC_NAME_LE_1(Z7_CRC_NUM_TABLES_USE)
UInt32 Z7_FASTCALL FUNC_NAME_LE (UInt32 v, const void *data, size_t size, const UInt32 *table);
#endif
#ifndef MY_CPU_LE
#define FUNC_NAME_BE_2(s) CrcUpdateT1_BeT ## s
#define FUNC_NAME_BE_1(s) FUNC_NAME_BE_2(s)
#define FUNC_NAME_BE FUNC_NAME_BE_1(Z7_CRC_NUM_TABLES_USE)
UInt32 Z7_FASTCALL FUNC_NAME_BE (UInt32 v, const void *data, size_t size, const UInt32 *table);
#endif
#endif
#endif // Z7_CRC_HW_FORCE
/* ---------- hardware CRC ---------- */ /* ---------- hardware CRC ---------- */
#ifdef MY_CPU_LE #ifdef MY_CPU_LE
#if defined(MY_CPU_ARM_OR_ARM64) #if defined(MY_CPU_ARM_OR_ARM64)
// #pragma message("ARM*") // #pragma message("ARM*")
#if defined(_MSC_VER) #if (defined(__clang__) && (__clang_major__ >= 3)) \
#if defined(MY_CPU_ARM64) || defined(__GNUC__) && (__GNUC__ >= 6) && defined(MY_CPU_ARM64) \
#if (_MSC_VER >= 1910) || defined(__GNUC__) && (__GNUC__ >= 8)
#define USE_ARM64_CRC
#endif
#endif
#elif (defined(__clang__) && (__clang_major__ >= 3)) \
|| (defined(__GNUC__) && (__GNUC__ > 4))
#if !defined(__ARM_FEATURE_CRC32) #if !defined(__ARM_FEATURE_CRC32)
// #pragma message("!defined(__ARM_FEATURE_CRC32)")
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define __ARM_FEATURE_CRC32 1 #define __ARM_FEATURE_CRC32 1
#if (!defined(__clang__) || (__clang_major__ > 3)) // fix these numbers Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc"))) #define Z7_ARM_FEATURE_CRC32_WAS_SET
#if defined(__clang__)
#if defined(MY_CPU_ARM64)
#define ATTRIB_CRC __attribute__((__target__("crc")))
#else
#define ATTRIB_CRC __attribute__((__target__("armv8-a,crc")))
#endif #endif
#else
#if defined(MY_CPU_ARM64)
#if !defined(Z7_GCC_VERSION) || (Z7_GCC_VERSION >= 60000)
#define ATTRIB_CRC __attribute__((__target__("+crc")))
#endif
#else
#if !defined(Z7_GCC_VERSION) || (__GNUC__ >= 8)
#if defined(__ARM_FP) && __GNUC__ >= 8
// for -mfloat-abi=hard: similar to <arm_acle.h>
#define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc+simd")))
#else
#define ATTRIB_CRC __attribute__((__target__("arch=armv8-a+crc")))
#endif
#endif
#endif
#endif
#endif #endif
#if defined(__ARM_FEATURE_CRC32) #if defined(__ARM_FEATURE_CRC32)
#define USE_ARM64_CRC // #pragma message("<arm_acle.h>")
/*
arm_acle.h (GGC):
before Nov 17, 2017:
#ifdef __ARM_FEATURE_CRC32
Nov 17, 2017: gcc10.0 (gcc 9.2.0) checked"
#if __ARM_ARCH >= 8
#pragma GCC target ("arch=armv8-a+crc")
Aug 22, 2019: GCC 8.4?, 9.2.1, 10.1:
#ifdef __ARM_FEATURE_CRC32
#ifdef __ARM_FP
#pragma GCC target ("arch=armv8-a+crc+simd")
#else
#pragma GCC target ("arch=armv8-a+crc")
#endif
*/
#if defined(__ARM_ARCH) && __ARM_ARCH < 8
#if defined(Z7_GCC_VERSION) && (__GNUC__ == 8) && (Z7_GCC_VERSION < 80400) \
|| defined(Z7_GCC_VERSION) && (__GNUC__ == 9) && (Z7_GCC_VERSION < 90201) \
|| defined(Z7_GCC_VERSION) && (__GNUC__ == 10) && (Z7_GCC_VERSION < 100100)
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
// #pragma message("#define __ARM_ARCH 8")
#undef __ARM_ARCH
#define __ARM_ARCH 8
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif
#define Z7_CRC_HW_USE
#include <arm_acle.h> #include <arm_acle.h>
#endif #endif
#elif defined(_MSC_VER)
#if defined(MY_CPU_ARM64)
#if (_MSC_VER >= 1910)
#ifdef __clang__
// #define Z7_CRC_HW_USE
// #include <arm_acle.h>
#else
#define Z7_CRC_HW_USE
#include <intrin.h>
#endif
#endif
#endif
#endif #endif
#else #else // non-ARM*
// no hardware CRC // #define Z7_CRC_HW_USE // for debug : we can test HW-branch of code
#ifdef Z7_CRC_HW_USE
// #define USE_CRC_EMU #include "7zCrcEmu.h"
#ifdef USE_CRC_EMU
#pragma message("ARM64 CRC emulation")
MY_FORCE_INLINE
UInt32 __crc32b(UInt32 v, UInt32 data)
{
const UInt32 *table = g_CrcTable;
v = CRC_UPDATE_BYTE_2(v, (Byte)data);
return v;
}
MY_FORCE_INLINE
UInt32 __crc32w(UInt32 v, UInt32 data)
{
const UInt32 *table = g_CrcTable;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
return v;
}
MY_FORCE_INLINE
UInt32 __crc32d(UInt32 v, UInt64 data)
{
const UInt32 *table = g_CrcTable;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
v = CRC_UPDATE_BYTE_2(v, (Byte)data); data >>= 8;
return v;
}
#endif // USE_CRC_EMU
#endif // defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
#if defined(USE_ARM64_CRC) || defined(USE_CRC_EMU)
#define T0_32_UNROLL_BYTES (4 * 4)
#define T0_64_UNROLL_BYTES (4 * 8)
#ifndef ATTRIB_CRC
#define ATTRIB_CRC
#endif #endif
#endif // non-ARM*
#if defined(Z7_CRC_HW_USE)
// #pragma message("USE ARM HW CRC") // #pragma message("USE ARM HW CRC")
ATTRIB_CRC #ifdef MY_CPU_64BIT
UInt32 MY_FAST_CALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table); #define CRC_HW_WORD_TYPE UInt64
ATTRIB_CRC #define CRC_HW_WORD_FUNC __crc32d
UInt32 MY_FAST_CALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, const UInt32 *table) #else
#define CRC_HW_WORD_TYPE UInt32
#define CRC_HW_WORD_FUNC __crc32w
#endif
#define CRC_HW_UNROLL_BYTES (sizeof(CRC_HW_WORD_TYPE) * 4)
#ifdef ATTRIB_CRC
ATTRIB_CRC
#endif
Z7_NO_INLINE
#ifdef Z7_CRC_HW_FORCE
UInt32 Z7_FASTCALL CrcUpdate
#else
static UInt32 Z7_FASTCALL CrcUpdate_HW
#endif
(UInt32 v, const void *data, size_t size)
{ {
const Byte *p = (const Byte *)data; const Byte *p = (const Byte *)data;
UNUSED_VAR(table); for (; size != 0 && ((unsigned)(ptrdiff_t)p & (CRC_HW_UNROLL_BYTES - 1)) != 0; size--)
for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_32_UNROLL_BYTES - 1)) != 0; size--)
v = __crc32b(v, *p++); v = __crc32b(v, *p++);
if (size >= CRC_HW_UNROLL_BYTES)
if (size >= T0_32_UNROLL_BYTES)
{ {
const Byte *lim = p + size; const Byte *lim = p + size;
size &= (T0_32_UNROLL_BYTES - 1); size &= CRC_HW_UNROLL_BYTES - 1;
lim -= size; lim -= size;
do do
{ {
v = __crc32w(v, *(const UInt32 *)(const void *)(p)); v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p));
v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4; v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE)));
v = __crc32w(v, *(const UInt32 *)(const void *)(p)); p += 2 * sizeof(CRC_HW_WORD_TYPE);
v = __crc32w(v, *(const UInt32 *)(const void *)(p + 4)); p += 2 * 4; v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p));
v = CRC_HW_WORD_FUNC(v, *(const CRC_HW_WORD_TYPE *)(const void *)(p + sizeof(CRC_HW_WORD_TYPE)));
p += 2 * sizeof(CRC_HW_WORD_TYPE);
} }
while (p != lim); while (p != lim);
} }
@ -187,136 +223,198 @@ UInt32 MY_FAST_CALL CrcUpdateT0_32(UInt32 v, const void *data, size_t size, cons
return v; return v;
} }
ATTRIB_CRC #ifdef Z7_ARM_FEATURE_CRC32_WAS_SET
UInt32 MY_FAST_CALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table); Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
ATTRIB_CRC #undef __ARM_FEATURE_CRC32
UInt32 MY_FAST_CALL CrcUpdateT0_64(UInt32 v, const void *data, size_t size, const UInt32 *table) Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
{ #undef Z7_ARM_FEATURE_CRC32_WAS_SET
const Byte *p = (const Byte *)data; #endif
UNUSED_VAR(table);
for (; size != 0 && ((unsigned)(ptrdiff_t)p & (T0_64_UNROLL_BYTES - 1)) != 0; size--)
v = __crc32b(v, *p++);
if (size >= T0_64_UNROLL_BYTES)
{
const Byte *lim = p + size;
size &= (T0_64_UNROLL_BYTES - 1);
lim -= size;
do
{
v = __crc32d(v, *(const UInt64 *)(const void *)(p));
v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8;
v = __crc32d(v, *(const UInt64 *)(const void *)(p));
v = __crc32d(v, *(const UInt64 *)(const void *)(p + 8)); p += 2 * 8;
}
while (p != lim);
}
for (; size != 0; size--)
v = __crc32b(v, *p++);
return v;
}
#endif // defined(USE_ARM64_CRC) || defined(USE_CRC_EMU)
#endif // defined(Z7_CRC_HW_USE)
#endif // MY_CPU_LE #endif // MY_CPU_LE
#ifndef Z7_CRC_HW_FORCE
void MY_FAST_CALL CrcGenerateTable() #if defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME)
/*
typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_WITH_TABLE_FUNC)
(UInt32 v, const void *data, size_t size, const UInt32 *table);
Z7_CRC_UPDATE_WITH_TABLE_FUNC g_CrcUpdate;
*/
static unsigned g_Crc_Algo;
#if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE))
static unsigned g_Crc_Be;
#endif
#endif // defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME)
Z7_NO_INLINE
#ifdef Z7_CRC_HW_USE
static UInt32 Z7_FASTCALL CrcUpdate_Base
#else
UInt32 Z7_FASTCALL CrcUpdate
#endif
(UInt32 crc, const void *data, size_t size)
{
#if Z7_CRC_NUM_TABLES_USE == 1
return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size);
#else // Z7_CRC_NUM_TABLES_USE != 1
#ifdef Z7_CRC_UPDATE_T1_FUNC_NAME
if (g_Crc_Algo == 1)
return Z7_CRC_UPDATE_T1_FUNC_NAME(crc, data, size);
#endif
#ifdef MY_CPU_LE
return FUNC_NAME_LE(crc, data, size, g_CrcTable);
#elif defined(MY_CPU_BE)
return FUNC_NAME_BE(crc, data, size, g_CrcTable);
#else
if (g_Crc_Be)
return FUNC_NAME_BE(crc, data, size, g_CrcTable);
else
return FUNC_NAME_LE(crc, data, size, g_CrcTable);
#endif
#endif // Z7_CRC_NUM_TABLES_USE != 1
}
#ifdef Z7_CRC_HW_USE
Z7_NO_INLINE
UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size)
{
if (g_Crc_Algo == 0)
return CrcUpdate_HW(crc, data, size);
return CrcUpdate_Base(crc, data, size);
}
#endif
#endif // !defined(Z7_CRC_HW_FORCE)
UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size)
{
return CrcUpdate(CRC_INIT_VAL, data, size) ^ CRC_INIT_VAL;
}
MY_ALIGN(64)
UInt32 g_CrcTable[256 * Z7_CRC_NUM_TABLES_TOTAL];
void Z7_FASTCALL CrcGenerateTable(void)
{ {
UInt32 i; UInt32 i;
for (i = 0; i < 256; i++) for (i = 0; i < 256; i++)
{ {
#if defined(Z7_CRC_HW_FORCE)
g_CrcTable[i] = __crc32b(i, 0);
#else
#define kCrcPoly 0xEDB88320
UInt32 r = i; UInt32 r = i;
unsigned j; unsigned j;
for (j = 0; j < 8; j++) for (j = 0; j < 8; j++)
r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1))); r = (r >> 1) ^ (kCrcPoly & ((UInt32)0 - (r & 1)));
g_CrcTable[i] = r; g_CrcTable[i] = r;
#endif
} }
for (i = 256; i < 256 * CRC_NUM_TABLES; i++) for (i = 256; i < 256 * Z7_CRC_NUM_TABLES_USE; i++)
{ {
UInt32 r = g_CrcTable[(size_t)i - 256]; const UInt32 r = g_CrcTable[(size_t)i - 256];
g_CrcTable[i] = g_CrcTable[r & 0xFF] ^ (r >> 8); g_CrcTable[i] = g_CrcTable[r & 0xFF] ^ (r >> 8);
} }
#if CRC_NUM_TABLES < 4 #if !defined(Z7_CRC_HW_FORCE) && \
(defined(Z7_CRC_HW_USE) || defined(Z7_CRC_UPDATE_T1_FUNC_NAME) || defined(MY_CPU_BE))
g_CrcUpdate = CrcUpdateT1; #if Z7_CRC_NUM_TABLES_USE <= 1
g_Crc_Algo = 1;
#else // Z7_CRC_NUM_TABLES_USE <= 1
#else #if defined(MY_CPU_LE)
g_Crc_Algo = Z7_CRC_NUM_TABLES_USE;
#ifdef MY_CPU_LE #else // !defined(MY_CPU_LE)
g_CrcUpdateT4 = CrcUpdateT4;
g_CrcUpdate = CrcUpdateT4;
#if CRC_NUM_TABLES >= 8
g_CrcUpdateT8 = CrcUpdateT8;
#ifdef MY_CPU_X86_OR_AMD64
if (!CPU_Is_InOrder())
#endif
g_CrcUpdate = CrcUpdateT8;
#endif
#else
{ {
#ifndef MY_CPU_BE #ifndef MY_CPU_BE
UInt32 k = 0x01020304; UInt32 k = 0x01020304;
const Byte *p = (const Byte *)&k; const Byte *p = (const Byte *)&k;
if (p[0] == 4 && p[1] == 3) if (p[0] == 4 && p[1] == 3)
{ g_Crc_Algo = Z7_CRC_NUM_TABLES_USE;
g_CrcUpdateT4 = CrcUpdateT4;
g_CrcUpdate = CrcUpdateT4;
#if CRC_NUM_TABLES >= 8
g_CrcUpdateT8 = CrcUpdateT8;
g_CrcUpdate = CrcUpdateT8;
#endif
}
else if (p[0] != 1 || p[1] != 2) else if (p[0] != 1 || p[1] != 2)
g_CrcUpdate = CrcUpdateT1; g_Crc_Algo = 1;
else else
#endif #endif // MY_CPU_BE
{ {
for (i = 256 * CRC_NUM_TABLES - 1; i >= 256; i--) for (i = 256 * Z7_CRC_NUM_TABLES_TOTAL - 1; i >= 256; i--)
{ {
UInt32 x = g_CrcTable[(size_t)i - 256]; const UInt32 x = g_CrcTable[(size_t)i - 256];
g_CrcTable[i] = CRC_UINT32_SWAP(x); g_CrcTable[i] = Z7_BSWAP32(x);
} }
g_CrcUpdateT4 = CrcUpdateT1_BeT4; #if defined(Z7_CRC_UPDATE_T1_FUNC_NAME)
g_CrcUpdate = CrcUpdateT1_BeT4; g_Crc_Algo = Z7_CRC_NUM_TABLES_USE;
#if CRC_NUM_TABLES >= 8 #endif
g_CrcUpdateT8 = CrcUpdateT1_BeT8; #if (!defined(MY_CPU_LE) && !defined(MY_CPU_BE))
g_CrcUpdate = CrcUpdateT1_BeT8; g_Crc_Be = 1;
#endif #endif
} }
} }
#endif #endif // !defined(MY_CPU_LE)
#endif
#ifdef MY_CPU_LE #ifdef MY_CPU_LE
#ifdef USE_ARM64_CRC #ifdef Z7_CRC_HW_USE
if (CPU_IsSupported_CRC32()) if (CPU_IsSupported_CRC32())
{ g_Crc_Algo = 0;
g_CrcUpdateT0_32 = CrcUpdateT0_32; #endif // Z7_CRC_HW_USE
g_CrcUpdateT0_64 = CrcUpdateT0_64; #endif // MY_CPU_LE
g_CrcUpdate =
#if defined(MY_CPU_ARM)
CrcUpdateT0_32;
#else
CrcUpdateT0_64;
#endif
}
#endif
#ifdef USE_CRC_EMU #endif // Z7_CRC_NUM_TABLES_USE <= 1
g_CrcUpdateT0_32 = CrcUpdateT0_32; #endif // g_Crc_Algo was declared
g_CrcUpdateT0_64 = CrcUpdateT0_64;
g_CrcUpdate = CrcUpdateT0_64;
#endif
#endif
} }
Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo)
{
if (algo == 0)
return &CrcUpdate;
#if defined(Z7_CRC_HW_USE)
if (algo == sizeof(CRC_HW_WORD_TYPE) * 8)
{
#ifdef Z7_CRC_HW_FORCE
return &CrcUpdate;
#else
if (g_Crc_Algo == 0)
return &CrcUpdate_HW;
#endif
}
#endif
#ifndef Z7_CRC_HW_FORCE
if (algo == Z7_CRC_NUM_TABLES_USE)
return
#ifdef Z7_CRC_HW_USE
&CrcUpdate_Base;
#else
&CrcUpdate;
#endif
#endif
return NULL;
}
#undef kCrcPoly
#undef Z7_CRC_NUM_TABLES_USE
#undef Z7_CRC_NUM_TABLES_TOTAL
#undef CRC_UPDATE_BYTE_2
#undef FUNC_NAME_LE_2
#undef FUNC_NAME_LE_1
#undef FUNC_NAME_LE
#undef FUNC_NAME_BE_2
#undef FUNC_NAME_BE_1
#undef FUNC_NAME_BE
#undef CRC_HW_UNROLL_BYTES
#undef CRC_HW_WORD_FUNC
#undef CRC_HW_WORD_TYPE

View file

@ -1,8 +1,8 @@
/* 7zCrc.h -- CRC32 calculation /* 7zCrc.h -- CRC32 calculation
2013-01-18 : Igor Pavlov : Public domain */ 2024-01-22 : Igor Pavlov : Public domain */
#ifndef __7Z_CRC_H #ifndef ZIP7_INC_7Z_CRC_H
#define __7Z_CRC_H #define ZIP7_INC_7Z_CRC_H
#include "7zTypes.h" #include "7zTypes.h"
@ -11,14 +11,17 @@ EXTERN_C_BEGIN
extern UInt32 g_CrcTable[]; extern UInt32 g_CrcTable[];
/* Call CrcGenerateTable one time before other CRC functions */ /* Call CrcGenerateTable one time before other CRC functions */
void MY_FAST_CALL CrcGenerateTable(void); void Z7_FASTCALL CrcGenerateTable(void);
#define CRC_INIT_VAL 0xFFFFFFFF #define CRC_INIT_VAL 0xFFFFFFFF
#define CRC_GET_DIGEST(crc) ((crc) ^ CRC_INIT_VAL) #define CRC_GET_DIGEST(crc) ((crc) ^ CRC_INIT_VAL)
#define CRC_UPDATE_BYTE(crc, b) (g_CrcTable[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) #define CRC_UPDATE_BYTE(crc, b) (g_CrcTable[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
UInt32 MY_FAST_CALL CrcUpdate(UInt32 crc, const void *data, size_t size); UInt32 Z7_FASTCALL CrcUpdate(UInt32 crc, const void *data, size_t size);
UInt32 MY_FAST_CALL CrcCalc(const void *data, size_t size); UInt32 Z7_FASTCALL CrcCalc(const void *data, size_t size);
typedef UInt32 (Z7_FASTCALL *Z7_CRC_UPDATE_FUNC)(UInt32 v, const void *data, size_t size);
Z7_CRC_UPDATE_FUNC z7_GetFunc_CrcUpdate(unsigned algo);
EXTERN_C_END EXTERN_C_END

View file

@ -1,117 +1,199 @@
/* 7zCrcOpt.c -- CRC32 calculation /* 7zCrcOpt.c -- CRC32 calculation (optimized functions)
2021-02-09 : Igor Pavlov : Public domain */ 2023-12-07 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include "CpuArch.h" #include "CpuArch.h"
#if !defined(Z7_CRC_NUM_TABLES) || Z7_CRC_NUM_TABLES > 1
// for debug only : define Z7_CRC_DEBUG_BE to test big-endian code in little-endian cpu
// #define Z7_CRC_DEBUG_BE
#ifdef Z7_CRC_DEBUG_BE
#undef MY_CPU_LE
#define MY_CPU_BE
#endif
// the value Z7_CRC_NUM_TABLES_USE must be defined to same value as in 7zCrc.c
#ifdef Z7_CRC_NUM_TABLES
#define Z7_CRC_NUM_TABLES_USE Z7_CRC_NUM_TABLES
#else
#define Z7_CRC_NUM_TABLES_USE 12
#endif
#if Z7_CRC_NUM_TABLES_USE % 4 || \
Z7_CRC_NUM_TABLES_USE < 4 * 1 || \
Z7_CRC_NUM_TABLES_USE > 4 * 6
#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
#endif
#ifndef MY_CPU_BE #ifndef MY_CPU_BE
#define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8)) #define CRC_UPDATE_BYTE_2(crc, b) (table[((crc) ^ (b)) & 0xFF] ^ ((crc) >> 8))
UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table); #define Q(n, d) \
UInt32 MY_FAST_CALL CrcUpdateT4(UInt32 v, const void *data, size_t size, const UInt32 *table) ( (table + ((n) * 4 + 3) * 0x100)[(Byte)(d)] \
^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 1 * 8) & 0xFF] \
^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 2 * 8) & 0xFF] \
^ (table + ((n) * 4 + 0) * 0x100)[((d) >> 3 * 8)] )
#define R(a) *((const UInt32 *)(const void *)p + (a))
#define CRC_FUNC_PRE_LE2(step) \
UInt32 Z7_FASTCALL CrcUpdateT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table)
#define CRC_FUNC_PRE_LE(step) \
CRC_FUNC_PRE_LE2(step); \
CRC_FUNC_PRE_LE2(step)
CRC_FUNC_PRE_LE(Z7_CRC_NUM_TABLES_USE)
{ {
const Byte *p = (const Byte *)data; const Byte *p = (const Byte *)data;
for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) const Byte *lim;
for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++)
v = CRC_UPDATE_BYTE_2(v, *p); v = CRC_UPDATE_BYTE_2(v, *p);
for (; size >= 4; size -= 4, p += 4) lim = p + size;
if (size >= Z7_CRC_NUM_TABLES_USE)
{ {
v ^= *(const UInt32 *)(const void *)p; lim -= Z7_CRC_NUM_TABLES_USE;
v = do
(table + 0x300)[((v ) & 0xFF)] {
^ (table + 0x200)[((v >> 8) & 0xFF)] v ^= R(0);
^ (table + 0x100)[((v >> 16) & 0xFF)] {
^ (table + 0x000)[((v >> 24))]; #if Z7_CRC_NUM_TABLES_USE == 1 * 4
v = Q(0, v);
#else
#define U2(r, op) \
{ d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); }
UInt32 d, x;
U2(1, =)
#if Z7_CRC_NUM_TABLES_USE >= 3 * 4
#define U(r) U2(r, ^=)
U(2)
#if Z7_CRC_NUM_TABLES_USE >= 4 * 4
U(3)
#if Z7_CRC_NUM_TABLES_USE >= 5 * 4
U(4)
#if Z7_CRC_NUM_TABLES_USE >= 6 * 4
U(5)
#if Z7_CRC_NUM_TABLES_USE >= 7 * 4
#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
#endif
#endif
#endif
#endif
#endif
#undef U
#undef U2
v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v);
#endif
}
p += Z7_CRC_NUM_TABLES_USE;
}
while (p <= lim);
lim += Z7_CRC_NUM_TABLES_USE;
} }
for (; size > 0; size--, p++) for (; p < lim; p++)
v = CRC_UPDATE_BYTE_2(v, *p); v = CRC_UPDATE_BYTE_2(v, *p);
return v; return v;
} }
UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table); #undef CRC_UPDATE_BYTE_2
UInt32 MY_FAST_CALL CrcUpdateT8(UInt32 v, const void *data, size_t size, const UInt32 *table) #undef R
{ #undef Q
const Byte *p = (const Byte *)data; #undef CRC_FUNC_PRE_LE
for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++) #undef CRC_FUNC_PRE_LE2
v = CRC_UPDATE_BYTE_2(v, *p);
for (; size >= 8; size -= 8, p += 8)
{
UInt32 d;
v ^= *(const UInt32 *)(const void *)p;
v =
(table + 0x700)[((v ) & 0xFF)]
^ (table + 0x600)[((v >> 8) & 0xFF)]
^ (table + 0x500)[((v >> 16) & 0xFF)]
^ (table + 0x400)[((v >> 24))];
d = *((const UInt32 *)(const void *)p + 1);
v ^=
(table + 0x300)[((d ) & 0xFF)]
^ (table + 0x200)[((d >> 8) & 0xFF)]
^ (table + 0x100)[((d >> 16) & 0xFF)]
^ (table + 0x000)[((d >> 24))];
}
for (; size > 0; size--, p++)
v = CRC_UPDATE_BYTE_2(v, *p);
return v;
}
#endif #endif
#ifndef MY_CPU_LE #ifndef MY_CPU_LE
#define CRC_UINT32_SWAP(v) ((v >> 24) | ((v >> 8) & 0xFF00) | ((v << 8) & 0xFF0000) | (v << 24)) #define CRC_UPDATE_BYTE_2_BE(crc, b) (table[((crc) >> 24) ^ (b)] ^ ((crc) << 8))
#define CRC_UPDATE_BYTE_2_BE(crc, b) (table[(((crc) >> 24) ^ (b))] ^ ((crc) << 8)) #define Q(n, d) \
( (table + ((n) * 4 + 0) * 0x100)[((d)) & 0xFF] \
^ (table + ((n) * 4 + 1) * 0x100)[((d) >> 1 * 8) & 0xFF] \
^ (table + ((n) * 4 + 2) * 0x100)[((d) >> 2 * 8) & 0xFF] \
^ (table + ((n) * 4 + 3) * 0x100)[((d) >> 3 * 8)] )
UInt32 MY_FAST_CALL CrcUpdateT1_BeT4(UInt32 v, const void *data, size_t size, const UInt32 *table) #ifdef Z7_CRC_DEBUG_BE
#define R(a) GetBe32a((const UInt32 *)(const void *)p + (a))
#else
#define R(a) *((const UInt32 *)(const void *)p + (a))
#endif
#define CRC_FUNC_PRE_BE2(step) \
UInt32 Z7_FASTCALL CrcUpdateT1_BeT ## step (UInt32 v, const void *data, size_t size, const UInt32 *table)
#define CRC_FUNC_PRE_BE(step) \
CRC_FUNC_PRE_BE2(step); \
CRC_FUNC_PRE_BE2(step)
CRC_FUNC_PRE_BE(Z7_CRC_NUM_TABLES_USE)
{ {
const Byte *p = (const Byte *)data; const Byte *p = (const Byte *)data;
const Byte *lim;
table += 0x100; table += 0x100;
v = CRC_UINT32_SWAP(v); v = Z7_BSWAP32(v);
for (; size > 0 && ((unsigned)(ptrdiff_t)p & 3) != 0; size--, p++) for (; size && ((unsigned)(ptrdiff_t)p & (7 - (Z7_CRC_NUM_TABLES_USE & 4))) != 0; size--, p++)
v = CRC_UPDATE_BYTE_2_BE(v, *p); v = CRC_UPDATE_BYTE_2_BE(v, *p);
for (; size >= 4; size -= 4, p += 4) lim = p + size;
if (size >= Z7_CRC_NUM_TABLES_USE)
{ {
v ^= *(const UInt32 *)(const void *)p; lim -= Z7_CRC_NUM_TABLES_USE;
v = do
(table + 0x000)[((v ) & 0xFF)] {
^ (table + 0x100)[((v >> 8) & 0xFF)] v ^= R(0);
^ (table + 0x200)[((v >> 16) & 0xFF)] {
^ (table + 0x300)[((v >> 24))]; #if Z7_CRC_NUM_TABLES_USE == 1 * 4
v = Q(0, v);
#else
#define U2(r, op) \
{ d = R(r); x op Q(Z7_CRC_NUM_TABLES_USE / 4 - 1 - (r), d); }
UInt32 d, x;
U2(1, =)
#if Z7_CRC_NUM_TABLES_USE >= 3 * 4
#define U(r) U2(r, ^=)
U(2)
#if Z7_CRC_NUM_TABLES_USE >= 4 * 4
U(3)
#if Z7_CRC_NUM_TABLES_USE >= 5 * 4
U(4)
#if Z7_CRC_NUM_TABLES_USE >= 6 * 4
U(5)
#if Z7_CRC_NUM_TABLES_USE >= 7 * 4
#error Stop_Compiling_Bad_Z7_CRC_NUM_TABLES
#endif
#endif
#endif
#endif
#endif
#undef U
#undef U2
v = x ^ Q(Z7_CRC_NUM_TABLES_USE / 4 - 1, v);
#endif
}
p += Z7_CRC_NUM_TABLES_USE;
}
while (p <= lim);
lim += Z7_CRC_NUM_TABLES_USE;
} }
for (; size > 0; size--, p++) for (; p < lim; p++)
v = CRC_UPDATE_BYTE_2_BE(v, *p); v = CRC_UPDATE_BYTE_2_BE(v, *p);
return CRC_UINT32_SWAP(v); return Z7_BSWAP32(v);
} }
UInt32 MY_FAST_CALL CrcUpdateT1_BeT8(UInt32 v, const void *data, size_t size, const UInt32 *table) #undef CRC_UPDATE_BYTE_2_BE
{ #undef R
const Byte *p = (const Byte *)data; #undef Q
table += 0x100; #undef CRC_FUNC_PRE_BE
v = CRC_UINT32_SWAP(v); #undef CRC_FUNC_PRE_BE2
for (; size > 0 && ((unsigned)(ptrdiff_t)p & 7) != 0; size--, p++)
v = CRC_UPDATE_BYTE_2_BE(v, *p);
for (; size >= 8; size -= 8, p += 8)
{
UInt32 d;
v ^= *(const UInt32 *)(const void *)p;
v =
(table + 0x400)[((v ) & 0xFF)]
^ (table + 0x500)[((v >> 8) & 0xFF)]
^ (table + 0x600)[((v >> 16) & 0xFF)]
^ (table + 0x700)[((v >> 24))];
d = *((const UInt32 *)(const void *)p + 1);
v ^=
(table + 0x000)[((d ) & 0xFF)]
^ (table + 0x100)[((d >> 8) & 0xFF)]
^ (table + 0x200)[((d >> 16) & 0xFF)]
^ (table + 0x300)[((d >> 24))];
}
for (; size > 0; size--, p++)
v = CRC_UPDATE_BYTE_2_BE(v, *p);
return CRC_UINT32_SWAP(v);
}
#endif #endif
#undef Z7_CRC_NUM_TABLES_USE
#endif

201
C/7zDec.c
View file

@ -1,11 +1,11 @@
/* 7zDec.c -- Decoding from 7z folder /* 7zDec.c -- Decoding from 7z folder
2021-02-09 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include <string.h> #include <string.h>
/* #define _7ZIP_PPMD_SUPPPORT */ /* #define Z7_PPMD_SUPPORT */
#include "7z.h" #include "7z.h"
#include "7zCrc.h" #include "7zCrc.h"
@ -16,27 +16,50 @@
#include "Delta.h" #include "Delta.h"
#include "LzmaDec.h" #include "LzmaDec.h"
#include "Lzma2Dec.h" #include "Lzma2Dec.h"
#ifdef _7ZIP_PPMD_SUPPPORT #ifdef Z7_PPMD_SUPPORT
#include "Ppmd7.h" #include "Ppmd7.h"
#endif #endif
#define k_Copy 0 #define k_Copy 0
#ifndef _7Z_NO_METHOD_LZMA2 #ifndef Z7_NO_METHOD_LZMA2
#define k_LZMA2 0x21 #define k_LZMA2 0x21
#endif #endif
#define k_LZMA 0x30101 #define k_LZMA 0x30101
#define k_BCJ2 0x303011B #define k_BCJ2 0x303011B
#ifndef _7Z_NO_METHODS_FILTERS
#if !defined(Z7_NO_METHODS_FILTERS)
#define Z7_USE_BRANCH_FILTER
#endif
#if !defined(Z7_NO_METHODS_FILTERS) || \
defined(Z7_USE_NATIVE_BRANCH_FILTER) && defined(MY_CPU_ARM64)
#define Z7_USE_FILTER_ARM64
#ifndef Z7_USE_BRANCH_FILTER
#define Z7_USE_BRANCH_FILTER
#endif
#define k_ARM64 0xa
#endif
#if !defined(Z7_NO_METHODS_FILTERS) || \
defined(Z7_USE_NATIVE_BRANCH_FILTER) && defined(MY_CPU_ARMT)
#define Z7_USE_FILTER_ARMT
#ifndef Z7_USE_BRANCH_FILTER
#define Z7_USE_BRANCH_FILTER
#endif
#define k_ARMT 0x3030701
#endif
#ifndef Z7_NO_METHODS_FILTERS
#define k_Delta 3 #define k_Delta 3
#define k_RISCV 0xb
#define k_BCJ 0x3030103 #define k_BCJ 0x3030103
#define k_PPC 0x3030205 #define k_PPC 0x3030205
#define k_IA64 0x3030401 #define k_IA64 0x3030401
#define k_ARM 0x3030501 #define k_ARM 0x3030501
#define k_ARMT 0x3030701
#define k_SPARC 0x3030805 #define k_SPARC 0x3030805
#endif #endif
#ifdef _7ZIP_PPMD_SUPPPORT #ifdef Z7_PPMD_SUPPORT
#define k_PPMD 0x30401 #define k_PPMD 0x30401
@ -49,12 +72,12 @@ typedef struct
UInt64 processed; UInt64 processed;
BoolInt extra; BoolInt extra;
SRes res; SRes res;
const ILookInStream *inStream; ILookInStreamPtr inStream;
} CByteInToLook; } CByteInToLook;
static Byte ReadByte(const IByteIn *pp) static Byte ReadByte(IByteInPtr pp)
{ {
CByteInToLook *p = CONTAINER_FROM_VTBL(pp, CByteInToLook, vt); Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CByteInToLook)
if (p->cur != p->end) if (p->cur != p->end)
return *p->cur++; return *p->cur++;
if (p->res == SZ_OK) if (p->res == SZ_OK)
@ -67,13 +90,13 @@ static Byte ReadByte(const IByteIn *pp)
p->cur = p->begin; p->cur = p->begin;
p->end = p->begin + size; p->end = p->begin + size;
if (size != 0) if (size != 0)
return *p->cur++;; return *p->cur++;
} }
p->extra = True; p->extra = True;
return 0; return 0;
} }
static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, const ILookInStream *inStream, static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStreamPtr inStream,
Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain) Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain)
{ {
CPpmd7 ppmd; CPpmd7 ppmd;
@ -138,14 +161,14 @@ static SRes SzDecodePpmd(const Byte *props, unsigned propsSize, UInt64 inSize, c
#endif #endif
static SRes SzDecodeLzma(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStream *inStream, static SRes SzDecodeLzma(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStreamPtr inStream,
Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain) Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain)
{ {
CLzmaDec state; CLzmaDec state;
SRes res = SZ_OK; SRes res = SZ_OK;
LzmaDec_Construct(&state); LzmaDec_CONSTRUCT(&state)
RINOK(LzmaDec_AllocateProbs(&state, props, propsSize, allocMain)); RINOK(LzmaDec_AllocateProbs(&state, props, propsSize, allocMain))
state.dic = outBuffer; state.dic = outBuffer;
state.dicBufSize = outSize; state.dicBufSize = outSize;
LzmaDec_Init(&state); LzmaDec_Init(&state);
@ -196,18 +219,18 @@ static SRes SzDecodeLzma(const Byte *props, unsigned propsSize, UInt64 inSize, I
} }
#ifndef _7Z_NO_METHOD_LZMA2 #ifndef Z7_NO_METHOD_LZMA2
static SRes SzDecodeLzma2(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStream *inStream, static SRes SzDecodeLzma2(const Byte *props, unsigned propsSize, UInt64 inSize, ILookInStreamPtr inStream,
Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain) Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain)
{ {
CLzma2Dec state; CLzma2Dec state;
SRes res = SZ_OK; SRes res = SZ_OK;
Lzma2Dec_Construct(&state); Lzma2Dec_CONSTRUCT(&state)
if (propsSize != 1) if (propsSize != 1)
return SZ_ERROR_DATA; return SZ_ERROR_DATA;
RINOK(Lzma2Dec_AllocateProbs(&state, props[0], allocMain)); RINOK(Lzma2Dec_AllocateProbs(&state, props[0], allocMain))
state.decoder.dic = outBuffer; state.decoder.dic = outBuffer;
state.decoder.dicBufSize = outSize; state.decoder.dicBufSize = outSize;
Lzma2Dec_Init(&state); Lzma2Dec_Init(&state);
@ -257,7 +280,7 @@ static SRes SzDecodeLzma2(const Byte *props, unsigned propsSize, UInt64 inSize,
#endif #endif
static SRes SzDecodeCopy(UInt64 inSize, ILookInStream *inStream, Byte *outBuffer) static SRes SzDecodeCopy(UInt64 inSize, ILookInStreamPtr inStream, Byte *outBuffer)
{ {
while (inSize > 0) while (inSize > 0)
{ {
@ -265,13 +288,13 @@ static SRes SzDecodeCopy(UInt64 inSize, ILookInStream *inStream, Byte *outBuffer
size_t curSize = (1 << 18); size_t curSize = (1 << 18);
if (curSize > inSize) if (curSize > inSize)
curSize = (size_t)inSize; curSize = (size_t)inSize;
RINOK(ILookInStream_Look(inStream, &inBuf, &curSize)); RINOK(ILookInStream_Look(inStream, &inBuf, &curSize))
if (curSize == 0) if (curSize == 0)
return SZ_ERROR_INPUT_EOF; return SZ_ERROR_INPUT_EOF;
memcpy(outBuffer, inBuf, curSize); memcpy(outBuffer, inBuf, curSize);
outBuffer += curSize; outBuffer += curSize;
inSize -= curSize; inSize -= curSize;
RINOK(ILookInStream_Skip(inStream, curSize)); RINOK(ILookInStream_Skip(inStream, curSize))
} }
return SZ_OK; return SZ_OK;
} }
@ -282,15 +305,16 @@ static BoolInt IS_MAIN_METHOD(UInt32 m)
{ {
case k_Copy: case k_Copy:
case k_LZMA: case k_LZMA:
#ifndef _7Z_NO_METHOD_LZMA2 #ifndef Z7_NO_METHOD_LZMA2
case k_LZMA2: case k_LZMA2:
#endif #endif
#ifdef _7ZIP_PPMD_SUPPPORT #ifdef Z7_PPMD_SUPPORT
case k_PPMD: case k_PPMD:
#endif #endif
return True; return True;
default:
return False;
} }
return False;
} }
static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c) static BoolInt IS_SUPPORTED_CODER(const CSzCoderInfo *c)
@ -317,7 +341,7 @@ static SRes CheckSupportedFolder(const CSzFolder *f)
} }
#ifndef _7Z_NO_METHODS_FILTERS #if defined(Z7_USE_BRANCH_FILTER)
if (f->NumCoders == 2) if (f->NumCoders == 2)
{ {
@ -333,13 +357,21 @@ static SRes CheckSupportedFolder(const CSzFolder *f)
return SZ_ERROR_UNSUPPORTED; return SZ_ERROR_UNSUPPORTED;
switch ((UInt32)c->MethodID) switch ((UInt32)c->MethodID)
{ {
#if !defined(Z7_NO_METHODS_FILTERS)
case k_Delta: case k_Delta:
case k_BCJ: case k_BCJ:
case k_PPC: case k_PPC:
case k_IA64: case k_IA64:
case k_SPARC: case k_SPARC:
case k_ARM: case k_ARM:
case k_RISCV:
#endif
#ifdef Z7_USE_FILTER_ARM64
case k_ARM64:
#endif
#ifdef Z7_USE_FILTER_ARMT
case k_ARMT: case k_ARMT:
#endif
break; break;
default: default:
return SZ_ERROR_UNSUPPORTED; return SZ_ERROR_UNSUPPORTED;
@ -372,15 +404,16 @@ static SRes CheckSupportedFolder(const CSzFolder *f)
return SZ_ERROR_UNSUPPORTED; return SZ_ERROR_UNSUPPORTED;
} }
#ifndef _7Z_NO_METHODS_FILTERS
#define CASE_BRA_CONV(isa) case k_ ## isa: isa ## _Convert(outBuffer, outSize, 0, 0); break;
#endif
static SRes SzFolder_Decode2(const CSzFolder *folder, static SRes SzFolder_Decode2(const CSzFolder *folder,
const Byte *propsData, const Byte *propsData,
const UInt64 *unpackSizes, const UInt64 *unpackSizes,
const UInt64 *packPositions, const UInt64 *packPositions,
ILookInStream *inStream, UInt64 startPos, ILookInStreamPtr inStream, UInt64 startPos,
Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain, Byte *outBuffer, SizeT outSize, ISzAllocPtr allocMain,
Byte *tempBuf[]) Byte *tempBuf[])
{ {
@ -389,7 +422,7 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
SizeT tempSize3 = 0; SizeT tempSize3 = 0;
Byte *tempBuf3 = 0; Byte *tempBuf3 = 0;
RINOK(CheckSupportedFolder(folder)); RINOK(CheckSupportedFolder(folder))
for (ci = 0; ci < folder->NumCoders; ci++) for (ci = 0; ci < folder->NumCoders; ci++)
{ {
@ -404,8 +437,8 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
SizeT outSizeCur = outSize; SizeT outSizeCur = outSize;
if (folder->NumCoders == 4) if (folder->NumCoders == 4)
{ {
UInt32 indices[] = { 3, 2, 0 }; const UInt32 indices[] = { 3, 2, 0 };
UInt64 unpackSize = unpackSizes[ci]; const UInt64 unpackSize = unpackSizes[ci];
si = indices[ci]; si = indices[ci];
if (ci < 2) if (ci < 2)
{ {
@ -431,37 +464,37 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
} }
offset = packPositions[si]; offset = packPositions[si];
inSize = packPositions[(size_t)si + 1] - offset; inSize = packPositions[(size_t)si + 1] - offset;
RINOK(LookInStream_SeekTo(inStream, startPos + offset)); RINOK(LookInStream_SeekTo(inStream, startPos + offset))
if (coder->MethodID == k_Copy) if (coder->MethodID == k_Copy)
{ {
if (inSize != outSizeCur) /* check it */ if (inSize != outSizeCur) /* check it */
return SZ_ERROR_DATA; return SZ_ERROR_DATA;
RINOK(SzDecodeCopy(inSize, inStream, outBufCur)); RINOK(SzDecodeCopy(inSize, inStream, outBufCur))
} }
else if (coder->MethodID == k_LZMA) else if (coder->MethodID == k_LZMA)
{ {
RINOK(SzDecodeLzma(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain)); RINOK(SzDecodeLzma(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain))
} }
#ifndef _7Z_NO_METHOD_LZMA2 #ifndef Z7_NO_METHOD_LZMA2
else if (coder->MethodID == k_LZMA2) else if (coder->MethodID == k_LZMA2)
{ {
RINOK(SzDecodeLzma2(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain)); RINOK(SzDecodeLzma2(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain))
} }
#endif #endif
#ifdef _7ZIP_PPMD_SUPPPORT #ifdef Z7_PPMD_SUPPORT
else if (coder->MethodID == k_PPMD) else if (coder->MethodID == k_PPMD)
{ {
RINOK(SzDecodePpmd(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain)); RINOK(SzDecodePpmd(propsData + coder->PropsOffset, coder->PropsSize, inSize, inStream, outBufCur, outSizeCur, allocMain))
} }
#endif #endif
else else
return SZ_ERROR_UNSUPPORTED; return SZ_ERROR_UNSUPPORTED;
} }
else if (coder->MethodID == k_BCJ2) else if (coder->MethodID == k_BCJ2)
{ {
UInt64 offset = packPositions[1]; const UInt64 offset = packPositions[1];
UInt64 s3Size = packPositions[2] - offset; const UInt64 s3Size = packPositions[2] - offset;
if (ci != 3) if (ci != 3)
return SZ_ERROR_UNSUPPORTED; return SZ_ERROR_UNSUPPORTED;
@ -473,8 +506,8 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
if (!tempBuf[2] && tempSizes[2] != 0) if (!tempBuf[2] && tempSizes[2] != 0)
return SZ_ERROR_MEM; return SZ_ERROR_MEM;
RINOK(LookInStream_SeekTo(inStream, startPos + offset)); RINOK(LookInStream_SeekTo(inStream, startPos + offset))
RINOK(SzDecodeCopy(s3Size, inStream, tempBuf[2])); RINOK(SzDecodeCopy(s3Size, inStream, tempBuf[2]))
if ((tempSizes[0] & 3) != 0 || if ((tempSizes[0] & 3) != 0 ||
(tempSizes[1] & 3) != 0 || (tempSizes[1] & 3) != 0 ||
@ -493,26 +526,22 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
p.destLim = outBuffer + outSize; p.destLim = outBuffer + outSize;
Bcj2Dec_Init(&p); Bcj2Dec_Init(&p);
RINOK(Bcj2Dec_Decode(&p)); RINOK(Bcj2Dec_Decode(&p))
{ {
unsigned i; unsigned i;
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
if (p.bufs[i] != p.lims[i]) if (p.bufs[i] != p.lims[i])
return SZ_ERROR_DATA; return SZ_ERROR_DATA;
if (p.dest != p.destLim || !Bcj2Dec_IsMaybeFinished(&p))
if (!Bcj2Dec_IsFinished(&p))
return SZ_ERROR_DATA;
if (p.dest != p.destLim
|| p.state != BCJ2_STREAM_MAIN)
return SZ_ERROR_DATA; return SZ_ERROR_DATA;
} }
} }
} }
#ifndef _7Z_NO_METHODS_FILTERS #if defined(Z7_USE_BRANCH_FILTER)
else if (ci == 1) else if (ci == 1)
{ {
#if !defined(Z7_NO_METHODS_FILTERS)
if (coder->MethodID == k_Delta) if (coder->MethodID == k_Delta)
{ {
if (coder->PropsSize != 1) if (coder->PropsSize != 1)
@ -522,31 +551,75 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
Delta_Init(state); Delta_Init(state);
Delta_Decode(state, (unsigned)(propsData[coder->PropsOffset]) + 1, outBuffer, outSize); Delta_Decode(state, (unsigned)(propsData[coder->PropsOffset]) + 1, outBuffer, outSize);
} }
continue;
} }
else #endif
#ifdef Z7_USE_FILTER_ARM64
if (coder->MethodID == k_ARM64)
{
UInt32 pc = 0;
if (coder->PropsSize == 4)
{
pc = GetUi32(propsData + coder->PropsOffset);
if (pc & 3)
return SZ_ERROR_UNSUPPORTED;
}
else if (coder->PropsSize != 0)
return SZ_ERROR_UNSUPPORTED;
z7_BranchConv_ARM64_Dec(outBuffer, outSize, pc);
continue;
}
#endif
#if !defined(Z7_NO_METHODS_FILTERS)
if (coder->MethodID == k_RISCV)
{
UInt32 pc = 0;
if (coder->PropsSize == 4)
{
pc = GetUi32(propsData + coder->PropsOffset);
if (pc & 1)
return SZ_ERROR_UNSUPPORTED;
}
else if (coder->PropsSize != 0)
return SZ_ERROR_UNSUPPORTED;
z7_BranchConv_RISCV_Dec(outBuffer, outSize, pc);
continue;
}
#endif
#if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT)
{ {
if (coder->PropsSize != 0) if (coder->PropsSize != 0)
return SZ_ERROR_UNSUPPORTED; return SZ_ERROR_UNSUPPORTED;
#define CASE_BRA_CONV(isa) case k_ ## isa: Z7_BRANCH_CONV_DEC(isa)(outBuffer, outSize, 0); break; // pc = 0;
switch (coder->MethodID) switch (coder->MethodID)
{ {
#if !defined(Z7_NO_METHODS_FILTERS)
case k_BCJ: case k_BCJ:
{ {
UInt32 state; UInt32 state = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL;
x86_Convert_Init(state); z7_BranchConvSt_X86_Dec(outBuffer, outSize, 0, &state); // pc = 0
x86_Convert(outBuffer, outSize, 0, &state, 0);
break; break;
} }
CASE_BRA_CONV(PPC) case k_PPC: Z7_BRANCH_CONV_DEC_2(BranchConv_PPC)(outBuffer, outSize, 0); break; // pc = 0;
// CASE_BRA_CONV(PPC)
CASE_BRA_CONV(IA64) CASE_BRA_CONV(IA64)
CASE_BRA_CONV(SPARC) CASE_BRA_CONV(SPARC)
CASE_BRA_CONV(ARM) CASE_BRA_CONV(ARM)
#endif
#if !defined(Z7_NO_METHODS_FILTERS) || defined(Z7_USE_FILTER_ARMT)
CASE_BRA_CONV(ARMT) CASE_BRA_CONV(ARMT)
#endif
default: default:
return SZ_ERROR_UNSUPPORTED; return SZ_ERROR_UNSUPPORTED;
} }
continue;
} }
} #endif
#endif } // (c == 1)
#endif // Z7_USE_BRANCH_FILTER
else else
return SZ_ERROR_UNSUPPORTED; return SZ_ERROR_UNSUPPORTED;
} }
@ -556,7 +629,7 @@ static SRes SzFolder_Decode2(const CSzFolder *folder,
SRes SzAr_DecodeFolder(const CSzAr *p, UInt32 folderIndex, SRes SzAr_DecodeFolder(const CSzAr *p, UInt32 folderIndex,
ILookInStream *inStream, UInt64 startPos, ILookInStreamPtr inStream, UInt64 startPos,
Byte *outBuffer, size_t outSize, Byte *outBuffer, size_t outSize,
ISzAllocPtr allocMain) ISzAllocPtr allocMain)
{ {

View file

@ -1,5 +1,5 @@
/* 7zFile.c -- File IO /* 7zFile.c -- File IO
2021-04-29 : Igor Pavlov : Public domain */ 2023-04-02 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -268,7 +268,7 @@ WRes File_Write(CSzFile *p, const void *data, size_t *size)
return errno; return errno;
if (processed == 0) if (processed == 0)
break; break;
data = (void *)((Byte *)data + (size_t)processed); data = (const void *)((const Byte *)data + (size_t)processed);
originalSize -= (size_t)processed; originalSize -= (size_t)processed;
*size += (size_t)processed; *size += (size_t)processed;
} }
@ -287,7 +287,8 @@ WRes File_Seek(CSzFile *p, Int64 *pos, ESzSeek origin)
DWORD moveMethod; DWORD moveMethod;
UInt32 low = (UInt32)*pos; UInt32 low = (UInt32)*pos;
LONG high = (LONG)((UInt64)*pos >> 16 >> 16); /* for case when UInt64 is 32-bit only */ LONG high = (LONG)((UInt64)*pos >> 16 >> 16); /* for case when UInt64 is 32-bit only */
switch (origin) // (int) to eliminate clang warning
switch ((int)origin)
{ {
case SZ_SEEK_SET: moveMethod = FILE_BEGIN; break; case SZ_SEEK_SET: moveMethod = FILE_BEGIN; break;
case SZ_SEEK_CUR: moveMethod = FILE_CURRENT; break; case SZ_SEEK_CUR: moveMethod = FILE_CURRENT; break;
@ -308,7 +309,7 @@ WRes File_Seek(CSzFile *p, Int64 *pos, ESzSeek origin)
int moveMethod; // = origin; int moveMethod; // = origin;
switch (origin) switch ((int)origin)
{ {
case SZ_SEEK_SET: moveMethod = SEEK_SET; break; case SZ_SEEK_SET: moveMethod = SEEK_SET; break;
case SZ_SEEK_CUR: moveMethod = SEEK_CUR; break; case SZ_SEEK_CUR: moveMethod = SEEK_CUR; break;
@ -387,10 +388,10 @@ WRes File_GetLength(CSzFile *p, UInt64 *length)
/* ---------- FileSeqInStream ---------- */ /* ---------- FileSeqInStream ---------- */
static SRes FileSeqInStream_Read(const ISeqInStream *pp, void *buf, size_t *size) static SRes FileSeqInStream_Read(ISeqInStreamPtr pp, void *buf, size_t *size)
{ {
CFileSeqInStream *p = CONTAINER_FROM_VTBL(pp, CFileSeqInStream, vt); Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileSeqInStream)
WRes wres = File_Read(&p->file, buf, size); const WRes wres = File_Read(&p->file, buf, size);
p->wres = wres; p->wres = wres;
return (wres == 0) ? SZ_OK : SZ_ERROR_READ; return (wres == 0) ? SZ_OK : SZ_ERROR_READ;
} }
@ -403,18 +404,18 @@ void FileSeqInStream_CreateVTable(CFileSeqInStream *p)
/* ---------- FileInStream ---------- */ /* ---------- FileInStream ---------- */
static SRes FileInStream_Read(const ISeekInStream *pp, void *buf, size_t *size) static SRes FileInStream_Read(ISeekInStreamPtr pp, void *buf, size_t *size)
{ {
CFileInStream *p = CONTAINER_FROM_VTBL(pp, CFileInStream, vt); Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileInStream)
WRes wres = File_Read(&p->file, buf, size); const WRes wres = File_Read(&p->file, buf, size);
p->wres = wres; p->wres = wres;
return (wres == 0) ? SZ_OK : SZ_ERROR_READ; return (wres == 0) ? SZ_OK : SZ_ERROR_READ;
} }
static SRes FileInStream_Seek(const ISeekInStream *pp, Int64 *pos, ESzSeek origin) static SRes FileInStream_Seek(ISeekInStreamPtr pp, Int64 *pos, ESzSeek origin)
{ {
CFileInStream *p = CONTAINER_FROM_VTBL(pp, CFileInStream, vt); Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileInStream)
WRes wres = File_Seek(&p->file, pos, origin); const WRes wres = File_Seek(&p->file, pos, origin);
p->wres = wres; p->wres = wres;
return (wres == 0) ? SZ_OK : SZ_ERROR_READ; return (wres == 0) ? SZ_OK : SZ_ERROR_READ;
} }
@ -428,10 +429,10 @@ void FileInStream_CreateVTable(CFileInStream *p)
/* ---------- FileOutStream ---------- */ /* ---------- FileOutStream ---------- */
static size_t FileOutStream_Write(const ISeqOutStream *pp, const void *data, size_t size) static size_t FileOutStream_Write(ISeqOutStreamPtr pp, const void *data, size_t size)
{ {
CFileOutStream *p = CONTAINER_FROM_VTBL(pp, CFileOutStream, vt); Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CFileOutStream)
WRes wres = File_Write(&p->file, data, &size); const WRes wres = File_Write(&p->file, data, &size);
p->wres = wres; p->wres = wres;
return size; return size;
} }

View file

@ -1,8 +1,8 @@
/* 7zFile.h -- File IO /* 7zFile.h -- File IO
2021-02-15 : Igor Pavlov : Public domain */ 2023-03-05 : Igor Pavlov : Public domain */
#ifndef __7Z_FILE_H #ifndef ZIP7_INC_FILE_H
#define __7Z_FILE_H #define ZIP7_INC_FILE_H
#ifdef _WIN32 #ifdef _WIN32
#define USE_WINDOWS_FILE #define USE_WINDOWS_FILE
@ -10,7 +10,8 @@
#endif #endif
#ifdef USE_WINDOWS_FILE #ifdef USE_WINDOWS_FILE
#include <windows.h> #include "7zWindows.h"
#else #else
// note: USE_FOPEN mode is limited to 32-bit file size // note: USE_FOPEN mode is limited to 32-bit file size
// #define USE_FOPEN // #define USE_FOPEN

View file

@ -1,5 +1,5 @@
/* 7zStream.c -- 7z Stream functions /* 7zStream.c -- 7z Stream functions
2021-02-09 : Igor Pavlov : Public domain */ 2023-04-02 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -7,12 +7,33 @@
#include "7zTypes.h" #include "7zTypes.h"
SRes SeqInStream_Read2(const ISeqInStream *stream, void *buf, size_t size, SRes errorType)
SRes SeqInStream_ReadMax(ISeqInStreamPtr stream, void *buf, size_t *processedSize)
{
size_t size = *processedSize;
*processedSize = 0;
while (size != 0)
{
size_t cur = size;
const SRes res = ISeqInStream_Read(stream, buf, &cur);
*processedSize += cur;
buf = (void *)((Byte *)buf + cur);
size -= cur;
if (res != SZ_OK)
return res;
if (cur == 0)
return SZ_OK;
}
return SZ_OK;
}
/*
SRes SeqInStream_Read2(ISeqInStreamPtr stream, void *buf, size_t size, SRes errorType)
{ {
while (size != 0) while (size != 0)
{ {
size_t processed = size; size_t processed = size;
RINOK(ISeqInStream_Read(stream, buf, &processed)); RINOK(ISeqInStream_Read(stream, buf, &processed))
if (processed == 0) if (processed == 0)
return errorType; return errorType;
buf = (void *)((Byte *)buf + processed); buf = (void *)((Byte *)buf + processed);
@ -21,42 +42,44 @@ SRes SeqInStream_Read2(const ISeqInStream *stream, void *buf, size_t size, SRes
return SZ_OK; return SZ_OK;
} }
SRes SeqInStream_Read(const ISeqInStream *stream, void *buf, size_t size) SRes SeqInStream_Read(ISeqInStreamPtr stream, void *buf, size_t size)
{ {
return SeqInStream_Read2(stream, buf, size, SZ_ERROR_INPUT_EOF); return SeqInStream_Read2(stream, buf, size, SZ_ERROR_INPUT_EOF);
} }
*/
SRes SeqInStream_ReadByte(const ISeqInStream *stream, Byte *buf)
SRes SeqInStream_ReadByte(ISeqInStreamPtr stream, Byte *buf)
{ {
size_t processed = 1; size_t processed = 1;
RINOK(ISeqInStream_Read(stream, buf, &processed)); RINOK(ISeqInStream_Read(stream, buf, &processed))
return (processed == 1) ? SZ_OK : SZ_ERROR_INPUT_EOF; return (processed == 1) ? SZ_OK : SZ_ERROR_INPUT_EOF;
} }
SRes LookInStream_SeekTo(const ILookInStream *stream, UInt64 offset) SRes LookInStream_SeekTo(ILookInStreamPtr stream, UInt64 offset)
{ {
Int64 t = (Int64)offset; Int64 t = (Int64)offset;
return ILookInStream_Seek(stream, &t, SZ_SEEK_SET); return ILookInStream_Seek(stream, &t, SZ_SEEK_SET);
} }
SRes LookInStream_LookRead(const ILookInStream *stream, void *buf, size_t *size) SRes LookInStream_LookRead(ILookInStreamPtr stream, void *buf, size_t *size)
{ {
const void *lookBuf; const void *lookBuf;
if (*size == 0) if (*size == 0)
return SZ_OK; return SZ_OK;
RINOK(ILookInStream_Look(stream, &lookBuf, size)); RINOK(ILookInStream_Look(stream, &lookBuf, size))
memcpy(buf, lookBuf, *size); memcpy(buf, lookBuf, *size);
return ILookInStream_Skip(stream, *size); return ILookInStream_Skip(stream, *size);
} }
SRes LookInStream_Read2(const ILookInStream *stream, void *buf, size_t size, SRes errorType) SRes LookInStream_Read2(ILookInStreamPtr stream, void *buf, size_t size, SRes errorType)
{ {
while (size != 0) while (size != 0)
{ {
size_t processed = size; size_t processed = size;
RINOK(ILookInStream_Read(stream, buf, &processed)); RINOK(ILookInStream_Read(stream, buf, &processed))
if (processed == 0) if (processed == 0)
return errorType; return errorType;
buf = (void *)((Byte *)buf + processed); buf = (void *)((Byte *)buf + processed);
@ -65,16 +88,16 @@ SRes LookInStream_Read2(const ILookInStream *stream, void *buf, size_t size, SRe
return SZ_OK; return SZ_OK;
} }
SRes LookInStream_Read(const ILookInStream *stream, void *buf, size_t size) SRes LookInStream_Read(ILookInStreamPtr stream, void *buf, size_t size)
{ {
return LookInStream_Read2(stream, buf, size, SZ_ERROR_INPUT_EOF); return LookInStream_Read2(stream, buf, size, SZ_ERROR_INPUT_EOF);
} }
#define GET_LookToRead2 CLookToRead2 *p = CONTAINER_FROM_VTBL(pp, CLookToRead2, vt); #define GET_LookToRead2 Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CLookToRead2)
static SRes LookToRead2_Look_Lookahead(const ILookInStream *pp, const void **buf, size_t *size) static SRes LookToRead2_Look_Lookahead(ILookInStreamPtr pp, const void **buf, size_t *size)
{ {
SRes res = SZ_OK; SRes res = SZ_OK;
GET_LookToRead2 GET_LookToRead2
@ -93,7 +116,7 @@ static SRes LookToRead2_Look_Lookahead(const ILookInStream *pp, const void **buf
return res; return res;
} }
static SRes LookToRead2_Look_Exact(const ILookInStream *pp, const void **buf, size_t *size) static SRes LookToRead2_Look_Exact(ILookInStreamPtr pp, const void **buf, size_t *size)
{ {
SRes res = SZ_OK; SRes res = SZ_OK;
GET_LookToRead2 GET_LookToRead2
@ -113,14 +136,14 @@ static SRes LookToRead2_Look_Exact(const ILookInStream *pp, const void **buf, si
return res; return res;
} }
static SRes LookToRead2_Skip(const ILookInStream *pp, size_t offset) static SRes LookToRead2_Skip(ILookInStreamPtr pp, size_t offset)
{ {
GET_LookToRead2 GET_LookToRead2
p->pos += offset; p->pos += offset;
return SZ_OK; return SZ_OK;
} }
static SRes LookToRead2_Read(const ILookInStream *pp, void *buf, size_t *size) static SRes LookToRead2_Read(ILookInStreamPtr pp, void *buf, size_t *size)
{ {
GET_LookToRead2 GET_LookToRead2
size_t rem = p->size - p->pos; size_t rem = p->size - p->pos;
@ -134,7 +157,7 @@ static SRes LookToRead2_Read(const ILookInStream *pp, void *buf, size_t *size)
return SZ_OK; return SZ_OK;
} }
static SRes LookToRead2_Seek(const ILookInStream *pp, Int64 *pos, ESzSeek origin) static SRes LookToRead2_Seek(ILookInStreamPtr pp, Int64 *pos, ESzSeek origin)
{ {
GET_LookToRead2 GET_LookToRead2
p->pos = p->size = 0; p->pos = p->size = 0;
@ -153,9 +176,9 @@ void LookToRead2_CreateVTable(CLookToRead2 *p, int lookahead)
static SRes SecToLook_Read(const ISeqInStream *pp, void *buf, size_t *size) static SRes SecToLook_Read(ISeqInStreamPtr pp, void *buf, size_t *size)
{ {
CSecToLook *p = CONTAINER_FROM_VTBL(pp, CSecToLook, vt); Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSecToLook)
return LookInStream_LookRead(p->realStream, buf, size); return LookInStream_LookRead(p->realStream, buf, size);
} }
@ -164,9 +187,9 @@ void SecToLook_CreateVTable(CSecToLook *p)
p->vt.Read = SecToLook_Read; p->vt.Read = SecToLook_Read;
} }
static SRes SecToRead_Read(const ISeqInStream *pp, void *buf, size_t *size) static SRes SecToRead_Read(ISeqInStreamPtr pp, void *buf, size_t *size)
{ {
CSecToRead *p = CONTAINER_FROM_VTBL(pp, CSecToRead, vt); Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CSecToRead)
return ILookInStream_Read(p->realStream, buf, size); return ILookInStream_Read(p->realStream, buf, size);
} }

View file

@ -1,8 +1,8 @@
/* 7zTypes.h -- Basic types /* 7zTypes.h -- Basic types
2021-12-25 : Igor Pavlov : Public domain */ 2024-01-24 : Igor Pavlov : Public domain */
#ifndef __7Z_TYPES_H #ifndef ZIP7_7Z_TYPES_H
#define __7Z_TYPES_H #define ZIP7_7Z_TYPES_H
#ifdef _WIN32 #ifdef _WIN32
/* #include <windows.h> */ /* #include <windows.h> */
@ -52,6 +52,11 @@ typedef int SRes;
#define MY_ALIGN(n) #define MY_ALIGN(n)
#endif #endif
#else #else
/*
// C11/C++11:
#include <stdalign.h>
#define MY_ALIGN(n) alignas(n)
*/
#define MY_ALIGN(n) __attribute__ ((aligned(n))) #define MY_ALIGN(n) __attribute__ ((aligned(n)))
#endif #endif
@ -62,7 +67,7 @@ typedef int SRes;
typedef unsigned WRes; typedef unsigned WRes;
#define MY_SRes_HRESULT_FROM_WRes(x) HRESULT_FROM_WIN32(x) #define MY_SRes_HRESULT_FROM_WRes(x) HRESULT_FROM_WIN32(x)
// #define MY_HRES_ERROR__INTERNAL_ERROR MY_SRes_HRESULT_FROM_WRes(ERROR_INTERNAL_ERROR) // #define MY_HRES_ERROR_INTERNAL_ERROR MY_SRes_HRESULT_FROM_WRes(ERROR_INTERNAL_ERROR)
#else // _WIN32 #else // _WIN32
@ -70,13 +75,13 @@ typedef unsigned WRes;
typedef int WRes; typedef int WRes;
// (FACILITY_ERRNO = 0x800) is 7zip's FACILITY constant to represent (errno) errors in HRESULT // (FACILITY_ERRNO = 0x800) is 7zip's FACILITY constant to represent (errno) errors in HRESULT
#define MY__FACILITY_ERRNO 0x800 #define MY_FACILITY_ERRNO 0x800
#define MY__FACILITY_WIN32 7 #define MY_FACILITY_WIN32 7
#define MY__FACILITY__WRes MY__FACILITY_ERRNO #define MY_FACILITY_WRes MY_FACILITY_ERRNO
#define MY_HRESULT_FROM_errno_CONST_ERROR(x) ((HRESULT)( \ #define MY_HRESULT_FROM_errno_CONST_ERROR(x) ((HRESULT)( \
( (HRESULT)(x) & 0x0000FFFF) \ ( (HRESULT)(x) & 0x0000FFFF) \
| (MY__FACILITY__WRes << 16) \ | (MY_FACILITY_WRes << 16) \
| (HRESULT)0x80000000 )) | (HRESULT)0x80000000 ))
#define MY_SRes_HRESULT_FROM_WRes(x) \ #define MY_SRes_HRESULT_FROM_WRes(x) \
@ -120,23 +125,19 @@ typedef int WRes;
#define ERROR_INVALID_REPARSE_DATA ((HRESULT)0x80071128L) #define ERROR_INVALID_REPARSE_DATA ((HRESULT)0x80071128L)
#define ERROR_REPARSE_TAG_INVALID ((HRESULT)0x80071129L) #define ERROR_REPARSE_TAG_INVALID ((HRESULT)0x80071129L)
// if (MY__FACILITY__WRes != FACILITY_WIN32), // if (MY_FACILITY_WRes != FACILITY_WIN32),
// we use FACILITY_WIN32 for COM errors: // we use FACILITY_WIN32 for COM errors:
#define E_OUTOFMEMORY ((HRESULT)0x8007000EL) #define E_OUTOFMEMORY ((HRESULT)0x8007000EL)
#define E_INVALIDARG ((HRESULT)0x80070057L) #define E_INVALIDARG ((HRESULT)0x80070057L)
#define MY__E_ERROR_NEGATIVE_SEEK ((HRESULT)0x80070083L) #define MY_E_ERROR_NEGATIVE_SEEK ((HRESULT)0x80070083L)
/* /*
// we can use FACILITY_ERRNO for some COM errors, that have errno equivalents: // we can use FACILITY_ERRNO for some COM errors, that have errno equivalents:
#define E_OUTOFMEMORY MY_HRESULT_FROM_errno_CONST_ERROR(ENOMEM) #define E_OUTOFMEMORY MY_HRESULT_FROM_errno_CONST_ERROR(ENOMEM)
#define E_INVALIDARG MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL) #define E_INVALIDARG MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL)
#define MY__E_ERROR_NEGATIVE_SEEK MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL) #define MY_E_ERROR_NEGATIVE_SEEK MY_HRESULT_FROM_errno_CONST_ERROR(EINVAL)
*/ */
// gcc / clang : (sizeof(long) == sizeof(void*)) in 32/64 bits
typedef long INT_PTR;
typedef unsigned long UINT_PTR;
#define TEXT(quote) quote #define TEXT(quote) quote
#define FILE_ATTRIBUTE_READONLY 0x0001 #define FILE_ATTRIBUTE_READONLY 0x0001
@ -160,18 +161,18 @@ typedef unsigned long UINT_PTR;
#ifndef RINOK #ifndef RINOK
#define RINOK(x) { int __result__ = (x); if (__result__ != 0) return __result__; } #define RINOK(x) { const int _result_ = (x); if (_result_ != 0) return _result_; }
#endif #endif
#ifndef RINOK_WRes #ifndef RINOK_WRes
#define RINOK_WRes(x) { WRes __result__ = (x); if (__result__ != 0) return __result__; } #define RINOK_WRes(x) { const WRes _result_ = (x); if (_result_ != 0) return _result_; }
#endif #endif
typedef unsigned char Byte; typedef unsigned char Byte;
typedef short Int16; typedef short Int16;
typedef unsigned short UInt16; typedef unsigned short UInt16;
#ifdef _LZMA_UINT32_IS_ULONG #ifdef Z7_DECL_Int32_AS_long
typedef long Int32; typedef long Int32;
typedef unsigned long UInt32; typedef unsigned long UInt32;
#else #else
@ -210,37 +211,51 @@ typedef size_t SIZE_T;
#endif // _WIN32 #endif // _WIN32
#define MY_HRES_ERROR__INTERNAL_ERROR ((HRESULT)0x8007054FL) #define MY_HRES_ERROR_INTERNAL_ERROR ((HRESULT)0x8007054FL)
#ifdef _SZ_NO_INT_64 #ifdef Z7_DECL_Int64_AS_long
/* define _SZ_NO_INT_64, if your compiler doesn't support 64-bit integers.
NOTES: Some code will work incorrectly in that case! */
typedef long Int64; typedef long Int64;
typedef unsigned long UInt64; typedef unsigned long UInt64;
#else #else
#if defined(_MSC_VER) || defined(__BORLANDC__) #if (defined(_MSC_VER) || defined(__BORLANDC__)) && !defined(__clang__)
typedef __int64 Int64; typedef __int64 Int64;
typedef unsigned __int64 UInt64; typedef unsigned __int64 UInt64;
#define UINT64_CONST(n) n #else
#if defined(__clang__) || defined(__GNUC__)
#include <stdint.h>
typedef int64_t Int64;
typedef uint64_t UInt64;
#else #else
typedef long long int Int64; typedef long long int Int64;
typedef unsigned long long int UInt64; typedef unsigned long long int UInt64;
#define UINT64_CONST(n) n ## ULL // #define UINT64_CONST(n) n ## ULL
#endif
#endif #endif
#endif #endif
#ifdef _LZMA_NO_SYSTEM_SIZE_T #define UINT64_CONST(n) n
typedef UInt32 SizeT;
#ifdef Z7_DECL_SizeT_AS_unsigned_int
typedef unsigned int SizeT;
#else #else
typedef size_t SizeT; typedef size_t SizeT;
#endif #endif
/*
#if (defined(_MSC_VER) && _MSC_VER <= 1200)
typedef size_t MY_uintptr_t;
#else
#include <stdint.h>
typedef uintptr_t MY_uintptr_t;
#endif
*/
typedef int BoolInt; typedef int BoolInt;
/* typedef BoolInt Bool; */ /* typedef BoolInt Bool; */
#define True 1 #define True 1
@ -248,23 +263,23 @@ typedef int BoolInt;
#ifdef _WIN32 #ifdef _WIN32
#define MY_STD_CALL __stdcall #define Z7_STDCALL __stdcall
#else #else
#define MY_STD_CALL #define Z7_STDCALL
#endif #endif
#ifdef _MSC_VER #ifdef _MSC_VER
#if _MSC_VER >= 1300 #if _MSC_VER >= 1300
#define MY_NO_INLINE __declspec(noinline) #define Z7_NO_INLINE __declspec(noinline)
#else #else
#define MY_NO_INLINE #define Z7_NO_INLINE
#endif #endif
#define MY_FORCE_INLINE __forceinline #define Z7_FORCE_INLINE __forceinline
#define MY_CDECL __cdecl #define Z7_CDECL __cdecl
#define MY_FAST_CALL __fastcall #define Z7_FASTCALL __fastcall
#else // _MSC_VER #else // _MSC_VER
@ -272,27 +287,25 @@ typedef int BoolInt;
|| (defined(__clang__) && (__clang_major__ >= 4)) \ || (defined(__clang__) && (__clang_major__ >= 4)) \
|| defined(__INTEL_COMPILER) \ || defined(__INTEL_COMPILER) \
|| defined(__xlC__) || defined(__xlC__)
#define MY_NO_INLINE __attribute__((noinline)) #define Z7_NO_INLINE __attribute__((noinline))
// #define MY_FORCE_INLINE __attribute__((always_inline)) inline #define Z7_FORCE_INLINE __attribute__((always_inline)) inline
#else #else
#define MY_NO_INLINE #define Z7_NO_INLINE
#define Z7_FORCE_INLINE
#endif #endif
#define MY_FORCE_INLINE #define Z7_CDECL
#define MY_CDECL
#if defined(_M_IX86) \ #if defined(_M_IX86) \
|| defined(__i386__) || defined(__i386__)
// #define MY_FAST_CALL __attribute__((fastcall)) // #define Z7_FASTCALL __attribute__((fastcall))
// #define MY_FAST_CALL __attribute__((cdecl)) // #define Z7_FASTCALL __attribute__((cdecl))
#define MY_FAST_CALL #define Z7_FASTCALL
#elif defined(MY_CPU_AMD64) #elif defined(MY_CPU_AMD64)
// #define MY_FAST_CALL __attribute__((ms_abi)) // #define Z7_FASTCALL __attribute__((ms_abi))
#define MY_FAST_CALL #define Z7_FASTCALL
#else #else
#define MY_FAST_CALL #define Z7_FASTCALL
#endif #endif
#endif // _MSC_VER #endif // _MSC_VER
@ -300,41 +313,49 @@ typedef int BoolInt;
/* The following interfaces use first parameter as pointer to structure */ /* The following interfaces use first parameter as pointer to structure */
typedef struct IByteIn IByteIn; // #define Z7_C_IFACE_CONST_QUAL
struct IByteIn #define Z7_C_IFACE_CONST_QUAL const
#define Z7_C_IFACE_DECL(a) \
struct a ## _; \
typedef Z7_C_IFACE_CONST_QUAL struct a ## _ * a ## Ptr; \
typedef struct a ## _ a; \
struct a ## _
Z7_C_IFACE_DECL (IByteIn)
{ {
Byte (*Read)(const IByteIn *p); /* reads one byte, returns 0 in case of EOF or error */ Byte (*Read)(IByteInPtr p); /* reads one byte, returns 0 in case of EOF or error */
}; };
#define IByteIn_Read(p) (p)->Read(p) #define IByteIn_Read(p) (p)->Read(p)
typedef struct IByteOut IByteOut; Z7_C_IFACE_DECL (IByteOut)
struct IByteOut
{ {
void (*Write)(const IByteOut *p, Byte b); void (*Write)(IByteOutPtr p, Byte b);
}; };
#define IByteOut_Write(p, b) (p)->Write(p, b) #define IByteOut_Write(p, b) (p)->Write(p, b)
typedef struct ISeqInStream ISeqInStream; Z7_C_IFACE_DECL (ISeqInStream)
struct ISeqInStream
{ {
SRes (*Read)(const ISeqInStream *p, void *buf, size_t *size); SRes (*Read)(ISeqInStreamPtr p, void *buf, size_t *size);
/* if (input(*size) != 0 && output(*size) == 0) means end_of_stream. /* if (input(*size) != 0 && output(*size) == 0) means end_of_stream.
(output(*size) < input(*size)) is allowed */ (output(*size) < input(*size)) is allowed */
}; };
#define ISeqInStream_Read(p, buf, size) (p)->Read(p, buf, size) #define ISeqInStream_Read(p, buf, size) (p)->Read(p, buf, size)
/* try to read as much as avail in stream and limited by (*processedSize) */
SRes SeqInStream_ReadMax(ISeqInStreamPtr stream, void *buf, size_t *processedSize);
/* it can return SZ_ERROR_INPUT_EOF */ /* it can return SZ_ERROR_INPUT_EOF */
SRes SeqInStream_Read(const ISeqInStream *stream, void *buf, size_t size); // SRes SeqInStream_Read(ISeqInStreamPtr stream, void *buf, size_t size);
SRes SeqInStream_Read2(const ISeqInStream *stream, void *buf, size_t size, SRes errorType); // SRes SeqInStream_Read2(ISeqInStreamPtr stream, void *buf, size_t size, SRes errorType);
SRes SeqInStream_ReadByte(const ISeqInStream *stream, Byte *buf); SRes SeqInStream_ReadByte(ISeqInStreamPtr stream, Byte *buf);
typedef struct ISeqOutStream ISeqOutStream; Z7_C_IFACE_DECL (ISeqOutStream)
struct ISeqOutStream
{ {
size_t (*Write)(const ISeqOutStream *p, const void *buf, size_t size); size_t (*Write)(ISeqOutStreamPtr p, const void *buf, size_t size);
/* Returns: result - the number of actually written bytes. /* Returns: result - the number of actually written bytes.
(result < size) means error */ (result < size) means error */
}; };
@ -348,29 +369,26 @@ typedef enum
} ESzSeek; } ESzSeek;
typedef struct ISeekInStream ISeekInStream; Z7_C_IFACE_DECL (ISeekInStream)
struct ISeekInStream
{ {
SRes (*Read)(const ISeekInStream *p, void *buf, size_t *size); /* same as ISeqInStream::Read */ SRes (*Read)(ISeekInStreamPtr p, void *buf, size_t *size); /* same as ISeqInStream::Read */
SRes (*Seek)(const ISeekInStream *p, Int64 *pos, ESzSeek origin); SRes (*Seek)(ISeekInStreamPtr p, Int64 *pos, ESzSeek origin);
}; };
#define ISeekInStream_Read(p, buf, size) (p)->Read(p, buf, size) #define ISeekInStream_Read(p, buf, size) (p)->Read(p, buf, size)
#define ISeekInStream_Seek(p, pos, origin) (p)->Seek(p, pos, origin) #define ISeekInStream_Seek(p, pos, origin) (p)->Seek(p, pos, origin)
typedef struct ILookInStream ILookInStream; Z7_C_IFACE_DECL (ILookInStream)
struct ILookInStream
{ {
SRes (*Look)(const ILookInStream *p, const void **buf, size_t *size); SRes (*Look)(ILookInStreamPtr p, const void **buf, size_t *size);
/* if (input(*size) != 0 && output(*size) == 0) means end_of_stream. /* if (input(*size) != 0 && output(*size) == 0) means end_of_stream.
(output(*size) > input(*size)) is not allowed (output(*size) > input(*size)) is not allowed
(output(*size) < input(*size)) is allowed */ (output(*size) < input(*size)) is allowed */
SRes (*Skip)(const ILookInStream *p, size_t offset); SRes (*Skip)(ILookInStreamPtr p, size_t offset);
/* offset must be <= output(*size) of Look */ /* offset must be <= output(*size) of Look */
SRes (*Read)(ILookInStreamPtr p, void *buf, size_t *size);
SRes (*Read)(const ILookInStream *p, void *buf, size_t *size);
/* reads directly (without buffer). It's same as ISeqInStream::Read */ /* reads directly (without buffer). It's same as ISeqInStream::Read */
SRes (*Seek)(const ILookInStream *p, Int64 *pos, ESzSeek origin); SRes (*Seek)(ILookInStreamPtr p, Int64 *pos, ESzSeek origin);
}; };
#define ILookInStream_Look(p, buf, size) (p)->Look(p, buf, size) #define ILookInStream_Look(p, buf, size) (p)->Look(p, buf, size)
@ -379,19 +397,18 @@ struct ILookInStream
#define ILookInStream_Seek(p, pos, origin) (p)->Seek(p, pos, origin) #define ILookInStream_Seek(p, pos, origin) (p)->Seek(p, pos, origin)
SRes LookInStream_LookRead(const ILookInStream *stream, void *buf, size_t *size); SRes LookInStream_LookRead(ILookInStreamPtr stream, void *buf, size_t *size);
SRes LookInStream_SeekTo(const ILookInStream *stream, UInt64 offset); SRes LookInStream_SeekTo(ILookInStreamPtr stream, UInt64 offset);
/* reads via ILookInStream::Read */ /* reads via ILookInStream::Read */
SRes LookInStream_Read2(const ILookInStream *stream, void *buf, size_t size, SRes errorType); SRes LookInStream_Read2(ILookInStreamPtr stream, void *buf, size_t size, SRes errorType);
SRes LookInStream_Read(const ILookInStream *stream, void *buf, size_t size); SRes LookInStream_Read(ILookInStreamPtr stream, void *buf, size_t size);
typedef struct typedef struct
{ {
ILookInStream vt; ILookInStream vt;
const ISeekInStream *realStream; ISeekInStreamPtr realStream;
size_t pos; size_t pos;
size_t size; /* it's data size */ size_t size; /* it's data size */
@ -403,13 +420,13 @@ typedef struct
void LookToRead2_CreateVTable(CLookToRead2 *p, int lookahead); void LookToRead2_CreateVTable(CLookToRead2 *p, int lookahead);
#define LookToRead2_Init(p) { (p)->pos = (p)->size = 0; } #define LookToRead2_INIT(p) { (p)->pos = (p)->size = 0; }
typedef struct typedef struct
{ {
ISeqInStream vt; ISeqInStream vt;
const ILookInStream *realStream; ILookInStreamPtr realStream;
} CSecToLook; } CSecToLook;
void SecToLook_CreateVTable(CSecToLook *p); void SecToLook_CreateVTable(CSecToLook *p);
@ -419,20 +436,19 @@ void SecToLook_CreateVTable(CSecToLook *p);
typedef struct typedef struct
{ {
ISeqInStream vt; ISeqInStream vt;
const ILookInStream *realStream; ILookInStreamPtr realStream;
} CSecToRead; } CSecToRead;
void SecToRead_CreateVTable(CSecToRead *p); void SecToRead_CreateVTable(CSecToRead *p);
typedef struct ICompressProgress ICompressProgress; Z7_C_IFACE_DECL (ICompressProgress)
struct ICompressProgress
{ {
SRes (*Progress)(const ICompressProgress *p, UInt64 inSize, UInt64 outSize); SRes (*Progress)(ICompressProgressPtr p, UInt64 inSize, UInt64 outSize);
/* Returns: result. (result != SZ_OK) means break. /* Returns: result. (result != SZ_OK) means break.
Value (UInt64)(Int64)-1 for size means unknown value. */ Value (UInt64)(Int64)-1 for size means unknown value. */
}; };
#define ICompressProgress_Progress(p, inSize, outSize) (p)->Progress(p, inSize, outSize) #define ICompressProgress_Progress(p, inSize, outSize) (p)->Progress(p, inSize, outSize)
@ -470,13 +486,13 @@ struct ISzAlloc
#ifndef MY_container_of #ifndef Z7_container_of
/* /*
#define MY_container_of(ptr, type, m) container_of(ptr, type, m) #define Z7_container_of(ptr, type, m) container_of(ptr, type, m)
#define MY_container_of(ptr, type, m) CONTAINING_RECORD(ptr, type, m) #define Z7_container_of(ptr, type, m) CONTAINING_RECORD(ptr, type, m)
#define MY_container_of(ptr, type, m) ((type *)((char *)(ptr) - offsetof(type, m))) #define Z7_container_of(ptr, type, m) ((type *)((char *)(ptr) - offsetof(type, m)))
#define MY_container_of(ptr, type, m) (&((type *)0)->m == (ptr), ((type *)(((char *)(ptr)) - MY_offsetof(type, m)))) #define Z7_container_of(ptr, type, m) (&((type *)0)->m == (ptr), ((type *)(((char *)(ptr)) - MY_offsetof(type, m))))
*/ */
/* /*
@ -485,24 +501,64 @@ struct ISzAlloc
GCC 4.8.1 : classes with non-public variable members" GCC 4.8.1 : classes with non-public variable members"
*/ */
#define MY_container_of(ptr, type, m) ((type *)(void *)((char *)(void *)(1 ? (ptr) : &((type *)0)->m) - MY_offsetof(type, m))) #define Z7_container_of(ptr, type, m) \
((type *)(void *)((char *)(void *) \
(1 ? (ptr) : &((type *)NULL)->m) - MY_offsetof(type, m)))
#define Z7_container_of_CONST(ptr, type, m) \
((const type *)(const void *)((const char *)(const void *) \
(1 ? (ptr) : &((type *)NULL)->m) - MY_offsetof(type, m)))
/*
#define Z7_container_of_NON_CONST_FROM_CONST(ptr, type, m) \
((type *)(void *)(const void *)((const char *)(const void *) \
(1 ? (ptr) : &((type *)NULL)->m) - MY_offsetof(type, m)))
*/
#endif #endif
#define CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) ((type *)(void *)(ptr)) #define Z7_CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) ((type *)(void *)(ptr))
// #define Z7_CONTAINER_FROM_VTBL(ptr, type, m) Z7_CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m)
#define Z7_CONTAINER_FROM_VTBL(ptr, type, m) Z7_container_of(ptr, type, m)
// #define Z7_CONTAINER_FROM_VTBL(ptr, type, m) Z7_container_of_NON_CONST_FROM_CONST(ptr, type, m)
#define Z7_CONTAINER_FROM_VTBL_CONST(ptr, type, m) Z7_container_of_CONST(ptr, type, m)
#define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m)
/* /*
#define CONTAINER_FROM_VTBL(ptr, type, m) CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) #define Z7_CONTAINER_FROM_VTBL_CLS(ptr, type, m) Z7_CONTAINER_FROM_VTBL(ptr, type, m)
*/ */
#define CONTAINER_FROM_VTBL(ptr, type, m) MY_container_of(ptr, type, m) #if defined (__clang__) || defined(__GNUC__)
#define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wcast-qual\"")
#define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL \
_Pragma("GCC diagnostic pop")
#else
#define Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL
#define Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL
#endif
#define CONTAINER_FROM_VTBL_CLS(ptr, type, m) CONTAINER_FROM_VTBL_SIMPLE(ptr, type, m) #define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(ptr, type, m, p) \
/* Z7_DIAGNOSTIC_IGNORE_BEGIN_CAST_QUAL \
#define CONTAINER_FROM_VTBL_CLS(ptr, type, m) CONTAINER_FROM_VTBL(ptr, type, m) type *p = Z7_CONTAINER_FROM_VTBL(ptr, type, m); \
*/ Z7_DIAGNOSTIC_IGNORE_END_CAST_QUAL
#define Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(type) \
Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR(pp, type, vt, p)
#define MY_memset_0_ARRAY(a) memset((a), 0, sizeof(a)) // #define ZIP7_DECLARE_HANDLE(name) typedef void *name;
#define Z7_DECLARE_HANDLE(name) struct name##_dummy{int unused;}; typedef struct name##_dummy *name;
#define Z7_memset_0_ARRAY(a) memset((a), 0, sizeof(a))
#ifndef Z7_ARRAY_SIZE
#define Z7_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
#endif
#ifdef _WIN32 #ifdef _WIN32
@ -520,6 +576,22 @@ struct ISzAlloc
#endif #endif
#define k_PropVar_TimePrec_0 0
#define k_PropVar_TimePrec_Unix 1
#define k_PropVar_TimePrec_DOS 2
#define k_PropVar_TimePrec_HighPrec 3
#define k_PropVar_TimePrec_Base 16
#define k_PropVar_TimePrec_100ns (k_PropVar_TimePrec_Base + 7)
#define k_PropVar_TimePrec_1ns (k_PropVar_TimePrec_Base + 9)
EXTERN_C_END EXTERN_C_END
#endif #endif
/*
#ifndef Z7_ST
#ifdef _7ZIP_ST
#define Z7_ST
#endif
#endif
*/

View file

@ -1,7 +1,7 @@
#define MY_VER_MAJOR 21 #define MY_VER_MAJOR 25
#define MY_VER_MINOR 07 #define MY_VER_MINOR 1
#define MY_VER_BUILD 0 #define MY_VER_BUILD 0
#define MY_VERSION_NUMBERS "21.07" #define MY_VERSION_NUMBERS "25.01"
#define MY_VERSION MY_VERSION_NUMBERS #define MY_VERSION MY_VERSION_NUMBERS
#ifdef MY_CPU_NAME #ifdef MY_CPU_NAME
@ -10,12 +10,12 @@
#define MY_VERSION_CPU MY_VERSION #define MY_VERSION_CPU MY_VERSION
#endif #endif
#define MY_DATE "2021-12-26" #define MY_DATE "2025-08-03"
#undef MY_COPYRIGHT #undef MY_COPYRIGHT
#undef MY_VERSION_COPYRIGHT_DATE #undef MY_VERSION_COPYRIGHT_DATE
#define MY_AUTHOR_NAME "Igor Pavlov" #define MY_AUTHOR_NAME "Igor Pavlov"
#define MY_COPYRIGHT_PD "Igor Pavlov : Public domain" #define MY_COPYRIGHT_PD "Igor Pavlov : Public domain"
#define MY_COPYRIGHT_CR "Copyright (c) 1999-2021 Igor Pavlov" #define MY_COPYRIGHT_CR "Copyright (c) 1999-2025 Igor Pavlov"
#ifdef USE_COPYRIGHT_CR #ifdef USE_COPYRIGHT_CR
#define MY_COPYRIGHT MY_COPYRIGHT_CR #define MY_COPYRIGHT MY_COPYRIGHT_CR

101
C/7zWindows.h Normal file
View file

@ -0,0 +1,101 @@
/* 7zWindows.h -- StdAfx
2023-04-02 : Igor Pavlov : Public domain */
#ifndef ZIP7_INC_7Z_WINDOWS_H
#define ZIP7_INC_7Z_WINDOWS_H
#ifdef _WIN32
#if defined(__clang__)
# pragma clang diagnostic push
#endif
#if defined(_MSC_VER)
#pragma warning(push)
#pragma warning(disable : 4668) // '_WIN32_WINNT' is not defined as a preprocessor macro, replacing with '0' for '#if/#elif'
#if _MSC_VER == 1900
// for old kit10 versions
// #pragma warning(disable : 4255) // winuser.h(13979): warning C4255: 'GetThreadDpiAwarenessContext':
#endif
// win10 Windows Kit:
#endif // _MSC_VER
#if defined(_MSC_VER) && _MSC_VER <= 1200 && !defined(_WIN64)
// for msvc6 without sdk2003
#define RPC_NO_WINDOWS_H
#endif
#if defined(__MINGW32__) || defined(__MINGW64__)
// #if defined(__GNUC__) && !defined(__clang__)
#include <windows.h>
#else
#include <Windows.h>
#endif
// #include <basetsd.h>
// #include <wtypes.h>
// but if precompiled with clang-cl then we need
// #include <windows.h>
#if defined(_MSC_VER)
#pragma warning(pop)
#endif
#if defined(__clang__)
# pragma clang diagnostic pop
#endif
#if defined(_MSC_VER) && _MSC_VER <= 1200 && !defined(_WIN64)
#ifndef _W64
typedef long LONG_PTR, *PLONG_PTR;
typedef unsigned long ULONG_PTR, *PULONG_PTR;
typedef ULONG_PTR DWORD_PTR, *PDWORD_PTR;
#define Z7_OLD_WIN_SDK
#endif // _W64
#endif // _MSC_VER == 1200
#ifdef Z7_OLD_WIN_SDK
#ifndef INVALID_FILE_ATTRIBUTES
#define INVALID_FILE_ATTRIBUTES ((DWORD)-1)
#endif
#ifndef INVALID_SET_FILE_POINTER
#define INVALID_SET_FILE_POINTER ((DWORD)-1)
#endif
#ifndef FILE_SPECIAL_ACCESS
#define FILE_SPECIAL_ACCESS (FILE_ANY_ACCESS)
#endif
// ShlObj.h:
// #define BIF_NEWDIALOGSTYLE 0x0040
#pragma warning(disable : 4201)
// #pragma warning(disable : 4115)
#undef VARIANT_TRUE
#define VARIANT_TRUE ((VARIANT_BOOL)-1)
#endif
#endif // Z7_OLD_WIN_SDK
#ifdef UNDER_CE
#undef VARIANT_TRUE
#define VARIANT_TRUE ((VARIANT_BOOL)-1)
#endif
#if defined(_MSC_VER)
#if _MSC_VER >= 1400 && _MSC_VER <= 1600
// BaseTsd.h(148) : 'HandleToULong' : unreferenced inline function has been removed
// string.h
// #pragma warning(disable : 4514)
#endif
#endif
/* #include "7zTypes.h" */
#endif

View file

@ -4,24 +4,57 @@ MY_ARCH_2 = $(MY_ARCH)
MY_ASM = jwasm MY_ASM = jwasm
MY_ASM = asmc MY_ASM = asmc
ifndef RC
#RC=windres.exe --target=pe-x86-64
#RC=windres.exe -F pe-i386
RC=windres.exe
endif
PROGPATH = $(O)/$(PROG) PROGPATH = $(O)/$(PROG)
PROGPATH_STATIC = $(O)/$(PROG)s PROGPATH_STATIC = $(O)/$(PROG)s
ifneq ($(CC), xlc)
CFLAGS_WARN_WALL = -Wall -Werror -Wextra
endif
# for object file # for object file
CFLAGS_BASE_LIST = -c CFLAGS_BASE_LIST = -c
# for ASM file # for ASM file
# CFLAGS_BASE_LIST = -S # CFLAGS_BASE_LIST = -S
CFLAGS_BASE = $(MY_ARCH_2) -O2 $(CFLAGS_BASE_LIST) -Wall -Werror -Wextra $(CFLAGS_WARN) \
FLAGS_FLTO = -flto
FLAGS_FLTO =
CFLAGS_BASE = $(MY_ARCH_2) -O2 $(CFLAGS_BASE_LIST) $(CFLAGS_WARN_WALL) $(CFLAGS_WARN) \
-DNDEBUG -D_REENTRANT -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -DNDEBUG -D_REENTRANT -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE
LDFLAGS_STATIC = -DNDEBUG
# -static
ifdef SystemDrive ifdef SystemDrive
IS_MINGW = 1 IS_MINGW = 1
else
ifdef SYSTEMDRIVE
# ifdef OS
IS_MINGW = 1
endif endif
endif
ifdef IS_MINGW
LDFLAGS_STATIC_2 = -static
else
ifndef DEF_FILE
ifndef IS_NOT_STANDALONE
ifndef MY_DYNAMIC_LINK
ifneq ($(CC), clang)
LDFLAGS_STATIC_2 =
# -static
# -static-libstdc++ -static-libgcc
endif
endif
endif
endif
endif
LDFLAGS_STATIC = -DNDEBUG $(LDFLAGS_STATIC_2)
ifdef DEF_FILE ifdef DEF_FILE
@ -62,22 +95,29 @@ endif
ifdef IS_MINGW ifdef IS_MINGW
ifdef MSYSTEM
RM = rm -f
MY_MKDIR=mkdir -p
DEL_OBJ_EXE = -$(RM) $(PROGPATH) $(PROGPATH_STATIC) $(OBJS)
else
RM = del RM = del
MY_MKDIR=mkdir MY_MKDIR=mkdir
LIB2 = -loleaut32 -luuid -ladvapi32 -lUser32 DEL_OBJ_EXE = -$(RM) $(O)\*.o $(O)\$(PROG).exe $(O)\$(PROG).dll
endif
CXXFLAGS_EXTRA = -DUNICODE -D_UNICODE LIB2 = -lOle32 -loleaut32 -luuid -ladvapi32 -lUser32 -lShell32
CFLAGS_EXTRA = -DUNICODE -D_UNICODE
# -Wno-delete-non-virtual-dtor # -Wno-delete-non-virtual-dtor
DEL_OBJ_EXE = -$(RM) $(O)\*.o $(O)\$(PROG).exe $(O)\$(PROG).dll
else else
RM = rm -f RM = rm -f
MY_MKDIR=mkdir -p MY_MKDIR=mkdir -p
# CFLAGS_BASE := $(CFLAGS_BASE) -D_7ZIP_ST # CFLAGS_BASE := $(CFLAGS_BASE) -DZ7_ST
# CXXFLAGS_EXTRA = -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE # CFLAGS_EXTRA = -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE
# LOCAL_LIBS=-lpthread # LOCAL_LIBS=-lpthread
# LOCAL_LIBS_DLL=$(LOCAL_LIBS) -ldl # LOCAL_LIBS_DLL=$(LOCAL_LIBS) -ldl
@ -88,10 +128,6 @@ DEL_OBJ_EXE = -$(RM) $(PROGPATH) $(PROGPATH_STATIC) $(OBJS)
endif endif
CFLAGS = $(LOCAL_FLAGS) $(CFLAGS_BASE2) $(CFLAGS_BASE) $(CC_SHARED) -o $@
ifdef IS_X64 ifdef IS_X64
AFLAGS_ABI = -elf64 -DABI_LINUX AFLAGS_ABI = -elf64 -DABI_LINUX
else else
@ -102,12 +138,9 @@ AFLAGS_ABI = -elf -DABI_LINUX -DABI_CDECL
endif endif
AFLAGS = $(AFLAGS_ABI) -Fo$(O)/ AFLAGS = $(AFLAGS_ABI) -Fo$(O)/
C_WARN_FLAGS =
CXX_WARN_FLAGS = CFLAGS = $(LOCAL_FLAGS) $(CFLAGS_BASE2) $(CFLAGS_BASE) $(CFLAGS_EXTRA) $(C_WARN_FLAGS) $(FLAGS_FLTO) $(CC_SHARED) -o $@
#-Wno-invalid-offsetof
#-Wno-reorder
CXXFLAGS = $(LOCAL_FLAGS) $(CXXFLAGS_BASE2) $(CFLAGS_BASE) $(CXXFLAGS_EXTRA) $(CC_SHARED) -o $@ $(CXX_WARN_FLAGS)
STATIC_TARGET= STATIC_TARGET=
ifdef COMPL_STATIC ifdef COMPL_STATIC
@ -120,18 +153,27 @@ all: $(O) $(PROGPATH) $(STATIC_TARGET)
$(O): $(O):
$(MY_MKDIR) $(O) $(MY_MKDIR) $(O)
LFLAGS_ALL = -s $(MY_ARCH_2) $(LDFLAGS) $(LD_arch) $(OBJS) $(MY_LIBS) $(LIB2) ifneq ($(CC), $(CROSS_COMPILE)clang)
LFLAGS_STRIP = -s
endif
LFLAGS_ALL = $(LFLAGS_STRIP) $(MY_ARCH_2) $(LDFLAGS) $(FLAGS_FLTO) $(LD_arch) $(OBJS) $(MY_LIBS) $(LIB2)
$(PROGPATH): $(OBJS) $(PROGPATH): $(OBJS)
$(CXX) -o $(PROGPATH) $(LFLAGS_ALL) $(CC) -o $(PROGPATH) $(LFLAGS_ALL)
$(PROGPATH_STATIC): $(OBJS) $(PROGPATH_STATIC): $(OBJS)
$(CXX) -static -o $(PROGPATH_STATIC) $(LFLAGS_ALL) $(CC) -static -o $(PROGPATH_STATIC) $(LFLAGS_ALL)
ifndef NO_DEFAULT_RES ifndef NO_DEFAULT_RES
# old mingw without -FO
# windres.exe $(RFLAGS) resource.rc $O/resource.o
$O/resource.o: resource.rc $O/resource.o: resource.rc
windres.exe $(RFLAGS) resource.rc $O/resource.o $(RC) $(RFLAGS) resource.rc $(O)/resource.o
endif endif
# windres.exe $(RFLAGS) resource.rc $(O)\resource.o
# windres.exe $(RFLAGS) resource.rc -FO $(O)/resource.o
# $(RC) $(RFLAGS) resource.rc -FO $(O)/resource.o
@ -229,10 +271,18 @@ $O/Sha256.o: ../../../C/Sha256.c
$(CC) $(CFLAGS) $< $(CC) $(CFLAGS) $<
$O/Sort.o: ../../../C/Sort.c $O/Sort.o: ../../../C/Sort.c
$(CC) $(CFLAGS) $< $(CC) $(CFLAGS) $<
$O/SwapBytes.o: ../../../C/SwapBytes.c
$(CC) $(CFLAGS) $<
$O/Xz.o: ../../../C/Xz.c $O/Xz.o: ../../../C/Xz.c
$(CC) $(CFLAGS) $< $(CC) $(CFLAGS) $<
$O/XzCrc64.o: ../../../C/XzCrc64.c $O/XzCrc64.o: ../../../C/XzCrc64.c
$(CC) $(CFLAGS) $< $(CC) $(CFLAGS) $<
$O/XzDec.o: ../../../C/XzDec.c
$(CC) $(CFLAGS) $<
$O/XzEnc.o: ../../../C/XzEnc.c
$(CC) $(CFLAGS) $<
$O/XzIn.o: ../../../C/XzIn.c
$(CC) $(CFLAGS) $<
ifdef USE_ASM ifdef USE_ASM
@ -279,11 +329,11 @@ endif
ifdef IS_ARM64 ifdef IS_ARM64
$O/LzmaDecOpt.o: ../../../Asm/arm64/LzmaDecOpt.S ../../../Asm/arm64/7zAsm.S $O/LzmaDecOpt.o: ../../../Asm/arm64/LzmaDecOpt.S ../../../Asm/arm64/7zAsm.S
$(CC) $(CFLAGS) $< $(CC) $(CFLAGS) $(ASM_FLAGS) $<
endif endif
$O/LzmaDec.o: ../../LzmaDec.c $O/LzmaDec.o: ../../LzmaDec.c
$(CC) $(CFLAGS) -D_LZMA_DEC_OPT $< $(CC) $(CFLAGS) -DZ7_LZMA_DEC_OPT $<
else else
@ -294,19 +344,16 @@ endif
$O/XzDec.o: ../../../C/XzDec.c
$(CC) $(CFLAGS) $<
$O/XzEnc.o: ../../../C/XzEnc.c
$(CC) $(CFLAGS) $<
$O/XzIn.o: ../../../C/XzIn.c
$(CC) $(CFLAGS) $<
$O/7zMain.o: ../../../C/Util/7z/7zMain.c $O/7zMain.o: ../../../C/Util/7z/7zMain.c
$(CC) $(CFLAGS) $< $(CC) $(CFLAGS) $<
$O/7zipInstall.o: ../../../C/Util/7zipInstall/7zipInstall.c
$(CC) $(CFLAGS) $<
$O/7zipUninstall.o: ../../../C/Util/7zipUninstall/7zipUninstall.c
$(CC) $(CFLAGS) $<
$O/LzmaUtil.o: ../../../C/Util/Lzma/LzmaUtil.c $O/LzmaUtil.o: ../../../C/Util/Lzma/LzmaUtil.c
$(CC) $(CFLAGS) $< $(CC) $(CFLAGS) $<
$O/XzUtil.o: ../../../C/Util/Xz/XzUtil.c
$(CC) $(CFLAGS) $<
clean: clean:

162
C/Aes.c
View file

@ -1,5 +1,5 @@
/* Aes.c -- AES encryption / decryption /* Aes.c -- AES encryption / decryption
2021-05-13 : Igor Pavlov : Public domain */ 2024-03-01 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -7,13 +7,15 @@
#include "Aes.h" #include "Aes.h"
AES_CODE_FUNC g_AesCbc_Decode; AES_CODE_FUNC g_AesCbc_Decode;
#ifndef _SFX #ifndef Z7_SFX
AES_CODE_FUNC g_AesCbc_Encode; AES_CODE_FUNC g_AesCbc_Encode;
AES_CODE_FUNC g_AesCtr_Code; AES_CODE_FUNC g_AesCtr_Code;
UInt32 g_Aes_SupportedFunctions_Flags; UInt32 g_Aes_SupportedFunctions_Flags;
#endif #endif
MY_ALIGN(64)
static UInt32 T[256 * 4]; static UInt32 T[256 * 4];
MY_ALIGN(64)
static const Byte Sbox[256] = { static const Byte Sbox[256] = {
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
@ -33,7 +35,9 @@ static const Byte Sbox[256] = {
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16}; 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};
MY_ALIGN(64)
static UInt32 D[256 * 4]; static UInt32 D[256 * 4];
MY_ALIGN(64)
static Byte InvS[256]; static Byte InvS[256];
#define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF) #define xtime(x) ((((x) << 1) ^ (((x) & 0x80) != 0 ? 0x1B : 0)) & 0xFF)
@ -51,32 +55,62 @@ static Byte InvS[256];
#define DD(x) (D + (x << 8)) #define DD(x) (D + (x << 8))
// #define _SHOW_AES_STATUS // #define Z7_SHOW_AES_STATUS
#ifdef MY_CPU_X86_OR_AMD64 #ifdef MY_CPU_X86_OR_AMD64
#define USE_HW_AES
#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) #if defined(__INTEL_COMPILER)
#if defined(__clang__) #if (__INTEL_COMPILER >= 1110)
#if (__clang_major__ >= 8) // fix that check
#define USE_HW_AES
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 6) // fix that check
#define USE_HW_AES #define USE_HW_AES
#if (__INTEL_COMPILER >= 1900)
#define USE_HW_VAES
#endif
#endif #endif
#elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400)
#define USE_HW_AES
#if defined(__clang__) && (__clang_major__ >= 8) \
|| defined(__GNUC__) && (__GNUC__ >= 8)
#define USE_HW_VAES
#endif
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
#if _MSC_VER >= 1910 #define USE_HW_AES
#define USE_HW_VAES
#endif
#elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
#if defined(__ARM_FEATURE_AES) \
|| defined(__ARM_FEATURE_CRYPTO)
#define USE_HW_AES
#else
#if defined(MY_CPU_ARM64) \
|| defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
|| defined(Z7_MSC_VER_ORIGINAL)
#if defined(__ARM_FP) && \
( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
|| defined(__GNUC__) && (__GNUC__ >= 6) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
#if defined(MY_CPU_ARM64) \
|| !defined(Z7_CLANG_VERSION) \
|| defined(__ARM_NEON) && \
(Z7_CLANG_VERSION < 170000 || \
Z7_CLANG_VERSION > 170001)
#define USE_HW_AES #define USE_HW_AES
#endif #endif
#endif
#endif
#endif #endif
#endif #endif
#ifdef USE_HW_AES #ifdef USE_HW_AES
#ifdef _SHOW_AES_STATUS // #pragma message("=== Aes.c USE_HW_AES === ")
#ifdef Z7_SHOW_AES_STATUS
#include <stdio.h> #include <stdio.h>
#define _PRF(x) x #define PRF(x) x
#else #else
#define _PRF(x) #define PRF(x)
#endif #endif
#endif #endif
@ -90,23 +124,23 @@ void AesGenTables(void)
for (i = 0; i < 256; i++) for (i = 0; i < 256; i++)
{ {
{ {
UInt32 a1 = Sbox[i]; const UInt32 a1 = Sbox[i];
UInt32 a2 = xtime(a1); const UInt32 a2 = xtime(a1);
UInt32 a3 = a2 ^ a1; const UInt32 a3 = a2 ^ a1;
TT(0)[i] = Ui32(a2, a1, a1, a3); TT(0)[i] = Ui32(a2, a1, a1, a3);
TT(1)[i] = Ui32(a3, a2, a1, a1); TT(1)[i] = Ui32(a3, a2, a1, a1);
TT(2)[i] = Ui32(a1, a3, a2, a1); TT(2)[i] = Ui32(a1, a3, a2, a1);
TT(3)[i] = Ui32(a1, a1, a3, a2); TT(3)[i] = Ui32(a1, a1, a3, a2);
} }
{ {
UInt32 a1 = InvS[i]; const UInt32 a1 = InvS[i];
UInt32 a2 = xtime(a1); const UInt32 a2 = xtime(a1);
UInt32 a4 = xtime(a2); const UInt32 a4 = xtime(a2);
UInt32 a8 = xtime(a4); const UInt32 a8 = xtime(a4);
UInt32 a9 = a8 ^ a1; const UInt32 a9 = a8 ^ a1;
UInt32 aB = a8 ^ a2 ^ a1; const UInt32 aB = a8 ^ a2 ^ a1;
UInt32 aD = a8 ^ a4 ^ a1; const UInt32 aD = a8 ^ a4 ^ a1;
UInt32 aE = a8 ^ a4 ^ a2; const UInt32 aE = a8 ^ a4 ^ a2;
DD(0)[i] = Ui32(aE, a9, aD, aB); DD(0)[i] = Ui32(aE, a9, aD, aB);
DD(1)[i] = Ui32(aB, aE, a9, aD); DD(1)[i] = Ui32(aB, aE, a9, aD);
DD(2)[i] = Ui32(aD, aB, aE, a9); DD(2)[i] = Ui32(aD, aB, aE, a9);
@ -116,7 +150,7 @@ void AesGenTables(void)
{ {
AES_CODE_FUNC d = AesCbc_Decode; AES_CODE_FUNC d = AesCbc_Decode;
#ifndef _SFX #ifndef Z7_SFX
AES_CODE_FUNC e = AesCbc_Encode; AES_CODE_FUNC e = AesCbc_Encode;
AES_CODE_FUNC c = AesCtr_Code; AES_CODE_FUNC c = AesCtr_Code;
UInt32 flags = 0; UInt32 flags = 0;
@ -126,31 +160,33 @@ void AesGenTables(void)
if (CPU_IsSupported_AES()) if (CPU_IsSupported_AES())
{ {
// #pragma message ("AES HW") // #pragma message ("AES HW")
_PRF(printf("\n===AES HW\n")); PRF(printf("\n===AES HW\n"));
d = AesCbc_Decode_HW; d = AesCbc_Decode_HW;
#ifndef _SFX #ifndef Z7_SFX
e = AesCbc_Encode_HW; e = AesCbc_Encode_HW;
c = AesCtr_Code_HW; c = AesCtr_Code_HW;
flags = k_Aes_SupportedFunctions_HW; flags = k_Aes_SupportedFunctions_HW;
#endif #endif
#ifdef MY_CPU_X86_OR_AMD64 #ifdef MY_CPU_X86_OR_AMD64
#ifdef USE_HW_VAES
if (CPU_IsSupported_VAES_AVX2()) if (CPU_IsSupported_VAES_AVX2())
{ {
_PRF(printf("\n===vaes avx2\n")); PRF(printf("\n===vaes avx2\n"));
d = AesCbc_Decode_HW_256; d = AesCbc_Decode_HW_256;
#ifndef _SFX #ifndef Z7_SFX
c = AesCtr_Code_HW_256; c = AesCtr_Code_HW_256;
flags |= k_Aes_SupportedFunctions_HW_256; flags |= k_Aes_SupportedFunctions_HW_256;
#endif #endif
} }
#endif #endif
#endif
} }
#endif #endif
g_AesCbc_Decode = d; g_AesCbc_Decode = d;
#ifndef _SFX #ifndef Z7_SFX
g_AesCbc_Encode = e; g_AesCbc_Encode = e;
g_AesCtr_Code = c; g_AesCtr_Code = c;
g_Aes_SupportedFunctions_Flags = flags; g_Aes_SupportedFunctions_Flags = flags;
@ -194,7 +230,7 @@ void AesGenTables(void)
#define FD(i, x) InvS[gb(x, m[(i - x) & 3])] #define FD(i, x) InvS[gb(x, m[(i - x) & 3])]
#define FD4(i) dest[i] = Ui32(FD(i, 0), FD(i, 1), FD(i, 2), FD(i, 3)) ^ w[i]; #define FD4(i) dest[i] = Ui32(FD(i, 0), FD(i, 1), FD(i, 2), FD(i, 3)) ^ w[i];
void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize) void Z7_FASTCALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize)
{ {
unsigned i, m; unsigned i, m;
const UInt32 *wLim; const UInt32 *wLim;
@ -230,7 +266,7 @@ void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *w, const Byte *key, unsigned keySize)
while (++w != wLim); while (++w != wLim);
} }
void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize) void Z7_FASTCALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize)
{ {
unsigned i, num; unsigned i, num;
Aes_SetKey_Enc(w, key, keySize); Aes_SetKey_Enc(w, key, keySize);
@ -251,7 +287,7 @@ void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *w, const Byte *key, unsigned keySize)
src and dest are pointers to 4 UInt32 words. src and dest are pointers to 4 UInt32 words.
src and dest can point to same block */ src and dest can point to same block */
// MY_FORCE_INLINE // Z7_FORCE_INLINE
static void Aes_Encode(const UInt32 *w, UInt32 *dest, const UInt32 *src) static void Aes_Encode(const UInt32 *w, UInt32 *dest, const UInt32 *src)
{ {
UInt32 s[4]; UInt32 s[4];
@ -265,17 +301,20 @@ static void Aes_Encode(const UInt32 *w, UInt32 *dest, const UInt32 *src)
w += 4; w += 4;
for (;;) for (;;)
{ {
HT16(m, s, 0); HT16(m, s, 0)
if (--numRounds2 == 0) if (--numRounds2 == 0)
break; break;
HT16(s, m, 4); HT16(s, m, 4)
w += 8; w += 8;
} }
w += 4; w += 4;
FT4(0); FT4(1); FT4(2); FT4(3); FT4(0)
FT4(1)
FT4(2)
FT4(3)
} }
MY_FORCE_INLINE Z7_FORCE_INLINE
static void Aes_Decode(const UInt32 *w, UInt32 *dest, const UInt32 *src) static void Aes_Decode(const UInt32 *w, UInt32 *dest, const UInt32 *src)
{ {
UInt32 s[4]; UInt32 s[4];
@ -289,12 +328,15 @@ static void Aes_Decode(const UInt32 *w, UInt32 *dest, const UInt32 *src)
for (;;) for (;;)
{ {
w -= 8; w -= 8;
HD16(m, s, 4); HD16(m, s, 4)
if (--numRounds2 == 0) if (--numRounds2 == 0)
break; break;
HD16(s, m, 0); HD16(s, m, 0)
} }
FD4(0); FD4(1); FD4(2); FD4(3); FD4(0)
FD4(1)
FD4(2)
FD4(3)
} }
void AesCbc_Init(UInt32 *p, const Byte *iv) void AesCbc_Init(UInt32 *p, const Byte *iv)
@ -304,7 +346,7 @@ void AesCbc_Init(UInt32 *p, const Byte *iv)
p[i] = GetUi32(iv + i * 4); p[i] = GetUi32(iv + i * 4);
} }
void MY_FAST_CALL AesCbc_Encode(UInt32 *p, Byte *data, size_t numBlocks) void Z7_FASTCALL AesCbc_Encode(UInt32 *p, Byte *data, size_t numBlocks)
{ {
for (; numBlocks != 0; numBlocks--, data += AES_BLOCK_SIZE) for (; numBlocks != 0; numBlocks--, data += AES_BLOCK_SIZE)
{ {
@ -315,14 +357,14 @@ void MY_FAST_CALL AesCbc_Encode(UInt32 *p, Byte *data, size_t numBlocks)
Aes_Encode(p + 4, p, p); Aes_Encode(p + 4, p, p);
SetUi32(data, p[0]); SetUi32(data, p[0])
SetUi32(data + 4, p[1]); SetUi32(data + 4, p[1])
SetUi32(data + 8, p[2]); SetUi32(data + 8, p[2])
SetUi32(data + 12, p[3]); SetUi32(data + 12, p[3])
} }
} }
void MY_FAST_CALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks) void Z7_FASTCALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks)
{ {
UInt32 in[4], out[4]; UInt32 in[4], out[4];
for (; numBlocks != 0; numBlocks--, data += AES_BLOCK_SIZE) for (; numBlocks != 0; numBlocks--, data += AES_BLOCK_SIZE)
@ -334,10 +376,10 @@ void MY_FAST_CALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks)
Aes_Decode(p + 4, out, in); Aes_Decode(p + 4, out, in);
SetUi32(data, p[0] ^ out[0]); SetUi32(data, p[0] ^ out[0])
SetUi32(data + 4, p[1] ^ out[1]); SetUi32(data + 4, p[1] ^ out[1])
SetUi32(data + 8, p[2] ^ out[2]); SetUi32(data + 8, p[2] ^ out[2])
SetUi32(data + 12, p[3] ^ out[3]); SetUi32(data + 12, p[3] ^ out[3])
p[0] = in[0]; p[0] = in[0];
p[1] = in[1]; p[1] = in[1];
@ -346,7 +388,7 @@ void MY_FAST_CALL AesCbc_Decode(UInt32 *p, Byte *data, size_t numBlocks)
} }
} }
void MY_FAST_CALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks) void Z7_FASTCALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks)
{ {
for (; numBlocks != 0; numBlocks--) for (; numBlocks != 0; numBlocks--)
{ {
@ -360,7 +402,7 @@ void MY_FAST_CALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks)
for (i = 0; i < 4; i++, data += 4) for (i = 0; i < 4; i++, data += 4)
{ {
UInt32 t = temp[i]; const UInt32 t = temp[i];
#ifdef MY_CPU_LE_UNALIGN #ifdef MY_CPU_LE_UNALIGN
*((UInt32 *)(void *)data) ^= t; *((UInt32 *)(void *)data) ^= t;
@ -373,3 +415,15 @@ void MY_FAST_CALL AesCtr_Code(UInt32 *p, Byte *data, size_t numBlocks)
} }
} }
} }
#undef xtime
#undef Ui32
#undef gb0
#undef gb1
#undef gb2
#undef gb3
#undef gb
#undef TT
#undef DD
#undef USE_HW_AES
#undef PRF

36
C/Aes.h
View file

@ -1,8 +1,8 @@
/* Aes.h -- AES encryption / decryption /* Aes.h -- AES encryption / decryption
2018-04-28 : Igor Pavlov : Public domain */ 2023-04-02 : Igor Pavlov : Public domain */
#ifndef __AES_H #ifndef ZIP7_INC_AES_H
#define __AES_H #define ZIP7_INC_AES_H
#include "7zTypes.h" #include "7zTypes.h"
@ -20,19 +20,19 @@ void AesGenTables(void);
/* aes - 16-byte aligned pointer to keyMode+roundKeys sequence */ /* aes - 16-byte aligned pointer to keyMode+roundKeys sequence */
/* keySize = 16 or 24 or 32 (bytes) */ /* keySize = 16 or 24 or 32 (bytes) */
typedef void (MY_FAST_CALL *AES_SET_KEY_FUNC)(UInt32 *aes, const Byte *key, unsigned keySize); typedef void (Z7_FASTCALL *AES_SET_KEY_FUNC)(UInt32 *aes, const Byte *key, unsigned keySize);
void MY_FAST_CALL Aes_SetKey_Enc(UInt32 *aes, const Byte *key, unsigned keySize); void Z7_FASTCALL Aes_SetKey_Enc(UInt32 *aes, const Byte *key, unsigned keySize);
void MY_FAST_CALL Aes_SetKey_Dec(UInt32 *aes, const Byte *key, unsigned keySize); void Z7_FASTCALL Aes_SetKey_Dec(UInt32 *aes, const Byte *key, unsigned keySize);
/* ivAes - 16-byte aligned pointer to iv+keyMode+roundKeys sequence: UInt32[AES_NUM_IVMRK_WORDS] */ /* ivAes - 16-byte aligned pointer to iv+keyMode+roundKeys sequence: UInt32[AES_NUM_IVMRK_WORDS] */
void AesCbc_Init(UInt32 *ivAes, const Byte *iv); /* iv size is AES_BLOCK_SIZE */ void AesCbc_Init(UInt32 *ivAes, const Byte *iv); /* iv size is AES_BLOCK_SIZE */
/* data - 16-byte aligned pointer to data */ /* data - 16-byte aligned pointer to data */
/* numBlocks - the number of 16-byte blocks in data array */ /* numBlocks - the number of 16-byte blocks in data array */
typedef void (MY_FAST_CALL *AES_CODE_FUNC)(UInt32 *ivAes, Byte *data, size_t numBlocks); typedef void (Z7_FASTCALL *AES_CODE_FUNC)(UInt32 *ivAes, Byte *data, size_t numBlocks);
extern AES_CODE_FUNC g_AesCbc_Decode; extern AES_CODE_FUNC g_AesCbc_Decode;
#ifndef _SFX #ifndef Z7_SFX
extern AES_CODE_FUNC g_AesCbc_Encode; extern AES_CODE_FUNC g_AesCbc_Encode;
extern AES_CODE_FUNC g_AesCtr_Code; extern AES_CODE_FUNC g_AesCtr_Code;
#define k_Aes_SupportedFunctions_HW (1 << 2) #define k_Aes_SupportedFunctions_HW (1 << 2)
@ -41,19 +41,19 @@ extern UInt32 g_Aes_SupportedFunctions_Flags;
#endif #endif
#define DECLARE__AES_CODE_FUNC(funcName) \ #define Z7_DECLARE_AES_CODE_FUNC(funcName) \
void MY_FAST_CALL funcName(UInt32 *ivAes, Byte *data, size_t numBlocks); void Z7_FASTCALL funcName(UInt32 *ivAes, Byte *data, size_t numBlocks);
DECLARE__AES_CODE_FUNC (AesCbc_Encode) Z7_DECLARE_AES_CODE_FUNC (AesCbc_Encode)
DECLARE__AES_CODE_FUNC (AesCbc_Decode) Z7_DECLARE_AES_CODE_FUNC (AesCbc_Decode)
DECLARE__AES_CODE_FUNC (AesCtr_Code) Z7_DECLARE_AES_CODE_FUNC (AesCtr_Code)
DECLARE__AES_CODE_FUNC (AesCbc_Encode_HW) Z7_DECLARE_AES_CODE_FUNC (AesCbc_Encode_HW)
DECLARE__AES_CODE_FUNC (AesCbc_Decode_HW) Z7_DECLARE_AES_CODE_FUNC (AesCbc_Decode_HW)
DECLARE__AES_CODE_FUNC (AesCtr_Code_HW) Z7_DECLARE_AES_CODE_FUNC (AesCtr_Code_HW)
DECLARE__AES_CODE_FUNC (AesCbc_Decode_HW_256) Z7_DECLARE_AES_CODE_FUNC (AesCbc_Decode_HW_256)
DECLARE__AES_CODE_FUNC (AesCtr_Code_HW_256) Z7_DECLARE_AES_CODE_FUNC (AesCtr_Code_HW_256)
EXTERN_C_END EXTERN_C_END

File diff suppressed because it is too large Load diff

328
C/Alloc.c
View file

@ -1,38 +1,53 @@
/* Alloc.c -- Memory allocation functions /* Alloc.c -- Memory allocation functions
2021-07-13 : Igor Pavlov : Public domain */ 2024-02-18 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include <stdio.h>
#ifdef _WIN32 #ifdef _WIN32
#include <Windows.h> #include "7zWindows.h"
#endif #endif
#include <stdlib.h> #include <stdlib.h>
#include "Alloc.h" #include "Alloc.h"
/* #define _SZ_ALLOC_DEBUG */ #if defined(Z7_LARGE_PAGES) && defined(_WIN32) && \
(!defined(Z7_WIN32_WINNT_MIN) || Z7_WIN32_WINNT_MIN < 0x0502) // < Win2003 (xp-64)
#define Z7_USE_DYN_GetLargePageMinimum
#endif
/* use _SZ_ALLOC_DEBUG to debug alloc/free operations */ // for debug:
#ifdef _SZ_ALLOC_DEBUG #if 0
#if defined(__CHERI__) && defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
// #pragma message("=== Z7_ALLOC_NO_OFFSET_ALLOCATOR === ")
#define Z7_ALLOC_NO_OFFSET_ALLOCATOR
#endif
#endif
// #define SZ_ALLOC_DEBUG
/* #define SZ_ALLOC_DEBUG */
/* use SZ_ALLOC_DEBUG to debug alloc/free operations */
#ifdef SZ_ALLOC_DEBUG
#include <string.h>
#include <stdio.h> #include <stdio.h>
int g_allocCount = 0; static int g_allocCount = 0;
int g_allocCountMid = 0; #ifdef _WIN32
int g_allocCountBig = 0; static int g_allocCountMid = 0;
static int g_allocCountBig = 0;
#endif
#define CONVERT_INT_TO_STR(charType, tempSize) \ #define CONVERT_INT_TO_STR(charType, tempSize) \
unsigned char temp[tempSize]; unsigned i = 0; \ char temp[tempSize]; unsigned i = 0; \
while (val >= 10) { temp[i++] = (unsigned char)('0' + (unsigned)(val % 10)); val /= 10; } \ while (val >= 10) { temp[i++] = (char)('0' + (unsigned)(val % 10)); val /= 10; } \
*s++ = (charType)('0' + (unsigned)val); \ *s++ = (charType)('0' + (unsigned)val); \
while (i != 0) { i--; *s++ = temp[i]; } \ while (i != 0) { i--; *s++ = temp[i]; } \
*s = 0; *s = 0;
static void ConvertUInt64ToString(UInt64 val, char *s) static void ConvertUInt64ToString(UInt64 val, char *s)
{ {
CONVERT_INT_TO_STR(char, 24); CONVERT_INT_TO_STR(char, 24)
} }
#define GET_HEX_CHAR(t) ((char)(((t < 10) ? ('0' + t) : ('A' + (t - 10))))) #define GET_HEX_CHAR(t) ((char)(((t < 10) ? ('0' + t) : ('A' + (t - 10)))))
@ -77,7 +92,7 @@ static void PrintAligned(const char *s, size_t align)
Print(s); Print(s);
} }
static void PrintLn() static void PrintLn(void)
{ {
Print("\n"); Print("\n");
} }
@ -89,10 +104,10 @@ static void PrintHex(UInt64 v, size_t align)
PrintAligned(s, align); PrintAligned(s, align);
} }
static void PrintDec(UInt64 v, size_t align) static void PrintDec(int v, size_t align)
{ {
char s[32]; char s[32];
ConvertUInt64ToString(v, s); ConvertUInt64ToString((unsigned)v, s);
PrintAligned(s, align); PrintAligned(s, align);
} }
@ -102,12 +117,19 @@ static void PrintAddr(void *p)
} }
#define PRINT_ALLOC(name, cnt, size, ptr) \ #define PRINT_REALLOC(name, cnt, size, ptr) { \
Print(name " "); \
if (!ptr) PrintDec(cnt++, 10); \
PrintHex(size, 10); \
PrintAddr(ptr); \
PrintLn(); }
#define PRINT_ALLOC(name, cnt, size, ptr) { \
Print(name " "); \ Print(name " "); \
PrintDec(cnt++, 10); \ PrintDec(cnt++, 10); \
PrintHex(size, 10); \ PrintHex(size, 10); \
PrintAddr(ptr); \ PrintAddr(ptr); \
PrintLn(); PrintLn(); }
#define PRINT_FREE(name, cnt, ptr) if (ptr) { \ #define PRINT_FREE(name, cnt, ptr) if (ptr) { \
Print(name " "); \ Print(name " "); \
@ -117,26 +139,45 @@ static void PrintAddr(void *p)
#else #else
#ifdef _WIN32
#define PRINT_ALLOC(name, cnt, size, ptr) #define PRINT_ALLOC(name, cnt, size, ptr)
#endif
#define PRINT_FREE(name, cnt, ptr) #define PRINT_FREE(name, cnt, ptr)
#define Print(s) #define Print(s)
#define PrintLn() #define PrintLn()
#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
#define PrintHex(v, align) #define PrintHex(v, align)
#endif
#define PrintAddr(p) #define PrintAddr(p)
#endif #endif
/*
by specification:
malloc(non_NULL, 0) : returns NULL or a unique pointer value that can later be successfully passed to free()
realloc(NULL, size) : the call is equivalent to malloc(size)
realloc(non_NULL, 0) : the call is equivalent to free(ptr)
in main compilers:
malloc(0) : returns non_NULL
realloc(NULL, 0) : returns non_NULL
realloc(non_NULL, 0) : returns NULL
*/
void *MyAlloc(size_t size) void *MyAlloc(size_t size)
{ {
if (size == 0) if (size == 0)
return NULL; return NULL;
PRINT_ALLOC("Alloc ", g_allocCount, size, NULL); // PRINT_ALLOC("Alloc ", g_allocCount, size, NULL)
#ifdef _SZ_ALLOC_DEBUG #ifdef SZ_ALLOC_DEBUG
{ {
void *p = malloc(size); void *p = malloc(size);
// PRINT_ALLOC("Alloc ", g_allocCount, size, p); if (p)
{
PRINT_ALLOC("Alloc ", g_allocCount, size, p)
}
return p; return p;
} }
#else #else
@ -146,71 +187,107 @@ void *MyAlloc(size_t size)
void MyFree(void *address) void MyFree(void *address)
{ {
PRINT_FREE("Free ", g_allocCount, address); PRINT_FREE("Free ", g_allocCount, address)
free(address); free(address);
} }
void *MyRealloc(void *address, size_t size)
{
if (size == 0)
{
MyFree(address);
return NULL;
}
// PRINT_REALLOC("Realloc ", g_allocCount, size, address)
#ifdef SZ_ALLOC_DEBUG
{
void *p = realloc(address, size);
if (p)
{
PRINT_REALLOC("Realloc ", g_allocCount, size, address)
}
return p;
}
#else
return realloc(address, size);
#endif
}
#ifdef _WIN32 #ifdef _WIN32
void *MidAlloc(size_t size) void *MidAlloc(size_t size)
{ {
if (size == 0) if (size == 0)
return NULL; return NULL;
#ifdef SZ_ALLOC_DEBUG
PRINT_ALLOC("Alloc-Mid", g_allocCountMid, size, NULL); {
void *p = VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE);
if (p)
{
PRINT_ALLOC("Alloc-Mid", g_allocCountMid, size, p)
}
return p;
}
#else
return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE); return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE);
#endif
} }
void MidFree(void *address) void MidFree(void *address)
{ {
PRINT_FREE("Free-Mid", g_allocCountMid, address); PRINT_FREE("Free-Mid", g_allocCountMid, address)
if (!address) if (!address)
return; return;
VirtualFree(address, 0, MEM_RELEASE); VirtualFree(address, 0, MEM_RELEASE);
} }
#ifdef _7ZIP_LARGE_PAGES #ifdef Z7_LARGE_PAGES
#ifdef MEM_LARGE_PAGES #ifdef MEM_LARGE_PAGES
#define MY__MEM_LARGE_PAGES MEM_LARGE_PAGES #define MY_MEM_LARGE_PAGES MEM_LARGE_PAGES
#else #else
#define MY__MEM_LARGE_PAGES 0x20000000 #define MY_MEM_LARGE_PAGES 0x20000000
#endif #endif
extern extern
SIZE_T g_LargePageSize; SIZE_T g_LargePageSize;
SIZE_T g_LargePageSize = 0; SIZE_T g_LargePageSize = 0;
typedef SIZE_T (WINAPI *GetLargePageMinimumP)(VOID); typedef SIZE_T (WINAPI *Func_GetLargePageMinimum)(VOID);
#endif // _7ZIP_LARGE_PAGES void SetLargePageSize(void)
void SetLargePageSize()
{ {
#ifdef _7ZIP_LARGE_PAGES
SIZE_T size; SIZE_T size;
GetLargePageMinimumP largePageMinimum = (GetLargePageMinimumP) #ifdef Z7_USE_DYN_GetLargePageMinimum
GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "GetLargePageMinimum"); Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
if (!largePageMinimum)
const
Func_GetLargePageMinimum fn =
(Func_GetLargePageMinimum) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")),
"GetLargePageMinimum");
if (!fn)
return; return;
size = largePageMinimum(); size = fn();
#else
size = GetLargePageMinimum();
#endif
if (size == 0 || (size & (size - 1)) != 0) if (size == 0 || (size & (size - 1)) != 0)
return; return;
g_LargePageSize = size; g_LargePageSize = size;
#endif
} }
#endif // Z7_LARGE_PAGES
void *BigAlloc(size_t size) void *BigAlloc(size_t size)
{ {
if (size == 0) if (size == 0)
return NULL; return NULL;
PRINT_ALLOC("Alloc-Big", g_allocCountBig, size, NULL); PRINT_ALLOC("Alloc-Big", g_allocCountBig, size, NULL)
#ifdef _7ZIP_LARGE_PAGES #ifdef Z7_LARGE_PAGES
{ {
SIZE_T ps = g_LargePageSize; SIZE_T ps = g_LargePageSize;
if (ps != 0 && ps <= (1 << 30) && size > (ps / 2)) if (ps != 0 && ps <= (1 << 30) && size > (ps / 2))
@ -220,56 +297,43 @@ void *BigAlloc(size_t size)
size2 = (size + ps) & ~ps; size2 = (size + ps) & ~ps;
if (size2 >= size) if (size2 >= size)
{ {
void *res = VirtualAlloc(NULL, size2, MEM_COMMIT | MY__MEM_LARGE_PAGES, PAGE_READWRITE); void *p = VirtualAlloc(NULL, size2, MEM_COMMIT | MY_MEM_LARGE_PAGES, PAGE_READWRITE);
if (res) if (p)
return res; {
PRINT_ALLOC("Alloc-BM ", g_allocCountMid, size2, p)
return p;
}
} }
} }
} }
#endif #endif
return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE); return MidAlloc(size);
} }
void BigFree(void *address) void BigFree(void *address)
{ {
PRINT_FREE("Free-Big", g_allocCountBig, address); PRINT_FREE("Free-Big", g_allocCountBig, address)
MidFree(address);
if (!address)
return;
VirtualFree(address, 0, MEM_RELEASE);
} }
#endif #endif // _WIN32
static void *SzAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return MyAlloc(size); } static void *SzAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p) return MyAlloc(size); }
static void SzFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); MyFree(address); } static void SzFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p) MyFree(address); }
const ISzAlloc g_Alloc = { SzAlloc, SzFree }; const ISzAlloc g_Alloc = { SzAlloc, SzFree };
#ifdef _WIN32 #ifdef _WIN32
static void *SzMidAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return MidAlloc(size); } static void *SzMidAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p) return MidAlloc(size); }
static void SzMidFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); MidFree(address); } static void SzMidFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p) MidFree(address); }
static void *SzBigAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p); return BigAlloc(size); } static void *SzBigAlloc(ISzAllocPtr p, size_t size) { UNUSED_VAR(p) return BigAlloc(size); }
static void SzBigFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p); BigFree(address); } static void SzBigFree(ISzAllocPtr p, void *address) { UNUSED_VAR(p) BigFree(address); }
const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree }; const ISzAlloc g_MidAlloc = { SzMidAlloc, SzMidFree };
const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree }; const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree };
#endif #endif
/* #ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
uintptr_t : <stdint.h> C99 (optional)
: unsupported in VS6
*/
#ifdef _WIN32
typedef UINT_PTR UIntPtr;
#else
/*
typedef uintptr_t UIntPtr;
*/
typedef ptrdiff_t UIntPtr;
#endif
#define ADJUST_ALLOC_SIZE 0 #define ADJUST_ALLOC_SIZE 0
/* /*
@ -280,14 +344,36 @@ const ISzAlloc g_BigAlloc = { SzBigAlloc, SzBigFree };
MyAlloc() can return address that is NOT multiple of sizeof(void *). MyAlloc() can return address that is NOT multiple of sizeof(void *).
*/ */
/* /*
#define MY_ALIGN_PTR_DOWN(p, align) ((void *)((char *)(p) - ((size_t)(UIntPtr)(p) & ((align) - 1)))) uintptr_t : <stdint.h> C99 (optional)
: unsupported in VS6
*/ */
#define MY_ALIGN_PTR_DOWN(p, align) ((void *)((((UIntPtr)(p)) & ~((UIntPtr)(align) - 1)))) typedef
#ifdef _WIN32
UINT_PTR
#elif 1
uintptr_t
#else
ptrdiff_t
#endif
MY_uintptr_t;
#if 0 \
|| (defined(__CHERI__) \
|| defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ > 8))
// for 128-bit pointers (cheri):
#define MY_ALIGN_PTR_DOWN(p, align) \
((void *)((char *)(p) - ((size_t)(MY_uintptr_t)(p) & ((align) - 1))))
#else
#define MY_ALIGN_PTR_DOWN(p, align) \
((void *)((((MY_uintptr_t)(p)) & ~((MY_uintptr_t)(align) - 1))))
#endif
#if !defined(_WIN32) && defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L) #endif
#if !defined(_WIN32) \
&& (defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR) \
|| defined(_POSIX_C_SOURCE) && (_POSIX_C_SOURCE >= 200112L))
#define USE_posix_memalign #define USE_posix_memalign
#endif #endif
@ -327,14 +413,13 @@ static int posix_memalign(void **ptr, size_t align, size_t size)
#define ALLOC_ALIGN_SIZE ((size_t)1 << 7) #define ALLOC_ALIGN_SIZE ((size_t)1 << 7)
static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size) void *z7_AlignedAlloc(size_t size)
{ {
#ifndef USE_posix_memalign #ifndef USE_posix_memalign
void *p; void *p;
void *pAligned; void *pAligned;
size_t newSize; size_t newSize;
UNUSED_VAR(pp);
/* also we can allocate additional dummy ALLOC_ALIGN_SIZE bytes after aligned /* also we can allocate additional dummy ALLOC_ALIGN_SIZE bytes after aligned
block to prevent cache line sharing with another allocated blocks */ block to prevent cache line sharing with another allocated blocks */
@ -359,10 +444,9 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size)
return pAligned; return pAligned;
#else #else
void *p; void *p;
UNUSED_VAR(pp);
if (posix_memalign(&p, ALLOC_ALIGN_SIZE, size)) if (posix_memalign(&p, ALLOC_ALIGN_SIZE, size))
return NULL; return NULL;
@ -371,19 +455,37 @@ static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size)
return p; return p;
#endif #endif
}
void z7_AlignedFree(void *address)
{
#ifndef USE_posix_memalign
if (address)
MyFree(((void **)address)[-1]);
#else
free(address);
#endif
}
static void *SzAlignedAlloc(ISzAllocPtr pp, size_t size)
{
UNUSED_VAR(pp)
return z7_AlignedAlloc(size);
} }
static void SzAlignedFree(ISzAllocPtr pp, void *address) static void SzAlignedFree(ISzAllocPtr pp, void *address)
{ {
UNUSED_VAR(pp); UNUSED_VAR(pp)
#ifndef USE_posix_memalign #ifndef USE_posix_memalign
if (address) if (address)
MyFree(((void **)address)[-1]); MyFree(((void **)address)[-1]);
#else #else
free(address); free(address);
#endif #endif
} }
@ -391,17 +493,45 @@ const ISzAlloc g_AlignedAlloc = { SzAlignedAlloc, SzAlignedFree };
#define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *))
/* we align ptr to support cases where CAlignOffsetAlloc::offset is not multiply of sizeof(void *) */ /* we align ptr to support cases where CAlignOffsetAlloc::offset is not multiply of sizeof(void *) */
#define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1] #ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
/* #if 1
#define REAL_BLOCK_PTR_VAR(p) ((void **)(p))[-1] #define MY_ALIGN_PTR_DOWN_1(p) MY_ALIGN_PTR_DOWN(p, sizeof(void *))
*/ #define REAL_BLOCK_PTR_VAR(p) ((void **)MY_ALIGN_PTR_DOWN_1(p))[-1]
#else
// we can use this simplified code,
// if (CAlignOffsetAlloc::offset == (k * sizeof(void *))
#define REAL_BLOCK_PTR_VAR(p) (((void **)(p))[-1])
#endif
#endif
#if 0
#ifndef Z7_ALLOC_NO_OFFSET_ALLOCATOR
#include <stdio.h>
static void PrintPtr(const char *s, const void *p)
{
const Byte *p2 = (const Byte *)&p;
unsigned i;
printf("%s %p ", s, p);
for (i = sizeof(p); i != 0;)
{
i--;
printf("%02x", p2[i]);
}
printf("\n");
}
#endif
#endif
static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size) static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size)
{ {
CAlignOffsetAlloc *p = CONTAINER_FROM_VTBL(pp, CAlignOffsetAlloc, vt); #if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR)
UNUSED_VAR(pp)
return z7_AlignedAlloc(size);
#else
const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt);
void *adr; void *adr;
void *pAligned; void *pAligned;
size_t newSize; size_t newSize;
@ -429,6 +559,12 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size)
pAligned = (char *)MY_ALIGN_PTR_DOWN((char *)adr + pAligned = (char *)MY_ALIGN_PTR_DOWN((char *)adr +
alignSize - p->offset + extra + ADJUST_ALLOC_SIZE, alignSize) + p->offset; alignSize - p->offset + extra + ADJUST_ALLOC_SIZE, alignSize) + p->offset;
#if 0
printf("\nalignSize = %6x, offset=%6x, size=%8x \n", (unsigned)alignSize, (unsigned)p->offset, (unsigned)size);
PrintPtr("base", adr);
PrintPtr("alig", pAligned);
#endif
PrintLn(); PrintLn();
Print("- Aligned: "); Print("- Aligned: ");
Print(" size="); PrintHex(size, 8); Print(" size="); PrintHex(size, 8);
@ -440,19 +576,25 @@ static void *AlignOffsetAlloc_Alloc(ISzAllocPtr pp, size_t size)
REAL_BLOCK_PTR_VAR(pAligned) = adr; REAL_BLOCK_PTR_VAR(pAligned) = adr;
return pAligned; return pAligned;
#endif
} }
static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address) static void AlignOffsetAlloc_Free(ISzAllocPtr pp, void *address)
{ {
#if defined(Z7_ALLOC_NO_OFFSET_ALLOCATOR)
UNUSED_VAR(pp)
z7_AlignedFree(address);
#else
if (address) if (address)
{ {
CAlignOffsetAlloc *p = CONTAINER_FROM_VTBL(pp, CAlignOffsetAlloc, vt); const CAlignOffsetAlloc *p = Z7_CONTAINER_FROM_VTBL_CONST(pp, CAlignOffsetAlloc, vt);
PrintLn(); PrintLn();
Print("- Aligned Free: "); Print("- Aligned Free: ");
PrintLn(); PrintLn();
ISzAlloc_Free(p->baseAlloc, REAL_BLOCK_PTR_VAR(address)); ISzAlloc_Free(p->baseAlloc, REAL_BLOCK_PTR_VAR(address));
} }
#endif
} }

View file

@ -1,31 +1,49 @@
/* Alloc.h -- Memory allocation functions /* Alloc.h -- Memory allocation functions
2021-07-13 : Igor Pavlov : Public domain */ 2024-01-22 : Igor Pavlov : Public domain */
#ifndef __COMMON_ALLOC_H #ifndef ZIP7_INC_ALLOC_H
#define __COMMON_ALLOC_H #define ZIP7_INC_ALLOC_H
#include "7zTypes.h" #include "7zTypes.h"
EXTERN_C_BEGIN EXTERN_C_BEGIN
/*
MyFree(NULL) : is allowed, as free(NULL)
MyAlloc(0) : returns NULL : but malloc(0) is allowed to return NULL or non_NULL
MyRealloc(NULL, 0) : returns NULL : but realloc(NULL, 0) is allowed to return NULL or non_NULL
MyRealloc() is similar to realloc() for the following cases:
MyRealloc(non_NULL, 0) : returns NULL and always calls MyFree(ptr)
MyRealloc(NULL, non_ZERO) : returns NULL, if allocation failed
MyRealloc(non_NULL, non_ZERO) : returns NULL, if reallocation failed
*/
void *MyAlloc(size_t size); void *MyAlloc(size_t size);
void MyFree(void *address); void MyFree(void *address);
void *MyRealloc(void *address, size_t size);
void *z7_AlignedAlloc(size_t size);
void z7_AlignedFree(void *p);
#ifdef _WIN32 #ifdef _WIN32
#ifdef Z7_LARGE_PAGES
void SetLargePageSize(void); void SetLargePageSize(void);
#endif
void *MidAlloc(size_t size); void *MidAlloc(size_t size);
void MidFree(void *address); void MidFree(void *address);
void *BigAlloc(size_t size); void *BigAlloc(size_t size);
void BigFree(void *address); void BigFree(void *address);
/* #define Z7_BIG_ALLOC_IS_ZERO_FILLED */
#else #else
#define MidAlloc(size) MyAlloc(size) #define MidAlloc(size) z7_AlignedAlloc(size)
#define MidFree(address) MyFree(address) #define MidFree(address) z7_AlignedFree(address)
#define BigAlloc(size) MyAlloc(size) #define BigAlloc(size) z7_AlignedAlloc(size)
#define BigFree(address) MyFree(address) #define BigFree(address) z7_AlignedFree(address)
#endif #endif

12
C/Asm_c.mak Normal file
View file

@ -0,0 +1,12 @@
!IFDEF ASM_OBJS
!IF "$(PLATFORM)" == "arm64"
$(ASM_OBJS): ../../../Asm/arm64/$(*B).S
$(COMPL_ASM_CLANG)
!ELSEIF "$(PLATFORM)" == "arm"
$(ASM_OBJS): ../../../Asm/arm/$(*B).asm
$(COMPL_ASM)
!ELSEIF "$(PLATFORM)" != "ia64" && "$(PLATFORM)" != "mips"
$(ASM_OBJS): ../../../Asm/x86/$(*B).asm
$(COMPL_ASM)
!ENDIF
!ENDIF

323
C/Bcj2.c
View file

@ -1,29 +1,24 @@
/* Bcj2.c -- BCJ2 Decoder (Converter for x86 code) /* Bcj2.c -- BCJ2 Decoder (Converter for x86 code)
2021-02-09 : Igor Pavlov : Public domain */ 2023-03-01 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include "Bcj2.h" #include "Bcj2.h"
#include "CpuArch.h" #include "CpuArch.h"
#define CProb UInt16
#define kTopValue ((UInt32)1 << 24) #define kTopValue ((UInt32)1 << 24)
#define kNumModelBits 11 #define kNumBitModelTotalBits 11
#define kBitModelTotal (1 << kNumModelBits) #define kBitModelTotal (1 << kNumBitModelTotalBits)
#define kNumMoveBits 5 #define kNumMoveBits 5
#define _IF_BIT_0 ttt = *prob; bound = (p->range >> kNumModelBits) * ttt; if (p->code < bound) // UInt32 bcj2_stats[256 + 2][2];
#define _UPDATE_0 p->range = bound; *prob = (CProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
#define _UPDATE_1 p->range -= bound; p->code -= bound; *prob = (CProb)(ttt - (ttt >> kNumMoveBits));
void Bcj2Dec_Init(CBcj2Dec *p) void Bcj2Dec_Init(CBcj2Dec *p)
{ {
unsigned i; unsigned i;
p->state = BCJ2_STREAM_RC; // BCJ2_DEC_STATE_OK;
p->state = BCJ2_DEC_STATE_OK;
p->ip = 0; p->ip = 0;
p->temp[3] = 0; p->temp = 0;
p->range = 0; p->range = 0;
p->code = 0; p->code = 0;
for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++) for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
@ -32,217 +27,248 @@ void Bcj2Dec_Init(CBcj2Dec *p)
SRes Bcj2Dec_Decode(CBcj2Dec *p) SRes Bcj2Dec_Decode(CBcj2Dec *p)
{ {
UInt32 v = p->temp;
// const Byte *src;
if (p->range <= 5) if (p->range <= 5)
{ {
p->state = BCJ2_DEC_STATE_OK; UInt32 code = p->code;
p->state = BCJ2_DEC_STATE_ERROR; /* for case if we return SZ_ERROR_DATA; */
for (; p->range != 5; p->range++) for (; p->range != 5; p->range++)
{ {
if (p->range == 1 && p->code != 0) if (p->range == 1 && code != 0)
return SZ_ERROR_DATA; return SZ_ERROR_DATA;
if (p->bufs[BCJ2_STREAM_RC] == p->lims[BCJ2_STREAM_RC]) if (p->bufs[BCJ2_STREAM_RC] == p->lims[BCJ2_STREAM_RC])
{ {
p->state = BCJ2_STREAM_RC; p->state = BCJ2_STREAM_RC;
return SZ_OK; return SZ_OK;
} }
code = (code << 8) | *(p->bufs[BCJ2_STREAM_RC])++;
p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++; p->code = code;
} }
if (code == 0xffffffff)
if (p->code == 0xFFFFFFFF)
return SZ_ERROR_DATA; return SZ_ERROR_DATA;
p->range = 0xffffffff;
p->range = 0xFFFFFFFF;
} }
else if (p->state >= BCJ2_DEC_STATE_ORIG_0) // else
{ {
while (p->state <= BCJ2_DEC_STATE_ORIG_3) unsigned state = p->state;
// we check BCJ2_IS_32BIT_STREAM() here instead of check in the main loop
if (BCJ2_IS_32BIT_STREAM(state))
{
const Byte *cur = p->bufs[state];
if (cur == p->lims[state])
return SZ_OK;
p->bufs[state] = cur + 4;
{
const UInt32 ip = p->ip + 4;
v = GetBe32a(cur) - ip;
p->ip = ip;
}
state = BCJ2_DEC_STATE_ORIG_0;
}
if ((unsigned)(state - BCJ2_DEC_STATE_ORIG_0) < 4)
{ {
Byte *dest = p->dest; Byte *dest = p->dest;
if (dest == p->destLim) for (;;)
return SZ_OK;
*dest = p->temp[(size_t)p->state - BCJ2_DEC_STATE_ORIG_0];
p->state++;
p->dest = dest + 1;
}
}
/*
if (BCJ2_IS_32BIT_STREAM(p->state))
{
const Byte *cur = p->bufs[p->state];
if (cur == p->lims[p->state])
return SZ_OK;
p->bufs[p->state] = cur + 4;
{
UInt32 val;
Byte *dest;
SizeT rem;
p->ip += 4;
val = GetBe32(cur) - p->ip;
dest = p->dest;
rem = p->destLim - dest;
if (rem < 4)
{ {
SizeT i; if (dest == p->destLim)
SetUi32(p->temp, val); {
for (i = 0; i < rem; i++) p->state = state;
dest[i] = p->temp[i]; p->temp = v;
p->dest = dest + rem; return SZ_OK;
p->state = BCJ2_DEC_STATE_ORIG_0 + (unsigned)rem; }
return SZ_OK; *dest++ = (Byte)v;
p->dest = dest;
if (++state == BCJ2_DEC_STATE_ORIG_3 + 1)
break;
v >>= 8;
} }
SetUi32(dest, val);
p->temp[3] = (Byte)(val >> 24);
p->dest = dest + 4;
p->state = BCJ2_DEC_STATE_OK;
} }
} }
*/
// src = p->bufs[BCJ2_STREAM_MAIN];
for (;;) for (;;)
{ {
/*
if (BCJ2_IS_32BIT_STREAM(p->state)) if (BCJ2_IS_32BIT_STREAM(p->state))
p->state = BCJ2_DEC_STATE_OK; p->state = BCJ2_DEC_STATE_OK;
else else
*/
{ {
if (p->range < kTopValue) if (p->range < kTopValue)
{ {
if (p->bufs[BCJ2_STREAM_RC] == p->lims[BCJ2_STREAM_RC]) if (p->bufs[BCJ2_STREAM_RC] == p->lims[BCJ2_STREAM_RC])
{ {
p->state = BCJ2_STREAM_RC; p->state = BCJ2_STREAM_RC;
p->temp = v;
return SZ_OK; return SZ_OK;
} }
p->range <<= 8; p->range <<= 8;
p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++; p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++;
} }
{ {
const Byte *src = p->bufs[BCJ2_STREAM_MAIN]; const Byte *src = p->bufs[BCJ2_STREAM_MAIN];
const Byte *srcLim; const Byte *srcLim;
Byte *dest; Byte *dest = p->dest;
SizeT num = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - src);
if (num == 0)
{ {
p->state = BCJ2_STREAM_MAIN; const SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - src);
return SZ_OK; SizeT num = (SizeT)(p->destLim - dest);
if (num >= rem)
num = rem;
#define NUM_ITERS 4
#if (NUM_ITERS & (NUM_ITERS - 1)) == 0
num &= ~((SizeT)NUM_ITERS - 1); // if (NUM_ITERS == (1 << x))
#else
num -= num % NUM_ITERS; // if (NUM_ITERS != (1 << x))
#endif
srcLim = src + num;
} }
dest = p->dest; #define NUM_SHIFT_BITS 24
if (num > (SizeT)(p->destLim - dest)) #define ONE_ITER(indx) { \
{ const unsigned b = src[indx]; \
num = (SizeT)(p->destLim - dest); *dest++ = (Byte)b; \
if (num == 0) v = (v << NUM_SHIFT_BITS) | b; \
{ if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
p->state = BCJ2_DEC_STATE_ORIG; if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
return SZ_OK; ((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
} /* ++dest */; /* v = b; */ }
}
srcLim = src + num; if (src != srcLim)
for (;;)
if (p->temp[3] == 0x0F && (src[0] & 0xF0) == 0x80)
*dest = src[0];
else for (;;)
{ {
Byte b = *src; /* The dependency chain of 2-cycle for (v) calculation is not big problem here.
*dest = b; But we can remove dependency chain with v = b in the end of loop. */
if (b != 0x0F) ONE_ITER(0)
{ #if (NUM_ITERS > 1)
if ((b & 0xFE) == 0xE8) ONE_ITER(1)
break; #if (NUM_ITERS > 2)
dest++; ONE_ITER(2)
if (++src != srcLim) #if (NUM_ITERS > 3)
continue; ONE_ITER(3)
#if (NUM_ITERS > 4)
ONE_ITER(4)
#if (NUM_ITERS > 5)
ONE_ITER(5)
#if (NUM_ITERS > 6)
ONE_ITER(6)
#if (NUM_ITERS > 7)
ONE_ITER(7)
#endif
#endif
#endif
#endif
#endif
#endif
#endif
src += NUM_ITERS;
if (src == srcLim)
break; break;
}
dest++;
if (++src == srcLim)
break;
if ((*src & 0xF0) != 0x80)
continue;
*dest = *src;
break;
} }
num = (SizeT)(src - p->bufs[BCJ2_STREAM_MAIN]);
if (src == srcLim) if (src == srcLim)
#if (NUM_ITERS > 1)
for (;;)
#endif
{ {
p->temp[3] = src[-1]; #if (NUM_ITERS > 1)
p->bufs[BCJ2_STREAM_MAIN] = src; if (src == p->lims[BCJ2_STREAM_MAIN] || dest == p->destLim)
p->ip += (UInt32)num; #endif
p->dest += num; {
p->state = const SizeT num = (SizeT)(src - p->bufs[BCJ2_STREAM_MAIN]);
p->bufs[BCJ2_STREAM_MAIN] == p->bufs[BCJ2_STREAM_MAIN] = src;
p->lims[BCJ2_STREAM_MAIN] ? p->dest = dest;
(unsigned)BCJ2_STREAM_MAIN : p->ip += (UInt32)num;
(unsigned)BCJ2_DEC_STATE_ORIG; /* state BCJ2_STREAM_MAIN has more priority than BCJ2_STATE_ORIG */
return SZ_OK; p->state =
src == p->lims[BCJ2_STREAM_MAIN] ?
(unsigned)BCJ2_STREAM_MAIN :
(unsigned)BCJ2_DEC_STATE_ORIG;
p->temp = v;
return SZ_OK;
}
#if (NUM_ITERS > 1)
ONE_ITER(0)
src++;
#endif
} }
{ {
UInt32 bound, ttt; const SizeT num = (SizeT)(dest - p->dest);
CProb *prob; p->dest = dest; // p->dest += num;
Byte b = src[0]; p->bufs[BCJ2_STREAM_MAIN] += num; // = src;
Byte prev = (Byte)(num == 0 ? p->temp[3] : src[-1]);
p->temp[3] = b;
p->bufs[BCJ2_STREAM_MAIN] = src + 1;
num++;
p->ip += (UInt32)num; p->ip += (UInt32)num;
p->dest += num; }
{
prob = p->probs + (unsigned)(b == 0xE8 ? 2 + (unsigned)prev : (b == 0xE9 ? 1 : 0)); UInt32 bound, ttt;
CBcj2Prob *prob; // unsigned index;
_IF_BIT_0 /*
prob = p->probs + (unsigned)((Byte)v == 0xe8 ?
2 + (Byte)(v >> 8) :
((v >> 5) & 1)); // ((Byte)v < 0xe8 ? 0 : 1));
*/
{ {
_UPDATE_0 const unsigned c = ((v + 0x17) >> 6) & 1;
prob = p->probs + (unsigned)
(((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
// (Byte)
// 8x->0 : e9->1 : xxe8->xx+2
// 8x->0x100 : e9->0x101 : xxe8->xx
// (((0x100 - (e & ~v)) & (0x100 | (v >> 8))) + (e & v));
// (((0x101 + (~e | v)) & (0x100 | (v >> 8))) + (e & v));
}
ttt = *prob;
bound = (p->range >> kNumBitModelTotalBits) * ttt;
if (p->code < bound)
{
// bcj2_stats[prob - p->probs][0]++;
p->range = bound;
*prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
continue; continue;
} }
_UPDATE_1 {
// bcj2_stats[prob - p->probs][1]++;
p->range -= bound;
p->code -= bound;
*prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
}
} }
} }
} }
{ {
UInt32 val; /* (v == 0xe8 ? 0 : 1) uses setcc instruction with additional zero register usage in x64 MSVC. */
unsigned cj = (p->temp[3] == 0xE8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP; // const unsigned cj = ((Byte)v == 0xe8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP;
const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
const Byte *cur = p->bufs[cj]; const Byte *cur = p->bufs[cj];
Byte *dest; Byte *dest;
SizeT rem; SizeT rem;
if (cur == p->lims[cj]) if (cur == p->lims[cj])
{ {
p->state = cj; p->state = cj;
break; break;
} }
v = GetBe32a(cur);
val = GetBe32(cur);
p->bufs[cj] = cur + 4; p->bufs[cj] = cur + 4;
{
p->ip += 4; const UInt32 ip = p->ip + 4;
val -= p->ip; v -= ip;
p->ip = ip;
}
dest = p->dest; dest = p->dest;
rem = (SizeT)(p->destLim - dest); rem = (SizeT)(p->destLim - dest);
if (rem < 4) if (rem < 4)
{ {
p->temp[0] = (Byte)val; if (rem > 0) dest[0] = (Byte)val; val >>= 8; if ((unsigned)rem > 0) { dest[0] = (Byte)v; v >>= 8;
p->temp[1] = (Byte)val; if (rem > 1) dest[1] = (Byte)val; val >>= 8; if ((unsigned)rem > 1) { dest[1] = (Byte)v; v >>= 8;
p->temp[2] = (Byte)val; if (rem > 2) dest[2] = (Byte)val; val >>= 8; if ((unsigned)rem > 2) { dest[2] = (Byte)v; v >>= 8; }}}
p->temp[3] = (Byte)val; p->temp = v;
p->dest = dest + rem; p->dest = dest + rem;
p->state = BCJ2_DEC_STATE_ORIG_0 + (unsigned)rem; p->state = BCJ2_DEC_STATE_ORIG_0 + (unsigned)rem;
break; break;
} }
SetUi32(dest, v)
SetUi32(dest, val); v >>= 24;
p->temp[3] = (Byte)(val >> 24);
p->dest = dest + 4; p->dest = dest + 4;
} }
} }
@ -252,6 +278,13 @@ SRes Bcj2Dec_Decode(CBcj2Dec *p)
p->range <<= 8; p->range <<= 8;
p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++; p->code = (p->code << 8) | *(p->bufs[BCJ2_STREAM_RC])++;
} }
return SZ_OK; return SZ_OK;
} }
#undef NUM_ITERS
#undef ONE_ITER
#undef NUM_SHIFT_BITS
#undef kTopValue
#undef kNumBitModelTotalBits
#undef kBitModelTotal
#undef kNumMoveBits

268
C/Bcj2.h
View file

@ -1,8 +1,8 @@
/* Bcj2.h -- BCJ2 Converter for x86 code /* Bcj2.h -- BCJ2 converter for x86 code (Branch CALL/JUMP variant2)
2014-11-10 : Igor Pavlov : Public domain */ 2023-03-02 : Igor Pavlov : Public domain */
#ifndef __BCJ2_H #ifndef ZIP7_INC_BCJ2_H
#define __BCJ2_H #define ZIP7_INC_BCJ2_H
#include "7zTypes.h" #include "7zTypes.h"
@ -26,37 +26,68 @@ enum
BCJ2_DEC_STATE_ORIG_3, BCJ2_DEC_STATE_ORIG_3,
BCJ2_DEC_STATE_ORIG, BCJ2_DEC_STATE_ORIG,
BCJ2_DEC_STATE_OK BCJ2_DEC_STATE_ERROR /* after detected data error */
}; };
enum enum
{ {
BCJ2_ENC_STATE_ORIG = BCJ2_NUM_STREAMS, BCJ2_ENC_STATE_ORIG = BCJ2_NUM_STREAMS,
BCJ2_ENC_STATE_OK BCJ2_ENC_STATE_FINISHED /* it's state after fully encoded stream */
}; };
#define BCJ2_IS_32BIT_STREAM(s) ((s) == BCJ2_STREAM_CALL || (s) == BCJ2_STREAM_JUMP) /* #define BCJ2_IS_32BIT_STREAM(s) ((s) == BCJ2_STREAM_CALL || (s) == BCJ2_STREAM_JUMP) */
#define BCJ2_IS_32BIT_STREAM(s) ((unsigned)((unsigned)(s) - (unsigned)BCJ2_STREAM_CALL) < 2)
/* /*
CBcj2Dec / CBcj2Enc CBcj2Dec / CBcj2Enc
bufs sizes: bufs sizes:
BUF_SIZE(n) = lims[n] - bufs[n] BUF_SIZE(n) = lims[n] - bufs[n]
bufs sizes for BCJ2_STREAM_CALL and BCJ2_STREAM_JUMP must be mutliply of 4: bufs sizes for BCJ2_STREAM_CALL and BCJ2_STREAM_JUMP must be multiply of 4:
(BUF_SIZE(BCJ2_STREAM_CALL) & 3) == 0 (BUF_SIZE(BCJ2_STREAM_CALL) & 3) == 0
(BUF_SIZE(BCJ2_STREAM_JUMP) & 3) == 0 (BUF_SIZE(BCJ2_STREAM_JUMP) & 3) == 0
*/ */
// typedef UInt32 CBcj2Prob;
typedef UInt16 CBcj2Prob;
/*
BCJ2 encoder / decoder internal requirements:
- If last bytes of stream contain marker (e8/e8/0f8x), then
there is also encoded symbol (0 : no conversion) in RC stream.
- One case of overlapped instructions is supported,
if last byte of converted instruction is (0f) and next byte is (8x):
marker [xx xx xx 0f] 8x
then the pair (0f 8x) is treated as marker.
*/
/* ---------- BCJ2 Decoder ---------- */
/* /*
CBcj2Dec: CBcj2Dec:
dest is allowed to overlap with bufs[BCJ2_STREAM_MAIN], with the following conditions: (dest) is allowed to overlap with bufs[BCJ2_STREAM_MAIN], with the following conditions:
bufs[BCJ2_STREAM_MAIN] >= dest && bufs[BCJ2_STREAM_MAIN] >= dest &&
bufs[BCJ2_STREAM_MAIN] - dest >= tempReserv + bufs[BCJ2_STREAM_MAIN] - dest >=
BUF_SIZE(BCJ2_STREAM_CALL) + BUF_SIZE(BCJ2_STREAM_CALL) +
BUF_SIZE(BCJ2_STREAM_JUMP) BUF_SIZE(BCJ2_STREAM_JUMP)
tempReserv = 0 : for first call of Bcj2Dec_Decode reserve = bufs[BCJ2_STREAM_MAIN] - dest -
tempReserv = 4 : for any other calls of Bcj2Dec_Decode ( BUF_SIZE(BCJ2_STREAM_CALL) +
overlap with offset = 1 is not allowed BUF_SIZE(BCJ2_STREAM_JUMP) )
and additional conditions:
if (it's first call of Bcj2Dec_Decode() after Bcj2Dec_Init())
{
(reserve != 1) : if (ver < v23.00)
}
else // if there are more than one calls of Bcj2Dec_Decode() after Bcj2Dec_Init())
{
(reserve >= 6) : if (ver < v23.00)
(reserve >= 4) : if (ver >= v23.00)
We need that (reserve) because after first call of Bcj2Dec_Decode(),
CBcj2Dec::temp can contain up to 4 bytes for writing to (dest).
}
(reserve == 0) is allowed, if we decode full stream via single call of Bcj2Dec_Decode().
(reserve == 0) also is allowed in case of multi-call, if we use fixed buffers,
and (reserve) is calculated from full (final) sizes of all streams before first call.
*/ */
typedef struct typedef struct
@ -68,22 +99,66 @@ typedef struct
unsigned state; /* BCJ2_STREAM_MAIN has more priority than BCJ2_STATE_ORIG */ unsigned state; /* BCJ2_STREAM_MAIN has more priority than BCJ2_STATE_ORIG */
UInt32 ip; UInt32 ip; /* property of starting base for decoding */
Byte temp[4]; UInt32 temp; /* Byte temp[4]; */
UInt32 range; UInt32 range;
UInt32 code; UInt32 code;
UInt16 probs[2 + 256]; CBcj2Prob probs[2 + 256];
} CBcj2Dec; } CBcj2Dec;
/* Note:
Bcj2Dec_Init() sets (CBcj2Dec::ip = 0)
if (ip != 0) property is required, the caller must set CBcj2Dec::ip after Bcj2Dec_Init()
*/
void Bcj2Dec_Init(CBcj2Dec *p); void Bcj2Dec_Init(CBcj2Dec *p);
/* Returns: SZ_OK or SZ_ERROR_DATA */
/* Bcj2Dec_Decode():
returns:
SZ_OK
SZ_ERROR_DATA : if data in 5 starting bytes of BCJ2_STREAM_RC stream are not correct
*/
SRes Bcj2Dec_Decode(CBcj2Dec *p); SRes Bcj2Dec_Decode(CBcj2Dec *p);
#define Bcj2Dec_IsFinished(_p_) ((_p_)->code == 0) /* To check that decoding was finished you can compare
sizes of processed streams with sizes known from another sources.
You must do at least one mandatory check from the two following options:
- the check for size of processed output (ORIG) stream.
- the check for size of processed input (MAIN) stream.
additional optional checks:
- the checks for processed sizes of all input streams (MAIN, CALL, JUMP, RC)
- the checks Bcj2Dec_IsMaybeFinished*()
also before actual decoding you can check that the
following condition is met for stream sizes:
( size(ORIG) == size(MAIN) + size(CALL) + size(JUMP) )
*/
/* (state == BCJ2_STREAM_MAIN) means that decoder is ready for
additional input data in BCJ2_STREAM_MAIN stream.
Note that (state == BCJ2_STREAM_MAIN) is allowed for non-finished decoding.
*/
#define Bcj2Dec_IsMaybeFinished_state_MAIN(_p_) ((_p_)->state == BCJ2_STREAM_MAIN)
/* if the stream decoding was finished correctly, then range decoder
part of CBcj2Dec also was finished, and then (CBcj2Dec::code == 0).
Note that (CBcj2Dec::code == 0) is allowed for non-finished decoding.
*/
#define Bcj2Dec_IsMaybeFinished_code(_p_) ((_p_)->code == 0)
/* use Bcj2Dec_IsMaybeFinished() only as additional check
after at least one mandatory check from the two following options:
- the check for size of processed output (ORIG) stream.
- the check for size of processed input (MAIN) stream.
*/
#define Bcj2Dec_IsMaybeFinished(_p_) ( \
Bcj2Dec_IsMaybeFinished_state_MAIN(_p_) && \
Bcj2Dec_IsMaybeFinished_code(_p_))
/* ---------- BCJ2 Encoder ---------- */
typedef enum typedef enum
{ {
BCJ2_ENC_FINISH_MODE_CONTINUE, BCJ2_ENC_FINISH_MODE_CONTINUE,
@ -91,6 +166,91 @@ typedef enum
BCJ2_ENC_FINISH_MODE_END_STREAM BCJ2_ENC_FINISH_MODE_END_STREAM
} EBcj2Enc_FinishMode; } EBcj2Enc_FinishMode;
/*
BCJ2_ENC_FINISH_MODE_CONTINUE:
process non finished encoding.
It notifies the encoder that additional further calls
can provide more input data (src) than provided by current call.
In that case the CBcj2Enc encoder still can move (src) pointer
up to (srcLim), but CBcj2Enc encoder can store some of the last
processed bytes (up to 4 bytes) from src to internal CBcj2Enc::temp[] buffer.
at return:
(CBcj2Enc::src will point to position that includes
processed data and data copied to (temp[]) buffer)
That data from (temp[]) buffer will be used in further calls.
BCJ2_ENC_FINISH_MODE_END_BLOCK:
finish encoding of current block (ended at srcLim) without RC flushing.
at return: if (CBcj2Enc::state == BCJ2_ENC_STATE_ORIG) &&
CBcj2Enc::src == CBcj2Enc::srcLim)
: it shows that block encoding was finished. And the encoder is
ready for new (src) data or for stream finish operation.
finished block means
{
CBcj2Enc has completed block encoding up to (srcLim).
(1 + 4 bytes) or (2 + 4 bytes) CALL/JUMP cortages will
not cross block boundary at (srcLim).
temporary CBcj2Enc buffer for (ORIG) src data is empty.
3 output uncompressed streams (MAIN, CALL, JUMP) were flushed.
RC stream was not flushed. And RC stream will cross block boundary.
}
Note: some possible implementation of BCJ2 encoder could
write branch marker (e8/e8/0f8x) in one call of Bcj2Enc_Encode(),
and it could calculate symbol for RC in another call of Bcj2Enc_Encode().
BCJ2 encoder uses ip/fileIp/fileSize/relatLimit values to calculate RC symbol.
And these CBcj2Enc variables can have different values in different Bcj2Enc_Encode() calls.
So caller must finish each block with BCJ2_ENC_FINISH_MODE_END_BLOCK
to ensure that RC symbol is calculated and written in proper block.
BCJ2_ENC_FINISH_MODE_END_STREAM
finish encoding of stream (ended at srcLim) fully including RC flushing.
at return: if (CBcj2Enc::state == BCJ2_ENC_STATE_FINISHED)
: it shows that stream encoding was finished fully,
and all output streams were flushed fully.
also Bcj2Enc_IsFinished() can be called.
*/
/*
32-bit relative offset in JUMP/CALL commands is
- (mod 4 GiB) for 32-bit x86 code
- signed Int32 for 64-bit x86-64 code
BCJ2 encoder also does internal relative to absolute address conversions.
And there are 2 possible ways to do it:
before v23: we used 32-bit variables and (mod 4 GiB) conversion
since v23: we use 64-bit variables and (signed Int32 offset) conversion.
The absolute address condition for conversion in v23:
((UInt64)((Int64)ip64 - (Int64)fileIp64 + 5 + (Int32)offset) < (UInt64)fileSize64)
note that if (fileSize64 > 2 GiB). there is difference between
old (mod 4 GiB) way (v22) and new (signed Int32 offset) way (v23).
And new (v23) way is more suitable to encode 64-bit x86-64 code for (fileSize64 > 2 GiB) cases.
*/
/*
// for old (v22) way for conversion:
typedef UInt32 CBcj2Enc_ip_unsigned;
typedef Int32 CBcj2Enc_ip_signed;
#define BCJ2_ENC_FileSize_MAX ((UInt32)1 << 31)
*/
typedef UInt64 CBcj2Enc_ip_unsigned;
typedef Int64 CBcj2Enc_ip_signed;
/* maximum size of file that can be used for conversion condition */
#define BCJ2_ENC_FileSize_MAX ((CBcj2Enc_ip_unsigned)0 - 2)
/* default value of fileSize64_minus1 variable that means
that absolute address limitation will not be used */
#define BCJ2_ENC_FileSizeField_UNLIMITED ((CBcj2Enc_ip_unsigned)0 - 1)
/* calculate value that later can be set to CBcj2Enc::fileSize64_minus1 */
#define BCJ2_ENC_GET_FileSizeField_VAL_FROM_FileSize(fileSize) \
((CBcj2Enc_ip_unsigned)(fileSize) - 1)
/* set CBcj2Enc::fileSize64_minus1 variable from size of file */
#define Bcj2Enc_SET_FileSize(p, fileSize) \
(p)->fileSize64_minus1 = BCJ2_ENC_GET_FileSizeField_VAL_FROM_FileSize(fileSize);
typedef struct typedef struct
{ {
Byte *bufs[BCJ2_NUM_STREAMS]; Byte *bufs[BCJ2_NUM_STREAMS];
@ -101,45 +261,71 @@ typedef struct
unsigned state; unsigned state;
EBcj2Enc_FinishMode finishMode; EBcj2Enc_FinishMode finishMode;
Byte prevByte; Byte context;
Byte flushRem;
Byte isFlushState;
Byte cache; Byte cache;
UInt32 range; UInt32 range;
UInt64 low; UInt64 low;
UInt64 cacheSize; UInt64 cacheSize;
UInt32 ip; // UInt32 context; // for marker version, it can include marker flag.
/* 32-bit ralative offset in JUMP/CALL commands is /* (ip64) and (fileIp64) correspond to virtual source stream position
- (mod 4 GB) in 32-bit mode that doesn't include data in temp[] */
- signed Int32 in 64-bit mode CBcj2Enc_ip_unsigned ip64; /* current (ip) position */
We use (mod 4 GB) check for fileSize. CBcj2Enc_ip_unsigned fileIp64; /* start (ip) position of current file */
Use fileSize up to 2 GB, if you want to support 32-bit and 64-bit code conversion. */ CBcj2Enc_ip_unsigned fileSize64_minus1; /* size of current file (for conversion limitation) */
UInt32 fileIp; UInt32 relatLimit; /* (relatLimit <= ((UInt32)1 << 31)) : 0 means disable_conversion */
UInt32 fileSize; /* (fileSize <= ((UInt32)1 << 31)), 0 means no_limit */ // UInt32 relatExcludeBits;
UInt32 relatLimit; /* (relatLimit <= ((UInt32)1 << 31)), 0 means desable_conversion */
UInt32 tempTarget; UInt32 tempTarget;
unsigned tempPos; unsigned tempPos; /* the number of bytes that were copied to temp[] buffer
Byte temp[4 * 2]; (tempPos <= 4) outside of Bcj2Enc_Encode() */
// Byte temp[4]; // for marker version
unsigned flushPos; Byte temp[8];
CBcj2Prob probs[2 + 256];
UInt16 probs[2 + 256];
} CBcj2Enc; } CBcj2Enc;
void Bcj2Enc_Init(CBcj2Enc *p); void Bcj2Enc_Init(CBcj2Enc *p);
/*
Bcj2Enc_Encode(): at exit:
p->State < BCJ2_NUM_STREAMS : we need more buffer space for output stream
(bufs[p->State] == lims[p->State])
p->State == BCJ2_ENC_STATE_ORIG : we need more data in input src stream
(src == srcLim)
p->State == BCJ2_ENC_STATE_FINISHED : after fully encoded stream
*/
void Bcj2Enc_Encode(CBcj2Enc *p); void Bcj2Enc_Encode(CBcj2Enc *p);
#define Bcj2Enc_Get_InputData_Size(p) ((SizeT)((p)->srcLim - (p)->src) + (p)->tempPos) /* Bcj2Enc encoder can look ahead for up 4 bytes of source stream.
#define Bcj2Enc_IsFinished(p) ((p)->flushPos == 5) CBcj2Enc::tempPos : is the number of bytes that were copied from input stream to temp[] buffer.
(CBcj2Enc::src) after Bcj2Enc_Encode() is starting position after
fully processed data and after data copied to temp buffer.
So if the caller needs to get real number of fully processed input
bytes (without look ahead data in temp buffer),
the caller must subtruct (CBcj2Enc::tempPos) value from processed size
value that is calculated based on current (CBcj2Enc::src):
cur_processed_pos = Calc_Big_Processed_Pos(enc.src)) -
Bcj2Enc_Get_AvailInputSize_in_Temp(&enc);
*/
/* get the size of input data that was stored in temp[] buffer: */
#define Bcj2Enc_Get_AvailInputSize_in_Temp(p) ((p)->tempPos)
#define Bcj2Enc_IsFinished(p) ((p)->flushRem == 0)
#define BCJ2_RELAT_LIMIT_NUM_BITS 26 /* Note : the decoder supports overlapping of marker (0f 80).
#define BCJ2_RELAT_LIMIT ((UInt32)1 << BCJ2_RELAT_LIMIT_NUM_BITS) But we can eliminate such overlapping cases by setting
the limit for relative offset conversion as
/* limit for CBcj2Enc::fileSize variable */ CBcj2Enc::relatLimit <= (0x0f << 24) == (240 MiB)
#define BCJ2_FileSize_MAX ((UInt32)1 << 31) */
/* default value for CBcj2Enc::relatLimit */
#define BCJ2_ENC_RELAT_LIMIT_DEFAULT ((UInt32)0x0f << 24)
#define BCJ2_ENC_RELAT_LIMIT_MAX ((UInt32)1 << 31)
// #define BCJ2_RELAT_EXCLUDE_NUM_BITS 5
EXTERN_C_END EXTERN_C_END

View file

@ -1,60 +1,62 @@
/* Bcj2Enc.c -- BCJ2 Encoder (Converter for x86 code) /* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2)
2021-02-09 : Igor Pavlov : Public domain */ 2023-04-02 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
/* #define SHOW_STAT */ /* #define SHOW_STAT */
#ifdef SHOW_STAT #ifdef SHOW_STAT
#include <stdio.h> #include <stdio.h>
#define PRF(x) x #define PRF2(s) printf("%s ip=%8x tempPos=%d src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src));
#else #else
#define PRF(x) #define PRF2(s)
#endif #endif
#include <string.h>
#include "Bcj2.h" #include "Bcj2.h"
#include "CpuArch.h" #include "CpuArch.h"
#define CProb UInt16
#define kTopValue ((UInt32)1 << 24) #define kTopValue ((UInt32)1 << 24)
#define kNumModelBits 11 #define kNumBitModelTotalBits 11
#define kBitModelTotal (1 << kNumModelBits) #define kBitModelTotal (1 << kNumBitModelTotalBits)
#define kNumMoveBits 5 #define kNumMoveBits 5
void Bcj2Enc_Init(CBcj2Enc *p) void Bcj2Enc_Init(CBcj2Enc *p)
{ {
unsigned i; unsigned i;
p->state = BCJ2_ENC_STATE_ORIG;
p->state = BCJ2_ENC_STATE_OK;
p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE; p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
p->context = 0;
p->prevByte = 0; p->flushRem = 5;
p->isFlushState = 0;
p->cache = 0; p->cache = 0;
p->range = 0xFFFFFFFF; p->range = 0xffffffff;
p->low = 0; p->low = 0;
p->cacheSize = 1; p->cacheSize = 1;
p->ip64 = 0;
p->ip = 0; p->fileIp64 = 0;
p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED;
p->fileIp = 0; p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT;
p->fileSize = 0; // p->relatExcludeBits = 0;
p->relatLimit = BCJ2_RELAT_LIMIT;
p->tempPos = 0; p->tempPos = 0;
p->flushPos = 0;
for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++) for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
p->probs[i] = kBitModelTotal >> 1; p->probs[i] = kBitModelTotal >> 1;
} }
static BoolInt MY_FAST_CALL RangeEnc_ShiftLow(CBcj2Enc *p) // Z7_NO_INLINE
Z7_FORCE_INLINE
static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)
{ {
if ((UInt32)p->low < (UInt32)0xFF000000 || (UInt32)(p->low >> 32) != 0) const UInt32 low = (UInt32)p->low;
const unsigned high = (unsigned)
#if defined(Z7_MSC_VER_ORIGINAL) \
&& defined(MY_CPU_X86) \
&& defined(MY_CPU_LE) \
&& !defined(MY_CPU_64BIT)
// we try to rid of __aullshr() call in MSVS-x86
(((const UInt32 *)&p->low)[1]); // [1] : for little-endian only
#else
(p->low >> 32);
#endif
if (low < (UInt32)0xff000000 || high != 0)
{ {
Byte *buf = p->bufs[BCJ2_STREAM_RC]; Byte *buf = p->bufs[BCJ2_STREAM_RC];
do do
@ -65,247 +67,440 @@ static BoolInt MY_FAST_CALL RangeEnc_ShiftLow(CBcj2Enc *p)
p->bufs[BCJ2_STREAM_RC] = buf; p->bufs[BCJ2_STREAM_RC] = buf;
return True; return True;
} }
*buf++ = (Byte)(p->cache + (Byte)(p->low >> 32)); *buf++ = (Byte)(p->cache + high);
p->cache = 0xFF; p->cache = 0xff;
} }
while (--p->cacheSize); while (--p->cacheSize);
p->bufs[BCJ2_STREAM_RC] = buf; p->bufs[BCJ2_STREAM_RC] = buf;
p->cache = (Byte)((UInt32)p->low >> 24); p->cache = (Byte)(low >> 24);
} }
p->cacheSize++; p->cacheSize++;
p->low = (UInt32)p->low << 8; p->low = low << 8;
return False; return False;
} }
/*
We can use 2 alternative versions of code:
1) non-marker version:
Byte CBcj2Enc::context
Byte temp[8];
Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer.
Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction
with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call.
2) marker version:
UInt32 CBcj2Enc::context
Byte CBcj2Enc::temp[4];
MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker.
it's allowed that
one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest,
and another call of Bcj2Enc_Encode_2() does offset conversion.
So different values of (fileIp) and (fileSize) are possible
in these different Bcj2Enc_Encode_2() calls.
Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop.
So we use non-marker version.
*/
/*
Corner cases with overlap in multi-block.
before v23: there was one corner case, where converted instruction
could start in one sub-stream and finish in next sub-stream.
If multi-block (solid) encoding is used,
and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream.
and (0f) is last byte of previous sub-stream
and (8x) is first byte of current sub-stream
then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder.
BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage,
if that offset meets limit requirements.
If encoder allows 32-bit offset conversion for such overlap case,
then the data in 3 uncompressed BCJ2 streams for some sub-stream
can depend from data of previous sub-stream.
That corner case is not big problem, and it's rare case.
Since v23.00 we do additional check to prevent conversions in such overlap cases.
*/
/*
Bcj2Enc_Encode_2() output variables at exit:
{
if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG))
{
it means that encoder needs more input data.
if (p->srcLim == p->src) at exit, then
{
(p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
all input data were read and processed, and we are ready for
new input data.
}
else
{
(p->srcLim != p->src)
(p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
The encoder have found e8/e9/0f_8x marker,
and p->src points to last byte of that marker,
Bcj2Enc_Encode_2() needs more input data to get totally
5 bytes (last byte of marker and 32-bit branch offset)
as continuous array starting from p->src.
(p->srcLim - p->src < 5) requirement is met after exit.
So non-processed resedue from p->src to p->srcLim is always less than 5 bytes.
}
}
}
*/
Z7_NO_INLINE
static void Bcj2Enc_Encode_2(CBcj2Enc *p) static void Bcj2Enc_Encode_2(CBcj2Enc *p)
{ {
if (BCJ2_IS_32BIT_STREAM(p->state)) if (!p->isFlushState)
{ {
Byte *cur = p->bufs[p->state]; const Byte *src;
if (cur == p->lims[p->state]) UInt32 v;
return;
SetBe32(cur, p->tempTarget);
p->bufs[p->state] = cur + 4;
}
p->state = BCJ2_ENC_STATE_ORIG;
for (;;)
{
if (p->range < kTopValue)
{
if (RangeEnc_ShiftLow(p))
return;
p->range <<= 8;
}
{ {
const unsigned state = p->state;
if (BCJ2_IS_32BIT_STREAM(state))
{
Byte *cur = p->bufs[state];
if (cur == p->lims[state])
return;
SetBe32a(cur, p->tempTarget)
p->bufs[state] = cur + 4;
}
}
p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit
src = p->src;
v = p->context;
// #define WRITE_CONTEXT p->context = v; // for marker version
#define WRITE_CONTEXT p->context = (Byte)v;
#define WRITE_CONTEXT_AND_SRC p->src = src; WRITE_CONTEXT
for (;;)
{
// const Byte *src;
// UInt32 v;
CBcj2Enc_ip_unsigned ip;
if (p->range < kTopValue)
{
// to reduce register pressure and code size: we save and restore local variables.
WRITE_CONTEXT_AND_SRC
if (Bcj2_RangeEnc_ShiftLow(p))
return;
p->range <<= 8;
src = p->src;
v = p->context;
}
// src = p->src;
// #define MARKER_FLAG ((UInt32)1 << 17)
// if ((v & MARKER_FLAG) == 0) // for marker version
{ {
const Byte *src = p->src;
const Byte *srcLim; const Byte *srcLim;
Byte *dest; Byte *dest = p->bufs[BCJ2_STREAM_MAIN];
SizeT num = (SizeT)(p->srcLim - src);
if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
{ {
if (num <= 4) const SizeT remSrc = (SizeT)(p->srcLim - src);
return; SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
num -= 4; if (rem >= remSrc)
rem = remSrc;
srcLim = src + rem;
} }
else if (num == 0) /* p->context contains context of previous byte:
break; bits [0 : 7] : src[-1], if (src) was changed in this call
bits [8 : 31] : are undefined for non-marker version
*/
// v = p->context;
#define NUM_SHIFT_BITS 24
#define CONV_FLAG ((UInt32)1 << 16)
#define ONE_ITER { \
b = src[0]; \
*dest++ = (Byte)b; \
v = (v << NUM_SHIFT_BITS) | b; \
if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
src++; if (src == srcLim) { break; } }
dest = p->bufs[BCJ2_STREAM_MAIN]; if (src != srcLim)
if (num > (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest)) for (;;)
{ {
num = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest); /* clang can generate ineffective code with setne instead of two jcc instructions.
if (num == 0) we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */
unsigned b;
ONE_ITER
ONE_ITER
}
ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]);
p->bufs[BCJ2_STREAM_MAIN] = dest;
p->ip64 = ip;
if (src == srcLim)
{
WRITE_CONTEXT_AND_SRC
if (src != p->srcLim)
{ {
p->state = BCJ2_STREAM_MAIN; p->state = BCJ2_STREAM_MAIN;
return; return;
} }
} /* (p->src == p->srcLim)
(p->state == BCJ2_ENC_STATE_ORIG) */
srcLim = src + num; if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
return;
if (p->prevByte == 0x0F && (src[0] & 0xF0) == 0x80) /* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */
*dest = src[0]; // (p->flushRem == 5);
else for (;;) p->isFlushState = 1;
{
Byte b = *src;
*dest = b;
if (b != 0x0F)
{
if ((b & 0xFE) == 0xE8)
break;
dest++;
if (++src != srcLim)
continue;
break;
}
dest++;
if (++src == srcLim)
break;
if ((*src & 0xF0) != 0x80)
continue;
*dest = *src;
break; break;
} }
src++;
num = (SizeT)(src - p->src); // p->src = src;
}
if (src == srcLim) // ip = p->ip; // for marker version
/* marker was found */
/* (v) contains marker that was found:
bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7]
: value of src[-2] : xx/xx/0f
bits [0 : 7] : value of src[-1] : e8/e9/8x
*/
{
{ {
p->prevByte = src[-1]; #if NUM_SHIFT_BITS != 24
p->bufs[BCJ2_STREAM_MAIN] = dest; v &= ~(UInt32)CONV_FLAG;
p->src = src; #endif
p->ip += (UInt32)num; // UInt32 relat = 0;
continue;
}
{
Byte context = (Byte)(num == 0 ? p->prevByte : src[-1]);
BoolInt needConvert;
p->bufs[BCJ2_STREAM_MAIN] = dest + 1;
p->ip += (UInt32)num + 1;
src++;
needConvert = False;
if ((SizeT)(p->srcLim - src) >= 4) if ((SizeT)(p->srcLim - src) >= 4)
{ {
UInt32 relatVal = GetUi32(src); /*
if ((p->fileSize == 0 || (UInt32)(p->ip + 4 + relatVal - p->fileIp) < p->fileSize) if (relat != 0 || (Byte)v != 0xe8)
&& ((relatVal + p->relatLimit) >> 1) < p->relatLimit) BoolInt isBigOffset = True;
needConvert = True; */
} const UInt32 relat = GetUi32(src);
/*
{ #define EXCLUDE_FLAG ((UInt32)1 << 4)
UInt32 bound; #define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0)
unsigned ttt; if (p->relatExcludeBits != 0)
Byte b = src[-1];
CProb *prob = p->probs + (unsigned)(b == 0xE8 ? 2 + (unsigned)context : (b == 0xE9 ? 1 : 0));
ttt = *prob;
bound = (p->range >> kNumModelBits) * ttt;
if (!needConvert)
{ {
const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1);
isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0);
}
// isBigOffset = False; // for debug
*/
ip -= p->fileIp64;
// Use the following if check, if (ip) is 64-bit:
if (ip > (((v + 0x20) >> 5) & 1)) // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9)
if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1)
if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit)
v |= CONV_FLAG;
}
else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
{
// (p->srcLim - src < 4)
// /*
// for non-marker version
p->ip64--; // p->ip = ip - 1;
p->bufs[BCJ2_STREAM_MAIN]--;
src--;
v >>= NUM_SHIFT_BITS;
// (0 < p->srcLim - p->src <= 4)
// */
// v |= MARKER_FLAG; // for marker version
/* (p->state == BCJ2_ENC_STATE_ORIG) */
WRITE_CONTEXT_AND_SRC
return;
}
{
const unsigned c = ((v + 0x17) >> 6) & 1;
CBcj2Prob *prob = p->probs + (unsigned)
(((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
/*
((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) :
((Byte)v < 0xe8 ? 0 : 1)); // ((v >> 5) & 1));
*/
const unsigned ttt = *prob;
const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt;
if ((v & CONV_FLAG) == 0)
{
// static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy);
// v = (Byte)v; // for marker version
p->range = bound; p->range = bound;
*prob = (CProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)); *prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
p->src = src; // WRITE_CONTEXT_AND_SRC
p->prevByte = b;
continue; continue;
} }
p->low += bound; p->low += bound;
p->range -= bound; p->range -= bound;
*prob = (CProb)(ttt - (ttt >> kNumMoveBits)); *prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
}
// p->context = src[3];
{
// const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP);
const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
ip = p->ip64;
v = GetUi32(src); // relat
ip += 4;
p->ip64 = ip;
src += 4;
// p->src = src;
{ {
UInt32 relatVal = GetUi32(src); const UInt32 absol = (UInt32)ip + v;
UInt32 absVal; Byte *cur = p->bufs[cj];
p->ip += 4; v >>= 24;
absVal = p->ip + relatVal; // WRITE_CONTEXT
p->prevByte = src[3]; if (cur == p->lims[cj])
src += 4;
p->src = src;
{ {
unsigned cj = (b == 0xE8) ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP; p->state = cj;
Byte *cur = p->bufs[cj]; p->tempTarget = absol;
if (cur == p->lims[cj]) WRITE_CONTEXT_AND_SRC
{ return;
p->state = cj;
p->tempTarget = absVal;
return;
}
SetBe32(cur, absVal);
p->bufs[cj] = cur + 4;
} }
SetBe32a(cur, absol)
p->bufs[cj] = cur + 4;
} }
} }
} }
} }
} } // end of loop
} }
if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM) for (; p->flushRem != 0; p->flushRem--)
return; if (Bcj2_RangeEnc_ShiftLow(p))
for (; p->flushPos < 5; p->flushPos++)
if (RangeEnc_ShiftLow(p))
return; return;
p->state = BCJ2_ENC_STATE_OK; p->state = BCJ2_ENC_STATE_FINISHED;
} }
/*
BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer.
So base function Bcj2Enc_Encode_2()
in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with
(p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim)
Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer.
so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(),
then (p->src == p->srcLim).
And the caller's code is simpler with Bcj2Enc_Encode().
*/
Z7_NO_INLINE
void Bcj2Enc_Encode(CBcj2Enc *p) void Bcj2Enc_Encode(CBcj2Enc *p)
{ {
PRF(printf("\n")); PRF2("\n----")
PRF(printf("---- ip = %8d tempPos = %8d src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src));
if (p->tempPos != 0) if (p->tempPos != 0)
{ {
/* extra: number of bytes that were copied from (src) to (temp) buffer in this call */
unsigned extra = 0; unsigned extra = 0;
/* We will touch only minimal required number of bytes in input (src) stream.
So we will add input bytes from (src) stream to temp[] with step of 1 byte.
We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call
in first loop iteration because
- previous call of Bcj2Enc_Encode() could use another (finishMode),
- previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG).
the case with full temp[] buffer (p->tempPos == 4) is possible here.
*/
for (;;) for (;;)
{ {
// (0 < p->tempPos <= 5) // in non-marker version
/* p->src : the current src data position including extra bytes
that were copied to temp[] buffer in this call */
const Byte *src = p->src; const Byte *src = p->src;
const Byte *srcLim = p->srcLim; const Byte *srcLim = p->srcLim;
EBcj2Enc_FinishMode finishMode = p->finishMode; const EBcj2Enc_FinishMode finishMode = p->finishMode;
if (src != srcLim)
{
/* if there are some src data after the data copied to temp[],
then we use MODE_CONTINUE for temp data */
p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
}
p->src = p->temp; p->src = p->temp;
p->srcLim = p->temp + p->tempPos; p->srcLim = p->temp + p->tempPos;
if (src != srcLim) PRF2(" ")
p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
PRF(printf(" ip = %8d tempPos = %8d src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src));
Bcj2Enc_Encode_2(p); Bcj2Enc_Encode_2(p);
{ {
unsigned num = (unsigned)(p->src - p->temp); const unsigned num = (unsigned)(p->src - p->temp);
unsigned tempPos = p->tempPos - num; const unsigned tempPos = p->tempPos - num;
unsigned i; unsigned i;
p->tempPos = tempPos; p->tempPos = tempPos;
for (i = 0; i < tempPos; i++) for (i = 0; i < tempPos; i++)
p->temp[i] = p->temp[(size_t)i + num]; p->temp[i] = p->temp[(SizeT)i + num];
// tempPos : number of bytes in temp buffer
p->src = src; p->src = src;
p->srcLim = srcLim; p->srcLim = srcLim;
p->finishMode = finishMode; p->finishMode = finishMode;
if (p->state != BCJ2_ENC_STATE_ORIG)
if (p->state != BCJ2_ENC_STATE_ORIG || src == srcLim) {
// (p->tempPos <= 4) // in non-marker version
/* if (the reason of exit from Bcj2Enc_Encode_2()
is not BCJ2_ENC_STATE_ORIG),
then we exit from Bcj2Enc_Encode() with same reason */
// optional code begin : we rollback (src) and tempPos, if it's possible:
if (extra >= tempPos)
extra = tempPos;
p->src = src - extra;
p->tempPos = tempPos - extra;
// optional code end : rollback of (src) and tempPos
return; return;
}
/* (p->tempPos <= 4)
(p->state == BCJ2_ENC_STATE_ORIG)
so encoder needs more data than in temp[] */
if (src == srcLim)
return; // src buffer has no more input data.
/* (src != srcLim)
so we can provide more input data from src for Bcj2Enc_Encode_2() */
if (extra >= tempPos) if (extra >= tempPos)
{ {
p->src = src - tempPos; /* (extra >= tempPos) means that temp buffer contains
only data from src buffer of this call.
So now we can encode without temp buffer */
p->src = src - tempPos; // rollback (src)
p->tempPos = 0; p->tempPos = 0;
break; break;
} }
// we append one additional extra byte from (src) to temp[] buffer:
p->temp[tempPos] = src[0]; p->temp[tempPos] = *src;
p->tempPos = tempPos + 1; p->tempPos = tempPos + 1;
// (0 < p->tempPos <= 5) // in non-marker version
p->src = src + 1; p->src = src + 1;
extra++; extra++;
} }
} }
} }
PRF(printf("++++ ip = %8d tempPos = %8d src = %8d\n", p->ip, p->tempPos, p->srcLim - p->src)); PRF2("++++")
// (p->tempPos == 0)
Bcj2Enc_Encode_2(p); Bcj2Enc_Encode_2(p);
PRF2("====")
if (p->state == BCJ2_ENC_STATE_ORIG) if (p->state == BCJ2_ENC_STATE_ORIG)
{ {
const Byte *src = p->src; const Byte *src = p->src;
unsigned rem = (unsigned)(p->srcLim - src); const Byte *srcLim = p->srcLim;
unsigned i; const unsigned rem = (unsigned)(srcLim - src);
for (i = 0; i < rem; i++) /* (rem <= 4) here.
p->temp[i] = src[i]; if (p->src != p->srcLim), then
p->tempPos = rem; - we copy non-processed bytes from (p->src) to temp[] buffer,
p->src = src + rem; - we set p->src equal to p->srcLim.
*/
if (rem)
{
unsigned i = 0;
p->src = srcLim;
p->tempPos = rem;
// (0 < p->tempPos <= 4)
do
p->temp[i] = src[i];
while (++i != rem);
}
// (p->tempPos <= 4)
// (p->src == p->srcLim)
} }
} }
#undef PRF2
#undef CONV_FLAG
#undef MARKER_FLAG
#undef WRITE_CONTEXT
#undef WRITE_CONTEXT_AND_SRC
#undef ONE_ITER
#undef NUM_SHIFT_BITS
#undef kTopValue
#undef kNumBitModelTotalBits
#undef kBitModelTotal
#undef kNumMoveBits

View file

@ -1,47 +1,104 @@
/* Blake2.h -- BLAKE2 Hash /* Blake2.h -- BLAKE2sp Hash
2015-06-30 : Igor Pavlov : Public domain 2024-01-17 : Igor Pavlov : Public domain */
2015 : Samuel Neves : Public domain */
#ifndef __BLAKE2_H #ifndef ZIP7_INC_BLAKE2_H
#define __BLAKE2_H #define ZIP7_INC_BLAKE2_H
#include "7zTypes.h" #include "7zTypes.h"
#if 0
#include "Compiler.h"
#include "CpuArch.h"
#if defined(MY_CPU_X86_OR_AMD64)
#if defined(__SSE2__) \
|| defined(_MSC_VER) && _MSC_VER > 1200 \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
|| defined(__clang__) \
|| defined(__INTEL_COMPILER)
#include <emmintrin.h> // SSE2
#endif
#if defined(__AVX2__) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
|| defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \
|| defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
#include <immintrin.h>
#if defined(__clang__)
#include <avxintrin.h>
#include <avx2intrin.h>
#endif
#endif // avx2
#endif // MY_CPU_X86_OR_AMD64
#endif // 0
EXTERN_C_BEGIN EXTERN_C_BEGIN
#define BLAKE2S_BLOCK_SIZE 64 #define Z7_BLAKE2S_BLOCK_SIZE 64
#define BLAKE2S_DIGEST_SIZE 32 #define Z7_BLAKE2S_DIGEST_SIZE 32
#define BLAKE2SP_PARALLEL_DEGREE 8 #define Z7_BLAKE2SP_PARALLEL_DEGREE 8
#define Z7_BLAKE2SP_NUM_STRUCT_WORDS 16
#if 1 || defined(Z7_BLAKE2SP_USE_FUNCTIONS)
typedef void (Z7_FASTCALL *Z7_BLAKE2SP_FUNC_COMPRESS)(UInt32 *states, const Byte *data, const Byte *end);
typedef void (Z7_FASTCALL *Z7_BLAKE2SP_FUNC_INIT)(UInt32 *states);
#endif
// it's required that CBlake2sp is aligned for 32-bytes,
// because the code can use unaligned access with sse and avx256.
// but 64-bytes alignment can be better.
MY_ALIGN(64)
typedef struct typedef struct
{ {
UInt32 h[8]; union
UInt32 t[2]; {
UInt32 f[2]; #if 0
Byte buf[BLAKE2S_BLOCK_SIZE]; #if defined(MY_CPU_X86_OR_AMD64)
UInt32 bufPos; #if defined(__SSE2__) \
UInt32 lastNode_f1; || defined(_MSC_VER) && _MSC_VER > 1200 \
UInt32 dummy[2]; /* for sizeof(CBlake2s) alignment */ || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 30300) \
} CBlake2s; || defined(__clang__) \
|| defined(__INTEL_COMPILER)
__m128i _pad_align_128bit[4];
#endif // sse2
#if defined(__AVX2__) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
|| defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \
|| defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
__m256i _pad_align_256bit[2];
#endif // avx2
#endif // x86
#endif // 0
/* You need to xor CBlake2s::h[i] with input parameter block after Blake2s_Init0() */ void * _pad_align_ptr[8];
/* UInt32 _pad_align_32bit[16];
void Blake2s_Init0(CBlake2s *p); struct
void Blake2s_Update(CBlake2s *p, const Byte *data, size_t size); {
void Blake2s_Final(CBlake2s *p, Byte *digest); unsigned cycPos;
*/ unsigned _pad_unused;
#if 1 || defined(Z7_BLAKE2SP_USE_FUNCTIONS)
Z7_BLAKE2SP_FUNC_COMPRESS func_Compress_Fast;
typedef struct Z7_BLAKE2SP_FUNC_COMPRESS func_Compress_Single;
{ Z7_BLAKE2SP_FUNC_INIT func_Init;
CBlake2s S[BLAKE2SP_PARALLEL_DEGREE]; Z7_BLAKE2SP_FUNC_INIT func_Final;
unsigned bufPos; #endif
} header;
} u;
// MY_ALIGN(64)
UInt32 states[Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2SP_NUM_STRUCT_WORDS];
// MY_ALIGN(64)
UInt32 buf32[Z7_BLAKE2SP_PARALLEL_DEGREE * Z7_BLAKE2SP_NUM_STRUCT_WORDS * 2];
} CBlake2sp; } CBlake2sp;
BoolInt Blake2sp_SetFunction(CBlake2sp *p, unsigned algo);
void Blake2sp_Init(CBlake2sp *p); void Blake2sp_Init(CBlake2sp *p);
void Blake2sp_InitState(CBlake2sp *p);
void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size); void Blake2sp_Update(CBlake2sp *p, const Byte *data, size_t size);
void Blake2sp_Final(CBlake2sp *p, Byte *digest); void Blake2sp_Final(CBlake2sp *p, Byte *digest);
void z7_Black2sp_Prepare(void);
EXTERN_C_END EXTERN_C_END

File diff suppressed because it is too large Load diff

819
C/Bra.c
View file

@ -1,230 +1,709 @@
/* Bra.c -- Converters for RISC code /* Bra.c -- Branch converters for RISC code
2021-02-09 : Igor Pavlov : Public domain */ 2024-01-20 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include "CpuArch.h"
#include "Bra.h" #include "Bra.h"
#include "RotateDefs.h"
#include "CpuArch.h"
SizeT ARM_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) #if defined(MY_CPU_SIZEOF_POINTER) \
&& ( MY_CPU_SIZEOF_POINTER == 4 \
|| MY_CPU_SIZEOF_POINTER == 8)
#define BR_CONV_USE_OPT_PC_PTR
#endif
#ifdef BR_CONV_USE_OPT_PC_PTR
#define BR_PC_INIT pc -= (UInt32)(SizeT)p;
#define BR_PC_GET (pc + (UInt32)(SizeT)p)
#else
#define BR_PC_INIT pc += (UInt32)size;
#define BR_PC_GET (pc - (UInt32)(SizeT)(lim - p))
// #define BR_PC_INIT
// #define BR_PC_GET (pc + (UInt32)(SizeT)(p - data))
#endif
#define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c;
// #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c;
#define Z7_BRANCH_CONV(name) z7_ ## name
#define Z7_BRANCH_FUNC_MAIN(name) \
static \
Z7_FORCE_INLINE \
Z7_ATTRIB_NO_VECTOR \
Byte *Z7_BRANCH_CONV(name)(Byte *p, SizeT size, UInt32 pc, int encoding)
#define Z7_BRANCH_FUNC_IMP(name, m, encoding) \
Z7_NO_INLINE \
Z7_ATTRIB_NO_VECTOR \
Byte *m(name)(Byte *data, SizeT size, UInt32 pc) \
{ return Z7_BRANCH_CONV(name)(data, size, pc, encoding); } \
#ifdef Z7_EXTRACT_ONLY
#define Z7_BRANCH_FUNCS_IMP(name) \
Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0)
#else
#define Z7_BRANCH_FUNCS_IMP(name) \
Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_DEC_2, 0) \
Z7_BRANCH_FUNC_IMP(name, Z7_BRANCH_CONV_ENC_2, 1)
#endif
#if defined(__clang__)
#define BR_EXTERNAL_FOR
#define BR_NEXT_ITERATION continue;
#else
#define BR_EXTERNAL_FOR for (;;)
#define BR_NEXT_ITERATION break;
#endif
#if defined(__clang__) && (__clang_major__ >= 8) \
|| defined(__GNUC__) && (__GNUC__ >= 1000) \
// GCC is not good for __builtin_expect() here
/* || defined(_MSC_VER) && (_MSC_VER >= 1920) */
// #define Z7_unlikely [[unlikely]]
// #define Z7_LIKELY(x) (__builtin_expect((x), 1))
#define Z7_UNLIKELY(x) (__builtin_expect((x), 0))
// #define Z7_likely [[likely]]
#else
// #define Z7_LIKELY(x) (x)
#define Z7_UNLIKELY(x) (x)
// #define Z7_likely
#endif
Z7_BRANCH_FUNC_MAIN(BranchConv_ARM64)
{ {
Byte *p; // Byte *p = data;
const Byte *lim; const Byte *lim;
size &= ~(size_t)3; const UInt32 flag = (UInt32)1 << (24 - 4);
ip += 4; const UInt32 mask = ((UInt32)1 << 24) - (flag << 1);
p = data; size &= ~(SizeT)3;
lim = data + size; // if (size == 0) return p;
lim = p + size;
BR_PC_INIT
pc -= 4; // because (p) will point to next instruction
if (encoding) BR_EXTERNAL_FOR
for (;;)
{ {
// Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
for (;;) for (;;)
{ {
if (p >= lim) UInt32 v;
return (SizeT)(p - data); if Z7_UNLIKELY(p == lim)
return p;
v = GetUi32a(p);
p += 4; p += 4;
if (p[-1] == 0xEB) if Z7_UNLIKELY(((v - 0x94000000) & 0xfc000000) == 0)
break; {
} UInt32 c = BR_PC_GET >> 2;
{ BR_CONVERT_VAL(v, c)
UInt32 v = GetUi32(p - 4); v &= 0x03ffffff;
v <<= 2; v |= 0x94000000;
v += ip + (UInt32)(p - data); SetUi32a(p - 4, v)
v >>= 2; BR_NEXT_ITERATION
v &= 0x00FFFFFF; }
v |= 0xEB000000; // v = rotlFixed(v, 8); v += (flag << 8) - 0x90; if Z7_UNLIKELY((v & ((mask << 8) + 0x9f)) == 0)
SetUi32(p - 4, v); v -= 0x90000000; if Z7_UNLIKELY((v & 0x9f000000) == 0)
} {
} UInt32 z, c;
// v = rotrFixed(v, 8);
for (;;) v += flag; if Z7_UNLIKELY(v & mask) continue;
{ z = (v & 0xffffffe0) | (v >> 26);
for (;;) c = (BR_PC_GET >> (12 - 3)) & ~(UInt32)7;
{ BR_CONVERT_VAL(z, c)
if (p >= lim) v &= 0x1f;
return (SizeT)(p - data); v |= 0x90000000;
p += 4; v |= z << 26;
if (p[-1] == 0xEB) v |= 0x00ffffe0 & ((z & (((flag << 1) - 1))) - flag);
break; SetUi32a(p - 4, v)
} }
{
UInt32 v = GetUi32(p - 4);
v <<= 2;
v -= ip + (UInt32)(p - data);
v >>= 2;
v &= 0x00FFFFFF;
v |= 0xEB000000;
SetUi32(p - 4, v);
} }
} }
} }
Z7_BRANCH_FUNCS_IMP(BranchConv_ARM64)
SizeT ARMT_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) Z7_BRANCH_FUNC_MAIN(BranchConv_ARM)
{ {
Byte *p; // Byte *p = data;
const Byte *lim; const Byte *lim;
size &= ~(size_t)1; size &= ~(SizeT)3;
p = data; lim = p + size;
lim = data + size - 4; BR_PC_INIT
/* in ARM: branch offset is relative to the +2 instructions from current instruction.
if (encoding) (p) will point to next instruction */
pc += 8 - 4;
for (;;) for (;;)
{ {
UInt32 b1;
for (;;) for (;;)
{ {
UInt32 b3; if Z7_UNLIKELY(p >= lim) { return p; } p += 4; if Z7_UNLIKELY(p[-1] == 0xeb) break;
if (p > lim) if Z7_UNLIKELY(p >= lim) { return p; } p += 4; if Z7_UNLIKELY(p[-1] == 0xeb) break;
return (SizeT)(p - data);
b1 = p[1];
b3 = p[3];
p += 2;
b1 ^= 8;
if ((b3 & b1) >= 0xF8)
break;
} }
{ {
UInt32 v = UInt32 v = GetUi32a(p - 4);
((UInt32)b1 << 19) UInt32 c = BR_PC_GET >> 2;
+ (((UInt32)p[1] & 0x7) << 8) BR_CONVERT_VAL(v, c)
+ (((UInt32)p[-2] << 11)) v &= 0x00ffffff;
+ (p[0]); v |= 0xeb000000;
SetUi32a(p - 4, v)
p += 2;
{
UInt32 cur = (ip + (UInt32)(p - data)) >> 1;
v += cur;
}
p[-4] = (Byte)(v >> 11);
p[-3] = (Byte)(0xF0 | ((v >> 19) & 0x7));
p[-2] = (Byte)v;
p[-1] = (Byte)(0xF8 | (v >> 8));
}
}
for (;;)
{
UInt32 b1;
for (;;)
{
UInt32 b3;
if (p > lim)
return (SizeT)(p - data);
b1 = p[1];
b3 = p[3];
p += 2;
b1 ^= 8;
if ((b3 & b1) >= 0xF8)
break;
}
{
UInt32 v =
((UInt32)b1 << 19)
+ (((UInt32)p[1] & 0x7) << 8)
+ (((UInt32)p[-2] << 11))
+ (p[0]);
p += 2;
{
UInt32 cur = (ip + (UInt32)(p - data)) >> 1;
v -= cur;
}
/*
SetUi16(p - 4, (UInt16)(((v >> 11) & 0x7FF) | 0xF000));
SetUi16(p - 2, (UInt16)(v | 0xF800));
*/
p[-4] = (Byte)(v >> 11);
p[-3] = (Byte)(0xF0 | ((v >> 19) & 0x7));
p[-2] = (Byte)v;
p[-1] = (Byte)(0xF8 | (v >> 8));
} }
} }
} }
Z7_BRANCH_FUNCS_IMP(BranchConv_ARM)
SizeT PPC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) Z7_BRANCH_FUNC_MAIN(BranchConv_PPC)
{ {
Byte *p; // Byte *p = data;
const Byte *lim; const Byte *lim;
size &= ~(size_t)3; size &= ~(SizeT)3;
ip -= 4; lim = p + size;
p = data; BR_PC_INIT
lim = data + size; pc -= 4; // because (p) will point to next instruction
for (;;) for (;;)
{ {
UInt32 v;
for (;;) for (;;)
{ {
if (p >= lim) if Z7_UNLIKELY(p == lim)
return (SizeT)(p - data); return p;
// v = GetBe32a(p);
v = *(UInt32 *)(void *)p;
p += 4; p += 4;
/* if ((v & 0xFC000003) == 0x48000001) */ // if ((v & 0xfc000003) == 0x48000001) break;
if ((p[-4] & 0xFC) == 0x48 && (p[-1] & 3) == 1) // if ((p[-4] & 0xFC) == 0x48 && (p[-1] & 3) == 1) break;
break; if Z7_UNLIKELY(
((v - Z7_CONV_BE_TO_NATIVE_CONST32(0x48000001))
& Z7_CONV_BE_TO_NATIVE_CONST32(0xfc000003)) == 0) break;
} }
{ {
UInt32 v = GetBe32(p - 4); v = Z7_CONV_NATIVE_TO_BE_32(v);
if (encoding) {
v += ip + (UInt32)(p - data); UInt32 c = BR_PC_GET;
else BR_CONVERT_VAL(v, c)
v -= ip + (UInt32)(p - data); }
v &= 0x03FFFFFF; v &= 0x03ffffff;
v |= 0x48000000; v |= 0x48000000;
SetBe32(p - 4, v); SetBe32a(p - 4, v)
} }
} }
} }
Z7_BRANCH_FUNCS_IMP(BranchConv_PPC)
SizeT SPARC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) #ifdef Z7_CPU_FAST_ROTATE_SUPPORTED
#define BR_SPARC_USE_ROTATE
#endif
Z7_BRANCH_FUNC_MAIN(BranchConv_SPARC)
{ {
Byte *p; // Byte *p = data;
const Byte *lim; const Byte *lim;
size &= ~(size_t)3; const UInt32 flag = (UInt32)1 << 22;
ip -= 4; size &= ~(SizeT)3;
p = data; lim = p + size;
lim = data + size; BR_PC_INIT
pc -= 4; // because (p) will point to next instruction
for (;;)
{
UInt32 v;
for (;;)
{
if Z7_UNLIKELY(p == lim)
return p;
/* // the code without GetBe32a():
{ const UInt32 v = GetUi16a(p) & 0xc0ff; p += 4; if (v == 0x40 || v == 0xc07f) break; }
*/
v = GetBe32a(p);
p += 4;
#ifdef BR_SPARC_USE_ROTATE
v = rotlFixed(v, 2);
v += (flag << 2) - 1;
if Z7_UNLIKELY((v & (3 - (flag << 3))) == 0)
#else
v += (UInt32)5 << 29;
v ^= (UInt32)7 << 29;
v += flag;
if Z7_UNLIKELY((v & (0 - (flag << 1))) == 0)
#endif
break;
}
{
// UInt32 v = GetBe32a(p - 4);
#ifndef BR_SPARC_USE_ROTATE
v <<= 2;
#endif
{
UInt32 c = BR_PC_GET;
BR_CONVERT_VAL(v, c)
}
v &= (flag << 3) - 1;
#ifdef BR_SPARC_USE_ROTATE
v -= (flag << 2) - 1;
v = rotrFixed(v, 2);
#else
v -= (flag << 2);
v >>= 2;
v |= (UInt32)1 << 30;
#endif
SetBe32a(p - 4, v)
}
}
}
Z7_BRANCH_FUNCS_IMP(BranchConv_SPARC)
Z7_BRANCH_FUNC_MAIN(BranchConv_ARMT)
{
// Byte *p = data;
Byte *lim;
size &= ~(SizeT)1;
// if (size == 0) return p;
if (size <= 2) return p;
size -= 2;
lim = p + size;
BR_PC_INIT
/* in ARM: branch offset is relative to the +2 instructions from current instruction.
(p) will point to the +2 instructions from current instruction */
// pc += 4 - 4;
// if (encoding) pc -= 0xf800 << 1; else pc += 0xf800 << 1;
// #define ARMT_TAIL_PROC { goto armt_tail; }
#define ARMT_TAIL_PROC { return p; }
do
{
/* in MSVC 32-bit x86 compilers:
UInt32 version : it loads value from memory with movzx
Byte version : it loads value to 8-bit register (AL/CL)
movzx version is slightly faster in some cpus
*/
unsigned b1;
// Byte / unsigned
b1 = p[1];
// optimized version to reduce one (p >= lim) check:
// unsigned a1 = p[1]; b1 = p[3]; p += 2; if Z7_LIKELY((b1 & (a1 ^ 8)) < 0xf8)
for (;;)
{
unsigned b3; // Byte / UInt32
/* (Byte)(b3) normalization can use low byte computations in MSVC.
It gives smaller code, and no loss of speed in some compilers/cpus.
But new MSVC 32-bit x86 compilers use more slow load
from memory to low byte register in that case.
So we try to use full 32-bit computations for faster code.
*/
// if (p >= lim) { ARMT_TAIL_PROC } b3 = b1 + 8; b1 = p[3]; p += 2; if ((b3 & b1) >= 0xf8) break;
if Z7_UNLIKELY(p >= lim) { ARMT_TAIL_PROC } b3 = p[3]; p += 2; if Z7_UNLIKELY((b3 & (b1 ^ 8)) >= 0xf8) break;
if Z7_UNLIKELY(p >= lim) { ARMT_TAIL_PROC } b1 = p[3]; p += 2; if Z7_UNLIKELY((b1 & (b3 ^ 8)) >= 0xf8) break;
}
{
/* we can adjust pc for (0xf800) to rid of (& 0x7FF) operation.
But gcc/clang for arm64 can use bfi instruction for full code here */
UInt32 v =
((UInt32)GetUi16a(p - 2) << 11) |
((UInt32)GetUi16a(p) & 0x7FF);
/*
UInt32 v =
((UInt32)p[1 - 2] << 19)
+ (((UInt32)p[1] & 0x7) << 8)
+ (((UInt32)p[-2] << 11))
+ (p[0]);
*/
p += 2;
{
UInt32 c = BR_PC_GET >> 1;
BR_CONVERT_VAL(v, c)
}
SetUi16a(p - 4, (UInt16)(((v >> 11) & 0x7ff) | 0xf000))
SetUi16a(p - 2, (UInt16)(v | 0xf800))
/*
p[-4] = (Byte)(v >> 11);
p[-3] = (Byte)(0xf0 | ((v >> 19) & 0x7));
p[-2] = (Byte)v;
p[-1] = (Byte)(0xf8 | (v >> 8));
*/
}
}
while (p < lim);
return p;
// armt_tail:
// if ((Byte)((lim[1] & 0xf8)) != 0xf0) { lim += 2; } return lim;
// return (Byte *)(lim + ((Byte)((lim[1] ^ 0xf0) & 0xf8) == 0 ? 0 : 2));
// return (Byte *)(lim + (((lim[1] ^ ~0xfu) & ~7u) == 0 ? 0 : 2));
// return (Byte *)(lim + 2 - (((((unsigned)lim[1] ^ 8) + 8) >> 7) & 2));
}
Z7_BRANCH_FUNCS_IMP(BranchConv_ARMT)
// #define BR_IA64_NO_INLINE
Z7_BRANCH_FUNC_MAIN(BranchConv_IA64)
{
// Byte *p = data;
const Byte *lim;
size &= ~(SizeT)15;
lim = p + size;
pc -= 1 << 4;
pc >>= 4 - 1;
// pc -= 1 << 1;
for (;;) for (;;)
{ {
unsigned m;
for (;;) for (;;)
{ {
if (p >= lim) if Z7_UNLIKELY(p == lim)
return (SizeT)(p - data); return p;
/* m = (unsigned)((UInt32)0x334b0000 >> (*p & 0x1e));
v = GetBe32(p); p += 16;
p += 4; pc += 1 << 1;
m = v + ((UInt32)5 << 29); if (m &= 3)
m ^= (UInt32)7 << 29;
m += (UInt32)1 << 22;
if ((m & ((UInt32)0x1FF << 23)) == 0)
break;
*/
p += 4;
if ((p[-4] == 0x40 && (p[-3] & 0xC0) == 0) ||
(p[-4] == 0x7F && (p[-3] >= 0xC0)))
break; break;
} }
{ {
UInt32 v = GetBe32(p - 4); p += (ptrdiff_t)m * 5 - 20; // negative value is expected here.
v <<= 2; do
if (encoding) {
v += ip + (UInt32)(p - data); const UInt32 t =
else #if defined(MY_CPU_X86_OR_AMD64)
v -= ip + (UInt32)(p - data); // we use 32-bit load here to reduce code size on x86:
GetUi32(p);
v &= 0x01FFFFFF; #else
v -= (UInt32)1 << 24; GetUi16(p);
v ^= 0xFF000000; #endif
v >>= 2; UInt32 z = GetUi32(p + 1) >> m;
v |= 0x40000000; p += 5;
SetBe32(p - 4, v); if (((t >> m) & (0x70 << 1)) == 0
&& ((z - (0x5000000 << 1)) & (0xf000000 << 1)) == 0)
{
UInt32 v = (UInt32)((0x8fffff << 1) | 1) & z;
z ^= v;
#ifdef BR_IA64_NO_INLINE
v |= (v & ((UInt32)1 << (23 + 1))) >> 3;
{
UInt32 c = pc;
BR_CONVERT_VAL(v, c)
}
v &= (0x1fffff << 1) | 1;
#else
{
if (encoding)
{
// pc &= ~(0xc00000 << 1); // we just need to clear at least 2 bits
pc &= (0x1fffff << 1) | 1;
v += pc;
}
else
{
// pc |= 0xc00000 << 1; // we need to set at least 2 bits
pc |= ~(UInt32)((0x1fffff << 1) | 1);
v -= pc;
}
}
v &= ~(UInt32)(0x600000 << 1);
#endif
v += (0x700000 << 1);
v &= (0x8fffff << 1) | 1;
z |= v;
z <<= m;
SetUi32(p + 1 - 5, z)
}
m++;
}
while (m &= 3); // while (m < 4);
} }
} }
} }
Z7_BRANCH_FUNCS_IMP(BranchConv_IA64)
#define BR_CONVERT_VAL_ENC(v) v += BR_PC_GET;
#define BR_CONVERT_VAL_DEC(v) v -= BR_PC_GET;
#if 1 && defined(MY_CPU_LE_UNALIGN)
#define RISCV_USE_UNALIGNED_LOAD
#endif
#ifdef RISCV_USE_UNALIGNED_LOAD
#define RISCV_GET_UI32(p) GetUi32(p)
#define RISCV_SET_UI32(p, v) { SetUi32(p, v) }
#else
#define RISCV_GET_UI32(p) \
((UInt32)GetUi16a(p) + \
((UInt32)GetUi16a((p) + 2) << 16))
#define RISCV_SET_UI32(p, v) { \
SetUi16a(p, (UInt16)(v)) \
SetUi16a((p) + 2, (UInt16)(v >> 16)) }
#endif
#if 1 && defined(MY_CPU_LE)
#define RISCV_USE_16BIT_LOAD
#endif
#ifdef RISCV_USE_16BIT_LOAD
#define RISCV_LOAD_VAL(p) GetUi16a(p)
#else
#define RISCV_LOAD_VAL(p) (*(p))
#endif
#define RISCV_INSTR_SIZE 2
#define RISCV_STEP_1 (4 + RISCV_INSTR_SIZE)
#define RISCV_STEP_2 4
#define RISCV_REG_VAL (2 << 7)
#define RISCV_CMD_VAL 3
#if 1
// for code size optimization:
#define RISCV_DELTA_7F 0x7f
#else
#define RISCV_DELTA_7F 0
#endif
#define RISCV_CHECK_1(v, b) \
(((((b) - RISCV_CMD_VAL) ^ ((v) << 8)) & (0xf8000 + RISCV_CMD_VAL)) == 0)
#if 1
#define RISCV_CHECK_2(v, r) \
((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL | 8)) \
<< 18) \
< ((r) & 0x1d))
#else
// this branch gives larger code, because
// compilers generate larger code for big constants.
#define RISCV_CHECK_2(v, r) \
((((v) - ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \
& ((RISCV_CMD_VAL << 12) | RISCV_REG_VAL)) \
< ((r) & 0x1d))
#endif
#define RISCV_SCAN_LOOP \
Byte *lim; \
size &= ~(SizeT)(RISCV_INSTR_SIZE - 1); \
if (size <= 6) return p; \
size -= 6; \
lim = p + size; \
BR_PC_INIT \
for (;;) \
{ \
UInt32 a, v; \
/* Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE */ \
for (;;) \
{ \
if Z7_UNLIKELY(p >= lim) { return p; } \
a = (RISCV_LOAD_VAL(p) ^ 0x10u) + 1; \
if ((a & 0x77) == 0) break; \
a = (RISCV_LOAD_VAL(p + RISCV_INSTR_SIZE) ^ 0x10u) + 1; \
p += RISCV_INSTR_SIZE * 2; \
if ((a & 0x77) == 0) \
{ \
p -= RISCV_INSTR_SIZE; \
if Z7_UNLIKELY(p >= lim) { return p; } \
break; \
} \
}
// (xx6f ^ 10) + 1 = xx7f + 1 = xx80 : JAL
// (xxef ^ 10) + 1 = xxff + 1 = xx00 + 100 : JAL
// (xx17 ^ 10) + 1 = xx07 + 1 = xx08 : AUIPC
// (xx97 ^ 10) + 1 = xx87 + 1 = xx88 : AUIPC
Byte * Z7_BRANCH_CONV_ENC(RISCV)(Byte *p, SizeT size, UInt32 pc)
{
RISCV_SCAN_LOOP
v = a;
a = RISCV_GET_UI32(p);
#ifndef RISCV_USE_16BIT_LOAD
v += (UInt32)p[1] << 8;
#endif
if ((v & 8) == 0) // JAL
{
if ((v - (0x100 /* - RISCV_DELTA_7F */)) & 0xd80)
{
p += RISCV_INSTR_SIZE;
continue;
}
{
v = ((a & 1u << 31) >> 11)
| ((a & 0x3ff << 21) >> 20)
| ((a & 1 << 20) >> 9)
| (a & 0xff << 12);
BR_CONVERT_VAL_ENC(v)
// ((v & 1) == 0)
// v: bits [1 : 20] contain offset bits
#if 0 && defined(RISCV_USE_UNALIGNED_LOAD)
a &= 0xfff;
a |= ((UInt32)(v << 23))
| ((UInt32)(v << 7) & ((UInt32)0xff << 16))
| ((UInt32)(v >> 5) & ((UInt32)0xf0 << 8));
RISCV_SET_UI32(p, a)
#else // aligned
#if 0
SetUi16a(p, (UInt16)(((v >> 5) & 0xf000) | (a & 0xfff)))
#else
p[1] = (Byte)(((v >> 13) & 0xf0) | ((a >> 8) & 0xf));
#endif
#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
v <<= 15;
v = Z7_BSWAP32(v);
SetUi16a(p + 2, (UInt16)v)
#else
p[2] = (Byte)(v >> 9);
p[3] = (Byte)(v >> 1);
#endif
#endif // aligned
}
p += 4;
continue;
} // JAL
{
// AUIPC
if (v & 0xe80) // (not x0) and (not x2)
{
const UInt32 b = RISCV_GET_UI32(p + 4);
if (RISCV_CHECK_1(v, b))
{
{
const UInt32 temp = (b << 12) | (0x17 + RISCV_REG_VAL);
RISCV_SET_UI32(p, temp)
}
a &= 0xfffff000;
{
#if 1
const int t = -1 >> 1;
if (t != -1)
a += (b >> 20) - ((b >> 19) & 0x1000); // arithmetic right shift emulation
else
#endif
a += (UInt32)((Int32)b >> 20); // arithmetic right shift (sign-extension).
}
BR_CONVERT_VAL_ENC(a)
#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
a = Z7_BSWAP32(a);
RISCV_SET_UI32(p + 4, a)
#else
SetBe32(p + 4, a)
#endif
p += 8;
}
else
p += RISCV_STEP_1;
}
else
{
UInt32 r = a >> 27;
if (RISCV_CHECK_2(v, r))
{
v = RISCV_GET_UI32(p + 4);
r = (r << 7) + 0x17 + (v & 0xfffff000);
a = (a >> 12) | (v << 20);
RISCV_SET_UI32(p, r)
RISCV_SET_UI32(p + 4, a)
p += 8;
}
else
p += RISCV_STEP_2;
}
}
} // for
}
Byte * Z7_BRANCH_CONV_DEC(RISCV)(Byte *p, SizeT size, UInt32 pc)
{
RISCV_SCAN_LOOP
#ifdef RISCV_USE_16BIT_LOAD
if ((a & 8) == 0)
{
#else
v = a;
a += (UInt32)p[1] << 8;
if ((v & 8) == 0)
{
#endif
// JAL
a -= 0x100 - RISCV_DELTA_7F;
if (a & 0xd80)
{
p += RISCV_INSTR_SIZE;
continue;
}
{
const UInt32 a_old = (a + (0xef - RISCV_DELTA_7F)) & 0xfff;
#if 0 // unaligned
a = GetUi32(p);
v = (UInt32)(a >> 23) & ((UInt32)0xff << 1)
| (UInt32)(a >> 7) & ((UInt32)0xff << 9)
#elif 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
v = GetUi16a(p + 2);
v = Z7_BSWAP32(v) >> 15
#else
v = (UInt32)p[3] << 1
| (UInt32)p[2] << 9
#endif
| (UInt32)((a & 0xf000) << 5);
BR_CONVERT_VAL_DEC(v)
a = a_old
| (v << 11 & 1u << 31)
| (v << 20 & 0x3ff << 21)
| (v << 9 & 1 << 20)
| (v & 0xff << 12);
RISCV_SET_UI32(p, a)
}
p += 4;
continue;
} // JAL
{
// AUIPC
v = a;
#if 1 && defined(RISCV_USE_UNALIGNED_LOAD)
a = GetUi32(p);
#else
a |= (UInt32)GetUi16a(p + 2) << 16;
#endif
if ((v & 0xe80) == 0) // x0/x2
{
const UInt32 r = a >> 27;
if (RISCV_CHECK_2(v, r))
{
UInt32 b;
#if 1 && defined(Z7_CPU_FAST_BSWAP_SUPPORTED) && defined(MY_CPU_LE)
b = RISCV_GET_UI32(p + 4);
b = Z7_BSWAP32(b);
#else
b = GetBe32(p + 4);
#endif
v = a >> 12;
BR_CONVERT_VAL_DEC(b)
a = (r << 7) + 0x17;
a += (b + 0x800) & 0xfffff000;
v |= b << 20;
RISCV_SET_UI32(p, a)
RISCV_SET_UI32(p + 4, v)
p += 8;
}
else
p += RISCV_STEP_2;
}
else
{
const UInt32 b = RISCV_GET_UI32(p + 4);
if (!RISCV_CHECK_1(v, b))
p += RISCV_STEP_1;
else
{
v = (a & 0xfffff000) | (b >> 20);
a = (b << 12) | (0x17 + RISCV_REG_VAL);
RISCV_SET_UI32(p, a)
RISCV_SET_UI32(p + 4, v)
p += 8;
}
}
}
} // for
}

117
C/Bra.h
View file

@ -1,64 +1,105 @@
/* Bra.h -- Branch converters for executables /* Bra.h -- Branch converters for executables
2013-01-18 : Igor Pavlov : Public domain */ 2024-01-20 : Igor Pavlov : Public domain */
#ifndef __BRA_H #ifndef ZIP7_INC_BRA_H
#define __BRA_H #define ZIP7_INC_BRA_H
#include "7zTypes.h" #include "7zTypes.h"
EXTERN_C_BEGIN EXTERN_C_BEGIN
/* #define PPC BAD_PPC_11 // for debug */
#define Z7_BRANCH_CONV_DEC_2(name) z7_ ## name ## _Dec
#define Z7_BRANCH_CONV_ENC_2(name) z7_ ## name ## _Enc
#define Z7_BRANCH_CONV_DEC(name) Z7_BRANCH_CONV_DEC_2(BranchConv_ ## name)
#define Z7_BRANCH_CONV_ENC(name) Z7_BRANCH_CONV_ENC_2(BranchConv_ ## name)
#define Z7_BRANCH_CONV_ST_DEC(name) z7_BranchConvSt_ ## name ## _Dec
#define Z7_BRANCH_CONV_ST_ENC(name) z7_BranchConvSt_ ## name ## _Enc
#define Z7_BRANCH_CONV_DECL(name) Byte * name(Byte *data, SizeT size, UInt32 pc)
#define Z7_BRANCH_CONV_ST_DECL(name) Byte * name(Byte *data, SizeT size, UInt32 pc, UInt32 *state)
typedef Z7_BRANCH_CONV_DECL( (*z7_Func_BranchConv));
typedef Z7_BRANCH_CONV_ST_DECL((*z7_Func_BranchConvSt));
#define Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL 0
Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_DEC(X86));
Z7_BRANCH_CONV_ST_DECL (Z7_BRANCH_CONV_ST_ENC(X86));
#define Z7_BRANCH_FUNCS_DECL(name) \
Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_DEC_2(name)); \
Z7_BRANCH_CONV_DECL (Z7_BRANCH_CONV_ENC_2(name));
Z7_BRANCH_FUNCS_DECL (BranchConv_ARM64)
Z7_BRANCH_FUNCS_DECL (BranchConv_ARM)
Z7_BRANCH_FUNCS_DECL (BranchConv_ARMT)
Z7_BRANCH_FUNCS_DECL (BranchConv_PPC)
Z7_BRANCH_FUNCS_DECL (BranchConv_SPARC)
Z7_BRANCH_FUNCS_DECL (BranchConv_IA64)
Z7_BRANCH_FUNCS_DECL (BranchConv_RISCV)
/* /*
These functions convert relative addresses to absolute addresses These functions convert data that contain CPU instructions.
in CALL instructions to increase the compression ratio. Each such function converts relative addresses to absolute addresses in some
branch instructions: CALL (in all converters) and JUMP (X86 converter only).
Such conversion allows to increase compression ratio, if we compress that data.
In: There are 2 types of converters:
data - data buffer Byte * Conv_RISC (Byte *data, SizeT size, UInt32 pc);
size - size of data Byte * ConvSt_X86(Byte *data, SizeT size, UInt32 pc, UInt32 *state);
ip - current virtual Instruction Pinter (IP) value Each Converter supports 2 versions: one for encoding
state - state variable for x86 converter and one for decoding (_Enc/_Dec postfixes in function name).
encoding - 0 (for decoding), 1 (for encoding)
Out: In params:
state - state variable for x86 converter data : data buffer
size : size of data
pc : current virtual Program Counter (Instruction Pointer) value
In/Out param:
state : pointer to state variable (for X86 converter only)
Returns: Return:
The number of processed bytes. If you call these functions with multiple calls, The pointer to position in (data) buffer after last byte that was processed.
you must start next call with first byte after block of processed bytes. If the caller calls converter again, it must call it starting with that position.
But the caller is allowed to move data in buffer. So pointer to
current processed position also will be changed for next call.
Also the caller must increase internal (pc) value for next call.
Each converter has some characteristics: Endian, Alignment, LookAhead.
Type Endian Alignment LookAhead Type Endian Alignment LookAhead
x86 little 1 4 X86 little 1 4
ARMT little 2 2 ARMT little 2 2
RISCV little 2 6
ARM little 4 0 ARM little 4 0
ARM64 little 4 0
PPC big 4 0 PPC big 4 0
SPARC big 4 0 SPARC big 4 0
IA64 little 16 0 IA64 little 16 0
size must be >= Alignment + LookAhead, if it's not last block. (data) must be aligned for (Alignment).
If (size < Alignment + LookAhead), converter returns 0. processed size can be calculated as:
SizeT processed = Conv(data, size, pc) - data;
if (processed == 0)
it means that converter needs more data for processing.
If (size < Alignment + LookAhead)
then (processed == 0) is allowed.
Example: Example code for conversion in loop:
UInt32 pc = 0;
UInt32 ip = 0; size = 0;
for () for (;;)
{ {
; size must be >= Alignment + LookAhead, if it's not last block size += Load_more_input_data(data + size);
SizeT processed = Convert(data, size, ip, 1); SizeT processed = Conv(data, size, pc) - data;
data += processed; if (processed == 0 && no_more_input_data_after_size)
size -= processed; break; // we stop convert loop
ip += processed; data += processed;
} size -= processed;
pc += processed;
}
*/ */
#define x86_Convert_Init(state) { state = 0; }
SizeT x86_Convert(Byte *data, SizeT size, UInt32 ip, UInt32 *state, int encoding);
SizeT ARM_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
SizeT ARMT_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
SizeT PPC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
SizeT SPARC_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
SizeT IA64_Convert(Byte *data, SizeT size, UInt32 ip, int encoding);
EXTERN_C_END EXTERN_C_END
#endif #endif

221
C/Bra86.c
View file

@ -1,82 +1,187 @@
/* Bra86.c -- Converter for x86 code (BCJ) /* Bra86.c -- Branch converter for X86 code (BCJ)
2021-02-09 : Igor Pavlov : Public domain */ 2023-04-02 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include "Bra.h" #include "Bra.h"
#include "CpuArch.h"
#define Test86MSByte(b) ((((b) + 1) & 0xFE) == 0)
SizeT x86_Convert(Byte *data, SizeT size, UInt32 ip, UInt32 *state, int encoding) #if defined(MY_CPU_SIZEOF_POINTER) \
&& ( MY_CPU_SIZEOF_POINTER == 4 \
|| MY_CPU_SIZEOF_POINTER == 8)
#define BR_CONV_USE_OPT_PC_PTR
#endif
#ifdef BR_CONV_USE_OPT_PC_PTR
#define BR_PC_INIT pc -= (UInt32)(SizeT)p; // (MY_uintptr_t)
#define BR_PC_GET (pc + (UInt32)(SizeT)p)
#else
#define BR_PC_INIT pc += (UInt32)size;
#define BR_PC_GET (pc - (UInt32)(SizeT)(lim - p))
// #define BR_PC_INIT
// #define BR_PC_GET (pc + (UInt32)(SizeT)(p - data))
#endif
#define BR_CONVERT_VAL(v, c) if (encoding) v += c; else v -= c;
// #define BR_CONVERT_VAL(v, c) if (!encoding) c = (UInt32)0 - c; v += c;
#define Z7_BRANCH_CONV_ST(name) z7_BranchConvSt_ ## name
#define BR86_NEED_CONV_FOR_MS_BYTE(b) ((((b) + 1) & 0xfe) == 0)
#ifdef MY_CPU_LE_UNALIGN
#define BR86_PREPARE_BCJ_SCAN const UInt32 v = GetUi32(p) ^ 0xe8e8e8e8;
#define BR86_IS_BCJ_BYTE(n) ((v & ((UInt32)0xfe << (n) * 8)) == 0)
#else
#define BR86_PREPARE_BCJ_SCAN
// bad for MSVC X86 (partial write to byte reg):
#define BR86_IS_BCJ_BYTE(n) ((p[n - 4] & 0xfe) == 0xe8)
// bad for old MSVC (partial write to byte reg):
// #define BR86_IS_BCJ_BYTE(n) (((*p ^ 0xe8) & 0xfe) == 0)
#endif
static
Z7_FORCE_INLINE
Z7_ATTRIB_NO_VECTOR
Byte *Z7_BRANCH_CONV_ST(X86)(Byte *p, SizeT size, UInt32 pc, UInt32 *state, int encoding)
{ {
SizeT pos = 0;
UInt32 mask = *state & 7;
if (size < 5) if (size < 5)
return 0; return p;
size -= 4; {
ip += 5; // Byte *p = data;
const Byte *lim = p + size - 4;
unsigned mask = (unsigned)*state; // & 7;
#ifdef BR_CONV_USE_OPT_PC_PTR
/* if BR_CONV_USE_OPT_PC_PTR is defined: we need to adjust (pc) for (+4),
because call/jump offset is relative to the next instruction.
if BR_CONV_USE_OPT_PC_PTR is not defined : we don't need to adjust (pc) for (+4),
because BR_PC_GET uses (pc - (lim - p)), and lim was adjusted for (-4) before.
*/
pc += 4;
#endif
BR_PC_INIT
goto start;
for (;;) for (;; mask |= 4)
{ {
Byte *p = data + pos; // cont: mask |= 4;
const Byte *limit = data + size; start:
for (; p < limit; p++) if (p >= lim)
if ((*p & 0xFE) == 0xE8) goto fin;
break;
{ {
SizeT d = (SizeT)(p - data) - pos; BR86_PREPARE_BCJ_SCAN
pos = (SizeT)(p - data); p += 4;
if (p >= limit) if (BR86_IS_BCJ_BYTE(0)) { goto m0; } mask >>= 1;
{ if (BR86_IS_BCJ_BYTE(1)) { goto m1; } mask >>= 1;
*state = (d > 2 ? 0 : mask >> (unsigned)d); if (BR86_IS_BCJ_BYTE(2)) { goto m2; } mask = 0;
return pos; if (BR86_IS_BCJ_BYTE(3)) { goto a3; }
}
if (d > 2)
mask = 0;
else
{
mask >>= (unsigned)d;
if (mask != 0 && (mask > 4 || mask == 3 || Test86MSByte(p[(size_t)(mask >> 1) + 1])))
{
mask = (mask >> 1) | 4;
pos++;
continue;
}
}
} }
goto main_loop;
if (Test86MSByte(p[4])) m0: p--;
m1: p--;
m2: p--;
if (mask == 0)
goto a3;
if (p > lim)
goto fin_p;
// if (((0x17u >> mask) & 1) == 0)
if (mask > 4 || mask == 3)
{ {
UInt32 v = ((UInt32)p[4] << 24) | ((UInt32)p[3] << 16) | ((UInt32)p[2] << 8) | ((UInt32)p[1]); mask >>= 1;
UInt32 cur = ip + (UInt32)pos; continue; // goto cont;
pos += 5; }
if (encoding) mask >>= 1;
v += cur; if (BR86_NEED_CONV_FOR_MS_BYTE(p[mask]))
else continue; // goto cont;
v -= cur; // if (!BR86_NEED_CONV_FOR_MS_BYTE(p[3])) continue; // goto cont;
if (mask != 0) {
UInt32 v = GetUi32(p);
UInt32 c;
v += (1 << 24); if (v & 0xfe000000) continue; // goto cont;
c = BR_PC_GET;
BR_CONVERT_VAL(v, c)
{ {
unsigned sh = (mask & 6) << 2; mask <<= 3;
if (Test86MSByte((Byte)(v >> sh))) if (BR86_NEED_CONV_FOR_MS_BYTE(v >> mask))
{ {
v ^= (((UInt32)0x100 << sh) - 1); v ^= (((UInt32)0x100 << mask) - 1);
if (encoding) #ifdef MY_CPU_X86
v += cur; // for X86 : we can recalculate (c) to reduce register pressure
else c = BR_PC_GET;
v -= cur; #endif
BR_CONVERT_VAL(v, c)
} }
mask = 0; mask = 0;
} }
p[1] = (Byte)v; // v = (v & ((1 << 24) - 1)) - (v & (1 << 24));
p[2] = (Byte)(v >> 8); v &= (1 << 25) - 1; v -= (1 << 24);
p[3] = (Byte)(v >> 16); SetUi32(p, v)
p[4] = (Byte)(0 - ((v >> 24) & 1)); p += 4;
goto main_loop;
} }
else
main_loop:
if (p >= lim)
goto fin;
for (;;)
{ {
mask = (mask >> 1) | 4; BR86_PREPARE_BCJ_SCAN
pos++; p += 4;
if (BR86_IS_BCJ_BYTE(0)) { goto a0; }
if (BR86_IS_BCJ_BYTE(1)) { goto a1; }
if (BR86_IS_BCJ_BYTE(2)) { goto a2; }
if (BR86_IS_BCJ_BYTE(3)) { goto a3; }
if (p >= lim)
goto fin;
}
a0: p--;
a1: p--;
a2: p--;
a3:
if (p > lim)
goto fin_p;
// if (!BR86_NEED_CONV_FOR_MS_BYTE(p[3])) continue; // goto cont;
{
UInt32 v = GetUi32(p);
UInt32 c;
v += (1 << 24); if (v & 0xfe000000) continue; // goto cont;
c = BR_PC_GET;
BR_CONVERT_VAL(v, c)
// v = (v & ((1 << 24) - 1)) - (v & (1 << 24));
v &= (1 << 25) - 1; v -= (1 << 24);
SetUi32(p, v)
p += 4;
goto main_loop;
} }
} }
fin_p:
p--;
fin:
// the following processing for tail is optional and can be commented
/*
lim += 4;
for (; p < lim; p++, mask >>= 1)
if ((*p & 0xfe) == 0xe8)
break;
*/
*state = (UInt32)mask;
return p;
}
} }
#define Z7_BRANCH_CONV_ST_FUNC_IMP(name, m, encoding) \
Z7_NO_INLINE \
Z7_ATTRIB_NO_VECTOR \
Byte *m(name)(Byte *data, SizeT size, UInt32 pc, UInt32 *state) \
{ return Z7_BRANCH_CONV_ST(name)(data, size, pc, state, encoding); }
Z7_BRANCH_CONV_ST_FUNC_IMP(X86, Z7_BRANCH_CONV_ST_DEC, 0)
#ifndef Z7_EXTRACT_ONLY
Z7_BRANCH_CONV_ST_FUNC_IMP(X86, Z7_BRANCH_CONV_ST_ENC, 1)
#endif

View file

@ -1,53 +1,14 @@
/* BraIA64.c -- Converter for IA-64 code /* BraIA64.c -- Converter for IA-64 code
2017-01-26 : Igor Pavlov : Public domain */ 2023-02-20 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include "CpuArch.h" // the code was moved to Bra.c
#include "Bra.h"
SizeT IA64_Convert(Byte *data, SizeT size, UInt32 ip, int encoding) #ifdef _MSC_VER
{ #pragma warning(disable : 4206) // nonstandard extension used : translation unit is empty
SizeT i; #endif
if (size < 16)
return 0;
size -= 16;
i = 0;
do
{
unsigned m = ((UInt32)0x334B0000 >> (data[i] & 0x1E)) & 3;
if (m)
{
m++;
do
{
Byte *p = data + (i + (size_t)m * 5 - 8);
if (((p[3] >> m) & 15) == 5
&& (((p[-1] | ((UInt32)p[0] << 8)) >> m) & 0x70) == 0)
{
unsigned raw = GetUi32(p);
unsigned v = raw >> m;
v = (v & 0xFFFFF) | ((v & (1 << 23)) >> 3);
v <<= 4; #if defined(__clang__)
if (encoding) #pragma GCC diagnostic ignored "-Wempty-translation-unit"
v += ip + (UInt32)i; #endif
else
v -= ip + (UInt32)i;
v >>= 4;
v &= 0x1FFFFF;
v += 0x700000;
v &= 0x8FFFFF;
raw &= ~((UInt32)0x8FFFFF << m);
raw |= (v << m);
SetUi32(p, raw);
}
}
while (++m <= 4);
}
i += 16;
}
while (i <= size);
return i;
}

View file

@ -1,5 +1,5 @@
/* BwtSort.c -- BWT block sorting /* BwtSort.c -- BWT block sorting
2021-04-01 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -7,8 +7,44 @@
#include "Sort.h" #include "Sort.h"
/* #define BLOCK_SORT_USE_HEAP_SORT */ /* #define BLOCK_SORT_USE_HEAP_SORT */
// #define BLOCK_SORT_USE_HEAP_SORT
#ifdef BLOCK_SORT_USE_HEAP_SORT
#define HeapSortRefDown(p, vals, n, size, temp) \
{ size_t k = n; UInt32 val = vals[temp]; for (;;) { \
size_t s = k << 1; \
if (s > size) break; \
if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \
if (val >= vals[p[s]]) break; \
p[k] = p[s]; k = s; \
} p[k] = temp; }
void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size)
{
if (size <= 1)
return;
p--;
{
size_t i = size / 2;
do
{
UInt32 temp = p[i];
HeapSortRefDown(p, vals, i, size, temp);
}
while (--i != 0);
}
do
{
UInt32 temp = p[size];
p[size--] = p[1];
HeapSortRefDown(p, vals, 1, size, temp);
}
while (size > 1);
}
#endif // BLOCK_SORT_USE_HEAP_SORT
#define NO_INLINE MY_FAST_CALL
/* Don't change it !!! */ /* Don't change it !!! */
#define kNumHashBytes 2 #define kNumHashBytes 2
@ -29,26 +65,27 @@
#else #else
#define kNumBitsMax 20 #define kNumBitsMax 20
#define kIndexMask ((1 << kNumBitsMax) - 1) #define kIndexMask (((UInt32)1 << kNumBitsMax) - 1)
#define kNumExtraBits (32 - kNumBitsMax) #define kNumExtraBits (32 - kNumBitsMax)
#define kNumExtra0Bits (kNumExtraBits - 2) #define kNumExtra0Bits (kNumExtraBits - 2)
#define kNumExtra0Mask ((1 << kNumExtra0Bits) - 1) #define kNumExtra0Mask ((1 << kNumExtra0Bits) - 1)
#define SetFinishedGroupSize(p, size) \ #define SetFinishedGroupSize(p, size) \
{ *(p) |= ((((size) - 1) & kNumExtra0Mask) << kNumBitsMax); \ { *(p) |= ((((UInt32)(size) - 1) & kNumExtra0Mask) << kNumBitsMax); \
if ((size) > (1 << kNumExtra0Bits)) { \ if ((size) > (1 << kNumExtra0Bits)) { \
*(p) |= 0x40000000; *((p) + 1) |= ((((size) - 1)>> kNumExtra0Bits) << kNumBitsMax); } } \ *(p) |= 0x40000000; \
*((p) + 1) |= (((UInt32)(size) - 1) >> kNumExtra0Bits) << kNumBitsMax; } } \
static void SetGroupSize(UInt32 *p, UInt32 size) static void SetGroupSize(UInt32 *p, size_t size)
{ {
if (--size == 0) if (--size == 0)
return; return;
*p |= 0x80000000 | ((size & kNumExtra0Mask) << kNumBitsMax); *p |= 0x80000000 | (((UInt32)size & kNumExtra0Mask) << kNumBitsMax);
if (size >= (1 << kNumExtra0Bits)) if (size >= (1 << kNumExtra0Bits))
{ {
*p |= 0x40000000; *p |= 0x40000000;
p[1] |= ((size >> kNumExtra0Bits) << kNumBitsMax); p[1] |= (((UInt32)size >> kNumExtra0Bits) << kNumBitsMax);
} }
} }
@ -60,10 +97,15 @@ SortGroup - is recursive Range-Sort function with HeapSort optimization for smal
returns: 1 - if there are groups, 0 - no more groups returns: 1 - if there are groups, 0 - no more groups
*/ */
static UInt32 NO_INLINE SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt32 groupOffset, UInt32 groupSize, int NumRefBits, UInt32 *Indices static
#ifndef BLOCK_SORT_USE_HEAP_SORT unsigned
, UInt32 left, UInt32 range Z7_FASTCALL
#endif SortGroup(size_t BlockSize, size_t NumSortedBytes,
size_t groupOffset, size_t groupSize,
unsigned NumRefBits, UInt32 *Indices
#ifndef BLOCK_SORT_USE_HEAP_SORT
, size_t left, size_t range
#endif
) )
{ {
UInt32 *ind2 = Indices + groupOffset; UInt32 *ind2 = Indices + groupOffset;
@ -72,96 +114,99 @@ static UInt32 NO_INLINE SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt3
{ {
/* /*
#ifndef BLOCK_SORT_EXTERNAL_FLAGS #ifndef BLOCK_SORT_EXTERNAL_FLAGS
SetFinishedGroupSize(ind2, 1); SetFinishedGroupSize(ind2, 1)
#endif #endif
*/ */
return 0; return 0;
} }
Groups = Indices + BlockSize + BS_TEMP_SIZE; Groups = Indices + BlockSize + BS_TEMP_SIZE;
if (groupSize <= ((UInt32)1 << NumRefBits) if (groupSize <= ((size_t)1 << NumRefBits)
#ifndef BLOCK_SORT_USE_HEAP_SORT #ifndef BLOCK_SORT_USE_HEAP_SORT
&& groupSize <= range && groupSize <= range
#endif #endif
) )
{ {
UInt32 *temp = Indices + BlockSize; UInt32 *temp = Indices + BlockSize;
UInt32 j; size_t j, group;
UInt32 mask, thereAreGroups, group, cg; UInt32 mask, cg;
unsigned thereAreGroups;
{ {
UInt32 gPrev; UInt32 gPrev;
UInt32 gRes = 0; UInt32 gRes = 0;
{ {
UInt32 sp = ind2[0] + NumSortedBytes; size_t sp = ind2[0] + NumSortedBytes;
if (sp >= BlockSize) sp -= BlockSize; if (sp >= BlockSize)
sp -= BlockSize;
gPrev = Groups[sp]; gPrev = Groups[sp];
temp[0] = (gPrev << NumRefBits); temp[0] = gPrev << NumRefBits;
} }
for (j = 1; j < groupSize; j++) for (j = 1; j < groupSize; j++)
{ {
UInt32 sp = ind2[j] + NumSortedBytes; size_t sp = ind2[j] + NumSortedBytes;
UInt32 g; UInt32 g;
if (sp >= BlockSize) sp -= BlockSize; if (sp >= BlockSize)
sp -= BlockSize;
g = Groups[sp]; g = Groups[sp];
temp[j] = (g << NumRefBits) | j; temp[j] = (g << NumRefBits) | (UInt32)j;
gRes |= (gPrev ^ g); gRes |= (gPrev ^ g);
} }
if (gRes == 0) if (gRes == 0)
{ {
#ifndef BLOCK_SORT_EXTERNAL_FLAGS #ifndef BLOCK_SORT_EXTERNAL_FLAGS
SetGroupSize(ind2, groupSize); SetGroupSize(ind2, groupSize);
#endif #endif
return 1; return 1;
} }
} }
HeapSort(temp, groupSize); HeapSort(temp, groupSize);
mask = (((UInt32)1 << NumRefBits) - 1); mask = ((UInt32)1 << NumRefBits) - 1;
thereAreGroups = 0; thereAreGroups = 0;
group = groupOffset; group = groupOffset;
cg = (temp[0] >> NumRefBits); cg = temp[0] >> NumRefBits;
temp[0] = ind2[temp[0] & mask]; temp[0] = ind2[temp[0] & mask];
{ {
#ifdef BLOCK_SORT_EXTERNAL_FLAGS #ifdef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 *Flags = Groups + BlockSize; UInt32 *Flags = Groups + BlockSize;
#else #else
UInt32 prevGroupStart = 0; size_t prevGroupStart = 0;
#endif #endif
for (j = 1; j < groupSize; j++) for (j = 1; j < groupSize; j++)
{ {
UInt32 val = temp[j]; const UInt32 val = temp[j];
UInt32 cgCur = (val >> NumRefBits); const UInt32 cgCur = val >> NumRefBits;
if (cgCur != cg) if (cgCur != cg)
{ {
cg = cgCur; cg = cgCur;
group = groupOffset + j; group = groupOffset + j;
#ifdef BLOCK_SORT_EXTERNAL_FLAGS #ifdef BLOCK_SORT_EXTERNAL_FLAGS
{ {
UInt32 t = group - 1; const size_t t = group - 1;
Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask));
} }
#else #else
SetGroupSize(temp + prevGroupStart, j - prevGroupStart); SetGroupSize(temp + prevGroupStart, j - prevGroupStart);
prevGroupStart = j; prevGroupStart = j;
#endif #endif
} }
else else
thereAreGroups = 1; thereAreGroups = 1;
{ {
UInt32 ind = ind2[val & mask]; const UInt32 ind = ind2[val & mask];
temp[j] = ind; temp[j] = ind;
Groups[ind] = group; Groups[ind] = (UInt32)group;
} }
} }
#ifndef BLOCK_SORT_EXTERNAL_FLAGS #ifndef BLOCK_SORT_EXTERNAL_FLAGS
SetGroupSize(temp + prevGroupStart, j - prevGroupStart); SetGroupSize(temp + prevGroupStart, j - prevGroupStart);
#endif #endif
} }
for (j = 0; j < groupSize; j++) for (j = 0; j < groupSize; j++)
@ -171,37 +216,42 @@ static UInt32 NO_INLINE SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt3
/* Check that all strings are in one group (cannot sort) */ /* Check that all strings are in one group (cannot sort) */
{ {
UInt32 group, j; UInt32 group;
UInt32 sp = ind2[0] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; size_t j;
size_t sp = ind2[0] + NumSortedBytes;
if (sp >= BlockSize)
sp -= BlockSize;
group = Groups[sp]; group = Groups[sp];
for (j = 1; j < groupSize; j++) for (j = 1; j < groupSize; j++)
{ {
sp = ind2[j] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; sp = ind2[j] + NumSortedBytes;
if (sp >= BlockSize)
sp -= BlockSize;
if (Groups[sp] != group) if (Groups[sp] != group)
break; break;
} }
if (j == groupSize) if (j == groupSize)
{ {
#ifndef BLOCK_SORT_EXTERNAL_FLAGS #ifndef BLOCK_SORT_EXTERNAL_FLAGS
SetGroupSize(ind2, groupSize); SetGroupSize(ind2, groupSize);
#endif #endif
return 1; return 1;
} }
} }
#ifndef BLOCK_SORT_USE_HEAP_SORT #ifndef BLOCK_SORT_USE_HEAP_SORT
{ {
/* ---------- Range Sort ---------- */ /* ---------- Range Sort ---------- */
UInt32 i; size_t i;
UInt32 mid; size_t mid;
for (;;) for (;;)
{ {
UInt32 j; size_t j;
if (range <= 1) if (range <= 1)
{ {
#ifndef BLOCK_SORT_EXTERNAL_FLAGS #ifndef BLOCK_SORT_EXTERNAL_FLAGS
SetGroupSize(ind2, groupSize); SetGroupSize(ind2, groupSize);
#endif #endif
return 1; return 1;
} }
mid = left + ((range + 1) >> 1); mid = left + ((range + 1) >> 1);
@ -209,7 +259,7 @@ static UInt32 NO_INLINE SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt3
i = 0; i = 0;
do do
{ {
UInt32 sp = ind2[i] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; size_t sp = ind2[i] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize;
if (Groups[sp] >= mid) if (Groups[sp] >= mid)
{ {
for (j--; j > i; j--) for (j--; j > i; j--)
@ -237,51 +287,53 @@ static UInt32 NO_INLINE SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt3
break; break;
} }
#ifdef BLOCK_SORT_EXTERNAL_FLAGS #ifdef BLOCK_SORT_EXTERNAL_FLAGS
{ {
UInt32 t = (groupOffset + i - 1); const size_t t = groupOffset + i - 1;
UInt32 *Flags = Groups + BlockSize; UInt32 *Flags = Groups + BlockSize;
Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask));
} }
#endif #endif
{ {
UInt32 j; size_t j;
for (j = i; j < groupSize; j++) for (j = i; j < groupSize; j++)
Groups[ind2[j]] = groupOffset + i; Groups[ind2[j]] = (UInt32)(groupOffset + i);
} }
{ {
UInt32 res = SortGroup(BlockSize, NumSortedBytes, groupOffset, i, NumRefBits, Indices, left, mid - left); unsigned res = SortGroup(BlockSize, NumSortedBytes, groupOffset, i, NumRefBits, Indices, left, mid - left);
return res | SortGroup(BlockSize, NumSortedBytes, groupOffset + i, groupSize - i, NumRefBits, Indices, mid, range - (mid - left)); return res | SortGroup(BlockSize, NumSortedBytes, groupOffset + i, groupSize - i, NumRefBits, Indices, mid, range - (mid - left));
} }
} }
#else #else // BLOCK_SORT_USE_HEAP_SORT
/* ---------- Heap Sort ---------- */ /* ---------- Heap Sort ---------- */
{ {
UInt32 j; size_t j;
for (j = 0; j < groupSize; j++) for (j = 0; j < groupSize; j++)
{ {
UInt32 sp = ind2[j] + NumSortedBytes; if (sp >= BlockSize) sp -= BlockSize; size_t sp = ind2[j] + NumSortedBytes;
ind2[j] = sp; if (sp >= BlockSize)
sp -= BlockSize;
ind2[j] = (UInt32)sp;
} }
HeapSortRef(ind2, Groups, groupSize); HeapSortRef(ind2, Groups, groupSize);
/* Write Flags */ /* Write Flags */
{ {
UInt32 sp = ind2[0]; size_t sp = ind2[0];
UInt32 group = Groups[sp]; UInt32 group = Groups[sp];
#ifdef BLOCK_SORT_EXTERNAL_FLAGS #ifdef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 *Flags = Groups + BlockSize; UInt32 *Flags = Groups + BlockSize;
#else #else
UInt32 prevGroupStart = 0; size_t prevGroupStart = 0;
#endif #endif
for (j = 1; j < groupSize; j++) for (j = 1; j < groupSize; j++)
{ {
@ -289,149 +341,210 @@ static UInt32 NO_INLINE SortGroup(UInt32 BlockSize, UInt32 NumSortedBytes, UInt3
if (Groups[sp] != group) if (Groups[sp] != group)
{ {
group = Groups[sp]; group = Groups[sp];
#ifdef BLOCK_SORT_EXTERNAL_FLAGS #ifdef BLOCK_SORT_EXTERNAL_FLAGS
{ {
UInt32 t = groupOffset + j - 1; const size_t t = groupOffset + j - 1;
Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask));
} }
#else #else
SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart);
prevGroupStart = j; prevGroupStart = j;
#endif #endif
} }
} }
#ifndef BLOCK_SORT_EXTERNAL_FLAGS #ifndef BLOCK_SORT_EXTERNAL_FLAGS
SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart); SetGroupSize(ind2 + prevGroupStart, j - prevGroupStart);
#endif #endif
} }
{ {
/* Write new Groups values and Check that there are groups */ /* Write new Groups values and Check that there are groups */
UInt32 thereAreGroups = 0; unsigned thereAreGroups = 0;
for (j = 0; j < groupSize; j++) for (j = 0; j < groupSize; j++)
{ {
UInt32 group = groupOffset + j; size_t group = groupOffset + j;
#ifndef BLOCK_SORT_EXTERNAL_FLAGS #ifndef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 subGroupSize = ((ind2[j] & ~0xC0000000) >> kNumBitsMax); UInt32 subGroupSize = ((ind2[j] & ~0xC0000000) >> kNumBitsMax);
if ((ind2[j] & 0x40000000) != 0) if (ind2[j] & 0x40000000)
subGroupSize += ((ind2[(size_t)j + 1] >> kNumBitsMax) << kNumExtra0Bits); subGroupSize += ((ind2[(size_t)j + 1] >> kNumBitsMax) << kNumExtra0Bits);
subGroupSize++; subGroupSize++;
for (;;) for (;;)
{ {
UInt32 original = ind2[j]; const UInt32 original = ind2[j];
UInt32 sp = original & kIndexMask; size_t sp = original & kIndexMask;
if (sp < NumSortedBytes) sp += BlockSize; sp -= NumSortedBytes; if (sp < NumSortedBytes)
ind2[j] = sp | (original & ~kIndexMask); sp += BlockSize;
Groups[sp] = group; sp -= NumSortedBytes;
ind2[j] = (UInt32)sp | (original & ~kIndexMask);
Groups[sp] = (UInt32)group;
if (--subGroupSize == 0) if (--subGroupSize == 0)
break; break;
j++; j++;
thereAreGroups = 1; thereAreGroups = 1;
} }
#else #else
UInt32 *Flags = Groups + BlockSize; UInt32 *Flags = Groups + BlockSize;
for (;;) for (;;)
{ {
UInt32 sp = ind2[j]; if (sp < NumSortedBytes) sp += BlockSize; sp -= NumSortedBytes; size_t sp = ind2[j];
ind2[j] = sp; if (sp < NumSortedBytes)
Groups[sp] = group; sp += BlockSize;
sp -= NumSortedBytes;
ind2[j] = (UInt32)sp;
Groups[sp] = (UInt32)group;
if ((Flags[(groupOffset + j) >> kNumFlagsBits] & (1 << ((groupOffset + j) & kFlagsMask))) == 0) if ((Flags[(groupOffset + j) >> kNumFlagsBits] & (1 << ((groupOffset + j) & kFlagsMask))) == 0)
break; break;
j++; j++;
thereAreGroups = 1; thereAreGroups = 1;
} }
#endif #endif
} }
return thereAreGroups; return thereAreGroups;
} }
} }
#endif #endif // BLOCK_SORT_USE_HEAP_SORT
} }
/* conditions: blockSize > 0 */ /* conditions: blockSize > 0 */
UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize) UInt32 BlockSort(UInt32 *Indices, const Byte *data, size_t blockSize)
{ {
UInt32 *counters = Indices + blockSize; UInt32 *counters = Indices + blockSize;
UInt32 i; size_t i;
UInt32 *Groups; UInt32 *Groups;
#ifdef BLOCK_SORT_EXTERNAL_FLAGS #ifdef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 *Flags; UInt32 *Flags;
#endif #endif
/* Radix-Sort for 2 bytes */ /* Radix-Sort for 2 bytes */
// { UInt32 yyy; for (yyy = 0; yyy < 100; yyy++) {
for (i = 0; i < kNumHashValues; i++) for (i = 0; i < kNumHashValues; i++)
counters[i] = 0; counters[i] = 0;
for (i = 0; i < blockSize - 1; i++) {
counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]++; const Byte *data2 = data;
counters[((UInt32)data[i] << 8) | data[0]]++; size_t a = data[(size_t)blockSize - 1];
const Byte *data_lim = data + blockSize;
if (blockSize >= 4)
{
data_lim -= 3;
do
{
size_t b;
b = data2[0]; counters[(a << 8) | b]++;
a = data2[1]; counters[(b << 8) | a]++;
b = data2[2]; counters[(a << 8) | b]++;
a = data2[3]; counters[(b << 8) | a]++;
data2 += 4;
}
while (data2 < data_lim);
data_lim += 3;
}
while (data2 != data_lim)
{
size_t b = *data2++;
counters[(a << 8) | b]++;
a = b;
}
}
// }}
Groups = counters + BS_TEMP_SIZE; Groups = counters + BS_TEMP_SIZE;
#ifdef BLOCK_SORT_EXTERNAL_FLAGS #ifdef BLOCK_SORT_EXTERNAL_FLAGS
Flags = Groups + blockSize; Flags = Groups + blockSize;
{ {
UInt32 numWords = (blockSize + kFlagsMask) >> kNumFlagsBits; const size_t numWords = (blockSize + kFlagsMask) >> kNumFlagsBits;
for (i = 0; i < numWords; i++) for (i = 0; i < numWords; i++)
Flags[i] = kAllFlags; Flags[i] = kAllFlags;
} }
#endif #endif
{ {
UInt32 sum = 0; UInt32 sum = 0;
for (i = 0; i < kNumHashValues; i++) for (i = 0; i < kNumHashValues; i++)
{ {
UInt32 groupSize = counters[i]; const UInt32 groupSize = counters[i];
if (groupSize > 0) counters[i] = sum;
sum += groupSize;
#ifdef BLOCK_SORT_EXTERNAL_FLAGS
if (groupSize)
{ {
#ifdef BLOCK_SORT_EXTERNAL_FLAGS const UInt32 t = sum - 1;
UInt32 t = sum + groupSize - 1; Flags[t >> kNumFlagsBits] &= ~((UInt32)1 << (t & kFlagsMask));
Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask));
#endif
sum += groupSize;
} }
counters[i] = sum - groupSize; #endif
}
}
for (i = 0; i < blockSize - 1; i++)
Groups[i] = counters[((unsigned)data[i] << 8) | data[(size_t)i + 1]];
Groups[i] = counters[((unsigned)data[i] << 8) | data[0]];
{
#define SET_Indices(a, b, i) \
{ UInt32 c; \
a = (a << 8) | (b); \
c = counters[a]; \
Indices[c] = (UInt32)i++; \
counters[a] = c + 1; \
} }
for (i = 0; i < blockSize - 1; i++) size_t a = data[0];
Groups[i] = counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]; const Byte *data_ptr = data + 1;
Groups[i] = counters[((UInt32)data[i] << 8) | data[0]]; i = 0;
if (blockSize >= 3)
for (i = 0; i < blockSize - 1; i++)
Indices[counters[((UInt32)data[i] << 8) | data[(size_t)i + 1]]++] = i;
Indices[counters[((UInt32)data[i] << 8) | data[0]]++] = i;
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
{ {
blockSize -= 2;
do
{
size_t b;
b = data_ptr[0]; SET_Indices(a, b, i)
a = data_ptr[1]; SET_Indices(b, a, i)
data_ptr += 2;
}
while (i < blockSize);
blockSize += 2;
}
if (i < blockSize - 1)
{
SET_Indices(a, data[(size_t)i + 1], i)
a = (Byte)a;
}
SET_Indices(a, data[0], i)
}
#ifndef BLOCK_SORT_EXTERNAL_FLAGS
{
UInt32 prev = 0; UInt32 prev = 0;
for (i = 0; i < kNumHashValues; i++) for (i = 0; i < kNumHashValues; i++)
{ {
UInt32 prevGroupSize = counters[i] - prev; const UInt32 prevGroupSize = counters[i] - prev;
if (prevGroupSize == 0) if (prevGroupSize == 0)
continue; continue;
SetGroupSize(Indices + prev, prevGroupSize); SetGroupSize(Indices + prev, prevGroupSize);
prev = counters[i]; prev = counters[i];
} }
}
#endif
} }
#endif
{ {
int NumRefBits; unsigned NumRefBits;
UInt32 NumSortedBytes; size_t NumSortedBytes;
for (NumRefBits = 0; ((blockSize - 1) >> NumRefBits) != 0; NumRefBits++); for (NumRefBits = 0; ((blockSize - 1) >> NumRefBits) != 0; NumRefBits++)
{}
NumRefBits = 32 - NumRefBits; NumRefBits = 32 - NumRefBits;
if (NumRefBits > kNumRefBitsMax) if (NumRefBits > kNumRefBitsMax)
NumRefBits = kNumRefBitsMax; NumRefBits = kNumRefBitsMax;
for (NumSortedBytes = kNumHashBytes; ; NumSortedBytes <<= 1) for (NumSortedBytes = kNumHashBytes; ; NumSortedBytes <<= 1)
{ {
#ifndef BLOCK_SORT_EXTERNAL_FLAGS #ifndef BLOCK_SORT_EXTERNAL_FLAGS
UInt32 finishedGroupSize = 0; size_t finishedGroupSize = 0;
#endif #endif
UInt32 newLimit = 0; size_t newLimit = 0;
for (i = 0; i < blockSize;) for (i = 0; i < blockSize;)
{ {
UInt32 groupSize; size_t groupSize;
#ifdef BLOCK_SORT_EXTERNAL_FLAGS #ifdef BLOCK_SORT_EXTERNAL_FLAGS
if ((Flags[i >> kNumFlagsBits] & (1 << (i & kFlagsMask))) == 0) if ((Flags[i >> kNumFlagsBits] & (1 << (i & kFlagsMask))) == 0)
{ {
@ -440,56 +553,56 @@ UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize)
} }
for (groupSize = 1; for (groupSize = 1;
(Flags[(i + groupSize) >> kNumFlagsBits] & (1 << ((i + groupSize) & kFlagsMask))) != 0; (Flags[(i + groupSize) >> kNumFlagsBits] & (1 << ((i + groupSize) & kFlagsMask))) != 0;
groupSize++); groupSize++)
{}
groupSize++; groupSize++;
#else #else
groupSize = ((Indices[i] & ~0xC0000000) >> kNumBitsMax); groupSize = (Indices[i] & ~0xC0000000) >> kNumBitsMax;
{ {
BoolInt finishedGroup = ((Indices[i] & 0x80000000) == 0); const BoolInt finishedGroup = ((Indices[i] & 0x80000000) == 0);
if ((Indices[i] & 0x40000000) != 0) if (Indices[i] & 0x40000000)
{
groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits);
Indices[(size_t)i + 1] &= kIndexMask;
}
Indices[i] &= kIndexMask;
groupSize++;
if (finishedGroup || groupSize == 1)
{
Indices[i - finishedGroupSize] &= kIndexMask;
if (finishedGroupSize > 1)
Indices[(size_t)(i - finishedGroupSize) + 1] &= kIndexMask;
{ {
UInt32 newGroupSize = groupSize + finishedGroupSize; groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits);
SetFinishedGroupSize(Indices + i - finishedGroupSize, newGroupSize); Indices[(size_t)i + 1] &= kIndexMask;
finishedGroupSize = newGroupSize;
} }
i += groupSize; Indices[i] &= kIndexMask;
continue; groupSize++;
} if (finishedGroup || groupSize == 1)
finishedGroupSize = 0; {
Indices[i - finishedGroupSize] &= kIndexMask;
if (finishedGroupSize > 1)
Indices[(size_t)(i - finishedGroupSize) + 1] &= kIndexMask;
{
const size_t newGroupSize = groupSize + finishedGroupSize;
SetFinishedGroupSize(Indices + i - finishedGroupSize, newGroupSize)
finishedGroupSize = newGroupSize;
}
i += groupSize;
continue;
}
finishedGroupSize = 0;
} }
#endif #endif
if (NumSortedBytes >= blockSize) if (NumSortedBytes >= blockSize)
{ {
UInt32 j; size_t j;
for (j = 0; j < groupSize; j++) for (j = 0; j < groupSize; j++)
{ {
UInt32 t = (i + j); size_t t = i + j;
/* Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); */ /* Flags[t >> kNumFlagsBits] &= ~(1 << (t & kFlagsMask)); */
Groups[Indices[t]] = t; Groups[Indices[t]] = (UInt32)t;
} }
} }
else else
if (SortGroup(blockSize, NumSortedBytes, i, groupSize, NumRefBits, Indices if (SortGroup(blockSize, NumSortedBytes, i, groupSize, NumRefBits, Indices
#ifndef BLOCK_SORT_USE_HEAP_SORT #ifndef BLOCK_SORT_USE_HEAP_SORT
, 0, blockSize , 0, blockSize
#endif #endif
) != 0) ))
newLimit = i + groupSize; newLimit = i + groupSize;
i += groupSize; i += groupSize;
} }
@ -497,19 +610,19 @@ UInt32 BlockSort(UInt32 *Indices, const Byte *data, UInt32 blockSize)
break; break;
} }
} }
#ifndef BLOCK_SORT_EXTERNAL_FLAGS #ifndef BLOCK_SORT_EXTERNAL_FLAGS
for (i = 0; i < blockSize;) for (i = 0; i < blockSize;)
{ {
UInt32 groupSize = ((Indices[i] & ~0xC0000000) >> kNumBitsMax); size_t groupSize = (Indices[i] & ~0xC0000000) >> kNumBitsMax;
if ((Indices[i] & 0x40000000) != 0) if (Indices[i] & 0x40000000)
{ {
groupSize += ((Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits); groupSize += (Indices[(size_t)i + 1] >> kNumBitsMax) << kNumExtra0Bits;
Indices[(size_t)i + 1] &= kIndexMask; Indices[(size_t)i + 1] &= kIndexMask;
} }
Indices[i] &= kIndexMask; Indices[i] &= kIndexMask;
groupSize++; groupSize++;
i += groupSize; i += groupSize;
} }
#endif #endif
return Groups[0]; return Groups[0];
} }

View file

@ -1,8 +1,8 @@
/* BwtSort.h -- BWT block sorting /* BwtSort.h -- BWT block sorting
2013-01-18 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#ifndef __BWT_SORT_H #ifndef ZIP7_INC_BWT_SORT_H
#define __BWT_SORT_H #define ZIP7_INC_BWT_SORT_H
#include "7zTypes.h" #include "7zTypes.h"
@ -10,16 +10,17 @@ EXTERN_C_BEGIN
/* use BLOCK_SORT_EXTERNAL_FLAGS if blockSize can be > 1M */ /* use BLOCK_SORT_EXTERNAL_FLAGS if blockSize can be > 1M */
/* #define BLOCK_SORT_EXTERNAL_FLAGS */ /* #define BLOCK_SORT_EXTERNAL_FLAGS */
// #define BLOCK_SORT_EXTERNAL_FLAGS
#ifdef BLOCK_SORT_EXTERNAL_FLAGS #ifdef BLOCK_SORT_EXTERNAL_FLAGS
#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) ((((blockSize) + 31) >> 5)) #define BLOCK_SORT_EXTERNAL_SIZE(blockSize) (((blockSize) + 31) >> 5)
#else #else
#define BLOCK_SORT_EXTERNAL_SIZE(blockSize) 0 #define BLOCK_SORT_EXTERNAL_SIZE(blockSize) 0
#endif #endif
#define BLOCK_SORT_BUF_SIZE(blockSize) ((blockSize) * 2 + BLOCK_SORT_EXTERNAL_SIZE(blockSize) + (1 << 16)) #define BLOCK_SORT_BUF_SIZE(blockSize) ((blockSize) * 2 + BLOCK_SORT_EXTERNAL_SIZE(blockSize) + (1 << 16))
UInt32 BlockSort(UInt32 *indices, const Byte *data, UInt32 blockSize); UInt32 BlockSort(UInt32 *indices, const Byte *data, size_t blockSize);
EXTERN_C_END EXTERN_C_END

View file

@ -1,12 +1,105 @@
/* Compiler.h /* Compiler.h : Compiler specific defines and pragmas
2021-01-05 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#ifndef __7Z_COMPILER_H #ifndef ZIP7_INC_COMPILER_H
#define __7Z_COMPILER_H #define ZIP7_INC_COMPILER_H
#ifdef __clang__ #if defined(__clang__)
#pragma clang diagnostic ignored "-Wunused-private-field" # define Z7_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
#endif
#if defined(__clang__) && defined(__apple_build_version__)
# define Z7_APPLE_CLANG_VERSION Z7_CLANG_VERSION
#elif defined(__clang__)
# define Z7_LLVM_CLANG_VERSION Z7_CLANG_VERSION
#elif defined(__GNUC__)
# define Z7_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
#endif
#ifdef _MSC_VER
#if !defined(__clang__) && !defined(__GNUC__)
#define Z7_MSC_VER_ORIGINAL _MSC_VER
#endif
#endif
#if defined(__MINGW32__) || defined(__MINGW64__)
#define Z7_MINGW
#endif
#if defined(__LCC__) && (defined(__MCST__) || defined(__e2k__))
#define Z7_MCST_LCC
#define Z7_MCST_LCC_VERSION (__LCC__ * 100 + __LCC_MINOR__)
#endif
/*
#if defined(__AVX2__) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 40600) \
|| defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30100) \
|| defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1800) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1400)
#define Z7_COMPILER_AVX2_SUPPORTED
#endif #endif
#endif
*/
// #pragma GCC diagnostic ignored "-Wunknown-pragmas"
#ifdef __clang__
// padding size of '' with 4 bytes to alignment boundary
#pragma GCC diagnostic ignored "-Wpadded"
#if defined(Z7_LLVM_CLANG_VERSION) && (__clang_major__ == 13) \
&& defined(__FreeBSD__)
// freebsd:
#pragma GCC diagnostic ignored "-Wexcess-padding"
#endif
#if __clang_major__ >= 16
#pragma GCC diagnostic ignored "-Wunsafe-buffer-usage"
#endif
#if __clang_major__ == 13
#if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
// cheri
#pragma GCC diagnostic ignored "-Wcapability-to-integer-cast"
#endif
#endif
#if __clang_major__ == 13
// for <arm_neon.h>
#pragma GCC diagnostic ignored "-Wreserved-identifier"
#endif
#endif // __clang__
#if defined(_WIN32) && defined(__clang__) && __clang_major__ >= 16
// #pragma GCC diagnostic ignored "-Wcast-function-type-strict"
#define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION \
_Pragma("GCC diagnostic ignored \"-Wcast-function-type-strict\"")
#else
#define Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
#endif
typedef void (*Z7_void_Function)(void);
#if defined(__clang__) || defined(__GNUC__)
#define Z7_CAST_FUNC_C (Z7_void_Function)
#elif defined(_MSC_VER) && _MSC_VER > 1920
#define Z7_CAST_FUNC_C (void *)
// #pragma warning(disable : 4191) // 'type cast': unsafe conversion from 'FARPROC' to 'void (__cdecl *)()'
#else
#define Z7_CAST_FUNC_C
#endif
/*
#if (defined(__GNUC__) && (__GNUC__ >= 8)) || defined(__clang__)
// #pragma GCC diagnostic ignored "-Wcast-function-type"
#endif
*/
#ifdef __GNUC__
#if defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40000) && (Z7_GCC_VERSION < 70000)
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#endif
#endif
#ifdef _MSC_VER #ifdef _MSC_VER
@ -17,24 +110,134 @@
#pragma warning(disable : 4214) // nonstandard extension used : bit field types other than int #pragma warning(disable : 4214) // nonstandard extension used : bit field types other than int
#endif #endif
#if _MSC_VER >= 1300 #if defined(_MSC_VER) && _MSC_VER >= 1800
#pragma warning(disable : 4996) // This function or variable may be unsafe #pragma warning(disable : 4464) // relative include path contains '..'
#else #endif
#pragma warning(disable : 4511) // copy constructor could not be generated
#pragma warning(disable : 4512) // assignment operator could not be generated
#pragma warning(disable : 4514) // unreferenced inline function has been removed
#pragma warning(disable : 4702) // unreachable code
#pragma warning(disable : 4710) // not inlined
#pragma warning(disable : 4714) // function marked as __forceinline not inlined
#pragma warning(disable : 4786) // identifier was truncated to '255' characters in the debug information
#endif
#ifdef __clang__ // == 1200 : -O1 : for __forceinline
#pragma clang diagnostic ignored "-Wdeprecated-declarations" // >= 1900 : -O1 : for printf
#pragma clang diagnostic ignored "-Wmicrosoft-exception-spec" #pragma warning(disable : 4710) // function not inlined
// #pragma clang diagnostic ignored "-Wreserved-id-macro"
#endif
#if _MSC_VER < 1900
// winnt.h: 'Int64ShllMod32'
#pragma warning(disable : 4514) // unreferenced inline function has been removed
#endif
#if _MSC_VER < 1300
// #pragma warning(disable : 4702) // unreachable code
// Bra.c : -O1:
#pragma warning(disable : 4714) // function marked as __forceinline not inlined
#endif
/*
#if _MSC_VER > 1400 && _MSC_VER <= 1900
// strcat: This function or variable may be unsafe
// sysinfoapi.h: kit10: GetVersion was declared deprecated
#pragma warning(disable : 4996)
#endif
*/
#if _MSC_VER > 1200
// -Wall warnings
#pragma warning(disable : 4711) // function selected for automatic inline expansion
#pragma warning(disable : 4820) // '2' bytes padding added after data member
#if _MSC_VER >= 1400 && _MSC_VER < 1920
// 1400: string.h: _DBG_MEMCPY_INLINE_
// 1600 - 191x : smmintrin.h __cplusplus'
// is not defined as a preprocessor macro, replacing with '0' for '#if/#elif'
#pragma warning(disable : 4668)
// 1400 - 1600 : WinDef.h : 'FARPROC' :
// 1900 - 191x : immintrin.h: _readfsbase_u32
// no function prototype given : converting '()' to '(void)'
#pragma warning(disable : 4255)
#endif
#if _MSC_VER >= 1914
// Compiler will insert Spectre mitigation for memory load if /Qspectre switch specified
#pragma warning(disable : 5045)
#endif
#endif // _MSC_VER > 1200
#endif // _MSC_VER
#if defined(__clang__) && (__clang_major__ >= 4)
#define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
_Pragma("clang loop unroll(disable)") \
_Pragma("clang loop vectorize(disable)")
#define Z7_ATTRIB_NO_VECTORIZE
#elif defined(__GNUC__) && (__GNUC__ >= 5) \
&& (!defined(Z7_MCST_LCC_VERSION) || (Z7_MCST_LCC_VERSION >= 12610))
#define Z7_ATTRIB_NO_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
// __attribute__((optimize("no-unroll-loops")));
#define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
#elif defined(_MSC_VER) && (_MSC_VER >= 1920)
#define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
_Pragma("loop( no_vector )")
#define Z7_ATTRIB_NO_VECTORIZE
#else
#define Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
#define Z7_ATTRIB_NO_VECTORIZE
#endif
#if defined(Z7_MSC_VER_ORIGINAL) && (Z7_MSC_VER_ORIGINAL >= 1920)
#define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE _Pragma("optimize ( \"s\", on )")
#define Z7_PRAGMA_OPTIMIZE_DEFAULT _Pragma("optimize ( \"\", on )")
#else
#define Z7_PRAGMA_OPTIMIZE_FOR_CODE_SIZE
#define Z7_PRAGMA_OPTIMIZE_DEFAULT
#endif
#if defined(MY_CPU_X86_OR_AMD64) && ( \
defined(__clang__) && (__clang_major__ >= 4) \
|| defined(__GNUC__) && (__GNUC__ >= 5))
#define Z7_ATTRIB_NO_SSE __attribute__((__target__("no-sse")))
#else
#define Z7_ATTRIB_NO_SSE
#endif
#define Z7_ATTRIB_NO_VECTOR \
Z7_ATTRIB_NO_VECTORIZE \
Z7_ATTRIB_NO_SSE
#if defined(__clang__) && (__clang_major__ >= 8) \
|| defined(__GNUC__) && (__GNUC__ >= 1000) \
/* || defined(_MSC_VER) && (_MSC_VER >= 1920) */
// GCC is not good for __builtin_expect()
#define Z7_LIKELY(x) (__builtin_expect((x), 1))
#define Z7_UNLIKELY(x) (__builtin_expect((x), 0))
// #define Z7_unlikely [[unlikely]]
// #define Z7_likely [[likely]]
#else
#define Z7_LIKELY(x) (x)
#define Z7_UNLIKELY(x) (x)
// #define Z7_likely
#endif
#if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30600))
#if (Z7_CLANG_VERSION < 130000)
#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wreserved-id-macro\"")
#else
#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER \
_Pragma("GCC diagnostic push") \
_Pragma("GCC diagnostic ignored \"-Wreserved-macro-identifier\"")
#endif
#define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER \
_Pragma("GCC diagnostic pop")
#else
#define Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif #endif
#define UNUSED_VAR(x) (void)x; #define UNUSED_VAR(x) (void)x;

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,8 @@
/* CpuArch.h -- CPU specific code /* CpuArch.h -- CPU specific code
2021-07-13 : Igor Pavlov : Public domain */ Igor Pavlov : Public domain */
#ifndef __CPU_ARCH_H #ifndef ZIP7_INC_CPU_ARCH_H
#define __CPU_ARCH_H #define ZIP7_INC_CPU_ARCH_H
#include "7zTypes.h" #include "7zTypes.h"
@ -20,6 +20,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8) MY_CPU_64BIT doesn't mean that (sizeof(void *) == 8)
*/ */
#if !defined(_M_ARM64EC)
#if defined(_M_X64) \ #if defined(_M_X64) \
|| defined(_M_AMD64) \ || defined(_M_AMD64) \
|| defined(__x86_64__) \ || defined(__x86_64__) \
@ -35,6 +36,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#endif #endif
#define MY_CPU_64BIT #define MY_CPU_64BIT
#endif #endif
#endif
#if defined(_M_IX86) \ #if defined(_M_IX86) \
@ -45,13 +47,34 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#define MY_CPU_SIZEOF_POINTER 4 #define MY_CPU_SIZEOF_POINTER 4
#endif #endif
#if defined(__SSE2__) \
|| defined(MY_CPU_AMD64) \
|| defined(_M_IX86_FP) && (_M_IX86_FP >= 2)
#define MY_CPU_SSE2
#endif
#if defined(_M_ARM64) \ #if defined(_M_ARM64) \
|| defined(_M_ARM64EC) \
|| defined(__AARCH64EL__) \ || defined(__AARCH64EL__) \
|| defined(__AARCH64EB__) \ || defined(__AARCH64EB__) \
|| defined(__aarch64__) || defined(__aarch64__)
#define MY_CPU_ARM64 #define MY_CPU_ARM64
#define MY_CPU_NAME "arm64" #if defined(__ILP32__) \
|| defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
#define MY_CPU_NAME "arm64-32"
#define MY_CPU_SIZEOF_POINTER 4
#elif defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 16)
#define MY_CPU_NAME "arm64-128"
#define MY_CPU_SIZEOF_POINTER 16
#else
#if defined(_M_ARM64EC)
#define MY_CPU_NAME "arm64ec"
#else
#define MY_CPU_NAME "arm64"
#endif
#define MY_CPU_SIZEOF_POINTER 8
#endif
#define MY_CPU_64BIT #define MY_CPU_64BIT
#endif #endif
@ -68,8 +91,10 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#define MY_CPU_ARM #define MY_CPU_ARM
#if defined(__thumb__) || defined(__THUMBEL__) || defined(_M_ARMT) #if defined(__thumb__) || defined(__THUMBEL__) || defined(_M_ARMT)
#define MY_CPU_ARMT
#define MY_CPU_NAME "armt" #define MY_CPU_NAME "armt"
#else #else
#define MY_CPU_ARM32
#define MY_CPU_NAME "arm" #define MY_CPU_NAME "arm"
#endif #endif
/* #define MY_CPU_32BIT */ /* #define MY_CPU_32BIT */
@ -103,6 +128,8 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
|| defined(__PPC__) \ || defined(__PPC__) \
|| defined(_POWER) || defined(_POWER)
#define MY_CPU_PPC_OR_PPC64
#if defined(__ppc64__) \ #if defined(__ppc64__) \
|| defined(__powerpc64__) \ || defined(__powerpc64__) \
|| defined(_LP64) \ || defined(_LP64) \
@ -123,12 +150,76 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#endif #endif
#if defined(__sparc64__) #if defined(__sparc__) \
#define MY_CPU_NAME "sparc64" || defined(__sparc)
#define MY_CPU_SPARC
#if defined(__LP64__) \
|| defined(_LP64) \
|| defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
#define MY_CPU_NAME "sparcv9"
#define MY_CPU_SIZEOF_POINTER 8
#define MY_CPU_64BIT
#elif defined(__sparc_v9__) \
|| defined(__sparcv9)
#define MY_CPU_64BIT
#if defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
#define MY_CPU_NAME "sparcv9-32"
#else
#define MY_CPU_NAME "sparcv9m"
#endif
#elif defined(__sparc_v8__) \
|| defined(__sparcv8)
#define MY_CPU_NAME "sparcv8"
#define MY_CPU_SIZEOF_POINTER 4
#else
#define MY_CPU_NAME "sparc"
#endif
#endif
#if defined(__riscv) \
|| defined(__riscv__)
#define MY_CPU_RISCV
#if __riscv_xlen == 32
#define MY_CPU_NAME "riscv32"
#elif __riscv_xlen == 64
#define MY_CPU_NAME "riscv64"
#else
#define MY_CPU_NAME "riscv"
#endif
#endif
#if defined(__loongarch__)
#define MY_CPU_LOONGARCH
#if defined(__loongarch64) || defined(__loongarch_grlen) && (__loongarch_grlen == 64)
#define MY_CPU_64BIT
#endif
#if defined(__loongarch64)
#define MY_CPU_NAME "loongarch64"
#define MY_CPU_LOONGARCH64
#else
#define MY_CPU_NAME "loongarch"
#endif
#endif
// #undef MY_CPU_NAME
// #undef MY_CPU_SIZEOF_POINTER
// #define __e2k__
// #define __SIZEOF_POINTER__ 4
#if defined(__e2k__)
#define MY_CPU_E2K
#if defined(__ILP32__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 4)
#define MY_CPU_NAME "e2k-32"
#define MY_CPU_SIZEOF_POINTER 4
#else
#define MY_CPU_NAME "e2k"
#if defined(__LP64__) || defined(__SIZEOF_POINTER__) && (__SIZEOF_POINTER__ == 8)
#define MY_CPU_SIZEOF_POINTER 8
#endif
#endif
#define MY_CPU_64BIT #define MY_CPU_64BIT
#elif defined(__sparc__)
#define MY_CPU_NAME "sparc"
/* #define MY_CPU_32BIT */
#endif #endif
@ -162,6 +253,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
|| defined(MY_CPU_ARM_LE) \ || defined(MY_CPU_ARM_LE) \
|| defined(MY_CPU_ARM64_LE) \ || defined(MY_CPU_ARM64_LE) \
|| defined(MY_CPU_IA64_LE) \ || defined(MY_CPU_IA64_LE) \
|| defined(_LITTLE_ENDIAN) \
|| defined(__LITTLE_ENDIAN__) \ || defined(__LITTLE_ENDIAN__) \
|| defined(__ARMEL__) \ || defined(__ARMEL__) \
|| defined(__THUMBEL__) \ || defined(__THUMBEL__) \
@ -194,6 +286,9 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#error Stop_Compiling_Bad_Endian #error Stop_Compiling_Bad_Endian
#endif #endif
#if !defined(MY_CPU_LE) && !defined(MY_CPU_BE)
#error Stop_Compiling_CPU_ENDIAN_must_be_detected_at_compile_time
#endif
#if defined(MY_CPU_32BIT) && defined(MY_CPU_64BIT) #if defined(MY_CPU_32BIT) && defined(MY_CPU_64BIT)
#error Stop_Compiling_Bad_32_64_BIT #error Stop_Compiling_Bad_32_64_BIT
@ -235,6 +330,7 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#ifndef MY_CPU_NAME #ifndef MY_CPU_NAME
// #define MY_CPU_IS_UNKNOWN
#ifdef MY_CPU_LE #ifdef MY_CPU_LE
#define MY_CPU_NAME "LE" #define MY_CPU_NAME "LE"
#elif defined(MY_CPU_BE) #elif defined(MY_CPU_BE)
@ -250,15 +346,121 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#ifdef __has_builtin
#define Z7_has_builtin(x) __has_builtin(x)
#else
#define Z7_has_builtin(x) 0
#endif
#define Z7_BSWAP32_CONST(v) \
( (((UInt32)(v) << 24) ) \
| (((UInt32)(v) << 8) & (UInt32)0xff0000) \
| (((UInt32)(v) >> 8) & (UInt32)0xff00 ) \
| (((UInt32)(v) >> 24) ))
#if defined(_MSC_VER) && (_MSC_VER >= 1300)
#include <stdlib.h>
/* Note: these macros will use bswap instruction (486), that is unsupported in 386 cpu */
#pragma intrinsic(_byteswap_ushort)
#pragma intrinsic(_byteswap_ulong)
#pragma intrinsic(_byteswap_uint64)
#define Z7_BSWAP16(v) _byteswap_ushort(v)
#define Z7_BSWAP32(v) _byteswap_ulong (v)
#define Z7_BSWAP64(v) _byteswap_uint64(v)
#define Z7_CPU_FAST_BSWAP_SUPPORTED
/* GCC can generate slow code that calls function for __builtin_bswap32() for:
- GCC for RISCV, if Zbb/XTHeadBb extension is not used.
- GCC for SPARC.
The code from CLANG for SPARC also is not fastest.
So we don't define Z7_CPU_FAST_BSWAP_SUPPORTED in some cases.
*/
#elif (!defined(MY_CPU_RISCV) || defined (__riscv_zbb) || defined(__riscv_xtheadbb)) \
&& !defined(MY_CPU_SPARC) \
&& ( \
(defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \
|| (defined(__clang__) && Z7_has_builtin(__builtin_bswap16)) \
)
#define Z7_BSWAP16(v) __builtin_bswap16(v)
#define Z7_BSWAP32(v) __builtin_bswap32(v)
#define Z7_BSWAP64(v) __builtin_bswap64(v)
#define Z7_CPU_FAST_BSWAP_SUPPORTED
#else
#define Z7_BSWAP16(v) ((UInt16) \
( ((UInt32)(v) << 8) \
| ((UInt32)(v) >> 8) \
))
#define Z7_BSWAP32(v) Z7_BSWAP32_CONST(v)
#define Z7_BSWAP64(v) \
( ( ( (UInt64)(v) ) << 8 * 7 ) \
| ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 1) ) << 8 * 5 ) \
| ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 2) ) << 8 * 3 ) \
| ( ( (UInt64)(v) & ((UInt32)0xff << 8 * 3) ) << 8 * 1 ) \
| ( ( (UInt64)(v) >> 8 * 1 ) & ((UInt32)0xff << 8 * 3) ) \
| ( ( (UInt64)(v) >> 8 * 3 ) & ((UInt32)0xff << 8 * 2) ) \
| ( ( (UInt64)(v) >> 8 * 5 ) & ((UInt32)0xff << 8 * 1) ) \
| ( ( (UInt64)(v) >> 8 * 7 ) ) \
)
#endif
#ifdef MY_CPU_LE #ifdef MY_CPU_LE
#if defined(MY_CPU_X86_OR_AMD64) \ #if defined(MY_CPU_X86_OR_AMD64) \
|| defined(MY_CPU_ARM64) || defined(MY_CPU_ARM64) \
|| defined(MY_CPU_RISCV) && defined(__riscv_misaligned_fast) \
|| defined(MY_CPU_E2K) && defined(__iset__) && (__iset__ >= 6)
#define MY_CPU_LE_UNALIGN #define MY_CPU_LE_UNALIGN
#define MY_CPU_LE_UNALIGN_64 #define MY_CPU_LE_UNALIGN_64
#elif defined(__ARM_FEATURE_UNALIGNED) #elif defined(__ARM_FEATURE_UNALIGNED)
/* gcc9 for 32-bit arm can use LDRD instruction that requires 32-bit alignment. /* === ALIGNMENT on 32-bit arm and LDRD/STRD/LDM/STM instructions.
So we can't use unaligned 64-bit operations. */ Description of problems:
#define MY_CPU_LE_UNALIGN problem-1 : 32-bit ARM architecture:
multi-access (pair of 32-bit accesses) instructions (LDRD/STRD/LDM/STM)
require 32-bit (WORD) alignment (by 32-bit ARM architecture).
So there is "Alignment fault exception", if data is not aligned for 32-bit.
problem-2 : 32-bit kernels and arm64 kernels:
32-bit linux kernels provide fixup for these "paired" instruction "Alignment fault exception".
So unaligned paired-access instructions work via exception handler in kernel in 32-bit linux.
But some arm64 kernels do not handle these faults in 32-bit programs.
So we have unhandled exception for such instructions.
Probably some new arm64 kernels have fixed it, and unaligned
paired-access instructions work in new kernels?
problem-3 : compiler for 32-bit arm:
Compilers use LDRD/STRD/LDM/STM for UInt64 accesses
and for another cases where two 32-bit accesses are fused
to one multi-access instruction.
So UInt64 variables must be aligned for 32-bit, and each
32-bit access must be aligned for 32-bit, if we want to
avoid "Alignment fault" exception (handled or unhandled).
problem-4 : performace:
Even if unaligned access is handled by kernel, it will be slow.
So if we allow unaligned access, we can get fast unaligned
single-access, and slow unaligned paired-access.
We don't allow unaligned access on 32-bit arm, because compiler
genarates paired-access instructions that require 32-bit alignment,
and some arm64 kernels have no handler for these instructions.
Also unaligned paired-access instructions will be slow, if kernel handles them.
*/
// it must be disabled:
// #define MY_CPU_LE_UNALIGN
#endif #endif
#endif #endif
@ -269,13 +471,11 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#define GetUi32(p) (*(const UInt32 *)(const void *)(p)) #define GetUi32(p) (*(const UInt32 *)(const void *)(p))
#ifdef MY_CPU_LE_UNALIGN_64 #ifdef MY_CPU_LE_UNALIGN_64
#define GetUi64(p) (*(const UInt64 *)(const void *)(p)) #define GetUi64(p) (*(const UInt64 *)(const void *)(p))
#define SetUi64(p, v) { *(UInt64 *)(void *)(p) = (v); }
#endif #endif
#define SetUi16(p, v) { *(UInt16 *)(void *)(p) = (v); } #define SetUi16(p, v) { *(UInt16 *)(void *)(p) = (v); }
#define SetUi32(p, v) { *(UInt32 *)(void *)(p) = (v); } #define SetUi32(p, v) { *(UInt32 *)(void *)(p) = (v); }
#ifdef MY_CPU_LE_UNALIGN_64
#define SetUi64(p, v) { *(UInt64 *)(void *)(p) = (v); }
#endif
#else #else
@ -302,50 +502,33 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#endif #endif
#ifndef MY_CPU_LE_UNALIGN_64 #ifndef GetUi64
#define GetUi64(p) (GetUi32(p) | ((UInt64)GetUi32(((const Byte *)(p)) + 4) << 32)) #define GetUi64(p) (GetUi32(p) | ((UInt64)GetUi32(((const Byte *)(p)) + 4) << 32))
#endif
#ifndef SetUi64
#define SetUi64(p, v) { Byte *_ppp2_ = (Byte *)(p); UInt64 _vvv2_ = (v); \ #define SetUi64(p, v) { Byte *_ppp2_ = (Byte *)(p); UInt64 _vvv2_ = (v); \
SetUi32(_ppp2_ , (UInt32)_vvv2_); \ SetUi32(_ppp2_ , (UInt32)_vvv2_) \
SetUi32(_ppp2_ + 4, (UInt32)(_vvv2_ >> 32)); } SetUi32(_ppp2_ + 4, (UInt32)(_vvv2_ >> 32)) }
#endif #endif
#if defined(MY_CPU_LE_UNALIGN) && defined(Z7_CPU_FAST_BSWAP_SUPPORTED)
#if 0
#ifdef __has_builtin // Z7_BSWAP16 can be slow for x86-msvc
#define MY__has_builtin(x) __has_builtin(x) #define GetBe16_to32(p) (Z7_BSWAP16 (*(const UInt16 *)(const void *)(p)))
#else #else
#define MY__has_builtin(x) 0 #define GetBe16_to32(p) (Z7_BSWAP32 (*(const UInt16 *)(const void *)(p)) >> 16)
#endif #endif
#if defined(MY_CPU_LE_UNALIGN) && /* defined(_WIN64) && */ defined(_MSC_VER) && (_MSC_VER >= 1300) #define GetBe32(p) Z7_BSWAP32 (*(const UInt32 *)(const void *)(p))
#define SetBe32(p, v) { (*(UInt32 *)(void *)(p)) = Z7_BSWAP32(v); }
/* Note: we use bswap instruction, that is unsupported in 386 cpu */ #if defined(MY_CPU_LE_UNALIGN_64)
#define GetBe64(p) Z7_BSWAP64 (*(const UInt64 *)(const void *)(p))
#include <stdlib.h> #define SetBe64(p, v) { (*(UInt64 *)(void *)(p)) = Z7_BSWAP64(v); }
#endif
#pragma intrinsic(_byteswap_ushort)
#pragma intrinsic(_byteswap_ulong)
#pragma intrinsic(_byteswap_uint64)
/* #define GetBe16(p) _byteswap_ushort(*(const UInt16 *)(const Byte *)(p)) */
#define GetBe32(p) _byteswap_ulong (*(const UInt32 *)(const void *)(p))
#define GetBe64(p) _byteswap_uint64(*(const UInt64 *)(const void *)(p))
#define SetBe32(p, v) (*(UInt32 *)(void *)(p)) = _byteswap_ulong(v)
#elif defined(MY_CPU_LE_UNALIGN) && ( \
(defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \
|| (defined(__clang__) && MY__has_builtin(__builtin_bswap16)) )
/* #define GetBe16(p) __builtin_bswap16(*(const UInt16 *)(const void *)(p)) */
#define GetBe32(p) __builtin_bswap32(*(const UInt32 *)(const void *)(p))
#define GetBe64(p) __builtin_bswap64(*(const UInt64 *)(const void *)(p))
#define SetBe32(p, v) (*(UInt32 *)(void *)(p)) = __builtin_bswap32(v)
#else #else
@ -355,8 +538,6 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
((UInt32)((const Byte *)(p))[2] << 8) | \ ((UInt32)((const Byte *)(p))[2] << 8) | \
((const Byte *)(p))[3] ) ((const Byte *)(p))[3] )
#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
#define SetBe32(p, v) { Byte *_ppp_ = (Byte *)(p); UInt32 _vvv_ = (v); \ #define SetBe32(p, v) { Byte *_ppp_ = (Byte *)(p); UInt32 _vvv_ = (v); \
_ppp_[0] = (Byte)(_vvv_ >> 24); \ _ppp_[0] = (Byte)(_vvv_ >> 24); \
_ppp_[1] = (Byte)(_vvv_ >> 16); \ _ppp_[1] = (Byte)(_vvv_ >> 16); \
@ -365,53 +546,115 @@ MY_CPU_64BIT means that processor can work with 64-bit registers.
#endif #endif
#ifndef GetBe64
#define GetBe64(p) (((UInt64)GetBe32(p) << 32) | GetBe32(((const Byte *)(p)) + 4))
#endif
#ifndef SetBe64
#define SetBe64(p, v) { Byte *_ppp_ = (Byte *)(p); UInt64 _vvv_ = (v); \
_ppp_[0] = (Byte)(_vvv_ >> 56); \
_ppp_[1] = (Byte)(_vvv_ >> 48); \
_ppp_[2] = (Byte)(_vvv_ >> 40); \
_ppp_[3] = (Byte)(_vvv_ >> 32); \
_ppp_[4] = (Byte)(_vvv_ >> 24); \
_ppp_[5] = (Byte)(_vvv_ >> 16); \
_ppp_[6] = (Byte)(_vvv_ >> 8); \
_ppp_[7] = (Byte)_vvv_; }
#endif
#ifndef GetBe16 #ifndef GetBe16
#ifdef GetBe16_to32
#define GetBe16(p) ( (UInt16) GetBe16_to32(p))
#else
#define GetBe16(p) ( (UInt16) ( \ #define GetBe16(p) ( (UInt16) ( \
((UInt16)((const Byte *)(p))[0] << 8) | \ ((UInt16)((const Byte *)(p))[0] << 8) | \
((const Byte *)(p))[1] )) ((const Byte *)(p))[1] ))
#endif
#endif #endif
#if defined(MY_CPU_BE)
#define Z7_CONV_BE_TO_NATIVE_CONST32(v) (v)
#define Z7_CONV_LE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
#define Z7_CONV_NATIVE_TO_BE_32(v) (v)
// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b1) | ((b0) << 8))
#elif defined(MY_CPU_LE)
#define Z7_CONV_BE_TO_NATIVE_CONST32(v) Z7_BSWAP32_CONST(v)
#define Z7_CONV_LE_TO_NATIVE_CONST32(v) (v)
#define Z7_CONV_NATIVE_TO_BE_32(v) Z7_BSWAP32(v)
// #define Z7_GET_NATIVE16_FROM_2_BYTES(b0, b1) ((b0) | ((b1) << 8))
#else
#error Stop_Compiling_Unknown_Endian_CONV
#endif
#if defined(MY_CPU_BE)
#define GetBe64a(p) (*(const UInt64 *)(const void *)(p))
#define GetBe32a(p) (*(const UInt32 *)(const void *)(p))
#define GetBe16a(p) (*(const UInt16 *)(const void *)(p))
#define SetBe32a(p, v) { *(UInt32 *)(void *)(p) = (v); }
#define SetBe16a(p, v) { *(UInt16 *)(void *)(p) = (v); }
#define GetUi64a(p) GetUi64(p)
#define GetUi32a(p) GetUi32(p)
#define GetUi16a(p) GetUi16(p)
#define SetUi32a(p, v) SetUi32(p, v)
#define SetUi16a(p, v) SetUi16(p, v)
#elif defined(MY_CPU_LE)
#define GetUi64a(p) (*(const UInt64 *)(const void *)(p))
#define GetUi32a(p) (*(const UInt32 *)(const void *)(p))
#define GetUi16a(p) (*(const UInt16 *)(const void *)(p))
#define SetUi32a(p, v) { *(UInt32 *)(void *)(p) = (v); }
#define SetUi16a(p, v) { *(UInt16 *)(void *)(p) = (v); }
#define GetBe64a(p) GetBe64(p)
#define GetBe32a(p) GetBe32(p)
#define GetBe16a(p) GetBe16(p)
#define SetBe32a(p, v) SetBe32(p, v)
#define SetBe16a(p, v) SetBe16(p, v)
#else
#error Stop_Compiling_Unknown_Endian_CPU_a
#endif
#ifndef GetBe16_to32
#define GetBe16_to32(p) GetBe16(p)
#endif
#if defined(MY_CPU_X86_OR_AMD64) \
|| defined(MY_CPU_ARM_OR_ARM64) \
|| defined(MY_CPU_PPC_OR_PPC64)
#define Z7_CPU_FAST_ROTATE_SUPPORTED
#endif
#ifdef MY_CPU_X86_OR_AMD64 #ifdef MY_CPU_X86_OR_AMD64
typedef struct void Z7_FASTCALL z7_x86_cpuid(UInt32 a[4], UInt32 function);
{ UInt32 Z7_FASTCALL z7_x86_cpuid_GetMaxFunc(void);
UInt32 maxFunc; #if defined(MY_CPU_AMD64)
UInt32 vendor[3]; #define Z7_IF_X86_CPUID_SUPPORTED
UInt32 ver; #else
UInt32 b; #define Z7_IF_X86_CPUID_SUPPORTED if (z7_x86_cpuid_GetMaxFunc())
UInt32 c; #endif
UInt32 d;
} Cx86cpuid;
enum
{
CPU_FIRM_INTEL,
CPU_FIRM_AMD,
CPU_FIRM_VIA
};
void MyCPUID(UInt32 function, UInt32 *a, UInt32 *b, UInt32 *c, UInt32 *d);
BoolInt x86cpuid_CheckAndRead(Cx86cpuid *p);
int x86cpuid_GetFirm(const Cx86cpuid *p);
#define x86cpuid_GetFamily(ver) (((ver >> 16) & 0xFF0) | ((ver >> 8) & 0xF))
#define x86cpuid_GetModel(ver) (((ver >> 12) & 0xF0) | ((ver >> 4) & 0xF))
#define x86cpuid_GetStepping(ver) (ver & 0xF)
BoolInt CPU_Is_InOrder(void);
BoolInt CPU_IsSupported_AES(void); BoolInt CPU_IsSupported_AES(void);
BoolInt CPU_IsSupported_AVX(void);
BoolInt CPU_IsSupported_AVX2(void); BoolInt CPU_IsSupported_AVX2(void);
BoolInt CPU_IsSupported_AVX512F_AVX512VL(void);
BoolInt CPU_IsSupported_VAES_AVX2(void); BoolInt CPU_IsSupported_VAES_AVX2(void);
BoolInt CPU_IsSupported_CMOV(void);
BoolInt CPU_IsSupported_SSE(void);
BoolInt CPU_IsSupported_SSE2(void);
BoolInt CPU_IsSupported_SSSE3(void); BoolInt CPU_IsSupported_SSSE3(void);
BoolInt CPU_IsSupported_SSE41(void); BoolInt CPU_IsSupported_SSE41(void);
BoolInt CPU_IsSupported_SHA(void); BoolInt CPU_IsSupported_SHA(void);
BoolInt CPU_IsSupported_SHA512(void);
BoolInt CPU_IsSupported_PageGB(void); BoolInt CPU_IsSupported_PageGB(void);
#elif defined(MY_CPU_ARM_OR_ARM64) #elif defined(MY_CPU_ARM_OR_ARM64)
@ -429,12 +672,13 @@ BoolInt CPU_IsSupported_SHA1(void);
BoolInt CPU_IsSupported_SHA2(void); BoolInt CPU_IsSupported_SHA2(void);
BoolInt CPU_IsSupported_AES(void); BoolInt CPU_IsSupported_AES(void);
#endif #endif
BoolInt CPU_IsSupported_SHA512(void);
#endif #endif
#if defined(__APPLE__) #if defined(__APPLE__)
int My_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize); int z7_sysctlbyname_Get(const char *name, void *buf, size_t *bufSize);
int My_sysctlbyname_Get_UInt32(const char *name, UInt32 *val); int z7_sysctlbyname_Get_UInt32(const char *name, UInt32 *val);
#endif #endif
EXTERN_C_END EXTERN_C_END

View file

@ -1,8 +1,8 @@
/* Delta.h -- Delta converter /* Delta.h -- Delta converter
2013-01-18 : Igor Pavlov : Public domain */ 2023-03-03 : Igor Pavlov : Public domain */
#ifndef __DELTA_H #ifndef ZIP7_INC_DELTA_H
#define __DELTA_H #define ZIP7_INC_DELTA_H
#include "7zTypes.h" #include "7zTypes.h"

View file

@ -1,110 +1,99 @@
/* DllSecur.c -- DLL loading security /* DllSecur.c -- DLL loading security
2021-12-25 : Igor Pavlov : Public domain */ 2023-12-03 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#ifdef _WIN32 #ifdef _WIN32
#include <Windows.h> #include "7zWindows.h"
#include "DllSecur.h" #include "DllSecur.h"
#ifndef UNDER_CE #ifndef UNDER_CE
Z7_DIAGNOSTIC_IGNORE_CAST_FUNCTION
typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags); typedef BOOL (WINAPI *Func_SetDefaultDllDirectories)(DWORD DirectoryFlags);
#define MY_LOAD_LIBRARY_SEARCH_USER_DIRS 0x400 #define MY_LOAD_LIBRARY_SEARCH_USER_DIRS 0x400
#define MY_LOAD_LIBRARY_SEARCH_SYSTEM32 0x800 #define MY_LOAD_LIBRARY_SEARCH_SYSTEM32 0x800
#define DELIM "\0"
static const char * const g_Dlls = static const char * const g_Dlls =
"userenv"
DELIM "setupapi"
DELIM "apphelp"
DELIM "propsys"
DELIM "dwmapi"
DELIM "cryptbase"
DELIM "oleacc"
DELIM "clbcatq"
DELIM "version"
#ifndef _CONSOLE #ifndef _CONSOLE
"UXTHEME\0" DELIM "uxtheme"
#endif #endif
"USERENV\0" DELIM;
"SETUPAPI\0"
"APPHELP\0"
"PROPSYS\0"
"DWMAPI\0"
"CRYPTBASE\0"
"OLEACC\0"
"CLBCATQ\0"
"VERSION\0"
;
#endif #endif
// #define MY_CAST_FUNC (void(*)()) #ifdef __clang__
#define MY_CAST_FUNC #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#endif
#if defined (_MSC_VER) && _MSC_VER >= 1900
// sysinfoapi.h: kit10: GetVersion was declared deprecated
#pragma warning(disable : 4996)
#endif
void My_SetDefaultDllDirectories() #define IF_NON_VISTA_SET_DLL_DIRS_AND_RETURN \
if ((UInt16)GetVersion() != 6) { \
const \
Func_SetDefaultDllDirectories setDllDirs = \
(Func_SetDefaultDllDirectories) Z7_CAST_FUNC_C GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), \
"SetDefaultDllDirectories"); \
if (setDllDirs) if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS)) return; }
void My_SetDefaultDllDirectories(void)
{ {
#ifndef UNDER_CE #ifndef UNDER_CE
IF_NON_VISTA_SET_DLL_DIRS_AND_RETURN
OSVERSIONINFO vi;
vi.dwOSVersionInfoSize = sizeof(vi);
if (!GetVersionEx(&vi) || vi.dwMajorVersion != 6 || vi.dwMinorVersion != 0)
{
Func_SetDefaultDllDirectories setDllDirs = (Func_SetDefaultDllDirectories)
MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories");
if (setDllDirs)
if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS))
return;
}
#endif #endif
} }
void LoadSecurityDlls() void LoadSecurityDlls(void)
{ {
#ifndef UNDER_CE #ifndef UNDER_CE
// at Vista (ver 6.0) : CoCreateInstance(CLSID_ShellLink, ...) doesn't work after SetDefaultDllDirectories() : Check it ???
wchar_t buf[MAX_PATH + 100]; IF_NON_VISTA_SET_DLL_DIRS_AND_RETURN
{
// at Vista (ver 6.0) : CoCreateInstance(CLSID_ShellLink, ...) doesn't work after SetDefaultDllDirectories() : Check it ???
OSVERSIONINFO vi;
vi.dwOSVersionInfoSize = sizeof(vi);
if (!GetVersionEx(&vi) || vi.dwMajorVersion != 6 || vi.dwMinorVersion != 0)
{
Func_SetDefaultDllDirectories setDllDirs = (Func_SetDefaultDllDirectories)
MY_CAST_FUNC GetProcAddress(GetModuleHandle(TEXT("kernel32.dll")), "SetDefaultDllDirectories");
if (setDllDirs)
if (setDllDirs(MY_LOAD_LIBRARY_SEARCH_SYSTEM32 | MY_LOAD_LIBRARY_SEARCH_USER_DIRS))
return;
}
}
{
unsigned len = GetSystemDirectoryW(buf, MAX_PATH + 2);
if (len == 0 || len > MAX_PATH)
return;
}
{ {
wchar_t buf[MAX_PATH + 100];
const char *dll; const char *dll;
unsigned pos = (unsigned)lstrlenW(buf); unsigned pos = GetSystemDirectoryW(buf, MAX_PATH + 2);
if (pos == 0 || pos > MAX_PATH)
return;
if (buf[pos - 1] != '\\') if (buf[pos - 1] != '\\')
buf[pos++] = '\\'; buf[pos++] = '\\';
for (dll = g_Dlls; *dll != 0;)
for (dll = g_Dlls; dll[0] != 0;)
{ {
unsigned k = 0; wchar_t *dest = &buf[pos];
for (;;) for (;;)
{ {
char c = *dll++; const char c = *dll++;
buf[pos + k] = (Byte)c;
k++;
if (c == 0) if (c == 0)
break; break;
*dest++ = (Byte)c;
} }
dest[0] = '.';
lstrcatW(buf, L".dll"); dest[1] = 'd';
dest[2] = 'l';
dest[3] = 'l';
dest[4] = 0;
// lstrcatW(buf, L".dll");
LoadLibraryExW(buf, NULL, LOAD_WITH_ALTERED_SEARCH_PATH); LoadLibraryExW(buf, NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
} }
} }
#endif #endif
} }
#endif #endif // _WIN32

View file

@ -1,8 +1,8 @@
/* DllSecur.h -- DLL loading for security /* DllSecur.h -- DLL loading for security
2018-02-19 : Igor Pavlov : Public domain */ 2023-03-03 : Igor Pavlov : Public domain */
#ifndef __DLL_SECUR_H #ifndef ZIP7_INC_DLL_SECUR_H
#define __DLL_SECUR_H #define ZIP7_INC_DLL_SECUR_H
#include "7zTypes.h" #include "7zTypes.h"

View file

@ -1,60 +1,125 @@
/* HuffEnc.c -- functions for Huffman encoding /* HuffEnc.c -- functions for Huffman encoding
2021-02-09 : Igor Pavlov : Public domain */ Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include <string.h>
#include "HuffEnc.h" #include "HuffEnc.h"
#include "Sort.h" #include "Sort.h"
#include "CpuArch.h"
#define kMaxLen 16 #define kMaxLen Z7_HUFFMAN_LEN_MAX
#define NUM_BITS 10 #define NUM_BITS 10
#define MASK (((unsigned)1 << NUM_BITS) - 1) #define MASK ((1u << NUM_BITS) - 1)
#define FREQ_MASK (~(UInt32)MASK)
#define NUM_COUNTERS (48 * 2)
#define NUM_COUNTERS 64 #if 1 && (defined(MY_CPU_LE) || defined(MY_CPU_BE))
#if defined(MY_CPU_LE)
#define HI_HALF_OFFSET 1
#else
#define HI_HALF_OFFSET 0
#endif
#define LOAD_PARENT(p) ((unsigned)*((const UInt16 *)(p) + HI_HALF_OFFSET))
#define STORE_PARENT(p, fb, val) *((UInt16 *)(p) + HI_HALF_OFFSET) = (UInt16)(val);
#define STORE_PARENT_DIRECT(p, fb, hi) STORE_PARENT(p, fb, hi)
#define UPDATE_E(eHi) eHi++;
#else
#define LOAD_PARENT(p) ((unsigned)(*(p) >> NUM_BITS))
#define STORE_PARENT_DIRECT(p, fb, hi) *(p) = ((fb) & MASK) | (hi); // set parent field
#define STORE_PARENT(p, fb, val) STORE_PARENT_DIRECT(p, fb, ((UInt32)(val) << NUM_BITS))
#define UPDATE_E(eHi) eHi += 1 << NUM_BITS;
#endif
#define HUFFMAN_SPEED_OPT void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, unsigned numSymbols, unsigned maxLen)
void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymbols, UInt32 maxLen)
{ {
UInt32 num = 0; #if NUM_COUNTERS > 2
/* if (maxLen > 10) maxLen = 10; */ unsigned counters[NUM_COUNTERS];
#endif
#if 1 && NUM_COUNTERS > (kMaxLen + 4) * 2
#define lenCounters (counters)
#define codes (counters + kMaxLen + 4)
#else
unsigned lenCounters[kMaxLen + 1];
UInt32 codes[kMaxLen + 1];
#endif
unsigned num;
{ {
UInt32 i; unsigned i;
// UInt32 sum = 0;
#ifdef HUFFMAN_SPEED_OPT #if NUM_COUNTERS > 2
#define CTR_ITEM_FOR_FREQ(freq) \
counters[(freq) >= NUM_COUNTERS - 1 ? NUM_COUNTERS - 1 : (unsigned)(freq)]
UInt32 counters[NUM_COUNTERS];
for (i = 0; i < NUM_COUNTERS; i++) for (i = 0; i < NUM_COUNTERS; i++)
counters[i] = 0; counters[i] = 0;
for (i = 0; i < numSymbols; i++) memset(lens, 0, numSymbols);
{ {
UInt32 freq = freqs[i]; const UInt32 *fp = freqs + numSymbols;
counters[(freq < NUM_COUNTERS - 1) ? freq : NUM_COUNTERS - 1]++; #define NUM_UNROLLS 1
#if NUM_UNROLLS > 1 // use 1 if odd (numSymbols) is possisble
if (numSymbols & 1)
{
UInt32 f;
f = *--fp; CTR_ITEM_FOR_FREQ(f)++;
// sum += f;
}
#endif
do
{
UInt32 f;
fp -= NUM_UNROLLS;
f = fp[0]; CTR_ITEM_FOR_FREQ(f)++;
// sum += f;
#if NUM_UNROLLS > 1
f = fp[1]; CTR_ITEM_FOR_FREQ(f)++;
// sum += f;
#endif
}
while (fp != freqs);
} }
#if 0
printf("\nsum=%8u numSymbols =%3u ctrs:", sum, numSymbols);
{
unsigned k = 0;
for (k = 0; k < NUM_COUNTERS; k++)
printf(" %u", counters[k]);
}
#endif
for (i = 1; i < NUM_COUNTERS; i++) num = counters[1];
counters[1] = 0;
for (i = 2; i != NUM_COUNTERS; i += 2)
{ {
UInt32 temp = counters[i]; unsigned c;
counters[i] = num; c = (counters )[i]; (counters )[i] = num; num += c;
num += temp; c = (counters + 1)[i]; (counters + 1)[i] = num; num += c;
} }
counters[0] = num; // we want to write (freq==0) symbols to the end of (p) array
for (i = 0; i < numSymbols; i++)
{ {
UInt32 freq = freqs[i]; i = 0;
if (freq == 0) do
lens[i] = 0; {
else const UInt32 f = freqs[i];
p[counters[((freq < NUM_COUNTERS - 1) ? freq : NUM_COUNTERS - 1)]++] = i | (freq << NUM_BITS); #if 0
if (f == 0) lens[i] = 0; else
#endif
p[CTR_ITEM_FOR_FREQ(f)++] = i | (f << NUM_BITS);
}
while (++i != numSymbols);
} }
counters[0] = 0;
HeapSort(p + counters[NUM_COUNTERS - 2], counters[NUM_COUNTERS - 1] - counters[NUM_COUNTERS - 2]); HeapSort(p + counters[NUM_COUNTERS - 2], counters[NUM_COUNTERS - 1] - counters[NUM_COUNTERS - 2]);
#else #else // NUM_COUNTERS <= 2
num = 0;
for (i = 0; i < numSymbols; i++) for (i = 0; i < numSymbols; i++)
{ {
UInt32 freq = freqs[i]; const UInt32 freq = freqs[i];
if (freq == 0) if (freq == 0)
lens[i] = 0; lens[i] = 0;
else else
@ -62,17 +127,27 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb
} }
HeapSort(p, num); HeapSort(p, num);
#endif #endif
} }
if (num < 2) if (num <= 2)
{ {
unsigned minCode = 0; unsigned minCode = 0;
unsigned maxCode = 1; unsigned maxCode = 1;
if (num == 1) if (num)
{ {
maxCode = (unsigned)p[0] & MASK; maxCode = (unsigned)p[(size_t)num - 1] & MASK;
if (maxCode == 0) if (num == 2)
{
minCode = (unsigned)p[0] & MASK;
if (minCode > maxCode)
{
const unsigned temp = minCode;
minCode = maxCode;
maxCode = temp;
}
}
else if (maxCode == 0)
maxCode++; maxCode++;
} }
p[minCode] = 0; p[minCode] = 0;
@ -80,69 +155,206 @@ void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 numSymb
lens[minCode] = lens[maxCode] = 1; lens[minCode] = lens[maxCode] = 1;
return; return;
} }
{ {
UInt32 b, e, i; unsigned i;
for (i = 0; i <= kMaxLen; i++)
i = b = e = 0; lenCounters[i] = 0;
do lenCounters[1] = 2; // by default root node has 2 child leaves at level 1.
}
// if (num != 2)
{
// num > 2
// the binary tree will contain (num - 1) internal nodes.
// p[num - 2] will be root node of binary tree.
UInt32 *b;
UInt32 *n;
// first node will have two leaf childs: p[0] and p[1]:
// p[0] += p[1] & FREQ_MASK; // set frequency sum of child leafs
// if (pi == n) exit(0);
// if (pi != n)
{ {
UInt32 n, m, freq; UInt32 fb = (p[1] & FREQ_MASK) + p[0];
n = (i != num && (b == e || (p[i] >> NUM_BITS) <= (p[b] >> NUM_BITS))) ? i++ : b++; UInt32 f = p[2] & FREQ_MASK;
freq = (p[n] & ~MASK); const UInt32 *pi = p + 2;
p[n] = (p[n] & MASK) | (e << NUM_BITS); UInt32 *e = p;
m = (i != num && (b == e || (p[i] >> NUM_BITS) <= (p[b] >> NUM_BITS))) ? i++ : b++; UInt32 eHi = 0;
freq += (p[m] & ~MASK); n = p + num;
p[m] = (p[m] & MASK) | (e << NUM_BITS); b = p;
p[e] = (p[e] & MASK) | freq; // p[0] = fb;
e++; for (;;)
{
// (b <= e)
UInt32 sum;
e++;
UPDATE_E(eHi)
// (b < e)
// p range : high bits
// [0, b) : parent : processed nodes that have parent and childs
// [b, e) : FREQ : non-processed nodes that have no parent but have childs
// [e, pi) : FREQ : processed leaves for which parent node was created
// [pi, n) : FREQ : non-processed leaves for which parent node was not created
// first child
// note : (*b < f) is same result as ((*b & FREQ_MASK) < f)
if (fb < f)
{
// node freq is smaller
sum = fb & FREQ_MASK;
STORE_PARENT_DIRECT (b, fb, eHi)
b++;
fb = *b;
if (b == e)
{
if (++pi == n)
break;
sum += f;
fb &= MASK;
fb |= sum;
*e = fb;
f = *pi & FREQ_MASK;
continue;
}
}
else if (++pi == n)
{
STORE_PARENT_DIRECT (b, fb, eHi)
b++;
break;
}
else
{
sum = f;
f = *pi & FREQ_MASK;
}
// (b < e)
// second child
if (fb < f)
{
sum += fb;
sum &= FREQ_MASK;
STORE_PARENT_DIRECT (b, fb, eHi)
b++;
*e = (*e & MASK) | sum; // set frequency sum
// (b <= e) is possible here
fb = *b;
}
else if (++pi == n)
break;
else
{
sum += f;
f = *pi & FREQ_MASK;
*e = (*e & MASK) | sum; // set frequency sum
}
}
} }
while (num - e > 1);
// printf("\nnum-e=%3u, numSymbols=%3u, num=%3u, b=%3u", n - e, numSymbols, n - p, b - p);
{ {
UInt32 lenCounters[kMaxLen + 1]; n -= 2;
for (i = 0; i <= kMaxLen; i++) *n &= MASK; // root node : we clear high bits (zero bits mean level == 0)
lenCounters[i] = 0; if (n != b)
p[--e] &= MASK;
lenCounters[1] = 2;
while (e > 0)
{ {
UInt32 len = (p[p[--e] >> NUM_BITS] >> NUM_BITS) + 1; // We go here, if we have some number of non-created nodes up to root.
p[e] = (p[e] & MASK) | (len << NUM_BITS); // We process them in simplified code:
// position of parent for each pair of nodes is known.
// n[-2], n[-1] : current pair of child nodes
// (p1) : parent node for current pair.
UInt32 *p1 = n;
do
{
const unsigned len = LOAD_PARENT(p1) + 1;
p1--;
(lenCounters )[len] -= 2; // we remove 2 leaves from level (len)
(lenCounters + 1)[len] += 2 * 2; // we add 4 leaves at level (len + 1)
n -= 2;
STORE_PARENT (n , n[0], len)
STORE_PARENT (n + 1, n[1], len)
}
while (n != b);
}
}
if (b != p)
{
// we detect level of each node (realtive to root),
// and update lenCounters[].
// We process only intermediate nodes and we don't process leaves.
do
{
// if (ii < b) : parent_bits_of (p[ii]) == index of parent node : ii < (p[ii])
// if (ii >= b) : parent_bits_of (p[ii]) == level of this (ii) node in tree
unsigned len;
b--;
len = (unsigned)LOAD_PARENT(p + LOAD_PARENT(b)) + 1;
STORE_PARENT (b, *b, len)
if (len >= maxLen) if (len >= maxLen)
for (len = maxLen - 1; lenCounters[len] == 0; len--); {
lenCounters[len]--; // We are not allowed to create node at level (maxLen) and higher,
lenCounters[(size_t)len + 1] += 2; // because all leaves must be placed to level (maxLen) or lower.
// We find nearest allowed leaf and place current node to level of that leaf:
for (len = maxLen - 1; lenCounters[len] == 0; len--) {}
}
lenCounters[len]--; // we remove 1 leaf from level (len)
(lenCounters + 1)[len] += 2; // we add 2 leaves at level (len + 1)
} }
while (b != p);
}
}
{
{
unsigned len = maxLen;
const UInt32 *p2 = p;
do
{ {
UInt32 len; unsigned k = lenCounters[len];
i = 0; if (k)
for (len = maxLen; len != 0; len--) do
{ lens[(unsigned)*p2++ & MASK] = (Byte)len;
UInt32 k; while (--k);
for (k = lenCounters[len]; k != 0; k--)
lens[p[i++] & MASK] = (Byte)len;
}
} }
while (--len);
}
codes[0] = 0; // we don't want garbage values to be written to p[] array.
// codes[1] = 0;
{
UInt32 code = 0;
unsigned len;
for (len = 0; len < kMaxLen; len++)
(codes + 1)[len] = code = (code + lenCounters[len]) << 1;
}
/* if (code + lenCounters[kMaxLen] - 1 != (1 << kMaxLen) - 1) throw 1; */
{
const Byte * const limit = lens + numSymbols;
do
{ {
UInt32 nextCodes[kMaxLen + 1]; unsigned len;
{ UInt32 c;
UInt32 code = 0; len = lens[0]; c = codes[len]; p[0] = c; codes[len] = c + 1;
UInt32 len; // len = lens[1]; c = codes[len]; p[1] = c; codes[len] = c + 1;
for (len = 1; len <= kMaxLen; len++) p += 1;
nextCodes[len] = code = (code + lenCounters[(size_t)len - 1]) << 1; lens += 1;
}
/* if (code + lenCounters[kMaxLen] - 1 != (1 << kMaxLen) - 1) throw 1; */
{
UInt32 k;
for (k = 0; k < numSymbols; k++)
p[k] = nextCodes[lens[k]]++;
}
} }
while (lens != limit);
} }
} }
} }
#undef kMaxLen
#undef NUM_BITS
#undef MASK
#undef FREQ_MASK
#undef NUM_COUNTERS
#undef CTR_ITEM_FOR_FREQ
#undef LOAD_PARENT
#undef STORE_PARENT
#undef STORE_PARENT_DIRECT
#undef UPDATE_E
#undef HI_HALF_OFFSET
#undef NUM_UNROLLS
#undef lenCounters
#undef codes

View file

@ -1,21 +1,21 @@
/* HuffEnc.h -- Huffman encoding /* HuffEnc.h -- Huffman encoding
2013-01-18 : Igor Pavlov : Public domain */ Igor Pavlov : Public domain */
#ifndef __HUFF_ENC_H #ifndef ZIP7_INC_HUFF_ENC_H
#define __HUFF_ENC_H #define ZIP7_INC_HUFF_ENC_H
#include "7zTypes.h" #include "7zTypes.h"
EXTERN_C_BEGIN EXTERN_C_BEGIN
#define Z7_HUFFMAN_LEN_MAX 16
/* /*
Conditions: Conditions:
num <= 1024 = 2 ^ NUM_BITS 2 <= num <= 1024 = 2 ^ NUM_BITS
Sum(freqs) < 4M = 2 ^ (32 - NUM_BITS) Sum(freqs) < 4M = 2 ^ (32 - NUM_BITS)
maxLen <= 16 = kMaxLen 1 <= maxLen <= 16 = Z7_HUFFMAN_LEN_MAX
Num_Items(p) >= HUFFMAN_TEMP_SIZE(num) Num_Items(p) >= HUFFMAN_TEMP_SIZE(num)
*/ */
void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 num, UInt32 maxLen); void Huffman_Generate(const UInt32 *freqs, UInt32 *p, Byte *lens, UInt32 num, UInt32 maxLen);
EXTERN_C_END EXTERN_C_END

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,8 @@
/* LzFind.h -- Match finder for LZ algorithms /* LzFind.h -- Match finder for LZ algorithms
2021-07-13 : Igor Pavlov : Public domain */ 2024-01-22 : Igor Pavlov : Public domain */
#ifndef __LZ_FIND_H #ifndef ZIP7_INC_LZ_FIND_H
#define __LZ_FIND_H #define ZIP7_INC_LZ_FIND_H
#include "7zTypes.h" #include "7zTypes.h"
@ -10,9 +10,9 @@ EXTERN_C_BEGIN
typedef UInt32 CLzRef; typedef UInt32 CLzRef;
typedef struct _CMatchFinder typedef struct
{ {
Byte *buffer; const Byte *buffer;
UInt32 pos; UInt32 pos;
UInt32 posLimit; UInt32 posLimit;
UInt32 streamPos; /* wrap over Zero is allowed (streamPos < pos). Use (UInt32)(streamPos - pos) */ UInt32 streamPos; /* wrap over Zero is allowed (streamPos < pos). Use (UInt32)(streamPos - pos) */
@ -32,8 +32,8 @@ typedef struct _CMatchFinder
UInt32 hashMask; UInt32 hashMask;
UInt32 cutValue; UInt32 cutValue;
Byte *bufferBase; Byte *bufBase;
ISeqInStream *stream; ISeqInStreamPtr stream;
UInt32 blockSize; UInt32 blockSize;
UInt32 keepSizeBefore; UInt32 keepSizeBefore;
@ -43,7 +43,9 @@ typedef struct _CMatchFinder
size_t directInputRem; size_t directInputRem;
UInt32 historySize; UInt32 historySize;
UInt32 fixedHashSize; UInt32 fixedHashSize;
UInt32 hashSizeSum; Byte numHashBytes_Min;
Byte numHashOutBits;
Byte _pad2_[2];
SRes result; SRes result;
UInt32 crc[256]; UInt32 crc[256];
size_t numRefs; size_t numRefs;
@ -69,24 +71,45 @@ void MatchFinder_ReadIfRequired(CMatchFinder *p);
void MatchFinder_Construct(CMatchFinder *p); void MatchFinder_Construct(CMatchFinder *p);
/* Conditions: /* (directInput = 0) is default value.
historySize <= 3 GB It's required to provide correct (directInput) value
keepAddBufferBefore + matchMaxLen + keepAddBufferAfter < 511MB before calling MatchFinder_Create().
You can set (directInput) by any of the following calls:
- MatchFinder_SET_DIRECT_INPUT_BUF()
- MatchFinder_SET_STREAM()
- MatchFinder_SET_STREAM_MODE()
*/ */
#define MatchFinder_SET_DIRECT_INPUT_BUF(p, _src_, _srcLen_) { \
(p)->stream = NULL; \
(p)->directInput = 1; \
(p)->buffer = (_src_); \
(p)->directInputRem = (_srcLen_); }
/*
#define MatchFinder_SET_STREAM_MODE(p) { \
(p)->directInput = 0; }
*/
#define MatchFinder_SET_STREAM(p, _stream_) { \
(p)->stream = _stream_; \
(p)->directInput = 0; }
int MatchFinder_Create(CMatchFinder *p, UInt32 historySize, int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter, UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
ISzAllocPtr alloc); ISzAllocPtr alloc);
void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc); void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc);
void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems); void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems);
// void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue);
/* /*
#define Inline_MatchFinder_InitPos(p, val) \ #define MatchFinder_INIT_POS(p, val) \
(p)->pos = (val); \ (p)->pos = (val); \
(p)->streamPos = (val); (p)->streamPos = (val);
*/ */
#define Inline_MatchFinder_ReduceOffsets(p, subValue) \ // void MatchFinder_ReduceOffsets(CMatchFinder *p, UInt32 subValue);
#define MatchFinder_REDUCE_OFFSETS(p, subValue) \
(p)->pos -= (subValue); \ (p)->pos -= (subValue); \
(p)->streamPos -= (subValue); (p)->streamPos -= (subValue);
@ -107,7 +130,7 @@ typedef const Byte * (*Mf_GetPointerToCurrentPos_Func)(void *object);
typedef UInt32 * (*Mf_GetMatches_Func)(void *object, UInt32 *distances); typedef UInt32 * (*Mf_GetMatches_Func)(void *object, UInt32 *distances);
typedef void (*Mf_Skip_Func)(void *object, UInt32); typedef void (*Mf_Skip_Func)(void *object, UInt32);
typedef struct _IMatchFinder typedef struct
{ {
Mf_Init_Func Init; Mf_Init_Func Init;
Mf_GetNumAvailableBytes_Func GetNumAvailableBytes; Mf_GetNumAvailableBytes_Func GetNumAvailableBytes;
@ -121,7 +144,8 @@ void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable);
void MatchFinder_Init_LowHash(CMatchFinder *p); void MatchFinder_Init_LowHash(CMatchFinder *p);
void MatchFinder_Init_HighHash(CMatchFinder *p); void MatchFinder_Init_HighHash(CMatchFinder *p);
void MatchFinder_Init_4(CMatchFinder *p); void MatchFinder_Init_4(CMatchFinder *p);
void MatchFinder_Init(CMatchFinder *p); // void MatchFinder_Init(CMatchFinder *p);
void MatchFinder_Init(void *p);
UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances);
UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances); UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances);

View file

@ -1,5 +1,5 @@
/* LzFindMt.c -- multithreaded Match finder for LZ algorithms /* LzFindMt.c -- multithreaded Match finder for LZ algorithms
2021-12-21 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -69,7 +69,7 @@ extern UInt64 g_NumIters_Bytes;
UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); } h3 = (temp ^ ((UInt32)cur[2] << 8)) & (kHash3Size - 1); }
#define __MT_HASH4_CALC { \ #define MT_HASH4_CALC { \
UInt32 temp = p->crc[cur[0]] ^ cur[1]; \ UInt32 temp = p->crc[cur[0]] ^ cur[1]; \
h2 = temp & (kHash2Size - 1); \ h2 = temp & (kHash2Size - 1); \
temp ^= ((UInt32)cur[2] << 8); \ temp ^= ((UInt32)cur[2] << 8); \
@ -79,14 +79,16 @@ extern UInt64 g_NumIters_Bytes;
*/ */
MY_NO_INLINE Z7_NO_INLINE
static void MtSync_Construct(CMtSync *p) static void MtSync_Construct(CMtSync *p)
{ {
p->affinityGroup = -1;
p->affinityInGroup = 0;
p->affinity = 0; p->affinity = 0;
p->wasCreated = False; p->wasCreated = False;
p->csWasInitialized = False; p->csWasInitialized = False;
p->csWasEntered = False; p->csWasEntered = False;
Thread_Construct(&p->thread); Thread_CONSTRUCT(&p->thread)
Event_Construct(&p->canStart); Event_Construct(&p->canStart);
Event_Construct(&p->wasStopped); Event_Construct(&p->wasStopped);
Semaphore_Construct(&p->freeSemaphore); Semaphore_Construct(&p->freeSemaphore);
@ -94,7 +96,7 @@ static void MtSync_Construct(CMtSync *p)
} }
#define DEBUG_BUFFER_LOCK // define it to debug lock state // #define DEBUG_BUFFER_LOCK // define it to debug lock state
#ifdef DEBUG_BUFFER_LOCK #ifdef DEBUG_BUFFER_LOCK
#include <stdlib.h> #include <stdlib.h>
@ -116,7 +118,7 @@ static void MtSync_Construct(CMtSync *p)
(p)->csWasEntered = False; } (p)->csWasEntered = False; }
MY_NO_INLINE Z7_NO_INLINE
static UInt32 MtSync_GetNextBlock(CMtSync *p) static UInt32 MtSync_GetNextBlock(CMtSync *p)
{ {
UInt32 numBlocks = 0; UInt32 numBlocks = 0;
@ -140,14 +142,14 @@ static UInt32 MtSync_GetNextBlock(CMtSync *p)
// buffer is UNLOCKED here // buffer is UNLOCKED here
Semaphore_Wait(&p->filledSemaphore); Semaphore_Wait(&p->filledSemaphore);
LOCK_BUFFER(p); LOCK_BUFFER(p)
return numBlocks; return numBlocks;
} }
/* if Writing (Processing) thread was started, we must call MtSync_StopWriting() */ /* if Writing (Processing) thread was started, we must call MtSync_StopWriting() */
MY_NO_INLINE Z7_NO_INLINE
static void MtSync_StopWriting(CMtSync *p) static void MtSync_StopWriting(CMtSync *p)
{ {
if (!Thread_WasCreated(&p->thread) || p->needStart) if (!Thread_WasCreated(&p->thread) || p->needStart)
@ -185,7 +187,7 @@ static void MtSync_StopWriting(CMtSync *p)
} }
MY_NO_INLINE Z7_NO_INLINE
static void MtSync_Destruct(CMtSync *p) static void MtSync_Destruct(CMtSync *p)
{ {
PRF(printf("\nMtSync_Destruct %p\n", p)); PRF(printf("\nMtSync_Destruct %p\n", p));
@ -220,11 +222,11 @@ static void MtSync_Destruct(CMtSync *p)
// #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; } // #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; }
// we want to get real system error codes here instead of SZ_ERROR_THREAD // we want to get real system error codes here instead of SZ_ERROR_THREAD
#define RINOK_THREAD(x) RINOK(x) #define RINOK_THREAD(x) RINOK_WRes(x)
// call it before each new file (when new starting is required): // call it before each new file (when new starting is required):
MY_NO_INLINE Z7_NO_INLINE
static SRes MtSync_Init(CMtSync *p, UInt32 numBlocks) static SRes MtSync_Init(CMtSync *p, UInt32 numBlocks)
{ {
WRes wres; WRes wres;
@ -245,12 +247,12 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *
if (p->wasCreated) if (p->wasCreated)
return SZ_OK; return SZ_OK;
RINOK_THREAD(CriticalSection_Init(&p->cs)); RINOK_THREAD(CriticalSection_Init(&p->cs))
p->csWasInitialized = True; p->csWasInitialized = True;
p->csWasEntered = False; p->csWasEntered = False;
RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart)); RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->canStart))
RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped)); RINOK_THREAD(AutoResetEvent_CreateNotSignaled(&p->wasStopped))
p->needStart = True; p->needStart = True;
p->exit = True; /* p->exit is unused before (canStart) Event. p->exit = True; /* p->exit is unused before (canStart) Event.
@ -259,18 +261,24 @@ static WRes MtSync_Create_WRes(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *
// return ERROR_TOO_MANY_POSTS; // for debug // return ERROR_TOO_MANY_POSTS; // for debug
// return EINVAL; // for debug // return EINVAL; // for debug
#ifdef _WIN32
if (p->affinityGroup >= 0)
wres = Thread_Create_With_Group(&p->thread, startAddress, obj,
(unsigned)(UInt32)p->affinityGroup, (CAffinityMask)p->affinityInGroup);
else
#endif
if (p->affinity != 0) if (p->affinity != 0)
wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity); wres = Thread_Create_With_Affinity(&p->thread, startAddress, obj, (CAffinityMask)p->affinity);
else else
wres = Thread_Create(&p->thread, startAddress, obj); wres = Thread_Create(&p->thread, startAddress, obj);
RINOK_THREAD(wres); RINOK_THREAD(wres)
p->wasCreated = True; p->wasCreated = True;
return SZ_OK; return SZ_OK;
} }
MY_NO_INLINE Z7_NO_INLINE
static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj) static SRes MtSync_Create(CMtSync *p, THREAD_FUNC_TYPE startAddress, void *obj)
{ {
const WRes wres = MtSync_Create_WRes(p, startAddress, obj); const WRes wres = MtSync_Create_WRes(p, startAddress, obj);
@ -519,7 +527,7 @@ static void HashThreadFunc(CMatchFinderMt *mt)
if (mf->pos > (UInt32)kMtMaxValForNormalize - num) if (mf->pos > (UInt32)kMtMaxValForNormalize - num)
{ {
const UInt32 subValue = (mf->pos - mf->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1); const UInt32 subValue = (mf->pos - mf->historySize - 1); // & ~(UInt32)(kNormalizeAlign - 1);
Inline_MatchFinder_ReduceOffsets(mf, subValue); MatchFinder_REDUCE_OFFSETS(mf, subValue)
MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1); MatchFinder_Normalize3(subValue, mf->hash + mf->fixedHashSize, (size_t)mf->hashMask + 1);
} }
@ -560,7 +568,7 @@ static void HashThreadFunc(CMatchFinderMt *mt)
*/ */
UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
UInt32 *posRes); UInt32 *posRes);
@ -749,7 +757,7 @@ static void BtFillBlock(CMatchFinderMt *p, UInt32 globalBlockIndex)
} }
MY_NO_INLINE Z7_NO_INLINE
static void BtThreadFunc(CMatchFinderMt *mt) static void BtThreadFunc(CMatchFinderMt *mt)
{ {
CMtSync *p = &mt->btSync; CMtSync *p = &mt->btSync;
@ -864,21 +872,22 @@ SRes MatchFinderMt_Create(CMatchFinderMt *p, UInt32 historySize, UInt32 keepAddB
if (!MatchFinder_Create(mf, historySize, keepAddBufferBefore, matchMaxLen, keepAddBufferAfter, alloc)) if (!MatchFinder_Create(mf, historySize, keepAddBufferBefore, matchMaxLen, keepAddBufferAfter, alloc))
return SZ_ERROR_MEM; return SZ_ERROR_MEM;
RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p)); RINOK(MtSync_Create(&p->hashSync, HashThreadFunc2, p))
RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p)); RINOK(MtSync_Create(&p->btSync, BtThreadFunc2, p))
return SZ_OK; return SZ_OK;
} }
SRes MatchFinderMt_InitMt(CMatchFinderMt *p) SRes MatchFinderMt_InitMt(CMatchFinderMt *p)
{ {
RINOK(MtSync_Init(&p->hashSync, kMtHashNumBlocks)); RINOK(MtSync_Init(&p->hashSync, kMtHashNumBlocks))
return MtSync_Init(&p->btSync, kMtBtNumBlocks); return MtSync_Init(&p->btSync, kMtBtNumBlocks);
} }
static void MatchFinderMt_Init(CMatchFinderMt *p) static void MatchFinderMt_Init(void *_p)
{ {
CMatchFinderMt *p = (CMatchFinderMt *)_p;
CMatchFinder *mf = MF(p); CMatchFinder *mf = MF(p);
p->btBufPos = p->btBufPos =
@ -941,7 +950,7 @@ void MatchFinderMt_ReleaseStream(CMatchFinderMt *p)
} }
MY_NO_INLINE Z7_NO_INLINE
static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p) static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p)
{ {
if (p->failure_LZ_BT) if (p->failure_LZ_BT)
@ -981,8 +990,9 @@ static UInt32 MatchFinderMt_GetNextBlock_Bt(CMatchFinderMt *p)
static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p) static const Byte * MatchFinderMt_GetPointerToCurrentPos(void *_p)
{ {
CMatchFinderMt *p = (CMatchFinderMt *)_p;
return p->pointerToCurPos; return p->pointerToCurPos;
} }
@ -990,8 +1000,9 @@ static const Byte * MatchFinderMt_GetPointerToCurrentPos(CMatchFinderMt *p)
#define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p); #define GET_NEXT_BLOCK_IF_REQUIRED if (p->btBufPos == p->btBufPosLimit) MatchFinderMt_GetNextBlock_Bt(p);
static UInt32 MatchFinderMt_GetNumAvailableBytes(CMatchFinderMt *p) static UInt32 MatchFinderMt_GetNumAvailableBytes(void *_p)
{ {
CMatchFinderMt *p = (CMatchFinderMt *)_p;
if (p->btBufPos != p->btBufPosLimit) if (p->btBufPos != p->btBufPosLimit)
return p->btNumAvailBytes; return p->btNumAvailBytes;
return MatchFinderMt_GetNextBlock_Bt(p); return MatchFinderMt_GetNextBlock_Bt(p);
@ -1163,7 +1174,7 @@ UInt32* MatchFinderMt_GetMatches_Bt4(CMatchFinderMt *p, UInt32 *d)
*/ */
static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d) static UInt32 * MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
{ {
UInt32 h2, h3, /* h4, */ c2, c3 /* , c4 */; UInt32 h2, h3, /* h4, */ c2, c3 /* , c4 */;
UInt32 *hash = p->hash; UInt32 *hash = p->hash;
@ -1179,9 +1190,8 @@ static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
(hash + kFix3HashSize)[h3] = m; (hash + kFix3HashSize)[h3] = m;
// (hash + kFix4HashSize)[h4] = m; // (hash + kFix4HashSize)[h4] = m;
#define _USE_H2 // #define BT5_USE_H2
// #ifdef BT5_USE_H2
#ifdef _USE_H2
if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0]) if (c2 >= matchMinPos && cur[(ptrdiff_t)c2 - (ptrdiff_t)m] == cur[0])
{ {
d[1] = m - c2 - 1; d[1] = m - c2 - 1;
@ -1198,7 +1208,7 @@ static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
d[0] = 3; d[0] = 3;
d += 2; d += 2;
#ifdef _USE_H4 #ifdef BT5_USE_H4
if (c4 >= matchMinPos) if (c4 >= matchMinPos)
if ( if (
cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] && cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] &&
@ -1214,7 +1224,7 @@ static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
d[0] = 2; d[0] = 2;
d += 2; d += 2;
} }
#endif // #endif
if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0]) if (c3 >= matchMinPos && cur[(ptrdiff_t)c3 - (ptrdiff_t)m] == cur[0])
{ {
@ -1228,7 +1238,7 @@ static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
d += 2; d += 2;
} }
#ifdef _USE_H4 #ifdef BT5_USE_H4
if (c4 >= matchMinPos) if (c4 >= matchMinPos)
if ( if (
cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] && cur[(ptrdiff_t)c4 - (ptrdiff_t)m] == cur[0] &&
@ -1244,8 +1254,9 @@ static UInt32 *MixMatches4(CMatchFinderMt *p, UInt32 matchMinPos, UInt32 *d)
} }
static UInt32* MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d) static UInt32 * MatchFinderMt2_GetMatches(void *_p, UInt32 *d)
{ {
CMatchFinderMt *p = (CMatchFinderMt *)_p;
const UInt32 *bt = p->btBufPos; const UInt32 *bt = p->btBufPos;
const UInt32 len = *bt++; const UInt32 len = *bt++;
const UInt32 *btLim = bt + len; const UInt32 *btLim = bt + len;
@ -1268,8 +1279,9 @@ static UInt32* MatchFinderMt2_GetMatches(CMatchFinderMt *p, UInt32 *d)
static UInt32* MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d) static UInt32 * MatchFinderMt_GetMatches(void *_p, UInt32 *d)
{ {
CMatchFinderMt *p = (CMatchFinderMt *)_p;
const UInt32 *bt = p->btBufPos; const UInt32 *bt = p->btBufPos;
UInt32 len = *bt++; UInt32 len = *bt++;
const UInt32 avail = p->btNumAvailBytes - 1; const UInt32 avail = p->btNumAvailBytes - 1;
@ -1316,14 +1328,16 @@ static UInt32* MatchFinderMt_GetMatches(CMatchFinderMt *p, UInt32 *d)
#define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash; #define SKIP_HEADER_MT(n) SKIP_HEADER2_MT if (p->btNumAvailBytes-- >= (n)) { const Byte *cur = p->pointerToCurPos; UInt32 *hash = p->hash;
#define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0); #define SKIP_FOOTER_MT } INCREASE_LZ_POS p->btBufPos += (size_t)*p->btBufPos + 1; } while (--num != 0);
static void MatchFinderMt0_Skip(CMatchFinderMt *p, UInt32 num) static void MatchFinderMt0_Skip(void *_p, UInt32 num)
{ {
CMatchFinderMt *p = (CMatchFinderMt *)_p;
SKIP_HEADER2_MT { p->btNumAvailBytes--; SKIP_HEADER2_MT { p->btNumAvailBytes--;
SKIP_FOOTER_MT SKIP_FOOTER_MT
} }
static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num) static void MatchFinderMt2_Skip(void *_p, UInt32 num)
{ {
CMatchFinderMt *p = (CMatchFinderMt *)_p;
SKIP_HEADER_MT(2) SKIP_HEADER_MT(2)
UInt32 h2; UInt32 h2;
MT_HASH2_CALC MT_HASH2_CALC
@ -1331,8 +1345,9 @@ static void MatchFinderMt2_Skip(CMatchFinderMt *p, UInt32 num)
SKIP_FOOTER_MT SKIP_FOOTER_MT
} }
static void MatchFinderMt3_Skip(CMatchFinderMt *p, UInt32 num) static void MatchFinderMt3_Skip(void *_p, UInt32 num)
{ {
CMatchFinderMt *p = (CMatchFinderMt *)_p;
SKIP_HEADER_MT(3) SKIP_HEADER_MT(3)
UInt32 h2, h3; UInt32 h2, h3;
MT_HASH3_CALC MT_HASH3_CALC
@ -1362,39 +1377,46 @@ static void MatchFinderMt4_Skip(CMatchFinderMt *p, UInt32 num)
void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable) void MatchFinderMt_CreateVTable(CMatchFinderMt *p, IMatchFinder2 *vTable)
{ {
vTable->Init = (Mf_Init_Func)MatchFinderMt_Init; vTable->Init = MatchFinderMt_Init;
vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinderMt_GetNumAvailableBytes; vTable->GetNumAvailableBytes = MatchFinderMt_GetNumAvailableBytes;
vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinderMt_GetPointerToCurrentPos; vTable->GetPointerToCurrentPos = MatchFinderMt_GetPointerToCurrentPos;
vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches; vTable->GetMatches = MatchFinderMt_GetMatches;
switch (MF(p)->numHashBytes) switch (MF(p)->numHashBytes)
{ {
case 2: case 2:
p->GetHeadsFunc = GetHeads2; p->GetHeadsFunc = GetHeads2;
p->MixMatchesFunc = (Mf_Mix_Matches)NULL; p->MixMatchesFunc = NULL;
vTable->Skip = (Mf_Skip_Func)MatchFinderMt0_Skip; vTable->Skip = MatchFinderMt0_Skip;
vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt2_GetMatches; vTable->GetMatches = MatchFinderMt2_GetMatches;
break; break;
case 3: case 3:
p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3; p->GetHeadsFunc = MF(p)->bigHash ? GetHeads3b : GetHeads3;
p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches2; p->MixMatchesFunc = MixMatches2;
vTable->Skip = (Mf_Skip_Func)MatchFinderMt2_Skip; vTable->Skip = MatchFinderMt2_Skip;
break; break;
case 4: case 4:
p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4; p->GetHeadsFunc = MF(p)->bigHash ? GetHeads4b : GetHeads4;
// it's fast inline version of GetMatches() // it's fast inline version of GetMatches()
// vTable->GetMatches = (Mf_GetMatches_Func)MatchFinderMt_GetMatches_Bt4; // vTable->GetMatches = MatchFinderMt_GetMatches_Bt4;
p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches3; p->MixMatchesFunc = MixMatches3;
vTable->Skip = (Mf_Skip_Func)MatchFinderMt3_Skip; vTable->Skip = MatchFinderMt3_Skip;
break; break;
default: default:
p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5; p->GetHeadsFunc = MF(p)->bigHash ? GetHeads5b : GetHeads5;
p->MixMatchesFunc = (Mf_Mix_Matches)MixMatches4; p->MixMatchesFunc = MixMatches4;
vTable->Skip = vTable->Skip =
(Mf_Skip_Func)MatchFinderMt3_Skip; MatchFinderMt3_Skip;
// (Mf_Skip_Func)MatchFinderMt4_Skip; // MatchFinderMt4_Skip;
break; break;
} }
} }
#undef RINOK_THREAD
#undef PRF
#undef MF
#undef GetUi24hi_from32
#undef LOCK_BUFFER
#undef UNLOCK_BUFFER

View file

@ -1,19 +1,21 @@
/* LzFindMt.h -- multithreaded Match finder for LZ algorithms /* LzFindMt.h -- multithreaded Match finder for LZ algorithms
2021-07-12 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#ifndef __LZ_FIND_MT_H #ifndef ZIP7_INC_LZ_FIND_MT_H
#define __LZ_FIND_MT_H #define ZIP7_INC_LZ_FIND_MT_H
#include "LzFind.h" #include "LzFind.h"
#include "Threads.h" #include "Threads.h"
EXTERN_C_BEGIN EXTERN_C_BEGIN
typedef struct _CMtSync typedef struct
{ {
UInt32 numProcessedBlocks; UInt32 numProcessedBlocks;
CThread thread; Int32 affinityGroup;
UInt64 affinityInGroup;
UInt64 affinity; UInt64 affinity;
CThread thread;
BoolInt wasCreated; BoolInt wasCreated;
BoolInt needStart; BoolInt needStart;
@ -31,7 +33,10 @@ typedef struct _CMtSync
// UInt32 numBlocks_Sent; // UInt32 numBlocks_Sent;
} CMtSync; } CMtSync;
typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distances);
struct CMatchFinderMt_;
typedef UInt32 * (*Mf_Mix_Matches)(struct CMatchFinderMt_ *p, UInt32 matchMinPos, UInt32 *distances);
/* kMtCacheLineDummy must be >= size_of_CPU_cache_line */ /* kMtCacheLineDummy must be >= size_of_CPU_cache_line */
#define kMtCacheLineDummy 128 #define kMtCacheLineDummy 128
@ -39,7 +44,7 @@ typedef UInt32 * (*Mf_Mix_Matches)(void *p, UInt32 matchMinPos, UInt32 *distance
typedef void (*Mf_GetHeads)(const Byte *buffer, UInt32 pos, typedef void (*Mf_GetHeads)(const Byte *buffer, UInt32 pos,
UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc); UInt32 *hash, UInt32 hashMask, UInt32 *heads, UInt32 numHeads, const UInt32 *crc);
typedef struct _CMatchFinderMt typedef struct CMatchFinderMt_
{ {
/* LZ */ /* LZ */
const Byte *pointerToCurPos; const Byte *pointerToCurPos;

View file

@ -1,5 +1,5 @@
/* LzFindOpt.c -- multithreaded Match finder for LZ algorithms /* LzFindOpt.c -- multithreaded Match finder for LZ algorithms
2021-07-13 : Igor Pavlov : Public domain */ 2023-04-02 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -41,8 +41,8 @@ UInt64 g_NumIters_Bytes;
// #define CYC_TO_POS_OFFSET 1 // for debug // #define CYC_TO_POS_OFFSET 1 // for debug
/* /*
MY_NO_INLINE Z7_NO_INLINE
UInt32 * MY_FAST_CALL GetMatchesSpecN_1(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, UInt32 * Z7_FASTCALL GetMatchesSpecN_1(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 *posRes) UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 *posRes)
{ {
do do
@ -214,13 +214,13 @@ else
to eliminate "movsx" BUG in old MSVC x64 compiler. to eliminate "movsx" BUG in old MSVC x64 compiler.
*/ */
UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
UInt32 *posRes); UInt32 *posRes);
MY_NO_INLINE Z7_NO_INLINE
UInt32 * MY_FAST_CALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son, UInt32 * Z7_FASTCALL GetMatchesSpecN_2(const Byte *lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 _cutValue, UInt32 *d, size_t _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
UInt32 *posRes) UInt32 *posRes)
@ -404,7 +404,7 @@ else
/* /*
typedef UInt32 uint32plus; // size_t typedef UInt32 uint32plus; // size_t
UInt32 * MY_FAST_CALL GetMatchesSpecN_3(uint32plus lenLimit, size_t pos, const Byte *cur, CLzRef *son, UInt32 * Z7_FASTCALL GetMatchesSpecN_3(uint32plus lenLimit, size_t pos, const Byte *cur, CLzRef *son,
UInt32 _cutValue, UInt32 *d, uint32plus _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size, UInt32 _cutValue, UInt32 *d, uint32plus _maxLen, const UInt32 *hash, const UInt32 *limit, const UInt32 *size,
size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, size_t _cyclicBufferPos, UInt32 _cyclicBufferSize,
UInt32 *posRes) UInt32 *posRes)

View file

@ -1,8 +1,8 @@
/* LzHash.h -- HASH functions for LZ algorithms /* LzHash.h -- HASH constants for LZ algorithms
2019-10-30 : Igor Pavlov : Public domain */ 2023-03-05 : Igor Pavlov : Public domain */
#ifndef __LZ_HASH_H #ifndef ZIP7_INC_LZ_HASH_H
#define __LZ_HASH_H #define ZIP7_INC_LZ_HASH_H
/* /*
(kHash2Size >= (1 << 8)) : Required (kHash2Size >= (1 << 8)) : Required

View file

@ -1,5 +1,5 @@
/* Lzma2Dec.c -- LZMA2 Decoder /* Lzma2Dec.c -- LZMA2 Decoder
2021-02-09 : Igor Pavlov : Public domain */ 2024-03-01 : Igor Pavlov : Public domain */
/* #define SHOW_DEBUG_INFO */ /* #define SHOW_DEBUG_INFO */
@ -71,14 +71,14 @@ static SRes Lzma2Dec_GetOldProps(Byte prop, Byte *props)
SRes Lzma2Dec_AllocateProbs(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc) SRes Lzma2Dec_AllocateProbs(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc)
{ {
Byte props[LZMA_PROPS_SIZE]; Byte props[LZMA_PROPS_SIZE];
RINOK(Lzma2Dec_GetOldProps(prop, props)); RINOK(Lzma2Dec_GetOldProps(prop, props))
return LzmaDec_AllocateProbs(&p->decoder, props, LZMA_PROPS_SIZE, alloc); return LzmaDec_AllocateProbs(&p->decoder, props, LZMA_PROPS_SIZE, alloc);
} }
SRes Lzma2Dec_Allocate(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc) SRes Lzma2Dec_Allocate(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc)
{ {
Byte props[LZMA_PROPS_SIZE]; Byte props[LZMA_PROPS_SIZE];
RINOK(Lzma2Dec_GetOldProps(prop, props)); RINOK(Lzma2Dec_GetOldProps(prop, props))
return LzmaDec_Allocate(&p->decoder, props, LZMA_PROPS_SIZE, alloc); return LzmaDec_Allocate(&p->decoder, props, LZMA_PROPS_SIZE, alloc);
} }
@ -157,8 +157,10 @@ static unsigned Lzma2Dec_UpdateState(CLzma2Dec *p, Byte b)
p->decoder.prop.lp = (Byte)lp; p->decoder.prop.lp = (Byte)lp;
return LZMA2_STATE_DATA; return LZMA2_STATE_DATA;
} }
default:
return LZMA2_STATE_ERROR;
} }
return LZMA2_STATE_ERROR;
} }
static void LzmaDec_UpdateWithUncompressed(CLzmaDec *p, const Byte *src, SizeT size) static void LzmaDec_UpdateWithUncompressed(CLzmaDec *p, const Byte *src, SizeT size)
@ -474,8 +476,8 @@ SRes Lzma2Decode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
SizeT outSize = *destLen, inSize = *srcLen; SizeT outSize = *destLen, inSize = *srcLen;
*destLen = *srcLen = 0; *destLen = *srcLen = 0;
*status = LZMA_STATUS_NOT_SPECIFIED; *status = LZMA_STATUS_NOT_SPECIFIED;
Lzma2Dec_Construct(&p); Lzma2Dec_CONSTRUCT(&p)
RINOK(Lzma2Dec_AllocateProbs(&p, prop, alloc)); RINOK(Lzma2Dec_AllocateProbs(&p, prop, alloc))
p.decoder.dic = dest; p.decoder.dic = dest;
p.decoder.dicBufSize = outSize; p.decoder.dicBufSize = outSize;
Lzma2Dec_Init(&p); Lzma2Dec_Init(&p);
@ -487,3 +489,5 @@ SRes Lzma2Decode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
Lzma2Dec_FreeProbs(&p, alloc); Lzma2Dec_FreeProbs(&p, alloc);
return res; return res;
} }
#undef PRF

View file

@ -1,8 +1,8 @@
/* Lzma2Dec.h -- LZMA2 Decoder /* Lzma2Dec.h -- LZMA2 Decoder
2018-02-19 : Igor Pavlov : Public domain */ 2023-03-03 : Igor Pavlov : Public domain */
#ifndef __LZMA2_DEC_H #ifndef ZIP7_INC_LZMA2_DEC_H
#define __LZMA2_DEC_H #define ZIP7_INC_LZMA2_DEC_H
#include "LzmaDec.h" #include "LzmaDec.h"
@ -22,9 +22,10 @@ typedef struct
CLzmaDec decoder; CLzmaDec decoder;
} CLzma2Dec; } CLzma2Dec;
#define Lzma2Dec_Construct(p) LzmaDec_Construct(&(p)->decoder) #define Lzma2Dec_CONSTRUCT(p) LzmaDec_CONSTRUCT(&(p)->decoder)
#define Lzma2Dec_FreeProbs(p, alloc) LzmaDec_FreeProbs(&(p)->decoder, alloc) #define Lzma2Dec_Construct(p) Lzma2Dec_CONSTRUCT(p)
#define Lzma2Dec_Free(p, alloc) LzmaDec_Free(&(p)->decoder, alloc) #define Lzma2Dec_FreeProbs(p, alloc) LzmaDec_FreeProbs(&(p)->decoder, alloc)
#define Lzma2Dec_Free(p, alloc) LzmaDec_Free(&(p)->decoder, alloc)
SRes Lzma2Dec_AllocateProbs(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc); SRes Lzma2Dec_AllocateProbs(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc);
SRes Lzma2Dec_Allocate(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc); SRes Lzma2Dec_Allocate(CLzma2Dec *p, Byte prop, ISzAllocPtr alloc);
@ -90,7 +91,7 @@ Lzma2Dec_GetUnpackExtra() returns the value that shows
at current input positon. at current input positon.
*/ */
#define Lzma2Dec_GetUnpackExtra(p) ((p)->isExtraMode ? (p)->unpackSize : 0); #define Lzma2Dec_GetUnpackExtra(p) ((p)->isExtraMode ? (p)->unpackSize : 0)
/* ---------- One Call Interface ---------- */ /* ---------- One Call Interface ---------- */

View file

@ -1,44 +1,44 @@
/* Lzma2DecMt.c -- LZMA2 Decoder Multi-thread /* Lzma2DecMt.c -- LZMA2 Decoder Multi-thread
2021-04-01 : Igor Pavlov : Public domain */ 2023-04-13 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
// #define SHOW_DEBUG_INFO // #define SHOW_DEBUG_INFO
// #define Z7_ST
// #define _7ZIP_ST
#ifdef SHOW_DEBUG_INFO #ifdef SHOW_DEBUG_INFO
#include <stdio.h> #include <stdio.h>
#endif #endif
#ifndef _7ZIP_ST
#ifdef SHOW_DEBUG_INFO
#define PRF(x) x
#else
#define PRF(x)
#endif
#define PRF_STR(s) PRF(printf("\n" s "\n"))
#define PRF_STR_INT_2(s, d1, d2) PRF(printf("\n" s " %d %d\n", (unsigned)d1, (unsigned)d2))
#endif
#include "Alloc.h" #include "Alloc.h"
#include "Lzma2Dec.h" #include "Lzma2Dec.h"
#include "Lzma2DecMt.h" #include "Lzma2DecMt.h"
#ifndef _7ZIP_ST #ifndef Z7_ST
#include "MtDec.h" #include "MtDec.h"
#define LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT (1 << 28) #define LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT (1 << 28)
#endif #endif
#ifndef Z7_ST
#ifdef SHOW_DEBUG_INFO
#define PRF(x) x
#else
#define PRF(x)
#endif
#define PRF_STR(s) PRF(printf("\n" s "\n");)
#define PRF_STR_INT_2(s, d1, d2) PRF(printf("\n" s " %d %d\n", (unsigned)d1, (unsigned)d2);)
#endif
void Lzma2DecMtProps_Init(CLzma2DecMtProps *p) void Lzma2DecMtProps_Init(CLzma2DecMtProps *p)
{ {
p->inBufSize_ST = 1 << 20; p->inBufSize_ST = 1 << 20;
p->outStep_ST = 1 << 20; p->outStep_ST = 1 << 20;
#ifndef _7ZIP_ST #ifndef Z7_ST
p->numThreads = 1; p->numThreads = 1;
p->inBufSize_MT = 1 << 18; p->inBufSize_MT = 1 << 18;
p->outBlockMax = LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT; p->outBlockMax = LZMA2DECMT_OUT_BLOCK_MAX_DEFAULT;
@ -48,7 +48,7 @@ void Lzma2DecMtProps_Init(CLzma2DecMtProps *p)
#ifndef _7ZIP_ST #ifndef Z7_ST
/* ---------- CLzma2DecMtThread ---------- */ /* ---------- CLzma2DecMtThread ---------- */
@ -81,7 +81,7 @@ typedef struct
/* ---------- CLzma2DecMt ---------- */ /* ---------- CLzma2DecMt ---------- */
typedef struct struct CLzma2DecMt
{ {
// ISzAllocPtr alloc; // ISzAllocPtr alloc;
ISzAllocPtr allocMid; ISzAllocPtr allocMid;
@ -90,9 +90,9 @@ typedef struct
CLzma2DecMtProps props; CLzma2DecMtProps props;
Byte prop; Byte prop;
ISeqInStream *inStream; ISeqInStreamPtr inStream;
ISeqOutStream *outStream; ISeqOutStreamPtr outStream;
ICompressProgress *progress; ICompressProgressPtr progress;
BoolInt finishMode; BoolInt finishMode;
BoolInt outSize_Defined; BoolInt outSize_Defined;
@ -111,14 +111,13 @@ typedef struct
size_t inPos; size_t inPos;
size_t inLim; size_t inLim;
#ifndef _7ZIP_ST #ifndef Z7_ST
UInt64 outProcessed_Parse; UInt64 outProcessed_Parse;
BoolInt mtc_WasConstructed; BoolInt mtc_WasConstructed;
CMtDec mtc; CMtDec mtc;
CLzma2DecMtThread coders[MTDEC__THREADS_MAX]; CLzma2DecMtThread coders[MTDEC_THREADS_MAX];
#endif #endif
};
} CLzma2DecMt;
@ -142,11 +141,11 @@ CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid)
// Lzma2DecMtProps_Init(&p->props); // Lzma2DecMtProps_Init(&p->props);
#ifndef _7ZIP_ST #ifndef Z7_ST
p->mtc_WasConstructed = False; p->mtc_WasConstructed = False;
{ {
unsigned i; unsigned i;
for (i = 0; i < MTDEC__THREADS_MAX; i++) for (i = 0; i < MTDEC_THREADS_MAX; i++)
{ {
CLzma2DecMtThread *t = &p->coders[i]; CLzma2DecMtThread *t = &p->coders[i];
t->dec_created = False; t->dec_created = False;
@ -156,16 +155,16 @@ CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid)
} }
#endif #endif
return p; return (CLzma2DecMtHandle)(void *)p;
} }
#ifndef _7ZIP_ST #ifndef Z7_ST
static void Lzma2DecMt_FreeOutBufs(CLzma2DecMt *p) static void Lzma2DecMt_FreeOutBufs(CLzma2DecMt *p)
{ {
unsigned i; unsigned i;
for (i = 0; i < MTDEC__THREADS_MAX; i++) for (i = 0; i < MTDEC_THREADS_MAX; i++)
{ {
CLzma2DecMtThread *t = &p->coders[i]; CLzma2DecMtThread *t = &p->coders[i];
if (t->outBuf) if (t->outBuf)
@ -196,13 +195,15 @@ static void Lzma2DecMt_FreeSt(CLzma2DecMt *p)
} }
void Lzma2DecMt_Destroy(CLzma2DecMtHandle pp) // #define GET_CLzma2DecMt_p CLzma2DecMt *p = (CLzma2DecMt *)(void *)pp;
void Lzma2DecMt_Destroy(CLzma2DecMtHandle p)
{ {
CLzma2DecMt *p = (CLzma2DecMt *)pp; // GET_CLzma2DecMt_p
Lzma2DecMt_FreeSt(p); Lzma2DecMt_FreeSt(p);
#ifndef _7ZIP_ST #ifndef Z7_ST
if (p->mtc_WasConstructed) if (p->mtc_WasConstructed)
{ {
@ -211,7 +212,7 @@ void Lzma2DecMt_Destroy(CLzma2DecMtHandle pp)
} }
{ {
unsigned i; unsigned i;
for (i = 0; i < MTDEC__THREADS_MAX; i++) for (i = 0; i < MTDEC_THREADS_MAX; i++)
{ {
CLzma2DecMtThread *t = &p->coders[i]; CLzma2DecMtThread *t = &p->coders[i];
if (t->dec_created) if (t->dec_created)
@ -226,19 +227,19 @@ void Lzma2DecMt_Destroy(CLzma2DecMtHandle pp)
#endif #endif
ISzAlloc_Free(p->alignOffsetAlloc.baseAlloc, pp); ISzAlloc_Free(p->alignOffsetAlloc.baseAlloc, p);
} }
#ifndef _7ZIP_ST #ifndef Z7_ST
static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCallbackInfo *cc) static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCallbackInfo *cc)
{ {
CLzma2DecMt *me = (CLzma2DecMt *)obj; CLzma2DecMt *me = (CLzma2DecMt *)obj;
CLzma2DecMtThread *t = &me->coders[coderIndex]; CLzma2DecMtThread *t = &me->coders[coderIndex];
PRF_STR_INT_2("Parse", coderIndex, cc->srcSize); PRF_STR_INT_2("Parse", coderIndex, cc->srcSize)
cc->state = MTDEC_PARSE_CONTINUE; cc->state = MTDEC_PARSE_CONTINUE;
@ -246,7 +247,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
{ {
if (!t->dec_created) if (!t->dec_created)
{ {
Lzma2Dec_Construct(&t->dec); Lzma2Dec_CONSTRUCT(&t->dec)
t->dec_created = True; t->dec_created = True;
AlignOffsetAlloc_CreateVTable(&t->alloc); AlignOffsetAlloc_CreateVTable(&t->alloc);
{ {
@ -297,7 +298,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
// that must be finished at position <= outBlockMax. // that must be finished at position <= outBlockMax.
{ {
const SizeT srcOrig = cc->srcSize; const size_t srcOrig = cc->srcSize;
SizeT srcSize_Point = 0; SizeT srcSize_Point = 0;
SizeT dicPos_Point = 0; SizeT dicPos_Point = 0;
@ -306,10 +307,10 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
for (;;) for (;;)
{ {
SizeT srcCur = srcOrig - cc->srcSize; SizeT srcCur = (SizeT)(srcOrig - cc->srcSize);
status = Lzma2Dec_Parse(&t->dec, status = Lzma2Dec_Parse(&t->dec,
limit - t->dec.decoder.dicPos, (SizeT)limit - t->dec.decoder.dicPos,
cc->src + cc->srcSize, &srcCur, cc->src + cc->srcSize, &srcCur,
checkFinishBlock); checkFinishBlock);
@ -333,7 +334,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
if (t->dec.decoder.dicPos >= (1 << 14)) if (t->dec.decoder.dicPos >= (1 << 14))
break; break;
dicPos_Point = t->dec.decoder.dicPos; dicPos_Point = t->dec.decoder.dicPos;
srcSize_Point = cc->srcSize; srcSize_Point = (SizeT)cc->srcSize;
continue; continue;
} }
@ -391,7 +392,7 @@ static void Lzma2DecMt_MtCallback_Parse(void *obj, unsigned coderIndex, CMtDecCa
if (unpackRem != 0) if (unpackRem != 0)
{ {
/* we also reserve space for max possible number of output bytes of current LZMA chunk */ /* we also reserve space for max possible number of output bytes of current LZMA chunk */
SizeT rem = limit - dicPos; size_t rem = limit - dicPos;
if (rem > unpackRem) if (rem > unpackRem)
rem = unpackRem; rem = unpackRem;
dicPos += rem; dicPos += rem;
@ -444,7 +445,7 @@ static SRes Lzma2DecMt_MtCallback_PreCode(void *pp, unsigned coderIndex)
} }
t->dec.decoder.dic = dest; t->dec.decoder.dic = dest;
t->dec.decoder.dicBufSize = t->outPreSize; t->dec.decoder.dicBufSize = (SizeT)t->outPreSize;
t->needInit = True; t->needInit = True;
@ -462,7 +463,7 @@ static SRes Lzma2DecMt_MtCallback_Code(void *pp, unsigned coderIndex,
UNUSED_VAR(srcFinished) UNUSED_VAR(srcFinished)
PRF_STR_INT_2("Code", coderIndex, srcSize); PRF_STR_INT_2("Code", coderIndex, srcSize)
*inCodePos = t->inCodeSize; *inCodePos = t->inCodeSize;
*outCodePos = 0; *outCodePos = 0;
@ -476,13 +477,13 @@ static SRes Lzma2DecMt_MtCallback_Code(void *pp, unsigned coderIndex,
{ {
ELzmaStatus status; ELzmaStatus status;
size_t srcProcessed = srcSize; SizeT srcProcessed = (SizeT)srcSize;
BoolInt blockWasFinished = BoolInt blockWasFinished =
((int)t->parseStatus == LZMA_STATUS_FINISHED_WITH_MARK ((int)t->parseStatus == LZMA_STATUS_FINISHED_WITH_MARK
|| t->parseStatus == LZMA2_PARSE_STATUS_NEW_BLOCK); || t->parseStatus == LZMA2_PARSE_STATUS_NEW_BLOCK);
SRes res = Lzma2Dec_DecodeToDic(&t->dec, SRes res = Lzma2Dec_DecodeToDic(&t->dec,
t->outPreSize, (SizeT)t->outPreSize,
src, &srcProcessed, src, &srcProcessed,
blockWasFinished ? LZMA_FINISH_END : LZMA_FINISH_ANY, blockWasFinished ? LZMA_FINISH_END : LZMA_FINISH_ANY,
&status); &status);
@ -540,7 +541,7 @@ static SRes Lzma2DecMt_MtCallback_Write(void *pp, unsigned coderIndex,
UNUSED_VAR(srcSize) UNUSED_VAR(srcSize)
UNUSED_VAR(isCross) UNUSED_VAR(isCross)
PRF_STR_INT_2("Write", coderIndex, srcSize); PRF_STR_INT_2("Write", coderIndex, srcSize)
*needContinue = False; *needContinue = False;
*canRecode = True; *canRecode = True;
@ -588,7 +589,7 @@ static SRes Lzma2DecMt_MtCallback_Write(void *pp, unsigned coderIndex,
*needContinue = needContinue2; *needContinue = needContinue2;
return SZ_OK; return SZ_OK;
} }
RINOK(MtProgress_ProgressAdd(&me->mtc.mtProgress, 0, 0)); RINOK(MtProgress_ProgressAdd(&me->mtc.mtProgress, 0, 0))
} }
} }
@ -611,11 +612,11 @@ static SRes Lzma2Dec_Prepare_ST(CLzma2DecMt *p)
{ {
if (!p->dec_created) if (!p->dec_created)
{ {
Lzma2Dec_Construct(&p->dec); Lzma2Dec_CONSTRUCT(&p->dec)
p->dec_created = True; p->dec_created = True;
} }
RINOK(Lzma2Dec_Allocate(&p->dec, p->prop, &p->alignOffsetAlloc.vt)); RINOK(Lzma2Dec_Allocate(&p->dec, p->prop, &p->alignOffsetAlloc.vt))
if (!p->inBuf || p->inBufSize != p->props.inBufSize_ST) if (!p->inBuf || p->inBufSize != p->props.inBufSize_ST)
{ {
@ -634,7 +635,7 @@ static SRes Lzma2Dec_Prepare_ST(CLzma2DecMt *p)
static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
#ifndef _7ZIP_ST #ifndef Z7_ST
, BoolInt tMode , BoolInt tMode
#endif #endif
) )
@ -646,7 +647,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
CLzma2Dec *dec; CLzma2Dec *dec;
#ifndef _7ZIP_ST #ifndef Z7_ST
if (tMode) if (tMode)
{ {
Lzma2DecMt_FreeOutBufs(p); Lzma2DecMt_FreeOutBufs(p);
@ -654,7 +655,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
} }
#endif #endif
RINOK(Lzma2Dec_Prepare_ST(p)); RINOK(Lzma2Dec_Prepare_ST(p))
dec = &p->dec; dec = &p->dec;
@ -681,7 +682,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
if (inPos == inLim) if (inPos == inLim)
{ {
#ifndef _7ZIP_ST #ifndef Z7_ST
if (tMode) if (tMode)
{ {
inData = MtDec_Read(&p->mtc, &inLim); inData = MtDec_Read(&p->mtc, &inLim);
@ -710,7 +711,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
{ {
SizeT next = dec->decoder.dicBufSize; SizeT next = dec->decoder.dicBufSize;
if (next - wrPos > p->props.outStep_ST) if (next - wrPos > p->props.outStep_ST)
next = wrPos + p->props.outStep_ST; next = wrPos + (SizeT)p->props.outStep_ST;
size = next - dicPos; size = next - dicPos;
} }
@ -726,7 +727,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
} }
} }
inProcessed = inLim - inPos; inProcessed = (SizeT)(inLim - inPos);
res = Lzma2Dec_DecodeToDic(dec, dicPos + size, inData + inPos, &inProcessed, finishMode, &status); res = Lzma2Dec_DecodeToDic(dec, dicPos + size, inData + inPos, &inProcessed, finishMode, &status);
@ -755,7 +756,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
dec->decoder.dicPos = 0; dec->decoder.dicPos = 0;
wrPos = dec->decoder.dicPos; wrPos = dec->decoder.dicPos;
RINOK(res2); RINOK(res2)
if (needStop) if (needStop)
{ {
@ -788,7 +789,7 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
UInt64 outDelta = p->outProcessed - outPrev; UInt64 outDelta = p->outProcessed - outPrev;
if (inDelta >= (1 << 22) || outDelta >= (1 << 22)) if (inDelta >= (1 << 22) || outDelta >= (1 << 22))
{ {
RINOK(ICompressProgress_Progress(p->progress, p->inProcessed, p->outProcessed)); RINOK(ICompressProgress_Progress(p->progress, p->inProcessed, p->outProcessed))
inPrev = p->inProcessed; inPrev = p->inProcessed;
outPrev = p->outProcessed; outPrev = p->outProcessed;
} }
@ -798,20 +799,20 @@ static SRes Lzma2Dec_Decode_ST(CLzma2DecMt *p
SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp, SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
Byte prop, Byte prop,
const CLzma2DecMtProps *props, const CLzma2DecMtProps *props,
ISeqOutStream *outStream, const UInt64 *outDataSize, int finishMode, ISeqOutStreamPtr outStream, const UInt64 *outDataSize, int finishMode,
// Byte *outBuf, size_t *outBufSize, // Byte *outBuf, size_t *outBufSize,
ISeqInStream *inStream, ISeqInStreamPtr inStream,
// const Byte *inData, size_t inDataSize, // const Byte *inData, size_t inDataSize,
UInt64 *inProcessed, UInt64 *inProcessed,
// UInt64 *outProcessed, // UInt64 *outProcessed,
int *isMT, int *isMT,
ICompressProgress *progress) ICompressProgressPtr progress)
{ {
CLzma2DecMt *p = (CLzma2DecMt *)pp; // GET_CLzma2DecMt_p
#ifndef _7ZIP_ST #ifndef Z7_ST
BoolInt tMode; BoolInt tMode;
#endif #endif
@ -845,7 +846,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
*isMT = False; *isMT = False;
#ifndef _7ZIP_ST #ifndef Z7_ST
tMode = False; tMode = False;
@ -939,7 +940,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
p->readWasFinished = p->mtc.readWasFinished; p->readWasFinished = p->mtc.readWasFinished;
p->inProcessed = p->mtc.inProcessed; p->inProcessed = p->mtc.inProcessed;
PRF_STR("----- decoding ST -----"); PRF_STR("----- decoding ST -----")
} }
} }
@ -950,7 +951,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
{ {
SRes res = Lzma2Dec_Decode_ST(p SRes res = Lzma2Dec_Decode_ST(p
#ifndef _7ZIP_ST #ifndef Z7_ST
, tMode , tMode
#endif #endif
); );
@ -967,7 +968,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
res = p->readRes; res = p->readRes;
/* /*
#ifndef _7ZIP_ST #ifndef Z7_ST
if (res == SZ_OK && tMode && p->mtc.parseRes != SZ_OK) if (res == SZ_OK && tMode && p->mtc.parseRes != SZ_OK)
res = p->mtc.parseRes; res = p->mtc.parseRes;
#endif #endif
@ -980,13 +981,13 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle pp,
/* ---------- Read from CLzma2DecMtHandle Interface ---------- */ /* ---------- Read from CLzma2DecMtHandle Interface ---------- */
SRes Lzma2DecMt_Init(CLzma2DecMtHandle pp, SRes Lzma2DecMt_Init(CLzma2DecMtHandle p,
Byte prop, Byte prop,
const CLzma2DecMtProps *props, const CLzma2DecMtProps *props,
const UInt64 *outDataSize, int finishMode, const UInt64 *outDataSize, int finishMode,
ISeqInStream *inStream) ISeqInStreamPtr inStream)
{ {
CLzma2DecMt *p = (CLzma2DecMt *)pp; // GET_CLzma2DecMt_p
if (prop > 40) if (prop > 40)
return SZ_ERROR_UNSUPPORTED; return SZ_ERROR_UNSUPPORTED;
@ -1015,11 +1016,11 @@ SRes Lzma2DecMt_Init(CLzma2DecMtHandle pp,
} }
SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp, SRes Lzma2DecMt_Read(CLzma2DecMtHandle p,
Byte *data, size_t *outSize, Byte *data, size_t *outSize,
UInt64 *inStreamProcessed) UInt64 *inStreamProcessed)
{ {
CLzma2DecMt *p = (CLzma2DecMt *)pp; // GET_CLzma2DecMt_p
ELzmaFinishMode finishMode; ELzmaFinishMode finishMode;
SRes readRes; SRes readRes;
size_t size = *outSize; size_t size = *outSize;
@ -1055,8 +1056,8 @@ SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp,
readRes = ISeqInStream_Read(p->inStream, p->inBuf, &p->inLim); readRes = ISeqInStream_Read(p->inStream, p->inBuf, &p->inLim);
} }
inCur = p->inLim - p->inPos; inCur = (SizeT)(p->inLim - p->inPos);
outCur = size; outCur = (SizeT)size;
res = Lzma2Dec_DecodeToBuf(&p->dec, data, &outCur, res = Lzma2Dec_DecodeToBuf(&p->dec, data, &outCur,
p->inBuf + p->inPos, &inCur, finishMode, &status); p->inBuf + p->inPos, &inCur, finishMode, &status);
@ -1088,3 +1089,7 @@ SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp,
return readRes; return readRes;
} }
} }
#undef PRF
#undef PRF_STR
#undef PRF_STR_INT_2

View file

@ -1,8 +1,8 @@
/* Lzma2DecMt.h -- LZMA2 Decoder Multi-thread /* Lzma2DecMt.h -- LZMA2 Decoder Multi-thread
2018-02-17 : Igor Pavlov : Public domain */ 2023-04-13 : Igor Pavlov : Public domain */
#ifndef __LZMA2_DEC_MT_H #ifndef ZIP7_INC_LZMA2_DEC_MT_H
#define __LZMA2_DEC_MT_H #define ZIP7_INC_LZMA2_DEC_MT_H
#include "7zTypes.h" #include "7zTypes.h"
@ -13,7 +13,7 @@ typedef struct
size_t inBufSize_ST; size_t inBufSize_ST;
size_t outStep_ST; size_t outStep_ST;
#ifndef _7ZIP_ST #ifndef Z7_ST
unsigned numThreads; unsigned numThreads;
size_t inBufSize_MT; size_t inBufSize_MT;
size_t outBlockMax; size_t outBlockMax;
@ -38,7 +38,9 @@ SRes:
SZ_ERROR_THREAD - error in multithreading functions (only for Mt version) SZ_ERROR_THREAD - error in multithreading functions (only for Mt version)
*/ */
typedef void * CLzma2DecMtHandle; typedef struct CLzma2DecMt CLzma2DecMt;
typedef CLzma2DecMt * CLzma2DecMtHandle;
// Z7_DECLARE_HANDLE(CLzma2DecMtHandle)
CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid); CLzma2DecMtHandle Lzma2DecMt_Create(ISzAllocPtr alloc, ISzAllocPtr allocMid);
void Lzma2DecMt_Destroy(CLzma2DecMtHandle p); void Lzma2DecMt_Destroy(CLzma2DecMtHandle p);
@ -46,11 +48,11 @@ void Lzma2DecMt_Destroy(CLzma2DecMtHandle p);
SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p, SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
Byte prop, Byte prop,
const CLzma2DecMtProps *props, const CLzma2DecMtProps *props,
ISeqOutStream *outStream, ISeqOutStreamPtr outStream,
const UInt64 *outDataSize, // NULL means undefined const UInt64 *outDataSize, // NULL means undefined
int finishMode, // 0 - partial unpacking is allowed, 1 - if lzma2 stream must be finished int finishMode, // 0 - partial unpacking is allowed, 1 - if lzma2 stream must be finished
// Byte *outBuf, size_t *outBufSize, // Byte *outBuf, size_t *outBufSize,
ISeqInStream *inStream, ISeqInStreamPtr inStream,
// const Byte *inData, size_t inDataSize, // const Byte *inData, size_t inDataSize,
// out variables: // out variables:
@ -58,7 +60,7 @@ SRes Lzma2DecMt_Decode(CLzma2DecMtHandle p,
int *isMT, /* out: (*isMT == 0), if single thread decoding was used */ int *isMT, /* out: (*isMT == 0), if single thread decoding was used */
// UInt64 *outProcessed, // UInt64 *outProcessed,
ICompressProgress *progress); ICompressProgressPtr progress);
/* ---------- Read from CLzma2DecMtHandle Interface ---------- */ /* ---------- Read from CLzma2DecMtHandle Interface ---------- */
@ -67,7 +69,7 @@ SRes Lzma2DecMt_Init(CLzma2DecMtHandle pp,
Byte prop, Byte prop,
const CLzma2DecMtProps *props, const CLzma2DecMtProps *props,
const UInt64 *outDataSize, int finishMode, const UInt64 *outDataSize, int finishMode,
ISeqInStream *inStream); ISeqInStreamPtr inStream);
SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp, SRes Lzma2DecMt_Read(CLzma2DecMtHandle pp,
Byte *data, size_t *outSize, Byte *data, size_t *outSize,

View file

@ -1,18 +1,18 @@
/* Lzma2Enc.c -- LZMA2 Encoder /* Lzma2Enc.c -- LZMA2 Encoder
2021-02-09 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include <string.h> #include <string.h>
/* #define _7ZIP_ST */ /* #define Z7_ST */
#include "Lzma2Enc.h" #include "Lzma2Enc.h"
#ifndef _7ZIP_ST #ifndef Z7_ST
#include "MtCoder.h" #include "MtCoder.h"
#else #else
#define MTCODER__THREADS_MAX 1 #define MTCODER_THREADS_MAX 1
#endif #endif
#define LZMA2_CONTROL_LZMA (1 << 7) #define LZMA2_CONTROL_LZMA (1 << 7)
@ -40,7 +40,7 @@
typedef struct typedef struct
{ {
ISeqInStream vt; ISeqInStream vt;
ISeqInStream *realStream; ISeqInStreamPtr realStream;
UInt64 limit; UInt64 limit;
UInt64 processed; UInt64 processed;
int finished; int finished;
@ -53,15 +53,15 @@ static void LimitedSeqInStream_Init(CLimitedSeqInStream *p)
p->finished = 0; p->finished = 0;
} }
static SRes LimitedSeqInStream_Read(const ISeqInStream *pp, void *data, size_t *size) static SRes LimitedSeqInStream_Read(ISeqInStreamPtr pp, void *data, size_t *size)
{ {
CLimitedSeqInStream *p = CONTAINER_FROM_VTBL(pp, CLimitedSeqInStream, vt); Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CLimitedSeqInStream)
size_t size2 = *size; size_t size2 = *size;
SRes res = SZ_OK; SRes res = SZ_OK;
if (p->limit != (UInt64)(Int64)-1) if (p->limit != (UInt64)(Int64)-1)
{ {
UInt64 rem = p->limit - p->processed; const UInt64 rem = p->limit - p->processed;
if (size2 > rem) if (size2 > rem)
size2 = (size_t)rem; size2 = (size_t)rem;
} }
@ -95,8 +95,8 @@ static SRes Lzma2EncInt_InitStream(CLzma2EncInt *p, const CLzma2EncProps *props)
{ {
SizeT propsSize = LZMA_PROPS_SIZE; SizeT propsSize = LZMA_PROPS_SIZE;
Byte propsEncoded[LZMA_PROPS_SIZE]; Byte propsEncoded[LZMA_PROPS_SIZE];
RINOK(LzmaEnc_SetProps(p->enc, &props->lzmaProps)); RINOK(LzmaEnc_SetProps(p->enc, &props->lzmaProps))
RINOK(LzmaEnc_WriteProperties(p->enc, propsEncoded, &propsSize)); RINOK(LzmaEnc_WriteProperties(p->enc, propsEncoded, &propsSize))
p->propsByte = propsEncoded[0]; p->propsByte = propsEncoded[0];
p->propsAreSet = True; p->propsAreSet = True;
} }
@ -111,23 +111,23 @@ static void Lzma2EncInt_InitBlock(CLzma2EncInt *p)
} }
SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp, ISeqInStream *inStream, UInt32 keepWindowSize, SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle p, ISeqInStreamPtr inStream, UInt32 keepWindowSize,
ISzAllocPtr alloc, ISzAllocPtr allocBig); ISzAllocPtr alloc, ISzAllocPtr allocBig);
SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const Byte *src, SizeT srcLen, SRes LzmaEnc_MemPrepare(CLzmaEncHandle p, const Byte *src, SizeT srcLen,
UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig); UInt32 keepWindowSize, ISzAllocPtr alloc, ISzAllocPtr allocBig);
SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, BoolInt reInit, SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle p, BoolInt reInit,
Byte *dest, size_t *destLen, UInt32 desiredPackSize, UInt32 *unpackSize); Byte *dest, size_t *destLen, UInt32 desiredPackSize, UInt32 *unpackSize);
const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle pp); const Byte *LzmaEnc_GetCurBuf(CLzmaEncHandle p);
void LzmaEnc_Finish(CLzmaEncHandle pp); void LzmaEnc_Finish(CLzmaEncHandle p);
void LzmaEnc_SaveState(CLzmaEncHandle pp); void LzmaEnc_SaveState(CLzmaEncHandle p);
void LzmaEnc_RestoreState(CLzmaEncHandle pp); void LzmaEnc_RestoreState(CLzmaEncHandle p);
/* /*
UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle pp); UInt32 LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle p);
*/ */
static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf, static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf,
size_t *packSizeRes, ISeqOutStream *outStream) size_t *packSizeRes, ISeqOutStreamPtr outStream)
{ {
size_t packSizeLimit = *packSizeRes; size_t packSizeLimit = *packSizeRes;
size_t packSize = packSizeLimit; size_t packSize = packSizeLimit;
@ -167,7 +167,7 @@ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf,
while (unpackSize > 0) while (unpackSize > 0)
{ {
UInt32 u = (unpackSize < LZMA2_COPY_CHUNK_SIZE) ? unpackSize : LZMA2_COPY_CHUNK_SIZE; const UInt32 u = (unpackSize < LZMA2_COPY_CHUNK_SIZE) ? unpackSize : LZMA2_COPY_CHUNK_SIZE;
if (packSizeLimit - destPos < u + 3) if (packSizeLimit - destPos < u + 3)
return SZ_ERROR_OUTPUT_EOF; return SZ_ERROR_OUTPUT_EOF;
outBuf[destPos++] = (Byte)(p->srcPos == 0 ? LZMA2_CONTROL_COPY_RESET_DIC : LZMA2_CONTROL_COPY_NO_RESET); outBuf[destPos++] = (Byte)(p->srcPos == 0 ? LZMA2_CONTROL_COPY_RESET_DIC : LZMA2_CONTROL_COPY_NO_RESET);
@ -196,9 +196,9 @@ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf,
{ {
size_t destPos = 0; size_t destPos = 0;
UInt32 u = unpackSize - 1; const UInt32 u = unpackSize - 1;
UInt32 pm = (UInt32)(packSize - 1); const UInt32 pm = (UInt32)(packSize - 1);
unsigned mode = (p->srcPos == 0) ? 3 : (p->needInitState ? (p->needInitProp ? 2 : 1) : 0); const unsigned mode = (p->srcPos == 0) ? 3 : (p->needInitState ? (p->needInitProp ? 2 : 1) : 0);
PRF(printf(" ")); PRF(printf(" "));
@ -231,10 +231,11 @@ static SRes Lzma2EncInt_EncodeSubblock(CLzma2EncInt *p, Byte *outBuf,
void Lzma2EncProps_Init(CLzma2EncProps *p) void Lzma2EncProps_Init(CLzma2EncProps *p)
{ {
LzmaEncProps_Init(&p->lzmaProps); LzmaEncProps_Init(&p->lzmaProps);
p->blockSize = LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO; p->blockSize = LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO;
p->numBlockThreads_Reduced = -1; p->numBlockThreads_Reduced = -1;
p->numBlockThreads_Max = -1; p->numBlockThreads_Max = -1;
p->numTotalThreads = -1; p->numTotalThreads = -1;
p->numThreadGroups = 0;
} }
void Lzma2EncProps_Normalize(CLzma2EncProps *p) void Lzma2EncProps_Normalize(CLzma2EncProps *p)
@ -251,8 +252,8 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
t2 = p->numBlockThreads_Max; t2 = p->numBlockThreads_Max;
t3 = p->numTotalThreads; t3 = p->numTotalThreads;
if (t2 > MTCODER__THREADS_MAX) if (t2 > MTCODER_THREADS_MAX)
t2 = MTCODER__THREADS_MAX; t2 = MTCODER_THREADS_MAX;
if (t3 <= 0) if (t3 <= 0)
{ {
@ -268,8 +269,8 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
t1 = 1; t1 = 1;
t2 = t3; t2 = t3;
} }
if (t2 > MTCODER__THREADS_MAX) if (t2 > MTCODER_THREADS_MAX)
t2 = MTCODER__THREADS_MAX; t2 = MTCODER_THREADS_MAX;
} }
else if (t1 <= 0) else if (t1 <= 0)
{ {
@ -286,8 +287,8 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
fileSize = p->lzmaProps.reduceSize; fileSize = p->lzmaProps.reduceSize;
if ( p->blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID if ( p->blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID
&& p->blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO && p->blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO
&& (p->blockSize < fileSize || fileSize == (UInt64)(Int64)-1)) && (p->blockSize < fileSize || fileSize == (UInt64)(Int64)-1))
p->lzmaProps.reduceSize = p->blockSize; p->lzmaProps.reduceSize = p->blockSize;
@ -297,19 +298,19 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
t1 = p->lzmaProps.numThreads; t1 = p->lzmaProps.numThreads;
if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID) if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID)
{ {
t2r = t2 = 1; t2r = t2 = 1;
t3 = t1; t3 = t1;
} }
else if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO && t2 <= 1) else if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO && t2 <= 1)
{ {
/* if there is no block multi-threading, we use SOLID block */ /* if there is no block multi-threading, we use SOLID block */
p->blockSize = LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID; p->blockSize = LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID;
} }
else else
{ {
if (p->blockSize == LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO) if (p->blockSize == LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO)
{ {
const UInt32 kMinSize = (UInt32)1 << 20; const UInt32 kMinSize = (UInt32)1 << 20;
const UInt32 kMaxSize = (UInt32)1 << 28; const UInt32 kMaxSize = (UInt32)1 << 28;
@ -344,7 +345,7 @@ void Lzma2EncProps_Normalize(CLzma2EncProps *p)
} }
static SRes Progress(ICompressProgress *p, UInt64 inSize, UInt64 outSize) static SRes Progress(ICompressProgressPtr p, UInt64 inSize, UInt64 outSize)
{ {
return (p && ICompressProgress_Progress(p, inSize, outSize) != SZ_OK) ? SZ_ERROR_PROGRESS : SZ_OK; return (p && ICompressProgress_Progress(p, inSize, outSize) != SZ_OK) ? SZ_ERROR_PROGRESS : SZ_OK;
} }
@ -352,7 +353,7 @@ static SRes Progress(ICompressProgress *p, UInt64 inSize, UInt64 outSize)
/* ---------- Lzma2 ---------- */ /* ---------- Lzma2 ---------- */
typedef struct struct CLzma2Enc
{ {
Byte propEncoded; Byte propEncoded;
CLzma2EncProps props; CLzma2EncProps props;
@ -363,23 +364,22 @@ typedef struct
ISzAllocPtr alloc; ISzAllocPtr alloc;
ISzAllocPtr allocBig; ISzAllocPtr allocBig;
CLzma2EncInt coders[MTCODER__THREADS_MAX]; CLzma2EncInt coders[MTCODER_THREADS_MAX];
#ifndef _7ZIP_ST #ifndef Z7_ST
ISeqOutStream *outStream; ISeqOutStreamPtr outStream;
Byte *outBuf; Byte *outBuf;
size_t outBuf_Rem; /* remainder in outBuf */ size_t outBuf_Rem; /* remainder in outBuf */
size_t outBufSize; /* size of allocated outBufs[i] */ size_t outBufSize; /* size of allocated outBufs[i] */
size_t outBufsDataSizes[MTCODER__BLOCKS_MAX]; size_t outBufsDataSizes[MTCODER_BLOCKS_MAX];
BoolInt mtCoder_WasConstructed; BoolInt mtCoder_WasConstructed;
CMtCoder mtCoder; CMtCoder mtCoder;
Byte *outBufs[MTCODER__BLOCKS_MAX]; Byte *outBufs[MTCODER_BLOCKS_MAX];
#endif #endif
};
} CLzma2Enc;
@ -396,30 +396,30 @@ CLzma2EncHandle Lzma2Enc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig)
p->allocBig = allocBig; p->allocBig = allocBig;
{ {
unsigned i; unsigned i;
for (i = 0; i < MTCODER__THREADS_MAX; i++) for (i = 0; i < MTCODER_THREADS_MAX; i++)
p->coders[i].enc = NULL; p->coders[i].enc = NULL;
} }
#ifndef _7ZIP_ST #ifndef Z7_ST
p->mtCoder_WasConstructed = False; p->mtCoder_WasConstructed = False;
{ {
unsigned i; unsigned i;
for (i = 0; i < MTCODER__BLOCKS_MAX; i++) for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
p->outBufs[i] = NULL; p->outBufs[i] = NULL;
p->outBufSize = 0; p->outBufSize = 0;
} }
#endif #endif
return p; return (CLzma2EncHandle)p;
} }
#ifndef _7ZIP_ST #ifndef Z7_ST
static void Lzma2Enc_FreeOutBufs(CLzma2Enc *p) static void Lzma2Enc_FreeOutBufs(CLzma2Enc *p)
{ {
unsigned i; unsigned i;
for (i = 0; i < MTCODER__BLOCKS_MAX; i++) for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
if (p->outBufs[i]) if (p->outBufs[i])
{ {
ISzAlloc_Free(p->alloc, p->outBufs[i]); ISzAlloc_Free(p->alloc, p->outBufs[i]);
@ -430,12 +430,13 @@ static void Lzma2Enc_FreeOutBufs(CLzma2Enc *p)
#endif #endif
// #define GET_CLzma2Enc_p CLzma2Enc *p = (CLzma2Enc *)(void *)p;
void Lzma2Enc_Destroy(CLzma2EncHandle pp) void Lzma2Enc_Destroy(CLzma2EncHandle p)
{ {
CLzma2Enc *p = (CLzma2Enc *)pp; // GET_CLzma2Enc_p
unsigned i; unsigned i;
for (i = 0; i < MTCODER__THREADS_MAX; i++) for (i = 0; i < MTCODER_THREADS_MAX; i++)
{ {
CLzma2EncInt *t = &p->coders[i]; CLzma2EncInt *t = &p->coders[i];
if (t->enc) if (t->enc)
@ -446,7 +447,7 @@ void Lzma2Enc_Destroy(CLzma2EncHandle pp)
} }
#ifndef _7ZIP_ST #ifndef Z7_ST
if (p->mtCoder_WasConstructed) if (p->mtCoder_WasConstructed)
{ {
MtCoder_Destruct(&p->mtCoder); MtCoder_Destruct(&p->mtCoder);
@ -458,13 +459,13 @@ void Lzma2Enc_Destroy(CLzma2EncHandle pp)
ISzAlloc_Free(p->alloc, p->tempBufLzma); ISzAlloc_Free(p->alloc, p->tempBufLzma);
p->tempBufLzma = NULL; p->tempBufLzma = NULL;
ISzAlloc_Free(p->alloc, pp); ISzAlloc_Free(p->alloc, p);
} }
SRes Lzma2Enc_SetProps(CLzma2EncHandle pp, const CLzma2EncProps *props) SRes Lzma2Enc_SetProps(CLzma2EncHandle p, const CLzma2EncProps *props)
{ {
CLzma2Enc *p = (CLzma2Enc *)pp; // GET_CLzma2Enc_p
CLzmaEncProps lzmaProps = props->lzmaProps; CLzmaEncProps lzmaProps = props->lzmaProps;
LzmaEncProps_Normalize(&lzmaProps); LzmaEncProps_Normalize(&lzmaProps);
if (lzmaProps.lc + lzmaProps.lp > LZMA2_LCLP_MAX) if (lzmaProps.lc + lzmaProps.lp > LZMA2_LCLP_MAX)
@ -475,16 +476,16 @@ SRes Lzma2Enc_SetProps(CLzma2EncHandle pp, const CLzma2EncProps *props)
} }
void Lzma2Enc_SetDataSize(CLzmaEncHandle pp, UInt64 expectedDataSiize) void Lzma2Enc_SetDataSize(CLzma2EncHandle p, UInt64 expectedDataSiize)
{ {
CLzma2Enc *p = (CLzma2Enc *)pp; // GET_CLzma2Enc_p
p->expectedDataSize = expectedDataSiize; p->expectedDataSize = expectedDataSiize;
} }
Byte Lzma2Enc_WriteProperties(CLzma2EncHandle pp) Byte Lzma2Enc_WriteProperties(CLzma2EncHandle p)
{ {
CLzma2Enc *p = (CLzma2Enc *)pp; // GET_CLzma2Enc_p
unsigned i; unsigned i;
UInt32 dicSize = LzmaEncProps_GetDictSize(&p->props.lzmaProps); UInt32 dicSize = LzmaEncProps_GetDictSize(&p->props.lzmaProps);
for (i = 0; i < 40; i++) for (i = 0; i < 40; i++)
@ -497,12 +498,12 @@ Byte Lzma2Enc_WriteProperties(CLzma2EncHandle pp)
static SRes Lzma2Enc_EncodeMt1( static SRes Lzma2Enc_EncodeMt1(
CLzma2Enc *me, CLzma2Enc *me,
CLzma2EncInt *p, CLzma2EncInt *p,
ISeqOutStream *outStream, ISeqOutStreamPtr outStream,
Byte *outBuf, size_t *outBufSize, Byte *outBuf, size_t *outBufSize,
ISeqInStream *inStream, ISeqInStreamPtr inStream,
const Byte *inData, size_t inDataSize, const Byte *inData, size_t inDataSize,
int finished, int finished,
ICompressProgress *progress) ICompressProgressPtr progress)
{ {
UInt64 unpackTotal = 0; UInt64 unpackTotal = 0;
UInt64 packTotal = 0; UInt64 packTotal = 0;
@ -540,12 +541,12 @@ static SRes Lzma2Enc_EncodeMt1(
} }
} }
RINOK(Lzma2EncInt_InitStream(p, &me->props)); RINOK(Lzma2EncInt_InitStream(p, &me->props))
for (;;) for (;;)
{ {
SRes res = SZ_OK; SRes res = SZ_OK;
size_t inSizeCur = 0; SizeT inSizeCur = 0;
Lzma2EncInt_InitBlock(p); Lzma2EncInt_InitBlock(p);
@ -559,7 +560,7 @@ static SRes Lzma2Enc_EncodeMt1(
if (me->expectedDataSize != (UInt64)(Int64)-1 if (me->expectedDataSize != (UInt64)(Int64)-1
&& me->expectedDataSize >= unpackTotal) && me->expectedDataSize >= unpackTotal)
expected = me->expectedDataSize - unpackTotal; expected = me->expectedDataSize - unpackTotal;
if (me->props.blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID if (me->props.blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID
&& expected > me->props.blockSize) && expected > me->props.blockSize)
expected = (size_t)me->props.blockSize; expected = (size_t)me->props.blockSize;
@ -569,14 +570,14 @@ static SRes Lzma2Enc_EncodeMt1(
&limitedInStream.vt, &limitedInStream.vt,
LZMA2_KEEP_WINDOW_SIZE, LZMA2_KEEP_WINDOW_SIZE,
me->alloc, me->alloc,
me->allocBig)); me->allocBig))
} }
else else
{ {
inSizeCur = inDataSize - (size_t)unpackTotal; inSizeCur = (SizeT)(inDataSize - (size_t)unpackTotal);
if (me->props.blockSize != LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID if (me->props.blockSize != LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID
&& inSizeCur > me->props.blockSize) && inSizeCur > me->props.blockSize)
inSizeCur = (size_t)me->props.blockSize; inSizeCur = (SizeT)(size_t)me->props.blockSize;
// LzmaEnc_SetDataSize(p->enc, inSizeCur); // LzmaEnc_SetDataSize(p->enc, inSizeCur);
@ -584,7 +585,7 @@ static SRes Lzma2Enc_EncodeMt1(
inData + (size_t)unpackTotal, inSizeCur, inData + (size_t)unpackTotal, inSizeCur,
LZMA2_KEEP_WINDOW_SIZE, LZMA2_KEEP_WINDOW_SIZE,
me->alloc, me->alloc,
me->allocBig)); me->allocBig))
} }
for (;;) for (;;)
@ -621,7 +622,7 @@ static SRes Lzma2Enc_EncodeMt1(
unpackTotal += p->srcPos; unpackTotal += p->srcPos;
RINOK(res); RINOK(res)
if (p->srcPos != (inStream ? limitedInStream.processed : inSizeCur)) if (p->srcPos != (inStream ? limitedInStream.processed : inSizeCur))
return SZ_ERROR_FAIL; return SZ_ERROR_FAIL;
@ -652,12 +653,12 @@ static SRes Lzma2Enc_EncodeMt1(
#ifndef _7ZIP_ST #ifndef Z7_ST
static SRes Lzma2Enc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned outBufIndex, static SRes Lzma2Enc_MtCallback_Code(void *p, unsigned coderIndex, unsigned outBufIndex,
const Byte *src, size_t srcSize, int finished) const Byte *src, size_t srcSize, int finished)
{ {
CLzma2Enc *me = (CLzma2Enc *)pp; CLzma2Enc *me = (CLzma2Enc *)p;
size_t destSize = me->outBufSize; size_t destSize = me->outBufSize;
SRes res; SRes res;
CMtProgressThunk progressThunk; CMtProgressThunk progressThunk;
@ -692,9 +693,9 @@ static SRes Lzma2Enc_MtCallback_Code(void *pp, unsigned coderIndex, unsigned out
} }
static SRes Lzma2Enc_MtCallback_Write(void *pp, unsigned outBufIndex) static SRes Lzma2Enc_MtCallback_Write(void *p, unsigned outBufIndex)
{ {
CLzma2Enc *me = (CLzma2Enc *)pp; CLzma2Enc *me = (CLzma2Enc *)p;
size_t size = me->outBufsDataSizes[outBufIndex]; size_t size = me->outBufsDataSizes[outBufIndex];
const Byte *data = me->outBufs[outBufIndex]; const Byte *data = me->outBufs[outBufIndex];
@ -713,14 +714,14 @@ static SRes Lzma2Enc_MtCallback_Write(void *pp, unsigned outBufIndex)
SRes Lzma2Enc_Encode2(CLzma2EncHandle pp, SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
ISeqOutStream *outStream, ISeqOutStreamPtr outStream,
Byte *outBuf, size_t *outBufSize, Byte *outBuf, size_t *outBufSize,
ISeqInStream *inStream, ISeqInStreamPtr inStream,
const Byte *inData, size_t inDataSize, const Byte *inData, size_t inDataSize,
ICompressProgress *progress) ICompressProgressPtr progress)
{ {
CLzma2Enc *p = (CLzma2Enc *)pp; // GET_CLzma2Enc_p
if (inStream && inData) if (inStream && inData)
return SZ_ERROR_PARAM; return SZ_ERROR_PARAM;
@ -730,11 +731,11 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp,
{ {
unsigned i; unsigned i;
for (i = 0; i < MTCODER__THREADS_MAX; i++) for (i = 0; i < MTCODER_THREADS_MAX; i++)
p->coders[i].propsAreSet = False; p->coders[i].propsAreSet = False;
} }
#ifndef _7ZIP_ST #ifndef Z7_ST
if (p->props.numBlockThreads_Reduced > 1) if (p->props.numBlockThreads_Reduced > 1)
{ {
@ -772,7 +773,7 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp,
return SZ_ERROR_PARAM; /* SZ_ERROR_MEM */ return SZ_ERROR_PARAM; /* SZ_ERROR_MEM */
{ {
size_t destBlockSize = p->mtCoder.blockSize + (p->mtCoder.blockSize >> 10) + 16; const size_t destBlockSize = p->mtCoder.blockSize + (p->mtCoder.blockSize >> 10) + 16;
if (destBlockSize < p->mtCoder.blockSize) if (destBlockSize < p->mtCoder.blockSize)
return SZ_ERROR_PARAM; return SZ_ERROR_PARAM;
if (p->outBufSize != destBlockSize) if (p->outBufSize != destBlockSize)
@ -781,10 +782,11 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp,
} }
p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max; p->mtCoder.numThreadsMax = (unsigned)p->props.numBlockThreads_Max;
p->mtCoder.numThreadGroups = p->props.numThreadGroups;
p->mtCoder.expectedDataSize = p->expectedDataSize; p->mtCoder.expectedDataSize = p->expectedDataSize;
{ {
SRes res = MtCoder_Code(&p->mtCoder); const SRes res = MtCoder_Code(&p->mtCoder);
if (!outStream) if (!outStream)
*outBufSize = (size_t)(p->outBuf - outBuf); *outBufSize = (size_t)(p->outBuf - outBuf);
return res; return res;
@ -801,3 +803,5 @@ SRes Lzma2Enc_Encode2(CLzma2EncHandle pp,
True, /* finished */ True, /* finished */
progress); progress);
} }
#undef PRF

View file

@ -1,15 +1,15 @@
/* Lzma2Enc.h -- LZMA2 Encoder /* Lzma2Enc.h -- LZMA2 Encoder
2017-07-27 : Igor Pavlov : Public domain */ 2023-04-13 : Igor Pavlov : Public domain */
#ifndef __LZMA2_ENC_H #ifndef ZIP7_INC_LZMA2_ENC_H
#define __LZMA2_ENC_H #define ZIP7_INC_LZMA2_ENC_H
#include "LzmaEnc.h" #include "LzmaEnc.h"
EXTERN_C_BEGIN EXTERN_C_BEGIN
#define LZMA2_ENC_PROPS__BLOCK_SIZE__AUTO 0 #define LZMA2_ENC_PROPS_BLOCK_SIZE_AUTO 0
#define LZMA2_ENC_PROPS__BLOCK_SIZE__SOLID ((UInt64)(Int64)-1) #define LZMA2_ENC_PROPS_BLOCK_SIZE_SOLID ((UInt64)(Int64)-1)
typedef struct typedef struct
{ {
@ -18,6 +18,7 @@ typedef struct
int numBlockThreads_Reduced; int numBlockThreads_Reduced;
int numBlockThreads_Max; int numBlockThreads_Max;
int numTotalThreads; int numTotalThreads;
unsigned numThreadGroups; // 0 : no groups
} CLzma2EncProps; } CLzma2EncProps;
void Lzma2EncProps_Init(CLzma2EncProps *p); void Lzma2EncProps_Init(CLzma2EncProps *p);
@ -36,7 +37,9 @@ SRes:
SZ_ERROR_THREAD - error in multithreading functions (only for Mt version) SZ_ERROR_THREAD - error in multithreading functions (only for Mt version)
*/ */
typedef void * CLzma2EncHandle; typedef struct CLzma2Enc CLzma2Enc;
typedef CLzma2Enc * CLzma2EncHandle;
// Z7_DECLARE_HANDLE(CLzma2EncHandle)
CLzma2EncHandle Lzma2Enc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig); CLzma2EncHandle Lzma2Enc_Create(ISzAllocPtr alloc, ISzAllocPtr allocBig);
void Lzma2Enc_Destroy(CLzma2EncHandle p); void Lzma2Enc_Destroy(CLzma2EncHandle p);
@ -44,11 +47,11 @@ SRes Lzma2Enc_SetProps(CLzma2EncHandle p, const CLzma2EncProps *props);
void Lzma2Enc_SetDataSize(CLzma2EncHandle p, UInt64 expectedDataSiize); void Lzma2Enc_SetDataSize(CLzma2EncHandle p, UInt64 expectedDataSiize);
Byte Lzma2Enc_WriteProperties(CLzma2EncHandle p); Byte Lzma2Enc_WriteProperties(CLzma2EncHandle p);
SRes Lzma2Enc_Encode2(CLzma2EncHandle p, SRes Lzma2Enc_Encode2(CLzma2EncHandle p,
ISeqOutStream *outStream, ISeqOutStreamPtr outStream,
Byte *outBuf, size_t *outBufSize, Byte *outBuf, size_t *outBufSize,
ISeqInStream *inStream, ISeqInStreamPtr inStream,
const Byte *inData, size_t inDataSize, const Byte *inData, size_t inDataSize,
ICompressProgress *progress); ICompressProgressPtr progress);
EXTERN_C_END EXTERN_C_END

View file

@ -1,8 +1,8 @@
/* Lzma86.h -- LZMA + x86 (BCJ) Filter /* Lzma86.h -- LZMA + x86 (BCJ) Filter
2013-01-18 : Igor Pavlov : Public domain */ 2023-03-03 : Igor Pavlov : Public domain */
#ifndef __LZMA86_H #ifndef ZIP7_INC_LZMA86_H
#define __LZMA86_H #define ZIP7_INC_LZMA86_H
#include "7zTypes.h" #include "7zTypes.h"

View file

@ -1,5 +1,5 @@
/* Lzma86Dec.c -- LZMA + x86 (BCJ) Filter Decoder /* Lzma86Dec.c -- LZMA + x86 (BCJ) Filter Decoder
2016-05-16 : Igor Pavlov : Public domain */ 2023-03-03 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -46,9 +46,8 @@ SRes Lzma86_Decode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen)
return res; return res;
if (useFilter == 1) if (useFilter == 1)
{ {
UInt32 x86State; UInt32 x86State = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL;
x86_Convert_Init(x86State); z7_BranchConvSt_X86_Dec(dest, *destLen, 0, &x86State);
x86_Convert(dest, *destLen, 0, &x86State, 0);
} }
return SZ_OK; return SZ_OK;
} }

View file

@ -1,5 +1,5 @@
/* Lzma86Enc.c -- LZMA + x86 (BCJ) Filter Encoder /* Lzma86Enc.c -- LZMA + x86 (BCJ) Filter Encoder
2018-07-04 : Igor Pavlov : Public domain */ 2023-03-03 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -46,9 +46,8 @@ int Lzma86_Encode(Byte *dest, size_t *destLen, const Byte *src, size_t srcLen,
memcpy(filteredStream, src, srcLen); memcpy(filteredStream, src, srcLen);
} }
{ {
UInt32 x86State; UInt32 x86State = Z7_BRANCH_CONV_ST_X86_STATE_INIT_VAL;
x86_Convert_Init(x86State); z7_BranchConvSt_X86_Enc(filteredStream, srcLen, 0, &x86State);
x86_Convert(filteredStream, srcLen, 0, &x86State, 1);
} }
} }

View file

@ -1,5 +1,5 @@
/* LzmaDec.c -- LZMA Decoder /* LzmaDec.c -- LZMA Decoder
2021-04-01 : Igor Pavlov : Public domain */ 2023-04-07 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -8,15 +8,15 @@
/* #include "CpuArch.h" */ /* #include "CpuArch.h" */
#include "LzmaDec.h" #include "LzmaDec.h"
#define kNumTopBits 24 // #define kNumTopBits 24
#define kTopValue ((UInt32)1 << kNumTopBits) #define kTopValue ((UInt32)1 << 24)
#define kNumBitModelTotalBits 11 #define kNumBitModelTotalBits 11
#define kBitModelTotal (1 << kNumBitModelTotalBits) #define kBitModelTotal (1 << kNumBitModelTotalBits)
#define RC_INIT_SIZE 5 #define RC_INIT_SIZE 5
#ifndef _LZMA_DEC_OPT #ifndef Z7_LZMA_DEC_OPT
#define kNumMoveBits 5 #define kNumMoveBits 5
#define NORMALIZE if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); } #define NORMALIZE if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); }
@ -25,14 +25,14 @@
#define UPDATE_0(p) range = bound; *(p) = (CLzmaProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)); #define UPDATE_0(p) range = bound; *(p) = (CLzmaProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
#define UPDATE_1(p) range -= bound; code -= bound; *(p) = (CLzmaProb)(ttt - (ttt >> kNumMoveBits)); #define UPDATE_1(p) range -= bound; code -= bound; *(p) = (CLzmaProb)(ttt - (ttt >> kNumMoveBits));
#define GET_BIT2(p, i, A0, A1) IF_BIT_0(p) \ #define GET_BIT2(p, i, A0, A1) IF_BIT_0(p) \
{ UPDATE_0(p); i = (i + i); A0; } else \ { UPDATE_0(p) i = (i + i); A0; } else \
{ UPDATE_1(p); i = (i + i) + 1; A1; } { UPDATE_1(p) i = (i + i) + 1; A1; }
#define TREE_GET_BIT(probs, i) { GET_BIT2(probs + i, i, ;, ;); } #define TREE_GET_BIT(probs, i) { GET_BIT2(probs + i, i, ;, ;); }
#define REV_BIT(p, i, A0, A1) IF_BIT_0(p + i) \ #define REV_BIT(p, i, A0, A1) IF_BIT_0(p + i) \
{ UPDATE_0(p + i); A0; } else \ { UPDATE_0(p + i) A0; } else \
{ UPDATE_1(p + i); A1; } { UPDATE_1(p + i) A1; }
#define REV_BIT_VAR( p, i, m) REV_BIT(p, i, i += m; m += m, m += m; i += m; ) #define REV_BIT_VAR( p, i, m) REV_BIT(p, i, i += m; m += m, m += m; i += m; )
#define REV_BIT_CONST(p, i, m) REV_BIT(p, i, i += m; , i += m * 2; ) #define REV_BIT_CONST(p, i, m) REV_BIT(p, i, i += m; , i += m * 2; )
#define REV_BIT_LAST( p, i, m) REV_BIT(p, i, i -= m , ; ) #define REV_BIT_LAST( p, i, m) REV_BIT(p, i, i -= m , ; )
@ -40,19 +40,19 @@
#define TREE_DECODE(probs, limit, i) \ #define TREE_DECODE(probs, limit, i) \
{ i = 1; do { TREE_GET_BIT(probs, i); } while (i < limit); i -= limit; } { i = 1; do { TREE_GET_BIT(probs, i); } while (i < limit); i -= limit; }
/* #define _LZMA_SIZE_OPT */ /* #define Z7_LZMA_SIZE_OPT */
#ifdef _LZMA_SIZE_OPT #ifdef Z7_LZMA_SIZE_OPT
#define TREE_6_DECODE(probs, i) TREE_DECODE(probs, (1 << 6), i) #define TREE_6_DECODE(probs, i) TREE_DECODE(probs, (1 << 6), i)
#else #else
#define TREE_6_DECODE(probs, i) \ #define TREE_6_DECODE(probs, i) \
{ i = 1; \ { i = 1; \
TREE_GET_BIT(probs, i); \ TREE_GET_BIT(probs, i) \
TREE_GET_BIT(probs, i); \ TREE_GET_BIT(probs, i) \
TREE_GET_BIT(probs, i); \ TREE_GET_BIT(probs, i) \
TREE_GET_BIT(probs, i); \ TREE_GET_BIT(probs, i) \
TREE_GET_BIT(probs, i); \ TREE_GET_BIT(probs, i) \
TREE_GET_BIT(probs, i); \ TREE_GET_BIT(probs, i) \
i -= 0x40; } i -= 0x40; }
#endif #endif
@ -64,25 +64,25 @@
probLit = prob + (offs + bit + symbol); \ probLit = prob + (offs + bit + symbol); \
GET_BIT2(probLit, symbol, offs ^= bit; , ;) GET_BIT2(probLit, symbol, offs ^= bit; , ;)
#endif // _LZMA_DEC_OPT #endif // Z7_LZMA_DEC_OPT
#define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_INPUT_EOF; range <<= 8; code = (code << 8) | (*buf++); } #define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_INPUT_EOF; range <<= 8; code = (code << 8) | (*buf++); }
#define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK; bound = (range >> kNumBitModelTotalBits) * (UInt32)ttt; if (code < bound) #define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK bound = (range >> kNumBitModelTotalBits) * (UInt32)ttt; if (code < bound)
#define UPDATE_0_CHECK range = bound; #define UPDATE_0_CHECK range = bound;
#define UPDATE_1_CHECK range -= bound; code -= bound; #define UPDATE_1_CHECK range -= bound; code -= bound;
#define GET_BIT2_CHECK(p, i, A0, A1) IF_BIT_0_CHECK(p) \ #define GET_BIT2_CHECK(p, i, A0, A1) IF_BIT_0_CHECK(p) \
{ UPDATE_0_CHECK; i = (i + i); A0; } else \ { UPDATE_0_CHECK i = (i + i); A0; } else \
{ UPDATE_1_CHECK; i = (i + i) + 1; A1; } { UPDATE_1_CHECK i = (i + i) + 1; A1; }
#define GET_BIT_CHECK(p, i) GET_BIT2_CHECK(p, i, ; , ;) #define GET_BIT_CHECK(p, i) GET_BIT2_CHECK(p, i, ; , ;)
#define TREE_DECODE_CHECK(probs, limit, i) \ #define TREE_DECODE_CHECK(probs, limit, i) \
{ i = 1; do { GET_BIT_CHECK(probs + i, i) } while (i < limit); i -= limit; } { i = 1; do { GET_BIT_CHECK(probs + i, i) } while (i < limit); i -= limit; }
#define REV_BIT_CHECK(p, i, m) IF_BIT_0_CHECK(p + i) \ #define REV_BIT_CHECK(p, i, m) IF_BIT_0_CHECK(p + i) \
{ UPDATE_0_CHECK; i += m; m += m; } else \ { UPDATE_0_CHECK i += m; m += m; } else \
{ UPDATE_1_CHECK; m += m; i += m; } { UPDATE_1_CHECK m += m; i += m; }
#define kNumPosBitsMax 4 #define kNumPosBitsMax 4
@ -224,14 +224,14 @@ Out:
*/ */
#ifdef _LZMA_DEC_OPT #ifdef Z7_LZMA_DEC_OPT
int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit); int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit);
#else #else
static static
int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit) int Z7_FASTCALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
{ {
CLzmaProb *probs = GET_PROBS; CLzmaProb *probs = GET_PROBS;
unsigned state = (unsigned)p->state; unsigned state = (unsigned)p->state;
@ -263,7 +263,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
IF_BIT_0(prob) IF_BIT_0(prob)
{ {
unsigned symbol; unsigned symbol;
UPDATE_0(prob); UPDATE_0(prob)
prob = probs + Literal; prob = probs + Literal;
if (processedPos != 0 || checkDicSize != 0) if (processedPos != 0 || checkDicSize != 0)
prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc); prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);
@ -273,7 +273,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
{ {
state -= (state < 4) ? state : 3; state -= (state < 4) ? state : 3;
symbol = 1; symbol = 1;
#ifdef _LZMA_SIZE_OPT #ifdef Z7_LZMA_SIZE_OPT
do { NORMAL_LITER_DEC } while (symbol < 0x100); do { NORMAL_LITER_DEC } while (symbol < 0x100);
#else #else
NORMAL_LITER_DEC NORMAL_LITER_DEC
@ -292,7 +292,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
unsigned offs = 0x100; unsigned offs = 0x100;
state -= (state < 10) ? 3 : 6; state -= (state < 10) ? 3 : 6;
symbol = 1; symbol = 1;
#ifdef _LZMA_SIZE_OPT #ifdef Z7_LZMA_SIZE_OPT
do do
{ {
unsigned bit; unsigned bit;
@ -321,25 +321,25 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
} }
{ {
UPDATE_1(prob); UPDATE_1(prob)
prob = probs + IsRep + state; prob = probs + IsRep + state;
IF_BIT_0(prob) IF_BIT_0(prob)
{ {
UPDATE_0(prob); UPDATE_0(prob)
state += kNumStates; state += kNumStates;
prob = probs + LenCoder; prob = probs + LenCoder;
} }
else else
{ {
UPDATE_1(prob); UPDATE_1(prob)
prob = probs + IsRepG0 + state; prob = probs + IsRepG0 + state;
IF_BIT_0(prob) IF_BIT_0(prob)
{ {
UPDATE_0(prob); UPDATE_0(prob)
prob = probs + IsRep0Long + COMBINED_PS_STATE; prob = probs + IsRep0Long + COMBINED_PS_STATE;
IF_BIT_0(prob) IF_BIT_0(prob)
{ {
UPDATE_0(prob); UPDATE_0(prob)
// that case was checked before with kBadRepCode // that case was checked before with kBadRepCode
// if (checkDicSize == 0 && processedPos == 0) { len = kMatchSpecLen_Error_Data + 1; break; } // if (checkDicSize == 0 && processedPos == 0) { len = kMatchSpecLen_Error_Data + 1; break; }
@ -353,30 +353,30 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
state = state < kNumLitStates ? 9 : 11; state = state < kNumLitStates ? 9 : 11;
continue; continue;
} }
UPDATE_1(prob); UPDATE_1(prob)
} }
else else
{ {
UInt32 distance; UInt32 distance;
UPDATE_1(prob); UPDATE_1(prob)
prob = probs + IsRepG1 + state; prob = probs + IsRepG1 + state;
IF_BIT_0(prob) IF_BIT_0(prob)
{ {
UPDATE_0(prob); UPDATE_0(prob)
distance = rep1; distance = rep1;
} }
else else
{ {
UPDATE_1(prob); UPDATE_1(prob)
prob = probs + IsRepG2 + state; prob = probs + IsRepG2 + state;
IF_BIT_0(prob) IF_BIT_0(prob)
{ {
UPDATE_0(prob); UPDATE_0(prob)
distance = rep2; distance = rep2;
} }
else else
{ {
UPDATE_1(prob); UPDATE_1(prob)
distance = rep3; distance = rep3;
rep3 = rep2; rep3 = rep2;
} }
@ -389,37 +389,37 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
prob = probs + RepLenCoder; prob = probs + RepLenCoder;
} }
#ifdef _LZMA_SIZE_OPT #ifdef Z7_LZMA_SIZE_OPT
{ {
unsigned lim, offset; unsigned lim, offset;
CLzmaProb *probLen = prob + LenChoice; CLzmaProb *probLen = prob + LenChoice;
IF_BIT_0(probLen) IF_BIT_0(probLen)
{ {
UPDATE_0(probLen); UPDATE_0(probLen)
probLen = prob + LenLow + GET_LEN_STATE; probLen = prob + LenLow + GET_LEN_STATE;
offset = 0; offset = 0;
lim = (1 << kLenNumLowBits); lim = (1 << kLenNumLowBits);
} }
else else
{ {
UPDATE_1(probLen); UPDATE_1(probLen)
probLen = prob + LenChoice2; probLen = prob + LenChoice2;
IF_BIT_0(probLen) IF_BIT_0(probLen)
{ {
UPDATE_0(probLen); UPDATE_0(probLen)
probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits); probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits);
offset = kLenNumLowSymbols; offset = kLenNumLowSymbols;
lim = (1 << kLenNumLowBits); lim = (1 << kLenNumLowBits);
} }
else else
{ {
UPDATE_1(probLen); UPDATE_1(probLen)
probLen = prob + LenHigh; probLen = prob + LenHigh;
offset = kLenNumLowSymbols * 2; offset = kLenNumLowSymbols * 2;
lim = (1 << kLenNumHighBits); lim = (1 << kLenNumHighBits);
} }
} }
TREE_DECODE(probLen, lim, len); TREE_DECODE(probLen, lim, len)
len += offset; len += offset;
} }
#else #else
@ -427,32 +427,32 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
CLzmaProb *probLen = prob + LenChoice; CLzmaProb *probLen = prob + LenChoice;
IF_BIT_0(probLen) IF_BIT_0(probLen)
{ {
UPDATE_0(probLen); UPDATE_0(probLen)
probLen = prob + LenLow + GET_LEN_STATE; probLen = prob + LenLow + GET_LEN_STATE;
len = 1; len = 1;
TREE_GET_BIT(probLen, len); TREE_GET_BIT(probLen, len)
TREE_GET_BIT(probLen, len); TREE_GET_BIT(probLen, len)
TREE_GET_BIT(probLen, len); TREE_GET_BIT(probLen, len)
len -= 8; len -= 8;
} }
else else
{ {
UPDATE_1(probLen); UPDATE_1(probLen)
probLen = prob + LenChoice2; probLen = prob + LenChoice2;
IF_BIT_0(probLen) IF_BIT_0(probLen)
{ {
UPDATE_0(probLen); UPDATE_0(probLen)
probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits); probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits);
len = 1; len = 1;
TREE_GET_BIT(probLen, len); TREE_GET_BIT(probLen, len)
TREE_GET_BIT(probLen, len); TREE_GET_BIT(probLen, len)
TREE_GET_BIT(probLen, len); TREE_GET_BIT(probLen, len)
} }
else else
{ {
UPDATE_1(probLen); UPDATE_1(probLen)
probLen = prob + LenHigh; probLen = prob + LenHigh;
TREE_DECODE(probLen, (1 << kLenNumHighBits), len); TREE_DECODE(probLen, (1 << kLenNumHighBits), len)
len += kLenNumLowSymbols * 2; len += kLenNumLowSymbols * 2;
} }
} }
@ -464,7 +464,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
UInt32 distance; UInt32 distance;
prob = probs + PosSlot + prob = probs + PosSlot +
((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
TREE_6_DECODE(prob, distance); TREE_6_DECODE(prob, distance)
if (distance >= kStartPosModelIndex) if (distance >= kStartPosModelIndex)
{ {
unsigned posSlot = (unsigned)distance; unsigned posSlot = (unsigned)distance;
@ -479,7 +479,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
distance++; distance++;
do do
{ {
REV_BIT_VAR(prob, distance, m); REV_BIT_VAR(prob, distance, m)
} }
while (--numDirectBits); while (--numDirectBits);
distance -= m; distance -= m;
@ -514,10 +514,10 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
distance <<= kNumAlignBits; distance <<= kNumAlignBits;
{ {
unsigned i = 1; unsigned i = 1;
REV_BIT_CONST(prob, i, 1); REV_BIT_CONST(prob, i, 1)
REV_BIT_CONST(prob, i, 2); REV_BIT_CONST(prob, i, 2)
REV_BIT_CONST(prob, i, 4); REV_BIT_CONST(prob, i, 4)
REV_BIT_LAST (prob, i, 8); REV_BIT_LAST (prob, i, 8)
distance |= i; distance |= i;
} }
if (distance == (UInt32)0xFFFFFFFF) if (distance == (UInt32)0xFFFFFFFF)
@ -592,7 +592,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
} }
while (dicPos < limit && buf < bufLimit); while (dicPos < limit && buf < bufLimit);
NORMALIZE; NORMALIZE
p->buf = buf; p->buf = buf;
p->range = range; p->range = range;
@ -613,7 +613,7 @@ int MY_FAST_CALL LZMA_DECODE_REAL(CLzmaDec *p, SizeT limit, const Byte *bufLimit
static void MY_FAST_CALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit) static void Z7_FASTCALL LzmaDec_WriteRem(CLzmaDec *p, SizeT limit)
{ {
unsigned len = (unsigned)p->remainLen; unsigned len = (unsigned)p->remainLen;
if (len == 0 /* || len >= kMatchSpecLenStart */) if (len == 0 /* || len >= kMatchSpecLenStart */)
@ -683,7 +683,7 @@ and we support the following state of (p->checkDicSize):
(p->checkDicSize == p->prop.dicSize) (p->checkDicSize == p->prop.dicSize)
*/ */
static int MY_FAST_CALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit) static int Z7_FASTCALL LzmaDec_DecodeReal2(CLzmaDec *p, SizeT limit, const Byte *bufLimit)
{ {
if (p->checkDicSize == 0) if (p->checkDicSize == 0)
{ {
@ -767,54 +767,54 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byt
else else
{ {
unsigned len; unsigned len;
UPDATE_1_CHECK; UPDATE_1_CHECK
prob = probs + IsRep + state; prob = probs + IsRep + state;
IF_BIT_0_CHECK(prob) IF_BIT_0_CHECK(prob)
{ {
UPDATE_0_CHECK; UPDATE_0_CHECK
state = 0; state = 0;
prob = probs + LenCoder; prob = probs + LenCoder;
res = DUMMY_MATCH; res = DUMMY_MATCH;
} }
else else
{ {
UPDATE_1_CHECK; UPDATE_1_CHECK
res = DUMMY_REP; res = DUMMY_REP;
prob = probs + IsRepG0 + state; prob = probs + IsRepG0 + state;
IF_BIT_0_CHECK(prob) IF_BIT_0_CHECK(prob)
{ {
UPDATE_0_CHECK; UPDATE_0_CHECK
prob = probs + IsRep0Long + COMBINED_PS_STATE; prob = probs + IsRep0Long + COMBINED_PS_STATE;
IF_BIT_0_CHECK(prob) IF_BIT_0_CHECK(prob)
{ {
UPDATE_0_CHECK; UPDATE_0_CHECK
break; break;
} }
else else
{ {
UPDATE_1_CHECK; UPDATE_1_CHECK
} }
} }
else else
{ {
UPDATE_1_CHECK; UPDATE_1_CHECK
prob = probs + IsRepG1 + state; prob = probs + IsRepG1 + state;
IF_BIT_0_CHECK(prob) IF_BIT_0_CHECK(prob)
{ {
UPDATE_0_CHECK; UPDATE_0_CHECK
} }
else else
{ {
UPDATE_1_CHECK; UPDATE_1_CHECK
prob = probs + IsRepG2 + state; prob = probs + IsRepG2 + state;
IF_BIT_0_CHECK(prob) IF_BIT_0_CHECK(prob)
{ {
UPDATE_0_CHECK; UPDATE_0_CHECK
} }
else else
{ {
UPDATE_1_CHECK; UPDATE_1_CHECK
} }
} }
} }
@ -826,31 +826,31 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byt
const CLzmaProb *probLen = prob + LenChoice; const CLzmaProb *probLen = prob + LenChoice;
IF_BIT_0_CHECK(probLen) IF_BIT_0_CHECK(probLen)
{ {
UPDATE_0_CHECK; UPDATE_0_CHECK
probLen = prob + LenLow + GET_LEN_STATE; probLen = prob + LenLow + GET_LEN_STATE;
offset = 0; offset = 0;
limit = 1 << kLenNumLowBits; limit = 1 << kLenNumLowBits;
} }
else else
{ {
UPDATE_1_CHECK; UPDATE_1_CHECK
probLen = prob + LenChoice2; probLen = prob + LenChoice2;
IF_BIT_0_CHECK(probLen) IF_BIT_0_CHECK(probLen)
{ {
UPDATE_0_CHECK; UPDATE_0_CHECK
probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits); probLen = prob + LenLow + GET_LEN_STATE + (1 << kLenNumLowBits);
offset = kLenNumLowSymbols; offset = kLenNumLowSymbols;
limit = 1 << kLenNumLowBits; limit = 1 << kLenNumLowBits;
} }
else else
{ {
UPDATE_1_CHECK; UPDATE_1_CHECK
probLen = prob + LenHigh; probLen = prob + LenHigh;
offset = kLenNumLowSymbols * 2; offset = kLenNumLowSymbols * 2;
limit = 1 << kLenNumHighBits; limit = 1 << kLenNumHighBits;
} }
} }
TREE_DECODE_CHECK(probLen, limit, len); TREE_DECODE_CHECK(probLen, limit, len)
len += offset; len += offset;
} }
@ -860,7 +860,7 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byt
prob = probs + PosSlot + prob = probs + PosSlot +
((len < kNumLenToPosStates - 1 ? len : kNumLenToPosStates - 1) << ((len < kNumLenToPosStates - 1 ? len : kNumLenToPosStates - 1) <<
kNumPosSlotBits); kNumPosSlotBits);
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot)
if (posSlot >= kStartPosModelIndex) if (posSlot >= kStartPosModelIndex)
{ {
unsigned numDirectBits = ((posSlot >> 1) - 1); unsigned numDirectBits = ((posSlot >> 1) - 1);
@ -888,7 +888,7 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byt
unsigned m = 1; unsigned m = 1;
do do
{ {
REV_BIT_CHECK(prob, i, m); REV_BIT_CHECK(prob, i, m)
} }
while (--numDirectBits); while (--numDirectBits);
} }
@ -897,7 +897,7 @@ static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const Byte *buf, const Byt
} }
break; break;
} }
NORMALIZE_CHECK; NORMALIZE_CHECK
*bufOut = buf; *bufOut = buf;
return res; return res;
@ -943,7 +943,7 @@ When the decoder lookahead, and the lookahead symbol is not end_marker, we have
*/ */
#define RETURN__NOT_FINISHED__FOR_FINISH \ #define RETURN_NOT_FINISHED_FOR_FINISH \
*status = LZMA_STATUS_NOT_FINISHED; \ *status = LZMA_STATUS_NOT_FINISHED; \
return SZ_ERROR_DATA; // for strict mode return SZ_ERROR_DATA; // for strict mode
// return SZ_OK; // for relaxed mode // return SZ_OK; // for relaxed mode
@ -1029,7 +1029,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr
} }
if (p->remainLen != 0) if (p->remainLen != 0)
{ {
RETURN__NOT_FINISHED__FOR_FINISH; RETURN_NOT_FINISHED_FOR_FINISH
} }
checkEndMarkNow = 1; checkEndMarkNow = 1;
} }
@ -1072,7 +1072,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr
for (i = 0; i < (unsigned)dummyProcessed; i++) for (i = 0; i < (unsigned)dummyProcessed; i++)
p->tempBuf[i] = src[i]; p->tempBuf[i] = src[i];
// p->remainLen = kMatchSpecLen_Error_Data; // p->remainLen = kMatchSpecLen_Error_Data;
RETURN__NOT_FINISHED__FOR_FINISH; RETURN_NOT_FINISHED_FOR_FINISH
} }
bufLimit = src; bufLimit = src;
@ -1150,7 +1150,7 @@ SRes LzmaDec_DecodeToDic(CLzmaDec *p, SizeT dicLimit, const Byte *src, SizeT *sr
(*srcLen) += (unsigned)dummyProcessed - p->tempBufSize; (*srcLen) += (unsigned)dummyProcessed - p->tempBufSize;
p->tempBufSize = (unsigned)dummyProcessed; p->tempBufSize = (unsigned)dummyProcessed;
// p->remainLen = kMatchSpecLen_Error_Data; // p->remainLen = kMatchSpecLen_Error_Data;
RETURN__NOT_FINISHED__FOR_FINISH; RETURN_NOT_FINISHED_FOR_FINISH
} }
} }
@ -1299,8 +1299,8 @@ static SRes LzmaDec_AllocateProbs2(CLzmaDec *p, const CLzmaProps *propNew, ISzAl
SRes LzmaDec_AllocateProbs(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAllocPtr alloc) SRes LzmaDec_AllocateProbs(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAllocPtr alloc)
{ {
CLzmaProps propNew; CLzmaProps propNew;
RINOK(LzmaProps_Decode(&propNew, props, propsSize)); RINOK(LzmaProps_Decode(&propNew, props, propsSize))
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc))
p->prop = propNew; p->prop = propNew;
return SZ_OK; return SZ_OK;
} }
@ -1309,14 +1309,14 @@ SRes LzmaDec_Allocate(CLzmaDec *p, const Byte *props, unsigned propsSize, ISzAll
{ {
CLzmaProps propNew; CLzmaProps propNew;
SizeT dicBufSize; SizeT dicBufSize;
RINOK(LzmaProps_Decode(&propNew, props, propsSize)); RINOK(LzmaProps_Decode(&propNew, props, propsSize))
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc))
{ {
UInt32 dictSize = propNew.dicSize; UInt32 dictSize = propNew.dicSize;
SizeT mask = ((UInt32)1 << 12) - 1; SizeT mask = ((UInt32)1 << 12) - 1;
if (dictSize >= ((UInt32)1 << 30)) mask = ((UInt32)1 << 22) - 1; if (dictSize >= ((UInt32)1 << 30)) mask = ((UInt32)1 << 22) - 1;
else if (dictSize >= ((UInt32)1 << 22)) mask = ((UInt32)1 << 20) - 1;; else if (dictSize >= ((UInt32)1 << 22)) mask = ((UInt32)1 << 20) - 1;
dicBufSize = ((SizeT)dictSize + mask) & ~mask; dicBufSize = ((SizeT)dictSize + mask) & ~mask;
if (dicBufSize < dictSize) if (dicBufSize < dictSize)
dicBufSize = dictSize; dicBufSize = dictSize;
@ -1348,8 +1348,8 @@ SRes LzmaDecode(Byte *dest, SizeT *destLen, const Byte *src, SizeT *srcLen,
*status = LZMA_STATUS_NOT_SPECIFIED; *status = LZMA_STATUS_NOT_SPECIFIED;
if (inSize < RC_INIT_SIZE) if (inSize < RC_INIT_SIZE)
return SZ_ERROR_INPUT_EOF; return SZ_ERROR_INPUT_EOF;
LzmaDec_Construct(&p); LzmaDec_CONSTRUCT(&p)
RINOK(LzmaDec_AllocateProbs(&p, propData, propSize, alloc)); RINOK(LzmaDec_AllocateProbs(&p, propData, propSize, alloc))
p.dic = dest; p.dic = dest;
p.dicBufSize = outSize; p.dicBufSize = outSize;
LzmaDec_Init(&p); LzmaDec_Init(&p);

View file

@ -1,19 +1,19 @@
/* LzmaDec.h -- LZMA Decoder /* LzmaDec.h -- LZMA Decoder
2020-03-19 : Igor Pavlov : Public domain */ 2023-04-02 : Igor Pavlov : Public domain */
#ifndef __LZMA_DEC_H #ifndef ZIP7_INC_LZMA_DEC_H
#define __LZMA_DEC_H #define ZIP7_INC_LZMA_DEC_H
#include "7zTypes.h" #include "7zTypes.h"
EXTERN_C_BEGIN EXTERN_C_BEGIN
/* #define _LZMA_PROB32 */ /* #define Z7_LZMA_PROB32 */
/* _LZMA_PROB32 can increase the speed on some CPUs, /* Z7_LZMA_PROB32 can increase the speed on some CPUs,
but memory usage for CLzmaDec::probs will be doubled in that case */ but memory usage for CLzmaDec::probs will be doubled in that case */
typedef typedef
#ifdef _LZMA_PROB32 #ifdef Z7_LZMA_PROB32
UInt32 UInt32
#else #else
UInt16 UInt16
@ -25,7 +25,7 @@ typedef
#define LZMA_PROPS_SIZE 5 #define LZMA_PROPS_SIZE 5
typedef struct _CLzmaProps typedef struct
{ {
Byte lc; Byte lc;
Byte lp; Byte lp;
@ -73,7 +73,8 @@ typedef struct
Byte tempBuf[LZMA_REQUIRED_INPUT_MAX]; Byte tempBuf[LZMA_REQUIRED_INPUT_MAX];
} CLzmaDec; } CLzmaDec;
#define LzmaDec_Construct(p) { (p)->dic = NULL; (p)->probs = NULL; } #define LzmaDec_CONSTRUCT(p) { (p)->dic = NULL; (p)->probs = NULL; }
#define LzmaDec_Construct(p) LzmaDec_CONSTRUCT(p)
void LzmaDec_Init(CLzmaDec *p); void LzmaDec_Init(CLzmaDec *p);

File diff suppressed because it is too large Load diff

View file

@ -1,8 +1,8 @@
/* LzmaEnc.h -- LZMA Encoder /* LzmaEnc.h -- LZMA Encoder
2019-10-30 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#ifndef __LZMA_ENC_H #ifndef ZIP7_INC_LZMA_ENC_H
#define __LZMA_ENC_H #define ZIP7_INC_LZMA_ENC_H
#include "7zTypes.h" #include "7zTypes.h"
@ -10,7 +10,7 @@ EXTERN_C_BEGIN
#define LZMA_PROPS_SIZE 5 #define LZMA_PROPS_SIZE 5
typedef struct _CLzmaEncProps typedef struct
{ {
int level; /* 0 <= level <= 9 */ int level; /* 0 <= level <= 9 */
UInt32 dictSize; /* (1 << 12) <= dictSize <= (1 << 27) for 32-bit version UInt32 dictSize; /* (1 << 12) <= dictSize <= (1 << 27) for 32-bit version
@ -23,14 +23,19 @@ typedef struct _CLzmaEncProps
int fb; /* 5 <= fb <= 273, default = 32 */ int fb; /* 5 <= fb <= 273, default = 32 */
int btMode; /* 0 - hashChain Mode, 1 - binTree mode - normal, default = 1 */ int btMode; /* 0 - hashChain Mode, 1 - binTree mode - normal, default = 1 */
int numHashBytes; /* 2, 3 or 4, default = 4 */ int numHashBytes; /* 2, 3 or 4, default = 4 */
unsigned numHashOutBits; /* default = ? */
UInt32 mc; /* 1 <= mc <= (1 << 30), default = 32 */ UInt32 mc; /* 1 <= mc <= (1 << 30), default = 32 */
unsigned writeEndMark; /* 0 - do not write EOPM, 1 - write EOPM, default = 0 */ unsigned writeEndMark; /* 0 - do not write EOPM, 1 - write EOPM, default = 0 */
int numThreads; /* 1 or 2, default = 2 */ int numThreads; /* 1 or 2, default = 2 */
// int _pad;
Int32 affinityGroup;
UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1. UInt64 reduceSize; /* estimated size of data that will be compressed. default = (UInt64)(Int64)-1.
Encoder uses this value to reduce dictionary size */ Encoder uses this value to reduce dictionary size */
UInt64 affinity; UInt64 affinity;
UInt64 affinityInGroup;
} CLzmaEncProps; } CLzmaEncProps;
void LzmaEncProps_Init(CLzmaEncProps *p); void LzmaEncProps_Init(CLzmaEncProps *p);
@ -51,7 +56,9 @@ SRes:
SZ_ERROR_THREAD - error in multithreading functions (only for Mt version) SZ_ERROR_THREAD - error in multithreading functions (only for Mt version)
*/ */
typedef void * CLzmaEncHandle; typedef struct CLzmaEnc CLzmaEnc;
typedef CLzmaEnc * CLzmaEncHandle;
// Z7_DECLARE_HANDLE(CLzmaEncHandle)
CLzmaEncHandle LzmaEnc_Create(ISzAllocPtr alloc); CLzmaEncHandle LzmaEnc_Create(ISzAllocPtr alloc);
void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAllocPtr alloc, ISzAllocPtr allocBig); void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAllocPtr alloc, ISzAllocPtr allocBig);
@ -61,17 +68,17 @@ void LzmaEnc_SetDataSize(CLzmaEncHandle p, UInt64 expectedDataSiize);
SRes LzmaEnc_WriteProperties(CLzmaEncHandle p, Byte *properties, SizeT *size); SRes LzmaEnc_WriteProperties(CLzmaEncHandle p, Byte *properties, SizeT *size);
unsigned LzmaEnc_IsWriteEndMark(CLzmaEncHandle p); unsigned LzmaEnc_IsWriteEndMark(CLzmaEncHandle p);
SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStream *outStream, ISeqInStream *inStream, SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStreamPtr outStream, ISeqInStreamPtr inStream,
ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig); ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
SRes LzmaEnc_MemEncode(CLzmaEncHandle p, Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, SRes LzmaEnc_MemEncode(CLzmaEncHandle p, Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen,
int writeEndMark, ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig); int writeEndMark, ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
/* ---------- One Call Interface ---------- */ /* ---------- One Call Interface ---------- */
SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen, SRes LzmaEncode(Byte *dest, SizeT *destLen, const Byte *src, SizeT srcLen,
const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark, const CLzmaEncProps *props, Byte *propsEncoded, SizeT *propsSize, int writeEndMark,
ICompressProgress *progress, ISzAllocPtr alloc, ISzAllocPtr allocBig); ICompressProgressPtr progress, ISzAllocPtr alloc, ISzAllocPtr allocBig);
EXTERN_C_END EXTERN_C_END

View file

@ -1,12 +1,14 @@
/* LzmaLib.c -- LZMA library wrapper /* LzmaLib.c -- LZMA library wrapper
2015-06-13 : Igor Pavlov : Public domain */ 2023-04-02 : Igor Pavlov : Public domain */
#include "Precomp.h"
#include "Alloc.h" #include "Alloc.h"
#include "LzmaDec.h" #include "LzmaDec.h"
#include "LzmaEnc.h" #include "LzmaEnc.h"
#include "LzmaLib.h" #include "LzmaLib.h"
MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen, Z7_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen,
unsigned char *outProps, size_t *outPropsSize, unsigned char *outProps, size_t *outPropsSize,
int level, /* 0 <= level <= 9, default = 5 */ int level, /* 0 <= level <= 9, default = 5 */
unsigned dictSize, /* use (1 << N) or (3 << N). 4 KB < dictSize <= 128 MB */ unsigned dictSize, /* use (1 << N) or (3 << N). 4 KB < dictSize <= 128 MB */
@ -32,7 +34,7 @@ MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char
} }
MY_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t *srcLen, Z7_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t *srcLen,
const unsigned char *props, size_t propsSize) const unsigned char *props, size_t propsSize)
{ {
ELzmaStatus status; ELzmaStatus status;

View file

@ -1,14 +1,14 @@
/* LzmaLib.h -- LZMA library interface /* LzmaLib.h -- LZMA library interface
2021-04-03 : Igor Pavlov : Public domain */ 2023-04-02 : Igor Pavlov : Public domain */
#ifndef __LZMA_LIB_H #ifndef ZIP7_INC_LZMA_LIB_H
#define __LZMA_LIB_H #define ZIP7_INC_LZMA_LIB_H
#include "7zTypes.h" #include "7zTypes.h"
EXTERN_C_BEGIN EXTERN_C_BEGIN
#define MY_STDAPI int MY_STD_CALL #define Z7_STDAPI int Z7_STDCALL
#define LZMA_PROPS_SIZE 5 #define LZMA_PROPS_SIZE 5
@ -100,7 +100,7 @@ Returns:
SZ_ERROR_THREAD - errors in multithreading functions (only for Mt version) SZ_ERROR_THREAD - errors in multithreading functions (only for Mt version)
*/ */
MY_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen, Z7_STDAPI LzmaCompress(unsigned char *dest, size_t *destLen, const unsigned char *src, size_t srcLen,
unsigned char *outProps, size_t *outPropsSize, /* *outPropsSize must be = 5 */ unsigned char *outProps, size_t *outPropsSize, /* *outPropsSize must be = 5 */
int level, /* 0 <= level <= 9, default = 5 */ int level, /* 0 <= level <= 9, default = 5 */
unsigned dictSize, /* default = (1 << 24) */ unsigned dictSize, /* default = (1 << 24) */
@ -130,7 +130,7 @@ Returns:
SZ_ERROR_INPUT_EOF - it needs more bytes in input buffer (src) SZ_ERROR_INPUT_EOF - it needs more bytes in input buffer (src)
*/ */
MY_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, SizeT *srcLen, Z7_STDAPI LzmaUncompress(unsigned char *dest, size_t *destLen, const unsigned char *src, SizeT *srcLen,
const unsigned char *props, size_t propsSize); const unsigned char *props, size_t propsSize);
EXTERN_C_END EXTERN_C_END

206
C/Md5.c Normal file
View file

@ -0,0 +1,206 @@
/* Md5.c -- MD5 Hash
: Igor Pavlov : Public domain
This code is based on Colin Plumb's public domain md5.c code */
#include "Precomp.h"
#include <string.h>
#include "Md5.h"
#include "RotateDefs.h"
#include "CpuArch.h"
#define MD5_UPDATE_BLOCKS(p) Md5_UpdateBlocks
Z7_NO_INLINE
void Md5_Init(CMd5 *p)
{
p->count = 0;
p->state[0] = 0x67452301;
p->state[1] = 0xefcdab89;
p->state[2] = 0x98badcfe;
p->state[3] = 0x10325476;
}
#if 0 && !defined(MY_CPU_LE_UNALIGN)
// optional optimization for Big-endian processors or processors without unaligned access:
// it is intended to reduce the number of complex LE32 memory reading from 64 to 16.
// But some compilers (sparc, armt) are better without this optimization.
#define Z7_MD5_USE_DATA32_ARRAY
#endif
#define LOAD_DATA(i) GetUi32((const UInt32 *)(const void *)data + (i))
#ifdef Z7_MD5_USE_DATA32_ARRAY
#define D(i) data32[i]
#else
#define D(i) LOAD_DATA(i)
#endif
#define F1(x, y, z) (z ^ (x & (y ^ z)))
#define F2(x, y, z) F1(z, x, y)
#define F3(x, y, z) (x ^ y ^ z)
#define F4(x, y, z) (y ^ (x | ~z))
#define R1(i, f, start, step, w, x, y, z, s, k) \
w += D((start + step * (i)) % 16) + k; \
w += f(x, y, z); \
w = rotlFixed(w, s) + x; \
#define R4(i4, f, start, step, s0,s1,s2,s3, k0,k1,k2,k3) \
R1 (i4*4+0, f, start, step, a,b,c,d, s0, k0) \
R1 (i4*4+1, f, start, step, d,a,b,c, s1, k1) \
R1 (i4*4+2, f, start, step, c,d,a,b, s2, k2) \
R1 (i4*4+3, f, start, step, b,c,d,a, s3, k3) \
#define R16(f, start, step, s0,s1,s2,s3, k00,k01,k02,k03, k10,k11,k12,k13, k20,k21,k22,k23, k30,k31,k32,k33) \
R4 (0, f, start, step, s0,s1,s2,s3, k00,k01,k02,k03) \
R4 (1, f, start, step, s0,s1,s2,s3, k10,k11,k12,k13) \
R4 (2, f, start, step, s0,s1,s2,s3, k20,k21,k22,k23) \
R4 (3, f, start, step, s0,s1,s2,s3, k30,k31,k32,k33) \
static
Z7_NO_INLINE
void Z7_FASTCALL Md5_UpdateBlocks(UInt32 state[4], const Byte *data, size_t numBlocks)
{
UInt32 a, b, c, d;
// if (numBlocks == 0) return;
a = state[0];
b = state[1];
c = state[2];
d = state[3];
do
{
#ifdef Z7_MD5_USE_DATA32_ARRAY
UInt32 data32[MD5_NUM_BLOCK_WORDS];
{
#define LOAD_data32_x4(i) { \
data32[i ] = LOAD_DATA(i ); \
data32[i + 1] = LOAD_DATA(i + 1); \
data32[i + 2] = LOAD_DATA(i + 2); \
data32[i + 3] = LOAD_DATA(i + 3); }
#if 1
LOAD_data32_x4 (0 * 4)
LOAD_data32_x4 (1 * 4)
LOAD_data32_x4 (2 * 4)
LOAD_data32_x4 (3 * 4)
#else
unsigned i;
for (i = 0; i < MD5_NUM_BLOCK_WORDS; i += 4)
{
LOAD_data32_x4(i)
}
#endif
}
#endif
R16 (F1, 0, 1, 7,12,17,22, 0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee,
0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821)
R16 (F2, 1, 5, 5, 9,14,20, 0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa,
0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed,
0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a)
R16 (F3, 5, 3, 4,11,16,23, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05,
0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665)
R16 (F4, 0, 7, 6,10,15,21, 0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039,
0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391)
a += state[0];
b += state[1];
c += state[2];
d += state[3];
state[0] = a;
state[1] = b;
state[2] = c;
state[3] = d;
data += MD5_BLOCK_SIZE;
}
while (--numBlocks);
}
#define Md5_UpdateBlock(p) MD5_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
void Md5_Update(CMd5 *p, const Byte *data, size_t size)
{
if (size == 0)
return;
{
const unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1);
const unsigned num = MD5_BLOCK_SIZE - pos;
p->count += size;
if (num > size)
{
memcpy(p->buffer + pos, data, size);
return;
}
if (pos != 0)
{
size -= num;
memcpy(p->buffer + pos, data, num);
data += num;
Md5_UpdateBlock(p);
}
}
{
const size_t numBlocks = size >> 6;
if (numBlocks)
MD5_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
size &= MD5_BLOCK_SIZE - 1;
if (size == 0)
return;
data += (numBlocks << 6);
memcpy(p->buffer, data, size);
}
}
void Md5_Final(CMd5 *p, Byte *digest)
{
unsigned pos = (unsigned)p->count & (MD5_BLOCK_SIZE - 1);
p->buffer[pos++] = 0x80;
if (pos > (MD5_BLOCK_SIZE - 4 * 2))
{
while (pos != MD5_BLOCK_SIZE) { p->buffer[pos++] = 0; }
// memset(&p->buf.buffer[pos], 0, MD5_BLOCK_SIZE - pos);
Md5_UpdateBlock(p);
pos = 0;
}
memset(&p->buffer[pos], 0, (MD5_BLOCK_SIZE - 4 * 2) - pos);
{
const UInt64 numBits = p->count << 3;
#if defined(MY_CPU_LE_UNALIGN)
SetUi64 (p->buffer + MD5_BLOCK_SIZE - 4 * 2, numBits)
#else
SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 2, (UInt32)(numBits))
SetUi32a(p->buffer + MD5_BLOCK_SIZE - 4 * 1, (UInt32)(numBits >> 32))
#endif
}
Md5_UpdateBlock(p);
SetUi32(digest, p->state[0])
SetUi32(digest + 4, p->state[1])
SetUi32(digest + 8, p->state[2])
SetUi32(digest + 12, p->state[3])
Md5_Init(p);
}
#undef R1
#undef R4
#undef R16
#undef D
#undef LOAD_DATA
#undef LOAD_data32_x4
#undef F1
#undef F2
#undef F3
#undef F4

34
C/Md5.h Normal file
View file

@ -0,0 +1,34 @@
/* Md5.h -- MD5 Hash
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_MD5_H
#define ZIP7_INC_MD5_H
#include "7zTypes.h"
EXTERN_C_BEGIN
#define MD5_NUM_BLOCK_WORDS 16
#define MD5_NUM_DIGEST_WORDS 4
#define MD5_BLOCK_SIZE (MD5_NUM_BLOCK_WORDS * 4)
#define MD5_DIGEST_SIZE (MD5_NUM_DIGEST_WORDS * 4)
typedef struct
{
UInt64 count;
UInt64 _pad_1;
// we want 16-bytes alignment here
UInt32 state[MD5_NUM_DIGEST_WORDS];
UInt64 _pad_2[4];
// we want 64-bytes alignment here
Byte buffer[MD5_BLOCK_SIZE];
} CMd5;
void Md5_Init(CMd5 *p);
void Md5_Update(CMd5 *p, const Byte *data, size_t size);
void Md5_Final(CMd5 *p, Byte *digest);
EXTERN_C_END
#endif

View file

@ -1,28 +1,28 @@
/* MtCoder.c -- Multi-thread Coder /* MtCoder.c -- Multi-thread Coder
2021-12-21 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include "MtCoder.h" #include "MtCoder.h"
#ifndef _7ZIP_ST #ifndef Z7_ST
static SRes MtProgressThunk_Progress(const ICompressProgress *pp, UInt64 inSize, UInt64 outSize) static SRes MtProgressThunk_Progress(ICompressProgressPtr pp, UInt64 inSize, UInt64 outSize)
{ {
CMtProgressThunk *thunk = CONTAINER_FROM_VTBL(pp, CMtProgressThunk, vt); Z7_CONTAINER_FROM_VTBL_TO_DECL_VAR_pp_vt_p(CMtProgressThunk)
UInt64 inSize2 = 0; UInt64 inSize2 = 0;
UInt64 outSize2 = 0; UInt64 outSize2 = 0;
if (inSize != (UInt64)(Int64)-1) if (inSize != (UInt64)(Int64)-1)
{ {
inSize2 = inSize - thunk->inSize; inSize2 = inSize - p->inSize;
thunk->inSize = inSize; p->inSize = inSize;
} }
if (outSize != (UInt64)(Int64)-1) if (outSize != (UInt64)(Int64)-1)
{ {
outSize2 = outSize - thunk->outSize; outSize2 = outSize - p->outSize;
thunk->outSize = outSize; p->outSize = outSize;
} }
return MtProgress_ProgressAdd(thunk->mtProgress, inSize2, outSize2); return MtProgress_ProgressAdd(p->mtProgress, inSize2, outSize2);
} }
@ -36,25 +36,31 @@ void MtProgressThunk_CreateVTable(CMtProgressThunk *p)
#define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; } #define RINOK_THREAD(x) { if ((x) != 0) return SZ_ERROR_THREAD; }
static WRes ArEvent_OptCreate_And_Reset(CEvent *p)
{
if (Event_IsCreated(p))
return Event_Reset(p);
return AutoResetEvent_CreateNotSignaled(p);
}
static THREAD_FUNC_DECL ThreadFunc(void *pp); static THREAD_FUNC_DECL ThreadFunc(void *pp);
static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t) static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t
#ifdef _WIN32
, CMtCoder * const mtc
#endif
)
{ {
WRes wres = ArEvent_OptCreate_And_Reset(&t->startEvent); WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->startEvent);
// printf("\n====== MtCoderThread_CreateAndStart : \n");
if (wres == 0) if (wres == 0)
{ {
t->stop = False; t->stop = False;
if (!Thread_WasCreated(&t->thread)) if (!Thread_WasCreated(&t->thread))
wres = Thread_Create(&t->thread, ThreadFunc, t); {
#ifdef _WIN32
if (mtc->numThreadGroups)
wres = Thread_Create_With_Group(&t->thread, ThreadFunc, t,
ThreadNextGroup_GetNext(&mtc->nextGroup), // group
0); // affinityMask
else
#endif
wres = Thread_Create(&t->thread, ThreadFunc, t);
}
if (wres == 0) if (wres == 0)
wres = Event_Set(&t->startEvent); wres = Event_Set(&t->startEvent);
} }
@ -64,6 +70,7 @@ static SRes MtCoderThread_CreateAndStart(CMtCoderThread *t)
} }
Z7_FORCE_INLINE
static void MtCoderThread_Destruct(CMtCoderThread *t) static void MtCoderThread_Destruct(CMtCoderThread *t)
{ {
if (Thread_WasCreated(&t->thread)) if (Thread_WasCreated(&t->thread))
@ -84,24 +91,6 @@ static void MtCoderThread_Destruct(CMtCoderThread *t)
static SRes FullRead(ISeqInStream *stream, Byte *data, size_t *processedSize)
{
size_t size = *processedSize;
*processedSize = 0;
while (size != 0)
{
size_t cur = size;
SRes res = ISeqInStream_Read(stream, data, &cur);
*processedSize += cur;
data += cur;
size -= cur;
RINOK(res);
if (cur == 0)
return SZ_OK;
}
return SZ_OK;
}
/* /*
ThreadFunc2() returns: ThreadFunc2() returns:
@ -111,7 +100,7 @@ static SRes FullRead(ISeqInStream *stream, Byte *data, size_t *processedSize)
static SRes ThreadFunc2(CMtCoderThread *t) static SRes ThreadFunc2(CMtCoderThread *t)
{ {
CMtCoder *mtc = t->mtCoder; CMtCoder * const mtc = t->mtCoder;
for (;;) for (;;)
{ {
@ -152,7 +141,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
} }
if (res == SZ_OK) if (res == SZ_OK)
{ {
res = FullRead(mtc->inStream, t->inBuf, &size); res = SeqInStream_ReadMax(mtc->inStream, t->inBuf, &size);
readProcessed = mtc->readProcessed + size; readProcessed = mtc->readProcessed + size;
mtc->readProcessed = readProcessed; mtc->readProcessed = readProcessed;
} }
@ -211,7 +200,11 @@ static SRes ThreadFunc2(CMtCoderThread *t)
if (mtc->numStartedThreads < mtc->numStartedThreadsLimit if (mtc->numStartedThreads < mtc->numStartedThreadsLimit
&& mtc->expectedDataSize != readProcessed) && mtc->expectedDataSize != readProcessed)
{ {
res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]); res = MtCoderThread_CreateAndStart(&mtc->threads[mtc->numStartedThreads]
#ifdef _WIN32
, mtc
#endif
);
if (res == SZ_OK) if (res == SZ_OK)
mtc->numStartedThreads++; mtc->numStartedThreads++;
else else
@ -247,13 +240,13 @@ static SRes ThreadFunc2(CMtCoderThread *t)
} }
{ {
CMtCoderBlock *block = &mtc->blocks[bi]; CMtCoderBlock * const block = &mtc->blocks[bi];
block->res = res; block->res = res;
block->bufIndex = bufIndex; block->bufIndex = bufIndex;
block->finished = finished; block->finished = finished;
} }
#ifdef MTCODER__USE_WRITE_THREAD #ifdef MTCODER_USE_WRITE_THREAD
RINOK_THREAD(Event_Set(&mtc->writeEvents[bi])) RINOK_THREAD(Event_Set(&mtc->writeEvents[bi]))
#else #else
{ {
@ -337,7 +330,7 @@ static SRes ThreadFunc2(CMtCoderThread *t)
static THREAD_FUNC_DECL ThreadFunc(void *pp) static THREAD_FUNC_DECL ThreadFunc(void *pp)
{ {
CMtCoderThread *t = (CMtCoderThread *)pp; CMtCoderThread * const t = (CMtCoderThread *)pp;
for (;;) for (;;)
{ {
if (Event_Wait(&t->startEvent) != 0) if (Event_Wait(&t->startEvent) != 0)
@ -345,16 +338,16 @@ static THREAD_FUNC_DECL ThreadFunc(void *pp)
if (t->stop) if (t->stop)
return 0; return 0;
{ {
SRes res = ThreadFunc2(t); const SRes res = ThreadFunc2(t);
CMtCoder *mtc = t->mtCoder; CMtCoder *mtc = t->mtCoder;
if (res != SZ_OK) if (res != SZ_OK)
{ {
MtProgress_SetError(&mtc->mtProgress, res); MtProgress_SetError(&mtc->mtProgress, res);
} }
#ifndef MTCODER__USE_WRITE_THREAD #ifndef MTCODER_USE_WRITE_THREAD
{ {
unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads); const unsigned numFinished = (unsigned)InterlockedIncrement(&mtc->numFinishedThreads);
if (numFinished == mtc->numStartedThreads) if (numFinished == mtc->numStartedThreads)
if (Event_Set(&mtc->finishedEvent) != 0) if (Event_Set(&mtc->finishedEvent) != 0)
return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD; return (THREAD_FUNC_RET_TYPE)SZ_ERROR_THREAD;
@ -372,6 +365,7 @@ void MtCoder_Construct(CMtCoder *p)
p->blockSize = 0; p->blockSize = 0;
p->numThreadsMax = 0; p->numThreadsMax = 0;
p->numThreadGroups = 0;
p->expectedDataSize = (UInt64)(Int64)-1; p->expectedDataSize = (UInt64)(Int64)-1;
p->inStream = NULL; p->inStream = NULL;
@ -389,7 +383,7 @@ void MtCoder_Construct(CMtCoder *p)
Event_Construct(&p->readEvent); Event_Construct(&p->readEvent);
Semaphore_Construct(&p->blocksSemaphore); Semaphore_Construct(&p->blocksSemaphore);
for (i = 0; i < MTCODER__THREADS_MAX; i++) for (i = 0; i < MTCODER_THREADS_MAX; i++)
{ {
CMtCoderThread *t = &p->threads[i]; CMtCoderThread *t = &p->threads[i];
t->mtCoder = p; t->mtCoder = p;
@ -397,11 +391,11 @@ void MtCoder_Construct(CMtCoder *p)
t->inBuf = NULL; t->inBuf = NULL;
t->stop = False; t->stop = False;
Event_Construct(&t->startEvent); Event_Construct(&t->startEvent);
Thread_Construct(&t->thread); Thread_CONSTRUCT(&t->thread)
} }
#ifdef MTCODER__USE_WRITE_THREAD #ifdef MTCODER_USE_WRITE_THREAD
for (i = 0; i < MTCODER__BLOCKS_MAX; i++) for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
Event_Construct(&p->writeEvents[i]); Event_Construct(&p->writeEvents[i]);
#else #else
Event_Construct(&p->finishedEvent); Event_Construct(&p->finishedEvent);
@ -424,14 +418,14 @@ static void MtCoder_Free(CMtCoder *p)
Event_Set(&p->readEvent); Event_Set(&p->readEvent);
*/ */
for (i = 0; i < MTCODER__THREADS_MAX; i++) for (i = 0; i < MTCODER_THREADS_MAX; i++)
MtCoderThread_Destruct(&p->threads[i]); MtCoderThread_Destruct(&p->threads[i]);
Event_Close(&p->readEvent); Event_Close(&p->readEvent);
Semaphore_Close(&p->blocksSemaphore); Semaphore_Close(&p->blocksSemaphore);
#ifdef MTCODER__USE_WRITE_THREAD #ifdef MTCODER_USE_WRITE_THREAD
for (i = 0; i < MTCODER__BLOCKS_MAX; i++) for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
Event_Close(&p->writeEvents[i]); Event_Close(&p->writeEvents[i]);
#else #else
Event_Close(&p->finishedEvent); Event_Close(&p->finishedEvent);
@ -455,20 +449,22 @@ SRes MtCoder_Code(CMtCoder *p)
unsigned i; unsigned i;
SRes res = SZ_OK; SRes res = SZ_OK;
if (numThreads > MTCODER__THREADS_MAX) // printf("\n====== MtCoder_Code : \n");
numThreads = MTCODER__THREADS_MAX;
numBlocksMax = MTCODER__GET_NUM_BLOCKS_FROM_THREADS(numThreads); if (numThreads > MTCODER_THREADS_MAX)
numThreads = MTCODER_THREADS_MAX;
numBlocksMax = MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads);
if (p->blockSize < ((UInt32)1 << 26)) numBlocksMax++; if (p->blockSize < ((UInt32)1 << 26)) numBlocksMax++;
if (p->blockSize < ((UInt32)1 << 24)) numBlocksMax++; if (p->blockSize < ((UInt32)1 << 24)) numBlocksMax++;
if (p->blockSize < ((UInt32)1 << 22)) numBlocksMax++; if (p->blockSize < ((UInt32)1 << 22)) numBlocksMax++;
if (numBlocksMax > MTCODER__BLOCKS_MAX) if (numBlocksMax > MTCODER_BLOCKS_MAX)
numBlocksMax = MTCODER__BLOCKS_MAX; numBlocksMax = MTCODER_BLOCKS_MAX;
if (p->blockSize != p->allocatedBufsSize) if (p->blockSize != p->allocatedBufsSize)
{ {
for (i = 0; i < MTCODER__THREADS_MAX; i++) for (i = 0; i < MTCODER_THREADS_MAX; i++)
{ {
CMtCoderThread *t = &p->threads[i]; CMtCoderThread *t = &p->threads[i];
if (t->inBuf) if (t->inBuf)
@ -484,23 +480,23 @@ SRes MtCoder_Code(CMtCoder *p)
MtProgress_Init(&p->mtProgress, p->progress); MtProgress_Init(&p->mtProgress, p->progress);
#ifdef MTCODER__USE_WRITE_THREAD #ifdef MTCODER_USE_WRITE_THREAD
for (i = 0; i < numBlocksMax; i++) for (i = 0; i < numBlocksMax; i++)
{ {
RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->writeEvents[i])); RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->writeEvents[i]))
} }
#else #else
RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->finishedEvent)); RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->finishedEvent))
#endif #endif
{ {
RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->readEvent)); RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->readEvent))
RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, numBlocksMax, numBlocksMax)); RINOK_THREAD(Semaphore_OptCreateInit(&p->blocksSemaphore, (UInt32)numBlocksMax, (UInt32)numBlocksMax))
} }
for (i = 0; i < MTCODER__BLOCKS_MAX - 1; i++) for (i = 0; i < MTCODER_BLOCKS_MAX - 1; i++)
p->freeBlockList[i] = i + 1; p->freeBlockList[i] = i + 1;
p->freeBlockList[MTCODER__BLOCKS_MAX - 1] = (unsigned)(int)-1; p->freeBlockList[MTCODER_BLOCKS_MAX - 1] = (unsigned)(int)-1;
p->freeBlockHead = 0; p->freeBlockHead = 0;
p->readProcessed = 0; p->readProcessed = 0;
@ -508,26 +504,37 @@ SRes MtCoder_Code(CMtCoder *p)
p->numBlocksMax = numBlocksMax; p->numBlocksMax = numBlocksMax;
p->stopReading = False; p->stopReading = False;
#ifndef MTCODER__USE_WRITE_THREAD #ifndef MTCODER_USE_WRITE_THREAD
p->writeIndex = 0; p->writeIndex = 0;
p->writeRes = SZ_OK; p->writeRes = SZ_OK;
for (i = 0; i < MTCODER__BLOCKS_MAX; i++) for (i = 0; i < MTCODER_BLOCKS_MAX; i++)
p->ReadyBlocks[i] = False; p->ReadyBlocks[i] = False;
p->numFinishedThreads = 0; p->numFinishedThreads = 0;
#endif #endif
p->numStartedThreadsLimit = numThreads; p->numStartedThreadsLimit = numThreads;
p->numStartedThreads = 0; p->numStartedThreads = 0;
ThreadNextGroup_Init(&p->nextGroup, p->numThreadGroups, 0); // startGroup
// for (i = 0; i < numThreads; i++) // for (i = 0; i < numThreads; i++)
{ {
// here we create new thread for first block.
// And each new thread will create another new thread after block reading
// until numStartedThreadsLimit is reached.
CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++]; CMtCoderThread *nextThread = &p->threads[p->numStartedThreads++];
RINOK(MtCoderThread_CreateAndStart(nextThread)); {
const SRes res2 = MtCoderThread_CreateAndStart(nextThread
#ifdef _WIN32
, p
#endif
);
RINOK(res2)
}
} }
RINOK_THREAD(Event_Set(&p->readEvent)) RINOK_THREAD(Event_Set(&p->readEvent))
#ifdef MTCODER__USE_WRITE_THREAD #ifdef MTCODER_USE_WRITE_THREAD
{ {
unsigned bi = 0; unsigned bi = 0;
@ -539,9 +546,9 @@ SRes MtCoder_Code(CMtCoder *p)
RINOK_THREAD(Event_Wait(&p->writeEvents[bi])) RINOK_THREAD(Event_Wait(&p->writeEvents[bi]))
{ {
const CMtCoderBlock *block = &p->blocks[bi]; const CMtCoderBlock * const block = &p->blocks[bi];
unsigned bufIndex = block->bufIndex; const unsigned bufIndex = block->bufIndex;
BoolInt finished = block->finished; const BoolInt finished = block->finished;
if (res == SZ_OK && block->res != SZ_OK) if (res == SZ_OK && block->res != SZ_OK)
res = block->res; res = block->res;
@ -571,7 +578,7 @@ SRes MtCoder_Code(CMtCoder *p)
} }
#else #else
{ {
WRes wres = Event_Wait(&p->finishedEvent); const WRes wres = Event_Wait(&p->finishedEvent);
res = MY_SRes_HRESULT_FROM_WRes(wres); res = MY_SRes_HRESULT_FROM_WRes(wres);
} }
#endif #endif
@ -582,7 +589,7 @@ SRes MtCoder_Code(CMtCoder *p)
if (res == SZ_OK) if (res == SZ_OK)
res = p->mtProgress.res; res = p->mtProgress.res;
#ifndef MTCODER__USE_WRITE_THREAD #ifndef MTCODER_USE_WRITE_THREAD
if (res == SZ_OK) if (res == SZ_OK)
res = p->writeRes; res = p->writeRes;
#endif #endif
@ -593,3 +600,5 @@ SRes MtCoder_Code(CMtCoder *p)
} }
#endif #endif
#undef RINOK_THREAD

View file

@ -1,30 +1,30 @@
/* MtCoder.h -- Multi-thread Coder /* MtCoder.h -- Multi-thread Coder
2018-07-04 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#ifndef __MT_CODER_H #ifndef ZIP7_INC_MT_CODER_H
#define __MT_CODER_H #define ZIP7_INC_MT_CODER_H
#include "MtDec.h" #include "MtDec.h"
EXTERN_C_BEGIN EXTERN_C_BEGIN
/* /*
if ( defined MTCODER__USE_WRITE_THREAD) : main thread writes all data blocks to output stream if ( defined MTCODER_USE_WRITE_THREAD) : main thread writes all data blocks to output stream
if (not defined MTCODER__USE_WRITE_THREAD) : any coder thread can write data blocks to output stream if (not defined MTCODER_USE_WRITE_THREAD) : any coder thread can write data blocks to output stream
*/ */
/* #define MTCODER__USE_WRITE_THREAD */ /* #define MTCODER_USE_WRITE_THREAD */
#ifndef _7ZIP_ST #ifndef Z7_ST
#define MTCODER__GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1) #define MTCODER_GET_NUM_BLOCKS_FROM_THREADS(numThreads) ((numThreads) + (numThreads) / 8 + 1)
#define MTCODER__THREADS_MAX 64 #define MTCODER_THREADS_MAX 256
#define MTCODER__BLOCKS_MAX (MTCODER__GET_NUM_BLOCKS_FROM_THREADS(MTCODER__THREADS_MAX) + 3) #define MTCODER_BLOCKS_MAX (MTCODER_GET_NUM_BLOCKS_FROM_THREADS(MTCODER_THREADS_MAX) + 3)
#else #else
#define MTCODER__THREADS_MAX 1 #define MTCODER_THREADS_MAX 1
#define MTCODER__BLOCKS_MAX 1 #define MTCODER_BLOCKS_MAX 1
#endif #endif
#ifndef _7ZIP_ST #ifndef Z7_ST
typedef struct typedef struct
@ -37,15 +37,15 @@ typedef struct
void MtProgressThunk_CreateVTable(CMtProgressThunk *p); void MtProgressThunk_CreateVTable(CMtProgressThunk *p);
#define MtProgressThunk_Init(p) { (p)->inSize = 0; (p)->outSize = 0; } #define MtProgressThunk_INIT(p) { (p)->inSize = 0; (p)->outSize = 0; }
struct _CMtCoder; struct CMtCoder_;
typedef struct typedef struct
{ {
struct _CMtCoder *mtCoder; struct CMtCoder_ *mtCoder;
unsigned index; unsigned index;
int stop; int stop;
Byte *inBuf; Byte *inBuf;
@ -71,19 +71,20 @@ typedef struct
} CMtCoderBlock; } CMtCoderBlock;
typedef struct _CMtCoder typedef struct CMtCoder_
{ {
/* input variables */ /* input variables */
size_t blockSize; /* size of input block */ size_t blockSize; /* size of input block */
unsigned numThreadsMax; unsigned numThreadsMax;
unsigned numThreadGroups;
UInt64 expectedDataSize; UInt64 expectedDataSize;
ISeqInStream *inStream; ISeqInStreamPtr inStream;
const Byte *inData; const Byte *inData;
size_t inDataSize; size_t inDataSize;
ICompressProgress *progress; ICompressProgressPtr progress;
ISzAllocPtr allocBig; ISzAllocPtr allocBig;
IMtCoderCallback2 *mtCallback; IMtCoderCallback2 *mtCallback;
@ -100,13 +101,13 @@ typedef struct _CMtCoder
BoolInt stopReading; BoolInt stopReading;
SRes readRes; SRes readRes;
#ifdef MTCODER__USE_WRITE_THREAD #ifdef MTCODER_USE_WRITE_THREAD
CAutoResetEvent writeEvents[MTCODER__BLOCKS_MAX]; CAutoResetEvent writeEvents[MTCODER_BLOCKS_MAX];
#else #else
CAutoResetEvent finishedEvent; CAutoResetEvent finishedEvent;
SRes writeRes; SRes writeRes;
unsigned writeIndex; unsigned writeIndex;
Byte ReadyBlocks[MTCODER__BLOCKS_MAX]; Byte ReadyBlocks[MTCODER_BLOCKS_MAX];
LONG numFinishedThreads; LONG numFinishedThreads;
#endif #endif
@ -120,11 +121,13 @@ typedef struct _CMtCoder
CCriticalSection cs; CCriticalSection cs;
unsigned freeBlockHead; unsigned freeBlockHead;
unsigned freeBlockList[MTCODER__BLOCKS_MAX]; unsigned freeBlockList[MTCODER_BLOCKS_MAX];
CMtProgress mtProgress; CMtProgress mtProgress;
CMtCoderBlock blocks[MTCODER__BLOCKS_MAX]; CMtCoderBlock blocks[MTCODER_BLOCKS_MAX];
CMtCoderThread threads[MTCODER__THREADS_MAX]; CMtCoderThread threads[MTCODER_THREADS_MAX];
CThreadNextGroup nextGroup;
} CMtCoder; } CMtCoder;

117
C/MtDec.c
View file

@ -1,5 +1,5 @@
/* MtDec.c -- Multi-thread Decoder /* MtDec.c -- Multi-thread Decoder
2021-12-21 : Igor Pavlov : Public domain */ 2024-02-20 : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -14,7 +14,7 @@
#include "MtDec.h" #include "MtDec.h"
#ifndef _7ZIP_ST #ifndef Z7_ST
#ifdef SHOW_DEBUG_INFO #ifdef SHOW_DEBUG_INFO
#define PRF(x) x #define PRF(x) x
@ -24,7 +24,7 @@
#define PRF_STR_INT(s, d) PRF(printf("\n" s " %d\n", (unsigned)d)) #define PRF_STR_INT(s, d) PRF(printf("\n" s " %d\n", (unsigned)d))
void MtProgress_Init(CMtProgress *p, ICompressProgress *progress) void MtProgress_Init(CMtProgress *p, ICompressProgressPtr progress)
{ {
p->progress = progress; p->progress = progress;
p->res = SZ_OK; p->res = SZ_OK;
@ -81,36 +81,28 @@ void MtProgress_SetError(CMtProgress *p, SRes res)
#define RINOK_THREAD(x) RINOK_WRes(x) #define RINOK_THREAD(x) RINOK_WRes(x)
static WRes ArEvent_OptCreate_And_Reset(CEvent *p) struct CMtDecBufLink_
{ {
if (Event_IsCreated(p)) struct CMtDecBufLink_ *next;
return Event_Reset(p);
return AutoResetEvent_CreateNotSignaled(p);
}
struct __CMtDecBufLink
{
struct __CMtDecBufLink *next;
void *pad[3]; void *pad[3];
}; };
typedef struct __CMtDecBufLink CMtDecBufLink; typedef struct CMtDecBufLink_ CMtDecBufLink;
#define MTDEC__LINK_DATA_OFFSET sizeof(CMtDecBufLink) #define MTDEC__LINK_DATA_OFFSET sizeof(CMtDecBufLink)
#define MTDEC__DATA_PTR_FROM_LINK(link) ((Byte *)(link) + MTDEC__LINK_DATA_OFFSET) #define MTDEC__DATA_PTR_FROM_LINK(link) ((Byte *)(link) + MTDEC__LINK_DATA_OFFSET)
static THREAD_FUNC_DECL ThreadFunc(void *pp); static THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp);
static WRes MtDecThread_CreateEvents(CMtDecThread *t) static WRes MtDecThread_CreateEvents(CMtDecThread *t)
{ {
WRes wres = ArEvent_OptCreate_And_Reset(&t->canWrite); WRes wres = AutoResetEvent_OptCreate_And_Reset(&t->canWrite);
if (wres == 0) if (wres == 0)
{ {
wres = ArEvent_OptCreate_And_Reset(&t->canRead); wres = AutoResetEvent_OptCreate_And_Reset(&t->canRead);
if (wres == 0) if (wres == 0)
return SZ_OK; return SZ_OK;
} }
@ -126,7 +118,7 @@ static SRes MtDecThread_CreateAndStart(CMtDecThread *t)
{ {
if (Thread_WasCreated(&t->thread)) if (Thread_WasCreated(&t->thread))
return SZ_OK; return SZ_OK;
wres = Thread_Create(&t->thread, ThreadFunc, t); wres = Thread_Create(&t->thread, MtDec_ThreadFunc, t);
if (wres == 0) if (wres == 0)
return SZ_OK; return SZ_OK;
} }
@ -167,7 +159,7 @@ static void MtDecThread_CloseThread(CMtDecThread *t)
static void MtDec_CloseThreads(CMtDec *p) static void MtDec_CloseThreads(CMtDec *p)
{ {
unsigned i; unsigned i;
for (i = 0; i < MTDEC__THREADS_MAX; i++) for (i = 0; i < MTDEC_THREADS_MAX; i++)
MtDecThread_CloseThread(&p->threads[i]); MtDecThread_CloseThread(&p->threads[i]);
} }
@ -179,25 +171,6 @@ static void MtDecThread_Destruct(CMtDecThread *t)
static SRes FullRead(ISeqInStream *stream, Byte *data, size_t *processedSize)
{
size_t size = *processedSize;
*processedSize = 0;
while (size != 0)
{
size_t cur = size;
SRes res = ISeqInStream_Read(stream, data, &cur);
*processedSize += cur;
data += cur;
size -= cur;
RINOK(res);
if (cur == 0)
return SZ_OK;
}
return SZ_OK;
}
static SRes MtDec_GetError_Spec(CMtDec *p, UInt64 interruptIndex, BoolInt *wasInterrupted) static SRes MtDec_GetError_Spec(CMtDec *p, UInt64 interruptIndex, BoolInt *wasInterrupted)
{ {
SRes res; SRes res;
@ -253,7 +226,7 @@ Byte *MtDec_GetCrossBuff(CMtDec *p)
/* /*
ThreadFunc2() returns: MtDec_ThreadFunc2() returns:
0 - in all normal cases (even for stream error or memory allocation error) 0 - in all normal cases (even for stream error or memory allocation error)
(!= 0) - WRes error return by system threading function (!= 0) - WRes error return by system threading function
*/ */
@ -261,11 +234,11 @@ Byte *MtDec_GetCrossBuff(CMtDec *p)
// #define MTDEC_ProgessStep (1 << 22) // #define MTDEC_ProgessStep (1 << 22)
#define MTDEC_ProgessStep (1 << 0) #define MTDEC_ProgessStep (1 << 0)
static WRes ThreadFunc2(CMtDecThread *t) static WRes MtDec_ThreadFunc2(CMtDecThread *t)
{ {
CMtDec *p = t->mtDec; CMtDec *p = t->mtDec;
PRF_STR_INT("ThreadFunc2", t->index); PRF_STR_INT("MtDec_ThreadFunc2", t->index)
// SetThreadAffinityMask(GetCurrentThread(), 1 << t->index); // SetThreadAffinityMask(GetCurrentThread(), 1 << t->index);
@ -295,13 +268,13 @@ static WRes ThreadFunc2(CMtDecThread *t)
// CMtDecCallbackInfo parse; // CMtDecCallbackInfo parse;
CMtDecThread *nextThread; CMtDecThread *nextThread;
PRF_STR_INT("=============== Event_Wait(&t->canRead)", t->index); PRF_STR_INT("=============== Event_Wait(&t->canRead)", t->index)
RINOK_THREAD(Event_Wait(&t->canRead)); RINOK_THREAD(Event_Wait(&t->canRead))
if (p->exitThread) if (p->exitThread)
return 0; return 0;
PRF_STR_INT("after Event_Wait(&t->canRead)", t->index); PRF_STR_INT("after Event_Wait(&t->canRead)", t->index)
// if (t->index == 3) return 19; // for test // if (t->index == 3) return 19; // for test
@ -373,7 +346,7 @@ static WRes ThreadFunc2(CMtDecThread *t)
{ {
size = p->inBufSize; size = p->inBufSize;
res = FullRead(p->inStream, data, &size); res = SeqInStream_ReadMax(p->inStream, data, &size);
// size = 10; // test // size = 10; // test
@ -615,7 +588,7 @@ static WRes ThreadFunc2(CMtDecThread *t)
// if ( !finish ) we must call Event_Set(&nextThread->canWrite) in any case // if ( !finish ) we must call Event_Set(&nextThread->canWrite) in any case
// if ( finish ) we switch to single-thread mode and there are 2 ways at the end of current iteration (current block): // if ( finish ) we switch to single-thread mode and there are 2 ways at the end of current iteration (current block):
// - if (needContinue) after Write(&needContinue), we restore decoding with new iteration // - if (needContinue) after Write(&needContinue), we restore decoding with new iteration
// - otherwise we stop decoding and exit from ThreadFunc2() // - otherwise we stop decoding and exit from MtDec_ThreadFunc2()
// Don't change (finish) variable in the further code // Don't change (finish) variable in the further code
@ -688,7 +661,7 @@ static WRes ThreadFunc2(CMtDecThread *t)
// ---------- WRITE ---------- // ---------- WRITE ----------
RINOK_THREAD(Event_Wait(&t->canWrite)); RINOK_THREAD(Event_Wait(&t->canWrite))
{ {
BoolInt isErrorMode = False; BoolInt isErrorMode = False;
@ -801,14 +774,14 @@ static WRes ThreadFunc2(CMtDecThread *t)
if (!finish) if (!finish)
{ {
RINOK_THREAD(Event_Set(&nextThread->canWrite)); RINOK_THREAD(Event_Set(&nextThread->canWrite))
} }
else else
{ {
if (needContinue) if (needContinue)
{ {
// we restore decoding with new iteration // we restore decoding with new iteration
RINOK_THREAD(Event_Set(&p->threads[0].canWrite)); RINOK_THREAD(Event_Set(&p->threads[0].canWrite))
} }
else else
{ {
@ -817,7 +790,7 @@ static WRes ThreadFunc2(CMtDecThread *t)
return SZ_OK; return SZ_OK;
p->exitThread = True; p->exitThread = True;
} }
RINOK_THREAD(Event_Set(&p->threads[0].canRead)); RINOK_THREAD(Event_Set(&p->threads[0].canRead))
} }
} }
} }
@ -836,7 +809,17 @@ static WRes ThreadFunc2(CMtDecThread *t)
#endif #endif
static THREAD_FUNC_DECL ThreadFunc1(void *pp) typedef
#ifdef _WIN32
UINT_PTR
#elif 1
uintptr_t
#else
ptrdiff_t
#endif
MY_uintptr_t;
static THREAD_FUNC_DECL MtDec_ThreadFunc1(void *pp)
{ {
WRes res; WRes res;
@ -845,10 +828,10 @@ static THREAD_FUNC_DECL ThreadFunc1(void *pp)
// fprintf(stdout, "\n%d = %p\n", t->index, &t); // fprintf(stdout, "\n%d = %p\n", t->index, &t);
res = ThreadFunc2(t); res = MtDec_ThreadFunc2(t);
p = t->mtDec; p = t->mtDec;
if (res == 0) if (res == 0)
return (THREAD_FUNC_RET_TYPE)(UINT_PTR)p->exitThreadWRes; return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)p->exitThreadWRes;
{ {
// it's unexpected situation for some threading function error // it's unexpected situation for some threading function error
if (p->exitThreadWRes == 0) if (p->exitThreadWRes == 0)
@ -859,17 +842,17 @@ static THREAD_FUNC_DECL ThreadFunc1(void *pp)
Event_Set(&p->threads[0].canWrite); Event_Set(&p->threads[0].canWrite);
MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res)); MtProgress_SetError(&p->mtProgress, MY_SRes_HRESULT_FROM_WRes(res));
} }
return (THREAD_FUNC_RET_TYPE)(UINT_PTR)res; return (THREAD_FUNC_RET_TYPE)(MY_uintptr_t)res;
} }
static MY_NO_INLINE THREAD_FUNC_DECL ThreadFunc(void *pp) static Z7_NO_INLINE THREAD_FUNC_DECL MtDec_ThreadFunc(void *pp)
{ {
#ifdef USE_ALLOCA #ifdef USE_ALLOCA
CMtDecThread *t = (CMtDecThread *)pp; CMtDecThread *t = (CMtDecThread *)pp;
// fprintf(stderr, "\n%d = %p - before", t->index, &t); // fprintf(stderr, "\n%d = %p - before", t->index, &t);
t->allocaPtr = alloca(t->index * 128); t->allocaPtr = alloca(t->index * 128);
#endif #endif
return ThreadFunc1(pp); return MtDec_ThreadFunc1(pp);
} }
@ -883,7 +866,7 @@ int MtDec_PrepareRead(CMtDec *p)
{ {
unsigned i; unsigned i;
for (i = 0; i < MTDEC__THREADS_MAX; i++) for (i = 0; i < MTDEC_THREADS_MAX; i++)
if (i > p->numStartedThreads if (i > p->numStartedThreads
|| p->numFilledThreads <= || p->numFilledThreads <=
(i >= p->filledThreadStart ? (i >= p->filledThreadStart ?
@ -987,7 +970,7 @@ void MtDec_Construct(CMtDec *p)
p->allocatedBufsSize = 0; p->allocatedBufsSize = 0;
for (i = 0; i < MTDEC__THREADS_MAX; i++) for (i = 0; i < MTDEC_THREADS_MAX; i++)
{ {
CMtDecThread *t = &p->threads[i]; CMtDecThread *t = &p->threads[i];
t->mtDec = p; t->mtDec = p;
@ -995,7 +978,7 @@ void MtDec_Construct(CMtDec *p)
t->inBuf = NULL; t->inBuf = NULL;
Event_Construct(&t->canRead); Event_Construct(&t->canRead);
Event_Construct(&t->canWrite); Event_Construct(&t->canWrite);
Thread_Construct(&t->thread); Thread_CONSTRUCT(&t->thread)
} }
// Event_Construct(&p->finishedEvent); // Event_Construct(&p->finishedEvent);
@ -1010,7 +993,7 @@ static void MtDec_Free(CMtDec *p)
p->exitThread = True; p->exitThread = True;
for (i = 0; i < MTDEC__THREADS_MAX; i++) for (i = 0; i < MTDEC_THREADS_MAX; i++)
MtDecThread_Destruct(&p->threads[i]); MtDecThread_Destruct(&p->threads[i]);
// Event_Close(&p->finishedEvent); // Event_Close(&p->finishedEvent);
@ -1061,15 +1044,15 @@ SRes MtDec_Code(CMtDec *p)
{ {
unsigned numThreads = p->numThreadsMax; unsigned numThreads = p->numThreadsMax;
if (numThreads > MTDEC__THREADS_MAX) if (numThreads > MTDEC_THREADS_MAX)
numThreads = MTDEC__THREADS_MAX; numThreads = MTDEC_THREADS_MAX;
p->numStartedThreads_Limit = numThreads; p->numStartedThreads_Limit = numThreads;
p->numStartedThreads = 0; p->numStartedThreads = 0;
} }
if (p->inBufSize != p->allocatedBufsSize) if (p->inBufSize != p->allocatedBufsSize)
{ {
for (i = 0; i < MTDEC__THREADS_MAX; i++) for (i = 0; i < MTDEC_THREADS_MAX; i++)
{ {
CMtDecThread *t = &p->threads[i]; CMtDecThread *t = &p->threads[i];
if (t->inBuf) if (t->inBuf)
@ -1086,7 +1069,7 @@ SRes MtDec_Code(CMtDec *p)
MtProgress_Init(&p->mtProgress, p->progress); MtProgress_Init(&p->mtProgress, p->progress);
// RINOK_THREAD(ArEvent_OptCreate_And_Reset(&p->finishedEvent)); // RINOK_THREAD(AutoResetEvent_OptCreate_And_Reset(&p->finishedEvent))
p->exitThread = False; p->exitThread = False;
p->exitThreadWRes = 0; p->exitThreadWRes = 0;
@ -1098,8 +1081,8 @@ SRes MtDec_Code(CMtDec *p)
wres = MtDecThread_CreateEvents(nextThread); wres = MtDecThread_CreateEvents(nextThread);
if (wres == 0) { wres = Event_Set(&nextThread->canWrite); if (wres == 0) { wres = Event_Set(&nextThread->canWrite);
if (wres == 0) { wres = Event_Set(&nextThread->canRead); if (wres == 0) { wres = Event_Set(&nextThread->canRead);
if (wres == 0) { THREAD_FUNC_RET_TYPE res = ThreadFunc(nextThread); if (wres == 0) { THREAD_FUNC_RET_TYPE res = MtDec_ThreadFunc(nextThread);
wres = (WRes)(UINT_PTR)res; wres = (WRes)(MY_uintptr_t)res;
if (wres != 0) if (wres != 0)
{ {
p->needContinue = False; p->needContinue = False;
@ -1137,3 +1120,5 @@ SRes MtDec_Code(CMtDec *p)
} }
#endif #endif
#undef PRF

View file

@ -1,46 +1,46 @@
/* MtDec.h -- Multi-thread Decoder /* MtDec.h -- Multi-thread Decoder
2020-03-05 : Igor Pavlov : Public domain */ 2023-04-02 : Igor Pavlov : Public domain */
#ifndef __MT_DEC_H #ifndef ZIP7_INC_MT_DEC_H
#define __MT_DEC_H #define ZIP7_INC_MT_DEC_H
#include "7zTypes.h" #include "7zTypes.h"
#ifndef _7ZIP_ST #ifndef Z7_ST
#include "Threads.h" #include "Threads.h"
#endif #endif
EXTERN_C_BEGIN EXTERN_C_BEGIN
#ifndef _7ZIP_ST #ifndef Z7_ST
#ifndef _7ZIP_ST #ifndef Z7_ST
#define MTDEC__THREADS_MAX 32 #define MTDEC_THREADS_MAX 32
#else #else
#define MTDEC__THREADS_MAX 1 #define MTDEC_THREADS_MAX 1
#endif #endif
typedef struct typedef struct
{ {
ICompressProgress *progress; ICompressProgressPtr progress;
SRes res; SRes res;
UInt64 totalInSize; UInt64 totalInSize;
UInt64 totalOutSize; UInt64 totalOutSize;
CCriticalSection cs; CCriticalSection cs;
} CMtProgress; } CMtProgress;
void MtProgress_Init(CMtProgress *p, ICompressProgress *progress); void MtProgress_Init(CMtProgress *p, ICompressProgressPtr progress);
SRes MtProgress_Progress_ST(CMtProgress *p); SRes MtProgress_Progress_ST(CMtProgress *p);
SRes MtProgress_ProgressAdd(CMtProgress *p, UInt64 inSize, UInt64 outSize); SRes MtProgress_ProgressAdd(CMtProgress *p, UInt64 inSize, UInt64 outSize);
SRes MtProgress_GetError(CMtProgress *p); SRes MtProgress_GetError(CMtProgress *p);
void MtProgress_SetError(CMtProgress *p, SRes res); void MtProgress_SetError(CMtProgress *p, SRes res);
struct _CMtDec; struct CMtDec;
typedef struct typedef struct
{ {
struct _CMtDec *mtDec; struct CMtDec_ *mtDec;
unsigned index; unsigned index;
void *inBuf; void *inBuf;
@ -117,7 +117,7 @@ typedef struct
typedef struct _CMtDec typedef struct CMtDec_
{ {
/* input variables */ /* input variables */
@ -126,11 +126,11 @@ typedef struct _CMtDec
// size_t inBlockMax; // size_t inBlockMax;
unsigned numThreadsMax_2; unsigned numThreadsMax_2;
ISeqInStream *inStream; ISeqInStreamPtr inStream;
// const Byte *inData; // const Byte *inData;
// size_t inDataSize; // size_t inDataSize;
ICompressProgress *progress; ICompressProgressPtr progress;
ISzAllocPtr alloc; ISzAllocPtr alloc;
IMtDecCallback2 *mtCallback; IMtDecCallback2 *mtCallback;
@ -171,11 +171,11 @@ typedef struct _CMtDec
unsigned filledThreadStart; unsigned filledThreadStart;
unsigned numFilledThreads; unsigned numFilledThreads;
#ifndef _7ZIP_ST #ifndef Z7_ST
BoolInt needInterrupt; BoolInt needInterrupt;
UInt64 interruptIndex; UInt64 interruptIndex;
CMtProgress mtProgress; CMtProgress mtProgress;
CMtDecThread threads[MTDEC__THREADS_MAX]; CMtDecThread threads[MTDEC_THREADS_MAX];
#endif #endif
} CMtDec; } CMtDec;

View file

@ -1,9 +1,9 @@
/* Ppmd.h -- PPMD codec common code /* Ppmd.h -- PPMD codec common code
2021-04-13 : Igor Pavlov : Public domain 2023-03-05 : Igor Pavlov : Public domain
This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
#ifndef __PPMD_H #ifndef ZIP7_INC_PPMD_H
#define __PPMD_H #define ZIP7_INC_PPMD_H
#include "CpuArch.h" #include "CpuArch.h"
@ -48,8 +48,10 @@ typedef struct
Byte Count; /* Count to next change of Shift */ Byte Count; /* Count to next change of Shift */
} CPpmd_See; } CPpmd_See;
#define Ppmd_See_Update(p) if ((p)->Shift < PPMD_PERIOD_BITS && --(p)->Count == 0) \ #define Ppmd_See_UPDATE(p) \
{ (p)->Summ = (UInt16)((p)->Summ << 1); (p)->Count = (Byte)(3 << (p)->Shift++); } { if ((p)->Shift < PPMD_PERIOD_BITS && --(p)->Count == 0) \
{ (p)->Summ = (UInt16)((p)->Summ << 1); \
(p)->Count = (Byte)(3 << (p)->Shift++); }}
typedef struct typedef struct

217
C/Ppmd7.c
View file

@ -1,5 +1,5 @@
/* Ppmd7.c -- PPMdH codec /* Ppmd7.c -- PPMdH codec
2021-04-13 : Igor Pavlov : Public domain 2023-09-07 : Igor Pavlov : Public domain
This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */ This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -14,7 +14,7 @@ This code is based on PPMd var.H (2001): Dmitry Shkarin : Public domain */
MY_ALIGN(16) MY_ALIGN(16)
static const Byte PPMD7_kExpEscape[16] = { 25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2 }; static const Byte PPMD7_kExpEscape[16] = { 25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2 };
MY_ALIGN(16) MY_ALIGN(16)
static const UInt16 kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051}; static const UInt16 PPMD7_kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051};
#define MAX_FREQ 124 #define MAX_FREQ 124
#define UNIT_SIZE 12 #define UNIT_SIZE 12
@ -33,7 +33,7 @@ static const UInt16 kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x
#define ONE_STATE(ctx) Ppmd7Context_OneState(ctx) #define ONE_STATE(ctx) Ppmd7Context_OneState(ctx)
#define SUFFIX(ctx) CTX((ctx)->Suffix) #define SUFFIX(ctx) CTX((ctx)->Suffix)
typedef CPpmd7_Context * CTX_PTR; typedef CPpmd7_Context * PPMD7_CTX_PTR;
struct CPpmd7_Node_; struct CPpmd7_Node_;
@ -107,14 +107,14 @@ BoolInt Ppmd7_Alloc(CPpmd7 *p, UInt32 size, ISzAllocPtr alloc)
// ---------- Internal Memory Allocator ---------- // ---------- Internal Memory Allocator ----------
/* We can use CPpmd7_Node in list of free units (as in Ppmd8) /* We can use CPpmd7_Node in list of free units (as in Ppmd8)
But we still need one additional list walk pass in GlueFreeBlocks(). But we still need one additional list walk pass in Ppmd7_GlueFreeBlocks().
So we use simple CPpmd_Void_Ref instead of CPpmd7_Node in InsertNode() / RemoveNode() So we use simple CPpmd_Void_Ref instead of CPpmd7_Node in Ppmd7_InsertNode() / Ppmd7_RemoveNode()
*/ */
#define EMPTY_NODE 0 #define EMPTY_NODE 0
static void InsertNode(CPpmd7 *p, void *node, unsigned indx) static void Ppmd7_InsertNode(CPpmd7 *p, void *node, unsigned indx)
{ {
*((CPpmd_Void_Ref *)node) = p->FreeList[indx]; *((CPpmd_Void_Ref *)node) = p->FreeList[indx];
// ((CPpmd7_Node *)node)->Next = (CPpmd7_Node_Ref)p->FreeList[indx]; // ((CPpmd7_Node *)node)->Next = (CPpmd7_Node_Ref)p->FreeList[indx];
@ -124,7 +124,7 @@ static void InsertNode(CPpmd7 *p, void *node, unsigned indx)
} }
static void *RemoveNode(CPpmd7 *p, unsigned indx) static void *Ppmd7_RemoveNode(CPpmd7 *p, unsigned indx)
{ {
CPpmd_Void_Ref *node = (CPpmd_Void_Ref *)Ppmd7_GetPtr(p, p->FreeList[indx]); CPpmd_Void_Ref *node = (CPpmd_Void_Ref *)Ppmd7_GetPtr(p, p->FreeList[indx]);
p->FreeList[indx] = *node; p->FreeList[indx] = *node;
@ -134,32 +134,32 @@ static void *RemoveNode(CPpmd7 *p, unsigned indx)
} }
static void SplitBlock(CPpmd7 *p, void *ptr, unsigned oldIndx, unsigned newIndx) static void Ppmd7_SplitBlock(CPpmd7 *p, void *ptr, unsigned oldIndx, unsigned newIndx)
{ {
unsigned i, nu = I2U(oldIndx) - I2U(newIndx); unsigned i, nu = I2U(oldIndx) - I2U(newIndx);
ptr = (Byte *)ptr + U2B(I2U(newIndx)); ptr = (Byte *)ptr + U2B(I2U(newIndx));
if (I2U(i = U2I(nu)) != nu) if (I2U(i = U2I(nu)) != nu)
{ {
unsigned k = I2U(--i); unsigned k = I2U(--i);
InsertNode(p, ((Byte *)ptr) + U2B(k), nu - k - 1); Ppmd7_InsertNode(p, ((Byte *)ptr) + U2B(k), nu - k - 1);
} }
InsertNode(p, ptr, i); Ppmd7_InsertNode(p, ptr, i);
} }
/* we use CPpmd7_Node_Union union to solve XLC -O2 strict pointer aliasing problem */ /* we use CPpmd7_Node_Union union to solve XLC -O2 strict pointer aliasing problem */
typedef union _CPpmd7_Node_Union typedef union
{ {
CPpmd7_Node Node; CPpmd7_Node Node;
CPpmd7_Node_Ref NextRef; CPpmd7_Node_Ref NextRef;
} CPpmd7_Node_Union; } CPpmd7_Node_Union;
/* Original PPmdH (Ppmd7) code uses doubly linked list in GlueFreeBlocks() /* Original PPmdH (Ppmd7) code uses doubly linked list in Ppmd7_GlueFreeBlocks()
we use single linked list similar to Ppmd8 code */ we use single linked list similar to Ppmd8 code */
static void GlueFreeBlocks(CPpmd7 *p) static void Ppmd7_GlueFreeBlocks(CPpmd7 *p)
{ {
/* /*
we use first UInt16 field of 12-bytes UNITs as record type stamp we use first UInt16 field of 12-bytes UNITs as record type stamp
@ -239,27 +239,27 @@ static void GlueFreeBlocks(CPpmd7 *p)
if (nu == 0) if (nu == 0)
continue; continue;
for (; nu > 128; nu -= 128, node += 128) for (; nu > 128; nu -= 128, node += 128)
InsertNode(p, node, PPMD_NUM_INDEXES - 1); Ppmd7_InsertNode(p, node, PPMD_NUM_INDEXES - 1);
if (I2U(i = U2I(nu)) != nu) if (I2U(i = U2I(nu)) != nu)
{ {
unsigned k = I2U(--i); unsigned k = I2U(--i);
InsertNode(p, node + k, (unsigned)nu - k - 1); Ppmd7_InsertNode(p, node + k, (unsigned)nu - k - 1);
} }
InsertNode(p, node, i); Ppmd7_InsertNode(p, node, i);
} }
} }
MY_NO_INLINE Z7_NO_INLINE
static void *AllocUnitsRare(CPpmd7 *p, unsigned indx) static void *Ppmd7_AllocUnitsRare(CPpmd7 *p, unsigned indx)
{ {
unsigned i; unsigned i;
if (p->GlueCount == 0) if (p->GlueCount == 0)
{ {
GlueFreeBlocks(p); Ppmd7_GlueFreeBlocks(p);
if (p->FreeList[indx] != 0) if (p->FreeList[indx] != 0)
return RemoveNode(p, indx); return Ppmd7_RemoveNode(p, indx);
} }
i = indx; i = indx;
@ -277,17 +277,17 @@ static void *AllocUnitsRare(CPpmd7 *p, unsigned indx)
while (p->FreeList[i] == 0); while (p->FreeList[i] == 0);
{ {
void *block = RemoveNode(p, i); void *block = Ppmd7_RemoveNode(p, i);
SplitBlock(p, block, i, indx); Ppmd7_SplitBlock(p, block, i, indx);
return block; return block;
} }
} }
static void *AllocUnits(CPpmd7 *p, unsigned indx) static void *Ppmd7_AllocUnits(CPpmd7 *p, unsigned indx)
{ {
if (p->FreeList[indx] != 0) if (p->FreeList[indx] != 0)
return RemoveNode(p, indx); return Ppmd7_RemoveNode(p, indx);
{ {
UInt32 numBytes = U2B(I2U(indx)); UInt32 numBytes = U2B(I2U(indx));
Byte *lo = p->LoUnit; Byte *lo = p->LoUnit;
@ -297,13 +297,22 @@ static void *AllocUnits(CPpmd7 *p, unsigned indx)
return lo; return lo;
} }
} }
return AllocUnitsRare(p, indx); return Ppmd7_AllocUnitsRare(p, indx);
} }
#define MyMem12Cpy(dest, src, num) \ #define MEM_12_CPY(dest, src, num) \
{ UInt32 *d = (UInt32 *)dest; const UInt32 *z = (const UInt32 *)src; UInt32 n = num; \ { UInt32 *d = (UInt32 *)(dest); \
do { d[0] = z[0]; d[1] = z[1]; d[2] = z[2]; z += 3; d += 3; } while (--n); } const UInt32 *z = (const UInt32 *)(src); \
unsigned n = (num); \
do { \
d[0] = z[0]; \
d[1] = z[1]; \
d[2] = z[2]; \
z += 3; \
d += 3; \
} while (--n); \
}
/* /*
@ -315,12 +324,12 @@ static void *ShrinkUnits(CPpmd7 *p, void *oldPtr, unsigned oldNU, unsigned newNU
return oldPtr; return oldPtr;
if (p->FreeList[i1] != 0) if (p->FreeList[i1] != 0)
{ {
void *ptr = RemoveNode(p, i1); void *ptr = Ppmd7_RemoveNode(p, i1);
MyMem12Cpy(ptr, oldPtr, newNU); MEM_12_CPY(ptr, oldPtr, newNU)
InsertNode(p, oldPtr, i0); Ppmd7_InsertNode(p, oldPtr, i0);
return ptr; return ptr;
} }
SplitBlock(p, oldPtr, i0, i1); Ppmd7_SplitBlock(p, oldPtr, i0, i1);
return oldPtr; return oldPtr;
} }
*/ */
@ -329,14 +338,14 @@ static void *ShrinkUnits(CPpmd7 *p, void *oldPtr, unsigned oldNU, unsigned newNU
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
static void SetSuccessor(CPpmd_State *p, CPpmd_Void_Ref v) static void SetSuccessor(CPpmd_State *p, CPpmd_Void_Ref v)
{ {
Ppmd_SET_SUCCESSOR(p, v); Ppmd_SET_SUCCESSOR(p, v)
} }
MY_NO_INLINE Z7_NO_INLINE
static static
void RestartModel(CPpmd7 *p) void Ppmd7_RestartModel(CPpmd7 *p)
{ {
unsigned i, k; unsigned i, k;
@ -352,8 +361,8 @@ void RestartModel(CPpmd7 *p)
p->PrevSuccess = 0; p->PrevSuccess = 0;
{ {
CPpmd7_Context *mc = (CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */ CPpmd7_Context *mc = (PPMD7_CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */
CPpmd_State *s = (CPpmd_State *)p->LoUnit; /* AllocUnits(p, PPMD_NUM_INDEXES - 1); */ CPpmd_State *s = (CPpmd_State *)p->LoUnit; /* Ppmd7_AllocUnits(p, PPMD_NUM_INDEXES - 1); */
p->LoUnit += U2B(256 / 2); p->LoUnit += U2B(256 / 2);
p->MaxContext = p->MinContext = mc; p->MaxContext = p->MinContext = mc;
@ -391,7 +400,7 @@ void RestartModel(CPpmd7 *p)
{ {
unsigned m; unsigned m;
UInt16 *dest = p->BinSumm[i] + k; UInt16 *dest = p->BinSumm[i] + k;
UInt16 val = (UInt16)(PPMD_BIN_SCALE - kInitBinEsc[k] / (i + 2)); const UInt16 val = (UInt16)(PPMD_BIN_SCALE - PPMD7_kInitBinEsc[k] / (i + 2));
for (m = 0; m < 64; m += 8) for (m = 0; m < 64; m += 8)
dest[m] = val; dest[m] = val;
} }
@ -423,13 +432,13 @@ void Ppmd7_Init(CPpmd7 *p, unsigned maxOrder)
{ {
p->MaxOrder = maxOrder; p->MaxOrder = maxOrder;
RestartModel(p); Ppmd7_RestartModel(p);
} }
/* /*
CreateSuccessors() Ppmd7_CreateSuccessors()
It's called when (FoundState->Successor) is RAW-Successor, It's called when (FoundState->Successor) is RAW-Successor,
that is the link to position in Raw text. that is the link to position in Raw text.
So we create Context records and write the links to So we create Context records and write the links to
@ -445,10 +454,10 @@ void Ppmd7_Init(CPpmd7 *p, unsigned maxOrder)
also it can return pointer to real context of same order, also it can return pointer to real context of same order,
*/ */
MY_NO_INLINE Z7_NO_INLINE
static CTX_PTR CreateSuccessors(CPpmd7 *p) static PPMD7_CTX_PTR Ppmd7_CreateSuccessors(CPpmd7 *p)
{ {
CTX_PTR c = p->MinContext; PPMD7_CTX_PTR c = p->MinContext;
CPpmd_Byte_Ref upBranch = (CPpmd_Byte_Ref)SUCCESSOR(p->FoundState); CPpmd_Byte_Ref upBranch = (CPpmd_Byte_Ref)SUCCESSOR(p->FoundState);
Byte newSym, newFreq; Byte newSym, newFreq;
unsigned numPs = 0; unsigned numPs = 0;
@ -522,15 +531,15 @@ static CTX_PTR CreateSuccessors(CPpmd7 *p)
do do
{ {
CTX_PTR c1; PPMD7_CTX_PTR c1;
/* = AllocContext(p); */ /* = AllocContext(p); */
if (p->HiUnit != p->LoUnit) if (p->HiUnit != p->LoUnit)
c1 = (CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); c1 = (PPMD7_CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE);
else if (p->FreeList[0] != 0) else if (p->FreeList[0] != 0)
c1 = (CTX_PTR)RemoveNode(p, 0); c1 = (PPMD7_CTX_PTR)Ppmd7_RemoveNode(p, 0);
else else
{ {
c1 = (CTX_PTR)AllocUnitsRare(p, 0); c1 = (PPMD7_CTX_PTR)Ppmd7_AllocUnitsRare(p, 0);
if (!c1) if (!c1)
return NULL; return NULL;
} }
@ -550,16 +559,16 @@ static CTX_PTR CreateSuccessors(CPpmd7 *p)
#define SwapStates(s) \ #define SWAP_STATES(s) \
{ CPpmd_State tmp = s[0]; s[0] = s[-1]; s[-1] = tmp; } { CPpmd_State tmp = s[0]; s[0] = s[-1]; s[-1] = tmp; }
void Ppmd7_UpdateModel(CPpmd7 *p); void Ppmd7_UpdateModel(CPpmd7 *p);
MY_NO_INLINE Z7_NO_INLINE
void Ppmd7_UpdateModel(CPpmd7 *p) void Ppmd7_UpdateModel(CPpmd7 *p)
{ {
CPpmd_Void_Ref maxSuccessor, minSuccessor; CPpmd_Void_Ref maxSuccessor, minSuccessor;
CTX_PTR c, mc; PPMD7_CTX_PTR c, mc;
unsigned s0, ns; unsigned s0, ns;
@ -592,7 +601,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
if (s[0].Freq >= s[-1].Freq) if (s[0].Freq >= s[-1].Freq)
{ {
SwapStates(s); SWAP_STATES(s)
s--; s--;
} }
} }
@ -610,10 +619,10 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
{ {
/* MAX ORDER context */ /* MAX ORDER context */
/* (FoundState->Successor) is RAW-Successor. */ /* (FoundState->Successor) is RAW-Successor. */
p->MaxContext = p->MinContext = CreateSuccessors(p); p->MaxContext = p->MinContext = Ppmd7_CreateSuccessors(p);
if (!p->MinContext) if (!p->MinContext)
{ {
RestartModel(p); Ppmd7_RestartModel(p);
return; return;
} }
SetSuccessor(p->FoundState, REF(p->MinContext)); SetSuccessor(p->FoundState, REF(p->MinContext));
@ -629,7 +638,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
p->Text = text; p->Text = text;
if (text >= p->UnitsStart) if (text >= p->UnitsStart)
{ {
RestartModel(p); Ppmd7_RestartModel(p);
return; return;
} }
maxSuccessor = REF(text); maxSuccessor = REF(text);
@ -645,10 +654,10 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
if (minSuccessor <= maxSuccessor) if (minSuccessor <= maxSuccessor)
{ {
// minSuccessor is RAW-Successor. So we will create real contexts records: // minSuccessor is RAW-Successor. So we will create real contexts records:
CTX_PTR cs = CreateSuccessors(p); PPMD7_CTX_PTR cs = Ppmd7_CreateSuccessors(p);
if (!cs) if (!cs)
{ {
RestartModel(p); Ppmd7_RestartModel(p);
return; return;
} }
minSuccessor = REF(cs); minSuccessor = REF(cs);
@ -711,27 +720,27 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
if ((ns1 & 1) == 0) if ((ns1 & 1) == 0)
{ {
/* Expand for one UNIT */ /* Expand for one UNIT */
unsigned oldNU = ns1 >> 1; const unsigned oldNU = ns1 >> 1;
unsigned i = U2I(oldNU); const unsigned i = U2I(oldNU);
if (i != U2I((size_t)oldNU + 1)) if (i != U2I((size_t)oldNU + 1))
{ {
void *ptr = AllocUnits(p, i + 1); void *ptr = Ppmd7_AllocUnits(p, i + 1);
void *oldPtr; void *oldPtr;
if (!ptr) if (!ptr)
{ {
RestartModel(p); Ppmd7_RestartModel(p);
return; return;
} }
oldPtr = STATS(c); oldPtr = STATS(c);
MyMem12Cpy(ptr, oldPtr, oldNU); MEM_12_CPY(ptr, oldPtr, oldNU)
InsertNode(p, oldPtr, i); Ppmd7_InsertNode(p, oldPtr, i);
c->Union4.Stats = STATS_REF(ptr); c->Union4.Stats = STATS_REF(ptr);
} }
} }
sum = c->Union2.SummFreq; sum = c->Union2.SummFreq;
/* max increase of Escape_Freq is 3 here. /* max increase of Escape_Freq is 3 here.
total increase of Union2.SummFreq for all symbols is less than 256 here */ total increase of Union2.SummFreq for all symbols is less than 256 here */
sum += (UInt32)(2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1)); sum += (UInt32)(unsigned)((2 * ns1 < ns) + 2 * ((unsigned)(4 * ns1 <= ns) & (sum <= 8 * ns1)));
/* original PPMdH uses 16-bit variable for (sum) here. /* original PPMdH uses 16-bit variable for (sum) here.
But (sum < 0x9000). So we don't truncate (sum) to 16-bit */ But (sum < 0x9000). So we don't truncate (sum) to 16-bit */
// sum = (UInt16)sum; // sum = (UInt16)sum;
@ -739,10 +748,10 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
else else
{ {
// instead of One-symbol context we create 2-symbol context // instead of One-symbol context we create 2-symbol context
CPpmd_State *s = (CPpmd_State*)AllocUnits(p, 0); CPpmd_State *s = (CPpmd_State*)Ppmd7_AllocUnits(p, 0);
if (!s) if (!s)
{ {
RestartModel(p); Ppmd7_RestartModel(p);
return; return;
} }
{ {
@ -761,7 +770,7 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
// (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context // (max(s->freq) == 120), when we convert from 1-symbol into 2-symbol context
s->Freq = (Byte)freq; s->Freq = (Byte)freq;
// max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here // max(InitEsc = PPMD7_kExpEscape[*]) is 25. So the max(escapeFreq) is 26 here
sum = freq + p->InitEsc + (ns > 3); sum = (UInt32)(freq + p->InitEsc + (ns > 3));
} }
} }
@ -795,8 +804,8 @@ void Ppmd7_UpdateModel(CPpmd7 *p)
MY_NO_INLINE Z7_NO_INLINE
static void Rescale(CPpmd7 *p) static void Ppmd7_Rescale(CPpmd7 *p)
{ {
unsigned i, adder, sumFreq, escFreq; unsigned i, adder, sumFreq, escFreq;
CPpmd_State *stats = STATS(p->MinContext); CPpmd_State *stats = STATS(p->MinContext);
@ -885,7 +894,7 @@ static void Rescale(CPpmd7 *p)
*s = *stats; *s = *stats;
s->Freq = (Byte)freq; // (freq <= 260 / 4) s->Freq = (Byte)freq; // (freq <= 260 / 4)
p->FoundState = s; p->FoundState = s;
InsertNode(p, stats, U2I(n0)); Ppmd7_InsertNode(p, stats, U2I(n0));
return; return;
} }
@ -899,13 +908,13 @@ static void Rescale(CPpmd7 *p)
{ {
if (p->FreeList[i1] != 0) if (p->FreeList[i1] != 0)
{ {
void *ptr = RemoveNode(p, i1); void *ptr = Ppmd7_RemoveNode(p, i1);
p->MinContext->Union4.Stats = STATS_REF(ptr); p->MinContext->Union4.Stats = STATS_REF(ptr);
MyMem12Cpy(ptr, (const void *)stats, n1); MEM_12_CPY(ptr, (const void *)stats, n1)
InsertNode(p, stats, i0); Ppmd7_InsertNode(p, stats, i0);
} }
else else
SplitBlock(p, stats, i0, i1); Ppmd7_SplitBlock(p, stats, i0, i1);
} }
} }
} }
@ -933,10 +942,10 @@ CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq)
p->HiBitsFlag; p->HiBitsFlag;
{ {
// if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ
unsigned summ = (UInt16)see->Summ; // & 0xFFFF const unsigned summ = (UInt16)see->Summ; // & 0xFFFF
unsigned r = (summ >> see->Shift); const unsigned r = (summ >> see->Shift);
see->Summ = (UInt16)(summ - r); see->Summ = (UInt16)(summ - r);
*escFreq = r + (r == 0); *escFreq = (UInt32)(r + (r == 0));
} }
} }
else else
@ -948,9 +957,9 @@ CPpmd_See *Ppmd7_MakeEscFreq(CPpmd7 *p, unsigned numMasked, UInt32 *escFreq)
} }
static void NextContext(CPpmd7 *p) static void Ppmd7_NextContext(CPpmd7 *p)
{ {
CTX_PTR c = CTX(SUCCESSOR(p->FoundState)); PPMD7_CTX_PTR c = CTX(SUCCESSOR(p->FoundState));
if (p->OrderFall == 0 && (const Byte *)c > p->Text) if (p->OrderFall == 0 && (const Byte *)c > p->Text)
p->MaxContext = p->MinContext = c; p->MaxContext = p->MinContext = c;
else else
@ -967,12 +976,12 @@ void Ppmd7_Update1(CPpmd7 *p)
s->Freq = (Byte)freq; s->Freq = (Byte)freq;
if (freq > s[-1].Freq) if (freq > s[-1].Freq)
{ {
SwapStates(s); SWAP_STATES(s)
p->FoundState = --s; p->FoundState = --s;
if (freq > MAX_FREQ) if (freq > MAX_FREQ)
Rescale(p); Ppmd7_Rescale(p);
} }
NextContext(p); Ppmd7_NextContext(p);
} }
@ -981,15 +990,15 @@ void Ppmd7_Update1_0(CPpmd7 *p)
CPpmd_State *s = p->FoundState; CPpmd_State *s = p->FoundState;
CPpmd7_Context *mc = p->MinContext; CPpmd7_Context *mc = p->MinContext;
unsigned freq = s->Freq; unsigned freq = s->Freq;
unsigned summFreq = mc->Union2.SummFreq; const unsigned summFreq = mc->Union2.SummFreq;
p->PrevSuccess = (2 * freq > summFreq); p->PrevSuccess = (2 * freq > summFreq);
p->RunLength += (int)p->PrevSuccess; p->RunLength += (Int32)p->PrevSuccess;
mc->Union2.SummFreq = (UInt16)(summFreq + 4); mc->Union2.SummFreq = (UInt16)(summFreq + 4);
freq += 4; freq += 4;
s->Freq = (Byte)freq; s->Freq = (Byte)freq;
if (freq > MAX_FREQ) if (freq > MAX_FREQ)
Rescale(p); Ppmd7_Rescale(p);
NextContext(p); Ppmd7_NextContext(p);
} }
@ -1000,7 +1009,7 @@ void Ppmd7_UpdateBin(CPpmd7 *p)
p->FoundState->Freq = (Byte)(freq + (freq < 128)); p->FoundState->Freq = (Byte)(freq + (freq < 128));
p->PrevSuccess = 1; p->PrevSuccess = 1;
p->RunLength++; p->RunLength++;
NextContext(p); Ppmd7_NextContext(p);
} }
*/ */
@ -1013,7 +1022,7 @@ void Ppmd7_Update2(CPpmd7 *p)
p->MinContext->Union2.SummFreq = (UInt16)(p->MinContext->Union2.SummFreq + 4); p->MinContext->Union2.SummFreq = (UInt16)(p->MinContext->Union2.SummFreq + 4);
s->Freq = (Byte)freq; s->Freq = (Byte)freq;
if (freq > MAX_FREQ) if (freq > MAX_FREQ)
Rescale(p); Ppmd7_Rescale(p);
Ppmd7_UpdateModel(p); Ppmd7_UpdateModel(p);
} }
@ -1042,8 +1051,8 @@ Last UNIT of array at offset (Size - 12) is root order-0 CPpmd7_Context record.
The code can free UNITs memory blocks that were allocated to store CPpmd_State vectors. The code can free UNITs memory blocks that were allocated to store CPpmd_State vectors.
The code doesn't free UNITs allocated for CPpmd7_Context records. The code doesn't free UNITs allocated for CPpmd7_Context records.
The code calls RestartModel(), when there is no free memory for allocation. The code calls Ppmd7_RestartModel(), when there is no free memory for allocation.
And RestartModel() changes the state to orignal start state, with full free block. And Ppmd7_RestartModel() changes the state to orignal start state, with full free block.
The code allocates UNITs with the following order: The code allocates UNITs with the following order:
@ -1051,14 +1060,14 @@ The code allocates UNITs with the following order:
Allocation of 1 UNIT for Context record Allocation of 1 UNIT for Context record
- from free space (HiUnit) down to (LoUnit) - from free space (HiUnit) down to (LoUnit)
- from FreeList[0] - from FreeList[0]
- AllocUnitsRare() - Ppmd7_AllocUnitsRare()
AllocUnits() for CPpmd_State vectors: Ppmd7_AllocUnits() for CPpmd_State vectors:
- from FreeList[i] - from FreeList[i]
- from free space (LoUnit) up to (HiUnit) - from free space (LoUnit) up to (HiUnit)
- AllocUnitsRare() - Ppmd7_AllocUnitsRare()
AllocUnitsRare() Ppmd7_AllocUnitsRare()
- if (GlueCount == 0) - if (GlueCount == 0)
{ Glue lists, GlueCount = 255, allocate from FreeList[i]] } { Glue lists, GlueCount = 255, allocate from FreeList[i]] }
- loop for all higher sized FreeList[...] lists - loop for all higher sized FreeList[...] lists
@ -1093,8 +1102,8 @@ The PPMd code tries to fulfill the condition:
We have (Sum(Stats[].Freq) <= 256 * 124), because of (MAX_FREQ = 124) We have (Sum(Stats[].Freq) <= 256 * 124), because of (MAX_FREQ = 124)
So (4 = 128 - 124) is average reserve for Escape_Freq for each symbol. So (4 = 128 - 124) is average reserve for Escape_Freq for each symbol.
If (CPpmd_State::Freq) is not aligned for 4, the reserve can be 5, 6 or 7. If (CPpmd_State::Freq) is not aligned for 4, the reserve can be 5, 6 or 7.
SummFreq and Escape_Freq can be changed in Rescale() and *Update*() functions. SummFreq and Escape_Freq can be changed in Ppmd7_Rescale() and *Update*() functions.
Rescale() can remove symbols only from max-order contexts. So Escape_Freq can increase after multiple calls of Rescale() for Ppmd7_Rescale() can remove symbols only from max-order contexts. So Escape_Freq can increase after multiple calls of Ppmd7_Rescale() for
max-order context. max-order context.
When the PPMd code still break (Total <= RC::Range) condition in range coder, When the PPMd code still break (Total <= RC::Range) condition in range coder,
@ -1102,3 +1111,21 @@ we have two ways to resolve that problem:
1) we can report error, if we want to keep compatibility with original PPMd code that has no fix for such cases. 1) we can report error, if we want to keep compatibility with original PPMd code that has no fix for such cases.
2) we can reduce (Total) value to (RC::Range) by reducing (Escape_Freq) part of (Total) value. 2) we can reduce (Total) value to (RC::Range) by reducing (Escape_Freq) part of (Total) value.
*/ */
#undef MAX_FREQ
#undef UNIT_SIZE
#undef U2B
#undef U2I
#undef I2U
#undef I2U_UInt16
#undef REF
#undef STATS_REF
#undef CTX
#undef STATS
#undef ONE_STATE
#undef SUFFIX
#undef NODE
#undef EMPTY_NODE
#undef MEM_12_CPY
#undef SUCCESSOR
#undef SWAP_STATES

View file

@ -1,11 +1,11 @@
/* Ppmd7.h -- Ppmd7 (PPMdH) compression codec /* Ppmd7.h -- Ppmd7 (PPMdH) compression codec
2021-04-13 : Igor Pavlov : Public domain 2023-04-02 : Igor Pavlov : Public domain
This code is based on: This code is based on:
PPMd var.H (2001): Dmitry Shkarin : Public domain */ PPMd var.H (2001): Dmitry Shkarin : Public domain */
#ifndef __PPMD7_H #ifndef ZIP7_INC_PPMD7_H
#define __PPMD7_H #define ZIP7_INC_PPMD7_H
#include "Ppmd.h" #include "Ppmd.h"
@ -55,7 +55,7 @@ typedef struct
UInt32 Range; UInt32 Range;
UInt32 Code; UInt32 Code;
UInt32 Low; UInt32 Low;
IByteIn *Stream; IByteInPtr Stream;
} CPpmd7_RangeDec; } CPpmd7_RangeDec;
@ -66,7 +66,7 @@ typedef struct
// Byte _dummy_[3]; // Byte _dummy_[3];
UInt64 Low; UInt64 Low;
UInt64 CacheSize; UInt64 CacheSize;
IByteOut *Stream; IByteOutPtr Stream;
} CPpmd7z_RangeEnc; } CPpmd7z_RangeEnc;

View file

@ -1,5 +1,5 @@
/* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder /* Ppmd7Dec.c -- Ppmd7z (PPMdH with 7z Range Coder) Decoder
2021-04-13 : Igor Pavlov : Public domain 2023-09-07 : Igor Pavlov : Public domain
This code is based on: This code is based on:
PPMd var.H (2001): Dmitry Shkarin : Public domain */ PPMd var.H (2001): Dmitry Shkarin : Public domain */
@ -8,7 +8,7 @@ This code is based on:
#include "Ppmd7.h" #include "Ppmd7.h"
#define kTopValue (1 << 24) #define kTopValue ((UInt32)1 << 24)
#define READ_BYTE(p) IByteIn_Read((p)->Stream) #define READ_BYTE(p) IByteIn_Read((p)->Stream)
@ -37,9 +37,9 @@ BoolInt Ppmd7z_RangeDec_Init(CPpmd7_RangeDec *p)
#define R (&p->rc.dec) #define R (&p->rc.dec)
MY_FORCE_INLINE Z7_FORCE_INLINE
// MY_NO_INLINE // Z7_NO_INLINE
static void RangeDec_Decode(CPpmd7 *p, UInt32 start, UInt32 size) static void Ppmd7z_RD_Decode(CPpmd7 *p, UInt32 start, UInt32 size)
{ {
@ -48,18 +48,18 @@ static void RangeDec_Decode(CPpmd7 *p, UInt32 start, UInt32 size)
RC_NORM_LOCAL(R) RC_NORM_LOCAL(R)
} }
#define RC_Decode(start, size) RangeDec_Decode(p, start, size); #define RC_Decode(start, size) Ppmd7z_RD_Decode(p, start, size);
#define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R) #define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R)
#define RC_GetThreshold(total) (R->Code / (R->Range /= (total))) #define RC_GetThreshold(total) (R->Code / (R->Range /= (total)))
#define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref)) #define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref))
typedef CPpmd7_Context * CTX_PTR; // typedef CPpmd7_Context * CTX_PTR;
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
void Ppmd7_UpdateModel(CPpmd7 *p); void Ppmd7_UpdateModel(CPpmd7 *p);
#define MASK(sym) ((unsigned char *)charMask)[sym] #define MASK(sym) ((Byte *)charMask)[sym]
// MY_FORCE_INLINE // Z7_FORCE_INLINE
// static // static
int Ppmd7z_DecodeSymbol(CPpmd7 *p) int Ppmd7z_DecodeSymbol(CPpmd7 *p)
{ {
@ -70,7 +70,7 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
CPpmd_State *s = Ppmd7_GetStats(p, p->MinContext); CPpmd_State *s = Ppmd7_GetStats(p, p->MinContext);
unsigned i; unsigned i;
UInt32 count, hiCnt; UInt32 count, hiCnt;
UInt32 summFreq = p->MinContext->Union2.SummFreq; const UInt32 summFreq = p->MinContext->Union2.SummFreq;
@ -81,7 +81,7 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
if ((Int32)(count -= s->Freq) < 0) if ((Int32)(count -= s->Freq) < 0)
{ {
Byte sym; Byte sym;
RC_DecodeFinal(0, s->Freq); RC_DecodeFinal(0, s->Freq)
p->FoundState = s; p->FoundState = s;
sym = s->Symbol; sym = s->Symbol;
Ppmd7_Update1_0(p); Ppmd7_Update1_0(p);
@ -96,7 +96,7 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
if ((Int32)(count -= (++s)->Freq) < 0) if ((Int32)(count -= (++s)->Freq) < 0)
{ {
Byte sym; Byte sym;
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq); RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
p->FoundState = s; p->FoundState = s;
sym = s->Symbol; sym = s->Symbol;
Ppmd7_Update1(p); Ppmd7_Update1(p);
@ -109,10 +109,10 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
return PPMD7_SYM_ERROR; return PPMD7_SYM_ERROR;
hiCnt -= count; hiCnt -= count;
RC_Decode(hiCnt, summFreq - hiCnt); RC_Decode(hiCnt, summFreq - hiCnt)
p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol); p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol);
PPMD_SetAllBitsIn256Bytes(charMask); PPMD_SetAllBitsIn256Bytes(charMask)
// i = p->MinContext->NumStats - 1; // i = p->MinContext->NumStats - 1;
// do { MASK((--s)->Symbol) = 0; } while (--i); // do { MASK((--s)->Symbol) = 0; } while (--i);
{ {
@ -120,8 +120,8 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
MASK(s->Symbol) = 0; MASK(s->Symbol) = 0;
do do
{ {
unsigned sym0 = s2[0].Symbol; const unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol; const unsigned sym1 = s2[1].Symbol;
s2 += 2; s2 += 2;
MASK(sym0) = 0; MASK(sym0) = 0;
MASK(sym1) = 0; MASK(sym1) = 0;
@ -152,7 +152,7 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
// Ppmd7_UpdateBin(p); // Ppmd7_UpdateBin(p);
{ {
unsigned freq = s->Freq; unsigned freq = s->Freq;
CTX_PTR c = CTX(SUCCESSOR(s)); CPpmd7_Context *c = CTX(SUCCESSOR(s));
sym = s->Symbol; sym = s->Symbol;
p->FoundState = s; p->FoundState = s;
p->PrevSuccess = 1; p->PrevSuccess = 1;
@ -176,7 +176,7 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
R->Range -= size0; R->Range -= size0;
RC_NORM_LOCAL(R) RC_NORM_LOCAL(R)
PPMD_SetAllBitsIn256Bytes(charMask); PPMD_SetAllBitsIn256Bytes(charMask)
MASK(Ppmd7Context_OneState(p->MinContext)->Symbol) = 0; MASK(Ppmd7Context_OneState(p->MinContext)->Symbol) = 0;
p->PrevSuccess = 0; p->PrevSuccess = 0;
} }
@ -209,17 +209,17 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
unsigned num2 = num / 2; unsigned num2 = num / 2;
num &= 1; num &= 1;
hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num);
s += num; s += num;
p->MinContext = mc; p->MinContext = mc;
do do
{ {
unsigned sym0 = s[0].Symbol; const unsigned sym0 = s[0].Symbol;
unsigned sym1 = s[1].Symbol; const unsigned sym1 = s[1].Symbol;
s += 2; s += 2;
hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0)));
hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1)));
} }
while (--num2); while (--num2);
} }
@ -238,20 +238,20 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
s = Ppmd7_GetStats(p, p->MinContext); s = Ppmd7_GetStats(p, p->MinContext);
hiCnt = count; hiCnt = count;
// count -= s->Freq & (unsigned)(MASK(s->Symbol)); // count -= s->Freq & (UInt32)(MASK(s->Symbol));
// if ((Int32)count >= 0) // if ((Int32)count >= 0)
{ {
for (;;) for (;;)
{ {
count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
// count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
}; }
} }
s--; s--;
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq); RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
// new (see->Summ) value can overflow over 16-bits in some rare cases // new (see->Summ) value can overflow over 16-bits in some rare cases
Ppmd_See_Update(see); Ppmd_See_UPDATE(see)
p->FoundState = s; p->FoundState = s;
sym = s->Symbol; sym = s->Symbol;
Ppmd7_Update2(p); Ppmd7_Update2(p);
@ -261,7 +261,7 @@ int Ppmd7z_DecodeSymbol(CPpmd7 *p)
if (count >= freqSum) if (count >= freqSum)
return PPMD7_SYM_ERROR; return PPMD7_SYM_ERROR;
RC_Decode(hiCnt, freqSum - hiCnt); RC_Decode(hiCnt, freqSum - hiCnt)
// We increase (see->Summ) for sum of Freqs of all non_Masked symbols. // We increase (see->Summ) for sum of Freqs of all non_Masked symbols.
// new (see->Summ) value can overflow over 16-bits in some rare cases // new (see->Summ) value can overflow over 16-bits in some rare cases
@ -295,3 +295,18 @@ Byte *Ppmd7z_DecodeSymbols(CPpmd7 *p, Byte *buf, const Byte *lim)
return buf; return buf;
} }
*/ */
#undef kTopValue
#undef READ_BYTE
#undef RC_NORM_BASE
#undef RC_NORM_1
#undef RC_NORM
#undef RC_NORM_LOCAL
#undef RC_NORM_REMOTE
#undef R
#undef RC_Decode
#undef RC_DecodeFinal
#undef RC_GetThreshold
#undef CTX
#undef SUCCESSOR
#undef MASK

View file

@ -1,5 +1,5 @@
/* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder /* Ppmd7Enc.c -- Ppmd7z (PPMdH with 7z Range Coder) Encoder
2021-04-13 : Igor Pavlov : Public domain 2023-09-07 : Igor Pavlov : Public domain
This code is based on: This code is based on:
PPMd var.H (2001): Dmitry Shkarin : Public domain */ PPMd var.H (2001): Dmitry Shkarin : Public domain */
@ -8,7 +8,7 @@ This code is based on:
#include "Ppmd7.h" #include "Ppmd7.h"
#define kTopValue (1 << 24) #define kTopValue ((UInt32)1 << 24)
#define R (&p->rc.enc) #define R (&p->rc.enc)
@ -20,8 +20,8 @@ void Ppmd7z_Init_RangeEnc(CPpmd7 *p)
R->CacheSize = 1; R->CacheSize = 1;
} }
MY_NO_INLINE Z7_NO_INLINE
static void RangeEnc_ShiftLow(CPpmd7 *p) static void Ppmd7z_RangeEnc_ShiftLow(CPpmd7 *p)
{ {
if ((UInt32)R->Low < (UInt32)0xFF000000 || (unsigned)(R->Low >> 32) != 0) if ((UInt32)R->Low < (UInt32)0xFF000000 || (unsigned)(R->Low >> 32) != 0)
{ {
@ -38,53 +38,53 @@ static void RangeEnc_ShiftLow(CPpmd7 *p)
R->Low = (UInt32)((UInt32)R->Low << 8); R->Low = (UInt32)((UInt32)R->Low << 8);
} }
#define RC_NORM_BASE(p) if (R->Range < kTopValue) { R->Range <<= 8; RangeEnc_ShiftLow(p); #define RC_NORM_BASE(p) if (R->Range < kTopValue) { R->Range <<= 8; Ppmd7z_RangeEnc_ShiftLow(p);
#define RC_NORM_1(p) RC_NORM_BASE(p) } #define RC_NORM_1(p) RC_NORM_BASE(p) }
#define RC_NORM(p) RC_NORM_BASE(p) RC_NORM_BASE(p) }} #define RC_NORM(p) RC_NORM_BASE(p) RC_NORM_BASE(p) }}
// we must use only one type of Normalization from two: LOCAL or REMOTE // we must use only one type of Normalization from two: LOCAL or REMOTE
#define RC_NORM_LOCAL(p) // RC_NORM(p) #define RC_NORM_LOCAL(p) // RC_NORM(p)
#define RC_NORM_REMOTE(p) RC_NORM(p) #define RC_NORM_REMOTE(p) RC_NORM(p)
/* /*
#define RangeEnc_Encode(p, start, _size_) \ #define Ppmd7z_RangeEnc_Encode(p, start, _size_) \
{ UInt32 size = _size_; \ { UInt32 size = _size_; \
R->Low += start * R->Range; \ R->Low += start * R->Range; \
R->Range *= size; \ R->Range *= size; \
RC_NORM_LOCAL(p); } RC_NORM_LOCAL(p); }
*/ */
MY_FORCE_INLINE Z7_FORCE_INLINE
// MY_NO_INLINE // Z7_NO_INLINE
static void RangeEnc_Encode(CPpmd7 *p, UInt32 start, UInt32 size) static void Ppmd7z_RangeEnc_Encode(CPpmd7 *p, UInt32 start, UInt32 size)
{ {
R->Low += start * R->Range; R->Low += start * R->Range;
R->Range *= size; R->Range *= size;
RC_NORM_LOCAL(p); RC_NORM_LOCAL(p)
} }
void Ppmd7z_Flush_RangeEnc(CPpmd7 *p) void Ppmd7z_Flush_RangeEnc(CPpmd7 *p)
{ {
unsigned i; unsigned i;
for (i = 0; i < 5; i++) for (i = 0; i < 5; i++)
RangeEnc_ShiftLow(p); Ppmd7z_RangeEnc_ShiftLow(p);
} }
#define RC_Encode(start, size) RangeEnc_Encode(p, start, size); #define RC_Encode(start, size) Ppmd7z_RangeEnc_Encode(p, start, size);
#define RC_EncodeFinal(start, size) RC_Encode(start, size); RC_NORM_REMOTE(p); #define RC_EncodeFinal(start, size) RC_Encode(start, size) RC_NORM_REMOTE(p)
#define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref)) #define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref))
#define SUFFIX(ctx) CTX((ctx)->Suffix) #define SUFFIX(ctx) CTX((ctx)->Suffix)
typedef CPpmd7_Context * CTX_PTR; // typedef CPpmd7_Context * CTX_PTR;
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
void Ppmd7_UpdateModel(CPpmd7 *p); void Ppmd7_UpdateModel(CPpmd7 *p);
#define MASK(sym) ((unsigned char *)charMask)[sym] #define MASK(sym) ((Byte *)charMask)[sym]
MY_FORCE_INLINE Z7_FORCE_INLINE
static static
void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol) void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
{ {
@ -104,7 +104,7 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
if (s->Symbol == symbol) if (s->Symbol == symbol)
{ {
// R->Range /= p->MinContext->Union2.SummFreq; // R->Range /= p->MinContext->Union2.SummFreq;
RC_EncodeFinal(0, s->Freq); RC_EncodeFinal(0, s->Freq)
p->FoundState = s; p->FoundState = s;
Ppmd7_Update1_0(p); Ppmd7_Update1_0(p);
return; return;
@ -117,7 +117,7 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
if ((++s)->Symbol == symbol) if ((++s)->Symbol == symbol)
{ {
// R->Range /= p->MinContext->Union2.SummFreq; // R->Range /= p->MinContext->Union2.SummFreq;
RC_EncodeFinal(sum, s->Freq); RC_EncodeFinal(sum, s->Freq)
p->FoundState = s; p->FoundState = s;
Ppmd7_Update1(p); Ppmd7_Update1(p);
return; return;
@ -127,10 +127,10 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
while (--i); while (--i);
// R->Range /= p->MinContext->Union2.SummFreq; // R->Range /= p->MinContext->Union2.SummFreq;
RC_Encode(sum, p->MinContext->Union2.SummFreq - sum); RC_Encode(sum, p->MinContext->Union2.SummFreq - sum)
p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol); p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol);
PPMD_SetAllBitsIn256Bytes(charMask); PPMD_SetAllBitsIn256Bytes(charMask)
// MASK(s->Symbol) = 0; // MASK(s->Symbol) = 0;
// i = p->MinContext->NumStats - 1; // i = p->MinContext->NumStats - 1;
// do { MASK((--s)->Symbol) = 0; } while (--i); // do { MASK((--s)->Symbol) = 0; } while (--i);
@ -139,8 +139,8 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
MASK(s->Symbol) = 0; MASK(s->Symbol) = 0;
do do
{ {
unsigned sym0 = s2[0].Symbol; const unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol; const unsigned sym1 = s2[1].Symbol;
s2 += 2; s2 += 2;
MASK(sym0) = 0; MASK(sym0) = 0;
MASK(sym1) = 0; MASK(sym1) = 0;
@ -153,20 +153,20 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
UInt16 *prob = Ppmd7_GetBinSumm(p); UInt16 *prob = Ppmd7_GetBinSumm(p);
CPpmd_State *s = Ppmd7Context_OneState(p->MinContext); CPpmd_State *s = Ppmd7Context_OneState(p->MinContext);
UInt32 pr = *prob; UInt32 pr = *prob;
UInt32 bound = (R->Range >> 14) * pr; const UInt32 bound = (R->Range >> 14) * pr;
pr = PPMD_UPDATE_PROB_1(pr); pr = PPMD_UPDATE_PROB_1(pr);
if (s->Symbol == symbol) if (s->Symbol == symbol)
{ {
*prob = (UInt16)(pr + (1 << PPMD_INT_BITS)); *prob = (UInt16)(pr + (1 << PPMD_INT_BITS));
// RangeEnc_EncodeBit_0(p, bound); // RangeEnc_EncodeBit_0(p, bound);
R->Range = bound; R->Range = bound;
RC_NORM_1(p); RC_NORM_1(p)
// p->FoundState = s; // p->FoundState = s;
// Ppmd7_UpdateBin(p); // Ppmd7_UpdateBin(p);
{ {
unsigned freq = s->Freq; const unsigned freq = s->Freq;
CTX_PTR c = CTX(SUCCESSOR(s)); CPpmd7_Context *c = CTX(SUCCESSOR(s));
p->FoundState = s; p->FoundState = s;
p->PrevSuccess = 1; p->PrevSuccess = 1;
p->RunLength++; p->RunLength++;
@ -187,7 +187,7 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
R->Range -= bound; R->Range -= bound;
RC_NORM_LOCAL(p) RC_NORM_LOCAL(p)
PPMD_SetAllBitsIn256Bytes(charMask); PPMD_SetAllBitsIn256Bytes(charMask)
MASK(s->Symbol) = 0; MASK(s->Symbol) = 0;
p->PrevSuccess = 0; p->PrevSuccess = 0;
} }
@ -248,14 +248,14 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
do do
{ {
unsigned cur = s->Symbol; const unsigned cur = s->Symbol;
if ((int)cur == symbol) if ((int)cur == symbol)
{ {
UInt32 low = sum; const UInt32 low = sum;
UInt32 freq = s->Freq; const UInt32 freq = s->Freq;
unsigned num2; unsigned num2;
Ppmd_See_Update(see); Ppmd_See_UPDATE(see)
p->FoundState = s; p->FoundState = s;
sum += escFreq; sum += escFreq;
@ -265,21 +265,20 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
if (num2 != 0) if (num2 != 0)
{ {
s += i; s += i;
for (;;) do
{ {
unsigned sym0 = s[0].Symbol; const unsigned sym0 = s[0].Symbol;
unsigned sym1 = s[1].Symbol; const unsigned sym1 = s[1].Symbol;
s += 2; s += 2;
sum += (s[-2].Freq & (unsigned)(MASK(sym0))); sum += (s[-2].Freq & (unsigned)(MASK(sym0)));
sum += (s[-1].Freq & (unsigned)(MASK(sym1))); sum += (s[-1].Freq & (unsigned)(MASK(sym1)));
if (--num2 == 0)
break;
} }
while (--num2);
} }
R->Range /= sum; R->Range /= sum;
RC_EncodeFinal(low, freq); RC_EncodeFinal(low, freq)
Ppmd7_Update2(p); Ppmd7_Update2(p);
return; return;
} }
@ -289,21 +288,21 @@ void Ppmd7z_EncodeSymbol(CPpmd7 *p, int symbol)
while (--i); while (--i);
{ {
UInt32 total = sum + escFreq; const UInt32 total = sum + escFreq;
see->Summ = (UInt16)(see->Summ + total); see->Summ = (UInt16)(see->Summ + total);
R->Range /= total; R->Range /= total;
RC_Encode(sum, escFreq); RC_Encode(sum, escFreq)
} }
{ {
CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext); const CPpmd_State *s2 = Ppmd7_GetStats(p, p->MinContext);
s--; s--;
MASK(s->Symbol) = 0; MASK(s->Symbol) = 0;
do do
{ {
unsigned sym0 = s2[0].Symbol; const unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol; const unsigned sym1 = s2[1].Symbol;
s2 += 2; s2 += 2;
MASK(sym0) = 0; MASK(sym0) = 0;
MASK(sym1) = 0; MASK(sym1) = 0;
@ -321,3 +320,18 @@ void Ppmd7z_EncodeSymbols(CPpmd7 *p, const Byte *buf, const Byte *lim)
Ppmd7z_EncodeSymbol(p, *buf); Ppmd7z_EncodeSymbol(p, *buf);
} }
} }
#undef kTopValue
#undef WRITE_BYTE
#undef RC_NORM_BASE
#undef RC_NORM_1
#undef RC_NORM
#undef RC_NORM_LOCAL
#undef RC_NORM_REMOTE
#undef R
#undef RC_Encode
#undef RC_EncodeFinal
#undef SUFFIX
#undef CTX
#undef SUCCESSOR
#undef MASK

View file

@ -1,5 +1,5 @@
/* Ppmd7aDec.c -- PPMd7a (PPMdH) Decoder /* Ppmd7aDec.c -- PPMd7a (PPMdH) Decoder
2021-04-13 : Igor Pavlov : Public domain 2023-09-07 : Igor Pavlov : Public domain
This code is based on: This code is based on:
PPMd var.H (2001): Dmitry Shkarin : Public domain PPMd var.H (2001): Dmitry Shkarin : Public domain
Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ Carryless rangecoder (1999): Dmitry Subbotin : Public domain */
@ -8,8 +8,8 @@ This code is based on:
#include "Ppmd7.h" #include "Ppmd7.h"
#define kTop (1 << 24) #define kTop ((UInt32)1 << 24)
#define kBot (1 << 15) #define kBot ((UInt32)1 << 15)
#define READ_BYTE(p) IByteIn_Read((p)->Stream) #define READ_BYTE(p) IByteIn_Read((p)->Stream)
@ -37,9 +37,9 @@ BoolInt Ppmd7a_RangeDec_Init(CPpmd7_RangeDec *p)
#define R (&p->rc.dec) #define R (&p->rc.dec)
MY_FORCE_INLINE Z7_FORCE_INLINE
// MY_NO_INLINE // Z7_NO_INLINE
static void RangeDec_Decode(CPpmd7 *p, UInt32 start, UInt32 size) static void Ppmd7a_RD_Decode(CPpmd7 *p, UInt32 start, UInt32 size)
{ {
start *= R->Range; start *= R->Range;
R->Low += start; R->Low += start;
@ -48,9 +48,9 @@ static void RangeDec_Decode(CPpmd7 *p, UInt32 start, UInt32 size)
RC_NORM_LOCAL(R) RC_NORM_LOCAL(R)
} }
#define RC_Decode(start, size) RangeDec_Decode(p, start, size); #define RC_Decode(start, size) Ppmd7a_RD_Decode(p, start, size);
#define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R) #define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R)
#define RC_GetThreshold(total) (R->Code / (R->Range /= (total))) #define RC_GetThreshold(total) (R->Code / (R->Range /= (total)))
#define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref)) #define CTX(ref) ((CPpmd7_Context *)Ppmd7_GetContext(p, ref))
@ -58,7 +58,7 @@ typedef CPpmd7_Context * CTX_PTR;
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
void Ppmd7_UpdateModel(CPpmd7 *p); void Ppmd7_UpdateModel(CPpmd7 *p);
#define MASK(sym) ((unsigned char *)charMask)[sym] #define MASK(sym) ((Byte *)charMask)[sym]
int Ppmd7a_DecodeSymbol(CPpmd7 *p) int Ppmd7a_DecodeSymbol(CPpmd7 *p)
@ -70,7 +70,7 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
CPpmd_State *s = Ppmd7_GetStats(p, p->MinContext); CPpmd_State *s = Ppmd7_GetStats(p, p->MinContext);
unsigned i; unsigned i;
UInt32 count, hiCnt; UInt32 count, hiCnt;
UInt32 summFreq = p->MinContext->Union2.SummFreq; const UInt32 summFreq = p->MinContext->Union2.SummFreq;
if (summFreq > R->Range) if (summFreq > R->Range)
return PPMD7_SYM_ERROR; return PPMD7_SYM_ERROR;
@ -81,7 +81,7 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
if ((Int32)(count -= s->Freq) < 0) if ((Int32)(count -= s->Freq) < 0)
{ {
Byte sym; Byte sym;
RC_DecodeFinal(0, s->Freq); RC_DecodeFinal(0, s->Freq)
p->FoundState = s; p->FoundState = s;
sym = s->Symbol; sym = s->Symbol;
Ppmd7_Update1_0(p); Ppmd7_Update1_0(p);
@ -96,7 +96,7 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
if ((Int32)(count -= (++s)->Freq) < 0) if ((Int32)(count -= (++s)->Freq) < 0)
{ {
Byte sym; Byte sym;
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq); RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
p->FoundState = s; p->FoundState = s;
sym = s->Symbol; sym = s->Symbol;
Ppmd7_Update1(p); Ppmd7_Update1(p);
@ -109,10 +109,10 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
return PPMD7_SYM_ERROR; return PPMD7_SYM_ERROR;
hiCnt -= count; hiCnt -= count;
RC_Decode(hiCnt, summFreq - hiCnt); RC_Decode(hiCnt, summFreq - hiCnt)
p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol); p->HiBitsFlag = PPMD7_HiBitsFlag_3(p->FoundState->Symbol);
PPMD_SetAllBitsIn256Bytes(charMask); PPMD_SetAllBitsIn256Bytes(charMask)
// i = p->MinContext->NumStats - 1; // i = p->MinContext->NumStats - 1;
// do { MASK((--s)->Symbol) = 0; } while (--i); // do { MASK((--s)->Symbol) = 0; } while (--i);
{ {
@ -120,8 +120,8 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
MASK(s->Symbol) = 0; MASK(s->Symbol) = 0;
do do
{ {
unsigned sym0 = s2[0].Symbol; const unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol; const unsigned sym1 = s2[1].Symbol;
s2 += 2; s2 += 2;
MASK(sym0) = 0; MASK(sym0) = 0;
MASK(sym1) = 0; MASK(sym1) = 0;
@ -176,7 +176,7 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
R->Range = (R->Range & ~((UInt32)PPMD_BIN_SCALE - 1)) - size0; R->Range = (R->Range & ~((UInt32)PPMD_BIN_SCALE - 1)) - size0;
RC_NORM_LOCAL(R) RC_NORM_LOCAL(R)
PPMD_SetAllBitsIn256Bytes(charMask); PPMD_SetAllBitsIn256Bytes(charMask)
MASK(Ppmd7Context_OneState(p->MinContext)->Symbol) = 0; MASK(Ppmd7Context_OneState(p->MinContext)->Symbol) = 0;
p->PrevSuccess = 0; p->PrevSuccess = 0;
} }
@ -209,17 +209,17 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
unsigned num2 = num / 2; unsigned num2 = num / 2;
num &= 1; num &= 1;
hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num);
s += num; s += num;
p->MinContext = mc; p->MinContext = mc;
do do
{ {
unsigned sym0 = s[0].Symbol; const unsigned sym0 = s[0].Symbol;
unsigned sym1 = s[1].Symbol; const unsigned sym1 = s[1].Symbol;
s += 2; s += 2;
hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0)));
hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1)));
} }
while (--num2); while (--num2);
} }
@ -238,20 +238,20 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
s = Ppmd7_GetStats(p, p->MinContext); s = Ppmd7_GetStats(p, p->MinContext);
hiCnt = count; hiCnt = count;
// count -= s->Freq & (unsigned)(MASK(s->Symbol)); // count -= s->Freq & (UInt32)(MASK(s->Symbol));
// if ((Int32)count >= 0) // if ((Int32)count >= 0)
{ {
for (;;) for (;;)
{ {
count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
// count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
}; }
} }
s--; s--;
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq); RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
// new (see->Summ) value can overflow over 16-bits in some rare cases // new (see->Summ) value can overflow over 16-bits in some rare cases
Ppmd_See_Update(see); Ppmd_See_UPDATE(see)
p->FoundState = s; p->FoundState = s;
sym = s->Symbol; sym = s->Symbol;
Ppmd7_Update2(p); Ppmd7_Update2(p);
@ -261,7 +261,7 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
if (count >= freqSum) if (count >= freqSum)
return PPMD7_SYM_ERROR; return PPMD7_SYM_ERROR;
RC_Decode(hiCnt, freqSum - hiCnt); RC_Decode(hiCnt, freqSum - hiCnt)
// We increase (see->Summ) for sum of Freqs of all non_Masked symbols. // We increase (see->Summ) for sum of Freqs of all non_Masked symbols.
// new (see->Summ) value can overflow over 16-bits in some rare cases // new (see->Summ) value can overflow over 16-bits in some rare cases
@ -277,3 +277,19 @@ int Ppmd7a_DecodeSymbol(CPpmd7 *p)
while (s != s2); while (s != s2);
} }
} }
#undef kTop
#undef kBot
#undef READ_BYTE
#undef RC_NORM_BASE
#undef RC_NORM_1
#undef RC_NORM
#undef RC_NORM_LOCAL
#undef RC_NORM_REMOTE
#undef R
#undef RC_Decode
#undef RC_DecodeFinal
#undef RC_GetThreshold
#undef CTX
#undef SUCCESSOR
#undef MASK

279
C/Ppmd8.c
View file

@ -1,5 +1,5 @@
/* Ppmd8.c -- PPMdI codec /* Ppmd8.c -- PPMdI codec
2021-04-13 : Igor Pavlov : Public domain 2023-09-07 : Igor Pavlov : Public domain
This code is based on PPMd var.I (2002): Dmitry Shkarin : Public domain */ This code is based on PPMd var.I (2002): Dmitry Shkarin : Public domain */
#include "Precomp.h" #include "Precomp.h"
@ -14,7 +14,7 @@ This code is based on PPMd var.I (2002): Dmitry Shkarin : Public domain */
MY_ALIGN(16) MY_ALIGN(16)
static const Byte PPMD8_kExpEscape[16] = { 25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2 }; static const Byte PPMD8_kExpEscape[16] = { 25, 14, 9, 7, 5, 5, 4, 4, 4, 3, 3, 3, 2, 2, 2, 2 };
MY_ALIGN(16) MY_ALIGN(16)
static const UInt16 kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051}; static const UInt16 PPMD8_kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x5ABC, 0x6632, 0x6051};
#define MAX_FREQ 124 #define MAX_FREQ 124
#define UNIT_SIZE 12 #define UNIT_SIZE 12
@ -33,7 +33,7 @@ static const UInt16 kInitBinEsc[] = { 0x3CDD, 0x1F3F, 0x59BF, 0x48F3, 0x64A1, 0x
#define ONE_STATE(ctx) Ppmd8Context_OneState(ctx) #define ONE_STATE(ctx) Ppmd8Context_OneState(ctx)
#define SUFFIX(ctx) CTX((ctx)->Suffix) #define SUFFIX(ctx) CTX((ctx)->Suffix)
typedef CPpmd8_Context * CTX_PTR; typedef CPpmd8_Context * PPMD8_CTX_PTR;
struct CPpmd8_Node_; struct CPpmd8_Node_;
@ -114,7 +114,7 @@ BoolInt Ppmd8_Alloc(CPpmd8 *p, UInt32 size, ISzAllocPtr alloc)
#define EMPTY_NODE 0xFFFFFFFF #define EMPTY_NODE 0xFFFFFFFF
static void InsertNode(CPpmd8 *p, void *node, unsigned indx) static void Ppmd8_InsertNode(CPpmd8 *p, void *node, unsigned indx)
{ {
((CPpmd8_Node *)node)->Stamp = EMPTY_NODE; ((CPpmd8_Node *)node)->Stamp = EMPTY_NODE;
((CPpmd8_Node *)node)->Next = (CPpmd8_Node_Ref)p->FreeList[indx]; ((CPpmd8_Node *)node)->Next = (CPpmd8_Node_Ref)p->FreeList[indx];
@ -124,7 +124,7 @@ static void InsertNode(CPpmd8 *p, void *node, unsigned indx)
} }
static void *RemoveNode(CPpmd8 *p, unsigned indx) static void *Ppmd8_RemoveNode(CPpmd8 *p, unsigned indx)
{ {
CPpmd8_Node *node = NODE((CPpmd8_Node_Ref)p->FreeList[indx]); CPpmd8_Node *node = NODE((CPpmd8_Node_Ref)p->FreeList[indx]);
p->FreeList[indx] = node->Next; p->FreeList[indx] = node->Next;
@ -134,16 +134,16 @@ static void *RemoveNode(CPpmd8 *p, unsigned indx)
} }
static void SplitBlock(CPpmd8 *p, void *ptr, unsigned oldIndx, unsigned newIndx) static void Ppmd8_SplitBlock(CPpmd8 *p, void *ptr, unsigned oldIndx, unsigned newIndx)
{ {
unsigned i, nu = I2U(oldIndx) - I2U(newIndx); unsigned i, nu = I2U(oldIndx) - I2U(newIndx);
ptr = (Byte *)ptr + U2B(I2U(newIndx)); ptr = (Byte *)ptr + U2B(I2U(newIndx));
if (I2U(i = U2I(nu)) != nu) if (I2U(i = U2I(nu)) != nu)
{ {
unsigned k = I2U(--i); unsigned k = I2U(--i);
InsertNode(p, ((Byte *)ptr) + U2B(k), nu - k - 1); Ppmd8_InsertNode(p, ((Byte *)ptr) + U2B(k), nu - k - 1);
} }
InsertNode(p, ptr, i); Ppmd8_InsertNode(p, ptr, i);
} }
@ -159,7 +159,7 @@ static void SplitBlock(CPpmd8 *p, void *ptr, unsigned oldIndx, unsigned newIndx)
static void GlueFreeBlocks(CPpmd8 *p) static void Ppmd8_GlueFreeBlocks(CPpmd8 *p)
{ {
/* /*
we use first UInt32 field of 12-bytes UNITs as record type stamp we use first UInt32 field of 12-bytes UNITs as record type stamp
@ -239,27 +239,27 @@ static void GlueFreeBlocks(CPpmd8 *p)
if (nu == 0) if (nu == 0)
continue; continue;
for (; nu > 128; nu -= 128, node += 128) for (; nu > 128; nu -= 128, node += 128)
InsertNode(p, node, PPMD_NUM_INDEXES - 1); Ppmd8_InsertNode(p, node, PPMD_NUM_INDEXES - 1);
if (I2U(i = U2I(nu)) != nu) if (I2U(i = U2I(nu)) != nu)
{ {
unsigned k = I2U(--i); unsigned k = I2U(--i);
InsertNode(p, node + k, (unsigned)nu - k - 1); Ppmd8_InsertNode(p, node + k, (unsigned)nu - k - 1);
} }
InsertNode(p, node, i); Ppmd8_InsertNode(p, node, i);
} }
} }
MY_NO_INLINE Z7_NO_INLINE
static void *AllocUnitsRare(CPpmd8 *p, unsigned indx) static void *Ppmd8_AllocUnitsRare(CPpmd8 *p, unsigned indx)
{ {
unsigned i; unsigned i;
if (p->GlueCount == 0) if (p->GlueCount == 0)
{ {
GlueFreeBlocks(p); Ppmd8_GlueFreeBlocks(p);
if (p->FreeList[indx] != 0) if (p->FreeList[indx] != 0)
return RemoveNode(p, indx); return Ppmd8_RemoveNode(p, indx);
} }
i = indx; i = indx;
@ -277,17 +277,17 @@ static void *AllocUnitsRare(CPpmd8 *p, unsigned indx)
while (p->FreeList[i] == 0); while (p->FreeList[i] == 0);
{ {
void *block = RemoveNode(p, i); void *block = Ppmd8_RemoveNode(p, i);
SplitBlock(p, block, i, indx); Ppmd8_SplitBlock(p, block, i, indx);
return block; return block;
} }
} }
static void *AllocUnits(CPpmd8 *p, unsigned indx) static void *Ppmd8_AllocUnits(CPpmd8 *p, unsigned indx)
{ {
if (p->FreeList[indx] != 0) if (p->FreeList[indx] != 0)
return RemoveNode(p, indx); return Ppmd8_RemoveNode(p, indx);
{ {
UInt32 numBytes = U2B(I2U(indx)); UInt32 numBytes = U2B(I2U(indx));
Byte *lo = p->LoUnit; Byte *lo = p->LoUnit;
@ -297,13 +297,22 @@ static void *AllocUnits(CPpmd8 *p, unsigned indx)
return lo; return lo;
} }
} }
return AllocUnitsRare(p, indx); return Ppmd8_AllocUnitsRare(p, indx);
} }
#define MyMem12Cpy(dest, src, num) \ #define MEM_12_CPY(dest, src, num) \
{ UInt32 *d = (UInt32 *)dest; const UInt32 *z = (const UInt32 *)src; UInt32 n = num; \ { UInt32 *d = (UInt32 *)(dest); \
do { d[0] = z[0]; d[1] = z[1]; d[2] = z[2]; z += 3; d += 3; } while (--n); } const UInt32 *z = (const UInt32 *)(src); \
unsigned n = (num); \
do { \
d[0] = z[0]; \
d[1] = z[1]; \
d[2] = z[2]; \
z += 3; \
d += 3; \
} while (--n); \
}
@ -315,26 +324,26 @@ static void *ShrinkUnits(CPpmd8 *p, void *oldPtr, unsigned oldNU, unsigned newNU
return oldPtr; return oldPtr;
if (p->FreeList[i1] != 0) if (p->FreeList[i1] != 0)
{ {
void *ptr = RemoveNode(p, i1); void *ptr = Ppmd8_RemoveNode(p, i1);
MyMem12Cpy(ptr, oldPtr, newNU); MEM_12_CPY(ptr, oldPtr, newNU)
InsertNode(p, oldPtr, i0); Ppmd8_InsertNode(p, oldPtr, i0);
return ptr; return ptr;
} }
SplitBlock(p, oldPtr, i0, i1); Ppmd8_SplitBlock(p, oldPtr, i0, i1);
return oldPtr; return oldPtr;
} }
static void FreeUnits(CPpmd8 *p, void *ptr, unsigned nu) static void FreeUnits(CPpmd8 *p, void *ptr, unsigned nu)
{ {
InsertNode(p, ptr, U2I(nu)); Ppmd8_InsertNode(p, ptr, U2I(nu));
} }
static void SpecialFreeUnit(CPpmd8 *p, void *ptr) static void SpecialFreeUnit(CPpmd8 *p, void *ptr)
{ {
if ((Byte *)ptr != p->UnitsStart) if ((Byte *)ptr != p->UnitsStart)
InsertNode(p, ptr, 0); Ppmd8_InsertNode(p, ptr, 0);
else else
{ {
#ifdef PPMD8_FREEZE_SUPPORT #ifdef PPMD8_FREEZE_SUPPORT
@ -352,10 +361,10 @@ static void *MoveUnitsUp(CPpmd8 *p, void *oldPtr, unsigned nu)
void *ptr; void *ptr;
if ((Byte *)oldPtr > p->UnitsStart + (1 << 14) || REF(oldPtr) > p->FreeList[indx]) if ((Byte *)oldPtr > p->UnitsStart + (1 << 14) || REF(oldPtr) > p->FreeList[indx])
return oldPtr; return oldPtr;
ptr = RemoveNode(p, indx); ptr = Ppmd8_RemoveNode(p, indx);
MyMem12Cpy(ptr, oldPtr, nu); MEM_12_CPY(ptr, oldPtr, nu)
if ((Byte *)oldPtr != p->UnitsStart) if ((Byte *)oldPtr != p->UnitsStart)
InsertNode(p, oldPtr, indx); Ppmd8_InsertNode(p, oldPtr, indx);
else else
p->UnitsStart += U2B(I2U(indx)); p->UnitsStart += U2B(I2U(indx));
return ptr; return ptr;
@ -411,22 +420,22 @@ static void ExpandTextArea(CPpmd8 *p)
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
static void SetSuccessor(CPpmd_State *p, CPpmd_Void_Ref v) static void Ppmd8State_SetSuccessor(CPpmd_State *p, CPpmd_Void_Ref v)
{ {
Ppmd_SET_SUCCESSOR(p, v); Ppmd_SET_SUCCESSOR(p, v)
} }
#define RESET_TEXT(offs) { p->Text = p->Base + p->AlignOffset + (offs); } #define RESET_TEXT(offs) { p->Text = p->Base + p->AlignOffset + (offs); }
MY_NO_INLINE Z7_NO_INLINE
static static
void RestartModel(CPpmd8 *p) void Ppmd8_RestartModel(CPpmd8 *p)
{ {
unsigned i, k, m; unsigned i, k, m;
memset(p->FreeList, 0, sizeof(p->FreeList)); memset(p->FreeList, 0, sizeof(p->FreeList));
memset(p->Stamps, 0, sizeof(p->Stamps)); memset(p->Stamps, 0, sizeof(p->Stamps));
RESET_TEXT(0); RESET_TEXT(0)
p->HiUnit = p->Text + p->Size; p->HiUnit = p->Text + p->Size;
p->LoUnit = p->UnitsStart = p->HiUnit - p->Size / 8 / UNIT_SIZE * 7 * UNIT_SIZE; p->LoUnit = p->UnitsStart = p->HiUnit - p->Size / 8 / UNIT_SIZE * 7 * UNIT_SIZE;
p->GlueCount = 0; p->GlueCount = 0;
@ -436,8 +445,8 @@ void RestartModel(CPpmd8 *p)
p->PrevSuccess = 0; p->PrevSuccess = 0;
{ {
CPpmd8_Context *mc = (CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */ CPpmd8_Context *mc = (PPMD8_CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); /* AllocContext(p); */
CPpmd_State *s = (CPpmd_State *)p->LoUnit; /* AllocUnits(p, PPMD_NUM_INDEXES - 1); */ CPpmd_State *s = (CPpmd_State *)p->LoUnit; /* Ppmd8_AllocUnits(p, PPMD_NUM_INDEXES - 1); */
p->LoUnit += U2B(256 / 2); p->LoUnit += U2B(256 / 2);
p->MaxContext = p->MinContext = mc; p->MaxContext = p->MinContext = mc;
@ -452,7 +461,7 @@ void RestartModel(CPpmd8 *p)
{ {
s->Symbol = (Byte)i; s->Symbol = (Byte)i;
s->Freq = 1; s->Freq = 1;
SetSuccessor(s, 0); Ppmd8State_SetSuccessor(s, 0);
} }
} }
@ -475,7 +484,7 @@ void RestartModel(CPpmd8 *p)
{ {
unsigned r; unsigned r;
UInt16 *dest = p->BinSumm[m] + k; UInt16 *dest = p->BinSumm[m] + k;
UInt16 val = (UInt16)(PPMD_BIN_SCALE - kInitBinEsc[k] / (i + 1)); const UInt16 val = (UInt16)(PPMD_BIN_SCALE - PPMD8_kInitBinEsc[k] / (i + 1));
for (r = 0; r < 64; r += 8) for (r = 0; r < 64; r += 8)
dest[r] = val; dest[r] = val;
} }
@ -507,7 +516,7 @@ void Ppmd8_Init(CPpmd8 *p, unsigned maxOrder, unsigned restoreMethod)
{ {
p->MaxOrder = maxOrder; p->MaxOrder = maxOrder;
p->RestoreMethod = restoreMethod; p->RestoreMethod = restoreMethod;
RestartModel(p); Ppmd8_RestartModel(p);
} }
@ -531,7 +540,7 @@ Refresh() is called when we remove some symbols (successors) in context.
It increases Escape_Freq for sum of all removed symbols. It increases Escape_Freq for sum of all removed symbols.
*/ */
static void Refresh(CPpmd8 *p, CTX_PTR ctx, unsigned oldNU, unsigned scale) static void Refresh(CPpmd8 *p, PPMD8_CTX_PTR ctx, unsigned oldNU, unsigned scale)
{ {
unsigned i = ctx->NumStats, escFreq, sumFreq, flags; unsigned i = ctx->NumStats, escFreq, sumFreq, flags;
CPpmd_State *s = (CPpmd_State *)ShrinkUnits(p, STATS(ctx), oldNU, (i + 2) >> 1); CPpmd_State *s = (CPpmd_State *)ShrinkUnits(p, STATS(ctx), oldNU, (i + 2) >> 1);
@ -581,7 +590,7 @@ static void Refresh(CPpmd8 *p, CTX_PTR ctx, unsigned oldNU, unsigned scale)
} }
static void SwapStates(CPpmd_State *t1, CPpmd_State *t2) static void SWAP_STATES(CPpmd_State *t1, CPpmd_State *t2)
{ {
CPpmd_State tmp = *t1; CPpmd_State tmp = *t1;
*t1 = *t2; *t1 = *t2;
@ -597,7 +606,7 @@ CutOff() reduces contexts:
if the (Union4.Stats) is close to (UnitsStart), it moves it up. if the (Union4.Stats) is close to (UnitsStart), it moves it up.
*/ */
static CPpmd_Void_Ref CutOff(CPpmd8 *p, CTX_PTR ctx, unsigned order) static CPpmd_Void_Ref CutOff(CPpmd8 *p, PPMD8_CTX_PTR ctx, unsigned order)
{ {
int ns = ctx->NumStats; int ns = ctx->NumStats;
unsigned nu; unsigned nu;
@ -613,7 +622,7 @@ static CPpmd_Void_Ref CutOff(CPpmd8 *p, CTX_PTR ctx, unsigned order)
successor = CutOff(p, CTX(successor), order + 1); successor = CutOff(p, CTX(successor), order + 1);
else else
successor = 0; successor = 0;
SetSuccessor(s, successor); Ppmd8State_SetSuccessor(s, successor);
if (successor || order <= 9) /* O_BOUND */ if (successor || order <= 9) /* O_BOUND */
return REF(ctx); return REF(ctx);
} }
@ -630,11 +639,11 @@ static CPpmd_Void_Ref CutOff(CPpmd8 *p, CTX_PTR ctx, unsigned order)
if ((UInt32)((Byte *)stats - p->UnitsStart) <= (1 << 14) if ((UInt32)((Byte *)stats - p->UnitsStart) <= (1 << 14)
&& (CPpmd_Void_Ref)ctx->Union4.Stats <= p->FreeList[indx]) && (CPpmd_Void_Ref)ctx->Union4.Stats <= p->FreeList[indx])
{ {
void *ptr = RemoveNode(p, indx); void *ptr = Ppmd8_RemoveNode(p, indx);
ctx->Union4.Stats = STATS_REF(ptr); ctx->Union4.Stats = STATS_REF(ptr);
MyMem12Cpy(ptr, (const void *)stats, nu); MEM_12_CPY(ptr, (const void *)stats, nu)
if ((Byte *)stats != p->UnitsStart) if ((Byte *)stats != p->UnitsStart)
InsertNode(p, stats, indx); Ppmd8_InsertNode(p, stats, indx);
else else
p->UnitsStart += U2B(I2U(indx)); p->UnitsStart += U2B(I2U(indx));
stats = ptr; stats = ptr;
@ -656,16 +665,16 @@ static CPpmd_Void_Ref CutOff(CPpmd8 *p, CTX_PTR ctx, unsigned order)
} }
else else
{ {
SwapStates(s, s2); SWAP_STATES(s, s2);
SetSuccessor(s2, 0); Ppmd8State_SetSuccessor(s2, 0);
} }
} }
else else
{ {
if (order < p->MaxOrder) if (order < p->MaxOrder)
SetSuccessor(s, CutOff(p, CTX(successor), order + 1)); Ppmd8State_SetSuccessor(s, CutOff(p, CTX(successor), order + 1));
else else
SetSuccessor(s, 0); Ppmd8State_SetSuccessor(s, 0);
} }
} }
while (--s >= stats); while (--s >= stats);
@ -711,7 +720,7 @@ RemoveBinContexts()
removes Bin Context without Successor, if suffix of that context is also binary. removes Bin Context without Successor, if suffix of that context is also binary.
*/ */
static CPpmd_Void_Ref RemoveBinContexts(CPpmd8 *p, CTX_PTR ctx, unsigned order) static CPpmd_Void_Ref RemoveBinContexts(CPpmd8 *p, PPMD8_CTX_PTR ctx, unsigned order)
{ {
if (!ctx->NumStats) if (!ctx->NumStats)
{ {
@ -721,7 +730,7 @@ static CPpmd_Void_Ref RemoveBinContexts(CPpmd8 *p, CTX_PTR ctx, unsigned order)
successor = RemoveBinContexts(p, CTX(successor), order + 1); successor = RemoveBinContexts(p, CTX(successor), order + 1);
else else
successor = 0; successor = 0;
SetSuccessor(s, successor); Ppmd8State_SetSuccessor(s, successor);
/* Suffix context can be removed already, since different (high-order) /* Suffix context can be removed already, since different (high-order)
Successors may refer to same context. So we check Flags == 0xFF (Stamp == EMPTY_NODE) */ Successors may refer to same context. So we check Flags == 0xFF (Stamp == EMPTY_NODE) */
if (!successor && (!SUFFIX(ctx)->NumStats || SUFFIX(ctx)->Flags == 0xFF)) if (!successor && (!SUFFIX(ctx)->NumStats || SUFFIX(ctx)->Flags == 0xFF))
@ -737,9 +746,9 @@ static CPpmd_Void_Ref RemoveBinContexts(CPpmd8 *p, CTX_PTR ctx, unsigned order)
{ {
CPpmd_Void_Ref successor = SUCCESSOR(s); CPpmd_Void_Ref successor = SUCCESSOR(s);
if ((Byte *)Ppmd8_GetPtr(p, successor) >= p->UnitsStart && order < p->MaxOrder) if ((Byte *)Ppmd8_GetPtr(p, successor) >= p->UnitsStart && order < p->MaxOrder)
SetSuccessor(s, RemoveBinContexts(p, CTX(successor), order + 1)); Ppmd8State_SetSuccessor(s, RemoveBinContexts(p, CTX(successor), order + 1));
else else
SetSuccessor(s, 0); Ppmd8State_SetSuccessor(s, 0);
} }
while (--s >= STATS(ctx)); while (--s >= STATS(ctx));
} }
@ -767,15 +776,15 @@ static UInt32 GetUsedMemory(const CPpmd8 *p)
#endif #endif
static void RestoreModel(CPpmd8 *p, CTX_PTR ctxError static void RestoreModel(CPpmd8 *p, PPMD8_CTX_PTR ctxError
#ifdef PPMD8_FREEZE_SUPPORT #ifdef PPMD8_FREEZE_SUPPORT
, CTX_PTR fSuccessor , PPMD8_CTX_PTR fSuccessor
#endif #endif
) )
{ {
CTX_PTR c; PPMD8_CTX_PTR c;
CPpmd_State *s; CPpmd_State *s;
RESET_TEXT(0); RESET_TEXT(0)
// we go here in cases of error of allocation for context (c1) // we go here in cases of error of allocation for context (c1)
// Order(MinContext) < Order(ctxError) <= Order(MaxContext) // Order(MinContext) < Order(ctxError) <= Order(MaxContext)
@ -831,7 +840,7 @@ static void RestoreModel(CPpmd8 *p, CTX_PTR ctxError
else else
#endif #endif
if (p->RestoreMethod == PPMD8_RESTORE_METHOD_RESTART || GetUsedMemory(p) < (p->Size >> 1)) if (p->RestoreMethod == PPMD8_RESTORE_METHOD_RESTART || GetUsedMemory(p) < (p->Size >> 1))
RestartModel(p); Ppmd8_RestartModel(p);
else else
{ {
while (p->MaxContext->Suffix) while (p->MaxContext->Suffix)
@ -850,8 +859,8 @@ static void RestoreModel(CPpmd8 *p, CTX_PTR ctxError
MY_NO_INLINE Z7_NO_INLINE
static CTX_PTR CreateSuccessors(CPpmd8 *p, BoolInt skip, CPpmd_State *s1, CTX_PTR c) static PPMD8_CTX_PTR Ppmd8_CreateSuccessors(CPpmd8 *p, BoolInt skip, CPpmd_State *s1, PPMD8_CTX_PTR c)
{ {
CPpmd_Byte_Ref upBranch = (CPpmd_Byte_Ref)SUCCESSOR(p->FoundState); CPpmd_Byte_Ref upBranch = (CPpmd_Byte_Ref)SUCCESSOR(p->FoundState);
@ -927,15 +936,15 @@ static CTX_PTR CreateSuccessors(CPpmd8 *p, BoolInt skip, CPpmd_State *s1, CTX_PT
do do
{ {
CTX_PTR c1; PPMD8_CTX_PTR c1;
/* = AllocContext(p); */ /* = AllocContext(p); */
if (p->HiUnit != p->LoUnit) if (p->HiUnit != p->LoUnit)
c1 = (CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE); c1 = (PPMD8_CTX_PTR)(void *)(p->HiUnit -= UNIT_SIZE);
else if (p->FreeList[0] != 0) else if (p->FreeList[0] != 0)
c1 = (CTX_PTR)RemoveNode(p, 0); c1 = (PPMD8_CTX_PTR)Ppmd8_RemoveNode(p, 0);
else else
{ {
c1 = (CTX_PTR)AllocUnitsRare(p, 0); c1 = (PPMD8_CTX_PTR)Ppmd8_AllocUnitsRare(p, 0);
if (!c1) if (!c1)
return NULL; return NULL;
} }
@ -943,9 +952,9 @@ static CTX_PTR CreateSuccessors(CPpmd8 *p, BoolInt skip, CPpmd_State *s1, CTX_PT
c1->NumStats = 0; c1->NumStats = 0;
c1->Union2.State2.Symbol = newSym; c1->Union2.State2.Symbol = newSym;
c1->Union2.State2.Freq = newFreq; c1->Union2.State2.Freq = newFreq;
SetSuccessor(ONE_STATE(c1), upBranch); Ppmd8State_SetSuccessor(ONE_STATE(c1), upBranch);
c1->Suffix = REF(c); c1->Suffix = REF(c);
SetSuccessor(ps[--numPs], REF(c1)); Ppmd8State_SetSuccessor(ps[--numPs], REF(c1));
c = c1; c = c1;
} }
while (numPs != 0); while (numPs != 0);
@ -954,10 +963,10 @@ static CTX_PTR CreateSuccessors(CPpmd8 *p, BoolInt skip, CPpmd_State *s1, CTX_PT
} }
static CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, CTX_PTR c) static PPMD8_CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, PPMD8_CTX_PTR c)
{ {
CPpmd_State *s = NULL; CPpmd_State *s = NULL;
CTX_PTR c1 = c; PPMD8_CTX_PTR c1 = c;
CPpmd_Void_Ref upBranch = REF(p->Text); CPpmd_Void_Ref upBranch = REF(p->Text);
#ifdef PPMD8_FREEZE_SUPPORT #ifdef PPMD8_FREEZE_SUPPORT
@ -967,7 +976,7 @@ static CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, CTX_PTR c)
ps[numPs++] = p->FoundState; ps[numPs++] = p->FoundState;
#endif #endif
SetSuccessor(p->FoundState, upBranch); Ppmd8State_SetSuccessor(p->FoundState, upBranch);
p->OrderFall++; p->OrderFall++;
for (;;) for (;;)
@ -985,8 +994,8 @@ static CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, CTX_PTR c)
#ifdef PPMD8_FREEZE_SUPPORT #ifdef PPMD8_FREEZE_SUPPORT
if (p->RestoreMethod > PPMD8_RESTORE_METHOD_FREEZE) if (p->RestoreMethod > PPMD8_RESTORE_METHOD_FREEZE)
{ {
do { SetSuccessor(ps[--numPs], REF(c)); } while (numPs); do { Ppmd8State_SetSuccessor(ps[--numPs], REF(c)); } while (numPs);
RESET_TEXT(1); RESET_TEXT(1)
p->OrderFall = 1; p->OrderFall = 1;
} }
#endif #endif
@ -1014,7 +1023,7 @@ static CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, CTX_PTR c)
#ifdef PPMD8_FREEZE_SUPPORT #ifdef PPMD8_FREEZE_SUPPORT
ps[numPs++] = s; ps[numPs++] = s;
#endif #endif
SetSuccessor(s, upBranch); Ppmd8State_SetSuccessor(s, upBranch);
p->OrderFall++; p->OrderFall++;
} }
@ -1022,8 +1031,8 @@ static CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, CTX_PTR c)
if (p->RestoreMethod > PPMD8_RESTORE_METHOD_FREEZE) if (p->RestoreMethod > PPMD8_RESTORE_METHOD_FREEZE)
{ {
c = CTX(SUCCESSOR(s)); c = CTX(SUCCESSOR(s));
do { SetSuccessor(ps[--numPs], REF(c)); } while (numPs); do { Ppmd8State_SetSuccessor(ps[--numPs], REF(c)); } while (numPs);
RESET_TEXT(1); RESET_TEXT(1)
p->OrderFall = 1; p->OrderFall = 1;
return c; return c;
} }
@ -1031,15 +1040,15 @@ static CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, CTX_PTR c)
#endif #endif
if (SUCCESSOR(s) <= upBranch) if (SUCCESSOR(s) <= upBranch)
{ {
CTX_PTR successor; PPMD8_CTX_PTR successor;
CPpmd_State *s2 = p->FoundState; CPpmd_State *s2 = p->FoundState;
p->FoundState = s; p->FoundState = s;
successor = CreateSuccessors(p, False, NULL, c); successor = Ppmd8_CreateSuccessors(p, False, NULL, c);
if (!successor) if (!successor)
SetSuccessor(s, 0); Ppmd8State_SetSuccessor(s, 0);
else else
SetSuccessor(s, REF(successor)); Ppmd8State_SetSuccessor(s, REF(successor));
p->FoundState = s2; p->FoundState = s2;
} }
@ -1047,7 +1056,7 @@ static CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, CTX_PTR c)
CPpmd_Void_Ref successor = SUCCESSOR(s); CPpmd_Void_Ref successor = SUCCESSOR(s);
if (p->OrderFall == 1 && c1 == p->MaxContext) if (p->OrderFall == 1 && c1 == p->MaxContext)
{ {
SetSuccessor(p->FoundState, successor); Ppmd8State_SetSuccessor(p->FoundState, successor);
p->Text--; p->Text--;
} }
if (successor == 0) if (successor == 0)
@ -1059,11 +1068,11 @@ static CTX_PTR ReduceOrder(CPpmd8 *p, CPpmd_State *s1, CTX_PTR c)
void Ppmd8_UpdateModel(CPpmd8 *p); void Ppmd8_UpdateModel(CPpmd8 *p);
MY_NO_INLINE Z7_NO_INLINE
void Ppmd8_UpdateModel(CPpmd8 *p) void Ppmd8_UpdateModel(CPpmd8 *p)
{ {
CPpmd_Void_Ref maxSuccessor, minSuccessor = SUCCESSOR(p->FoundState); CPpmd_Void_Ref maxSuccessor, minSuccessor = SUCCESSOR(p->FoundState);
CTX_PTR c; PPMD8_CTX_PTR c;
unsigned s0, ns, fFreq = p->FoundState->Freq; unsigned s0, ns, fFreq = p->FoundState->Freq;
Byte flag, fSymbol = p->FoundState->Symbol; Byte flag, fSymbol = p->FoundState->Symbol;
{ {
@ -1096,7 +1105,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
if (s[0].Freq >= s[-1].Freq) if (s[0].Freq >= s[-1].Freq)
{ {
SwapStates(&s[0], &s[-1]); SWAP_STATES(&s[0], &s[-1]);
s--; s--;
} }
} }
@ -1112,14 +1121,14 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
c = p->MaxContext; c = p->MaxContext;
if (p->OrderFall == 0 && minSuccessor) if (p->OrderFall == 0 && minSuccessor)
{ {
CTX_PTR cs = CreateSuccessors(p, True, s, p->MinContext); PPMD8_CTX_PTR cs = Ppmd8_CreateSuccessors(p, True, s, p->MinContext);
if (!cs) if (!cs)
{ {
SetSuccessor(p->FoundState, 0); Ppmd8State_SetSuccessor(p->FoundState, 0);
RESTORE_MODEL(c, CTX(minSuccessor)); RESTORE_MODEL(c, CTX(minSuccessor));
return; return;
} }
SetSuccessor(p->FoundState, REF(cs)); Ppmd8State_SetSuccessor(p->FoundState, REF(cs));
p->MinContext = p->MaxContext = cs; p->MinContext = p->MaxContext = cs;
return; return;
} }
@ -1141,7 +1150,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
if (!minSuccessor) if (!minSuccessor)
{ {
CTX_PTR cs = ReduceOrder(p, s, p->MinContext); PPMD8_CTX_PTR cs = ReduceOrder(p, s, p->MinContext);
if (!cs) if (!cs)
{ {
RESTORE_MODEL(c, NULL); RESTORE_MODEL(c, NULL);
@ -1151,7 +1160,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
} }
else if ((Byte *)Ppmd8_GetPtr(p, minSuccessor) < p->UnitsStart) else if ((Byte *)Ppmd8_GetPtr(p, minSuccessor) < p->UnitsStart)
{ {
CTX_PTR cs = CreateSuccessors(p, False, s, p->MinContext); PPMD8_CTX_PTR cs = Ppmd8_CreateSuccessors(p, False, s, p->MinContext);
if (!cs) if (!cs)
{ {
RESTORE_MODEL(c, NULL); RESTORE_MODEL(c, NULL);
@ -1169,7 +1178,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
else if (p->RestoreMethod > PPMD8_RESTORE_METHOD_FREEZE) else if (p->RestoreMethod > PPMD8_RESTORE_METHOD_FREEZE)
{ {
maxSuccessor = minSuccessor; maxSuccessor = minSuccessor;
RESET_TEXT(0); RESET_TEXT(0)
p->OrderFall = 0; p->OrderFall = 0;
} }
#endif #endif
@ -1215,11 +1224,11 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
if ((ns1 & 1) != 0) if ((ns1 & 1) != 0)
{ {
/* Expand for one UNIT */ /* Expand for one UNIT */
unsigned oldNU = (ns1 + 1) >> 1; const unsigned oldNU = (ns1 + 1) >> 1;
unsigned i = U2I(oldNU); const unsigned i = U2I(oldNU);
if (i != U2I((size_t)oldNU + 1)) if (i != U2I((size_t)oldNU + 1))
{ {
void *ptr = AllocUnits(p, i + 1); void *ptr = Ppmd8_AllocUnits(p, i + 1);
void *oldPtr; void *oldPtr;
if (!ptr) if (!ptr)
{ {
@ -1227,15 +1236,15 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
return; return;
} }
oldPtr = STATS(c); oldPtr = STATS(c);
MyMem12Cpy(ptr, oldPtr, oldNU); MEM_12_CPY(ptr, oldPtr, oldNU)
InsertNode(p, oldPtr, i); Ppmd8_InsertNode(p, oldPtr, i);
c->Union4.Stats = STATS_REF(ptr); c->Union4.Stats = STATS_REF(ptr);
} }
} }
sum = c->Union2.SummFreq; sum = c->Union2.SummFreq;
/* max increase of Escape_Freq is 1 here. /* max increase of Escape_Freq is 1 here.
an average increase is 1/3 per symbol */ an average increase is 1/3 per symbol */
sum += (3 * ns1 + 1 < ns); sum += (UInt32)(unsigned)(3 * ns1 + 1 < ns);
/* original PPMdH uses 16-bit variable for (sum) here. /* original PPMdH uses 16-bit variable for (sum) here.
But (sum < ???). Do we need to truncate (sum) to 16-bit */ But (sum < ???). Do we need to truncate (sum) to 16-bit */
// sum = (UInt16)sum; // sum = (UInt16)sum;
@ -1243,7 +1252,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
else else
{ {
CPpmd_State *s = (CPpmd_State*)AllocUnits(p, 0); CPpmd_State *s = (CPpmd_State*)Ppmd8_AllocUnits(p, 0);
if (!s) if (!s)
{ {
RESTORE_MODEL(c, CTX(minSuccessor)); RESTORE_MODEL(c, CTX(minSuccessor));
@ -1255,7 +1264,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
s->Symbol = c->Union2.State2.Symbol; s->Symbol = c->Union2.State2.Symbol;
s->Successor_0 = c->Union4.State4.Successor_0; s->Successor_0 = c->Union4.State4.Successor_0;
s->Successor_1 = c->Union4.State4.Successor_1; s->Successor_1 = c->Union4.State4.Successor_1;
// SetSuccessor(s, c->Union4.Stats); // call it only for debug purposes to check the order of // Ppmd8State_SetSuccessor(s, c->Union4.Stats); // call it only for debug purposes to check the order of
// (Successor_0 and Successor_1) in LE/BE. // (Successor_0 and Successor_1) in LE/BE.
c->Union4.Stats = REF(s); c->Union4.Stats = REF(s);
if (freq < MAX_FREQ / 4 - 1) if (freq < MAX_FREQ / 4 - 1)
@ -1265,7 +1274,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
s->Freq = (Byte)freq; s->Freq = (Byte)freq;
sum = freq + p->InitEsc + (ns > 2); // Ppmd8 (> 2) sum = (UInt32)(freq + p->InitEsc + (ns > 2)); // Ppmd8 (> 2)
} }
} }
@ -1275,7 +1284,7 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
UInt32 sf = (UInt32)s0 + sum; UInt32 sf = (UInt32)s0 + sum;
s->Symbol = fSymbol; s->Symbol = fSymbol;
c->NumStats = (Byte)(ns1 + 1); c->NumStats = (Byte)(ns1 + 1);
SetSuccessor(s, maxSuccessor); Ppmd8State_SetSuccessor(s, maxSuccessor);
c->Flags |= flag; c->Flags |= flag;
if (cf < 6 * sf) if (cf < 6 * sf)
{ {
@ -1299,8 +1308,8 @@ void Ppmd8_UpdateModel(CPpmd8 *p)
MY_NO_INLINE Z7_NO_INLINE
static void Rescale(CPpmd8 *p) static void Ppmd8_Rescale(CPpmd8 *p)
{ {
unsigned i, adder, sumFreq, escFreq; unsigned i, adder, sumFreq, escFreq;
CPpmd_State *stats = STATS(p->MinContext); CPpmd_State *stats = STATS(p->MinContext);
@ -1389,7 +1398,7 @@ static void Rescale(CPpmd8 *p)
*s = *stats; *s = *stats;
s->Freq = (Byte)freq; s->Freq = (Byte)freq;
p->FoundState = s; p->FoundState = s;
InsertNode(p, stats, U2I(n0)); Ppmd8_InsertNode(p, stats, U2I(n0));
return; return;
} }
@ -1437,10 +1446,10 @@ CPpmd_See *Ppmd8_MakeEscFreq(CPpmd8 *p, unsigned numMasked1, UInt32 *escFreq)
{ {
// if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ // if (see->Summ) field is larger than 16-bit, we need only low 16 bits of Summ
unsigned summ = (UInt16)see->Summ; // & 0xFFFF const unsigned summ = (UInt16)see->Summ; // & 0xFFFF
unsigned r = (summ >> see->Shift); const unsigned r = (summ >> see->Shift);
see->Summ = (UInt16)(summ - r); see->Summ = (UInt16)(summ - r);
*escFreq = r + (r == 0); *escFreq = (UInt32)(r + (r == 0));
} }
} }
else else
@ -1452,9 +1461,9 @@ CPpmd_See *Ppmd8_MakeEscFreq(CPpmd8 *p, unsigned numMasked1, UInt32 *escFreq)
} }
static void NextContext(CPpmd8 *p) static void Ppmd8_NextContext(CPpmd8 *p)
{ {
CTX_PTR c = CTX(SUCCESSOR(p->FoundState)); PPMD8_CTX_PTR c = CTX(SUCCESSOR(p->FoundState));
if (p->OrderFall == 0 && (const Byte *)c >= p->UnitsStart) if (p->OrderFall == 0 && (const Byte *)c >= p->UnitsStart)
p->MaxContext = p->MinContext = c; p->MaxContext = p->MinContext = c;
else else
@ -1471,12 +1480,12 @@ void Ppmd8_Update1(CPpmd8 *p)
s->Freq = (Byte)freq; s->Freq = (Byte)freq;
if (freq > s[-1].Freq) if (freq > s[-1].Freq)
{ {
SwapStates(s, &s[-1]); SWAP_STATES(s, &s[-1]);
p->FoundState = --s; p->FoundState = --s;
if (freq > MAX_FREQ) if (freq > MAX_FREQ)
Rescale(p); Ppmd8_Rescale(p);
} }
NextContext(p); Ppmd8_NextContext(p);
} }
@ -1485,15 +1494,15 @@ void Ppmd8_Update1_0(CPpmd8 *p)
CPpmd_State *s = p->FoundState; CPpmd_State *s = p->FoundState;
CPpmd8_Context *mc = p->MinContext; CPpmd8_Context *mc = p->MinContext;
unsigned freq = s->Freq; unsigned freq = s->Freq;
unsigned summFreq = mc->Union2.SummFreq; const unsigned summFreq = mc->Union2.SummFreq;
p->PrevSuccess = (2 * freq >= summFreq); // Ppmd8 (>=) p->PrevSuccess = (2 * freq >= summFreq); // Ppmd8 (>=)
p->RunLength += (int)p->PrevSuccess; p->RunLength += (Int32)p->PrevSuccess;
mc->Union2.SummFreq = (UInt16)(summFreq + 4); mc->Union2.SummFreq = (UInt16)(summFreq + 4);
freq += 4; freq += 4;
s->Freq = (Byte)freq; s->Freq = (Byte)freq;
if (freq > MAX_FREQ) if (freq > MAX_FREQ)
Rescale(p); Ppmd8_Rescale(p);
NextContext(p); Ppmd8_NextContext(p);
} }
@ -1504,7 +1513,7 @@ void Ppmd8_UpdateBin(CPpmd8 *p)
p->FoundState->Freq = (Byte)(freq + (freq < 196)); // Ppmd8 (196) p->FoundState->Freq = (Byte)(freq + (freq < 196)); // Ppmd8 (196)
p->PrevSuccess = 1; p->PrevSuccess = 1;
p->RunLength++; p->RunLength++;
NextContext(p); Ppmd8_NextContext(p);
} }
*/ */
@ -1517,7 +1526,7 @@ void Ppmd8_Update2(CPpmd8 *p)
p->MinContext->Union2.SummFreq = (UInt16)(p->MinContext->Union2.SummFreq + 4); p->MinContext->Union2.SummFreq = (UInt16)(p->MinContext->Union2.SummFreq + 4);
s->Freq = (Byte)freq; s->Freq = (Byte)freq;
if (freq > MAX_FREQ) if (freq > MAX_FREQ)
Rescale(p); Ppmd8_Rescale(p);
Ppmd8_UpdateModel(p); Ppmd8_UpdateModel(p);
} }
@ -1526,7 +1535,7 @@ void Ppmd8_Update2(CPpmd8 *p)
GlueCount, and Glue method GlueCount, and Glue method
BinSum BinSum
See / EscFreq See / EscFreq
CreateSuccessors updates more suffix contexts Ppmd8_CreateSuccessors updates more suffix contexts
Ppmd8_UpdateModel consts. Ppmd8_UpdateModel consts.
PrevSuccess Update PrevSuccess Update
@ -1535,3 +1544,31 @@ Flags:
(1 << 3) - there is symbol in Stats with (sym >= 0x40) in (1 << 3) - there is symbol in Stats with (sym >= 0x40) in
(1 << 4) - main symbol of context is (sym >= 0x40) (1 << 4) - main symbol of context is (sym >= 0x40)
*/ */
#undef RESET_TEXT
#undef FLAG_RESCALED
#undef FLAG_PREV_HIGH
#undef HiBits_Prepare
#undef HiBits_Convert_3
#undef HiBits_Convert_4
#undef PPMD8_HiBitsFlag_3
#undef PPMD8_HiBitsFlag_4
#undef RESTORE_MODEL
#undef MAX_FREQ
#undef UNIT_SIZE
#undef U2B
#undef U2I
#undef I2U
#undef REF
#undef STATS_REF
#undef CTX
#undef STATS
#undef ONE_STATE
#undef SUFFIX
#undef NODE
#undef EMPTY_NODE
#undef MEM_12_CPY
#undef SUCCESSOR
#undef SWAP_STATES

View file

@ -1,11 +1,11 @@
/* Ppmd8.h -- Ppmd8 (PPMdI) compression codec /* Ppmd8.h -- Ppmd8 (PPMdI) compression codec
2021-04-13 : Igor Pavlov : Public domain 2023-04-02 : Igor Pavlov : Public domain
This code is based on: This code is based on:
PPMd var.I (2002): Dmitry Shkarin : Public domain PPMd var.I (2002): Dmitry Shkarin : Public domain
Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ Carryless rangecoder (1999): Dmitry Subbotin : Public domain */
#ifndef __PPMD8_H #ifndef ZIP7_INC_PPMD8_H
#define __PPMD8_H #define ZIP7_INC_PPMD8_H
#include "Ppmd.h" #include "Ppmd.h"
@ -87,8 +87,8 @@ typedef struct
UInt32 Low; UInt32 Low;
union union
{ {
IByteIn *In; IByteInPtr In;
IByteOut *Out; IByteOutPtr Out;
} Stream; } Stream;
Byte Indx2Units[PPMD_NUM_INDEXES + 2]; // +2 for alignment Byte Indx2Units[PPMD_NUM_INDEXES + 2]; // +2 for alignment

View file

@ -1,5 +1,5 @@
/* Ppmd8Dec.c -- Ppmd8 (PPMdI) Decoder /* Ppmd8Dec.c -- Ppmd8 (PPMdI) Decoder
2021-04-13 : Igor Pavlov : Public domain 2023-09-07 : Igor Pavlov : Public domain
This code is based on: This code is based on:
PPMd var.I (2002): Dmitry Shkarin : Public domain PPMd var.I (2002): Dmitry Shkarin : Public domain
Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ Carryless rangecoder (1999): Dmitry Subbotin : Public domain */
@ -8,8 +8,8 @@ This code is based on:
#include "Ppmd8.h" #include "Ppmd8.h"
#define kTop (1 << 24) #define kTop ((UInt32)1 << 24)
#define kBot (1 << 15) #define kBot ((UInt32)1 << 15)
#define READ_BYTE(p) IByteIn_Read((p)->Stream.In) #define READ_BYTE(p) IByteIn_Read((p)->Stream.In)
@ -37,9 +37,9 @@ BoolInt Ppmd8_Init_RangeDec(CPpmd8 *p)
#define R p #define R p
MY_FORCE_INLINE Z7_FORCE_INLINE
// MY_NO_INLINE // Z7_NO_INLINE
static void RangeDec_Decode(CPpmd8 *p, UInt32 start, UInt32 size) static void Ppmd8_RD_Decode(CPpmd8 *p, UInt32 start, UInt32 size)
{ {
start *= R->Range; start *= R->Range;
R->Low += start; R->Low += start;
@ -48,17 +48,17 @@ static void RangeDec_Decode(CPpmd8 *p, UInt32 start, UInt32 size)
RC_NORM_LOCAL(R) RC_NORM_LOCAL(R)
} }
#define RC_Decode(start, size) RangeDec_Decode(p, start, size); #define RC_Decode(start, size) Ppmd8_RD_Decode(p, start, size);
#define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R) #define RC_DecodeFinal(start, size) RC_Decode(start, size) RC_NORM_REMOTE(R)
#define RC_GetThreshold(total) (R->Code / (R->Range /= (total))) #define RC_GetThreshold(total) (R->Code / (R->Range /= (total)))
#define CTX(ref) ((CPpmd8_Context *)Ppmd8_GetContext(p, ref)) #define CTX(ref) ((CPpmd8_Context *)Ppmd8_GetContext(p, ref))
typedef CPpmd8_Context * CTX_PTR; // typedef CPpmd8_Context * CTX_PTR;
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
void Ppmd8_UpdateModel(CPpmd8 *p); void Ppmd8_UpdateModel(CPpmd8 *p);
#define MASK(sym) ((unsigned char *)charMask)[sym] #define MASK(sym) ((Byte *)charMask)[sym]
int Ppmd8_DecodeSymbol(CPpmd8 *p) int Ppmd8_DecodeSymbol(CPpmd8 *p)
@ -81,7 +81,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
if ((Int32)(count -= s->Freq) < 0) if ((Int32)(count -= s->Freq) < 0)
{ {
Byte sym; Byte sym;
RC_DecodeFinal(0, s->Freq); RC_DecodeFinal(0, s->Freq)
p->FoundState = s; p->FoundState = s;
sym = s->Symbol; sym = s->Symbol;
Ppmd8_Update1_0(p); Ppmd8_Update1_0(p);
@ -96,7 +96,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
if ((Int32)(count -= (++s)->Freq) < 0) if ((Int32)(count -= (++s)->Freq) < 0)
{ {
Byte sym; Byte sym;
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq); RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
p->FoundState = s; p->FoundState = s;
sym = s->Symbol; sym = s->Symbol;
Ppmd8_Update1(p); Ppmd8_Update1(p);
@ -109,10 +109,10 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
return PPMD8_SYM_ERROR; return PPMD8_SYM_ERROR;
hiCnt -= count; hiCnt -= count;
RC_Decode(hiCnt, summFreq - hiCnt); RC_Decode(hiCnt, summFreq - hiCnt)
PPMD_SetAllBitsIn256Bytes(charMask); PPMD_SetAllBitsIn256Bytes(charMask)
// i = p->MinContext->NumStats - 1; // i = p->MinContext->NumStats - 1;
// do { MASK((--s)->Symbol) = 0; } while (--i); // do { MASK((--s)->Symbol) = 0; } while (--i);
{ {
@ -120,8 +120,8 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
MASK(s->Symbol) = 0; MASK(s->Symbol) = 0;
do do
{ {
unsigned sym0 = s2[0].Symbol; const unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol; const unsigned sym1 = s2[1].Symbol;
s2 += 2; s2 += 2;
MASK(sym0) = 0; MASK(sym0) = 0;
MASK(sym1) = 0; MASK(sym1) = 0;
@ -152,7 +152,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
// Ppmd8_UpdateBin(p); // Ppmd8_UpdateBin(p);
{ {
unsigned freq = s->Freq; unsigned freq = s->Freq;
CTX_PTR c = CTX(SUCCESSOR(s)); CPpmd8_Context *c = CTX(SUCCESSOR(s));
sym = s->Symbol; sym = s->Symbol;
p->FoundState = s; p->FoundState = s;
p->PrevSuccess = 1; p->PrevSuccess = 1;
@ -176,7 +176,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
R->Range = (R->Range & ~((UInt32)PPMD_BIN_SCALE - 1)) - size0; R->Range = (R->Range & ~((UInt32)PPMD_BIN_SCALE - 1)) - size0;
RC_NORM_LOCAL(R) RC_NORM_LOCAL(R)
PPMD_SetAllBitsIn256Bytes(charMask); PPMD_SetAllBitsIn256Bytes(charMask)
MASK(Ppmd8Context_OneState(p->MinContext)->Symbol) = 0; MASK(Ppmd8Context_OneState(p->MinContext)->Symbol) = 0;
p->PrevSuccess = 0; p->PrevSuccess = 0;
} }
@ -209,17 +209,17 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
unsigned num2 = num / 2; unsigned num2 = num / 2;
num &= 1; num &= 1;
hiCnt = (s->Freq & (unsigned)(MASK(s->Symbol))) & (0 - (UInt32)num); hiCnt = (s->Freq & (UInt32)(MASK(s->Symbol))) & (0 - (UInt32)num);
s += num; s += num;
p->MinContext = mc; p->MinContext = mc;
do do
{ {
unsigned sym0 = s[0].Symbol; const unsigned sym0 = s[0].Symbol;
unsigned sym1 = s[1].Symbol; const unsigned sym1 = s[1].Symbol;
s += 2; s += 2;
hiCnt += (s[-2].Freq & (unsigned)(MASK(sym0))); hiCnt += (s[-2].Freq & (UInt32)(MASK(sym0)));
hiCnt += (s[-1].Freq & (unsigned)(MASK(sym1))); hiCnt += (s[-1].Freq & (UInt32)(MASK(sym1)));
} }
while (--num2); while (--num2);
} }
@ -227,7 +227,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
see = Ppmd8_MakeEscFreq(p, numMasked, &freqSum); see = Ppmd8_MakeEscFreq(p, numMasked, &freqSum);
freqSum += hiCnt; freqSum += hiCnt;
freqSum2 = freqSum; freqSum2 = freqSum;
PPMD8_CORRECT_SUM_RANGE(R, freqSum2); PPMD8_CORRECT_SUM_RANGE(R, freqSum2)
count = RC_GetThreshold(freqSum2); count = RC_GetThreshold(freqSum2);
@ -235,7 +235,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
if (count < hiCnt) if (count < hiCnt)
{ {
Byte sym; Byte sym;
// Ppmd_See_Update(see); // new (see->Summ) value can overflow over 16-bits in some rare cases // Ppmd_See_UPDATE(see) // new (see->Summ) value can overflow over 16-bits in some rare cases
s = Ppmd8_GetStats(p, p->MinContext); s = Ppmd8_GetStats(p, p->MinContext);
hiCnt = count; hiCnt = count;
@ -243,15 +243,15 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
{ {
for (;;) for (;;)
{ {
count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
// count -= s->Freq & (unsigned)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break; // count -= s->Freq & (UInt32)(MASK((s)->Symbol)); s++; if ((Int32)count < 0) break;
} }
} }
s--; s--;
RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq); RC_DecodeFinal((hiCnt - count) - s->Freq, s->Freq)
// new (see->Summ) value can overflow over 16-bits in some rare cases // new (see->Summ) value can overflow over 16-bits in some rare cases
Ppmd_See_Update(see); Ppmd_See_UPDATE(see)
p->FoundState = s; p->FoundState = s;
sym = s->Symbol; sym = s->Symbol;
Ppmd8_Update2(p); Ppmd8_Update2(p);
@ -261,7 +261,7 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
if (count >= freqSum2) if (count >= freqSum2)
return PPMD8_SYM_ERROR; return PPMD8_SYM_ERROR;
RC_Decode(hiCnt, freqSum2 - hiCnt); RC_Decode(hiCnt, freqSum2 - hiCnt)
// We increase (see->Summ) for sum of Freqs of all non_Masked symbols. // We increase (see->Summ) for sum of Freqs of all non_Masked symbols.
// new (see->Summ) value can overflow over 16-bits in some rare cases // new (see->Summ) value can overflow over 16-bits in some rare cases
@ -277,3 +277,19 @@ int Ppmd8_DecodeSymbol(CPpmd8 *p)
while (s != s2); while (s != s2);
} }
} }
#undef kTop
#undef kBot
#undef READ_BYTE
#undef RC_NORM_BASE
#undef RC_NORM_1
#undef RC_NORM
#undef RC_NORM_LOCAL
#undef RC_NORM_REMOTE
#undef R
#undef RC_Decode
#undef RC_DecodeFinal
#undef RC_GetThreshold
#undef CTX
#undef SUCCESSOR
#undef MASK

View file

@ -1,5 +1,5 @@
/* Ppmd8Enc.c -- Ppmd8 (PPMdI) Encoder /* Ppmd8Enc.c -- Ppmd8 (PPMdI) Encoder
2021-04-13 : Igor Pavlov : Public domain 2023-09-07 : Igor Pavlov : Public domain
This code is based on: This code is based on:
PPMd var.I (2002): Dmitry Shkarin : Public domain PPMd var.I (2002): Dmitry Shkarin : Public domain
Carryless rangecoder (1999): Dmitry Subbotin : Public domain */ Carryless rangecoder (1999): Dmitry Subbotin : Public domain */
@ -8,8 +8,8 @@ This code is based on:
#include "Ppmd8.h" #include "Ppmd8.h"
#define kTop (1 << 24) #define kTop ((UInt32)1 << 24)
#define kBot (1 << 15) #define kBot ((UInt32)1 << 15)
#define WRITE_BYTE(p) IByteOut_Write(p->Stream.Out, (Byte)(p->Low >> 24)) #define WRITE_BYTE(p) IByteOut_Write(p->Stream.Out, (Byte)(p->Low >> 24))
@ -54,13 +54,13 @@ void Ppmd8_Flush_RangeEnc(CPpmd8 *p)
MY_FORCE_INLINE Z7_FORCE_INLINE
// MY_NO_INLINE // Z7_NO_INLINE
static void RangeEnc_Encode(CPpmd8 *p, UInt32 start, UInt32 size, UInt32 total) static void Ppmd8_RangeEnc_Encode(CPpmd8 *p, UInt32 start, UInt32 size, UInt32 total)
{ {
R->Low += start * (R->Range /= total); R->Low += start * (R->Range /= total);
R->Range *= size; R->Range *= size;
RC_NORM_LOCAL(R); RC_NORM_LOCAL(R)
} }
@ -72,19 +72,19 @@ static void RangeEnc_Encode(CPpmd8 *p, UInt32 start, UInt32 size, UInt32 total)
#define RC_Encode(start, size, total) RangeEnc_Encode(p, start, size, total); #define RC_Encode(start, size, total) Ppmd8_RangeEnc_Encode(p, start, size, total);
#define RC_EncodeFinal(start, size, total) RC_Encode(start, size, total); RC_NORM_REMOTE(p); #define RC_EncodeFinal(start, size, total) RC_Encode(start, size, total) RC_NORM_REMOTE(p)
#define CTX(ref) ((CPpmd8_Context *)Ppmd8_GetContext(p, ref)) #define CTX(ref) ((CPpmd8_Context *)Ppmd8_GetContext(p, ref))
typedef CPpmd8_Context * CTX_PTR; // typedef CPpmd8_Context * CTX_PTR;
#define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p) #define SUCCESSOR(p) Ppmd_GET_SUCCESSOR(p)
void Ppmd8_UpdateModel(CPpmd8 *p); void Ppmd8_UpdateModel(CPpmd8 *p);
#define MASK(sym) ((unsigned char *)charMask)[sym] #define MASK(sym) ((Byte *)charMask)[sym]
// MY_FORCE_INLINE // Z7_FORCE_INLINE
// static // static
void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol) void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
{ {
@ -104,7 +104,7 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
if (s->Symbol == symbol) if (s->Symbol == symbol)
{ {
RC_EncodeFinal(0, s->Freq, summFreq); RC_EncodeFinal(0, s->Freq, summFreq)
p->FoundState = s; p->FoundState = s;
Ppmd8_Update1_0(p); Ppmd8_Update1_0(p);
return; return;
@ -117,7 +117,7 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
if ((++s)->Symbol == symbol) if ((++s)->Symbol == symbol)
{ {
RC_EncodeFinal(sum, s->Freq, summFreq); RC_EncodeFinal(sum, s->Freq, summFreq)
p->FoundState = s; p->FoundState = s;
Ppmd8_Update1(p); Ppmd8_Update1(p);
return; return;
@ -127,10 +127,10 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
while (--i); while (--i);
RC_Encode(sum, summFreq - sum, summFreq); RC_Encode(sum, summFreq - sum, summFreq)
PPMD_SetAllBitsIn256Bytes(charMask); PPMD_SetAllBitsIn256Bytes(charMask)
// MASK(s->Symbol) = 0; // MASK(s->Symbol) = 0;
// i = p->MinContext->NumStats; // i = p->MinContext->NumStats;
// do { MASK((--s)->Symbol) = 0; } while (--i); // do { MASK((--s)->Symbol) = 0; } while (--i);
@ -139,8 +139,8 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
MASK(s->Symbol) = 0; MASK(s->Symbol) = 0;
do do
{ {
unsigned sym0 = s2[0].Symbol; const unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol; const unsigned sym1 = s2[1].Symbol;
s2 += 2; s2 += 2;
MASK(sym0) = 0; MASK(sym0) = 0;
MASK(sym1) = 0; MASK(sym1) = 0;
@ -153,20 +153,20 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
UInt16 *prob = Ppmd8_GetBinSumm(p); UInt16 *prob = Ppmd8_GetBinSumm(p);
CPpmd_State *s = Ppmd8Context_OneState(p->MinContext); CPpmd_State *s = Ppmd8Context_OneState(p->MinContext);
UInt32 pr = *prob; UInt32 pr = *prob;
UInt32 bound = (R->Range >> 14) * pr; const UInt32 bound = (R->Range >> 14) * pr;
pr = PPMD_UPDATE_PROB_1(pr); pr = PPMD_UPDATE_PROB_1(pr);
if (s->Symbol == symbol) if (s->Symbol == symbol)
{ {
*prob = (UInt16)(pr + (1 << PPMD_INT_BITS)); *prob = (UInt16)(pr + (1 << PPMD_INT_BITS));
// RangeEnc_EncodeBit_0(p, bound); // RangeEnc_EncodeBit_0(p, bound);
R->Range = bound; R->Range = bound;
RC_NORM(R); RC_NORM(R)
// p->FoundState = s; // p->FoundState = s;
// Ppmd8_UpdateBin(p); // Ppmd8_UpdateBin(p);
{ {
unsigned freq = s->Freq; const unsigned freq = s->Freq;
CTX_PTR c = CTX(SUCCESSOR(s)); CPpmd8_Context *c = CTX(SUCCESSOR(s));
p->FoundState = s; p->FoundState = s;
p->PrevSuccess = 1; p->PrevSuccess = 1;
p->RunLength++; p->RunLength++;
@ -187,7 +187,7 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
R->Range = (R->Range & ~((UInt32)PPMD_BIN_SCALE - 1)) - bound; R->Range = (R->Range & ~((UInt32)PPMD_BIN_SCALE - 1)) - bound;
RC_NORM_LOCAL(R) RC_NORM_LOCAL(R)
PPMD_SetAllBitsIn256Bytes(charMask); PPMD_SetAllBitsIn256Bytes(charMask)
MASK(s->Symbol) = 0; MASK(s->Symbol) = 0;
p->PrevSuccess = 0; p->PrevSuccess = 0;
} }
@ -248,14 +248,14 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
do do
{ {
unsigned cur = s->Symbol; const unsigned cur = s->Symbol;
if ((int)cur == symbol) if ((int)cur == symbol)
{ {
UInt32 low = sum; const UInt32 low = sum;
UInt32 freq = s->Freq; const UInt32 freq = s->Freq;
unsigned num2; unsigned num2;
Ppmd_See_Update(see); Ppmd_See_UPDATE(see)
p->FoundState = s; p->FoundState = s;
sum += escFreq; sum += escFreq;
@ -265,21 +265,20 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
if (num2 != 0) if (num2 != 0)
{ {
s += i; s += i;
for (;;) do
{ {
unsigned sym0 = s[0].Symbol; const unsigned sym0 = s[0].Symbol;
unsigned sym1 = s[1].Symbol; const unsigned sym1 = s[1].Symbol;
s += 2; s += 2;
sum += (s[-2].Freq & (unsigned)(MASK(sym0))); sum += (s[-2].Freq & (unsigned)(MASK(sym0)));
sum += (s[-1].Freq & (unsigned)(MASK(sym1))); sum += (s[-1].Freq & (unsigned)(MASK(sym1)));
if (--num2 == 0)
break;
} }
while (--num2);
} }
PPMD8_CORRECT_SUM_RANGE(p, sum); PPMD8_CORRECT_SUM_RANGE(p, sum)
RC_EncodeFinal(low, freq, sum); RC_EncodeFinal(low, freq, sum)
Ppmd8_Update2(p); Ppmd8_Update2(p);
return; return;
} }
@ -291,19 +290,19 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
{ {
UInt32 total = sum + escFreq; UInt32 total = sum + escFreq;
see->Summ = (UInt16)(see->Summ + total); see->Summ = (UInt16)(see->Summ + total);
PPMD8_CORRECT_SUM_RANGE(p, total); PPMD8_CORRECT_SUM_RANGE(p, total)
RC_Encode(sum, total - sum, total); RC_Encode(sum, total - sum, total)
} }
{ {
CPpmd_State *s2 = Ppmd8_GetStats(p, p->MinContext); const CPpmd_State *s2 = Ppmd8_GetStats(p, p->MinContext);
s--; s--;
MASK(s->Symbol) = 0; MASK(s->Symbol) = 0;
do do
{ {
unsigned sym0 = s2[0].Symbol; const unsigned sym0 = s2[0].Symbol;
unsigned sym1 = s2[1].Symbol; const unsigned sym1 = s2[1].Symbol;
s2 += 2; s2 += 2;
MASK(sym0) = 0; MASK(sym0) = 0;
MASK(sym1) = 0; MASK(sym1) = 0;
@ -312,3 +311,27 @@ void Ppmd8_EncodeSymbol(CPpmd8 *p, int symbol)
} }
} }
} }
#undef kTop
#undef kBot
#undef WRITE_BYTE
#undef RC_NORM_BASE
#undef RC_NORM_1
#undef RC_NORM
#undef RC_NORM_LOCAL
#undef RC_NORM_REMOTE
#undef R
#undef RC_Encode
#undef RC_EncodeFinal
#undef CTX
#undef SUCCESSOR
#undef MASK

View file

@ -1,10 +1,127 @@
/* Precomp.h -- StdAfx /* Precomp.h -- precompilation file
2013-11-12 : Igor Pavlov : Public domain */ 2024-01-25 : Igor Pavlov : Public domain */
#ifndef __7Z_PRECOMP_H #ifndef ZIP7_INC_PRECOMP_H
#define __7Z_PRECOMP_H #define ZIP7_INC_PRECOMP_H
/*
this file must be included before another *.h files and before <windows.h>.
this file is included from the following files:
C\*.c
C\Util\*\Precomp.h <- C\Util\*\*.c
CPP\Common\Common.h <- *\StdAfx.h <- *\*.cpp
this file can set the following macros:
Z7_LARGE_PAGES 1
Z7_LONG_PATH 1
Z7_WIN32_WINNT_MIN 0x0500 (or higher) : we require at least win2000+ for 7-Zip
_WIN32_WINNT 0x0500 (or higher)
WINVER _WIN32_WINNT
UNICODE 1
_UNICODE 1
*/
#include "Compiler.h" #include "Compiler.h"
/* #include "7zTypes.h" */
#ifdef _MSC_VER
// #pragma warning(disable : 4206) // nonstandard extension used : translation unit is empty
#if _MSC_VER >= 1912
// #pragma warning(disable : 5039) // pointer or reference to potentially throwing function passed to 'extern "C"' function under - EHc.Undefined behavior may occur if this function throws an exception.
#endif
#endif
/*
// for debug:
#define UNICODE 1
#define _UNICODE 1
#define _WIN32_WINNT 0x0500 // win2000
#ifndef WINVER
#define WINVER _WIN32_WINNT
#endif
*/
#ifdef _WIN32
/*
this "Precomp.h" file must be included before <windows.h>,
if we want to define _WIN32_WINNT before <windows.h>.
*/
#ifndef Z7_LARGE_PAGES
#ifndef Z7_NO_LARGE_PAGES
#define Z7_LARGE_PAGES 1
#endif
#endif
#ifndef Z7_LONG_PATH
#ifndef Z7_NO_LONG_PATH
#define Z7_LONG_PATH 1
#endif
#endif
#ifndef Z7_DEVICE_FILE
#ifndef Z7_NO_DEVICE_FILE
// #define Z7_DEVICE_FILE 1
#endif
#endif
// we don't change macros if included after <windows.h>
#ifndef _WINDOWS_
#ifndef Z7_WIN32_WINNT_MIN
#if defined(_M_ARM64) || defined(__aarch64__)
// #define Z7_WIN32_WINNT_MIN 0x0a00 // win10
#define Z7_WIN32_WINNT_MIN 0x0600 // vista
#elif defined(_M_ARM) && defined(_M_ARMT) && defined(_M_ARM_NT)
// #define Z7_WIN32_WINNT_MIN 0x0602 // win8
#define Z7_WIN32_WINNT_MIN 0x0600 // vista
#elif defined(_M_X64) || defined(_M_AMD64) || defined(__x86_64__) || defined(_M_IA64)
#define Z7_WIN32_WINNT_MIN 0x0503 // win2003
// #elif defined(_M_IX86) || defined(__i386__)
// #define Z7_WIN32_WINNT_MIN 0x0500 // win2000
#else // x86 and another(old) systems
#define Z7_WIN32_WINNT_MIN 0x0500 // win2000
// #define Z7_WIN32_WINNT_MIN 0x0502 // win2003 // for debug
#endif
#endif // Z7_WIN32_WINNT_MIN
#ifndef Z7_DO_NOT_DEFINE_WIN32_WINNT
#ifdef _WIN32_WINNT
// #error Stop_Compiling_Bad_WIN32_WINNT
#else
#ifndef Z7_NO_DEFINE_WIN32_WINNT
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define _WIN32_WINNT Z7_WIN32_WINNT_MIN
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif // _WIN32_WINNT
#ifndef WINVER
#define WINVER _WIN32_WINNT
#endif
#endif // Z7_DO_NOT_DEFINE_WIN32_WINNT
#ifndef _MBCS
#ifndef Z7_NO_UNICODE
// UNICODE and _UNICODE are used by <windows.h> and by 7-zip code.
#ifndef UNICODE
#define UNICODE 1
#endif
#ifndef _UNICODE
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define _UNICODE 1
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif // Z7_NO_UNICODE
#endif // _MBCS
#endif // _WINDOWS_
// #include "7zWindows.h"
#endif // _WIN32
#endif #endif

View file

@ -1,14 +1,14 @@
/* RotateDefs.h -- Rotate functions /* RotateDefs.h -- Rotate functions
2015-03-25 : Igor Pavlov : Public domain */ 2023-06-18 : Igor Pavlov : Public domain */
#ifndef __ROTATE_DEFS_H #ifndef ZIP7_INC_ROTATE_DEFS_H
#define __ROTATE_DEFS_H #define ZIP7_INC_ROTATE_DEFS_H
#ifdef _MSC_VER #ifdef _MSC_VER
#include <stdlib.h> #include <stdlib.h>
/* don't use _rotl with MINGW. It can insert slow call to function. */ /* don't use _rotl with old MINGW. It can insert slow call to function. */
/* #if (_MSC_VER >= 1200) */ /* #if (_MSC_VER >= 1200) */
#pragma intrinsic(_rotl) #pragma intrinsic(_rotl)
@ -18,12 +18,32 @@
#define rotlFixed(x, n) _rotl((x), (n)) #define rotlFixed(x, n) _rotl((x), (n))
#define rotrFixed(x, n) _rotr((x), (n)) #define rotrFixed(x, n) _rotr((x), (n))
#if (_MSC_VER >= 1300)
#define Z7_ROTL64(x, n) _rotl64((x), (n))
#define Z7_ROTR64(x, n) _rotr64((x), (n))
#else
#define Z7_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#define Z7_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
#endif
#else #else
/* new compilers can translate these macros to fast commands. */ /* new compilers can translate these macros to fast commands. */
#if defined(__clang__) && (__clang_major__ >= 4) \
|| defined(__GNUC__) && (__GNUC__ >= 5)
/* GCC 4.9.0 and clang 3.5 can recognize more correct version: */
#define rotlFixed(x, n) (((x) << (n)) | ((x) >> (-(n) & 31)))
#define rotrFixed(x, n) (((x) >> (n)) | ((x) << (-(n) & 31)))
#define Z7_ROTL64(x, n) (((x) << (n)) | ((x) >> (-(n) & 63)))
#define Z7_ROTR64(x, n) (((x) >> (n)) | ((x) << (-(n) & 63)))
#else
/* for old GCC / clang: */
#define rotlFixed(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) #define rotlFixed(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
#define rotrFixed(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) #define rotrFixed(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#define Z7_ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n))))
#define Z7_ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n))))
#endif
#endif #endif

316
C/Sha1.c
View file

@ -1,64 +1,60 @@
/* Sha1.c -- SHA-1 Hash /* Sha1.c -- SHA-1 Hash
2021-07-13 : Igor Pavlov : Public domain : Igor Pavlov : Public domain
This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */ This code is based on public domain code of Steve Reid from Wei Dai's Crypto++ library. */
#include "Precomp.h" #include "Precomp.h"
#include <string.h> #include <string.h>
#include "CpuArch.h"
#include "RotateDefs.h"
#include "Sha1.h" #include "Sha1.h"
#include "RotateDefs.h"
#if defined(_MSC_VER) && (_MSC_VER < 1900) #include "CpuArch.h"
// #define USE_MY_MM
#endif
#ifdef MY_CPU_X86_OR_AMD64 #ifdef MY_CPU_X86_OR_AMD64
#ifdef _MSC_VER #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
#if _MSC_VER >= 1200 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
#define _SHA_SUPPORTED || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
#endif || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \
#elif defined(__clang__) || defined(_MSC_VER) && (_MSC_VER >= 1200)
#if (__clang_major__ >= 8) // fix that check #define Z7_COMPILER_SHA1_SUPPORTED
#define _SHA_SUPPORTED
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 8) // fix that check
#define _SHA_SUPPORTED
#endif
#elif defined(__INTEL_COMPILER)
#if (__INTEL_COMPILER >= 1800) // fix that check
#define _SHA_SUPPORTED
#endif
#endif #endif
#elif defined(MY_CPU_ARM_OR_ARM64) #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) \
#ifdef _MSC_VER && (!defined(Z7_MSC_VER_ORIGINAL) || (_MSC_VER >= 1929) && (_MSC_FULL_VER >= 192930037))
#if _MSC_VER >= 1910 && _MSC_VER >= 1929 && _MSC_FULL_VER >= 192930037 #if defined(__ARM_FEATURE_SHA2) \
#define _SHA_SUPPORTED || defined(__ARM_FEATURE_CRYPTO)
#define Z7_COMPILER_SHA1_SUPPORTED
#else
#if defined(MY_CPU_ARM64) \
|| defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
|| defined(Z7_MSC_VER_ORIGINAL)
#if defined(__ARM_FP) && \
( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
|| defined(__GNUC__) && (__GNUC__ >= 6) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
#if defined(MY_CPU_ARM64) \
|| !defined(Z7_CLANG_VERSION) \
|| defined(__ARM_NEON) && \
(Z7_CLANG_VERSION < 170000 || \
Z7_CLANG_VERSION > 170001)
#define Z7_COMPILER_SHA1_SUPPORTED
#endif #endif
#elif defined(__clang__)
#if (__clang_major__ >= 8) // fix that check
#define _SHA_SUPPORTED
#endif #endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 6) // fix that check
#define _SHA_SUPPORTED
#endif #endif
#endif #endif
#endif #endif
void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks); void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks);
#ifdef _SHA_SUPPORTED #ifdef Z7_COMPILER_SHA1_SUPPORTED
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks); void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
static SHA1_FUNC_UPDATE_BLOCKS g_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks; static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS = Sha1_UpdateBlocks;
static SHA1_FUNC_UPDATE_BLOCKS g_FUNC_UPDATE_BLOCKS_HW; static SHA1_FUNC_UPDATE_BLOCKS g_SHA1_FUNC_UPDATE_BLOCKS_HW;
#define UPDATE_BLOCKS(p) p->func_UpdateBlocks #define SHA1_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
#else #else
#define UPDATE_BLOCKS(p) Sha1_UpdateBlocks #define SHA1_UPDATE_BLOCKS(p) Sha1_UpdateBlocks
#endif #endif
@ -66,16 +62,16 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
{ {
SHA1_FUNC_UPDATE_BLOCKS func = Sha1_UpdateBlocks; SHA1_FUNC_UPDATE_BLOCKS func = Sha1_UpdateBlocks;
#ifdef _SHA_SUPPORTED #ifdef Z7_COMPILER_SHA1_SUPPORTED
if (algo != SHA1_ALGO_SW) if (algo != SHA1_ALGO_SW)
{ {
if (algo == SHA1_ALGO_DEFAULT) if (algo == SHA1_ALGO_DEFAULT)
func = g_FUNC_UPDATE_BLOCKS; func = g_SHA1_FUNC_UPDATE_BLOCKS;
else else
{ {
if (algo != SHA1_ALGO_HW) if (algo != SHA1_ALGO_HW)
return False; return False;
func = g_FUNC_UPDATE_BLOCKS_HW; func = g_SHA1_FUNC_UPDATE_BLOCKS_HW;
if (!func) if (!func)
return False; return False;
} }
@ -85,27 +81,28 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
return False; return False;
#endif #endif
p->func_UpdateBlocks = func; p->v.vars.func_UpdateBlocks = func;
return True; return True;
} }
/* define it for speed optimization */ /* define it for speed optimization */
// #define _SHA1_UNROLL // #define Z7_SHA1_UNROLL
// allowed unroll steps: (1, 2, 4, 5, 20) // allowed unroll steps: (1, 2, 4, 5, 20)
#ifdef _SHA1_UNROLL #undef Z7_SHA1_BIG_W
#ifdef Z7_SHA1_UNROLL
#define STEP_PRE 20 #define STEP_PRE 20
#define STEP_MAIN 20 #define STEP_MAIN 20
#else #else
#define _SHA1_BIG_W #define Z7_SHA1_BIG_W
#define STEP_PRE 5 #define STEP_PRE 5
#define STEP_MAIN 5 #define STEP_MAIN 5
#endif #endif
#ifdef _SHA1_BIG_W #ifdef Z7_SHA1_BIG_W
#define kNumW 80 #define kNumW 80
#define w(i) W[i] #define w(i) W[i]
#else #else
@ -150,11 +147,11 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
*/ */
#define M5(i, fx, wx0, wx1) \ #define M5(i, fx, wx0, wx1) \
T5 ( a,b,c,d,e, fx, wx0((i) ) ); \ T5 ( a,b,c,d,e, fx, wx0((i) ) ) \
T5 ( e,a,b,c,d, fx, wx1((i)+1) ); \ T5 ( e,a,b,c,d, fx, wx1((i)+1) ) \
T5 ( d,e,a,b,c, fx, wx1((i)+2) ); \ T5 ( d,e,a,b,c, fx, wx1((i)+2) ) \
T5 ( c,d,e,a,b, fx, wx1((i)+3) ); \ T5 ( c,d,e,a,b, fx, wx1((i)+3) ) \
T5 ( b,c,d,e,a, fx, wx1((i)+4) ); \ T5 ( b,c,d,e,a, fx, wx1((i)+4) ) \
#define R5(i, fx, wx) \ #define R5(i, fx, wx) \
M5 ( i, fx, wx, wx) \ M5 ( i, fx, wx, wx) \
@ -163,17 +160,17 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
#if STEP_PRE > 5 #if STEP_PRE > 5
#define R20_START \ #define R20_START \
R5 ( 0, f0, w0); \ R5 ( 0, f0, w0) \
R5 ( 5, f0, w0); \ R5 ( 5, f0, w0) \
R5 ( 10, f0, w0); \ R5 ( 10, f0, w0) \
M5 ( 15, f0, w0, w1); \ M5 ( 15, f0, w0, w1) \
#elif STEP_PRE == 5 #elif STEP_PRE == 5
#define R20_START \ #define R20_START \
{ size_t i; for (i = 0; i < 15; i += STEP_PRE) \ { size_t i; for (i = 0; i < 15; i += STEP_PRE) \
{ R5(i, f0, w0); } } \ { R5(i, f0, w0) } } \
M5 ( 15, f0, w0, w1); \ M5 ( 15, f0, w0, w1) \
#else #else
@ -187,8 +184,8 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
#define R20_START \ #define R20_START \
{ size_t i; for (i = 0; i < 16; i += STEP_PRE) \ { size_t i; for (i = 0; i < 16; i += STEP_PRE) \
{ R_PRE(i, f0, w0); } } \ { R_PRE(i, f0, w0) } } \
R4 ( 16, f0, w1); \ R4 ( 16, f0, w1) \
#endif #endif
@ -197,10 +194,10 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
#if STEP_MAIN > 5 #if STEP_MAIN > 5
#define R20(ii, fx) \ #define R20(ii, fx) \
R5 ( (ii) , fx, w1); \ R5 ( (ii) , fx, w1) \
R5 ( (ii) + 5 , fx, w1); \ R5 ( (ii) + 5 , fx, w1) \
R5 ( (ii) + 10, fx, w1); \ R5 ( (ii) + 10, fx, w1) \
R5 ( (ii) + 15, fx, w1); \ R5 ( (ii) + 15, fx, w1) \
#else #else
@ -216,7 +213,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
#define R20(ii, fx) \ #define R20(ii, fx) \
{ size_t i; for (i = (ii); i < (ii) + 20; i += STEP_MAIN) \ { size_t i; for (i = (ii); i < (ii) + 20; i += STEP_MAIN) \
{ R_MAIN(i, fx, w1); } } \ { R_MAIN(i, fx, w1) } } \
#endif #endif
@ -224,7 +221,7 @@ BoolInt Sha1_SetFunction(CSha1 *p, unsigned algo)
void Sha1_InitState(CSha1 *p) void Sha1_InitState(CSha1 *p)
{ {
p->count = 0; p->v.vars.count = 0;
p->state[0] = 0x67452301; p->state[0] = 0x67452301;
p->state[1] = 0xEFCDAB89; p->state[1] = 0xEFCDAB89;
p->state[2] = 0x98BADCFE; p->state[2] = 0x98BADCFE;
@ -234,9 +231,9 @@ void Sha1_InitState(CSha1 *p)
void Sha1_Init(CSha1 *p) void Sha1_Init(CSha1 *p)
{ {
p->func_UpdateBlocks = p->v.vars.func_UpdateBlocks =
#ifdef _SHA_SUPPORTED #ifdef Z7_COMPILER_SHA1_SUPPORTED
g_FUNC_UPDATE_BLOCKS; g_SHA1_FUNC_UPDATE_BLOCKS;
#else #else
NULL; NULL;
#endif #endif
@ -244,12 +241,12 @@ void Sha1_Init(CSha1 *p)
} }
MY_NO_INLINE Z7_NO_INLINE
void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks) void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks)
{ {
UInt32 a, b, c, d, e; UInt32 a, b, c, d, e;
UInt32 W[kNumW]; UInt32 W[kNumW];
// if (numBlocks != 0x1264378347) return;
if (numBlocks == 0) if (numBlocks == 0)
return; return;
@ -266,9 +263,9 @@ void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t nu
#endif #endif
R20_START R20_START
R20(20, f1); R20(20, f1)
R20(40, f2); R20(40, f2)
R20(60, f3); R20(60, f3)
a += state[0]; a += state[0];
b += state[1]; b += state[1];
@ -282,32 +279,27 @@ void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t nu
state[3] = d; state[3] = d;
state[4] = e; state[4] = e;
data += 64; data += SHA1_BLOCK_SIZE;
} }
while (--numBlocks); while (--numBlocks);
} }
#define Sha1_UpdateBlock(p) UPDATE_BLOCKS(p)(p->state, p->buffer, 1) #define Sha1_UpdateBlock(p) SHA1_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
void Sha1_Update(CSha1 *p, const Byte *data, size_t size) void Sha1_Update(CSha1 *p, const Byte *data, size_t size)
{ {
if (size == 0) if (size == 0)
return; return;
{ {
unsigned pos = (unsigned)p->count & 0x3F; const unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1);
unsigned num; const unsigned num = SHA1_BLOCK_SIZE - pos;
p->v.vars.count += size;
p->count += size;
num = 64 - pos;
if (num > size) if (num > size)
{ {
memcpy(p->buffer + pos, data, size); memcpy(p->buffer + pos, data, size);
return; return;
} }
if (pos != 0) if (pos != 0)
{ {
size -= num; size -= num;
@ -317,9 +309,10 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size)
} }
} }
{ {
size_t numBlocks = size >> 6; const size_t numBlocks = size >> 6;
UPDATE_BLOCKS(p)(p->state, data, numBlocks); // if (numBlocks)
size &= 0x3F; SHA1_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
size &= SHA1_BLOCK_SIZE - 1;
if (size == 0) if (size == 0)
return; return;
data += (numBlocks << 6); data += (numBlocks << 6);
@ -330,52 +323,28 @@ void Sha1_Update(CSha1 *p, const Byte *data, size_t size)
void Sha1_Final(CSha1 *p, Byte *digest) void Sha1_Final(CSha1 *p, Byte *digest)
{ {
unsigned pos = (unsigned)p->count & 0x3F; unsigned pos = (unsigned)p->v.vars.count & (SHA1_BLOCK_SIZE - 1);
p->buffer[pos++] = 0x80; p->buffer[pos++] = 0x80;
if (pos > (SHA1_BLOCK_SIZE - 4 * 2))
if (pos > (64 - 8))
{ {
while (pos != 64) { p->buffer[pos++] = 0; } while (pos != SHA1_BLOCK_SIZE) { p->buffer[pos++] = 0; }
// memset(&p->buf.buffer[pos], 0, 64 - pos); // memset(&p->buf.buffer[pos], 0, SHA1_BLOCK_SIZE - pos);
Sha1_UpdateBlock(p); Sha1_UpdateBlock(p);
pos = 0; pos = 0;
} }
memset(&p->buffer[pos], 0, (SHA1_BLOCK_SIZE - 4 * 2) - pos);
/*
if (pos & 3)
{ {
p->buffer[pos] = 0; const UInt64 numBits = p->v.vars.count << 3;
p->buffer[pos + 1] = 0; SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
p->buffer[pos + 2] = 0; SetBe32(p->buffer + SHA1_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
pos += 3;
pos &= ~3;
} }
{
for (; pos < 64 - 8; pos += 4)
*(UInt32 *)(&p->buffer[pos]) = 0;
}
*/
memset(&p->buffer[pos], 0, (64 - 8) - pos);
{
UInt64 numBits = (p->count << 3);
SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32));
SetBe32(p->buffer + 64 - 4, (UInt32)(numBits));
}
Sha1_UpdateBlock(p); Sha1_UpdateBlock(p);
SetBe32(digest, p->state[0]); SetBe32(digest, p->state[0])
SetBe32(digest + 4, p->state[1]); SetBe32(digest + 4, p->state[1])
SetBe32(digest + 8, p->state[2]); SetBe32(digest + 8, p->state[2])
SetBe32(digest + 12, p->state[3]); SetBe32(digest + 12, p->state[3])
SetBe32(digest + 16, p->state[4]); SetBe32(digest + 16, p->state[4])
Sha1_InitState(p); Sha1_InitState(p);
} }
@ -383,11 +352,11 @@ void Sha1_Final(CSha1 *p, Byte *digest)
void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size) void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size)
{ {
const UInt64 numBits = (p->count + size) << 3; const UInt64 numBits = (p->v.vars.count + size) << 3;
SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32)); SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 2], (UInt32)(numBits >> 32))
SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits)); SetBe32(&((UInt32 *)(void *)block)[SHA1_NUM_BLOCK_WORDS - 1], (UInt32)(numBits))
// SetBe32((UInt32 *)(block + size), 0x80000000); // SetBe32((UInt32 *)(block + size), 0x80000000);
SetUi32((UInt32 *)(void *)(block + size), 0x80); SetUi32((UInt32 *)(void *)(block + size), 0x80)
size += 4; size += 4;
while (size != (SHA1_NUM_BLOCK_WORDS - 2) * 4) while (size != (SHA1_NUM_BLOCK_WORDS - 2) * 4)
{ {
@ -407,67 +376,66 @@ void Sha1_GetBlockDigest(const CSha1 *p, const Byte *data, Byte *destDigest)
st[3] = p->state[3]; st[3] = p->state[3];
st[4] = p->state[4]; st[4] = p->state[4];
UPDATE_BLOCKS(p)(st, data, 1); SHA1_UPDATE_BLOCKS(p)(st, data, 1);
SetBe32(destDigest + 0 , st[0]); SetBe32(destDigest + 0 , st[0])
SetBe32(destDigest + 1 * 4, st[1]); SetBe32(destDigest + 1 * 4, st[1])
SetBe32(destDigest + 2 * 4, st[2]); SetBe32(destDigest + 2 * 4, st[2])
SetBe32(destDigest + 3 * 4, st[3]); SetBe32(destDigest + 3 * 4, st[3])
SetBe32(destDigest + 4 * 4, st[4]); SetBe32(destDigest + 4 * 4, st[4])
} }
void Sha1Prepare() void Sha1Prepare(void)
{ {
#ifdef _SHA_SUPPORTED #ifdef Z7_COMPILER_SHA1_SUPPORTED
SHA1_FUNC_UPDATE_BLOCKS f, f_hw; SHA1_FUNC_UPDATE_BLOCKS f, f_hw;
f = Sha1_UpdateBlocks; f = Sha1_UpdateBlocks;
f_hw = NULL; f_hw = NULL;
#ifdef MY_CPU_X86_OR_AMD64 #ifdef MY_CPU_X86_OR_AMD64
#ifndef USE_MY_MM
if (CPU_IsSupported_SHA() if (CPU_IsSupported_SHA()
&& CPU_IsSupported_SSSE3() && CPU_IsSupported_SSSE3()
// && CPU_IsSupported_SSE41()
) )
#endif #else
#else
if (CPU_IsSupported_SHA1()) if (CPU_IsSupported_SHA1())
#endif #endif
{ {
// printf("\n========== HW SHA1 ======== \n"); // printf("\n========== HW SHA1 ======== \n");
#if defined(MY_CPU_ARM_OR_ARM64) && defined(_MSC_VER) #if 1 && defined(MY_CPU_ARM_OR_ARM64) && defined(Z7_MSC_VER_ORIGINAL) && (_MSC_FULL_VER < 192930037)
/* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037). /* there was bug in MSVC compiler for ARM64 -O2 before version VS2019 16.10 (19.29.30037).
It generated incorrect SHA-1 code. It generated incorrect SHA-1 code. */
21.03 : we test sha1-hardware code at runtime initialization */ #pragma message("== SHA1 code can work incorrectly with this compiler")
#error Stop_Compiling_MSC_Compiler_BUG_SHA1
#pragma message("== SHA1 code: MSC compiler : failure-check code was inserted") #endif
UInt32 state[5] = { 0, 1, 2, 3, 4 } ;
Byte data[64];
unsigned i;
for (i = 0; i < sizeof(data); i += 2)
{
data[i ] = (Byte)(i);
data[i + 1] = (Byte)(i + 1);
}
Sha1_UpdateBlocks_HW(state, data, sizeof(data) / 64);
if ( state[0] != 0x9acd7297
|| state[1] != 0x4624d898
|| state[2] != 0x0bf079f0
|| state[3] != 0x031e61b3
|| state[4] != 0x8323fe20)
{
// printf("\n========== SHA-1 hardware version failure ======== \n");
}
else
#endif
{ {
f = f_hw = Sha1_UpdateBlocks_HW; f = f_hw = Sha1_UpdateBlocks_HW;
} }
} }
g_FUNC_UPDATE_BLOCKS = f; g_SHA1_FUNC_UPDATE_BLOCKS = f;
g_FUNC_UPDATE_BLOCKS_HW = f_hw; g_SHA1_FUNC_UPDATE_BLOCKS_HW = f_hw;
#endif #endif
} }
#undef kNumW
#undef w
#undef w0
#undef w1
#undef f0
#undef f1
#undef f2
#undef f3
#undef T1
#undef T5
#undef M5
#undef R1
#undef R2
#undef R4
#undef R5
#undef R20_START
#undef R_PRE
#undef R_MAIN
#undef STEP_PRE
#undef STEP_MAIN
#undef Z7_SHA1_BIG_W
#undef Z7_SHA1_UNROLL
#undef Z7_COMPILER_SHA1_SUPPORTED

View file

@ -1,8 +1,8 @@
/* Sha1.h -- SHA-1 Hash /* Sha1.h -- SHA-1 Hash
2021-02-08 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#ifndef __7Z_SHA1_H #ifndef ZIP7_INC_SHA1_H
#define __7Z_SHA1_H #define ZIP7_INC_SHA1_H
#include "7zTypes.h" #include "7zTypes.h"
@ -14,7 +14,10 @@ EXTERN_C_BEGIN
#define SHA1_BLOCK_SIZE (SHA1_NUM_BLOCK_WORDS * 4) #define SHA1_BLOCK_SIZE (SHA1_NUM_BLOCK_WORDS * 4)
#define SHA1_DIGEST_SIZE (SHA1_NUM_DIGEST_WORDS * 4) #define SHA1_DIGEST_SIZE (SHA1_NUM_DIGEST_WORDS * 4)
typedef void (MY_FAST_CALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks);
typedef void (Z7_FASTCALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte *data, size_t numBlocks);
/* /*
if (the system supports different SHA1 code implementations) if (the system supports different SHA1 code implementations)
@ -32,11 +35,18 @@ typedef void (MY_FAST_CALL *SHA1_FUNC_UPDATE_BLOCKS)(UInt32 state[5], const Byte
typedef struct typedef struct
{ {
SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks; union
UInt64 count; {
UInt64 __pad_2[2]; struct
{
SHA1_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
UInt64 count;
} vars;
UInt64 _pad_64bit[4];
void *_pad_align_ptr[2];
} v;
UInt32 state[SHA1_NUM_DIGEST_WORDS]; UInt32 state[SHA1_NUM_DIGEST_WORDS];
UInt32 __pad_3[3]; UInt32 _pad_3[3];
Byte buffer[SHA1_BLOCK_SIZE]; Byte buffer[SHA1_BLOCK_SIZE];
} CSha1; } CSha1;
@ -62,7 +72,7 @@ void Sha1_Final(CSha1 *p, Byte *digest);
void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size); void Sha1_PrepareBlock(const CSha1 *p, Byte *block, unsigned size);
void Sha1_GetBlockDigest(const CSha1 *p, const Byte *data, Byte *destDigest); void Sha1_GetBlockDigest(const CSha1 *p, const Byte *data, Byte *destDigest);
// void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks); // void Z7_FASTCALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks);
/* /*
call Sha1Prepare() once at program start. call Sha1Prepare() once at program start.

View file

@ -1,71 +1,53 @@
/* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions /* Sha1Opt.c -- SHA-1 optimized code for SHA-1 hardware instructions
2021-04-01 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include "Compiler.h"
#if defined(_MSC_VER)
#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
// #define USE_MY_MM
#endif
#endif
#include "CpuArch.h" #include "CpuArch.h"
// #define Z7_USE_HW_SHA_STUB // for debug
#ifdef MY_CPU_X86_OR_AMD64 #ifdef MY_CPU_X86_OR_AMD64
#if defined(__clang__) #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
#if (__clang_major__ >= 8) // fix that check
#define USE_HW_SHA #define USE_HW_SHA
#ifndef __SHA__ #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
#define USE_HW_SHA
#if !defined(__INTEL_COMPILER)
// icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
#if !defined(__SHA__) || !defined(__SSSE3__)
#define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
#if defined(_MSC_VER)
// SSSE3: for clang-cl:
#include <tmmintrin.h>
#define __SHA__
#endif
#endif #endif
#pragma clang diagnostic ignored "-Wvector-conversion"
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 8) // fix that check
#define USE_HW_SHA
#ifndef __SHA__
#define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
// #pragma GCC target("sha,ssse3")
#endif #endif
#endif
#elif defined(__INTEL_COMPILER)
#if (__INTEL_COMPILER >= 1800) // fix that check
#define USE_HW_SHA
#endif
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
#ifdef USE_MY_MM #if (_MSC_VER >= 1900)
#define USE_VER_MIN 1300
#else
#define USE_VER_MIN 1910
#endif
#if _MSC_VER >= USE_VER_MIN
#define USE_HW_SHA #define USE_HW_SHA
#else
#define Z7_USE_HW_SHA_STUB
#endif #endif
#endif #endif
// #endif // MY_CPU_X86_OR_AMD64 // #endif // MY_CPU_X86_OR_AMD64
#ifndef USE_HW_SHA
// #define Z7_USE_HW_SHA_STUB // for debug
#endif
#ifdef USE_HW_SHA #ifdef USE_HW_SHA
// #pragma message("Sha1 HW") // #pragma message("Sha1 HW")
// #include <wmmintrin.h>
#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
// sse/sse2/ssse3:
#include <tmmintrin.h>
// sha*:
#include <immintrin.h> #include <immintrin.h>
#if defined (__clang__) && defined(_MSC_VER)
#if !defined(__SHA__)
#include <shaintrin.h>
#endif
#else #else
#include <emmintrin.h>
#if defined(_MSC_VER) && (_MSC_VER >= 1600)
// #include <intrin.h>
#endif
#ifdef USE_MY_MM
#include "My_mm.h"
#endif
#endif #endif
@ -87,84 +69,69 @@ SHA:
_mm_sha1* _mm_sha1*
*/ */
#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
#define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src); #define XOR_SI128(dest, src) dest = _mm_xor_si128(dest, src);
#define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask); #define SHUFFLE_EPI8(dest, mask) dest = _mm_shuffle_epi8(dest, mask);
#define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask); #define SHUFFLE_EPI32(dest, mask) dest = _mm_shuffle_epi32(dest, mask);
#ifdef __clang__
#define SHA1_RND4(abcd, e0, f) abcd = _mm_sha1rnds4_epu32(abcd, e0, f); #define SHA1_RNDS4_RET_TYPE_CAST (__m128i)
#define SHA1_NEXTE(e, m) e = _mm_sha1nexte_epu32(e, m); #else
#define SHA1_RNDS4_RET_TYPE_CAST
#endif
#define SHA1_RND4(abcd, e0, f) abcd = SHA1_RNDS4_RET_TYPE_CAST _mm_sha1rnds4_epu32(abcd, e0, f);
#define SHA1_NEXTE(e, m) e = _mm_sha1nexte_epu32(e, m);
#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
#define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src); #define SHA1_MSG1(dest, src) dest = _mm_sha1msg1_epu32(dest, src);
#define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src); #define SHA1_MSG2(dest, src) dest = _mm_sha1msg2_epu32(dest, src);
#define LOAD_SHUFFLE(m, k) \ #define LOAD_SHUFFLE(m, k) \
m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
SHUFFLE_EPI8(m, mask); \ SHUFFLE_EPI8(m, mask) \
#define SM1(m0, m1, m2, m3) \
SHA1_MSG1(m0, m1); \
#define SM2(m0, m1, m2, m3) \
XOR_SI128(m3, m1); \
SHA1_MSG2(m3, m2); \
#define SM3(m0, m1, m2, m3) \
XOR_SI128(m3, m1); \
SM1(m0, m1, m2, m3) \
SHA1_MSG2(m3, m2); \
#define NNN(m0, m1, m2, m3) #define NNN(m0, m1, m2, m3)
#define SM1(m0, m1, m2, m3) \
SHA1_MSG1(m0, m1) \
#define SM2(m0, m1, m2, m3) \
XOR_SI128(m3, m1) \
SHA1_MSG2(m3, m2) \
#define SM3(m0, m1, m2, m3) \
XOR_SI128(m3, m1) \
SM1(m0, m1, m2, m3) \
SHA1_MSG2(m3, m2) \
#define R4(k, m0, m1, m2, m3, e0, e1, OP) \
#define R4(k, e0, e1, m0, m1, m2, m3, OP) \
e1 = abcd; \ e1 = abcd; \
SHA1_RND4(abcd, e0, (k) / 5); \ SHA1_RND4(abcd, e0, (k) / 5) \
SHA1_NEXTE(e1, m1); \ SHA1_NEXTE(e1, m1) \
OP(m0, m1, m2, m3); \ OP(m0, m1, m2, m3) \
#define R16(k, mx, OP0, OP1, OP2, OP3) \ #define R16(k, mx, OP0, OP1, OP2, OP3) \
R4 ( (k)*4+0, e0,e1, m0,m1,m2,m3, OP0 ) \ R4 ( (k)*4+0, m0,m1,m2,m3, e0,e1, OP0 ) \
R4 ( (k)*4+1, e1,e0, m1,m2,m3,m0, OP1 ) \ R4 ( (k)*4+1, m1,m2,m3,m0, e1,e0, OP1 ) \
R4 ( (k)*4+2, e0,e1, m2,m3,m0,m1, OP2 ) \ R4 ( (k)*4+2, m2,m3,m0,m1, e0,e1, OP2 ) \
R4 ( (k)*4+3, e1,e0, m3,mx,m1,m2, OP3 ) \ R4 ( (k)*4+3, m3,mx,m1,m2, e1,e0, OP3 ) \
#define PREPARE_STATE \ #define PREPARE_STATE \
SHUFFLE_EPI32 (abcd, 0x1B); \ SHUFFLE_EPI32 (abcd, 0x1B) \
SHUFFLE_EPI32 (e0, 0x1B); \ SHUFFLE_EPI32 (e0, 0x1B) \
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks); void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
#ifdef ATTRIB_SHA #ifdef ATTRIB_SHA
ATTRIB_SHA ATTRIB_SHA
#endif #endif
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks) void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
{ {
const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); const __m128i mask = _mm_set_epi32(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
__m128i abcd, e0; __m128i abcd, e0;
if (numBlocks == 0) if (numBlocks == 0)
@ -190,15 +157,15 @@ void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
LOAD_SHUFFLE (m2, 2) LOAD_SHUFFLE (m2, 2)
LOAD_SHUFFLE (m3, 3) LOAD_SHUFFLE (m3, 3)
ADD_EPI32(e0, m0); ADD_EPI32(e0, m0)
R16 ( 0, m0, SM1, SM3, SM3, SM3 ); R16 ( 0, m0, SM1, SM3, SM3, SM3 )
R16 ( 1, m0, SM3, SM3, SM3, SM3 ); R16 ( 1, m0, SM3, SM3, SM3, SM3 )
R16 ( 2, m0, SM3, SM3, SM3, SM3 ); R16 ( 2, m0, SM3, SM3, SM3, SM3 )
R16 ( 3, m0, SM3, SM3, SM3, SM3 ); R16 ( 3, m0, SM3, SM3, SM3, SM3 )
R16 ( 4, e2, SM2, NNN, NNN, NNN ); R16 ( 4, e2, SM2, NNN, NNN, NNN )
ADD_EPI32(abcd, abcd_save); ADD_EPI32(abcd, abcd_save)
data += 64; data += 64;
} }
@ -207,78 +174,155 @@ void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
PREPARE_STATE PREPARE_STATE
_mm_storeu_si128((__m128i *) (void *) state, abcd); _mm_storeu_si128((__m128i *) (void *) state, abcd);
*(state+4) = (UInt32)_mm_cvtsi128_si32(e0); *(state + 4) = (UInt32)_mm_cvtsi128_si32(e0);
} }
#endif // USE_HW_SHA #endif // USE_HW_SHA
#elif defined(MY_CPU_ARM_OR_ARM64) #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE) \
&& (!defined(Z7_MSC_VER_ORIGINAL) || (_MSC_VER >= 1929) && (_MSC_FULL_VER >= 192930037))
#if defined(__clang__) #if defined(__ARM_FEATURE_SHA2) \
#if (__clang_major__ >= 8) // fix that check || defined(__ARM_FEATURE_CRYPTO)
#define USE_HW_SHA
#else
#if defined(MY_CPU_ARM64) \
|| defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
|| defined(Z7_MSC_VER_ORIGINAL)
#if defined(__ARM_FP) && \
( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
|| defined(__GNUC__) && (__GNUC__ >= 6) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
#if defined(MY_CPU_ARM64) \
|| !defined(Z7_CLANG_VERSION) \
|| defined(__ARM_NEON) && \
(Z7_CLANG_VERSION < 170000 || \
Z7_CLANG_VERSION > 170001)
#define USE_HW_SHA #define USE_HW_SHA
#endif #endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 6) // fix that check
#define USE_HW_SHA
#endif #endif
#elif defined(_MSC_VER)
#if _MSC_VER >= 1910
#define USE_HW_SHA
#endif #endif
#endif #endif
#ifdef USE_HW_SHA #ifdef USE_HW_SHA
// #pragma message("=== Sha1 HW === ") // #pragma message("=== Sha1 HW === ")
// __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_SHA2
#if defined(__clang__) || defined(__GNUC__) #if defined(__clang__) || defined(__GNUC__)
#if !defined(__ARM_FEATURE_SHA2) && \
!defined(__ARM_FEATURE_CRYPTO)
#ifdef MY_CPU_ARM64 #ifdef MY_CPU_ARM64
#if defined(__clang__)
#define ATTRIB_SHA __attribute__((__target__("crypto")))
#else
#define ATTRIB_SHA __attribute__((__target__("+crypto"))) #define ATTRIB_SHA __attribute__((__target__("+crypto")))
#endif
#else #else
#if defined(__clang__) && (__clang_major__ >= 1)
#define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2")))
#else
#define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
#endif
#endif #endif
#endif
#else #else
// _MSC_VER // _MSC_VER
// for arm32 // for arm32
#define _ARM_USE_NEW_NEON_INTRINSICS #define _ARM_USE_NEW_NEON_INTRINSICS
#endif #endif
#if defined(_MSC_VER) && defined(MY_CPU_ARM64) #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
#include <arm64_neon.h> #include <arm64_neon.h>
#else #else
#include <arm_neon.h>
#if defined(__clang__) && __clang_major__ < 16
#if !defined(__ARM_FEATURE_SHA2) && \
!defined(__ARM_FEATURE_CRYPTO)
// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
// #if defined(__clang__) && __clang_major__ < 13
#define __ARM_FEATURE_CRYPTO 1
// #else
#define __ARM_FEATURE_SHA2 1
// #endif
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif #endif
#endif // clang
#if defined(__clang__)
#if defined(__ARM_ARCH) && __ARM_ARCH < 8
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
// #pragma message("#define __ARM_ARCH 8")
#undef __ARM_ARCH
#define __ARM_ARCH 8
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif // clang
#include <arm_neon.h>
#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
defined(__ARM_FEATURE_CRYPTO) && \
defined(__ARM_FEATURE_SHA2)
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#undef __ARM_FEATURE_CRYPTO
#undef __ARM_FEATURE_SHA2
#undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
#endif
#endif // Z7_MSC_VER_ORIGINAL
typedef uint32x4_t v128; typedef uint32x4_t v128;
// typedef __n128 v128; // MSVC // typedef __n128 v128; // MSVC
// the bug in clang 3.8.1:
#ifdef MY_CPU_BE // __builtin_neon_vgetq_lane_i32((int8x16_t)__s0, __p1);
#define MY_rev32_for_LE(x) #if defined(__clang__) && (__clang_major__ <= 9)
#else #pragma GCC diagnostic ignored "-Wvector-conversion"
#define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x)))
#endif #endif
#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) #ifdef MY_CPU_BE
#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) #define MY_rev32_for_LE(x) x
#else
#define MY_rev32_for_LE(x) vrev32q_u8(x)
#endif
#define LOAD_128_32(_p) vld1q_u32(_p)
#define LOAD_128_8(_p) vld1q_u8 (_p)
#define STORE_128_32(_p, _v) vst1q_u32(_p, _v)
#define LOAD_SHUFFLE(m, k) \ #define LOAD_SHUFFLE(m, k) \
m = LOAD_128((data + (k) * 16)); \ m = vreinterpretq_u32_u8( \
MY_rev32_for_LE(m); \ MY_rev32_for_LE( \
LOAD_128_8(data + (k) * 16))); \
#define SU0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3); #define N0(dest, src2, src3)
#define SU1(dest, src) dest = vsha1su1q_u32(dest, src); #define N1(dest, src)
#define C(e) abcd = vsha1cq_u32(abcd, e, t); #define U0(dest, src2, src3) dest = vsha1su0q_u32(dest, src2, src3);
#define P(e) abcd = vsha1pq_u32(abcd, e, t); #define U1(dest, src) dest = vsha1su1q_u32(dest, src);
#define M(e) abcd = vsha1mq_u32(abcd, e, t); #define C(e) abcd = vsha1cq_u32(abcd, e, t)
#define P(e) abcd = vsha1pq_u32(abcd, e, t)
#define M(e) abcd = vsha1mq_u32(abcd, e, t)
#define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0)) #define H(e) e = vsha1h_u32(vgetq_lane_u32(abcd, 0))
#define T(m, c) t = vaddq_u32(m, c) #define T(m, c) t = vaddq_u32(m, c)
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); #define R16(d0,d1,d2,d3, f0,z0, f1,z1, f2,z2, f3,z3, w0,w1,w2,w3) \
T(m0, d0); f0(m3, m0, m1) z0(m2, m1) H(e1); w0(e0); \
T(m1, d1); f1(m0, m1, m2) z1(m3, m2) H(e0); w1(e1); \
T(m2, d2); f2(m1, m2, m3) z2(m0, m3) H(e1); w2(e0); \
T(m3, d3); f3(m2, m3, m0) z3(m1, m0) H(e0); w3(e1); \
void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
#ifdef ATTRIB_SHA #ifdef ATTRIB_SHA
ATTRIB_SHA ATTRIB_SHA
#endif #endif
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
{ {
v128 abcd; v128 abcd;
v128 c0, c1, c2, c3; v128 c0, c1, c2, c3;
@ -292,7 +336,7 @@ void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
c2 = vdupq_n_u32(0x8f1bbcdc); c2 = vdupq_n_u32(0x8f1bbcdc);
c3 = vdupq_n_u32(0xca62c1d6); c3 = vdupq_n_u32(0xca62c1d6);
abcd = LOAD_128(&state[0]); abcd = LOAD_128_32(&state[0]);
e0 = state[4]; e0 = state[4];
do do
@ -310,26 +354,11 @@ void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
LOAD_SHUFFLE (m2, 2) LOAD_SHUFFLE (m2, 2)
LOAD_SHUFFLE (m3, 3) LOAD_SHUFFLE (m3, 3)
T(m0, c0); H(e1); C(e0); R16 ( c0,c0,c0,c0, N0,N1, U0,N1, U0,U1, U0,U1, C,C,C,C )
T(m1, c0); SU0(m0, m1, m2); H(e0); C(e1); R16 ( c0,c1,c1,c1, U0,U1, U0,U1, U0,U1, U0,U1, C,P,P,P )
T(m2, c0); SU0(m1, m2, m3); SU1(m0, m3); H(e1); C(e0); R16 ( c1,c1,c2,c2, U0,U1, U0,U1, U0,U1, U0,U1, P,P,M,M )
T(m3, c0); SU0(m2, m3, m0); SU1(m1, m0); H(e0); C(e1); R16 ( c2,c2,c2,c3, U0,U1, U0,U1, U0,U1, U0,U1, M,M,M,P )
T(m0, c0); SU0(m3, m0, m1); SU1(m2, m1); H(e1); C(e0); R16 ( c3,c3,c3,c3, U0,U1, N0,U1, N0,N1, N0,N1, P,P,P,P )
T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1);
T(m2, c1); SU0(m1, m2, m3); SU1(m0, m3); H(e1); P(e0);
T(m3, c1); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1);
T(m0, c1); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0);
T(m1, c1); SU0(m0, m1, m2); SU1(m3, m2); H(e0); P(e1);
T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0);
T(m3, c2); SU0(m2, m3, m0); SU1(m1, m0); H(e0); M(e1);
T(m0, c2); SU0(m3, m0, m1); SU1(m2, m1); H(e1); M(e0);
T(m1, c2); SU0(m0, m1, m2); SU1(m3, m2); H(e0); M(e1);
T(m2, c2); SU0(m1, m2, m3); SU1(m0, m3); H(e1); M(e0);
T(m3, c3); SU0(m2, m3, m0); SU1(m1, m0); H(e0); P(e1);
T(m0, c3); SU0(m3, m0, m1); SU1(m2, m1); H(e1); P(e0);
T(m1, c3); SU1(m3, m2); H(e0); P(e1);
T(m2, c3); H(e1); P(e0);
T(m3, c3); H(e0); P(e1);
abcd = vaddq_u32(abcd, abcd_save); abcd = vaddq_u32(abcd, abcd_save);
e0 += e0_save; e0 += e0_save;
@ -338,7 +367,7 @@ void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
} }
while (--numBlocks); while (--numBlocks);
STORE_128(&state[0], abcd); STORE_128_32(&state[0], abcd);
state[4] = e0; state[4] = e0;
} }
@ -346,19 +375,16 @@ void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t
#endif // MY_CPU_ARM_OR_ARM64 #endif // MY_CPU_ARM_OR_ARM64
#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
#ifndef USE_HW_SHA
// #error Stop_Compiling_UNSUPPORTED_SHA // #error Stop_Compiling_UNSUPPORTED_SHA
// #include <stdlib.h> // #include <stdlib.h>
// #include "Sha1.h" // #include "Sha1.h"
void MY_FAST_CALL Sha1_UpdateBlocks(UInt32 state[5], const Byte *data, size_t numBlocks); // #if defined(_MSC_VER)
#pragma message("Sha1 HW-SW stub was used") #pragma message("Sha1 HW-SW stub was used")
// #endif
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks); void Z7_FASTCALL Sha1_UpdateBlocks (UInt32 state[5], const Byte *data, size_t numBlocks);
void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks) void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks);
void Z7_FASTCALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t numBlocks)
{ {
Sha1_UpdateBlocks(state, data, numBlocks); Sha1_UpdateBlocks(state, data, numBlocks);
/* /*
@ -369,5 +395,30 @@ void MY_FAST_CALL Sha1_UpdateBlocks_HW(UInt32 state[5], const Byte *data, size_t
return; return;
*/ */
} }
#endif #endif
#undef U0
#undef U1
#undef N0
#undef N1
#undef C
#undef P
#undef M
#undef H
#undef T
#undef MY_rev32_for_LE
#undef NNN
#undef LOAD_128
#undef STORE_128
#undef LOAD_SHUFFLE
#undef SM1
#undef SM2
#undef SM3
#undef NNN
#undef R4
#undef R16
#undef PREPARE_STATE
#undef USE_HW_SHA
#undef ATTRIB_SHA
#undef USE_VER_MIN
#undef Z7_USE_HW_SHA_STUB

View file

@ -1,64 +1,60 @@
/* Sha256.c -- SHA-256 Hash /* Sha256.c -- SHA-256 Hash
2021-04-01 : Igor Pavlov : Public domain : Igor Pavlov : Public domain
This code is based on public domain code from Wei Dai's Crypto++ library. */ This code is based on public domain code from Wei Dai's Crypto++ library. */
#include "Precomp.h" #include "Precomp.h"
#include <string.h> #include <string.h>
#include "CpuArch.h"
#include "RotateDefs.h"
#include "Sha256.h" #include "Sha256.h"
#include "RotateDefs.h"
#if defined(_MSC_VER) && (_MSC_VER < 1900) #include "CpuArch.h"
// #define USE_MY_MM
#endif
#ifdef MY_CPU_X86_OR_AMD64 #ifdef MY_CPU_X86_OR_AMD64
#ifdef _MSC_VER #if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
#if _MSC_VER >= 1200 || defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
#define _SHA_SUPPORTED || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900) \
#endif || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) \
#elif defined(__clang__) || defined(_MSC_VER) && (_MSC_VER >= 1200)
#if (__clang_major__ >= 8) // fix that check #define Z7_COMPILER_SHA256_SUPPORTED
#define _SHA_SUPPORTED
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 8) // fix that check
#define _SHA_SUPPORTED
#endif
#elif defined(__INTEL_COMPILER)
#if (__INTEL_COMPILER >= 1800) // fix that check
#define _SHA_SUPPORTED
#endif
#endif #endif
#elif defined(MY_CPU_ARM_OR_ARM64) #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
#ifdef _MSC_VER
#if _MSC_VER >= 1910 #if defined(__ARM_FEATURE_SHA2) \
#define _SHA_SUPPORTED || defined(__ARM_FEATURE_CRYPTO)
#define Z7_COMPILER_SHA256_SUPPORTED
#else
#if defined(MY_CPU_ARM64) \
|| defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
|| defined(Z7_MSC_VER_ORIGINAL)
#if defined(__ARM_FP) && \
( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
|| defined(__GNUC__) && (__GNUC__ >= 6) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
#if defined(MY_CPU_ARM64) \
|| !defined(Z7_CLANG_VERSION) \
|| defined(__ARM_NEON) && \
(Z7_CLANG_VERSION < 170000 || \
Z7_CLANG_VERSION > 170001)
#define Z7_COMPILER_SHA256_SUPPORTED
#endif #endif
#elif defined(__clang__)
#if (__clang_major__ >= 8) // fix that check
#define _SHA_SUPPORTED
#endif #endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 6) // fix that check
#define _SHA_SUPPORTED
#endif #endif
#endif #endif
#endif #endif
void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
#ifdef _SHA_SUPPORTED #ifdef Z7_COMPILER_SHA256_SUPPORTED
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
static SHA256_FUNC_UPDATE_BLOCKS g_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks; static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS = Sha256_UpdateBlocks;
static SHA256_FUNC_UPDATE_BLOCKS g_FUNC_UPDATE_BLOCKS_HW; static SHA256_FUNC_UPDATE_BLOCKS g_SHA256_FUNC_UPDATE_BLOCKS_HW;
#define UPDATE_BLOCKS(p) p->func_UpdateBlocks #define SHA256_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
#else #else
#define UPDATE_BLOCKS(p) Sha256_UpdateBlocks #define SHA256_UPDATE_BLOCKS(p) Sha256_UpdateBlocks
#endif #endif
@ -66,16 +62,16 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
{ {
SHA256_FUNC_UPDATE_BLOCKS func = Sha256_UpdateBlocks; SHA256_FUNC_UPDATE_BLOCKS func = Sha256_UpdateBlocks;
#ifdef _SHA_SUPPORTED #ifdef Z7_COMPILER_SHA256_SUPPORTED
if (algo != SHA256_ALGO_SW) if (algo != SHA256_ALGO_SW)
{ {
if (algo == SHA256_ALGO_DEFAULT) if (algo == SHA256_ALGO_DEFAULT)
func = g_FUNC_UPDATE_BLOCKS; func = g_SHA256_FUNC_UPDATE_BLOCKS;
else else
{ {
if (algo != SHA256_ALGO_HW) if (algo != SHA256_ALGO_HW)
return False; return False;
func = g_FUNC_UPDATE_BLOCKS_HW; func = g_SHA256_FUNC_UPDATE_BLOCKS_HW;
if (!func) if (!func)
return False; return False;
} }
@ -85,24 +81,25 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
return False; return False;
#endif #endif
p->func_UpdateBlocks = func; p->v.vars.func_UpdateBlocks = func;
return True; return True;
} }
/* define it for speed optimization */ /* define it for speed optimization */
#ifdef _SFX #ifdef Z7_SFX
#define STEP_PRE 1 #define STEP_PRE 1
#define STEP_MAIN 1 #define STEP_MAIN 1
#else #else
#define STEP_PRE 2 #define STEP_PRE 2
#define STEP_MAIN 4 #define STEP_MAIN 4
// #define _SHA256_UNROLL // #define Z7_SHA256_UNROLL
#endif #endif
#undef Z7_SHA256_BIG_W
#if STEP_MAIN != 16 #if STEP_MAIN != 16
#define _SHA256_BIG_W #define Z7_SHA256_BIG_W
#endif #endif
@ -110,7 +107,7 @@ BoolInt Sha256_SetFunction(CSha256 *p, unsigned algo)
void Sha256_InitState(CSha256 *p) void Sha256_InitState(CSha256 *p)
{ {
p->count = 0; p->v.vars.count = 0;
p->state[0] = 0x6a09e667; p->state[0] = 0x6a09e667;
p->state[1] = 0xbb67ae85; p->state[1] = 0xbb67ae85;
p->state[2] = 0x3c6ef372; p->state[2] = 0x3c6ef372;
@ -121,21 +118,28 @@ void Sha256_InitState(CSha256 *p)
p->state[7] = 0x5be0cd19; p->state[7] = 0x5be0cd19;
} }
void Sha256_Init(CSha256 *p) void Sha256_Init(CSha256 *p)
{ {
p->func_UpdateBlocks = p->v.vars.func_UpdateBlocks =
#ifdef _SHA_SUPPORTED #ifdef Z7_COMPILER_SHA256_SUPPORTED
g_FUNC_UPDATE_BLOCKS; g_SHA256_FUNC_UPDATE_BLOCKS;
#else #else
NULL; NULL;
#endif #endif
Sha256_InitState(p); Sha256_InitState(p);
} }
#define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x, 22)) #define S0(x) (rotrFixed(x, 2) ^ rotrFixed(x,13) ^ rotrFixed(x,22))
#define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x, 25)) #define S1(x) (rotrFixed(x, 6) ^ rotrFixed(x,11) ^ rotrFixed(x,25))
#define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3)) #define s0(x) (rotrFixed(x, 7) ^ rotrFixed(x,18) ^ (x >> 3))
#define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >> 10)) #define s1(x) (rotrFixed(x,17) ^ rotrFixed(x,19) ^ (x >>10))
#define Ch(x,y,z) (z^(x&(y^z))) #define Ch(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) ((x&y)|(z&(x|y))) #define Maj(x,y,z) ((x&y)|(z&(x|y)))
@ -145,7 +149,7 @@ void Sha256_Init(CSha256 *p)
#define blk2_main(j, i) s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15)) #define blk2_main(j, i) s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15))
#ifdef _SHA256_BIG_W #ifdef Z7_SHA256_BIG_W
// we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned. // we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned.
#define w(j, i) W[(size_t)(j) + i] #define w(j, i) W[(size_t)(j) + i]
#define blk2(j, i) (w(j, i) = w(j, (i)-16) + blk2_main(j, i)) #define blk2(j, i) (w(j, i) = w(j, (i)-16) + blk2_main(j, i))
@ -176,7 +180,7 @@ void Sha256_Init(CSha256 *p)
#define R1_PRE(i) T1( W_PRE, i) #define R1_PRE(i) T1( W_PRE, i)
#define R1_MAIN(i) T1( W_MAIN, i) #define R1_MAIN(i) T1( W_MAIN, i)
#if (!defined(_SHA256_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4) #if (!defined(Z7_SHA256_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4)
#define R2_MAIN(i) \ #define R2_MAIN(i) \
R1_MAIN(i) \ R1_MAIN(i) \
R1_MAIN(i + 1) \ R1_MAIN(i + 1) \
@ -185,7 +189,7 @@ void Sha256_Init(CSha256 *p)
#if defined(_SHA256_UNROLL) && STEP_MAIN >= 8 #if defined(Z7_SHA256_UNROLL) && STEP_MAIN >= 8
#define T4( a,b,c,d,e,f,g,h, wx, i) \ #define T4( a,b,c,d,e,f,g,h, wx, i) \
h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \ h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
@ -223,14 +227,10 @@ void Sha256_Init(CSha256 *p)
#endif #endif
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
// static extern
extern MY_ALIGN(64) MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64];
const UInt32 SHA256_K_ARRAY[64]; MY_ALIGN(64) const UInt32 SHA256_K_ARRAY[64] = {
MY_ALIGN(64)
const UInt32 SHA256_K_ARRAY[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
@ -249,26 +249,28 @@ const UInt32 SHA256_K_ARRAY[64] = {
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
}; };
#define K SHA256_K_ARRAY #define K SHA256_K_ARRAY
Z7_NO_INLINE
MY_NO_INLINE void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks)
void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks)
{ {
UInt32 W UInt32 W
#ifdef _SHA256_BIG_W #ifdef Z7_SHA256_BIG_W
[64]; [64];
#else #else
[16]; [16];
#endif #endif
unsigned j; unsigned j;
UInt32 a,b,c,d,e,f,g,h; UInt32 a,b,c,d,e,f,g,h;
#if !defined(Z7_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
#if !defined(_SHA256_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
UInt32 tmp; UInt32 tmp;
#endif #endif
if (numBlocks == 0) return;
a = state[0]; a = state[0];
b = state[1]; b = state[1];
@ -279,7 +281,7 @@ void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t
g = state[6]; g = state[6];
h = state[7]; h = state[7];
while (numBlocks) do
{ {
for (j = 0; j < 16; j += STEP_PRE) for (j = 0; j < 16; j += STEP_PRE)
@ -297,12 +299,12 @@ void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t
#else #else
R1_PRE(0); R1_PRE(0)
#if STEP_PRE >= 2 #if STEP_PRE >= 2
R1_PRE(1); R1_PRE(1)
#if STEP_PRE >= 4 #if STEP_PRE >= 4
R1_PRE(2); R1_PRE(2)
R1_PRE(3); R1_PRE(3)
#endif #endif
#endif #endif
@ -311,32 +313,32 @@ void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t
for (j = 16; j < 64; j += STEP_MAIN) for (j = 16; j < 64; j += STEP_MAIN)
{ {
#if defined(_SHA256_UNROLL) && STEP_MAIN >= 8 #if defined(Z7_SHA256_UNROLL) && STEP_MAIN >= 8
#if STEP_MAIN < 8 #if STEP_MAIN < 8
R4_MAIN(0); R4_MAIN(0)
#else #else
R8_MAIN(0); R8_MAIN(0)
#if STEP_MAIN == 16 #if STEP_MAIN == 16
R8_MAIN(8); R8_MAIN(8)
#endif #endif
#endif #endif
#else #else
R1_MAIN(0); R1_MAIN(0)
#if STEP_MAIN >= 2 #if STEP_MAIN >= 2
R1_MAIN(1); R1_MAIN(1)
#if STEP_MAIN >= 4 #if STEP_MAIN >= 4
R2_MAIN(2); R2_MAIN(2)
#if STEP_MAIN >= 8 #if STEP_MAIN >= 8
R2_MAIN(4); R2_MAIN(4)
R2_MAIN(6); R2_MAIN(6)
#if STEP_MAIN >= 16 #if STEP_MAIN >= 16
R2_MAIN(8); R2_MAIN(8)
R2_MAIN(10); R2_MAIN(10)
R2_MAIN(12); R2_MAIN(12)
R2_MAIN(14); R2_MAIN(14)
#endif #endif
#endif #endif
#endif #endif
@ -353,40 +355,27 @@ void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t
g += state[6]; state[6] = g; g += state[6]; state[6] = g;
h += state[7]; state[7] = h; h += state[7]; state[7] = h;
data += 64; data += SHA256_BLOCK_SIZE;
numBlocks--;
} }
while (--numBlocks);
/* Wipe variables */
/* memset(W, 0, sizeof(W)); */
} }
#undef S0
#undef S1
#undef s0
#undef s1
#undef K
#define Sha256_UpdateBlock(p) UPDATE_BLOCKS(p)(p->state, p->buffer, 1) #define Sha256_UpdateBlock(p) SHA256_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
void Sha256_Update(CSha256 *p, const Byte *data, size_t size) void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
{ {
if (size == 0) if (size == 0)
return; return;
{ {
unsigned pos = (unsigned)p->count & 0x3F; const unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
unsigned num; const unsigned num = SHA256_BLOCK_SIZE - pos;
p->v.vars.count += size;
p->count += size;
num = 64 - pos;
if (num > size) if (num > size)
{ {
memcpy(p->buffer + pos, data, size); memcpy(p->buffer + pos, data, size);
return; return;
} }
if (pos != 0) if (pos != 0)
{ {
size -= num; size -= num;
@ -396,9 +385,10 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
} }
} }
{ {
size_t numBlocks = size >> 6; const size_t numBlocks = size >> 6;
UPDATE_BLOCKS(p)(p->state, data, numBlocks); // if (numBlocks)
size &= 0x3F; SHA256_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
size &= SHA256_BLOCK_SIZE - 1;
if (size == 0) if (size == 0)
return; return;
data += (numBlocks << 6); data += (numBlocks << 6);
@ -409,78 +399,94 @@ void Sha256_Update(CSha256 *p, const Byte *data, size_t size)
void Sha256_Final(CSha256 *p, Byte *digest) void Sha256_Final(CSha256 *p, Byte *digest)
{ {
unsigned pos = (unsigned)p->count & 0x3F; unsigned pos = (unsigned)p->v.vars.count & (SHA256_BLOCK_SIZE - 1);
unsigned i;
p->buffer[pos++] = 0x80; p->buffer[pos++] = 0x80;
if (pos > (SHA256_BLOCK_SIZE - 4 * 2))
if (pos > (64 - 8))
{ {
while (pos != 64) { p->buffer[pos++] = 0; } while (pos != SHA256_BLOCK_SIZE) { p->buffer[pos++] = 0; }
// memset(&p->buf.buffer[pos], 0, 64 - pos); // memset(&p->buf.buffer[pos], 0, SHA256_BLOCK_SIZE - pos);
Sha256_UpdateBlock(p); Sha256_UpdateBlock(p);
pos = 0; pos = 0;
} }
memset(&p->buffer[pos], 0, (SHA256_BLOCK_SIZE - 4 * 2) - pos);
/*
if (pos & 3)
{ {
p->buffer[pos] = 0; const UInt64 numBits = p->v.vars.count << 3;
p->buffer[pos + 1] = 0; SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 2, (UInt32)(numBits >> 32))
p->buffer[pos + 2] = 0; SetBe32(p->buffer + SHA256_BLOCK_SIZE - 4 * 1, (UInt32)(numBits))
pos += 3;
pos &= ~3;
} }
{
for (; pos < 64 - 8; pos += 4)
*(UInt32 *)(&p->buffer[pos]) = 0;
}
*/
memset(&p->buffer[pos], 0, (64 - 8) - pos);
{
UInt64 numBits = (p->count << 3);
SetBe32(p->buffer + 64 - 8, (UInt32)(numBits >> 32));
SetBe32(p->buffer + 64 - 4, (UInt32)(numBits));
}
Sha256_UpdateBlock(p); Sha256_UpdateBlock(p);
#if 1 && defined(MY_CPU_BE)
for (i = 0; i < 8; i += 2) memcpy(digest, p->state, SHA256_DIGEST_SIZE);
#else
{ {
UInt32 v0 = p->state[i]; unsigned i;
UInt32 v1 = p->state[(size_t)i + 1]; for (i = 0; i < 8; i += 2)
SetBe32(digest , v0); {
SetBe32(digest + 4, v1); const UInt32 v0 = p->state[i];
digest += 8; const UInt32 v1 = p->state[(size_t)i + 1];
SetBe32(digest , v0)
SetBe32(digest + 4, v1)
digest += 4 * 2;
}
} }
#endif
Sha256_InitState(p); Sha256_InitState(p);
} }
void Sha256Prepare() void Sha256Prepare(void)
{ {
#ifdef _SHA_SUPPORTED #ifdef Z7_COMPILER_SHA256_SUPPORTED
SHA256_FUNC_UPDATE_BLOCKS f, f_hw; SHA256_FUNC_UPDATE_BLOCKS f, f_hw;
f = Sha256_UpdateBlocks; f = Sha256_UpdateBlocks;
f_hw = NULL; f_hw = NULL;
#ifdef MY_CPU_X86_OR_AMD64 #ifdef MY_CPU_X86_OR_AMD64
#ifndef USE_MY_MM
if (CPU_IsSupported_SHA() if (CPU_IsSupported_SHA()
&& CPU_IsSupported_SSSE3() && CPU_IsSupported_SSSE3()
// && CPU_IsSupported_SSE41()
) )
#endif #else
#else
if (CPU_IsSupported_SHA2()) if (CPU_IsSupported_SHA2())
#endif #endif
{ {
// printf("\n========== HW SHA256 ======== \n"); // printf("\n========== HW SHA256 ======== \n");
f = f_hw = Sha256_UpdateBlocks_HW; f = f_hw = Sha256_UpdateBlocks_HW;
} }
g_FUNC_UPDATE_BLOCKS = f; g_SHA256_FUNC_UPDATE_BLOCKS = f;
g_FUNC_UPDATE_BLOCKS_HW = f_hw; g_SHA256_FUNC_UPDATE_BLOCKS_HW = f_hw;
#endif #endif
} }
#undef U64C
#undef K
#undef S0
#undef S1
#undef s0
#undef s1
#undef Ch
#undef Maj
#undef W_MAIN
#undef W_PRE
#undef w
#undef blk2_main
#undef blk2
#undef T1
#undef T4
#undef T8
#undef R1_PRE
#undef R1_MAIN
#undef R2_MAIN
#undef R4
#undef R4_PRE
#undef R4_MAIN
#undef R8
#undef R8_PRE
#undef R8_MAIN
#undef STEP_PRE
#undef STEP_MAIN
#undef Z7_SHA256_BIG_W
#undef Z7_SHA256_UNROLL
#undef Z7_COMPILER_SHA256_SUPPORTED

View file

@ -1,8 +1,8 @@
/* Sha256.h -- SHA-256 Hash /* Sha256.h -- SHA-256 Hash
2021-01-01 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#ifndef __7Z_SHA256_H #ifndef ZIP7_INC_SHA256_H
#define __7Z_SHA256_H #define ZIP7_INC_SHA256_H
#include "7zTypes.h" #include "7zTypes.h"
@ -14,7 +14,10 @@ EXTERN_C_BEGIN
#define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4) #define SHA256_BLOCK_SIZE (SHA256_NUM_BLOCK_WORDS * 4)
#define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4) #define SHA256_DIGEST_SIZE (SHA256_NUM_DIGEST_WORDS * 4)
typedef void (MY_FAST_CALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks);
typedef void (Z7_FASTCALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const Byte *data, size_t numBlocks);
/* /*
if (the system supports different SHA256 code implementations) if (the system supports different SHA256 code implementations)
@ -32,9 +35,16 @@ typedef void (MY_FAST_CALL *SHA256_FUNC_UPDATE_BLOCKS)(UInt32 state[8], const By
typedef struct typedef struct
{ {
SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks; union
UInt64 count; {
UInt64 __pad_2[2]; struct
{
SHA256_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
UInt64 count;
} vars;
UInt64 _pad_64bit[4];
void *_pad_align_ptr[2];
} v;
UInt32 state[SHA256_NUM_DIGEST_WORDS]; UInt32 state[SHA256_NUM_DIGEST_WORDS];
Byte buffer[SHA256_BLOCK_SIZE]; Byte buffer[SHA256_BLOCK_SIZE];
@ -62,7 +72,7 @@ void Sha256_Final(CSha256 *p, Byte *digest);
// void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); // void Z7_FASTCALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks);
/* /*
call Sha256Prepare() once at program start. call Sha256Prepare() once at program start.

View file

@ -1,71 +1,53 @@
/* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions /* Sha256Opt.c -- SHA-256 optimized code for SHA-256 hardware instructions
2021-04-01 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include "Compiler.h"
#if defined(_MSC_VER)
#if (_MSC_VER < 1900) && (_MSC_VER >= 1200)
// #define USE_MY_MM
#endif
#endif
#include "CpuArch.h" #include "CpuArch.h"
// #define Z7_USE_HW_SHA_STUB // for debug
#ifdef MY_CPU_X86_OR_AMD64 #ifdef MY_CPU_X86_OR_AMD64
#if defined(__clang__) #if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1600) // fix that check
#if (__clang_major__ >= 8) // fix that check
#define USE_HW_SHA #define USE_HW_SHA
#ifndef __SHA__ #elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 30800) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 50100) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40900)
#define USE_HW_SHA
#if !defined(__INTEL_COMPILER)
// icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
#if !defined(__SHA__) || !defined(__SSSE3__)
#define ATTRIB_SHA __attribute__((__target__("sha,ssse3"))) #define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
#if defined(_MSC_VER)
// SSSE3: for clang-cl:
#include <tmmintrin.h>
#define __SHA__
#endif
#endif #endif
#endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 8) // fix that check
#define USE_HW_SHA
#ifndef __SHA__
#define ATTRIB_SHA __attribute__((__target__("sha,ssse3")))
// #pragma GCC target("sha,ssse3")
#endif #endif
#endif
#elif defined(__INTEL_COMPILER)
#if (__INTEL_COMPILER >= 1800) // fix that check
#define USE_HW_SHA
#endif
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
#ifdef USE_MY_MM #if (_MSC_VER >= 1900)
#define USE_VER_MIN 1300
#else
#define USE_VER_MIN 1910
#endif
#if _MSC_VER >= USE_VER_MIN
#define USE_HW_SHA #define USE_HW_SHA
#else
#define Z7_USE_HW_SHA_STUB
#endif #endif
#endif #endif
// #endif // MY_CPU_X86_OR_AMD64 // #endif // MY_CPU_X86_OR_AMD64
#ifndef USE_HW_SHA
// #define Z7_USE_HW_SHA_STUB // for debug
#endif
#ifdef USE_HW_SHA #ifdef USE_HW_SHA
// #pragma message("Sha256 HW") // #pragma message("Sha256 HW")
// #include <wmmintrin.h>
#if !defined(_MSC_VER) || (_MSC_VER >= 1900)
// sse/sse2/ssse3:
#include <tmmintrin.h>
// sha*:
#include <immintrin.h> #include <immintrin.h>
#if defined (__clang__) && defined(_MSC_VER)
#if !defined(__SHA__)
#include <shaintrin.h>
#endif
#else #else
#include <emmintrin.h>
#if defined(_MSC_VER) && (_MSC_VER >= 1600)
// #include <intrin.h>
#endif
#ifdef USE_MY_MM
#include "My_mm.h"
#endif
#endif #endif
@ -94,60 +76,44 @@ SHA:
extern extern
MY_ALIGN(64) MY_ALIGN(64)
const UInt32 SHA256_K_ARRAY[64]; const UInt32 SHA256_K_ARRAY[64];
#define K SHA256_K_ARRAY #define K SHA256_K_ARRAY
#define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src); #define ADD_EPI32(dest, src) dest = _mm_add_epi32(dest, src);
#define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src); #define SHA256_MSG1(dest, src) dest = _mm_sha256msg1_epu32(dest, src);
#define SHA25G_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src); #define SHA256_MSG2(dest, src) dest = _mm_sha256msg2_epu32(dest, src);
#define LOAD_SHUFFLE(m, k) \ #define LOAD_SHUFFLE(m, k) \
m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \ m = _mm_loadu_si128((const __m128i *)(const void *)(data + (k) * 16)); \
m = _mm_shuffle_epi8(m, mask); \ m = _mm_shuffle_epi8(m, mask); \
#define SM1(g0, g1, g2, g3) \ #define NNN(m0, m1, m2, m3)
SHA256_MSG1(g3, g0); \
#define SM2(g0, g1, g2, g3) \ #define SM1(m1, m2, m3, m0) \
tmp = _mm_alignr_epi8(g1, g0, 4); \ SHA256_MSG1(m0, m1); \
ADD_EPI32(g2, tmp); \
SHA25G_MSG2(g2, g1); \
// #define LS0(k, g0, g1, g2, g3) LOAD_SHUFFLE(g0, k)
// #define LS1(k, g0, g1, g2, g3) LOAD_SHUFFLE(g1, k+1)
#define NNN(g0, g1, g2, g3)
#define SM2(m2, m3, m0, m1) \
ADD_EPI32(m0, _mm_alignr_epi8(m3, m2, 4)) \
SHA256_MSG2(m0, m3); \
#define RND2(t0, t1) \ #define RND2(t0, t1) \
t0 = _mm_sha256rnds2_epu32(t0, t1, msg); t0 = _mm_sha256rnds2_epu32(t0, t1, msg);
#define RND2_0(m, k) \
msg = _mm_add_epi32(m, *(const __m128i *) (const void *) &K[(k) * 4]); \
#define R4(k, m0, m1, m2, m3, OP0, OP1) \
msg = _mm_add_epi32(m0, *(const __m128i *) (const void *) &K[(k) * 4]); \
RND2(state0, state1); \ RND2(state0, state1); \
msg = _mm_shuffle_epi32(msg, 0x0E); \ msg = _mm_shuffle_epi32(msg, 0x0E); \
OP0(m0, m1, m2, m3) \
#define RND2_1 \
RND2(state1, state0); \ RND2(state1, state0); \
OP1(m0, m1, m2, m3) \
// We use scheme with 3 rounds ahead for SHA256_MSG1 / 2 rounds ahead for SHA256_MSG2
#define R4(k, g0, g1, g2, g3, OP0, OP1) \
RND2_0(g0, k); \
OP0(g0, g1, g2, g3); \
RND2_1; \
OP1(g0, g1, g2, g3); \
#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
R4 ( (k)*4+0, m0, m1, m2, m3, OP0, OP1 ) \ R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
R4 ( (k)*4+1, m1, m2, m3, m0, OP2, OP3 ) \ R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
R4 ( (k)*4+2, m2, m3, m0, m1, OP4, OP5 ) \ R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \ R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
#define PREPARE_STATE \ #define PREPARE_STATE \
tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \ tmp = _mm_shuffle_epi32(state0, 0x1B); /* abcd */ \
@ -157,15 +123,16 @@ const UInt32 SHA256_K_ARRAY[64];
state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \ state1 = _mm_unpackhi_epi64(state1, tmp); /* abef */ \
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
#ifdef ATTRIB_SHA #ifdef ATTRIB_SHA
ATTRIB_SHA ATTRIB_SHA
#endif #endif
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
{ {
const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203); const __m128i mask = _mm_set_epi32(0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203);
__m128i tmp;
__m128i state0, state1;
__m128i tmp, state0, state1;
if (numBlocks == 0) if (numBlocks == 0)
return; return;
@ -192,13 +159,13 @@ void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size
R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ); R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ); R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
ADD_EPI32(state0, state0_save); ADD_EPI32(state0, state0_save)
ADD_EPI32(state1, state1_save); ADD_EPI32(state1, state1_save)
data += 64; data += 64;
} }
@ -212,19 +179,28 @@ void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size
#endif // USE_HW_SHA #endif // USE_HW_SHA
#elif defined(MY_CPU_ARM_OR_ARM64) #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
#if defined(__clang__) #if defined(__ARM_FEATURE_SHA2) \
#if (__clang_major__ >= 8) // fix that check || defined(__ARM_FEATURE_CRYPTO)
#define USE_HW_SHA
#else
#if defined(MY_CPU_ARM64) \
|| defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
|| defined(Z7_MSC_VER_ORIGINAL)
#if defined(__ARM_FP) && \
( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
|| defined(__GNUC__) && (__GNUC__ >= 6) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
#if defined(MY_CPU_ARM64) \
|| !defined(Z7_CLANG_VERSION) \
|| defined(__ARM_NEON) && \
(Z7_CLANG_VERSION < 170000 || \
Z7_CLANG_VERSION > 170001)
#define USE_HW_SHA #define USE_HW_SHA
#endif #endif
#elif defined(__GNUC__)
#if (__GNUC__ >= 6) // fix that check
#define USE_HW_SHA
#endif #endif
#elif defined(_MSC_VER)
#if _MSC_VER >= 1910
#define USE_HW_SHA
#endif #endif
#endif #endif
@ -232,63 +208,144 @@ void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size
// #pragma message("=== Sha256 HW === ") // #pragma message("=== Sha256 HW === ")
#if defined(__clang__) || defined(__GNUC__) #if defined(__clang__) || defined(__GNUC__)
#if !defined(__ARM_FEATURE_SHA2) && \
!defined(__ARM_FEATURE_CRYPTO)
#ifdef MY_CPU_ARM64 #ifdef MY_CPU_ARM64
#if defined(__clang__)
#define ATTRIB_SHA __attribute__((__target__("crypto")))
#else
#define ATTRIB_SHA __attribute__((__target__("+crypto"))) #define ATTRIB_SHA __attribute__((__target__("+crypto")))
#endif
#else #else
#if defined(__clang__) && (__clang_major__ >= 1)
#define ATTRIB_SHA __attribute__((__target__("armv8-a,sha2")))
#else
#define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8"))) #define ATTRIB_SHA __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
#endif
#endif #endif
#endif
#else #else
// _MSC_VER // _MSC_VER
// for arm32 // for arm32
#define _ARM_USE_NEW_NEON_INTRINSICS #define _ARM_USE_NEW_NEON_INTRINSICS
#endif #endif
#if defined(_MSC_VER) && defined(MY_CPU_ARM64) #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
#include <arm64_neon.h> #include <arm64_neon.h>
#else #else
#include <arm_neon.h>
#if defined(__clang__) && __clang_major__ < 16
#if !defined(__ARM_FEATURE_SHA2) && \
!defined(__ARM_FEATURE_CRYPTO)
// #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
// #if defined(__clang__) && __clang_major__ < 13
#define __ARM_FEATURE_CRYPTO 1
// #else
#define __ARM_FEATURE_SHA2 1
// #endif
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif #endif
#endif // clang
#if defined(__clang__)
#if defined(__ARM_ARCH) && __ARM_ARCH < 8
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
// #pragma message("#define __ARM_ARCH 8")
#undef __ARM_ARCH
#define __ARM_ARCH 8
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif // clang
#include <arm_neon.h>
#if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
defined(__ARM_FEATURE_CRYPTO) && \
defined(__ARM_FEATURE_SHA2)
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#undef __ARM_FEATURE_CRYPTO
#undef __ARM_FEATURE_SHA2
#undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
#endif
#endif // Z7_MSC_VER_ORIGINAL
typedef uint32x4_t v128; typedef uint32x4_t v128;
// typedef __n128 v128; // MSVC // typedef __n128 v128; // MSVC
#ifdef MY_CPU_BE #ifdef MY_CPU_BE
#define MY_rev32_for_LE(x) #define MY_rev32_for_LE(x) x
#else #else
#define MY_rev32_for_LE(x) x = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x))) #define MY_rev32_for_LE(x) vrev32q_u8(x)
#endif #endif
#define LOAD_128(_p) (*(const v128 *)(const void *)(_p)) #if 1 // 0 for debug
#define STORE_128(_p, _v) *(v128 *)(void *)(_p) = (_v) // for arm32: it works slower by some reason than direct code
/*
for arm32 it generates:
MSVC-2022, GCC-9:
vld1.32 {d18,d19}, [r10]
vst1.32 {d4,d5}, [r3]
vld1.8 {d20-d21}, [r4]
there is no align hint (like [r10:128]). So instruction allows unaligned access
*/
#define LOAD_128_32(_p) vld1q_u32(_p)
#define LOAD_128_8(_p) vld1q_u8 (_p)
#define STORE_128_32(_p, _v) vst1q_u32(_p, _v)
#else
/*
for arm32:
MSVC-2022:
vldm r10,{d18,d19}
vstm r3,{d4,d5}
does it require strict alignment?
GCC-9:
vld1.64 {d30-d31}, [r0:64]
vldr d28, [r0, #16]
vldr d29, [r0, #24]
vst1.64 {d30-d31}, [r0:64]
vstr d28, [r0, #16]
vstr d29, [r0, #24]
there is hint [r0:64], so does it requires 64-bit alignment.
*/
#define LOAD_128_32(_p) (*(const v128 *)(const void *)(_p))
#define LOAD_128_8(_p) vreinterpretq_u8_u32(*(const v128 *)(const void *)(_p))
#define STORE_128_32(_p, _v) *(v128 *)(void *)(_p) = (_v)
#endif
#define LOAD_SHUFFLE(m, k) \ #define LOAD_SHUFFLE(m, k) \
m = LOAD_128((data + (k) * 16)); \ m = vreinterpretq_u32_u8( \
MY_rev32_for_LE(m); \ MY_rev32_for_LE( \
LOAD_128_8(data + (k) * 16))); \
// K array must be aligned for 16-bytes at least. // K array must be aligned for 16-bytes at least.
extern extern
MY_ALIGN(64) MY_ALIGN(64)
const UInt32 SHA256_K_ARRAY[64]; const UInt32 SHA256_K_ARRAY[64];
#define K SHA256_K_ARRAY #define K SHA256_K_ARRAY
#define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src); #define SHA256_SU0(dest, src) dest = vsha256su0q_u32(dest, src);
#define SHA25G_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3); #define SHA256_SU1(dest, src2, src3) dest = vsha256su1q_u32(dest, src2, src3);
#define SM1(g0, g1, g2, g3) SHA256_SU0(g3, g0) #define SM1(m0, m1, m2, m3) SHA256_SU0(m3, m0)
#define SM2(g0, g1, g2, g3) SHA25G_SU1(g2, g0, g1) #define SM2(m0, m1, m2, m3) SHA256_SU1(m2, m0, m1)
#define NNN(g0, g1, g2, g3) #define NNN(m0, m1, m2, m3)
#define R4(k, m0, m1, m2, m3, OP0, OP1) \
#define R4(k, g0, g1, g2, g3, OP0, OP1) \ msg = vaddq_u32(m0, *(const v128 *) (const void *) &K[(k) * 4]); \
msg = vaddq_u32(g0, *(const v128 *) (const void *) &K[(k) * 4]); \
tmp = state0; \ tmp = state0; \
state0 = vsha256hq_u32( state0, state1, msg ); \ state0 = vsha256hq_u32( state0, state1, msg ); \
state1 = vsha256h2q_u32( state1, tmp, msg ); \ state1 = vsha256h2q_u32( state1, tmp, msg ); \
OP0(g0, g1, g2, g3); \ OP0(m0, m1, m2, m3); \
OP1(g0, g1, g2, g3); \ OP1(m0, m1, m2, m3); \
#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \ #define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
@ -298,19 +355,19 @@ const UInt32 SHA256_K_ARRAY[64];
R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \ R4 ( (k)*4+3, m3, m0, m1, m2, OP6, OP7 ) \
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
#ifdef ATTRIB_SHA #ifdef ATTRIB_SHA
ATTRIB_SHA ATTRIB_SHA
#endif #endif
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
{ {
v128 state0, state1; v128 state0, state1;
if (numBlocks == 0) if (numBlocks == 0)
return; return;
state0 = LOAD_128(&state[0]); state0 = LOAD_128_32(&state[0]);
state1 = LOAD_128(&state[4]); state1 = LOAD_128_32(&state[4]);
do do
{ {
@ -326,10 +383,10 @@ void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size
LOAD_SHUFFLE (m2, 2) LOAD_SHUFFLE (m2, 2)
LOAD_SHUFFLE (m3, 3) LOAD_SHUFFLE (m3, 3)
R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 ); R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 ); R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN ); R16 ( 3, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
state0 = vaddq_u32(state0, state0_save); state0 = vaddq_u32(state0, state0_save);
state1 = vaddq_u32(state1, state1_save); state1 = vaddq_u32(state1, state1_save);
@ -338,8 +395,8 @@ void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size
} }
while (--numBlocks); while (--numBlocks);
STORE_128(&state[0], state0); STORE_128_32(&state[0], state0);
STORE_128(&state[4], state1); STORE_128_32(&state[4], state1);
} }
#endif // USE_HW_SHA #endif // USE_HW_SHA
@ -347,18 +404,19 @@ void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size
#endif // MY_CPU_ARM_OR_ARM64 #endif // MY_CPU_ARM_OR_ARM64
#ifndef USE_HW_SHA #if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
// #error Stop_Compiling_UNSUPPORTED_SHA // #error Stop_Compiling_UNSUPPORTED_SHA
// #include <stdlib.h> // #include <stdlib.h>
// We can compile this file with another C compiler,
// or we can compile asm version.
// So we can generate real code instead of this stub function.
// #include "Sha256.h" // #include "Sha256.h"
void MY_FAST_CALL Sha256_UpdateBlocks(UInt32 state[8], const Byte *data, size_t numBlocks); // #if defined(_MSC_VER)
#pragma message("Sha256 HW-SW stub was used") #pragma message("Sha256 HW-SW stub was used")
// #endif
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks); void Z7_FASTCALL Sha256_UpdateBlocks (UInt32 state[8], const Byte *data, size_t numBlocks);
void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks) void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks);
void Z7_FASTCALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size_t numBlocks)
{ {
Sha256_UpdateBlocks(state, data, numBlocks); Sha256_UpdateBlocks(state, data, numBlocks);
/* /*
@ -369,5 +427,25 @@ void MY_FAST_CALL Sha256_UpdateBlocks_HW(UInt32 state[8], const Byte *data, size
return; return;
*/ */
} }
#endif #endif
#undef K
#undef RND2
#undef MY_rev32_for_LE
#undef NNN
#undef LOAD_128
#undef STORE_128
#undef LOAD_SHUFFLE
#undef SM1
#undef SM2
#undef R4
#undef R16
#undef PREPARE_STATE
#undef USE_HW_SHA
#undef ATTRIB_SHA
#undef USE_VER_MIN
#undef Z7_USE_HW_SHA_STUB

359
C/Sha3.c Normal file
View file

@ -0,0 +1,359 @@
/* Sha3.c -- SHA-3 Hash
: Igor Pavlov : Public domain
This code is based on public domain code from Wei Dai's Crypto++ library. */
#include "Precomp.h"
#include <string.h>
#include "Sha3.h"
#include "RotateDefs.h"
#include "CpuArch.h"
#define U64C(x) UINT64_CONST(x)
static
MY_ALIGN(64)
const UInt64 SHA3_K_ARRAY[24] =
{
U64C(0x0000000000000001), U64C(0x0000000000008082),
U64C(0x800000000000808a), U64C(0x8000000080008000),
U64C(0x000000000000808b), U64C(0x0000000080000001),
U64C(0x8000000080008081), U64C(0x8000000000008009),
U64C(0x000000000000008a), U64C(0x0000000000000088),
U64C(0x0000000080008009), U64C(0x000000008000000a),
U64C(0x000000008000808b), U64C(0x800000000000008b),
U64C(0x8000000000008089), U64C(0x8000000000008003),
U64C(0x8000000000008002), U64C(0x8000000000000080),
U64C(0x000000000000800a), U64C(0x800000008000000a),
U64C(0x8000000080008081), U64C(0x8000000000008080),
U64C(0x0000000080000001), U64C(0x8000000080008008)
};
void Sha3_Init(CSha3 *p)
{
p->count = 0;
memset(p->state, 0, sizeof(p->state));
}
#define GET_state(i, a) UInt64 a = state[i];
#define SET_state(i, a) state[i] = a;
#define LS_5(M, i, a0,a1,a2,a3,a4) \
M ((i) * 5 , a0) \
M ((i) * 5 + 1, a1) \
M ((i) * 5 + 2, a2) \
M ((i) * 5 + 3, a3) \
M ((i) * 5 + 4, a4) \
#define LS_25(M) \
LS_5 (M, 0, a50, a51, a52, a53, a54) \
LS_5 (M, 1, a60, a61, a62, a63, a64) \
LS_5 (M, 2, a70, a71, a72, a73, a74) \
LS_5 (M, 3, a80, a81, a82, a83, a84) \
LS_5 (M, 4, a90, a91, a92, a93, a94) \
#define XOR_1(i, a0) \
a0 ^= GetUi64(data + (i) * 8); \
#define XOR_4(i, a0,a1,a2,a3) \
XOR_1 ((i) , a0); \
XOR_1 ((i) + 1, a1); \
XOR_1 ((i) + 2, a2); \
XOR_1 ((i) + 3, a3); \
#define D(d,b1,b2) \
d = b1 ^ Z7_ROTL64(b2, 1);
#define D5 \
D (d0, c4, c1) \
D (d1, c0, c2) \
D (d2, c1, c3) \
D (d3, c2, c4) \
D (d4, c3, c0) \
#define C0(c,a,d) \
c = a ^ d; \
#define C(c,a,d,k) \
c = a ^ d; \
c = Z7_ROTL64(c, k); \
#define E4(e1,e2,e3,e4) \
e1 = c1 ^ (~c2 & c3); \
e2 = c2 ^ (~c3 & c4); \
e3 = c3 ^ (~c4 & c0); \
e4 = c4 ^ (~c0 & c1); \
#define CK( v0,w0, \
v1,w1,k1, \
v2,w2,k2, \
v3,w3,k3, \
v4,w4,k4, e0,e1,e2,e3,e4, keccak_c) \
C0(c0,v0,w0) \
C (c1,v1,w1,k1) \
C (c2,v2,w2,k2) \
C (c3,v3,w3,k3) \
C (c4,v4,w4,k4) \
e0 = c0 ^ (~c1 & c2) ^ keccak_c; \
E4(e1,e2,e3,e4) \
#define CE( v0,w0,k0, \
v1,w1,k1, \
v2,w2,k2, \
v3,w3,k3, \
v4,w4,k4, e0,e1,e2,e3,e4) \
C (c0,v0,w0,k0) \
C (c1,v1,w1,k1) \
C (c2,v2,w2,k2) \
C (c3,v3,w3,k3) \
C (c4,v4,w4,k4) \
e0 = c0 ^ (~c1 & c2); \
E4(e1,e2,e3,e4) \
// numBlocks != 0
static
Z7_NO_INLINE
void Z7_FASTCALL Sha3_UpdateBlocks(UInt64 state[SHA3_NUM_STATE_WORDS],
const Byte *data, size_t numBlocks, size_t blockSize)
{
LS_25 (GET_state)
do
{
unsigned round;
XOR_4 ( 0, a50, a51, a52, a53)
XOR_4 ( 4, a54, a60, a61, a62)
XOR_1 ( 8, a63)
if (blockSize > 8 * 9) { XOR_4 ( 9, a64, a70, a71, a72) // sha3-384
if (blockSize > 8 * 13) { XOR_4 (13, a73, a74, a80, a81) // sha3-256
if (blockSize > 8 * 17) { XOR_1 (17, a82) // sha3-224
if (blockSize > 8 * 18) { XOR_1 (18, a83) // shake128
XOR_1 (19, a84)
XOR_1 (20, a90) }}}}
data += blockSize;
for (round = 0; round < 24; round += 2)
{
UInt64 c0, c1, c2, c3, c4;
UInt64 d0, d1, d2, d3, d4;
UInt64 e50, e51, e52, e53, e54;
UInt64 e60, e61, e62, e63, e64;
UInt64 e70, e71, e72, e73, e74;
UInt64 e80, e81, e82, e83, e84;
UInt64 e90, e91, e92, e93, e94;
c0 = a50^a60^a70^a80^a90;
c1 = a51^a61^a71^a81^a91;
c2 = a52^a62^a72^a82^a92;
c3 = a53^a63^a73^a83^a93;
c4 = a54^a64^a74^a84^a94;
D5
CK( a50, d0,
a61, d1, 44,
a72, d2, 43,
a83, d3, 21,
a94, d4, 14, e50, e51, e52, e53, e54, SHA3_K_ARRAY[round])
CE( a53, d3, 28,
a64, d4, 20,
a70, d0, 3,
a81, d1, 45,
a92, d2, 61, e60, e61, e62, e63, e64)
CE( a51, d1, 1,
a62, d2, 6,
a73, d3, 25,
a84, d4, 8,
a90, d0, 18, e70, e71, e72, e73, e74)
CE( a54, d4, 27,
a60, d0, 36,
a71, d1, 10,
a82, d2, 15,
a93, d3, 56, e80, e81, e82, e83, e84)
CE( a52, d2, 62,
a63, d3, 55,
a74, d4, 39,
a80, d0, 41,
a91, d1, 2, e90, e91, e92, e93, e94)
// ---------- ROUND + 1 ----------
c0 = e50^e60^e70^e80^e90;
c1 = e51^e61^e71^e81^e91;
c2 = e52^e62^e72^e82^e92;
c3 = e53^e63^e73^e83^e93;
c4 = e54^e64^e74^e84^e94;
D5
CK( e50, d0,
e61, d1, 44,
e72, d2, 43,
e83, d3, 21,
e94, d4, 14, a50, a51, a52, a53, a54, SHA3_K_ARRAY[(size_t)round + 1])
CE( e53, d3, 28,
e64, d4, 20,
e70, d0, 3,
e81, d1, 45,
e92, d2, 61, a60, a61, a62, a63, a64)
CE( e51, d1, 1,
e62, d2, 6,
e73, d3, 25,
e84, d4, 8,
e90, d0, 18, a70, a71, a72, a73, a74)
CE (e54, d4, 27,
e60, d0, 36,
e71, d1, 10,
e82, d2, 15,
e93, d3, 56, a80, a81, a82, a83, a84)
CE (e52, d2, 62,
e63, d3, 55,
e74, d4, 39,
e80, d0, 41,
e91, d1, 2, a90, a91, a92, a93, a94)
}
}
while (--numBlocks);
LS_25 (SET_state)
}
#define Sha3_UpdateBlock(p) \
Sha3_UpdateBlocks(p->state, p->buffer, 1, p->blockSize)
void Sha3_Update(CSha3 *p, const Byte *data, size_t size)
{
/*
for (;;)
{
if (size == 0)
return;
unsigned cur = p->blockSize - p->count;
if (cur > size)
cur = (unsigned)size;
size -= cur;
unsigned pos = p->count;
p->count = pos + cur;
while (pos & 7)
{
if (cur == 0)
return;
Byte *pb = &(((Byte *)p->state)[pos]);
*pb = (Byte)(*pb ^ *data++);
cur--;
pos++;
}
if (cur >= 8)
{
do
{
*(UInt64 *)(void *)&(((Byte *)p->state)[pos]) ^= GetUi64(data);
data += 8;
pos += 8;
cur -= 8;
}
while (cur >= 8);
}
if (pos != p->blockSize)
{
if (cur)
{
Byte *pb = &(((Byte *)p->state)[pos]);
do
{
*pb = (Byte)(*pb ^ *data++);
pb++;
}
while (--cur);
}
return;
}
Sha3_UpdateBlock(p->state);
p->count = 0;
}
*/
if (size == 0)
return;
{
const unsigned pos = p->count;
const unsigned num = p->blockSize - pos;
if (num > size)
{
p->count = pos + (unsigned)size;
memcpy(p->buffer + pos, data, size);
return;
}
if (pos != 0)
{
size -= num;
memcpy(p->buffer + pos, data, num);
data += num;
Sha3_UpdateBlock(p);
}
}
if (size >= p->blockSize)
{
const size_t numBlocks = size / p->blockSize;
const Byte *dataOld = data;
data += numBlocks * p->blockSize;
size = (size_t)(dataOld + size - data);
Sha3_UpdateBlocks(p->state, dataOld, numBlocks, p->blockSize);
}
p->count = (unsigned)size;
if (size)
memcpy(p->buffer, data, size);
}
// we support only (digestSize % 4 == 0) cases
void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake)
{
memset(p->buffer + p->count, 0, p->blockSize - p->count);
// we write bits markers from low to higher in current byte:
// - if sha-3 : 2 bits : 0,1
// - if shake : 4 bits : 1111
// then we write bit 1 to same byte.
// And we write bit 1 to highest bit of last byte of block.
p->buffer[p->count] = (Byte)(shake ? 0x1f : 0x06);
// we need xor operation (^= 0x80) here because we must write 0x80 bit
// to same byte as (0x1f : 0x06), if (p->count == p->blockSize - 1) !!!
p->buffer[p->blockSize - 1] ^= 0x80;
/*
((Byte *)p->state)[p->count] ^= (Byte)(shake ? 0x1f : 0x06);
((Byte *)p->state)[p->blockSize - 1] ^= 0x80;
*/
Sha3_UpdateBlock(p);
#if 1 && defined(MY_CPU_LE)
memcpy(digest, p->state, digestSize);
#else
{
const unsigned numWords = digestSize >> 3;
unsigned i;
for (i = 0; i < numWords; i++)
{
const UInt64 v = p->state[i];
SetUi64(digest, v)
digest += 8;
}
if (digestSize & 4) // for SHA3-224
{
const UInt32 v = (UInt32)p->state[numWords];
SetUi32(digest, v)
}
}
#endif
Sha3_Init(p);
}
#undef GET_state
#undef SET_state
#undef LS_5
#undef LS_25
#undef XOR_1
#undef XOR_4
#undef D
#undef D5
#undef C0
#undef C
#undef E4
#undef CK
#undef CE

36
C/Sha3.h Normal file
View file

@ -0,0 +1,36 @@
/* Sha3.h -- SHA-3 Hash
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_MD5_H
#define ZIP7_INC_MD5_H
#include "7zTypes.h"
EXTERN_C_BEGIN
#define SHA3_NUM_STATE_WORDS 25
#define SHA3_BLOCK_SIZE_FROM_DIGEST_SIZE(digestSize) \
(SHA3_NUM_STATE_WORDS * 8 - (digestSize) * 2)
typedef struct
{
UInt32 count; // < blockSize
UInt32 blockSize; // <= SHA3_NUM_STATE_WORDS * 8
UInt64 _pad1[3];
// we want 32-bytes alignment here
UInt64 state[SHA3_NUM_STATE_WORDS];
UInt64 _pad2[3];
// we want 64-bytes alignment here
Byte buffer[SHA3_NUM_STATE_WORDS * 8]; // last bytes will be unused with predefined blockSize values
} CSha3;
#define Sha3_SET_blockSize(p, blockSize) { (p)->blockSize = (blockSize); }
void Sha3_Init(CSha3 *p);
void Sha3_Update(CSha3 *p, const Byte *data, size_t size);
void Sha3_Final(CSha3 *p, Byte *digest, unsigned digestSize, unsigned shake);
EXTERN_C_END
#endif

711
C/Sha512.c Normal file
View file

@ -0,0 +1,711 @@
/* Sha512.c -- SHA-512 Hash
: Igor Pavlov : Public domain
This code is based on public domain code from Wei Dai's Crypto++ library. */
#include "Precomp.h"
#include <string.h>
#include "Sha512.h"
#include "RotateDefs.h"
#include "CpuArch.h"
#ifdef MY_CPU_X86_OR_AMD64
#if defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) \
|| defined(_MSC_VER) && (_MSC_VER >= 1940)
#define Z7_COMPILER_SHA512_SUPPORTED
#endif
#elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
#if defined(__ARM_FEATURE_SHA512)
#define Z7_COMPILER_SHA512_SUPPORTED
#else
#if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \
|| defined(__GNUC__) && (__GNUC__ >= 9) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it
#define Z7_COMPILER_SHA512_SUPPORTED
#endif
#endif
#endif
void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks);
#ifdef Z7_COMPILER_SHA512_SUPPORTED
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS = Sha512_UpdateBlocks;
static SHA512_FUNC_UPDATE_BLOCKS g_SHA512_FUNC_UPDATE_BLOCKS_HW;
#define SHA512_UPDATE_BLOCKS(p) p->v.vars.func_UpdateBlocks
#else
#define SHA512_UPDATE_BLOCKS(p) Sha512_UpdateBlocks
#endif
BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo)
{
SHA512_FUNC_UPDATE_BLOCKS func = Sha512_UpdateBlocks;
#ifdef Z7_COMPILER_SHA512_SUPPORTED
if (algo != SHA512_ALGO_SW)
{
if (algo == SHA512_ALGO_DEFAULT)
func = g_SHA512_FUNC_UPDATE_BLOCKS;
else
{
if (algo != SHA512_ALGO_HW)
return False;
func = g_SHA512_FUNC_UPDATE_BLOCKS_HW;
if (!func)
return False;
}
}
#else
if (algo > 1)
return False;
#endif
p->v.vars.func_UpdateBlocks = func;
return True;
}
/* define it for speed optimization */
#if 0 // 1 for size optimization
#define STEP_PRE 1
#define STEP_MAIN 1
#else
#define STEP_PRE 2
#define STEP_MAIN 4
// #define Z7_SHA512_UNROLL
#endif
#undef Z7_SHA512_BIG_W
#if STEP_MAIN != 16
#define Z7_SHA512_BIG_W
#endif
#define U64C(x) UINT64_CONST(x)
static MY_ALIGN(64) const UInt64 SHA512_INIT_ARRAYS[4][8] = {
{ U64C(0x8c3d37c819544da2), U64C(0x73e1996689dcd4d6), U64C(0x1dfab7ae32ff9c82), U64C(0x679dd514582f9fcf),
U64C(0x0f6d2b697bd44da8), U64C(0x77e36f7304c48942), U64C(0x3f9d85a86a1d36c8), U64C(0x1112e6ad91d692a1)
},
{ U64C(0x22312194fc2bf72c), U64C(0x9f555fa3c84c64c2), U64C(0x2393b86b6f53b151), U64C(0x963877195940eabd),
U64C(0x96283ee2a88effe3), U64C(0xbe5e1e2553863992), U64C(0x2b0199fc2c85b8aa), U64C(0x0eb72ddc81c52ca2)
},
{ U64C(0xcbbb9d5dc1059ed8), U64C(0x629a292a367cd507), U64C(0x9159015a3070dd17), U64C(0x152fecd8f70e5939),
U64C(0x67332667ffc00b31), U64C(0x8eb44a8768581511), U64C(0xdb0c2e0d64f98fa7), U64C(0x47b5481dbefa4fa4)
},
{ U64C(0x6a09e667f3bcc908), U64C(0xbb67ae8584caa73b), U64C(0x3c6ef372fe94f82b), U64C(0xa54ff53a5f1d36f1),
U64C(0x510e527fade682d1), U64C(0x9b05688c2b3e6c1f), U64C(0x1f83d9abfb41bd6b), U64C(0x5be0cd19137e2179)
}};
void Sha512_InitState(CSha512 *p, unsigned digestSize)
{
p->v.vars.count = 0;
memcpy(p->state, SHA512_INIT_ARRAYS[(size_t)(digestSize >> 4) - 1], sizeof(p->state));
}
void Sha512_Init(CSha512 *p, unsigned digestSize)
{
p->v.vars.func_UpdateBlocks =
#ifdef Z7_COMPILER_SHA512_SUPPORTED
g_SHA512_FUNC_UPDATE_BLOCKS;
#else
NULL;
#endif
Sha512_InitState(p, digestSize);
}
#define S0(x) (Z7_ROTR64(x,28) ^ Z7_ROTR64(x,34) ^ Z7_ROTR64(x,39))
#define S1(x) (Z7_ROTR64(x,14) ^ Z7_ROTR64(x,18) ^ Z7_ROTR64(x,41))
#define s0(x) (Z7_ROTR64(x, 1) ^ Z7_ROTR64(x, 8) ^ (x >> 7))
#define s1(x) (Z7_ROTR64(x,19) ^ Z7_ROTR64(x,61) ^ (x >> 6))
#define Ch(x,y,z) (z^(x&(y^z)))
#define Maj(x,y,z) ((x&y)|(z&(x|y)))
#define W_PRE(i) (W[(i) + (size_t)(j)] = GetBe64(data + ((size_t)(j) + i) * 8))
#define blk2_main(j, i) s1(w(j, (i)-2)) + w(j, (i)-7) + s0(w(j, (i)-15))
#ifdef Z7_SHA512_BIG_W
// we use +i instead of +(i) to change the order to solve CLANG compiler warning for signed/unsigned.
#define w(j, i) W[(size_t)(j) + i]
#define blk2(j, i) (w(j, i) = w(j, (i)-16) + blk2_main(j, i))
#else
#if STEP_MAIN == 16
#define w(j, i) W[(i) & 15]
#else
#define w(j, i) W[((size_t)(j) + (i)) & 15]
#endif
#define blk2(j, i) (w(j, i) += blk2_main(j, i))
#endif
#define W_MAIN(i) blk2(j, i)
#define T1(wx, i) \
tmp = h + S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
h = g; \
g = f; \
f = e; \
e = d + tmp; \
tmp += S0(a) + Maj(a, b, c); \
d = c; \
c = b; \
b = a; \
a = tmp; \
#define R1_PRE(i) T1( W_PRE, i)
#define R1_MAIN(i) T1( W_MAIN, i)
#if (!defined(Z7_SHA512_UNROLL) || STEP_MAIN < 8) && (STEP_MAIN >= 4)
#define R2_MAIN(i) \
R1_MAIN(i) \
R1_MAIN(i + 1) \
#endif
#if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8
#define T4( a,b,c,d,e,f,g,h, wx, i) \
h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
tmp = h; \
h += d; \
d = tmp + S0(a) + Maj(a, b, c); \
#define R4( wx, i) \
T4 ( a,b,c,d,e,f,g,h, wx, (i )); \
T4 ( d,a,b,c,h,e,f,g, wx, (i+1)); \
T4 ( c,d,a,b,g,h,e,f, wx, (i+2)); \
T4 ( b,c,d,a,f,g,h,e, wx, (i+3)); \
#define R4_PRE(i) R4( W_PRE, i)
#define R4_MAIN(i) R4( W_MAIN, i)
#define T8( a,b,c,d,e,f,g,h, wx, i) \
h += S1(e) + Ch(e,f,g) + K[(i)+(size_t)(j)] + wx(i); \
d += h; \
h += S0(a) + Maj(a, b, c); \
#define R8( wx, i) \
T8 ( a,b,c,d,e,f,g,h, wx, i ); \
T8 ( h,a,b,c,d,e,f,g, wx, i+1); \
T8 ( g,h,a,b,c,d,e,f, wx, i+2); \
T8 ( f,g,h,a,b,c,d,e, wx, i+3); \
T8 ( e,f,g,h,a,b,c,d, wx, i+4); \
T8 ( d,e,f,g,h,a,b,c, wx, i+5); \
T8 ( c,d,e,f,g,h,a,b, wx, i+6); \
T8 ( b,c,d,e,f,g,h,a, wx, i+7); \
#define R8_PRE(i) R8( W_PRE, i)
#define R8_MAIN(i) R8( W_MAIN, i)
#endif
extern
MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80];
MY_ALIGN(64) const UInt64 SHA512_K_ARRAY[80] = {
U64C(0x428a2f98d728ae22), U64C(0x7137449123ef65cd), U64C(0xb5c0fbcfec4d3b2f), U64C(0xe9b5dba58189dbbc),
U64C(0x3956c25bf348b538), U64C(0x59f111f1b605d019), U64C(0x923f82a4af194f9b), U64C(0xab1c5ed5da6d8118),
U64C(0xd807aa98a3030242), U64C(0x12835b0145706fbe), U64C(0x243185be4ee4b28c), U64C(0x550c7dc3d5ffb4e2),
U64C(0x72be5d74f27b896f), U64C(0x80deb1fe3b1696b1), U64C(0x9bdc06a725c71235), U64C(0xc19bf174cf692694),
U64C(0xe49b69c19ef14ad2), U64C(0xefbe4786384f25e3), U64C(0x0fc19dc68b8cd5b5), U64C(0x240ca1cc77ac9c65),
U64C(0x2de92c6f592b0275), U64C(0x4a7484aa6ea6e483), U64C(0x5cb0a9dcbd41fbd4), U64C(0x76f988da831153b5),
U64C(0x983e5152ee66dfab), U64C(0xa831c66d2db43210), U64C(0xb00327c898fb213f), U64C(0xbf597fc7beef0ee4),
U64C(0xc6e00bf33da88fc2), U64C(0xd5a79147930aa725), U64C(0x06ca6351e003826f), U64C(0x142929670a0e6e70),
U64C(0x27b70a8546d22ffc), U64C(0x2e1b21385c26c926), U64C(0x4d2c6dfc5ac42aed), U64C(0x53380d139d95b3df),
U64C(0x650a73548baf63de), U64C(0x766a0abb3c77b2a8), U64C(0x81c2c92e47edaee6), U64C(0x92722c851482353b),
U64C(0xa2bfe8a14cf10364), U64C(0xa81a664bbc423001), U64C(0xc24b8b70d0f89791), U64C(0xc76c51a30654be30),
U64C(0xd192e819d6ef5218), U64C(0xd69906245565a910), U64C(0xf40e35855771202a), U64C(0x106aa07032bbd1b8),
U64C(0x19a4c116b8d2d0c8), U64C(0x1e376c085141ab53), U64C(0x2748774cdf8eeb99), U64C(0x34b0bcb5e19b48a8),
U64C(0x391c0cb3c5c95a63), U64C(0x4ed8aa4ae3418acb), U64C(0x5b9cca4f7763e373), U64C(0x682e6ff3d6b2b8a3),
U64C(0x748f82ee5defb2fc), U64C(0x78a5636f43172f60), U64C(0x84c87814a1f0ab72), U64C(0x8cc702081a6439ec),
U64C(0x90befffa23631e28), U64C(0xa4506cebde82bde9), U64C(0xbef9a3f7b2c67915), U64C(0xc67178f2e372532b),
U64C(0xca273eceea26619c), U64C(0xd186b8c721c0c207), U64C(0xeada7dd6cde0eb1e), U64C(0xf57d4f7fee6ed178),
U64C(0x06f067aa72176fba), U64C(0x0a637dc5a2c898a6), U64C(0x113f9804bef90dae), U64C(0x1b710b35131c471b),
U64C(0x28db77f523047d84), U64C(0x32caab7b40c72493), U64C(0x3c9ebe0a15c9bebc), U64C(0x431d67c49c100d4c),
U64C(0x4cc5d4becb3e42b6), U64C(0x597f299cfc657e2a), U64C(0x5fcb6fab3ad6faec), U64C(0x6c44198c4a475817)
};
#define K SHA512_K_ARRAY
Z7_NO_INLINE
void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks)
{
UInt64 W
#ifdef Z7_SHA512_BIG_W
[80];
#else
[16];
#endif
unsigned j;
UInt64 a,b,c,d,e,f,g,h;
#if !defined(Z7_SHA512_UNROLL) || (STEP_MAIN <= 4) || (STEP_PRE <= 4)
UInt64 tmp;
#endif
if (numBlocks == 0) return;
a = state[0];
b = state[1];
c = state[2];
d = state[3];
e = state[4];
f = state[5];
g = state[6];
h = state[7];
do
{
for (j = 0; j < 16; j += STEP_PRE)
{
#if STEP_PRE > 4
#if STEP_PRE < 8
R4_PRE(0);
#else
R8_PRE(0);
#if STEP_PRE == 16
R8_PRE(8);
#endif
#endif
#else
R1_PRE(0)
#if STEP_PRE >= 2
R1_PRE(1)
#if STEP_PRE >= 4
R1_PRE(2)
R1_PRE(3)
#endif
#endif
#endif
}
for (j = 16; j < 80; j += STEP_MAIN)
{
#if defined(Z7_SHA512_UNROLL) && STEP_MAIN >= 8
#if STEP_MAIN < 8
R4_MAIN(0)
#else
R8_MAIN(0)
#if STEP_MAIN == 16
R8_MAIN(8)
#endif
#endif
#else
R1_MAIN(0)
#if STEP_MAIN >= 2
R1_MAIN(1)
#if STEP_MAIN >= 4
R2_MAIN(2)
#if STEP_MAIN >= 8
R2_MAIN(4)
R2_MAIN(6)
#if STEP_MAIN >= 16
R2_MAIN(8)
R2_MAIN(10)
R2_MAIN(12)
R2_MAIN(14)
#endif
#endif
#endif
#endif
#endif
}
a += state[0]; state[0] = a;
b += state[1]; state[1] = b;
c += state[2]; state[2] = c;
d += state[3]; state[3] = d;
e += state[4]; state[4] = e;
f += state[5]; state[5] = f;
g += state[6]; state[6] = g;
h += state[7]; state[7] = h;
data += SHA512_BLOCK_SIZE;
}
while (--numBlocks);
}
#define Sha512_UpdateBlock(p) SHA512_UPDATE_BLOCKS(p)(p->state, p->buffer, 1)
void Sha512_Update(CSha512 *p, const Byte *data, size_t size)
{
if (size == 0)
return;
{
const unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1);
const unsigned num = SHA512_BLOCK_SIZE - pos;
p->v.vars.count += size;
if (num > size)
{
memcpy(p->buffer + pos, data, size);
return;
}
if (pos != 0)
{
size -= num;
memcpy(p->buffer + pos, data, num);
data += num;
Sha512_UpdateBlock(p);
}
}
{
const size_t numBlocks = size >> 7;
// if (numBlocks)
SHA512_UPDATE_BLOCKS(p)(p->state, data, numBlocks);
size &= SHA512_BLOCK_SIZE - 1;
if (size == 0)
return;
data += (numBlocks << 7);
memcpy(p->buffer, data, size);
}
}
void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize)
{
unsigned pos = (unsigned)p->v.vars.count & (SHA512_BLOCK_SIZE - 1);
p->buffer[pos++] = 0x80;
if (pos > (SHA512_BLOCK_SIZE - 8 * 2))
{
while (pos != SHA512_BLOCK_SIZE) { p->buffer[pos++] = 0; }
// memset(&p->buf.buffer[pos], 0, SHA512_BLOCK_SIZE - pos);
Sha512_UpdateBlock(p);
pos = 0;
}
memset(&p->buffer[pos], 0, (SHA512_BLOCK_SIZE - 8 * 2) - pos);
{
const UInt64 numBits = p->v.vars.count << 3;
SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 2, 0) // = (p->v.vars.count >> (64 - 3)); (high 64-bits)
SetBe64(p->buffer + SHA512_BLOCK_SIZE - 8 * 1, numBits)
}
Sha512_UpdateBlock(p);
#if 1 && defined(MY_CPU_BE)
memcpy(digest, p->state, digestSize);
#else
{
const unsigned numWords = digestSize >> 3;
unsigned i;
for (i = 0; i < numWords; i++)
{
const UInt64 v = p->state[i];
SetBe64(digest, v)
digest += 8;
}
if (digestSize & 4) // digestSize == SHA512_224_DIGEST_SIZE
{
const UInt32 v = (UInt32)((p->state[numWords]) >> 32);
SetBe32(digest, v)
}
}
#endif
Sha512_InitState(p, digestSize);
}
// #define Z7_SHA512_PROBE_DEBUG // for debug
#if defined(Z7_SHA512_PROBE_DEBUG) || defined(Z7_COMPILER_SHA512_SUPPORTED)
#if defined(Z7_SHA512_PROBE_DEBUG) \
|| defined(_WIN32) && defined(MY_CPU_ARM64)
#ifndef Z7_SHA512_USE_PROBE
#define Z7_SHA512_USE_PROBE
#endif
#endif
#ifdef Z7_SHA512_USE_PROBE
#ifdef Z7_SHA512_PROBE_DEBUG
#include <stdio.h>
#define PRF(x) x
#else
#define PRF(x)
#endif
#if 0 || !defined(_MSC_VER) // 1 || : for debug LONGJMP mode
// MINGW doesn't support __try. So we use signal() / longjmp().
// Note: signal() / longjmp() probably is not thread-safe.
// So we must call Sha512Prepare() from main thread at program start.
#ifndef Z7_SHA512_USE_LONGJMP
#define Z7_SHA512_USE_LONGJMP
#endif
#endif
#ifdef Z7_SHA512_USE_LONGJMP
#include <signal.h>
#include <setjmp.h>
static jmp_buf g_Sha512_jmp_buf;
// static int g_Sha512_Unsupported;
#if defined(__GNUC__) && (__GNUC__ >= 8) \
|| defined(__clang__) && (__clang_major__ >= 3)
__attribute__((noreturn))
#endif
static void Z7_CDECL Sha512_signal_Handler(int v)
{
PRF(printf("======== Sha512_signal_Handler = %x\n", (unsigned)v);)
// g_Sha512_Unsupported = 1;
longjmp(g_Sha512_jmp_buf, 1);
}
#endif // Z7_SHA512_USE_LONGJMP
#if defined(_WIN32)
#include "7zWindows.h"
#endif
#if defined(MY_CPU_ARM64)
// #define Z7_SHA512_USE_SIMPLIFIED_PROBE // for debug
#endif
#ifdef Z7_SHA512_USE_SIMPLIFIED_PROBE
#include <arm_neon.h>
#if defined(__clang__)
__attribute__((__target__("sha3")))
#elif !defined(_MSC_VER)
__attribute__((__target__("arch=armv8.2-a+sha3")))
#endif
#endif
static BoolInt CPU_IsSupported_SHA512_Probe(void)
{
PRF(printf("\n== CPU_IsSupported_SHA512_Probe\n");)
#if defined(_WIN32) && defined(MY_CPU_ARM64)
// we have no SHA512 flag for IsProcessorFeaturePresent() still.
if (!CPU_IsSupported_CRYPTO())
return False;
PRF(printf("==== Registry check\n");)
{
// we can't read ID_AA64ISAR0_EL1 register from application.
// but ID_AA64ISAR0_EL1 register is mapped to "CP 4030" registry value.
HKEY key = NULL;
LONG res = RegOpenKeyEx(HKEY_LOCAL_MACHINE,
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
0, KEY_READ, &key);
if (res != ERROR_SUCCESS)
return False;
{
DWORD type = 0;
DWORD count = sizeof(UInt64);
UInt64 val = 0;
res = RegQueryValueEx(key, TEXT("CP 4030"), NULL,
&type, (LPBYTE)&val, &count);
RegCloseKey(key);
if (res != ERROR_SUCCESS
|| type != REG_QWORD
|| count != sizeof(UInt64)
|| ((unsigned)(val >> 12) & 0xf) != 2)
return False;
// we parse SHA2 field of ID_AA64ISAR0_EL1 register:
// 0 : No SHA2 instructions implemented
// 1 : SHA256 implemented
// 2 : SHA256 and SHA512 implemented
}
}
#endif // defined(_WIN32) && defined(MY_CPU_ARM64)
#if 1 // 0 for debug to disable SHA512 PROBE code
/*
----- SHA512 PROBE -----
We suppose that "CP 4030" registry reading is enough.
But we use additional SHA512 PROBE code, because
we can catch exception here, and we don't catch exceptions,
if we call Sha512 functions from main code.
NOTE: arm64 PROBE code doesn't work, if we call it via Wine in linux-arm64.
The program just stops.
Also x64 version of PROBE code doesn't work, if we run it via Intel SDE emulator
without SHA512 support (-skl switch),
The program stops, and we have message from SDE:
TID 0 SDE-ERROR: Executed instruction not valid for specified chip (SKYLAKE): vsha512msg1
But we still want to catch that exception instead of process stopping.
Does this PROBE code work in native Windows-arm64 (with/without sha512 hw instructions)?
Are there any ways to fix the problems with arm64-wine and x64-SDE cases?
*/
PRF(printf("==== CPU_IsSupported_SHA512 PROBE\n");)
{
BoolInt isSupported = False;
#ifdef Z7_SHA512_USE_LONGJMP
void (Z7_CDECL *signal_prev)(int);
/*
if (g_Sha512_Unsupported)
{
PRF(printf("==== g_Sha512_Unsupported\n");)
return False;
}
*/
printf("====== signal(SIGILL)\n");
signal_prev = signal(SIGILL, Sha512_signal_Handler);
if (signal_prev == SIG_ERR)
{
PRF(printf("====== signal fail\n");)
return False;
}
// PRF(printf("==== signal_prev = %p\n", (void *)signal_prev);)
// docs: Before the specified function is executed,
// the value of func is set to SIG_DFL.
// So we can exit if (setjmp(g_Sha512_jmp_buf) != 0).
PRF(printf("====== setjmp\n");)
if (!setjmp(g_Sha512_jmp_buf))
#else // Z7_SHA512_USE_LONGJMP
#ifdef _MSC_VER
#ifdef __clang_major__
#pragma GCC diagnostic ignored "-Wlanguage-extension-token"
#endif
__try
#endif
#endif // Z7_SHA512_USE_LONGJMP
{
#if defined(Z7_COMPILER_SHA512_SUPPORTED)
#ifdef Z7_SHA512_USE_SIMPLIFIED_PROBE
// simplified sha512 check for arm64:
const uint64x2_t a = vdupq_n_u64(1);
const uint64x2_t b = vsha512hq_u64(a, a, a);
PRF(printf("======== vsha512hq_u64 probe\n");)
if ((UInt32)vgetq_lane_u64(b, 0) == 0x11800002)
#else
MY_ALIGN(16)
UInt64 temp[SHA512_NUM_DIGEST_WORDS + SHA512_NUM_BLOCK_WORDS];
memset(temp, 0x5a, sizeof(temp));
PRF(printf("======== Sha512_UpdateBlocks_HW\n");)
Sha512_UpdateBlocks_HW(temp,
(const Byte *)(const void *)(temp + SHA512_NUM_DIGEST_WORDS), 1);
// PRF(printf("======== t = %x\n", (UInt32)temp[0]);)
if ((UInt32)temp[0] == 0xa33cfdf7)
#endif
{
PRF(printf("======== PROBE SHA512: SHA512 is supported\n");)
isSupported = True;
}
#else // Z7_COMPILER_SHA512_SUPPORTED
// for debug : we generate bad instrction or raise exception.
// __except() doesn't catch raise() calls.
#ifdef Z7_SHA512_USE_LONGJMP
PRF(printf("====== raise(SIGILL)\n");)
raise(SIGILL);
#else
#if defined(_MSC_VER) && defined(MY_CPU_X86)
__asm ud2
#endif
#endif // Z7_SHA512_USE_LONGJMP
#endif // Z7_COMPILER_SHA512_SUPPORTED
}
#ifdef Z7_SHA512_USE_LONGJMP
PRF(printf("====== restore signal SIGILL\n");)
signal(SIGILL, signal_prev);
#elif _MSC_VER
__except (EXCEPTION_EXECUTE_HANDLER)
{
PRF(printf("==== CPU_IsSupported_SHA512 __except(EXCEPTION_EXECUTE_HANDLER)\n");)
}
#endif
PRF(printf("== return (sha512 supported) = %d\n", isSupported);)
return isSupported;
}
#else
// without SHA512 PROBE code
return True;
#endif
}
#endif // Z7_SHA512_USE_PROBE
#endif // defined(Z7_SHA512_PROBE_DEBUG) || defined(Z7_COMPILER_SHA512_SUPPORTED)
void Sha512Prepare(void)
{
#ifdef Z7_COMPILER_SHA512_SUPPORTED
SHA512_FUNC_UPDATE_BLOCKS f, f_hw;
f = Sha512_UpdateBlocks;
f_hw = NULL;
#ifdef Z7_SHA512_USE_PROBE
if (CPU_IsSupported_SHA512_Probe())
#elif defined(MY_CPU_X86_OR_AMD64)
if (CPU_IsSupported_SHA512() && CPU_IsSupported_AVX2())
#else
if (CPU_IsSupported_SHA512())
#endif
{
// printf("\n========== HW SHA512 ======== \n");
f = f_hw = Sha512_UpdateBlocks_HW;
}
g_SHA512_FUNC_UPDATE_BLOCKS = f;
g_SHA512_FUNC_UPDATE_BLOCKS_HW = f_hw;
#elif defined(Z7_SHA512_PROBE_DEBUG)
CPU_IsSupported_SHA512_Probe(); // for debug
#endif
}
#undef K
#undef S0
#undef S1
#undef s0
#undef s1
#undef Ch
#undef Maj
#undef W_MAIN
#undef W_PRE
#undef w
#undef blk2_main
#undef blk2
#undef T1
#undef T4
#undef T8
#undef R1_PRE
#undef R1_MAIN
#undef R2_MAIN
#undef R4
#undef R4_PRE
#undef R4_MAIN
#undef R8
#undef R8_PRE
#undef R8_MAIN
#undef STEP_PRE
#undef STEP_MAIN
#undef Z7_SHA512_BIG_W
#undef Z7_SHA512_UNROLL
#undef Z7_COMPILER_SHA512_SUPPORTED

86
C/Sha512.h Normal file
View file

@ -0,0 +1,86 @@
/* Sha512.h -- SHA-512 Hash
: Igor Pavlov : Public domain */
#ifndef ZIP7_INC_SHA512_H
#define ZIP7_INC_SHA512_H
#include "7zTypes.h"
EXTERN_C_BEGIN
#define SHA512_NUM_BLOCK_WORDS 16
#define SHA512_NUM_DIGEST_WORDS 8
#define SHA512_BLOCK_SIZE (SHA512_NUM_BLOCK_WORDS * 8)
#define SHA512_DIGEST_SIZE (SHA512_NUM_DIGEST_WORDS * 8)
#define SHA512_224_DIGEST_SIZE (224 / 8)
#define SHA512_256_DIGEST_SIZE (256 / 8)
#define SHA512_384_DIGEST_SIZE (384 / 8)
typedef void (Z7_FASTCALL *SHA512_FUNC_UPDATE_BLOCKS)(UInt64 state[8], const Byte *data, size_t numBlocks);
/*
if (the system supports different SHA512 code implementations)
{
(CSha512::func_UpdateBlocks) will be used
(CSha512::func_UpdateBlocks) can be set by
Sha512_Init() - to default (fastest)
Sha512_SetFunction() - to any algo
}
else
{
(CSha512::func_UpdateBlocks) is ignored.
}
*/
typedef struct
{
union
{
struct
{
SHA512_FUNC_UPDATE_BLOCKS func_UpdateBlocks;
UInt64 count;
} vars;
UInt64 _pad_64bit[8];
void *_pad_align_ptr[2];
} v;
UInt64 state[SHA512_NUM_DIGEST_WORDS];
Byte buffer[SHA512_BLOCK_SIZE];
} CSha512;
#define SHA512_ALGO_DEFAULT 0
#define SHA512_ALGO_SW 1
#define SHA512_ALGO_HW 2
/*
Sha512_SetFunction()
return:
0 - (algo) value is not supported, and func_UpdateBlocks was not changed
1 - func_UpdateBlocks was set according (algo) value.
*/
BoolInt Sha512_SetFunction(CSha512 *p, unsigned algo);
// we support only these (digestSize) values: 224/8, 256/8, 384/8, 512/8
void Sha512_InitState(CSha512 *p, unsigned digestSize);
void Sha512_Init(CSha512 *p, unsigned digestSize);
void Sha512_Update(CSha512 *p, const Byte *data, size_t size);
void Sha512_Final(CSha512 *p, Byte *digest, unsigned digestSize);
// void Z7_FASTCALL Sha512_UpdateBlocks(UInt64 state[8], const Byte *data, size_t numBlocks);
/*
call Sha512Prepare() once at program start.
It prepares all supported implementations, and detects the fastest implementation.
*/
void Sha512Prepare(void);
EXTERN_C_END
#endif

395
C/Sha512Opt.c Normal file
View file

@ -0,0 +1,395 @@
/* Sha512Opt.c -- SHA-512 optimized code for SHA-512 hardware instructions
: Igor Pavlov : Public domain */
#include "Precomp.h"
#include "Compiler.h"
#include "CpuArch.h"
// #define Z7_USE_HW_SHA_STUB // for debug
#ifdef MY_CPU_X86_OR_AMD64
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 2400) && (__INTEL_COMPILER <= 9900) // fix it
#define USE_HW_SHA
#elif defined(Z7_LLVM_CLANG_VERSION) && (Z7_LLVM_CLANG_VERSION >= 170001) \
|| defined(Z7_APPLE_CLANG_VERSION) && (Z7_APPLE_CLANG_VERSION >= 170001) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 140000)
#define USE_HW_SHA
#if !defined(__INTEL_COMPILER)
// icc defines __GNUC__, but icc doesn't support __attribute__(__target__)
#if !defined(__SHA512__) || !defined(__AVX2__)
#define ATTRIB_SHA512 __attribute__((__target__("sha512,avx2")))
#endif
#endif
#elif defined(Z7_MSC_VER_ORIGINAL)
#if (_MSC_VER >= 1940)
#define USE_HW_SHA
#else
// #define Z7_USE_HW_SHA_STUB
#endif
#endif
// #endif // MY_CPU_X86_OR_AMD64
#ifndef USE_HW_SHA
// #define Z7_USE_HW_SHA_STUB // for debug
#endif
#ifdef USE_HW_SHA
// #pragma message("Sha512 HW")
#include <immintrin.h>
#if defined (__clang__) && defined(_MSC_VER)
#if !defined(__AVX__)
#include <avxintrin.h>
#endif
#if !defined(__AVX2__)
#include <avx2intrin.h>
#endif
#if !defined(__SHA512__)
#include <sha512intrin.h>
#endif
#else
#endif
/*
SHA512 uses:
AVX:
_mm256_loadu_si256 (vmovdqu)
_mm256_storeu_si256
_mm256_set_epi32 (unused)
AVX2:
_mm256_add_epi64 : vpaddq
_mm256_shuffle_epi8 : vpshufb
_mm256_shuffle_epi32 : pshufd
_mm256_blend_epi32 : vpblendd
_mm256_permute4x64_epi64 : vpermq : 3c
_mm256_permute2x128_si256: vperm2i128 : 3c
_mm256_extracti128_si256 : vextracti128 : 3c
SHA512:
_mm256_sha512*
*/
// K array must be aligned for 32-bytes at least.
// The compiler can look align attribute and selects
// vmovdqu - for code without align attribute
// vmovdqa - for code with align attribute
extern
MY_ALIGN(64)
const UInt64 SHA512_K_ARRAY[80];
#define K SHA512_K_ARRAY
#define ADD_EPI64(dest, src) dest = _mm256_add_epi64(dest, src);
#define SHA512_MSG1(dest, src) dest = _mm256_sha512msg1_epi64(dest, _mm256_extracti128_si256(src, 0));
#define SHA512_MSG2(dest, src) dest = _mm256_sha512msg2_epi64(dest, src);
#define LOAD_SHUFFLE(m, k) \
m = _mm256_loadu_si256((const __m256i *)(const void *)(data + (k) * 32)); \
m = _mm256_shuffle_epi8(m, mask); \
#define NNN(m0, m1, m2, m3)
#define SM1(m1, m2, m3, m0) \
SHA512_MSG1(m0, m1); \
#define SM2(m2, m3, m0, m1) \
ADD_EPI64(m0, _mm256_permute4x64_epi64(_mm256_blend_epi32(m2, m3, 3), 0x39)); \
SHA512_MSG2(m0, m3); \
#define RND2(t0, t1, lane) \
t0 = _mm256_sha512rnds2_epi64(t0, t1, _mm256_extracti128_si256(msg, lane));
#define R4(k, m0, m1, m2, m3, OP0, OP1) \
msg = _mm256_add_epi64(m0, *(const __m256i *) (const void *) &K[(k) * 4]); \
RND2(state0, state1, 0); OP0(m0, m1, m2, m3) \
RND2(state1, state0, 1); OP1(m0, m1, m2, m3) \
#define R16(k, OP0, OP1, OP2, OP3, OP4, OP5, OP6, OP7) \
R4 ( (k)*4+0, m0,m1,m2,m3, OP0, OP1 ) \
R4 ( (k)*4+1, m1,m2,m3,m0, OP2, OP3 ) \
R4 ( (k)*4+2, m2,m3,m0,m1, OP4, OP5 ) \
R4 ( (k)*4+3, m3,m0,m1,m2, OP6, OP7 ) \
#define PREPARE_STATE \
state0 = _mm256_shuffle_epi32(state0, 0x4e); /* cdab */ \
state1 = _mm256_shuffle_epi32(state1, 0x4e); /* ghef */ \
tmp = state0; \
state0 = _mm256_permute2x128_si256(state0, state1, 0x13); /* cdgh */ \
state1 = _mm256_permute2x128_si256(tmp, state1, 2); /* abef */ \
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
#ifdef ATTRIB_SHA512
ATTRIB_SHA512
#endif
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
{
const __m256i mask = _mm256_set_epi32(
0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607,
0x08090a0b,0x0c0d0e0f, 0x00010203,0x04050607);
__m256i tmp, state0, state1;
if (numBlocks == 0)
return;
state0 = _mm256_loadu_si256((const __m256i *) (const void *) &state[0]);
state1 = _mm256_loadu_si256((const __m256i *) (const void *) &state[4]);
PREPARE_STATE
do
{
__m256i state0_save, state1_save;
__m256i m0, m1, m2, m3;
__m256i msg;
// #define msg tmp
state0_save = state0;
state1_save = state1;
LOAD_SHUFFLE (m0, 0)
LOAD_SHUFFLE (m1, 1)
LOAD_SHUFFLE (m2, 2)
LOAD_SHUFFLE (m3, 3)
R16 ( 0, NNN, NNN, SM1, NNN, SM1, SM2, SM1, SM2 )
R16 ( 1, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 2, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 3, SM1, SM2, SM1, SM2, SM1, SM2, SM1, SM2 )
R16 ( 4, SM1, SM2, NNN, SM2, NNN, NNN, NNN, NNN )
ADD_EPI64(state0, state0_save)
ADD_EPI64(state1, state1_save)
data += 128;
}
while (--numBlocks);
PREPARE_STATE
_mm256_storeu_si256((__m256i *) (void *) &state[0], state0);
_mm256_storeu_si256((__m256i *) (void *) &state[4], state1);
}
#endif // USE_HW_SHA
// gcc 8.5 also supports sha512, but we need also support in assembler that is called by gcc
#elif defined(MY_CPU_ARM64) && defined(MY_CPU_LE)
#if defined(__ARM_FEATURE_SHA512)
#define USE_HW_SHA
#else
#if (defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 130000) \
|| defined(__GNUC__) && (__GNUC__ >= 9) \
) \
|| defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1940) // fix it
#define USE_HW_SHA
#endif
#endif
#ifdef USE_HW_SHA
// #pragma message("=== Sha512 HW === ")
#if defined(__clang__) || defined(__GNUC__)
#if !defined(__ARM_FEATURE_SHA512)
// #pragma message("=== we define SHA3 ATTRIB_SHA512 === ")
#if defined(__clang__)
#define ATTRIB_SHA512 __attribute__((__target__("sha3"))) // "armv8.2-a,sha3"
#else
#define ATTRIB_SHA512 __attribute__((__target__("arch=armv8.2-a+sha3")))
#endif
#endif
#endif
#if defined(Z7_MSC_VER_ORIGINAL)
#include <arm64_neon.h>
#else
#if defined(__clang__) && __clang_major__ < 16
#if !defined(__ARM_FEATURE_SHA512)
// #pragma message("=== we set __ARM_FEATURE_SHA512 1 === ")
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#define Z7_ARM_FEATURE_SHA512_WAS_SET 1
#define __ARM_FEATURE_SHA512 1
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
#endif
#endif // clang
#include <arm_neon.h>
#if defined(Z7_ARM_FEATURE_SHA512_WAS_SET) && \
defined(__ARM_FEATURE_SHA512)
Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
#undef __ARM_FEATURE_SHA512
#undef Z7_ARM_FEATURE_SHA512_WAS_SET
Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
// #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
#endif
#endif // Z7_MSC_VER_ORIGINAL
typedef uint64x2_t v128_64;
// typedef __n128 v128_64; // MSVC
#ifdef MY_CPU_BE
#define MY_rev64_for_LE(x) x
#else
#define MY_rev64_for_LE(x) vrev64q_u8(x)
#endif
#define LOAD_128_64(_p) vld1q_u64(_p)
#define LOAD_128_8(_p) vld1q_u8 (_p)
#define STORE_128_64(_p, _v) vst1q_u64(_p, _v)
#define LOAD_SHUFFLE(m, k) \
m = vreinterpretq_u64_u8( \
MY_rev64_for_LE( \
LOAD_128_8(data + (k) * 16))); \
// K array must be aligned for 16-bytes at least.
extern
MY_ALIGN(64)
const UInt64 SHA512_K_ARRAY[80];
#define K SHA512_K_ARRAY
#define NN(m0, m1, m4, m5, m7)
#define SM(m0, m1, m4, m5, m7) \
m0 = vsha512su1q_u64(vsha512su0q_u64(m0, m1), m7, vextq_u64(m4, m5, 1));
#define R2(k, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP) \
OP(m0, m1, m4, m5, m7) \
t = vaddq_u64(m0, vld1q_u64(k)); \
t = vaddq_u64(vextq_u64(t, t, 1), a3); \
t = vsha512hq_u64(t, vextq_u64(a2, a3, 1), vextq_u64(a1, a2, 1)); \
a3 = vsha512h2q_u64(t, a1, a0); \
a1 = vaddq_u64(a1, t); \
#define R8(k, m0,m1,m2,m3,m4,m5,m6,m7, OP) \
R2 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, a0,a1,a2,a3, OP ) \
R2 ( (k)+1*2, m1,m2,m3,m4,m5,m6,m7,m0, a3,a0,a1,a2, OP ) \
R2 ( (k)+2*2, m2,m3,m4,m5,m6,m7,m0,m1, a2,a3,a0,a1, OP ) \
R2 ( (k)+3*2, m3,m4,m5,m6,m7,m0,m1,m2, a1,a2,a3,a0, OP ) \
#define R16(k, OP) \
R8 ( (k)+0*2, m0,m1,m2,m3,m4,m5,m6,m7, OP ) \
R8 ( (k)+4*2, m4,m5,m6,m7,m0,m1,m2,m3, OP ) \
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
#ifdef ATTRIB_SHA512
ATTRIB_SHA512
#endif
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
{
v128_64 a0, a1, a2, a3;
if (numBlocks == 0)
return;
a0 = LOAD_128_64(&state[0]);
a1 = LOAD_128_64(&state[2]);
a2 = LOAD_128_64(&state[4]);
a3 = LOAD_128_64(&state[6]);
do
{
v128_64 a0_save, a1_save, a2_save, a3_save;
v128_64 m0, m1, m2, m3, m4, m5, m6, m7;
v128_64 t;
unsigned i;
const UInt64 *k_ptr;
LOAD_SHUFFLE (m0, 0)
LOAD_SHUFFLE (m1, 1)
LOAD_SHUFFLE (m2, 2)
LOAD_SHUFFLE (m3, 3)
LOAD_SHUFFLE (m4, 4)
LOAD_SHUFFLE (m5, 5)
LOAD_SHUFFLE (m6, 6)
LOAD_SHUFFLE (m7, 7)
a0_save = a0;
a1_save = a1;
a2_save = a2;
a3_save = a3;
R16 ( K, NN )
k_ptr = K + 16;
for (i = 0; i < 4; i++)
{
R16 ( k_ptr, SM )
k_ptr += 16;
}
a0 = vaddq_u64(a0, a0_save);
a1 = vaddq_u64(a1, a1_save);
a2 = vaddq_u64(a2, a2_save);
a3 = vaddq_u64(a3, a3_save);
data += 128;
}
while (--numBlocks);
STORE_128_64(&state[0], a0);
STORE_128_64(&state[2], a1);
STORE_128_64(&state[4], a2);
STORE_128_64(&state[6], a3);
}
#endif // USE_HW_SHA
#endif // MY_CPU_ARM_OR_ARM64
#if !defined(USE_HW_SHA) && defined(Z7_USE_HW_SHA_STUB)
// #error Stop_Compiling_UNSUPPORTED_SHA
// #include <stdlib.h>
// We can compile this file with another C compiler,
// or we can compile asm version.
// So we can generate real code instead of this stub function.
// #include "Sha512.h"
// #if defined(_MSC_VER)
#pragma message("Sha512 HW-SW stub was used")
// #endif
void Z7_FASTCALL Sha512_UpdateBlocks (UInt64 state[8], const Byte *data, size_t numBlocks);
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks);
void Z7_FASTCALL Sha512_UpdateBlocks_HW(UInt64 state[8], const Byte *data, size_t numBlocks)
{
Sha512_UpdateBlocks(state, data, numBlocks);
/*
UNUSED_VAR(state);
UNUSED_VAR(data);
UNUSED_VAR(numBlocks);
exit(1);
return;
*/
}
#endif
#undef K
#undef RND2
#undef MY_rev64_for_LE
#undef NN
#undef NNN
#undef LOAD_128
#undef STORE_128
#undef LOAD_SHUFFLE
#undef SM1
#undef SM2
#undef SM
#undef R2
#undef R4
#undef R16
#undef PREPARE_STATE
#undef USE_HW_SHA
#undef ATTRIB_SHA512
#undef USE_VER_MIN
#undef Z7_USE_HW_SHA_STUB

367
C/Sort.c
View file

@ -1,141 +1,268 @@
/* Sort.c -- Sort functions /* Sort.c -- Sort functions
2014-04-05 : Igor Pavlov : Public domain */ : Igor Pavlov : Public domain */
#include "Precomp.h" #include "Precomp.h"
#include "Sort.h" #include "Sort.h"
#include "CpuArch.h"
#define HeapSortDown(p, k, size, temp) \ #if ( (defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \
{ for (;;) { \ || (defined(__clang__) && Z7_has_builtin(__builtin_prefetch)) \
size_t s = (k << 1); \ )
if (s > size) break; \ // the code with prefetch is slow for small arrays on x86.
if (s < size && p[s + 1] > p[s]) s++; \ // So we disable prefetch for x86.
if (temp >= p[s]) break; \ #ifndef MY_CPU_X86
p[k] = p[s]; k = s; \ // #pragma message("Z7_PREFETCH : __builtin_prefetch")
} p[k] = temp; } #define Z7_PREFETCH(a) __builtin_prefetch((a))
#endif
void HeapSort(UInt32 *p, size_t size) #elif defined(_WIN32) // || defined(_MSC_VER) && (_MSC_VER >= 1200)
{
if (size <= 1)
return;
p--;
{
size_t i = size / 2;
do
{
UInt32 temp = p[i];
size_t k = i;
HeapSortDown(p, k, size, temp)
}
while (--i != 0);
}
/*
do
{
size_t k = 1;
UInt32 temp = p[size];
p[size--] = p[1];
HeapSortDown(p, k, size, temp)
}
while (size > 1);
*/
while (size > 3)
{
UInt32 temp = p[size];
size_t k = (p[3] > p[2]) ? 3 : 2;
p[size--] = p[1];
p[1] = p[k];
HeapSortDown(p, k, size, temp)
}
{
UInt32 temp = p[size];
p[size] = p[1];
if (size > 2 && p[2] < temp)
{
p[1] = p[2];
p[2] = temp;
}
else
p[1] = temp;
}
}
void HeapSort64(UInt64 *p, size_t size) #include "7zWindows.h"
{
if (size <= 1) // NOTE: CLANG/GCC/MSVC can define different values for _MM_HINT_T0 / PF_TEMPORAL_LEVEL_1.
return; // For example, clang-cl can generate "prefetcht2" instruction for
p--; // PreFetchCacheLine(PF_TEMPORAL_LEVEL_1) call.
{ // But we want to generate "prefetcht0" instruction.
size_t i = size / 2; // So for CLANG/GCC we must use __builtin_prefetch() in code branch above
do // instead of PreFetchCacheLine() / _mm_prefetch().
{
UInt64 temp = p[i]; // New msvc-x86 compiler generates "prefetcht0" instruction for PreFetchCacheLine() call.
size_t k = i; // But old x86 cpus don't support "prefetcht0".
HeapSortDown(p, k, size, temp) // So we will use PreFetchCacheLine(), only if we are sure that
} // generated instruction is supported by all cpus of that isa.
while (--i != 0); #if defined(MY_CPU_AMD64) \
} || defined(MY_CPU_ARM64) \
/* || defined(MY_CPU_IA64)
do // we need to use additional braces for (a) in PreFetchCacheLine call, because
{ // PreFetchCacheLine macro doesn't use braces:
size_t k = 1; // #define PreFetchCacheLine(l, a) _mm_prefetch((CHAR CONST *) a, l)
UInt64 temp = p[size]; // #pragma message("Z7_PREFETCH : PreFetchCacheLine")
p[size--] = p[1]; #define Z7_PREFETCH(a) PreFetchCacheLine(PF_TEMPORAL_LEVEL_1, (a))
HeapSortDown(p, k, size, temp) #endif
}
while (size > 1); #endif // _WIN32
*/
while (size > 3)
{ #define PREFETCH_NO(p,k,s,size)
UInt64 temp = p[size];
size_t k = (p[3] > p[2]) ? 3 : 2; #ifndef Z7_PREFETCH
p[size--] = p[1]; #define SORT_PREFETCH(p,k,s,size)
p[1] = p[k]; #else
HeapSortDown(p, k, size, temp)
} // #define PREFETCH_LEVEL 2 // use it if cache line is 32-bytes
{ #define PREFETCH_LEVEL 3 // it is fast for most cases (64-bytes cache line prefetch)
UInt64 temp = p[size]; // #define PREFETCH_LEVEL 4 // it can be faster for big array (128-bytes prefetch)
p[size] = p[1];
if (size > 2 && p[2] < temp) #if PREFETCH_LEVEL == 0
{
p[1] = p[2]; #define SORT_PREFETCH(p,k,s,size)
p[2] = temp;
} #else // PREFETCH_LEVEL != 0
else
p[1] = temp;
}
}
/* /*
#define HeapSortRefDown(p, vals, n, size, temp) \ if defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
{ size_t k = n; UInt32 val = vals[temp]; for (;;) { \ we prefetch one value per cache line.
size_t s = (k << 1); \ Use it if array is aligned for cache line size (64 bytes)
if (s > size) break; \ or if array is small (less than L1 cache size).
if (s < size && vals[p[s + 1]] > vals[p[s]]) s++; \
if (val >= vals[p[s]]) break; \
p[k] = p[s]; k = s; \
} p[k] = temp; }
void HeapSortRef(UInt32 *p, UInt32 *vals, size_t size) if !defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
we perfetch all cache lines that can be required.
it can be faster for big unaligned arrays.
*/
#define USE_PREFETCH_FOR_ALIGNED_ARRAY
// s == k * 2
#if 0 && PREFETCH_LEVEL <= 3 && defined(MY_CPU_X86_OR_AMD64)
// x86 supports (lea r1*8+offset)
#define PREFETCH_OFFSET(k,s) ((s) << PREFETCH_LEVEL)
#else
#define PREFETCH_OFFSET(k,s) ((k) << (PREFETCH_LEVEL + 1))
#endif
#if 1 && PREFETCH_LEVEL <= 3 && defined(USE_PREFETCH_FOR_ALIGNED_ARRAY)
#define PREFETCH_ADD_OFFSET 0
#else
// last offset that can be reqiured in PREFETCH_LEVEL step:
#define PREFETCH_RANGE ((2 << PREFETCH_LEVEL) - 1)
#define PREFETCH_ADD_OFFSET PREFETCH_RANGE / 2
#endif
#if PREFETCH_LEVEL <= 3
#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_ADD_OFFSET; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2)); \
}}
#else /* for unaligned array */
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
Z7_PREFETCH((p + s2)); \
}}
#endif
#else // PREFETCH_LEVEL > 3
#ifdef USE_PREFETCH_FOR_ALIGNED_ARRAY
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE - 16 / 2; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2 - 16)); \
Z7_PREFETCH((p + s2)); \
}}
#else /* for unaligned array */
#define SORT_PREFETCH(p,k,s,size) \
{ const size_t s2 = PREFETCH_OFFSET(k,s) + PREFETCH_RANGE; \
if (s2 <= size) { \
Z7_PREFETCH((p + s2 - PREFETCH_RANGE)); \
Z7_PREFETCH((p + s2 - PREFETCH_RANGE / 2)); \
Z7_PREFETCH((p + s2)); \
}}
#endif
#endif // PREFETCH_LEVEL > 3
#endif // PREFETCH_LEVEL != 0
#endif // Z7_PREFETCH
#if defined(MY_CPU_ARM64) \
/* || defined(MY_CPU_AMD64) */ \
/* || defined(MY_CPU_ARM) && !defined(_MSC_VER) */
// we want to use cmov, if cmov is very fast:
// - this cmov version is slower for clang-x64.
// - this cmov version is faster for gcc-arm64 for some fast arm64 cpus.
#define Z7_FAST_CMOV_SUPPORTED
#endif
#ifdef Z7_FAST_CMOV_SUPPORTED
// we want to use cmov here, if cmov is fast: new arm64 cpus.
// we want the compiler to use conditional move for this branch
#define GET_MAX_VAL(n0, n1, max_val_slow) if (n0 < n1) n0 = n1;
#else
// use this branch, if cpu doesn't support fast conditional move.
// it uses slow array access reading:
#define GET_MAX_VAL(n0, n1, max_val_slow) n0 = max_val_slow;
#endif
#define HeapSortDown(p, k, size, temp, macro_prefetch) \
{ \
for (;;) { \
UInt32 n0, n1; \
size_t s = k * 2; \
if (s >= size) { \
if (s == size) { \
n0 = p[s]; \
p[k] = n0; \
if (temp < n0) k = s; \
} \
break; \
} \
n0 = p[k * 2]; \
n1 = p[k * 2 + 1]; \
s += n0 < n1; \
GET_MAX_VAL(n0, n1, p[s]) \
if (temp >= n0) break; \
macro_prefetch(p, k, s, size) \
p[k] = n0; \
k = s; \
} \
p[k] = temp; \
}
/*
stage-1 : O(n) :
we generate intermediate partially sorted binary tree:
p[0] : it's additional item for better alignment of tree structure in memory.
p[1]
p[2] p[3]
p[4] p[5] p[6] p[7]
...
p[x] >= p[x * 2]
p[x] >= p[x * 2 + 1]
stage-2 : O(n)*log2(N):
we move largest item p[0] from head of tree to the end of array
and insert last item to sorted binary tree.
*/
// (p) must be aligned for cache line size (64-bytes) for best performance
void Z7_FASTCALL HeapSort(UInt32 *p, size_t size)
{ {
if (size <= 1) if (size < 2)
return; return;
p--; if (size == 2)
{ {
size_t i = size / 2; const UInt32 a0 = p[0];
const UInt32 a1 = p[1];
const unsigned k = a1 < a0;
p[k] = a0;
p[k ^ 1] = a1;
return;
}
{
// stage-1 : O(n)
// we transform array to partially sorted binary tree.
size_t i = --size / 2;
// (size) now is the index of the last item in tree,
// if (i)
{
do
{
const UInt32 temp = p[i];
size_t k = i;
HeapSortDown(p, k, size, temp, PREFETCH_NO)
}
while (--i);
}
{
const UInt32 temp = p[0];
const UInt32 a1 = p[1];
if (temp < a1)
{
size_t k = 1;
p[0] = a1;
HeapSortDown(p, k, size, temp, PREFETCH_NO)
}
}
}
if (size < 3)
{
// size == 2
const UInt32 a0 = p[0];
p[0] = p[2];
p[2] = a0;
return;
}
if (size != 3)
{
// stage-2 : O(size) * log2(size):
// we move largest item p[0] from head to the end of array,
// and insert last item to sorted binary tree.
do do
{ {
UInt32 temp = p[i]; const UInt32 temp = p[size];
HeapSortRefDown(p, vals, i, size, temp); size_t k = p[2] < p[3] ? 3 : 2;
p[size--] = p[0];
p[0] = p[1];
p[1] = p[k];
HeapSortDown(p, k, size, temp, SORT_PREFETCH) // PREFETCH_NO
} }
while (--i != 0); while (size != 3);
} }
do
{ {
UInt32 temp = p[size]; const UInt32 a2 = p[2];
p[size--] = p[1]; const UInt32 a3 = p[3];
HeapSortRefDown(p, vals, 1, size, temp); const size_t k = a2 < a3;
p[2] = p[1];
p[3] = p[0];
p[k] = a3;
p[k ^ 1] = a2;
} }
while (size > 1);
} }
*/

Some files were not shown because too many files have changed in this diff Show more