From 3345a239b7f5353a1c1296d6a5d6b90729d4b669 Mon Sep 17 00:00:00 2001 From: Con Kolivas Date: Tue, 16 Feb 2021 09:41:14 +1100 Subject: [PATCH] Use lz4 for compressibility testing only, which won't break existing archives, but speeds up testing slightly. This makes liblz4 a required library. --- README.md | 21 +++++++++--------- configure.ac | 2 ++ lrzip_private.h | 4 ++-- main.c | 8 +++---- man/lrz.1.pod | 4 ++-- man/lrzip.1 | 14 ++++++------ stream.c | 58 +++++++++++++++++++++++-------------------------- 7 files changed, 55 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index 503c471..e0f87a1 100644 --- a/README.md +++ b/README.md @@ -49,8 +49,9 @@ A quick guide on building and installing. - libz-dev - libbz2-dev - liblzo2-dev + - liblz4-dev - coreutils - - nasm on x86, not needed on x64 + - Optional nasm - git if you want a repo-fresh copy - an OS with the usual *nix headers and libraries @@ -247,17 +248,17 @@ lzma compression can't currently be tracked when handing over 100+MB chunks over to the lzma library. Therefore you'll see progress percentage until each chunk is handed over to the lzma library. -> Q: What's this "lzo testing for incompressible data" message? +> Q: What's this "lz4 testing for incompressible data" message? -> A: Other compression is much slower, and lzo is the fastest. To help speed up -the process, lzo compression is performed on the data first to test that the +> A: Other compression is much slower, and lz4 is the fastest. To help speed up +the process, lz4 compression is performed on the data first to test that the data is at all compressible. If a small block of data is not compressible, it tests progressively larger blocks until it has tested all the data (if it fails to compress at all). If no compressible data is found, then the subsequent compression is not even attempted. This can save a lot of time during the compression phase when there is incompressible data. Theoretically it may be possible that data is compressible by the other backend (zpaq, lzma etc) and -not at all by lzo, but in practice such data achieves only minuscule amounts of +not at all by lz4, but in practice such data achieves only minuscule amounts of compression which are not worth pursuing. Most of the time it is clear one way or the other that data is compressible or not. If you wish to disable this test and force it to try compressing it anyway, use -T. @@ -357,14 +358,14 @@ cpu process scheduler how to prioritise workloads, and if your application is the _only_ thing running it will be no faster at nice -20 nor will it be any slower at +19. -> Q: What is the LZO Testing option, -T? +> Q: What is the LZ4 Testing option, -T? -> A: LZO testing is normally performed for the slower back-end compression of -LZMA and ZPAQ. The reasoning is that if it is completely incompressible by LZO +> A: LZ4 testing is normally performed for the slower back-end compression of +LZMA and ZPAQ. The reasoning is that if it is completely incompressible by LZ4 then it will also be incompressible by them. Thus if a block fails to be -compressed by the very fast LZO, lrzip will not attempt to compress that block +compressed by the very fast LZ4, lrzip will not attempt to compress that block with the slower compressor, thereby saving time. If this option is enabled, it -will bypass the LZO testing and attempt to compress each block regardless. +will bypass the LZ4 testing and attempt to compress each block regardless. > Q: Compression and decompression progress on large archives slows down and speeds up. There's also a jump in the percentage at the end? diff --git a/configure.ac b/configure.ac index 45d2870..f394b0b 100644 --- a/configure.ac +++ b/configure.ac @@ -117,6 +117,8 @@ AC_CHECK_LIB(bz2, BZ2_bzBuffToBuffCompress, , AC_MSG_ERROR([Could not find bz2 library - please install libbz2-dev])) AC_CHECK_LIB(lzo2, lzo1x_1_compress, , AC_MSG_ERROR([Could not find lzo2 library - please install liblzo2-dev])) +AC_CHECK_LIB(lz4, LZ4_compress_default, , + AC_MSG_ERROR([Could not find lz4 library - please install liblz4-dev])) AC_CHECK_FUNCS(mmap strerror) AC_CHECK_FUNCS(getopt_long) diff --git a/lrzip_private.h b/lrzip_private.h index e9613cb..d6ae1f4 100644 --- a/lrzip_private.h +++ b/lrzip_private.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2006-2016,2018 Con Kolivas + Copyright (C) 2006-2016,2018,2021 Con Kolivas Copyright (C) 2011 Peter Hyman Copyright (C) 1998-2003 Andrew Tridgell @@ -308,7 +308,7 @@ typedef sem_t cksem_t; #define HAS_MD5 (control->flags & FLAG_MD5) #define CHECK_FILE (control->flags & FLAG_CHECK) #define KEEP_BROKEN (control->flags & FLAG_KEEP_BROKEN) -#define LZO_TEST (control->flags & FLAG_THRESHOLD) +#define LZ4_TEST (control->flags & FLAG_THRESHOLD) #define TMP_OUTBUF (control->flags & FLAG_TMP_OUTBUF) #define TMP_INBUF (control->flags & FLAG_TMP_INBUF) #define ENCRYPT (control->flags & FLAG_ENCRYPT) diff --git a/main.c b/main.c index d460835..89bd396 100644 --- a/main.c +++ b/main.c @@ -121,7 +121,7 @@ static void usage(bool compat) print_output(" -p, --threads value Set processor count to override number of threads\n"); print_output(" -m, --maxram size Set maximum available ram in hundreds of MB\n"); print_output(" overrides detected amount of available ram\n"); - print_output(" -T, --threshold Disable LZO compressibility testing\n"); + print_output(" -T, --threshold Disable LZ4 compressibility testing\n"); print_output(" -U, --unlimited Use unlimited window size beyond ramsize (potentially much slower)\n"); print_output(" -w, --window size maximum compression window in hundreds of MB\n"); print_output(" default chosen by heuristic dependent on ram and chosen compression\n"); @@ -185,15 +185,15 @@ static void show_summary(void) if (!DECOMPRESS && !TEST_ONLY) { print_verbose("Compression mode is: "); if (LZMA_COMPRESS) - print_verbose("LZMA. LZO Compressibility testing %s\n", (LZO_TEST? "enabled" : "disabled")); + print_verbose("LZMA. LZ4 Compressibility testing %s\n", (LZ4_TEST? "enabled" : "disabled")); else if (LZO_COMPRESS) print_verbose("LZO\n"); else if (BZIP2_COMPRESS) - print_verbose("BZIP2. LZO Compressibility testing %s\n", (LZO_TEST? "enabled" : "disabled")); + print_verbose("BZIP2. LZ4 Compressibility testing %s\n", (LZ4_TEST? "enabled" : "disabled")); else if (ZLIB_COMPRESS) print_verbose("GZIP\n"); else if (ZPAQ_COMPRESS) - print_verbose("ZPAQ. LZO Compressibility testing %s\n", (LZO_TEST? "enabled" : "disabled")); + print_verbose("ZPAQ. LZ4 Compressibility testing %s\n", (LZ4_TEST? "enabled" : "disabled")); else if (NO_COMPRESS) print_verbose("RZIP pre-processing only\n"); if (control->window) diff --git a/man/lrz.1.pod b/man/lrz.1.pod index ccc64e2..54a93b4 100644 --- a/man/lrz.1.pod +++ b/man/lrz.1.pod @@ -2,7 +2,7 @@ # Copyright # -# Copyright (C) 2016 Con Kolivas +# Copyright (C) 2021 Con Kolivas # # License # @@ -296,7 +296,7 @@ Overrides detected amount of available ram. =item B<-T> -Disable LZO compressibility testing. +Disable LZ4 compressibility testing. =item B<--unlimited> diff --git a/man/lrzip.1 b/man/lrzip.1 index 5891650..4aa5040 100644 --- a/man/lrzip.1 +++ b/man/lrzip.1 @@ -1,4 +1,4 @@ -.TH "lrzip" "1" "June 2016" "" "" +.TH "lrzip" "1" "February 2021" "" "" .SH "NAME" lrzip \- a large-file compression program .SH "SYNOPSIS" @@ -66,7 +66,7 @@ Low level options: \-p, \-\-threads value Set processor count to override number of threads \-m, \-\-maxram size Set maximum available ram in hundreds of MB overrides detected amount of available ram - \-T, \-\-threshold Disable LZO compressibility testing + \-T, \-\-threshold Disable LZ4 compressibility testing \-U, \-\-unlimited Use unlimited window size beyond ramsize (potentially much slower) \-w, \-\-window size maximum compression window in hundreds of MB default chosen by heuristic dependent on ram and chosen compression @@ -240,13 +240,13 @@ decrease the load on your machine, or to improve compression. Setting it to 1 will maximise compression but will not attempt to use more than one CPU. .IP .IP "\fB-T\fP" -Disables the LZO compressibility threshold testing when a slower compression -back-end is used. LZO testing is normally performed for the slower back-end +Disables the LZ4 compressibility threshold testing when a slower compression +back-end is used. LZ4 testing is normally performed for the slower back-end compression of LZMA and ZPAQ. The reasoning is that if it is completely -incompressible by LZO then it will also be incompressible by them. Thus if a -block fails to be compressed by the very fast LZO, lrzip will not attempt to +incompressible by LZ4 then it will also be incompressible by them. Thus if a +block fails to be compressed by the very fast LZ4, lrzip will not attempt to compress that block with the slower compressor, thereby saving time. If this -option is enabled, it will bypass the LZO testing and attempt to compress each +option is enabled, it will bypass the LZ4 testing and attempt to compress each block regardless. .IP .IP "\fB-U \fP" diff --git a/stream.c b/stream.c index bb62a36..d0c5190 100644 --- a/stream.c +++ b/stream.c @@ -42,6 +42,7 @@ #include #include #include +#include #ifdef HAVE_ERRNO_H # include #endif @@ -143,7 +144,7 @@ bool join_pthread(rzip_control *control, pthread_t th, void **thread_return) /* just to keep things clean, declare function here * but move body to the end since it's a work function */ -static int lzo_compresses(rzip_control *control, uchar *s_buf, i64 s_len); +static int lz4_compresses(rzip_control *control, uchar *s_buf, i64 s_len); /* ***** COMPRESSION FUNCTIONS ***** @@ -160,7 +161,7 @@ static int zpaq_compress_buf(rzip_control *control, struct compress_thread *cthr i64 c_len, c_size; uchar *c_buf; - if (!lzo_compresses(control, cthread->s_buf, cthread->s_len)) + if (!lz4_compresses(control, cthread->s_buf, cthread->s_len)) return 0; c_size = round_up_page(control, cthread->s_len + 10000); @@ -195,7 +196,7 @@ static int bzip2_compress_buf(rzip_control *control, struct compress_thread *cth int bzip2_ret; uchar *c_buf; - if (!lzo_compresses(control, cthread->s_buf, cthread->s_len)) + if (!lz4_compresses(control, cthread->s_buf, cthread->s_len)) return 0; c_buf = malloc(dlen); @@ -291,7 +292,7 @@ static int lzma_compress_buf(rzip_control *control, struct compress_thread *cthr uchar *c_buf; size_t dlen; - if (!lzo_compresses(control, cthread->s_buf, cthread->s_len)) + if (!lz4_compresses(control, cthread->s_buf, cthread->s_len)) return 0; /* only 7 levels with lzma, scale them */ @@ -1875,47 +1876,43 @@ int close_stream_in(rzip_control *control, void *ss) return 0; } -/* As others are slow and lzo very fast, it is worth doing a quick lzo pass - to see if there is any compression at all with lzo first. It is unlikely - that others will be able to compress if lzo is unable to drop a single byte - so do not compress any block that is incompressible by lzo. */ -static int lzo_compresses(rzip_control *control, uchar *s_buf, i64 s_len) +/* As others are slow and lz4 very fast, it is worth doing a quick lz4 pass + to see if there is any compression at all with lz4 first. It is unlikely + that others will be able to compress if lz4 is unable to drop a single byte + so do not compress any block that is incompressible by lz4. */ +static int lz4_compresses(rzip_control *control, uchar *s_buf, i64 s_len) { - lzo_bytep wrkmem = NULL; - lzo_uint in_len, test_len = s_len, save_len = s_len; - lzo_uint dlen; - uchar *c_buf = NULL, *test_buf = s_buf; + int in_len, test_len = s_len, save_len = s_len; + int dlen; + char *c_buf = NULL, *test_buf = (char *)s_buf; /* set minimum buffer test size based on the length of the test stream */ - unsigned long buftest_size = (test_len > 5 * STREAM_BUFSIZE ? STREAM_BUFSIZE : STREAM_BUFSIZE / 4096); + int buftest_size = (test_len > 5 * STREAM_BUFSIZE ? STREAM_BUFSIZE : STREAM_BUFSIZE / 4096); int ret = 0; int workcounter = 0; /* count # of passes */ - lzo_uint best_dlen = UINT_MAX; /* save best compression estimate */ + int best_dlen = INT_MAX; /* save best compression estimate */ - if (!LZO_TEST) + if (!LZ4_TEST) return 1; - wrkmem = (lzo_bytep) malloc(LZO1X_1_MEM_COMPRESS); - if (unlikely(wrkmem == NULL)) - fatal_return(("Unable to allocate wrkmem in lzo_compresses\n"), 0); in_len = MIN(test_len, buftest_size); dlen = STREAM_BUFSIZE + STREAM_BUFSIZE / 16 + 64 + 3; c_buf = malloc(dlen); - if (unlikely(!c_buf)) { - dealloc(wrkmem); - fatal_return(("Unable to allocate c_buf in lzo_compresses\n"), 0); - } + if (unlikely(!c_buf)) + fatal_return(("Unable to allocate c_buf in lz4_compresses\n"), 0); /* Test progressively larger blocks at a time and as soon as anything compressible is found, jump out as a success */ while (test_len > 0) { + int lz4_ret; + workcounter++; - lzo1x_1_compress(test_buf, in_len, (uchar *)c_buf, &dlen, wrkmem); - - if (dlen < best_dlen) - best_dlen = dlen; /* save best value */ - - if (dlen < in_len) { + lz4_ret = LZ4_compress_default((const char *)test_buf, c_buf, test_len, dlen); + if (!lz4_ret) // Bigger than dlen, no point going further + break; + if (lz4_ret < best_dlen) + best_dlen = lz4_ret; + if (lz4_ret < test_len) { ret = 1; break; } @@ -1928,11 +1925,10 @@ static int lzo_compresses(rzip_control *control, uchar *s_buf, i64 s_len) in_len = MIN(test_len, buftest_size); } } - print_maxverbose("lzo testing %s for chunk %ld. Compressed size = %5.2F%% of chunk, %d Passes\n", + print_maxverbose("lz4 testing %s for chunk %ld. Compressed size = %5.2F%% of chunk, %d Passes\n", (ret == 0? "FAILED" : "OK"), save_len, 100 * ((double) best_dlen / (double) in_len), workcounter); - dealloc(wrkmem); dealloc(c_buf); return ret;