From 1637598c3fdba31ffc3ee9226bb64ac02c0687aa Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Sat, 13 Nov 2010 21:37:17 +1100
Subject: [PATCH] Bump version number up to 0.530. Update all documentation.
 Minor fixes by Jari Aalto for build and docs.

---
 ChangeLog             | 22 +++++++++++++++
 Makefile.in           | 12 ++++++--
 README                | 26 ++++++++++--------
 TODO                  |  2 +-
 WHATS-NEW             | 15 ++++++++++
 configure             | 22 +++++++--------
 configure.ac          |  2 +-
 doc/README.benchmarks | 64 +++++++++++++++++++++----------------------
 main.c                |  3 +-
 man/lrzip.1           |  4 +--
 rzip.h                |  2 +-
 11 files changed, 110 insertions(+), 64 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 6481db2..bbdcd25 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,26 @@
 lrzip ChangeLog
+NOVEMBER 2010, version 0.530 Con Kolivas
+* Massive rewrite of backend compression phase. Now the stream is split up
+into as many chunks as there are CPUs, of at least 10MB in size, that are
+still mallocable. Once the stream has reached a chunk of this size, its buffer
+is handed to a new backend compression thread which works while the rzip stream
+continues processing. This has the effect of parallelising workloads almost
+linearly up to the number of CPUs on the slower compression backends. ZPAQ,
+in particular, is effectively 4x faster on quad core now. Decompression is
+unchanged.
+* Added the -p option to allow the number of processors to be specified to
+override the detected number.
+* Changed the default level back to 7 as 9 wasn't offering significanly more
+compression but was adding time.
+* Increased the size of all the buffers to other backends now as well, since
+each block adds overhead with its header.
+* Numerous alterations to screen output to cope with new threaded compression
+phase.
+* Deprecated the -P option since not setting the file permissions only
+generates a warning now, not a failure.
+* Updated docs and benchmarks.
+
+
 NOVEMBER 2010, version 0.520 Con Kolivas
 * Distros don't like 3 point version numbering so just repackaged as 0.520.
 
diff --git a/Makefile.in b/Makefile.in
index beffbd1..aef8a66 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -6,7 +6,8 @@
 prefix=@prefix@
 exec_prefix=@exec_prefix@
 datarootdir=@datarootdir@
-ASM_OBJ=@ASM_OBJ@
+#ASM_OBJ=@ASM_OBJ@
+ASM_OBJ=7zCrc.o
 PACKAGE_TARNAME=@PACKAGE_TARNAME@
 INSTALL_BIN=$(exec_prefix)/bin
 INSTALL_MAN1=@mandir@/man1
@@ -24,7 +25,12 @@ LZMA_CFLAGS=-I@top_srcdir@/lzma/C -DCOMPRESS_MF_MT -D_REENTRANT
 INSTALLCMD=@INSTALL@
 LN_S=@LN_S@
 RM=rm -f
-ASM=@ASM@
+
+ifneq ($(NO_ASSEMBLER),)
+	ASM=@ASM@
+else
+	ASM=7zCrc.o
+endif
 
 VPATH=@srcdir@
 srcdir=@srcdir@
@@ -34,7 +40,7 @@ SHELL=/bin/sh
 .SUFFIXES: .c .o
 
 OBJS= main.o rzip.o runzip.o stream.o util.o \
-  @ASM_OBJ@ \
+  7zCrc.o \
   zpipe.o \
   Threads.o \
   LzFind.o \
diff --git a/README b/README
index 2ea30c2..bb91cf1 100644
--- a/README
+++ b/README
@@ -106,6 +106,10 @@ compression, and even use a compression window larger than you have ram.
 Expect serious swapping to occur if your file is larger than your ram and for
 it to take many times longer.
 
+Q. I want the absolute fastest decent compression I can possibly get.
+A. Try the command line options -Ml. This will use the maximum possible
+memory, lzo backend compression, and level 7 compression (1 isn't much faster).
+
 Q. How much slower is the unlimited mode?
 A. It depends on 2 things. First, just how much larger than your ram the file
 is, as the bigger the difference, the slower it will be. The second is how much
@@ -161,12 +165,11 @@ lrzip source code. Libraries with functions similar to compress() and
 decompress() functions of zlib would make the process most painless. Please
 tell me if you have such a library so I can include it :)
 
-Q. What's this "Progress percentage pausing during lzma compression" message?
+Q. What's this "Starting lzma back end compression thread..." message?
 A. While I'm a big fan of progress percentage being visible, unfortunately
 lzma compression can't currently be tracked when handing over 100+MB chunks
 over to the lzma library. Therefore you'll see progress percentage until
-each chunk is handed over to the lzma library. lzo, bzip2 or no compression
-doesn't have this problem and shows progress continuously.
+each chunk is handed over to the lzma library.
 
 Q. What's this "lzo testing for incompressible data" message?
 A. Other compression is much slower, and lzo is the fastest. To help speed up
@@ -189,10 +192,6 @@ option with lzma and are never anywhere near as large as the compression
 requirements. However if you're on 64bit and you use a compression window
 greater than 2GB, it might not be possible to decompress it on 32bit machines.
 
-Q. I've changed the compression level with -L in combination with -l or -z and
-the file size doesn't vary?
-A. That's right, -l and -z only has one compression level.
-
 Q. Why are you including bzip2 compression?
 A. To maintain a similar compression format to the original rzip (although the
 other modes are more useful).
@@ -211,11 +210,14 @@ the only compression format that can do any significant compression of
 multimedia.
 
 Q. Is this multithreaded?
-A. As of version 0.21, the answer is yes for lzma compression only thanks to a
-multithreaded lzma library. However I have not found the gains to scale well
-with number of cpus, but there are definite performance gains with more cpus.
-It is important to note that the mulithreading actually decreases the
-compression somewhat. It's a tradeoff either way.
+A. As of version 0.530, it is HEAVILY multithreaded with the back end
+compression phase, and will continue to process the rzip pre-processing phase
+so when using one of the more CPU intensive backend compressions like lzma or
+zpaq, SMP machines will show massive speed improvements. Lrzip will detect the
+number of CPUs to use, but it can be overridden with the -p option if the
+slightly better compression is desired more than speed. Decompression at the
+moment is not multithreaded, but is already much faster than the compression
+phase (except for zpaq), but enhancements for decompression are planned.
 
 Q. This uses heaps of memory, can I make it use less?
 A. Well you can by setting -w to the lowest value (1) but the huge use of
diff --git a/TODO b/TODO
index 3e0e702..e64cb5c 100644
--- a/TODO
+++ b/TODO
@@ -17,7 +17,7 @@ Get the ASM working on 64bit.
 
 Clean up the config system since it's a mystery to me.
 
-Increased multi-threading.
+Multi-threading on decompression.
 
 Make stdout work without a temporary file.
 
diff --git a/WHATS-NEW b/WHATS-NEW
index 9456dc6..20b409f 100644
--- a/WHATS-NEW
+++ b/WHATS-NEW
@@ -1,3 +1,18 @@
+lrzip-0.530
+
+MASSIVE MULTITHREADING on the compression phase. Lrzip will now use as many
+threads as you have CPU cores for the back end compression, and even continue
+doing the rzip preprocessing stage as long as it can which the other threads
+continue. This makes the slower compression algorithms (lzma and zpaq) much
+faster on multicore machines, to the point of making zpaq compression almost
+as fast as single threaded lzma compression.
+-p option added to allow you to specify number of processors to override the
+built-in test, or if you wish to disable threading.
+-P option to not set permissions has now been removed since failing to set
+permissions is only a warning now and not a failure.
+Further improvements to the progress output.
+Updated benchmarks and docs.
+
 lrzip-0.520
 
 Just changed version numbering back to 2 point.
diff --git a/configure b/configure
index 4fd0b9a..f3d2450 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.67 for lrzip 0.520.
+# Generated by GNU Autoconf 2.67 for lrzip 0.530.
 #
 # Report bugs to <kernel@kolivas.org>.
 #
@@ -551,9 +551,9 @@ MAKEFLAGS=
 
 # Identity of this package.
 PACKAGE_NAME='lrzip'
-PACKAGE_TARNAME='lrzip-0.520'
-PACKAGE_VERSION='0.520'
-PACKAGE_STRING='lrzip 0.520'
+PACKAGE_TARNAME='lrzip-0.530'
+PACKAGE_VERSION='0.530'
+PACKAGE_STRING='lrzip 0.530'
 PACKAGE_BUGREPORT='kernel@kolivas.org'
 PACKAGE_URL=''
 
@@ -1221,7 +1221,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures lrzip 0.520 to adapt to many kinds of systems.
+\`configure' configures lrzip 0.530 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1269,7 +1269,7 @@ Fine tuning of the installation directories:
   --infodir=DIR           info documentation [DATAROOTDIR/info]
   --localedir=DIR         locale-dependent data [DATAROOTDIR/locale]
   --mandir=DIR            man documentation [DATAROOTDIR/man]
-  --docdir=DIR            documentation root [DATAROOTDIR/doc/lrzip-0.520]
+  --docdir=DIR            documentation root [DATAROOTDIR/doc/lrzip-0.530]
   --htmldir=DIR           html documentation [DOCDIR]
   --dvidir=DIR            dvi documentation [DOCDIR]
   --pdfdir=DIR            pdf documentation [DOCDIR]
@@ -1286,7 +1286,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of lrzip 0.520:";;
+     short | recursive ) echo "Configuration of lrzip 0.530:";;
    esac
   cat <<\_ACEOF
 
@@ -1375,7 +1375,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-lrzip configure 0.520
+lrzip configure 0.530
 generated by GNU Autoconf 2.67
 
 Copyright (C) 2010 Free Software Foundation, Inc.
@@ -2014,7 +2014,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by lrzip $as_me 0.520, which was
+It was created by lrzip $as_me 0.530, which was
 generated by GNU Autoconf 2.67.  Invocation command line was
 
   $ $0 $@
@@ -5324,7 +5324,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by lrzip $as_me 0.520, which was
+This file was extended by lrzip $as_me 0.530, which was
 generated by GNU Autoconf 2.67.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -5386,7 +5386,7 @@ _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-lrzip config.status 0.520
+lrzip config.status 0.530
 configured by $0, generated by GNU Autoconf 2.67,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index 4fe3948..f642202 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,5 +1,5 @@
 dnl Process this file with autoconf to produce a configure script.
-AC_INIT([lrzip],[0.520],[kernel@kolivas.org],[lrzip-0.520])
+AC_INIT([lrzip],[0.530],[kernel@kolivas.org],[lrzip-0.530])
 AC_CONFIG_HEADER(config.h)
 # see what our system is!
 AC_CANONICAL_HOST
diff --git a/doc/README.benchmarks b/doc/README.benchmarks
index b94c15e..7acaf9d 100644
--- a/doc/README.benchmarks
+++ b/doc/README.benchmarks
@@ -13,28 +13,26 @@ backend.
 linux-2.6.31.tar
 
 These are benchmarks performed on a 3GHz quad core Intel Core2 with 8GB ram
-using lrzip v0.42.
+using lrzip v0.530
 
 Compression	Size		Percentage	Compress	Decompress
 None		365711360	100
-7z		53315279	14.6		2m4s		0m5.4s
-lrzip		52372722	14.3		2m48s		0m8.3s
-lrzip -z	43455498	11.9		10m11s		10m14s
-lrzip -l	112151676	30.7		0m14s		0m5.1s
-lrzip -g	73476127	20.1		0m29s		0m5.6s
-lrzip -b	60851152	16.6		0m43s		0m12.2s
-bzip2		62416571	17.1		0m44s		0m9.8s
-gzip		80563601	22.0		0m14s		0m2.8s
+7z		53315279	14.6		1m58s		0m5.6s
+lrzip		52724172	14.4		1m33s		0m15.6s
+lrzip -z	43223954	11.8		3m42s		10m14s
+lrzip -l	110893724	30.3		0m21s		0m13.4s
+lrzip -g	72746424	19.9		0m25s		0m13.8s
+lrzip -b	60774043	16.6		0m29s		0m19.8s
+bzip2		62416571	17.1		0m44s		0m10.5s
+gzip		80563601	22.0		0m14s		0m3.0s
 
 
 These results are interesting to note the compression of lrzip by default is
-only slightly better than lzma, but at some cost in time at the compress and
-decompress end of the spectrum. Clearly zpaq compression is much better than any
-other compression algorithm by far, but the speed cost on both compression and
-decompression is extreme. At this size compression, lzo is interesting because
-it's faster than simply copying the file but only offers modest compression.
-What lrzip offers at this end of the spectrum is extreme compression if
-desired.
+only slightly better than lzma, but it's significantly faster thanks to its
+heavily multithreaded nature. Decompression is slower but I'm working on that.
+Zpaq offers by far the best compression but at the cost of extra time. However
+with the heavily threaded nature of lrzip, it's not a lot longer given how
+much better its compression is.
 
 
 Let's take six kernel trees one version apart as a tarball, linux-2.6.31 to
@@ -96,7 +94,7 @@ system and some basic working software on it. The default options on the
 
 10GB Virtual image:
 
-These benchmarks were done on the quad core with version 0.5.1
+These benchmarks were done on the quad core with version 0.530
 
 Compression	Size		Percentage	Compress Time	Decompress Time
 None		10737418240	100.0
@@ -104,24 +102,26 @@ gzip		2772899756	 25.8		05m47s		2m46s
 bzip2		2704781700	 25.2		16m15s		6m19s
 xz		2272322208	 21.2		50m58s		3m52s
 7z		2242897134	 20.9		26m36s		5m41s
-lrzip		1354237684	 12.6		29m13s		6m55s
-lrzip -M	1079528708	 10.1		23m44s		4m05s
-lrzip -l	1793312108	 16.7		05m13s		3m12s
-lrzip -lM	1413268368	 13.2		04m18s		2m54s
-lrzip -z	1299844906	 12.1		04h32m14s	04h33m
-lrzip -zM	1066902006	  9.9		04h07m14s	04h08m
+lrzip		1299228155	 12.1		16m12s		4m32s
+lrzip -M	1079682231	 10.1		12m03s		4m05s
+lrzip -l	1754694010	 16.3		05m30s		3m12s
+lrzip -lM	1414958844	 13.2		05m15s		2m57s
+lrzip -zM	1066902006	  9.9		71m20s		04h08m
 
 
 At this end of the spectrum things really start to heat up. The compression
 advantage is massive, with the lzo backend even giving much better results than
-7z, and over a ridiculously short time. The default lzma backend is slightly
-slower than 7z, but provides a lot more compression. What appears to be a big
-disappointment is actually zpaq here which takes more than 8 times longer than
-lzma for a measly .2% improvement. The reason is that most of the advantage here
-is achieved by the rzip first stage since there's a lot of redundant space over
-huge distances on a virtual image. The -M option which works the memory
-subsystem rather hard making noticeable impact on the rest of the machine also
-does further wonders for the compression and times.
+7z, and over a ridiculously short time. The improvements in version 0.530 in
+scalability with multiple CPUs has a huge impact on compression time here,
+with zpaq almost being as fast on quad core as xz is, yet producing a file
+less than half the size. Note that decompression was not multithreaded on
+v0.530, hence why zpaq decompression was so slow.
+What appears to be a big disappointment is actually zpaq here which takes more
+than 6 times longer than lzma for a measly .2% improvement. The reason is that
+most of the advantage here is achieved by the rzip first stage since there's a
+lot of redundant space over huge distances on a virtual image. The -M option
+which works the memory subsystem rather hard making noticeable impact on the
+rest of the machine also does further wonders for the compression and times.
 
 This should help govern what compression you choose. Small files are nicely
 compressed with zpaq. Intermediate files are nicely compressed with lzma.
@@ -131,4 +131,4 @@ Or, to make things easier, just use the default settings all the time and be
 happy as lzma gives good results. :D
 
 Con Kolivas
-Tue, 7th Nov 2010
+Tue, 13th Nov 2010
diff --git a/main.c b/main.c
index 346252e..f022a53 100644
--- a/main.c
+++ b/main.c
@@ -708,8 +708,9 @@ int main(int argc, char *argv[])
 			print_verbose("Threading is %s. Number of CPUs detected: %d\n", control.threads > 1? "ENABLED" : "DISABLED",
 				control.threads);
 		print_verbose("Detected %lld bytes ram\n", control.ramsize);
+		print_verbose("Comrpession level %d\n", control.compression_level);
 		print_verbose("Nice Value: %d\n", control.nice_val);
-		print_progress("Show Progress\n");
+		print_verbose("Show Progress\n");
 		print_maxverbose("Max ");
 		print_verbose("Verbose\n");
 		if (FORCE_REPLACE)
diff --git a/man/lrzip.1 b/man/lrzip.1
index 5e4dbb9..9190653 100644
--- a/man/lrzip.1
+++ b/man/lrzip.1
@@ -103,8 +103,8 @@ does not fit into the available ram, lrzip will use a moving second buffer as a
 possible compression in the first rzip stage which can improve the compression
 of ultra large files when they're bigger than the available ram. However it runs
 progressively slower the larger the difference between ram and the file size so
-it is worth trying the -M option first to see if the whole file can be accessed
-in one pass, and then if not, it should be used together with the -M option (if
+it is worth trying the \-M option first to see if the whole file can be accessed
+in one pass, and then if not, it should be used together with the \-M option (if
 at all).
 .IP
 .IP "\fB-T 0\&.\&.10\fP"
diff --git a/rzip.h b/rzip.h
index eed59f2..6fbabb3 100644
--- a/rzip.h
+++ b/rzip.h
@@ -19,7 +19,7 @@
 
 #define LRZIP_MAJOR_VERSION 0
 #define LRZIP_MINOR_VERSION 5
-#define LRZIP_MINOR_SUBVERSION 20
+#define LRZIP_MINOR_SUBVERSION 30
 
 #define NUM_STREAMS 2