Bump version number to 0.542.

Choose sane defaults for memory usage since linux ludicriously overcommits. Use sliding mmap for any compression windows greater than 2/3 ram. Consolidate and simplify testing of allocatable ram. Minor tweaks to output. Round up the size of the high buffer in sliding mmap to one page. Squeeze a little more out of 32 bit compression windows.
2026-03-07 05:23:57 +01:00 · 2010-11-20 01:23:08 +11:00 · 2010-11-20 01:23:08 +11:00 · 75e675e6dd
parent 25e053ed49
commit 75e675e6dd
6 changed files with 116 additions and 91 deletions
--- a/20
+++ b/20
@ -1,4 +1,18 @@
 lrzip ChangeLog
+NOVEMBER 2010, version 0.542 Con Kolivas
+* Choose sane defaults for memory usage since linux ludicriously overcommits.
+* Use sliding mmap for any compression windows greater than 2/3 ram.
+* Consolidate and simplify testing of allocatable ram.
+* Minor tweaks to output.
+* Round up the size of the high buffer in sliding mmap to one page.
+* Squeeze a little more out of 32 bit compression windows.
+
+NOVEMBER 2010, version 0.541 Con Kolivas
+* Fix wrong number of passes reported.
+* Re-fix the off-by-one that wasn't off-by-one.
+* Limit lzma compression windows to 300MB as per reports of failures with larger
+windows.
+
 NOVEMBER 2010, version 0.540 Con Kolivas
 * Massive rewrite of backend decompression phse, implementing multithreading.
 This is done by taking each stream of data on read in into separate buffers for
@ -7,9 +21,11 @@ into runzip once it is requests more of the stream. Provided there are enough
 chunks in the originally compressed data, this provides a massive speedup
 potentially proportional to the number of CPUs. The slower the backend
 compression, the better the speed up (i.e. zpaq is the best sped up).
-* Fix the output of zpaq compress and decompress from trampling on itself and racing and consuming a lot of CPU time printing to the console.
+* Fix the output of zpaq compress and decompress from trampling on itself and
+racing and consuming a lot of CPU time printing to the console.
 * When limiting cwindow to 6 on 32 bits, ensure that control.window is also set.
-* When testing for the maximum size of testmalloc, the multiple used was out by one, so increase it.
+* When testing for the maximum size of testmalloc, the multiple used was out by
+one, so increase it.
 * Minor output tweaks.
 * Build warning fixes.
 * Updated benchmarks.
--- a/11
+++ b/11
@ -1,3 +1,14 @@
+lrzip-0.542
+
+Lrzip will now try to select sane defaults for memory usage in cases where the
+virtual memory heavily overcommits (eg. Linux) as this seriously slows down
+compression.
+For compression windows larger than 2/3 ram, lrzip will now use a sliding mmap
+buffer for better performance.
+The progress output is more informative in max verbose mode, and will no longer
+do more passes than it estimates.
+32 bit machines should be able to use slightly larger windows.
+
 lrzip-0.540

 MASSIVE MULTITHREADING on the decompression phase. Provided there are enough
--- a/configure.ac
+++ b/configure.ac
@ -1,5 +1,5 @@
 dnl Process this file with autoconf to produce a configure script.
-AC_INIT([lrzip],[0.541],[kernel@kolivas.org],[lrzip-0.541])
+AC_INIT([lrzip],[0.542],[kernel@kolivas.org],[lrzip-0.542])
 AC_CONFIG_HEADER(config.h)
 # see what our system is!
 AC_CANONICAL_HOST
--- a/rzip.c
+++ b/rzip.c
@ -119,7 +119,7 @@ static void remap_low_sb(void)
 		top = 1;
 	}
 	round_to_page(&new_offset);
-	print_maxverbose("Sliding main buffer   \n");
+	print_maxverbose("Sliding main buffer to offset %lld\n", new_offset);
 	if (unlikely(munmap(sb.buf_low, sb.size_low)))
 		fatal("Failed to munmap in remap_low_sb\n");
 	sb.offset_low = new_offset;
@ -673,8 +673,11 @@ static void mmap_stdin(uchar *buf, struct rzip_state *st)
 static void init_sliding_mmap(struct rzip_state *st, int fd_in, i64 offset)
 {
 	/* Initialise the high buffer */
-	if (UNLIMITED) {
+	if (!STDIN) {
 		sb.high_length = 65536;
+		/* Round up to the next biggest page size */
+		if (sb.high_length % control.page_size)
+			sb.high_length += control.page_size - (sb.high_length % control.page_size);
 		sb.buf_high = (uchar *)mmap(NULL, sb.high_length, PROT_READ, MAP_SHARED, fd_in, offset);
 		if (unlikely(sb.buf_high == MAP_FAILED))
 			fatal("Unable to mmap buf_high in init_sliding_mmap\n");
@ -699,13 +702,13 @@ static void rzip_chunk(struct rzip_state *st, int fd_in, int fd_out, i64 offset,
 	if (unlikely(!st->ss))
 		fatal("Failed to open streams in rzip_chunk\n");

-	print_verbose("Performing rzip pre-processing phase\n");
+	print_verbose("Beginning rzip pre-processing phase\n");
 	hash_search(st, pct_base, pct_multiple);

 	/* unmap buffer before closing and reallocating streams */
 	if (unlikely(munmap(sb.buf_low, sb.size_low)))
 		fatal("Failed to munmap in rzip_chunk\n");
-	if (UNLIMITED) {
+	if (!STDIN) {
 		if (unlikely(munmap(sb.buf_high, sb.size_high)))
 			fatal("Failed to munmap in rzip_chunk\n");
 	}
@ -753,17 +756,34 @@ void rzip_fd(int fd_in, int fd_out)
 	} else
 		control.st_size = 0;

+	/* Optimal use of ram involves no more than 2/3 of it, so if we
+	 * expressly request more with -M or -U, use a sliding mmap */
+	control.max_mmap = control.ramsize / 3 * 2;
+	if (MAXRAM)
+		control.max_chunk = control.ramsize;
+	else
+		control.max_chunk = control.max_mmap;
+
+	/* On 32 bits we can have a big window with sliding mmap, but can
+	 * not enable much per mmap/malloc */
+	if (BITS32)
+		control.max_mmap = MIN(control.max_mmap, two_gig / 3);
+	round_to_page(&control.max_chunk);
+	round_to_page(&control.max_mmap);
+	if (UNLIMITED)
+		control.max_chunk = control.st_size;
+
 	if (control.window)
 		chunk_window = control.window * CHUNK_MULTIPLE;
-	else {
-		if (STDIN)
-			chunk_window = control.ramsize;
-		else
-			chunk_window = len;
-	}
-	if (chunk_window < len)
-		chunk_window -= chunk_window % control.page_size;
-	st->chunk_size = chunk_window;
+	else
+		chunk_window = control.max_chunk;
+
+	if (!STDIN)
+		st->chunk_size = MIN(chunk_window, len);
+	else
+		st->chunk_size = chunk_window;
+	if (st->chunk_size < len)
+		round_to_page(&st->chunk_size);

 	st->level = &levels[control.compression_level];
 	st->fd_in = fd_in;
@ -783,69 +803,53 @@ void rzip_fd(int fd_in, int fd_out)
 		i64 offset = s.st_size - len;
 		int bits = 8;

-		/* Flushing the dirty data will decrease our chances of
-		 * running out of memory when we allocate ram again on the
-		 * next chunk. It will also prevent thrashing on-disk due to
-		 * concurrent reads and writes if we're on the same device. */
-		if (last_chunk)
-			print_verbose("Flushing data to disk.\n");
-		fsync(fd_out);
-
-		if (st->chunk_size > len && !STDIN)
-			st->chunk_size = len;
-		st->mmap_size = st->chunk_size;
-		if (BITS32 && st->mmap_size > two_gig) {
-			print_verbose("Limiting to 2GB due to 32 bit limitations\n");
-			st->mmap_size = two_gig;
+		st->chunk_size = control.max_chunk;
+		st->mmap_size = control.max_mmap;
+		if (!STDIN) {
+			st->chunk_size = MIN(st->chunk_size, len);
+			st->mmap_size = MIN(st->mmap_size, len);
 		}

 retry:
-		/* Mmapping anonymously first will tell us how much ram we can use in
-		 * advance and zeroes it which has a defragmenting effect on ram
-		 * before the real read in. */
-		sb.buf_low = mmap(NULL, st->mmap_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-		/* Better to shrink the window to the largest size that works than fail */
-		if (sb.buf_low == MAP_FAILED) {
-			st->mmap_size = st->mmap_size / 10 * 9;
-			st->mmap_size -= st->mmap_size % control.page_size;
-			if (unlikely(!st->mmap_size))
-				fatal("Unable to mmap any ram\n");
-			goto retry;
-		}
-
-		/* NOTE the buf is saved here for STDIN mode */
-		if (!STDIN) {
-			if (unlikely(munmap(sb.buf_low, st->mmap_size)))
-				fatal("Failed to munmap\n");
-		}
-
-		if (!MAXRAM) {
-			print_maxverbose("Succeeded in allocating %lld sized mmap\n", st->mmap_size);
-			if (!UNLIMITED)
-				st->chunk_size = st->mmap_size;
-		} else
-			st->mmap_size = st->chunk_size;
-
-		if (!STDIN) {
-			/* The buf is saved here for !STDIN mode */
-			sb.buf_low = (uchar *)mmap(sb.buf_low, st->mmap_size, PROT_READ, MAP_SHARED, fd_in, offset);
+		if (STDIN) {
+			/* NOTE the buf is saved here for STDIN mode */
+			sb.buf_low = mmap(NULL, st->mmap_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+			/* Better to shrink the window to the largest size that works than fail */
 			if (sb.buf_low == MAP_FAILED) {
-				if (unlikely(!MAXRAM))
-					fatal("Failed to remap ram\n");
 				st->mmap_size = st->mmap_size / 10 * 9;
-				st->mmap_size -= st->mmap_size % control.page_size;
+				round_to_page(&st->mmap_size);
 				if (unlikely(!st->mmap_size))
 					fatal("Unable to mmap any ram\n");
 				goto retry;
 			}
-		} else
 			mmap_stdin(sb.buf_low, st);
+		} else {
+			/* NOTE the buf is saved here for !STDIN mode */
+			if (st->mmap_size < st->chunk_size)
+				print_maxverbose("Enabling sliding mmap mode and using mmap of %lld bytes with window of %lld bytes\n", st->mmap_size, st->chunk_size);

-		if (MAXRAM)
-			print_maxverbose("Succeeded in allocating %lld sized mmap\n", st->mmap_size);
+			/* The buf is saved here for !STDIN mode */
+			sb.buf_low = (uchar *)mmap(sb.buf_low, st->mmap_size, PROT_READ, MAP_SHARED, fd_in, offset);
+			if (sb.buf_low == MAP_FAILED) {
+				st->mmap_size = st->mmap_size / 10 * 9;
+				round_to_page(&st->mmap_size);
+				if (unlikely(!st->mmap_size))
+					fatal("Unable to mmap any ram\n");
+				goto retry;
+			}
+		}
+		print_maxverbose("Succeeded in testing %lld sized mmap for rzip pre-processing\n", st->mmap_size);

-		if (st->mmap_size < st->chunk_size)
-			print_verbose("Compression window is larger than ram allocated, will proceed with unlimited mode possibly much slower\n");
+		if (st->chunk_size > control.ramsize)
+			print_verbose("Compression window is larger than ram, will proceed with unlimited mode possibly much slower\n");
+
+		if (!passes && !STDIN) {
+			passes = s.st_size / st->chunk_size + !!(s.st_size % st->chunk_size);
+			if (passes == 1)
+				print_verbose("Will take 1 pass\n");
+			else
+				print_verbose("Will take %d passes\n", passes);
+		}

 		sb.orig_offset = offset;
 		print_maxverbose("Chunk size: %lld\n", st->chunk_size);
@ -871,8 +875,6 @@ retry:
 		gettimeofday(&current, NULL);
 		/* this will count only when size > window */
 		if (last.tv_sec > 0) {
-			if (!passes)
-				passes = s.st_size / st->chunk_size;
 			elapsed_time = current.tv_sec - start.tv_sec;
 			finish_time = elapsed_time / (pct_base / 100.0);
 			elapsed_hours = (unsigned int)(elapsed_time) / 3600;
--- a/rzip.h
+++ b/rzip.h
@ -19,7 +19,7 @@

 #define LRZIP_MAJOR_VERSION 0
 #define LRZIP_MINOR_VERSION 5
-#define LRZIP_MINOR_SUBVERSION 41
+#define LRZIP_MINOR_SUBVERSION 42

 #define NUM_STREAMS 2

@ -121,7 +121,6 @@ extern int errno;
 #define likely(x)	__builtin_expect(!!(x), 1)
 #define unlikely(x)	__builtin_expect(!!(x), 0)

-typedef unsigned long long u64;
 typedef long long int i64;
 typedef uint16_t u16;
 typedef uint32_t u32;
@ -231,9 +230,11 @@ struct rzip_control {
 	int compression_level;
 	unsigned char lzma_properties[5]; // lzma properties, encoded
 	double threshold;
-	unsigned long long window;
+	i64 window;
 	unsigned long flags;
-	unsigned long long ramsize;
+	i64 ramsize;
+	i64 max_chunk;
+	i64 max_mmap;
 	int threads;
 	int nice_val;		// added for consistency
 	int major_version;
--- a/stream.c
+++ b/stream.c
@ -644,6 +644,7 @@ static int seekto(struct stream_info *sinfo, i64 pos)
 }

 static pthread_t *threads;
+extern const i64 two_gig;

 /* open a set of output streams, compressing with the given
   compression level and algorithm */
@ -651,7 +652,6 @@ void *open_stream_out(int f, int n, i64 limit)
 {
 	struct stream_info *sinfo;
 	uchar *testmalloc;
-	unsigned cwindow;
 	int i;

 	sinfo = malloc(sizeof(*sinfo));
@ -689,21 +689,9 @@ void *open_stream_out(int f, int n, i64 limit)
 	sinfo->cur_pos = 0;
 	sinfo->fd = f;

-	if (BITS32) {
-		/* Largest window we can safely support on 32bit is 2GB */
-		if (!control.window || control.window > 20)
-			control.window = 20;
-		/* Largest window supported by lzma is 300MB */
-		if (LZMA_COMPRESS && control.window > 3)
-			control.window = 3;
-	}
-	cwindow = control.window;
-
-	/* No point making the stream larger than the amount of data */
-	if (cwindow)
-		sinfo->bufsize = MIN(STREAM_BUFSIZE * 10 * cwindow, limit);
-	else
-		sinfo->bufsize = limit;
+	/* Serious limits imposed on 32 bit capabilities */
+	if (BITS32)
+		limit = MIN(limit, two_gig / 3);

 	sinfo->initial_pos = lseek(f, 0, SEEK_CUR);

@ -717,12 +705,19 @@ void *open_stream_out(int f, int n, i64 limit)
 	 * ram. We need enough for the 2 streams and for the compression
 	 * backend at most, being conservative. */
 retest_malloc:
-	testmalloc = malloc(sinfo->bufsize * (n + 1));
+	testmalloc = malloc(limit * (n + 1));
 	if (!testmalloc) {
-		sinfo->bufsize = sinfo->bufsize / 10 * 9;
+		limit = limit / 10 * 9;
 		goto retest_malloc;
 	}
 	free(testmalloc);
+	print_maxverbose("Succeeded in testing %lld sized malloc for back end compression\n", limit * (n + 1));
+
+	/* Largest window supported by lzma is 300MB */
+	if (LZMA_COMPRESS)
+		limit = MIN(limit, 3 * STREAM_BUFSIZE * 10);
+
+	sinfo->bufsize = limit;

 	/* Make the bufsize no smaller than STREAM_BUFSIZE. Round up the
 	 * bufsize to fit X threads into it */