diff --git a/ChangeLog b/ChangeLog index 44c2be5..1b39279 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,4 +1,18 @@ lrzip ChangeLog +NOVEMBER 2010, version 0.542 Con Kolivas +* Choose sane defaults for memory usage since linux ludicriously overcommits. +* Use sliding mmap for any compression windows greater than 2/3 ram. +* Consolidate and simplify testing of allocatable ram. +* Minor tweaks to output. +* Round up the size of the high buffer in sliding mmap to one page. +* Squeeze a little more out of 32 bit compression windows. + +NOVEMBER 2010, version 0.541 Con Kolivas +* Fix wrong number of passes reported. +* Re-fix the off-by-one that wasn't off-by-one. +* Limit lzma compression windows to 300MB as per reports of failures with larger +windows. + NOVEMBER 2010, version 0.540 Con Kolivas * Massive rewrite of backend decompression phse, implementing multithreading. This is done by taking each stream of data on read in into separate buffers for @@ -7,9 +21,11 @@ into runzip once it is requests more of the stream. Provided there are enough chunks in the originally compressed data, this provides a massive speedup potentially proportional to the number of CPUs. The slower the backend compression, the better the speed up (i.e. zpaq is the best sped up). -* Fix the output of zpaq compress and decompress from trampling on itself and racing and consuming a lot of CPU time printing to the console. +* Fix the output of zpaq compress and decompress from trampling on itself and +racing and consuming a lot of CPU time printing to the console. * When limiting cwindow to 6 on 32 bits, ensure that control.window is also set. -* When testing for the maximum size of testmalloc, the multiple used was out by one, so increase it. +* When testing for the maximum size of testmalloc, the multiple used was out by +one, so increase it. * Minor output tweaks. * Build warning fixes. * Updated benchmarks. diff --git a/WHATS-NEW b/WHATS-NEW index f82195b..cc9d807 100644 --- a/WHATS-NEW +++ b/WHATS-NEW @@ -1,3 +1,14 @@ +lrzip-0.542 + +Lrzip will now try to select sane defaults for memory usage in cases where the +virtual memory heavily overcommits (eg. Linux) as this seriously slows down +compression. +For compression windows larger than 2/3 ram, lrzip will now use a sliding mmap +buffer for better performance. +The progress output is more informative in max verbose mode, and will no longer +do more passes than it estimates. +32 bit machines should be able to use slightly larger windows. + lrzip-0.540 MASSIVE MULTITHREADING on the decompression phase. Provided there are enough diff --git a/configure.ac b/configure.ac index 41f25ef..774ee8d 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ dnl Process this file with autoconf to produce a configure script. -AC_INIT([lrzip],[0.541],[kernel@kolivas.org],[lrzip-0.541]) +AC_INIT([lrzip],[0.542],[kernel@kolivas.org],[lrzip-0.542]) AC_CONFIG_HEADER(config.h) # see what our system is! AC_CANONICAL_HOST diff --git a/rzip.c b/rzip.c index e6855fc..310a8e0 100644 --- a/rzip.c +++ b/rzip.c @@ -119,7 +119,7 @@ static void remap_low_sb(void) top = 1; } round_to_page(&new_offset); - print_maxverbose("Sliding main buffer \n"); + print_maxverbose("Sliding main buffer to offset %lld\n", new_offset); if (unlikely(munmap(sb.buf_low, sb.size_low))) fatal("Failed to munmap in remap_low_sb\n"); sb.offset_low = new_offset; @@ -673,8 +673,11 @@ static void mmap_stdin(uchar *buf, struct rzip_state *st) static void init_sliding_mmap(struct rzip_state *st, int fd_in, i64 offset) { /* Initialise the high buffer */ - if (UNLIMITED) { + if (!STDIN) { sb.high_length = 65536; + /* Round up to the next biggest page size */ + if (sb.high_length % control.page_size) + sb.high_length += control.page_size - (sb.high_length % control.page_size); sb.buf_high = (uchar *)mmap(NULL, sb.high_length, PROT_READ, MAP_SHARED, fd_in, offset); if (unlikely(sb.buf_high == MAP_FAILED)) fatal("Unable to mmap buf_high in init_sliding_mmap\n"); @@ -699,13 +702,13 @@ static void rzip_chunk(struct rzip_state *st, int fd_in, int fd_out, i64 offset, if (unlikely(!st->ss)) fatal("Failed to open streams in rzip_chunk\n"); - print_verbose("Performing rzip pre-processing phase\n"); + print_verbose("Beginning rzip pre-processing phase\n"); hash_search(st, pct_base, pct_multiple); /* unmap buffer before closing and reallocating streams */ if (unlikely(munmap(sb.buf_low, sb.size_low))) fatal("Failed to munmap in rzip_chunk\n"); - if (UNLIMITED) { + if (!STDIN) { if (unlikely(munmap(sb.buf_high, sb.size_high))) fatal("Failed to munmap in rzip_chunk\n"); } @@ -753,17 +756,34 @@ void rzip_fd(int fd_in, int fd_out) } else control.st_size = 0; + /* Optimal use of ram involves no more than 2/3 of it, so if we + * expressly request more with -M or -U, use a sliding mmap */ + control.max_mmap = control.ramsize / 3 * 2; + if (MAXRAM) + control.max_chunk = control.ramsize; + else + control.max_chunk = control.max_mmap; + + /* On 32 bits we can have a big window with sliding mmap, but can + * not enable much per mmap/malloc */ + if (BITS32) + control.max_mmap = MIN(control.max_mmap, two_gig / 3); + round_to_page(&control.max_chunk); + round_to_page(&control.max_mmap); + if (UNLIMITED) + control.max_chunk = control.st_size; + if (control.window) chunk_window = control.window * CHUNK_MULTIPLE; - else { - if (STDIN) - chunk_window = control.ramsize; - else - chunk_window = len; - } - if (chunk_window < len) - chunk_window -= chunk_window % control.page_size; - st->chunk_size = chunk_window; + else + chunk_window = control.max_chunk; + + if (!STDIN) + st->chunk_size = MIN(chunk_window, len); + else + st->chunk_size = chunk_window; + if (st->chunk_size < len) + round_to_page(&st->chunk_size); st->level = &levels[control.compression_level]; st->fd_in = fd_in; @@ -783,69 +803,53 @@ void rzip_fd(int fd_in, int fd_out) i64 offset = s.st_size - len; int bits = 8; - /* Flushing the dirty data will decrease our chances of - * running out of memory when we allocate ram again on the - * next chunk. It will also prevent thrashing on-disk due to - * concurrent reads and writes if we're on the same device. */ - if (last_chunk) - print_verbose("Flushing data to disk.\n"); - fsync(fd_out); - - if (st->chunk_size > len && !STDIN) - st->chunk_size = len; - st->mmap_size = st->chunk_size; - if (BITS32 && st->mmap_size > two_gig) { - print_verbose("Limiting to 2GB due to 32 bit limitations\n"); - st->mmap_size = two_gig; + st->chunk_size = control.max_chunk; + st->mmap_size = control.max_mmap; + if (!STDIN) { + st->chunk_size = MIN(st->chunk_size, len); + st->mmap_size = MIN(st->mmap_size, len); } retry: - /* Mmapping anonymously first will tell us how much ram we can use in - * advance and zeroes it which has a defragmenting effect on ram - * before the real read in. */ - sb.buf_low = mmap(NULL, st->mmap_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - /* Better to shrink the window to the largest size that works than fail */ - if (sb.buf_low == MAP_FAILED) { - st->mmap_size = st->mmap_size / 10 * 9; - st->mmap_size -= st->mmap_size % control.page_size; - if (unlikely(!st->mmap_size)) - fatal("Unable to mmap any ram\n"); - goto retry; - } - - /* NOTE the buf is saved here for STDIN mode */ - if (!STDIN) { - if (unlikely(munmap(sb.buf_low, st->mmap_size))) - fatal("Failed to munmap\n"); - } - - if (!MAXRAM) { - print_maxverbose("Succeeded in allocating %lld sized mmap\n", st->mmap_size); - if (!UNLIMITED) - st->chunk_size = st->mmap_size; - } else - st->mmap_size = st->chunk_size; - - if (!STDIN) { - /* The buf is saved here for !STDIN mode */ - sb.buf_low = (uchar *)mmap(sb.buf_low, st->mmap_size, PROT_READ, MAP_SHARED, fd_in, offset); + if (STDIN) { + /* NOTE the buf is saved here for STDIN mode */ + sb.buf_low = mmap(NULL, st->mmap_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + /* Better to shrink the window to the largest size that works than fail */ if (sb.buf_low == MAP_FAILED) { - if (unlikely(!MAXRAM)) - fatal("Failed to remap ram\n"); st->mmap_size = st->mmap_size / 10 * 9; - st->mmap_size -= st->mmap_size % control.page_size; + round_to_page(&st->mmap_size); if (unlikely(!st->mmap_size)) fatal("Unable to mmap any ram\n"); goto retry; } - } else mmap_stdin(sb.buf_low, st); + } else { + /* NOTE the buf is saved here for !STDIN mode */ + if (st->mmap_size < st->chunk_size) + print_maxverbose("Enabling sliding mmap mode and using mmap of %lld bytes with window of %lld bytes\n", st->mmap_size, st->chunk_size); - if (MAXRAM) - print_maxverbose("Succeeded in allocating %lld sized mmap\n", st->mmap_size); + /* The buf is saved here for !STDIN mode */ + sb.buf_low = (uchar *)mmap(sb.buf_low, st->mmap_size, PROT_READ, MAP_SHARED, fd_in, offset); + if (sb.buf_low == MAP_FAILED) { + st->mmap_size = st->mmap_size / 10 * 9; + round_to_page(&st->mmap_size); + if (unlikely(!st->mmap_size)) + fatal("Unable to mmap any ram\n"); + goto retry; + } + } + print_maxverbose("Succeeded in testing %lld sized mmap for rzip pre-processing\n", st->mmap_size); - if (st->mmap_size < st->chunk_size) - print_verbose("Compression window is larger than ram allocated, will proceed with unlimited mode possibly much slower\n"); + if (st->chunk_size > control.ramsize) + print_verbose("Compression window is larger than ram, will proceed with unlimited mode possibly much slower\n"); + + if (!passes && !STDIN) { + passes = s.st_size / st->chunk_size + !!(s.st_size % st->chunk_size); + if (passes == 1) + print_verbose("Will take 1 pass\n"); + else + print_verbose("Will take %d passes\n", passes); + } sb.orig_offset = offset; print_maxverbose("Chunk size: %lld\n", st->chunk_size); @@ -871,8 +875,6 @@ retry: gettimeofday(¤t, NULL); /* this will count only when size > window */ if (last.tv_sec > 0) { - if (!passes) - passes = s.st_size / st->chunk_size; elapsed_time = current.tv_sec - start.tv_sec; finish_time = elapsed_time / (pct_base / 100.0); elapsed_hours = (unsigned int)(elapsed_time) / 3600; diff --git a/rzip.h b/rzip.h index d648109..6f2c269 100644 --- a/rzip.h +++ b/rzip.h @@ -19,7 +19,7 @@ #define LRZIP_MAJOR_VERSION 0 #define LRZIP_MINOR_VERSION 5 -#define LRZIP_MINOR_SUBVERSION 41 +#define LRZIP_MINOR_SUBVERSION 42 #define NUM_STREAMS 2 @@ -121,7 +121,6 @@ extern int errno; #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) -typedef unsigned long long u64; typedef long long int i64; typedef uint16_t u16; typedef uint32_t u32; @@ -231,9 +230,11 @@ struct rzip_control { int compression_level; unsigned char lzma_properties[5]; // lzma properties, encoded double threshold; - unsigned long long window; + i64 window; unsigned long flags; - unsigned long long ramsize; + i64 ramsize; + i64 max_chunk; + i64 max_mmap; int threads; int nice_val; // added for consistency int major_version; diff --git a/stream.c b/stream.c index 6ee0fed..0f678c1 100644 --- a/stream.c +++ b/stream.c @@ -644,6 +644,7 @@ static int seekto(struct stream_info *sinfo, i64 pos) } static pthread_t *threads; +extern const i64 two_gig; /* open a set of output streams, compressing with the given compression level and algorithm */ @@ -651,7 +652,6 @@ void *open_stream_out(int f, int n, i64 limit) { struct stream_info *sinfo; uchar *testmalloc; - unsigned cwindow; int i; sinfo = malloc(sizeof(*sinfo)); @@ -689,21 +689,9 @@ void *open_stream_out(int f, int n, i64 limit) sinfo->cur_pos = 0; sinfo->fd = f; - if (BITS32) { - /* Largest window we can safely support on 32bit is 2GB */ - if (!control.window || control.window > 20) - control.window = 20; - /* Largest window supported by lzma is 300MB */ - if (LZMA_COMPRESS && control.window > 3) - control.window = 3; - } - cwindow = control.window; - - /* No point making the stream larger than the amount of data */ - if (cwindow) - sinfo->bufsize = MIN(STREAM_BUFSIZE * 10 * cwindow, limit); - else - sinfo->bufsize = limit; + /* Serious limits imposed on 32 bit capabilities */ + if (BITS32) + limit = MIN(limit, two_gig / 3); sinfo->initial_pos = lseek(f, 0, SEEK_CUR); @@ -717,12 +705,19 @@ void *open_stream_out(int f, int n, i64 limit) * ram. We need enough for the 2 streams and for the compression * backend at most, being conservative. */ retest_malloc: - testmalloc = malloc(sinfo->bufsize * (n + 1)); + testmalloc = malloc(limit * (n + 1)); if (!testmalloc) { - sinfo->bufsize = sinfo->bufsize / 10 * 9; + limit = limit / 10 * 9; goto retest_malloc; } free(testmalloc); + print_maxverbose("Succeeded in testing %lld sized malloc for back end compression\n", limit * (n + 1)); + + /* Largest window supported by lzma is 300MB */ + if (LZMA_COMPRESS) + limit = MIN(limit, 3 * STREAM_BUFSIZE * 10); + + sinfo->bufsize = limit; /* Make the bufsize no smaller than STREAM_BUFSIZE. Round up the * bufsize to fit X threads into it */