diff --git a/main.c b/main.c index 3b68893..2460acb 100644 --- a/main.c +++ b/main.c @@ -810,6 +810,16 @@ int main(int argc, char *argv[]) if (BITS32) control.ramsize = MAX(control.ramsize - 900000000ll, 900000000ll); + /* Work out the compression overhead per compression thread */ + if (LZMA_COMPRESS) { + int level = control.compression_level * 7 / 9 ? : 1; + i64 dictsize = (level <= 5 ? (1 << (level * 2 + 14)) : + (level == 6 ? (1 << 25) : (1 << 26))); + + control.overhead = (dictsize * 23/ 2) + (4 * 1024 * 1024); + } else if (ZPAQ_COMPRESS) + control.overhead = 112 * 1024 * 1024; + /* OK, if verbosity set, print summary of options selected */ if (!INFO) { if (!TEST_ONLY) diff --git a/rzip.h b/rzip.h index 349d6e7..4e5ca1c 100644 --- a/rzip.h +++ b/rzip.h @@ -266,6 +266,7 @@ struct rzip_control { FILE *msgout; //stream for output messages const char *suffix; int compression_level; + i64 overhead; // compressor overhead unsigned char lzma_properties[5]; // lzma properties, encoded double threshold; i64 window; diff --git a/stream.c b/stream.c index d9ee835..dc36d66 100644 --- a/stream.c +++ b/stream.c @@ -720,9 +720,12 @@ void prepare_streamout_threads(void) /* As we serialise the generation of threads during the rzip * pre-processing stage, it's faster to have one more thread available - * to keep all CPUs busy. */ + * to keep all CPUs busy. There is no point splitting up the chunks + * into multiple threads if there will be no compression back end. */ if (control.threads > 1) ++control.threads; + if (NO_COMPRESS) + control.threads = 1; threads = calloc(sizeof(pthread_t), control.threads); if (unlikely(!threads)) fatal("Unable to calloc threads in prepare_streamout_threads\n"); @@ -780,20 +783,25 @@ void *open_stream_out(int f, int n, i64 limit, char cbytes) } /* Find the largest we can make the window based on ability to malloc - * ram. We need enough for the 2 streams and for the compression - * backend at most, being conservative. We don't need any for the - * backend compression if we won't be doing any. - */ - testbufs = n; - if (!NO_COMPRESS) - testbufs++; + * ram. We need 2 buffers for each compression thread and the overhead + * of each compression back end. No 2nd buf is required when there is + * no back end compression. We limit the total regardless to 1/3 ram + * for when the OS lies due to heavy overcommit. */ + if (NO_COMPRESS) + testbufs = 1; + else + testbufs = 2; /* Serious limits imposed on 32 bit capabilities */ if (BITS32) - limit = MIN(limit, two_gig / testbufs); + limit = MIN(limit, (two_gig / testbufs) - + (control.overhead * control.threads)); + testsize = (limit * testbufs) + (control.overhead * control.threads); + if (testsize > control.ramsize / 3) + limit = (control.ramsize / 3 - (control.overhead * control.threads)) / testbufs; retest_malloc: - testsize = limit * testbufs; + testsize = (limit * testbufs) + (control.overhead * control.threads); testmalloc = malloc(testsize); if (!testmalloc) { limit = limit / 10 * 9; @@ -802,23 +810,25 @@ retest_malloc: free(testmalloc); print_maxverbose("Succeeded in testing %lld sized malloc for back end compression\n", testsize); + sinfo->max_bufsize = limit / control.threads; + /* We start with slightly smaller buffers to start loading CPUs as soon * as possible and make them exponentially larger approaching the * tested maximum size. We ensure the buffers are of a minimum size, * though, as compression efficency drops off dramatically with tiny * buffers. */ if (control.threads > 1) { - sinfo->max_bufsize = limit / control.threads; sinfo->bufsize = sinfo->max_bufsize * 63 / 100; round_to_page(&sinfo->bufsize); sinfo->bufsize = MAX(sinfo->bufsize, STREAM_BUFSIZE); - } + } else + sinfo->bufsize = sinfo->max_bufsize; if (control.threads > 1) print_maxverbose("Using up to %d threads to compress up to %lld bytes each.\n", control.threads, sinfo->max_bufsize); else - print_maxverbose("Using 1 thread to compress up to %lld bytes\n", + print_maxverbose("Using only 1 thread to compress up to %lld bytes\n", sinfo->bufsize); for (i = 0; i < n; i++) {