diff --git a/main.c b/main.c
index 3b68893..2460acb 100644
--- a/main.c
+++ b/main.c
@@ -810,6 +810,16 @@ int main(int argc, char *argv[])
 	if (BITS32)
 		control.ramsize = MAX(control.ramsize - 900000000ll, 900000000ll);
 
+	/* Work out the compression overhead per compression thread */
+	if (LZMA_COMPRESS) {
+		int level = control.compression_level * 7 / 9 ? : 1;
+		i64 dictsize = (level <= 5 ? (1 << (level * 2 + 14)) :
+				(level == 6 ? (1 << 25) : (1 << 26)));
+
+		control.overhead = (dictsize * 23/ 2) + (4 * 1024 * 1024);
+	} else if (ZPAQ_COMPRESS)
+		control.overhead = 112 * 1024 * 1024;
+
 	/* OK, if verbosity set, print summary of options selected */
 	if (!INFO) {
 		if (!TEST_ONLY)
diff --git a/rzip.h b/rzip.h
index 349d6e7..4e5ca1c 100644
--- a/rzip.h
+++ b/rzip.h
@@ -266,6 +266,7 @@ struct rzip_control {
 	FILE *msgout; //stream for output messages
 	const char *suffix;
 	int compression_level;
+	i64 overhead; // compressor overhead
 	unsigned char lzma_properties[5]; // lzma properties, encoded
 	double threshold;
 	i64 window;
diff --git a/stream.c b/stream.c
index d9ee835..dc36d66 100644
--- a/stream.c
+++ b/stream.c
@@ -720,9 +720,12 @@ void prepare_streamout_threads(void)
 
 	/* As we serialise the generation of threads during the rzip
 	 * pre-processing stage, it's faster to have one more thread available
-	 * to keep all CPUs busy. */
+	 * to keep all CPUs busy. There is no point splitting up the chunks
+	 * into multiple threads if there will be no compression back end. */
 	if (control.threads > 1)
 		++control.threads;
+	if (NO_COMPRESS)
+		control.threads = 1;
 	threads = calloc(sizeof(pthread_t), control.threads);
 	if (unlikely(!threads))
 		fatal("Unable to calloc threads in prepare_streamout_threads\n");
@@ -780,20 +783,25 @@ void *open_stream_out(int f, int n, i64 limit, char cbytes)
 	}
 
 	/* Find the largest we can make the window based on ability to malloc
-	 * ram. We need enough for the 2 streams and for the compression
-	 * backend at most, being conservative. We don't need any for the
-	 * backend compression if we won't be doing any.
-	 */
-	testbufs = n;
-	if (!NO_COMPRESS)
-		testbufs++;
+	 * ram. We need 2 buffers for each compression thread and the overhead
+	 * of each compression back end. No 2nd buf is required when there is
+	 * no back end compression. We limit the total regardless to 1/3 ram
+	 * for when the OS lies due to heavy overcommit. */
+	if (NO_COMPRESS)
+		testbufs = 1;
+	else
+		testbufs = 2;
 
 	/* Serious limits imposed on 32 bit capabilities */
 	if (BITS32)
-		limit = MIN(limit, two_gig / testbufs);
+		limit = MIN(limit, (two_gig / testbufs) -
+			(control.overhead * control.threads));
 
+	testsize = (limit * testbufs) + (control.overhead * control.threads);
+	if (testsize > control.ramsize / 3)
+		limit = (control.ramsize / 3 - (control.overhead * control.threads)) / testbufs;
 retest_malloc:
-	testsize = limit * testbufs;
+	testsize = (limit * testbufs) + (control.overhead * control.threads);
 	testmalloc = malloc(testsize);
 	if (!testmalloc) {
 		limit = limit / 10 * 9;
@@ -802,23 +810,25 @@ retest_malloc:
 	free(testmalloc);
 	print_maxverbose("Succeeded in testing %lld sized malloc for back end compression\n", testsize);
 
+	sinfo->max_bufsize = limit / control.threads;
+
 	/* We start with slightly smaller buffers to start loading CPUs as soon
 	 * as possible and make them exponentially larger approaching the
 	 * tested maximum size. We ensure the buffers are of a minimum size,
 	 * though, as compression efficency drops off dramatically with tiny
 	 * buffers. */
 	if (control.threads > 1) {
-		sinfo->max_bufsize = limit / control.threads;
 		sinfo->bufsize = sinfo->max_bufsize * 63 / 100;
 		round_to_page(&sinfo->bufsize);
 		sinfo->bufsize = MAX(sinfo->bufsize, STREAM_BUFSIZE);
-	}
+	} else
+		sinfo->bufsize = sinfo->max_bufsize;
 
 	if (control.threads > 1)
 		print_maxverbose("Using up to %d threads to compress up to %lld bytes each.\n",
 			control.threads, sinfo->max_bufsize);
 	else
-		print_maxverbose("Using 1 thread to compress up to %lld bytes\n",
+		print_maxverbose("Using only 1 thread to compress up to %lld bytes\n",
 			sinfo->bufsize);
 
 	for (i = 0; i < n; i++) {