2010-03-29 01:07:08 +02:00
|
|
|
/*
|
2011-02-20 13:04:44 +01:00
|
|
|
Copyright (C) 2006-2011 Con Kolivas
|
2010-12-15 23:45:21 +01:00
|
|
|
Copyright (C) 1998 Andrew Tridgell
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
Modified to use flat hash, memory limit and variable hash culling
|
|
|
|
|
by Rusty Russell copyright (C) 2003.
|
|
|
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
2010-12-15 23:45:21 +01:00
|
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
2010-03-29 01:07:08 +02:00
|
|
|
*/
|
|
|
|
|
/* rzip compression algorithm */
|
|
|
|
|
#include "rzip.h"
|
|
|
|
|
|
|
|
|
|
#define CHUNK_MULTIPLE (100 * 1024 * 1024)
|
|
|
|
|
#define CKSUM_CHUNK 1024*1024
|
|
|
|
|
#define GREAT_MATCH 1024
|
|
|
|
|
#define MINIMUM_MATCH 31
|
|
|
|
|
|
|
|
|
|
/* Hash table works as follows. We start by throwing tags at every
|
|
|
|
|
* offset into the table. As it fills, we start eliminating tags
|
|
|
|
|
* which don't have lower bits set to one (ie. first we eliminate all
|
|
|
|
|
* even tags, then all tags divisible by four, etc.). This ensures
|
|
|
|
|
* that on average, all parts of the file are covered by the hash, if
|
|
|
|
|
* sparsely. */
|
|
|
|
|
typedef i64 tag;
|
|
|
|
|
|
|
|
|
|
/* All zero means empty. We might miss the first chunk this way. */
|
|
|
|
|
struct hash_entry {
|
|
|
|
|
i64 offset;
|
|
|
|
|
tag t;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/* Levels control hashtable size and bzip2 level. */
|
|
|
|
|
static struct level {
|
|
|
|
|
unsigned long mb_used;
|
|
|
|
|
unsigned initial_freq;
|
|
|
|
|
unsigned max_chain_len;
|
|
|
|
|
} levels[10] = {
|
|
|
|
|
{ 1, 4, 1 },
|
|
|
|
|
{ 2, 4, 2 },
|
|
|
|
|
{ 4, 4, 2 },
|
|
|
|
|
{ 8, 4, 2 },
|
|
|
|
|
{ 16, 4, 3 },
|
|
|
|
|
{ 32, 4, 4 },
|
|
|
|
|
{ 32, 2, 6 },
|
|
|
|
|
{ 64, 1, 16 }, /* More MB makes sense, but need bigger test files */
|
|
|
|
|
{ 64, 1, 32 },
|
|
|
|
|
{ 64, 1, 128 },
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
struct rzip_state {
|
|
|
|
|
void *ss;
|
|
|
|
|
struct level *level;
|
|
|
|
|
tag hash_index[256];
|
|
|
|
|
struct hash_entry *hash_table;
|
|
|
|
|
i64 hash_bits;
|
|
|
|
|
i64 hash_count;
|
|
|
|
|
i64 hash_limit;
|
|
|
|
|
tag minimum_tag_mask;
|
|
|
|
|
i64 tag_clean_ptr;
|
2010-11-04 11:14:55 +01:00
|
|
|
i64 last_match;
|
2010-03-29 01:07:08 +02:00
|
|
|
i64 chunk_size;
|
2010-11-05 13:02:58 +01:00
|
|
|
i64 mmap_size;
|
2010-10-31 05:09:05 +01:00
|
|
|
char chunk_bytes;
|
2010-03-29 01:07:08 +02:00
|
|
|
uint32_t cksum;
|
|
|
|
|
int fd_in, fd_out;
|
2010-11-01 06:45:48 +01:00
|
|
|
int stdin_eof;
|
2010-03-29 01:07:08 +02:00
|
|
|
struct {
|
|
|
|
|
i64 inserts;
|
|
|
|
|
i64 literals;
|
|
|
|
|
i64 literal_bytes;
|
|
|
|
|
i64 matches;
|
|
|
|
|
i64 match_bytes;
|
|
|
|
|
i64 tag_hits;
|
|
|
|
|
i64 tag_misses;
|
|
|
|
|
} stats;
|
|
|
|
|
};
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
struct sliding_buffer {
|
|
|
|
|
uchar *buf_low; /* The low window buffer */
|
|
|
|
|
uchar *buf_high;/* "" high "" */
|
|
|
|
|
i64 orig_offset;/* Where the original buffer started */
|
2010-11-05 02:16:43 +01:00
|
|
|
i64 offset_low; /* What the current offset the low buffer has */
|
|
|
|
|
i64 offset_high;/* "" high buffer "" */
|
|
|
|
|
i64 offset_search;/* Where the search is up to */
|
2010-11-04 11:14:55 +01:00
|
|
|
i64 orig_size; /* How big the full buffer would be */
|
|
|
|
|
i64 size_low; /* How big the low buffer is */
|
2010-11-05 02:16:43 +01:00
|
|
|
i64 size_high; /* "" high "" */
|
|
|
|
|
i64 high_length;/* How big the high buffer should be */
|
2010-11-04 11:14:55 +01:00
|
|
|
int fd; /* The fd of the mmap */
|
|
|
|
|
} sb; /* Sliding buffer */
|
|
|
|
|
|
2010-11-05 02:16:43 +01:00
|
|
|
static void remap_low_sb(void)
|
2010-11-04 11:14:55 +01:00
|
|
|
{
|
2011-02-19 00:34:45 +01:00
|
|
|
i64 new_offset;
|
2010-11-05 02:16:43 +01:00
|
|
|
|
|
|
|
|
new_offset = sb.offset_search;
|
2010-11-18 13:33:43 +01:00
|
|
|
round_to_page(&new_offset);
|
2010-11-19 15:23:08 +01:00
|
|
|
print_maxverbose("Sliding main buffer to offset %lld\n", new_offset);
|
2010-11-05 02:16:43 +01:00
|
|
|
if (unlikely(munmap(sb.buf_low, sb.size_low)))
|
|
|
|
|
fatal("Failed to munmap in remap_low_sb\n");
|
2011-02-18 11:14:16 +01:00
|
|
|
if (new_offset + sb.size_low > sb.orig_size)
|
|
|
|
|
sb.size_low = sb.orig_size - new_offset;
|
2010-11-05 02:16:43 +01:00
|
|
|
sb.offset_low = new_offset;
|
|
|
|
|
sb.buf_low = (uchar *)mmap(sb.buf_low, sb.size_low, PROT_READ, MAP_SHARED, sb.fd, sb.orig_offset + sb.offset_low);
|
|
|
|
|
if (unlikely(sb.buf_low == MAP_FAILED))
|
|
|
|
|
fatal("Failed to re mmap in remap_low_sb\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void remap_high_sb(i64 p)
|
|
|
|
|
{
|
2010-11-05 04:52:14 +01:00
|
|
|
if (unlikely(munmap(sb.buf_high, sb.size_high)))
|
2010-11-04 11:14:55 +01:00
|
|
|
fatal("Failed to munmap in remap_high_sb\n");
|
2010-11-05 02:16:43 +01:00
|
|
|
sb.size_high = sb.high_length; /* In case we shrunk it when we hit the end of the file */
|
2010-11-04 11:14:55 +01:00
|
|
|
sb.offset_high = p;
|
2010-11-05 04:52:14 +01:00
|
|
|
/* Make sure offset is rounded to page size of total offset */
|
2010-11-06 08:17:33 +01:00
|
|
|
sb.offset_high -= (sb.offset_high + sb.orig_offset) % control.page_size;
|
2010-11-05 02:16:43 +01:00
|
|
|
if (unlikely(sb.offset_high + sb.size_high > sb.orig_size))
|
2010-11-04 11:14:55 +01:00
|
|
|
sb.size_high = sb.orig_size - sb.offset_high;
|
2010-11-05 02:16:43 +01:00
|
|
|
sb.buf_high = (uchar *)mmap(sb.buf_high, sb.size_high, PROT_READ, MAP_SHARED, sb.fd, sb.orig_offset + sb.offset_high);
|
|
|
|
|
if (unlikely(sb.buf_high == MAP_FAILED))
|
2010-11-04 11:14:55 +01:00
|
|
|
fatal("Failed to re mmap in remap_high_sb\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We use a "sliding mmap" to effectively read more than we can fit into the
|
|
|
|
|
* compression window. This is done by using a maximally sized lower mmap at
|
2010-11-05 04:52:14 +01:00
|
|
|
* the beginning of the block which slides up once the hash search moves beyond
|
|
|
|
|
* it, and a 64k mmap block that slides up and down as is required for any
|
|
|
|
|
* offsets outside the range of the lower one. This is much slower than mmap
|
|
|
|
|
* but makes it possible to have unlimited sized compression windows. */
|
2010-11-04 11:14:55 +01:00
|
|
|
static uchar *get_sb(i64 p)
|
|
|
|
|
{
|
2010-11-05 02:16:43 +01:00
|
|
|
i64 low_end = sb.offset_low + sb.size_low;
|
|
|
|
|
|
2011-02-18 11:14:16 +01:00
|
|
|
if (unlikely(sb.offset_search > low_end))
|
2010-11-05 02:16:43 +01:00
|
|
|
remap_low_sb();
|
|
|
|
|
if (p >= sb.offset_low && p < low_end)
|
|
|
|
|
return (sb.buf_low + p - sb.offset_low);
|
2010-11-04 11:14:55 +01:00
|
|
|
if (p >= sb.offset_high && p < (sb.offset_high + sb.size_high))
|
|
|
|
|
return (sb.buf_high + (p - sb.offset_high));
|
2010-11-05 04:52:14 +01:00
|
|
|
/* p is not within the low or high buffer range */
|
2010-11-04 11:14:55 +01:00
|
|
|
remap_high_sb(p);
|
|
|
|
|
return (sb.buf_high + (p - sb.offset_high));
|
|
|
|
|
}
|
|
|
|
|
|
2010-12-03 09:30:56 +01:00
|
|
|
/* All put_u8/u32/vchars go to stream 0 */
|
|
|
|
|
static inline void put_u8(void *ss, uchar b)
|
2010-03-29 01:07:08 +02:00
|
|
|
{
|
2010-12-03 09:30:56 +01:00
|
|
|
if (unlikely(write_stream(ss, 0, &b, 1)))
|
2010-10-31 01:35:04 +02:00
|
|
|
fatal("Failed to put_u8\n");
|
2010-03-29 01:07:08 +02:00
|
|
|
}
|
|
|
|
|
|
2010-12-03 09:30:56 +01:00
|
|
|
static inline void put_u32(void *ss, uint32_t s)
|
2010-03-29 01:07:08 +02:00
|
|
|
{
|
2010-12-03 09:30:56 +01:00
|
|
|
if (unlikely(write_stream(ss, 0, (uchar *)&s, 4)))
|
2010-10-31 01:35:04 +02:00
|
|
|
fatal("Failed to put_u32\n");
|
2010-03-29 01:07:08 +02:00
|
|
|
}
|
|
|
|
|
|
2010-10-31 01:35:04 +02:00
|
|
|
/* Put a variable length of bytes dependant on how big the chunk is */
|
2010-12-03 09:30:56 +01:00
|
|
|
static inline void put_vchars(void *ss, i64 s, int length)
|
2010-03-29 01:07:08 +02:00
|
|
|
{
|
2010-10-31 01:35:04 +02:00
|
|
|
int bytes;
|
|
|
|
|
|
|
|
|
|
for (bytes = 0; bytes < length; bytes++) {
|
|
|
|
|
int bits = bytes * 8;
|
|
|
|
|
uchar sb = (s >> bits) & (i64)0XFF;
|
|
|
|
|
|
2010-12-03 09:30:56 +01:00
|
|
|
put_u8(ss, sb);
|
2010-10-31 01:35:04 +02:00
|
|
|
}
|
2010-03-29 01:07:08 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void put_header(void *ss, uchar head, i64 len)
|
|
|
|
|
{
|
2010-12-03 09:30:56 +01:00
|
|
|
put_u8(ss, head);
|
|
|
|
|
put_vchars(ss, len, 2);
|
2010-03-29 01:07:08 +02:00
|
|
|
}
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
static void put_match(struct rzip_state *st, i64 p, i64 offset, i64 len)
|
2010-03-29 01:07:08 +02:00
|
|
|
{
|
|
|
|
|
do {
|
|
|
|
|
i64 ofs;
|
|
|
|
|
i64 n = len;
|
2010-10-31 14:19:39 +01:00
|
|
|
if (n > 0xFFFF)
|
|
|
|
|
n = 0xFFFF;
|
2010-03-29 01:07:08 +02:00
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
ofs = (p - offset);
|
2010-03-29 01:07:08 +02:00
|
|
|
put_header(st->ss, 1, n);
|
2010-12-03 09:30:56 +01:00
|
|
|
put_vchars(st->ss, ofs, st->chunk_bytes);
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
st->stats.matches++;
|
|
|
|
|
st->stats.match_bytes += n;
|
|
|
|
|
len -= n;
|
|
|
|
|
p += n;
|
|
|
|
|
offset += n;
|
|
|
|
|
} while (len);
|
|
|
|
|
}
|
|
|
|
|
|
2010-12-08 11:25:00 +01:00
|
|
|
/* write some data to a stream mmap encoded. Return -1 on failure */
|
|
|
|
|
int write_sbstream(void *ss, int stream, i64 p, i64 len)
|
2010-11-04 11:14:55 +01:00
|
|
|
{
|
|
|
|
|
struct stream_info *sinfo = ss;
|
|
|
|
|
|
|
|
|
|
while (len) {
|
|
|
|
|
i64 n, i;
|
|
|
|
|
|
2010-12-08 11:25:00 +01:00
|
|
|
n = MIN(sinfo->bufsize - sinfo->s[stream].buflen, len);
|
2010-11-04 11:14:55 +01:00
|
|
|
|
|
|
|
|
for (i = 0; i < n; i++) {
|
2010-12-08 11:25:00 +01:00
|
|
|
memcpy(sinfo->s[stream].buf + sinfo->s[stream].buflen + i,
|
2010-11-04 11:14:55 +01:00
|
|
|
get_sb(p + i), 1);
|
|
|
|
|
}
|
2010-12-08 11:25:00 +01:00
|
|
|
sinfo->s[stream].buflen += n;
|
2010-11-04 11:14:55 +01:00
|
|
|
p += n;
|
|
|
|
|
len -= n;
|
2010-11-12 15:26:09 +01:00
|
|
|
|
2010-12-08 11:25:00 +01:00
|
|
|
if (sinfo->s[stream].buflen == sinfo->bufsize)
|
|
|
|
|
flush_buffer(sinfo, stream);
|
2010-11-04 11:14:55 +01:00
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void put_literal(struct rzip_state *st, i64 last, i64 p)
|
2010-03-29 01:07:08 +02:00
|
|
|
{
|
|
|
|
|
do {
|
2010-11-04 11:14:55 +01:00
|
|
|
i64 len = p - last;
|
2010-10-31 14:19:39 +01:00
|
|
|
if (len > 0xFFFF)
|
|
|
|
|
len = 0xFFFF;
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
st->stats.literals++;
|
|
|
|
|
st->stats.literal_bytes += len;
|
|
|
|
|
|
|
|
|
|
put_header(st->ss, 0, len);
|
|
|
|
|
|
2010-12-08 11:25:00 +01:00
|
|
|
if (unlikely(len && write_sbstream(st->ss, 1, last, len)))
|
2010-10-31 14:19:39 +01:00
|
|
|
fatal("Failed to write_stream in put_literal\n");
|
2010-03-29 01:07:08 +02:00
|
|
|
last += len;
|
|
|
|
|
} while (p > last);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Could give false positive on offset 0. Who cares. */
|
|
|
|
|
static int empty_hash(struct rzip_state *st, i64 h)
|
|
|
|
|
{
|
|
|
|
|
return !st->hash_table[h].offset && !st->hash_table[h].t;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static i64 primary_hash(struct rzip_state *st, tag t)
|
|
|
|
|
{
|
|
|
|
|
return t & ((1 << st->hash_bits) - 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline tag increase_mask(tag tag_mask)
|
|
|
|
|
{
|
|
|
|
|
/* Get more precise. */
|
|
|
|
|
return (tag_mask << 1) | 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static int minimum_bitness(struct rzip_state *st, tag t)
|
|
|
|
|
{
|
|
|
|
|
tag better_than_min = increase_mask(st->minimum_tag_mask);
|
2010-10-31 05:09:05 +01:00
|
|
|
|
2010-03-29 01:07:08 +02:00
|
|
|
if ((t & better_than_min) != better_than_min)
|
|
|
|
|
return 1;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Is a going to be cleaned before b? ie. does a have fewer low bits
|
|
|
|
|
* set than b? */
|
|
|
|
|
static int lesser_bitness(tag a, tag b)
|
|
|
|
|
{
|
|
|
|
|
tag mask;
|
|
|
|
|
|
2010-10-31 01:35:04 +02:00
|
|
|
for (mask = 0; mask != (tag) - 1; mask = ((mask << 1) | 1)) {
|
2010-03-29 01:07:08 +02:00
|
|
|
if ((a & b & mask) != mask)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
return ((a & mask) < (b & mask));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If hash bucket is taken, we spill into next bucket(s). Secondary hashing
|
|
|
|
|
works better in theory, but modern caches make this 20% faster. */
|
|
|
|
|
static void insert_hash(struct rzip_state *st, tag t, i64 offset)
|
|
|
|
|
{
|
|
|
|
|
i64 h, victim_h = 0, round = 0;
|
|
|
|
|
/* If we need to kill one, this will be it. */
|
|
|
|
|
static i64 victim_round = 0;
|
|
|
|
|
|
|
|
|
|
h = primary_hash(st, t);
|
|
|
|
|
while (!empty_hash(st, h)) {
|
|
|
|
|
/* If this due for cleaning anyway, just replace it:
|
|
|
|
|
rehashing might move it behind tag_clean_ptr. */
|
|
|
|
|
if (minimum_bitness(st, st->hash_table[h].t)) {
|
|
|
|
|
st->hash_count--;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
/* If we are better than current occupant, we can't
|
|
|
|
|
jump over it: it will be cleaned before us, and
|
|
|
|
|
noone would then find us in the hash table. Rehash
|
|
|
|
|
it, then take its place. */
|
|
|
|
|
if (lesser_bitness(st->hash_table[h].t, t)) {
|
|
|
|
|
insert_hash(st, st->hash_table[h].t,
|
|
|
|
|
st->hash_table[h].offset);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If we have lots of identical patterns, we end up
|
|
|
|
|
with lots of the same hash number. Discard random. */
|
|
|
|
|
if (st->hash_table[h].t == t) {
|
2010-10-31 14:19:39 +01:00
|
|
|
if (round == victim_round)
|
2010-03-29 01:07:08 +02:00
|
|
|
victim_h = h;
|
|
|
|
|
if (++round == st->level->max_chain_len) {
|
|
|
|
|
h = victim_h;
|
|
|
|
|
st->hash_count--;
|
|
|
|
|
victim_round++;
|
|
|
|
|
if (victim_round == st->level->max_chain_len)
|
|
|
|
|
victim_round = 0;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
h++;
|
|
|
|
|
h &= ((1 << st->hash_bits) - 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
st->hash_table[h].t = t;
|
|
|
|
|
st->hash_table[h].offset = offset;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Eliminate one hash entry with minimum number of lower bits set.
|
|
|
|
|
Returns tag requirement for any new entries. */
|
|
|
|
|
static tag clean_one_from_hash(struct rzip_state *st)
|
|
|
|
|
{
|
|
|
|
|
tag better_than_min;
|
|
|
|
|
|
|
|
|
|
again:
|
|
|
|
|
better_than_min = increase_mask(st->minimum_tag_mask);
|
2010-10-31 18:53:53 +01:00
|
|
|
if (!st->tag_clean_ptr)
|
2010-11-12 15:26:09 +01:00
|
|
|
print_maxverbose("Starting sweep for mask %u\n", (unsigned int)st->minimum_tag_mask);
|
2010-03-29 01:07:08 +02:00
|
|
|
|
2010-10-31 05:09:05 +01:00
|
|
|
for (; st->tag_clean_ptr < (1U << st->hash_bits); st->tag_clean_ptr++) {
|
2010-03-29 01:07:08 +02:00
|
|
|
if (empty_hash(st, st->tag_clean_ptr))
|
|
|
|
|
continue;
|
|
|
|
|
if ((st->hash_table[st->tag_clean_ptr].t & better_than_min)
|
|
|
|
|
!= better_than_min) {
|
|
|
|
|
st->hash_table[st->tag_clean_ptr].offset = 0;
|
|
|
|
|
st->hash_table[st->tag_clean_ptr].t = 0;
|
|
|
|
|
st->hash_count--;
|
|
|
|
|
return better_than_min;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We hit the end: everthing in hash satisfies the better mask. */
|
|
|
|
|
st->minimum_tag_mask = better_than_min;
|
|
|
|
|
st->tag_clean_ptr = 0;
|
|
|
|
|
goto again;
|
|
|
|
|
}
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
static inline tag next_tag(struct rzip_state *st, i64 p, tag t)
|
2010-03-29 01:07:08 +02:00
|
|
|
{
|
2010-11-04 11:14:55 +01:00
|
|
|
t ^= st->hash_index[*get_sb(p - 1)];
|
|
|
|
|
t ^= st->hash_index[*get_sb(p + MINIMUM_MATCH - 1)];
|
2010-03-29 01:07:08 +02:00
|
|
|
return t;
|
|
|
|
|
}
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
static inline tag full_tag(struct rzip_state *st, i64 p)
|
2010-03-29 01:07:08 +02:00
|
|
|
{
|
|
|
|
|
tag ret = 0;
|
|
|
|
|
int i;
|
2010-10-31 05:09:05 +01:00
|
|
|
|
2010-03-29 01:07:08 +02:00
|
|
|
for (i = 0; i < MINIMUM_MATCH; i++)
|
2010-11-04 11:14:55 +01:00
|
|
|
ret ^= st->hash_index[*get_sb(p + i)];
|
2010-03-29 01:07:08 +02:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
static inline i64 match_len(struct rzip_state *st, i64 p0, i64 op, i64 end,
|
|
|
|
|
i64 *rev)
|
2010-03-29 01:07:08 +02:00
|
|
|
{
|
2010-11-04 11:14:55 +01:00
|
|
|
i64 p = p0;
|
2010-03-29 01:07:08 +02:00
|
|
|
i64 len = 0;
|
|
|
|
|
|
|
|
|
|
if (op >= p0)
|
|
|
|
|
return 0;
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
while ((*get_sb(p) == *get_sb(op)) && (p < end)) {
|
2010-03-29 01:07:08 +02:00
|
|
|
p++;
|
|
|
|
|
op++;
|
|
|
|
|
}
|
|
|
|
|
len = p - p0;
|
|
|
|
|
|
|
|
|
|
p = p0;
|
|
|
|
|
op -= len;
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
end = 0;
|
2010-03-29 01:07:08 +02:00
|
|
|
if (end < st->last_match)
|
|
|
|
|
end = st->last_match;
|
|
|
|
|
|
2010-11-05 04:52:14 +01:00
|
|
|
while (p > end && op > 0 && *get_sb(op - 1) == *get_sb(p - 1)) {
|
2010-03-29 01:07:08 +02:00
|
|
|
op--;
|
|
|
|
|
p--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
(*rev) = p0 - p;
|
|
|
|
|
len += p0 - p;
|
|
|
|
|
|
|
|
|
|
if (len < MINIMUM_MATCH)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
static i64 find_best_match(struct rzip_state *st, tag t, i64 p, i64 end,
|
2010-03-29 01:07:08 +02:00
|
|
|
i64 *offset, i64 *reverse)
|
|
|
|
|
{
|
|
|
|
|
i64 length = 0;
|
|
|
|
|
i64 h, best_h;
|
2010-10-31 05:09:05 +01:00
|
|
|
i64 rev;
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
rev = 0;
|
|
|
|
|
*reverse = 0;
|
|
|
|
|
|
|
|
|
|
/* Could optimise: if lesser goodness, can stop search. But
|
|
|
|
|
* chains are usually short anyway. */
|
|
|
|
|
h = primary_hash(st, t);
|
|
|
|
|
while (!empty_hash(st, h)) {
|
|
|
|
|
i64 mlen;
|
|
|
|
|
|
|
|
|
|
if (t == st->hash_table[h].t) {
|
2010-11-04 11:14:55 +01:00
|
|
|
mlen = match_len(st, p, st->hash_table[h].offset, end,
|
|
|
|
|
&rev);
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
if (mlen)
|
|
|
|
|
st->stats.tag_hits++;
|
|
|
|
|
else
|
|
|
|
|
st->stats.tag_misses++;
|
|
|
|
|
|
|
|
|
|
if (mlen >= length) {
|
|
|
|
|
length = mlen;
|
|
|
|
|
(*offset) = st->hash_table[h].offset - rev;
|
|
|
|
|
(*reverse) = rev;
|
|
|
|
|
best_h = h;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
h++;
|
|
|
|
|
h &= ((1 << st->hash_bits) - 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return length;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void show_distrib(struct rzip_state *st)
|
|
|
|
|
{
|
|
|
|
|
i64 primary = 0;
|
2010-10-31 05:09:05 +01:00
|
|
|
i64 total = 0;
|
|
|
|
|
i64 i;
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
for (i = 0; i < (1U << st->hash_bits); i++) {
|
|
|
|
|
if (empty_hash(st, i))
|
|
|
|
|
continue;
|
|
|
|
|
total++;
|
|
|
|
|
if (primary_hash(st, st->hash_table[i].t) == i)
|
|
|
|
|
primary++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (total != st->hash_count)
|
2010-11-12 15:26:09 +01:00
|
|
|
print_err("WARNING: hash_count says total %lld\n", st->hash_count);
|
2010-03-29 01:07:08 +02:00
|
|
|
|
2010-11-12 15:26:09 +01:00
|
|
|
print_output("%lld total hashes -- %lld in primary bucket (%-2.3f%%)\n", total, primary,
|
2010-03-29 01:07:08 +02:00
|
|
|
primary*100.0/total);
|
|
|
|
|
}
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
static void hash_search(struct rzip_state *st, double pct_base, double pct_multiple)
|
2010-03-29 01:07:08 +02:00
|
|
|
{
|
2010-11-05 13:02:58 +01:00
|
|
|
i64 cksum_limit = 0, p, end;
|
2010-03-29 01:07:08 +02:00
|
|
|
tag t = 0;
|
|
|
|
|
struct {
|
2010-11-04 11:14:55 +01:00
|
|
|
i64 p;
|
2010-03-29 01:07:08 +02:00
|
|
|
i64 ofs;
|
|
|
|
|
i64 len;
|
|
|
|
|
} current;
|
|
|
|
|
|
|
|
|
|
tag tag_mask = (1 << st->level->initial_freq) - 1;
|
|
|
|
|
|
2010-10-31 14:19:39 +01:00
|
|
|
if (st->hash_table)
|
2010-03-29 01:07:08 +02:00
|
|
|
memset(st->hash_table, 0, sizeof(st->hash_table[0]) * (1<<st->hash_bits));
|
2010-10-31 14:19:39 +01:00
|
|
|
else {
|
2010-03-29 01:07:08 +02:00
|
|
|
i64 hashsize = st->level->mb_used *
|
|
|
|
|
(1024 * 1024 / sizeof(st->hash_table[0]));
|
|
|
|
|
for (st->hash_bits = 0; (1U << st->hash_bits) < hashsize; st->hash_bits++);
|
|
|
|
|
|
2010-10-31 18:53:53 +01:00
|
|
|
print_maxverbose("hashsize = %lld. bits = %lld. %luMB\n",
|
|
|
|
|
hashsize, st->hash_bits, st->level->mb_used);
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
/* 66% full at max. */
|
|
|
|
|
st->hash_limit = (1 << st->hash_bits) / 3 * 2;
|
|
|
|
|
st->hash_table = calloc(sizeof(st->hash_table[0]), (1 << st->hash_bits));
|
|
|
|
|
}
|
|
|
|
|
|
2010-11-05 02:16:43 +01:00
|
|
|
if (unlikely(!st->hash_table))
|
2010-03-29 01:07:08 +02:00
|
|
|
fatal("Failed to allocate hash table in hash_search\n");
|
|
|
|
|
|
|
|
|
|
st->minimum_tag_mask = tag_mask;
|
|
|
|
|
st->tag_clean_ptr = 0;
|
|
|
|
|
st->cksum = 0;
|
|
|
|
|
st->hash_count = 0;
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
p = 0;
|
|
|
|
|
end = st->chunk_size - MINIMUM_MATCH;
|
2010-03-29 01:07:08 +02:00
|
|
|
st->last_match = p;
|
|
|
|
|
current.len = 0;
|
|
|
|
|
current.p = p;
|
|
|
|
|
current.ofs = 0;
|
|
|
|
|
|
|
|
|
|
t = full_tag(st, p);
|
|
|
|
|
|
|
|
|
|
while (p < end) {
|
2010-11-05 13:02:58 +01:00
|
|
|
int lastpct = 0, last_chunkpct = 0;
|
|
|
|
|
i64 reverse, mlen, offset = 0;
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
p++;
|
2010-11-05 02:16:43 +01:00
|
|
|
sb.offset_search = p;
|
2010-03-29 01:07:08 +02:00
|
|
|
t = next_tag(st, p, t);
|
|
|
|
|
|
|
|
|
|
/* Don't look for a match if there are no tags with
|
|
|
|
|
this number of bits in the hash table. */
|
|
|
|
|
if ((t & st->minimum_tag_mask) != st->minimum_tag_mask)
|
|
|
|
|
continue;
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
mlen = find_best_match(st, t, p, end, &offset, &reverse);
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
/* Only insert occasionally into hash. */
|
|
|
|
|
if ((t & tag_mask) == tag_mask) {
|
|
|
|
|
st->stats.inserts++;
|
|
|
|
|
st->hash_count++;
|
2010-11-04 11:14:55 +01:00
|
|
|
insert_hash(st, t, p);
|
2010-03-29 01:07:08 +02:00
|
|
|
if (st->hash_count > st->hash_limit)
|
|
|
|
|
tag_mask = clean_one_from_hash(st);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (mlen > current.len) {
|
|
|
|
|
current.p = p - reverse;
|
|
|
|
|
current.len = mlen;
|
|
|
|
|
current.ofs = offset;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ((current.len >= GREAT_MATCH || p >= current.p + MINIMUM_MATCH)
|
|
|
|
|
&& current.len >= MINIMUM_MATCH) {
|
|
|
|
|
if (st->last_match < current.p)
|
|
|
|
|
put_literal(st, st->last_match, current.p);
|
2010-11-04 11:14:55 +01:00
|
|
|
put_match(st, current.p, current.ofs, current.len);
|
2010-03-29 01:07:08 +02:00
|
|
|
st->last_match = current.p + current.len;
|
|
|
|
|
current.p = p = st->last_match;
|
|
|
|
|
current.len = 0;
|
|
|
|
|
t = full_tag(st, p);
|
|
|
|
|
}
|
|
|
|
|
|
2010-11-05 13:02:58 +01:00
|
|
|
if (unlikely(p % 128 == 0)) {
|
|
|
|
|
int pct, chunk_pct;
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
pct = pct_base + (pct_multiple * (100.0 * p) /
|
2010-03-29 01:07:08 +02:00
|
|
|
st->chunk_size);
|
2010-11-05 13:02:58 +01:00
|
|
|
chunk_pct = p / (end / 100);
|
|
|
|
|
if (pct != lastpct || chunk_pct != last_chunkpct) {
|
2010-11-04 11:14:55 +01:00
|
|
|
if (!STDIN)
|
2010-11-06 08:17:33 +01:00
|
|
|
print_progress("Total: %2d%% ", pct);
|
|
|
|
|
print_progress("Chunk: %2d%%\r", chunk_pct);
|
2010-03-29 01:07:08 +02:00
|
|
|
lastpct = pct;
|
2010-11-05 13:02:58 +01:00
|
|
|
last_chunkpct = chunk_pct;
|
2010-03-29 01:07:08 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
if (p > (i64)cksum_limit) {
|
2011-02-20 02:16:45 +01:00
|
|
|
i64 i, n = MIN(st->chunk_size - p, control.page_size);
|
2011-02-19 00:34:45 +01:00
|
|
|
uchar *ckbuf = malloc(n);
|
2010-11-04 11:14:55 +01:00
|
|
|
|
2011-02-21 04:51:20 +01:00
|
|
|
if (unlikely(!ckbuf))
|
2011-02-19 00:34:45 +01:00
|
|
|
fatal("Failed to malloc ckbuf in hash_search\n");
|
2010-11-04 11:14:55 +01:00
|
|
|
for (i = 0; i < n; i++)
|
2011-02-19 00:34:45 +01:00
|
|
|
memcpy(ckbuf + i, get_sb(cksum_limit + i), 1);
|
|
|
|
|
st->cksum = CrcUpdate(st->cksum, ckbuf, n);
|
|
|
|
|
md5_process_bytes(ckbuf, n, &control.ctx);
|
2010-03-29 01:07:08 +02:00
|
|
|
cksum_limit += n;
|
2011-02-19 00:34:45 +01:00
|
|
|
free(ckbuf);
|
2010-03-29 01:07:08 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2010-11-01 03:28:49 +01:00
|
|
|
if (MAX_VERBOSE)
|
2010-03-29 01:07:08 +02:00
|
|
|
show_distrib(st);
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
if (st->last_match < st->chunk_size)
|
|
|
|
|
put_literal(st, st->last_match, st->chunk_size);
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
if (st->chunk_size > cksum_limit) {
|
2010-11-04 11:14:55 +01:00
|
|
|
i64 i, n = st->chunk_size - cksum_limit;
|
2011-02-19 00:34:45 +01:00
|
|
|
uchar *ckbuf = malloc(n);
|
2010-11-04 11:14:55 +01:00
|
|
|
|
2011-02-21 04:51:20 +01:00
|
|
|
if (unlikely(!ckbuf))
|
2011-02-19 00:34:45 +01:00
|
|
|
fatal("Failed to malloc ckbuf in hash_search\n");
|
2010-11-04 11:14:55 +01:00
|
|
|
for (i = 0; i < n; i++)
|
2011-02-19 00:34:45 +01:00
|
|
|
memcpy(ckbuf + i, get_sb(cksum_limit + i), 1);
|
|
|
|
|
st->cksum = CrcUpdate(st->cksum, ckbuf, n);
|
|
|
|
|
md5_process_bytes(ckbuf, n, &control.ctx);
|
2010-03-29 01:07:08 +02:00
|
|
|
cksum_limit += n;
|
2011-02-19 00:34:45 +01:00
|
|
|
free(ckbuf);
|
2010-03-29 01:07:08 +02:00
|
|
|
}
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
put_literal(st, 0, 0);
|
2010-12-03 09:30:56 +01:00
|
|
|
put_u32(st->ss, st->cksum);
|
2010-03-29 01:07:08 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void init_hash_indexes(struct rzip_state *st)
|
|
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < 256; i++)
|
|
|
|
|
st->hash_index[i] = ((random() << 16) ^ random());
|
|
|
|
|
}
|
|
|
|
|
|
2010-11-01 06:45:48 +01:00
|
|
|
extern const i64 one_g;
|
|
|
|
|
|
2010-11-03 03:14:46 +01:00
|
|
|
static inline void *fake_mremap(void *old_address, size_t old_size, size_t new_size, int flags)
|
|
|
|
|
{
|
|
|
|
|
flags = 0;
|
|
|
|
|
munmap(old_address, old_size);
|
|
|
|
|
return mmap(old_address, new_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
|
|
|
|
}
|
|
|
|
|
|
2010-11-01 06:45:48 +01:00
|
|
|
/* stdin is not file backed so we have to emulate the mmap by mapping
|
|
|
|
|
* anonymous ram and reading stdin into it. It means the maximum ram
|
|
|
|
|
* we can use will be less but we will already have determined this in
|
|
|
|
|
* rzip_chunk */
|
|
|
|
|
static void mmap_stdin(uchar *buf, struct rzip_state *st)
|
|
|
|
|
{
|
|
|
|
|
i64 len = st->chunk_size;
|
|
|
|
|
uchar *offset_buf = buf;
|
|
|
|
|
ssize_t ret;
|
|
|
|
|
i64 total;
|
|
|
|
|
|
|
|
|
|
total = 0;
|
|
|
|
|
while (len > 0) {
|
|
|
|
|
if (len > one_g)
|
|
|
|
|
ret = one_g;
|
|
|
|
|
else
|
|
|
|
|
ret = len;
|
|
|
|
|
ret = read(0, offset_buf, (size_t)ret);
|
2010-11-05 02:16:43 +01:00
|
|
|
if (unlikely(ret < 0))
|
2010-11-01 06:45:48 +01:00
|
|
|
fatal("Failed to read in mmap_stdin\n");
|
|
|
|
|
total += ret;
|
|
|
|
|
if (ret == 0) {
|
|
|
|
|
/* Should be EOF */
|
|
|
|
|
print_maxverbose("Shrinking chunk to %lld\n", total);
|
2011-02-26 09:45:00 +01:00
|
|
|
buf = (uchar *)mremap(buf, st->chunk_size, total, 0);
|
2010-11-05 02:16:43 +01:00
|
|
|
if (unlikely(buf == MAP_FAILED))
|
2010-11-01 09:18:20 +01:00
|
|
|
fatal("Failed to remap to smaller buf in mmap_stdin\n");
|
2010-12-12 07:40:58 +01:00
|
|
|
st->mmap_size = st->chunk_size = total;
|
2010-11-01 06:45:48 +01:00
|
|
|
st->stdin_eof = 1;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
offset_buf += ret;
|
|
|
|
|
len -= ret;
|
|
|
|
|
}
|
2010-11-05 14:00:44 +01:00
|
|
|
control.st_size += total;
|
2010-11-01 06:45:48 +01:00
|
|
|
}
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
static void init_sliding_mmap(struct rzip_state *st, int fd_in, i64 offset)
|
2010-03-29 01:07:08 +02:00
|
|
|
{
|
2010-11-05 02:16:43 +01:00
|
|
|
/* Initialise the high buffer */
|
2010-11-19 15:23:08 +01:00
|
|
|
if (!STDIN) {
|
2010-11-05 02:16:43 +01:00
|
|
|
sb.high_length = 65536;
|
2010-11-19 15:23:08 +01:00
|
|
|
/* Round up to the next biggest page size */
|
|
|
|
|
if (sb.high_length % control.page_size)
|
|
|
|
|
sb.high_length += control.page_size - (sb.high_length % control.page_size);
|
2010-11-05 02:16:43 +01:00
|
|
|
sb.buf_high = (uchar *)mmap(NULL, sb.high_length, PROT_READ, MAP_SHARED, fd_in, offset);
|
|
|
|
|
if (unlikely(sb.buf_high == MAP_FAILED))
|
2010-11-04 11:14:55 +01:00
|
|
|
fatal("Unable to mmap buf_high in init_sliding_mmap\n");
|
2010-11-05 02:16:43 +01:00
|
|
|
sb.size_high = sb.high_length;
|
2010-11-04 11:14:55 +01:00
|
|
|
sb.offset_high = 0;
|
|
|
|
|
}
|
2010-11-05 02:16:43 +01:00
|
|
|
sb.offset_low = 0;
|
|
|
|
|
sb.offset_search = 0;
|
2010-11-05 13:02:58 +01:00
|
|
|
sb.size_low = st->mmap_size;
|
2010-11-04 11:14:55 +01:00
|
|
|
sb.orig_size = st->chunk_size;
|
|
|
|
|
sb.fd = fd_in;
|
|
|
|
|
}
|
2010-03-29 01:07:08 +02:00
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
/* compress a chunk of an open file. Assumes that the file is able to
|
|
|
|
|
be mmap'd and is seekable */
|
|
|
|
|
static void rzip_chunk(struct rzip_state *st, int fd_in, int fd_out, i64 offset,
|
|
|
|
|
double pct_base, double pct_multiple)
|
|
|
|
|
{
|
|
|
|
|
init_sliding_mmap(st, fd_in, offset);
|
2010-11-05 13:02:58 +01:00
|
|
|
|
2010-12-10 13:51:59 +01:00
|
|
|
st->ss = open_stream_out(fd_out, NUM_STREAMS, st->chunk_size, st->chunk_bytes);
|
2010-11-05 02:16:43 +01:00
|
|
|
if (unlikely(!st->ss))
|
2010-10-31 01:35:04 +02:00
|
|
|
fatal("Failed to open streams in rzip_chunk\n");
|
2010-11-05 13:02:58 +01:00
|
|
|
|
2010-11-19 15:23:08 +01:00
|
|
|
print_verbose("Beginning rzip pre-processing phase\n");
|
2010-11-04 11:14:55 +01:00
|
|
|
hash_search(st, pct_base, pct_multiple);
|
2010-11-05 13:02:58 +01:00
|
|
|
|
2010-03-29 01:07:08 +02:00
|
|
|
/* unmap buffer before closing and reallocating streams */
|
2010-11-05 02:16:43 +01:00
|
|
|
if (unlikely(munmap(sb.buf_low, sb.size_low)))
|
2010-10-31 14:19:39 +01:00
|
|
|
fatal("Failed to munmap in rzip_chunk\n");
|
2010-11-19 15:23:08 +01:00
|
|
|
if (!STDIN) {
|
2010-11-05 02:16:43 +01:00
|
|
|
if (unlikely(munmap(sb.buf_high, sb.size_high)))
|
2010-11-04 11:14:55 +01:00
|
|
|
fatal("Failed to munmap in rzip_chunk\n");
|
|
|
|
|
}
|
2010-03-29 01:07:08 +02:00
|
|
|
|
2010-11-05 02:16:43 +01:00
|
|
|
if (unlikely(close_stream_out(st->ss)))
|
2010-10-31 01:35:04 +02:00
|
|
|
fatal("Failed to flush/close streams in rzip_chunk\n");
|
2010-03-29 01:07:08 +02:00
|
|
|
}
|
|
|
|
|
|
2010-11-05 13:02:58 +01:00
|
|
|
/* Needs to be less than 31 bits and page aligned on 32 bits */
|
2010-11-06 15:57:23 +01:00
|
|
|
const i64 two_gig = (1ull << 31) - 4096;
|
2010-11-05 13:02:58 +01:00
|
|
|
|
2010-03-29 01:07:08 +02:00
|
|
|
/* compress a whole file chunks at a time */
|
|
|
|
|
void rzip_fd(int fd_in, int fd_out)
|
|
|
|
|
{
|
|
|
|
|
/* add timers for ETA estimates
|
|
|
|
|
* Base it off the file size and number of iterations required
|
|
|
|
|
* depending on compression window size
|
|
|
|
|
* Track elapsed time and estimated time to go
|
|
|
|
|
* If file size < compression window, can't do
|
|
|
|
|
*/
|
|
|
|
|
struct timeval current, start, last;
|
|
|
|
|
struct stat s, s2;
|
|
|
|
|
struct rzip_state *st;
|
2010-11-19 23:22:47 +01:00
|
|
|
i64 len = 0, last_chunk = 0;
|
2011-02-17 23:09:40 +01:00
|
|
|
int pass = 0, passes, j;
|
2010-03-29 01:07:08 +02:00
|
|
|
unsigned int eta_hours, eta_minutes, eta_seconds, elapsed_hours,
|
|
|
|
|
elapsed_minutes, elapsed_seconds;
|
|
|
|
|
double finish_time, elapsed_time, chunkmbs;
|
2011-02-17 23:09:40 +01:00
|
|
|
char md5_resblock[MD5_DIGEST_SIZE];
|
|
|
|
|
|
|
|
|
|
md5_init_ctx (&control.ctx);
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
st = calloc(sizeof(*st), 1);
|
2010-11-05 02:16:43 +01:00
|
|
|
if (unlikely(!st))
|
2010-03-29 01:07:08 +02:00
|
|
|
fatal("Failed to allocate control state in rzip_fd\n");
|
|
|
|
|
|
2010-11-01 03:28:49 +01:00
|
|
|
if (LZO_COMPRESS) {
|
2010-11-05 02:16:43 +01:00
|
|
|
if (unlikely(lzo_init() != LZO_E_OK))
|
2010-03-29 01:07:08 +02:00
|
|
|
fatal("lzo_init() failed\n");
|
|
|
|
|
}
|
|
|
|
|
|
2010-11-05 02:16:43 +01:00
|
|
|
if (unlikely(fstat(fd_in, &s)))
|
2010-03-29 01:07:08 +02:00
|
|
|
fatal("Failed to stat fd_in in rzip_fd - %s\n", strerror(errno));
|
|
|
|
|
|
2010-11-01 09:18:20 +01:00
|
|
|
if (!STDIN) {
|
|
|
|
|
len = control.st_size = s.st_size;
|
|
|
|
|
print_verbose("File size: %lld\n", len);
|
|
|
|
|
} else
|
|
|
|
|
control.st_size = 0;
|
2010-03-29 01:07:08 +02:00
|
|
|
|
2010-12-10 15:07:43 +01:00
|
|
|
/* Optimal use of ram involves using no more than 2/3 of it, so we
|
|
|
|
|
* allocate 1/3 of it to the main buffer and use a sliding mmap
|
|
|
|
|
* buffer to work on 2/3 ram size, leaving enough ram for the
|
|
|
|
|
* compression backends */
|
|
|
|
|
control.max_mmap = control.ramsize / 3;
|
|
|
|
|
|
2010-11-19 15:23:08 +01:00
|
|
|
/* On 32 bits we can have a big window with sliding mmap, but can
|
2010-12-03 22:39:52 +01:00
|
|
|
* not enable much per mmap/malloc */
|
2010-11-19 15:23:08 +01:00
|
|
|
if (BITS32)
|
2010-12-10 15:07:43 +01:00
|
|
|
control.max_mmap = MIN(control.max_mmap, two_gig);
|
2010-11-19 15:23:08 +01:00
|
|
|
round_to_page(&control.max_mmap);
|
2010-11-19 23:22:47 +01:00
|
|
|
|
2011-02-24 01:38:31 +01:00
|
|
|
/* Set maximum chunk size to 2/3 of ram if not unlimited or specified
|
|
|
|
|
* by a control window. When it's smaller than the file size, round it
|
|
|
|
|
* to page size for efficiency. */
|
2010-11-19 15:23:08 +01:00
|
|
|
if (UNLIMITED)
|
|
|
|
|
control.max_chunk = control.st_size;
|
2011-02-24 01:38:31 +01:00
|
|
|
else if (control.window)
|
2010-11-19 23:22:47 +01:00
|
|
|
control.max_chunk = MIN(control.max_chunk, control.window * CHUNK_MULTIPLE);
|
2011-02-24 01:38:31 +01:00
|
|
|
else
|
|
|
|
|
control.max_chunk = control.ramsize / 3 * 2;
|
2010-12-12 07:40:58 +01:00
|
|
|
control.max_mmap = MIN(control.max_mmap, control.max_chunk);
|
2011-02-24 01:38:31 +01:00
|
|
|
if (control.max_mmap < control.st_size)
|
|
|
|
|
round_to_page(&control.max_chunk);
|
2010-11-19 15:23:08 +01:00
|
|
|
|
|
|
|
|
if (!STDIN)
|
2010-11-19 23:22:47 +01:00
|
|
|
st->chunk_size = MIN(control.max_chunk, len);
|
2010-11-19 15:23:08 +01:00
|
|
|
else
|
2010-11-19 23:22:47 +01:00
|
|
|
st->chunk_size = control.max_mmap;
|
2010-11-19 15:23:08 +01:00
|
|
|
if (st->chunk_size < len)
|
|
|
|
|
round_to_page(&st->chunk_size);
|
2010-03-29 01:07:08 +02:00
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
st->level = &levels[control.compression_level];
|
2010-03-29 01:07:08 +02:00
|
|
|
st->fd_in = fd_in;
|
|
|
|
|
st->fd_out = fd_out;
|
2010-11-01 06:45:48 +01:00
|
|
|
st->stdin_eof = 0;
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
init_hash_indexes(st);
|
|
|
|
|
|
2010-11-18 13:33:43 +01:00
|
|
|
passes = 0;
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
/* set timers and chunk counter */
|
|
|
|
|
last.tv_sec = last.tv_usec = 0;
|
|
|
|
|
gettimeofday(&start, NULL);
|
|
|
|
|
|
2010-12-10 13:51:59 +01:00
|
|
|
prepare_streamout_threads();
|
|
|
|
|
|
2010-11-01 06:45:48 +01:00
|
|
|
while (len > 0 || (STDIN && !st->stdin_eof)) {
|
2010-03-29 01:07:08 +02:00
|
|
|
double pct_base, pct_multiple;
|
2010-11-05 13:02:58 +01:00
|
|
|
i64 offset = s.st_size - len;
|
2010-11-01 03:28:49 +01:00
|
|
|
int bits = 8;
|
2010-03-29 01:07:08 +02:00
|
|
|
|
2010-11-19 15:23:08 +01:00
|
|
|
st->chunk_size = control.max_chunk;
|
|
|
|
|
st->mmap_size = control.max_mmap;
|
2010-11-05 13:02:58 +01:00
|
|
|
if (!STDIN) {
|
2010-11-19 15:23:08 +01:00
|
|
|
st->chunk_size = MIN(st->chunk_size, len);
|
|
|
|
|
st->mmap_size = MIN(st->mmap_size, len);
|
2010-11-05 13:02:58 +01:00
|
|
|
}
|
|
|
|
|
|
2010-11-19 15:23:08 +01:00
|
|
|
retry:
|
|
|
|
|
if (STDIN) {
|
|
|
|
|
/* NOTE the buf is saved here for STDIN mode */
|
|
|
|
|
sb.buf_low = mmap(NULL, st->mmap_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
|
|
|
|
/* Better to shrink the window to the largest size that works than fail */
|
|
|
|
|
if (sb.buf_low == MAP_FAILED) {
|
2011-02-26 10:00:26 +01:00
|
|
|
if (unlikely(errno != ENOMEM))
|
|
|
|
|
fatal("Failed to mmap %s\n", control.infile);
|
2010-11-19 15:23:08 +01:00
|
|
|
st->mmap_size = st->mmap_size / 10 * 9;
|
|
|
|
|
round_to_page(&st->mmap_size);
|
|
|
|
|
if (unlikely(!st->mmap_size))
|
|
|
|
|
fatal("Unable to mmap any ram\n");
|
|
|
|
|
goto retry;
|
|
|
|
|
}
|
2010-12-12 07:40:58 +01:00
|
|
|
st->chunk_size = st->mmap_size;
|
2010-11-19 15:23:08 +01:00
|
|
|
mmap_stdin(sb.buf_low, st);
|
|
|
|
|
} else {
|
2010-12-11 02:25:43 +01:00
|
|
|
/* NOTE The buf is saved here for !STDIN mode */
|
2010-11-05 13:02:58 +01:00
|
|
|
sb.buf_low = (uchar *)mmap(sb.buf_low, st->mmap_size, PROT_READ, MAP_SHARED, fd_in, offset);
|
|
|
|
|
if (sb.buf_low == MAP_FAILED) {
|
2011-02-26 10:00:26 +01:00
|
|
|
if (unlikely(errno != ENOMEM))
|
|
|
|
|
fatal("Failed to mmap %s\n", control.infile);
|
2010-11-05 13:02:58 +01:00
|
|
|
st->mmap_size = st->mmap_size / 10 * 9;
|
2010-11-19 15:23:08 +01:00
|
|
|
round_to_page(&st->mmap_size);
|
2010-11-05 13:02:58 +01:00
|
|
|
if (unlikely(!st->mmap_size))
|
|
|
|
|
fatal("Unable to mmap any ram\n");
|
|
|
|
|
goto retry;
|
|
|
|
|
}
|
2010-12-11 02:25:43 +01:00
|
|
|
if (st->mmap_size < st->chunk_size)
|
|
|
|
|
print_maxverbose("Enabling sliding mmap mode and using mmap of %lld bytes with window of %lld bytes\n", st->mmap_size, st->chunk_size);
|
2010-11-19 15:23:08 +01:00
|
|
|
}
|
|
|
|
|
print_maxverbose("Succeeded in testing %lld sized mmap for rzip pre-processing\n", st->mmap_size);
|
2010-11-05 13:02:58 +01:00
|
|
|
|
2010-11-19 15:23:08 +01:00
|
|
|
if (st->chunk_size > control.ramsize)
|
|
|
|
|
print_verbose("Compression window is larger than ram, will proceed with unlimited mode possibly much slower\n");
|
2010-11-13 07:36:21 +01:00
|
|
|
|
2010-11-19 15:23:08 +01:00
|
|
|
if (!passes && !STDIN) {
|
|
|
|
|
passes = s.st_size / st->chunk_size + !!(s.st_size % st->chunk_size);
|
|
|
|
|
if (passes == 1)
|
|
|
|
|
print_verbose("Will take 1 pass\n");
|
|
|
|
|
else
|
|
|
|
|
print_verbose("Will take %d passes\n", passes);
|
|
|
|
|
}
|
2010-11-05 13:02:58 +01:00
|
|
|
|
|
|
|
|
sb.orig_offset = offset;
|
2010-11-04 11:14:55 +01:00
|
|
|
print_maxverbose("Chunk size: %lld\n", st->chunk_size);
|
2010-03-29 01:07:08 +02:00
|
|
|
|
2010-12-10 13:51:59 +01:00
|
|
|
/* Determine the chunk byte width to write to the file
|
2010-11-01 03:28:49 +01:00
|
|
|
* This allows archives of different chunk sizes to have
|
2010-11-01 06:45:48 +01:00
|
|
|
* optimal byte width entries. When working with stdin we
|
|
|
|
|
* won't know in advance how big it is so it will always be
|
|
|
|
|
* rounded up to the window size. */
|
2010-11-04 11:14:55 +01:00
|
|
|
while (st->chunk_size >> bits > 0)
|
2010-11-01 03:28:49 +01:00
|
|
|
bits++;
|
|
|
|
|
st->chunk_bytes = bits / 8;
|
|
|
|
|
if (bits % 8)
|
|
|
|
|
st->chunk_bytes++;
|
|
|
|
|
print_maxverbose("Byte width: %d\n", st->chunk_bytes);
|
|
|
|
|
|
2010-03-29 01:07:08 +02:00
|
|
|
pct_base = (100.0 * (s.st_size - len)) / s.st_size;
|
2010-11-04 11:14:55 +01:00
|
|
|
pct_multiple = ((double)st->chunk_size) / s.st_size;
|
2010-03-29 01:07:08 +02:00
|
|
|
pass++;
|
|
|
|
|
|
|
|
|
|
gettimeofday(¤t, NULL);
|
|
|
|
|
/* this will count only when size > window */
|
2010-11-04 11:14:55 +01:00
|
|
|
if (last.tv_sec > 0) {
|
2010-11-01 03:28:49 +01:00
|
|
|
elapsed_time = current.tv_sec - start.tv_sec;
|
|
|
|
|
finish_time = elapsed_time / (pct_base / 100.0);
|
|
|
|
|
elapsed_hours = (unsigned int)(elapsed_time) / 3600;
|
|
|
|
|
elapsed_minutes = (unsigned int)(elapsed_time - elapsed_hours * 3600) / 60;
|
|
|
|
|
elapsed_seconds = (unsigned int) elapsed_time - elapsed_hours * 60 - elapsed_minutes * 60;
|
|
|
|
|
eta_hours = (unsigned int)(finish_time - elapsed_time) / 3600;
|
|
|
|
|
eta_minutes = (unsigned int)((finish_time - elapsed_time) - eta_hours * 3600) / 60;
|
|
|
|
|
eta_seconds = (unsigned int)(finish_time - elapsed_time) - eta_hours * 60 - eta_minutes * 60;
|
2010-11-05 04:52:14 +01:00
|
|
|
chunkmbs = (last_chunk / 1024 / 1024) / (double)(current.tv_sec-last.tv_sec);
|
2010-11-05 14:00:44 +01:00
|
|
|
if (!STDIN)
|
|
|
|
|
print_verbose("\nPass %d / %d -- Elapsed Time: %02d:%02d:%02d. ETA: %02d:%02d:%02d. Compress Speed: %3.3fMB/s.\n",
|
|
|
|
|
pass, passes, elapsed_hours, elapsed_minutes, elapsed_seconds,
|
|
|
|
|
eta_hours, eta_minutes, eta_seconds, chunkmbs);
|
|
|
|
|
else
|
2010-11-18 13:33:43 +01:00
|
|
|
print_verbose("\nPass %d -- Elapsed Time: %02d:%02d:%02d. Compress Speed: %3.3fMB/s.\n",
|
|
|
|
|
pass, elapsed_hours, elapsed_minutes, elapsed_seconds, chunkmbs);
|
2010-03-29 01:07:08 +02:00
|
|
|
}
|
|
|
|
|
last.tv_sec = current.tv_sec;
|
|
|
|
|
last.tv_usec = current.tv_usec;
|
2010-11-05 13:02:58 +01:00
|
|
|
rzip_chunk(st, fd_in, fd_out, offset, pct_base, pct_multiple);
|
2010-11-18 13:33:43 +01:00
|
|
|
/* st->chunk_size may be shrunk in rzip_chunk */
|
2010-11-04 11:14:55 +01:00
|
|
|
last_chunk = st->chunk_size;
|
|
|
|
|
len -= st->chunk_size;
|
2010-03-29 01:07:08 +02:00
|
|
|
}
|
|
|
|
|
|
2010-12-10 13:51:59 +01:00
|
|
|
close_streamout_threads();
|
|
|
|
|
|
2011-02-17 23:09:40 +01:00
|
|
|
md5_finish_ctx (&control.ctx, md5_resblock);
|
2011-02-20 08:01:19 +01:00
|
|
|
if (HASH_CHECK || MAX_VERBOSE) {
|
2011-02-20 02:20:05 +01:00
|
|
|
print_output("MD5: ");
|
2011-02-18 05:16:13 +01:00
|
|
|
for (j = 0; j < MD5_DIGEST_SIZE; j++)
|
|
|
|
|
print_output("%02x", md5_resblock[j] & 0xFF);
|
|
|
|
|
print_output("\n");
|
|
|
|
|
}
|
2011-02-20 08:01:19 +01:00
|
|
|
if (unlikely(write(control.fd_out, md5_resblock, MD5_DIGEST_SIZE) != MD5_DIGEST_SIZE))
|
|
|
|
|
fatal("Failed to write md5 in rzip_fd\n");
|
2011-02-17 23:09:40 +01:00
|
|
|
|
2010-03-29 01:07:08 +02:00
|
|
|
gettimeofday(¤t, NULL);
|
2010-12-12 07:40:58 +01:00
|
|
|
if (STDIN)
|
|
|
|
|
s.st_size = control.st_size;
|
2010-03-29 01:07:08 +02:00
|
|
|
chunkmbs = (s.st_size / 1024 / 1024) / ((double)(current.tv_sec-start.tv_sec)? : 1);
|
|
|
|
|
|
|
|
|
|
fstat(fd_out, &s2);
|
|
|
|
|
|
2010-10-31 18:53:53 +01:00
|
|
|
print_maxverbose("matches=%u match_bytes=%u\n",
|
|
|
|
|
(unsigned int)st->stats.matches, (unsigned int)st->stats.match_bytes);
|
|
|
|
|
print_maxverbose("literals=%u literal_bytes=%u\n",
|
|
|
|
|
(unsigned int)st->stats.literals, (unsigned int)st->stats.literal_bytes);
|
|
|
|
|
print_maxverbose("true_tag_positives=%u false_tag_positives=%u\n",
|
|
|
|
|
(unsigned int)st->stats.tag_hits, (unsigned int)st->stats.tag_misses);
|
|
|
|
|
print_maxverbose("inserts=%u match %.3f\n",
|
|
|
|
|
(unsigned int)st->stats.inserts,
|
|
|
|
|
(1.0 + st->stats.match_bytes) / st->stats.literal_bytes);
|
|
|
|
|
|
2010-11-04 11:14:55 +01:00
|
|
|
if (!STDIN)
|
2010-10-31 18:53:53 +01:00
|
|
|
print_progress("%s - ", control.infile);
|
2010-11-04 11:14:55 +01:00
|
|
|
print_progress("Compression Ratio: %.3f. Average Compression Speed: %6.3fMB/s.\n",
|
|
|
|
|
1.0 * s.st_size / s2.st_size, chunkmbs);
|
2010-03-29 01:07:08 +02:00
|
|
|
|
|
|
|
|
free(st);
|
|
|
|
|
}
|