Import libzapq files.

This commit is contained in:
Con Kolivas 2012-03-15 22:37:46 +11:00
parent 88e2f80736
commit 2f3f01958d
3 changed files with 4359 additions and 0 deletions

737
libzpaq501/libzpaq.3.pod Normal file
View file

@ -0,0 +1,737 @@
# Documentation for libzpaq
#
# Copyright (C) 2012, Dell Inc. Written by Matt Mahoney.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so without restriction.
# This Software is provided "as is" without warranty.
#
# To create man page: pod2man libzpaq.3.pod > libzpaq.3
# To create HTML documentation: pod2html libzpaq.3.pod > libzpaq.html
=pod
=head1 NAME
libzpaq - ZPAQ compression API
=head1 SYNOPSIS
#include "libzpaq.h"
namespace libzpaq {
extern void error(const char* msg);
class Reader {
public:
virtual int get() = 0;
virtual int read(char* buf, int n); // optional
virtual ~Reader() {}
};
class Writer {
public:
virtual void put(int c) = 0;
virtual void write(const char* buf, int n); // optional
virtual ~Writer() {}
};
class SHA1 {
public:
SHA1();
void put(int c);
double size() const;
uint64_t usize() const
const char* result();
};
class Compressor {
public:
Compressor();
void setOutput(Writer* out);
void writeTag();
void startBlock(int level);
void startBlock(const char* hcomp);
void startSegment(const char* filename = 0,
const char* comment = 0);
void setInput(Reader* i);
void postProcess(const char* pcomp = 0, int length = 0);
bool compress(int n = -1);
void endSegment(const char* sha1string = 0);
void endBlock();
};
class Decompresser {
public:
Decompresser();
void setInput(Reader* in);
bool findBlock(double* memptr = 0);
void hcomp(Writer* out);
bool findFilename(Writer* = 0);
void readComment(Writer* = 0);
void setOutput(Writer* out);
void setSHA1(SHA1* sha1ptr);
bool decompress(int n = -1);
bool pcomp(Writer* out);
void readSegmentEnd(char* sha1string = 0);
};
void compress(Reader* in, Writer* out, int level);
void decompress(Reader* in, Writer* out);
}
=head1 DESCRIPTION
I<libzpaq> is a C++ API for compressing or decompressing
files or objects in memory comforming to the ZPAQ level 1 and 2 standards
(see I<availability>). This document describes version 5.00
of the software. The software may be used without
restriction under a modified MIT license.
ZPAQ provides a high level of data compression in a streaming
(single pass) self-describing format that supports single or multiple
named objects (such as archives) with optional integrity checking.
The library provides 3 default compression levels but supports
custom algorithms. The performance of the default levels is
shown in the table below for the 14 file Calgary corpus as
a tar file. Compression and decompression times are in seconds
on a 2 GHz T3200 running on one of two cores. Memory required
to compress or decompress is in MB. Some popular formats
are shown for comparison.
Program Format Size Time (C, D) Memory
----------- ------ --------- ----------- ------
Uncompresed .tar 3,152,896
compress .tar.Z 1,319,521 1.6 0.2 .1 MB
gzip -9 .tar.gz 1,022,810 0.7 0.1 .1 MB
bzip2 -9 .tar.bz2 860,097 0.6 0.4 5 MB
7zip .tar.7z 824,573 1.5 0.1 195 MB
zpaq 1 (fast) .tar.zpaq 806,959 2 2 38 MB
zpaq 2 (mid) .tar.zpaq 699,191 8 8 112 MB
zpaq 3 (max) .tar.zpaq 644,190 20 20 246 MB
A ZPAQ stream consists of one or more blocks, possibly mixed with
other data, that can be decompressed independently in any order.
Each block consists of one or more segments that must be decompressed
in order from the beginning of the block. Each block header contains
a description of the decompression algorithm. Each segment consists
of an optional filename string, an optional comment string,
self delimiting compressed data, and an optional SHA-1 checksum.
If ZPAQ blocks are mixed with other data, they must be
preceded by an identifying 13 byte tag which does not otherwise
appear in that data.
ZPAQ compression is based on the PAQ context mixing model.
An array of components predict the probability of the next bit
of input, either independently or depending on the predictions
of earlier components. The final prediction is arithmetic coded.
Each component inputs a context computed from earlier input
by a program written in ZPAQL byte code which runs on a virtual
machine. Both the component array description and the ZPAQL
code are encoded in a string called HCOMP in each block header.
Data can also be stored uncompressed.
A block may optionally specify a post-processor, a program
(also in ZPAQL) which takes the decoded data as input and
outputs the decompressed output. This program, if present,
is encoded as a string called PCOMP which is compressed
in the first segment prior to the compressed data. The first
decoded byte from the first segment is a flag indicating
whether a PCOMP string is present. The user is responsible
for correctly pre-processing the data so that post-processing
restores the original data.
=head2 API Organization
The I<libzpaq> API consists of 2 files.
=over
=item libzpaq.h
Header file to include in your application.
=item libzpaq.cpp
Source code file to link to your application.
=back
An application would have the line C<#include "libzpaq.h"> and
link to libzpaq.cpp.
The API provides two classes, C<Compressor> and C<Decompresser>
which write or read respectively each of the syntactic elements
of a ZPAQ stream. The two functions C<compress()> and
C<decompress()> provide simple interfaces for the most common
uses. In either case, the user must create classes derived
from the abstract base classes C<Reader> and C<Writer> and
define methods C<get()> and C<put()> which the code
will use to read and write bytes. The user must also define
a callback error handler.
By default, libzpaq(3) uses just-in-time (JIT) acceleration
by translating ZPAQL code to x86-32 or x86-64 internally
and executing it. This feature can be disabled by compiling
with -DNOJIT. If enabled, it requires an x86 processor
capable of executing SSE2 instructions. SSE2 is supported
by most Intel processors since 2001 and AMD since 2003.
Run time checks (assertions) can be enabled with -DDEBUG
for debugging purposes.
All of the API code is contained in the namespace C<libzpaq>.
=head2 Callback Functions
The following three functions must be defined by the user.
=over
=item C<extern void libzpaq::error(const char* msg);>
This function must be defined by the user to handle errors
from libzpaq. The library will call the function with
an English language message passed to C<msg>. Errors may
result from bad input during decompression, out of memory,
or illegal arguments or calling sequences to libzpaq
functions. Errors should be considered unrecoverable.
=item C<int libzpaq::Reader::get() = 0;>
The user must create a class derived from Reader with an
implementation for C<get()> that reads one byte of input
and returns its value in the range 0...255, or returns
EOF (-1) at end of input. Objects of the derived type
would then be passed to functions that require a C<Reader>.
=item C<void libzpaq::Writer::put(int c) = 0;>
The user must create a class derived from Writer with
an implemenation of C<put()> which is expected to take
a byte value C<c> in the range 0...255 and write it to
output. Objects of the derived type
would then be passed to functions that require a C<Writer>.
=back
The following two functions are optional. Defining them
can improve performance slightly.
=over
=item C<virtual int read(char* buf, int n);>
If defined, this function should input up to C<n> bytes into
the array C<buf> and return the number actually read, in
the range 0..n. A return value of 0 indicates end of input.
If C<read()> is not defined, then the default implementation
will call C<get()> n times.
=item C<virtual void write(const char* buf, int n);>
If defined, this function should output the elements C<buf[0]>
through C<buf[n-1]> in order. If not defined, then the default
implementation will call C<put()> n times.
=back
=head2 Simple Compression
In the remainder of this document, all classes and
functions are assumed to be in namespace C<libzpaq>.
=over
=item C<void compress(Reader* in, Writer* out, int mode);>
C<compress()> compresses from C<in> to C<out> until C<get()>
returns EOF. It writes a single segment in a single block
with empty filename, comment, and checksum fields. C<mode>
must be 1, 2, or 3, to select models I<fast>, I<mid>, or
I<max> respectively. Higher modes compress smaller but
take longer to compress and subsequently decompress.
=item C<void decompress(Reader* in, Writer* out);>
C<decompress()> decompresses any valid ZPAQ stream from
C<in> to C<out> until C<get()> returns EOF. Any
non-ZPAQ data in the input is ignored. Any ZPAQ blocks
following non-ZPAQ must be preceded by a marker tag
to be recognized. Each block is decoded according to the
instructions in the block header. The contents of the
filename, comment, and checksum fields are ignored.
Data with bad checksums will be decoded anyway. If there
is more than one segment, then all of the output
data will be concatenated.
=back
=head2 class SHA1
The SHA1 class is used to compute SHA-1 checksums for compression
and verify them for decompression. It is believed to be
computationally infeasible to find two different strings
with the same hash value. Its member functions
are as follows:
=over
=item C<SHA1();>
The constructor creates a new SHA1 object representing the
hash of an empty string.
=item C<void put(int c);>
Appends one byte c (0...255) to the string whose hash is represented.
=item C<double size() const;>
Returns the length (so far) of the string whose hash is represented.
The largest possible value returned is
2^61 - 1 = 2305843009213693951.0, but values larger than 2^53 =
9007199254740992.0
will not be exact on systems using IEEE 64 bit floating point
representation of type C<double>. The initial value is 0.0.
=item C<int64_t usize() const;>
Returns the length (so far) as a 64 bit unsigned integer.
=item C<const char* result();>
Computes the 20 byte SHA-1 hash and resets the string back
to a size of 0.0. The returned pointer points to an array
inside the SHA1 object whose
contents remain unchanged until the next call to C<result()>.
=back
=head2 class Compressor
The C<Compressor> class has member functions to write
each of the syntactic elements of a ZPAQ stream and to specify
their values. It will compress using either built-in or
user supplied models.
=over
=item C<Compressor();>
The constructor creates a Compression object. No input source,
output destination, or compression model is specified.
=item C<void setOutput(Writer* out);>
Specifies a destination for output. Must be specified before
calling any function that writes data.
=item C<void writeTag();>
Writes a 13 byte marker tag which can be used to identify
the start of a block following non-ZPAQ data.
=item C<void startBlock(int level);>
Writes a block header and specifies a compression model.
If linked with F<libzpaqo.cpp>, then C<level> must be 1, 2, or 3
to specify I<fast>, I<mid>, or I<max> respectively. Higher numbers
compress smaller but more slowly. These models are compatible
with both the ZPAQ level 1 and 2 standards.
=item C<void startBlock(const char* hcomp);>
Writes a block header and specifies the HCOMP portion of the
compression model. The first two bytes of the string should
encode the length of the rest of the string as a 16 bit unsigned
number with the least significant bit first. The meaning of the
rest of the string is defined in the ZPAQ level 2 standard.
If the number of components (C<hcomp[8]>) is 0, then the block
is saved in ZPAQ level 2 format, which cannot be read by
older ZPAQ level 1 decoders. Otherwise the block is saved in
ZPAQ level 1 format, which is compatible with all decoders.
=item C<void startSegment(const char* filename = 0, const char* comment = 0);>
Writes a segment header. C<filename> and
C<comment> are NUL terminated strings. If specified, then their
values are stored. Normally, C<filename> would be a file name
when compressing to an archive or omitted otherwise. If a file
is split among segments, then by convention only the first segment
is named. C<comment> is normally the uncompressed size as a decimal
number which is displayed when listing the contents of an archive.
Omitting it does not affect decompression.
=item C<void postProcess(const char* pcomp = 0, int length = 0);>
Specifies the optional PCOMP string used for post-processing.
It must be called from within the first segment
of each block prior to compressing any data, but not from within
any other segment.
If C<pcomp> is 0 or no argument is passed, then the decompresser
will not post-process the data. The effect is to compress a
0 byte to indicate to the decompresser that no PCOMP string
is present.
If C<pcomp> is not 0, then I<length> bytes of the string I<pcomp>
are passed. If I<length> is 0 or omitted, then
the first two bytes must encode
the length of the rest of the string as a 16 bit unsigned number
with the least significant byte first. The format of the remainder
of the string is described in the ZPAQ level 2 standard.
The effect is to compress a 1 byte
to indicate the presence of PCOMP, followed by the two length
bytes and the string as passed. For example, either
C<pcomp("\x02\x00\x05\x08")> or C<pcomp("\x05\x08", 2)>
would compress the 5 bytes 1, 2, 0, 5, 8.
The user is responsible for pre-processing the input
prior to compression so that PCOMP restores the original data.
=item C<void setInput(Reader* in);>
Specifies the input source for compression. It must be set
prior to the first call to C<compress()>.
=item C<bool compress(int n = -1);>
Compress n bytes of data, or until EOF is input, whichever comes
first. If n < 0 or omitted, then compress until EOF.
Returns true if there is more input available, or false if EOF
was read.
=item C<void endSegment(const char* sha1string = 0);>
Stop compressing and write the end of a segment. If
C<sha1string> is specified, it should be a 20 byte string
as returned by C<SHA1::result()> on the input data for
this segment I<before> pre-processing.
=item C<void endBlock();>
Finish writing the current block.
=back
In order to create a valid ZPAQ stream, the components must
be written in the following order:
for each block do {
if any non-ZPAQ data then {
write non-ZPAQ data
writeTag()
}
startBlock()
for each segment do {
startSegment()
if first segment in block then {
postProcess()
}
while (compress(n)) ;
endSegment()
}
endBlock()
}
=head2 class Decompresser
The class Decompresser has member functions to read each of the
syntactic elements of a ZPAQ stream.
=over
=item C<Decompresser()>
The constructor creates a Decompresser object. No input source or
output destination is specified.
=item C<void setInput(Reader* in);>
Specifies where the ZPAQ stream will be read from. Must be called
before any function that reads the stream.
=item C<bool findBlock(double* memptr = 0);>
Scan the input to find the start of the next block. If a block
does not start immediately, then the block must be preceded by
a marker tag (written with C<Compressor::writeTag()>) or it will
not be found. If C<memptr> is not 0, then write the approximate
memory requirement (in bytes) to decompress to C<*memptr>). The
memory will be allocated by the first call to C<decompress()>.
It returns true if a block is found, or false if it reads to EOF
without finding a block.
=item C<void hcomp(Writer* out);>
Write the HCOMP string of the current block to C<out>.
It will be in a format suitable
for passing to C<Compressor::startBlock()>. The first 2 bytes will
encode the length of the rest of the string as a 16 bit unsigned
integer with the least significant byte first. The format of the
remainder of the string is described in the ZPAQ level 1
specification.
=item C<bool findFilename(Writer* out = 0);>
Find the start of the next segment. If another segment is found
within the current block then return true. If the end of the block
is found first, then return false. If a segment is found, the
filename field is not empty, and C<out>
is not 0, then write the filename (without a terminating NUL byte)
to C<out>.
=item C<void readComment(Writer* out = 0);>
Read or skip past the comment field following the filename field
in the segment header. If C<out> is not 0 and the comment field is
not empty, then write the comment
(without a terminating NUL byte) to C<out>.
=item C<void setOutput(Writer* out);>
Specify the destination for decompression. It must be set before
any data can be decompressed.
=item C<void setSHA1(SHA1* sha1ptr);>
Specify the address of a SHA1 object for computing the checksum
of the decompressed data (after post-processing). As each byte C<c>
is output, it is also passed to C<sha1ptr-E<gt>put(c)>. In order to
compute the correct checksum, the SHA1 object should be in its
initial state, either newly created, or by calling C<SHA1::result()>,
before the first call to C<decompress()>. When the end of the segment
is reached, the value returned by C<sha1ptr-E<gt>result()> should match
the stored checksum, if any.
=item C<bool decompress(int n = -1);>
Decode n bytes or until the end of segment, whichever comes
first. Return false if the end of segment is reached first. If
n < 0 or not specified, then decompress to the end of segment
and return false. C<n> is the number of bytes prior to post-processing.
If the data is post-processed, then the size of the output may
be different.
=item C<bool pcomp(Writer* out);>
Write the PCOMP string, if any, for the current block to C<out>.
If there is no PCOMP string (no post-processor) then return false.
Otherwise write the string to C<out> in a format suitable for
passing to C<Compressor::postProcess()> and return true. If written,
then the first 2 bytes will encode the length of the rest of the
string as a 16 bit unsigned integer with the least significant
bit first. The format of the rest of the string is descibed in
the ZPAQ level 1 standard.
C<pcomp()> is only valid after the first call to C<decompress()>
in the current block. To read the PCOMP string without decompressing any
data, then call C<decompress(0)> first. It is not necessary to
call C<setOutput()> in this case.
=item C<void readSegmentEnd(char* sha1string = 0);>
Skip any compressed data in the current segment that has not yet
been decompressed and advance to the end of the segment.
Then if C<sha1string> is not 0 then write into
the 21 byte array that it points to. If a checksum is present,
then write a 1 into C<sha1string[0]> and write the stored checksum
in C<sha1string[1...20]>. Otherwise write a 0 in C<sha1string[0]>.
Note that it is not permitted to call decompress() if any compressed
data has been skipped in any earlier segments in the same block.
=back
A valid sequence of calls is as follows:
while (findBlock()) {
while (findFilename()) {
readComment();
if first segment in block then { (optional)
decompress(0)
pcomp()
}
while (decompress(n)) ; (optional)
readSegmentEnd();
}
}
=head1 EXAMPLES
The following program F<listzpaq.cpp>
lists the contents of a ZPAQ archive
read from standard input.
#include <stdio.h>
#include <stdlib.h>
#include "libzpaq.h"
// Implement Reader and Writer interfaces for file I/O
class File: public libzpaq::Reader, public libzpaq::Writer {
FILE* f;
public:
File(FILE* f_): f(f_) {}
int get() {return getc(f);}
void put(int c) {putc(c, f);}
int read(char* buf, int n) {return fread(buf, 1, n, f);}
void write(const char* buf, int n) {fwrite(buf, 1, n, f);}
};
// Implement error handler
namespace libzpaq {
void error(const char* msg) {
fprintf(stderr, "Error: %s\n", msg);
exit(1);
}
}
// List the contents of an archive. For each block, show
// the memory required to decompress. For each segment,
// show the filename and comment.
void list(FILE* input, FILE* output) {
libzpaq::Decompresser d;
File in(input), out(output);
double memory;
d.setInput(&in);
for (int block=1; d.findBlock(&memory); ++block) {
printf("Block %d needs %1.0f MB\n", block, memory/1e6);
while (d.findFilename(&out)) { // print filename
printf("\t");
d.readComment(&out); // print comment
printf("\n");
d.readSegmentEnd(); // skip compressed data
}
}
}
int main() {
list(stdin, stdout);
return 0;
}
The program could be compiled as follows:
g++ listzpaq.cpp libzpaq.cpp
The following code compresses a list of files into one block
written to stdout. Each file is compressed to a separate
segment. For each segment, the filename, comment, and SHA-1
checksum are stored. The comment, as conventional, is the
file size as a decimal string.
// Compress one file to one segment
void compress_file(libzpaq::Compressor& c,
const char* filename,
bool first_segment) {
// Open input file
FILE* f;
f=fopen(filename, "rb");
if (!f) return;
// Compute SHA-1 checksum and file size
libzpaq::SHA1 sha1;
int ch;
while ((ch=getc(f))!=EOF)
sha1.put(ch);
// Write file size as a comment.
// The size can have at most 19 digits.
char comment[20];
sprintf(comment, "%1.0f", sha1.size());
// Compress segment
rewind(f);
File in(f);
c.startSegment(filename, comment);
if (first_segment)
c.postProcess();
c.setInput(&in);
c.compress();
c.endSegment(sha1.result());
// Close input file
fclose(f);
}
// Compress a list of argc files in argv[0...argc-1] into one
// ZPAQ block to stdout at level 2.
void compress_list(int argc, char** argv) {
libzpaq::Compressor c;
File out(stdout);
c.setOutput(&out);
c.startBlock(2);
for (int i=0; i<argc; ++i)
compress_file(c, argv[i], i==0);
c.endBlock();
}
The following function decompresses from stdin to stdout.
Filenames and comments are ignored, but checksums are verified
if present.
void decompress() {
libzpaq::Decompresser d;
File in(stdin), out(stdout);
d.setInput(&in);
while (d.findBlock()) {
while (d.findFilename()) {
d.readComment();
libzpaq::SHA1 sha1;
d.setSHA1(&sha1);
d.setOutput(&out);
d.decompress();
char sha1string[21];
d.readSegmentEnd(sha1string);
const char* sha1result = sha1.result();
if (sha1string[0]==1
&& memcmp(sha1string+1, sha1result, 20))
libzpaq::error("checksum verify error");
}
}
}
C<Compressor::compress()> and C<Decompresser::decompress()> can
be passed an argument n to display progress every n bytes,
for example:
for (int i=1; d.decompress(1000000); ++i)
fprintf(stderr, "Decompressed %d MB\n", i);
To compress or decompress to and from objects in memory, derive
appropriate classes from C<Reader> and C<Writer>. For example, it is
possible to compress or decompress to a C<std::string> using
the following class.
struct String: public libzpaq::Writer {
std::string s;
void put(int c) {s+=char(c);}
};
This class is also useful for reading the filename and comment
fields during decompression as follows:
String filename, comment;
while (d.findFilename(&filename)) {
d.readComment(&comment);
// ...
=head1 AVAILABILITY
I<libzpaq>, I<zpaq>, and the ZPAQ level 1 and 2 specifications are
available from L<http://mattmahoney.net/zpaq/>.
=head1 SEE ALSO
C<zpaq(1)>
C<sha1(1SSL)>
=cut

3181
libzpaq501/libzpaq.cpp Normal file

File diff suppressed because it is too large Load diff

441
libzpaq501/libzpaq.h Normal file
View file

@ -0,0 +1,441 @@
/* libzpaq.h - LIBZPAQ Version 5.00.
Copyright (C) 2011, Dell Inc. Written by Matt Mahoney.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so without restriction.
This Software is provided "as is" without warranty.
LIBZPAQ is a C++ library for compression and decompression of data
conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/
By default, LIBZPAQ uses JIT (just in time) acceleration. This only
works on x86-32 and x86-64 processors that support the SSE2 instruction
set. To disable JIT, compile with -DNOJIT. To enable run time checks,
compile with -DDEBUG. Both options will decrease speed.
The decompression code, when compiled with -DDEBUG and -DNOJIT,
comprises the reference decoder for the ZPAQ level 2 standard.
*/
#ifndef LIBZPAQ_H
#define LIBZPAQ_H
#ifndef DEBUG
#define NDEBUG 1
#endif
#include <assert.h>
#include <stddef.h>
#include <stdint.h>
namespace libzpaq {
// 1, 2, 4, 8 byte unsigned integers
typedef uint8_t U8;
typedef uint16_t U16;
typedef uint32_t U32;
typedef uint64_t U64;
// Standard library prototypes redirected to libzpaq.cpp
void* calloc(size_t, size_t);
void free(void*);
// Callback for error handling
extern void error(const char* msg);
// Virtual base classes for input and output
// get() and put() must be overridden to read or write 1 byte.
// read() and write() may be overridden to read or write n bytes more
// efficiently than calling get() or put() n times.
class Reader {
public:
virtual int get() = 0; // should return 0..255, or -1 at EOF
virtual int read(char* buf, int n); // read to buf[n], return no. read
virtual ~Reader() {}
};
class Writer {
public:
virtual void put(int c) = 0; // should output low 8 bits of c
virtual void write(const char* buf, int n); // write buf[n]
virtual ~Writer() {}
};
// Read 16 bit little-endian number
int toU16(const char* p);
// An Array of T is cleared and aligned on a 64 byte address
// with no constructors called. No copy or assignment.
// Array<T> a(n, ex=0); - creates n<<ex elements of type T
// a[i] - index
// a(i) - index mod n, n must be a power of 2
// a.size() - gets n
template <typename T>
class Array {
T *data; // user location of [0] on a 64 byte boundary
size_t n; // user size
int offset; // distance back in bytes to start of actual allocation
void operator=(const Array&); // no assignment
Array(const Array&); // no copy
public:
Array(size_t sz=0, int ex=0): data(0), n(0), offset(0) {
resize(sz, ex);} // [0..sz-1] = 0
void resize(size_t sz, int ex=0); // change size, erase content to zeros
~Array() {resize(0);} // free memory
size_t size() const {return n;} // get size
int isize() const {return int(n);} // get size as an int
T& operator[](size_t i) {assert(n>0 && i<n); return data[i];}
T& operator()(size_t i) {assert(n>0 && (n&(n-1))==0); return data[i&(n-1)];}
};
// Change size to sz<<ex elements of 0
template<typename T>
void Array<T>::resize(size_t sz, int ex) {
assert(size_t(-1)>0); // unsigned type?
while (ex>0) {
if (sz>sz*2) error("Array too big");
sz*=2, --ex;
}
if (n>0) {
assert(offset>0 && offset<=64);
assert((char*)data-offset);
free((char*)data-offset);
}
n=0;
if (sz==0) return;
n=sz;
const size_t nb=128+n*sizeof(T); // test for overflow
if (nb<=128 || (nb-128)/sizeof(T)!=n) error("Array too big");
data=(T*)calloc(nb, 1);
if (!data) error("Out of memory");
offset=64-(((char*)data-(char*)0)&63);
assert(offset>0 && offset<=64);
data=(T*)((char*)data+offset);
}
//////////////////////////// SHA1 ////////////////////////////
// For computing SHA-1 checksums
class SHA1 {
public:
void put(int c) { // hash 1 byte
U32& r=w[len0>>5&15];
r=(r<<8)|(c&255);
if (!(len0+=8)) ++len1;
if ((len0&511)==0) process();
}
double size() const {return len0/8+len1*536870912.0;} // size in bytes
uint64_t usize() const {return len0/8+(U64(len1)<<29);} // size in bytes
const char* result(); // get hash and reset
SHA1() {init();}
private:
void init(); // reset, but don't clear hbuf
U32 len0, len1; // length in bits (low, high)
U32 h[5]; // hash state
U32 w[80]; // input buffer
char hbuf[20]; // result
void process(); // hash 1 block
};
//////////////////////////// ZPAQL ///////////////////////////
// Symbolic constants, instruction size, and names
typedef enum {NONE,CONS,CM,ICM,MATCH,AVG,MIX2,MIX,ISSE,SSE} CompType;
extern const int compsize[256];
// A ZPAQL machine COMP+HCOMP or PCOMP.
class ZPAQL {
public:
ZPAQL();
~ZPAQL();
void clear(); // Free memory, erase program, reset machine state
void inith(); // Initialize as HCOMP to run
void initp(); // Initialize as PCOMP to run
double memory(); // Return memory requirement in bytes
void run(U32 input); // Execute with input
int read(Reader* in2); // Read header
bool write(Writer* out2, bool pp); // If pp write PCOMP else HCOMP header
int step(U32 input, int mode); // Trace execution (defined externally)
Writer* output; // Destination for OUT instruction, or 0 to suppress
SHA1* sha1; // Points to checksum computer
U32 H(int i) {return h(i);} // get element of h
void flush(); // write outbuf[0..bufptr-1] to output and sha1
void outc(int c) { // output byte c (0..255) or -1 at EOS
if (c<0 || (outbuf[bufptr]=c, ++bufptr==outbuf.isize())) flush();
}
// ZPAQ1 block header
Array<U8> header; // hsize[2] hh hm ph pm n COMP (guard) HCOMP (guard)
int cend; // COMP in header[7...cend-1]
int hbegin, hend; // HCOMP/PCOMP in header[hbegin...hend-1]
private:
// Machine state for executing HCOMP
Array<U8> m; // memory array M for HCOMP
Array<U32> h; // hash array H for HCOMP
Array<U32> r; // 256 element register array
Array<char> outbuf; // output buffer
int bufptr; // number of bytes in outbuf
U32 a, b, c, d; // machine registers
int f; // condition flag
int pc; // program counter
int rcode_size; // length of rcode
U8* rcode; // JIT code for run()
// Support code
int assemble(); // put JIT code in rcode
void init(int hbits, int mbits); // initialize H and M sizes
int execute(); // execute 1 instruction, return 0 after HALT, else 1
void run0(U32 input); // default run() when select==0
void div(U32 x) {if (x) a/=x; else a=0;}
void mod(U32 x) {if (x) a%=x; else a=0;}
void swap(U32& x) {a^=x; x^=a; a^=x;}
void swap(U8& x) {a^=x; x^=a; a^=x;}
void err(); // exit with run time error
};
///////////////////////// Component //////////////////////////
// A Component is a context model, indirect context model, match model,
// fixed weight mixer, adaptive 2 input mixer without or with current
// partial byte as context, adaptive m input mixer (without or with),
// or SSE (without or with).
struct Component {
size_t limit; // max count for cm
size_t cxt; // saved context
size_t a, b, c; // multi-purpose variables
Array<U32> cm; // cm[cxt] -> p in bits 31..10, n in 9..0; MATCH index
Array<U8> ht; // ICM/ISSE hash table[0..size1][0..15] and MATCH buf
Array<U16> a16; // MIX weights
void init(); // initialize to all 0
Component() {init();}
};
////////////////////////// StateTable ////////////////////////
// Next state table generator
class StateTable {
enum {N=64}; // sizes of b, t
int num_states(int n0, int n1); // compute t[n0][n1][1]
void discount(int& n0); // set new value of n0 after 1 or n1 after 0
void next_state(int& n0, int& n1, int y); // new (n0,n1) after bit y
public:
U8 ns[1024]; // state*4 -> next state if 0, if 1, n0, n1
int next(int state, int y) { // next state for bit y
assert(state>=0 && state<256);
assert(y>=0 && y<4);
return ns[state*4+y];
}
int cminit(int state) { // initial probability of 1 * 2^23
assert(state>=0 && state<256);
return ((ns[state*4+3]*2+1)<<22)/(ns[state*4+2]+ns[state*4+3]+1);
}
StateTable();
};
///////////////////////// Predictor //////////////////////////
// A predictor guesses the next bit
class Predictor {
public:
Predictor(ZPAQL&);
~Predictor();
void init(); // build model
int predict(); // probability that next bit is a 1 (0..4095)
void update(int y); // train on bit y (0..1)
int stat(int); // Defined externally
bool isModeled() { // n>0 components?
assert(z.header.isize()>6);
return z.header[6]!=0;
}
private:
// Predictor state
int c8; // last 0...7 bits.
int hmap4; // c8 split into nibbles
int p[256]; // predictions
U32 h[256]; // unrolled copy of z.h
ZPAQL& z; // VM to compute context hashes, includes H, n
Component comp[256]; // the model, includes P
// Modeling support functions
int predict0(); // default
void update0(int y); // default
int dt2k[256]; // division table for match: dt2k[i] = 2^12/i
int dt[1024]; // division table for cm: dt[i] = 2^16/(i+1.5)
U16 squasht[4096]; // squash() lookup table
short stretcht[32768];// stretch() lookup table
StateTable st; // next, cminit functions
U8* pcode; // JIT code for predict() and update()
int pcode_size; // length of pcode
// reduce prediction error in cr.cm
void train(Component& cr, int y) {
assert(y==0 || y==1);
U32& pn=cr.cm(cr.cxt);
U32 count=pn&0x3ff;
int error=y*32767-(cr.cm(cr.cxt)>>17);
pn+=(error*dt[count]&-1024)+(count<cr.limit);
}
// x -> floor(32768/(1+exp(-x/64)))
int squash(int x) {
assert(x>=-2048 && x<=2047);
return squasht[x+2048];
}
// x -> round(64*log((x+0.5)/(32767.5-x))), approx inverse of squash
int stretch(int x) {
assert(x>=0 && x<=32767);
return stretcht[x];
}
// bound x to a 12 bit signed int
int clamp2k(int x) {
if (x<-2048) return -2048;
else if (x>2047) return 2047;
else return x;
}
// bound x to a 20 bit signed int
int clamp512k(int x) {
if (x<-(1<<19)) return -(1<<19);
else if (x>=(1<<19)) return (1<<19)-1;
else return x;
}
// Get cxt in ht, creating a new row if needed
size_t find(Array<U8>& ht, int sizebits, U32 cxt);
// Put JIT code in pcode
int assemble_p();
};
//////////////////////////// Decoder /////////////////////////
// Decoder decompresses using an arithmetic code
class Decoder {
public:
Reader* in; // destination
Decoder(ZPAQL& z);
int decompress(); // return a byte or EOF
int skip(); // skip to the end of the segment, return next byte
void init(); // initialize at start of block
int stat(int x) {return pr.stat(x);}
private:
U32 low, high; // range
U32 curr; // last 4 bytes of archive
Predictor pr; // to get p
enum {BUFSIZE=1<<16};
Array<char> buf; // input buffer of size BUFSIZE bytes
// of unmodeled data. buf[low..high-1] is input with curr
// remaining in sub-block.
int decode(int p); // return decoded bit (0..1) with prob. p (0..65535)
void loadbuf(); // read unmodeled data into buf to EOS
};
/////////////////////////// PostProcessor ////////////////////
class PostProcessor {
int state; // input parse state: 0=INIT, 1=PASS, 2..4=loading, 5=POST
int hsize; // header size
int ph, pm; // sizes of H and M in z
public:
ZPAQL z; // holds PCOMP
PostProcessor(): state(0), hsize(0), ph(0), pm(0) {}
void init(int h, int m); // ph, pm sizes of H and M
int write(int c); // Input a byte, return state
int getState() const {return state;}
void setOutput(Writer* out) {z.output=out;}
void setSHA1(SHA1* sha1ptr) {z.sha1=sha1ptr;}
};
//////////////////////// Decompresser ////////////////////////
// For decompression and listing archive contents
class Decompresser {
public:
Decompresser(): z(), dec(z), pp(), state(BLOCK), decode_state(FIRSTSEG) {}
void setInput(Reader* in) {dec.in=in;}
bool findBlock(double* memptr = 0);
void hcomp(Writer* out2) {z.write(out2, false);}
bool findFilename(Writer* = 0);
void readComment(Writer* = 0);
void setOutput(Writer* out) {pp.setOutput(out);}
void setSHA1(SHA1* sha1ptr) {pp.setSHA1(sha1ptr);}
bool decompress(int n = -1); // n bytes, -1=all, return true until done
bool pcomp(Writer* out2) {return pp.z.write(out2, true);}
void readSegmentEnd(char* sha1string = 0);
int stat(int x) {return dec.stat(x);}
private:
ZPAQL z;
Decoder dec;
PostProcessor pp;
enum {BLOCK, FILENAME, COMMENT, DATA, SEGEND} state; // expected next
enum {FIRSTSEG, SEG, SKIP} decode_state; // which segment in block?
};
/////////////////////////// decompress() /////////////////////
void decompress(Reader* in, Writer* out);
//////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////
// Code following this point is not a part of the ZPAQ level 2 standard.
//////////////////////////// Encoder /////////////////////////
// Encoder compresses using an arithmetic code
class Encoder {
public:
Encoder(ZPAQL& z, int size=0):
out(0), low(1), high(0xFFFFFFFF), pr(z) {}
void init();
void compress(int c); // c is 0..255 or EOF
int stat(int x) {return pr.stat(x);}
Writer* out; // destination
private:
U32 low, high; // range
Predictor pr; // to get p
Array<char> buf; // unmodeled input
void encode(int y, int p); // encode bit y (0..1) with prob. p (0..65535)
};
//////////////////////// Compressor //////////////////////////
class Compressor {
public:
Compressor(): enc(z), in(0), state(INIT) {}
void setOutput(Writer* out) {enc.out=out;}
void writeTag();
void startBlock(int level); // level=1,2,3
void startBlock(const char* hcomp);
void startSegment(const char* filename = 0, const char* comment = 0);
void setInput(Reader* i) {in=i;}
void postProcess(const char* pcomp = 0, int len = 0);
bool compress(int n = -1); // n bytes, -1=all, return true until done
void endSegment(const char* sha1string = 0);
void endBlock();
int stat(int x) {return enc.stat(x);}
private:
ZPAQL z;
Encoder enc;
Reader* in;
enum {INIT, BLOCK1, SEG1, BLOCK2, SEG2} state;
};
/////////////////////////// compress() ///////////////////////
void compress(Reader* in, Writer* out, int level);
} // namespace libzpaq
#endif // LIBZPAQ_H