diff --git a/libzpaq501/libzpaq.3.pod b/libzpaq501/libzpaq.3.pod new file mode 100644 index 0000000..5726755 --- /dev/null +++ b/libzpaq501/libzpaq.3.pod @@ -0,0 +1,737 @@ +# Documentation for libzpaq +# +# Copyright (C) 2012, Dell Inc. Written by Matt Mahoney. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so without restriction. +# This Software is provided "as is" without warranty. +# +# To create man page: pod2man libzpaq.3.pod > libzpaq.3 +# To create HTML documentation: pod2html libzpaq.3.pod > libzpaq.html + +=pod + +=head1 NAME + +libzpaq - ZPAQ compression API + +=head1 SYNOPSIS + + #include "libzpaq.h" + + namespace libzpaq { + + extern void error(const char* msg); + + class Reader { + public: + virtual int get() = 0; + virtual int read(char* buf, int n); // optional + virtual ~Reader() {} + }; + + class Writer { + public: + virtual void put(int c) = 0; + virtual void write(const char* buf, int n); // optional + virtual ~Writer() {} + }; + + class SHA1 { + public: + SHA1(); + void put(int c); + double size() const; + uint64_t usize() const + const char* result(); + }; + + class Compressor { + public: + Compressor(); + void setOutput(Writer* out); + void writeTag(); + void startBlock(int level); + void startBlock(const char* hcomp); + void startSegment(const char* filename = 0, + const char* comment = 0); + void setInput(Reader* i); + void postProcess(const char* pcomp = 0, int length = 0); + bool compress(int n = -1); + void endSegment(const char* sha1string = 0); + void endBlock(); + }; + + class Decompresser { + public: + Decompresser(); + void setInput(Reader* in); + bool findBlock(double* memptr = 0); + void hcomp(Writer* out); + bool findFilename(Writer* = 0); + void readComment(Writer* = 0); + void setOutput(Writer* out); + void setSHA1(SHA1* sha1ptr); + bool decompress(int n = -1); + bool pcomp(Writer* out); + void readSegmentEnd(char* sha1string = 0); + }; + + void compress(Reader* in, Writer* out, int level); + + void decompress(Reader* in, Writer* out); + } + +=head1 DESCRIPTION + +I is a C++ API for compressing or decompressing +files or objects in memory comforming to the ZPAQ level 1 and 2 standards +(see I). This document describes version 5.00 +of the software. The software may be used without +restriction under a modified MIT license. + +ZPAQ provides a high level of data compression in a streaming +(single pass) self-describing format that supports single or multiple +named objects (such as archives) with optional integrity checking. + +The library provides 3 default compression levels but supports +custom algorithms. The performance of the default levels is +shown in the table below for the 14 file Calgary corpus as +a tar file. Compression and decompression times are in seconds +on a 2 GHz T3200 running on one of two cores. Memory required +to compress or decompress is in MB. Some popular formats +are shown for comparison. + + Program Format Size Time (C, D) Memory + ----------- ------ --------- ----------- ------ + Uncompresed .tar 3,152,896 + compress .tar.Z 1,319,521 1.6 0.2 .1 MB + gzip -9 .tar.gz 1,022,810 0.7 0.1 .1 MB + bzip2 -9 .tar.bz2 860,097 0.6 0.4 5 MB + 7zip .tar.7z 824,573 1.5 0.1 195 MB + zpaq 1 (fast) .tar.zpaq 806,959 2 2 38 MB + zpaq 2 (mid) .tar.zpaq 699,191 8 8 112 MB + zpaq 3 (max) .tar.zpaq 644,190 20 20 246 MB + +A ZPAQ stream consists of one or more blocks, possibly mixed with +other data, that can be decompressed independently in any order. +Each block consists of one or more segments that must be decompressed +in order from the beginning of the block. Each block header contains +a description of the decompression algorithm. Each segment consists +of an optional filename string, an optional comment string, +self delimiting compressed data, and an optional SHA-1 checksum. +If ZPAQ blocks are mixed with other data, they must be +preceded by an identifying 13 byte tag which does not otherwise +appear in that data. + +ZPAQ compression is based on the PAQ context mixing model. +An array of components predict the probability of the next bit +of input, either independently or depending on the predictions +of earlier components. The final prediction is arithmetic coded. +Each component inputs a context computed from earlier input +by a program written in ZPAQL byte code which runs on a virtual +machine. Both the component array description and the ZPAQL +code are encoded in a string called HCOMP in each block header. +Data can also be stored uncompressed. + +A block may optionally specify a post-processor, a program +(also in ZPAQL) which takes the decoded data as input and +outputs the decompressed output. This program, if present, +is encoded as a string called PCOMP which is compressed +in the first segment prior to the compressed data. The first +decoded byte from the first segment is a flag indicating +whether a PCOMP string is present. The user is responsible +for correctly pre-processing the data so that post-processing +restores the original data. + +=head2 API Organization + +The I API consists of 2 files. + +=over + +=item libzpaq.h + +Header file to include in your application. + +=item libzpaq.cpp + +Source code file to link to your application. + +=back + +An application would have the line C<#include "libzpaq.h"> and +link to libzpaq.cpp. +The API provides two classes, C and C +which write or read respectively each of the syntactic elements +of a ZPAQ stream. The two functions C and +C provide simple interfaces for the most common +uses. In either case, the user must create classes derived +from the abstract base classes C and C and +define methods C and C which the code +will use to read and write bytes. The user must also define +a callback error handler. + +By default, libzpaq(3) uses just-in-time (JIT) acceleration +by translating ZPAQL code to x86-32 or x86-64 internally +and executing it. This feature can be disabled by compiling +with -DNOJIT. If enabled, it requires an x86 processor +capable of executing SSE2 instructions. SSE2 is supported +by most Intel processors since 2001 and AMD since 2003. + +Run time checks (assertions) can be enabled with -DDEBUG +for debugging purposes. + +All of the API code is contained in the namespace C. + +=head2 Callback Functions + +The following three functions must be defined by the user. + +=over + +=item C + +This function must be defined by the user to handle errors +from libzpaq. The library will call the function with +an English language message passed to C. Errors may +result from bad input during decompression, out of memory, +or illegal arguments or calling sequences to libzpaq +functions. Errors should be considered unrecoverable. + +=item C + +The user must create a class derived from Reader with an +implementation for C that reads one byte of input +and returns its value in the range 0...255, or returns +EOF (-1) at end of input. Objects of the derived type +would then be passed to functions that require a C. + +=item C + +The user must create a class derived from Writer with +an implemenation of C which is expected to take +a byte value C in the range 0...255 and write it to +output. Objects of the derived type +would then be passed to functions that require a C. + +=back + +The following two functions are optional. Defining them +can improve performance slightly. + +=over + +=item C + +If defined, this function should input up to C bytes into +the array C and return the number actually read, in +the range 0..n. A return value of 0 indicates end of input. +If C is not defined, then the default implementation +will call C n times. + +=item C + +If defined, this function should output the elements C +through C in order. If not defined, then the default +implementation will call C n times. + +=back + +=head2 Simple Compression + +In the remainder of this document, all classes and +functions are assumed to be in namespace C. + +=over + +=item C + +C compresses from C to C until C +returns EOF. It writes a single segment in a single block +with empty filename, comment, and checksum fields. C +must be 1, 2, or 3, to select models I, I, or +I respectively. Higher modes compress smaller but +take longer to compress and subsequently decompress. + +=item C + +C decompresses any valid ZPAQ stream from +C to C until C returns EOF. Any +non-ZPAQ data in the input is ignored. Any ZPAQ blocks +following non-ZPAQ must be preceded by a marker tag +to be recognized. Each block is decoded according to the +instructions in the block header. The contents of the +filename, comment, and checksum fields are ignored. +Data with bad checksums will be decoded anyway. If there +is more than one segment, then all of the output +data will be concatenated. + +=back + +=head2 class SHA1 + +The SHA1 class is used to compute SHA-1 checksums for compression +and verify them for decompression. It is believed to be +computationally infeasible to find two different strings +with the same hash value. Its member functions +are as follows: + +=over + +=item C + +The constructor creates a new SHA1 object representing the +hash of an empty string. + +=item C + +Appends one byte c (0...255) to the string whose hash is represented. + +=item C + +Returns the length (so far) of the string whose hash is represented. +The largest possible value returned is +2^61 - 1 = 2305843009213693951.0, but values larger than 2^53 = +9007199254740992.0 +will not be exact on systems using IEEE 64 bit floating point +representation of type C. The initial value is 0.0. + +=item C + +Returns the length (so far) as a 64 bit unsigned integer. + +=item C + +Computes the 20 byte SHA-1 hash and resets the string back +to a size of 0.0. The returned pointer points to an array +inside the SHA1 object whose +contents remain unchanged until the next call to C. + +=back + +=head2 class Compressor + +The C class has member functions to write +each of the syntactic elements of a ZPAQ stream and to specify +their values. It will compress using either built-in or +user supplied models. + +=over + +=item C + +The constructor creates a Compression object. No input source, +output destination, or compression model is specified. + +=item C + +Specifies a destination for output. Must be specified before +calling any function that writes data. + +=item C + +Writes a 13 byte marker tag which can be used to identify +the start of a block following non-ZPAQ data. + +=item C + +Writes a block header and specifies a compression model. +If linked with F, then C must be 1, 2, or 3 +to specify I, I, or I respectively. Higher numbers +compress smaller but more slowly. These models are compatible +with both the ZPAQ level 1 and 2 standards. + +=item C + +Writes a block header and specifies the HCOMP portion of the +compression model. The first two bytes of the string should +encode the length of the rest of the string as a 16 bit unsigned +number with the least significant bit first. The meaning of the +rest of the string is defined in the ZPAQ level 2 standard. +If the number of components (C) is 0, then the block +is saved in ZPAQ level 2 format, which cannot be read by +older ZPAQ level 1 decoders. Otherwise the block is saved in +ZPAQ level 1 format, which is compatible with all decoders. + +=item C + +Writes a segment header. C and +C are NUL terminated strings. If specified, then their +values are stored. Normally, C would be a file name +when compressing to an archive or omitted otherwise. If a file +is split among segments, then by convention only the first segment +is named. C is normally the uncompressed size as a decimal +number which is displayed when listing the contents of an archive. +Omitting it does not affect decompression. + +=item C + +Specifies the optional PCOMP string used for post-processing. +It must be called from within the first segment +of each block prior to compressing any data, but not from within +any other segment. +If C is 0 or no argument is passed, then the decompresser +will not post-process the data. The effect is to compress a +0 byte to indicate to the decompresser that no PCOMP string +is present. + +If C is not 0, then I bytes of the string I +are passed. If I is 0 or omitted, then +the first two bytes must encode +the length of the rest of the string as a 16 bit unsigned number +with the least significant byte first. The format of the remainder +of the string is described in the ZPAQ level 2 standard. +The effect is to compress a 1 byte +to indicate the presence of PCOMP, followed by the two length +bytes and the string as passed. For example, either +C or C +would compress the 5 bytes 1, 2, 0, 5, 8. +The user is responsible for pre-processing the input +prior to compression so that PCOMP restores the original data. + +=item C + +Specifies the input source for compression. It must be set +prior to the first call to C. + +=item C + +Compress n bytes of data, or until EOF is input, whichever comes +first. If n < 0 or omitted, then compress until EOF. +Returns true if there is more input available, or false if EOF +was read. + +=item C + +Stop compressing and write the end of a segment. If +C is specified, it should be a 20 byte string +as returned by C on the input data for +this segment I pre-processing. + +=item C + +Finish writing the current block. + +=back + +In order to create a valid ZPAQ stream, the components must +be written in the following order: + + for each block do { + if any non-ZPAQ data then { + write non-ZPAQ data + writeTag() + } + startBlock() + for each segment do { + startSegment() + if first segment in block then { + postProcess() + } + while (compress(n)) ; + endSegment() + } + endBlock() + } + +=head2 class Decompresser + +The class Decompresser has member functions to read each of the +syntactic elements of a ZPAQ stream. + +=over + +=item C + +The constructor creates a Decompresser object. No input source or +output destination is specified. + +=item C + +Specifies where the ZPAQ stream will be read from. Must be called +before any function that reads the stream. + +=item C + +Scan the input to find the start of the next block. If a block +does not start immediately, then the block must be preceded by +a marker tag (written with C) or it will +not be found. If C is not 0, then write the approximate +memory requirement (in bytes) to decompress to C<*memptr>). The +memory will be allocated by the first call to C. +It returns true if a block is found, or false if it reads to EOF +without finding a block. + +=item C + +Write the HCOMP string of the current block to C. +It will be in a format suitable +for passing to C. The first 2 bytes will +encode the length of the rest of the string as a 16 bit unsigned +integer with the least significant byte first. The format of the +remainder of the string is described in the ZPAQ level 1 +specification. + +=item C + +Find the start of the next segment. If another segment is found +within the current block then return true. If the end of the block +is found first, then return false. If a segment is found, the +filename field is not empty, and C +is not 0, then write the filename (without a terminating NUL byte) +to C. + +=item C + +Read or skip past the comment field following the filename field +in the segment header. If C is not 0 and the comment field is +not empty, then write the comment +(without a terminating NUL byte) to C. + +=item C + +Specify the destination for decompression. It must be set before +any data can be decompressed. + +=item C + +Specify the address of a SHA1 object for computing the checksum +of the decompressed data (after post-processing). As each byte C +is output, it is also passed to Cput(c)>. In order to +compute the correct checksum, the SHA1 object should be in its +initial state, either newly created, or by calling C, +before the first call to C. When the end of the segment +is reached, the value returned by Cresult()> should match +the stored checksum, if any. + +=item C + +Decode n bytes or until the end of segment, whichever comes +first. Return false if the end of segment is reached first. If +n < 0 or not specified, then decompress to the end of segment +and return false. C is the number of bytes prior to post-processing. +If the data is post-processed, then the size of the output may +be different. + +=item C + +Write the PCOMP string, if any, for the current block to C. +If there is no PCOMP string (no post-processor) then return false. +Otherwise write the string to C in a format suitable for +passing to C and return true. If written, +then the first 2 bytes will encode the length of the rest of the +string as a 16 bit unsigned integer with the least significant +bit first. The format of the rest of the string is descibed in +the ZPAQ level 1 standard. + +C is only valid after the first call to C +in the current block. To read the PCOMP string without decompressing any +data, then call C first. It is not necessary to +call C in this case. + +=item C + +Skip any compressed data in the current segment that has not yet +been decompressed and advance to the end of the segment. +Then if C is not 0 then write into +the 21 byte array that it points to. If a checksum is present, +then write a 1 into C and write the stored checksum +in C. Otherwise write a 0 in C. + +Note that it is not permitted to call decompress() if any compressed +data has been skipped in any earlier segments in the same block. + +=back + +A valid sequence of calls is as follows: + + while (findBlock()) { + while (findFilename()) { + readComment(); + if first segment in block then { (optional) + decompress(0) + pcomp() + } + while (decompress(n)) ; (optional) + readSegmentEnd(); + } + } + +=head1 EXAMPLES + +The following program F +lists the contents of a ZPAQ archive +read from standard input. + + #include + #include + #include "libzpaq.h" + + // Implement Reader and Writer interfaces for file I/O + class File: public libzpaq::Reader, public libzpaq::Writer { + FILE* f; + public: + File(FILE* f_): f(f_) {} + int get() {return getc(f);} + void put(int c) {putc(c, f);} + int read(char* buf, int n) {return fread(buf, 1, n, f);} + void write(const char* buf, int n) {fwrite(buf, 1, n, f);} + }; + + // Implement error handler + namespace libzpaq { + void error(const char* msg) { + fprintf(stderr, "Error: %s\n", msg); + exit(1); + } + } + + // List the contents of an archive. For each block, show + // the memory required to decompress. For each segment, + // show the filename and comment. + void list(FILE* input, FILE* output) { + libzpaq::Decompresser d; + File in(input), out(output); + double memory; + d.setInput(&in); + for (int block=1; d.findBlock(&memory); ++block) { + printf("Block %d needs %1.0f MB\n", block, memory/1e6); + while (d.findFilename(&out)) { // print filename + printf("\t"); + d.readComment(&out); // print comment + printf("\n"); + d.readSegmentEnd(); // skip compressed data + } + } + } + + int main() { + list(stdin, stdout); + return 0; + } + +The program could be compiled as follows: + + g++ listzpaq.cpp libzpaq.cpp + +The following code compresses a list of files into one block +written to stdout. Each file is compressed to a separate +segment. For each segment, the filename, comment, and SHA-1 +checksum are stored. The comment, as conventional, is the +file size as a decimal string. + + // Compress one file to one segment + void compress_file(libzpaq::Compressor& c, + const char* filename, + bool first_segment) { + + // Open input file + FILE* f; + f=fopen(filename, "rb"); + if (!f) return; + + // Compute SHA-1 checksum and file size + libzpaq::SHA1 sha1; + int ch; + while ((ch=getc(f))!=EOF) + sha1.put(ch); + + // Write file size as a comment. + // The size can have at most 19 digits. + char comment[20]; + sprintf(comment, "%1.0f", sha1.size()); + + // Compress segment + rewind(f); + File in(f); + c.startSegment(filename, comment); + if (first_segment) + c.postProcess(); + c.setInput(&in); + c.compress(); + c.endSegment(sha1.result()); + + // Close input file + fclose(f); + } + + // Compress a list of argc files in argv[0...argc-1] into one + // ZPAQ block to stdout at level 2. + void compress_list(int argc, char** argv) { + libzpaq::Compressor c; + File out(stdout); + c.setOutput(&out); + c.startBlock(2); + for (int i=0; i and C can +be passed an argument n to display progress every n bytes, +for example: + + for (int i=1; d.decompress(1000000); ++i) + fprintf(stderr, "Decompressed %d MB\n", i); + +To compress or decompress to and from objects in memory, derive +appropriate classes from C and C. For example, it is +possible to compress or decompress to a C using +the following class. + + struct String: public libzpaq::Writer { + std::string s; + void put(int c) {s+=char(c);} + }; + +This class is also useful for reading the filename and comment +fields during decompression as follows: + + String filename, comment; + while (d.findFilename(&filename)) { + d.readComment(&comment); + // ... + +=head1 AVAILABILITY + +I, I, and the ZPAQ level 1 and 2 specifications are +available from L. + +=head1 SEE ALSO + +C +C + +=cut + + diff --git a/libzpaq501/libzpaq.cpp b/libzpaq501/libzpaq.cpp new file mode 100644 index 0000000..f0c35d5 --- /dev/null +++ b/libzpaq501/libzpaq.cpp @@ -0,0 +1,3181 @@ +/* libzpaq.cpp - Part of LIBZPAQ Version 5.01 + + Copyright (C) 2011, Dell Inc. Written by Matt Mahoney. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so without restriction. + This Software is provided "as is" without warranty. + +LIBZPAQ is a C++ library for compression and decompression of data +conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/ +*/ + +#include "libzpaq.h" +#include +#include +#include + +#ifndef NOJIT +#ifdef unix +#include +#else +#include +#endif +#endif + +namespace libzpaq { + +// Standard library redirections +void* calloc(size_t a, size_t b) {return ::calloc(a, b);} +void free(void* p) {::free(p);} +int memcmp(const void* d, const void* s, size_t n) { + return ::memcmp(d, s, n);} +void* memset(void* d, int c, size_t n) {return ::memset(d, c, n);} +double log(double x) {return ::log(x);} +double exp(double x) {return ::exp(x);} +double pow(double x, double y) {return ::pow(x, y);} + +// Read 16 bit little-endian number +int toU16(const char* p) { + return (p[0]&255)+256*(p[1]&255); +} + +// Default read() and write() +int Reader::read(char* buf, int n) { + int i=0, c; + while (i=0) + buf[i++]=c; + return i; +} + +void Writer::write(const char* buf, int n) { + for (int i=0; i 0 bytes of executable memory and update +// p to point to it and newsize = n. Free any previously +// allocated memory first. If newsize is 0 then free only. +// Call error in case of failure. If NOJIT, ignore newsize +// and set p=0, n=0 without allocating memory. +void allocx(U8* &p, int &n, int newsize) { +#ifdef NOJIT + p=0; + n=0; +#else + if (p || n) { + if (p) +#ifdef unix + munmap(p, n); +#else // Windows + VirtualFree(p, 0, MEM_RELEASE); +#endif + p=0; + n=0; + } + if (newsize>0) { +#ifdef unix + p=(U8*)mmap(0, newsize, PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_PRIVATE|MAP_ANON, -1, 0); + if ((void*)p==MAP_FAILED) p=0; +#else + p=(U8*)VirtualAlloc(0, newsize, MEM_RESERVE|MEM_COMMIT, + PAGE_EXECUTE_READWRITE); +#endif + if (p) + n=newsize; + else { + n=0; + error("allocx failed"); + } + } +#endif +} + +//////////////////////////// SHA1 //////////////////////////// + +// SHA1 code, see http://en.wikipedia.org/wiki/SHA-1 + +// Start a new hash +void SHA1::init() { + len0=len1=0; + h[0]=0x67452301; + h[1]=0xEFCDAB89; + h[2]=0x98BADCFE; + h[3]=0x10325476; + h[4]=0xC3D2E1F0; +} + +// Return old result and start a new hash +const char* SHA1::result() { + + // pad and append length + const U32 s1=len1, s0=len0; + put(0x80); + while ((len0&511)!=448) + put(0); + put(s1>>24); + put(s1>>16); + put(s1>>8); + put(s1); + put(s0>>24); + put(s0>>16); + put(s0>>8); + put(s0); + + // copy h to hbuf + for (int i=0; i<5; ++i) { + hbuf[4*i]=h[i]>>24; + hbuf[4*i+1]=h[i]>>16; + hbuf[4*i+2]=h[i]>>8; + hbuf[4*i+3]=h[i]; + } + + // return hash prior to clearing state + init(); + return hbuf; +} + +// Hash 1 block of 64 bytes +void SHA1::process() { + for (int i=16; i<80; ++i) { + w[i]=w[i-3]^w[i-8]^w[i-14]^w[i-16]; + w[i]=w[i]<<1|w[i]>>31; + } + U32 a=h[0]; + U32 b=h[1]; + U32 c=h[2]; + U32 d=h[3]; + U32 e=h[4]; + const U32 k1=0x5A827999, k2=0x6ED9EBA1, k3=0x8F1BBCDC, k4=0xCA62C1D6; +#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+((b&c)|(~b&d))+k1+w[i]; b=b<<30|b>>2; +#define f5(i) f1(a,b,c,d,e,i) f1(e,a,b,c,d,i+1) f1(d,e,a,b,c,i+2) \ + f1(c,d,e,a,b,i+3) f1(b,c,d,e,a,i+4) + f5(0) f5(5) f5(10) f5(15) +#undef f1 +#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+(b^c^d)+k2+w[i]; b=b<<30|b>>2; + f5(20) f5(25) f5(30) f5(35) +#undef f1 +#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+((b&c)|(b&d)|(c&d))+k3+w[i]; b=b<<30|b>>2; + f5(40) f5(45) f5(50) f5(55) +#undef f1 +#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+(b^c^d)+k4+w[i]; b=b<<30|b>>2; + f5(60) f5(65) f5(70) f5(75) +#undef f1 +#undef f5 + h[0]+=a; + h[1]+=b; + h[2]+=c; + h[3]+=d; + h[4]+=e; +} + +//////////////////////////// Component /////////////////////// + +// A Component is a context model, indirect context model, match model, +// fixed weight mixer, adaptive 2 input mixer without or with current +// partial byte as context, adaptive m input mixer (without or with), +// or SSE (without or with). + +const int compsize[256]={0,2,3,2,3,4,6,6,3,5}; + +void Component::init() { + limit=cxt=a=b=c=0; + cm.resize(0); + ht.resize(0); + a16.resize(0); +} + +////////////////////////// StateTable ////////////////////////// + +// How many states with count of n0 zeros, n1 ones (0...2) +int StateTable::num_states(int n0, int n1) { + const int B=6; + const int bound[B]={20,48,15,8,6,5}; // n0 -> max n1, n1 -> max n0 + if (n0=B || n0>bound[n1]) return 0; + return 1+(n1>0 && n0+n1<=17); +} + +// New value of count n0 if 1 is observed (and vice versa) +void StateTable::discount(int& n0) { + n0=(n0>=1)+(n0>=2)+(n0>=3)+(n0>=4)+(n0>=5)+(n0>=7)+(n0>=8); +} + +// compute next n0,n1 (0 to N) given input y (0 or 1) +void StateTable::next_state(int& n0, int& n1, int y) { + if (n0 20,0 + // 48,1,0 -> 48,1 + // 15,2,0 -> 8,1 + // 8,3,0 -> 6,2 + // 8,3,1 -> 5,3 + // 6,4,0 -> 5,3 + // 5,5,0 -> 5,4 + // 5,5,1 -> 4,5 + while (!num_states(n0, n1)) { + if (n1<2) --n0; + else { + n0=(n0*(n1-1)+(n1/2))/n1; + --n1; + } + } + } +} + +// Initialize next state table ns[state*4] -> next if 0, next if 1, n0, n1 +StateTable::StateTable() { + + // Assign states by increasing priority + const int N=50; + U8 t[N][N][2]={{{0}}}; // (n0,n1,y) -> state number + int state=0; + for (int i=0; i=0 && n<=2); + if (n) { + t[n0][n1][0]=state; + t[n0][n1][1]=state+n-1; + state+=n; + } + } + } + + // Generate next state table + memset(ns, 0, sizeof(ns)); + for (int n0=0; n0=0 && s<256); + int s0=n0, s1=n1; + next_state(s0, s1, 0); + assert(s0>=0 && s0=0 && s1=0 && s0=0 && s1=7); + assert(hbegin>=cend); + assert(hend>=hbegin); + assert(out2); + if (!pp) { // if not a postprocessor then write COMP + for (int i=0; iput(header[i]); + } + else { // write PCOMP size only + out2->put((hend-hbegin)&255); + out2->put((hend-hbegin)>>8); + } + for (int i=hbegin; iput(header[i]); + return true; +} + +// Read header from in2 +int ZPAQL::read(Reader* in2) { + + // Get header size and allocate + int hsize=in2->get(); + hsize+=in2->get()*256; + header.resize(hsize+300); + cend=hbegin=hend=0; + header[cend++]=hsize&255; + header[cend++]=hsize>>8; + while (cend<7) header[cend++]=in2->get(); // hh hm ph pm n + + // Read COMP + int n=header[cend-1]; + for (int i=0; iget(); // component type + if (type==-1) error("unexpected end of file"); + header[cend++]=type; // component type + int size=compsize[type]; + if (size<1) error("Invalid component type"); + if (cend+size>header.isize()-8) error("COMP list too big"); + for (int j=1; jget(); + } + if ((header[cend++]=in2->get())!=0) error("missing COMP END"); + + // Insert a guard gap and read HCOMP + hbegin=hend=cend+128; + while (hendget(); + if (op==-1) error("unexpected end of file"); + header[hend++]=op; + } + if ((header[hend++]=in2->get())!=0) error("missing HCOMP END"); + assert(cend>=7 && cendhbegin && hend6); + assert(output==0); + assert(sha1==0); + init(header[2], header[3]); // hh, hm +} + +// Initialize machine state as PCOMP +void ZPAQL::initp() { + assert(header.isize()>6); + init(header[4], header[5]); // ph, pm +} + +// Flush pending output +void ZPAQL::flush() { + if (output) output->write(&outbuf[0], bufptr); + if (sha1) for (int i=0; iput(U8(outbuf[i])); + bufptr=0; +} + +// Return memory requirement in bytes +double ZPAQL::memory() { + double mem=pow(2.0,header[2]+2)+pow(2.0,header[3]) // hh hm + +pow(2.0,header[4]+2)+pow(2.0,header[5]) // ph pm + +header.size(); + int cp=7; // start of comp list + for (int i=0; i0); + assert(cend>=7); + assert(hbegin>=cend+128); + assert(hend>=hbegin); + assert(hend0); + h.resize(1, hbits); + m.resize(1, mbits); + r.resize(256); + a=b=c=d=pc=f=0; +} + +// Run program on input by interpreting header +void ZPAQL::run0(U32 input) { + assert(cend>6); + assert(hbegin>=cend+128); + assert(hend>=hbegin); + assert(hend0); + assert(h.size()>0); + assert(header[0]+256*header[1]==cend+hend-hbegin-2); + pc=hbegin; + a=input; + while (execute()) ; +} + +// Execute one instruction, return 0 after HALT else 1 +int ZPAQL::execute() { + switch(header[pc++]) { + case 0: err(); break; // ERROR + case 1: ++a; break; // A++ + case 2: --a; break; // A-- + case 3: a = ~a; break; // A! + case 4: a = 0; break; // A=0 + case 7: a = r[header[pc++]]; break; // A=R N + case 8: swap(b); break; // B<>A + case 9: ++b; break; // B++ + case 10: --b; break; // B-- + case 11: b = ~b; break; // B! + case 12: b = 0; break; // B=0 + case 15: b = r[header[pc++]]; break; // B=R N + case 16: swap(c); break; // C<>A + case 17: ++c; break; // C++ + case 18: --c; break; // C-- + case 19: c = ~c; break; // C! + case 20: c = 0; break; // C=0 + case 23: c = r[header[pc++]]; break; // C=R N + case 24: swap(d); break; // D<>A + case 25: ++d; break; // D++ + case 26: --d; break; // D-- + case 27: d = ~d; break; // D! + case 28: d = 0; break; // D=0 + case 31: d = r[header[pc++]]; break; // D=R N + case 32: swap(m(b)); break; // *B<>A + case 33: ++m(b); break; // *B++ + case 34: --m(b); break; // *B-- + case 35: m(b) = ~m(b); break; // *B! + case 36: m(b) = 0; break; // *B=0 + case 39: if (f) pc+=((header[pc]+128)&255)-127; else ++pc; break; // JT N + case 40: swap(m(c)); break; // *C<>A + case 41: ++m(c); break; // *C++ + case 42: --m(c); break; // *C-- + case 43: m(c) = ~m(c); break; // *C! + case 44: m(c) = 0; break; // *C=0 + case 47: if (!f) pc+=((header[pc]+128)&255)-127; else ++pc; break; // JF N + case 48: swap(h(d)); break; // *D<>A + case 49: ++h(d); break; // *D++ + case 50: --h(d); break; // *D-- + case 51: h(d) = ~h(d); break; // *D! + case 52: h(d) = 0; break; // *D=0 + case 55: r[header[pc++]] = a; break; // R=A N + case 56: return 0 ; // HALT + case 57: outc(a&255); break; // OUT + case 59: a = (a+m(b)+512)*773; break; // HASH + case 60: h(d) = (h(d)+a+512)*773; break; // HASHD + case 63: pc+=((header[pc]+128)&255)-127; break; // JMP N + case 64: a = a; break; // A=A + case 65: a = b; break; // A=B + case 66: a = c; break; // A=C + case 67: a = d; break; // A=D + case 68: a = m(b); break; // A=*B + case 69: a = m(c); break; // A=*C + case 70: a = h(d); break; // A=*D + case 71: a = header[pc++]; break; // A= N + case 72: b = a; break; // B=A + case 73: b = b; break; // B=B + case 74: b = c; break; // B=C + case 75: b = d; break; // B=D + case 76: b = m(b); break; // B=*B + case 77: b = m(c); break; // B=*C + case 78: b = h(d); break; // B=*D + case 79: b = header[pc++]; break; // B= N + case 80: c = a; break; // C=A + case 81: c = b; break; // C=B + case 82: c = c; break; // C=C + case 83: c = d; break; // C=D + case 84: c = m(b); break; // C=*B + case 85: c = m(c); break; // C=*C + case 86: c = h(d); break; // C=*D + case 87: c = header[pc++]; break; // C= N + case 88: d = a; break; // D=A + case 89: d = b; break; // D=B + case 90: d = c; break; // D=C + case 91: d = d; break; // D=D + case 92: d = m(b); break; // D=*B + case 93: d = m(c); break; // D=*C + case 94: d = h(d); break; // D=*D + case 95: d = header[pc++]; break; // D= N + case 96: m(b) = a; break; // *B=A + case 97: m(b) = b; break; // *B=B + case 98: m(b) = c; break; // *B=C + case 99: m(b) = d; break; // *B=D + case 100: m(b) = m(b); break; // *B=*B + case 101: m(b) = m(c); break; // *B=*C + case 102: m(b) = h(d); break; // *B=*D + case 103: m(b) = header[pc++]; break; // *B= N + case 104: m(c) = a; break; // *C=A + case 105: m(c) = b; break; // *C=B + case 106: m(c) = c; break; // *C=C + case 107: m(c) = d; break; // *C=D + case 108: m(c) = m(b); break; // *C=*B + case 109: m(c) = m(c); break; // *C=*C + case 110: m(c) = h(d); break; // *C=*D + case 111: m(c) = header[pc++]; break; // *C= N + case 112: h(d) = a; break; // *D=A + case 113: h(d) = b; break; // *D=B + case 114: h(d) = c; break; // *D=C + case 115: h(d) = d; break; // *D=D + case 116: h(d) = m(b); break; // *D=*B + case 117: h(d) = m(c); break; // *D=*C + case 118: h(d) = h(d); break; // *D=*D + case 119: h(d) = header[pc++]; break; // *D= N + case 128: a += a; break; // A+=A + case 129: a += b; break; // A+=B + case 130: a += c; break; // A+=C + case 131: a += d; break; // A+=D + case 132: a += m(b); break; // A+=*B + case 133: a += m(c); break; // A+=*C + case 134: a += h(d); break; // A+=*D + case 135: a += header[pc++]; break; // A+= N + case 136: a -= a; break; // A-=A + case 137: a -= b; break; // A-=B + case 138: a -= c; break; // A-=C + case 139: a -= d; break; // A-=D + case 140: a -= m(b); break; // A-=*B + case 141: a -= m(c); break; // A-=*C + case 142: a -= h(d); break; // A-=*D + case 143: a -= header[pc++]; break; // A-= N + case 144: a *= a; break; // A*=A + case 145: a *= b; break; // A*=B + case 146: a *= c; break; // A*=C + case 147: a *= d; break; // A*=D + case 148: a *= m(b); break; // A*=*B + case 149: a *= m(c); break; // A*=*C + case 150: a *= h(d); break; // A*=*D + case 151: a *= header[pc++]; break; // A*= N + case 152: div(a); break; // A/=A + case 153: div(b); break; // A/=B + case 154: div(c); break; // A/=C + case 155: div(d); break; // A/=D + case 156: div(m(b)); break; // A/=*B + case 157: div(m(c)); break; // A/=*C + case 158: div(h(d)); break; // A/=*D + case 159: div(header[pc++]); break; // A/= N + case 160: mod(a); break; // A%=A + case 161: mod(b); break; // A%=B + case 162: mod(c); break; // A%=C + case 163: mod(d); break; // A%=D + case 164: mod(m(b)); break; // A%=*B + case 165: mod(m(c)); break; // A%=*C + case 166: mod(h(d)); break; // A%=*D + case 167: mod(header[pc++]); break; // A%= N + case 168: a &= a; break; // A&=A + case 169: a &= b; break; // A&=B + case 170: a &= c; break; // A&=C + case 171: a &= d; break; // A&=D + case 172: a &= m(b); break; // A&=*B + case 173: a &= m(c); break; // A&=*C + case 174: a &= h(d); break; // A&=*D + case 175: a &= header[pc++]; break; // A&= N + case 176: a &= ~ a; break; // A&~A + case 177: a &= ~ b; break; // A&~B + case 178: a &= ~ c; break; // A&~C + case 179: a &= ~ d; break; // A&~D + case 180: a &= ~ m(b); break; // A&~*B + case 181: a &= ~ m(c); break; // A&~*C + case 182: a &= ~ h(d); break; // A&~*D + case 183: a &= ~ header[pc++]; break; // A&~ N + case 184: a |= a; break; // A|=A + case 185: a |= b; break; // A|=B + case 186: a |= c; break; // A|=C + case 187: a |= d; break; // A|=D + case 188: a |= m(b); break; // A|=*B + case 189: a |= m(c); break; // A|=*C + case 190: a |= h(d); break; // A|=*D + case 191: a |= header[pc++]; break; // A|= N + case 192: a ^= a; break; // A^=A + case 193: a ^= b; break; // A^=B + case 194: a ^= c; break; // A^=C + case 195: a ^= d; break; // A^=D + case 196: a ^= m(b); break; // A^=*B + case 197: a ^= m(c); break; // A^=*C + case 198: a ^= h(d); break; // A^=*D + case 199: a ^= header[pc++]; break; // A^= N + case 200: a <<= (a&31); break; // A<<=A + case 201: a <<= (b&31); break; // A<<=B + case 202: a <<= (c&31); break; // A<<=C + case 203: a <<= (d&31); break; // A<<=D + case 204: a <<= (m(b)&31); break; // A<<=*B + case 205: a <<= (m(c)&31); break; // A<<=*C + case 206: a <<= (h(d)&31); break; // A<<=*D + case 207: a <<= (header[pc++]&31); break; // A<<= N + case 208: a >>= (a&31); break; // A>>=A + case 209: a >>= (b&31); break; // A>>=B + case 210: a >>= (c&31); break; // A>>=C + case 211: a >>= (d&31); break; // A>>=D + case 212: a >>= (m(b)&31); break; // A>>=*B + case 213: a >>= (m(c)&31); break; // A>>=*C + case 214: a >>= (h(d)&31); break; // A>>=*D + case 215: a >>= (header[pc++]&31); break; // A>>= N + case 216: f = (a == a); break; // A==A + case 217: f = (a == b); break; // A==B + case 218: f = (a == c); break; // A==C + case 219: f = (a == d); break; // A==D + case 220: f = (a == U32(m(b))); break; // A==*B + case 221: f = (a == U32(m(c))); break; // A==*C + case 222: f = (a == h(d)); break; // A==*D + case 223: f = (a == U32(header[pc++])); break; // A== N + case 224: f = (a < a); break; // A a); break; // A>A + case 233: f = (a > b); break; // A>B + case 234: f = (a > c); break; // A>C + case 235: f = (a > d); break; // A>D + case 236: f = (a > U32(m(b))); break; // A>*B + case 237: f = (a > U32(m(c))); break; // A>*C + case 238: f = (a > h(d)); break; // A>*D + case 239: f = (a > U32(header[pc++])); break; // A> N + case 255: if((pc=hbegin+header[pc]+256*header[pc+1])>=hend)err();break;//LJ + default: err(); + } + return 1; +} + +// Print illegal instruction error message and exit +void ZPAQL::err() { + error("ZPAQL execution error"); +} + +///////////////////////// Predictor ///////////////////////// + +// Initailize model-independent tables +Predictor::Predictor(ZPAQL& zr): + c8(1), hmap4(1), z(zr) { + assert(sizeof(U8)==1); + assert(sizeof(U16)==2); + assert(sizeof(U32)==4); + assert(sizeof(U64)==8); + assert(sizeof(short)==2); + assert(sizeof(int)==4); + + // Initialize tables + dt2k[0]=0; + for (int i=1; i<256; ++i) + dt2k[i]=2048/i; + for (int i=0; i<1024; ++i) + dt[i]=(1<<17)/(i*2+3)*2; + for (int i=0; i<32768; ++i) + stretcht[i]=int(log((i+0.5)/(32767.5-i))*64+0.5+100000)-100000; + for (int i=0; i<4096; ++i) + squasht[i]=int(32768.0/(1+exp((i-2048)*(-1.0/64)))); + + // Verify floating point math for squash() and stretch() + U32 sqsum=0, stsum=0; + for (int i=32767; i>=0; --i) + stsum=stsum*3+stretch(i); + for (int i=4095; i>=0; --i) + sqsum=sqsum*3+squash(i-2048); + assert(stsum==3887533746u); + assert(sqsum==2278286169u); + + pcode=0; + pcode_size=0; +} + +Predictor::~Predictor() { + allocx(pcode, pcode_size, 0); // free executable memory +} + +// Initialize the predictor with a new model in z +void Predictor::init() { + + // Clear old JIT code if any + allocx(pcode, pcode_size, 0); + + // Initialize context hash function + z.inith(); + + // Initialize predictions + for (int i=0; i<256; ++i) h[i]=p[i]=0; + + // Initialize components + for (int i=0; i<256; ++i) // clear old model + comp[i].init(); + int n=z.header[6]; // hsize[0..1] hh hm ph pm n (comp)[n] END 0[128] (hcomp) END + const U8* cp=&z.header[7]; // start of component list + for (int i=0; i&z.header[0] && cp<&z.header[z.header.isize()-8]); + Component& cr=comp[i]; + switch(cp[0]) { + case CONS: // c + p[i]=(cp[1]-128)*4; + break; + case CM: // sizebits limit + if (cp[1]>32) error("max size for CM is 32"); + cr.cm.resize(1, cp[1]); // packed CM (22 bits) + CMCOUNT (10 bits) + cr.limit=cp[2]*4; + for (size_t j=0; j26) error("max size for ICM is 26"); + cr.limit=1023; + cr.cm.resize(256); + cr.ht.resize(64, cp[1]); + for (size_t j=0; j32 || cp[2]>32) error("max size for MATCH is 32 32"); + cr.cm.resize(1, cp[1]); // index + cr.ht.resize(1, cp[2]); // buf + cr.ht(0)=1; + break; + case AVG: // j k wt + if (cp[1]>=i) error("AVG j >= i"); + if (cp[2]>=i) error("AVG k >= i"); + break; + case MIX2: // sizebits j k rate mask + if (cp[1]>32) error("max size for MIX2 is 32"); + if (cp[3]>=i) error("MIX2 k >= i"); + if (cp[2]>=i) error("MIX2 j >= i"); + cr.c=(size_t(1)<32) error("max size for MIX is 32"); + if (cp[2]>=i) error("MIX j >= i"); + if (cp[3]<1 || cp[3]>i-cp[2]) error("MIX m not in 1..i-j"); + int m=cp[3]; // number of inputs + assert(m>=1); + cr.c=(size_t(1)<32) error("max size for ISSE is 32"); + if (cp[2]>=i) error("ISSE j >= i"); + cr.ht.resize(64, cp[1]); + cr.cm.resize(512); + for (int j=0; j<256; ++j) { + cr.cm[j*2]=1<<15; + cr.cm[j*2+1]=clamp512k(stretch(st.cminit(j)>>8)<<10); + } + break; + case SSE: // sizebits j start limit + if (cp[1]>32) error("max size for SSE is 32"); + if (cp[2]>=i) error("SSE j >= i"); + if (cp[3]>cp[4]*4) error("SSE start > limit*4"); + cr.cm.resize(32, cp[1]); + cr.limit=cp[4]*4; + for (size_t j=0; j0); + cp+=compsize[*cp]; + assert(cp>=&z.header[7] && cp<&z.header[z.cend]); + } +} + +// Return next bit prediction using interpreted COMP code +int Predictor::predict0() { + assert(c8>=1 && c8<=255); + + // Predict next bit + int n=z.header[6]; + assert(n>0 && n<=255); + const U8* cp=&z.header[7]; + assert(cp[-1]==n); + for (int i=0; i&z.header[0] && cp<&z.header[z.header.isize()-8]); + Component& cr=comp[i]; + switch(cp[0]) { + case CONS: // c + break; + case CM: // sizebits limit + cr.cxt=h[i]^hmap4; + p[i]=stretch(cr.cm(cr.cxt)>>17); + break; + case ICM: // sizebits + assert((hmap4&15)>0); + if (c8==1 || (c8&0xf0)==16) cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); + cr.cxt=cr.ht[cr.c+(hmap4&15)]; + p[i]=stretch(cr.cm(cr.cxt)>>8); + break; + case MATCH: // sizebits bufbits: a=len, b=offset, c=bit, cxt=bitpos, + // ht=buf, limit=pos + assert(cr.cm.size()==(size_t(1)<>(7-cr.cxt))&1; // predicted bit + p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767); + } + break; + case AVG: // j k wt + p[i]=(p[cp[1]]*cp[3]+p[cp[2]]*(256-cp[3]))>>8; + break; + case MIX2: { // sizebits j k rate mask + // c=size cm=wt[size] cxt=input + cr.cxt=((h[i]+(c8&cp[5]))&(cr.c-1)); + assert(cr.cxt=0 && w<65536); + p[i]=(w*p[cp[2]]+(65536-w)*p[cp[3]])>>16; + assert(p[i]>=-2048 && p[i]<2048); + } + break; + case MIX: { // sizebits j m rate mask + // c=size cm=wt[size][m] cxt=index of wt in cm + int m=cp[3]; + assert(m>=1 && m<=i); + cr.cxt=h[i]+(c8&cp[5]); + cr.cxt=(cr.cxt&(cr.c-1))*m; // pointer to row of weights + assert(cr.cxt<=cr.cm.size()-m); + int* wt=(int*)&cr.cm[cr.cxt]; + p[i]=0; + for (int j=0; j>8)*p[cp[2]+j]; + p[i]=clamp2k(p[i]>>8); + } + break; + case ISSE: { // sizebits j -- c=hi, cxt=bh + assert((hmap4&15)>0); + if (c8==1 || (c8&0xf0)==16) + cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); + cr.cxt=cr.ht[cr.c+(hmap4&15)]; // bit history + int *wt=(int*)&cr.cm[cr.cxt*2]; + p[i]=clamp2k((wt[0]*p[cp[2]]+wt[1]*64)>>16); + } + break; + case SSE: { // sizebits j start limit + cr.cxt=(h[i]+c8)*32; + int pq=p[cp[2]]+992; + if (pq<0) pq=0; + if (pq>1983) pq=1983; + int wt=pq&63; + pq>>=6; + assert(pq>=0 && pq<=30); + cr.cxt+=pq; + p[i]=stretch(((cr.cm(cr.cxt)>>10)*(64-wt)+(cr.cm(cr.cxt+1)>>10)*wt)>>13); + cr.cxt+=wt>>5; + } + break; + default: + error("component predict not implemented"); + } + cp+=compsize[cp[0]]; + assert(cp<&z.header[z.cend]); + assert(p[i]>=-2048 && p[i]<2048); + } + assert(cp[0]==NONE); + return squash(p[n-1]); +} + +// Update model with decoded bit y (0...1) +void Predictor::update0(int y) { + assert(y==0 || y==1); + assert(c8>=1 && c8<=255); + assert(hmap4>=1 && hmap4<=511); + + // Update components + const U8* cp=&z.header[7]; + int n=z.header[6]; + assert(n>=1 && n<=255); + assert(cp[-1]==n); + for (int i=0; i>8))>>2; + } + break; + case MATCH: // sizebits bufbits: + // a=len, b=offset, c=bit, cm=index, cxt=bitpos + // ht=buf, limit=pos + { + assert(cr.a<=255); + assert(cr.c==0 || cr.c==1); + assert(cr.cxt<8); + assert(cr.cm.size()==(size_t(1)<>5; + int w=cr.a16[cr.cxt]; + w+=(err*(p[cp[2]]-p[cp[3]])+(1<<12))>>13; + if (w<0) w=0; + if (w>65535) w=65535; + cr.a16[cr.cxt]=w; + } + break; + case MIX: { // sizebits j m rate mask + // cm=wt[size][m], cxt=input + int m=cp[3]; + assert(m>0 && m<=i); + assert(cr.cm.size()==m*cr.c); + assert(cr.cxt+m<=cr.cm.size()); + int err=(y*32767-squash(p[i]))*cp[4]>>4; + int* wt=(int*)&cr.cm[cr.cxt]; + for (int j=0; j>13)); + } + break; + case ISSE: { // sizebits j -- c=hi, cxt=bh + assert(cr.cxt==cr.ht[cr.c+(hmap4&15)]); + int err=y*32767-squash(p[i]); + int *wt=(int*)&cr.cm[cr.cxt*2]; + wt[0]=clamp512k(wt[0]+((err*p[cp[2]]+(1<<12))>>13)); + wt[1]=clamp512k(wt[1]+((err+16)>>5)); + cr.ht[cr.c+(hmap4&15)]=st.next(cr.cxt, y); + } + break; + case SSE: // sizebits j start limit + train(cr, y); + break; + default: + assert(0); + } + cp+=compsize[cp[0]]; + assert(cp>=&z.header[7] && cp<&z.header[z.cend] + && cp<&z.header[z.header.isize()-8]); + } + assert(cp[0]==NONE); + + // Save bit y in c8, hmap4 + c8+=c8+y; + if (c8>=256) { + z.run(c8-256); + hmap4=1; + c8=1; + for (int i=0; i=16 && c8<32) + hmap4=(hmap4&0xf)<<5|y<<4|1; + else + hmap4=(hmap4&0x1f0)|(((hmap4&0xf)*2+y)&0xf); +} + +// Find cxt row in hash table ht. ht has rows of 16 indexed by the +// low sizebits of cxt with element 0 having the next higher 8 bits for +// collision detection. If not found after 3 adjacent tries, replace the +// row with lowest element 1 as priority. Return index of row. +size_t Predictor::find(Array& ht, int sizebits, U32 cxt) { + assert(ht.size()==size_t(16)<>sizebits&255; + size_t h0=(cxt*16)&(ht.size()-16); + if (ht[h0]==chk) return h0; + size_t h1=h0^16; + if (ht[h1]==chk) return h1; + size_t h2=h0^32; + if (ht[h2]==chk) return h2; + if (ht[h0+1]<=ht[h1+1] && ht[h0+1]<=ht[h2+1]) + return memset(&ht[h0], 0, 16), ht[h0]=chk, h0; + else if (ht[h1+1]get(); + if (c<0) error("unexpected end of input"); + curr=curr<<8|c; + } + } + U32 n=buf.size(); + if (n>curr) n=curr; + high=in->read(&buf[0], n); + curr-=high; + low=0; +} + +// Return next bit of decoded input, which has 16 bit probability p of being 1 +int Decoder::decode(int p) { + assert(p>=0 && p<65536); + assert(high>low && low>0); + if (currhigh) error("archive corrupted"); + assert(curr>=low && curr<=high); + U32 mid=low+U32(((high-low)*U64(U32(p)))>>16); // split range + assert(high>mid && mid>=low); + int y=curr<=mid; + if (y) high=mid; else low=mid+1; // pick half + while ((high^low)<0x1000000) { // shift out identical leading bytes + high=high<<8|255; + low=low<<8; + low+=(low==0); + int c=in->get(); + if (c<0) error("unexpected end of file"); + curr=curr<<8|c; + } + return y; +} + +// Decompress 1 byte or -1 at end of input +int Decoder::decompress() { + if (pr.isModeled()) { // n>0 components? + if (curr==0) { // segment initialization + for (int i=0; i<4; ++i) + curr=curr<<8|in->get(); + } + if (decode(0)) { + if (curr!=0) error("decoding end of stream"); + return -1; + } + else { + int c=1; + while (c<256) { // get 8 bits + int p=pr.predict()*2+1; + c+=c+decode(p); + pr.update(c&1); + } + return c-256; + } + } + else { + if (low==high) loadbuf(); + if (low==high) return -1; + return buf[low++]&255; + } +} + +// Find end of compressed data and return next byte +int Decoder::skip() { + int c=-1; + if (pr.isModeled()) { + while (curr==0) // at start? + curr=in->get(); + while (curr && (c=in->get())>=0) // find 4 zeros + curr=curr<<8|c; + while ((c=in->get())==0) ; // might be more than 4 + return c; + } + else { + if (curr==0) // at start? + for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c; + while (curr>0) { + U32 n=BUFSIZE; + if (n>curr) n=curr; + U32 n1=in->read(&buf[0], n); + curr-=n1; + if (n1!=n) return -1; + if (curr==0) + for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c; + } + if (c>=0) c=in->get(); + return c; + } +} + +////////////////////// PostProcessor ////////////////////// + +// Copy ph, pm from block header +void PostProcessor::init(int h, int m) { + state=hsize=0; + ph=h; + pm=m; + z.clear(); +} + +// (PASS=0 | PROG=1 psize[0..1] pcomp[0..psize-1]) data... EOB=-1 +// Return state: 1=PASS, 2..4=loading PROG, 5=PROG loaded +int PostProcessor::write(int c) { + assert(c>=-1 && c<=255); + switch (state) { + case 0: // initial state + if (c<0) error("Unexpected EOS"); + state=c+1; // 1=PASS, 2=PROG + if (state>2) error("unknown post processing type"); + if (state==1) z.clear(); + break; + case 1: // PASS + z.outc(c); + break; + case 2: // PROG + if (c<0) error("Unexpected EOS"); + hsize=c; // low byte of size + state=3; + break; + case 3: // PROG psize[0] + if (c<0) error("Unexpected EOS"); + hsize+=c*256; // high byte of psize + z.header.resize(hsize+300); + z.cend=8; + z.hbegin=z.hend=z.cend+128; + z.header[4]=ph; + z.header[5]=pm; + state=4; + break; + case 4: // PROG psize[0..1] pcomp[0...] + if (c<0) error("Unexpected EOS"); + assert(z.hend>8; + z.initp(); + state=5; + } + break; + case 5: // PROG ... data + z.run(c); + if (c<0) z.flush(); + break; + } + return state; +} + +/////////////////////// Decompresser ///////////////////// + +// Find the start of a block and return true if found. Set memptr +// to memory used. +bool Decompresser::findBlock(double* memptr) { + assert(state==BLOCK); + + // Find start of block + U32 h1=0x3D49B113, h2=0x29EB7F93, h3=0x2614BE13, h4=0x3828EB13; + // Rolling hashes initialized to hash of first 13 bytes + int c; + while ((c=dec.in->get())!=-1) { + h1=h1*12+c; + h2=h2*20+c; + h3=h3*28+c; + h4=h4*44+c; + if (h1==0xB16B88F1 && h2==0xFF5376F1 && h3==0x72AC5BF1 && h4==0x2F909AF1) + break; // hash of 16 byte string + } + if (c==-1) return false; + + // Read header + if ((c=dec.in->get())!=1 && c!=2) error("unsupported ZPAQ level"); + if (dec.in->get()!=1) error("unsupported ZPAQL type"); + z.read(dec.in); + if (c==1 && z.header.isize()>6 && z.header[6]==0) + error("ZPAQ level 1 requires at least 1 component"); + if (memptr) *memptr=z.memory(); + state=FILENAME; + decode_state=FIRSTSEG; + return true; +} + +// Read the start of a segment (1) or end of block code (255). +// If a segment is found, write the filename and return true, else false. +bool Decompresser::findFilename(Writer* filename) { + assert(state==FILENAME); + int c=dec.in->get(); + if (c==1) { // segment found + while (true) { + c=dec.in->get(); + if (c==-1) error("unexpected EOF"); + if (c==0) { + state=COMMENT; + return true; + } + if (filename) filename->put(c); + } + } + else if (c==255) { // end of block found + state=BLOCK; + return false; + } + else + error("missing segment or end of block"); + return false; +} + +// Read the comment from the segment header +void Decompresser::readComment(Writer* comment) { + assert(state==COMMENT); + state=DATA; + while (true) { + int c=dec.in->get(); + if (c==-1) error("unexpected EOF"); + if (c==0) break; + if (comment) comment->put(c); + } + if (dec.in->get()!=0) error("missing reserved byte"); +} + +// Decompress n bytes, or all if n < 0. Return false if done +bool Decompresser::decompress(int n) { + assert(state==DATA); + assert(decode_state!=SKIP); + + // Initialize models to start decompressing block + if (decode_state==FIRSTSEG) { + dec.init(); + assert(z.header.size()>5); + pp.init(z.header[4], z.header[5]); + decode_state=SEG; + } + + // Decompress and load PCOMP into postprocessor + while ((pp.getState()&3)!=1) + pp.write(dec.decompress()); + + // Decompress n bytes, or all if n < 0 + while (n) { + int c=dec.decompress(); + pp.write(c); + if (c==-1) { + state=SEGEND; + return false; + } + if (n>0) --n; + } + return true; +} + +// Read end of block. If a SHA1 checksum is present, write 1 and the +// 20 byte checksum into sha1string, else write 0 in first byte. +// If sha1string is 0 then discard it. +void Decompresser::readSegmentEnd(char* sha1string) { + assert(state==DATA || state==SEGEND); + + // Skip remaining data if any and get next byte + int c=0; + if (state==DATA) { + c=dec.skip(); + decode_state=SKIP; + } + else if (state==SEGEND) + c=dec.in->get(); + state=FILENAME; + + // Read checksum + if (c==254) { + if (sha1string) sha1string[0]=0; // no checksum + } + else if (c==253) { + if (sha1string) sha1string[0]=1; + for (int i=1; i<=20; ++i) { + c=dec.in->get(); + if (sha1string) sha1string[i]=c; + } + } + else + error("missing end of segment marker"); +} + +/////////////////////////// decompress() ///////////////////// + +void decompress(Reader* in, Writer* out) { + Decompresser d; + d.setInput(in); + d.setOutput(out); + while (d.findBlock()) { // don't calculate memory + while (d.findFilename()) { // discard filename + d.readComment(); // discard comment + d.decompress(); // to end of segment + d.readSegmentEnd(); // discard sha1string + } + } +} + +////////////////////// Encoder //////////////////// + +// Initialize for start of block +void Encoder::init() { + low=1; + high=0xFFFFFFFF; + pr.init(); + if (!pr.isModeled()) low=0, buf.resize(1<<16); +} + +// compress bit y having probability p/64K +void Encoder::encode(int y, int p) { + assert(out); + assert(p>=0 && p<65536); + assert(y==0 || y==1); + assert(high>low && low>0); + U32 mid=low+U32(((high-low)*U64(U32(p)))>>16); // split range + assert(high>mid && mid>=low); + if (y) high=mid; else low=mid+1; // pick half + while ((high^low)<0x1000000) { // write identical leading bytes + out->put(high>>24); // same as low>>24 + high=high<<8|255; + low=low<<8; + low+=(low==0); // so we don't code 4 0 bytes in a row + } +} + +// compress byte c (0..255 or -1=EOS) +void Encoder::compress(int c) { + assert(out); + if (pr.isModeled()) { + if (c==-1) + encode(1, 0); + else { + assert(c>=0 && c<=255); + encode(0, 0); + for (int i=7; i>=0; --i) { + int p=pr.predict()*2+1; + assert(p>0 && p<65536); + int y=c>>i&1; + encode(y, p); + pr.update(y); + } + } + } + else { + if (c<0 || low==buf.size()) { + out->put((low>>24)&255); + out->put((low>>16)&255); + out->put((low>>8)&255); + out->put(low&255); + out->write(&buf[0], low); + low=0; + } + if (c>=0) buf[low++]=c; + } +} + +///////////////////// Compressor ////////////////////// + +// Write 13 byte start tag +// "\x37\x6B\x53\x74\xA0\x31\x83\xD3\x8C\xB2\x28\xB0\xD3" +void Compressor::writeTag() { + assert(state==INIT); + enc.out->put(0x37); + enc.out->put(0x6b); + enc.out->put(0x53); + enc.out->put(0x74); + enc.out->put(0xa0); + enc.out->put(0x31); + enc.out->put(0x83); + enc.out->put(0xd3); + enc.out->put(0x8c); + enc.out->put(0xb2); + enc.out->put(0x28); + enc.out->put(0xb0); + enc.out->put(0xd3); +} + +void Compressor::startBlock(int level) { + + // Model 1 - min.cfg + static const char models[]={ + 26,0,1,2,0,0,2,3,16,8,19,0,0,96,4,28, + 59,10,59,112,25,10,59,10,59,112,56,0, + + // Model 2 - mid.cfg + 69,0,3,3,0,0,8,3,5,8,13,0,8,17,1,8, + 18,2,8,18,3,8,19,4,4,22,24,7,16,0,7,24, + -1,0,17,104,74,4,95,1,59,112,10,25,59,112,10,25, + 59,112,10,25,59,112,10,25,59,112,10,25,59,10,59,112, + 25,69,-49,8,112,56,0, + + // Model 3 - max.cfg + -60,0,5,9,0,0,22,1,-96,3,5,8,13,1,8,16, + 2,8,18,3,8,19,4,8,19,5,8,20,6,4,22,24, + 3,17,8,19,9,3,13,3,13,3,13,3,14,7,16,0, + 15,24,-1,7,8,0,16,10,-1,6,0,15,16,24,0,9, + 8,17,32,-1,6,8,17,18,16,-1,9,16,19,32,-1,6, + 0,19,20,16,0,0,17,104,74,4,95,2,59,112,10,25, + 59,112,10,25,59,112,10,25,59,112,10,25,59,112,10,25, + 59,10,59,112,10,25,59,112,10,25,69,-73,32,-17,64,47, + 14,-25,91,47,10,25,60,26,48,-122,-105,20,112,63,9,70, + -33,0,39,3,25,112,26,52,25,25,74,10,4,59,112,25, + 10,4,59,112,25,10,4,59,112,25,65,-113,-44,72,4,59, + 112,8,-113,-40,8,68,-81,60,60,25,69,-49,9,112,25,25, + 25,25,25,112,56,0, + + 0,0}; // 0,0 = end of list + + if (level<1) error("compression level must be at least 1"); + const char* p=models; + int i; + for (i=1; iput('z'); + enc.out->put('P'); + enc.out->put('Q'); + enc.out->put(1+(len>6 && hcomp[6]==0)); // level 1 or 2 + enc.out->put(1); + for (int i=0; iput(hcomp[i]); + MemoryReader m(hcomp); + z.read(&m); + state=BLOCK1; +} + +// Write a segment header +void Compressor::startSegment(const char* filename, const char* comment) { + assert(state==BLOCK1 || state==BLOCK2); + enc.out->put(1); + while (filename && *filename) + enc.out->put(*filename++); + enc.out->put(0); + while (comment && *comment) + enc.out->put(*comment++); + enc.out->put(0); + enc.out->put(0); + if (state==BLOCK1) state=SEG1; + if (state==BLOCK2) state=SEG2; +} + +// Initialize encoding and write pcomp to first segment +// If len is 0 then length is encoded in pcomp[0..1] +void Compressor::postProcess(const char* pcomp, int len) { + assert(state==SEG1); + enc.init(); + if (pcomp) { + enc.compress(1); + if (len<=0) { + len=toU16(pcomp); + pcomp+=2; + } + enc.compress(len&255); + enc.compress((len>>8)&255); + for (int i=0; iget())>=0) { + enc.compress(ch); + if (n>0) --n; + } + return ch>=0; +} + +// End segment, write sha1string if present +void Compressor::endSegment(const char* sha1string) { + assert(state==SEG2); + enc.compress(-1); + enc.out->put(0); + enc.out->put(0); + enc.out->put(0); + enc.out->put(0); + if (sha1string) { + enc.out->put(253); + for (int i=0; i<20; ++i) + enc.out->put(sha1string[i]); + } + else + enc.out->put(254); + state=BLOCK2; +} + +// End block +void Compressor::endBlock() { + assert(state==BLOCK2); + enc.out->put(255); + state=INIT; +} + +/////////////////////////// compress() /////////////////////// + +void compress(Reader* in, Writer* out, int level) { + assert(level>=1); + Compressor c; + c.setInput(in); + c.setOutput(out); + c.startBlock(level); + c.startSegment(); + c.postProcess(); + c.compress(); + c.endSegment(); + c.endBlock(); +} + +//////////////////////// ZPAQL::assemble() //////////////////// + +#ifndef NOJIT +/* +assemble(); + +Assembles the ZPAQL code in hcomp[0..hlen-1] and stores x86-32 or x86-64 +code in rcode[0..rcode_size-1]. Execution begins at rcode[0]. It will not +write beyond the end of rcode, but in any case it returns the number of +bytes that would have been written. It returns 0 in case of error. + +The assembled code implements run() and returns 1 if successful or +0 if the ZPAQL code executes an invalid instruction or jumps out of +bounds. + +A ZPAQL virtual machine has the following state. All values are +unsigned and initially 0: + + a, b, c, d: 32 bit registers (pointed to by their respective parameters) + f: 1 bit flag register (pointed to) + r[0..255]: 32 bit registers + m[0..msize-1]: 8 bit registers, where msize is a power of 2 + h[0..hsize-1]: 32 bit registers, where hsize is a power of 2 + out: pointer to a Writer + sha1: pointer to a SHA1 + +Generally a ZPAQL machine is used to compute contexts which are +placed in h. A second machine might post-process, and write its +output to out and sha1. In either case, a machine is called with +its input in a, representing a single byte (0..255) or +(for a postprocessor) EOF (0xffffffff). Execution returs after a +ZPAQL halt instruction. + +ZPAQL instructions are 1 byte unless the last 3 bits are 1. +In this case, a second operand byte follows. Opcode 255 is +the only 3 byte instruction. They are organized: + + 00dddxxx = unary opcode xxx on destination ddd (ddd < 111) + 00111xxx = special instruction xxx + 01dddsss = assignment: ddd = sss (ddd < 111) + 1xxxxsss = operation sxxx from sss to a + +The meaning of sss and ddd are as follows: + + 000 = a (accumulator) + 001 = b + 010 = c + 011 = d + 100 = *b (means m[b mod msize]) + 101 = *c (means m[c mod msize]) + 110 = *d (means h[d mod hsize]) + 111 = n (constant 0..255 in second byte of instruction) + +For example, 01001110 assigns *d to b. The other instructions xxx +are as follows: + +Group 00dddxxx where ddd < 111 and xxx is: + 000 = ddd<>a, swap with a (except 00000000 is an error, and swap + with *b or *c leaves the high bits of a unchanged) + 001 = ddd++, increment + 010 = ddd--, decrement + 011 = ddd!, not (invert all bits) + 100 = ddd=0, clear (set all bits of ddd to 0) + 101 = not used (error) + 110 = not used + 111 = ddd=r n, assign from r[n] to ddd, n=0..255 in next opcode byte +Except: + 00100111 = jt n, jump if f is true (n = -128..127, relative to next opcode) + 00101111 = jf n, jump if f is false (n = -128..127) + 00110111 = r=a n, assign r[n] = a (n = 0..255) + +Group 00111xxx where xxx is: + 000 = halt (return) + 001 = output a + 010 = not used + 011 = hash: a = (a + *b + 512) * 773 + 100 = hashd: *d = (*d + a + 512) * 773 + 101 = not used + 110 = not used + 111 = unconditional jump (n = -128 to 127, relative to next opcode) + +Group 1xxxxsss where xxxx is: + 0000 = a += sss (add, subtract, multiply, divide sss to a) + 0001 = a -= sss + 0010 = a *= sss + 0011 = a /= sss (unsigned, except set a = 0 if sss is 0) + 0100 = a %= sss (remainder, except set a = 0 if sss is 0) + 0101 = a &= sss (bitwise AND) + 0110 = a &= ~sss (bitwise AND with complement of sss) + 0111 = a |= sss (bitwise OR) + 1000 = a ^= sss (bitwise XOR) + 1001 = a <<= (sss % 32) (left shift by low 5 bits of sss) + 1010 = a >>= (sss % 32) (unsigned, zero bits shifted in) + 1011 = a == sss (compare, set f = true if equal or false otherwise) + 1100 = a < sss (unsigned compare, result in f) + 1101 = a > sss (unsigned compare) + 1110 = not used + 1111 = not used except 11111111 is a 3 byte jump to the absolute address + in the next 2 bytes in little-endian (LSB first) order. + +assemble() translates ZPAQL to 32 bit x86 code to be executed by run(). +Registers are mapped as follows: + + eax = source sss from *b, *c, *d or sometimes n + ecx = pointer to destination *b, *c, *d, or spare + edx = a + ebx = f (1 for true, 0 for false) + esp = stack pointer + ebp = d + esi = b + edi = c + +run() saves non-volatile registers (ebp, esi, edi, ebx) on the stack, +loads a, b, c, d, f, and executes the translated instructions. +A halt instruction saves a, b, c, d, f, pops the saved registers +and returns. Invalid instructions or jumps outside of the range +of the ZPAQL code call libzpaq::error(). + +In 64 bit mode, the following additional registers are used: + + r12 = h + r14 = r + r15 = m + +*/ + +// Called by out +static void flush1(ZPAQL* z) { + z->flush(); +} + +// return true if op is an undefined ZPAQL instruction +static bool iserr(int op) { + return op==0 || (op>=120 && op<=127) || (op>=240 && op<=254) + || op==58 || (op<64 && (op%8==5 || op%8==6)); +} + +// Write k bytes of x to rcode[o++] MSB first +static void put(U8* rcode, int n, int& o, U32 x, int k) { + while (k-->0) { + if (o>(k*8))&255; + ++o; + } +} + +// Write 4 bytes of x to rcode[o++] LSB first +static void put4lsb(U8* rcode, int n, int& o, U32 x) { + for (int k=0; k<4; ++k) { + if (o>(k*8))&255; + ++o; + } +} + +// Write a 1-4 byte x86 opcode without or with an 4 byte operand +// to rcode[o...] +#define put1(x) put(rcode, rcode_size, o, (x), 1) +#define put2(x) put(rcode, rcode_size, o, (x), 2) +#define put3(x) put(rcode, rcode_size, o, (x), 3) +#define put4(x) put(rcode, rcode_size, o, (x), 4) +#define put5(x,y) put4(x), put1(y) +#define put6(x,y) put4(x), put2(y) +#define put4r(x) put4lsb(rcode, rcode_size, o, x) +#define puta(x) t=U32(size_t(x)), put4r(t) +#define put1a(x,y) put1(x), puta(y) +#define put2a(x,y) put2(x), puta(y) +#define put3a(x,y) put3(x), puta(y) +#define put4a(x,y) put4(x), puta(y) +#define put5a(x,y,z) put4(x), put1(y), puta(z) +#define put2l(x,y) put2(x), t=U32(size_t(y)), put4r(t), \ + t=U32(size_t(y)>>(S*4)), put4r(t) + +// Assemble ZPAQL in in the HCOMP section of header to rcode, +// but do not write beyond rcode_size. Return the number of +// bytes output or that would have been output. +// Execution starts at rcode[0] and returns 1 if successful or 0 +// in case of a ZPAQL execution error. +int ZPAQL::assemble() { + + // x86? (not foolproof) + const int S=sizeof(char*); // 4 = x86, 8 = x86-64 + U32 t=0x12345678; + if (*(char*)&t!=0x78 || (S!=4 && S!=8)) + error("JIT supported only for x86-32 and x86-64"); + + const U8* hcomp=&header[hbegin]; + const int hlen=hend-hbegin+1; + const int msize=m.size(); + const int hsize=h.size(); + const int regcode[8]={2,6,7,5}; // a,b,c,d.. -> edx,esi,edi,ebp,eax.. + Array it(hlen); // hcomp -> rcode locations + int done=0; // number of instructions assembled (0..hlen) + int o=5; // rcode output index, reserve space for jmp + + // Code for the halt instruction (restore registers and return) + const int halt=o; + if (S==8) { + put2l(0x48b9, &a); // mov rcx, a + put2(0x8911); // mov [rcx], edx + put2l(0x48b9, &b); // mov rcx, b + put2(0x8931); // mov [rcx], esi + put2l(0x48b9, &c); // mov rcx, c + put2(0x8939); // mov [rcx], edi + put2l(0x48b9, &d); // mov rcx, d + put2(0x8929); // mov [rcx], ebp + put2l(0x48b9, &f); // mov rcx, f + put2(0x8919); // mov [rcx], ebx + put4(0x4883c438); // add rsp, 56 + put2(0x415f); // pop r15 + put2(0x415e); // pop r14 + put2(0x415d); // pop r13 + put2(0x415c); // pop r12 + } + else { + put2a(0x8915, &a); // mov [a], edx + put2a(0x8935, &b); // mov [b], esi + put2a(0x893d, &c); // mov [c], edi + put2a(0x892d, &d); // mov [d], ebp + put2a(0x891d, &f); // mov [f], ebx + put3(0x83c43c); // add esp, 60 + } + put1(0x5d); // pop ebp + put1(0x5b); // pop ebx + put1(0x5f); // pop edi + put1(0x5e); // pop esi + put1(0xc3); // ret + + // Code for the out instruction. + // Store a=edx at outbuf[bufptr++]. If full, call flush1(). + const int outlabel=o; + if (S==8) { + put2l(0x48b8, &outbuf[0]);// mov rax, outbuf.p + put2l(0x49ba, &bufptr); // mov r10, &bufptr + put3(0x418b0a); // mov ecx, [r10] + put3(0x891408); // mov [rax+rcx], edx + put2(0xffc1); // inc ecx + put3(0x41890a); // mov [r10], ecx + put2a(0x81f9, outbuf.size()); // cmp ecx, outbuf.size() + put2(0x7401); // jz L1 + put1(0xc3); // ret + put4(0x4883ec30); // L1: sub esp, 48 ; call flush1(this) + put4(0x48893c24); // mov [rsp], rdi + put5(0x48897424,8); // mov [rsp+8], rsi + put5(0x48895424,16); // mov [rsp+16], rdx + put5(0x48894c24,24); // mov [rsp+24], rcx +#ifdef unix + put2l(0x48bf, this); // mov rdi, this +#else // Windows + put2l(0x48b9, this); // mov rcx, this +#endif + put2l(0x49bb, &flush1); // mov r11, &flush1 + put3(0x41ffd3); // call r11 + put5(0x488b4c24,24); // mov rcx, [rsp+24] + put5(0x488b5424,16); // mov rdx, [rsp+16] + put5(0x488b7424,8); // mov rsi, [rsp+8] + put4(0x488b3c24); // mov rdi, [rsp] + put4(0x4883c430); // add esp, 48 + put1(0xc3); // ret + } + else { + put1a(0xb8, &outbuf[0]); // mov eax, outbuf.p + put2a(0x8b0d, &bufptr); // mov ecx, [bufptr] + put3(0x891408); // mov [eax+ecx], edx + put2(0xffc1); // inc ecx + put2a(0x890d, &bufptr); // mov [bufptr], ecx + put2a(0x81f9, outbuf.size()); // cmp ecx, outbuf.size() + put2(0x7401); // jz L1 + put1(0xc3); // ret + put3(0x83ec08); // L1: sub esp, 8 + put4(0x89542404); // mov [esp+4], edx + put3a(0xc70424, this); // mov [esp], this + put1a(0xb8, &flush1); // mov eax, &flush1 + put2(0xffd0); // call eax + put4(0x8b542404); // mov edx, [esp+4] + put3(0x83c408); // add esp, 8 + put1(0xc3); // ret + } + + // Set it[i]=1 for each ZPAQL instruction reachable from the previous + // instruction + 2 if reachable by a jump (or 3 if both). + it[0]=2; + assert(hlen>0 && hcomp[hlen-1]==0); // ends with error + do { + done=0; + const int NONE=0x80000000; + for (int i=0; i>24);// jt,jf,jmp + if (op==63) next1=NONE; // jmp + if ((next2<0 || next2>=hlen) && next2!=NONE) next2=hlen-1; // error + if (next1!=NONE && !(it[next1]&1)) it[next1]|=1, ++done; + if (next2!=NONE && !(it[next2]&2)) it[next2]|=2, ++done; + } + } + } while (done>0); + + // Set it[i] bits 2-3 to 4, 8, or 12 if a comparison + // (<, >, == respectively) does not need to save the result in f, + // or if a conditional jump (jt, jf) does not need to read f. + // This is true if a comparison is followed directly by a jt/jf, + // the jt/jf is not a jump target, the byte before is not a jump + // target (for a 2 byte comparison), and for the comparison instruction + // if both paths after the jt/jf lead to another comparison or error + // before another jt/jf. At most hlen steps are traced because after + // that it must be an infinite loop. + for (int i=0; i=216 && op1<240 && (op2==39 || op2==47) + && it[i2]==1 && (i2==i+1 || it[i+1]==0)) { + int code=(op1-208)/8*4; // 4,8,12 is ==,<,> + it[i2]+=code; // OK to test CF, ZF instead of f + for (int j=0; j<2 && code; ++j) { // trace each path from i2 + int k=i2+2; // branch not taken + if (j==1) k=i2+2+(hcomp[i2+1]<<24>>24); // branch taken + for (int l=0; l=hlen) break; // out of bounds, pass + const int op=hcomp[k]; + if (op==39 || op==47) code=0; // jt,jf, fail + else if (op>=216 && op<240) break; // ==,<,>, pass + else if (iserr(op)) break; // error, pass + else if (op==255) k=hcomp[k+1]+256*hcomp[k+2]; // lj + else if (op==63) k=k+2+(hcomp[k+1]<<24>>24); // jmp + else if (op==56) k=0; // halt + else k=k+1+(op%8==7); // ordinary instruction + } + } + it[i]+=code; // if > 0 then OK to not save flags in f (bl) + } + } + + // Start of run(): Save x86 and load ZPAQL registers + const int start=o; + assert(start>=16); + put1(0x56); // push esi/rsi + put1(0x57); // push edi/rdi + put1(0x53); // push ebx/rbx + put1(0x55); // push ebp/rbp + if (S==8) { + put2(0x4154); // push r12 + put2(0x4155); // push r13 + put2(0x4156); // push r14 + put2(0x4157); // push r15 + put4(0x4883ec38); // sub rsp, 56 + put2l(0x48b8, &a); // mov rax, a + put2(0x8b10); // mov edx, [rax] + put2l(0x48b8, &b); // mov rax, b + put2(0x8b30); // mov esi, [rax] + put2l(0x48b8, &c); // mov rax, c + put2(0x8b38); // mov edi, [rax] + put2l(0x48b8, &d); // mov rax, d + put2(0x8b28); // mov ebp, [rax] + put2l(0x48b8, &f); // mov rax, f + put2(0x8b18); // mov ebx, [rax] + put2l(0x49bc, &h[0]); // mov r12, h + put2l(0x49bd, &outbuf[0]); // mov r13, outbuf.p + put2l(0x49be, &r[0]); // mov r14, r + put2l(0x49bf, &m[0]); // mov r15, m + } + else { + put3(0x83ec3c); // sub esp, 60 + put2a(0x8b15, &a); // mov edx, [a] + put2a(0x8b35, &b); // mov esi, [b] + put2a(0x8b3d, &c); // mov edi, [c] + put2a(0x8b2d, &d); // mov ebp, [d] + put2a(0x8b1d, &f); // mov ebx, [f] + } + + // Assemble in multiple passes until every byte of hcomp has a translation + for (int istart=0; istarti); + assert(i>=0 && i=16) { + if (i>istart) { + int a=code-o; + if (a>-120 && a<120) + put2(0xeb00+((a-2)&255)); // jmp short o + else + put1a(0xe9, a-5); // jmp near o + } + break; + } + + // Else assemble the instruction at hcode[i] to rcode[o] + else { + assert(i>=0 && i0 && it[i]<16); + assert(o>=16); + it[i]=o; + ++done; + const int op=hcomp[i]; + const int arg=hcomp[i+1]+((op==255)?256*hcomp[i+2]:0); + const int ddd=op/8%8; + const int sss=op%8; + + // error instruction: return 0 + if (iserr(op)) { + put2(0x31c0); // xor eax, eax + put1a(0xe9, halt-o-4); // jmp near halt + continue; + } + + // Load source *b, *c, *d, or hash (*b) into eax except: + // {a,b,c,d}=*d, a{+,-,*,&,|,^,=,==,>,>}=*d: load address to eax + // {a,b,c,d}={*b,*c}: load source into ddd + if (op==59 || (op>=64 && op<240 && op%8>=4 && op%8<7)) { + put2(0x89c0+8*regcode[sss-3+(op==59)]); // mov eax, {esi,edi,ebp} + const int sz=(sss==6?hsize:msize)-1; + if (sz>=128) put1a(0x25, sz); // and eax, dword msize-1 + else put3(0x83e000+sz); // and eax, byte msize-1 + const int move=(op>=64 && op<112); // = or else ddd is eax + if (sss<6) { // ddd={a,b,c,d,*b,*c} + if (S==8) put5(0x410fb604+8*move*regcode[ddd],0x07); + // movzx ddd, byte [r15+rax] + else put3a(0x0fb680+8*move*regcode[ddd], &m[0]); + // movzx ddd, byte [m+eax] + } + else if ((0x06587000>>(op/8))&1) {// {*b,*c,*d,a/,a%,a&~,a<<,a>>}=*d + if (S==8) put4(0x418b0484); // mov eax, [r12+rax*4] + else put3a(0x8b0485, &h[0]); // mov eax, [h+eax*4] + } + } + + // Load destination address *b, *c, *d or hashd (*d) into ecx + if ((op>=32 && op<56 && op%8<5) || (op>=96 && op<120) || op==60) { + put2(0x89c1+8*regcode[op/8%8-3-(op==60)]);// mov ecx,{esi,edi,ebp} + const int sz=(ddd==6||op==60?hsize:msize)-1; + if (sz>=128) put2a(0x81e1, sz); // and ecx, dword sz + else put3(0x83e100+sz); // and ecx, byte sz + if (op/8%8==6 || op==60) { // *d + if (S==8) put4(0x498d0c8c); // lea rcx, [r12+rcx*4] + else put3a(0x8d0c8d, &h[0]); // lea ecx, [ecx*4+h] + } + else { // *b, *c + if (S==8) put4(0x498d0c0f); // lea rcx, [r15+rcx] + else put2a(0x8d89, &m[0]); // lea ecx, [ecx+h] + } + } + + // Translate by opcode + switch((op/8)&31) { + case 0: // ddd = a + case 1: // ddd = b + case 2: // ddd = c + case 3: // ddd = d + switch(sss) { + case 0: // ddd<>a (swap) + put2(0x87d0+regcode[ddd]); // xchg edx, ddd + break; + case 1: // ddd++ + put2(0xffc0+regcode[ddd]); // inc ddd + break; + case 2: // ddd-- + put2(0xffc8+regcode[ddd]); // dec ddd + break; + case 3: // ddd! + put2(0xf7d0+regcode[ddd]); // not ddd + break; + case 4: // ddd=0 + put2(0x31c0+9*regcode[ddd]); // xor ddd,ddd + break; + case 7: // ddd=r n + if (S==8) + put3a(0x418b86+8*regcode[ddd], arg*4); // mov ddd, [r14+n*4] + else + put2a(0x8b05+8*regcode[ddd], (&r[arg]));//mov ddd, [r+n] + break; + } + break; + case 4: // ddd = *b + case 5: // ddd = *c + switch(sss) { + case 0: // ddd<>a (swap) + put2(0x8611); // xchg dl, [ecx] + break; + case 1: // ddd++ + put2(0xfe01); // inc byte [ecx] + break; + case 2: // ddd-- + put2(0xfe09); // dec byte [ecx] + break; + case 3: // ddd! + put2(0xf611); // not byte [ecx] + break; + case 4: // ddd=0 + put2(0x31c0); // xor eax, eax + put2(0x8801); // mov [ecx], al + break; + case 7: // jt, jf + { + assert(code>=0 && code<16); + const int jtab[2][4]={{5,4,2,7},{4,5,3,6}}; + // jnz,je,jb,ja, jz,jne,jae,jbe + if (code<4) put2(0x84db); // test bl, bl + if (arg>=128 && arg-257-i>=0 && o-it[arg-257-i]<120) + put2(0x7000+256*jtab[op==47][code/4]); // jx short 0 + else + put2a(0x0f80+jtab[op==47][code/4], 0); // jx near 0 + break; + } + } + break; + case 6: // ddd = *d + switch(sss) { + case 0: // ddd<>a (swap) + put2(0x8711); // xchg edx, [ecx] + break; + case 1: // ddd++ + put2(0xff01); // inc dword [ecx] + break; + case 2: // ddd-- + put2(0xff09); // dec dword [ecx] + break; + case 3: // ddd! + put2(0xf711); // not dword [ecx] + break; + case 4: // ddd=0 + put2(0x31c0); // xor eax, eax + put2(0x8901); // mov [ecx], eax + break; + case 7: // ddd=r n + if (S==8) + put3a(0x418996, arg*4); // mov [r14+n*4], edx + else + put2a(0x8915, &r[arg]); // mov [r+n], edx + break; + } + break; + case 7: // special + switch(op) { + case 56: // halt + put1a(0xb8, 1); // mov eax, 1 + put1a(0xe9, halt-o-4); // jmp near halt + break; + case 57: // out + put1a(0xe8, outlabel-o-4);// call outlabel + break; + case 59: // hash: a = (a + *b + 512) * 773 + put3a(0x8d8410, 512); // lea edx, [eax+edx+512] + put2a(0x69d0, 773); // imul edx, eax, 773 + break; + case 60: // hashd: *d = (*d + a + 512) * 773 + put2(0x8b01); // mov eax, [ecx] + put3a(0x8d8410, 512); // lea eax, [eax+edx+512] + put2a(0x69c0, 773); // imul eax, eax, 773 + put2(0x8901); // mov [ecx], eax + break; + case 63: // jmp + put1a(0xe9, 0); // jmp near 0 (fill in target later) + break; + } + break; + case 8: // a= + case 9: // b= + case 10: // c= + case 11: // d= + if (sss==7) // n + put1a(0xb8+regcode[ddd], arg); // mov ddd, n + else if (sss==6) { // *d + if (S==8) + put4(0x418b0484+(regcode[ddd]<<11)); // mov ddd, [r12+rax*4] + else + put3a(0x8b0485+(regcode[ddd]<<11),&h[0]);// mov ddd, [h+eax*4] + } + else if (sss<4) // a, b, c, d + put2(0x89c0+regcode[ddd]+8*regcode[sss]);// mov ddd,sss + break; + case 12: // *b= + case 13: // *c= + if (sss==7) put3(0xc60100+arg); // mov byte [ecx], n + else if (sss==0) put2(0x8811); // mov byte [ecx], dl + else { + if (sss<4) put2(0x89c0+8*regcode[sss]);// mov eax, sss + put2(0x8801); // mov byte [ecx], al + } + break; + case 14: // *d= + if (sss<7) put2(0x8901+8*regcode[sss]); // mov [ecx], sss + else put2a(0xc701, arg); // mov dword [ecx], n + break; + case 15: break; // not used + case 16: // a+= + if (sss==6) { + if (S==8) put4(0x41031484); // add edx, [r12+rax*4] + else put3a(0x031485, &h[0]); // add edx, [h+eax*4] + } + else if (sss<7) put2(0x01c2+8*regcode[sss]);// add edx, sss + else if (arg>128) put2a(0x81c2, arg); // add edx, n + else put3(0x83c200+arg); // add edx, byte n + break; + case 17: // a-= + if (sss==6) { + if (S==8) put4(0x412b1484); // sub edx, [r12+rax*4] + else put3a(0x2b1485, &h[0]); // sub edx, [h+eax*4] + } + else if (sss<7) put2(0x29c2+8*regcode[sss]);// sub edx, sss + else if (arg>=128) put2a(0x81ea, arg); // sub edx, n + else put3(0x83ea00+arg); // sub edx, byte n + break; + case 18: // a*= + if (sss==6) { + if (S==8) put5(0x410faf14,0x84); // imul edx, [r12+rax*4] + else put4a(0x0faf1485, &h[0]); // imul edx, [h+eax*4] + } + else if (sss<7) put3(0x0fafd0+regcode[sss]);// imul edx, sss + else if (arg>=128) put2a(0x69d2, arg); // imul edx, n + else put3(0x6bd200+arg); // imul edx, byte n + break; + case 19: // a/= + case 20: // a%= + if (sss<7) put2(0x89c1+8*regcode[sss]); // mov ecx, sss + else put1a(0xb9, arg); // mov ecx, n + put2(0x85c9); // test ecx, ecx + put3(0x0f44d1); // cmovz edx, ecx + put2(0x7408-2*(op/8==20)); // jz (over rest) + put2(0x89d0); // mov eax, edx + put2(0x31d2); // xor edx, edx + put2(0xf7f1); // div ecx + if (op/8==19) put2(0x89c2); // mov edx, eax + break; + case 21: // a&= + if (sss==6) { + if (S==8) put4(0x41231484); // and edx, [r12+rax*4] + else put3a(0x231485, &h[0]); // and edx, [h+eax*4] + } + else if (sss<7) put2(0x21c2+8*regcode[sss]);// and edx, sss + else if (arg>=128) put2a(0x81e2, arg); // and edx, n + else put3(0x83e200+arg); // and edx, byte n + break; + case 22: // a&~ + if (sss==7) { + if (arg<128) put3(0x83e200+(~arg&255));// and edx, byte ~n + else put2a(0x81e2, ~arg); // and edx, ~n + } + else { + if (sss<4) put2(0x89c0+8*regcode[sss]);// mov eax, sss + put2(0xf7d0); // not eax + put2(0x21c2); // and edx, eax + } + break; + case 23: // a|= + if (sss==6) { + if (S==8) put4(0x410b1484); // or edx, [r12+rax*4] + else put3a(0x0b1485, &h[0]); // or edx, [h+eax*4] + } + else if (sss<7) put2(0x09c2+8*regcode[sss]);// or edx, sss + else if (arg>=128) put2a(0x81ca, arg); // or edx, n + else put3(0x83ca00+arg); // or edx, byte n + break; + case 24: // a^= + if (sss==6) { + if (S==8) put4(0x41331484); // xor edx, [r12+rax*4] + else put3a(0x331485, &h[0]); // xor edx, [h+eax*4] + } + else if (sss<7) put2(0x31c2+8*regcode[sss]);// xor edx, sss + else if (arg>=128) put2a(0x81f2, arg); // xor edx, byte n + else put3(0x83f200+arg); // xor edx, n + break; + case 25: // a<<= + case 26: // a>>= + if (sss==7) // sss = n + put3(0xc1e200+8*256*(op/8==26)+arg); // shl/shr n + else { + put2(0x89c1+8*regcode[sss]); // mov ecx, sss + put2(0xd3e2+8*(op/8==26)); // shl/shr edx, cl + } + break; + case 27: // a== + case 28: // a< + case 29: // a> + if (sss==6) { + if (S==8) put4(0x413b1484); // cmp edx, [r12+rax*4] + else put3a(0x3b1485, &h[0]); // cmp edx, [h+eax*4] + } + else if (sss==7) // sss = n + put2a(0x81fa, arg); // cmp edx, dword n + else + put2(0x39c2+8*regcode[sss]); // cmp edx, sss + if (code<4) { + if (op/8==27) put3(0x0f94c3); // setz bl + if (op/8==28) put3(0x0f92c3); // setc bl + if (op/8==29) put3(0x0f97c3); // seta bl + } + break; + case 30: // not used + case 31: // 255 = lj + if (op==255) put1a(0xe9, 0); // jmp near + break; + } + } + } + } + + // Finish first pass + const int rsize=o; + if (o>rcode_size) return rsize; + + // Fill in jump addresses (second pass) + for (int i=0; i=128) target-=256; + target+=i+2; + } + if (target<0 || target>=hlen) target=hlen-1; // runtime ZPAQL error + o=it[i]; + assert(o>=16 && o skip test + assert(o>=16 && o=0x72 && op<0x78) || op==0xeb) { // jx, jmp short + --target; + if (target<-128 || target>127) + error("Cannot code x86 short jump"); + assert(o=0x82 && op<0x88) || op==0xe9) // jx, jmp near + { + target-=4; + puta(target); + } + else assert(false); // not a x86 jump + } + } + + // Jump to start + o=0; + put1a(0xe9, start-5); // jmp near start + return rsize; +} + +//////////////////////// Predictor::assemble_p() ///////////////////// + +// Assemble the ZPAQL code in the HCOMP section of z.header to pcomp and +// return the number of bytes of x86 or x86-64 code written, or that would +// be written if pcomp were large enough. The code for predict() begins +// at pr.pcomp[0] and update() at pr.pcomp[5], both as jmp instructions. + +// The assembled code is equivalent to int predict(Predictor*) +// and void update(Predictor*, int y); The Preditor address is placed in +// edi/rdi. The update bit y is placed in ebp/rbp. + +int Predictor::assemble_p() { + Predictor& pr=*this; + U8* rcode=pr.pcode; // x86 output array + int rcode_size=pcode_size; // output size + int o=0; // output index in pcode + const int S=sizeof(char*); // 4 or 8 + U8* hcomp=&pr.z.header[0]; // The code to translate +#define off(x) ((char*)&(pr.x)-(char*)&pr) +#define offc(x) ((char*)&(pr.comp[i].x)-(char*)&pr) + + // test for little-endian (probably x86) + U32 t=0x12345678; + if (*(char*)&t!=0x78 || (S!=4 && S!=8)) + error("JIT supported only for x86-32 and x86-64"); + + // Initialize for predict(). Put predictor address in edi/rdi + put1a(0xe9, 5); // jmp predict + put1a(0, 0x90909000); // reserve space for jmp update + put1(0x53); // push ebx/rbx + put1(0x55); // push ebp/rbp + put1(0x56); // push esi/rsi + put1(0x57); // push edi/rdi + if (S==4) + put4(0x8b7c2414); // mov edi,[esp+0x14] ; pr + else { +#ifndef unix + put3(0x4889cf); // mov rdi, rcx (1st arg in Win64) +#endif + } + + // Code predict() for each component + const int n=hcomp[6]; // number of components + U8* cp=hcomp+7; + for (int i=0; i=pr.z.cend) error("comp too big"); + if (cp[0]<1 || cp[0]>9) error("invalid component"); + assert(compsize[cp[0]]>0 && compsize[cp[0]]<8); + switch (cp[0]) { + + case CONS: // c + break; + + case CM: // sizebits limit + // Component& cr=comp[i]; + // cr.cxt=h[i]^hmap4; + // p[i]=stretch(cr.cm(cr.cxt)>>17); + + put2a(0x8b87, off(h[i])); // mov eax, [edi+&h[i]] + put2a(0x3387, off(hmap4)); // xor eax, [edi+&hmap4] + put1a(0x25, (1<rsi) + put2a(0x8bb7, offc(cm)); // mov esi, [edi+&cm] + put3(0x8b0486); // mov eax, [esi+eax*4] + put3(0xc1e811); // shr eax, 17 + put4a(0x0fbf8447, off(stretcht)); // movsx eax,word[edi+eax*2+..] + put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax + break; + + case ISSE: // sizebits j -- c=hi, cxt=bh + // assert((hmap4&15)>0); + // if (c8==1 || (c8&0xf0)==16) + // cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); + // cr.cxt=cr.ht[cr.c+(hmap4&15)]; // bit history + // int *wt=(int*)&cr.cm[cr.cxt*2]; + // p[i]=clamp2k((wt[0]*p[cp[2]]+wt[1]*64)>>16); + + case ICM: // sizebits + // assert((hmap4&15)>0); + // if (c8==1 || (c8&0xf0)==16) cr.c=find(cr.ht, cp[1]+2, h[i]+16*c8); + // cr.cxt=cr.ht[cr.c+(hmap4&15)]; + // p[i]=stretch(cr.cm(cr.cxt)>>8); + // + // Find cxt row in hash table ht. ht has rows of 16 indexed by the low + // sizebits of cxt with element 0 having the next higher 8 bits for + // collision detection. If not found after 3 adjacent tries, replace + // row with lowest element 1 as priority. Return index of row. + // + // size_t Predictor::find(Array& ht, int sizebits, U32 cxt) { + // assert(ht.size()==size_t(16)<>sizebits&255; + // size_t h0=(cxt*16)&(ht.size()-16); + // if (ht[h0]==chk) return h0; + // size_t h1=h0^16; + // if (ht[h1]==chk) return h1; + // size_t h2=h0^32; + // if (ht[h2]==chk) return h2; + // if (ht[h0+1]<=ht[h1+1] && ht[h0+1]<=ht[h2+1]) + // return memset(&ht[h0], 0, 16), ht[h0]=chk, h0; + // else if (ht[h1+1]>(7-cr.cxt))&1; // predicted bit + // p[i]=stretch(dt2k[cr.a]*(cr.c*-2+1)&32767); + // } + + if (S==8) put1(0x48); // rex.w + put2a(0x8bb7, offc(ht)); // mov esi, [edi+&ht] + + // If match length (a) is 0 then p[i]=0 + put2a(0x8b87, offc(a)); // mov eax, [edi+&a] + put2(0x85c0); // test eax, eax + put2(0x7449); // jz L2 ; p[i]=0 + + // Else put predicted bit in c + put1a(0xb9, 7); // mov ecx, 7 + put2a(0x2b8f, offc(cxt)); // sub ecx, [edi+&cxt] + put2a(0x8b87, offc(limit)); // mov eax, [edi+&limit] + put2a(0x2b87, offc(b)); // sub eax, [edi+&b] + put1a(0x25, (1<>8; + + put2a(0x8b87, off(p[cp[1]])); // mov eax, [edi+&p[j]] + put2a(0x2b87, off(p[cp[2]])); // sub eax, [edi+&p[k]] + put2a(0x69c0, cp[3]); // imul eax, wt + put3(0xc1f808); // sar eax, 8 + put2a(0x0387, off(p[cp[2]])); // add eax, [edi+&p[k]] + put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax + break; + + case MIX2: // sizebits j k rate mask + // c=size cm=wt[size] cxt=input + // cr.cxt=((h[i]+(c8&cp[5]))&(cr.c-1)); + // assert(cr.cxt=0 && w<65536); + // p[i]=(w*p[cp[2]]+(65536-w)*p[cp[3]])>>16; + // assert(p[i]>=-2048 && p[i]<2048); + + put2(0x8b07); // mov eax, [edi] ; c8 + put1a(0x25, cp[5]); // and eax, mask + put2a(0x0387, off(h[i])); // add eax, [edi+&h[i]] + put1a(0x25, (1<=1 && m<=i); + // cr.cxt=h[i]+(c8&cp[5]); + // cr.cxt=(cr.cxt&(cr.c-1))*m; // pointer to row of weights + // assert(cr.cxt<=cr.cm.size()-m); + // int* wt=(int*)&cr.cm[cr.cxt]; + // p[i]=0; + // for (int j=0; j>8)*p[cp[2]+j]; + // p[i]=clamp2k(p[i]>>8); + + put2(0x8b07); // mov eax, [edi] ; c8 + put1a(0x25, cp[5]); // and eax, mask + put2a(0x0387, off(h[i])); // add eax, [edi+&h[i]] + put1a(0x25, (1<3) put4a(0xf30f6f96, k*4+16);//movdqu xmm2, [esi+k*4+16] + put5(0x660f72e1,0x08); // psrad xmm1, 8 + if (tail>3) put5(0x660f72e2,0x08); // psrad xmm2, 8 + put4(0x660f6bca); // packssdw xmm1, xmm2 + put4a(0xf30f6f9f, off(p[cp[2]+k])); // movdqu xmm3, [edi+&p[j+k]] + if (tail>3) + put4a(0xf30f6fa7,off(p[cp[2]+k+4]));//movdqu xmm4, [edi+&p[j+k+4]] + put4(0x660f6bdc); // packssdw, xmm3, xmm4 + if (tail>0 && tail<8) { // last loop, mask extra weights + put4(0x660f76ed); // pcmpeqd xmm5, xmm5 ; -1 + put5(0x660f73dd, 16-tail*2); // psrldq xmm5, 16-tail*2 + put4(0x660fdbcd); // pand xmm1, xmm5 + } + if (k==0) { // first loop, initialize sum in xmm0 + put4(0xf30f6fc1); // movdqu xmm0, xmm1 + put4(0x660ff5c3); // pmaddwd xmm0, xmm3 + } + else { // accumulate sum in xmm0 + put4(0xf30f6fd1); // movdqu xmm2, xmm1 + put4(0x660ff5d3); // pmaddwd xmm2, xmm3 + put4(0x660ffec2); // paddd, xmm0, xmm2 + } + } + + // Add up the 4 elements of xmm0 = p[i] in the first element + put4(0xf30f6fc8); // movdqu xmm1, xmm0 + put5(0x660f73d9,0x08); // psrldq xmm1, 8 + put4(0x660ffec1); // paddd xmm0, xmm1 + put4(0xf30f6fc8); // movdqu xmm1, xmm0 + put5(0x660f73d9,0x04); // psrldq xmm1, 4 + put4(0x660ffec1); // paddd xmm0, xmm1 + put4(0x660f7ec0); // movd eax, xmm0 ; p[i] + put3(0xc1f808); // sar eax, 8 + put1a(0xb9, 2047); // mov ecx, 2047 ; clamp2k + put2(0x39c8); // cmp eax, ecx + put3(0x0f4fc1); // cmovg eax, ecx + put2(0xf7d1); // not ecx ; -2048 + put2(0x39c8); // cmp eax, ecx + put3(0x0f4cc1); // cmovl eax, ecx + put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax + break; + + case SSE: // sizebits j start limit + // cr.cxt=(h[i]+c8)*32; + // int pq=p[cp[2]]+992; + // if (pq<0) pq=0; + // if (pq>1983) pq=1983; + // int wt=pq&63; + // pq>>=6; + // assert(pq>=0 && pq<=30); + // cr.cxt+=pq; + // p[i]=stretch(((cr.cm(cr.cxt)>>10)*(64-wt) // p0 + // +(cr.cm(cr.cxt+1)>>10)*wt)>>13); // p1 + // // p = p0*(64-wt)+p1*wt = (p1-p0)*wt + p0*64 + // cr.cxt+=wt>>5; + + put2a(0x8b8f, off(h[i])); // mov ecx, [edi+&h[i]] + put2(0x030f); // add ecx, [edi] ; c0 + put2a(0x81e1, (1<>5 + put2a(0x898f, offc(cxt)); // mov [edi+cxt], ecx ; cxt saved + put3(0xc1e80a); // shr eax, 10 ; p0 = cm[cxt]>>10 + put3(0xc1eb0a); // shr ebx, 10 ; p1 = cm[cxt+1]>>10 + put2(0x29c3); // sub ebx, eax, ; p1-p0 + put3(0x0fafda); // imul ebx, edx ; (p1-p0)*wt + put3(0xc1e006); // shr eax, 6 + put2(0x01d8); // add eax, ebx ; p in 0..2^28-1 + put3(0xc1e80d); // shr eax, 13 ; p in 0..32767 + put4a(0x0fbf8447, off(stretcht)); // movsx eax, word [edi+eax*2+...] + put2a(0x8987, off(p[i])); // mov [edi+&p[i]], eax + break; + + default: + error("invalid ZPAQ component"); + } + } + + // return squash(p[n-1]) + put2a(0x8b87, off(p[n-1])); // mov eax, [edi+...] + put1a(0x05, 0x800); // add eax, 2048 + put4a(0x0fbf8447, off(squasht[0])); // movsx eax, word [edi+eax*2+...] + put1(0x5f); // pop edi + put1(0x5e); // pop esi + put1(0x5d); // pop ebp + put1(0x5b); // pop ebx + put1(0xc3); // ret + + // Initialize for update() Put predictor address in edi/rdi + // and bit y=0..1 in ebp + int save_o=o; + o=5; + put1a(0xe9, save_o-10); // jmp update + o=save_o; + put1(0x53); // push ebx/rbx + put1(0x55); // push ebp/rbp + put1(0x56); // push esi/rsi + put1(0x57); // push edi/rdi + if (S==4) { + put4(0x8b7c2414); // mov edi,[esp+0x14] ; (1st arg = pr) + put4(0x8b6c2418); // mov ebp,[esp+0x18] ; (2nd arg = y) + } + else { +#ifdef unix // (1st arg already in rdi) + put3(0x4889f5); // mov rbp, rsi (2nd arg in Linux-64) +#else + put3(0x4889cf); // mov rdi, rcx (1st arg in Win64) + put3(0x4889d5); // mov rbp, rdx (2nd arg) +#endif + } + + // Code update() for each component + cp=hcomp+7; + for (int i=0; i=1 && cp[0]<=9); + assert(compsize[cp[0]]>0 && compsize[cp[0]]<8); + switch (cp[0]) { + + case CONS: // c + break; + + case SSE: // sizebits j start limit + case CM: // sizebits limit + // train(cr, y); + // + // reduce prediction error in cr.cm + // void train(Component& cr, int y) { + // assert(y==0 || y==1); + // U32& pn=cr.cm(cr.cxt); + // U32 count=pn&0x3ff; + // int error=y*32767-(cr.cm(cr.cxt)>>17); + // pn+=(error*dt[count]&-1024)+(countrsi) + put2a(0x8bb7, offc(cm)); // mov esi,[edi+cm] ; cm + put2a(0x8b87, offc(cxt)); // mov eax,[edi+cxt] ; cxt + put1a(0x25, pr.comp[i].cm.size()-1); // and eax, size-1 + if (S==8) put1(0x48); // rex.w + put3(0x8d3486); // lea esi,[esi+eax*4] ; &cm[cxt] + put2(0x8b06); // mov eax,[esi] ; cm[cxt] + put2(0x89c2); // mov edx, eax ; cm[cxt] + put3(0xc1e811); // shr eax, 17 ; cm[cxt]>>17 + put2(0x89e9); // mov ecx, ebp ; y + put3(0xc1e10f); // shl ecx, 15 ; y*32768 + put2(0x29e9); // sub ecx, ebp ; y*32767 + put2(0x29c1); // sub ecx, eax ; error + put2a(0x81e2, 0x3ff); // and edx, 1023 ; count + put3a(0x8b8497, off(dt)); // mov eax,[edi+edx*4+dt] ; dt[count] + put3(0x0fafc8); // imul ecx, eax ; error*dt[count] + put2a(0x81e1, 0xfffffc00); // and ecx, -1024 + put2a(0x81fa, cp[2+2*(cp[0]==SSE)]*4); // cmp edx, limit*4 + put2(0x110e); // adc [esi], ecx ; pn+=... + break; + + case ICM: // sizebits: cxt=bh, ht[c][0..15]=bh row + // cr.ht[cr.c+(hmap4&15)]=st.next(cr.ht[cr.c+(hmap4&15)], y); + // U32& pn=cr.cm(cr.cxt); + // pn+=int(y*32767-(pn>>8))>>2; + + case ISSE: // sizebits j -- c=hi, cxt=bh + // assert(cr.cxt==cr.ht[cr.c+(hmap4&15)]); + // int err=y*32767-squash(p[i]); + // int *wt=(int*)&cr.cm[cr.cxt*2]; + // wt[0]=clamp512k(wt[0]+((err*p[cp[2]]+(1<<12))>>13)); + // wt[1]=clamp512k(wt[1]+((err+16)>>5)); + // cr.ht[cr.c+(hmap4&15)]=st.next(cr.cxt, y); + + // update bit history bh to next(bh,y=ebp) in ht[c+(hmap4&15)] + put3(0x8b4700+off(hmap4)); // mov eax, [edi+&hmap4] + put3(0x83e00f); // and eax, 15 + put2a(0x0387, offc(c)); // add eax [edi+&c] ; cxt + if (S==8) put1(0x48); // rex.w + put2a(0x8bb7, offc(ht)); // mov esi, [edi+&ht] + put4(0x0fb61406); // movzx edx, byte [esi+eax] ; bh + put4(0x8d5c9500); // lea ebx, [ebp+edx*4] ; index to st + put4a(0x0fb69c1f, off(st)); // movzx ebx,byte[edi+ebx+st]; next bh + put3(0x881c06); // mov [esi+eax], bl ; save next bh + if (S==8) put1(0x48); // rex.w + put2a(0x8bb7, offc(cm)); // mov esi, [edi+&cm] + + // ICM: update cm[cxt=edx=bit history] to reduce prediction error + // esi = &cm + if (cp[0]==ICM) { + if (S==8) put1(0x48); // rex.w + put3(0x8d3496); // lea esi, [esi+edx*4] ; &cm[bh] + put2(0x8b06); // mov eax, [esi] ; pn + put3(0xc1e808); // shr eax, 8 ; pn>>8 + put2(0x89e9); // mov ecx, ebp ; y + put3(0xc1e10f); // shl ecx, 15 + put2(0x29e9); // sub ecx, ebp ; y*32767 + put2(0x29c1); // sub ecx, eax + put3(0xc1f902); // sar ecx, 2 + put2(0x010e); // add [esi], ecx + } + + // ISSE: update weights. edx=cxt=bit history (0..255), esi=cm[512] + else { + put2a(0x8b87, off(p[i])); // mov eax, [edi+&p[i]] + put1a(0x05, 2048); // add eax, 2048 + put4a(0x0fb78447, off(squasht)); // movzx eax, word [edi+eax*2+..] + put2(0x89e9); // mov ecx, ebp ; y + put3(0xc1e10f); // shl ecx, 15 + put2(0x29e9); // sub ecx, ebp ; y*32767 + put2(0x29c1); // sub ecx, eax ; err + put2a(0x8b87, off(p[cp[2]]));// mov eax, [edi+&p[j]] + put3(0x0fafc1); // imul eax, ecx + put1a(0x05, (1<<12)); // add eax, 4096 + put3(0xc1f80d); // sar eax, 13 + put3(0x0304d6); // add eax, [esi+edx*8] ; wt[0] + put1a(0xbb, (1<<19)-1); // mov ebx, 524287 + put2(0x39d8); // cmp eax, ebx + put3(0x0f4fc3); // cmovg eax, ebx + put2(0xf7d3); // not ebx ; -524288 + put2(0x39d8); // cmp eax, ebx + put3(0x0f4cc3); // cmovl eax, ebx + put3(0x8904d6); // mov [esi+edx*8], eax + put3(0x83c110); // add ecx, 16 ; err + put3(0xc1f905); // sar ecx, 5 + put4(0x034cd604); // add ecx, [esi+edx*8+4] ; wt[1] + put1a(0xb8, (1<<19)-1); // mov eax, 524287 + put2(0x39c1); // cmp ecx, eax + put3(0x0f4fc8); // cmovg ecx, eax + put2(0xf7d0); // not eax ; -524288 + put2(0x39c1); // cmp ecx, eax + put3(0x0f4cc8); // cmovl ecx, eax + put4(0x894cd604); // mov [esi+edx*8+4], ecx + } + break; + + case MATCH: // sizebits bufbits: + // a=len, b=offset, c=bit, cm=index, cxt=bitpos + // ht=buf, limit=pos + // assert(cr.a<=255); + // assert(cr.c==0 || cr.c==1); + // assert(cr.cxt<8); + // assert(cr.cm.size()==(size_t(1)<>5; + // int w=cr.a16[cr.cxt]; + // w+=(err*(p[cp[2]]-p[cp[3]])+(1<<12))>>13; + // if (w<0) w=0; + // if (w>65535) w=65535; + // cr.a16[cr.cxt]=w; + + // set ecx=err + put2a(0x8b87, off(p[i])); // mov eax, [edi+&p[i]] + put1a(0x05, 2048); // add eax, 2048 + put4a(0x0fb78447, off(squasht));//movzx eax, word [edi+eax*2+&squasht] + put2(0x89e9); // mov ecx, ebp ; y + put3(0xc1e10f); // shl ecx, 15 + put2(0x29e9); // sub ecx, ebp ; y*32767 + put2(0x29c1); // sub ecx, eax + put2a(0x69c9, cp[4]); // imul ecx, rate + put3(0xc1f905); // sar ecx, 5 ; err + + // Update w + put2a(0x8b87, offc(cxt)); // mov eax, [edi+&cxt] + if (S==8) put1(0x48); // rex.w + put2a(0x8bb7, offc(a16)); // mov esi, [edi+&a16] + if (S==8) put1(0x48); // rex.w + put3(0x8d3446); // lea esi, [esi+eax*2] ; &w + put2a(0x8b87, off(p[cp[2]])); // mov eax, [edi+&p[j]] + put2a(0x2b87, off(p[cp[3]])); // sub eax, [edi+&p[k]] ; p[j]-p[k] + put3(0x0fafc1); // imul eax, ecx ; * err + put1a(0x05, 1<<12); // add eax, 4096 + put3(0xc1f80d); // sar eax, 13 + put3(0x0fb716); // movzx edx, word [esi] ; w + put2(0x01d0); // add eax, edx + put1a(0xba, 0xffff); // mov edx, 65535 + put2(0x39d0); // cmp eax, edx + put3(0x0f4fc2); // cmovg eax, edx + put2(0x31d2); // xor edx, edx + put2(0x39d0); // cmp eax, edx + put3(0x0f4cc2); // cmovl eax, edx + put3(0x668906); // mov word [esi], ax + break; + + case MIX: // sizebits j m rate mask + // cm=wt[size][m], cxt=input + // int m=cp[3]; + // assert(m>0 && m<=i); + // assert(cr.cm.size()==m*cr.c); + // assert(cr.cxt+m<=cr.cm.size()); + // int err=(y*32767-squash(p[i]))*cp[4]>>4; + // int* wt=(int*)&cr.cm[cr.cxt]; + // for (int j=0; j>13)); + + // set ecx=err + put2a(0x8b87, off(p[i])); // mov eax, [edi+&p[i]] + put1a(0x05, 2048); // add eax, 2048 + put4a(0x0fb78447, off(squasht));//movzx eax, word [edi+eax*2+&squasht] + put2(0x89e9); // mov ecx, ebp ; y + put3(0xc1e10f); // shl ecx, 15 + put2(0x29e9); // sub ecx, ebp ; y*32767 + put2(0x29c1); // sub ecx, eax + put2a(0x69c9, cp[4]); // imul ecx, rate + put3(0xc1f904); // sar ecx, 4 ; err + + // set esi=wt + put2a(0x8b87, offc(cxt)); // mov eax, [edi+&cxt] ; cxt + if (S==8) put1(0x48); // rex.w + put2a(0x8bb7, offc(cm)); // mov esi, [edi+&cm] + if (S==8) put1(0x48); // rex.w + put3(0x8d3486); // lea esi, [esi+eax*4] ; wt + + for (int k=0; k=256) { + z.run(c8-256); + hmap4=1; + c8=1; + for (int i=0; i=16 && c8<32) + hmap4=(hmap4&0xf)<<5|y<<4|1; + else + hmap4=(hmap4&0x1f0)|(((hmap4&0xf)*2+y)&0xf); +#endif +} + +// Execute the ZPAQL code with input byte or -1 for EOF. +// Use JIT code at rcode if available, or else create it. +void ZPAQL::run(U32 input) { +#ifdef NOJIT + run0(input); +#else + if (!rcode) { + int n=assemble(); + allocx(rcode, rcode_size, n); + if (!rcode || n<10 || rcode_size<10 || n!=assemble()) + error("run JIT failed"); + } + a=input; + if (!((int(*)())(&rcode[0]))()) + libzpaq::error("Bad ZPAQL opcode"); +#endif +} + +} // end namespace libzpaq diff --git a/libzpaq501/libzpaq.h b/libzpaq501/libzpaq.h new file mode 100644 index 0000000..e7879b4 --- /dev/null +++ b/libzpaq501/libzpaq.h @@ -0,0 +1,441 @@ +/* libzpaq.h - LIBZPAQ Version 5.00. + + Copyright (C) 2011, Dell Inc. Written by Matt Mahoney. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so without restriction. + This Software is provided "as is" without warranty. + +LIBZPAQ is a C++ library for compression and decompression of data +conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/ + +By default, LIBZPAQ uses JIT (just in time) acceleration. This only +works on x86-32 and x86-64 processors that support the SSE2 instruction +set. To disable JIT, compile with -DNOJIT. To enable run time checks, +compile with -DDEBUG. Both options will decrease speed. + +The decompression code, when compiled with -DDEBUG and -DNOJIT, +comprises the reference decoder for the ZPAQ level 2 standard. +*/ + +#ifndef LIBZPAQ_H +#define LIBZPAQ_H + +#ifndef DEBUG +#define NDEBUG 1 +#endif +#include +#include +#include + +namespace libzpaq { + +// 1, 2, 4, 8 byte unsigned integers +typedef uint8_t U8; +typedef uint16_t U16; +typedef uint32_t U32; +typedef uint64_t U64; + +// Standard library prototypes redirected to libzpaq.cpp +void* calloc(size_t, size_t); +void free(void*); + +// Callback for error handling +extern void error(const char* msg); + +// Virtual base classes for input and output +// get() and put() must be overridden to read or write 1 byte. +// read() and write() may be overridden to read or write n bytes more +// efficiently than calling get() or put() n times. +class Reader { +public: + virtual int get() = 0; // should return 0..255, or -1 at EOF + virtual int read(char* buf, int n); // read to buf[n], return no. read + virtual ~Reader() {} +}; + +class Writer { +public: + virtual void put(int c) = 0; // should output low 8 bits of c + virtual void write(const char* buf, int n); // write buf[n] + virtual ~Writer() {} +}; + +// Read 16 bit little-endian number +int toU16(const char* p); + +// An Array of T is cleared and aligned on a 64 byte address +// with no constructors called. No copy or assignment. +// Array a(n, ex=0); - creates n< +class Array { + T *data; // user location of [0] on a 64 byte boundary + size_t n; // user size + int offset; // distance back in bytes to start of actual allocation + void operator=(const Array&); // no assignment + Array(const Array&); // no copy +public: + Array(size_t sz=0, int ex=0): data(0), n(0), offset(0) { + resize(sz, ex);} // [0..sz-1] = 0 + void resize(size_t sz, int ex=0); // change size, erase content to zeros + ~Array() {resize(0);} // free memory + size_t size() const {return n;} // get size + int isize() const {return int(n);} // get size as an int + T& operator[](size_t i) {assert(n>0 && i0 && (n&(n-1))==0); return data[i&(n-1)];} +}; + +// Change size to sz< +void Array::resize(size_t sz, int ex) { + assert(size_t(-1)>0); // unsigned type? + while (ex>0) { + if (sz>sz*2) error("Array too big"); + sz*=2, --ex; + } + if (n>0) { + assert(offset>0 && offset<=64); + assert((char*)data-offset); + free((char*)data-offset); + } + n=0; + if (sz==0) return; + n=sz; + const size_t nb=128+n*sizeof(T); // test for overflow + if (nb<=128 || (nb-128)/sizeof(T)!=n) error("Array too big"); + data=(T*)calloc(nb, 1); + if (!data) error("Out of memory"); + offset=64-(((char*)data-(char*)0)&63); + assert(offset>0 && offset<=64); + data=(T*)((char*)data+offset); +} + +//////////////////////////// SHA1 //////////////////////////// + +// For computing SHA-1 checksums +class SHA1 { +public: + void put(int c) { // hash 1 byte + U32& r=w[len0>>5&15]; + r=(r<<8)|(c&255); + if (!(len0+=8)) ++len1; + if ((len0&511)==0) process(); + } + double size() const {return len0/8+len1*536870912.0;} // size in bytes + uint64_t usize() const {return len0/8+(U64(len1)<<29);} // size in bytes + const char* result(); // get hash and reset + SHA1() {init();} +private: + void init(); // reset, but don't clear hbuf + U32 len0, len1; // length in bits (low, high) + U32 h[5]; // hash state + U32 w[80]; // input buffer + char hbuf[20]; // result + void process(); // hash 1 block +}; + +//////////////////////////// ZPAQL /////////////////////////// + +// Symbolic constants, instruction size, and names +typedef enum {NONE,CONS,CM,ICM,MATCH,AVG,MIX2,MIX,ISSE,SSE} CompType; +extern const int compsize[256]; + +// A ZPAQL machine COMP+HCOMP or PCOMP. +class ZPAQL { +public: + ZPAQL(); + ~ZPAQL(); + void clear(); // Free memory, erase program, reset machine state + void inith(); // Initialize as HCOMP to run + void initp(); // Initialize as PCOMP to run + double memory(); // Return memory requirement in bytes + void run(U32 input); // Execute with input + int read(Reader* in2); // Read header + bool write(Writer* out2, bool pp); // If pp write PCOMP else HCOMP header + int step(U32 input, int mode); // Trace execution (defined externally) + + Writer* output; // Destination for OUT instruction, or 0 to suppress + SHA1* sha1; // Points to checksum computer + U32 H(int i) {return h(i);} // get element of h + + void flush(); // write outbuf[0..bufptr-1] to output and sha1 + void outc(int c) { // output byte c (0..255) or -1 at EOS + if (c<0 || (outbuf[bufptr]=c, ++bufptr==outbuf.isize())) flush(); + } + + // ZPAQ1 block header + Array header; // hsize[2] hh hm ph pm n COMP (guard) HCOMP (guard) + int cend; // COMP in header[7...cend-1] + int hbegin, hend; // HCOMP/PCOMP in header[hbegin...hend-1] + +private: + // Machine state for executing HCOMP + Array m; // memory array M for HCOMP + Array h; // hash array H for HCOMP + Array r; // 256 element register array + Array outbuf; // output buffer + int bufptr; // number of bytes in outbuf + U32 a, b, c, d; // machine registers + int f; // condition flag + int pc; // program counter + int rcode_size; // length of rcode + U8* rcode; // JIT code for run() + + // Support code + int assemble(); // put JIT code in rcode + void init(int hbits, int mbits); // initialize H and M sizes + int execute(); // execute 1 instruction, return 0 after HALT, else 1 + void run0(U32 input); // default run() when select==0 + void div(U32 x) {if (x) a/=x; else a=0;} + void mod(U32 x) {if (x) a%=x; else a=0;} + void swap(U32& x) {a^=x; x^=a; a^=x;} + void swap(U8& x) {a^=x; x^=a; a^=x;} + void err(); // exit with run time error +}; + +///////////////////////// Component ////////////////////////// + +// A Component is a context model, indirect context model, match model, +// fixed weight mixer, adaptive 2 input mixer without or with current +// partial byte as context, adaptive m input mixer (without or with), +// or SSE (without or with). + +struct Component { + size_t limit; // max count for cm + size_t cxt; // saved context + size_t a, b, c; // multi-purpose variables + Array cm; // cm[cxt] -> p in bits 31..10, n in 9..0; MATCH index + Array ht; // ICM/ISSE hash table[0..size1][0..15] and MATCH buf + Array a16; // MIX weights + void init(); // initialize to all 0 + Component() {init();} +}; + +////////////////////////// StateTable //////////////////////// + +// Next state table generator +class StateTable { + enum {N=64}; // sizes of b, t + int num_states(int n0, int n1); // compute t[n0][n1][1] + void discount(int& n0); // set new value of n0 after 1 or n1 after 0 + void next_state(int& n0, int& n1, int y); // new (n0,n1) after bit y +public: + U8 ns[1024]; // state*4 -> next state if 0, if 1, n0, n1 + int next(int state, int y) { // next state for bit y + assert(state>=0 && state<256); + assert(y>=0 && y<4); + return ns[state*4+y]; + } + int cminit(int state) { // initial probability of 1 * 2^23 + assert(state>=0 && state<256); + return ((ns[state*4+3]*2+1)<<22)/(ns[state*4+2]+ns[state*4+3]+1); + } + StateTable(); +}; + +///////////////////////// Predictor ////////////////////////// + +// A predictor guesses the next bit +class Predictor { +public: + Predictor(ZPAQL&); + ~Predictor(); + void init(); // build model + int predict(); // probability that next bit is a 1 (0..4095) + void update(int y); // train on bit y (0..1) + int stat(int); // Defined externally + bool isModeled() { // n>0 components? + assert(z.header.isize()>6); + return z.header[6]!=0; + } +private: + + // Predictor state + int c8; // last 0...7 bits. + int hmap4; // c8 split into nibbles + int p[256]; // predictions + U32 h[256]; // unrolled copy of z.h + ZPAQL& z; // VM to compute context hashes, includes H, n + Component comp[256]; // the model, includes P + + // Modeling support functions + int predict0(); // default + void update0(int y); // default + int dt2k[256]; // division table for match: dt2k[i] = 2^12/i + int dt[1024]; // division table for cm: dt[i] = 2^16/(i+1.5) + U16 squasht[4096]; // squash() lookup table + short stretcht[32768];// stretch() lookup table + StateTable st; // next, cminit functions + U8* pcode; // JIT code for predict() and update() + int pcode_size; // length of pcode + + // reduce prediction error in cr.cm + void train(Component& cr, int y) { + assert(y==0 || y==1); + U32& pn=cr.cm(cr.cxt); + U32 count=pn&0x3ff; + int error=y*32767-(cr.cm(cr.cxt)>>17); + pn+=(error*dt[count]&-1024)+(count floor(32768/(1+exp(-x/64))) + int squash(int x) { + assert(x>=-2048 && x<=2047); + return squasht[x+2048]; + } + + // x -> round(64*log((x+0.5)/(32767.5-x))), approx inverse of squash + int stretch(int x) { + assert(x>=0 && x<=32767); + return stretcht[x]; + } + + // bound x to a 12 bit signed int + int clamp2k(int x) { + if (x<-2048) return -2048; + else if (x>2047) return 2047; + else return x; + } + + // bound x to a 20 bit signed int + int clamp512k(int x) { + if (x<-(1<<19)) return -(1<<19); + else if (x>=(1<<19)) return (1<<19)-1; + else return x; + } + + // Get cxt in ht, creating a new row if needed + size_t find(Array& ht, int sizebits, U32 cxt); + + // Put JIT code in pcode + int assemble_p(); +}; + +//////////////////////////// Decoder ///////////////////////// + +// Decoder decompresses using an arithmetic code +class Decoder { +public: + Reader* in; // destination + Decoder(ZPAQL& z); + int decompress(); // return a byte or EOF + int skip(); // skip to the end of the segment, return next byte + void init(); // initialize at start of block + int stat(int x) {return pr.stat(x);} +private: + U32 low, high; // range + U32 curr; // last 4 bytes of archive + Predictor pr; // to get p + enum {BUFSIZE=1<<16}; + Array buf; // input buffer of size BUFSIZE bytes + // of unmodeled data. buf[low..high-1] is input with curr + // remaining in sub-block. + int decode(int p); // return decoded bit (0..1) with prob. p (0..65535) + void loadbuf(); // read unmodeled data into buf to EOS +}; + +/////////////////////////// PostProcessor //////////////////// + +class PostProcessor { + int state; // input parse state: 0=INIT, 1=PASS, 2..4=loading, 5=POST + int hsize; // header size + int ph, pm; // sizes of H and M in z +public: + ZPAQL z; // holds PCOMP + PostProcessor(): state(0), hsize(0), ph(0), pm(0) {} + void init(int h, int m); // ph, pm sizes of H and M + int write(int c); // Input a byte, return state + int getState() const {return state;} + void setOutput(Writer* out) {z.output=out;} + void setSHA1(SHA1* sha1ptr) {z.sha1=sha1ptr;} +}; + +//////////////////////// Decompresser //////////////////////// + +// For decompression and listing archive contents +class Decompresser { +public: + Decompresser(): z(), dec(z), pp(), state(BLOCK), decode_state(FIRSTSEG) {} + void setInput(Reader* in) {dec.in=in;} + bool findBlock(double* memptr = 0); + void hcomp(Writer* out2) {z.write(out2, false);} + bool findFilename(Writer* = 0); + void readComment(Writer* = 0); + void setOutput(Writer* out) {pp.setOutput(out);} + void setSHA1(SHA1* sha1ptr) {pp.setSHA1(sha1ptr);} + bool decompress(int n = -1); // n bytes, -1=all, return true until done + bool pcomp(Writer* out2) {return pp.z.write(out2, true);} + void readSegmentEnd(char* sha1string = 0); + int stat(int x) {return dec.stat(x);} +private: + ZPAQL z; + Decoder dec; + PostProcessor pp; + enum {BLOCK, FILENAME, COMMENT, DATA, SEGEND} state; // expected next + enum {FIRSTSEG, SEG, SKIP} decode_state; // which segment in block? +}; + +/////////////////////////// decompress() ///////////////////// + +void decompress(Reader* in, Writer* out); + +////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////// + +// Code following this point is not a part of the ZPAQ level 2 standard. + +//////////////////////////// Encoder ///////////////////////// + +// Encoder compresses using an arithmetic code +class Encoder { +public: + Encoder(ZPAQL& z, int size=0): + out(0), low(1), high(0xFFFFFFFF), pr(z) {} + void init(); + void compress(int c); // c is 0..255 or EOF + int stat(int x) {return pr.stat(x);} + Writer* out; // destination +private: + U32 low, high; // range + Predictor pr; // to get p + Array buf; // unmodeled input + void encode(int y, int p); // encode bit y (0..1) with prob. p (0..65535) +}; + +//////////////////////// Compressor ////////////////////////// + +class Compressor { +public: + Compressor(): enc(z), in(0), state(INIT) {} + void setOutput(Writer* out) {enc.out=out;} + void writeTag(); + void startBlock(int level); // level=1,2,3 + void startBlock(const char* hcomp); + void startSegment(const char* filename = 0, const char* comment = 0); + void setInput(Reader* i) {in=i;} + void postProcess(const char* pcomp = 0, int len = 0); + bool compress(int n = -1); // n bytes, -1=all, return true until done + void endSegment(const char* sha1string = 0); + void endBlock(); + int stat(int x) {return enc.stat(x);} +private: + ZPAQL z; + Encoder enc; + Reader* in; + enum {INIT, BLOCK1, SEG1, BLOCK2, SEG2} state; +}; + +/////////////////////////// compress() /////////////////////// + +void compress(Reader* in, Writer* out, int level); + +} // namespace libzpaq + +#endif // LIBZPAQ_H