From c1ab298c965c78c2c0a8864a980b895667b0c8d7 Mon Sep 17 00:00:00 2001
From: Peter Hyman <pete@peterhyman.com>
Date: Sun, 5 Apr 2020 15:09:58 -0500
Subject: [PATCH] zpaq 7.15 update

---
 configure.ac          |    2 +-
 libzpaq/libzpaq.3.pod |  737 ------
 libzpaq/libzpaq.cpp   | 5239 ++++++++++++++++++++++++++++++++++++++---
 libzpaq/libzpaq.h     | 1213 +++++++++-
 stream.c              |    3 +-
 5 files changed, 6054 insertions(+), 1140 deletions(-)
 delete mode 100644 libzpaq/libzpaq.3.pod
diff --git a/configure.ac b/configure.ac
index 29e8d35..c3a96c8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -124,7 +124,7 @@ AC_CHECK_FUNCS(getopt_long)
 AX_PTHREAD
 LIBS="$PTHREAD_LIBS $LIBS"
 CFLAGS="$CFLAGS $PTHREAD_CFLAGS"
-CXXFLAGS="$CXXFLAGS $PTHREAD_CXXFLAGS"
+CXXFLAGS="$CXXFLAGS $PTHREAD_CFLAGS"
 
 # final checks for assembler
 # ASM is back for x86_64 by using newer CRC code from p7zip-16.02
diff --git a/libzpaq/libzpaq.3.pod b/libzpaq/libzpaq.3.pod
deleted file mode 100644
index 3ea1b95..0000000
--- a/libzpaq/libzpaq.3.pod
+++ /dev/null
@@ -1,737 +0,0 @@
-#  Documentation for libzpaq
-#
-#  Copyright (C) 2012, Dell Inc. Written by Matt Mahoney.
-#
-#  Permission is hereby granted, free of charge, to any person obtaining a copy
-#  of this software and associated documentation files (the "Software"), to deal
-#  in the Software without restriction, including without limitation the rights
-#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-#  copies of the Software, and to permit persons to whom the Software is
-#  furnished to do so without restriction.
-#  This Software is provided "as is" without warranty.
-#
-#  To create man page: pod2man libzpaq.3.pod > libzpaq.3
-#  To create HTML documentation: pod2html libzpaq.3.pod > libzpaq.html
-
-=pod
-
-=head1 NAME
-
-libzpaq - ZPAQ compression API
-
-=head1 SYNOPSIS
-
-    #include "libzpaq.h"
-
-    namespace libzpaq {
-
-    extern void error(const char* msg);
-
-    class Reader {
-    public:
-        virtual int get() = 0;
-        virtual int read(char* buf, int n); // optional
-        virtual ~Reader() {}
-    };
-
-    class Writer {
-    public:
-        virtual void put(int c) = 0;
-        virtual void write(const char* buf, int n); // optional
-        virtual ~Writer() {}
-    };
-
-    class SHA1 {
-    public:
-        SHA1();
-        void put(int c);
-        double size() const;
-        uint64_t usize() const
-        const char* result();
-    };
-
-    class Compressor {
-    public:
-        Compressor();
-        void setOutput(Writer* out);
-        void writeTag();
-        void startBlock(int level);
-        void startBlock(const char* hcomp);
-        void startSegment(const char* filename = 0,
-                          const char* comment = 0);
-        void setInput(Reader* i);
-        void postProcess(const char* pcomp = 0, int length = 0);
-        bool compress(int n = -1);
-        void endSegment(const char* sha1string = 0);
-        void endBlock();
-    };
-
-    class Decompresser {
-    public:
-        Decompresser();
-        void setInput(Reader* in);
-        bool findBlock(double* memptr = 0);
-        void hcomp(Writer* out);
-        bool findFilename(Writer* = 0);
-        void readComment(Writer* = 0);
-        void setOutput(Writer* out);
-        void setSHA1(SHA1* sha1ptr);
-        bool decompress(int n = -1);
-        bool pcomp(Writer* out);
-        void readSegmentEnd(char* sha1string = 0);
-    };
-
-    void compress(Reader* in, Writer* out, int level);
-
-    void decompress(Reader* in, Writer* out);
-    }
-
-=head1 DESCRIPTION
-
-I<libzpaq> is a C++ API for compressing or decompressing
-files or objects in memory comforming to the ZPAQ level 1 and 2 standards
-(see I<availability>). This document describes version 5.00
-of the software. The software may be used without
-restriction under a modified MIT license.
-
-ZPAQ provides a high level of data compression in a streaming
-(single pass) self-describing format that supports single or multiple
-named objects (such as archives) with optional integrity checking.
-
-The library provides 3 default compression levels but supports
-custom algorithms. The performance of the default levels is
-shown in the table below for the 14 file Calgary corpus as
-a tar file. Compression and decompression times are in seconds
-on a 2 GHz T3200 running on one of two cores. Memory required
-to compress or decompress is in MB. Some popular formats
-are shown for comparison.
-
-    Program       Format       Size     Time (C, D)   Memory
-    -----------   ------     ---------  -----------   ------
-    Uncompresed   .tar       3,152,896
-    compress      .tar.Z     1,319,521    1.6   0.2    .1 MB
-    gzip -9       .tar.gz    1,022,810    0.7   0.1    .1 MB
-    bzip2 -9      .tar.bz2     860,097    0.6   0.4     5 MB
-    7zip          .tar.7z      824,573    1.5   0.1   195 MB
-    zpaq 1 (fast) .tar.zpaq    806,959    2     2      38 MB
-    zpaq 2 (mid)  .tar.zpaq    699,191    8     8     112 MB
-    zpaq 3 (max)  .tar.zpaq    644,190   20    20     246 MB
-
-A ZPAQ stream consists of one or more blocks, possibly mixed with
-other data, that can be decompressed independently in any order.
-Each block consists of one or more segments that must be decompressed
-in order from the beginning of the block. Each block header contains
-a description of the decompression algorithm. Each segment consists
-of an optional filename string, an optional comment string,
-self delimiting compressed data, and an optional SHA-1 checksum.
-If ZPAQ blocks are mixed with other data, they must be
-preceded by an identifying 13 byte tag which does not otherwise
-appear in that data.
-
-ZPAQ compression is based on the PAQ context mixing model.
-An array of components predict the probability of the next bit
-of input, either independently or depending on the predictions
-of earlier components. The final prediction is arithmetic coded.
-Each component inputs a context computed from earlier input
-by a program written in ZPAQL byte code which runs on a virtual
-machine. Both the component array description and the ZPAQL
-code are encoded in a string called HCOMP in each block header.
-Data can also be stored uncompressed.
-
-A block may optionally specify a post-processor, a program
-(also in ZPAQL) which takes the decoded data as input and
-outputs the decompressed output. This program, if present,
-is encoded as a string called PCOMP which is compressed
-in the first segment prior to the compressed data. The first
-decoded byte from the first segment is a flag indicating
-whether a PCOMP string is present. The user is responsible
-for correctly pre-processing the data so that post-processing
-restores the original data.
-
-=head2 API Organization
-
-The I<libzpaq> API consists of 2 files.
-
-=over
-
-=item libzpaq.h
-
-Header file to include in your application.
-
-=item libzpaq.cpp
-
-Source code file to link to your application.
-
-=back
-
-An application would have the line C<#include "libzpaq.h"> and
-link to libzpaq.cpp.
-The API provides two classes, C<Compressor> and C<Decompresser>
-which write or read respectively each of the syntactic elements
-of a ZPAQ stream. The two functions C<compress()> and
-C<decompress()> provide simple interfaces for the most common
-uses. In either case, the user must create classes derived
-from the abstract base classes C<Reader> and C<Writer> and
-define methods C<get()> and C<put()> which the code
-will use to read and write bytes. The user must also define
-a callback error handler.
-
-By default, libzpaq(3) uses just-in-time (JIT) acceleration
-by translating ZPAQL code to x86-32 or x86-64 internally
-and executing it. This feature can be disabled by compiling
-with -DNOJIT. If enabled, it requires an x86 processor
-capable of executing SSE2 instructions. SSE2 is supported
-by most Intel processors since 2001 and AMD since 2003.
-
-Run time checks (assertions) can be enabled with -DDEBUG
-for debugging purposes.
-
-All of the API code is contained in the namespace C<libzpaq>.
-
-=head2 Callback Functions
-
-The following three functions must be defined by the user.
-
-=over
-
-=item C<extern void libzpaq::error(const char* msg);>
-
-This function must be defined by the user to handle errors
-from libzpaq. The library will call the function with
-an English language message passed to C<msg>. Errors may
-result from bad input during decompression, out of memory,
-or illegal arguments or calling sequences to libzpaq
-functions. Errors should be considered unrecoverable.
-
-=item C<int libzpaq::Reader::get() = 0;>
-
-The user must create a class derived from Reader with an
-implementation for C<get()> that reads one byte of input
-and returns its value in the range 0...255, or returns
-EOF (-1) at end of input. Objects of the derived type
-would then be passed to functions that require a C<Reader>.
-
-=item C<void libzpaq::Writer::put(int c) = 0;>
-
-The user must create a class derived from Writer with
-an implemenation of C<put()> which is expected to take
-a byte value C<c> in the range 0...255 and write it to
-output. Objects of the derived type
-would then be passed to functions that require a C<Writer>.
-
-=back
-
-The following two functions are optional. Defining them
-can improve performance slightly.
-
-=over
-
-=item C<virtual int read(char* buf, int n);>
-
-If defined, this function should input up to C<n> bytes into
-the array C<buf> and return the number actually read, in
-the range 0..n. A return value of 0 indicates end of input.
-If C<read()> is not defined, then the default implementation
-will call C<get()> n times.
-
-=item C<virtual void write(const char* buf, int n);>
-
-If defined, this function should output the elements C<buf[0]>
-through C<buf[n-1]> in order. If not defined, then the default
-implementation will call C<put()> n times.
-
-=back
-
-=head2 Simple Compression
-
-In the remainder of this document, all classes and
-functions are assumed to be in namespace C<libzpaq>.
-
-=over
-
-=item C<void compress(Reader* in, Writer* out, int mode);>
-
-C<compress()> compresses from C<in> to C<out> until C<get()>
-returns EOF. It writes a single segment in a single block
-with empty filename, comment, and checksum fields. C<mode>
-must be 1, 2, or 3, to select models I<fast>, I<mid>, or
-I<max> respectively. Higher modes compress smaller but
-take longer to compress and subsequently decompress.
-
-=item C<void decompress(Reader* in, Writer* out);>
-
-C<decompress()> decompresses any valid ZPAQ stream from
-C<in> to C<out> until C<get()> returns EOF. Any
-non-ZPAQ data in the input is ignored. Any ZPAQ blocks
-following non-ZPAQ must be preceded by a marker tag
-to be recognized. Each block is decoded according to the
-instructions in the block header. The contents of the
-filename, comment, and checksum fields are ignored.
-Data with bad checksums will be decoded anyway. If there
-is more than one segment, then all of the output
-data will be concatenated.
-
-=back
-
-=head2 class SHA1
-
-The SHA1 class is used to compute SHA-1 checksums for compression
-and verify them for decompression. It is believed to be
-computationally infeasible to find two different strings
-with the same hash value. Its member functions
-are as follows:
-
-=over
-
-=item C<SHA1();>
-
-The constructor creates a new SHA1 object representing the
-hash of an empty string.
-
-=item C<void put(int c);>
-
-Appends one byte c (0...255) to the string whose hash is represented.
-
-=item C<double size() const;>
-
-Returns the length (so far) of the string whose hash is represented.
-The largest possible value returned is
-2^61 - 1 = 2305843009213693951.0, but values larger than 2^53 =
-9007199254740992.0
-will not be exact on systems using IEEE 64 bit floating point
-representation of type C<double>. The initial value is 0.0.
-
-=item C<int64_t usize() const;>
-
-Returns the length (so far) as a 64 bit unsigned integer.
-
-=item C<const char* result();>
-
-Computes the 20 byte SHA-1 hash and resets the string back
-to a size of 0.0. The returned pointer points to an array
-inside the SHA1 object whose
-contents remain unchanged until the next call to C<result()>.
-
-=back
-
-=head2 class Compressor
-
-The C<Compressor> class has member functions to write
-each of the syntactic elements of a ZPAQ stream and to specify
-their values. It will compress using either built-in or
-user supplied models.
-
-=over
-
-=item C<Compressor();>
-
-The constructor creates a Compression object. No input source,
-output destination, or compression model is specified.
-
-=item C<void setOutput(Writer* out);>
-
-Specifies a destination for output. Must be specified before
-calling any function that writes data.
-
-=item C<void writeTag();>
-
-Writes a 13 byte marker tag which can be used to identify
-the start of a block following non-ZPAQ data.
-
-=item C<void startBlock(int level);>
-
-Writes a block header and specifies a compression model.
-If linked with F<libzpaqo.cpp>, then C<level> must be 1, 2, or 3
-to specify I<fast>, I<mid>, or I<max> respectively. Higher numbers
-compress smaller but more slowly. These models are compatible
-with both the ZPAQ level 1 and 2 standards.
-
-=item C<void startBlock(const char* hcomp);>
-
-Writes a block header and specifies the HCOMP portion of the
-compression model. The first two bytes of the string should
-encode the length of the rest of the string as a 16 bit unsigned
-number with the least significant bit first. The meaning of the
-rest of the string is defined in the ZPAQ level 2 standard.
-If the number of components (C<hcomp[8]>) is 0, then the block
-is saved in ZPAQ level 2 format, which cannot be read by
-older ZPAQ level 1 decoders. Otherwise the block is saved in
-ZPAQ level 1 format, which is compatible with all decoders.
-
-=item C<void startSegment(const char* filename = 0, const char* comment = 0);>
-
-Writes a segment header. C<filename> and
-C<comment> are NUL terminated strings. If specified, then their
-values are stored. Normally, C<filename> would be a file name
-when compressing to an archive or omitted otherwise. If a file
-is split among segments, then by convention only the first segment
-is named. C<comment> is normally the uncompressed size as a decimal
-number which is displayed when listing the contents of an archive.
-Omitting it does not affect decompression.
-
-=item C<void postProcess(const char* pcomp = 0, int length = 0);>
-
-Specifies the optional PCOMP string used for post-processing.
-It must be called from within the first segment
-of each block prior to compressing any data, but not from within
-any other segment.
-If C<pcomp> is 0 or no argument is passed, then the decompresser
-will not post-process the data. The effect is to compress a
-0 byte to indicate to the decompresser that no PCOMP string
-is present.
-
-If C<pcomp> is not 0, then I<length> bytes of the string I<pcomp>
-are passed. If I<length> is 0 or omitted, then
-the first two bytes must encode
-the length of the rest of the string as a 16 bit unsigned number
-with the least significant byte first. The format of the remainder
-of the string is described in the ZPAQ level 2 standard.
-The effect is to compress a 1 byte
-to indicate the presence of PCOMP, followed by the two length
-bytes and the string as passed. For example, either
-C<pcomp("\x02\x00\x05\x08")> or C<pcomp("\x05\x08", 2)>
-would compress the 5 bytes 1, 2, 0, 5, 8.
-The user is responsible for pre-processing the input
-prior to compression so that PCOMP restores the original data.
-
-=item C<void setInput(Reader* in);>
-
-Specifies the input source for compression. It must be set
-prior to the first call to C<compress()>.
-
-=item C<bool compress(int n = -1);>
-
-Compress n bytes of data, or until EOF is input, whichever comes
-first. If n < 0 or omitted, then compress until EOF.
-Returns true if there is more input available, or false if EOF
-was read.
-
-=item C<void endSegment(const char* sha1string = 0);>
-
-Stop compressing and write the end of a segment. If
-C<sha1string> is specified, it should be a 20 byte string
-as returned by C<SHA1::result()> on the input data for
-this segment I<before> pre-processing.
-
-=item C<void endBlock();>
-
-Finish writing the current block.
-
-=back
-
-In order to create a valid ZPAQ stream, the components must
-be written in the following order:
-
-    for each block do {
-        if any non-ZPAQ data then {
-            write non-ZPAQ data
-            writeTag()
-        }
-        startBlock()
-        for each segment do {
-            startSegment()
-            if first segment in block then {
-                postProcess()
-            }
-            while (compress(n)) ;
-            endSegment()
-        }
-        endBlock()
-    }
-
-=head2 class Decompresser
-
-The class Decompresser has member functions to read each of the
-syntactic elements of a ZPAQ stream.
-
-=over
-
-=item C<Decompresser()>
-
-The constructor creates a Decompresser object. No input source or
-output destination is specified.
-
-=item C<void setInput(Reader* in);>
-
-Specifies where the ZPAQ stream will be read from. Must be called
-before any function that reads the stream.
-
-=item C<bool findBlock(double* memptr = 0);>
-
-Scan the input to find the start of the next block. If a block
-does not start immediately, then the block must be preceded by
-a marker tag (written with C<Compressor::writeTag()>) or it will
-not be found. If C<memptr> is not 0, then write the approximate
-memory requirement (in bytes) to decompress to C<*memptr>). The
-memory will be allocated by the first call to C<decompress()>.
-It returns true if a block is found, or false if it reads to EOF
-without finding a block.
-
-=item C<void hcomp(Writer* out);>
-
-Write the HCOMP string of the current block to C<out>.
-It will be in a format suitable
-for passing to C<Compressor::startBlock()>. The first 2 bytes will
-encode the length of the rest of the string as a 16 bit unsigned
-integer with the least significant byte first. The format of the
-remainder of the string is described in the ZPAQ level 1
-specification.
-
-=item C<bool findFilename(Writer* out = 0);>
-
-Find the start of the next segment. If another segment is found
-within the current block then return true. If the end of the block
-is found first, then return false. If a segment is found, the
-filename field is not empty, and C<out>
-is not 0, then write the filename (without a terminating NUL byte)
-to C<out>.
-
-=item C<void readComment(Writer* out = 0);>
-
-Read or skip past the comment field following the filename field
-in the segment header. If C<out> is not 0 and the comment field is
-not empty, then write the comment
-(without a terminating NUL byte) to C<out>.
-
-=item C<void setOutput(Writer* out);>
-
-Specify the destination for decompression. It must be set before
-any data can be decompressed.
-
-=item C<void setSHA1(SHA1* sha1ptr);>
-
-Specify the address of a SHA1 object for computing the checksum
-of the decompressed data (after post-processing). As each byte C<c>
-is output, it is also passed to C<sha1ptr-E<gt>put(c)>. In order to
-compute the correct checksum, the SHA1 object should be in its
-initial state, either newly created, or by calling C<SHA1::result()>,
-before the first call to C<decompress()>. When the end of the segment
-is reached, the value returned by C<sha1ptr-E<gt>result()> should match
-the stored checksum, if any.
-
-=item C<bool decompress(int n = -1);>
-
-Decode n bytes or until the end of segment, whichever comes
-first. Return false if the end of segment is reached first. If
-n < 0 or not specified, then decompress to the end of segment
-and return false. C<n> is the number of bytes prior to post-processing.
-If the data is post-processed, then the size of the output may
-be different.
-
-=item C<bool pcomp(Writer* out);>
-
-Write the PCOMP string, if any, for the current block to C<out>.
-If there is no PCOMP string (no post-processor) then return false.
-Otherwise write the string to C<out> in a format suitable for
-passing to C<Compressor::postProcess()> and return true. If written,
-then the first 2 bytes will encode the length of the rest of the
-string as a 16 bit unsigned integer with the least significant
-bit first. The format of the rest of the string is descibed in
-the ZPAQ level 1 standard.
-
-C<pcomp()> is only valid after the first call to C<decompress()>
-in the current block. To read the PCOMP string without decompressing any
-data, then call C<decompress(0)> first. It is not necessary to
-call C<setOutput()> in this case.
-
-=item C<void readSegmentEnd(char* sha1string = 0);>
-
-Skip any compressed data in the current segment that has not yet
-been decompressed and advance to the end of the segment.
-Then if C<sha1string> is not 0 then write into
-the 21 byte array that it points to. If a checksum is present,
-then write a 1 into C<sha1string[0]> and write the stored checksum
-in C<sha1string[1...20]>. Otherwise write a 0 in C<sha1string[0]>.
-
-Note that it is not permitted to call decompress() if any compressed
-data has been skipped in any earlier segments in the same block.
-
-=back
-
-A valid sequence of calls is as follows:
-
-    while (findBlock()) {
-        while (findFilename()) {
-            readComment();
-            if first segment in block then { (optional)
-                decompress(0)
-                pcomp()
-            }
-            while (decompress(n)) ; (optional)
-            readSegmentEnd();
-        }
-    }
-
-=head1 EXAMPLES
-
-The following program F<listzpaq.cpp>
-lists the contents of a ZPAQ archive
-read from standard input.
-
-    #include <stdio.h>
-    #include <stdlib.h>
-    #include "libzpaq.h"
-
-    // Implement Reader and Writer interfaces for file I/O
-    class File: public libzpaq::Reader, public libzpaq::Writer {
-        FILE* f;
-    public:
-        File(FILE* f_): f(f_) {}
-        int get() {return getc(f);}
-        void put(int c) {putc(c, f);}
-        int read(char* buf, int n) {return fread(buf, 1, n, f);}
-        void write(const char* buf, int n) {fwrite(buf, 1, n, f);}
-    };
-
-    // Implement error handler
-    namespace libzpaq {
-        void error(const char* msg) {
-            fprintf(stderr, "Error: %s\n", msg);
-            exit(1);
-        }
-    }
-
-    // List the contents of an archive. For each block, show
-    // the memory required to decompress. For each segment,
-    // show the filename and comment.
-    void list(FILE* input, FILE* output) {
-        libzpaq::Decompresser d;
-        File in(input), out(output);
-        double memory;
-        d.setInput(&in);
-        for (int block=1; d.findBlock(&memory); ++block) {
-            printf("Block %d needs %1.0f MB\n", block, memory/1e6);
-            while (d.findFilename(&out)) {  // print filename
-                printf("\t");
-                d.readComment(&out);  // print comment
-                printf("\n");
-                d.readSegmentEnd();  // skip compressed data
-            }
-        }
-    }
-
-    int main() {
-        list(stdin, stdout);
-        return 0;
-    }
-
-The program could be compiled as follows:
-
-    g++ listzpaq.cpp libzpaq.cpp
-
-The following code compresses a list of files into one block
-written to stdout. Each file is compressed to a separate
-segment. For each segment, the filename, comment, and SHA-1
-checksum are stored. The comment, as conventional, is the
-file size as a decimal string.
-
-    // Compress one file to one segment
-    void compress_file(libzpaq::Compressor& c,
-                       const char* filename,
-                       bool first_segment) {
-
-        // Open input file
-        FILE* f;
-        f=fopen(filename, "rb");
-        if (!f) return;
-
-        // Compute SHA-1 checksum and file size
-        libzpaq::SHA1 sha1;
-        int ch;
-        while ((ch=getc(f))!=EOF)
-            sha1.put(ch);
-
-        // Write file size as a comment.
-        // The size can have at most 19 digits.
-        char comment[20];
-        sprintf(comment, "%1.0f", sha1.size());
-
-        // Compress segment
-        rewind(f);
-        File in(f);
-        c.startSegment(filename, comment);
-        if (first_segment)
-            c.postProcess();
-        c.setInput(&in);
-        c.compress();
-        c.endSegment(sha1.result());
-
-        // Close input file
-        fclose(f);
-    }
-
-    // Compress a list of argc files in argv[0...argc-1] into one
-    // ZPAQ block to stdout at level 2.
-    void compress_list(int argc, char** argv) {
-        libzpaq::Compressor c;
-        File out(stdout);
-        c.setOutput(&out);
-        c.startBlock(2);
-        for (int i=0; i<argc; ++i)
-            compress_file(c, argv[i], i==0);
-        c.endBlock();
-    }
-
-The following function decompresses from stdin to stdout.
-Filenames and comments are ignored, but checksums are verified
-if present.
-
-    void decompress() {
-        libzpaq::Decompresser d;
-        File in(stdin), out(stdout);
-        d.setInput(&in);
-        while (d.findBlock()) {
-            while (d.findFilename()) {
-                d.readComment();
-                libzpaq::SHA1 sha1;
-                d.setSHA1(&sha1);
-                d.setOutput(&out);
-                d.decompress();
-                char sha1string[21];
-                d.readSegmentEnd(sha1string);
-                const char* sha1result = sha1.result();
-                if (sha1string[0]==1
-                         && memcmp(sha1string+1, sha1result, 20))
-                    libzpaq::error("checksum verify error");
-            }
-        }
-    }
-
-C<Compressor::compress()> and C<Decompresser::decompress()> can
-be passed an argument n to display progress every n bytes,
-for example:
-
-    for (int i=1; d.decompress(1000000); ++i)
-        fprintf(stderr, "Decompressed %d MB\n", i);
-
-To compress or decompress to and from objects in memory, derive
-appropriate classes from C<Reader> and C<Writer>. For example, it is
-possible to compress or decompress to a C<std::string> using
-the following class.
-
-    struct String: public libzpaq::Writer {
-        std::string s;
-        void put(int c) {s+=char(c);}
-    };
-
-This class is also useful for reading the filename and comment
-fields during decompression as follows:
-
-    String filename, comment;
-    while (d.findFilename(&filename)) {
-        d.readComment(&comment);
-        // ...
-
-=head1 AVAILABILITY
-
-I<libzpaq>, I<zpaq>, and the ZPAQ level 1 and 2 specifications are
-available from L<http://mattmahoney.net/zpaq/>.
-
-=head1 SEE ALSO
-
-C<zpaq(1)>
-C<sha1(1SSL)>
-
-=cut
-
-
diff --git a/libzpaq/libzpaq.cpp b/libzpaq/libzpaq.cpp
index 633c6f1..d82bacb 100644
--- a/libzpaq/libzpaq.cpp
+++ b/libzpaq/libzpaq.cpp
@@ -1,31 +1,41 @@
-/* libzpaq.cpp - Part of LIBZPAQ Version 5.01
+/* libzpaq.cpp - LIBZPAQ Version 7.15 implementation - Aug. 17, 2016.
 
-  Copyright (C) 2011, Dell Inc. Written by Matt Mahoney.
+  libdivsufsort.c for divsufsort 2.00, included within, is
+  (C) 2003-2008 Yuta Mori, all rights reserved.
+  It is released under the MIT license as described in the comments
+  at the beginning of that section.
 
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so without restriction.
-  This Software is provided "as is" without warranty.
+  Some of the code for AES is from libtomcrypt 1.17 by Tom St. Denis
+  and is public domain.
+
+  The Salsa20/8 code for Scrypt is by D. Bernstein and is public domain.
+
+  All of the remaining software is provided as-is, with no warranty.
+  I, Matt Mahoney, release this software into
+  the public domain. This applies worldwide.
+  In some countries this may not be legally possible; if so:
+  I grant anyone the right to use this software for any purpose,
+  without any conditions, unless such conditions are required by law.
 
 LIBZPAQ is a C++ library for compression and decompression of data
 conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/
+See libzpaq.h for additional documentation.
 */
 
 #include "libzpaq.h"
-#include <stdio.h>
-#include <stdlib.h>
 #include <string.h>
+#include <string>
+#include <vector>
+#include <stdio.h>
 #include <math.h>
 
+#ifdef unix
 #ifndef NOJIT
-#ifndef _WIN32
 #include <sys/mman.h>
+#endif
 #else
 #include <windows.h>
-#endif
+#include <wincrypt.h>
 #endif
 
 namespace libzpaq {
@@ -58,10 +68,6 @@ void Writer::write(const char* buf, int n) {
     put(U8(buf[i]));
 }
 
-void error(const char* msg) {
-  fprintf(stderr, "zpipe error: %s\n", msg);
-  exit(1);
-}
 ///////////////////////// allocx //////////////////////
 
 // Allocate newsize > 0 bytes of executable memory and update
@@ -76,7 +82,7 @@ void allocx(U8* &p, int &n, int newsize) {
 #else
   if (p || n) {
     if (p)
-#ifndef _WIN32
+#ifdef unix
       munmap(p, n);
 #else // Windows
       VirtualFree(p, 0, MEM_RELEASE);
@@ -85,7 +91,7 @@ void allocx(U8* &p, int &n, int newsize) {
     n=0;
   }
   if (newsize>0) {
-#ifndef _WIN32
+#ifdef unix
     p=(U8*)mmap(0, newsize, PROT_READ|PROT_WRITE|PROT_EXEC,
                 MAP_PRIVATE|MAP_ANON, -1, 0);
     if ((void*)p==MAP_FAILED) p=0;
@@ -109,30 +115,31 @@ void allocx(U8* &p, int &n, int newsize) {
 
 // Start a new hash
 void SHA1::init() {
-  len0=len1=0;
+  len=0;
   h[0]=0x67452301;
   h[1]=0xEFCDAB89;
   h[2]=0x98BADCFE;
   h[3]=0x10325476;
   h[4]=0xC3D2E1F0;
+  memset(w, 0, sizeof(w));
 }
 
 // Return old result and start a new hash
 const char* SHA1::result() {
 
   // pad and append length
-  const U32 s1=len1, s0=len0;
+  const U64 s=len;
   put(0x80);
-  while ((len0&511)!=448)
+  while ((len&511)!=448)
     put(0);
-  put(s1>>24);
-  put(s1>>16);
-  put(s1>>8);
-  put(s1);
-  put(s0>>24);
-  put(s0>>16);
-  put(s0>>8);
-  put(s0);
+  put(s>>56);
+  put(s>>48);
+  put(s>>40);
+  put(s>>32);
+  put(s>>24);
+  put(s>>16);
+  put(s>>8);
+  put(s);
 
   // copy h to hbuf
   for (int i=0; i<5; ++i) {
@@ -147,38 +154,565 @@ const char* SHA1::result() {
   return hbuf;
 }
 
+// Hash buf[0..n-1]
+void SHA1::write(const char* buf, int64_t n) {
+  const unsigned char* p=(const unsigned char*) buf;
+  for (; n>0 && (U32(len)&511)!=0; --n) put(*p++);
+  for (; n>=64; n-=64) {
+    for (int i=0; i<16; ++i)
+      w[i]=p[0]<<24|p[1]<<16|p[2]<<8|p[3], p+=4;
+    len+=512;
+    process();
+  }
+  for (; n>0; --n) put(*p++);
+}
+
 // Hash 1 block of 64 bytes
 void SHA1::process() {
-  for (int i=16; i<80; ++i) {
-    w[i]=w[i-3]^w[i-8]^w[i-14]^w[i-16];
-    w[i]=w[i]<<1|w[i]>>31;
+  U32 a=h[0], b=h[1], c=h[2], d=h[3], e=h[4];
+  static const U32 k[4]={0x5A827999, 0x6ED9EBA1, 0x8F1BBCDC, 0xCA62C1D6};
+  #define f(a,b,c,d,e,i) \
+    if (i>=16) \
+      w[(i)&15]^=w[(i-3)&15]^w[(i-8)&15]^w[(i-14)&15], \
+      w[(i)&15]=w[(i)&15]<<1|w[(i)&15]>>31; \
+    e+=(a<<5|a>>27)+k[(i)/20]+w[(i)&15] \
+      +((i)%40>=20 ? b^c^d : i>=40 ? (b&c)|(d&(b|c)) : d^(b&(c^d))); \
+    b=b<<30|b>>2;
+  #define r(i) f(a,b,c,d,e,i) f(e,a,b,c,d,i+1) f(d,e,a,b,c,i+2) \
+               f(c,d,e,a,b,i+3) f(b,c,d,e,a,i+4)
+  r(0)  r(5)  r(10) r(15) r(20) r(25) r(30) r(35)
+  r(40) r(45) r(50) r(55) r(60) r(65) r(70) r(75)
+  #undef f
+  #undef r
+  h[0]+=a; h[1]+=b; h[2]+=c; h[3]+=d; h[4]+=e;
+}
+
+//////////////////////////// SHA256 //////////////////////////
+
+void SHA256::init() {
+  len0=len1=0;
+  s[0]=0x6a09e667;
+  s[1]=0xbb67ae85;
+  s[2]=0x3c6ef372;
+  s[3]=0xa54ff53a;
+  s[4]=0x510e527f;
+  s[5]=0x9b05688c;
+  s[6]=0x1f83d9ab;
+  s[7]=0x5be0cd19;
+  memset(w, 0, sizeof(w));
+}
+
+void SHA256::process() {
+
+  #define ror(a,b) ((a)>>(b)|(a<<(32-(b))))
+
+  #define m(i) \
+     w[(i)&15]+=w[(i-7)&15] \
+       +(ror(w[(i-15)&15],7)^ror(w[(i-15)&15],18)^(w[(i-15)&15]>>3)) \
+       +(ror(w[(i-2)&15],17)^ror(w[(i-2)&15],19)^(w[(i-2)&15]>>10))
+
+  #define r(a,b,c,d,e,f,g,h,i) { \
+    unsigned t1=ror(e,14)^e; \
+    t1=ror(t1,5)^e; \
+    h+=ror(t1,6)+((e&f)^(~e&g))+k[i]+w[(i)&15]; } \
+    d+=h; \
+    {unsigned t1=ror(a,9)^a; \
+    t1=ror(t1,11)^a; \
+    h+=ror(t1,2)+((a&b)^(c&(a^b))); }
+
+  #define mr(a,b,c,d,e,f,g,h,i) m(i); r(a,b,c,d,e,f,g,h,i);
+
+  #define r8(i) \
+    r(a,b,c,d,e,f,g,h,i);   \
+    r(h,a,b,c,d,e,f,g,i+1); \
+    r(g,h,a,b,c,d,e,f,i+2); \
+    r(f,g,h,a,b,c,d,e,i+3); \
+    r(e,f,g,h,a,b,c,d,i+4); \
+    r(d,e,f,g,h,a,b,c,i+5); \
+    r(c,d,e,f,g,h,a,b,i+6); \
+    r(b,c,d,e,f,g,h,a,i+7);
+
+  #define mr8(i) \
+    mr(a,b,c,d,e,f,g,h,i);   \
+    mr(h,a,b,c,d,e,f,g,i+1); \
+    mr(g,h,a,b,c,d,e,f,i+2); \
+    mr(f,g,h,a,b,c,d,e,i+3); \
+    mr(e,f,g,h,a,b,c,d,i+4); \
+    mr(d,e,f,g,h,a,b,c,i+5); \
+    mr(c,d,e,f,g,h,a,b,i+6); \
+    mr(b,c,d,e,f,g,h,a,i+7);
+
+  static const unsigned k[64]={
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2};
+
+  unsigned a=s[0];
+  unsigned b=s[1];
+  unsigned c=s[2];
+  unsigned d=s[3];
+  unsigned e=s[4];
+  unsigned f=s[5];
+  unsigned g=s[6];
+  unsigned h=s[7];
+
+  r8(0);
+  r8(8);
+  mr8(16);
+  mr8(24);
+  mr8(32);
+  mr8(40);
+  mr8(48);
+  mr8(56);
+
+  s[0]+=a;
+  s[1]+=b;
+  s[2]+=c;
+  s[3]+=d;
+  s[4]+=e;
+  s[5]+=f;
+  s[6]+=g;
+  s[7]+=h;
+
+  #undef mr8
+  #undef r8
+  #undef mr
+  #undef r
+  #undef m
+  #undef ror
+};
+
+// Return old result and start a new hash
+const char* SHA256::result() {
+
+  // pad and append length
+  const unsigned s1=len1, s0=len0;
+  put(0x80);
+  while ((len0&511)!=448) put(0);
+  put(s1>>24);
+  put(s1>>16);
+  put(s1>>8);
+  put(s1);
+  put(s0>>24);
+  put(s0>>16);
+  put(s0>>8);
+  put(s0);
+
+  // copy s to hbuf
+  for (int i=0; i<8; ++i) {
+    hbuf[4*i]=s[i]>>24;
+    hbuf[4*i+1]=s[i]>>16;
+    hbuf[4*i+2]=s[i]>>8;
+    hbuf[4*i+3]=s[i];
   }
-  U32 a=h[0];
-  U32 b=h[1];
-  U32 c=h[2];
-  U32 d=h[3];
-  U32 e=h[4];
-  const U32 k1=0x5A827999, k2=0x6ED9EBA1, k3=0x8F1BBCDC, k4=0xCA62C1D6;
-#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+((b&c)|(~b&d))+k1+w[i]; b=b<<30|b>>2;
-#define f5(i) f1(a,b,c,d,e,i) f1(e,a,b,c,d,i+1) f1(d,e,a,b,c,i+2) \
-              f1(c,d,e,a,b,i+3) f1(b,c,d,e,a,i+4)
-  f5(0) f5(5) f5(10) f5(15)
-#undef f1
-#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+(b^c^d)+k2+w[i]; b=b<<30|b>>2;
-  f5(20) f5(25) f5(30) f5(35)
-#undef f1
-#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+((b&c)|(b&d)|(c&d))+k3+w[i]; b=b<<30|b>>2;
-  f5(40) f5(45) f5(50) f5(55)
-#undef f1
-#define f1(a,b,c,d,e,i) e+=(a<<5|a>>27)+(b^c^d)+k4+w[i]; b=b<<30|b>>2;
-  f5(60) f5(65) f5(70) f5(75)
-#undef f1
-#undef f5
-  h[0]+=a;
-  h[1]+=b;
-  h[2]+=c;
-  h[3]+=d;
-  h[4]+=e;
+
+  // return hash prior to clearing state
+  init();
+  return hbuf;
+}
+
+//////////////////////////// AES /////////////////////////////
+
+// Some AES code is derived from libtomcrypt 1.17 (public domain).
+
+#define Te4_0 0x000000FF & Te4
+#define Te4_1 0x0000FF00 & Te4
+#define Te4_2 0x00FF0000 & Te4
+#define Te4_3 0xFF000000 & Te4
+
+// Extract byte n of x
+static inline unsigned byte(unsigned x, unsigned n) {return (x>>(8*n))&255;}
+
+// x = y[0..3] MSB first
+static inline void LOAD32H(U32& x, const char* y) {
+  const unsigned char* u=(const unsigned char*)y;
+  x=u[0]<<24|u[1]<<16|u[2]<<8|u[3];
+}
+
+// y[0..3] = x MSB first
+static inline void STORE32H(U32& x, unsigned char* y) {
+  y[0]=x>>24;
+  y[1]=x>>16;
+  y[2]=x>>8;
+  y[3]=x;
+}
+
+#define setup_mix(temp) \
+  ((Te4_3[byte(temp, 2)]) ^ (Te4_2[byte(temp, 1)]) ^ \
+   (Te4_1[byte(temp, 0)]) ^ (Te4_0[byte(temp, 3)]))
+
+// Initialize encryption tables and round key. keylen is 16, 24, or 32.
+AES_CTR::AES_CTR(const char* key, int keylen, const char* iv) {
+  assert(key  != NULL);
+  assert(keylen==16 || keylen==24 || keylen==32);
+
+  // Initialize IV (default 0)
+  iv0=iv1=0;
+  if (iv) {
+    LOAD32H(iv0, iv);
+    LOAD32H(iv1, iv+4);
+  }
+
+  // Initialize encryption tables
+  for (int i=0; i<256; ++i) {
+    unsigned s1=
+    "\x63\x7c\x77\x7b\xf2\x6b\x6f\xc5\x30\x01\x67\x2b\xfe\xd7\xab\x76"
+    "\xca\x82\xc9\x7d\xfa\x59\x47\xf0\xad\xd4\xa2\xaf\x9c\xa4\x72\xc0"
+    "\xb7\xfd\x93\x26\x36\x3f\xf7\xcc\x34\xa5\xe5\xf1\x71\xd8\x31\x15"
+    "\x04\xc7\x23\xc3\x18\x96\x05\x9a\x07\x12\x80\xe2\xeb\x27\xb2\x75"
+    "\x09\x83\x2c\x1a\x1b\x6e\x5a\xa0\x52\x3b\xd6\xb3\x29\xe3\x2f\x84"
+    "\x53\xd1\x00\xed\x20\xfc\xb1\x5b\x6a\xcb\xbe\x39\x4a\x4c\x58\xcf"
+    "\xd0\xef\xaa\xfb\x43\x4d\x33\x85\x45\xf9\x02\x7f\x50\x3c\x9f\xa8"
+    "\x51\xa3\x40\x8f\x92\x9d\x38\xf5\xbc\xb6\xda\x21\x10\xff\xf3\xd2"
+    "\xcd\x0c\x13\xec\x5f\x97\x44\x17\xc4\xa7\x7e\x3d\x64\x5d\x19\x73"
+    "\x60\x81\x4f\xdc\x22\x2a\x90\x88\x46\xee\xb8\x14\xde\x5e\x0b\xdb"
+    "\xe0\x32\x3a\x0a\x49\x06\x24\x5c\xc2\xd3\xac\x62\x91\x95\xe4\x79"
+    "\xe7\xc8\x37\x6d\x8d\xd5\x4e\xa9\x6c\x56\xf4\xea\x65\x7a\xae\x08"
+    "\xba\x78\x25\x2e\x1c\xa6\xb4\xc6\xe8\xdd\x74\x1f\x4b\xbd\x8b\x8a"
+    "\x70\x3e\xb5\x66\x48\x03\xf6\x0e\x61\x35\x57\xb9\x86\xc1\x1d\x9e"
+    "\xe1\xf8\x98\x11\x69\xd9\x8e\x94\x9b\x1e\x87\xe9\xce\x55\x28\xdf"
+    "\x8c\xa1\x89\x0d\xbf\xe6\x42\x68\x41\x99\x2d\x0f\xb0\x54\xbb\x16"
+    [i]&255;
+    unsigned s2=s1<<1;
+    if (s2>=0x100) s2^=0x11b;
+    unsigned s3=s1^s2;
+    Te0[i]=s2<<24|s1<<16|s1<<8|s3;
+    Te1[i]=s3<<24|s2<<16|s1<<8|s1;
+    Te2[i]=s1<<24|s3<<16|s2<<8|s1;
+    Te3[i]=s1<<24|s1<<16|s3<<8|s2;
+    Te4[i]=s1<<24|s1<<16|s1<<8|s1;
+  }
+
+  // setup the forward key
+  Nr = 10 + ((keylen/8)-2)*2;  // 10, 12, or 14 rounds
+  int i = 0;
+  U32* rk = &ek[0];
+  U32 temp;
+  static const U32 rcon[10] = {
+    0x01000000UL, 0x02000000UL, 0x04000000UL, 0x08000000UL,
+    0x10000000UL, 0x20000000UL, 0x40000000UL, 0x80000000UL,
+    0x1B000000UL, 0x36000000UL};  // round constants
+
+  LOAD32H(rk[0], key   );
+  LOAD32H(rk[1], key +  4);
+  LOAD32H(rk[2], key +  8);
+  LOAD32H(rk[3], key + 12);
+  if (keylen == 16) {
+    for (;;) {
+      temp  = rk[3];
+      rk[4] = rk[0] ^ setup_mix(temp) ^ rcon[i];
+      rk[5] = rk[1] ^ rk[4];
+      rk[6] = rk[2] ^ rk[5];
+      rk[7] = rk[3] ^ rk[6];
+      if (++i == 10) {
+         break;
+      }
+      rk += 4;
+    }
+  }
+  else if (keylen == 24) {
+    LOAD32H(rk[4], key + 16);
+    LOAD32H(rk[5], key + 20);
+    for (;;) {
+      temp = rk[5];
+      rk[ 6] = rk[ 0] ^ setup_mix(temp) ^ rcon[i];
+      rk[ 7] = rk[ 1] ^ rk[ 6];
+      rk[ 8] = rk[ 2] ^ rk[ 7];
+      rk[ 9] = rk[ 3] ^ rk[ 8];
+      if (++i == 8) {
+        break;
+      }
+      rk[10] = rk[ 4] ^ rk[ 9];
+      rk[11] = rk[ 5] ^ rk[10];
+      rk += 6;
+    }
+  }
+  else if (keylen == 32) {
+    LOAD32H(rk[4], key + 16);
+    LOAD32H(rk[5], key + 20);
+    LOAD32H(rk[6], key + 24);
+    LOAD32H(rk[7], key + 28);
+    for (;;) {
+      temp = rk[7];
+      rk[ 8] = rk[ 0] ^ setup_mix(temp) ^ rcon[i];
+      rk[ 9] = rk[ 1] ^ rk[ 8];
+      rk[10] = rk[ 2] ^ rk[ 9];
+      rk[11] = rk[ 3] ^ rk[10];
+      if (++i == 7) {
+        break;
+      }
+      temp = rk[11];
+      rk[12] = rk[ 4] ^ setup_mix(temp<<24|temp>>8);
+      rk[13] = rk[ 5] ^ rk[12];
+      rk[14] = rk[ 6] ^ rk[13];
+      rk[15] = rk[ 7] ^ rk[14];
+      rk += 8;
+    }
+  }
+}
+
+// Encrypt to ct[16]
+void AES_CTR::encrypt(U32 s0, U32 s1, U32 s2, U32 s3, unsigned char* ct) {
+  int r = Nr >> 1;
+  U32 *rk = &ek[0];
+  U32 t0=0, t1=0, t2=0, t3=0;
+  s0 ^= rk[0];
+  s1 ^= rk[1];
+  s2 ^= rk[2];
+  s3 ^= rk[3];
+  for (;;) {
+    t0 =
+      Te0[byte(s0, 3)] ^
+      Te1[byte(s1, 2)] ^
+      Te2[byte(s2, 1)] ^
+      Te3[byte(s3, 0)] ^
+      rk[4];
+    t1 =
+      Te0[byte(s1, 3)] ^
+      Te1[byte(s2, 2)] ^
+      Te2[byte(s3, 1)] ^
+      Te3[byte(s0, 0)] ^
+      rk[5];
+    t2 =
+      Te0[byte(s2, 3)] ^
+      Te1[byte(s3, 2)] ^
+      Te2[byte(s0, 1)] ^
+      Te3[byte(s1, 0)] ^
+      rk[6];
+    t3 =
+      Te0[byte(s3, 3)] ^
+      Te1[byte(s0, 2)] ^
+      Te2[byte(s1, 1)] ^
+      Te3[byte(s2, 0)] ^
+      rk[7];
+
+    rk += 8;
+    if (--r == 0) {
+      break;
+    }
+
+    s0 =
+      Te0[byte(t0, 3)] ^
+      Te1[byte(t1, 2)] ^
+      Te2[byte(t2, 1)] ^
+      Te3[byte(t3, 0)] ^
+      rk[0];
+    s1 =
+      Te0[byte(t1, 3)] ^
+      Te1[byte(t2, 2)] ^
+      Te2[byte(t3, 1)] ^
+      Te3[byte(t0, 0)] ^
+      rk[1];
+    s2 =
+      Te0[byte(t2, 3)] ^
+      Te1[byte(t3, 2)] ^
+      Te2[byte(t0, 1)] ^
+      Te3[byte(t1, 0)] ^
+      rk[2];
+    s3 =
+      Te0[byte(t3, 3)] ^
+      Te1[byte(t0, 2)] ^
+      Te2[byte(t1, 1)] ^
+      Te3[byte(t2, 0)] ^
+      rk[3];
+  }
+
+  // apply last round and map cipher state to byte array block:
+  s0 =
+    (Te4_3[byte(t0, 3)]) ^
+    (Te4_2[byte(t1, 2)]) ^
+    (Te4_1[byte(t2, 1)]) ^
+    (Te4_0[byte(t3, 0)]) ^
+    rk[0];
+  STORE32H(s0, ct);
+  s1 =
+    (Te4_3[byte(t1, 3)]) ^
+    (Te4_2[byte(t2, 2)]) ^
+    (Te4_1[byte(t3, 1)]) ^
+    (Te4_0[byte(t0, 0)]) ^
+    rk[1];
+  STORE32H(s1, ct+4);
+  s2 =
+    (Te4_3[byte(t2, 3)]) ^
+    (Te4_2[byte(t3, 2)]) ^
+    (Te4_1[byte(t0, 1)]) ^
+    (Te4_0[byte(t1, 0)]) ^
+    rk[2];
+  STORE32H(s2, ct+8);
+  s3 =
+    (Te4_3[byte(t3, 3)]) ^
+    (Te4_2[byte(t0, 2)]) ^
+    (Te4_1[byte(t1, 1)]) ^
+    (Te4_0[byte(t2, 0)]) ^ 
+    rk[3];
+  STORE32H(s3, ct+12);
+}
+
+// Encrypt or decrypt slice buf[0..n-1] at offset by XOR with AES(i) where
+// i is the 128 bit big-endian distance from the start in 16 byte blocks.
+void AES_CTR::encrypt(char* buf, int n, U64 offset) {
+  for (U64 i=offset/16; i<=(offset+n)/16; ++i) {
+    unsigned char ct[16];
+    encrypt(iv0, iv1, i>>32, i, ct);
+    for (int j=0; j<16; ++j) {
+      const int k=i*16-offset+j;
+      if (k>=0 && k<n)
+        buf[k]^=ct[j];
+    }
+  }
+}
+
+#undef setup_mix
+#undef Te4_3
+#undef Te4_2
+#undef Te4_1
+#undef Te4_0
+
+//////////////////////////// stretchKey //////////////////////
+
+// PBKDF2(pw[0..pwlen], salt[0..saltlen], c) to buf[0..dkLen-1]
+// using HMAC-SHA256, for the special case of c = 1 iterations
+// output size dkLen a multiple of 32, and pwLen <= 64.
+static void pbkdf2(const char* pw, int pwLen, const char* salt, int saltLen,
+                   int c, char* buf, int dkLen) {
+  assert(c==1);
+  assert(dkLen%32==0);
+  assert(pwLen<=64);
+
+  libzpaq::SHA256 sha256;
+  char b[32];
+  for (int i=1; i*32<=dkLen; ++i) {
+    for (int j=0; j<pwLen; ++j) sha256.put(pw[j]^0x36);
+    for (int j=pwLen; j<64; ++j) sha256.put(0x36);
+    for (int j=0; j<saltLen; ++j) sha256.put(salt[j]);
+    for (int j=24; j>=0; j-=8) sha256.put(i>>j);
+    memcpy(b, sha256.result(), 32);
+    for (int j=0; j<pwLen; ++j) sha256.put(pw[j]^0x5c);
+    for (int j=pwLen; j<64; ++j) sha256.put(0x5c);
+    for (int j=0; j<32; ++j) sha256.put(b[j]);
+    memcpy(buf+i*32-32, sha256.result(), 32);
+  }
+}
+
+// Hash b[0..15] using 8 rounds of salsa20
+// Modified from http://cr.yp.to/salsa20.html (public domain) to 8 rounds
+static void salsa8(U32* b) {
+  unsigned x[16]={0};
+  memcpy(x, b, 64);
+  for (int i=0; i<4; ++i) {
+    #define R(a,b) (((a)<<(b))+((a)>>(32-b)))
+    x[ 4] ^= R(x[ 0]+x[12], 7);  x[ 8] ^= R(x[ 4]+x[ 0], 9);
+    x[12] ^= R(x[ 8]+x[ 4],13);  x[ 0] ^= R(x[12]+x[ 8],18);
+    x[ 9] ^= R(x[ 5]+x[ 1], 7);  x[13] ^= R(x[ 9]+x[ 5], 9);
+    x[ 1] ^= R(x[13]+x[ 9],13);  x[ 5] ^= R(x[ 1]+x[13],18);
+    x[14] ^= R(x[10]+x[ 6], 7);  x[ 2] ^= R(x[14]+x[10], 9);
+    x[ 6] ^= R(x[ 2]+x[14],13);  x[10] ^= R(x[ 6]+x[ 2],18);
+    x[ 3] ^= R(x[15]+x[11], 7);  x[ 7] ^= R(x[ 3]+x[15], 9);
+    x[11] ^= R(x[ 7]+x[ 3],13);  x[15] ^= R(x[11]+x[ 7],18);
+    x[ 1] ^= R(x[ 0]+x[ 3], 7);  x[ 2] ^= R(x[ 1]+x[ 0], 9);
+    x[ 3] ^= R(x[ 2]+x[ 1],13);  x[ 0] ^= R(x[ 3]+x[ 2],18);
+    x[ 6] ^= R(x[ 5]+x[ 4], 7);  x[ 7] ^= R(x[ 6]+x[ 5], 9);
+    x[ 4] ^= R(x[ 7]+x[ 6],13);  x[ 5] ^= R(x[ 4]+x[ 7],18);
+    x[11] ^= R(x[10]+x[ 9], 7);  x[ 8] ^= R(x[11]+x[10], 9);
+    x[ 9] ^= R(x[ 8]+x[11],13);  x[10] ^= R(x[ 9]+x[ 8],18);
+    x[12] ^= R(x[15]+x[14], 7);  x[13] ^= R(x[12]+x[15], 9);
+    x[14] ^= R(x[13]+x[12],13);  x[15] ^= R(x[14]+x[13],18);
+    #undef R
+  }
+  for (int i=0; i<16; ++i) b[i]+=x[i];
+}
+
+// BlockMix_{Salsa20/8, r} on b[0..128*r-1]
+static void blockmix(U32* b, int r) {
+  assert(r<=8);
+  U32 x[16];
+  U32 y[256];
+  memcpy(x, b+32*r-16, 64);
+  for (int i=0; i<2*r; ++i) {
+    for (int j=0; j<16; ++j) x[j]^=b[i*16+j];
+    salsa8(x);
+    memcpy(&y[i*16], x, 64);
+  }
+  for (int i=0; i<r; ++i) memcpy(b+i*16, &y[i*32], 64);
+  for (int i=0; i<r; ++i) memcpy(b+(i+r)*16, &y[i*32+16], 64);
+}
+
+// Mix b[0..128*r-1]. Uses 128*r*n bytes of memory and O(r*n) time
+static void smix(char* b, int r, int n) {
+  libzpaq::Array<U32> x(32*r), v(32*r*n);
+  for (int i=0; i<r*128; ++i) x[i/4]+=(b[i]&255)<<i%4*8;
+  for (int i=0; i<n; ++i) {
+    memcpy(&v[i*r*32], &x[0], r*128);
+    blockmix(&x[0], r);
+  }
+  for (int i=0; i<n; ++i) {
+    U32 j=x[(2*r-1)*16]&(n-1);
+    for (int k=0; k<r*32; ++k) x[k]^=v[j*r*32+k];
+    blockmix(&x[0], r);
+  }
+  for (int i=0; i<r*128; ++i) b[i]=x[i/4]>>(i%4*8);
+}
+
+// Strengthen password pw[0..pwlen-1] and salt[0..saltlen-1]
+// to produce key buf[0..buflen-1]. Uses O(n*r*p) time and 128*r*n bytes
+// of memory. n must be a power of 2 and r <= 8.
+void scrypt(const char* pw, int pwlen,
+            const char* salt, int saltlen,
+            int n, int r, int p, char* buf, int buflen) {
+  assert(r<=8);
+  assert(n>0 && (n&(n-1))==0);  // power of 2?
+  libzpaq::Array<char> b(p*r*128);
+  pbkdf2(pw, pwlen, salt, saltlen, 1, &b[0], p*r*128);
+  for (int i=0; i<p; ++i) smix(&b[i*r*128], r, n);
+  pbkdf2(pw, pwlen, &b[0], p*r*128, 1, buf, buflen);
+}
+
+// Stretch key in[0..31], assumed to be SHA256(password), with
+// NUL terminate salt to produce new key out[0..31]
+void stretchKey(char* out, const char* in, const char* salt) {
+  scrypt(in, 32, salt, 32, 1<<14, 8, 1, out, 32);
+}
+
+//////////////////////////// random //////////////////////////
+
+// Put n cryptographic random bytes in buf[0..n-1].
+// The first byte will not be 'z' or '7' (start of a ZPAQ archive).
+// For a pure random number, discard the first byte.
+// In VC++, must link to advapi32.lib.
+
+void random(char* buf, int n) {
+#ifdef unix
+  FILE* in=fopen("/dev/urandom", "rb");
+  if (in && int(fread(buf, 1, n, in))==n)
+    fclose(in);
+  else {
+    error("key generation failed");
+  }
+#else
+  HCRYPTPROV h;
+  if (CryptAcquireContext(&h, NULL, NULL, PROV_RSA_FULL,
+      CRYPT_VERIFYCONTEXT) && CryptGenRandom(h, n, (BYTE*)buf))
+    CryptReleaseContext(h, 0);
+  else {
+    fprintf(stderr, "CryptGenRandom: error %d\n", int(GetLastError()));
+    error("key generation failed");
+  }
+#endif
+  if (n>=1 && (buf[0]=='z' || buf[0]=='7'))
+    buf[0]^=0x80;
 }
 
 //////////////////////////// Component ///////////////////////
@@ -197,93 +731,143 @@ void Component::init() {
   a16.resize(0);
 }
 
-////////////////////////// StateTable //////////////////////////
+////////////////////////// StateTable ////////////////////////
 
-// How many states with count of n0 zeros, n1 ones (0...2)
-int StateTable::num_states(int n0, int n1) {
-  const int B=6;
-  const int bound[B]={20,48,15,8,6,5}; // n0 -> max n1, n1 -> max n0
-  if (n0<n1) return num_states(n1, n0);
-  if (n0<0 || n1<0 || n1>=B || n0>bound[n1]) return 0;
-  return 1+(n1>0 && n0+n1<=17);
-}
-
-// New value of count n0 if 1 is observed (and vice versa)
-void StateTable::discount(int& n0) {
-  n0=(n0>=1)+(n0>=2)+(n0>=3)+(n0>=4)+(n0>=5)+(n0>=7)+(n0>=8);
-}
-
-// compute next n0,n1 (0 to N) given input y (0 or 1)
-void StateTable::next_state(int& n0, int& n1, int y) {
-  if (n0<n1)
-    next_state(n1, n0, 1-y);
-  else {
-    if (y) {
-      ++n1;
-      discount(n0);
-    }
-    else {
-      ++n0;
-      discount(n1);
-    }
-    // 20,0,0 -> 20,0
-    // 48,1,0 -> 48,1
-    // 15,2,0 -> 8,1
-    //  8,3,0 -> 6,2
-    //  8,3,1 -> 5,3
-    //  6,4,0 -> 5,3
-    //  5,5,0 -> 5,4
-    //  5,5,1 -> 4,5
-    while (!num_states(n0, n1)) {
-      if (n1<2) --n0;
-      else {
-        n0=(n0*(n1-1)+(n1/2))/n1;
-        --n1;
-      }
-    }
-  }
-}
+// sns[i*4] -> next state if 0, next state if 1, n0, n1
+static const U8 sns[1024]={
+     1,     2,     0,     0,     3,     5,     1,     0,
+     4,     6,     0,     1,     7,     9,     2,     0,
+     8,    11,     1,     1,     8,    11,     1,     1,
+    10,    12,     0,     2,    13,    15,     3,     0,
+    14,    17,     2,     1,    14,    17,     2,     1,
+    16,    19,     1,     2,    16,    19,     1,     2,
+    18,    20,     0,     3,    21,    23,     4,     0,
+    22,    25,     3,     1,    22,    25,     3,     1,
+    24,    27,     2,     2,    24,    27,     2,     2,
+    26,    29,     1,     3,    26,    29,     1,     3,
+    28,    30,     0,     4,    31,    33,     5,     0,
+    32,    35,     4,     1,    32,    35,     4,     1,
+    34,    37,     3,     2,    34,    37,     3,     2,
+    36,    39,     2,     3,    36,    39,     2,     3,
+    38,    41,     1,     4,    38,    41,     1,     4,
+    40,    42,     0,     5,    43,    33,     6,     0,
+    44,    47,     5,     1,    44,    47,     5,     1,
+    46,    49,     4,     2,    46,    49,     4,     2,
+    48,    51,     3,     3,    48,    51,     3,     3,
+    50,    53,     2,     4,    50,    53,     2,     4,
+    52,    55,     1,     5,    52,    55,     1,     5,
+    40,    56,     0,     6,    57,    45,     7,     0,
+    58,    47,     6,     1,    58,    47,     6,     1,
+    60,    63,     5,     2,    60,    63,     5,     2,
+    62,    65,     4,     3,    62,    65,     4,     3,
+    64,    67,     3,     4,    64,    67,     3,     4,
+    66,    69,     2,     5,    66,    69,     2,     5,
+    52,    71,     1,     6,    52,    71,     1,     6,
+    54,    72,     0,     7,    73,    59,     8,     0,
+    74,    61,     7,     1,    74,    61,     7,     1,
+    76,    63,     6,     2,    76,    63,     6,     2,
+    78,    81,     5,     3,    78,    81,     5,     3,
+    80,    83,     4,     4,    80,    83,     4,     4,
+    82,    85,     3,     5,    82,    85,     3,     5,
+    66,    87,     2,     6,    66,    87,     2,     6,
+    68,    89,     1,     7,    68,    89,     1,     7,
+    70,    90,     0,     8,    91,    59,     9,     0,
+    92,    77,     8,     1,    92,    77,     8,     1,
+    94,    79,     7,     2,    94,    79,     7,     2,
+    96,    81,     6,     3,    96,    81,     6,     3,
+    98,   101,     5,     4,    98,   101,     5,     4,
+   100,   103,     4,     5,   100,   103,     4,     5,
+    82,   105,     3,     6,    82,   105,     3,     6,
+    84,   107,     2,     7,    84,   107,     2,     7,
+    86,   109,     1,     8,    86,   109,     1,     8,
+    70,   110,     0,     9,   111,    59,    10,     0,
+   112,    77,     9,     1,   112,    77,     9,     1,
+   114,    97,     8,     2,   114,    97,     8,     2,
+   116,    99,     7,     3,   116,    99,     7,     3,
+    62,   101,     6,     4,    62,   101,     6,     4,
+    80,    83,     5,     5,    80,    83,     5,     5,
+   100,    67,     4,     6,   100,    67,     4,     6,
+   102,   119,     3,     7,   102,   119,     3,     7,
+   104,   121,     2,     8,   104,   121,     2,     8,
+    86,   123,     1,     9,    86,   123,     1,     9,
+    70,   124,     0,    10,   125,    59,    11,     0,
+   126,    77,    10,     1,   126,    77,    10,     1,
+   128,    97,     9,     2,   128,    97,     9,     2,
+    60,    63,     8,     3,    60,    63,     8,     3,
+    66,    69,     3,     8,    66,    69,     3,     8,
+   104,   131,     2,     9,   104,   131,     2,     9,
+    86,   133,     1,    10,    86,   133,     1,    10,
+    70,   134,     0,    11,   135,    59,    12,     0,
+   136,    77,    11,     1,   136,    77,    11,     1,
+   138,    97,    10,     2,   138,    97,    10,     2,
+   104,   141,     2,    10,   104,   141,     2,    10,
+    86,   143,     1,    11,    86,   143,     1,    11,
+    70,   144,     0,    12,   145,    59,    13,     0,
+   146,    77,    12,     1,   146,    77,    12,     1,
+   148,    97,    11,     2,   148,    97,    11,     2,
+   104,   151,     2,    11,   104,   151,     2,    11,
+    86,   153,     1,    12,    86,   153,     1,    12,
+    70,   154,     0,    13,   155,    59,    14,     0,
+   156,    77,    13,     1,   156,    77,    13,     1,
+   158,    97,    12,     2,   158,    97,    12,     2,
+   104,   161,     2,    12,   104,   161,     2,    12,
+    86,   163,     1,    13,    86,   163,     1,    13,
+    70,   164,     0,    14,   165,    59,    15,     0,
+   166,    77,    14,     1,   166,    77,    14,     1,
+   168,    97,    13,     2,   168,    97,    13,     2,
+   104,   171,     2,    13,   104,   171,     2,    13,
+    86,   173,     1,    14,    86,   173,     1,    14,
+    70,   174,     0,    15,   175,    59,    16,     0,
+   176,    77,    15,     1,   176,    77,    15,     1,
+   178,    97,    14,     2,   178,    97,    14,     2,
+   104,   181,     2,    14,   104,   181,     2,    14,
+    86,   183,     1,    15,    86,   183,     1,    15,
+    70,   184,     0,    16,   185,    59,    17,     0,
+   186,    77,    16,     1,   186,    77,    16,     1,
+    74,    97,    15,     2,    74,    97,    15,     2,
+   104,    89,     2,    15,   104,    89,     2,    15,
+    86,   187,     1,    16,    86,   187,     1,    16,
+    70,   188,     0,    17,   189,    59,    18,     0,
+   190,    77,    17,     1,    86,   191,     1,    17,
+    70,   192,     0,    18,   193,    59,    19,     0,
+   194,    77,    18,     1,    86,   195,     1,    18,
+    70,   196,     0,    19,   193,    59,    20,     0,
+   197,    77,    19,     1,    86,   198,     1,    19,
+    70,   196,     0,    20,   199,    77,    20,     1,
+    86,   200,     1,    20,   201,    77,    21,     1,
+    86,   202,     1,    21,   203,    77,    22,     1,
+    86,   204,     1,    22,   205,    77,    23,     1,
+    86,   206,     1,    23,   207,    77,    24,     1,
+    86,   208,     1,    24,   209,    77,    25,     1,
+    86,   210,     1,    25,   211,    77,    26,     1,
+    86,   212,     1,    26,   213,    77,    27,     1,
+    86,   214,     1,    27,   215,    77,    28,     1,
+    86,   216,     1,    28,   217,    77,    29,     1,
+    86,   218,     1,    29,   219,    77,    30,     1,
+    86,   220,     1,    30,   221,    77,    31,     1,
+    86,   222,     1,    31,   223,    77,    32,     1,
+    86,   224,     1,    32,   225,    77,    33,     1,
+    86,   226,     1,    33,   227,    77,    34,     1,
+    86,   228,     1,    34,   229,    77,    35,     1,
+    86,   230,     1,    35,   231,    77,    36,     1,
+    86,   232,     1,    36,   233,    77,    37,     1,
+    86,   234,     1,    37,   235,    77,    38,     1,
+    86,   236,     1,    38,   237,    77,    39,     1,
+    86,   238,     1,    39,   239,    77,    40,     1,
+    86,   240,     1,    40,   241,    77,    41,     1,
+    86,   242,     1,    41,   243,    77,    42,     1,
+    86,   244,     1,    42,   245,    77,    43,     1,
+    86,   246,     1,    43,   247,    77,    44,     1,
+    86,   248,     1,    44,   249,    77,    45,     1,
+    86,   250,     1,    45,   251,    77,    46,     1,
+    86,   252,     1,    46,   253,    77,    47,     1,
+    86,   254,     1,    47,   253,    77,    48,     1,
+    86,   254,     1,    48,     0,     0,     0,     0
+};
 
 // Initialize next state table ns[state*4] -> next if 0, next if 1, n0, n1
 StateTable::StateTable() {
-
-  // Assign states by increasing priority
-  const int N=50;
-  U8 t[N][N][2]={{{0}}}; // (n0,n1,y) -> state number
-  int state=0;
-  for (int i=0; i<N; ++i) {
-    for (int n1=0; n1<=i; ++n1) {
-      int n0=i-n1;
-      int n=num_states(n0, n1);
-      assert(n>=0 && n<=2);
-      if (n) {
-        t[n0][n1][0]=state;
-        t[n0][n1][1]=state+n-1;
-        state+=n;
-      }
-    }
-  }
-       
-  // Generate next state table
-  memset(ns, 0, sizeof(ns));
-  for (int n0=0; n0<N; ++n0) {
-    for (int n1=0; n1<N; ++n1) {
-      for (int y=0; y<num_states(n0, n1); ++y) {
-        int s=t[n0][n1][y];
-        assert(s>=0 && s<256);
-        int s0=n0, s1=n1;
-        next_state(s0, s1, 0);
-        assert(s0>=0 && s0<N && s1>=0 && s1<N);
-        ns[s*4+0]=t[s0][s1][0];
-        s0=n0, s1=n1;
-        next_state(s0, s1, 1);
-        assert(s0>=0 && s0<N && s1>=0 && s1<N);
-        ns[s*4+1]=t[s0][s1][1];
-        ns[s*4+2]=n0;
-        ns[s*4+3]=n1;
-      }
-    }
-  }
+  memcpy(ns, sns, sizeof(ns));
 }
 
 /////////////////////////// ZPAQL //////////////////////////
@@ -326,11 +910,11 @@ int ZPAQL::read(Reader* in2) {
   int n=header[cend-1];
   for (int i=0; i<n; ++i) {
     int type=in2->get();  // component type
-    if (type==-1) error("unexpected end of file");
+    if (type<0 || type>255) error("unexpected end of file");
     header[cend++]=type;  // component type
     int size=compsize[type];
     if (size<1) error("Invalid component type");
-    if (cend+size>header.isize()-8) error("COMP list too big");
+    if (cend+size>hsize) error("COMP overflows header");
     for (int j=1; j<size; ++j)
       header[cend++]=in2->get();
   }
@@ -338,6 +922,7 @@ int ZPAQL::read(Reader* in2) {
 
   // Insert a guard gap and read HCOMP
   hbegin=hend=cend+128;
+  if (hend>hsize+129) error("missing HCOMP");
   while (hend<hsize+129) {
     assert(hend<header.isize()-8);
     int op=in2->get();
@@ -397,23 +982,30 @@ void ZPAQL::initp() {
 // Flush pending output
 void ZPAQL::flush() {
   if (output) output->write(&outbuf[0], bufptr);
-  if (sha1) for (int i=0; i<bufptr; ++i) sha1->put(U8(outbuf[i]));
+  if (sha1) sha1->write(&outbuf[0], bufptr);
   bufptr=0;
 }
 
+// pow(2, x)
+static double pow2(int x) {
+  double r=1;
+  for (; x>0; x--) r+=r;
+  return r;
+}
+
 // Return memory requirement in bytes
 double ZPAQL::memory() {
-  double mem=pow(2.0,header[2]+2)+pow(2.0,header[3])  // hh hm
-            +pow(2.0,header[4]+2)+pow(2.0,header[5])  // ph pm
+  double mem=pow2(header[2]+2)+pow2(header[3])  // hh hm
+            +pow2(header[4]+2)+pow2(header[5])  // ph pm
             +header.size();
   int cp=7;  // start of comp list
   for (int i=0; i<header[6]; ++i) {  // n
     assert(cp<cend);
-    double size=pow(2.0, header[cp+1]); // sizebits
+    double size=pow2(header[cp+1]); // sizebits
     switch(header[cp]) {
       case CM: mem+=4*size; break;
       case ICM: mem+=64*size+1024; break;
-      case MATCH: mem+=4*size+pow(2.0, header[cp+2]); break; // bufbits
+      case MATCH: mem+=4*size+pow2(header[cp+2]); break; // bufbits
       case MIX2: mem+=2*size; break;
       case MIX: mem+=4*size*header[cp+3]; break; // m
       case ISSE: mem+=64*size+2048; break;
@@ -434,6 +1026,8 @@ void ZPAQL::init(int hbits, int mbits) {
   assert(header[0]+256*header[1]==cend-2+hend-hbegin);
   assert(bufptr==0);
   assert(outbuf.isize()>0);
+  if (hbits>32) error("H too big");
+  if (mbits>32) error("M too big");
   h.resize(1, hbits);
   m.resize(1, mbits);
   r.resize(256);
@@ -540,7 +1134,7 @@ int ZPAQL::execute() {
     case 97: m(b) = b; break; // *B=B
     case 98: m(b) = c; break; // *B=C
     case 99: m(b) = d; break; // *B=D
-    case 100: m(b) = m(b); break; // *B=*B
+    case 100: break; // *B=*B
     case 101: m(b) = m(c); break; // *B=*C
     case 102: m(b) = h(d); break; // *B=*D
     case 103: m(b) = header[pc++]; break; // *B= N
@@ -549,7 +1143,7 @@ int ZPAQL::execute() {
     case 106: m(c) = c; break; // *C=C
     case 107: m(c) = d; break; // *C=D
     case 108: m(c) = m(b); break; // *C=*B
-    case 109: m(c) = m(c); break; // *C=*C
+    case 109: break; // *C=*C
     case 110: m(c) = h(d); break; // *C=*D
     case 111: m(c) = header[pc++]; break; // *C= N
     case 112: h(d) = a; break; // *D=A
@@ -558,7 +1152,7 @@ int ZPAQL::execute() {
     case 115: h(d) = d; break; // *D=D
     case 116: h(d) = m(b); break; // *D=*B
     case 117: h(d) = m(c); break; // *D=*C
-    case 118: h(d) = h(d); break; // *D=*D
+    case 118: break; // *D=*D
     case 119: h(d) = header[pc++]; break; // *D= N
     case 128: a += a; break; // A+=A
     case 129: a += b; break; // A+=B
@@ -648,7 +1242,7 @@ int ZPAQL::execute() {
     case 213: a >>= (m(c)&31); break; // A>>=*C
     case 214: a >>= (h(d)&31); break; // A>>=*D
     case 215: a >>= (header[pc++]&31); break; // A>>= N
-    case 216: f = (true); break; // A==A
+    case 216: f = 1; break; // A==A
     case 217: f = (a == b); break; // A==B
     case 218: f = (a == c); break; // A==C
     case 219: f = (a == d); break; // A==D
@@ -656,7 +1250,7 @@ int ZPAQL::execute() {
     case 221: f = (a == U32(m(c))); break; // A==*C
     case 222: f = (a == h(d)); break; // A==*D
     case 223: f = (a == U32(header[pc++])); break; // A== N
-    case 224: f = (false); break; // A<A
+    case 224: f = 0; break; // A<A
     case 225: f = (a < b); break; // A<B
     case 226: f = (a < c); break; // A<C
     case 227: f = (a < d); break; // A<D
@@ -664,7 +1258,7 @@ int ZPAQL::execute() {
     case 229: f = (a < U32(m(c))); break; // A<*C
     case 230: f = (a < h(d)); break; // A<*D
     case 231: f = (a < U32(header[pc++])); break; // A< N
-    case 232: f = (false); break; // A>A
+    case 232: f = 0; break; // A>A
     case 233: f = (a > b); break; // A>B
     case 234: f = (a > c); break; // A>C
     case 235: f = (a > d); break; // A>D
@@ -685,7 +1279,440 @@ void ZPAQL::err() {
 
 ///////////////////////// Predictor /////////////////////////
 
-// Initailize model-independent tables
+// sdt2k[i]=2048/i;
+static const int sdt2k[256]={
+     0,  2048,  1024,   682,   512,   409,   341,   292,
+   256,   227,   204,   186,   170,   157,   146,   136,
+   128,   120,   113,   107,   102,    97,    93,    89,
+    85,    81,    78,    75,    73,    70,    68,    66,
+    64,    62,    60,    58,    56,    55,    53,    52,
+    51,    49,    48,    47,    46,    45,    44,    43,
+    42,    41,    40,    40,    39,    38,    37,    37,
+    36,    35,    35,    34,    34,    33,    33,    32,
+    32,    31,    31,    30,    30,    29,    29,    28,
+    28,    28,    27,    27,    26,    26,    26,    25,
+    25,    25,    24,    24,    24,    24,    23,    23,
+    23,    23,    22,    22,    22,    22,    21,    21,
+    21,    21,    20,    20,    20,    20,    20,    19,
+    19,    19,    19,    19,    18,    18,    18,    18,
+    18,    18,    17,    17,    17,    17,    17,    17,
+    17,    16,    16,    16,    16,    16,    16,    16,
+    16,    15,    15,    15,    15,    15,    15,    15,
+    15,    14,    14,    14,    14,    14,    14,    14,
+    14,    14,    14,    13,    13,    13,    13,    13,
+    13,    13,    13,    13,    13,    13,    12,    12,
+    12,    12,    12,    12,    12,    12,    12,    12,
+    12,    12,    12,    11,    11,    11,    11,    11,
+    11,    11,    11,    11,    11,    11,    11,    11,
+    11,    11,    11,    10,    10,    10,    10,    10,
+    10,    10,    10,    10,    10,    10,    10,    10,
+    10,    10,    10,    10,    10,     9,     9,     9,
+     9,     9,     9,     9,     9,     9,     9,     9,
+     9,     9,     9,     9,     9,     9,     9,     9,
+     9,     9,     9,     9,     8,     8,     8,     8,
+     8,     8,     8,     8,     8,     8,     8,     8,
+     8,     8,     8,     8,     8,     8,     8,     8,
+     8,     8,     8,     8,     8,     8,     8,     8
+};
+
+// sdt[i]=(1<<17)/(i*2+3)*2;
+static const int sdt[1024]={
+ 87380, 52428, 37448, 29126, 23830, 20164, 17476, 15420,
+ 13796, 12482, 11396, 10484,  9708,  9038,  8456,  7942,
+  7488,  7084,  6720,  6392,  6096,  5824,  5576,  5348,
+  5140,  4946,  4766,  4598,  4442,  4296,  4160,  4032,
+  3912,  3798,  3692,  3590,  3494,  3404,  3318,  3236,
+  3158,  3084,  3012,  2944,  2880,  2818,  2758,  2702,
+  2646,  2594,  2544,  2496,  2448,  2404,  2360,  2318,
+  2278,  2240,  2202,  2166,  2130,  2096,  2064,  2032,
+  2000,  1970,  1940,  1912,  1884,  1858,  1832,  1806,
+  1782,  1758,  1736,  1712,  1690,  1668,  1648,  1628,
+  1608,  1588,  1568,  1550,  1532,  1514,  1496,  1480,
+  1464,  1448,  1432,  1416,  1400,  1386,  1372,  1358,
+  1344,  1330,  1316,  1304,  1290,  1278,  1266,  1254,
+  1242,  1230,  1218,  1208,  1196,  1186,  1174,  1164,
+  1154,  1144,  1134,  1124,  1114,  1106,  1096,  1086,
+  1078,  1068,  1060,  1052,  1044,  1036,  1028,  1020,
+  1012,  1004,   996,   988,   980,   974,   966,   960,
+   952,   946,   938,   932,   926,   918,   912,   906,
+   900,   894,   888,   882,   876,   870,   864,   858,
+   852,   848,   842,   836,   832,   826,   820,   816,
+   810,   806,   800,   796,   790,   786,   782,   776,
+   772,   768,   764,   758,   754,   750,   746,   742,
+   738,   734,   730,   726,   722,   718,   714,   710,
+   706,   702,   698,   694,   690,   688,   684,   680,
+   676,   672,   670,   666,   662,   660,   656,   652,
+   650,   646,   644,   640,   636,   634,   630,   628,
+   624,   622,   618,   616,   612,   610,   608,   604,
+   602,   598,   596,   594,   590,   588,   586,   582,
+   580,   578,   576,   572,   570,   568,   566,   562,
+   560,   558,   556,   554,   550,   548,   546,   544,
+   542,   540,   538,   536,   532,   530,   528,   526,
+   524,   522,   520,   518,   516,   514,   512,   510,
+   508,   506,   504,   502,   500,   498,   496,   494,
+   492,   490,   488,   488,   486,   484,   482,   480,
+   478,   476,   474,   474,   472,   470,   468,   466,
+   464,   462,   462,   460,   458,   456,   454,   454,
+   452,   450,   448,   448,   446,   444,   442,   442,
+   440,   438,   436,   436,   434,   432,   430,   430,
+   428,   426,   426,   424,   422,   422,   420,   418,
+   418,   416,   414,   414,   412,   410,   410,   408,
+   406,   406,   404,   402,   402,   400,   400,   398,
+   396,   396,   394,   394,   392,   390,   390,   388,
+   388,   386,   386,   384,   382,   382,   380,   380,
+   378,   378,   376,   376,   374,   372,   372,   370,
+   370,   368,   368,   366,   366,   364,   364,   362,
+   362,   360,   360,   358,   358,   356,   356,   354,
+   354,   352,   352,   350,   350,   348,   348,   348,
+   346,   346,   344,   344,   342,   342,   340,   340,
+   340,   338,   338,   336,   336,   334,   334,   332,
+   332,   332,   330,   330,   328,   328,   328,   326,
+   326,   324,   324,   324,   322,   322,   320,   320,
+   320,   318,   318,   316,   316,   316,   314,   314,
+   312,   312,   312,   310,   310,   310,   308,   308,
+   308,   306,   306,   304,   304,   304,   302,   302,
+   302,   300,   300,   300,   298,   298,   298,   296,
+   296,   296,   294,   294,   294,   292,   292,   292,
+   290,   290,   290,   288,   288,   288,   286,   286,
+   286,   284,   284,   284,   284,   282,   282,   282,
+   280,   280,   280,   278,   278,   278,   276,   276,
+   276,   276,   274,   274,   274,   272,   272,   272,
+   272,   270,   270,   270,   268,   268,   268,   268,
+   266,   266,   266,   266,   264,   264,   264,   262,
+   262,   262,   262,   260,   260,   260,   260,   258,
+   258,   258,   258,   256,   256,   256,   256,   254,
+   254,   254,   254,   252,   252,   252,   252,   250,
+   250,   250,   250,   248,   248,   248,   248,   248,
+   246,   246,   246,   246,   244,   244,   244,   244,
+   242,   242,   242,   242,   242,   240,   240,   240,
+   240,   238,   238,   238,   238,   238,   236,   236,
+   236,   236,   234,   234,   234,   234,   234,   232,
+   232,   232,   232,   232,   230,   230,   230,   230,
+   230,   228,   228,   228,   228,   228,   226,   226,
+   226,   226,   226,   224,   224,   224,   224,   224,
+   222,   222,   222,   222,   222,   220,   220,   220,
+   220,   220,   220,   218,   218,   218,   218,   218,
+   216,   216,   216,   216,   216,   216,   214,   214,
+   214,   214,   214,   212,   212,   212,   212,   212,
+   212,   210,   210,   210,   210,   210,   210,   208,
+   208,   208,   208,   208,   208,   206,   206,   206,
+   206,   206,   206,   204,   204,   204,   204,   204,
+   204,   204,   202,   202,   202,   202,   202,   202,
+   200,   200,   200,   200,   200,   200,   198,   198,
+   198,   198,   198,   198,   198,   196,   196,   196,
+   196,   196,   196,   196,   194,   194,   194,   194,
+   194,   194,   194,   192,   192,   192,   192,   192,
+   192,   192,   190,   190,   190,   190,   190,   190,
+   190,   188,   188,   188,   188,   188,   188,   188,
+   186,   186,   186,   186,   186,   186,   186,   186,
+   184,   184,   184,   184,   184,   184,   184,   182,
+   182,   182,   182,   182,   182,   182,   182,   180,
+   180,   180,   180,   180,   180,   180,   180,   178,
+   178,   178,   178,   178,   178,   178,   178,   176,
+   176,   176,   176,   176,   176,   176,   176,   176,
+   174,   174,   174,   174,   174,   174,   174,   174,
+   172,   172,   172,   172,   172,   172,   172,   172,
+   172,   170,   170,   170,   170,   170,   170,   170,
+   170,   170,   168,   168,   168,   168,   168,   168,
+   168,   168,   168,   166,   166,   166,   166,   166,
+   166,   166,   166,   166,   166,   164,   164,   164,
+   164,   164,   164,   164,   164,   164,   162,   162,
+   162,   162,   162,   162,   162,   162,   162,   162,
+   160,   160,   160,   160,   160,   160,   160,   160,
+   160,   160,   158,   158,   158,   158,   158,   158,
+   158,   158,   158,   158,   158,   156,   156,   156,
+   156,   156,   156,   156,   156,   156,   156,   154,
+   154,   154,   154,   154,   154,   154,   154,   154,
+   154,   154,   152,   152,   152,   152,   152,   152,
+   152,   152,   152,   152,   152,   150,   150,   150,
+   150,   150,   150,   150,   150,   150,   150,   150,
+   150,   148,   148,   148,   148,   148,   148,   148,
+   148,   148,   148,   148,   148,   146,   146,   146,
+   146,   146,   146,   146,   146,   146,   146,   146,
+   146,   144,   144,   144,   144,   144,   144,   144,
+   144,   144,   144,   144,   144,   142,   142,   142,
+   142,   142,   142,   142,   142,   142,   142,   142,
+   142,   142,   140,   140,   140,   140,   140,   140,
+   140,   140,   140,   140,   140,   140,   140,   138,
+   138,   138,   138,   138,   138,   138,   138,   138,
+   138,   138,   138,   138,   138,   136,   136,   136,
+   136,   136,   136,   136,   136,   136,   136,   136,
+   136,   136,   136,   134,   134,   134,   134,   134,
+   134,   134,   134,   134,   134,   134,   134,   134,
+   134,   132,   132,   132,   132,   132,   132,   132,
+   132,   132,   132,   132,   132,   132,   132,   132,
+   130,   130,   130,   130,   130,   130,   130,   130,
+   130,   130,   130,   130,   130,   130,   130,   128,
+   128,   128,   128,   128,   128,   128,   128,   128,
+   128,   128,   128,   128,   128,   128,   128,   126
+};
+
+// ssquasht[i]=int(32768.0/(1+exp((i-2048)*(-1.0/64))));
+// Middle 1344 of 4096 entries only.
+static const U16 ssquasht[1344]={
+     0,     0,     0,     0,     0,     0,     0,     1,
+     1,     1,     1,     1,     1,     1,     1,     1,
+     1,     1,     1,     1,     1,     1,     1,     1,
+     1,     1,     1,     1,     1,     1,     1,     1,
+     1,     1,     1,     1,     1,     1,     1,     1,
+     1,     1,     1,     1,     1,     1,     1,     1,
+     1,     1,     1,     2,     2,     2,     2,     2,
+     2,     2,     2,     2,     2,     2,     2,     2,
+     2,     2,     2,     2,     2,     2,     2,     2,
+     2,     2,     2,     2,     2,     3,     3,     3,
+     3,     3,     3,     3,     3,     3,     3,     3,
+     3,     3,     3,     3,     3,     3,     3,     3,
+     4,     4,     4,     4,     4,     4,     4,     4,
+     4,     4,     4,     4,     4,     4,     5,     5,
+     5,     5,     5,     5,     5,     5,     5,     5,
+     5,     5,     6,     6,     6,     6,     6,     6,
+     6,     6,     6,     6,     7,     7,     7,     7,
+     7,     7,     7,     7,     8,     8,     8,     8,
+     8,     8,     8,     8,     9,     9,     9,     9,
+     9,     9,    10,    10,    10,    10,    10,    10,
+    10,    11,    11,    11,    11,    11,    12,    12,
+    12,    12,    12,    13,    13,    13,    13,    13,
+    14,    14,    14,    14,    15,    15,    15,    15,
+    15,    16,    16,    16,    17,    17,    17,    17,
+    18,    18,    18,    18,    19,    19,    19,    20,
+    20,    20,    21,    21,    21,    22,    22,    22,
+    23,    23,    23,    24,    24,    25,    25,    25,
+    26,    26,    27,    27,    28,    28,    28,    29,
+    29,    30,    30,    31,    31,    32,    32,    33,
+    33,    34,    34,    35,    36,    36,    37,    37,
+    38,    38,    39,    40,    40,    41,    42,    42,
+    43,    44,    44,    45,    46,    46,    47,    48,
+    49,    49,    50,    51,    52,    53,    54,    54,
+    55,    56,    57,    58,    59,    60,    61,    62,
+    63,    64,    65,    66,    67,    68,    69,    70,
+    71,    72,    73,    74,    76,    77,    78,    79,
+    81,    82,    83,    84,    86,    87,    88,    90,
+    91,    93,    94,    96,    97,    99,   100,   102,
+   103,   105,   107,   108,   110,   112,   114,   115,
+   117,   119,   121,   123,   125,   127,   129,   131,
+   133,   135,   137,   139,   141,   144,   146,   148,
+   151,   153,   155,   158,   160,   163,   165,   168,
+   171,   173,   176,   179,   182,   184,   187,   190,
+   193,   196,   199,   202,   206,   209,   212,   215,
+   219,   222,   226,   229,   233,   237,   240,   244,
+   248,   252,   256,   260,   264,   268,   272,   276,
+   281,   285,   289,   294,   299,   303,   308,   313,
+   318,   323,   328,   333,   338,   343,   349,   354,
+   360,   365,   371,   377,   382,   388,   394,   401,
+   407,   413,   420,   426,   433,   440,   446,   453,
+   460,   467,   475,   482,   490,   497,   505,   513,
+   521,   529,   537,   545,   554,   562,   571,   580,
+   589,   598,   607,   617,   626,   636,   646,   656,
+   666,   676,   686,   697,   708,   719,   730,   741,
+   752,   764,   776,   788,   800,   812,   825,   837,
+   850,   863,   876,   890,   903,   917,   931,   946,
+   960,   975,   990,  1005,  1020,  1036,  1051,  1067,
+  1084,  1100,  1117,  1134,  1151,  1169,  1186,  1204,
+  1223,  1241,  1260,  1279,  1298,  1318,  1338,  1358,
+  1379,  1399,  1421,  1442,  1464,  1486,  1508,  1531,
+  1554,  1577,  1600,  1624,  1649,  1673,  1698,  1724,
+  1749,  1775,  1802,  1829,  1856,  1883,  1911,  1940,
+  1968,  1998,  2027,  2057,  2087,  2118,  2149,  2181,
+  2213,  2245,  2278,  2312,  2345,  2380,  2414,  2450,
+  2485,  2521,  2558,  2595,  2633,  2671,  2709,  2748,
+  2788,  2828,  2869,  2910,  2952,  2994,  3037,  3080,
+  3124,  3168,  3213,  3259,  3305,  3352,  3399,  3447,
+  3496,  3545,  3594,  3645,  3696,  3747,  3799,  3852,
+  3906,  3960,  4014,  4070,  4126,  4182,  4240,  4298,
+  4356,  4416,  4476,  4537,  4598,  4660,  4723,  4786,
+  4851,  4916,  4981,  5048,  5115,  5183,  5251,  5320,
+  5390,  5461,  5533,  5605,  5678,  5752,  5826,  5901,
+  5977,  6054,  6131,  6210,  6289,  6369,  6449,  6530,
+  6613,  6695,  6779,  6863,  6949,  7035,  7121,  7209,
+  7297,  7386,  7476,  7566,  7658,  7750,  7842,  7936,
+  8030,  8126,  8221,  8318,  8415,  8513,  8612,  8712,
+  8812,  8913,  9015,  9117,  9221,  9324,  9429,  9534,
+  9640,  9747,  9854,  9962, 10071, 10180, 10290, 10401,
+ 10512, 10624, 10737, 10850, 10963, 11078, 11192, 11308,
+ 11424, 11540, 11658, 11775, 11893, 12012, 12131, 12251,
+ 12371, 12491, 12612, 12734, 12856, 12978, 13101, 13224,
+ 13347, 13471, 13595, 13719, 13844, 13969, 14095, 14220,
+ 14346, 14472, 14599, 14725, 14852, 14979, 15106, 15233,
+ 15361, 15488, 15616, 15744, 15872, 16000, 16128, 16256,
+ 16384, 16511, 16639, 16767, 16895, 17023, 17151, 17279,
+ 17406, 17534, 17661, 17788, 17915, 18042, 18168, 18295,
+ 18421, 18547, 18672, 18798, 18923, 19048, 19172, 19296,
+ 19420, 19543, 19666, 19789, 19911, 20033, 20155, 20276,
+ 20396, 20516, 20636, 20755, 20874, 20992, 21109, 21227,
+ 21343, 21459, 21575, 21689, 21804, 21917, 22030, 22143,
+ 22255, 22366, 22477, 22587, 22696, 22805, 22913, 23020,
+ 23127, 23233, 23338, 23443, 23546, 23650, 23752, 23854,
+ 23955, 24055, 24155, 24254, 24352, 24449, 24546, 24641,
+ 24737, 24831, 24925, 25017, 25109, 25201, 25291, 25381,
+ 25470, 25558, 25646, 25732, 25818, 25904, 25988, 26072,
+ 26154, 26237, 26318, 26398, 26478, 26557, 26636, 26713,
+ 26790, 26866, 26941, 27015, 27089, 27162, 27234, 27306,
+ 27377, 27447, 27516, 27584, 27652, 27719, 27786, 27851,
+ 27916, 27981, 28044, 28107, 28169, 28230, 28291, 28351,
+ 28411, 28469, 28527, 28585, 28641, 28697, 28753, 28807,
+ 28861, 28915, 28968, 29020, 29071, 29122, 29173, 29222,
+ 29271, 29320, 29368, 29415, 29462, 29508, 29554, 29599,
+ 29643, 29687, 29730, 29773, 29815, 29857, 29898, 29939,
+ 29979, 30019, 30058, 30096, 30134, 30172, 30209, 30246,
+ 30282, 30317, 30353, 30387, 30422, 30455, 30489, 30522,
+ 30554, 30586, 30618, 30649, 30680, 30710, 30740, 30769,
+ 30799, 30827, 30856, 30884, 30911, 30938, 30965, 30992,
+ 31018, 31043, 31069, 31094, 31118, 31143, 31167, 31190,
+ 31213, 31236, 31259, 31281, 31303, 31325, 31346, 31368,
+ 31388, 31409, 31429, 31449, 31469, 31488, 31507, 31526,
+ 31544, 31563, 31581, 31598, 31616, 31633, 31650, 31667,
+ 31683, 31700, 31716, 31731, 31747, 31762, 31777, 31792,
+ 31807, 31821, 31836, 31850, 31864, 31877, 31891, 31904,
+ 31917, 31930, 31942, 31955, 31967, 31979, 31991, 32003,
+ 32015, 32026, 32037, 32048, 32059, 32070, 32081, 32091,
+ 32101, 32111, 32121, 32131, 32141, 32150, 32160, 32169,
+ 32178, 32187, 32196, 32205, 32213, 32222, 32230, 32238,
+ 32246, 32254, 32262, 32270, 32277, 32285, 32292, 32300,
+ 32307, 32314, 32321, 32327, 32334, 32341, 32347, 32354,
+ 32360, 32366, 32373, 32379, 32385, 32390, 32396, 32402,
+ 32407, 32413, 32418, 32424, 32429, 32434, 32439, 32444,
+ 32449, 32454, 32459, 32464, 32468, 32473, 32478, 32482,
+ 32486, 32491, 32495, 32499, 32503, 32507, 32511, 32515,
+ 32519, 32523, 32527, 32530, 32534, 32538, 32541, 32545,
+ 32548, 32552, 32555, 32558, 32561, 32565, 32568, 32571,
+ 32574, 32577, 32580, 32583, 32585, 32588, 32591, 32594,
+ 32596, 32599, 32602, 32604, 32607, 32609, 32612, 32614,
+ 32616, 32619, 32621, 32623, 32626, 32628, 32630, 32632,
+ 32634, 32636, 32638, 32640, 32642, 32644, 32646, 32648,
+ 32650, 32652, 32653, 32655, 32657, 32659, 32660, 32662,
+ 32664, 32665, 32667, 32668, 32670, 32671, 32673, 32674,
+ 32676, 32677, 32679, 32680, 32681, 32683, 32684, 32685,
+ 32686, 32688, 32689, 32690, 32691, 32693, 32694, 32695,
+ 32696, 32697, 32698, 32699, 32700, 32701, 32702, 32703,
+ 32704, 32705, 32706, 32707, 32708, 32709, 32710, 32711,
+ 32712, 32713, 32713, 32714, 32715, 32716, 32717, 32718,
+ 32718, 32719, 32720, 32721, 32721, 32722, 32723, 32723,
+ 32724, 32725, 32725, 32726, 32727, 32727, 32728, 32729,
+ 32729, 32730, 32730, 32731, 32731, 32732, 32733, 32733,
+ 32734, 32734, 32735, 32735, 32736, 32736, 32737, 32737,
+ 32738, 32738, 32739, 32739, 32739, 32740, 32740, 32741,
+ 32741, 32742, 32742, 32742, 32743, 32743, 32744, 32744,
+ 32744, 32745, 32745, 32745, 32746, 32746, 32746, 32747,
+ 32747, 32747, 32748, 32748, 32748, 32749, 32749, 32749,
+ 32749, 32750, 32750, 32750, 32750, 32751, 32751, 32751,
+ 32752, 32752, 32752, 32752, 32752, 32753, 32753, 32753,
+ 32753, 32754, 32754, 32754, 32754, 32754, 32755, 32755,
+ 32755, 32755, 32755, 32756, 32756, 32756, 32756, 32756,
+ 32757, 32757, 32757, 32757, 32757, 32757, 32757, 32758,
+ 32758, 32758, 32758, 32758, 32758, 32759, 32759, 32759,
+ 32759, 32759, 32759, 32759, 32759, 32760, 32760, 32760,
+ 32760, 32760, 32760, 32760, 32760, 32761, 32761, 32761,
+ 32761, 32761, 32761, 32761, 32761, 32761, 32761, 32762,
+ 32762, 32762, 32762, 32762, 32762, 32762, 32762, 32762,
+ 32762, 32762, 32762, 32763, 32763, 32763, 32763, 32763,
+ 32763, 32763, 32763, 32763, 32763, 32763, 32763, 32763,
+ 32763, 32764, 32764, 32764, 32764, 32764, 32764, 32764,
+ 32764, 32764, 32764, 32764, 32764, 32764, 32764, 32764,
+ 32764, 32764, 32764, 32764, 32765, 32765, 32765, 32765,
+ 32765, 32765, 32765, 32765, 32765, 32765, 32765, 32765,
+ 32765, 32765, 32765, 32765, 32765, 32765, 32765, 32765,
+ 32765, 32765, 32765, 32765, 32765, 32765, 32766, 32766,
+ 32766, 32766, 32766, 32766, 32766, 32766, 32766, 32766,
+ 32766, 32766, 32766, 32766, 32766, 32766, 32766, 32766,
+ 32766, 32766, 32766, 32766, 32766, 32766, 32766, 32766,
+ 32766, 32766, 32766, 32766, 32766, 32766, 32766, 32766,
+ 32766, 32766, 32766, 32766, 32766, 32766, 32766, 32766,
+ 32766, 32766, 32767, 32767, 32767, 32767, 32767, 32767
+};
+
+// stdt[i]=count of -i or i in botton or top of stretcht[]
+static const U8 stdt[712]={
+    64,   128,   128,   128,   128,   128,   127,   128,
+   127,   128,   127,   127,   127,   127,   126,   126,
+   126,   126,   126,   125,   125,   124,   125,   124,
+   123,   123,   123,   123,   122,   122,   121,   121,
+   120,   120,   119,   119,   118,   118,   118,   116,
+   117,   115,   116,   114,   114,   113,   113,   112,
+   112,   111,   110,   110,   109,   108,   108,   107,
+   106,   106,   105,   104,   104,   102,   103,   101,
+   101,   100,    99,    98,    98,    97,    96,    96,
+    94,    94,    94,    92,    92,    91,    90,    89,
+    89,    88,    87,    86,    86,    84,    84,    84,
+    82,    82,    81,    80,    79,    79,    78,    77,
+    76,    76,    75,    74,    73,    73,    72,    71,
+    70,    70,    69,    68,    67,    67,    66,    65,
+    65,    64,    63,    62,    62,    61,    61,    59,
+    59,    59,    57,    58,    56,    56,    55,    54,
+    54,    53,    52,    52,    51,    51,    50,    49,
+    49,    48,    48,    47,    47,    45,    46,    44,
+    45,    43,    43,    43,    42,    41,    41,    40,
+    40,    40,    39,    38,    38,    37,    37,    36,
+    36,    36,    35,    34,    34,    34,    33,    32,
+    33,    32,    31,    31,    30,    31,    29,    30,
+    28,    29,    28,    28,    27,    27,    27,    26,
+    26,    25,    26,    24,    25,    24,    24,    23,
+    23,    23,    23,    22,    22,    21,    22,    21,
+    20,    21,    20,    19,    20,    19,    19,    19,
+    18,    18,    18,    18,    17,    17,    17,    17,
+    16,    16,    16,    16,    15,    15,    15,    15,
+    15,    14,    14,    14,    14,    13,    14,    13,
+    13,    13,    12,    13,    12,    12,    12,    11,
+    12,    11,    11,    11,    11,    11,    10,    11,
+    10,    10,    10,    10,     9,    10,     9,     9,
+     9,     9,     9,     8,     9,     8,     9,     8,
+     8,     8,     7,     8,     8,     7,     7,     8,
+     7,     7,     7,     6,     7,     7,     6,     6,
+     7,     6,     6,     6,     6,     6,     6,     5,
+     6,     5,     6,     5,     5,     5,     5,     5,
+     5,     5,     5,     5,     4,     5,     4,     5,
+     4,     4,     5,     4,     4,     4,     4,     4,
+     4,     3,     4,     4,     3,     4,     4,     3,
+     3,     4,     3,     3,     3,     4,     3,     3,
+     3,     3,     3,     3,     2,     3,     3,     3,
+     2,     3,     2,     3,     3,     2,     2,     3,
+     2,     2,     3,     2,     2,     2,     2,     3,
+     2,     2,     2,     2,     2,     2,     1,     2,
+     2,     2,     2,     1,     2,     2,     2,     1,
+     2,     1,     2,     2,     1,     2,     1,     2,
+     1,     1,     2,     1,     1,     2,     1,     1,
+     2,     1,     1,     1,     1,     2,     1,     1,
+     1,     1,     1,     1,     1,     1,     1,     1,
+     1,     1,     1,     1,     1,     1,     1,     1,
+     1,     1,     0,     1,     1,     1,     1,     0,
+     1,     1,     1,     0,     1,     1,     1,     0,
+     1,     1,     0,     1,     1,     0,     1,     0,
+     1,     1,     0,     1,     0,     1,     0,     1,
+     0,     1,     0,     1,     0,     1,     0,     1,
+     0,     1,     0,     1,     0,     1,     0,     0,
+     1,     0,     1,     0,     0,     1,     0,     1,
+     0,     0,     1,     0,     0,     1,     0,     0,
+     1,     0,     0,     1,     0,     0,     0,     1,
+     0,     0,     1,     0,     0,     0,     1,     0,
+     0,     0,     1,     0,     0,     0,     1,     0,
+     0,     0,     0,     1,     0,     0,     0,     0,
+     1,     0,     0,     0,     0,     1,     0,     0,
+     0,     0,     0,     1,     0,     0,     0,     0,
+     0,     1,     0,     0,     0,     0,     0,     0,
+     1,     0,     0,     0,     0,     0,     0,     0,
+     1,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     1,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     1,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     1,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     1,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     1,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     1,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     0,     0,
+     0,     0,     0,     0,     0,     0,     1,     0
+};
+
 Predictor::Predictor(ZPAQL& zr):
     c8(1), hmap4(1), z(zr) {
   assert(sizeof(U8)==1);
@@ -694,29 +1721,9 @@ Predictor::Predictor(ZPAQL& zr):
   assert(sizeof(U64)==8);
   assert(sizeof(short)==2);
   assert(sizeof(int)==4);
-
-  // Initialize tables
-  dt2k[0]=0;
-  for (int i=1; i<256; ++i)
-    dt2k[i]=2048/i;
-  for (int i=0; i<1024; ++i)
-    dt[i]=(1<<17)/(i*2+3)*2;
-  for (int i=0; i<32768; ++i)
-    stretcht[i]=int(log((i+0.5)/(32767.5-i))*64+0.5+100000)-100000;
-  for (int i=0; i<4096; ++i)
-    squasht[i]=int(32768.0/(1+exp((i-2048)*(-1.0/64))));
-
-  // Verify floating point math for squash() and stretch()
-  U32 sqsum=0, stsum=0;
-  for (int i=32767; i>=0; --i)
-    stsum=stsum*3+stretch(i);
-  for (int i=4095; i>=0; --i)
-    sqsum=sqsum*3+squash(i-2048);
-  assert(stsum==3887533746u);
-  assert(sqsum==2278286169u);
-
   pcode=0;
   pcode_size=0;
+  initTables=false;
 }
 
 Predictor::~Predictor() {
@@ -732,6 +1739,39 @@ void Predictor::init() {
   // Initialize context hash function
   z.inith();
 
+  // Initialize model independent tables
+  if (!initTables && isModeled()) {
+    initTables=true;
+    memcpy(dt2k, sdt2k, sizeof(dt2k));
+    memcpy(dt, sdt, sizeof(dt));
+
+    // ssquasht[i]=int(32768.0/(1+exp((i-2048)*(-1.0/64))));
+    // Copy middle 1344 of 4096 entries.
+    memset(squasht, 0, 1376*2);
+    memcpy(squasht+1376, ssquasht, 1344*2);
+    for (int i=2720; i<4096; ++i) squasht[i]=32767;
+
+    // sstretcht[i]=int(log((i+0.5)/(32767.5-i))*64+0.5+100000)-100000;
+    int k=16384;
+    for (int i=0; i<712; ++i)
+      for (int j=stdt[i]; j>0; --j)
+        stretcht[k++]=i;
+    assert(k==32768);
+    for (int i=0; i<16384; ++i)
+      stretcht[i]=-stretcht[32767-i];
+
+#ifndef NDEBUG
+    // Verify floating point math for squash() and stretch()
+    U32 sqsum=0, stsum=0;
+    for (int i=32767; i>=0; --i)
+      stsum=stsum*3+stretch(i);
+    for (int i=4095; i>=0; --i)
+      sqsum=sqsum*3+squash(i-2048);
+    assert(stsum==3887533746u);
+    assert(sqsum==2278286169u);
+#endif
+  }
+
   // Initialize predictions
   for (int i=0; i<256; ++i) h[i]=p[i]=0;
 
@@ -801,7 +1841,7 @@ void Predictor::init() {
         cr.cm.resize(512);
         for (int j=0; j<256; ++j) {
           cr.cm[j*2]=1<<15;
-          cr.cm[j*2+1]=clamp512k(stretch(st.cminit(j)>>8)<<10);
+          cr.cm[j*2+1]=clamp512k(stretch(st.cminit(j)>>8)*1024);
         }
         break;
       case SSE: // sizebits j start limit
@@ -823,6 +1863,7 @@ void Predictor::init() {
 
 // Return next bit prediction using interpreted COMP code
 int Predictor::predict0() {
+  assert(initTables);
   assert(c8>=1 && c8<=255);
 
   // Predict next bit
@@ -922,6 +1963,7 @@ int Predictor::predict0() {
 
 // Update model with decoded bit y (0...1)
 void Predictor::update0(int y) {
+  assert(initTables);
   assert(y==0 || y==1);
   assert(c8>=1 && c8<=255);
   assert(hmap4>=1 && hmap4<=511);
@@ -1039,6 +2081,7 @@ void Predictor::update0(int y) {
 // collision detection. If not found after 3 adjacent tries, replace the
 // row with lowest element 1 as priority. Return index of row.
 size_t Predictor::find(Array<U8>& ht, int sizebits, U32 cxt) {
+  assert(initTables);
   assert(ht.size()==size_t(16)<<sizebits);
   int chk=cxt>>sizebits&255;
   size_t h0=(cxt*16)&(ht.size()-16);
@@ -1058,7 +2101,8 @@ size_t Predictor::find(Array<U8>& ht, int sizebits, U32 cxt) {
 /////////////////////// Decoder ///////////////////////
 
 Decoder::Decoder(ZPAQL& z):
-    in(0), low(1), high(0xFFFFFFFF), curr(0), pr(z), buf(BUFSIZE) {
+    in(0), low(1), high(0xFFFFFFFF), curr(0), rpos(0), wpos(0),
+    pr(z), buf(BUFSIZE) {
 }
 
 void Decoder::init() {
@@ -1067,35 +2111,18 @@ void Decoder::init() {
   else low=high=curr=0;
 }
 
-// Read un-modeled input into buf[low=0..high-1]
-// with curr remaining in subblock to read.
-void Decoder::loadbuf() {
-  assert(!pr.isModeled());
-  assert(low==high);
-  if (curr==0) {
-    for (int i=0; i<4; ++i) {
-      int c=in->get();
-      if (c<0) error("unexpected end of input");
-      curr=curr<<8|c;
-    }
-  }
-  U32 n=buf.size();
-  if (n>curr) n=curr;
-  high=in->read(&buf[0], n);
-  curr-=high;
-  low=0;
-}
-
 // Return next bit of decoded input, which has 16 bit probability p of being 1
 int Decoder::decode(int p) {
+  assert(pr.isModeled());
   assert(p>=0 && p<65536);
   assert(high>low && low>0);
   if (curr<low || curr>high) error("archive corrupted");
   assert(curr>=low && curr<=high);
   U32 mid=low+U32(((high-low)*U64(U32(p)))>>16);  // split range
   assert(high>mid && mid>=low);
-  int y=curr<=mid;
-  if (y) high=mid; else low=mid+1; // pick half
+  int y;
+  if (curr<=mid) y=1, high=mid;  // pick half
+  else y=0, low=mid+1;
   while ((high^low)<0x1000000) { // shift out identical leading bytes
     high=high<<8|255;
     low=low<<8;
@@ -1129,9 +2156,12 @@ int Decoder::decompress() {
     }
   }
   else {
-    if (low==high) loadbuf();
-    if (low==high) return -1;
-    return buf[low++]&255;
+    if (curr==0) {
+      for (int i=0; i<4; ++i) curr=curr<<8|get();
+      if (curr==0) return -1;
+    }
+    --curr;
+    return in->get();
   }
 }
 
@@ -1150,13 +2180,11 @@ int Decoder::skip() {
     if (curr==0)  // at start?
       for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c;
     while (curr>0) {
-      U32 n=BUFSIZE;
-      if (n>curr) n=curr;
-      U32 n1=in->read(&buf[0], n);
-      curr-=n1;
-      if (n1!=n) return -1;
-      if (curr==0)
-        for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c;
+      while (curr>0) {
+        --curr;
+        if (in->get()<0) return error("skipped to EOF"), -1;
+      }
+      for (int i=0; i<4 && (c=in->get())>=0; ++i) curr=curr<<8|c;
     }
     if (c>=0) c=in->get();
     return c;
@@ -1195,6 +2223,7 @@ int PostProcessor::write(int c) {
     case 3:  // PROG psize[0]
       if (c<0) error("Unexpected EOS");
       hsize+=c*256;  // high byte of psize
+      if (hsize<1) error("Empty PCOMP");
       z.header.resize(hsize+300);
       z.cend=8;
       z.hbegin=z.hend=z.cend+128;
@@ -1296,6 +2325,7 @@ void Decompresser::readComment(Writer* comment) {
 // Decompress n bytes, or all if n < 0. Return false if done
 bool Decompresser::decompress(int n) {
   assert(state==DATA);
+  if (decode_state==SKIP) error("decompression after skipped segment");
   assert(decode_state!=SKIP);
 
   // Initialize models to start decompressing block
@@ -1354,7 +2384,7 @@ void Decompresser::readSegmentEnd(char* sha1string) {
     error("missing end of segment marker");
 }
 
-/////////////////////////// decompress() /////////////////////
+/////////////////////////// decompress() //////////////////////
 
 void decompress(Reader* in, Writer* out) {
   Decompresser d;
@@ -1369,7 +2399,7 @@ void decompress(Reader* in, Writer* out) {
   }
 }
 
-////////////////////// Encoder ////////////////////
+/////////////////////////// Encoder ///////////////////////////
 
 // Initialize for start of block
 void Encoder::init() {
@@ -1415,7 +2445,7 @@ void Encoder::compress(int c) {
     }
   }
   else {
-    if (c<0 || low==buf.size()) {
+    if (low && (c<0 || low==buf.size())) {
       out->put((low>>24)&255);
       out->put((low>>16)&255);
       out->put((low>>8)&255);
@@ -1427,6 +2457,329 @@ void Encoder::compress(int c) {
   }
 }
 
+//////////////////////////// Compiler /////////////////////////
+
+// Component names
+const char* compname[256]=
+  {"","const","cm","icm","match","avg","mix2","mix","isse","sse",0};
+
+// Opcodes
+const char* opcodelist[272]={
+"error","a++",  "a--",  "a!",   "a=0",  "",     "",     "a=r",
+"b<>a", "b++",  "b--",  "b!",   "b=0",  "",     "",     "b=r",
+"c<>a", "c++",  "c--",  "c!",   "c=0",  "",     "",     "c=r",
+"d<>a", "d++",  "d--",  "d!",   "d=0",  "",     "",     "d=r",
+"*b<>a","*b++", "*b--", "*b!",  "*b=0", "",     "",     "jt",
+"*c<>a","*c++", "*c--", "*c!",  "*c=0", "",     "",     "jf",
+"*d<>a","*d++", "*d--", "*d!",  "*d=0", "",     "",     "r=a",
+"halt", "out",  "",     "hash", "hashd","",     "",     "jmp",
+"a=a",  "a=b",  "a=c",  "a=d",  "a=*b", "a=*c", "a=*d", "a=",
+"b=a",  "b=b",  "b=c",  "b=d",  "b=*b", "b=*c", "b=*d", "b=",
+"c=a",  "c=b",  "c=c",  "c=d",  "c=*b", "c=*c", "c=*d", "c=",
+"d=a",  "d=b",  "d=c",  "d=d",  "d=*b", "d=*c", "d=*d", "d=",
+"*b=a", "*b=b", "*b=c", "*b=d", "*b=*b","*b=*c","*b=*d","*b=",
+"*c=a", "*c=b", "*c=c", "*c=d", "*c=*b","*c=*c","*c=*d","*c=",
+"*d=a", "*d=b", "*d=c", "*d=d", "*d=*b","*d=*c","*d=*d","*d=",
+"",     "",     "",     "",     "",     "",     "",     "",
+"a+=a", "a+=b", "a+=c", "a+=d", "a+=*b","a+=*c","a+=*d","a+=",
+"a-=a", "a-=b", "a-=c", "a-=d", "a-=*b","a-=*c","a-=*d","a-=",
+"a*=a", "a*=b", "a*=c", "a*=d", "a*=*b","a*=*c","a*=*d","a*=",
+"a/=a", "a/=b", "a/=c", "a/=d", "a/=*b","a/=*c","a/=*d","a/=",
+"a%=a", "a%=b", "a%=c", "a%=d", "a%=*b","a%=*c","a%=*d","a%=",
+"a&=a", "a&=b", "a&=c", "a&=d", "a&=*b","a&=*c","a&=*d","a&=",
+"a&~a", "a&~b", "a&~c", "a&~d", "a&~*b","a&~*c","a&~*d","a&~",
+"a|=a", "a|=b", "a|=c", "a|=d", "a|=*b","a|=*c","a|=*d","a|=",
+"a^=a", "a^=b", "a^=c", "a^=d", "a^=*b","a^=*c","a^=*d","a^=",
+"a<<=a","a<<=b","a<<=c","a<<=d","a<<=*b","a<<=*c","a<<=*d","a<<=",
+"a>>=a","a>>=b","a>>=c","a>>=d","a>>=*b","a>>=*c","a>>=*d","a>>=",
+"a==a", "a==b", "a==c", "a==d", "a==*b","a==*c","a==*d","a==",
+"a<a",  "a<b",  "a<c",  "a<d",  "a<*b", "a<*c", "a<*d", "a<",
+"a>a",  "a>b",  "a>c",  "a>d",  "a>*b", "a>*c", "a>*d", "a>",
+"",     "",     "",     "",     "",     "",     "",     "",
+"",     "",     "",     "",     "",     "",     "",     "lj",
+"post", "pcomp","end",  "if",   "ifnot","else", "endif","do",
+"while","until","forever","ifl","ifnotl","elsel",";",    0};
+
+// Advance in to start of next token. Tokens are delimited by white
+// space. Comments inclosed in ((nested) parenthsis) are skipped.
+void Compiler::next() {
+  assert(in);
+  for (; *in; ++in) {
+    if (*in=='\n') ++line;
+    if (*in=='(') state+=1+(state<0);
+    else if (state>0 && *in==')') --state;
+    else if (state<0 && *in<=' ') state=0;
+    else if (state==0 && *in>' ') {state=-1; break;}
+  }
+  if (!*in) error("unexpected end of config");
+}
+
+// convert to lower case
+int tolower(int c) {return (c>='A' && c<='Z') ? c+'a'-'A' : c;}
+
+// return true if in==word up to white space or '(', case insensitive
+bool Compiler::matchToken(const char* word) {
+  const char* a=in;
+  for (; (*a>' ' && *a!='(' && *word); ++a, ++word)
+    if (tolower(*a)!=tolower(*word)) return false;
+  return !*word && (*a<=' ' || *a=='(');
+}
+
+// Print error message and exit
+void Compiler::syntaxError(const char* msg, const char* expected) {
+  Array<char> sbuf(128);  // error message to report
+  char* s=&sbuf[0];
+  strcat(s, "Config line ");
+  for (int i=strlen(s), r=1000000; r; r/=10)  // append line number
+    if (line/r) s[i++]='0'+line/r%10;
+  strcat(s, " at ");
+  for (int i=strlen(s); i<40 && *in>' '; ++i)  // append token found
+    s[i]=*in++;
+  strcat(s, ": ");
+  strncat(s, msg, 40);  // append message
+  if (expected) {
+    strcat(s, ", expected: ");
+    strncat(s, expected, 20);  // append expected token if any
+  }
+  error(s);
+}
+
+// Read a token, which must be in the NULL terminated list or else
+// exit with an error. If found, return its index.
+int Compiler::rtoken(const char* list[]) {
+  assert(in);
+  assert(list);
+  next();
+  for (int i=0; list[i]; ++i)
+    if (matchToken(list[i]))
+      return i;
+  syntaxError("unexpected");
+  assert(0);
+  return -1; // not reached
+}
+
+// Read a token which must be the specified value s
+void Compiler::rtoken(const char* s) {
+  assert(s);
+  next();
+  if (!matchToken(s)) syntaxError("expected", s);
+}
+
+// Read a number in (low...high) or exit with an error
+// For numbers like $N+M, return arg[N-1]+M
+int Compiler::rtoken(int low, int high) {
+  next();
+  int r=0;
+  if (in[0]=='$' && in[1]>='1' && in[1]<='9') {
+    if (in[2]=='+') r=atoi(in+3);
+    if (args) r+=args[in[1]-'1'];
+  }
+  else if (in[0]=='-' || (in[0]>='0' && in[0]<='9')) r=atoi(in);
+  else syntaxError("expected a number");
+  if (r<low) syntaxError("number too low");
+  if (r>high) syntaxError("number too high");
+  return r;
+}
+
+// Compile HCOMP or PCOMP code. Exit on error. Return
+// code for end token (POST, PCOMP, END)
+int Compiler::compile_comp(ZPAQL& z) {
+  int op=0;
+  const int comp_begin=z.hend;
+  while (true) {
+    op=rtoken(opcodelist);
+    if (op==POST || op==PCOMP || op==END) break;
+    int operand=-1; // 0...255 if 2 bytes
+    int operand2=-1;  // 0...255 if 3 bytes
+    if (op==IF) {
+      op=JF;
+      operand=0; // set later
+      if_stack.push(z.hend+1); // save jump target location
+    }
+    else if (op==IFNOT) {
+      op=JT;
+      operand=0;
+      if_stack.push(z.hend+1); // save jump target location
+    }
+    else if (op==IFL || op==IFNOTL) {  // long if
+      if (op==IFL) z.header[z.hend++]=(JT);
+      if (op==IFNOTL) z.header[z.hend++]=(JF);
+      z.header[z.hend++]=(3);
+      op=LJ;
+      operand=operand2=0;
+      if_stack.push(z.hend+1);
+    }
+    else if (op==ELSE || op==ELSEL) {
+      if (op==ELSE) op=JMP, operand=0;
+      if (op==ELSEL) op=LJ, operand=operand2=0;
+      int a=if_stack.pop();  // conditional jump target location
+      assert(a>comp_begin && a<int(z.hend));
+      if (z.header[a-1]!=LJ) {  // IF, IFNOT
+        assert(z.header[a-1]==JT || z.header[a-1]==JF || z.header[a-1]==JMP);
+        int j=z.hend-a+1+(op==LJ); // offset at IF
+        assert(j>=0);
+        if (j>127) syntaxError("IF too big, try IFL, IFNOTL");
+        z.header[a]=j;
+      }
+      else {  // IFL, IFNOTL
+        int j=z.hend-comp_begin+2+(op==LJ);
+        assert(j>=0);
+        z.header[a]=j&255;
+        z.header[a+1]=(j>>8)&255;
+      }
+      if_stack.push(z.hend+1);  // save JMP target location
+    }
+    else if (op==ENDIF) {
+      int a=if_stack.pop();  // jump target address
+      assert(a>comp_begin && a<int(z.hend));
+      int j=z.hend-a-1;  // jump offset
+      assert(j>=0);
+      if (z.header[a-1]!=LJ) {
+        assert(z.header[a-1]==JT || z.header[a-1]==JF || z.header[a-1]==JMP);
+        if (j>127) syntaxError("IF too big, try IFL, IFNOTL, ELSEL\n");
+        z.header[a]=j;
+      }
+      else {
+        assert(a+1<int(z.hend));
+        j=z.hend-comp_begin;
+        z.header[a]=j&255;
+        z.header[a+1]=(j>>8)&255;
+      }
+    }
+    else if (op==DO) {
+      do_stack.push(z.hend);
+    }
+    else if (op==WHILE || op==UNTIL || op==FOREVER) {
+      int a=do_stack.pop();
+      assert(a>=comp_begin && a<int(z.hend));
+      int j=a-z.hend-2;
+      assert(j<=-2);
+      if (j>=-127) {  // backward short jump
+        if (op==WHILE) op=JT;
+        if (op==UNTIL) op=JF;
+        if (op==FOREVER) op=JMP;
+        operand=j&255;
+      }
+      else {  // backward long jump
+        j=a-comp_begin;
+        assert(j>=0 && j<int(z.hend)-comp_begin);
+        if (op==WHILE) {
+          z.header[z.hend++]=(JF);
+          z.header[z.hend++]=(3);
+        }
+        if (op==UNTIL) {
+          z.header[z.hend++]=(JT);
+          z.header[z.hend++]=(3);
+        }
+        op=LJ;
+        operand=j&255;
+        operand2=j>>8;
+      }
+    }
+    else if ((op&7)==7) { // 2 byte operand, read N
+      if (op==LJ) {
+        operand=rtoken(0, 65535);
+        operand2=operand>>8;
+        operand&=255;
+      }
+      else if (op==JT || op==JF || op==JMP) {
+        operand=rtoken(-128, 127);
+        operand&=255;
+      }
+      else
+        operand=rtoken(0, 255);
+    }
+    if (op>=0 && op<=255)
+      z.header[z.hend++]=(op);
+    if (operand>=0)
+      z.header[z.hend++]=(operand);
+    if (operand2>=0)
+      z.header[z.hend++]=(operand2);
+    if (z.hend>=z.header.isize()-130 || z.hend-z.hbegin+z.cend-2>65535)
+      syntaxError("program too big");
+  }
+  z.header[z.hend++]=(0); // END
+  return op;
+}
+
+// Compile a configuration file. Store COMP/HCOMP section in hcomp.
+// If there is a PCOMP section, store it in pcomp and store the PCOMP
+// command in pcomp_cmd. Replace "$1..$9+n" with args[0..8]+n
+
+Compiler::Compiler(const char* in_, int* args_, ZPAQL& hz_, ZPAQL& pz_,
+                   Writer* out2_): in(in_), args(args_), hz(hz_), pz(pz_),
+                   out2(out2_), if_stack(1000), do_stack(1000) {
+  line=1;
+  state=0;
+  hz.clear();
+  pz.clear();
+  hz.header.resize(68000); 
+
+  // Compile the COMP section of header
+  rtoken("comp");
+  hz.header[2]=rtoken(0, 255);  // hh
+  hz.header[3]=rtoken(0, 255);  // hm
+  hz.header[4]=rtoken(0, 255);  // ph
+  hz.header[5]=rtoken(0, 255);  // pm
+  const int n=hz.header[6]=rtoken(0, 255);  // n
+  hz.cend=7;
+  for (int i=0; i<n; ++i) {
+    rtoken(i, i);
+    CompType type=CompType(rtoken(compname));
+    hz.header[hz.cend++]=type;
+    int clen=libzpaq::compsize[type&255];
+    if (clen<1 || clen>10) syntaxError("invalid component");
+    for (int j=1; j<clen; ++j)
+      hz.header[hz.cend++]=rtoken(0, 255);  // component arguments
+  }
+  hz.header[hz.cend++];  // end
+  hz.hbegin=hz.hend=hz.cend+128;
+
+  // Compile HCOMP
+  rtoken("hcomp");
+  int op=compile_comp(hz);
+
+  // Compute header size
+  int hsize=hz.cend-2+hz.hend-hz.hbegin;
+  hz.header[0]=hsize&255;
+  hz.header[1]=hsize>>8;
+
+  // Compile POST 0 END
+  if (op==POST) {
+    rtoken(0, 0);
+    rtoken("end");
+  }
+
+  // Compile PCOMP pcomp_cmd ; program... END
+  else if (op==PCOMP) {
+    pz.header.resize(68000);
+    pz.header[4]=hz.header[4];  // ph
+    pz.header[5]=hz.header[5];  // pm
+    pz.cend=8;
+    pz.hbegin=pz.hend=pz.cend+128;
+
+    // get pcomp_cmd ending with ";" (case sensitive)
+    next();
+    while (*in && *in!=';') {
+      if (out2)
+        out2->put(*in);
+      ++in;
+    }
+    if (*in) ++in;
+
+    // Compile PCOMP
+    op=compile_comp(pz);
+    int len=pz.cend-2+pz.hend-pz.hbegin;  // insert header size
+    assert(len>=0);
+    pz.header[0]=len&255;
+    pz.header[1]=len>>8;
+    if (op!=END)
+      syntaxError("expected END");
+  }
+  else if (op!=END)
+    syntaxError("expected END or POST 0 END or PCOMP cmd ; ... END");
+}
+
 ///////////////////// Compressor //////////////////////
 
 // Write 13 byte start tag
@@ -1496,20 +2849,32 @@ public:
   int get() {return *p++&255;}
 };
 
-// Write a block header
 void Compressor::startBlock(const char* hcomp) {
   assert(state==INIT);
-  assert(hcomp);
-  int len=toU16(hcomp)+2;
+  MemoryReader m(hcomp);
+  z.read(&m);
+  pz.sha1=&sha1;
+  assert(z.header.isize()>6);
   enc.out->put('z');
   enc.out->put('P');
   enc.out->put('Q');
-  enc.out->put(1+(len>6 && hcomp[6]==0));  // level 1 or 2
+  enc.out->put(1+(z.header[6]==0));  // level 1 or 2
   enc.out->put(1);
-  for (int i=0; i<len; ++i)  // write compression model hcomp
-    enc.out->put(hcomp[i]);
-  MemoryReader m(hcomp);
-  z.read(&m);
+  z.write(enc.out, false);
+  state=BLOCK1;
+}
+
+void Compressor::startBlock(const char* config, int* args, Writer* pcomp_cmd) {
+  assert(state==INIT);
+  Compiler(config, args, z, pz, pcomp_cmd);
+  pz.sha1=&sha1;
+  assert(z.header.isize()>6);
+  enc.out->put('z');
+  enc.out->put('P');
+  enc.out->put('Q');
+  enc.out->put(1+(z.header[6]==0));  // level 1 or 2
+  enc.out->put(1);
+  z.write(enc.out, false);
   state=BLOCK1;
 }
 
@@ -1530,40 +2895,82 @@ void Compressor::startSegment(const char* filename, const char* comment) {
 
 // Initialize encoding and write pcomp to first segment
 // If len is 0 then length is encoded in pcomp[0..1]
+// if pcomp is 0 then get pcomp from pz.header
 void Compressor::postProcess(const char* pcomp, int len) {
+  if (state==SEG2) return;
   assert(state==SEG1);
   enc.init();
-  if (pcomp) {
-    enc.compress(1);
-    if (len<=0) {
-      len=toU16(pcomp);
-      pcomp+=2;
+  if (!pcomp) {
+    len=pz.hend-pz.hbegin;
+    if (len>0) {
+      assert(pz.header.isize()>pz.hend);
+      assert(pz.hbegin>=0);
+      pcomp=(const char*)&pz.header[pz.hbegin];
     }
+    assert(len>=0);
+  }
+  else if (len==0) {
+    len=toU16(pcomp);
+    pcomp+=2;
+  }
+  if (len>0) {
+    enc.compress(1);
     enc.compress(len&255);
     enc.compress((len>>8)&255);
     for (int i=0; i<len; ++i)
       enc.compress(pcomp[i]&255);
+    if (verify)
+      pz.initp();
   }
   else
     enc.compress(0);
   state=SEG2;
 }
 
-// Compress n bytes, or to EOF if n <= 0
+// Compress n bytes, or to EOF if n < 0
 bool Compressor::compress(int n) {
+  if (state==SEG1)
+    postProcess();
   assert(state==SEG2);
-  int ch=0;
-  while (n && (ch=in->get())>=0) {
-    enc.compress(ch);
-    if (n>0) --n;
+
+  const int BUFSIZE=1<<14;
+  char buf[BUFSIZE];  // input buffer
+  while (n) {
+    int nbuf=BUFSIZE;  // bytes read into buf
+    if (n>=0 && n<nbuf) nbuf=n;
+    int nr=in->read(buf, nbuf);
+    if (nr<0 || nr>BUFSIZE || nr>nbuf) error("invalid read size");
+    if (nr<=0) return false;
+    if (n>=0) n-=nr;
+    for (int i=0; i<nr; ++i) {
+      int ch=U8(buf[i]);
+/* TODO
+ * Need to show progress
+      // ver 7.15 uses read instead of get
+      // need to show progresss in this way
+      if (!(i % 128)) 
+	      show_progress(i);
+*/
+      enc.compress(ch);
+      if (verify) {
+        if (pz.hend) pz.run(ch);
+        else sha1.put(ch);
+      }
+    }
   }
-  return ch>=0;
+  return true;
 }
 
 // End segment, write sha1string if present
 void Compressor::endSegment(const char* sha1string) {
+  if (state==SEG1)
+    postProcess();
   assert(state==SEG2);
   enc.compress(-1);
+  if (verify && pz.hend) {
+    pz.run(-1);
+    pz.flush();
+  }
   enc.out->put(0);
   enc.out->put(0);
   enc.out->put(0);
@@ -1578,6 +2985,35 @@ void Compressor::endSegment(const char* sha1string) {
   state=BLOCK2;
 }
 
+// End segment, write checksum and size is verify is true
+char* Compressor::endSegmentChecksum(int64_t* size, bool dosha1) {
+  if (state==SEG1)
+    postProcess();
+  assert(state==SEG2);
+  enc.compress(-1);
+  if (verify && pz.hend) {
+    pz.run(-1);
+    pz.flush();
+  }
+  enc.out->put(0);
+  enc.out->put(0);
+  enc.out->put(0);
+  enc.out->put(0);
+  if (verify) {
+    if (size) *size=sha1.usize();
+    memcpy(sha1result, sha1.result(), 20);
+  }
+  if (verify && dosha1) {
+    enc.out->put(253);
+    for (int i=0; i<20; ++i)
+      enc.out->put(sha1result[i]);
+  }
+  else
+    enc.out->put(254);
+  state=BLOCK2;
+  return verify ? sha1result : 0;
+}
+
 // End block
 void Compressor::endBlock() {
   assert(state==BLOCK2);
@@ -1587,17 +3023,29 @@ void Compressor::endBlock() {
 
 /////////////////////////// compress() ///////////////////////
 
-void compress(Reader* in, Writer* out, int level) {
-  assert(level>=1);
-  Compressor c;
-  c.setInput(in);
-  c.setOutput(out);
-  c.startBlock(level);
-  c.startSegment();
-  c.postProcess();
-  c.compress();
-  c.endSegment();
-  c.endBlock();
+void compress(Reader* in, Writer* out, const char* method,
+              const char* filename, const char* comment, bool dosha1) {
+
+  // Get block size
+  int bs=4;
+  if (method && method[0] && method[1]>='0' && method[1]<='9') {
+    bs=method[1]-'0';
+    if (method[2]>='0' && method[2]<='9') bs=bs*10+method[2]-'0';
+    if (bs>11) bs=11;
+  }
+  bs=(0x100000<<bs)-4096;
+
+  // Compress in blocks
+  StringBuffer sb(bs);
+  sb.write(0, bs);
+  int n=0;
+  while (in && (n=in->read((char*)sb.data(), bs))>0) {
+    sb.resize(n);
+    compressBlock(&sb, out, method, filename, comment, dosha1);
+    filename=0;
+    comment=0;
+    sb.resize(0);
+  }
 }
 
 //////////////////////// ZPAQL::assemble() ////////////////////
@@ -1611,9 +3059,9 @@ code in rcode[0..rcode_size-1]. Execution begins at rcode[0]. It will not
 write beyond the end of rcode, but in any case it returns the number of
 bytes that would have been written. It returns 0 in case of error.
 
-The assembled code implements run() and returns 1 if successful or
-0 if the ZPAQL code executes an invalid instruction or jumps out of
-bounds.
+The assembled code implements int run() and returns 0 if successful,
+1 if the ZPAQL code executes an invalid instruction or jumps out of
+bounds, or 2 if OUT throws bad_alloc, or 3 for other OUT exceptions.
 
 A ZPAQL virtual machine has the following state. All values are
 unsigned and initially 0:
@@ -1640,7 +3088,7 @@ the only 3 byte instruction. They are organized:
   00dddxxx = unary opcode xxx on destination ddd (ddd < 111)
   00111xxx = special instruction xxx
   01dddsss = assignment: ddd = sss (ddd < 111)
-  1xxxxsss = operation sxxx from sss to a
+  1xxxxsss = operation xxxx from sss to a
 
 The meaning of sss and ddd are as follows:
 
@@ -1727,8 +3175,17 @@ In 64 bit mode, the following additional registers are used:
 */
 
 // Called by out
-static void flush1(ZPAQL* z) {
-  z->flush();
+static int flush1(ZPAQL* z) {
+  try {
+    z->flush();
+    return 0;
+  }
+  catch(std::bad_alloc& x) {
+    return 2;
+  }
+  catch(...) {
+    return 3;
+  }
 }
 
 // return true if op is an undefined ZPAQL instruction
@@ -1737,6 +3194,19 @@ static bool iserr(int op) {
     || op==58 || (op<64 && (op%8==5 || op%8==6));
 }
 
+// Return length of ZPAQL instruction at hcomp[0]. Assume 0 padding at end.
+// A run of identical ++ or -- is counted as 1 instruction.
+static int oplen(const U8* hcomp) {
+  if (*hcomp==255) return 3;
+  if (*hcomp%8==7) return 2;
+  if (*hcomp<51 && (*hcomp%8-1)/2==0) {  // ++ or -- opcode
+    int i;
+    for (i=1; i<127 && hcomp[i]==hcomp[0]; ++i);
+    return i;
+  }
+  return 1;
+}
+
 // Write k bytes of x to rcode[o++] MSB first
 static void put(U8* rcode, int n, int& o, U32 x, int k) {
   while (k-->0) {
@@ -1785,10 +3255,10 @@ int ZPAQL::assemble() {
     error("JIT supported only for x86-32 and x86-64");
 
   const U8* hcomp=&header[hbegin];
-  const int hlen=hend-hbegin+1;
+  const int hlen=hend-hbegin+2;
   const int msize=m.size();
   const int hsize=h.size();
-  const int regcode[8]={2,6,7,5}; // a,b,c,d.. -> edx,esi,edi,ebp,eax..
+  static const int regcode[8]={2,6,7,5}; // a,b,c,d.. -> edx,esi,edi,ebp,eax..
   Array<int> it(hlen);            // hcomp -> rcode locations
   int done=0;  // number of instructions assembled (0..hlen)
   int o=5;  // rcode output index, reserve space for jmp
@@ -1806,7 +3276,7 @@ int ZPAQL::assemble() {
     put2(0x8929);             // mov [rcx], ebp
     put2l(0x48b9, &f);        // mov rcx, f
     put2(0x8919);             // mov [rcx], ebx
-    put4(0x4883c438);         // add rsp, 56
+    put4(0x4883c408);         // add rsp, 8
     put2(0x415f);             // pop r15
     put2(0x415e);             // pop r14
     put2(0x415d);             // pop r13
@@ -1818,12 +3288,12 @@ int ZPAQL::assemble() {
     put2a(0x893d, &c);        // mov [c], edi
     put2a(0x892d, &d);        // mov [d], ebp
     put2a(0x891d, &f);        // mov [f], ebx
-    put3(0x83c43c);           // add esp, 60
+    put3(0x83c40c);           // add esp, 12
   }
-  put1(0x5d);                 // pop ebp
   put1(0x5b);                 // pop ebx
   put1(0x5f);                 // pop edi
   put1(0x5e);                 // pop esi
+  put1(0x5d);                 // pop ebp
   put1(0xc3);                 // ret
 
   // Code for the out instruction.
@@ -1832,50 +3302,54 @@ int ZPAQL::assemble() {
   if (S==8) {
     put2l(0x48b8, &outbuf[0]);// mov rax, outbuf.p
     put2l(0x49ba, &bufptr);   // mov r10, &bufptr
-    put3(0x418b0a);           // mov ecx, [r10]
-    put3(0x891408);           // mov [rax+rcx], edx
-    put2(0xffc1);             // inc ecx
+    put3(0x418b0a);           // mov rcx, [r10]
+    put3(0x881408);           // mov [rax+rcx], dl
+    put2(0xffc1);             // inc rcx
     put3(0x41890a);           // mov [r10], ecx
-    put2a(0x81f9, outbuf.size());  // cmp ecx, outbuf.size()
-    put2(0x7401);             // jz L1
+    put2a(0x81f9, outbuf.size());  // cmp rcx, outbuf.size()
+    put2(0x7403);             // jz L1
+    put2(0x31c0);             // xor eax, eax
     put1(0xc3);               // ret
-    put4(0x4883ec30);         // L1: sub esp, 48  ; call flush1(this)
-    put4(0x48893c24);         // mov [rsp], rdi
-    put5(0x48897424,8);       // mov [rsp+8], rsi
-    put5(0x48895424,16);      // mov [rsp+16], rdx
-    put5(0x48894c24,24);      // mov [rsp+24], rcx
-#ifndef _WIN32
+
+    put1(0x55);               // L1: push rbp ; call flush1(this)
+    put1(0x57);               // push rdi
+    put1(0x56);               // push rsi
+    put1(0x52);               // push rdx
+    put1(0x51);               // push rcx
+    put3(0x4889e5);           // mov rbp, rsp
+    put4(0x4883c570);         // add rbp, 112
+#if defined(unix) && !defined(__CYGWIN__)
     put2l(0x48bf, this);      // mov rdi, this
 #else  // Windows
     put2l(0x48b9, this);      // mov rcx, this
 #endif
     put2l(0x49bb, &flush1);   // mov r11, &flush1
     put3(0x41ffd3);           // call r11
-    put5(0x488b4c24,24);      // mov rcx, [rsp+24]
-    put5(0x488b5424,16);      // mov rdx, [rsp+16]
-    put5(0x488b7424,8);       // mov rsi, [rsp+8]
-    put4(0x488b3c24);         // mov rdi, [rsp]
-    put4(0x4883c430);         // add esp, 48
-    put1(0xc3);               // ret
+    put1(0x59);               // pop rcx
+    put1(0x5a);               // pop rdx
+    put1(0x5e);               // pop rsi
+    put1(0x5f);               // pop rdi
+    put1(0x5d);               // pop rbp
   }
   else {
     put1a(0xb8, &outbuf[0]);  // mov eax, outbuf.p
     put2a(0x8b0d, &bufptr);   // mov ecx, [bufptr]
-    put3(0x891408);           // mov [eax+ecx], edx
+    put3(0x881408);           // mov [eax+ecx], dl
     put2(0xffc1);             // inc ecx
     put2a(0x890d, &bufptr);   // mov [bufptr], ecx
     put2a(0x81f9, outbuf.size());  // cmp ecx, outbuf.size()
-    put2(0x7401);             // jz L1
+    put2(0x7403);             // jz L1
+    put2(0x31c0);             // xor eax, eax
     put1(0xc3);               // ret
-    put3(0x83ec08);           // L1: sub esp, 8
+    put3(0x83ec0c);           // L1: sub esp, 12
     put4(0x89542404);         // mov [esp+4], edx
     put3a(0xc70424, this);    // mov [esp], this
     put1a(0xb8, &flush1);     // mov eax, &flush1
     put2(0xffd0);             // call eax
     put4(0x8b542404);         // mov edx, [esp+4]
-    put3(0x83c408);           // add esp, 8
-    put1(0xc3);               // ret
+    put3(0x83c40c);           // add esp, 12
   }
+  put1(0xc3);               // ret
 
   // Set it[i]=1 for each ZPAQL instruction reachable from the previous
   // instruction + 2 if reachable by a jump (or 3 if both).
@@ -1887,21 +3361,21 @@ int ZPAQL::assemble() {
     for (int i=0; i<hlen; ++i) {
       int op=hcomp[i];
       if (it[i]) {
-        int next1=i+1+(op%8==7), next2=NONE; // next and jump targets
+        int next1=i+oplen(hcomp+i), next2=NONE; // next and jump targets
         if (iserr(op)) next1=NONE;  // error
         if (op==56) next1=NONE, next2=0;  // halt
         if (op==255) next1=NONE, next2=hcomp[i+1]+256*hcomp[i+2]; // lj
         if (op==39||op==47||op==63)next2=i+2+(hcomp[i+1]<<24>>24);// jt,jf,jmp
         if (op==63) next1=NONE;  // jmp
         if ((next2<0 || next2>=hlen) && next2!=NONE) next2=hlen-1; // error
-        if (next1!=NONE && !(it[next1]&1)) it[next1]|=1, ++done;
-        if (next2!=NONE && !(it[next2]&2)) it[next2]|=2, ++done;
+        if (next1>=0 && next1<hlen && !(it[next1]&1)) it[next1]|=1, ++done;
+        if (next2>=0 && next2<hlen && !(it[next2]&2)) it[next2]|=2, ++done;
       }
     }
   } while (done>0);
 
   // Set it[i] bits 2-3 to 4, 8, or 12 if a comparison
-  //  (<, >, == respectively) does not need to save the result in f,
+  //  (==, <, > respectively) does not need to save the result in f,
   // or if a conditional jump (jt, jf) does not need to read f.
   // This is true if a comparison is followed directly by a jt/jf,
   // the jt/jf is not a jump target, the byte before is not a jump
@@ -1939,16 +3413,16 @@ int ZPAQL::assemble() {
   // Start of run(): Save x86 and load ZPAQL registers
   const int start=o;
   assert(start>=16);
+  put1(0x55);          // push ebp/rbp
   put1(0x56);          // push esi/rsi
   put1(0x57);          // push edi/rdi
   put1(0x53);          // push ebx/rbx
-  put1(0x55);          // push ebp/rbp
   if (S==8) {
     put2(0x4154);      // push r12
     put2(0x4155);      // push r13
     put2(0x4156);      // push r14
     put2(0x4157);      // push r15
-    put4(0x4883ec38);  // sub rsp, 56
+    put4(0x4883ec08);  // sub rsp, 8
     put2l(0x48b8, &a); // mov rax, a
     put2(0x8b10);      // mov edx, [rax]
     put2l(0x48b8, &b); // mov rax, b
@@ -1965,7 +3439,7 @@ int ZPAQL::assemble() {
     put2l(0x49bf, &m[0]);   // mov r15, m
   }
   else {
-    put3(0x83ec3c);    // sub esp, 60
+    put3(0x83ec0c);    // sub esp, 12
     put2a(0x8b15, &a); // mov edx, [a]
     put2a(0x8b35, &b); // mov esi, [b]
     put2a(0x8b3d, &c); // mov edi, [c]
@@ -1975,8 +3449,10 @@ int ZPAQL::assemble() {
 
   // Assemble in multiple passes until every byte of hcomp has a translation
   for (int istart=0; istart<hlen; ++istart) {
-    for (int i=istart; i<hlen&&it[i]; i=i+1+(hcomp[i]%8==7)+(hcomp[i]==255)) {
+    int inc=0;
+    for (int i=istart; i<hlen && it[i]; i+=inc) {
       const int code=it[i];
+      inc=oplen(hcomp+i);
 
       // If already assembled, then assemble a jump to it
       U32 t;
@@ -1993,7 +3469,7 @@ int ZPAQL::assemble() {
         break;
       }
 
-      // Else assemble the instruction at hcode[i] to rcode[o]
+      // Else assemble the instruction at hcomp[i] to rcode[o]
       else {
         assert(i>=0 && i<it.isize());
         assert(it[i]>0 && it[i]<16);
@@ -2005,9 +3481,9 @@ int ZPAQL::assemble() {
         const int ddd=op/8%8;
         const int sss=op%8;
 
-        // error instruction: return 0
+        // error instruction: return 1
         if (iserr(op)) {
-          put2(0x31c0);           // xor eax, eax
+          put1a(0xb8, 1);         // mov eax, 1
           put1a(0xe9, halt-o-4);  // jmp near halt
           continue;
         }
@@ -2060,10 +3536,10 @@ int ZPAQL::assemble() {
                 put2(0x87d0+regcode[ddd]);   // xchg edx, ddd
                 break;
               case 1:  // ddd++
-                put2(0xffc0+regcode[ddd]);   // inc ddd
+                put3(0x83c000+256*regcode[ddd]+inc); // add ddd, inc
                 break;
               case 2:  // ddd--
-                put2(0xffc8+regcode[ddd]);   // dec ddd
+                put3(0x83e800+256*regcode[ddd]+inc); // sub ddd, inc
                 break;
               case 3:  // ddd!
                 put2(0xf7d0+regcode[ddd]);   // not ddd
@@ -2086,10 +3562,10 @@ int ZPAQL::assemble() {
                 put2(0x8611);                // xchg dl, [ecx]
                 break;
               case 1:  // ddd++
-                put2(0xfe01);                // inc byte [ecx]
+                put3(0x800100+inc);          // add byte [ecx], inc
                 break;
               case 2:  // ddd--
-                put2(0xfe09);                // dec byte [ecx]
+                put3(0x802900+inc);          // sub byte [ecx], inc
                 break;
               case 3:  // ddd!
                 put2(0xf611);                // not byte [ecx]
@@ -2101,7 +3577,7 @@ int ZPAQL::assemble() {
               case 7:  // jt, jf
               {
                 assert(code>=0 && code<16);
-                const int jtab[2][4]={{5,4,2,7},{4,5,3,6}};
+                static const unsigned char jtab[2][4]={{5,4,2,7},{4,5,3,6}};
                                // jnz,je,jb,ja, jz,jne,jae,jbe
                 if (code<4) put2(0x84db);    // test bl, bl
                 if (arg>=128 && arg-257-i>=0 && o-it[arg-257-i]<120)
@@ -2118,10 +3594,10 @@ int ZPAQL::assemble() {
                 put2(0x8711);             // xchg edx, [ecx]
                 break;
               case 1:  // ddd++
-                put2(0xff01);             // inc dword [ecx]
+                put3(0x830100+inc);       // add dword [ecx], inc
                 break;
               case 2:  // ddd--
-                put2(0xff09);             // dec dword [ecx]
+                put3(0x832900+inc);       // sub dword [ecx], inc
                 break;
               case 3:  // ddd!
                 put2(0xf711);             // not dword [ecx]
@@ -2141,11 +3617,14 @@ int ZPAQL::assemble() {
           case 7:  // special
             switch(op) {
               case 56: // halt
-                put1a(0xb8, 1);           // mov eax, 1
+                put2(0x31c0);             // xor eax, eax  ; return 0
                 put1a(0xe9, halt-o-4);    // jmp near halt
                 break;
               case 57:  // out
                 put1a(0xe8, outlabel-o-4);// call outlabel
+                put3(0x83f800);           // cmp eax, 0  ; returned error code
+                put2(0x7405);             // je L1:
+                put1a(0xe9, halt-o-4);    // jmp near halt ; L1:
                 break;
               case 59:  // hash: a = (a + *b + 512) * 773
                 put3a(0x8d8410, 512);     // lea edx, [eax+edx+512]
@@ -2197,7 +3676,7 @@ int ZPAQL::assemble() {
               else put3a(0x031485, &h[0]);           // add edx, [h+eax*4]
             }
             else if (sss<7) put2(0x01c2+8*regcode[sss]);// add edx, sss
-            else if (arg>128) put2a(0x81c2, arg);    // add edx, n
+            else if (arg>=128) put2a(0x81c2, arg);   // add edx, n
             else put3(0x83c200+arg);                 // add edx, byte n
             break;
           case 17:  // a-=
@@ -2385,7 +3864,7 @@ int Predictor::assemble_p() {
   if (S==4)
     put4(0x8b7c2414);         // mov edi,[esp+0x14] ; pr
   else {
-#ifdef _WIN32
+#if !defined(unix) || defined(__CYGWIN__)
     put3(0x4889cf);           // mov rdi, rcx (1st arg in Win64)
 #endif
   }
@@ -2673,9 +4152,8 @@ int Predictor::assemble_p() {
             put4(0x660ff5c3);                  // pmaddwd xmm0, xmm3
           }
           else {  // accumulate sum in xmm0
-            put4(0xf30f6fd1);                  // movdqu xmm2, xmm1
-            put4(0x660ff5d3);                  // pmaddwd xmm2, xmm3
-            put4(0x660ffec2);                  // paddd, xmm0, xmm2
+            put4(0x660ff5cb);                  // pmaddwd xmm1, xmm3
+            put4(0x660ffec1);                  // paddd xmm0, xmm1
           }
         }
 
@@ -2688,13 +4166,13 @@ int Predictor::assemble_p() {
         put4(0x660ffec1);                      // paddd xmm0, xmm1
         put4(0x660f7ec0);                      // movd eax, xmm0 ; p[i]
         put3(0xc1f808);                        // sar eax, 8
-        put1a(0xb9, 2047);                     // mov ecx, 2047 ; clamp2k
-        put2(0x39c8);                          // cmp eax, ecx
-        put3(0x0f4fc1);                        // cmovg eax, ecx
-        put2(0xf7d1);                          // not ecx ; -2048
-        put2(0x39c8);                          // cmp eax, ecx
-        put3(0x0f4cc1);                        // cmovl eax, ecx
-        put2a(0x8987, off(p[i]));              // mov [edi+&p[i]], eax
+        put1a(0x3d, 2047);                     // cmp eax, 2047
+        put2(0x7e05);                          // jle L1
+        put1a(0xb8, 2047);                     // mov eax, 2047
+        put1a(0x3d, -2048);                    // L1: cmp eax, -2048
+        put2(0x7d05);                          // jge, L2
+        put1a(0xb8, -2048);                    // mov eax, -2048
+        put2a(0x8987, off(p[i]));              // L2: mov [edi+&p[i]], eax
         break;
 
       case SSE:  // sizebits j start limit
@@ -2775,7 +4253,7 @@ int Predictor::assemble_p() {
     put4(0x8b6c2418);          // mov ebp,[esp+0x18] ; (2nd arg = y)
   }
   else {
-#ifndef _WIN32
+#if defined(unix) && !defined(__CYGWIN__)  // (1st arg already in rdi)
     put3(0x4889f5);            // mov rbp, rsi (2nd arg in Linux-64)
 #else
     put3(0x4889cf);            // mov rdi, rcx (1st arg in Win64)
@@ -2882,23 +4360,23 @@ int Predictor::assemble_p() {
           put1a(0x05, (1<<12));        // add eax, 4096
           put3(0xc1f80d);              // sar eax, 13
           put3(0x0304d6);              // add eax, [esi+edx*8] ; wt[0]
-          put1a(0xbb, (1<<19)-1);      // mov ebx, 524287
-          put2(0x39d8);                // cmp eax, ebx
-          put3(0x0f4fc3);              // cmovg eax, ebx
-          put2(0xf7d3);                // not ebx ; -524288
-          put2(0x39d8);                // cmp eax, ebx
-          put3(0x0f4cc3);              // cmovl eax, ebx
-          put3(0x8904d6);              // mov [esi+edx*8], eax
+          put1a(0x3d, (1<<19)-1);      // cmp eax, (1<<19)-1
+          put2(0x7e05);                // jle L1
+          put1a(0xb8, (1<<19)-1);      // mov eax, (1<<19)-1
+          put1a(0x3d, 0xfff80000);     // cmp eax, -1<<19
+          put2(0x7d05);                // jge L2
+          put1a(0xb8, 0xfff80000);     // mov eax, -1<<19
+          put3(0x8904d6);              // L2: mov [esi+edx*8], eax
           put3(0x83c110);              // add ecx, 16 ; err
           put3(0xc1f905);              // sar ecx, 5
           put4(0x034cd604);            // add ecx, [esi+edx*8+4] ; wt[1]
-          put1a(0xb8, (1<<19)-1);      // mov eax, 524287
-          put2(0x39c1);                // cmp ecx, eax
-          put3(0x0f4fc8);              // cmovg ecx, eax
-          put2(0xf7d0);                // not eax ; -524288
-          put2(0x39c1);                // cmp ecx, eax
-          put3(0x0f4cc8);              // cmovl ecx, eax
-          put4(0x894cd604);            // mov [esi+edx*8+4], ecx
+          put2a(0x81f9, (1<<19)-1);    // cmp ecx, (1<<19)-1
+          put2(0x7e05);                // jle L3
+          put1a(0xb9, (1<<19)-1);      // mov ecx, (1<<19)-1
+          put2a(0x81f9, 0xfff80000);   // cmp ecx, -1<<19
+          put2(0x7d05);                // jge L4
+          put1a(0xb9, 0xfff80000);     // mov ecx, -1<<19
+          put4(0x894cd604);            // L4: mov [esi+edx*8+4], ecx
         }
         break;
 
@@ -3093,13 +4571,13 @@ int Predictor::assemble_p() {
           put1a(0x05, 1<<12);          // add eax, 1<<12
           put3(0xc1f80d);              // sar eax, 13
           put2(0x0306);                // add eax, [esi]
-          put1a(0xba, (1<<19)-1);      // mov edx, (1<<19)-1
-          put2(0x39d0);                // cmp eax, edx
-          put3(0x0f4fc2);              // cmovg eax, edx
-          put2(0xf7d2);                // not edx
-          put2(0x39d0);                // cmp eax, edx
-          put3(0x0f4cc2);              // cmovl eax, edx
-          put2(0x8906);                // mov [esi], eax
+          put1a(0x3d, (1<<19)-1);      // cmp eax, (1<<19)-1
+          put2(0x7e05);                // jge L1
+          put1a(0xb8, (1<<19)-1);      // mov eax, (1<<19)-1
+          put1a(0x3d, 0xfff80000);     // cmp eax, -1<<19
+          put2(0x7d05);                // jle L2
+          put1a(0xb8, 0xfff80000);     // mov eax, -1<<19
+          put2(0x8906);                // L2: mov [esi], eax
           if (k<cp[3]-1) {
             if (S==8) put1(0x48);      // rex.w
             put3(0x83c604);            // add esi, 4
@@ -3131,13 +4609,17 @@ int Predictor::predict() {
   return predict0();
 #else
   if (!pcode) {
+    allocx(pcode, pcode_size, (z.cend*100+4096)&-4096);
     int n=assemble_p();
-    allocx(pcode, pcode_size, n);
-    if (!pcode || n!=assemble_p() || n<10 || pcode_size<10)
-      error("predictor JIT failed");
+    if (n>pcode_size) {
+      allocx(pcode, pcode_size, n);
+      n=assemble_p();
+    }
+    if (!pcode || n<15 || pcode_size<15)
+      error("run JIT failed");
   }
   assert(pcode && pcode[0]);
-  return ((int(*)(Predictor*))&pcode[0])(this);
+  return ((int(*)(Predictor*))&pcode[10])(this);
 #endif
 }
 
@@ -3172,15 +4654,3098 @@ void ZPAQL::run(U32 input) {
   run0(input);
 #else
   if (!rcode) {
+    allocx(rcode, rcode_size, (hend*10+4096)&-4096);
     int n=assemble();
-    allocx(rcode, rcode_size, n);
-    if (!rcode || n<10 || rcode_size<10 || n!=assemble())
+    if (n>rcode_size) {
+      allocx(rcode, rcode_size, n);
+      n=assemble();
+    }
+    if (!rcode || n<10 || rcode_size<10)
       error("run JIT failed");
   }
   a=input;
-  if (!((int(*)())(&rcode[0]))())
-    libzpaq::error("Bad ZPAQL opcode");
+  const U32 rc=((int(*)())(&rcode[0]))();
+  if (rc==0) return;
+  else if (rc==1) libzpaq::error("Bad ZPAQL opcode");
+  else if (rc==2) libzpaq::error("Out of memory");
+  else if (rc==3) libzpaq::error("Write error");
+  else libzpaq::error("ZPAQL execution error");
 #endif
 }
 
+////////////////////////// divsufsort ///////////////////////////////
+
+/*
+ * divsufsort.c for libdivsufsort-lite
+ * Copyright (c) 2003-2008 Yuta Mori All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*- Constants -*/
+#define INLINE __inline
+#if defined(ALPHABET_SIZE) && (ALPHABET_SIZE < 1)
+# undef ALPHABET_SIZE
+#endif
+#if !defined(ALPHABET_SIZE)
+# define ALPHABET_SIZE (256)
+#endif
+#define BUCKET_A_SIZE (ALPHABET_SIZE)
+#define BUCKET_B_SIZE (ALPHABET_SIZE * ALPHABET_SIZE)
+#if defined(SS_INSERTIONSORT_THRESHOLD)
+# if SS_INSERTIONSORT_THRESHOLD < 1
+#  undef SS_INSERTIONSORT_THRESHOLD
+#  define SS_INSERTIONSORT_THRESHOLD (1)
+# endif
+#else
+# define SS_INSERTIONSORT_THRESHOLD (8)
+#endif
+#if defined(SS_BLOCKSIZE)
+# if SS_BLOCKSIZE < 0
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (0)
+# elif 32768 <= SS_BLOCKSIZE
+#  undef SS_BLOCKSIZE
+#  define SS_BLOCKSIZE (32767)
+# endif
+#else
+# define SS_BLOCKSIZE (1024)
+#endif
+/* minstacksize = log(SS_BLOCKSIZE) / log(3) * 2 */
+#if SS_BLOCKSIZE == 0
+# define SS_MISORT_STACKSIZE (96)
+#elif SS_BLOCKSIZE <= 4096
+# define SS_MISORT_STACKSIZE (16)
+#else
+# define SS_MISORT_STACKSIZE (24)
+#endif
+#define SS_SMERGE_STACKSIZE (32)
+#define TR_INSERTIONSORT_THRESHOLD (8)
+#define TR_STACKSIZE (64)
+
+
+/*- Macros -*/
+#ifndef SWAP
+# define SWAP(_a, _b) do { t = (_a); (_a) = (_b); (_b) = t; } while(0)
+#endif /* SWAP */
+#ifndef MIN
+# define MIN(_a, _b) (((_a) < (_b)) ? (_a) : (_b))
+#endif /* MIN */
+#ifndef MAX
+# define MAX(_a, _b) (((_a) > (_b)) ? (_a) : (_b))
+#endif /* MAX */
+#define STACK_PUSH(_a, _b, _c, _d)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize++].d = (_d);\
+  } while(0)
+#define STACK_PUSH5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(ssize < STACK_SIZE);\
+    stack[ssize].a = (_a), stack[ssize].b = (_b),\
+    stack[ssize].c = (_c), stack[ssize].d = (_d), stack[ssize++].e = (_e);\
+  } while(0)
+#define STACK_POP(_a, _b, _c, _d)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d;\
+  } while(0)
+#define STACK_POP5(_a, _b, _c, _d, _e)\
+  do {\
+    assert(0 <= ssize);\
+    if(ssize == 0) { return; }\
+    (_a) = stack[--ssize].a, (_b) = stack[ssize].b,\
+    (_c) = stack[ssize].c, (_d) = stack[ssize].d, (_e) = stack[ssize].e;\
+  } while(0)
+#define BUCKET_A(_c0) bucket_A[(_c0)]
+#if ALPHABET_SIZE == 256
+#define BUCKET_B(_c0, _c1) (bucket_B[((_c1) << 8) | (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[((_c0) << 8) | (_c1)])
+#else
+#define BUCKET_B(_c0, _c1) (bucket_B[(_c1) * ALPHABET_SIZE + (_c0)])
+#define BUCKET_BSTAR(_c0, _c1) (bucket_B[(_c0) * ALPHABET_SIZE + (_c1)])
+#endif
+
+
+/*- Private Functions -*/
+
+static const int lg_table[256]= {
+ -1,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+};
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+int
+ss_ilg(int n) {
+#if SS_BLOCKSIZE == 0
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+#elif SS_BLOCKSIZE < 256
+  return lg_table[n];
+#else
+  return (n & 0xff00) ?
+          8 + lg_table[(n >> 8) & 0xff] :
+          0 + lg_table[(n >> 0) & 0xff];
+#endif
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+#if SS_BLOCKSIZE != 0
+
+static const int sqq_table[256] = {
+  0,  16,  22,  27,  32,  35,  39,  42,  45,  48,  50,  53,  55,  57,  59,  61,
+ 64,  65,  67,  69,  71,  73,  75,  76,  78,  80,  81,  83,  84,  86,  87,  89,
+ 90,  91,  93,  94,  96,  97,  98,  99, 101, 102, 103, 104, 106, 107, 108, 109,
+110, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+128, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+143, 144, 144, 145, 146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155,
+156, 157, 158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168,
+169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178, 179, 180,
+181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188, 189, 189, 190, 191,
+192, 192, 193, 193, 194, 195, 195, 196, 197, 197, 198, 199, 199, 200, 201, 201,
+202, 203, 203, 204, 204, 205, 206, 206, 207, 208, 208, 209, 209, 210, 211, 211,
+212, 212, 213, 214, 214, 215, 215, 216, 217, 217, 218, 218, 219, 219, 220, 221,
+221, 222, 222, 223, 224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230,
+230, 231, 231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238,
+239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247,
+247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253, 253, 254, 254, 255
+};
+
+static INLINE
+int
+ss_isqrt(int x) {
+  int y, e;
+
+  if(x >= (SS_BLOCKSIZE * SS_BLOCKSIZE)) { return SS_BLOCKSIZE; }
+  e = (x & 0xffff0000) ?
+        ((x & 0xff000000) ?
+          24 + lg_table[(x >> 24) & 0xff] :
+          16 + lg_table[(x >> 16) & 0xff]) :
+        ((x & 0x0000ff00) ?
+           8 + lg_table[(x >>  8) & 0xff] :
+           0 + lg_table[(x >>  0) & 0xff]);
+
+  if(e >= 16) {
+    y = sqq_table[x >> ((e - 6) - (e & 1))] << ((e >> 1) - 7);
+    if(e >= 24) { y = (y + 1 + x / y) >> 1; }
+    y = (y + 1 + x / y) >> 1;
+  } else if(e >= 8) {
+    y = (sqq_table[x >> ((e - 6) - (e & 1))] >> (7 - (e >> 1))) + 1;
+  } else {
+    return sqq_table[x] >> 4;
+  }
+
+  return (x < (y * y)) ? y - 1 : y;
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Compares two suffixes. */
+static INLINE
+int
+ss_compare(const unsigned char *T,
+           const int *p1, const int *p2,
+           int depth) {
+  const unsigned char *U1, *U2, *U1n, *U2n;
+
+  for(U1 = T + depth + *p1,
+      U2 = T + depth + *p2,
+      U1n = T + *(p1 + 1) + 2,
+      U2n = T + *(p2 + 1) + 2;
+      (U1 < U1n) && (U2 < U2n) && (*U1 == *U2);
+      ++U1, ++U2) {
+  }
+
+  return U1 < U1n ?
+        (U2 < U2n ? *U1 - *U2 : 1) :
+        (U2 < U2n ? -1 : 0);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1)
+
+/* Insertionsort for small size groups */
+static
+void
+ss_insertionsort(const unsigned char *T, const int *PA,
+                 int *first, int *last, int depth) {
+  int *i, *j;
+  int t;
+  int r;
+
+  for(i = last - 2; first <= i; --i) {
+    for(t = *i, j = i + 1; 0 < (r = ss_compare(T, PA + t, PA + *j, depth));) {
+      do { *(j - 1) = *j; } while((++j < last) && (*j < 0));
+      if(last <= j) { break; }
+    }
+    if(r == 0) { *j = ~*j; }
+    *(j - 1) = t;
+  }
+}
+
+#endif /* (SS_BLOCKSIZE != 1) && (SS_INSERTIONSORT_THRESHOLD != 1) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE)
+
+static INLINE
+void
+ss_fixdown(const unsigned char *Td, const int *PA,
+           int *SA, int i, int size) {
+  int j, k;
+  int v;
+  int c, d, e;
+
+  for(v = SA[i], c = Td[PA[v]]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = Td[PA[SA[k = j++]]];
+    if(d < (e = Td[PA[SA[j]]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+ss_heapsort(const unsigned char *Td, const int *PA, int *SA, int size) {
+  int i, m;
+  int t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(Td[PA[SA[m / 2]]] < Td[PA[SA[m]]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { ss_fixdown(Td, PA, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); ss_fixdown(Td, PA, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    ss_fixdown(Td, PA, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+int *
+ss_median3(const unsigned char *Td, const int *PA,
+           int *v1, int *v2, int *v3) {
+  int *t;
+  if(Td[PA[*v1]] > Td[PA[*v2]]) { SWAP(v1, v2); }
+  if(Td[PA[*v2]] > Td[PA[*v3]]) {
+    if(Td[PA[*v1]] > Td[PA[*v3]]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+int *
+ss_median5(const unsigned char *Td, const int *PA,
+           int *v1, int *v2, int *v3, int *v4, int *v5) {
+  int *t;
+  if(Td[PA[*v2]] > Td[PA[*v3]]) { SWAP(v2, v3); }
+  if(Td[PA[*v4]] > Td[PA[*v5]]) { SWAP(v4, v5); }
+  if(Td[PA[*v2]] > Td[PA[*v4]]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(Td[PA[*v1]] > Td[PA[*v3]]) { SWAP(v1, v3); }
+  if(Td[PA[*v1]] > Td[PA[*v4]]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(Td[PA[*v3]] > Td[PA[*v4]]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+int *
+ss_pivot(const unsigned char *Td, const int *PA, int *first, int *last) {
+  int *middle;
+  int t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return ss_median3(Td, PA, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return ss_median5(Td, PA, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = ss_median3(Td, PA, first, first + t, first + (t << 1));
+  middle = ss_median3(Td, PA, middle - t, middle, middle + t);
+  last   = ss_median3(Td, PA, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return ss_median3(Td, PA, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Binary partition for substrings. */
+static INLINE
+int *
+ss_partition(const int *PA,
+                    int *first, int *last, int depth) {
+  int *a, *b;
+  int t;
+  for(a = first - 1, b = last;;) {
+    for(; (++a < b) && ((PA[*a] + depth) >= (PA[*a + 1] + 1));) { *a = ~*a; }
+    for(; (a < --b) && ((PA[*b] + depth) <  (PA[*b + 1] + 1));) { }
+    if(b <= a) { break; }
+    t = ~*b;
+    *b = *a;
+    *a = t;
+  }
+  if(first < a) { *first = ~*first; }
+  return a;
+}
+
+/* Multikey introsort for medium size groups. */
+static
+void
+ss_mintrosort(const unsigned char *T, const int *PA,
+              int *first, int *last,
+              int depth) {
+#define STACK_SIZE SS_MISORT_STACKSIZE
+  struct { int *a, *b, c; int d; } stack[STACK_SIZE];
+  const unsigned char *Td;
+  int *a, *b, *c, *d, *e, *f;
+  int s, t;
+  int ssize;
+  int limit;
+  int v, x = 0;
+
+  for(ssize = 0, limit = ss_ilg(last - first);;) {
+
+    if((last - first) <= SS_INSERTIONSORT_THRESHOLD) {
+#if 1 < SS_INSERTIONSORT_THRESHOLD
+      if(1 < (last - first)) { ss_insertionsort(T, PA, first, last, depth); }
+#endif
+      STACK_POP(first, last, depth, limit);
+      continue;
+    }
+
+    Td = T + depth;
+    if(limit-- == 0) { ss_heapsort(Td, PA, first, last - first); }
+    if(limit < 0) {
+      for(a = first + 1, v = Td[PA[*first]]; a < last; ++a) {
+        if((x = Td[PA[*a]]) != v) {
+          if(1 < (a - first)) { break; }
+          v = x;
+          first = a;
+        }
+      }
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, a, depth);
+      }
+      if((a - first) <= (last - a)) {
+        if(1 < (a - first)) {
+          STACK_PUSH(a, last, depth, -1);
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        } else {
+          first = a, limit = -1;
+        }
+      } else {
+        if(1 < (last - a)) {
+          STACK_PUSH(first, a, depth + 1, ss_ilg(a - first));
+          first = a, limit = -1;
+        } else {
+          last = a, depth += 1, limit = ss_ilg(a - first);
+        }
+      }
+      continue;
+    }
+
+    /* choose pivot */
+    a = ss_pivot(Td, PA, first, last);
+    v = Td[PA[*a]];
+    SWAP(*first, *a);
+
+    /* partition */
+    for(b = first; (++b < last) && ((x = Td[PA[*b]]) == v);) { }
+    if(((a = b) < last) && (x < v)) {
+      for(; (++b < last) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+    }
+    for(c = last; (b < --c) && ((x = Td[PA[*c]]) == v);) { }
+    if((b < (d = c)) && (x > v)) {
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+    for(; b < c;) {
+      SWAP(*b, *c);
+      for(; (++b < c) && ((x = Td[PA[*b]]) <= v);) {
+        if(x == v) { SWAP(*b, *a); ++a; }
+      }
+      for(; (b < --c) && ((x = Td[PA[*c]]) >= v);) {
+        if(x == v) { SWAP(*c, *d); --d; }
+      }
+    }
+
+    if(a <= d) {
+      c = b - 1;
+
+      if((s = a - first) > (t = b - a)) { s = t; }
+      for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+      if((s = d - c) > (t = last - d - 1)) { s = t; }
+      for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+
+      a = first + (b - a), c = last - (d - c);
+      b = (v <= Td[PA[*a] - 1]) ? a : ss_partition(PA, a, c, depth);
+
+      if((a - first) <= (last - c)) {
+        if((last - c) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(c, last, depth, limit);
+          last = a;
+        } else if((a - first) <= (c - b)) {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          last = a;
+        } else {
+          STACK_PUSH(c, last, depth, limit);
+          STACK_PUSH(first, a, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      } else {
+        if((a - first) <= (c - b)) {
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          STACK_PUSH(first, a, depth, limit);
+          first = c;
+        } else if((last - c) <= (c - b)) {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(b, c, depth + 1, ss_ilg(c - b));
+          first = c;
+        } else {
+          STACK_PUSH(first, a, depth, limit);
+          STACK_PUSH(c, last, depth, limit);
+          first = b, last = c, depth += 1, limit = ss_ilg(c - b);
+        }
+      }
+    } else {
+      limit += 1;
+      if(Td[PA[*first] - 1] < v) {
+        first = ss_partition(PA, first, last, depth);
+        limit = ss_ilg(last - first);
+      }
+      depth += 1;
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* (SS_BLOCKSIZE == 0) || (SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE) */
+
+
+/*---------------------------------------------------------------------------*/
+
+#if SS_BLOCKSIZE != 0
+
+static INLINE
+void
+ss_blockswap(int *a, int *b, int n) {
+  int t;
+  for(; 0 < n; --n, ++a, ++b) {
+    t = *a, *a = *b, *b = t;
+  }
+}
+
+static INLINE
+void
+ss_rotate(int *first, int *middle, int *last) {
+  int *a, *b, t;
+  int l, r;
+  l = middle - first, r = last - middle;
+  for(; (0 < l) && (0 < r);) {
+    if(l == r) { ss_blockswap(first, middle, l); break; }
+    if(l < r) {
+      a = last - 1, b = middle - 1;
+      t = *a;
+      do {
+        *a-- = *b, *b-- = *a;
+        if(b < first) {
+          *a = t;
+          last = a;
+          if((r -= l + 1) <= l) { break; }
+          a -= 1, b = middle - 1;
+          t = *a;
+        }
+      } while(1);
+    } else {
+      a = first, b = middle;
+      t = *a;
+      do {
+        *a++ = *b, *b++ = *a;
+        if(last <= b) {
+          *a = t;
+          first = a + 1;
+          if((l -= r + 1) <= r) { break; }
+          a += 1, b = middle;
+          t = *a;
+        }
+      } while(1);
+    }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static
+void
+ss_inplacemerge(const unsigned char *T, const int *PA,
+                int *first, int *middle, int *last,
+                int depth) {
+  const int *p;
+  int *a, *b;
+  int len, half;
+  int q, r;
+  int x;
+
+  for(;;) {
+    if(*(last - 1) < 0) { x = 1; p = PA + ~*(last - 1); }
+    else                { x = 0; p = PA +  *(last - 1); }
+    for(a = first, len = middle - first, half = len >> 1, r = -1;
+        0 < len;
+        len = half, half >>= 1) {
+      b = a + half;
+      q = ss_compare(T, PA + ((0 <= *b) ? *b : ~*b), p, depth);
+      if(q < 0) {
+        a = b + 1;
+        half -= (len & 1) ^ 1;
+      } else {
+        r = q;
+      }
+    }
+    if(a < middle) {
+      if(r == 0) { *a = ~*a; }
+      ss_rotate(a, middle, last);
+      last -= middle - a;
+      middle = a;
+      if(first == middle) { break; }
+    }
+    --last;
+    if(x != 0) { while(*--last < 0) { } }
+    if(middle == last) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Merge-forward with internal buffer. */
+static
+void
+ss_mergeforward(const unsigned char *T, const int *PA,
+                int *first, int *middle, int *last,
+                int *buf, int depth) {
+  int *a, *b, *c, *bufend;
+  int t;
+  int r;
+
+  bufend = buf + (middle - first) - 1;
+  ss_blockswap(buf, first, middle - first);
+
+  for(t = *(a = first), b = buf, c = middle;;) {
+    r = ss_compare(T, PA + *b, PA + *c, depth);
+    if(r < 0) {
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+    } else if(r > 0) {
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    } else {
+      *c = ~*c;
+      do {
+        *a++ = *b;
+        if(bufend <= b) { *bufend = t; return; }
+        *b++ = *a;
+      } while(*b < 0);
+
+      do {
+        *a++ = *c, *c++ = *a;
+        if(last <= c) {
+          while(b < bufend) { *a++ = *b, *b++ = *a; }
+          *a = *b, *b = t;
+          return;
+        }
+      } while(*c < 0);
+    }
+  }
+}
+
+/* Merge-backward with internal buffer. */
+static
+void
+ss_mergebackward(const unsigned char *T, const int *PA,
+                 int *first, int *middle, int *last,
+                 int *buf, int depth) {
+  const int *p1, *p2;
+  int *a, *b, *c, *bufend;
+  int t;
+  int r;
+  int x;
+
+  bufend = buf + (last - middle) - 1;
+  ss_blockswap(buf, middle, last - middle);
+
+  x = 0;
+  if(*bufend < 0)       { p1 = PA + ~*bufend; x |= 1; }
+  else                  { p1 = PA +  *bufend; }
+  if(*(middle - 1) < 0) { p2 = PA + ~*(middle - 1); x |= 2; }
+  else                  { p2 = PA +  *(middle - 1); }
+  for(t = *(a = last - 1), b = bufend, c = middle - 1;;) {
+    r = ss_compare(T, p1, p2, depth);
+    if(0 < r) {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = *b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+    } else if(r < 0) {
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    } else {
+      if(x & 1) { do { *a-- = *b, *b-- = *a; } while(*b < 0); x ^= 1; }
+      *a-- = ~*b;
+      if(b <= buf) { *buf = t; break; }
+      *b-- = *a;
+      if(x & 2) { do { *a-- = *c, *c-- = *a; } while(*c < 0); x ^= 2; }
+      *a-- = *c, *c-- = *a;
+      if(c < first) {
+        while(buf < b) { *a-- = *b, *b-- = *a; }
+        *a = *b, *b = t;
+        break;
+      }
+      if(*b < 0) { p1 = PA + ~*b; x |= 1; }
+      else       { p1 = PA +  *b; }
+      if(*c < 0) { p2 = PA + ~*c; x |= 2; }
+      else       { p2 = PA +  *c; }
+    }
+  }
+}
+
+/* D&C based merge. */
+static
+void
+ss_swapmerge(const unsigned char *T, const int *PA,
+             int *first, int *middle, int *last,
+             int *buf, int bufsize, int depth) {
+#define STACK_SIZE SS_SMERGE_STACKSIZE
+#define GETIDX(a) ((0 <= (a)) ? (a) : (~(a)))
+#define MERGE_CHECK(a, b, c)\
+  do {\
+    if(((c) & 1) ||\
+       (((c) & 2) && (ss_compare(T, PA + GETIDX(*((a) - 1)), PA + *(a), depth) == 0))) {\
+      *(a) = ~*(a);\
+    }\
+    if(((c) & 4) && ((ss_compare(T, PA + GETIDX(*((b) - 1)), PA + *(b), depth) == 0))) {\
+      *(b) = ~*(b);\
+    }\
+  } while(0)
+  struct { int *a, *b, *c; int d; } stack[STACK_SIZE];
+  int *l, *r, *lm, *rm;
+  int m, len, half;
+  int ssize;
+  int check, next;
+
+  for(check = 0, ssize = 0;;) {
+    if((last - middle) <= bufsize) {
+      if((first < middle) && (middle < last)) {
+        ss_mergebackward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    if((middle - first) <= bufsize) {
+      if(first < middle) {
+        ss_mergeforward(T, PA, first, middle, last, buf, depth);
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+      continue;
+    }
+
+    for(m = 0, len = MIN(middle - first, last - middle), half = len >> 1;
+        0 < len;
+        len = half, half >>= 1) {
+      if(ss_compare(T, PA + GETIDX(*(middle + m + half)),
+                       PA + GETIDX(*(middle - m - half - 1)), depth) < 0) {
+        m += half + 1;
+        half -= (len & 1) ^ 1;
+      }
+    }
+
+    if(0 < m) {
+      lm = middle - m, rm = middle + m;
+      ss_blockswap(lm, middle, m);
+      l = r = middle, next = 0;
+      if(rm < last) {
+        if(*rm < 0) {
+          *rm = ~*rm;
+          if(first < lm) { for(; *--l < 0;) { } next |= 4; }
+          next |= 1;
+        } else if(first < lm) {
+          for(; *r < 0; ++r) { }
+          next |= 2;
+        }
+      }
+
+      if((l - first) <= (last - r)) {
+        STACK_PUSH(r, rm, last, (next & 3) | (check & 4));
+        middle = lm, last = l, check = (check & 3) | (next & 4);
+      } else {
+        if((next & 2) && (r == middle)) { next ^= 6; }
+        STACK_PUSH(first, lm, l, (check & 3) | (next & 4));
+        first = r, middle = rm, check = (next & 3) | (check & 4);
+      }
+    } else {
+      if(ss_compare(T, PA + GETIDX(*(middle - 1)), PA + *middle, depth) == 0) {
+        *middle = ~*middle;
+      }
+      MERGE_CHECK(first, last, check);
+      STACK_POP(first, middle, last, check);
+    }
+  }
+#undef STACK_SIZE
+}
+
+#endif /* SS_BLOCKSIZE != 0 */
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Substring sort */
+static
+void
+sssort(const unsigned char *T, const int *PA,
+       int *first, int *last,
+       int *buf, int bufsize,
+       int depth, int n, int lastsuffix) {
+  int *a;
+#if SS_BLOCKSIZE != 0
+  int *b, *middle, *curbuf;
+  int j, k, curbufsize, limit;
+#endif
+  int i;
+
+  if(lastsuffix != 0) { ++first; }
+
+#if SS_BLOCKSIZE == 0
+  ss_mintrosort(T, PA, first, last, depth);
+#else
+  if((bufsize < SS_BLOCKSIZE) &&
+      (bufsize < (last - first)) &&
+      (bufsize < (limit = ss_isqrt(last - first)))) {
+    if(SS_BLOCKSIZE < limit) { limit = SS_BLOCKSIZE; }
+    buf = middle = last - limit, bufsize = limit;
+  } else {
+    middle = last, limit = 0;
+  }
+  for(a = first, i = 0; SS_BLOCKSIZE < (middle - a); a += SS_BLOCKSIZE, ++i) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, a, a + SS_BLOCKSIZE, depth);
+#endif
+    curbufsize = last - (a + SS_BLOCKSIZE);
+    curbuf = a + SS_BLOCKSIZE;
+    if(curbufsize <= bufsize) { curbufsize = bufsize, curbuf = buf; }
+    for(b = a, k = SS_BLOCKSIZE, j = i; j & 1; b -= k, k <<= 1, j >>= 1) {
+      ss_swapmerge(T, PA, b - k, b, b + k, curbuf, curbufsize, depth);
+    }
+  }
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+  ss_mintrosort(T, PA, a, middle, depth);
+#elif 1 < SS_BLOCKSIZE
+  ss_insertionsort(T, PA, a, middle, depth);
+#endif
+  for(k = SS_BLOCKSIZE; i != 0; k <<= 1, i >>= 1) {
+    if(i & 1) {
+      ss_swapmerge(T, PA, a - k, a, middle, buf, bufsize, depth);
+      a -= k;
+    }
+  }
+  if(limit != 0) {
+#if SS_INSERTIONSORT_THRESHOLD < SS_BLOCKSIZE
+    ss_mintrosort(T, PA, middle, last, depth);
+#elif 1 < SS_BLOCKSIZE
+    ss_insertionsort(T, PA, middle, last, depth);
+#endif
+    ss_inplacemerge(T, PA, first, middle, last, depth);
+  }
+#endif
+
+  if(lastsuffix != 0) {
+    /* Insert last type B* suffix. */
+    int PAi[2]; PAi[0] = PA[*(first - 1)], PAi[1] = n - 2;
+    for(a = first, i = *(first - 1);
+        (a < last) && ((*a < 0) || (0 < ss_compare(T, &(PAi[0]), PA + *a, depth)));
+        ++a) {
+      *(a - 1) = *a;
+    }
+    *(a - 1) = i;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+int
+tr_ilg(int n) {
+  return (n & 0xffff0000) ?
+          ((n & 0xff000000) ?
+            24 + lg_table[(n >> 24) & 0xff] :
+            16 + lg_table[(n >> 16) & 0xff]) :
+          ((n & 0x0000ff00) ?
+             8 + lg_table[(n >>  8) & 0xff] :
+             0 + lg_table[(n >>  0) & 0xff]);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Simple insertionsort for small size groups. */
+static
+void
+tr_insertionsort(const int *ISAd, int *first, int *last) {
+  int *a, *b;
+  int t, r;
+
+  for(a = first + 1; a < last; ++a) {
+    for(t = *a, b = a - 1; 0 > (r = ISAd[t] - ISAd[*b]);) {
+      do { *(b + 1) = *b; } while((first <= --b) && (*b < 0));
+      if(b < first) { break; }
+    }
+    if(r == 0) { *b = ~*b; }
+    *(b + 1) = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_fixdown(const int *ISAd, int *SA, int i, int size) {
+  int j, k;
+  int v;
+  int c, d, e;
+
+  for(v = SA[i], c = ISAd[v]; (j = 2 * i + 1) < size; SA[i] = SA[k], i = k) {
+    d = ISAd[SA[k = j++]];
+    if(d < (e = ISAd[SA[j]])) { k = j; d = e; }
+    if(d <= c) { break; }
+  }
+  SA[i] = v;
+}
+
+/* Simple top-down heapsort. */
+static
+void
+tr_heapsort(const int *ISAd, int *SA, int size) {
+  int i, m;
+  int t;
+
+  m = size;
+  if((size % 2) == 0) {
+    m--;
+    if(ISAd[SA[m / 2]] < ISAd[SA[m]]) { SWAP(SA[m], SA[m / 2]); }
+  }
+
+  for(i = m / 2 - 1; 0 <= i; --i) { tr_fixdown(ISAd, SA, i, m); }
+  if((size % 2) == 0) { SWAP(SA[0], SA[m]); tr_fixdown(ISAd, SA, 0, m); }
+  for(i = m - 1; 0 < i; --i) {
+    t = SA[0], SA[0] = SA[i];
+    tr_fixdown(ISAd, SA, 0, i);
+    SA[i] = t;
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Returns the median of three elements. */
+static INLINE
+int *
+tr_median3(const int *ISAd, int *v1, int *v2, int *v3) {
+  int *t;
+  if(ISAd[*v1] > ISAd[*v2]) { SWAP(v1, v2); }
+  if(ISAd[*v2] > ISAd[*v3]) {
+    if(ISAd[*v1] > ISAd[*v3]) { return v1; }
+    else { return v3; }
+  }
+  return v2;
+}
+
+/* Returns the median of five elements. */
+static INLINE
+int *
+tr_median5(const int *ISAd,
+           int *v1, int *v2, int *v3, int *v4, int *v5) {
+  int *t;
+  if(ISAd[*v2] > ISAd[*v3]) { SWAP(v2, v3); }
+  if(ISAd[*v4] > ISAd[*v5]) { SWAP(v4, v5); }
+  if(ISAd[*v2] > ISAd[*v4]) { SWAP(v2, v4); SWAP(v3, v5); }
+  if(ISAd[*v1] > ISAd[*v3]) { SWAP(v1, v3); }
+  if(ISAd[*v1] > ISAd[*v4]) { SWAP(v1, v4); SWAP(v3, v5); }
+  if(ISAd[*v3] > ISAd[*v4]) { return v4; }
+  return v3;
+}
+
+/* Returns the pivot element. */
+static INLINE
+int *
+tr_pivot(const int *ISAd, int *first, int *last) {
+  int *middle;
+  int t;
+
+  t = last - first;
+  middle = first + t / 2;
+
+  if(t <= 512) {
+    if(t <= 32) {
+      return tr_median3(ISAd, first, middle, last - 1);
+    } else {
+      t >>= 2;
+      return tr_median5(ISAd, first, first + t, middle, last - 1 - t, last - 1);
+    }
+  }
+  t >>= 3;
+  first  = tr_median3(ISAd, first, first + t, first + (t << 1));
+  middle = tr_median3(ISAd, middle - t, middle, middle + t);
+  last   = tr_median3(ISAd, last - 1 - (t << 1), last - 1 - t, last - 1);
+  return tr_median3(ISAd, first, middle, last);
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+typedef struct _trbudget_t trbudget_t;
+struct _trbudget_t {
+  int chance;
+  int remain;
+  int incval;
+  int count;
+};
+
+static INLINE
+void
+trbudget_init(trbudget_t *budget, int chance, int incval) {
+  budget->chance = chance;
+  budget->remain = budget->incval = incval;
+}
+
+static INLINE
+int
+trbudget_check(trbudget_t *budget, int size) {
+  if(size <= budget->remain) { budget->remain -= size; return 1; }
+  if(budget->chance == 0) { budget->count += size; return 0; }
+  budget->remain += budget->incval - size;
+  budget->chance -= 1;
+  return 1;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+static INLINE
+void
+tr_partition(const int *ISAd,
+             int *first, int *middle, int *last,
+             int **pa, int **pb, int v) {
+  int *a, *b, *c, *d, *e, *f;
+  int t, s;
+  int x = 0;
+
+  for(b = middle - 1; (++b < last) && ((x = ISAd[*b]) == v);) { }
+  if(((a = b) < last) && (x < v)) {
+    for(; (++b < last) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+  }
+  for(c = last; (b < --c) && ((x = ISAd[*c]) == v);) { }
+  if((b < (d = c)) && (x > v)) {
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+  for(; b < c;) {
+    SWAP(*b, *c);
+    for(; (++b < c) && ((x = ISAd[*b]) <= v);) {
+      if(x == v) { SWAP(*b, *a); ++a; }
+    }
+    for(; (b < --c) && ((x = ISAd[*c]) >= v);) {
+      if(x == v) { SWAP(*c, *d); --d; }
+    }
+  }
+
+  if(a <= d) {
+    c = b - 1;
+    if((s = a - first) > (t = b - a)) { s = t; }
+    for(e = first, f = b - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    if((s = d - c) > (t = last - d - 1)) { s = t; }
+    for(e = b, f = last - s; 0 < s; --s, ++e, ++f) { SWAP(*e, *f); }
+    first += (b - a), last -= (d - c);
+  }
+  *pa = first, *pb = last;
+}
+
+static
+void
+tr_copy(int *ISA, const int *SA,
+        int *first, int *a, int *b, int *last,
+        int depth) {
+  /* sort suffixes of middle partition
+     by using sorted order of suffixes of left and right partition. */
+  int *c, *d, *e;
+  int s, v;
+
+  v = b - SA - 1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      ISA[s] = d - SA;
+    }
+  }
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      ISA[s] = d - SA;
+    }
+  }
+}
+
+static
+void
+tr_partialcopy(int *ISA, const int *SA,
+               int *first, int *a, int *b, int *last,
+               int depth) {
+  int *c, *d, *e;
+  int s, v;
+  int rank, lastrank, newrank = -1;
+
+  v = b - SA - 1;
+  lastrank = -1;
+  for(c = first, d = a - 1; c <= d; ++c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *++d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+
+  lastrank = -1;
+  for(e = d; first <= e; --e) {
+    rank = ISA[*e];
+    if(lastrank != rank) { lastrank = rank; newrank = e - SA; }
+    if(newrank != rank) { ISA[*e] = newrank; }
+  }
+
+  lastrank = -1;
+  for(c = last - 1, e = d + 1, d = b; e < d; --c) {
+    if((0 <= (s = *c - depth)) && (ISA[s] == v)) {
+      *--d = s;
+      rank = ISA[s + depth];
+      if(lastrank != rank) { lastrank = rank; newrank = d - SA; }
+      ISA[s] = newrank;
+    }
+  }
+}
+
+static
+void
+tr_introsort(int *ISA, const int *ISAd,
+             int *SA, int *first, int *last,
+             trbudget_t *budget) {
+#define STACK_SIZE TR_STACKSIZE
+  struct { const int *a; int *b, *c; int d, e; }stack[STACK_SIZE];
+  int *a, *b, *c;
+  int t;
+  int v, x = 0;
+  int incr = ISAd - ISA;
+  int limit, next;
+  int ssize, trlink = -1;
+
+  for(ssize = 0, limit = tr_ilg(last - first);;) {
+
+    if(limit < 0) {
+      if(limit == -1) {
+        /* tandem repeat partition */
+        tr_partition(ISAd - incr, first, first, last, &a, &b, last - SA - 1);
+
+        /* update ranks */
+        if(a < last) {
+          for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+        }
+        if(b < last) {
+          for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; }
+        }
+
+        /* push */
+        if(1 < (b - a)) {
+          STACK_PUSH5(NULL, a, b, 0, 0);
+          STACK_PUSH5(ISAd - incr, first, last, -2, trlink);
+          trlink = ssize - 2;
+        }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, tr_ilg(last - b), trlink);
+            last = a, limit = tr_ilg(a - first);
+          } else if(1 < (last - b)) {
+            first = b, limit = tr_ilg(last - b);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, tr_ilg(a - first), trlink);
+            first = b, limit = tr_ilg(last - b);
+          } else if(1 < (a - first)) {
+            last = a, limit = tr_ilg(a - first);
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      } else if(limit == -2) {
+        /* tandem repeat copy */
+        a = stack[--ssize].b, b = stack[ssize].c;
+        if(stack[ssize].d == 0) {
+          tr_copy(ISA, SA, first, a, b, last, ISAd - ISA);
+        } else {
+          if(0 <= trlink) { stack[trlink].d = -1; }
+          tr_partialcopy(ISA, SA, first, a, b, last, ISAd - ISA);
+        }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      } else {
+        /* sorted partition */
+        if(0 <= *first) {
+          a = first;
+          do { ISA[*a] = a - SA; } while((++a < last) && (0 <= *a));
+          first = a;
+        }
+        if(first < last) {
+          a = first; do { *a = ~*a; } while(*++a < 0);
+          next = (ISA[*a] != ISAd[*a]) ? tr_ilg(a - first + 1) : -1;
+          if(++a < last) { for(b = first, v = a - SA - 1; b < a; ++b) { ISA[*b] = v; } }
+
+          /* push */
+          if(trbudget_check(budget, a - first)) {
+            if((a - first) <= (last - a)) {
+              STACK_PUSH5(ISAd, a, last, -3, trlink);
+              ISAd += incr, last = a, limit = next;
+            } else {
+              if(1 < (last - a)) {
+                STACK_PUSH5(ISAd + incr, first, a, next, trlink);
+                first = a, limit = -3;
+              } else {
+                ISAd += incr, last = a, limit = next;
+              }
+            }
+          } else {
+            if(0 <= trlink) { stack[trlink].d = -1; }
+            if(1 < (last - a)) {
+              first = a, limit = -3;
+            } else {
+              STACK_POP5(ISAd, first, last, limit, trlink);
+            }
+          }
+        } else {
+          STACK_POP5(ISAd, first, last, limit, trlink);
+        }
+      }
+      continue;
+    }
+
+    if((last - first) <= TR_INSERTIONSORT_THRESHOLD) {
+      tr_insertionsort(ISAd, first, last);
+      limit = -3;
+      continue;
+    }
+
+    if(limit-- == 0) {
+      tr_heapsort(ISAd, first, last - first);
+      for(a = last - 1; first < a; a = b) {
+        for(x = ISAd[*a], b = a - 1; (first <= b) && (ISAd[*b] == x); --b) { *b = ~*b; }
+      }
+      limit = -3;
+      continue;
+    }
+
+    /* choose pivot */
+    a = tr_pivot(ISAd, first, last);
+    SWAP(*first, *a);
+    v = ISAd[*first];
+
+    /* partition */
+    tr_partition(ISAd, first, first + 1, last, &a, &b, v);
+    if((last - first) != (b - a)) {
+      next = (ISA[*a] != v) ? tr_ilg(b - a) : -1;
+
+      /* update ranks */
+      for(c = first, v = a - SA - 1; c < a; ++c) { ISA[*c] = v; }
+      if(b < last) { for(c = a, v = b - SA - 1; c < b; ++c) { ISA[*c] = v; } }
+
+      /* push */
+      if((1 < (b - a)) && (trbudget_check(budget, b - a))) {
+        if((a - first) <= (last - b)) {
+          if((last - b) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              last = a;
+            } else if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((a - first) <= (b - a)) {
+            if(1 < (a - first)) {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              STACK_PUSH5(ISAd, b, last, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        } else {
+          if((a - first) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              first = b;
+            } else if(1 < (a - first)) {
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              last = a;
+            } else {
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else if((last - b) <= (b - a)) {
+            if(1 < (last - b)) {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              STACK_PUSH5(ISAd + incr, a, b, next, trlink);
+              first = b;
+            } else {
+              STACK_PUSH5(ISAd, first, a, limit, trlink);
+              ISAd += incr, first = a, last = b, limit = next;
+            }
+          } else {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            ISAd += incr, first = a, last = b, limit = next;
+          }
+        }
+      } else {
+        if((1 < (b - a)) && (0 <= trlink)) { stack[trlink].d = -1; }
+        if((a - first) <= (last - b)) {
+          if(1 < (a - first)) {
+            STACK_PUSH5(ISAd, b, last, limit, trlink);
+            last = a;
+          } else if(1 < (last - b)) {
+            first = b;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        } else {
+          if(1 < (last - b)) {
+            STACK_PUSH5(ISAd, first, a, limit, trlink);
+            first = b;
+          } else if(1 < (a - first)) {
+            last = a;
+          } else {
+            STACK_POP5(ISAd, first, last, limit, trlink);
+          }
+        }
+      }
+    } else {
+      if(trbudget_check(budget, last - first)) {
+        limit = tr_ilg(last - first), ISAd += incr;
+      } else {
+        if(0 <= trlink) { stack[trlink].d = -1; }
+        STACK_POP5(ISAd, first, last, limit, trlink);
+      }
+    }
+  }
+#undef STACK_SIZE
+}
+
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Tandem repeat sort */
+static
+void
+trsort(int *ISA, int *SA, int n, int depth) {
+  int *ISAd;
+  int *first, *last;
+  trbudget_t budget;
+  int t, skip, unsorted;
+
+  trbudget_init(&budget, tr_ilg(n) * 2 / 3, n);
+/*  trbudget_init(&budget, tr_ilg(n) * 3 / 4, n); */
+  for(ISAd = ISA + depth; -n < *SA; ISAd += ISAd - ISA) {
+    first = SA;
+    skip = 0;
+    unsorted = 0;
+    do {
+      if((t = *first) < 0) { first -= t; skip += t; }
+      else {
+        if(skip != 0) { *(first + skip) = skip; skip = 0; }
+        last = SA + ISA[t] + 1;
+        if(1 < (last - first)) {
+          budget.count = 0;
+          tr_introsort(ISA, ISAd, SA, first, last, &budget);
+          if(budget.count != 0) { unsorted += budget.count; }
+          else { skip = first - last; }
+        } else if((last - first) == 1) {
+          skip = -1;
+        }
+        first = last;
+      }
+    } while(first < (SA + n));
+    if(skip != 0) { *(first + skip) = skip; }
+    if(unsorted == 0) { break; }
+  }
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/* Sorts suffixes of type B*. */
+static
+int
+sort_typeBstar(const unsigned char *T, int *SA,
+               int *bucket_A, int *bucket_B,
+               int n) {
+  int *PAb, *ISAb, *buf;
+#ifdef _OPENMP
+  int *curbuf;
+  int l;
+#endif
+  int i, j, k, t, m, bufsize;
+  int c0, c1;
+#ifdef _OPENMP
+  int d0, d1;
+  int tmp;
+#endif
+
+  /* Initialize bucket arrays. */
+  for(i = 0; i < BUCKET_A_SIZE; ++i) { bucket_A[i] = 0; }
+  for(i = 0; i < BUCKET_B_SIZE; ++i) { bucket_B[i] = 0; }
+
+  /* Count the number of occurrences of the first one or two characters of each
+     type A, B and B* suffix. Moreover, store the beginning position of all
+     type B* suffixes into the array SA. */
+  for(i = n - 1, m = n, c0 = T[n - 1]; 0 <= i;) {
+    /* type A suffix. */
+    do { ++BUCKET_A(c1 = c0); } while((0 <= --i) && ((c0 = T[i]) >= c1));
+    if(0 <= i) {
+      /* type B* suffix. */
+      ++BUCKET_BSTAR(c0, c1);
+      SA[--m] = i;
+      /* type B suffix. */
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) {
+        ++BUCKET_B(c0, c1);
+      }
+    }
+  }
+  m = n - m;
+/*
+note:
+  A type B* suffix is lexicographically smaller than a type B suffix that
+  begins with the same first two characters.
+*/
+
+  /* Calculate the index of start/end point of each bucket. */
+  for(c0 = 0, i = 0, j = 0; c0 < ALPHABET_SIZE; ++c0) {
+    t = i + BUCKET_A(c0);
+    BUCKET_A(c0) = i + j; /* start point */
+    i = t + BUCKET_B(c0, c0);
+    for(c1 = c0 + 1; c1 < ALPHABET_SIZE; ++c1) {
+      j += BUCKET_BSTAR(c0, c1);
+      BUCKET_BSTAR(c0, c1) = j; /* end point */
+      i += BUCKET_B(c0, c1);
+    }
+  }
+
+  if(0 < m) {
+    /* Sort the type B* suffixes by their first two characters. */
+    PAb = SA + n - m; ISAb = SA + m;
+    for(i = m - 2; 0 <= i; --i) {
+      t = PAb[i], c0 = T[t], c1 = T[t + 1];
+      SA[--BUCKET_BSTAR(c0, c1)] = i;
+    }
+    t = PAb[m - 1], c0 = T[t], c1 = T[t + 1];
+    SA[--BUCKET_BSTAR(c0, c1)] = m - 1;
+
+    /* Sort the type B* substrings using sssort. */
+#ifdef _OPENMP
+    tmp = omp_get_max_threads();
+    buf = SA + m, bufsize = (n - (2 * m)) / tmp;
+    c0 = ALPHABET_SIZE - 2, c1 = ALPHABET_SIZE - 1, j = m;
+#pragma omp parallel default(shared) private(curbuf, k, l, d0, d1, tmp)
+    {
+      tmp = omp_get_thread_num();
+      curbuf = buf + tmp * bufsize;
+      k = 0;
+      for(;;) {
+        #pragma omp critical(sssort_lock)
+        {
+          if(0 < (l = j)) {
+            d0 = c0, d1 = c1;
+            do {
+              k = BUCKET_BSTAR(d0, d1);
+              if(--d1 <= d0) {
+                d1 = ALPHABET_SIZE - 1;
+                if(--d0 < 0) { break; }
+              }
+            } while(((l - k) <= 1) && (0 < (l = k)));
+            c0 = d0, c1 = d1, j = k;
+          }
+        }
+        if(l == 0) { break; }
+        sssort(T, PAb, SA + k, SA + l,
+               curbuf, bufsize, 2, n, *(SA + k) == (m - 1));
+      }
+    }
+#else
+    buf = SA + m, bufsize = n - (2 * m);
+    for(c0 = ALPHABET_SIZE - 2, j = m; 0 < j; --c0) {
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; j = i, --c1) {
+        i = BUCKET_BSTAR(c0, c1);
+        if(1 < (j - i)) {
+          sssort(T, PAb, SA + i, SA + j,
+                 buf, bufsize, 2, n, *(SA + i) == (m - 1));
+        }
+      }
+    }
+#endif
+
+    /* Compute ranks of type B* substrings. */
+    for(i = m - 1; 0 <= i; --i) {
+      if(0 <= SA[i]) {
+        j = i;
+        do { ISAb[SA[i]] = i; } while((0 <= --i) && (0 <= SA[i]));
+        SA[i + 1] = i - j;
+        if(i <= 0) { break; }
+      }
+      j = i;
+      do { ISAb[SA[i] = ~SA[i]] = j; } while(SA[--i] < 0);
+      ISAb[SA[i]] = j;
+    }
+
+    /* Construct the inverse suffix array of type B* suffixes using trsort. */
+    trsort(ISAb, SA, m, 1);
+
+    /* Set the sorted order of tyoe B* suffixes. */
+    for(i = n - 1, j = m, c0 = T[n - 1]; 0 <= i;) {
+      for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) >= c1); --i, c1 = c0) { }
+      if(0 <= i) {
+        t = i;
+        for(--i, c1 = c0; (0 <= i) && ((c0 = T[i]) <= c1); --i, c1 = c0) { }
+        SA[ISAb[--j]] = ((t == 0) || (1 < (t - i))) ? t : ~t;
+      }
+    }
+
+    /* Calculate the index of start/end point of each bucket. */
+    BUCKET_B(ALPHABET_SIZE - 1, ALPHABET_SIZE - 1) = n; /* end point */
+    for(c0 = ALPHABET_SIZE - 2, k = m - 1; 0 <= c0; --c0) {
+      i = BUCKET_A(c0 + 1) - 1;
+      for(c1 = ALPHABET_SIZE - 1; c0 < c1; --c1) {
+        t = i - BUCKET_B(c0, c1);
+        BUCKET_B(c0, c1) = i; /* end point */
+
+        /* Move all type B* suffixes to the correct position. */
+        for(i = t, j = BUCKET_BSTAR(c0, c1);
+            j <= k;
+            --i, --k) { SA[i] = SA[k]; }
+      }
+      BUCKET_BSTAR(c0, c0 + 1) = i - BUCKET_B(c0, c0) + 1; /* start point */
+      BUCKET_B(c0, c0) = i; /* end point */
+    }
+  }
+
+  return m;
+}
+
+/* Constructs the suffix array by using the sorted order of type B* suffixes. */
+static
+void
+construct_SA(const unsigned char *T, int *SA,
+             int *bucket_A, int *bucket_B,
+             int n, int m) {
+  int *i, *j, *k;
+  int s;
+  int c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          *j = ~s;
+          c0 = T[--s];
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else {
+          assert(((s == 0) && (T[s] == c1)) || (s < 0));
+          *j = ~s;
+        }
+      }
+    }
+  }
+
+  /* Construct the suffix array by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~(n - 1) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      if((s == 0) || (T[s - 1] < c0)) { s = ~s; }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else {
+      assert(s < 0);
+      *i = ~s;
+    }
+  }
+}
+
+/* Constructs the burrows-wheeler transformed string directly
+   by using the sorted order of type B* suffixes. */
+static
+int
+construct_BWT(const unsigned char *T, int *SA,
+              int *bucket_A, int *bucket_B,
+              int n, int m) {
+  int *i, *j, *k, *orig;
+  int s;
+  int c0, c1, c2;
+
+  if(0 < m) {
+    /* Construct the sorted order of type B suffixes by using
+       the sorted order of type B* suffixes. */
+    for(c1 = ALPHABET_SIZE - 2; 0 <= c1; --c1) {
+      /* Scan the suffix array from right to left. */
+      for(i = SA + BUCKET_BSTAR(c1, c1 + 1),
+          j = SA + BUCKET_A(c1 + 1) - 1, k = NULL, c2 = -1;
+          i <= j;
+          --j) {
+        if(0 < (s = *j)) {
+          assert(T[s] == c1);
+          assert(((s + 1) < n) && (T[s] <= T[s + 1]));
+          assert(T[s - 1] <= T[s]);
+          c0 = T[--s];
+          *j = ~((int)c0);
+          if((0 < s) && (T[s - 1] > c0)) { s = ~s; }
+          if(c0 != c2) {
+            if(0 <= c2) { BUCKET_B(c2, c1) = k - SA; }
+            k = SA + BUCKET_B(c2 = c0, c1);
+          }
+          assert(k < j);
+          *k-- = s;
+        } else if(s != 0) {
+          *j = ~s;
+#ifndef NDEBUG
+        } else {
+          assert(T[s] == c1);
+#endif
+        }
+      }
+    }
+  }
+
+  /* Construct the BWTed string by using
+     the sorted order of type B suffixes. */
+  k = SA + BUCKET_A(c2 = T[n - 1]);
+  *k++ = (T[n - 2] < c2) ? ~((int)T[n - 2]) : (n - 1);
+  /* Scan the suffix array from left to right. */
+  for(i = SA, j = SA + n, orig = SA; i < j; ++i) {
+    if(0 < (s = *i)) {
+      assert(T[s - 1] >= T[s]);
+      c0 = T[--s];
+      *i = c0;
+      if((0 < s) && (T[s - 1] < c0)) { s = ~((int)T[s - 1]); }
+      if(c0 != c2) {
+        BUCKET_A(c2) = k - SA;
+        k = SA + BUCKET_A(c2 = c0);
+      }
+      assert(i < k);
+      *k++ = s;
+    } else if(s != 0) {
+      *i = ~s;
+    } else {
+      orig = i;
+    }
+  }
+
+  return orig - SA;
+}
+
+
+/*---------------------------------------------------------------------------*/
+
+/*- Function -*/
+
+int
+divsufsort(const unsigned char *T, int *SA, int n) {
+  int *bucket_A, *bucket_B;
+  int m;
+  int err = 0;
+
+  /* Check arguments. */
+  if((T == NULL) || (SA == NULL) || (n < 0)) { return -1; }
+  else if(n == 0) { return 0; }
+  else if(n == 1) { SA[0] = 0; return 0; }
+  else if(n == 2) { m = (T[0] < T[1]); SA[m ^ 1] = 0, SA[m] = 1; return 0; }
+
+  bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
+  bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
+
+  /* Suffixsort. */
+  if((bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, SA, bucket_A, bucket_B, n);
+    construct_SA(T, SA, bucket_A, bucket_B, n, m);
+  } else {
+    err = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+
+  return err;
+}
+
+int
+divbwt(const unsigned char *T, unsigned char *U, int *A, int n) {
+  int *B;
+  int *bucket_A, *bucket_B;
+  int m, pidx, i;
+
+  /* Check arguments. */
+  if((T == NULL) || (U == NULL) || (n < 0)) { return -1; }
+  else if(n <= 1) { if(n == 1) { U[0] = T[0]; } return n; }
+
+  if((B = A) == NULL) { B = (int *)malloc((size_t)(n + 1) * sizeof(int)); }
+  bucket_A = (int *)malloc(BUCKET_A_SIZE * sizeof(int));
+  bucket_B = (int *)malloc(BUCKET_B_SIZE * sizeof(int));
+
+  /* Burrows-Wheeler Transform. */
+  if((B != NULL) && (bucket_A != NULL) && (bucket_B != NULL)) {
+    m = sort_typeBstar(T, B, bucket_A, bucket_B, n);
+    pidx = construct_BWT(T, B, bucket_A, bucket_B, n, m);
+
+    /* Copy to output string. */
+    U[0] = T[n - 1];
+    for(i = 0; i < pidx; ++i) { U[i + 1] = (unsigned char)B[i]; }
+    for(i += 1; i < n; ++i) { U[i] = (unsigned char)B[i]; }
+    pidx += 1;
+  } else {
+    pidx = -2;
+  }
+
+  free(bucket_B);
+  free(bucket_A);
+  if(A == NULL) { free(B); }
+
+  return pidx;
+}
+
+// End divsufsort.c
+
+/////////////////////////////// add ///////////////////////////////////
+
+// Convert non-negative decimal number x to string of at least n digits
+std::string itos(int64_t x, int n=1) {
+  assert(x>=0);
+  assert(n>=0);
+  std::string r;
+  for (; x || n>0; x/=10, --n) r=std::string(1, '0'+x%10)+r;
+  return r;
+}
+
+// E8E9 transform of buf[0..n-1] to improve compression of .exe and .dll.
+// Patterns (E8|E9 xx xx xx 00|FF) at offset i replace the 3 middle
+// bytes with x+i mod 2^24, LSB first, reading backward.
+void e8e9(unsigned char* buf, int n) {
+  for (int i=n-5; i>=0; --i) {
+    if (((buf[i]&254)==0xe8) && ((buf[i+4]+1)&254)==0) {
+      unsigned a=(buf[i+1]|buf[i+2]<<8|buf[i+3]<<16)+i;
+      buf[i+1]=a;
+      buf[i+2]=a>>8;
+      buf[i+3]=a>>16;
+    }
+  }
+}
+
+// Encode inbuf to buf using LZ77. args are as follows:
+// args[0] is log2 buffer size in MB.
+// args[1] is level (1=var. length, 2=byte aligned lz77, 3=bwt) + 4 if E8E9.
+// args[2] is the lz77 minimum match length and context order.
+// args[3] is the lz77 higher context order to search first, or else 0.
+// args[4] is the log2 hash bucket size (number of searches).
+// args[5] is the log2 hash table size. If 21+args[0] then use a suffix array.
+// args[6] is the secondary context look ahead
+// sap is pointer to external suffix array of inbuf or 0. If supplied and
+//   args[0]=5..7 then it is assumed that E8E9 was already applied to
+//   both the input and sap and the input buffer is not modified.
+
+class LZBuffer: public libzpaq::Reader {
+  libzpaq::Array<unsigned> ht;// hash table, confirm in low bits, or SA+ISA
+  const unsigned char* in;    // input pointer
+  const int checkbits;        // hash confirmation size or lg(ISA size)
+  const int level;            // 1=var length LZ77, 2=byte aligned LZ77, 3=BWT
+  const unsigned htsize;      // size of hash table
+  const unsigned n;           // input length
+  unsigned i;                 // current location in in (0 <= i < n)
+  const unsigned minMatch;    // minimum match length
+  const unsigned minMatch2;   // second context order or 0 if not used
+  const unsigned maxMatch;    // longest match length allowed
+  const unsigned maxLiteral;  // longest literal length allowed
+  const unsigned lookahead;   // second context look ahead
+  unsigned h1, h2;            // low, high order context hashes of in[i..]
+  const unsigned bucket;      // number of matches to search per hash - 1
+  const unsigned shift1, shift2;  // how far to shift h1, h2 per hash
+  const int minMatchBoth;     // max(minMatch, minMatch2)
+  const unsigned rb;          // number of level 1 r bits in match code
+  unsigned bits;              // pending output bits (level 1)
+  unsigned nbits;             // number of bits in bits
+  unsigned rpos, wpos;        // read, write pointers
+  unsigned idx;               // BWT index
+  const unsigned* sa;         // suffix array for BWT or LZ77-SA
+  unsigned* isa;              // inverse suffix array for LZ77-SA
+  enum {BUFSIZE=1<<14};       // output buffer size
+  unsigned char buf[BUFSIZE]; // output buffer
+
+  void write_literal(unsigned i, unsigned& lit);
+  void write_match(unsigned len, unsigned off);
+  void fill();  // encode to buf
+
+  // write k bits of x
+  void putb(unsigned x, int k) {
+    x&=(1<<k)-1;
+    bits|=x<<nbits;
+    nbits+=k;
+    while (nbits>7) {
+      assert(wpos<BUFSIZE);
+      buf[wpos++]=bits, bits>>=8, nbits-=8;
+    }
+  }
+
+  // write last byte
+  void flush() {
+    assert(wpos<BUFSIZE);
+    if (nbits>0) buf[wpos++]=bits;
+    bits=nbits=0;
+  }
+
+  // write 1 byte
+  void put(int c) {
+    assert(wpos<BUFSIZE);
+    buf[wpos++]=c;
+  }
+
+public:
+  LZBuffer(StringBuffer& inbuf, int args[], const unsigned* sap=0);
+
+  // return 1 byte of compressed output (overrides Reader)
+  int get() {
+    int c=-1;
+    if (rpos==wpos) fill();
+    if (rpos<wpos) c=buf[rpos++];
+    if (rpos==wpos) rpos=wpos=0;
+    return c;
+  }
+
+  // Read up to p[0..n-1] and return bytes read.
+  int read(char* p, int n);
+};
+
+// LZ/BWT preprocessor for levels 1..3 compression and e8e9 filter.
+// Level 1 uses variable length LZ77 codes like in the lazy compressor:
+//
+//   00,n,L[n] = n literal bytes
+//   mm,mmm,n,ll,r,q (mm > 00) = match 4*n+ll at offset (q<<rb)+r-1
+//
+// where q is written in 8mm+mmm-8 (0..23) bits with an implied leading 1 bit
+// and n is written using interleaved Elias Gamma coding, i.e. the leading
+// 1 bit is implied, remaining bits are preceded by a 1 and terminated by
+// a 0. e.g. abc is written 1,b,1,c,0. Codes are packed LSB first and
+// padded with leading 0 bits in the last byte. r is a number with rb bits,
+// where rb = log2(blocksize) - 24.
+//
+// Level 2 is byte oriented LZ77 with minimum match length m = $4 = args[3]
+// with m in 1..64. Lengths and offsets are MSB first:
+// 00xxxxxx   x+1 (1..64) literals follow
+// yyxxxxxx   y+1 (2..4) offset bytes follow, match length x+m (m..m+63)
+//
+// Level 3 is BWT with the end of string byte coded as 255 and the
+// last 4 bytes giving its position LSB first.
+
+// floor(log2(x)) + 1 = number of bits excluding leading zeros (0..32)
+int lg(unsigned x) {
+  unsigned r=0;
+  if (x>=65536) r=16, x>>=16;
+  if (x>=256) r+=8, x>>=8;
+  if (x>=16) r+=4, x>>=4;
+  assert(x>=0 && x<16);
+  return
+    "\x00\x01\x02\x02\x03\x03\x03\x03\x04\x04\x04\x04\x04\x04\x04\x04"[x]+r;
+}
+
+// return number of 1 bits in x
+int nbits(unsigned x) {
+  int r;
+  for (r=0; x; x>>=1) r+=x&1;
+  return r;
+}
+
+// Read n bytes of compressed output into p and return number of
+// bytes read in 0..n. 0 signals EOF (overrides Reader).
+int LZBuffer::read(char* p, int n) {
+  if (rpos==wpos) fill();
+  int nr=n;
+  if (nr>int(wpos-rpos)) nr=wpos-rpos;
+  if (nr) memcpy(p, buf+rpos, nr);
+  rpos+=nr;
+  assert(rpos<=wpos);
+  if (rpos==wpos) rpos=wpos=0;
+  return nr;
+}
+
+LZBuffer::LZBuffer(StringBuffer& inbuf, int args[], const unsigned* sap):
+    ht((args[1]&3)==3 ? (inbuf.size()+1)*!sap      // for BWT suffix array
+        : args[5]-args[0]<21 ? 1u<<args[5]         // for LZ77 hash table
+        : (inbuf.size()*!sap)+(1u<<17<<args[0])),  // for LZ77 SA and ISA
+    in(inbuf.data()),
+    checkbits(args[5]-args[0]<21 ? 12-args[0] : 17+args[0]),
+    level(args[1]&3),
+    htsize(ht.size()),
+    n(inbuf.size()),
+    i(0),
+    minMatch(args[2]),
+    minMatch2(args[3]),
+    maxMatch(BUFSIZE*3),
+    maxLiteral(BUFSIZE/4),
+    lookahead(args[6]),
+    h1(0), h2(0),
+    bucket((1<<args[4])-1), 
+    shift1(minMatch>0 ? (args[5]-1)/minMatch+1 : 1),
+    shift2(minMatch2>0 ? (args[5]-1)/minMatch2+1 : 0),
+    minMatchBoth(MAX(minMatch, minMatch2+lookahead)+4),
+    rb(args[0]>4 ? args[0]-4 : 0),
+    bits(0), nbits(0), rpos(0), wpos(0),
+    idx(0), sa(0), isa(0) {
+  assert(args[0]>=0);
+  assert(n<=(1u<<20<<args[0]));
+  assert(args[1]>=1 && args[1]<=7 && args[1]!=4);
+  assert(level>=1 && level<=3);
+  if ((minMatch<4 && level==1) || (minMatch<1 && level==2))
+    error("match length $3 too small");
+
+  // e8e9 transform
+  if (args[1]>4 && !sap) e8e9(inbuf.data(), n);
+
+  // build suffix array if not supplied
+  if (args[5]-args[0]>=21 || level==3) {  // LZ77-SA or BWT
+    if (sap)
+      sa=sap;
+    else {
+      assert(ht.size()>=n);
+      assert(ht.size()>0);
+      sa=&ht[0];
+      if (n>0) divsufsort((const unsigned char*)in, (int*)sa, n);
+    }
+    if (level<3) {
+      assert(ht.size()>=(n*(sap==0))+(1u<<17<<args[0]));
+      isa=&ht[n*(sap==0)];
+    }
+  }
+}
+
+// Encode from in to buf until end of input or buf is not empty
+void LZBuffer::fill() {
+
+  // BWT
+  if (level==3) {
+    assert(in || n==0);
+    assert(sa);
+    for (; wpos<BUFSIZE && i<n+5; ++i) {
+      if (i==0) put(n>0 ? in[n-1] : 255);
+      else if (i>n) put(idx&255), idx>>=8;
+      else if (sa[i-1]==0) idx=i, put(255);
+      else put(in[sa[i-1]-1]);
+    }
+    return;
+  }
+
+  // LZ77: scan the input
+  unsigned lit=0;  // number of output literals pending
+  const unsigned mask=(1<<checkbits)-1;
+  while (i<n && wpos*2<BUFSIZE) {
+
+    // Search for longest match, or pick closest in case of tie
+    unsigned blen=minMatch-1;  // best match length
+    unsigned bp=0;  // pointer to best match
+    unsigned blit=0;  // literals before best match
+    int bscore=0;  // best cost
+
+    // Look up contexts in suffix array
+    if (isa) {
+      if (sa[isa[i&mask]]!=i) // rebuild ISA
+        for (unsigned j=0; j<n; ++j)
+          if ((sa[j]&~mask)==(i&~mask))
+            isa[sa[j]&mask]=j;
+      for (unsigned h=0; h<=lookahead; ++h) {
+        unsigned q=isa[(h+i)&mask];  // location of h+i in SA
+        assert(q<n);
+        if (sa[q]!=h+i) continue;
+        for (int j=-1; j<=1; j+=2) {  // search backward and forward
+          for (unsigned k=1; k<=bucket; ++k) {
+            unsigned p;  // match to be tested
+            if (q+j*k<n && (p=sa[q+j*k]-h)<i) {
+              assert(p<n);
+              unsigned l, l1;  // length of match, leading literals
+              for (l=h; i+l<n && l<maxMatch && in[p+l]==in[i+l]; ++l);
+              for (l1=h; l1>0 && in[p+l1-1]==in[i+l1-1]; --l1);
+              int score=int(l-l1)*8-lg(i-p)-4*(lit==0 && l1>0)-11;
+              for (unsigned a=0; a<h; ++a) score=score*5/8;
+              if (score>bscore) blen=l, bp=p, blit=l1, bscore=score;
+              if (l<blen || l<minMatch || l>255) break;
+            }
+          }
+        }
+        if (bscore<=0 || blen<minMatch) break;
+      }
+    }
+
+    // Look up contexts in a hash table.
+    // Try the longest context orders first. If a match is found, then
+    // skip the lower order as a speed optimization.
+    else if (level==1 || minMatch<=64) {
+      if (minMatch2>0) {
+        for (unsigned k=0; k<=bucket; ++k) {
+          unsigned p=ht[h2^k];
+          if (p && (p&mask)==(in[i+3]&mask)) {
+            p>>=checkbits;
+            if (p<i && i+blen<=n && in[p+blen-1]==in[i+blen-1]) {
+              unsigned l;  // match length from lookahead
+              for (l=lookahead; i+l<n && l<maxMatch && in[p+l]==in[i+l]; ++l);
+              if (l>=minMatch2+lookahead) {
+                int l1;  // length back from lookahead
+                for (l1=lookahead; l1>0 && in[p+l1-1]==in[i+l1-1]; --l1);
+                assert(l1>=0 && l1<=int(lookahead));
+                int score=int(l-l1)*8-lg(i-p)-8*(lit==0 && l1>0)-11;
+                if (score>bscore) blen=l, bp=p, blit=l1, bscore=score;
+              }
+            }
+          }
+          if (blen>=128) break;
+        }
+      }
+
+      // Search the lower order context
+      if (!minMatch2 || blen<minMatch2) {
+        for (unsigned k=0; k<=bucket; ++k) {
+          unsigned p=ht[h1^k];
+          if (p && i+3<n && (p&mask)==(in[i+3]&mask)) {
+            p>>=checkbits;
+            if (p<i && i+blen<=n && in[p+blen-1]==in[i+blen-1]) {
+              unsigned l;
+              for (l=0; i+l<n && l<maxMatch && in[p+l]==in[i+l]; ++l);
+              int score=l*8-lg(i-p)-2*(lit>0)-11;
+              if (score>bscore) blen=l, bp=p, blit=0, bscore=score;
+            }
+          }
+          if (blen>=128) break;
+        }
+      }
+    }
+
+    // If match is long enough, then output any pending literals first,
+    // and then the match. blen is the length of the match.
+    assert(i>=bp);
+    const unsigned off=i-bp;  // offset
+    if (off>0 && bscore>0
+        && blen-blit>=minMatch+(level==2)*((off>=(1<<16))+(off>=(1<<24)))) {
+      lit+=blit;
+      write_literal(i+blit, lit);
+      write_match(blen-blit, off);
+    }
+
+    // Otherwise add to literal length
+    else {
+      blen=1;
+      ++lit;
+    }
+
+    // Update index, advance blen bytes
+    if (isa)
+      i+=blen;
+    else {
+      while (blen--) {
+        if (i+minMatchBoth<n) {
+          unsigned ih=((i*1234547)>>19)&bucket;
+          const unsigned p=(i<<checkbits)|(in[i+3]&mask);
+          assert(ih<=bucket);
+          if (minMatch2) {
+            ht[h2^ih]=p;
+            h2=(((h2*9)<<shift2)
+                +(in[i+minMatch2+lookahead]+1)*23456789u)&(htsize-1);
+          }
+          ht[h1^ih]=p;
+          h1=(((h1*5)<<shift1)+(in[i+minMatch]+1)*123456791u)&(htsize-1);
+        }
+        ++i;
+      }
+    }
+
+    // Write long literals to keep buf from filling up
+    if (lit>=maxLiteral)
+      write_literal(i, lit);
+  }
+
+  // Write pending literals at end of input
+  assert(i<=n);
+  if (i==n) {
+    write_literal(n, lit);
+    flush();
+  }
+}
+
+// Write literal sequence in[i-lit..i-1], set lit=0
+void LZBuffer::write_literal(unsigned i, unsigned& lit) {
+  assert(lit>=0);
+  assert(i>=0 && i<=n);
+  assert(i>=lit);
+  if (level==1) {
+    if (lit<1) return;
+    int ll=lg(lit);
+    assert(ll>=1 && ll<=24);
+    putb(0, 2);
+    --ll;
+    while (--ll>=0) {
+      putb(1, 1);
+      putb((lit>>ll)&1, 1);
+    }
+    putb(0, 1);
+    while (lit) putb(in[i-lit--], 8);
+  }
+  else {
+    assert(level==2);
+    while (lit>0) {
+      unsigned lit1=lit;
+      if (lit1>64) lit1=64;
+      put(lit1-1);
+      for (unsigned j=i-lit; j<i-lit+lit1; ++j) put(in[j]);
+      lit-=lit1;
+    }
+  }
+}
+
+// Write match sequence of given length and offset
+void LZBuffer::write_match(unsigned len, unsigned off) {
+
+  // mm,mmm,n,ll,r,q[mmmmm-8] = match n*4+ll, offset ((q-1)<<rb)+r+1
+  if (level==1) {
+    assert(len>=minMatch && len<=maxMatch);
+    assert(off>0);
+    assert(len>=4);
+    assert(rb>=0 && rb<=8);
+    int ll=lg(len)-1;
+    assert(ll>=2);
+    off+=(1<<rb)-1;
+    int lo=lg(off)-1-rb;
+    assert(lo>=0 && lo<=23);
+    putb((lo+8)>>3, 2);// mm
+    putb(lo&7, 3);     // mmm
+    while (--ll>=2) {  // n
+      putb(1, 1);
+      putb((len>>ll)&1, 1);
+    }
+    putb(0, 1);
+    putb(len&3, 2);    // ll
+    putb(off, rb);     // r
+    putb(off>>rb, lo); // q
+  }
+
+  // x[2]:len[6] off[x-1] 
+  else {
+    assert(level==2);
+    assert(minMatch>=1 && minMatch<=64);
+    --off;
+    while (len>0) {  // Split long matches to len1=minMatch..minMatch+63
+      const unsigned len1=len>minMatch*2+63 ? minMatch+63 :
+          len>minMatch+63 ? len-minMatch : len;
+      assert(wpos<BUFSIZE-5);
+      assert(len1>=minMatch && len1<minMatch+64);
+      if (off<(1<<16)) {
+        put(64+len1-minMatch);
+        put(off>>8);
+        put(off);
+      }
+      else if (off<(1<<24)) {
+        put(128+len1-minMatch);
+        put(off>>16);
+        put(off>>8);
+        put(off);
+      }
+      else {
+        put(192+len1-minMatch);
+        put(off>>24);
+        put(off>>16);
+        put(off>>8);
+        put(off);
+      }
+      len-=len1;
+    }
+  }
+}
+
+// Generate a config file from the method argument with syntax:
+// {0|x|s|i}[N1[,N2]...][{ciamtswf<cfg>}[N1[,N2]]...]...
+std::string makeConfig(const char* method, int args[]) {
+  assert(method);
+  const char type=method[0];
+  assert(type=='x' || type=='s' || type=='0' || type=='i');
+
+  // Read "{x|s|i|0}N1,N2...N9" into args[0..8] ($1..$9)
+  args[0]=0;  // log block size in MiB
+  args[1]=0;  // 0=none, 1=var-LZ77, 2=byte-LZ77, 3=BWT, 4..7 adds E8E9
+  args[2]=0;  // lz77 minimum match length
+  args[3]=0;  // secondary context length
+  args[4]=0;  // log searches
+  args[5]=0;  // lz77 hash table size or SA if args[0]+21
+  args[6]=0;  // secondary context look ahead
+  args[7]=0;  // not used
+  args[8]=0;  // not used
+  if (isdigit(*++method)) args[0]=0;
+  for (int i=0; i<9 && (isdigit(*method) || *method==',' || *method=='.');) {
+    if (isdigit(*method))
+      args[i]=args[i]*10+*method-'0';
+    else if (++i<9)
+      args[i]=0;
+    ++method;
+  }
+
+  // "0..." = No compression
+  if (type=='0')
+    return "comp 0 0 0 0 0 hcomp end\n";
+
+  // Generate the postprocessor
+  std::string hdr, pcomp;
+  const int level=args[1]&3;
+  const bool doe8=args[1]>=4 && args[1]<=7;
+
+  // LZ77+Huffman, with or without E8E9
+  if (level==1) {
+    const int rb=args[0]>4 ? args[0]-4 : 0;
+    hdr="comp 9 16 0 $1+20 ";
+    pcomp=
+    "pcomp lazy2 3 ;\n"
+    " (r1 = state\n"
+    "  r2 = len - match or literal length\n"
+    "  r3 = m - number of offset bits expected\n"
+    "  r4 = ptr to buf\n"
+    "  r5 = r - low bits of offset\n"
+    "  c = bits - input buffer\n"
+    "  d = n - number of bits in c)\n"
+    "\n"
+    "  a> 255 if\n";
+    if (doe8)
+      pcomp+=
+      "    b=0 d=r 4 do (for b=0..d-1, d = end of buf)\n"
+      "      a=b a==d ifnot\n"
+      "        a+= 4 a<d if\n"
+      "          a=*b a&= 254 a== 232 if (e8 or e9?)\n"
+      "            c=b b++ b++ b++ b++ a=*b a++ a&= 254 a== 0 if (00 or ff)\n"
+      "              b-- a=*b\n"
+      "              b-- a<<= 8 a+=*b\n"
+      "              b-- a<<= 8 a+=*b\n"
+      "              a-=b a++\n"
+      "              *b=a a>>= 8 b++\n"
+      "              *b=a a>>= 8 b++\n"
+      "              *b=a b++\n"
+      "            endif\n"
+      "            b=c\n"
+      "          endif\n"
+      "        endif\n"
+      "        a=*b out b++\n"
+      "      forever\n"
+      "    endif\n"
+      "\n";
+    pcomp+=
+    "    (reset state)\n"
+    "    a=0 b=0 c=0 d=0 r=a 1 r=a 2 r=a 3 r=a 4\n"
+    "    halt\n"
+    "  endif\n"
+    "\n"
+    "  a<<=d a+=c c=a               (bits+=a<<n)\n"
+    "  a= 8 a+=d d=a                (n+=8)\n"
+    "\n"
+    "  (if state==0 (expect new code))\n"
+    "  a=r 1 a== 0 if (match code mm,mmm)\n"
+    "    a= 1 r=a 2                 (len=1)\n"
+    "    a=c a&= 3 a> 0 if          (if (bits&3))\n"
+    "      a-- a<<= 3 r=a 3           (m=((bits&3)-1)*8)\n"
+    "      a=c a>>= 2 c=a             (bits>>=2)\n"
+    "      b=r 3 a&= 7 a+=b r=a 3     (m+=bits&7)\n"
+    "      a=c a>>= 3 c=a             (bits>>=3)\n"
+    "      a=d a-= 5 d=a              (n-=5)\n"
+    "      a= 1 r=a 1                 (state=1)\n"
+    "    else (literal, discard 00)\n"
+    "      a=c a>>= 2 c=a             (bits>>=2)\n"
+    "      d-- d--                    (n-=2)\n"
+    "      a= 3 r=a 1                 (state=3)\n"
+    "    endif\n"
+    "  endif\n"
+    "\n"
+    "  (while state==1 && n>=3 (expect match length n*4+ll -> r2))\n"
+    "  do a=r 1 a== 1 if a=d a> 2 if\n"
+    "    a=c a&= 1 a== 1 if         (if bits&1)\n"
+    "      a=c a>>= 1 c=a             (bits>>=1)\n"
+    "      b=r 2 a=c a&= 1 a+=b a+=b r=a 2 (len+=len+(bits&1))\n"
+    "      a=c a>>= 1 c=a             (bits>>=1)\n"
+    "      d-- d--                    (n-=2)\n"
+    "    else\n"
+    "      a=c a>>= 1 c=a             (bits>>=1)\n"
+    "      a=r 2 a<<= 2 b=a           (len<<=2)\n"
+    "      a=c a&= 3 a+=b r=a 2       (len+=bits&3)\n"
+    "      a=c a>>= 2 c=a             (bits>>=2)\n"
+    "      d-- d-- d--                (n-=3)\n";
+    if (rb)
+      pcomp+="      a= 5 r=a 1                 (state=5)\n";
+    else
+      pcomp+="      a= 2 r=a 1                 (state=2)\n";
+    pcomp+=
+    "    endif\n"
+    "  forever endif endif\n"
+    "\n";
+    if (rb) pcomp+=  // save r in r5
+      "  (if state==5 && n>=8) (expect low bits of offset to put in r5)\n"
+      "  a=r 1 a== 5 if a=d a> "+itos(rb-1)+" if\n"
+      "    a=c a&= "+itos((1<<rb)-1)+" r=a 5            (save r in r5)\n"
+      "    a=c a>>= "+itos(rb)+" c=a\n"
+      "    a=d a-= "+itos(rb)+ " d=a\n"
+      "    a= 2 r=a 1                   (go to state 2)\n"
+      "  endif endif\n"
+      "\n";
+    pcomp+=
+    "  (if state==2 && n>=m) (expect m offset bits)\n"
+    "  a=r 1 a== 2 if a=r 3 a>d ifnot\n"
+    "    a=c r=a 6 a=d r=a 7          (save c=bits, d=n in r6,r7)\n"
+    "    b=r 3 a= 1 a<<=b d=a         (d=1<<m)\n"
+    "    a-- a&=c a+=d                (d=offset=bits&((1<<m)-1)|(1<<m))\n";
+    if (rb)
+      pcomp+=  // insert r into low bits of d
+      "    a<<= "+itos(rb)+" d=r 5 a+=d a-= "+itos((1<<rb)-1)+"\n";
+    pcomp+=
+    "    d=a b=r 4 a=b a-=d c=a       (c=p=(b=ptr)-offset)\n"
+    "\n"
+    "    (while len-- (copy and output match d bytes from *c to *b))\n"
+    "    d=r 2 do a=d a> 0 if d--\n"
+    "      a=*c *b=a c++ b++          (buf[ptr++]-buf[p++])\n";
+    if (!doe8) pcomp+=" out\n";
+    pcomp+=
+    "    forever endif\n"
+    "    a=b r=a 4\n"
+    "\n"
+    "    a=r 6 b=r 3 a>>=b c=a        (bits>>=m)\n"
+    "    a=r 7 a-=b d=a               (n-=m)\n"
+    "    a=0 r=a 1                    (state=0)\n"
+    "  endif endif\n"
+    "\n"
+    "  (while state==3 && n>=2 (expect literal length))\n"
+    "  do a=r 1 a== 3 if a=d a> 1 if\n"
+    "    a=c a&= 1 a== 1 if         (if bits&1)\n"
+    "      a=c a>>= 1 c=a              (bits>>=1)\n"
+    "      b=r 2 a&= 1 a+=b a+=b r=a 2 (len+=len+(bits&1))\n"
+    "      a=c a>>= 1 c=a              (bits>>=1)\n"
+    "      d-- d--                     (n-=2)\n"
+    "    else\n"
+    "      a=c a>>= 1 c=a              (bits>>=1)\n"
+    "      d--                         (--n)\n"
+    "      a= 4 r=a 1                  (state=4)\n"
+    "    endif\n"
+    "  forever endif endif\n"
+    "\n"
+    "  (if state==4 && n>=8 (expect len literals))\n"
+    "  a=r 1 a== 4 if a=d a> 7 if\n"
+    "    b=r 4 a=c *b=a\n";
+    if (!doe8) pcomp+=" out\n";
+    pcomp+=
+    "    b++ a=b r=a 4                 (buf[ptr++]=bits)\n"
+    "    a=c a>>= 8 c=a                (bits>>=8)\n"
+    "    a=d a-= 8 d=a                 (n-=8)\n"
+    "    a=r 2 a-- r=a 2 a== 0 if      (if --len<1)\n"
+    "      a=0 r=a 1                     (state=0)\n"
+    "    endif\n"
+    "  endif endif\n"
+    "  halt\n"
+    "end\n";
+  }
+
+  // Byte aligned LZ77, with or without E8E9
+  else if (level==2) {
+    hdr="comp 9 16 0 $1+20 ";
+    pcomp=
+    "pcomp lzpre c ;\n"
+    "  (Decode LZ77: d=state, M=output buffer, b=size)\n"
+    "  a> 255 if (at EOF decode e8e9 and output)\n";
+    if (doe8)
+      pcomp+=
+      "    d=b b=0 do (for b=0..d-1, d = end of buf)\n"
+      "      a=b a==d ifnot\n"
+      "        a+= 4 a<d if\n"
+      "          a=*b a&= 254 a== 232 if (e8 or e9?)\n"
+      "            c=b b++ b++ b++ b++ a=*b a++ a&= 254 a== 0 if (00 or ff)\n"
+      "              b-- a=*b\n"
+      "              b-- a<<= 8 a+=*b\n"
+      "              b-- a<<= 8 a+=*b\n"
+      "              a-=b a++\n"
+      "              *b=a a>>= 8 b++\n"
+      "              *b=a a>>= 8 b++\n"
+      "              *b=a b++\n"
+      "            endif\n"
+      "            b=c\n"
+      "          endif\n"
+      "        endif\n"
+      "        a=*b out b++\n"
+      "      forever\n"
+      "    endif\n";
+    pcomp+=
+    "    b=0 c=0 d=0 a=0 r=a 1 r=a 2 (reset state)\n"
+    "  halt\n"
+    "  endif\n"
+    "\n"
+    "  (in state d==0, expect a new code)\n"
+    "  (put length in r1 and inital part of offset in r2)\n"
+    "  c=a a=d a== 0 if\n"
+    "    a=c a>>= 6 a++ d=a\n"
+    "    a== 1 if (literal?)\n"
+    "      a+=c r=a 1 a=0 r=a 2\n"
+    "    else (3 to 5 byte match)\n"
+    "      d++ a=c a&= 63 a+= $3 r=a 1 a=0 r=a 2\n"
+    "    endif\n"
+    "  else\n"
+    "    a== 1 if (writing literal)\n"
+    "      a=c *b=a b++\n";
+    if (!doe8) pcomp+=" out\n";
+    pcomp+=
+    "      a=r 1 a-- a== 0 if d=0 endif r=a 1 (if (--len==0) state=0)\n"
+    "    else\n"
+    "      a> 2 if (reading offset)\n"
+    "        a=r 2 a<<= 8 a|=c r=a 2 d-- (off=off<<8|c, --state)\n"
+    "      else (state==2, write match)\n"
+    "        a=r 2 a<<= 8 a|=c c=a a=b a-=c a-- c=a (c=i-off-1)\n"
+    "        d=r 1 (d=len)\n"
+    "        do (copy and output d=len bytes)\n"
+    "          a=*c *b=a c++ b++\n";
+    if (!doe8) pcomp+=" out\n";
+    pcomp+=
+    "        d-- a=d a> 0 while\n"
+    "        (d=state=0. off, len don\'t matter)\n"
+    "      endif\n"
+    "    endif\n"
+    "  endif\n"
+    "  halt\n"
+    "end\n";
+  }
+
+  // BWT with or without E8E9
+  else if (level==3) {  // IBWT
+    hdr="comp 9 16 $1+20 $1+20 ";  // 2^$1 = block size in MB
+    pcomp=
+    "pcomp bwtrle c ;\n"
+    "\n"
+    "  (read BWT, index into M, size in b)\n"
+    "  a> 255 ifnot\n"
+    "    *b=a b++\n"
+    "\n"
+    "  (inverse BWT)\n"
+    "  elsel\n"
+    "\n"
+    "    (index in last 4 bytes, put in c and R1)\n"
+    "    b-- a=*b\n"
+    "    b-- a<<= 8 a+=*b\n"
+    "    b-- a<<= 8 a+=*b\n"
+    "    b-- a<<= 8 a+=*b c=a r=a 1\n"
+    "\n"
+    "    (save size in R2)\n"
+    "    a=b r=a 2\n"
+    "\n"
+    "    (count bytes in H[~1..~255, ~0])\n"
+    "    do\n"
+    "      a=b a> 0 if\n"
+    "        b-- a=*b a++ a&= 255 d=a d! *d++\n"
+    "      forever\n"
+    "    endif\n"
+    "\n"
+    "    (cumulative counts: H[~i=0..255] = count of bytes before i)\n"
+    "    d=0 d! *d= 1 a=0\n"
+    "    do\n"
+    "      a+=*d *d=a d--\n"
+    "    d<>a a! a> 255 a! d<>a until\n"
+    "\n"
+    "    (build first part of linked list in H[0..idx-1])\n"
+    "    b=0 do\n"
+    "      a=c a>b if\n"
+    "        d=*b d! *d++ d=*d d-- *d=b\n"
+    "      b++ forever\n"
+    "    endif\n"
+    "\n"
+    "    (rest of list in H[idx+1..n-1])\n"
+    "    b=c b++ c=r 2 do\n"
+    "      a=c a>b if\n"
+    "        d=*b d! *d++ d=*d d-- *d=b\n"
+    "      b++ forever\n"
+    "    endif\n"
+    "\n";
+    if (args[0]<=4) {  // faster IBWT list traversal limited to 16 MB blocks
+      pcomp+=
+      "    (copy M to low 8 bits of H to reduce cache misses in next loop)\n"
+      "    b=0 do\n"
+      "      a=c a>b if\n"
+      "        d=b a=*d a<<= 8 a+=*b *d=a\n"
+      "      b++ forever\n"
+      "    endif\n"
+      "\n"
+      "    (traverse list and output or copy to M)\n"
+      "    d=r 1 b=0 do\n"
+      "      a=d a== 0 ifnot\n"
+      "        a=*d a>>= 8 d=a\n";
+      if (doe8) pcomp+=" *b=*d b++\n";
+      else      pcomp+=" a=*d out\n";
+      pcomp+=
+      "      forever\n"
+      "    endif\n"
+      "\n";
+      if (doe8)  // IBWT+E8E9
+        pcomp+=
+        "    (e8e9 transform to out)\n"
+        "    d=b b=0 do (for b=0..d-1, d = end of buf)\n"
+        "      a=b a==d ifnot\n"
+        "        a+= 4 a<d if\n"
+        "          a=*b a&= 254 a== 232 if\n"
+        "            c=b b++ b++ b++ b++ a=*b a++ a&= 254 a== 0 if\n"
+        "              b-- a=*b\n"
+        "              b-- a<<= 8 a+=*b\n"
+        "              b-- a<<= 8 a+=*b\n"
+        "              a-=b a++\n"
+        "              *b=a a>>= 8 b++\n"
+        "              *b=a a>>= 8 b++\n"
+        "              *b=a b++\n"
+        "            endif\n"
+        "            b=c\n"
+        "          endif\n"
+        "        endif\n"
+        "        a=*b out b++\n"
+        "      forever\n"
+        "    endif\n";
+      pcomp+=
+      "  endif\n"
+      "  halt\n"
+      "end\n";
+    }
+    else {  // slower IBWT list traversal for all sized blocks
+      if (doe8) {  // E8E9 after IBWT
+        pcomp+=
+        "    (R2 = output size without EOS)\n"
+        "    a=r 2 a-- r=a 2\n"
+        "\n"
+        "    (traverse list (d = IBWT pointer) and output inverse e8e9)\n"
+        "    (C = offset = 0..R2-1)\n"
+        "    (R4 = last 4 bytes shifted in from MSB end)\n"
+        "    (R5 = temp pending output byte)\n"
+        "    c=0 d=r 1 do\n"
+        "      a=d a== 0 ifnot\n"
+        "        d=*d\n"
+        "\n"
+        "        (store byte in R4 and shift out to R5)\n"
+        "        b=d a=*b a<<= 24 b=a\n"
+        "        a=r 4 r=a 5 a>>= 8 a|=b r=a 4\n"
+        "\n"
+        "        (if E8|E9 xx xx xx 00|FF in R4:R5 then subtract c from x)\n"
+        "        a=c a> 3 if\n"
+        "          a=r 5 a&= 254 a== 232 if\n"
+        "            a=r 4 a>>= 24 b=a a++ a&= 254 a< 2 if\n"
+        "              a=r 4 a-=c a+= 4 a<<= 8 a>>= 8 \n"
+        "              b<>a a<<= 24 a+=b r=a 4\n"
+        "            endif\n"
+        "          endif\n"
+        "        endif\n"
+        "\n"
+        "        (output buffered byte)\n"
+        "        a=c a> 3 if a=r 5 out endif c++\n"
+        "\n"
+        "      forever\n"
+        "    endif\n"
+        "\n"
+        "    (output up to 4 pending bytes in R4)\n"
+        "    b=r 4\n"
+        "    a=c a> 3 a=b if out endif a>>= 8 b=a\n"
+        "    a=c a> 2 a=b if out endif a>>= 8 b=a\n"
+        "    a=c a> 1 a=b if out endif a>>= 8 b=a\n"
+        "    a=c a> 0 a=b if out endif\n"
+        "\n"
+        "  endif\n"
+        "  halt\n"
+        "end\n";
+      }
+      else {
+        pcomp+=
+        "    (traverse list and output)\n"
+        "    d=r 1 do\n"
+        "      a=d a== 0 ifnot\n"
+        "        d=*d\n"
+        "        b=d a=*b out\n"
+        "      forever\n"
+        "    endif\n"
+        "  endif\n"
+        "  halt\n"
+        "end\n";
+      }
+    }
+  }
+
+  // E8E9 or no preprocessing
+  else if (level==0) {
+    hdr="comp 9 16 0 0 ";
+    if (doe8) { // E8E9?
+      pcomp=
+      "pcomp e8e9 d ;\n"
+      "  a> 255 if\n"
+      "    a=c a> 4 if\n"
+      "      c= 4\n"
+      "    else\n"
+      "      a! a+= 5 a<<= 3 d=a a=b a>>=d b=a\n"
+      "    endif\n"
+      "    do a=c a> 0 if\n"
+      "      a=b out a>>= 8 b=a c--\n"
+      "    forever endif\n"
+      "  else\n"
+      "    *b=b a<<= 24 d=a a=b a>>= 8 a+=d b=a c++\n"
+      "    a=c a> 4 if\n"
+      "      a=*b out\n"
+      "      a&= 254 a== 232 if\n"
+      "        a=b a>>= 24 a++ a&= 254 a== 0 if\n"
+      "          a=b a>>= 24 a<<= 24 d=a\n"
+      "          a=b a-=c a+= 5\n"
+      "          a<<= 8 a>>= 8 a|=d b=a\n"
+      "        endif\n"
+      "      endif\n"
+      "    endif\n"
+      "  endif\n"
+      "  halt\n"
+      "end\n";
+    }
+    else
+      pcomp="end\n";
+  }
+  else
+    error("Unsupported method");
+  
+  // Build context model (comp, hcomp) assuming:
+  // H[0..254] = contexts
+  // H[255..511] = location of last byte i-255
+  // M = last 64K bytes, filling backward
+  // C = pointer to most recent byte
+  // R1 = level 2 lz77 1+bytes expected until next code, 0=init
+  // R2 = level 2 lz77 first byte of code
+  int ncomp=0;  // number of components
+  const int membits=args[0]+20;
+  int sb=5;  // bits in last context
+  std::string comp;
+  std::string hcomp="hcomp\n"
+    "c-- *c=a a+= 255 d=a *d=c\n";
+  if (level==2) {  // put level 2 lz77 parse state in R1, R2
+    hcomp+=
+    "  (decode lz77 into M. Codes:\n"
+    "  00xxxxxx = literal length xxxxxx+1\n"
+    "  xx......, xx > 0 = match with xx offset bytes to follow)\n"
+    "\n"
+    "  a=r 1 a== 0 if (init)\n"
+    "    a= "+itos(111+57*doe8)+" (skip post code)\n"
+    "  else a== 1 if  (new code?)\n"
+    "    a=*c r=a 2  (save code in R2)\n"
+    "    a> 63 if a>>= 6 a++ a++  (match)\n"
+    "    else a++ a++ endif  (literal)\n"
+    "  else (read rest of code)\n"
+    "    a--\n"
+    "  endif endif\n"
+    "  r=a 1  (R1 = 1+expected bytes to next code)\n";
+  }
+
+  // Generate the context model
+  while (*method && ncomp<254) {
+
+    // parse command C[N1[,N2]...] into v = {C, N1, N2...}
+    std::vector<int> v;
+    v.push_back(*method++);
+    if (isdigit(*method)) {
+      v.push_back(*method++-'0');
+      while (isdigit(*method) || *method==',' || *method=='.') {
+        if (isdigit(*method))
+          v.back()=v.back()*10+*method++-'0';
+        else {
+          v.push_back(0);
+          ++method;
+        }
+      }
+    }
+
+    // c: context model
+    // N1%1000: 0=ICM 1..256=CM limit N1-1
+    // N1/1000: number of times to halve memory
+    // N2: 1..255=offset mod N2. 1000..1255=distance to N2-1000
+    // N3...: 0..255=byte mask + 256=lz77 state. 1000+=run of N3-1000 zeros.
+    if (v[0]=='c') {
+      while (v.size()<3) v.push_back(0);
+      comp+=itos(ncomp)+" ";
+      sb=11;  // count context bits
+      if (v[2]<256) sb+=lg(v[2]);
+      else sb+=6;
+      for (unsigned i=3; i<v.size(); ++i)
+        if (v[i]<512) sb+=nbits(v[i])*3/4;
+      if (sb>membits) sb=membits;
+      if (v[1]%1000==0) comp+="icm "+itos(sb-6-v[1]/1000)+"\n";
+      else comp+="cm "+itos(sb-2-v[1]/1000)+" "+itos(v[1]%1000-1)+"\n";
+
+      // special contexts
+      hcomp+="d= "+itos(ncomp)+" *d=0\n";
+      if (v[2]>1 && v[2]<=255) {  // periodic context
+        if (lg(v[2])!=lg(v[2]-1))
+          hcomp+="a=c a&= "+itos(v[2]-1)+" hashd\n";
+        else
+          hcomp+="a=c a%= "+itos(v[2])+" hashd\n";
+      }
+      else if (v[2]>=1000 && v[2]<=1255)  // distance context
+        hcomp+="a= 255 a+= "+itos(v[2]-1000)+
+               " d=a a=*d a-=c a> 255 if a= 255 endif d= "+
+               itos(ncomp)+" hashd\n";
+
+      // Masked context
+      for (unsigned i=3; i<v.size(); ++i) {
+        if (i==3) hcomp+="b=c ";
+        if (v[i]==255)
+          hcomp+="a=*b hashd\n";  // ordinary byte
+        else if (v[i]>0 && v[i]<255)
+          hcomp+="a=*b a&= "+itos(v[i])+" hashd\n";  // masked byte
+        else if (v[i]>=256 && v[i]<512) { // lz77 state or masked literal byte
+          hcomp+=
+          "a=r 1 a> 1 if\n"  // expect literal or offset
+          "  a=r 2 a< 64 if\n"  // expect literal
+          "    a=*b ";
+          if (v[i]<511) hcomp+="a&= "+itos(v[i]-256);
+          hcomp+=" hashd\n"
+          "  else\n"  // expect match offset byte
+          "    a>>= 6 hashd a=r 1 hashd\n"
+          "  endif\n"
+          "else\n"  // expect new code
+          "  a= 255 hashd a=r 2 hashd\n"
+          "endif\n";
+        }
+        else if (v[i]>=1256)  // skip v[i]-1000 bytes
+          hcomp+="a= "+itos(((v[i]-1000)>>8)&255)+" a<<= 8 a+= "
+               +itos((v[i]-1000)&255)+
+          " a+=b b=a\n";
+        else if (v[i]>1000)
+          hcomp+="a= "+itos(v[i]-1000)+" a+=b b=a\n";
+        if (v[i]<512 && i<v.size()-1)
+          hcomp+="b++ ";
+      }
+      ++ncomp;
+    }
+
+    // m,8,24: MIX, size, rate
+    // t,8,24: MIX2, size, rate
+    // s,8,32,255: SSE, size, start, limit
+    if (strchr("mts", v[0]) && ncomp>int(v[0]=='t')) {
+      if (v.size()<=1) v.push_back(8);
+      if (v.size()<=2) v.push_back(24+8*(v[0]=='s'));
+      if (v[0]=='s' && v.size()<=3) v.push_back(255);
+      comp+=itos(ncomp);
+      sb=5+v[1]*3/4;
+      if (v[0]=='m')
+        comp+=" mix "+itos(v[1])+" 0 "+itos(ncomp)+" "+itos(v[2])+" 255\n";
+      else if (v[0]=='t')
+        comp+=" mix2 "+itos(v[1])+" "+itos(ncomp-1)+" "+itos(ncomp-2)
+            +" "+itos(v[2])+" 255\n";
+      else // s
+        comp+=" sse "+itos(v[1])+" "+itos(ncomp-1)+" "+itos(v[2])+" "
+            +itos(v[3])+"\n";
+      if (v[1]>8) {
+        hcomp+="d= "+itos(ncomp)+" *d=0 b=c a=0\n";
+        for (; v[1]>=16; v[1]-=8) {
+          hcomp+="a<<= 8 a+=*b";
+          if (v[1]>16) hcomp+=" b++";
+          hcomp+="\n";
+        }
+        if (v[1]>8)
+          hcomp+="a<<= 8 a+=*b a>>= "+itos(16-v[1])+"\n";
+        hcomp+="a<<= 8 *d=a\n";
+      }
+      ++ncomp;
+    }
+
+    // i: ISSE chain with order increasing by N1,N2...
+    if (v[0]=='i' && ncomp>0) {
+      assert(sb>=5);
+      hcomp+="d= "+itos(ncomp-1)+" b=c a=*d d++\n";
+      for (unsigned i=1; i<v.size() && ncomp<254; ++i) {
+        for (int j=0; j<v[i]%10; ++j) {
+          hcomp+="hash ";
+          if (i<v.size()-1 || j<v[i]%10-1) hcomp+="b++ ";
+          sb+=6;
+        }
+        hcomp+="*d=a";
+        if (i<v.size()-1) hcomp+=" d++";
+        hcomp+="\n";
+        if (sb>membits) sb=membits;
+        comp+=itos(ncomp)+" isse "+itos(sb-6-v[i]/10)+" "+itos(ncomp-1)+"\n";
+        ++ncomp;
+      }
+    }
+
+    // a24,0,0: MATCH. N1=hash multiplier. N2,N3=halve buf, table.
+    if (v[0]=='a') {
+      if (v.size()<=1) v.push_back(24);
+      while (v.size()<4) v.push_back(0);
+      comp+=itos(ncomp)+" match "+itos(membits-v[3]-2)+" "
+          +itos(membits-v[2])+"\n";
+      hcomp+="d= "+itos(ncomp)+" a=*d a*= "+itos(v[1])
+           +" a+=*c a++ *d=a\n";
+      sb=5+(membits-v[2])*3/4;
+      ++ncomp;
+    }
+
+    // w1,65,26,223,20,0: ICM-ISSE chain of length N1 with word contexts,
+    // where a word is a sequence of c such that c&N4 is in N2..N2+N3-1.
+    // Word is hashed by: hash := hash*N5+c+1
+    // Decrease memory by 2^-N6.
+    if (v[0]=='w') {
+      if (v.size()<=1) v.push_back(1);
+      if (v.size()<=2) v.push_back(65);
+      if (v.size()<=3) v.push_back(26);
+      if (v.size()<=4) v.push_back(223);
+      if (v.size()<=5) v.push_back(20);
+      if (v.size()<=6) v.push_back(0);
+      comp+=itos(ncomp)+" icm "+itos(membits-6-v[6])+"\n";
+      for (int i=1; i<v[1]; ++i)
+        comp+=itos(ncomp+i)+" isse "+itos(membits-6-v[6])+" "
+            +itos(ncomp+i-1)+"\n";
+      hcomp+="a=*c a&= "+itos(v[4])+" a-= "+itos(v[2])+" a&= 255 a< "
+           +itos(v[3])+" if\n";
+      for (int i=0; i<v[1]; ++i) {
+        if (i==0) hcomp+="  d= "+itos(ncomp);
+        else hcomp+="  d++";
+        hcomp+=" a=*d a*= "+itos(v[5])+" a+=*c a++ *d=a\n";
+      }
+      hcomp+="else\n";
+      for (int i=v[1]-1; i>0; --i)
+        hcomp+="  d= "+itos(ncomp+i-1)+" a=*d d++ *d=a\n";
+      hcomp+="  d= "+itos(ncomp)+" *d=0\n"
+           "endif\n";
+      ncomp+=v[1]-1;
+      sb=membits-v[6];
+      ++ncomp;
+    }
+  }
+  return hdr+itos(ncomp)+"\n"+comp+hcomp+"halt\n"+pcomp;
+}
+
+// Compress from in to out in 1 segment in 1 block using the algorithm
+// descried in method. If method begins with a digit then choose
+// a method depending on type. Save filename and comment
+// in the segment header. If comment is 0 then the default is the input size
+// as a decimal string, plus " jDC\x01" for a journaling method (method[0]
+// is not 's'). Write the generated method to methodOut if not 0.
+void compressBlock(StringBuffer* in, Writer* out, const char* method_,
+                   const char* filename, const char* comment, bool dosha1) {
+  assert(in);
+  assert(out);
+  assert(method_);
+  assert(method_[0]);
+  std::string method=method_;
+  const unsigned n=in->size();  // input size
+  const int arg0=MAX(lg(n+4095)-20, 0);  // block size
+  assert((1u<<(arg0+20))>=n+4096);
+
+  // Get type from method "LB,R,t" where L is level 0..5, B is block
+  // size 0..11, R is redundancy 0..255, t = 0..3 = binary, text, exe, both.
+  unsigned type=0;
+  if (isdigit(method[0])) {
+    int commas=0, arg[4]={0};
+    for (int i=1; i<int(method.size()) && commas<4; ++i) {
+      if (method[i]==',' || method[i]=='.') ++commas;
+      else if (isdigit(method[i])) arg[commas]=arg[commas]*10+method[i]-'0';
+    }
+    if (commas==0) type=512;
+    else type=arg[1]*4+arg[2];
+  }
+
+  // Get hash of input
+  libzpaq::SHA1 sha1;
+  const char* sha1ptr=0;
+#ifdef DEBUG
+  if (true) {
+#else
+  if (dosha1) {
+#endif
+    sha1.write(in->c_str(), n);
+    sha1ptr=sha1.result();
+  }
+
+  // Expand default methods
+  if (isdigit(method[0])) {
+    const int level=method[0]-'0';
+    assert(level>=0 && level<=9);
+
+    // build models
+    const int doe8=(type&2)*2;
+    method="x"+itos(arg0);
+    std::string htsz=","+itos(19+arg0+(arg0<=6));  // lz77 hash table size
+    std::string sasz=","+itos(21+arg0);            // lz77 suffix array size
+
+    // store uncompressed
+    if (level==0)
+      method="0"+itos(arg0)+",0";
+
+    // LZ77, no model. Store if hard to compress
+    else if (level==1) {
+      if (type<40) method+=",0";
+      else {
+        method+=","+itos(1+doe8)+",";
+        if      (type<80)  method+="4,0,1,15";
+        else if (type<128) method+="4,0,2,16";
+        else if (type<256) method+="4,0,2"+htsz;
+        else if (type<960) method+="5,0,3"+htsz;
+        else               method+="6,0,3"+htsz;
+      }
+    }
+
+    // LZ77 with longer search
+    else if (level==2) {
+      if (type<32) method+=",0";
+      else {
+        method+=","+itos(1+doe8)+",";
+        if (type<64) method+="4,0,3"+htsz;
+        else method+="4,0,7"+sasz+",1";
+      }
+    }
+
+    // LZ77 with CM depending on redundancy
+    else if (level==3) {
+      if (type<20)  // store if not compressible
+        method+=",0";
+      else if (type<48)  // fast LZ77 if barely compressible
+        method+=","+itos(1+doe8)+",4,0,3"+htsz;
+      else if (type>=640 || (type&1))  // BWT if text or highly compressible
+        method+=","+itos(3+doe8)+"ci1";
+      else  // LZ77 with O0-1 compression of up to 12 literals
+        method+=","+itos(2+doe8)+",12,0,7"+sasz+",1c0,0,511i2";
+    }
+
+    // LZ77+CM, fast CM, or BWT depending on type
+    else if (level==4) {
+      if (type<12)
+        method+=",0";
+      else if (type<24)
+        method+=","+itos(1+doe8)+",4,0,3"+htsz;
+      else if (type<48)
+        method+=","+itos(2+doe8)+",5,0,7"+sasz+"1c0,0,511";
+      else if (type<900) {
+        method+=","+itos(doe8)+"ci1,1,1,1,2a";
+        if (type&1) method+="w";
+        method+="m";
+      }
+      else
+        method+=","+itos(3+doe8)+"ci1";
+    }
+
+    // Slow CM with lots of models
+    else {  // 5..9
+
+      // Model text files
+      method+=","+itos(doe8);
+      if (type&1) method+="w2c0,1010,255i1";
+      else method+="w1i1";
+      method+="c256ci1,1,1,1,1,1,2a";
+
+      // Analyze the data
+      const int NR=1<<12;
+      int pt[256]={0};  // position of last occurrence
+      int r[NR]={0};    // count repetition gaps of length r
+      const unsigned char* p=in->data();
+      if (level>0) {
+        for (unsigned i=0; i<n; ++i) {
+          const int k=i-pt[p[i]];
+          if (k>0 && k<NR) ++r[k];
+          pt[p[i]]=i;
+        }
+      }
+
+      // Add periodic models
+      int n1=n-r[1]-r[2]-r[3];
+      for (int i=0; i<2; ++i) {
+        int period=0;
+        double score=0;
+        int t=0;
+        for (int j=5; j<NR && t<n1; ++j) {
+          const double s=r[j]/(256.0+n1-t);
+          if (s>score) score=s, period=j;
+          t+=r[j];
+        }
+        if (period>4 && score>0.1) {
+          method+="c0,0,"+itos(999+period)+",255i1";
+          if (period<=255)
+            method+="c0,"+itos(period)+"i1";
+          n1-=r[period];
+          r[period]=0;
+        }
+        else
+          break;
+      }
+      method+="c0,2,0,255i1c0,3,0,0,255i1c0,4,0,0,0,255i1mm16ts19t0";
+    }
+  }
+
+  // Compress
+  std::string config;
+  int args[9]={0};
+  config=makeConfig(method.c_str(), args);
+  assert(n<=(0x100000u<<args[0])-4096);
+  libzpaq::Compressor co;
+  co.setOutput(out);
+#ifdef DEBUG
+  co.setVerify(true);
+#endif
+  StringBuffer pcomp_cmd;
+  co.writeTag();
+  co.startBlock(config.c_str(), args, &pcomp_cmd);
+  std::string cs=itos(n);
+  if (comment) cs=cs+" "+comment;
+  co.startSegment(filename, cs.c_str());
+  if (args[1]>=1 && args[1]<=7 && args[1]!=4) {  // LZ77 or BWT
+    LZBuffer lz(*in, args);
+    co.setInput(&lz);
+    co.compress();
+  }
+  else {  // compress with e8e9 or no preprocessing
+    if (args[1]>=4 && args[1]<=7)
+      e8e9(in->data(), in->size());
+    co.setInput(in);
+    co.compress();
+  }
+#ifdef DEBUG  // verify pre-post processing are inverses
+  int64_t outsize;
+  const char* sha1result=co.endSegmentChecksum(&outsize, dosha1);
+  assert(sha1result);
+  assert(sha1ptr);
+  if (memcmp(sha1result, sha1ptr, 20)!=0)
+    error("Pre/post-processor test failed");
+#else
+  co.endSegment(sha1ptr);
+#endif
+  co.endBlock();
+}
+
 }  // end namespace libzpaq
diff --git a/libzpaq/libzpaq.h b/libzpaq/libzpaq.h
index cbe211d..ff8cad8 100644
--- a/libzpaq/libzpaq.h
+++ b/libzpaq/libzpaq.h
@@ -1,27 +1,834 @@
-/* libzpaq.h - LIBZPAQ Version 5.00.
+/* libzpaq.h - LIBZPAQ Version 7.12 header - Apr. 19, 2016.
 
-  Copyright (C) 2011, Dell Inc. Written by Matt Mahoney.
+  This software is provided as-is, with no warranty.
+  I, Matt Mahoney, release this software into
+  the public domain.   This applies worldwide.
+  In some countries this may not be legally possible; if so:
+  I grant anyone the right to use this software for any purpose,
+  without any conditions, unless such conditions are required by law.
 
-  Permission is hereby granted, free of charge, to any person obtaining a copy
-  of this software and associated documentation files (the "Software"), to deal
-  in the Software without restriction, including without limitation the rights
-  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-  copies of the Software, and to permit persons to whom the Software is
-  furnished to do so without restriction.
-  This Software is provided "as is" without warranty.
+LIBZPAQ is a C++ library providing data compression and decompression
+services using the ZPAQ level 2 format as described in
+http://mattmahoney.net/zpaq/
 
-LIBZPAQ is a C++ library for compression and decompression of data
-conforming to the ZPAQ level 2 standard. See http://mattmahoney.net/zpaq/
+An application wishing to use these services should #include "libzpaq.h"
+and link to libzpaq.cpp (and advapi32.lib in Windows/VC++).
+libzpaq recognizes the following options:
 
-By default, LIBZPAQ uses JIT (just in time) acceleration. This only
-works on x86-32 and x86-64 processors that support the SSE2 instruction
-set. To disable JIT, compile with -DNOJIT. To enable run time checks,
-compile with -DDEBUG. Both options will decrease speed.
+  -DDEBUG   Turn on assertion checks (slower).
+  -DNOJIT   Don't assume x86-32 or x86-64 with SSE2 (slower).
+  -Dunix    Without -DNOJIT, assume Unix (Linux, Mac) rather than Windows.
 
-The decompression code, when compiled with -DDEBUG and -DNOJIT,
-comprises the reference decoder for the ZPAQ level 2 standard.
+The application must provide an error handling function and derived
+implementations of two abstract classes, Reader and Writer,
+specifying the input and output byte streams. For example, to compress
+from stdin to stdout (assuming binary I/O as in Linux):
+
+  #include "libzpaq.h"
+  #include <stdio.h>
+  #include <stdlib.h>
+
+  void libzpaq::error(const char* msg) {  // print message and exit
+    fprintf(stderr, "Oops: %s\n", msg);
+    exit(1);
+  }
+
+  class In: public libzpaq::Reader {
+  public:
+    int get() {return getchar();}  // returns byte 0..255 or -1 at EOF
+  } in;
+
+  class Out: public libzpaq::Writer {
+  public:
+    void put(int c) {putchar(c);}  // writes 1 byte 0..255
+  } out;
+
+  int main() {
+    libzpaq::compress(&in, &out, "1");  // "0".."5" = faster..better
+  }
+
+Or to decompress:
+
+    libzpaq::decompress(&in, &out);
+
+The function error() will be called with an English language message
+in case of an unrecoverable error such as badly formatted compressed
+input data or running out of memory. error() should not return.
+In a multi-threaded application where ZPAQ blocks are being decompressed
+in separate threads, error() should exit the thread, but other threads
+may continue. Blocks are independent and libzpaq is thread safe.
+
+Reader and Writer provide default implementations of read() and write()
+for block I/O. You may override these with your own versions, which
+might be faster. The default is to call get() or put() the appropriate
+number of times. For example:
+
+  // Read n bytes into buf[0..n-1] or to EOF, whichever is first.
+  // Return the number of bytes actually read.
+  int In::read(char* buf, int n) {return fread(buf, 1, n, stdin);}
+
+  // Write buf[0..n-1]
+  void Out::write(char* buf, int n) {fwrite(buf, 1, n, stdout);}
+
+By default, compress() divides the input into blocks with one segment
+each. The segment filename field is empty. The comment field of each
+block is the uncompressed size as a decimal string. The checksum
+is saved. To override:
+
+  compress(&in, &out, "1", "filename", "comment", false);
+
+If the filename is not NULL then it is saved in the first block only.
+If the comment is not NULL then a space and the comment are appended
+to the decimal size in the first block only. The comment would normally
+be the date and attributes like "20141231235959 w32", or "jDC\x01" for
+a journaling archive as described in the ZPAQ specification.
+
+The method string has the general form of a concatenation of single
+character commands each possibly followed by a list of decimal
+numeric arguments separated by commas or periods:
+
+  {012345xciawmst}[N1[{.,}N2]...]...
+
+For example "1" or "14,128,0" or "x6.3ci1m".
+
+Only the first command can be a digit 0..5. If it is, then it selects
+a compression level and the other commands are ignored. Otherwise,
+if it is "x" then the arguments and remaining commands describe
+the compression method. Any other letter as the first command is
+interpreted the same as "x". 
+
+Higher compression levels are slower but compress better. "1" is
+good for most purposes. "0" does not compress. "2" compresses slower
+but decompression is just as fast as 1. "3", "4", and "5" also
+decompress slower. The numeric arguments are as follows:
+
+  N1: 0..11 = block size of at most 2^N1 MiB - 4096 bytes (default 4).
+  N2: 0..255 = estimated ease of compression (default 128).
+  N3: 0..3 = data type. 1 = text, 2 = exe, 3 = both (default 0).
+
+For example, "14" or "54" divide the input in 16 MB blocks which
+are compressed independently. N2 and N3 are hints to the compressor
+based on analysis of the input data. N2 is 0 if the data is random
+or 255 if the data is easily compressed (for example, all zero bytes).
+Most compression methods will simply store random data with no
+compression. The default is "14,128,0".
+
+If the first command is "x" then the string describes the exact
+compression method. The arguments to "x" describe the pre/post
+processing (LZ77, BWT, E8E9), and remaining commands describe the
+context model, if any, of the transformed data. The arguments to "x" are:
+
+  N1: 0..11 = block size as before.
+  N2: 0..7: 0=none, 1=packed LZ77, 2=LZ77, 3=BWT, 4..7 = 0..3 + E8E9.
+  N3: 4..63: LZ77 min match.
+  N4: LZ77 secondary match to try first or 0 to skip.
+  N5: LZ77 log search depth.
+  N6: LZ77 log hash table size, or N1+21 to use a suffix array.
+  N7: LZ77 lookahead.
+
+N2 selects the basic transform applied before context modeling.
+N2 = 0 does not transform the input. N2 = 1 selects LZ77 encoding
+of literals strings and matches using bit-packed codes. It is normally
+not used with a context model. N2 = 2 selects byte aligned LZ77, which
+compresses worse by itself but better than 1 when a context model is
+used. It uses single bytes to encode either a literal of length 1..64
+or a match of length N3..N3+63 with a 2, 3, or 4 byte offset.
+
+N2 = 3 selects a Burrows-Wheeler transform, in which the input is
+sorted by right-context. This does not compress by itself but makes
+the data more compressible using a low order, adaptive context model.
+BWT requires 4 times the block size in additional memory for both
+compression and decompression.
+
+N2 = 4..7 are the same as 0..3 except that a E8E9 transform is first applied
+to improve the compression of x86 code usually found .exe and .dll files.
+It scans the input block backward for 5 byte strings of the form
+{E8|E9 xx xx xx 00|FF} and adds the offset from the start of the
+block to the middle 3 bytes interpreted as a little-endian (LSB first)
+number (mod 2^24). E8 and E9 are the CALL and JMP instructions, followed
+by a 32 bit relative offset.
+
+N3..N7 apply only to LZ77. For either type, it searches for matches
+by hashing the next N4 bytes, and then the next N3 bytes, and looking
+up each of the hashes at 2^N5 locations in a table with 2^N6 entries.
+Of those, it picks the longest match, or closest in case of a tie.
+If no match is at least N3, then a literal is encoded instead. If N5
+is 0 then only one hash is computed, which is faster but does not
+compress as well. Typical good values for fast compression are
+"x4.1.5.0.3.22" which means 16 MiB blocks, packed LZ77, mininum match
+length 5, no secondary match, search depth 2^3 = 8, and 2^22 = 4M
+hash table (using 16 MiB memory).
+
+The hash table requires 4 x 2^N6 bytes of memory. If N6 = N1+21, then
+matches are found using a suffix array and inverse suffix array using
+2.25 x 2^N6 bytes (4.5 x block size). This finds better matches but
+takes longer to compute the suffix array (SA). The matches are found by
+searching forward and backward in the SA 2^N5 in each direction up
+to the first earlier match, and picking the longer of the two.
+Good values are "x4.1.4.0.8.25". The secondary match N4 has no effect.
+
+N7 is the lookahead. It looks for matches of length at least N4+N7
+when using a hash table or N3+N7 for a SA, but allows the first N7
+bytes not to match and be coded as literals if this results in
+a significantly longer match. Values higher than 1 are rarely effective.
+The default is 0.
+
+All subsequent commands after "x" describe a context model. A model
+consists of a set of components that output a bit prediction, taking
+a context and possibly earlier predictions as input. The final prediction
+is arithmetic coded. The component types are:
+
+  c = CM or ICM (context model or indirect context model).
+  i = ISSE chain (indirect secondary symbol estimator).
+  a = MATCH.
+  w = word model (ICM-ISSE chain with whole word contexts).
+  m = MIX.
+  s = SSE (secondary symbol estimator).
+  t = MIX2 (2 input MIX).
+
+For example, "x4.3ci1" describes a BWT followed by an order 0 CM
+and order 1 ISSE, which is used for level 3 text compression. The
+parameters to "c" (default all 0) are as follows:
+
+  N1: 0 = ICM, 1..256 CM with faster..slower adaptation, +1000 halves memory.
+  N2: 1..255 = offset mod N2, 1000..1255 = offset to last N2-1000 byte.
+  N3: 0..255 = order 0 context mask, 256..511 mixes LZ77 parse state.
+  N4...: 0..255 order 1... context masks. 1000... skips N4-1000 bytes.
+
+Most components use no more memory than the block size, depending on
+the number of context bits, but it is possible to select less memory
+and lose compression.
+
+A CM inputs a context hash and outputs a prediction from a table.
+The table entry is then updated by adjusting in the direction of the
+actual bit. The adjustment is 1/count, where the maximum count is 4 x N1.
+Larger values are best for stationary data. Smaller values adapt faster
+to changing data.
+
+If N1 is 0 then c selects an ICM. An ICM maps a context to a bit history
+(8 bit state), and then to slow adapting prediction. It is generally
+better than a CM on most nonstationary data.
+
+The context for a CM or ICM is a hash of all selected contexts: a
+cyclic counter (N2 = 1..255), the distance from the last occurrence
+of some byte value (N2 = 1000..1255), and the masked history of the
+last 64K bytes ANDED with N3, N4... For example, "c0.0.255.255.255" is
+an order 3 ICM. "C0.1010.255" is an order 1 context hashed together
+with the column number in a text file (distance to the last linefeed,
+ASCII 10). "c256.0.255.1511.255" is a stationary grayscale 512 byte
+wide image model using the two previous neighboring pixels as context.
+"c0.0.511.255" is an order 1 model for LZ77, which helps compress
+literal strings. The LZ77 state context applies only to byte aligned
+LZ77 (type 2 or 6).
+
+The parameters to "i" (ISSE chain) are the initial context length and
+subsequent increments for a chain connected to an existing earlier component.
+For example, "ci1.1.2" specifies an ICM (order 0) followed by a chain
+of 3 ISSE with orders 1, 2, and 4. An ISSE maps a context to a bit
+history like an ISSE, but uses the history to select a pair of weights
+to mix the input prediction with a constant 1, thus performing the
+mapping q' := w1 x q + w2 in the logistic domain (q = log p/(1-p)).
+The mixer is then updated by adjusting the weights to improve the
+prediction. High order ISSE chains (like "x4.0ci1.1.1.1.2") and BWT
+followed by a low order chain (like "x4.3ci1") both provide
+excellent general purpose compression.
+
+A MATCH ("a") keeps a rotating history buffer and a hash table to look
+up the previous occurrence of the current context hash and predicts
+whatever bit came next. The parameters are:
+
+  N1 = hash multiplier, default 24.
+  N2 = halve buffer size, default 0 = same size as input block.
+  N3 = halve hash table size, default 0 = block size / 4.
+
+For example, "x4.0m24.1.1" selects a 16 MiB block size, 8 MiB match
+buffer size, and 2M hash table size (using 8 MiB at 4 bytes per entry).
+The hash is computed as hash := hash x N1 + next_byte + 1 (mod hash table
+size). Thus, N1 = 12 selects a higher order context, and N1 = 48 selects a
+lower order.
+
+A word model ('w") is an ICM-ISSE chain of length N1 (orders 0..N1-1)
+in which the contexts are whole words. A word is defined as the set
+of characters in the range N2..N2+N3-1 after ANDing with N4. The context
+is hashed using multiplier N5. Memory is halved by N6. The default is
+"w1.65.26.223.20.0" which is a chain of length 1 (ICM only), where words
+are in range 65 ('A') to 65+26-1 ('Z') after ANDing with 223 (which
+converts to upper case). The hash multiplier is 20, which has the
+effect of shifting the high 2 bits out of the hash. The memory usage
+of each component is the same as the block size.
+
+A MIX ("m") performs the weighted average of all previous component
+predictions. The weights are then adjusted to improve the prediction
+by favoring the most accurate components. N1 selects the number of
+context bits (not hashed) to select a set of weights. N2 is the
+learning rate (around 16..32 works well). The default is "m8.24"
+which selects the previously modeled bits of the current byte as
+context. When N1 is not a multiple of 8, it selects the most significant
+bits of the oldest byte.
+
+A SSE ("s") adjusts the previous prediction like an ISSE, but uses
+a direct lookup table of the quantized and interpolated input prediction
+and a direct (not hashed) N1-bit context. The adjustment is 1/count where
+the count is allowed to range from N2 to 4 x N3. The default
+is "s8.32.255".
+
+A MIX2 ("t") is a MIX but mixing only the last 2 components. The
+default is "t8.24" where the meaning is the same as "m".
+
+For example, a good model for text is "x6.0ci1.1.1.1.2aw2mm16tst"
+which selects 2^6 = 64 MiB blocks, no preprocessing, an order 0 ICM,
+an ISSE chain with orders 1, 2, 3, 4, 6, a MATCH, an order 0-1 word
+ICM-ISSE chain, two mixers with 0 and 1 byte contexts, whose outputs are
+mixed by a MIX2. The MIX2 output is adjusted by a SSE, and finally
+the SSE input and outputs are mixed again for the final bit prediction.
+
+
+COMPRESSBLOCK
+
+CompressBlock() takes the same arguments as compress() except that
+the input is a StringBuffer instead of a Reader. The output is always
+a single block, regardless of the N1 (block size) argument in the method.
+
+  void compressBlock(StringBuffer* in, Writer* out, const char* method,
+                     const char* filename=0, const char* comment=0,
+                     bool compute_sha1=false);
+
+A StringBuffer is both a Reader and a Writer, but also allows random
+memory access. It provides convenient and efficient storage when the
+input size is unknown.
+
+  class StringBuffer: public libzpaq::Reader, public libzpaq::Writer {
+  public:
+    StringBuffer(size_t n=0);     // initial allocation after first use
+    ~StringBuffer();
+    int get();                    // read 1 byte or EOF from memory
+    int read(char* buf, int n);   // read n bytes
+    void put(int c);              // write 1 byte to memory
+    void write(const char* buf, int n);  // write n bytes
+    const char* c_str() const;    // read-only access to written data
+    unsigned char* data();        // read-write access
+    size_t size() const;          // number of bytes written
+    size_t remaining() const;     // number of bytes to read until EOF
+    void setLimit(size_t n);      // set maximum write size
+    void reset();                 // discard contents and free memory
+    void resize(size_t n);        // truncate to n bytes
+    void swap(StringBuffer& s);   // exchange contents efficiently
+  };
+
+The constructor sets the inital allocation size after the first
+write to n or 128, whichever is larger. Initially, no memory is allocated.
+The allocated size is always n x (2^k - 1), for example
+128 x (1, 3, 7, 15, 31...).
+
+put() and write() append 1 or n bytes, allocating memory as needed.
+buf can be NULL and the StringBuffer will be enlarged by n.
+get() and read() read 1 or up to n bytes. get() returns EOF if you
+attempt to read past the end of written data. read() returns less
+than n if it reaches EOF first, or 0 at EOF.
+
+size() is the number of bytes written, which does not change when
+data is read. remaining() is the number of bytes left to read
+before EOF.
+
+c_str() provides read-only access to the data. It is not NUL terminated.
+data() provides read-write access. Either may return NULL if size()
+is 0. write(), put(), reset(), swap(), and the destructor may
+invalidate saved pointers.
+
+setLimit() sets a maximum size. It will call error() if you try to
+write past it. The default is -1 or no limit.
+
+reset() sets the size to 0 and frees memory. resize() sets the size
+to n by moving the write pointer, but does not allocate or free memory.
+Moving the pointer forward does not overwrite the previous contents
+in between. The write pointer can be moved past the end of allocated
+memory, and the next put() or write() will allocate as needed. If the
+write pointer is moved back before the read pointer, then remaining()
+is set to 0.
+
+swap() swaps 2 StringBuffers efficiently, but does not change their
+initial allocations.
+
+
+DECOMPRESSER
+
+decompress() will decompress any valid ZPAQ stream, which may contain
+multiple blocks with multiple segments each. It will ignore filenames,
+comments, and checksums. You need the Decompresser class if you want to
+do something other than decompress all of the data serially to a single
+file. To decompress individual blocks and segments and retrieve the
+filenames, comments, data, and hashes of each segment (in exactly this
+order):
+
+  libzpaq::Decompresser d;               // to decompress
+  libzpaq::SHA1 sha1;                    // to verify output hashes
+  double memory;                         // bytes required to decompress
+  Out filename, comment;
+  char sha1out[21];
+  d.setInput(&in);
+  while (d.findBlock(&memory)) {         // default is NULL
+    while (d.findFilename(&filename)) {  // default is NULL
+      d.readComment(&comment);           // default is NULL
+      d.setOutput(&out);                 // if omitted or NULL, discard output
+      d.setSHA1(&sha1);                  // optional
+      while (d.decompress(1000));        // bytes to decode, default is all
+      d.readSegmentEnd(sha1out);         // {0} or {1,hash[20]}
+      if (sha1out[0]==1 && memcmp(sha1.result(), sha1out+1, 20))
+        error("checksum error");
+    }
+  }
+
+findBlock() scans the input for the next ZPAQ block and returns true
+if found. It optionally sets memory to the approximate number of bytes
+that it will allocate at the first call to decompress().
+
+findFilename() finds the next segment and returns false if there are
+no more in the current block. It optionally writes the saved filename.
+
+readComment() optionally writes the comment. It must be called
+after reading the filename and before decompressing.
+
+setSHA1() specifies an SHA1 object for computing a hash of the segment.
+It may be omitted if you do not want to compute a hash.
+
+decompress() decodes the requested number of bytes, postprocesses them,
+and writes them to out. For the 3 built in compression levels, this
+is the same as the number of bytes output, but it may be different if
+postprocessing was used. It returns true until there is no more data
+to decompress in the current segment. The default (-1) is to decompress the
+whole segment.
+
+readSegmentEnd() skips any remaining data not yet decompressed in the
+segment and writes 21 bytes, either a 0 if no hash was saved, 
+or a 1 followed by the 20 byte saved hash. If any data is skipped,
+then all data in the remaining segments in the current block must
+also be skipped.
+
+
+SHA1
+
+The SHA1 object computes SHA-1 cryptographic hashes. It is safe to
+assume that two inputs with the same hash are identical. For example:
+
+  libzpaq::SHA1 sha1;
+  int ch;
+  while ((ch=getchar())!=EOF)
+    sha1.put(ch);
+  printf("Size is %1.0f or %1.0f bytes\n", sha1.size(), double(sha1.usize()));
+
+size() returns the number of bytes read as a double, and usize() as a
+64 bit integer. result() returns a pointer to the 20 byte hash and
+resets the size to 0. The hash (not just the pointer) should be copied
+before the next call to result() if you want to save it. You can also
+call sha1.write(buffer, n) to hash n bytes of char* buffer.
+
+
+COMPRESSOR
+
+A Compressor object allows greater control over the compressed data.
+In particular you can specify the compression algorithm in ZPAQL to
+specify methods not possible using compress() or compressBlock(). You
+can create blocks with multiple segments specifying different files,
+or compress streams of unlimited size to a single block when the
+input size is not known.
+
+  libzpaq::Compressor c;
+  for (int i=0; i<num_blocks; ++i) {
+    c.setOutput(&out);              // if omitted or NULL, discard output
+    c.writeTag();                   // optional locator tag
+    c.startBlock(2);                // compression level 1, 2, or 3
+    for (int j=0; j<num_segments; ++j) {
+      c.startSegment("filename", "comment");  // default NULL = empty
+      c.setInput(&in);
+      while (c.compress(1000));     // bytes to compress, default -1 = all
+      c.endSegment(sha1.result());  // default NULL = don't save checksum
+    }
+    c.endBlock();
+  }
+
+Input and output can be set anywhere before the first input and output
+operation, respectively. Output may be changed any time.
+
+writeTag() outputs a 13 byte string that allows decompress() to scan
+for blocks that don't occur immediately, such as searching from the
+start of a self extracting archive.
+
+startBlock() specifies either a compression level (1, 2, 3), or a ZPAQL
+program, described below. It does not work with the fast method type
+arguments to compress() or compressBlock(). Levels 1, 2, and 3 correspond
+approximately to "3", "4", and "5". Any preprocessing must be done
+by the application before input to the Compressor.
+
+StartSegment() starts a segment. An empty or NULL filename continues
+the previous file. The comment normally contains the uncompressed size
+of the segment as a decimal string, but it is allowed to omit it.
+
+compress() will read the requested number of bytes or until in.get()
+returns EOF (-1), whichever comes first, and return true if there is
+more data to decompress. If the argument is omitted or -1, then it will
+read to EOF and return false.
+
+endSegment() writes a provided SHA-1 cryptographic hash checksum of the
+input segment before any preprocessing. It may be omitted.
+
+
+ZPAQL
+
+ZPAQ supports arbitrary compression algorithms in addition to the
+built in levels. For example, method "x4.0c0.0.255.255i4" compression could
+alternatively be specified using the ZPAQL language description of the
+compression algorithm:
+
+  int args[9]={0}
+  c.startBlock(
+    "(min.cfg - equivalent to level 1) "
+    "comp 1 2 0 0 2 (log array sizes hh,hm,ph,pm and number of components n) "
+    "  0 icm 16    (order 2 indirect context model using 4 MB memory) "
+    "  1 isse 19 0 (order 4 indirect secondary symbol estimator, 32 MB) "
+    "hcomp (context computation, input is last modeled byte in A) "
+    "  *b=a a=0 (save in rotating buffer M pointed to by B) "
+    "  d=0 hash b-- hash *d=a (put order 2 context hash in H[0] pointed by D)"
+    "  d++ b-- hash b-- hash *d=a (put order 4 context in H[1]) "
+    "  halt "
+    "end " (no pre/post processing) ",
+    args,     // Arguments $1 through $9 to ZPAQL code (unused, can be NULL)
+    &out);    // Writer* to write pcomp command (default is NULL)
+
+The first argument is a description of the compression algorithm in
+the ZPAQL language. It is compiled into byte code and saved in the
+archive block header so that the decompressor knows how read the data.
+A ZPAQL program accepts up to 9 numeric arguments, which should be
+passed in array.
+
+A decompression algorithm has two optional parts, a context mixing
+model and a postprocessor. The context model is identical for both
+the compressor and decompressor, so is used in both instances. The
+postprocessor, if present, is generally different, which presents
+the possibility that the user supplied code may not restore the
+original data exactly. It is assumed that the input has already been
+preprocessed by the application but that the hash supplied to endSegment()
+is of the original input before preprocessing. The following functions
+allow you to test the postprocesser during compression:
+
+  c.setVerify(true); // before c.compress(), may run slower
+  c.getSize();       // return 64 bit size of postprocessed output
+  c.getChecksum();   // after c.readSegmentEnd(), return hash, reset size to 0
+
+This example has no postprocessor, but if it did, then setVerify(true)
+would cause compress() to run the preprocessed input through the
+postprocessor and into a SHA1 in parallel with compression. Then,
+getChecksum() would return the hash which could be compared with
+the hash of the input computed by the application. Also,
+
+  int64_t size;
+  c.endSegmentChecksum(&size, true);
+
+instead of c.endSegment() will automatically add the computed checksum
+if setVerify is true and return the checksum, whether or not there
+is a postprocessor. If &size is not NULL then the segment size is written
+to size. If the second argument is false then the computed checksum is
+not saved to output. Default is true. If setVerify is false, then no
+checksum is saved and the function returns 0 with size not written.
+
+A context model consists of two parts, an array COMP of n components,
+and some code HCOMP that computes contexts for the components.
+The model compresses one bit at a time (MSB to LSB order) by computing
+a probability that the next bit will be a 1, and then arithmetic
+coding that bit. Better predictions compress smaller.
+
+If n is 0 then the data is uncompressed. Otherwise, there is an array
+of n = 1..255 components each taking a context and possibly the
+predictions of previous components as input and outputting a new
+prediction. The output of the last prediction is used to encode the
+bit. After encoding, the state of each component is updated to
+reduce the prediction error when the same context occurs again.
+Components are as follows. Most arguments have range 0...255.
+
+  CONST c          predict a 1 (c > 128) or 0 (c < 128).
+  CM s t           context model with 2^s contexts, learning rate 1/4t.
+  ICM s            indirect context model with 2^(s+6) contexts.
+  MATCH s b        match model with 2^s context hashes and 2^b history.
+  AVG j k wt       average components j and k with weight wt/256 for j.
+  MIX2 s j k r x   average j and k with 2^s contexts, rate r, mask x.
+  MIX  s j m r x   average j..j+m-1 with 2^s contexts, rate r, mask x.
+  ISSE s j         adjust prediction j using 2^(s+6) indirect contexts.
+  SSE s j t1 t2    adjust j using 2^s direct contexts, rate 1/t1..1/4t2.
+
+A CONST predicts a 1 with probability 1/(1+exp((128-c)/16)), i.e
+numbers near 0 or 255 are the most confident.
+  
+A CM maps a context to a prediction and a count. It is updated by
+adjusting the prediction to reduce the error by 1/count and incrementing
+the count up to 4t.
+
+A ICM maps a s+10 bit context hash to a bit history (8 bit state)
+representing a bounded count of zeros and ones previously seen in the
+context and which bit was last. The bit history is mapped to a
+prediction, which is updated by reducing the error by 1/1024.
+The initial prediction is estimated from the counts represented by each
+bit history.
+
+A MATCH looks up a context hash and predicts whatever bit came next
+following the previous occurrence in the history buffer. The strength
+of the prediction depends on the match length.
+
+AVG, MIX2, and MIX perform weighted averaging of predictions in the
+logistic domain (log(p/(1-p))). AVG uses a fixed weight. MIX2 and MIX
+adjust the weights (selected by context) to reduce prediction error
+by a rate that increases with r. The mask is AND-ed with the current
+partially coded byte to compute that context. Normally it is 255.
+A MIX takes a contiguous range of m components as input.
+
+ISSE adjusts a prediction using a bit history (as with an ICM) to
+select a pair of weights for a 2 input MIX. It mixes the input
+prediction with a constant 1 in the logistic domain.
+
+SSE adjusts a logistic prediction by quantizing it to 32 levels and
+selecting a new prediction from a table indexed by context, interpolating
+between the nearest two steps. The nearest prediction error is
+reduced by 1/count where count increments from t1 to 4*t2.
+
+Contexts are computed and stored in an array H of 32 bit unsigned
+integers by the HCOMP program written in ZPAQL. The program is called
+after encoding a whole byte. To form a complete context, these values
+are combined with the previous 0 to 7 bits of the current parital byte.
+The method depends on the component type as follows:
+
+  CM: H[i]    XOR hmap4(c).
+  ICM, ISSE:  hash table lookup of (H[i]*16+c) on nibble boundaries.
+  MIX2, MIX:  H[i] + (c AND x).
+  SSE:        H[i] + c.
+
+where c is the previous bits with a leading 1 bit (1, 1x, 1xx, ...,
+1xxxxxxx where x is a previously coded bit). hmap4(c) maps c
+to a 9 bit value to reduce cache misses. The first nibble is
+mapped as before and the second nibble with 1xxxx in the high
+5 bits. For example, after 6 bits, where c = 1xxxxxx,
+hmap4(c) = 1xxxx01xx with the bits in the same order.
+
+There are two ZPAQL virtual machines, HCOMP to compute contexts
+and PCOMP to post-process the decoded output. Each has the
+following state:
+
+  PC: 16 bit program counter.
+  A, B, C, D, R0...R255: 32 bit unsigned registers.
+  F: 1 bit condition register.
+  H: array of 2^h 32 bit unsigned values (output for HCOMP).
+  M: array of 2^m 8 bit unsigned values.
+
+All values are initialized to 0 at the beginning of a block
+and retain their values between calls. There are two machines.
+HCOMP is called after coding each byte with the value of that
+byte in A. PCOMP, if present, is called once for each decoded
+byte with that byte in A, and once more at the end of each
+segment with 2^32 - 1 in A.
+
+Normally, A is an accumulator. It is the destination of all
+binary operations except assignment. The low m bits of B and
+C index M. The low h bits of D indexes H. We write *B, *C, *D
+to refer to the elements they point to. The instruction set
+is as follows, where X is A, B, C, D, *B, *C, *D except as
+indicated. X may also be a constant 0...255, written with
+a leading space if it appears on the right side of an operator,
+e.g. "*B= 255". Instructions taking a numeric argument are 2 bytes,
+otherwise 1. Arithmetic is modulo 2^32.
+
+  X<>A    Swap X with A (X cannot be A).
+  X++     Add 1.
+  X--     Subtract 1.
+  X!      Complement bits of X.
+  X=0     Clear X (1 byte instruction).
+  X=X     Assignment to left hand side.
+  A+=X    Add to A
+  A-=X    Subtract from A
+  A*=X    Multipy
+  A/=X    Divide. If X is 0 then A=0.
+  A%=X    Mod. If X is 0 then A=0.
+  A&=X    Clear bits of A that are 0 in X.
+  A&~X    Clear bits of A that are 1 in X.
+  A|=X    Set bits of A that are 1 in X.
+  A^=X    Complement bits of A that are set in X.
+  A<<=X   Shift A left by (X mod 32) bits.
+  A>>=X   Shift right (zero fill) A by (X mod 32) bits.
+  A==X    Set F=1 if equal else F=0.
+  A<X     Set F=1 if less else F=0.
+  A>X     Set F=1 if greater else F=0.
+  X=R N   Set A,B,C,D to RN (R0...R255).
+  R=A N   Set R0...R255 to A.
+  JMP N   Jump N=-128...127 bytes from next instruction.
+  JT N    Jump N=-128...127 if F is 1.
+  JF N    Jump N=-128...127 if F is 0.
+  LJ N    Long jump to location 0...65535 (only 3 byte instruction).
+  OUT     Output A (PCOMP only).
+  HASH    A=(A+*B+512)*773.
+  HASHD   *D=(*D+A+512)*773.
+  HALT    Return at end of program.
+  ERROR   Fail if executed.
+
+Rather than using jump instructions, the following constructs are
+allowed and translated appropriately.
+
+  IF ... ENDIF              Execute if F is 1.
+  IFNOT ... ENDIF           Execute if F is 0.
+  IF ... ELSE ... ENDIF     Execute first part if F is 1 else second part.
+  IFNOT ... ELSE ... ENDIF  Execute first part if F is 0 else second part.
+  DO ... WHILE              Loop while F is 1.
+  DO ... UNTIL              Loop while F is 0.
+  DO ... FOREVER            Loop unconditionally.
+
+Forward jumps (IF, IFNOT, ELSE) will not compile if beyond 127
+instructions. In that case, use the long form (IFL, IFNOTL, ELSEL).
+DO loops automatically use long jumps if needed. IF and DO loops
+may intersect. For example, DO ... IF ... FOREVER ENDIF is equivalent
+to a while-loop.
+
+A config argument without a postprocessor has the following syntax:
+
+  COMP hh hm ph pm n
+    i COMP args...
+  HCOMP
+    zpaql...
+  END (or POST 0 END for backward compatibility)
+
+With a postprocessor:
+
+  COMP hh hm ph pm n
+    i COMP args...
+  HCOMP
+    zpaql...
+  PCOMP command args... ;
+    zpaql...
+  END
+
+In HCOMP, H and M have sizes 2^hh and 2^hm respectively. In PCOMP,
+H and M have sizes 2^ph and 2^pm respectively. There are n components,
+which must be numbered i = 0 to n-1. If a postprocessor is used, then
+"command args..." is written to the Writer* passed as the 4'th argument,
+but otherwise ignored. A typical use in a development environment might
+be to call an external program that will be passed two additional
+arguments on the command line, the input and output file names
+respectively.
+
+You can pass up to 9 signed numeric arguments in args[]. In any
+place that a number "N" is allowed, you can write "$M" or "$M+N"
+(like "$1" or $9+25") and value args[M-1]+N will be substituted.
+
+ZPAQL allows (nested) comments in parenthesis. It is not case sensitive.
+If there are input errors, then error() will report the error. If the
+string contains newlines, it will report the line number of the error.
+
+ZPAQL is compiled internally into a byte code, and then to native x86
+32 or 64 bit code (unless compiled with -DNOJIT, in which case the
+byte code is interpreted). You can also specify the algorithm directly
+in byte code, although this is less convenient because it requires two
+steps:
+
+  c.startBlock(hcomp);      // COMP and HCOMP at start of block
+  c.postProcess(pcomp, 0);  // PCOMP right before compress() in first segment
+
+This is necessary because the COMP and HCOMP sections are stored in
+the block header, but the PCOMP section is compressed in the first
+segment after the filename and comment but before any data.
+
+To retrive compiled byte code in suitable format after startBlock():
+
+  c.hcomp(&out);      // writes COMP and HCOMP sections
+  c.pcomp(&out);      // writes PCOMP section if any
+
+Or during decompression:
+
+  d.hcomp(&out);      // valid after findBlock()
+  d.pcomp(&out);      // valid after decompress(0) in first segment
+
+Both versions of pcomp() write nothing and return false if there is no
+PCOMP section. The output of hcomp() and pcomp() may be passed to the
+input of startBlock() and postProcess(). These are strings in which the
+first 2 bytes encode the length of the rest of the string, least
+significant byte first. Alternatively, postProcess() allows the length to
+be omitted and passed separately as the second argument. In the case
+of decompression, the HCOMP and PCOMP strings are read from the archive.
+The preprocessor command (from "PCOMP cmd ;") is not saved in the compressed
+data.
+
+
+ARRAY
+
+The libzpaq::Array template class is convenient for creating arrays aligned
+on 64 byte addresses. It calls error("Out of memory") if needed.
+It is used as follows:
+
+  libzpaq::Array<T> a(n);  // array a[0]..a[n-1] of type T, zeroed
+  a.resize(n);             // change size and zero contents
+  a[i]                     // i'th element
+  a(i)                     // a[i%n], valid only if n is a power of 2
+  a.size()                 // n (as a size_t)
+  a.isize()                // n (as a signed int)
+
+T should be a simple type without constructors or destructors. Arrays
+cannot be copied or assigned. You can also specify the size:
+
+  Array<T> a(n, e);  // n << e
+  a.resize(n, e);    // n << e
+
+which is equivalent to n << e except that it calls error("Array too big")
+rather than overflow if n << e would require more than 32 bits. If
+compiled with -DDEBUG, then bounds are checked at run time.
+
+
+ENCRYPTION
+
+There is a class libzpaq::SHA256 with put(), result(), size(), and usize()
+as in SHA1. result() returns a 32 byte SHA-256 hash. It is used by scrypt.
+
+The libzpaq::AES_CTR class allows encryption in CTR mode with 128, 192,
+or 256 bit keys. The public members are:
+
+class AES_CTR {
+public:
+  AES_CTR(const char* key, int keylen, char* iv=0);
+  void encrypt(U32 s0, U32 s1, U32 s2, U32 s3, unsigned char* ct);
+  void encrypt(char* buf, int n, U64 offset);
+};
+
+The constructor initializes with a 16, 24, or 32 byte key. The length
+is given by keylen. iv can be an 8 byte string or NULL. If not NULL
+then iv0, iv1 are initialized with iv[0..7] in big-endian order, else 0.
+
+encrypt(s0, s1, s2, s3, ct) encrypts a plaintext block divided into
+4 32-bit words MSB first. The first byte of plaintext is the high 8
+bits of s0. The output is to ct[16].
+
+encrypt(buf, n, offset) encrypts or decrypts an n byte slice of a string
+starting at offset. The i'th 16 byte block is encrypted by XOR with
+the result (in ct) of encrypt(iv0, iv1, i>>32, i&0xffffffff, ct) starting
+with i = 0. For example:
+
+  AES_CTR a("a 128 bit key!!!", 16);
+  char buf[500];             // some data 
+  a.encrypt(buf, 100, 0);    // encrypt first 100 bytes
+  a.encrypt(buf, 400, 100);  // encrypt next 400 bytes
+  a.encrypt(buf, 500, 0);    // decrypt in one step
+
+libzpaq::stretchKey(char* out, const char* in, const char* salt);
+
+Generate a 32 byte key out[0..31] from key[0..31] and salt[0..31]
+using scrypt(key, salt, N=16384, r=8, p=1). key[0..31] should be
+the SHA-256 hash of the password. With these parameters, the function
+uses 0.1 to 0.3 seconds and 16 MiB memory.
+Scrypt is defined in http://www.tarsnap.com/scrypt/scrypt.pdf
+
+void random(char* buf, int n);
+
+Puts n cryptographic random bytes in buf[0..n-1], where the first
+byte is never '7' or 'z' (start of a ZPAQ archive). For a pure
+random string, discard the first byte.
+
+Other classes and functions defined here are for internal use.
+Use at your own risk.
 */
 
+//////////////////////////////////////////////////////////////
+
 #ifndef LIBZPAQ_H
 #define LIBZPAQ_H
 
@@ -29,9 +836,10 @@ comprises the reference decoder for the ZPAQ level 2 standard.
 #define NDEBUG 1
 #endif
 #include <assert.h>
-#include <stddef.h>
 #include <stdint.h>
+#include <stdlib.h>
 #include <string.h>
+#include <algorithm>
 
 namespace libzpaq {
 
@@ -41,9 +849,10 @@ typedef uint16_t U16;
 typedef uint32_t U32;
 typedef uint64_t U64;
 
-// Standard library prototypes redirected to libzpaq.cpp
-void* calloc(size_t, size_t);
-void free(void*);
+// Tables for parsing ZPAQL source code
+extern const char* compname[256];    // list of ZPAQL component types
+extern const int compsize[256];      // number of bytes to encode a component
+extern const char* opcodelist[272];  // list of ZPAQL instructions
 
 // Callback for error handling
 extern void error(const char* msg);
@@ -56,6 +865,10 @@ class Reader {
 public:
   virtual int get() = 0;  // should return 0..255, or -1 at EOF
   virtual int read(char* buf, int n); // read to buf[n], return no. read
+/* TODO
+ * need to show progress
+  virtual void show_progress(int bytes_in); // lrzip addition to show progress
+*/
   virtual ~Reader() {}
 };
 
@@ -104,15 +917,16 @@ void Array<T>::resize(size_t sz, int ex) {
   if (n>0) {
     assert(offset>0 && offset<=64);
     assert((char*)data-offset);
-    free((char*)data-offset);
+    ::free((char*)data-offset);
   }
   n=0;
+  offset=0;
   if (sz==0) return;
   n=sz;
   const size_t nb=128+n*sizeof(T);  // test for overflow
-  if (nb<=128 || (nb-128)/sizeof(T)!=n) error("Array too big");
-  data=(T*)calloc(nb, 1);
-  if (!data) error("Out of memory");
+  if (nb<=128 || (nb-128)/sizeof(T)!=n) n=0, error("Array too big");
+  data=(T*)::calloc(nb, 1);
+  if (!data) n=0, error("Out of memory");
   offset=64-(((char*)data-(char*)0)&63);
   assert(offset>0 && offset<=64);
   data=(T*)((char*)data+offset);
@@ -124,29 +938,92 @@ void Array<T>::resize(size_t sz, int ex) {
 class SHA1 {
 public:
   void put(int c) {  // hash 1 byte
-    U32& r=w[len0>>5&15];
+    U32& r=w[U32(len)>>5&15];
+    r=(r<<8)|(c&255);
+    len+=8;
+    if ((U32(len)&511)==0) process();
+  }
+  void write(const char* buf, int64_t n); // hash buf[0..n-1]
+  double size() const {return len/8;}     // size in bytes
+  uint64_t usize() const {return len/8;}  // size in bytes
+  const char* result();  // get hash and reset
+  SHA1() {init();}
+private:
+  void init();      // reset, but don't clear hbuf
+  U64 len;          // length in bits
+  U32 h[5];         // hash state
+  U32 w[16];        // input buffer
+  char hbuf[20];    // result
+  void process();   // hash 1 block
+};
+
+//////////////////////////// SHA256 //////////////////////////
+
+// For computing SHA-256 checksums
+// http://en.wikipedia.org/wiki/SHA-2
+class SHA256 {
+public:
+  void put(int c) {  // hash 1 byte
+    unsigned& r=w[len0>>5&15];
     r=(r<<8)|(c&255);
     if (!(len0+=8)) ++len1;
     if ((len0&511)==0) process();
   }
   double size() const {return len0/8+len1*536870912.0;} // size in bytes
-  uint64_t usize() const {return len0/8+(U64(len1)<<29);} // size in bytes
+  uint64_t usize() const {return len0/8+(U64(len1)<<29);} //size in bytes
   const char* result();  // get hash and reset
-  SHA1() {init();}
+  SHA256() {init();}
 private:
-  void init();      // reset, but don't clear hbuf
-  U32 len0, len1;   // length in bits (low, high)
-  U32 h[5];         // hash state
-  U32 w[80];        // input buffer
-  char hbuf[20];    // result
-  void process();   // hash 1 block
+  void init();           // reset, but don't clear hbuf
+  unsigned len0, len1;   // length in bits (low, high)
+  unsigned s[8];         // hash state
+  unsigned w[16];        // input buffer
+  char hbuf[32];         // result
+  void process();        // hash 1 block
 };
 
+//////////////////////////// AES /////////////////////////////
+
+// For encrypting with AES in CTR mode.
+// The i'th 16 byte block is encrypted by XOR with AES(i)
+// (i is big endian or MSB first, starting with 0).
+class AES_CTR {
+  U32 Te0[256], Te1[256], Te2[256], Te3[256], Te4[256]; // encryption tables
+  U32 ek[60];  // round key
+  int Nr;  // number of rounds (10, 12, 14 for AES 128, 192, 256)
+  U32 iv0, iv1;  // first 8 bytes in CTR mode
+public:
+  AES_CTR(const char* key, int keylen, const char* iv=0);
+    // Schedule: keylen is 16, 24, or 32, iv is 8 bytes or NULL
+  void encrypt(U32 s0, U32 s1, U32 s2, U32 s3, unsigned char* ct);
+  void encrypt(char* buf, int n, U64 offset);  // encrypt n bytes of buf
+};
+
+//////////////////////////// stretchKey //////////////////////
+
+// Strengthen password pw[0..pwlen-1] and salt[0..saltlen-1]
+// to produce key buf[0..buflen-1]. Uses O(n*r*p) time and 128*r*n bytes
+// of memory. n must be a power of 2 and r <= 8.
+void scrypt(const char* pw, int pwlen,
+            const char* salt, int saltlen,
+            int n, int r, int p, char* buf, int buflen);
+
+// Generate a strong key out[0..31] key[0..31] and salt[0..31].
+// Calls scrypt(key, 32, salt, 32, 16384, 8, 1, out, 32);
+void stretchKey(char* out, const char* key, const char* salt);
+
+//////////////////////////// random //////////////////////////
+
+// Fill buf[0..n-1] with n cryptographic random bytes. The first
+// byte is never '7' or 'z'.
+void random(char* buf, int n);
+
 //////////////////////////// ZPAQL ///////////////////////////
 
 // Symbolic constants, instruction size, and names
 typedef enum {NONE,CONS,CM,ICM,MATCH,AVG,MIX2,MIX,ISSE,SSE} CompType;
 extern const int compsize[256];
+class Decoder;  // forward
 
 // A ZPAQL machine COMP+HCOMP or PCOMP.
 class ZPAQL {
@@ -167,8 +1044,8 @@ public:
   U32 H(int i) {return h(i);}  // get element of h
 
   void flush();           // write outbuf[0..bufptr-1] to output and sha1
-  void outc(int c) {      // output byte c (0..255) or -1 at EOS
-    if (c<0 || (outbuf[bufptr]=c, ++bufptr==outbuf.isize())) flush();
+  void outc(int ch) {     // output byte ch (0..255) or -1 at EOS
+    if (ch<0 || (outbuf[bufptr]=ch, ++bufptr==outbuf.isize())) flush();
   }
 
   // ZPAQ1 block header
@@ -192,8 +1069,8 @@ private:
   // Support code
   int assemble();  // put JIT code in rcode
   void init(int hbits, int mbits);  // initialize H and M sizes
-  int execute();  // execute 1 instruction, return 0 after HALT, else 1
-  void run0(U32 input);  // default run() when select==0
+  int execute();  // interpret 1 instruction, return 0 after HALT, else 1
+  void run0(U32 input);  // default run() if not JIT
   void div(U32 x) {if (x) a/=x; else a=0;}
   void mod(U32 x) {if (x) a%=x; else a=0;}
   void swap(U32& x) {a^=x; x^=a; a^=x;}
@@ -221,12 +1098,8 @@ struct Component {
 
 ////////////////////////// StateTable ////////////////////////
 
-// Next state table generator
+// Next state table
 class StateTable {
-  enum {N=64}; // sizes of b, t
-  int num_states(int n0, int n1);  // compute t[n0][n1][1]
-  void discount(int& n0);  // set new value of n0 after 1 or n1 after 0
-  void next_state(int& n0, int& n1, int y);  // new (n0,n1) after bit y
 public:
   U8 ns[1024]; // state*4 -> next state if 0, if 1, n0, n1
   int next(int state, int y) {  // next state for bit y
@@ -265,6 +1138,7 @@ private:
   U32 h[256];           // unrolled copy of z.h
   ZPAQL& z;             // VM to compute context hashes, includes H, n
   Component comp[256];  // the model, includes P
+  bool initTables;      // are tables initialized?
 
   // Modeling support functions
   int predict0();       // default
@@ -288,12 +1162,14 @@ private:
 
   // x -> floor(32768/(1+exp(-x/64)))
   int squash(int x) {
+    assert(initTables);
     assert(x>=-2048 && x<=2047);
     return squasht[x+2048];
   }
 
   // x -> round(64*log((x+0.5)/(32767.5-x))), approx inverse of squash
   int stretch(int x) {
+    assert(initTables);
     assert(x>=0 && x<=32767);
     return stretcht[x];
   }
@@ -322,7 +1198,7 @@ private:
 //////////////////////////// Decoder /////////////////////////
 
 // Decoder decompresses using an arithmetic code
-class Decoder {
+class Decoder: public Reader {
 public:
   Reader* in;        // destination
   Decoder(ZPAQL& z);
@@ -330,16 +1206,23 @@ public:
   int skip();        // skip to the end of the segment, return next byte
   void init();       // initialize at start of block
   int stat(int x) {return pr.stat(x);}
+  int get() {        // return 1 byte of buffered input or EOF
+    if (rpos==wpos) {
+      rpos=0;
+      wpos=in ? in->read(&buf[0], BUFSIZE) : 0;
+      assert(wpos<=BUFSIZE);
+    }
+    return rpos<wpos ? U8(buf[rpos++]) : -1;
+  }
+  int buffered() {return wpos-rpos;}  // how far read ahead?
 private:
   U32 low, high;     // range
-  U32 curr;          // last 4 bytes of archive
+  U32 curr;          // last 4 bytes of archive or remaining bytes in subblock
+  U32 rpos, wpos;    // read, write position in buf
   Predictor pr;      // to get p
   enum {BUFSIZE=1<<16};
   Array<char> buf;   // input buffer of size BUFSIZE bytes
-    // of unmodeled data. buf[low..high-1] is input with curr
-    // remaining in sub-block.
   int decode(int p); // return decoded bit (0..1) with prob. p (0..65535)
-  void loadbuf();    // read unmodeled data into buf to EOS
 };
 
 /////////////////////////// PostProcessor ////////////////////
@@ -375,6 +1258,7 @@ public:
   bool pcomp(Writer* out2) {return pp.z.write(out2, true);}
   void readSegmentEnd(char* sha1string = 0);
   int stat(int x) {return dec.stat(x);}
+  int buffered() {return dec.buffered();}
 private:
   ZPAQL z;
   Decoder dec;
@@ -387,17 +1271,12 @@ private:
 
 void decompress(Reader* in, Writer* out);
 
-//////////////////////////////////////////////////////////////
-//////////////////////////////////////////////////////////////
-
-// Code following this point is not a part of the ZPAQ level 2 standard.
-
 //////////////////////////// Encoder /////////////////////////
 
 // Encoder compresses using an arithmetic code
 class Encoder {
 public:
-  Encoder(ZPAQL& z):
+  Encoder(ZPAQL& z, int size=0):
     out(0), low(1), high(0xFFFFFFFF), pr(z) {}
   void init();
   void compress(int c);  // c is 0..255 or EOF
@@ -410,37 +1289,231 @@ private:
   void encode(int y, int p); // encode bit y (0..1) with prob. p (0..65535)
 };
 
+//////////////////////////// Compiler ////////////////////////
+
+// Input ZPAQL source code with args and store the compiled code
+// in hz and pz and write pcomp_cmd to out2.
+
+class Compiler {
+public:
+  Compiler(const char* in, int* args, ZPAQL& hz, ZPAQL& pz, Writer* out2);
+private:
+  const char* in;  // ZPAQL source code
+  int* args;       // Array of up to 9 args, default NULL = all 0
+  ZPAQL& hz;       // Output of COMP and HCOMP sections
+  ZPAQL& pz;       // Output of PCOMP section
+  Writer* out2;    // Output ... of "PCOMP ... ;"
+  int line;        // Input line number for reporting errors
+  int state;       // parse state: 0=space -1=word >0 (nest level)
+
+  // Symbolic constants
+  typedef enum {NONE,CONS,CM,ICM,MATCH,AVG,MIX2,MIX,ISSE,SSE,
+    JT=39,JF=47,JMP=63,LJ=255,
+    POST=256,PCOMP,END,IF,IFNOT,ELSE,ENDIF,DO,
+    WHILE,UNTIL,FOREVER,IFL,IFNOTL,ELSEL,SEMICOLON} CompType;
+
+  void syntaxError(const char* msg, const char* expected=0); // error()
+  void next();                     // advance in to next token
+  bool matchToken(const char* tok);// in==token?
+  int rtoken(int low, int high);   // return token which must be in range
+  int rtoken(const char* list[]);  // return token by position in list
+  void rtoken(const char* s);      // return token which must be s
+  int compile_comp(ZPAQL& z);      // compile either HCOMP or PCOMP
+
+  // Stack of n elements
+  class Stack {
+    libzpaq::Array<U16> s;
+    size_t top;
+  public:
+    Stack(int n): s(n), top(0) {}
+    void push(const U16& x) {
+      if (top>=s.size()) error("IF or DO nested too deep");
+      s[top++]=x;
+    }
+    U16 pop() {
+      if (top<=0) error("unmatched IF or DO");
+      return s[--top];
+    }
+  };
+
+  Stack if_stack, do_stack;
+};
+
 //////////////////////// Compressor //////////////////////////
 
 class Compressor {
 public:
-  Compressor(): enc(z), in(0), state(INIT) {}
+  Compressor(): enc(z), in(0), state(INIT), verify(false) {}
   void setOutput(Writer* out) {enc.out=out;}
   void writeTag();
   void startBlock(int level);  // level=1,2,3
-  void startBlock(const char* hcomp);
+  void startBlock(const char* hcomp);     // ZPAQL byte code
+  void startBlock(const char* config,     // ZPAQL source code
+                  int* args,              // NULL or int[9] arguments
+                  Writer* pcomp_cmd = 0); // retrieve preprocessor command
+  void setVerify(bool v) {verify = v;}    // check postprocessing?
+  void hcomp(Writer* out2) {z.write(out2, false);}
+  bool pcomp(Writer* out2) {return pz.write(out2, true);}
   void startSegment(const char* filename = 0, const char* comment = 0);
   void setInput(Reader* i) {in=i;}
-  void postProcess(const char* pcomp = 0, int len = 0);
+  void postProcess(const char* pcomp = 0, int len = 0);  // byte code
   bool compress(int n = -1);  // n bytes, -1=all, return true until done
   void endSegment(const char* sha1string = 0);
+  char* endSegmentChecksum(int64_t* size = 0, bool dosha1=true);
+  int64_t getSize() {return sha1.usize();}
+  const char* getChecksum() {return sha1.result();}
   void endBlock();
   int stat(int x) {return enc.stat(x);}
 private:
-  ZPAQL z;
-  Encoder enc;
-  Reader* in;
+  ZPAQL z, pz;  // model and test postprocessor
+  Encoder enc;  // arithmetic encoder containing predictor
+  SHA1 sha1;    // to test pz output
+  Reader* in;   // input source
+  char sha1result[20];  // sha1 output
   enum {INIT, BLOCK1, SEG1, BLOCK2, SEG2} state;
+  bool verify;  // if true then test by postprocessing
+};
+
+/////////////////////////// StringBuffer /////////////////////
+
+// For (de)compressing to/from a string. Writing appends bytes
+// which can be later read.
+class StringBuffer: public libzpaq::Reader, public libzpaq::Writer {
+  unsigned char* p;  // allocated memory, not NUL terminated, may be NULL
+  size_t al;         // number of bytes allocated, 0 iff p is NULL
+  size_t wpos;       // index of next byte to write, wpos <= al
+  size_t rpos;       // index of next byte to read, rpos < wpos or return EOF.
+  size_t limit;      // max size, default = -1
+  const size_t init; // initial size on first use after reset
+
+  // Increase capacity to a without changing size
+  void reserve(size_t a) {
+    assert(!al==!p);
+    if (a<=al) return;
+    unsigned char* q=0;
+    if (a>0) q=(unsigned char*)(p ? realloc(p, a) : malloc(a));
+    if (a>0 && !q) error("Out of memory");
+    p=q;
+    al=a;
+  }
+
+  // Enlarge al to make room to write at least n bytes.
+  void lengthen(size_t n) {
+    assert(wpos<=al);
+    if (wpos+n>limit || wpos+n<wpos) error("StringBuffer overflow");
+    if (wpos+n<=al) return;
+    size_t a=al;
+    while (wpos+n>=a) a=a*2+init;
+    reserve(a);
+  }
+
+  // No assignment or copy
+  void operator=(const StringBuffer&);
+  StringBuffer(const StringBuffer&);
+
+public:
+
+  // Direct access to data
+  unsigned char* data() {assert(p || wpos==0); return p;}
+
+  // Allocate no memory initially
+  StringBuffer(size_t n=0):
+      p(0), al(0), wpos(0), rpos(0), limit(size_t(-1)), init(n>128?n:128) {}
+
+  // Set output limit
+  void setLimit(size_t n) {limit=n;}
+
+  // Free memory
+  ~StringBuffer() {if (p) free(p);}
+
+  // Return number of bytes written.
+  size_t size() const {return wpos;}
+
+  // Return number of bytes left to read
+  size_t remaining() const {return wpos-rpos;}
+
+  // Reset size to 0 and free memory.
+  void reset() {
+    if (p) free(p);
+    p=0;
+    al=rpos=wpos=0;
+  }
+
+  // Write a single byte.
+  void put(int c) {  // write 1 byte
+    lengthen(1);
+    assert(p);
+    assert(wpos<al);
+    p[wpos++]=c;
+    assert(wpos<=al);
+  }
+
+  // Write buf[0..n-1]. If buf is NULL then advance write pointer only.
+  void write(const char* buf, int n) {
+    if (n<1) return;
+    lengthen(n);
+    assert(p);
+    assert(wpos+n<=al);
+    if (buf) memcpy(p+wpos, buf, n);
+    wpos+=n;
+  }
+
+  // Read a single byte. Return EOF (-1) at end.
+  int get() {
+    assert(rpos<=wpos);
+    assert(rpos==wpos || p);
+    return rpos<wpos ? p[rpos++] : -1;
+  }
+
+  // Read up to n bytes into buf[0..] or fewer if EOF is first.
+  // Return the number of bytes actually read.
+  // If buf is NULL then advance read pointer without reading.
+  int read(char* buf, int n) {
+    assert(rpos<=wpos);
+    assert(wpos<=al);
+    assert(!al==!p);
+    if (rpos+n>wpos) n=wpos-rpos;
+    if (n>0 && buf) memcpy(buf, p+rpos, n);
+    rpos+=n;
+    return n;
+  }
+
+  // Return the entire string as a read-only array.
+  const char* c_str() const {return (const char*)p;}
+
+  // Truncate the string to size i.
+  void resize(size_t i) {
+    wpos=i;
+    if (rpos>wpos) rpos=wpos;
+  }
+
+  // Swap efficiently (init is not swapped)
+  void swap(StringBuffer& s) {
+    std::swap(p, s.p);
+    std::swap(al, s.al);
+    std::swap(wpos, s.wpos);
+    std::swap(rpos, s.rpos);
+    std::swap(limit, s.limit);
+  }
 };
 
 /////////////////////////// compress() ///////////////////////
 
-void compress(Reader* in, Writer* out, int level);
+// Compress in to out in multiple blocks. Default method is "14,128,0"
+// Default filename is "". Comment is appended to input size.
+// dosha1 means save the SHA-1 checksum.
+void compress(Reader* in, Writer* out, const char* method,
+     const char* filename=0, const char* comment=0, bool dosha1=true);
+
+// Same as compress() but output is 1 block, ignoring block size parameter.
+void compressBlock(StringBuffer* in, Writer* out, const char* method,
+     const char* filename=0, const char* comment=0, bool dosha1=true);
 
 }  // namespace libzpaq
 
 /////////////////////////// lrzip functions //////////////////
 
+
 #include <stdio.h>
 #ifndef uchar
 #define uchar unsigned char
@@ -463,8 +1536,8 @@ struct bufRead: public libzpaq::Reader {
 	bufRead(uchar *buf_, i64 *n_, i64 total_len_, int *last_pct_, bool progress_, long thread_, FILE *msgout_):
 		s_buf(buf_), s_len(n_), total_len(total_len_), last_pct(last_pct_), progress(progress_), thread(thread_), msgout(msgout_) {}
 
-	int get() {
-		if (progress && !(*s_len % 128)) {
+	void show_progress (int bytes_in) {
+		if (progress) {
 			int pct = (total_len > 0) ?
 				(total_len - *s_len) * 100 / total_len : 100;
 
@@ -480,9 +1553,13 @@ struct bufRead: public libzpaq::Reader {
 				*last_pct = pct;
 			}
 		}
+	}
 
+	int get() {
 		if (likely(*s_len > 0)) {
 			(*s_len)--;
+			if (!(*s_len % 128))
+				show_progress(*s_len);
 			return ((int)(uchar)*s_buf++);
 		}
 		return -1;
@@ -520,11 +1597,14 @@ extern "C" void zpaq_compress(uchar *c_buf, i64 *c_len, uchar *s_buf, i64 s_len,
 {
 	i64 total_len = s_len;
 	int last_pct = 100;
+	char method[]="041280";		// defaults for buffer size, compressibility, data type
 
 	bufRead bufR(s_buf, &s_len, total_len, &last_pct, progress, thread, msgout);
 	bufWrite bufW(c_buf, c_len);
 
-	compress (&bufR, &bufW, level);
+	method[0]=(char)level+'0';	// set level 1-5
+
+	compress (&bufR, &bufW, (const char *) &method,  NULL, NULL, true);
 }
 
 extern "C" void zpaq_decompress(uchar *s_buf, i64 *d_len, uchar *c_buf, i64 c_len,
@@ -539,4 +1619,9 @@ extern "C" void zpaq_decompress(uchar *s_buf, i64 *d_len, uchar *c_buf, i64 c_le
 	decompress(&bufR, &bufW);
 }
 
+void libzpaq::error(const char* msg) {  // print message and exit
+    fprintf(stderr, "ZPAQ Error: %s\n", msg);
+    exit(1);
+}
+
 #endif  // LIBZPAQ_H
diff --git a/stream.c b/stream.c
index 407c550..492381d 100644
--- a/stream.c
+++ b/stream.c
@@ -181,7 +181,8 @@ static int zpaq_compress_buf(rzip_control *control, struct compress_thread *cthr
 
 	c_len = 0;
 
-	zpaq_compress(c_buf, &c_len, cthread->s_buf, cthread->s_len, control->compression_level / 4 + 1,
+	/* Compression level can be 1 to 5, zpaq version 7.15 */
+	zpaq_compress(c_buf, &c_len, cthread->s_buf, cthread->s_len, control->compression_level / 2 + 1,
 		      control->msgout, SHOW_PROGRESS ? true: false, thread);
 
 	if (unlikely(c_len >= cthread->c_len)) {