static char rcsid[] = "$Id$";
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif


/* Modified from simdbitmapdecode.c by Daniel Lemire */

#include "genomebits_decode.h"

#include "simd.h"

#if 0
#ifdef HAVE_SSE2
#include <emmintrin.h>
#endif
#ifdef HAVE_AVX2
#include <immintrin.h>
#endif
#endif


#if defined(_MSC_VER)
#define ALIGNED(x) __declspec(align(x))
#else
#if defined(__GNUC__)
#define ALIGNED(x) __attribute__ ((aligned(x)))
#endif
#endif


#ifdef HAVE_SSE2
static const int decode_length[256] = {
    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
#endif

#ifdef HAVE_SSE2
/* Has all values of (trailing_zeroes + 1) (1-based) in ascending order */
static const int decode_trailing_table[256][8] ALIGNED(32) = {
    { 0, 0, 0, 0, 0, 0, 0, 0 }, /* 0x00 (00000000) */
    { 1, 0, 0, 0, 0, 0, 0, 0 }, /* 0x01 (00000001) */
    { 2, 0, 0, 0, 0, 0, 0, 0 }, /* 0x02 (00000010) */
    { 1, 2, 0, 0, 0, 0, 0, 0 }, /* 0x03 (00000011) */
    { 3, 0, 0, 0, 0, 0, 0, 0 }, /* 0x04 (00000100) */
    { 1, 3, 0, 0, 0, 0, 0, 0 }, /* 0x05 (00000101) */
    { 2, 3, 0, 0, 0, 0, 0, 0 }, /* 0x06 (00000110) */
    { 1, 2, 3, 0, 0, 0, 0, 0 }, /* 0x07 (00000111) */
    { 4, 0, 0, 0, 0, 0, 0, 0 }, /* 0x08 (00001000) */
    { 1, 4, 0, 0, 0, 0, 0, 0 }, /* 0x09 (00001001) */
    { 2, 4, 0, 0, 0, 0, 0, 0 }, /* 0x0A (00001010) */
    { 1, 2, 4, 0, 0, 0, 0, 0 }, /* 0x0B (00001011) */
    { 3, 4, 0, 0, 0, 0, 0, 0 }, /* 0x0C (00001100) */
    { 1, 3, 4, 0, 0, 0, 0, 0 }, /* 0x0D (00001101) */
    { 2, 3, 4, 0, 0, 0, 0, 0 }, /* 0x0E (00001110) */
    { 1, 2, 3, 4, 0, 0, 0, 0 }, /* 0x0F (00001111) */
    { 5, 0, 0, 0, 0, 0, 0, 0 }, /* 0x10 (00010000) */
    { 1, 5, 0, 0, 0, 0, 0, 0 }, /* 0x11 (00010001) */
    { 2, 5, 0, 0, 0, 0, 0, 0 }, /* 0x12 (00010010) */
    { 1, 2, 5, 0, 0, 0, 0, 0 }, /* 0x13 (00010011) */
    { 3, 5, 0, 0, 0, 0, 0, 0 }, /* 0x14 (00010100) */
    { 1, 3, 5, 0, 0, 0, 0, 0 }, /* 0x15 (00010101) */
    { 2, 3, 5, 0, 0, 0, 0, 0 }, /* 0x16 (00010110) */
    { 1, 2, 3, 5, 0, 0, 0, 0 }, /* 0x17 (00010111) */
    { 4, 5, 0, 0, 0, 0, 0, 0 }, /* 0x18 (00011000) */
    { 1, 4, 5, 0, 0, 0, 0, 0 }, /* 0x19 (00011001) */
    { 2, 4, 5, 0, 0, 0, 0, 0 }, /* 0x1A (00011010) */
    { 1, 2, 4, 5, 0, 0, 0, 0 }, /* 0x1B (00011011) */
    { 3, 4, 5, 0, 0, 0, 0, 0 }, /* 0x1C (00011100) */
    { 1, 3, 4, 5, 0, 0, 0, 0 }, /* 0x1D (00011101) */
    { 2, 3, 4, 5, 0, 0, 0, 0 }, /* 0x1E (00011110) */
    { 1, 2, 3, 4, 5, 0, 0, 0 }, /* 0x1F (00011111) */
    { 6, 0, 0, 0, 0, 0, 0, 0 }, /* 0x20 (00100000) */
    { 1, 6, 0, 0, 0, 0, 0, 0 }, /* 0x21 (00100001) */
    { 2, 6, 0, 0, 0, 0, 0, 0 }, /* 0x22 (00100010) */
    { 1, 2, 6, 0, 0, 0, 0, 0 }, /* 0x23 (00100011) */
    { 3, 6, 0, 0, 0, 0, 0, 0 }, /* 0x24 (00100100) */
    { 1, 3, 6, 0, 0, 0, 0, 0 }, /* 0x25 (00100101) */
    { 2, 3, 6, 0, 0, 0, 0, 0 }, /* 0x26 (00100110) */
    { 1, 2, 3, 6, 0, 0, 0, 0 }, /* 0x27 (00100111) */
    { 4, 6, 0, 0, 0, 0, 0, 0 }, /* 0x28 (00101000) */
    { 1, 4, 6, 0, 0, 0, 0, 0 }, /* 0x29 (00101001) */
    { 2, 4, 6, 0, 0, 0, 0, 0 }, /* 0x2A (00101010) */
    { 1, 2, 4, 6, 0, 0, 0, 0 }, /* 0x2B (00101011) */
    { 3, 4, 6, 0, 0, 0, 0, 0 }, /* 0x2C (00101100) */
    { 1, 3, 4, 6, 0, 0, 0, 0 }, /* 0x2D (00101101) */
    { 2, 3, 4, 6, 0, 0, 0, 0 }, /* 0x2E (00101110) */
    { 1, 2, 3, 4, 6, 0, 0, 0 }, /* 0x2F (00101111) */
    { 5, 6, 0, 0, 0, 0, 0, 0 }, /* 0x30 (00110000) */
    { 1, 5, 6, 0, 0, 0, 0, 0 }, /* 0x31 (00110001) */
    { 2, 5, 6, 0, 0, 0, 0, 0 }, /* 0x32 (00110010) */
    { 1, 2, 5, 6, 0, 0, 0, 0 }, /* 0x33 (00110011) */
    { 3, 5, 6, 0, 0, 0, 0, 0 }, /* 0x34 (00110100) */
    { 1, 3, 5, 6, 0, 0, 0, 0 }, /* 0x35 (00110101) */
    { 2, 3, 5, 6, 0, 0, 0, 0 }, /* 0x36 (00110110) */
    { 1, 2, 3, 5, 6, 0, 0, 0 }, /* 0x37 (00110111) */
    { 4, 5, 6, 0, 0, 0, 0, 0 }, /* 0x38 (00111000) */
    { 1, 4, 5, 6, 0, 0, 0, 0 }, /* 0x39 (00111001) */
    { 2, 4, 5, 6, 0, 0, 0, 0 }, /* 0x3A (00111010) */
    { 1, 2, 4, 5, 6, 0, 0, 0 }, /* 0x3B (00111011) */
    { 3, 4, 5, 6, 0, 0, 0, 0 }, /* 0x3C (00111100) */
    { 1, 3, 4, 5, 6, 0, 0, 0 }, /* 0x3D (00111101) */
    { 2, 3, 4, 5, 6, 0, 0, 0 }, /* 0x3E (00111110) */
    { 1, 2, 3, 4, 5, 6, 0, 0 }, /* 0x3F (00111111) */
    { 7, 0, 0, 0, 0, 0, 0, 0 }, /* 0x40 (01000000) */
    { 1, 7, 0, 0, 0, 0, 0, 0 }, /* 0x41 (01000001) */
    { 2, 7, 0, 0, 0, 0, 0, 0 }, /* 0x42 (01000010) */
    { 1, 2, 7, 0, 0, 0, 0, 0 }, /* 0x43 (01000011) */
    { 3, 7, 0, 0, 0, 0, 0, 0 }, /* 0x44 (01000100) */
    { 1, 3, 7, 0, 0, 0, 0, 0 }, /* 0x45 (01000101) */
    { 2, 3, 7, 0, 0, 0, 0, 0 }, /* 0x46 (01000110) */
    { 1, 2, 3, 7, 0, 0, 0, 0 }, /* 0x47 (01000111) */
    { 4, 7, 0, 0, 0, 0, 0, 0 }, /* 0x48 (01001000) */
    { 1, 4, 7, 0, 0, 0, 0, 0 }, /* 0x49 (01001001) */
    { 2, 4, 7, 0, 0, 0, 0, 0 }, /* 0x4A (01001010) */
    { 1, 2, 4, 7, 0, 0, 0, 0 }, /* 0x4B (01001011) */
    { 3, 4, 7, 0, 0, 0, 0, 0 }, /* 0x4C (01001100) */
    { 1, 3, 4, 7, 0, 0, 0, 0 }, /* 0x4D (01001101) */
    { 2, 3, 4, 7, 0, 0, 0, 0 }, /* 0x4E (01001110) */
    { 1, 2, 3, 4, 7, 0, 0, 0 }, /* 0x4F (01001111) */
    { 5, 7, 0, 0, 0, 0, 0, 0 }, /* 0x50 (01010000) */
    { 1, 5, 7, 0, 0, 0, 0, 0 }, /* 0x51 (01010001) */
    { 2, 5, 7, 0, 0, 0, 0, 0 }, /* 0x52 (01010010) */
    { 1, 2, 5, 7, 0, 0, 0, 0 }, /* 0x53 (01010011) */
    { 3, 5, 7, 0, 0, 0, 0, 0 }, /* 0x54 (01010100) */
    { 1, 3, 5, 7, 0, 0, 0, 0 }, /* 0x55 (01010101) */
    { 2, 3, 5, 7, 0, 0, 0, 0 }, /* 0x56 (01010110) */
    { 1, 2, 3, 5, 7, 0, 0, 0 }, /* 0x57 (01010111) */
    { 4, 5, 7, 0, 0, 0, 0, 0 }, /* 0x58 (01011000) */
    { 1, 4, 5, 7, 0, 0, 0, 0 }, /* 0x59 (01011001) */
    { 2, 4, 5, 7, 0, 0, 0, 0 }, /* 0x5A (01011010) */
    { 1, 2, 4, 5, 7, 0, 0, 0 }, /* 0x5B (01011011) */
    { 3, 4, 5, 7, 0, 0, 0, 0 }, /* 0x5C (01011100) */
    { 1, 3, 4, 5, 7, 0, 0, 0 }, /* 0x5D (01011101) */
    { 2, 3, 4, 5, 7, 0, 0, 0 }, /* 0x5E (01011110) */
    { 1, 2, 3, 4, 5, 7, 0, 0 }, /* 0x5F (01011111) */
    { 6, 7, 0, 0, 0, 0, 0, 0 }, /* 0x60 (01100000) */
    { 1, 6, 7, 0, 0, 0, 0, 0 }, /* 0x61 (01100001) */
    { 2, 6, 7, 0, 0, 0, 0, 0 }, /* 0x62 (01100010) */
    { 1, 2, 6, 7, 0, 0, 0, 0 }, /* 0x63 (01100011) */
    { 3, 6, 7, 0, 0, 0, 0, 0 }, /* 0x64 (01100100) */
    { 1, 3, 6, 7, 0, 0, 0, 0 }, /* 0x65 (01100101) */
    { 2, 3, 6, 7, 0, 0, 0, 0 }, /* 0x66 (01100110) */
    { 1, 2, 3, 6, 7, 0, 0, 0 }, /* 0x67 (01100111) */
    { 4, 6, 7, 0, 0, 0, 0, 0 }, /* 0x68 (01101000) */
    { 1, 4, 6, 7, 0, 0, 0, 0 }, /* 0x69 (01101001) */
    { 2, 4, 6, 7, 0, 0, 0, 0 }, /* 0x6A (01101010) */
    { 1, 2, 4, 6, 7, 0, 0, 0 }, /* 0x6B (01101011) */
    { 3, 4, 6, 7, 0, 0, 0, 0 }, /* 0x6C (01101100) */
    { 1, 3, 4, 6, 7, 0, 0, 0 }, /* 0x6D (01101101) */
    { 2, 3, 4, 6, 7, 0, 0, 0 }, /* 0x6E (01101110) */
    { 1, 2, 3, 4, 6, 7, 0, 0 }, /* 0x6F (01101111) */
    { 5, 6, 7, 0, 0, 0, 0, 0 }, /* 0x70 (01110000) */
    { 1, 5, 6, 7, 0, 0, 0, 0 }, /* 0x71 (01110001) */
    { 2, 5, 6, 7, 0, 0, 0, 0 }, /* 0x72 (01110010) */
    { 1, 2, 5, 6, 7, 0, 0, 0 }, /* 0x73 (01110011) */
    { 3, 5, 6, 7, 0, 0, 0, 0 }, /* 0x74 (01110100) */
    { 1, 3, 5, 6, 7, 0, 0, 0 }, /* 0x75 (01110101) */
    { 2, 3, 5, 6, 7, 0, 0, 0 }, /* 0x76 (01110110) */
    { 1, 2, 3, 5, 6, 7, 0, 0 }, /* 0x77 (01110111) */
    { 4, 5, 6, 7, 0, 0, 0, 0 }, /* 0x78 (01111000) */
    { 1, 4, 5, 6, 7, 0, 0, 0 }, /* 0x79 (01111001) */
    { 2, 4, 5, 6, 7, 0, 0, 0 }, /* 0x7A (01111010) */
    { 1, 2, 4, 5, 6, 7, 0, 0 }, /* 0x7B (01111011) */
    { 3, 4, 5, 6, 7, 0, 0, 0 }, /* 0x7C (01111100) */
    { 1, 3, 4, 5, 6, 7, 0, 0 }, /* 0x7D (01111101) */
    { 2, 3, 4, 5, 6, 7, 0, 0 }, /* 0x7E (01111110) */
    { 1, 2, 3, 4, 5, 6, 7, 0 }, /* 0x7F (01111111) */
    { 8, 0, 0, 0, 0, 0, 0, 0 }, /* 0x80 (10000000) */
    { 1, 8, 0, 0, 0, 0, 0, 0 }, /* 0x81 (10000001) */
    { 2, 8, 0, 0, 0, 0, 0, 0 }, /* 0x82 (10000010) */
    { 1, 2, 8, 0, 0, 0, 0, 0 }, /* 0x83 (10000011) */
    { 3, 8, 0, 0, 0, 0, 0, 0 }, /* 0x84 (10000100) */
    { 1, 3, 8, 0, 0, 0, 0, 0 }, /* 0x85 (10000101) */
    { 2, 3, 8, 0, 0, 0, 0, 0 }, /* 0x86 (10000110) */
    { 1, 2, 3, 8, 0, 0, 0, 0 }, /* 0x87 (10000111) */
    { 4, 8, 0, 0, 0, 0, 0, 0 }, /* 0x88 (10001000) */
    { 1, 4, 8, 0, 0, 0, 0, 0 }, /* 0x89 (10001001) */
    { 2, 4, 8, 0, 0, 0, 0, 0 }, /* 0x8A (10001010) */
    { 1, 2, 4, 8, 0, 0, 0, 0 }, /* 0x8B (10001011) */
    { 3, 4, 8, 0, 0, 0, 0, 0 }, /* 0x8C (10001100) */
    { 1, 3, 4, 8, 0, 0, 0, 0 }, /* 0x8D (10001101) */
    { 2, 3, 4, 8, 0, 0, 0, 0 }, /* 0x8E (10001110) */
    { 1, 2, 3, 4, 8, 0, 0, 0 }, /* 0x8F (10001111) */
    { 5, 8, 0, 0, 0, 0, 0, 0 }, /* 0x90 (10010000) */
    { 1, 5, 8, 0, 0, 0, 0, 0 }, /* 0x91 (10010001) */
    { 2, 5, 8, 0, 0, 0, 0, 0 }, /* 0x92 (10010010) */
    { 1, 2, 5, 8, 0, 0, 0, 0 }, /* 0x93 (10010011) */
    { 3, 5, 8, 0, 0, 0, 0, 0 }, /* 0x94 (10010100) */
    { 1, 3, 5, 8, 0, 0, 0, 0 }, /* 0x95 (10010101) */
    { 2, 3, 5, 8, 0, 0, 0, 0 }, /* 0x96 (10010110) */
    { 1, 2, 3, 5, 8, 0, 0, 0 }, /* 0x97 (10010111) */
    { 4, 5, 8, 0, 0, 0, 0, 0 }, /* 0x98 (10011000) */
    { 1, 4, 5, 8, 0, 0, 0, 0 }, /* 0x99 (10011001) */
    { 2, 4, 5, 8, 0, 0, 0, 0 }, /* 0x9A (10011010) */
    { 1, 2, 4, 5, 8, 0, 0, 0 }, /* 0x9B (10011011) */
    { 3, 4, 5, 8, 0, 0, 0, 0 }, /* 0x9C (10011100) */
    { 1, 3, 4, 5, 8, 0, 0, 0 }, /* 0x9D (10011101) */
    { 2, 3, 4, 5, 8, 0, 0, 0 }, /* 0x9E (10011110) */
    { 1, 2, 3, 4, 5, 8, 0, 0 }, /* 0x9F (10011111) */
    { 6, 8, 0, 0, 0, 0, 0, 0 }, /* 0xA0 (10100000) */
    { 1, 6, 8, 0, 0, 0, 0, 0 }, /* 0xA1 (10100001) */
    { 2, 6, 8, 0, 0, 0, 0, 0 }, /* 0xA2 (10100010) */
    { 1, 2, 6, 8, 0, 0, 0, 0 }, /* 0xA3 (10100011) */
    { 3, 6, 8, 0, 0, 0, 0, 0 }, /* 0xA4 (10100100) */
    { 1, 3, 6, 8, 0, 0, 0, 0 }, /* 0xA5 (10100101) */
    { 2, 3, 6, 8, 0, 0, 0, 0 }, /* 0xA6 (10100110) */
    { 1, 2, 3, 6, 8, 0, 0, 0 }, /* 0xA7 (10100111) */
    { 4, 6, 8, 0, 0, 0, 0, 0 }, /* 0xA8 (10101000) */
    { 1, 4, 6, 8, 0, 0, 0, 0 }, /* 0xA9 (10101001) */
    { 2, 4, 6, 8, 0, 0, 0, 0 }, /* 0xAA (10101010) */
    { 1, 2, 4, 6, 8, 0, 0, 0 }, /* 0xAB (10101011) */
    { 3, 4, 6, 8, 0, 0, 0, 0 }, /* 0xAC (10101100) */
    { 1, 3, 4, 6, 8, 0, 0, 0 }, /* 0xAD (10101101) */
    { 2, 3, 4, 6, 8, 0, 0, 0 }, /* 0xAE (10101110) */
    { 1, 2, 3, 4, 6, 8, 0, 0 }, /* 0xAF (10101111) */
    { 5, 6, 8, 0, 0, 0, 0, 0 }, /* 0xB0 (10110000) */
    { 1, 5, 6, 8, 0, 0, 0, 0 }, /* 0xB1 (10110001) */
    { 2, 5, 6, 8, 0, 0, 0, 0 }, /* 0xB2 (10110010) */
    { 1, 2, 5, 6, 8, 0, 0, 0 }, /* 0xB3 (10110011) */
    { 3, 5, 6, 8, 0, 0, 0, 0 }, /* 0xB4 (10110100) */
    { 1, 3, 5, 6, 8, 0, 0, 0 }, /* 0xB5 (10110101) */
    { 2, 3, 5, 6, 8, 0, 0, 0 }, /* 0xB6 (10110110) */
    { 1, 2, 3, 5, 6, 8, 0, 0 }, /* 0xB7 (10110111) */
    { 4, 5, 6, 8, 0, 0, 0, 0 }, /* 0xB8 (10111000) */
    { 1, 4, 5, 6, 8, 0, 0, 0 }, /* 0xB9 (10111001) */
    { 2, 4, 5, 6, 8, 0, 0, 0 }, /* 0xBA (10111010) */
    { 1, 2, 4, 5, 6, 8, 0, 0 }, /* 0xBB (10111011) */
    { 3, 4, 5, 6, 8, 0, 0, 0 }, /* 0xBC (10111100) */
    { 1, 3, 4, 5, 6, 8, 0, 0 }, /* 0xBD (10111101) */
    { 2, 3, 4, 5, 6, 8, 0, 0 }, /* 0xBE (10111110) */
    { 1, 2, 3, 4, 5, 6, 8, 0 }, /* 0xBF (10111111) */
    { 7, 8, 0, 0, 0, 0, 0, 0 }, /* 0xC0 (11000000) */
    { 1, 7, 8, 0, 0, 0, 0, 0 }, /* 0xC1 (11000001) */
    { 2, 7, 8, 0, 0, 0, 0, 0 }, /* 0xC2 (11000010) */
    { 1, 2, 7, 8, 0, 0, 0, 0 }, /* 0xC3 (11000011) */
    { 3, 7, 8, 0, 0, 0, 0, 0 }, /* 0xC4 (11000100) */
    { 1, 3, 7, 8, 0, 0, 0, 0 }, /* 0xC5 (11000101) */
    { 2, 3, 7, 8, 0, 0, 0, 0 }, /* 0xC6 (11000110) */
    { 1, 2, 3, 7, 8, 0, 0, 0 }, /* 0xC7 (11000111) */
    { 4, 7, 8, 0, 0, 0, 0, 0 }, /* 0xC8 (11001000) */
    { 1, 4, 7, 8, 0, 0, 0, 0 }, /* 0xC9 (11001001) */
    { 2, 4, 7, 8, 0, 0, 0, 0 }, /* 0xCA (11001010) */
    { 1, 2, 4, 7, 8, 0, 0, 0 }, /* 0xCB (11001011) */
    { 3, 4, 7, 8, 0, 0, 0, 0 }, /* 0xCC (11001100) */
    { 1, 3, 4, 7, 8, 0, 0, 0 }, /* 0xCD (11001101) */
    { 2, 3, 4, 7, 8, 0, 0, 0 }, /* 0xCE (11001110) */
    { 1, 2, 3, 4, 7, 8, 0, 0 }, /* 0xCF (11001111) */
    { 5, 7, 8, 0, 0, 0, 0, 0 }, /* 0xD0 (11010000) */
    { 1, 5, 7, 8, 0, 0, 0, 0 }, /* 0xD1 (11010001) */
    { 2, 5, 7, 8, 0, 0, 0, 0 }, /* 0xD2 (11010010) */
    { 1, 2, 5, 7, 8, 0, 0, 0 }, /* 0xD3 (11010011) */
    { 3, 5, 7, 8, 0, 0, 0, 0 }, /* 0xD4 (11010100) */
    { 1, 3, 5, 7, 8, 0, 0, 0 }, /* 0xD5 (11010101) */
    { 2, 3, 5, 7, 8, 0, 0, 0 }, /* 0xD6 (11010110) */
    { 1, 2, 3, 5, 7, 8, 0, 0 }, /* 0xD7 (11010111) */
    { 4, 5, 7, 8, 0, 0, 0, 0 }, /* 0xD8 (11011000) */
    { 1, 4, 5, 7, 8, 0, 0, 0 }, /* 0xD9 (11011001) */
    { 2, 4, 5, 7, 8, 0, 0, 0 }, /* 0xDA (11011010) */
    { 1, 2, 4, 5, 7, 8, 0, 0 }, /* 0xDB (11011011) */
    { 3, 4, 5, 7, 8, 0, 0, 0 }, /* 0xDC (11011100) */
    { 1, 3, 4, 5, 7, 8, 0, 0 }, /* 0xDD (11011101) */
    { 2, 3, 4, 5, 7, 8, 0, 0 }, /* 0xDE (11011110) */
    { 1, 2, 3, 4, 5, 7, 8, 0 }, /* 0xDF (11011111) */
    { 6, 7, 8, 0, 0, 0, 0, 0 }, /* 0xE0 (11100000) */
    { 1, 6, 7, 8, 0, 0, 0, 0 }, /* 0xE1 (11100001) */
    { 2, 6, 7, 8, 0, 0, 0, 0 }, /* 0xE2 (11100010) */
    { 1, 2, 6, 7, 8, 0, 0, 0 }, /* 0xE3 (11100011) */
    { 3, 6, 7, 8, 0, 0, 0, 0 }, /* 0xE4 (11100100) */
    { 1, 3, 6, 7, 8, 0, 0, 0 }, /* 0xE5 (11100101) */
    { 2, 3, 6, 7, 8, 0, 0, 0 }, /* 0xE6 (11100110) */
    { 1, 2, 3, 6, 7, 8, 0, 0 }, /* 0xE7 (11100111) */
    { 4, 6, 7, 8, 0, 0, 0, 0 }, /* 0xE8 (11101000) */
    { 1, 4, 6, 7, 8, 0, 0, 0 }, /* 0xE9 (11101001) */
    { 2, 4, 6, 7, 8, 0, 0, 0 }, /* 0xEA (11101010) */
    { 1, 2, 4, 6, 7, 8, 0, 0 }, /* 0xEB (11101011) */
    { 3, 4, 6, 7, 8, 0, 0, 0 }, /* 0xEC (11101100) */
    { 1, 3, 4, 6, 7, 8, 0, 0 }, /* 0xED (11101101) */
    { 2, 3, 4, 6, 7, 8, 0, 0 }, /* 0xEE (11101110) */
    { 1, 2, 3, 4, 6, 7, 8, 0 }, /* 0xEF (11101111) */
    { 5, 6, 7, 8, 0, 0, 0, 0 }, /* 0xF0 (11110000) */
    { 1, 5, 6, 7, 8, 0, 0, 0 }, /* 0xF1 (11110001) */
    { 2, 5, 6, 7, 8, 0, 0, 0 }, /* 0xF2 (11110010) */
    { 1, 2, 5, 6, 7, 8, 0, 0 }, /* 0xF3 (11110011) */
    { 3, 5, 6, 7, 8, 0, 0, 0 }, /* 0xF4 (11110100) */
    { 1, 3, 5, 6, 7, 8, 0, 0 }, /* 0xF5 (11110101) */
    { 2, 3, 5, 6, 7, 8, 0, 0 }, /* 0xF6 (11110110) */
    { 1, 2, 3, 5, 6, 7, 8, 0 }, /* 0xF7 (11110111) */
    { 4, 5, 6, 7, 8, 0, 0, 0 }, /* 0xF8 (11111000) */
    { 1, 4, 5, 6, 7, 8, 0, 0 }, /* 0xF9 (11111001) */
    { 2, 4, 5, 6, 7, 8, 0, 0 }, /* 0xFA (11111010) */
    { 1, 2, 4, 5, 6, 7, 8, 0 }, /* 0xFB (11111011) */
    { 3, 4, 5, 6, 7, 8, 0, 0 }, /* 0xFC (11111100) */
    { 1, 3, 4, 5, 6, 7, 8, 0 }, /* 0xFD (11111101) */
    { 2, 3, 4, 5, 6, 7, 8, 0 }, /* 0xFE (11111110) */
    { 1, 2, 3, 4, 5, 6, 7, 8 }  /* 0xFF (11111111) */
};
#endif

#ifdef HAVE_SSE2
/* Has all values of (leading_zeroes + 1) (1-based) in ascending order */
static const int decode_leading_table[256][8] ALIGNED(32) = {
    { 0, 0, 0, 0, 0, 0, 0, 0 }, /* 0x00 (00000000) */
    { 8, 0, 0, 0, 0, 0, 0, 0 }, /* 0x01 (00000001) */
    { 7, 0, 0, 0, 0, 0, 0, 0 }, /* 0x02 (00000010) */
    { 7, 8, 0, 0, 0, 0, 0, 0 }, /* 0x03 (00000011) */
    { 6, 0, 0, 0, 0, 0, 0, 0 }, /* 0x04 (00000100) */
    { 6, 8, 0, 0, 0, 0, 0, 0 }, /* 0x05 (00000101) */
    { 6, 7, 0, 0, 0, 0, 0, 0 }, /* 0x06 (00000110) */
    { 6, 7, 8, 0, 0, 0, 0, 0 }, /* 0x07 (00000111) */
    { 5, 0, 0, 0, 0, 0, 0, 0 }, /* 0x08 (00001000) */
    { 5, 8, 0, 0, 0, 0, 0, 0 }, /* 0x09 (00001001) */
    { 5, 7, 0, 0, 0, 0, 0, 0 }, /* 0x0A (00001010) */
    { 5, 7, 8, 0, 0, 0, 0, 0 }, /* 0x0B (00001011) */
    { 5, 6, 0, 0, 0, 0, 0, 0 }, /* 0x0C (00001100) */
    { 5, 6, 8, 0, 0, 0, 0, 0 }, /* 0x0D (00001101) */
    { 5, 6, 7, 0, 0, 0, 0, 0 }, /* 0x0E (00001110) */
    { 5, 6, 7, 8, 0, 0, 0, 0 }, /* 0x0F (00001111) */
    { 4, 0, 0, 0, 0, 0, 0, 0 }, /* 0x10 (00010000) */
    { 4, 8, 0, 0, 0, 0, 0, 0 }, /* 0x11 (00010001) */
    { 4, 7, 0, 0, 0, 0, 0, 0 }, /* 0x12 (00010010) */
    { 4, 7, 8, 0, 0, 0, 0, 0 }, /* 0x13 (00010011) */
    { 4, 6, 0, 0, 0, 0, 0, 0 }, /* 0x14 (00010100) */
    { 4, 6, 8, 0, 0, 0, 0, 0 }, /* 0x15 (00010101) */
    { 4, 6, 7, 0, 0, 0, 0, 0 }, /* 0x16 (00010110) */
    { 4, 6, 7, 8, 0, 0, 0, 0 }, /* 0x17 (00010111) */
    { 4, 5, 0, 0, 0, 0, 0, 0 }, /* 0x18 (00011000) */
    { 4, 5, 8, 0, 0, 0, 0, 0 }, /* 0x19 (00011001) */
    { 4, 5, 7, 0, 0, 0, 0, 0 }, /* 0x1A (00011010) */
    { 4, 5, 7, 8, 0, 0, 0, 0 }, /* 0x1B (00011011) */
    { 4, 5, 6, 0, 0, 0, 0, 0 }, /* 0x1C (00011100) */
    { 4, 5, 6, 8, 0, 0, 0, 0 }, /* 0x1D (00011101) */
    { 4, 5, 6, 7, 0, 0, 0, 0 }, /* 0x1E (00011110) */
    { 4, 5, 6, 7, 8, 0, 0, 0 }, /* 0x1F (00011111) */
    { 3, 0, 0, 0, 0, 0, 0, 0 }, /* 0x20 (00100000) */
    { 3, 8, 0, 0, 0, 0, 0, 0 }, /* 0x21 (00100001) */
    { 3, 7, 0, 0, 0, 0, 0, 0 }, /* 0x22 (00100010) */
    { 3, 7, 8, 0, 0, 0, 0, 0 }, /* 0x23 (00100011) */
    { 3, 6, 0, 0, 0, 0, 0, 0 }, /* 0x24 (00100100) */
    { 3, 6, 8, 0, 0, 0, 0, 0 }, /* 0x25 (00100101) */
    { 3, 6, 7, 0, 0, 0, 0, 0 }, /* 0x26 (00100110) */
    { 3, 6, 7, 8, 0, 0, 0, 0 }, /* 0x27 (00100111) */
    { 3, 5, 0, 0, 0, 0, 0, 0 }, /* 0x28 (00101000) */
    { 3, 5, 8, 0, 0, 0, 0, 0 }, /* 0x29 (00101001) */
    { 3, 5, 7, 0, 0, 0, 0, 0 }, /* 0x2A (00101010) */
    { 3, 5, 7, 8, 0, 0, 0, 0 }, /* 0x2B (00101011) */
    { 3, 5, 6, 0, 0, 0, 0, 0 }, /* 0x2C (00101100) */
    { 3, 5, 6, 8, 0, 0, 0, 0 }, /* 0x2D (00101101) */
    { 3, 5, 6, 7, 0, 0, 0, 0 }, /* 0x2E (00101110) */
    { 3, 5, 6, 7, 8, 0, 0, 0 }, /* 0x2F (00101111) */
    { 3, 4, 0, 0, 0, 0, 0, 0 }, /* 0x30 (00110000) */
    { 3, 4, 8, 0, 0, 0, 0, 0 }, /* 0x31 (00110001) */
    { 3, 4, 7, 0, 0, 0, 0, 0 }, /* 0x32 (00110010) */
    { 3, 4, 7, 8, 0, 0, 0, 0 }, /* 0x33 (00110011) */
    { 3, 4, 6, 0, 0, 0, 0, 0 }, /* 0x34 (00110100) */
    { 3, 4, 6, 8, 0, 0, 0, 0 }, /* 0x35 (00110101) */
    { 3, 4, 6, 7, 0, 0, 0, 0 }, /* 0x36 (00110110) */
    { 3, 4, 6, 7, 8, 0, 0, 0 }, /* 0x37 (00110111) */
    { 3, 4, 5, 0, 0, 0, 0, 0 }, /* 0x38 (00111000) */
    { 3, 4, 5, 8, 0, 0, 0, 0 }, /* 0x39 (00111001) */
    { 3, 4, 5, 7, 0, 0, 0, 0 }, /* 0x3A (00111010) */
    { 3, 4, 5, 7, 8, 0, 0, 0 }, /* 0x3B (00111011) */
    { 3, 4, 5, 6, 0, 0, 0, 0 }, /* 0x3C (00111100) */
    { 3, 4, 5, 6, 8, 0, 0, 0 }, /* 0x3D (00111101) */
    { 3, 4, 5, 6, 7, 0, 0, 0 }, /* 0x3E (00111110) */
    { 3, 4, 5, 6, 7, 8, 0, 0 }, /* 0x3F (00111111) */
    { 2, 0, 0, 0, 0, 0, 0, 0 }, /* 0x40 (01000000) */
    { 2, 8, 0, 0, 0, 0, 0, 0 }, /* 0x41 (01000001) */
    { 2, 7, 0, 0, 0, 0, 0, 0 }, /* 0x42 (01000010) */
    { 2, 7, 8, 0, 0, 0, 0, 0 }, /* 0x43 (01000011) */
    { 2, 6, 0, 0, 0, 0, 0, 0 }, /* 0x44 (01000100) */
    { 2, 6, 8, 0, 0, 0, 0, 0 }, /* 0x45 (01000101) */
    { 2, 6, 7, 0, 0, 0, 0, 0 }, /* 0x46 (01000110) */
    { 2, 6, 7, 8, 0, 0, 0, 0 }, /* 0x47 (01000111) */
    { 2, 5, 0, 0, 0, 0, 0, 0 }, /* 0x48 (01001000) */
    { 2, 5, 8, 0, 0, 0, 0, 0 }, /* 0x49 (01001001) */
    { 2, 5, 7, 0, 0, 0, 0, 0 }, /* 0x4A (01001010) */
    { 2, 5, 7, 8, 0, 0, 0, 0 }, /* 0x4B (01001011) */
    { 2, 5, 6, 0, 0, 0, 0, 0 }, /* 0x4C (01001100) */
    { 2, 5, 6, 8, 0, 0, 0, 0 }, /* 0x4D (01001101) */
    { 2, 5, 6, 7, 0, 0, 0, 0 }, /* 0x4E (01001110) */
    { 2, 5, 6, 7, 8, 0, 0, 0 }, /* 0x4F (01001111) */
    { 2, 4, 0, 0, 0, 0, 0, 0 }, /* 0x50 (01010000) */
    { 2, 4, 8, 0, 0, 0, 0, 0 }, /* 0x51 (01010001) */
    { 2, 4, 7, 0, 0, 0, 0, 0 }, /* 0x52 (01010010) */
    { 2, 4, 7, 8, 0, 0, 0, 0 }, /* 0x53 (01010011) */
    { 2, 4, 6, 0, 0, 0, 0, 0 }, /* 0x54 (01010100) */
    { 2, 4, 6, 8, 0, 0, 0, 0 }, /* 0x55 (01010101) */
    { 2, 4, 6, 7, 0, 0, 0, 0 }, /* 0x56 (01010110) */
    { 2, 4, 6, 7, 8, 0, 0, 0 }, /* 0x57 (01010111) */
    { 2, 4, 5, 0, 0, 0, 0, 0 }, /* 0x58 (01011000) */
    { 2, 4, 5, 8, 0, 0, 0, 0 }, /* 0x59 (01011001) */
    { 2, 4, 5, 7, 0, 0, 0, 0 }, /* 0x5A (01011010) */
    { 2, 4, 5, 7, 8, 0, 0, 0 }, /* 0x5B (01011011) */
    { 2, 4, 5, 6, 0, 0, 0, 0 }, /* 0x5C (01011100) */
    { 2, 4, 5, 6, 8, 0, 0, 0 }, /* 0x5D (01011101) */
    { 2, 4, 5, 6, 7, 0, 0, 0 }, /* 0x5E (01011110) */
    { 2, 4, 5, 6, 7, 8, 0, 0 }, /* 0x5F (01011111) */
    { 2, 3, 0, 0, 0, 0, 0, 0 }, /* 0x60 (01100000) */
    { 2, 3, 8, 0, 0, 0, 0, 0 }, /* 0x61 (01100001) */
    { 2, 3, 7, 0, 0, 0, 0, 0 }, /* 0x62 (01100010) */
    { 2, 3, 7, 8, 0, 0, 0, 0 }, /* 0x63 (01100011) */
    { 2, 3, 6, 0, 0, 0, 0, 0 }, /* 0x64 (01100100) */
    { 2, 3, 6, 8, 0, 0, 0, 0 }, /* 0x65 (01100101) */
    { 2, 3, 6, 7, 0, 0, 0, 0 }, /* 0x66 (01100110) */
    { 2, 3, 6, 7, 8, 0, 0, 0 }, /* 0x67 (01100111) */
    { 2, 3, 5, 0, 0, 0, 0, 0 }, /* 0x68 (01101000) */
    { 2, 3, 5, 8, 0, 0, 0, 0 }, /* 0x69 (01101001) */
    { 2, 3, 5, 7, 0, 0, 0, 0 }, /* 0x6A (01101010) */
    { 2, 3, 5, 7, 8, 0, 0, 0 }, /* 0x6B (01101011) */
    { 2, 3, 5, 6, 0, 0, 0, 0 }, /* 0x6C (01101100) */
    { 2, 3, 5, 6, 8, 0, 0, 0 }, /* 0x6D (01101101) */
    { 2, 3, 5, 6, 7, 0, 0, 0 }, /* 0x6E (01101110) */
    { 2, 3, 5, 6, 7, 8, 0, 0 }, /* 0x6F (01101111) */
    { 2, 3, 4, 0, 0, 0, 0, 0 }, /* 0x70 (01110000) */
    { 2, 3, 4, 8, 0, 0, 0, 0 }, /* 0x71 (01110001) */
    { 2, 3, 4, 7, 0, 0, 0, 0 }, /* 0x72 (01110010) */
    { 2, 3, 4, 7, 8, 0, 0, 0 }, /* 0x73 (01110011) */
    { 2, 3, 4, 6, 0, 0, 0, 0 }, /* 0x74 (01110100) */
    { 2, 3, 4, 6, 8, 0, 0, 0 }, /* 0x75 (01110101) */
    { 2, 3, 4, 6, 7, 0, 0, 0 }, /* 0x76 (01110110) */
    { 2, 3, 4, 6, 7, 8, 0, 0 }, /* 0x77 (01110111) */
    { 2, 3, 4, 5, 0, 0, 0, 0 }, /* 0x78 (01111000) */
    { 2, 3, 4, 5, 8, 0, 0, 0 }, /* 0x79 (01111001) */
    { 2, 3, 4, 5, 7, 0, 0, 0 }, /* 0x7A (01111010) */
    { 2, 3, 4, 5, 7, 8, 0, 0 }, /* 0x7B (01111011) */
    { 2, 3, 4, 5, 6, 0, 0, 0 }, /* 0x7C (01111100) */
    { 2, 3, 4, 5, 6, 8, 0, 0 }, /* 0x7D (01111101) */
    { 2, 3, 4, 5, 6, 7, 0, 0 }, /* 0x7E (01111110) */
    { 2, 3, 4, 5, 6, 7, 8, 0 }, /* 0x7F (01111111) */
    { 1, 0, 0, 0, 0, 0, 0, 0 }, /* 0x80 (10000000) */
    { 1, 8, 0, 0, 0, 0, 0, 0 }, /* 0x81 (10000001) */
    { 1, 7, 0, 0, 0, 0, 0, 0 }, /* 0x82 (10000010) */
    { 1, 7, 8, 0, 0, 0, 0, 0 }, /* 0x83 (10000011) */
    { 1, 6, 0, 0, 0, 0, 0, 0 }, /* 0x84 (10000100) */
    { 1, 6, 8, 0, 0, 0, 0, 0 }, /* 0x85 (10000101) */
    { 1, 6, 7, 0, 0, 0, 0, 0 }, /* 0x86 (10000110) */
    { 1, 6, 7, 8, 0, 0, 0, 0 }, /* 0x87 (10000111) */
    { 1, 5, 0, 0, 0, 0, 0, 0 }, /* 0x88 (10001000) */
    { 1, 5, 8, 0, 0, 0, 0, 0 }, /* 0x89 (10001001) */
    { 1, 5, 7, 0, 0, 0, 0, 0 }, /* 0x8A (10001010) */
    { 1, 5, 7, 8, 0, 0, 0, 0 }, /* 0x8B (10001011) */
    { 1, 5, 6, 0, 0, 0, 0, 0 }, /* 0x8C (10001100) */
    { 1, 5, 6, 8, 0, 0, 0, 0 }, /* 0x8D (10001101) */
    { 1, 5, 6, 7, 0, 0, 0, 0 }, /* 0x8E (10001110) */
    { 1, 5, 6, 7, 8, 0, 0, 0 }, /* 0x8F (10001111) */
    { 1, 4, 0, 0, 0, 0, 0, 0 }, /* 0x90 (10010000) */
    { 1, 4, 8, 0, 0, 0, 0, 0 }, /* 0x91 (10010001) */
    { 1, 4, 7, 0, 0, 0, 0, 0 }, /* 0x92 (10010010) */
    { 1, 4, 7, 8, 0, 0, 0, 0 }, /* 0x93 (10010011) */
    { 1, 4, 6, 0, 0, 0, 0, 0 }, /* 0x94 (10010100) */
    { 1, 4, 6, 8, 0, 0, 0, 0 }, /* 0x95 (10010101) */
    { 1, 4, 6, 7, 0, 0, 0, 0 }, /* 0x96 (10010110) */
    { 1, 4, 6, 7, 8, 0, 0, 0 }, /* 0x97 (10010111) */
    { 1, 4, 5, 0, 0, 0, 0, 0 }, /* 0x98 (10011000) */
    { 1, 4, 5, 8, 0, 0, 0, 0 }, /* 0x99 (10011001) */
    { 1, 4, 5, 7, 0, 0, 0, 0 }, /* 0x9A (10011010) */
    { 1, 4, 5, 7, 8, 0, 0, 0 }, /* 0x9B (10011011) */
    { 1, 4, 5, 6, 0, 0, 0, 0 }, /* 0x9C (10011100) */
    { 1, 4, 5, 6, 8, 0, 0, 0 }, /* 0x9D (10011101) */
    { 1, 4, 5, 6, 7, 0, 0, 0 }, /* 0x9E (10011110) */
    { 1, 4, 5, 6, 7, 8, 0, 0 }, /* 0x9F (10011111) */
    { 1, 3, 0, 0, 0, 0, 0, 0 }, /* 0xA0 (10100000) */
    { 1, 3, 8, 0, 0, 0, 0, 0 }, /* 0xA1 (10100001) */
    { 1, 3, 7, 0, 0, 0, 0, 0 }, /* 0xA2 (10100010) */
    { 1, 3, 7, 8, 0, 0, 0, 0 }, /* 0xA3 (10100011) */
    { 1, 3, 6, 0, 0, 0, 0, 0 }, /* 0xA4 (10100100) */
    { 1, 3, 6, 8, 0, 0, 0, 0 }, /* 0xA5 (10100101) */
    { 1, 3, 6, 7, 0, 0, 0, 0 }, /* 0xA6 (10100110) */
    { 1, 3, 6, 7, 8, 0, 0, 0 }, /* 0xA7 (10100111) */
    { 1, 3, 5, 0, 0, 0, 0, 0 }, /* 0xA8 (10101000) */
    { 1, 3, 5, 8, 0, 0, 0, 0 }, /* 0xA9 (10101001) */
    { 1, 3, 5, 7, 0, 0, 0, 0 }, /* 0xAA (10101010) */
    { 1, 3, 5, 7, 8, 0, 0, 0 }, /* 0xAB (10101011) */
    { 1, 3, 5, 6, 0, 0, 0, 0 }, /* 0xAC (10101100) */
    { 1, 3, 5, 6, 8, 0, 0, 0 }, /* 0xAD (10101101) */
    { 1, 3, 5, 6, 7, 0, 0, 0 }, /* 0xAE (10101110) */
    { 1, 3, 5, 6, 7, 8, 0, 0 }, /* 0xAF (10101111) */
    { 1, 3, 4, 0, 0, 0, 0, 0 }, /* 0xB0 (10110000) */
    { 1, 3, 4, 8, 0, 0, 0, 0 }, /* 0xB1 (10110001) */
    { 1, 3, 4, 7, 0, 0, 0, 0 }, /* 0xB2 (10110010) */
    { 1, 3, 4, 7, 8, 0, 0, 0 }, /* 0xB3 (10110011) */
    { 1, 3, 4, 6, 0, 0, 0, 0 }, /* 0xB4 (10110100) */
    { 1, 3, 4, 6, 8, 0, 0, 0 }, /* 0xB5 (10110101) */
    { 1, 3, 4, 6, 7, 0, 0, 0 }, /* 0xB6 (10110110) */
    { 1, 3, 4, 6, 7, 8, 0, 0 }, /* 0xB7 (10110111) */
    { 1, 3, 4, 5, 0, 0, 0, 0 }, /* 0xB8 (10111000) */
    { 1, 3, 4, 5, 8, 0, 0, 0 }, /* 0xB9 (10111001) */
    { 1, 3, 4, 5, 7, 0, 0, 0 }, /* 0xBA (10111010) */
    { 1, 3, 4, 5, 7, 8, 0, 0 }, /* 0xBB (10111011) */
    { 1, 3, 4, 5, 6, 0, 0, 0 }, /* 0xBC (10111100) */
    { 1, 3, 4, 5, 6, 8, 0, 0 }, /* 0xBD (10111101) */
    { 1, 3, 4, 5, 6, 7, 0, 0 }, /* 0xBE (10111110) */
    { 1, 3, 4, 5, 6, 7, 8, 0 }, /* 0xBF (10111111) */
    { 1, 2, 0, 0, 0, 0, 0, 0 }, /* 0xC0 (11000000) */
    { 1, 2, 8, 0, 0, 0, 0, 0 }, /* 0xC1 (11000001) */
    { 1, 2, 7, 0, 0, 0, 0, 0 }, /* 0xC2 (11000010) */
    { 1, 2, 7, 8, 0, 0, 0, 0 }, /* 0xC3 (11000011) */
    { 1, 2, 6, 0, 0, 0, 0, 0 }, /* 0xC4 (11000100) */
    { 1, 2, 6, 8, 0, 0, 0, 0 }, /* 0xC5 (11000101) */
    { 1, 2, 6, 7, 0, 0, 0, 0 }, /* 0xC6 (11000110) */
    { 1, 2, 6, 7, 8, 0, 0, 0 }, /* 0xC7 (11000111) */
    { 1, 2, 5, 0, 0, 0, 0, 0 }, /* 0xC8 (11001000) */
    { 1, 2, 5, 8, 0, 0, 0, 0 }, /* 0xC9 (11001001) */
    { 1, 2, 5, 7, 0, 0, 0, 0 }, /* 0xCA (11001010) */
    { 1, 2, 5, 7, 8, 0, 0, 0 }, /* 0xCB (11001011) */
    { 1, 2, 5, 6, 0, 0, 0, 0 }, /* 0xCC (11001100) */
    { 1, 2, 5, 6, 8, 0, 0, 0 }, /* 0xCD (11001101) */
    { 1, 2, 5, 6, 7, 0, 0, 0 }, /* 0xCE (11001110) */
    { 1, 2, 5, 6, 7, 8, 0, 0 }, /* 0xCF (11001111) */
    { 1, 2, 4, 0, 0, 0, 0, 0 }, /* 0xD0 (11010000) */
    { 1, 2, 4, 8, 0, 0, 0, 0 }, /* 0xD1 (11010001) */
    { 1, 2, 4, 7, 0, 0, 0, 0 }, /* 0xD2 (11010010) */
    { 1, 2, 4, 7, 8, 0, 0, 0 }, /* 0xD3 (11010011) */
    { 1, 2, 4, 6, 0, 0, 0, 0 }, /* 0xD4 (11010100) */
    { 1, 2, 4, 6, 8, 0, 0, 0 }, /* 0xD5 (11010101) */
    { 1, 2, 4, 6, 7, 0, 0, 0 }, /* 0xD6 (11010110) */
    { 1, 2, 4, 6, 7, 8, 0, 0 }, /* 0xD7 (11010111) */
    { 1, 2, 4, 5, 0, 0, 0, 0 }, /* 0xD8 (11011000) */
    { 1, 2, 4, 5, 8, 0, 0, 0 }, /* 0xD9 (11011001) */
    { 1, 2, 4, 5, 7, 0, 0, 0 }, /* 0xDA (11011010) */
    { 1, 2, 4, 5, 7, 8, 0, 0 }, /* 0xDB (11011011) */
    { 1, 2, 4, 5, 6, 0, 0, 0 }, /* 0xDC (11011100) */
    { 1, 2, 4, 5, 6, 8, 0, 0 }, /* 0xDD (11011101) */
    { 1, 2, 4, 5, 6, 7, 0, 0 }, /* 0xDE (11011110) */
    { 1, 2, 4, 5, 6, 7, 8, 0 }, /* 0xDF (11011111) */
    { 1, 2, 3, 0, 0, 0, 0, 0 }, /* 0xE0 (11100000) */
    { 1, 2, 3, 8, 0, 0, 0, 0 }, /* 0xE1 (11100001) */
    { 1, 2, 3, 7, 0, 0, 0, 0 }, /* 0xE2 (11100010) */
    { 1, 2, 3, 7, 8, 0, 0, 0 }, /* 0xE3 (11100011) */
    { 1, 2, 3, 6, 0, 0, 0, 0 }, /* 0xE4 (11100100) */
    { 1, 2, 3, 6, 8, 0, 0, 0 }, /* 0xE5 (11100101) */
    { 1, 2, 3, 6, 7, 0, 0, 0 }, /* 0xE6 (11100110) */
    { 1, 2, 3, 6, 7, 8, 0, 0 }, /* 0xE7 (11100111) */
    { 1, 2, 3, 5, 0, 0, 0, 0 }, /* 0xE8 (11101000) */
    { 1, 2, 3, 5, 8, 0, 0, 0 }, /* 0xE9 (11101001) */
    { 1, 2, 3, 5, 7, 0, 0, 0 }, /* 0xEA (11101010) */
    { 1, 2, 3, 5, 7, 8, 0, 0 }, /* 0xEB (11101011) */
    { 1, 2, 3, 5, 6, 0, 0, 0 }, /* 0xEC (11101100) */
    { 1, 2, 3, 5, 6, 8, 0, 0 }, /* 0xED (11101101) */
    { 1, 2, 3, 5, 6, 7, 0, 0 }, /* 0xEE (11101110) */
    { 1, 2, 3, 5, 6, 7, 8, 0 }, /* 0xEF (11101111) */
    { 1, 2, 3, 4, 0, 0, 0, 0 }, /* 0xF0 (11110000) */
    { 1, 2, 3, 4, 8, 0, 0, 0 }, /* 0xF1 (11110001) */
    { 1, 2, 3, 4, 7, 0, 0, 0 }, /* 0xF2 (11110010) */
    { 1, 2, 3, 4, 7, 8, 0, 0 }, /* 0xF3 (11110011) */
    { 1, 2, 3, 4, 6, 0, 0, 0 }, /* 0xF4 (11110100) */
    { 1, 2, 3, 4, 6, 8, 0, 0 }, /* 0xF5 (11110101) */
    { 1, 2, 3, 4, 6, 7, 0, 0 }, /* 0xF6 (11110110) */
    { 1, 2, 3, 4, 6, 7, 8, 0 }, /* 0xF7 (11110111) */
    { 1, 2, 3, 4, 5, 0, 0, 0 }, /* 0xF8 (11111000) */
    { 1, 2, 3, 4, 5, 8, 0, 0 }, /* 0xF9 (11111001) */
    { 1, 2, 3, 4, 5, 7, 0, 0 }, /* 0xFA (11111010) */
    { 1, 2, 3, 4, 5, 7, 8, 0 }, /* 0xFB (11111011) */
    { 1, 2, 3, 4, 5, 6, 0, 0 }, /* 0xFC (11111100) */
    { 1, 2, 3, 4, 5, 6, 8, 0 }, /* 0xFD (11111101) */
    { 1, 2, 3, 4, 5, 6, 7, 0 }, /* 0xFE (11111110) */
    { 1, 2, 3, 4, 5, 6, 7, 8 }  /* 0xFF (11111111) */

};
#endif


#ifdef HAVE_SSE2
int
Genomebits_decode_trailing_64 (int *out, int nmismatches, uint64_t word, int offset,
			       int max_mismatches) {
#ifdef HAVE_AVX2
  __m256i vec, _offset;
  __m256i add8 = _mm256_set1_epi32(8); /* Bits per byte */
#else
  __m128i vec, _offset;
  __m128i add8 = _mm_set1_epi32(8); /* Bits per byte */
#endif
  unsigned char byte;
  int n, bytei;


  if (word) {
#ifdef HAVE_AVX2
    _offset = _mm256_set1_epi32(offset - 1); /* Subtract 1 because decode tables are 1-based */
#else
    _offset = _mm_set1_epi32(offset - 1); /* Subtract 1 because decode tables are 1-based */
#endif

    for (bytei = 0; bytei < 8; bytei++) {
      byte = (unsigned char) word;
#ifdef HAVE_AVX2
      vec = _mm256_load_si256((const __m256i *) decode_trailing_table[byte]);
      vec = _mm256_add_epi32(_offset, vec);
      _mm256_storeu_si256((__m256i *) out, vec);
      n = decode_length[byte];
#else
      vec = _mm_load_si128((const __m128i *) &(decode_trailing_table[byte][0]));
      vec = _mm_add_epi32(_offset, vec);
      _mm_storeu_si128((__m128i *) out, vec);
      if ((n = decode_length[byte]) > 4) {
	vec = _mm_load_si128((const __m128i *) &(decode_trailing_table[byte][4]));
	vec = _mm_add_epi32(_offset, vec);
	_mm_storeu_si128((__m128i *) &(out[4]), vec);
      }
#endif

      out += n;
      nmismatches += n;
      if (nmismatches > max_mismatches) {
	return nmismatches;
      } else {
#ifdef HAVE_AVX2
	_offset = _mm256_add_epi32(_offset, add8);
#else
	_offset = _mm_add_epi32(_offset, add8);
#endif
	word >>= 8;
      }
    }
  }

  return nmismatches;
}


int
Genomebits_decode_trailing_32 (int *out, int nmismatches, uint32_t word, int offset,
			       int max_mismatches) {
#ifdef HAVE_AVX2
  __m256i vec, _offset;
  __m256i add8 = _mm256_set1_epi32(8); /* Bits per byte */
#else
  __m128i vec, _offset;
  __m128i add8 = _mm_set1_epi32(8); /* Bits per byte */
#endif
  unsigned char byte;
  int n, bytei;


  if (word) {
#ifdef HAVE_AVX2
    _offset = _mm256_set1_epi32(offset - 1); /* Subtract 1 because decode tables are 1-based */
#else
    _offset = _mm_set1_epi32(offset - 1); /* Subtract 1 because decode tables are 1-based */
#endif

    for (bytei = 0; bytei < 4; bytei++) {
      byte = (unsigned char) word;
#ifdef HAVE_AVX2
      vec = _mm256_load_si256((const __m256i *) decode_trailing_table[byte]);
      vec = _mm256_add_epi32(_offset, vec);
      _mm256_storeu_si256((__m256i *) out, vec);
      n = decode_length[byte];
#else
      vec = _mm_load_si128((const __m128i *) &(decode_trailing_table[byte][0]));
      vec = _mm_add_epi32(_offset, vec);
      _mm_storeu_si128((__m128i *) out, vec);
      if ((n = decode_length[byte]) > 4) {
	vec = _mm_load_si128((const __m128i *) &(decode_trailing_table[byte][4]));
	vec = _mm_add_epi32(_offset, vec);
	_mm_storeu_si128((__m128i *) &(out[4]), vec);
      }
#endif

      out += n;
      nmismatches += n;
      if (nmismatches > max_mismatches) {
	return nmismatches;
      } else {
#ifdef HAVE_AVX2
	_offset = _mm256_add_epi32(_offset, add8);
#else
	_offset = _mm_add_epi32(_offset, add8);
#endif
	word >>= 8;
      }
    }
  }

  return nmismatches;
}


int
Genomebits_decode_leading_64 (int *out, int nmismatches, uint64_t word, int offset,
			      int max_mismatches) {
#ifdef HAVE_AVX2
  __m256i vec, _offset;
  __m256i sub8 = _mm256_set1_epi32(8); /* Bits per byte */
#else
  __m128i vec, _offset;
  __m128i sub8 = _mm_set1_epi32(8); /* Bits per byte */
#endif
  unsigned char byte;
  int n, bytei;

  if (word) {
#ifdef HAVE_AVX2
    _offset = _mm256_set1_epi32(offset + 1); /* Add 1 because decode tables are 1-based */
#else
    _offset = _mm_set1_epi32(offset + 1); /* Add 1 because decode tables are 1-based */
#endif

    for (bytei = 7; bytei >= 0; bytei--) {
      byte = (unsigned char) (word >> bytei*8);
#ifdef HAVE_AVX2
      vec = _mm256_load_si256((const __m256i *) decode_leading_table[byte]);
      vec = _mm256_sub_epi32(_offset, vec);
      _mm256_storeu_si256((__m256i *) out, vec);
      n = decode_length[byte];
#else
      vec = _mm_load_si128((const __m128i *) &(decode_leading_table[byte][0]));
      vec = _mm_sub_epi32(_offset, vec);
      _mm_storeu_si128((__m128i *) out, vec);
      if ((n = decode_length[byte]) > 4) {
	vec = _mm_load_si128((const __m128i *) &(decode_leading_table[byte][4]));
	vec = _mm_sub_epi32(_offset, vec);
	_mm_storeu_si128((__m128i *) &(out[4]), vec);
      }
#endif

      out += n;
      nmismatches += n;
      if (nmismatches > max_mismatches) {
	return nmismatches;
      } else {
#ifdef HAVE_AVX2
	_offset = _mm256_sub_epi32(_offset, sub8);
#else
	_offset = _mm_sub_epi32(_offset, sub8);
#endif
      }
    }
  }

  return nmismatches;
}


int
Genomebits_decode_leading_32 (int *out, int nmismatches, uint32_t word, int offset,
			      int max_mismatches) {
#ifdef HAVE_AVX2
  __m256i vec, _offset;
  __m256i sub8 = _mm256_set1_epi32(8); /* Bits per byte */
#else
  __m128i vec, _offset;
  __m128i sub8 = _mm_set1_epi32(8); /* Bits per byte */
#endif
  unsigned char byte;
  int n, bytei;


  if (word) {
#ifdef HAVE_AVX2
    _offset = _mm256_set1_epi32(offset + 1); /* Add 1 because decode tables are 1-based */
#else
    _offset = _mm_set1_epi32(offset + 1); /* Add 1 because decode tables are 1-based */
#endif

    for (bytei = 3; bytei >= 0; bytei--) {
      byte = (unsigned char) (word >> bytei*8);
#ifdef HAVE_AVX2
      vec = _mm256_load_si256((const __m256i *) decode_leading_table[byte]);
      vec = _mm256_sub_epi32(_offset, vec);
      _mm256_storeu_si256((__m256i *) out, vec);
      n = decode_length[byte];
#else
      vec = _mm_load_si128((const __m128i *) &(decode_leading_table[byte][0]));
      vec = _mm_sub_epi32(_offset, vec);
      _mm_storeu_si128((__m128i *) out, vec);
      if ((n = decode_length[byte]) > 4) {
	vec = _mm_load_si128((const __m128i *) &(decode_leading_table[byte][4]));
	vec = _mm_sub_epi32(_offset, vec);
	_mm_storeu_si128((__m128i *) &(out[4]), vec);
      }
#endif

      out += n;
      nmismatches += n;
      if (nmismatches > max_mismatches) {
	return nmismatches;
      } else {
#ifdef HAVE_AVX2
	_offset = _mm256_sub_epi32(_offset, sub8);
#else
	_offset = _mm_sub_epi32(_offset, sub8);
#endif
      }
    }
  }

  return nmismatches;
}
#endif
