/*
 * Branch/Call/Jump (BCJ) filter decoders
 *
 * Authors: Lasse Collin <lasse.collin@tukaani.org>
 *          Igor Pavlov <http://7-zip.org/>
 *
 * This file has been put into the public domain.
 * You can do whatever you want with this file.
 */

#include "xz_private.h"

/*
 * The rest of the file is inside this ifdef. It makes things a little more
 * convenient when building without support for any BCJ filters.
 */
#ifdef XZ_DEC_BCJ

struct xz_dec_bcj
{
    /* Type of the BCJ filter being used */
    enum
    {
        BCJ_X86 = 4,      /* x86 or x86-64 */
        BCJ_POWERPC = 5,  /* Big endian only */
        BCJ_IA64 = 6,     /* Big or little endian */
        BCJ_ARM = 7,      /* Little endian only */
        BCJ_ARMTHUMB = 8, /* Little endian only */
        BCJ_SPARC = 9     /* Big or little endian */
    } type;

    /*
     * Return value of the next filter in the chain. We need to preserve
     * this information across calls, because we must not call the next
     * filter anymore once it has returned XZ_STREAM_END.
     */
    enum xz_ret ret;

    /* True if we are operating in single-call mode. */
    bool single_call;

    /*
     * Absolute position relative to the beginning of the uncompressed
     * data (in a single .xz Block). We care only about the lowest 32
     * bits so this doesn't need to be uint64_t even with big files.
     */
    uint32_t pos;

    /* x86 filter state */
    uint32_t x86_prev_mask;

    /* Temporary space to hold the variables from struct xz_buf */
    uint8_t *out;
    size_t out_pos;
    size_t out_size;

    struct
    {
        /* Amount of already filtered data in the beginning of buf */
        size_t filtered;

        /* Total amount of data currently stored in buf  */
        size_t size;

        /*
         * Buffer to hold a mix of filtered and unfiltered data. This
         * needs to be big enough to hold Alignment + 2 * Look-ahead:
         *
         * Type         Alignment   Look-ahead
         * x86              1           4
         * PowerPC          4           0
         * IA-64           16           0
         * ARM              4           0
         * ARM-Thumb        2           2
         * SPARC            4           0
         */
        uint8_t buf[16];
    } temp;
};

#ifdef XZ_DEC_X86
/*
 * This is used to test the most significant byte of a memory address
 * in an x86 instruction.
 */
static inline int bcj_x86_test_msbyte(uint8_t b)
{
    return b == 0x00 || b == 0xFF;
}

static size_t bcj_x86(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
{
    static const bool mask_to_allowed_status[8] = {true, true,  true,  false,
                                                   true, false, false, false};

    static const uint8_t mask_to_bit_num[8] = {0, 1, 2, 2, 3, 3, 3, 3};

    size_t i;
    size_t prev_pos = (size_t) - 1;
    uint32_t prev_mask = s->x86_prev_mask;
    uint32_t src;
    uint32_t dest;
    uint32_t j;
    uint8_t b;

    if (size <= 4)
        return 0;

    size -= 4;
    for (i = 0; i < size; ++i)
    {
        if ((buf[i] & 0xFE) != 0xE8)
            continue;

        prev_pos = i - prev_pos;
        if (prev_pos > 3)
        {
            prev_mask = 0;
        }
        else
        {
            prev_mask = (prev_mask << (prev_pos - 1)) & 7;
            if (prev_mask != 0)
            {
                b = buf[i + 4 - mask_to_bit_num[prev_mask]];
                if (!mask_to_allowed_status[prev_mask] || bcj_x86_test_msbyte(b))
                {
                    prev_pos = i;
                    prev_mask = (prev_mask << 1) | 1;
                    continue;
                }
            }
        }

        prev_pos = i;

        if (bcj_x86_test_msbyte(buf[i + 4]))
        {
            src = get_unaligned_le32(buf + i + 1);
            while (true)
            {
                dest = src - (s->pos + (uint32_t)i + 5);
                if (prev_mask == 0)
                    break;

                j = mask_to_bit_num[prev_mask] * 8;
                b = (uint8_t)(dest >> (24 - j));
                if (!bcj_x86_test_msbyte(b))
                    break;

                src = dest ^ (((uint32_t)1 << (32 - j)) - 1);
            }

            dest &= 0x01FFFFFF;
            dest |= (uint32_t)0 - (dest & 0x01000000);
            put_unaligned_le32(dest, buf + i + 1);
            i += 4;
        }
        else
        {
            prev_mask = (prev_mask << 1) | 1;
        }
    }

    prev_pos = i - prev_pos;
    s->x86_prev_mask = prev_pos > 3 ? 0 : prev_mask << (prev_pos - 1);
    return i;
}
#endif

#ifdef XZ_DEC_POWERPC
static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
{
    size_t i;
    uint32_t instr;

    for (i = 0; i + 4 <= size; i += 4)
    {
        instr = get_unaligned_be32(buf + i);
        if ((instr & 0xFC000003) == 0x48000001)
        {
            instr &= 0x03FFFFFC;
            instr -= s->pos + (uint32_t)i;
            instr &= 0x03FFFFFC;
            instr |= 0x48000001;
            put_unaligned_be32(instr, buf + i);
        }
    }

    return i;
}
#endif

#ifdef XZ_DEC_IA64
static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
{
    static const uint8_t branch_table[32] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                             4, 4, 6, 6, 0, 0, 7, 7, 4, 4, 0, 0, 4, 4, 0, 0};

    /*
     * The local variables take a little bit stack space, but it's less
     * than what LZMA2 decoder takes, so it doesn't make sense to reduce
     * stack usage here without doing that for the LZMA2 decoder too.
     */

    /* Loop counters */
    size_t i;
    size_t j;

    /* Instruction slot (0, 1, or 2) in the 128-bit instruction word */
    uint32_t slot;

    /* Bitwise offset of the instruction indicated by slot */
    uint32_t bit_pos;

    /* bit_pos split into byte and bit parts */
    uint32_t byte_pos;
    uint32_t bit_res;

    /* Address part of an instruction */
    uint32_t addr;

    /* Mask used to detect which instructions to convert */
    uint32_t mask;

    /* 41-bit instruction stored somewhere in the lowest 48 bits */
    uint64_t instr;

    /* Instruction normalized with bit_res for easier manipulation */
    uint64_t norm;

    for (i = 0; i + 16 <= size; i += 16)
    {
        mask = branch_table[buf[i] & 0x1F];
        for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41)
        {
            if (((mask >> slot) & 1) == 0)
                continue;

            byte_pos = bit_pos >> 3;
            bit_res = bit_pos & 7;
            instr = 0;
            for (j = 0; j < 6; ++j)
                instr |= (uint64_t)(buf[i + j + byte_pos]) << (8 * j);

            norm = instr >> bit_res;

            if (((norm >> 37) & 0x0F) == 0x05 && ((norm >> 9) & 0x07) == 0)
            {
                addr = (norm >> 13) & 0x0FFFFF;
                addr |= ((uint32_t)(norm >> 36) & 1) << 20;
                addr <<= 4;
                addr -= s->pos + (uint32_t)i;
                addr >>= 4;

                norm &= ~((uint64_t)0x8FFFFF << 13);
                norm |= (uint64_t)(addr & 0x0FFFFF) << 13;
                norm |= (uint64_t)(addr & 0x100000) << (36 - 20);

                instr &= (1 << bit_res) - 1;
                instr |= norm << bit_res;

                for (j = 0; j < 6; j++)
                    buf[i + j + byte_pos] = (uint8_t)(instr >> (8 * j));
            }
        }
    }

    return i;
}
#endif

#ifdef XZ_DEC_ARM
static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
{
    size_t i;
    uint32_t addr;

    for (i = 0; i + 4 <= size; i += 4)
    {
        if (buf[i + 3] == 0xEB)
        {
            addr =
                (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8) | ((uint32_t)buf[i + 2] << 16);
            addr <<= 2;
            addr -= s->pos + (uint32_t)i + 8;
            addr >>= 2;
            buf[i] = (uint8_t)addr;
            buf[i + 1] = (uint8_t)(addr >> 8);
            buf[i + 2] = (uint8_t)(addr >> 16);
        }
    }

    return i;
}
#endif

#ifdef XZ_DEC_ARMTHUMB
static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
{
    size_t i;
    uint32_t addr;

    for (i = 0; i + 4 <= size; i += 2)
    {
        if ((buf[i + 1] & 0xF8) == 0xF0 && (buf[i + 3] & 0xF8) == 0xF8)
        {
            addr = (((uint32_t)buf[i + 1] & 0x07) << 19) | ((uint32_t)buf[i] << 11) |
                   (((uint32_t)buf[i + 3] & 0x07) << 8) | (uint32_t)buf[i + 2];
            addr <<= 1;
            addr -= s->pos + (uint32_t)i + 4;
            addr >>= 1;
            buf[i + 1] = (uint8_t)(0xF0 | ((addr >> 19) & 0x07));
            buf[i] = (uint8_t)(addr >> 11);
            buf[i + 3] = (uint8_t)(0xF8 | ((addr >> 8) & 0x07));
            buf[i + 2] = (uint8_t)addr;
            i += 2;
        }
    }

    return i;
}
#endif

#ifdef XZ_DEC_SPARC
static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size)
{
    size_t i;
    uint32_t instr;

    for (i = 0; i + 4 <= size; i += 4)
    {
        instr = get_unaligned_be32(buf + i);
        if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF)
        {
            instr <<= 2;
            instr -= s->pos + (uint32_t)i;
            instr >>= 2;
            instr =
                ((uint32_t)0x40000000 - (instr & 0x400000)) | 0x40000000 | (instr & 0x3FFFFF);
            put_unaligned_be32(instr, buf + i);
        }
    }

    return i;
}
#endif

/*
 * Apply the selected BCJ filter. Update *pos and s->pos to match the amount
 * of data that got filtered.
 *
 * NOTE: This is implemented as a switch statement to avoid using function
 * pointers, which could be problematic in the kernel boot code, which must
 * avoid pointers to static data (at least on x86).
 */
static void bcj_apply(struct xz_dec_bcj *s, uint8_t *buf, size_t *pos, size_t size)
{
    size_t filtered;

    buf += *pos;
    size -= *pos;

    switch (s->type)
    {
#ifdef XZ_DEC_X86
    case BCJ_X86:
        filtered = bcj_x86(s, buf, size);
        break;
#endif
#ifdef XZ_DEC_POWERPC
    case BCJ_POWERPC:
        filtered = bcj_powerpc(s, buf, size);
        break;
#endif
#ifdef XZ_DEC_IA64
    case BCJ_IA64:
        filtered = bcj_ia64(s, buf, size);
        break;
#endif
#ifdef XZ_DEC_ARM
    case BCJ_ARM:
        filtered = bcj_arm(s, buf, size);
        break;
#endif
#ifdef XZ_DEC_ARMTHUMB
    case BCJ_ARMTHUMB:
        filtered = bcj_armthumb(s, buf, size);
        break;
#endif
#ifdef XZ_DEC_SPARC
    case BCJ_SPARC:
        filtered = bcj_sparc(s, buf, size);
        break;
#endif
    default:
        /* Never reached but silence compiler warnings. */
        filtered = 0;
        break;
    }

    *pos += filtered;
    s->pos += filtered;
}

/*
 * Flush pending filtered data from temp to the output buffer.
 * Move the remaining mixture of possibly filtered and unfiltered
 * data to the beginning of temp.
 */
static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b)
{
    size_t copy_size;

    copy_size = min_t(size_t, s->temp.filtered, b->out_size - b->out_pos);
    memcpy(b->out + b->out_pos, s->temp.buf, copy_size);
    b->out_pos += copy_size;

    s->temp.filtered -= copy_size;
    s->temp.size -= copy_size;
    memmove(s->temp.buf, s->temp.buf + copy_size, s->temp.size);
}

/*
 * The BCJ filter functions are primitive in sense that they process the
 * data in chunks of 1-16 bytes. To hide this issue, this function does
 * some buffering.
 */
XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, struct xz_dec_lzma2 *lzma2,
                                     struct xz_buf *b)
{
    size_t out_start;

    /*
     * Flush pending already filtered data to the output buffer. Return
     * immediatelly if we couldn't flush everything, or if the next
     * filter in the chain had already returned XZ_STREAM_END.
     */
    if (s->temp.filtered > 0)
    {
        bcj_flush(s, b);
        if (s->temp.filtered > 0)
            return XZ_OK;

        if (s->ret == XZ_STREAM_END)
            return XZ_STREAM_END;
    }

    /*
     * If we have more output space than what is currently pending in
     * temp, copy the unfiltered data from temp to the output buffer
     * and try to fill the output buffer by decoding more data from the
     * next filter in the chain. Apply the BCJ filter on the new data
     * in the output buffer. If everything cannot be filtered, copy it
     * to temp and rewind the output buffer position accordingly.
     *
     * This needs to be always run when temp.size == 0 to handle a special
     * case where the output buffer is full and the next filter has no
     * more output coming but hasn't returned XZ_STREAM_END yet.
     */
    if (s->temp.size < b->out_size - b->out_pos || s->temp.size == 0)
    {
        out_start = b->out_pos;
        memcpy(b->out + b->out_pos, s->temp.buf, s->temp.size);
        b->out_pos += s->temp.size;

        s->ret = xz_dec_lzma2_run(lzma2, b);
        if (s->ret != XZ_STREAM_END && (s->ret != XZ_OK || s->single_call))
            return s->ret;

        bcj_apply(s, b->out, &out_start, b->out_pos);

        /*
         * As an exception, if the next filter returned XZ_STREAM_END,
         * we can do that too, since the last few bytes that remain
         * unfiltered are meant to remain unfiltered.
         */
        if (s->ret == XZ_STREAM_END)
            return XZ_STREAM_END;

        s->temp.size = b->out_pos - out_start;
        b->out_pos -= s->temp.size;
        memcpy(s->temp.buf, b->out + b->out_pos, s->temp.size);

        /*
         * If there wasn't enough input to the next filter to fill
         * the output buffer with unfiltered data, there's no point
         * to try decoding more data to temp.
         */
        if (b->out_pos + s->temp.size < b->out_size)
            return XZ_OK;
    }

    /*
     * We have unfiltered data in temp. If the output buffer isn't full
     * yet, try to fill the temp buffer by decoding more data from the
     * next filter. Apply the BCJ filter on temp. Then we hopefully can
     * fill the actual output buffer by copying filtered data from temp.
     * A mix of filtered and unfiltered data may be left in temp; it will
     * be taken care on the next call to this function.
     */
    if (b->out_pos < b->out_size)
    {
        /* Make b->out{,_pos,_size} temporarily point to s->temp. */
        s->out = b->out;
        s->out_pos = b->out_pos;
        s->out_size = b->out_size;
        b->out = s->temp.buf;
        b->out_pos = s->temp.size;
        b->out_size = sizeof(s->temp.buf);

        s->ret = xz_dec_lzma2_run(lzma2, b);

        s->temp.size = b->out_pos;
        b->out = s->out;
        b->out_pos = s->out_pos;
        b->out_size = s->out_size;

        if (s->ret != XZ_OK && s->ret != XZ_STREAM_END)
            return s->ret;

        bcj_apply(s, s->temp.buf, &s->temp.filtered, s->temp.size);

        /*
         * If the next filter returned XZ_STREAM_END, we mark that
         * everything is filtered, since the last unfiltered bytes
         * of the stream are meant to be left as is.
         */
        if (s->ret == XZ_STREAM_END)
            s->temp.filtered = s->temp.size;

        bcj_flush(s, b);
        if (s->temp.filtered > 0)
            return XZ_OK;
    }

    return s->ret;
}

XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call)
{
    struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL);
    if (s != NULL)
        s->single_call = single_call;

    return s;
}

XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id)
{
    switch (id)
    {
#ifdef XZ_DEC_X86
    case BCJ_X86:
#endif
#ifdef XZ_DEC_POWERPC
    case BCJ_POWERPC:
#endif
#ifdef XZ_DEC_IA64
    case BCJ_IA64:
#endif
#ifdef XZ_DEC_ARM
    case BCJ_ARM:
#endif
#ifdef XZ_DEC_ARMTHUMB
    case BCJ_ARMTHUMB:
#endif
#ifdef XZ_DEC_SPARC
    case BCJ_SPARC:
#endif
        break;

    default:
        /* Unsupported Filter ID */
        return XZ_OPTIONS_ERROR;
    }

    s->type = id;
    s->ret = XZ_OK;
    s->pos = 0;
    s->x86_prev_mask = 0;
    s->temp.filtered = 0;
    s->temp.size = 0;

    return XZ_OK;
}

#endif