jdk/src/solaris/native/sun/awt/medialib/mlib_v_ImageLookUpSIU8S32Func.c
author bae
Fri, 15 Oct 2010 10:42:39 +0400
changeset 6814 c6e347fb5b20
parent 5506 202f599c92aa
permissions -rw-r--r--
6725821: Compiler warnings in medialib code Reviewed-by: igor, prr

/*
 * Copyright (c) 1998, 2003, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */



#include "vis_proto.h"
#include "mlib_image.h"
#include "mlib_v_ImageLookUpFunc.h"

/***************************************************************/
static void mlib_v_ImageLookUpSI_U8_S32_2_SrcOff0_D1(const mlib_u8  *src,
                                                     mlib_s32       *dst,
                                                     mlib_s32       xsize,
                                                     const mlib_d64 *table);

static void mlib_v_ImageLookUpSI_U8_S32_2_DstNonAl_D1(const mlib_u8  *src,
                                                      mlib_s32       *dst,
                                                      mlib_s32       xsize,
                                                      const mlib_d64 *table);

static void mlib_v_ImageLookUpSI_U8_S32_2_SMALL(const mlib_u8  *src,
                                                mlib_s32       *dst,
                                                mlib_s32       xsize,
                                                const mlib_s32 **table);

static void mlib_v_ImageLookUpSI_U8_S32_3_SrcOff0_D1(const mlib_u8  *src,
                                                     mlib_s32       *dst,
                                                     mlib_s32       xsize,
                                                     const mlib_d64 *table);

static void mlib_v_ImageLookUpSI_U8_S32_3_DstNonAl_D1(const mlib_u8  *src,
                                                      mlib_s32       *dst,
                                                      mlib_s32       xsize,
                                                      const mlib_d64 *table);

static void mlib_v_ImageLookUpSI_U8_S32_3_SMALL(const mlib_u8  *src,
                                                mlib_s32       *dst,
                                                mlib_s32       xsize,
                                                const mlib_s32 **table);

static void mlib_v_ImageLookUpSI_U8_S32_4_SrcOff0_D1(const mlib_u8  *src,
                                                     mlib_s32       *dst,
                                                     mlib_s32       xsize,
                                                     const mlib_d64 *table);

static void mlib_v_ImageLookUpSI_U8_S32_4_DstNonAl_D1(const mlib_u8  *src,
                                                      mlib_s32       *dst,
                                                      mlib_s32       xsize,
                                                      const mlib_d64 *table);

static void mlib_v_ImageLookUpSI_U8_S32_4_SMALL(const mlib_u8  *src,
                                                mlib_s32       *dst,
                                                mlib_s32       xsize,
                                                const mlib_s32 **table);

/***************************************************************/
void mlib_v_ImageLookUpSI_U8_S32_2_SrcOff0_D1(const mlib_u8  *src,
                                              mlib_s32       *dst,
                                              mlib_s32       xsize,
                                              const mlib_d64 *table)
{
  mlib_u32 *sa;          /* aligned pointer to source data */
  mlib_u8  *sp;          /* pointer to source data */
  mlib_u32 s0;           /* source data */
  mlib_d64 *dp;          /* aligned pointer to destination */
  mlib_d64 acc0, acc1;   /* destination data */
  mlib_d64 acc2, acc3;   /* destination data */
  mlib_s32 i;            /* loop variable */
  mlib_u32 s00, s01, s02, s03;

  sa   = (mlib_u32*)src;
  dp   = (mlib_d64 *) dst;

  i = 0;

  if (xsize >= 4) {

    s0 = *sa++;
    s00 = (s0 >> 21) & 0x7F8;
    s01 = (s0 >> 13) & 0x7F8;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, dp += 4) {
      s02 = (s0 >> 5) & 0x7F8;
      s03 = (s0 << 3) & 0x7F8;
      acc0 = *(mlib_d64*)((mlib_u8*)table + s00);
      acc1 = *(mlib_d64*)((mlib_u8*)table + s01);
      acc2 = *(mlib_d64*)((mlib_u8*)table + s02);
      acc3 = *(mlib_d64*)((mlib_u8*)table + s03);
      s0 = *sa++;
      s00 = (s0 >> 21) & 0x7F8;
      s01 = (s0 >> 13) & 0x7F8;
      dp[0] = acc0;
      dp[1] = acc1;
      dp[2] = acc2;
      dp[3] = acc3;
    }

    s02 = (s0 >> 5) & 0x7F8;
    s03 = (s0 << 3) & 0x7F8;
    acc0 = *(mlib_d64*)((mlib_u8*)table + s00);
    acc1 = *(mlib_d64*)((mlib_u8*)table + s01);
    acc2 = *(mlib_d64*)((mlib_u8*)table + s02);
    acc3 = *(mlib_d64*)((mlib_u8*)table + s03);
    dp[0] = acc0;
    dp[1] = acc1;
    dp[2] = acc2;
    dp[3] = acc3;
    dp += 4;
    i += 4;
  }

  sp = (mlib_u8*)sa;

  if ( i <= xsize - 2) {
    *dp++ = table[sp[0]];
    *dp++ = table[sp[1]];
    i+=2; sp += 2;
  }

  if ( i < xsize) *dp++ = table[sp[0]];
}

/***************************************************************/
void mlib_v_ImageLookUpSI_U8_S32_2_DstNonAl_D1(const mlib_u8  *src,
                                               mlib_s32       *dst,
                                               mlib_s32       xsize,
                                               const mlib_d64 *table)
{
  mlib_u32 *sa;              /* aligned pointer to source data */
  mlib_u8  *sp;              /* pointer to source data */
  mlib_u32 s0;               /* source data */
  mlib_s32 *dl;              /* pointer to start of destination */
  mlib_d64 *dp;              /* aligned pointer to destination */
  mlib_d64 acc0, acc1;       /* destination data */
  mlib_d64 acc2, acc3, acc4; /* destination data */
  mlib_s32 i;                /* loop variable */
  mlib_u32 s00, s01, s02, s03;

  sa = (mlib_u32*)src;
  dl = dst;
  dp   = (mlib_d64 *) ((mlib_addr) dl & (~7)) + 1;
  vis_alignaddr(dp, 4);

  s0 = *sa++;
  s00 = (s0 >> 21) & 0x7F8;
  acc0 = *(mlib_d64*)((mlib_u8*)table + s00);
  *(mlib_f32*)dl = vis_read_hi(acc0);
  xsize--;
  sp = (mlib_u8*)sa - 3;

  if (xsize >= 3) {
    s01 = (s0 >> 13) & 0x7F8;
    s02 = (s0 >> 5) & 0x7F8;
    s03 = (s0 << 3) & 0x7F8;
    acc1 = *(mlib_d64*)((mlib_u8*)table + s01);
    acc2 = *(mlib_d64*)((mlib_u8*)table + s02);
    acc3 = *(mlib_d64*)((mlib_u8*)table + s03);
    dp[0] = vis_faligndata(acc0, acc1);
    dp[1] = vis_faligndata(acc1, acc2);
    dp[2] = vis_faligndata(acc2, acc3);
    acc0 = acc3; dp += 3; xsize -= 3;
    sp = (mlib_u8*)sa;
  }

  i = 0;

  if (xsize >= 4) {

    s0 = *sa++;
    s00 = (s0 >> 21) & 0x7F8;
    s01 = (s0 >> 13) & 0x7F8;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, dp += 4) {
      s02 = (s0 >> 5) & 0x7F8;
      s03 = (s0 << 3) & 0x7F8;
      acc1 = *(mlib_d64*)((mlib_u8*)table + s00);
      acc2 = *(mlib_d64*)((mlib_u8*)table + s01);
      acc3 = *(mlib_d64*)((mlib_u8*)table + s02);
      acc4 = *(mlib_d64*)((mlib_u8*)table + s03);
      s0 = *sa++;
      s00 = (s0 >> 21) & 0x7F8;
      s01 = (s0 >> 13) & 0x7F8;
      dp[0] = vis_faligndata(acc0, acc1);
      dp[1] = vis_faligndata(acc1, acc2);
      dp[2] = vis_faligndata(acc2, acc3);
      dp[3] = vis_faligndata(acc3, acc4);
      acc0 = acc4;
    }

    s02 = (s0 >> 5) & 0x7F8;
    s03 = (s0 << 3) & 0x7F8;
    acc1 = *(mlib_d64*)((mlib_u8*)table + s00);
    acc2 = *(mlib_d64*)((mlib_u8*)table + s01);
    acc3 = *(mlib_d64*)((mlib_u8*)table + s02);
    acc4 = *(mlib_d64*)((mlib_u8*)table + s03);
    dp[0] = vis_faligndata(acc0, acc1);
    dp[1] = vis_faligndata(acc1, acc2);
    dp[2] = vis_faligndata(acc2, acc3);
    dp[3] = vis_faligndata(acc3, acc4);
    acc0 = acc4;
    dp += 4;
    i += 4;
    sp = (mlib_u8*)sa;
  }

  if ( i <= xsize - 2) {
    acc1 = table[sp[0]];
    acc2 = table[sp[1]];
    *dp++ = vis_faligndata(acc0, acc1);
    *dp++ = vis_faligndata(acc1, acc2);
    i+=2; sp += 2;
    acc0 = acc2;
  }

  if ( i < xsize) {
    acc1 = table[sp[0]];
    *dp++ = vis_faligndata(acc0, acc1);
    acc0 = acc1;
  }

  *(mlib_f32*) dp = vis_read_lo(acc0);
}

/***************************************************************/
void mlib_v_ImageLookUpSI_U8_S32_2_SMALL(const mlib_u8  *src,
                                         mlib_s32       *dst,
                                         mlib_s32       xsize,
                                         const mlib_s32 **table)
{
  mlib_u32 *sa;          /* aligned pointer to source data */
  mlib_u8  *sp;          /* pointer to source data */
  mlib_u32 s0;           /* source data */
  mlib_f32 *dp;          /* aligned pointer to destination */
  mlib_f32 acc0, acc1;   /* destination data */
  mlib_f32 acc2, acc3;   /* destination data */
  mlib_f32 acc4, acc5;   /* destination data */
  mlib_f32 acc6, acc7;   /* destination data */
  mlib_f32 *table0 = (mlib_f32*)table[0];
  mlib_f32 *table1 = (mlib_f32*)table[1];
  mlib_s32 i;            /* loop variable */
  mlib_u32 s00, s01, s02, s03;

  sa   = (mlib_u32*)src;
  dp   = (mlib_f32*)dst;

  i = 0;

  if (xsize >= 4) {

    s0 = *sa++;
    s00 = (s0 >> 22) & 0x3FC;
    s01 = (s0 >> 14) & 0x3FC;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, dp += 8) {
      s02 = (s0 >> 6) & 0x3FC;
      s03 = (s0 << 2) & 0x3FC;
      acc0 = *(mlib_f32*)((mlib_u8*)table0 + s00);
      acc1 = *(mlib_f32*)((mlib_u8*)table1 + s00);
      acc2 = *(mlib_f32*)((mlib_u8*)table0 + s01);
      acc3 = *(mlib_f32*)((mlib_u8*)table1 + s01);
      acc4 = *(mlib_f32*)((mlib_u8*)table0 + s02);
      acc5 = *(mlib_f32*)((mlib_u8*)table1 + s02);
      acc6 = *(mlib_f32*)((mlib_u8*)table0 + s03);
      acc7 = *(mlib_f32*)((mlib_u8*)table1 + s03);
      s0 = *sa++;
      s00 = (s0 >> 22) & 0x3FC;
      s01 = (s0 >> 14) & 0x3FC;
      dp[0] = acc0;
      dp[1] = acc1;
      dp[2] = acc2;
      dp[3] = acc3;
      dp[4] = acc4;
      dp[5] = acc5;
      dp[6] = acc6;
      dp[7] = acc7;
    }

    s02 = (s0 >> 6) & 0x3FC;
    s03 = (s0 << 2) & 0x3FC;
    acc0 = *(mlib_f32*)((mlib_u8*)table0 + s00);
    acc1 = *(mlib_f32*)((mlib_u8*)table1 + s00);
    acc2 = *(mlib_f32*)((mlib_u8*)table0 + s01);
    acc3 = *(mlib_f32*)((mlib_u8*)table1 + s01);
    acc4 = *(mlib_f32*)((mlib_u8*)table0 + s02);
    acc5 = *(mlib_f32*)((mlib_u8*)table1 + s02);
    acc6 = *(mlib_f32*)((mlib_u8*)table0 + s03);
    acc7 = *(mlib_f32*)((mlib_u8*)table1 + s03);
    dp[0] = acc0;
    dp[1] = acc1;
    dp[2] = acc2;
    dp[3] = acc3;
    dp[4] = acc4;
    dp[5] = acc5;
    dp[6] = acc6;
    dp[7] = acc7;
    dp += 8;
    i += 4;
  }

  sp = (mlib_u8*)sa;

  if ( i < xsize ) {
    *dp++ = table0[sp[0]];
    *dp++ = table1[sp[0]];
    i++; sp++;
  }

  if ( i < xsize ) {
    *dp++ = table0[sp[0]];
    *dp++ = table1[sp[0]];
    i++; sp++;
  }

  if ( i < xsize ) {
    *dp++ = table0[sp[0]];
    *dp++ = table1[sp[0]];
  }
}

/***************************************************************/
void mlib_v_ImageLookUpSI_U8_S32_2(const mlib_u8  *src,
                                   mlib_s32       slb,
                                   mlib_s32       *dst,
                                   mlib_s32       dlb,
                                   mlib_s32       xsize,
                                   mlib_s32       ysize,
                                   const mlib_s32 **table)
{
  if ((xsize * ysize) < 600) {
    mlib_u8  *sl;
    mlib_s32 *dl;
    mlib_s32 j, i;
    const mlib_s32 *tab0 = table[0];
    const mlib_s32 *tab1 = table[1];

    sl = (void *)src;
    dl = dst;

    /* row loop */
    for (j = 0; j < ysize; j ++) {
      mlib_u8  *sp = sl;
      mlib_s32 *dp = dl;
      mlib_s32 off, size = xsize;

      off = (mlib_s32)((4 - ((mlib_addr)sp & 3)) & 3);

      off = (off < size) ? off : size;

      for (i = 0; i < off; i++) {
        *dp++ = tab0[sp[0]];
        *dp++ = tab1[sp[0]];
        size--; sp++;
      }

      if (size > 0) {
        mlib_v_ImageLookUpSI_U8_S32_2_SMALL(sp, (mlib_s32*)dp, size, table);
      }

      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
      dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb);
    }

  } else {
    mlib_u8  *sl;
    mlib_s32 *dl;
    mlib_d64 dtab[256];
    mlib_u32 *tab;
    mlib_u32 *tab0 = (mlib_u32*)table[0];
    mlib_u32 *tab1 = (mlib_u32*)table[1];
    mlib_s32 i, j;
    mlib_u32 s0, s1;

    tab = (mlib_u32*)dtab;
    s0 = tab0[0];
    s1 = tab1[0];
    for (i = 0; i < 255; i++) {
      tab[2*i] = s0;
      tab[2*i+1] = s1;
      s0 = tab0[i+1];
      s1 = tab1[i+1];
    }

    tab[510] = s0;
    tab[511] = s1;

    sl = (void *)src;
    dl = dst;

    /* row loop */
    for (j = 0; j < ysize; j ++) {
      mlib_u8  *sp = sl;
      mlib_u32 *dp = (mlib_u32*)dl;
      mlib_s32 off, size = xsize;

      off = (mlib_s32)((4 - ((mlib_addr)sp & 3)) & 3);

      off = (off < size) ? off : size;

#pragma pipeloop(0)
      for (i = 0; i < off; i++) {
        dp[0] = tab0[sp[0]];
        dp[1] = tab1[sp[0]];
        dp += 2; sp++;
      }

      size -= off;

      if (size > 0) {
        if (((mlib_addr)dp & 7) == 0) {
          mlib_v_ImageLookUpSI_U8_S32_2_SrcOff0_D1(sp, (mlib_s32*)dp, size, dtab);
        } else {
          mlib_v_ImageLookUpSI_U8_S32_2_DstNonAl_D1(sp, (mlib_s32*)dp, size, dtab);
        }
      }

      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
      dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb);
    }
  }
}

/***************************************************************/
void mlib_v_ImageLookUpSI_U8_S32_3_SrcOff0_D1(const mlib_u8  *src,
                                              mlib_s32       *dst,
                                              mlib_s32       xsize,
                                              const mlib_d64 *table)
{
  mlib_u8  *sp;              /* pointer to source data */
  mlib_u32 *sa;              /* aligned pointer to source data */
  mlib_u32 s0;               /* source data */
  mlib_s32 *dl;              /* pointer to start of destination */
  mlib_d64 *dp;              /* aligned pointer to destination */
  mlib_d64 t0, t1, t2, t3;   /* destination data */
  mlib_d64 t4, t5, t6, t7;   /* destination data */
  mlib_s32 i;                /* loop variable */
  mlib_s32 *ptr;
  mlib_u32 s00, s01, s02, s03;

  dl  = dst;
  sp  = (void *)src;
  dp  = (mlib_d64 *) dl;
  sa  = (mlib_u32*)sp;

  vis_alignaddr((void *) 0, 4);

  i = 0;

  if (xsize >= 4) {

    s0 = *sa++;
    s00 = (s0 >> 20) & 0xFF0;
    s01 = (s0 >> 12) & 0xFF0;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, dp+=6) {
      s02 = (s0 >> 4) & 0xFF0;
      s03 = (s0 << 4) & 0xFF0;
      t0 = *(mlib_d64*)((mlib_u8*)table + s00);
      t1 = *(mlib_d64*)((mlib_u8*)table + s00 + 8);
      t2 = *(mlib_d64*)((mlib_u8*)table + s01);
      t3 = *(mlib_d64*)((mlib_u8*)table + s01 + 8);
      t4 = *(mlib_d64*)((mlib_u8*)table + s02);
      t5 = *(mlib_d64*)((mlib_u8*)table + s02 + 8);
      t6 = *(mlib_d64*)((mlib_u8*)table + s03);
      t7 = *(mlib_d64*)((mlib_u8*)table + s03 + 8);
      t1 = vis_faligndata(t1, t1);
      t1 = vis_faligndata(t1, t2);
      t2 = vis_faligndata(t2, t3);
      t5 = vis_faligndata(t5, t5);
      t5 = vis_faligndata(t5, t6);
      t6 = vis_faligndata(t6, t7);
      s0 = *sa++;
      s00 = (s0 >> 20) & 0xFF0;
      s01 = (s0 >> 12) & 0xFF0;
      dp[0] = t0;
      dp[1] = t1;
      dp[2] = t2;
      dp[3] = t4;
      dp[4] = t5;
      dp[5] = t6;
    }

    s02 = (s0 >> 4) & 0xFF0;
    s03 = (s0 << 4) & 0xFF0;
    t0 = *(mlib_d64*)((mlib_u8*)table + s00);
    t1 = *(mlib_d64*)((mlib_u8*)table + s00 + 8);
    t2 = *(mlib_d64*)((mlib_u8*)table + s01);
    t3 = *(mlib_d64*)((mlib_u8*)table + s01 + 8);
    t4 = *(mlib_d64*)((mlib_u8*)table + s02);
    t5 = *(mlib_d64*)((mlib_u8*)table + s02 + 8);
    t6 = *(mlib_d64*)((mlib_u8*)table + s03);
    t7 = *(mlib_d64*)((mlib_u8*)table + s03 + 8);
    t1 = vis_faligndata(t1, t1);
    t1 = vis_faligndata(t1, t2);
    t2 = vis_faligndata(t2, t3);
    t5 = vis_faligndata(t5, t5);
    t5 = vis_faligndata(t5, t6);
    t6 = vis_faligndata(t6, t7);
    dp[0] = t0;
    dp[1] = t1;
    dp[2] = t2;
    dp[3] = t4;
    dp[4] = t5;
    dp[5] = t6;
    i += 4; dp += 6;
  }

  dl = (mlib_s32*)dp;
  sp = (mlib_u8*)sa;

#pragma pipeloop(0)
  for (; i < xsize; i++) {
    ptr = (mlib_s32*)(table + (sp[0] << 1));
    dl[0] = ptr[0];
    dl[1] = ptr[1];
    dl[2] = ptr[2];
    dl += 3; sp ++;
  }
}

/***************************************************************/
void mlib_v_ImageLookUpSI_U8_S32_3_DstNonAl_D1(const mlib_u8  *src,
                                               mlib_s32       *dst,
                                               mlib_s32       xsize,
                                               const mlib_d64 *table)
{
  mlib_u8  *sp;              /* pointer to source data */
  mlib_u32 *sa;              /* aligned pointer to source data */
  mlib_u32 s0;               /* source data */
  mlib_s32 *dl;              /* pointer to start of destination */
  mlib_d64 *dp;              /* aligned pointer to destination */
  mlib_d64 t0, t1, t2, t3;   /* destination data */
  mlib_d64 t4, t5, t6, t7;   /* destination data */
  mlib_s32 i;                /* loop variable */
  mlib_s32 *ptr;
  mlib_u32 s00, s01, s02, s03;

  dl  = dst;
  sp  = (void *)src;
  dp   = (mlib_d64 *) ((mlib_addr) dl & (~7));
  sa  = (mlib_u32*)sp;

  vis_alignaddr((void *) 0, 4);

  i = 0;

  if (xsize >= 4) {

    s0 = *sa++;
    s00 = (s0 >> 20) & 0xFF0;
    s01 = (s0 >> 12) & 0xFF0;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, dp+=6) {
      s02 = (s0 >> 4) & 0xFF0;
      s03 = (s0 << 4) & 0xFF0;
      t0 = *(mlib_d64*)((mlib_u8*)table + s00);
      t1 = *(mlib_d64*)((mlib_u8*)table + s00 + 8);
      t2 = *(mlib_d64*)((mlib_u8*)table + s01);
      t3 = *(mlib_d64*)((mlib_u8*)table + s01 + 8);
      t4 = *(mlib_d64*)((mlib_u8*)table + s02);
      t5 = *(mlib_d64*)((mlib_u8*)table + s02 + 8);
      t6 = *(mlib_d64*)((mlib_u8*)table + s03);
      t7 = *(mlib_d64*)((mlib_u8*)table + s03 + 8);
      t1 = vis_faligndata(t0, t1);
      t3 = vis_faligndata(t3, t3);
      t3 = vis_faligndata(t3, t4);
      t4 = vis_faligndata(t4, t5);
      s0 = *sa++;
      s00 = (s0 >> 20) & 0xFF0;
      s01 = (s0 >> 12) & 0xFF0;
      *(mlib_f32*)((mlib_f32*)dp + 1) = vis_read_hi(t0);
      dp[1] = t1;
      dp[2] = t2;
      dp[3] = t3;
      dp[4] = t4;
      dp[5] = t6;
      *(mlib_f32*)((mlib_f32*)dp + 12) = vis_read_hi(t7);
    }

    s02 = (s0 >> 4) & 0xFF0;
    s03 = (s0 << 4) & 0xFF0;
    t0 = *(mlib_d64*)((mlib_u8*)table + s00);
    t1 = *(mlib_d64*)((mlib_u8*)table + s00 + 8);
    t2 = *(mlib_d64*)((mlib_u8*)table + s01);
    t3 = *(mlib_d64*)((mlib_u8*)table + s01 + 8);
    t4 = *(mlib_d64*)((mlib_u8*)table + s02);
    t5 = *(mlib_d64*)((mlib_u8*)table + s02 + 8);
    t6 = *(mlib_d64*)((mlib_u8*)table + s03);
    t7 = *(mlib_d64*)((mlib_u8*)table + s03 + 8);
    t1 = vis_faligndata(t0, t1);
    t3 = vis_faligndata(t3, t3);
    t3 = vis_faligndata(t3, t4);
    t4 = vis_faligndata(t4, t5);
    *(mlib_f32*)((mlib_f32*)dp + 1) = vis_read_hi(t0);
    dp[1] = t1;
    dp[2] = t2;
    dp[3] = t3;
    dp[4] = t4;
    dp[5] = t6;
    *(mlib_f32*)((mlib_f32*)dp + 12) = vis_read_hi(t7);
    i += 4; dp += 6;
  }

  dl = (mlib_s32*)dp + 1;
  sp = (mlib_u8*)sa;

#pragma pipeloop(0)
  for (; i < xsize; i++) {
    ptr = (mlib_s32*)(table + (sp[0] << 1));
    dl[0] = ptr[0];
    dl[1] = ptr[1];
    dl[2] = ptr[2];
    dl += 3; sp ++;
  }
}

/***************************************************************/
void mlib_v_ImageLookUpSI_U8_S32_3_SMALL(const mlib_u8  *src,
                                         mlib_s32       *dst,
                                         mlib_s32       xsize,
                                         const mlib_s32 **table)
{
  mlib_u32 *sa;          /* aligned pointer to source data */
  mlib_u8  *sp;          /* pointer to source data */
  mlib_u32 s0;           /* source data */
  mlib_f32 *dp;          /* aligned pointer to destination */
  mlib_f32 acc0, acc1;   /* destination data */
  mlib_f32 acc2, acc3;   /* destination data */
  mlib_f32 acc4, acc5;   /* destination data */
  mlib_f32 acc6, acc7;   /* destination data */
  mlib_f32 acc8, acc9;   /* destination data */
  mlib_f32 acc10, acc11; /* destination data */
  mlib_f32 *table0 = (mlib_f32*)table[0];
  mlib_f32 *table1 = (mlib_f32*)table[1];
  mlib_f32 *table2 = (mlib_f32*)table[2];
  mlib_s32 i;            /* loop variable */
  mlib_u32 s00, s01, s02, s03;

  sa   = (mlib_u32*)src;
  dp   = (mlib_f32*)dst;

  i = 0;

  if (xsize >= 4) {

    s0 = *sa++;
    s00 = (s0 >> 22) & 0x3FC;
    s01 = (s0 >> 14) & 0x3FC;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, dp += 12) {
      s02 = (s0 >> 6) & 0x3FC;
      s03 = (s0 << 2) & 0x3FC;
      acc0 = *(mlib_f32*)((mlib_u8*)table0 + s00);
      acc1 = *(mlib_f32*)((mlib_u8*)table1 + s00);
      acc2 = *(mlib_f32*)((mlib_u8*)table2 + s00);
      acc3 = *(mlib_f32*)((mlib_u8*)table0 + s01);
      acc4 = *(mlib_f32*)((mlib_u8*)table1 + s01);
      acc5 = *(mlib_f32*)((mlib_u8*)table2 + s01);
      acc6 = *(mlib_f32*)((mlib_u8*)table0 + s02);
      acc7 = *(mlib_f32*)((mlib_u8*)table1 + s02);
      acc8 = *(mlib_f32*)((mlib_u8*)table2 + s02);
      acc9 = *(mlib_f32*)((mlib_u8*)table0 + s03);
      acc10 = *(mlib_f32*)((mlib_u8*)table1 + s03);
      acc11 = *(mlib_f32*)((mlib_u8*)table2 + s03);
      s0 = *sa++;
      s00 = (s0 >> 22) & 0x3FC;
      s01 = (s0 >> 14) & 0x3FC;
      dp[0] = acc0;
      dp[1] = acc1;
      dp[2] = acc2;
      dp[3] = acc3;
      dp[4] = acc4;
      dp[5] = acc5;
      dp[6] = acc6;
      dp[7] = acc7;
      dp[8] = acc8;
      dp[9] = acc9;
      dp[10] = acc10;
      dp[11] = acc11;
    }

    s02 = (s0 >> 6) & 0x3FC;
    s03 = (s0 << 2) & 0x3FC;
    acc0 = *(mlib_f32*)((mlib_u8*)table0 + s00);
    acc1 = *(mlib_f32*)((mlib_u8*)table1 + s00);
    acc2 = *(mlib_f32*)((mlib_u8*)table2 + s00);
    acc3 = *(mlib_f32*)((mlib_u8*)table0 + s01);
    acc4 = *(mlib_f32*)((mlib_u8*)table1 + s01);
    acc5 = *(mlib_f32*)((mlib_u8*)table2 + s01);
    acc6 = *(mlib_f32*)((mlib_u8*)table0 + s02);
    acc7 = *(mlib_f32*)((mlib_u8*)table1 + s02);
    acc8 = *(mlib_f32*)((mlib_u8*)table2 + s02);
    acc9 = *(mlib_f32*)((mlib_u8*)table0 + s03);
    acc10 = *(mlib_f32*)((mlib_u8*)table1 + s03);
    acc11 = *(mlib_f32*)((mlib_u8*)table2 + s03);
    dp[0] = acc0;
    dp[1] = acc1;
    dp[2] = acc2;
    dp[3] = acc3;
    dp[4] = acc4;
    dp[5] = acc5;
    dp[6] = acc6;
    dp[7] = acc7;
    dp[8] = acc8;
    dp[9] = acc9;
    dp[10] = acc10;
    dp[11] = acc11;
    dp += 12;
    i += 4;
  }

  sp = (mlib_u8*)sa;

  if ( i < xsize ) {
    *dp++ = table0[sp[0]];
    *dp++ = table1[sp[0]];
    *dp++ = table2[sp[0]];
    i++; sp++;
  }

  if ( i < xsize ) {
    *dp++ = table0[sp[0]];
    *dp++ = table1[sp[0]];
    *dp++ = table2[sp[0]];
    i++; sp++;
  }

  if ( i < xsize ) {
    *dp++ = table0[sp[0]];
    *dp++ = table1[sp[0]];
    *dp++ = table2[sp[0]];
  }
}

/***************************************************************/
void mlib_v_ImageLookUpSI_U8_S32_3(const mlib_u8  *src,
                                   mlib_s32       slb,
                                   mlib_s32       *dst,
                                   mlib_s32       dlb,
                                   mlib_s32       xsize,
                                   mlib_s32       ysize,
                                   const mlib_s32 **table)
{
  if ((xsize * ysize) < 600) {
    mlib_u8  *sl;
    mlib_s32 *dl;
    mlib_s32 j, i;
    const mlib_s32 *tab0 = table[0];
    const mlib_s32 *tab1 = table[1];
    const mlib_s32 *tab2 = table[2];

    sl = (void *)src;
    dl = dst;

    /* row loop */
    for (j = 0; j < ysize; j ++) {
      mlib_u8  *sp = sl;
      mlib_s32 *dp = dl;
      mlib_s32 off, size = xsize;

      off = (mlib_s32)((4 - ((mlib_addr)sp & 3)) & 3);

      off = (off < size) ? off : size;

      for (i = 0; i < off; i++) {
        *dp++ = tab0[sp[0]];
        *dp++ = tab1[sp[0]];
        *dp++ = tab2[sp[0]];
        size--; sp++;
      }

      if (size > 0) {
        mlib_v_ImageLookUpSI_U8_S32_3_SMALL(sp, (mlib_s32*)dp, size, table);
      }

      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
      dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb);
    }

  } else {
    mlib_u8  *sl;
    mlib_s32 *dl;
    mlib_d64 dtab[512];
    mlib_u32 *tab;
    mlib_u32 *tab0 = (mlib_u32*)table[0];
    mlib_u32 *tab1 = (mlib_u32*)table[1];
    mlib_u32 *tab2 = (mlib_u32*)table[2];
    mlib_s32 i, j;
    mlib_u32 s0, s1, s2;

    tab = (mlib_u32*)dtab;
    s0 = tab0[0];
    s1 = tab1[0];
    s2 = tab2[0];
    for (i = 0; i < 255; i++) {
      tab[4*i] = s0;
      tab[4*i+1] = s1;
      tab[4*i+2] = s2;
      s0 = tab0[i+1];
      s1 = tab1[i+1];
      s2 = tab2[i+1];
    }

    tab[1020] = s0;
    tab[1021] = s1;
    tab[1022] = s2;

    sl = (void *)src;
    dl = dst;

    /* row loop */
    for (j = 0; j < ysize; j ++) {
      mlib_u8  *sp = sl;
      mlib_u32 *dp = (mlib_u32*)dl;
      mlib_s32 off, size = xsize;

      off = (mlib_s32)((4 - ((mlib_addr)sp & 3)) & 3);

      off = (off < size) ? off : size;

#pragma pipeloop(0)
      for (i = 0; i < off; i++) {
        dp[0] = tab0[sp[0]];
        dp[1] = tab1[sp[0]];
        dp[2] = tab2[sp[0]];
        dp += 3; sp++;
      }

      size -= off;

      if (size > 0) {
        if (((mlib_addr)dp & 7) == 0) {
          mlib_v_ImageLookUpSI_U8_S32_3_SrcOff0_D1(sp, (mlib_s32*)dp, size, dtab);
        } else {
          mlib_v_ImageLookUpSI_U8_S32_3_DstNonAl_D1(sp, (mlib_s32*)dp, size, dtab);
        }
      }

      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
      dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb);
    }
  }
}

/***************************************************************/
void mlib_v_ImageLookUpSI_U8_S32_4_SrcOff0_D1(const mlib_u8  *src,
                                              mlib_s32       *dst,
                                              mlib_s32       xsize,
                                              const mlib_d64 *table)
{
  mlib_u32 *sa;            /* aligned pointer to source data */
  mlib_u8  *sp;            /* pointer to source data */
  mlib_u32 s0;             /* source data */
  mlib_d64 *dp;            /* aligned pointer to destination */
  mlib_d64 t0, t1, t2, t3; /* destination data */
  mlib_d64 t4, t5, t6, t7; /* destination data */
  mlib_s32 i;              /* loop variable */
  mlib_u32 s00, s01, s02, s03;

  sa   = (mlib_u32*)src;
  dp   = (mlib_d64 *) dst;

  i = 0;

  if (xsize >= 4) {

    s0 = *sa++;
    s00 = (s0 >> 20) & 0xFF0;
    s01 = (s0 >> 12) & 0xFF0;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, dp+=8) {
      s02 = (s0 >> 4) & 0xFF0;
      s03 = (s0 << 4) & 0xFF0;
      t0 = *(mlib_d64*)((mlib_u8*)table + s00);
      t1 = *(mlib_d64*)((mlib_u8*)table + s00 + 8);
      t2 = *(mlib_d64*)((mlib_u8*)table + s01);
      t3 = *(mlib_d64*)((mlib_u8*)table + s01 + 8);
      t4 = *(mlib_d64*)((mlib_u8*)table + s02);
      t5 = *(mlib_d64*)((mlib_u8*)table + s02 + 8);
      t6 = *(mlib_d64*)((mlib_u8*)table + s03);
      t7 = *(mlib_d64*)((mlib_u8*)table + s03 + 8);
      s0 = *sa++;
      s00 = (s0 >> 20) & 0xFF0;
      s01 = (s0 >> 12) & 0xFF0;
      dp[0] = t0;
      dp[1] = t1;
      dp[2] = t2;
      dp[3] = t3;
      dp[4] = t4;
      dp[5] = t5;
      dp[6] = t6;
      dp[7] = t7;
    }

    s02 = (s0 >> 4) & 0xFF0;
    s03 = (s0 << 4) & 0xFF0;
    t0 = *(mlib_d64*)((mlib_u8*)table + s00);
    t1 = *(mlib_d64*)((mlib_u8*)table + s00 + 8);
    t2 = *(mlib_d64*)((mlib_u8*)table + s01);
    t3 = *(mlib_d64*)((mlib_u8*)table + s01 + 8);
    t4 = *(mlib_d64*)((mlib_u8*)table + s02);
    t5 = *(mlib_d64*)((mlib_u8*)table + s02 + 8);
    t6 = *(mlib_d64*)((mlib_u8*)table + s03);
    t7 = *(mlib_d64*)((mlib_u8*)table + s03 + 8);
    dp[0] = t0;
    dp[1] = t1;
    dp[2] = t2;
    dp[3] = t3;
    dp[4] = t4;
    dp[5] = t5;
    dp[6] = t6;
    dp[7] = t7;
    dp += 8;
    i += 4;
  }

  sp = (mlib_u8*)sa;

  if ( i < xsize ) {
    *dp++ = table[2*sp[0]];
    *dp++ = table[2*sp[0] + 1];
    i++; sp++;
  }

  if ( i < xsize ) {
    *dp++ = table[2*sp[0]];
    *dp++ = table[2*sp[0] + 1];
    i++; sp++;
  }

  if ( i < xsize ) {
    *dp++ = table[2*sp[0]];
    *dp++ = table[2*sp[0] + 1];
  }
}

/***************************************************************/
void mlib_v_ImageLookUpSI_U8_S32_4_DstNonAl_D1(const mlib_u8  *src,
                                               mlib_s32       *dst,
                                               mlib_s32       xsize,
                                               const mlib_d64 *table)
{
  mlib_u32 *sa;                /* aligned pointer to source data */
  mlib_u8  *sp;                /* pointer to source data */
  mlib_u32 s0;                 /* source data */
  mlib_s32 *dl;                /* pointer to start of destination */
  mlib_d64 *dp;                /* aligned pointer to destination */
  mlib_d64 t0, t1, t2, t3;     /* destination data */
  mlib_d64 t4, t5, t6, t7, t8; /* destination data */
  mlib_s32 i;                  /* loop variable */
  mlib_u32 s00, s01, s02, s03;

  sa = (mlib_u32*)src;
  dl = dst;
  dp   = (mlib_d64 *) ((mlib_addr) dl & (~7)) + 1;
  vis_alignaddr(dp, 4);
  s0 = *sa++;
  s00 = (s0 >> 20) & 0xFF0;
  t0 = *(mlib_d64*)((mlib_u8*)table + s00);
  t1 = *(mlib_d64*)((mlib_u8*)table + s00 + 8);
  *(mlib_f32*)dl = vis_read_hi(t0);
  dp[0] = vis_faligndata(t0, t1);
  t0 = t1;
  xsize--; dp++;
  sp = (mlib_u8*)sa - 3;

  if (xsize >= 3) {
    s01 = (s0 >> 12) & 0xFF0;
    s02 = (s0 >> 4) & 0xFF0;
    s03 = (s0 << 4) & 0xFF0;
    t1 = *(mlib_d64*)((mlib_u8*)table + s01);
    t2 = *(mlib_d64*)((mlib_u8*)table + s01 + 8);
    t3 = *(mlib_d64*)((mlib_u8*)table + s02);
    t4 = *(mlib_d64*)((mlib_u8*)table + s02 + 8);
    t5 = *(mlib_d64*)((mlib_u8*)table + s03);
    t6 = *(mlib_d64*)((mlib_u8*)table + s03 + 8);
    dp[0] = vis_faligndata(t0, t1);
    dp[1] = vis_faligndata(t1, t2);
    dp[2] = vis_faligndata(t2, t3);
    dp[3] = vis_faligndata(t3, t4);
    dp[4] = vis_faligndata(t4, t5);
    dp[5] = vis_faligndata(t5, t6);
    t0 = t6; dp += 6; xsize -= 3;
    sp = (mlib_u8*)sa;
  }

  i = 0;

  if (xsize >= 4) {

    s0 = *sa++;
    s00 = (s0 >> 20) & 0xFF0;
    s01 = (s0 >> 12) & 0xFF0;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, dp += 8) {
      s02 = (s0 >> 4) & 0xFF0;
      s03 = (s0 << 4) & 0xFF0;
      t1 = *(mlib_d64*)((mlib_u8*)table + s00);
      t2 = *(mlib_d64*)((mlib_u8*)table + s00 + 8);
      t3 = *(mlib_d64*)((mlib_u8*)table + s01);
      t4 = *(mlib_d64*)((mlib_u8*)table + s01 + 8);
      t5 = *(mlib_d64*)((mlib_u8*)table + s02);
      t6 = *(mlib_d64*)((mlib_u8*)table + s02 + 8);
      t7 = *(mlib_d64*)((mlib_u8*)table + s03);
      t8 = *(mlib_d64*)((mlib_u8*)table + s03 + 8);
      s0 = *sa++;
      s00 = (s0 >> 20) & 0xFF0;
      s01 = (s0 >> 12) & 0xFF0;
      dp[0] = vis_faligndata(t0, t1);
      dp[1] = vis_faligndata(t1, t2);
      dp[2] = vis_faligndata(t2, t3);
      dp[3] = vis_faligndata(t3, t4);
      dp[4] = vis_faligndata(t4, t5);
      dp[5] = vis_faligndata(t5, t6);
      dp[6] = vis_faligndata(t6, t7);
      dp[7] = vis_faligndata(t7, t8);
      t0 = t8;
    }

    s02 = (s0 >> 4) & 0xFF0;
    s03 = (s0 << 4) & 0xFF0;
    t1 = *(mlib_d64*)((mlib_u8*)table + s00);
    t2 = *(mlib_d64*)((mlib_u8*)table + s00 + 8);
    t3 = *(mlib_d64*)((mlib_u8*)table + s01);
    t4 = *(mlib_d64*)((mlib_u8*)table + s01 + 8);
    t5 = *(mlib_d64*)((mlib_u8*)table + s02);
    t6 = *(mlib_d64*)((mlib_u8*)table + s02 + 8);
    t7 = *(mlib_d64*)((mlib_u8*)table + s03);
    t8 = *(mlib_d64*)((mlib_u8*)table + s03 + 8);
    dp[0] = vis_faligndata(t0, t1);
    dp[1] = vis_faligndata(t1, t2);
    dp[2] = vis_faligndata(t2, t3);
    dp[3] = vis_faligndata(t3, t4);
    dp[4] = vis_faligndata(t4, t5);
    dp[5] = vis_faligndata(t5, t6);
    dp[6] = vis_faligndata(t6, t7);
    dp[7] = vis_faligndata(t7, t8);
    t0 = t8;
    dp += 8;
    i += 4;
    sp = (mlib_u8*)sa;
  }

  if ( i < xsize ) {
    t1 = table[2*sp[0]];
    t2 = table[2*sp[0] + 1];
    *dp++ = vis_faligndata(t0, t1);
    *dp++ = vis_faligndata(t1, t2);
    i++; sp++;
    t0 = t2;
  }

  if ( i < xsize ) {
    t1 = table[2*sp[0]];
    t2 = table[2*sp[0] + 1];
    *dp++ = vis_faligndata(t0, t1);
    *dp++ = vis_faligndata(t1, t2);
    i++; sp++;
    t0 = t2;
  }

  if ( i < xsize ) {
    t1 = table[2*sp[0]];
    t2 = table[2*sp[0] + 1];
    *dp++ = vis_faligndata(t0, t1);
    *dp++ = vis_faligndata(t1, t2);
    t0 = t2;
  }

  *(mlib_f32*) dp = vis_read_lo(t0);
}

/***************************************************************/
void mlib_v_ImageLookUpSI_U8_S32_4_SMALL(const mlib_u8  *src,
                                         mlib_s32       *dst,
                                         mlib_s32       xsize,
                                         const mlib_s32 **table)
{
  mlib_u32 *sa;          /* aligned pointer to source data */
  mlib_u8  *sp;          /* pointer to source data */
  mlib_u32 s0;           /* source data */
  mlib_f32 *dp;          /* aligned pointer to destination */
  mlib_f32 acc0, acc1;   /* destination data */
  mlib_f32 acc2, acc3;   /* destination data */
  mlib_f32 acc4, acc5;   /* destination data */
  mlib_f32 acc6, acc7;   /* destination data */
  mlib_f32 acc8, acc9;   /* destination data */
  mlib_f32 acc10, acc11; /* destination data */
  mlib_f32 acc12, acc13; /* destination data */
  mlib_f32 acc14, acc15; /* destination data */
  mlib_f32 *table0 = (mlib_f32*)table[0];
  mlib_f32 *table1 = (mlib_f32*)table[1];
  mlib_f32 *table2 = (mlib_f32*)table[2];
  mlib_f32 *table3 = (mlib_f32*)table[3];
  mlib_s32 i;            /* loop variable */
  mlib_u32 s00, s01, s02, s03;

  sa   = (mlib_u32*)src;
  dp   = (mlib_f32*)dst;

  i = 0;

  if (xsize >= 4) {

    s0 = *sa++;
    s00 = (s0 >> 22) & 0x3FC;
    s01 = (s0 >> 14) & 0x3FC;

#pragma pipeloop(0)
    for(i = 0; i <= xsize - 8; i+=4, dp += 16) {
      s02 = (s0 >> 6) & 0x3FC;
      s03 = (s0 << 2) & 0x3FC;
      acc0 = *(mlib_f32*)((mlib_u8*)table0 + s00);
      acc1 = *(mlib_f32*)((mlib_u8*)table1 + s00);
      acc2 = *(mlib_f32*)((mlib_u8*)table2 + s00);
      acc3 = *(mlib_f32*)((mlib_u8*)table3 + s00);
      acc4 = *(mlib_f32*)((mlib_u8*)table0 + s01);
      acc5 = *(mlib_f32*)((mlib_u8*)table1 + s01);
      acc6 = *(mlib_f32*)((mlib_u8*)table2 + s01);
      acc7 = *(mlib_f32*)((mlib_u8*)table3 + s01);
      acc8 = *(mlib_f32*)((mlib_u8*)table0 + s02);
      acc9 = *(mlib_f32*)((mlib_u8*)table1 + s02);
      acc10 = *(mlib_f32*)((mlib_u8*)table2 + s02);
      acc11 = *(mlib_f32*)((mlib_u8*)table3 + s02);
      acc12 = *(mlib_f32*)((mlib_u8*)table0 + s03);
      acc13 = *(mlib_f32*)((mlib_u8*)table1 + s03);
      acc14 = *(mlib_f32*)((mlib_u8*)table2 + s03);
      acc15 = *(mlib_f32*)((mlib_u8*)table3 + s03);
      s0 = *sa++;
      s00 = (s0 >> 22) & 0x3FC;
      s01 = (s0 >> 14) & 0x3FC;
      dp[0] = acc0;
      dp[1] = acc1;
      dp[2] = acc2;
      dp[3] = acc3;
      dp[4] = acc4;
      dp[5] = acc5;
      dp[6] = acc6;
      dp[7] = acc7;
      dp[8] = acc8;
      dp[9] = acc9;
      dp[10] = acc10;
      dp[11] = acc11;
      dp[12] = acc12;
      dp[13] = acc13;
      dp[14] = acc14;
      dp[15] = acc15;
    }

    s02 = (s0 >> 6) & 0x3FC;
    s03 = (s0 << 2) & 0x3FC;
    acc0 = *(mlib_f32*)((mlib_u8*)table0 + s00);
    acc1 = *(mlib_f32*)((mlib_u8*)table1 + s00);
    acc2 = *(mlib_f32*)((mlib_u8*)table2 + s00);
    acc3 = *(mlib_f32*)((mlib_u8*)table3 + s00);
    acc4 = *(mlib_f32*)((mlib_u8*)table0 + s01);
    acc5 = *(mlib_f32*)((mlib_u8*)table1 + s01);
    acc6 = *(mlib_f32*)((mlib_u8*)table2 + s01);
    acc7 = *(mlib_f32*)((mlib_u8*)table3 + s01);
    acc8 = *(mlib_f32*)((mlib_u8*)table0 + s02);
    acc9 = *(mlib_f32*)((mlib_u8*)table1 + s02);
    acc10 = *(mlib_f32*)((mlib_u8*)table2 + s02);
    acc11 = *(mlib_f32*)((mlib_u8*)table3 + s02);
    acc12 = *(mlib_f32*)((mlib_u8*)table0 + s03);
    acc13 = *(mlib_f32*)((mlib_u8*)table1 + s03);
    acc14 = *(mlib_f32*)((mlib_u8*)table2 + s03);
    acc15 = *(mlib_f32*)((mlib_u8*)table3 + s03);
    dp[0] = acc0;
    dp[1] = acc1;
    dp[2] = acc2;
    dp[3] = acc3;
    dp[4] = acc4;
    dp[5] = acc5;
    dp[6] = acc6;
    dp[7] = acc7;
    dp[8] = acc8;
    dp[9] = acc9;
    dp[10] = acc10;
    dp[11] = acc11;
    dp[12] = acc12;
    dp[13] = acc13;
    dp[14] = acc14;
    dp[15] = acc15;
    dp += 16;
    i += 4;
  }

  sp = (mlib_u8*)sa;

  if ( i < xsize ) {
    *dp++ = table0[sp[0]];
    *dp++ = table1[sp[0]];
    *dp++ = table2[sp[0]];
    *dp++ = table3[sp[0]];
    i++; sp++;
  }

  if ( i < xsize ) {
    *dp++ = table0[sp[0]];
    *dp++ = table1[sp[0]];
    *dp++ = table2[sp[0]];
    *dp++ = table3[sp[0]];
    i++; sp++;
  }

  if ( i < xsize ) {
    *dp++ = table0[sp[0]];
    *dp++ = table1[sp[0]];
    *dp++ = table2[sp[0]];
    *dp++ = table3[sp[0]];
  }
}

/***************************************************************/
void mlib_v_ImageLookUpSI_U8_S32_4(const mlib_u8  *src,
                                   mlib_s32       slb,
                                   mlib_s32       *dst,
                                   mlib_s32       dlb,
                                   mlib_s32       xsize,
                                   mlib_s32       ysize,
                                   const mlib_s32 **table)
{
  if ((xsize * ysize) < 600) {
    mlib_u8  *sl;
    mlib_s32 *dl;
    mlib_s32 j, i;
    const mlib_s32 *tab0 = table[0];
    const mlib_s32 *tab1 = table[1];
    const mlib_s32 *tab2 = table[2];
    const mlib_s32 *tab3 = table[3];

    sl = (void *)src;
    dl = dst;

    /* row loop */
    for (j = 0; j < ysize; j ++) {
      mlib_u8  *sp = sl;
      mlib_s32 *dp = dl;
      mlib_s32 off, size = xsize;

      off = (mlib_s32)((4 - ((mlib_addr)sp & 3)) & 3);

      off = (off < size) ? off : size;

      for (i = 0; i < off; i++) {
        *dp++ = tab0[sp[0]];
        *dp++ = tab1[sp[0]];
        *dp++ = tab2[sp[0]];
        *dp++ = tab3[sp[0]];
        size--; sp++;
      }

      if (size > 0) {
        mlib_v_ImageLookUpSI_U8_S32_4_SMALL(sp, (mlib_s32*)dp, size, table);
      }

      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
      dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb);
    }

  } else {
    mlib_u8  *sl;
    mlib_s32 *dl;
    mlib_d64 dtab[512];
    mlib_u32 *tab;
    mlib_u32 *tab0 = (mlib_u32*)table[0];
    mlib_u32 *tab1 = (mlib_u32*)table[1];
    mlib_u32 *tab2 = (mlib_u32*)table[2];
    mlib_u32 *tab3 = (mlib_u32*)table[3];
    mlib_s32 i, j;
    mlib_u32 s0, s1, s2, s3;

    tab = (mlib_u32*)dtab;
    s0 = tab0[0];
    s1 = tab1[0];
    s2 = tab2[0];
    s3 = tab3[0];
    for (i = 0; i < 255; i++) {
      tab[4*i] = s0;
      tab[4*i+1] = s1;
      tab[4*i+2] = s2;
      tab[4*i+3] = s3;
      s0 = tab0[i+1];
      s1 = tab1[i+1];
      s2 = tab2[i+1];
      s3 = tab3[i+1];
    }

    tab[1020] = s0;
    tab[1021] = s1;
    tab[1022] = s2;
    tab[1023] = s3;

    sl = (void *)src;
    dl = dst;

    /* row loop */
    for (j = 0; j < ysize; j ++) {
      mlib_u8  *sp = sl;
      mlib_u32 *dp = (mlib_u32*)dl;
      mlib_s32 off, size = xsize;

      off = (mlib_s32)((4 - ((mlib_addr)sp & 3)) & 3);

      off = (off < size) ? off : size;

#pragma pipeloop(0)
      for (i = 0; i < off; i++) {
        dp[0] = tab0[sp[0]];
        dp[1] = tab1[sp[0]];
        dp[2] = tab2[sp[0]];
        dp[3] = tab3[sp[0]];
        dp += 4; sp++;
      }

      size -= off;

      if (size > 0) {
        if (((mlib_addr)dp & 7) == 0) {
          mlib_v_ImageLookUpSI_U8_S32_4_SrcOff0_D1(sp, (mlib_s32*)dp, size, dtab);
        } else {
          mlib_v_ImageLookUpSI_U8_S32_4_DstNonAl_D1(sp, (mlib_s32*)dp, size, dtab);
        }
      }

      sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
      dl = (mlib_s32 *) ((mlib_u8 *) dl + dlb);
    }
  }
}

/***************************************************************/