jdk/src/solaris/native/sun/awt/medialib/mlib_v_ImageLookUpSIU8U8Func.c
changeset 2 90ce3da70b43
child 5506 202f599c92aa
equal deleted inserted replaced
0:fd16c54261b3 2:90ce3da70b43
       
     1 /*
       
     2  * Copyright 1998-2003 Sun Microsystems, Inc.  All Rights Reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.  Sun designates this
       
     8  * particular file as subject to the "Classpath" exception as provided
       
     9  * by Sun in the LICENSE file that accompanied this code.
       
    10  *
       
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    14  * version 2 for more details (a copy is included in the LICENSE file that
       
    15  * accompanied this code).
       
    16  *
       
    17  * You should have received a copy of the GNU General Public License version
       
    18  * 2 along with this work; if not, write to the Free Software Foundation,
       
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    20  *
       
    21  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
       
    22  * CA 95054 USA or visit www.sun.com if you need additional information or
       
    23  * have any questions.
       
    24  */
       
    25 
       
    26 
       
    27 
       
    28 #include "vis_proto.h"
       
    29 #include "mlib_image.h"
       
    30 #include "mlib_v_ImageLookUpFunc.h"
       
    31 
       
    32 /***************************************************************/
       
    33 static void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff0_D1(const mlib_u8  *src,
       
    34                                                     mlib_u8        *dst,
       
    35                                                     mlib_s32       xsize,
       
    36                                                     const mlib_u16 *table);
       
    37 
       
    38 static void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff1_D1(const mlib_u8  *src,
       
    39                                                     mlib_u8        *dst,
       
    40                                                     mlib_s32       xsize,
       
    41                                                     const mlib_u16 *table);
       
    42 
       
    43 static void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff2_D1(const mlib_u8  *src,
       
    44                                                     mlib_u8        *dst,
       
    45                                                     mlib_s32       xsize,
       
    46                                                     const mlib_u16 *table);
       
    47 
       
    48 static void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff3_D1(const mlib_u8  *src,
       
    49                                                     mlib_u8        *dst,
       
    50                                                     mlib_s32       xsize,
       
    51                                                     const mlib_u16 *table);
       
    52 
       
    53 static void mlib_v_ImageLookUpSI_U8_U8_2_DstNonAl_D1(const mlib_u8  *src,
       
    54                                                      mlib_u8        *dst,
       
    55                                                      mlib_s32       xsize,
       
    56                                                      const mlib_u16 *table);
       
    57 
       
    58 static void mlib_v_ImageLookUpSI_U8_U8_2_DstA8D1_SMALL(const mlib_u8 *src,
       
    59                                                        mlib_u8       *dst,
       
    60                                                        mlib_s32      xsize,
       
    61                                                        const mlib_u8 **table);
       
    62 
       
    63 static void mlib_v_ImageLookUpSI_U8_U8_2_D1_SMALL(const mlib_u8 *src,
       
    64                                                   mlib_u8       *dst,
       
    65                                                   mlib_s32      xsize,
       
    66                                                   const mlib_u8 **table);
       
    67 
       
    68 static void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff0_D1(const mlib_u8  *src,
       
    69                                                     mlib_u8        *dst,
       
    70                                                     mlib_s32       xsize,
       
    71                                                     const mlib_d64 *table);
       
    72 
       
    73 static void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff1_D1(const mlib_u8  *src,
       
    74                                                     mlib_u8        *dst,
       
    75                                                     mlib_s32       xsize,
       
    76                                                     const mlib_d64 *table);
       
    77 
       
    78 static void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff2_D1(const mlib_u8  *src,
       
    79                                                     mlib_u8        *dst,
       
    80                                                     mlib_s32       xsize,
       
    81                                                     const mlib_d64 *table);
       
    82 
       
    83 static void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff3_D1(const mlib_u8  *src,
       
    84                                                     mlib_u8        *dst,
       
    85                                                     mlib_s32       xsize,
       
    86                                                     const mlib_d64 *table);
       
    87 
       
    88 static void mlib_v_ImageLookUpSI_U8_U8_3_D1_SMALL(const mlib_u8 *src,
       
    89                                                   mlib_u8       *dst,
       
    90                                                   mlib_s32      xsize,
       
    91                                                   const mlib_u8 **table);
       
    92 
       
    93 static void mlib_v_ImageLookUpSI_U8_U8_4_SrcOff0_D1(const mlib_u8  *src,
       
    94                                                     mlib_u8        *dst,
       
    95                                                     mlib_s32       xsize,
       
    96                                                     const mlib_f32 *table);
       
    97 
       
    98 static void mlib_v_ImageLookUpSI_U8_U8_4_DstNonAl_D1(const mlib_u8  *src,
       
    99                                                      mlib_u8        *dst,
       
   100                                                      mlib_s32       xsize,
       
   101                                                      const mlib_f32 *table);
       
   102 
       
   103 static void mlib_v_ImageLookUpSI_U8_U8_4_DstOff0_D1_SMALL(const mlib_u8 *src,
       
   104                                                           mlib_u8       *dst,
       
   105                                                           mlib_s32      xsize,
       
   106                                                           const mlib_u8 **table);
       
   107 
       
   108 static void mlib_v_ImageLookUpSI_U8_U8_4_DstOff1_D1_SMALL(const mlib_u8 *src,
       
   109                                                           mlib_u8       *dst,
       
   110                                                           mlib_s32      xsize,
       
   111                                                           const mlib_u8 **table);
       
   112 
       
   113 static void mlib_v_ImageLookUpSI_U8_U8_4_DstOff2_D1_SMALL(const mlib_u8 *src,
       
   114                                                           mlib_u8       *dst,
       
   115                                                           mlib_s32      xsize,
       
   116                                                           const mlib_u8 **table);
       
   117 
       
   118 static void mlib_v_ImageLookUpSI_U8_U8_4_DstOff3_D1_SMALL(const mlib_u8 *src,
       
   119                                                           mlib_u8       *dst,
       
   120                                                           mlib_s32      xsize,
       
   121                                                           const mlib_u8 **table);
       
   122 
       
   123 /***************************************************************/
       
   124 #define VIS_LD_U8_I(X, Y)       vis_ld_u8_i((void *)(X), (Y))
       
   125 #define VIS_LD_U16_I(X, Y)      vis_ld_u16_i((void *)(X), (Y))
       
   126 
       
   127 /***************************************************************/
       
   128 void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff0_D1(const mlib_u8  *src,
       
   129                                              mlib_u8        *dst,
       
   130                                              mlib_s32       xsize,
       
   131                                              const mlib_u16 *table)
       
   132 {
       
   133   mlib_u32 *sa;          /* aligned pointer to source data */
       
   134   mlib_u8  *sp;          /* pointer to source data */
       
   135   mlib_u32 s0;           /* source data */
       
   136   mlib_u16 *dl;          /* pointer to start of destination */
       
   137   mlib_u16 *dend;        /* pointer to end of destination */
       
   138   mlib_d64 *dp;          /* aligned pointer to destination */
       
   139   mlib_d64 t0, t1, t2;   /* destination data */
       
   140   mlib_d64 t3, acc;      /* destination data */
       
   141   mlib_s32 emask;        /* edge mask */
       
   142   mlib_s32 i, num;       /* loop variable */
       
   143 
       
   144   sa   = (mlib_u32*)src;
       
   145   dl   = (mlib_u16*)dst;
       
   146   dp   = (mlib_d64 *) dl;
       
   147   dend = dl + xsize - 1;
       
   148 
       
   149   vis_alignaddr((void *) 0, 6);
       
   150 
       
   151   if (xsize >= 4) {
       
   152 
       
   153     s0 = sa[0];
       
   154     sa ++;
       
   155 
       
   156 #pragma pipeloop(0)
       
   157     for(i = 0; i <= xsize - 8; i+=4, sa++) {
       
   158       t3 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
       
   159       t2 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
       
   160       t1 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
       
   161       t0 = VIS_LD_U16_I(table, (s0 >> 23) & 0x1FE);
       
   162       acc = vis_faligndata(t3, acc);
       
   163       acc = vis_faligndata(t2, acc);
       
   164       acc = vis_faligndata(t1, acc);
       
   165       acc = vis_faligndata(t0, acc);
       
   166       s0 = sa[0];
       
   167       *dp++ = acc;
       
   168     }
       
   169 
       
   170     t3 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
       
   171     t2 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
       
   172     t1 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
       
   173     t0 = VIS_LD_U16_I(table, (s0 >> 23) & 0x1FE);
       
   174     acc = vis_faligndata(t3, acc);
       
   175     acc = vis_faligndata(t2, acc);
       
   176     acc = vis_faligndata(t1, acc);
       
   177     acc = vis_faligndata(t0, acc);
       
   178     *dp++ = acc;
       
   179   }
       
   180 
       
   181   sp = (mlib_u8*)sa;
       
   182 
       
   183   if ((mlib_addr) dp <= (mlib_addr) dend) {
       
   184 
       
   185     num = (mlib_u16*) dend - (mlib_u16*) dp;
       
   186     sp  += num;
       
   187     num ++;
       
   188 #pragma pipeloop(0)
       
   189     for (i = 0; i < num; i ++) {
       
   190       s0 = (mlib_s32) *sp;
       
   191       sp --;
       
   192 
       
   193       t0  = VIS_LD_U16_I(table, 2*s0);
       
   194       acc = vis_faligndata(t0, acc);
       
   195     }
       
   196 
       
   197     emask = vis_edge16(dp, dend);
       
   198     vis_pst_16(acc, dp, emask);
       
   199   }
       
   200 }
       
   201 
       
   202 /***************************************************************/
       
   203 void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff1_D1(const mlib_u8  *src,
       
   204                                              mlib_u8        *dst,
       
   205                                              mlib_s32       xsize,
       
   206                                              const mlib_u16 *table)
       
   207 {
       
   208   mlib_u32 *sa;          /* aligned pointer to source data */
       
   209   mlib_u8  *sp;          /* pointer to source data */
       
   210   mlib_u32 s0, s1;       /* source data */
       
   211   mlib_u16 *dl;          /* pointer to start of destination */
       
   212   mlib_u16 *dend;        /* pointer to end of destination */
       
   213   mlib_d64 *dp;          /* aligned pointer to destination */
       
   214   mlib_d64 t0, t1, t2;   /* destination data */
       
   215   mlib_d64 t3, acc;      /* destination data */
       
   216   mlib_s32 emask;        /* edge mask */
       
   217   mlib_s32 i, num;       /* loop variable */
       
   218 
       
   219   sa   = (mlib_u32*)(src-1);
       
   220   dl   = (mlib_u16*)dst;
       
   221   dp   = (mlib_d64 *) dl;
       
   222   dend = dl + xsize - 1;
       
   223 
       
   224   vis_alignaddr((void *) 0, 6);
       
   225 
       
   226   s0 = *sa++;
       
   227 
       
   228   if (xsize >= 4) {
       
   229 
       
   230     s1 = sa[0];
       
   231     sa ++;
       
   232 
       
   233 #pragma pipeloop(0)
       
   234     for(i = 0; i <= xsize - 8; i+=4, sa++) {
       
   235       t3 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
       
   236       t2 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
       
   237       t1 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
       
   238       t0 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
       
   239       acc = vis_faligndata(t3, acc);
       
   240       acc = vis_faligndata(t2, acc);
       
   241       acc = vis_faligndata(t1, acc);
       
   242       acc = vis_faligndata(t0, acc);
       
   243       s0 = s1;
       
   244       s1 = sa[0];
       
   245       *dp++ = acc;
       
   246     }
       
   247 
       
   248     t3 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
       
   249     t2 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
       
   250     t1 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
       
   251     t0 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
       
   252     acc = vis_faligndata(t3, acc);
       
   253     acc = vis_faligndata(t2, acc);
       
   254     acc = vis_faligndata(t1, acc);
       
   255     acc = vis_faligndata(t0, acc);
       
   256     *dp++ = acc;
       
   257   }
       
   258 
       
   259   sp = (mlib_u8*)sa;
       
   260   sp -= 3;
       
   261 
       
   262   if ((mlib_addr) dp <= (mlib_addr) dend) {
       
   263 
       
   264     num = (mlib_u16*) dend - (mlib_u16*) dp;
       
   265     sp  += num;
       
   266     num ++;
       
   267 #pragma pipeloop(0)
       
   268     for (i = 0; i < num; i ++) {
       
   269       s0 = (mlib_s32) *sp;
       
   270       sp --;
       
   271 
       
   272       t0  = VIS_LD_U16_I(table, 2*s0);
       
   273       acc = vis_faligndata(t0, acc);
       
   274     }
       
   275 
       
   276     emask = vis_edge16(dp, dend);
       
   277     vis_pst_16(acc, dp, emask);
       
   278   }
       
   279 }
       
   280 
       
   281 /***************************************************************/
       
   282 void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff2_D1(const mlib_u8  *src,
       
   283                                              mlib_u8        *dst,
       
   284                                              mlib_s32       xsize,
       
   285                                              const mlib_u16 *table)
       
   286 {
       
   287   mlib_u32 *sa;          /* pointer to source data */
       
   288   mlib_u8  *sp;          /* pointer to source data */
       
   289   mlib_u32 s0, s1;       /* source data */
       
   290   mlib_u16 *dl;          /* pointer to start of destination */
       
   291   mlib_u16 *dend;        /* pointer to end of destination */
       
   292   mlib_d64 *dp;          /* aligned pointer to destination */
       
   293   mlib_d64 t0, t1, t2;   /* destination data */
       
   294   mlib_d64 t3, acc;      /* destination data */
       
   295   mlib_s32 emask;        /* edge mask */
       
   296   mlib_s32 i, num;       /* loop variable */
       
   297 
       
   298   sa   = (mlib_u32*)(src-2);
       
   299   dl   = (mlib_u16*)dst;
       
   300   dp   = (mlib_d64 *) dl;
       
   301   dend = dl + xsize - 1;
       
   302 
       
   303   vis_alignaddr((void *) 0, 6);
       
   304 
       
   305   s0 = *sa++;
       
   306 
       
   307   if (xsize >= 4) {
       
   308 
       
   309     s1 = sa[0];
       
   310     sa ++;
       
   311 
       
   312 #pragma pipeloop(0)
       
   313     for(i = 0; i <= xsize - 8; i+=4, sa++) {
       
   314       t3 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
       
   315       t2 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
       
   316       t1 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
       
   317       t0 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
       
   318       acc = vis_faligndata(t3, acc);
       
   319       acc = vis_faligndata(t2, acc);
       
   320       acc = vis_faligndata(t1, acc);
       
   321       acc = vis_faligndata(t0, acc);
       
   322       s0 = s1;
       
   323       s1 = sa[0];
       
   324       *dp++ = acc;
       
   325     }
       
   326 
       
   327     t3 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
       
   328     t2 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
       
   329     t1 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
       
   330     t0 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
       
   331     acc = vis_faligndata(t3, acc);
       
   332     acc = vis_faligndata(t2, acc);
       
   333     acc = vis_faligndata(t1, acc);
       
   334     acc = vis_faligndata(t0, acc);
       
   335     *dp++ = acc;
       
   336   }
       
   337 
       
   338   sp = (mlib_u8*)sa;
       
   339   sp -= 2;
       
   340 
       
   341   if ((mlib_addr) dp <= (mlib_addr) dend) {
       
   342 
       
   343     num = (mlib_u16*) dend - (mlib_u16*) dp;
       
   344     sp  += num;
       
   345     num ++;
       
   346 #pragma pipeloop(0)
       
   347     for (i = 0; i < num; i ++) {
       
   348       s0 = (mlib_s32) *sp;
       
   349       sp --;
       
   350 
       
   351       t0  = VIS_LD_U16_I(table, 2*s0);
       
   352       acc = vis_faligndata(t0, acc);
       
   353     }
       
   354 
       
   355     emask = vis_edge16(dp, dend);
       
   356     vis_pst_16(acc, dp, emask);
       
   357   }
       
   358 }
       
   359 
       
   360 /***************************************************************/
       
   361 void mlib_v_ImageLookUpSI_U8_U8_2_SrcOff3_D1(const mlib_u8  *src,
       
   362                                              mlib_u8        *dst,
       
   363                                              mlib_s32       xsize,
       
   364                                              const mlib_u16 *table)
       
   365 {
       
   366   mlib_u32 *sa;          /* aligned pointer to source data */
       
   367   mlib_u8  *sp;          /* pointer to source data */
       
   368   mlib_u32 s0, s1;       /* source data */
       
   369   mlib_u16 *dl;          /* pointer to start of destination */
       
   370   mlib_u16 *dend;        /* pointer to end of destination */
       
   371   mlib_d64 *dp;          /* aligned pointer to destination */
       
   372   mlib_d64 t0, t1, t2;   /* destination data */
       
   373   mlib_d64 t3, acc;      /* destination data */
       
   374   mlib_s32 emask;        /* edge mask */
       
   375   mlib_s32 i, num;       /* loop variable */
       
   376 
       
   377   sa   = (mlib_u32*)(src-3);
       
   378   dl   = (mlib_u16*)dst;
       
   379   dp   = (mlib_d64 *) dl;
       
   380   dend = dl + xsize - 1;
       
   381 
       
   382   vis_alignaddr((void *) 0, 6);
       
   383 
       
   384   s0 = *sa++;
       
   385 
       
   386   if (xsize >= 4) {
       
   387 
       
   388     s1 = sa[0];
       
   389     sa ++;
       
   390 
       
   391 #pragma pipeloop(0)
       
   392     for(i = 0; i <= xsize - 8; i+=4, sa++) {
       
   393       t3 = VIS_LD_U16_I(table, (s1 >> 7) & 0x1FE);
       
   394       t2 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
       
   395       t1 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
       
   396       t0 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
       
   397       acc = vis_faligndata(t3, acc);
       
   398       acc = vis_faligndata(t2, acc);
       
   399       acc = vis_faligndata(t1, acc);
       
   400       acc = vis_faligndata(t0, acc);
       
   401       s0 = s1;
       
   402       s1 = sa[0];
       
   403       *dp++ = acc;
       
   404     }
       
   405 
       
   406     t3 = VIS_LD_U16_I(table, (s1 >> 7) & 0x1FE);
       
   407     t2 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
       
   408     t1 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
       
   409     t0 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
       
   410     acc = vis_faligndata(t3, acc);
       
   411     acc = vis_faligndata(t2, acc);
       
   412     acc = vis_faligndata(t1, acc);
       
   413     acc = vis_faligndata(t0, acc);
       
   414     *dp++ = acc;
       
   415   }
       
   416 
       
   417   sp = (mlib_u8*)sa;
       
   418   sp -= 1;
       
   419 
       
   420   if ((mlib_addr) dp <= (mlib_addr) dend) {
       
   421 
       
   422     num = (mlib_u16*) dend - (mlib_u16*) dp;
       
   423     sp  += num;
       
   424     num ++;
       
   425 #pragma pipeloop(0)
       
   426     for (i = 0; i < num; i ++) {
       
   427       s0 = (mlib_s32) *sp;
       
   428       sp --;
       
   429 
       
   430       t0  = VIS_LD_U16_I(table, 2*s0);
       
   431       acc = vis_faligndata(t0, acc);
       
   432     }
       
   433 
       
   434     emask = vis_edge16(dp, dend);
       
   435     vis_pst_16(acc, dp, emask);
       
   436   }
       
   437 }
       
   438 
       
   439 /***************************************************************/
       
   440 void mlib_v_ImageLookUpSI_U8_U8_2_DstNonAl_D1(const mlib_u8  *src,
       
   441                                               mlib_u8        *dst,
       
   442                                               mlib_s32       xsize,
       
   443                                               const mlib_u16 *table)
       
   444 {
       
   445   mlib_u32 *sa;             /* aligned pointer to source data */
       
   446   mlib_u8  *sp;             /* pointer to source data */
       
   447   mlib_u32 s0, s1, s2, s3;  /* source data */
       
   448   mlib_u8  *dl;             /* pointer to start of destination */
       
   449   mlib_u8  *dend;           /* pointer to end of destination */
       
   450   mlib_d64 *dp;             /* aligned pointer to destination */
       
   451   mlib_d64 t0, t1, t2;      /* destination data */
       
   452   mlib_d64 t3, t4, t5;      /* destination data */
       
   453   mlib_d64 t6, t7, acc0;    /* destination data */
       
   454   mlib_d64 acc1, acc2;      /* destination data */
       
   455   mlib_d64 acc3, acc4;      /* destination data */
       
   456   mlib_s32 emask;           /* edge mask */
       
   457   mlib_s32 i, num;          /* loop variable */
       
   458   mlib_s32 off;             /* offset */
       
   459 
       
   460   sa   = (mlib_u32*)src;
       
   461   dl   = dst;
       
   462   sp   = (void *)src;
       
   463   dend = dl + 2*xsize - 1;
       
   464   dp   = (mlib_d64 *) ((mlib_addr) dl & (~7));
       
   465   off  = (mlib_addr) dp - (mlib_addr) dl;
       
   466 
       
   467   emask = vis_edge8(dl, dend);
       
   468   num = (xsize < 4) ? xsize : 4;
       
   469 
       
   470   sp += (num-1);
       
   471 
       
   472   vis_alignaddr(dp, 6);
       
   473 
       
   474   for (i = 0; i < num; i ++) {
       
   475     s0 = (mlib_s32) *sp;
       
   476     sp --;
       
   477 
       
   478     t0  = VIS_LD_U16_I(table, 2*s0);
       
   479     acc0 = vis_faligndata(t0, acc0);
       
   480   }
       
   481 
       
   482   vis_alignaddr(dp, off);
       
   483   vis_pst_8(vis_faligndata(acc0, acc0), dp++, emask);
       
   484 
       
   485   sa++;
       
   486 
       
   487   xsize -= 4;
       
   488 
       
   489   i = 0;
       
   490 
       
   491   if (xsize >= 16) {
       
   492 
       
   493     s0 = sa[0];
       
   494     s1 = sa[1];
       
   495     s2 = sa[2];
       
   496     s3 = sa[3];
       
   497     sa += 4;
       
   498 
       
   499 #pragma pipeloop(0)
       
   500     for(i = 0; i <= xsize - 32; i+=16, sa+=4) {
       
   501       vis_alignaddr(dp, 6);
       
   502       t3 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
       
   503       t2 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
       
   504       t1 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
       
   505       t0 = VIS_LD_U16_I(table, (s0 >> 23) & 0x1FE);
       
   506       acc1 = vis_faligndata(t3, acc1);
       
   507       acc1 = vis_faligndata(t2, acc1);
       
   508       acc1 = vis_faligndata(t1, acc1);
       
   509       acc1 = vis_faligndata(t0, acc1);
       
   510       t7 = VIS_LD_U16_I(table, (s1 << 1) & 0x1FE);
       
   511       t6 = VIS_LD_U16_I(table, (s1 >> 7) & 0x1FE);
       
   512       t5 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
       
   513       t4 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
       
   514       acc2 = vis_faligndata(t7, acc2);
       
   515       acc2 = vis_faligndata(t6, acc2);
       
   516       acc2 = vis_faligndata(t5, acc2);
       
   517       acc2 = vis_faligndata(t4, acc2);
       
   518       t3 = VIS_LD_U16_I(table, (s2 << 1) & 0x1FE);
       
   519       t2 = VIS_LD_U16_I(table, (s2 >> 7) & 0x1FE);
       
   520       t1 = VIS_LD_U16_I(table, (s2 >> 15) & 0x1FE);
       
   521       t0 = VIS_LD_U16_I(table, (s2 >> 23) & 0x1FE);
       
   522       acc3 = vis_faligndata(t3, acc3);
       
   523       acc3 = vis_faligndata(t2, acc3);
       
   524       acc3 = vis_faligndata(t1, acc3);
       
   525       acc3 = vis_faligndata(t0, acc3);
       
   526       t7 = VIS_LD_U16_I(table, (s3 << 1) & 0x1FE);
       
   527       t6 = VIS_LD_U16_I(table, (s3 >> 7) & 0x1FE);
       
   528       t5 = VIS_LD_U16_I(table, (s3 >> 15) & 0x1FE);
       
   529       t4 = VIS_LD_U16_I(table, (s3 >> 23) & 0x1FE);
       
   530       acc4 = vis_faligndata(t7, acc4);
       
   531       acc4 = vis_faligndata(t6, acc4);
       
   532       acc4 = vis_faligndata(t5, acc4);
       
   533       acc4 = vis_faligndata(t4, acc4);
       
   534       vis_alignaddr(dp, off);
       
   535       s0 = sa[0];
       
   536       s1 = sa[1];
       
   537       s2 = sa[2];
       
   538       s3 = sa[3];
       
   539       *dp++ = vis_faligndata(acc0, acc1);
       
   540       *dp++ = vis_faligndata(acc1, acc2);
       
   541       *dp++ = vis_faligndata(acc2, acc3);
       
   542       *dp++ = vis_faligndata(acc3, acc4);
       
   543       acc0 = acc4;
       
   544     }
       
   545 
       
   546     vis_alignaddr(dp, 6);
       
   547     t3 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
       
   548     t2 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
       
   549     t1 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
       
   550     t0 = VIS_LD_U16_I(table, (s0 >> 23) & 0x1FE);
       
   551     acc1 = vis_faligndata(t3, acc1);
       
   552     acc1 = vis_faligndata(t2, acc1);
       
   553     acc1 = vis_faligndata(t1, acc1);
       
   554     acc1 = vis_faligndata(t0, acc1);
       
   555     t7 = VIS_LD_U16_I(table, (s1 << 1) & 0x1FE);
       
   556     t6 = VIS_LD_U16_I(table, (s1 >> 7) & 0x1FE);
       
   557     t5 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
       
   558     t4 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
       
   559     acc2 = vis_faligndata(t7, acc2);
       
   560     acc2 = vis_faligndata(t6, acc2);
       
   561     acc2 = vis_faligndata(t5, acc2);
       
   562     acc2 = vis_faligndata(t4, acc2);
       
   563     t3 = VIS_LD_U16_I(table, (s2 << 1) & 0x1FE);
       
   564     t2 = VIS_LD_U16_I(table, (s2 >> 7) & 0x1FE);
       
   565     t1 = VIS_LD_U16_I(table, (s2 >> 15) & 0x1FE);
       
   566     t0 = VIS_LD_U16_I(table, (s2 >> 23) & 0x1FE);
       
   567     acc3 = vis_faligndata(t3, acc3);
       
   568     acc3 = vis_faligndata(t2, acc3);
       
   569     acc3 = vis_faligndata(t1, acc3);
       
   570     acc3 = vis_faligndata(t0, acc3);
       
   571     t7 = VIS_LD_U16_I(table, (s3 << 1) & 0x1FE);
       
   572     t6 = VIS_LD_U16_I(table, (s3 >> 7) & 0x1FE);
       
   573     t5 = VIS_LD_U16_I(table, (s3 >> 15) & 0x1FE);
       
   574     t4 = VIS_LD_U16_I(table, (s3 >> 23) & 0x1FE);
       
   575     acc4 = vis_faligndata(t7, acc4);
       
   576     acc4 = vis_faligndata(t6, acc4);
       
   577     acc4 = vis_faligndata(t5, acc4);
       
   578     acc4 = vis_faligndata(t4, acc4);
       
   579     vis_alignaddr(dp, off);
       
   580     *dp++ = vis_faligndata(acc0, acc1);
       
   581     *dp++ = vis_faligndata(acc1, acc2);
       
   582     *dp++ = vis_faligndata(acc2, acc3);
       
   583     *dp++ = vis_faligndata(acc3, acc4);
       
   584     acc0 = acc4; i+=16;
       
   585   }
       
   586 
       
   587   if (i <= xsize - 8) {
       
   588     s0 = sa[0];
       
   589     s1 = sa[1];
       
   590     vis_alignaddr(dp, 6);
       
   591     t3 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
       
   592     t2 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
       
   593     t1 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
       
   594     t0 = VIS_LD_U16_I(table, (s0 >> 23) & 0x1FE);
       
   595     acc1 = vis_faligndata(t3, acc1);
       
   596     acc1 = vis_faligndata(t2, acc1);
       
   597     acc1 = vis_faligndata(t1, acc1);
       
   598     acc1 = vis_faligndata(t0, acc1);
       
   599     t7 = VIS_LD_U16_I(table, (s1 << 1) & 0x1FE);
       
   600     t6 = VIS_LD_U16_I(table, (s1 >> 7) & 0x1FE);
       
   601     t5 = VIS_LD_U16_I(table, (s1 >> 15) & 0x1FE);
       
   602     t4 = VIS_LD_U16_I(table, (s1 >> 23) & 0x1FE);
       
   603     acc2 = vis_faligndata(t7, acc2);
       
   604     acc2 = vis_faligndata(t6, acc2);
       
   605     acc2 = vis_faligndata(t5, acc2);
       
   606     acc2 = vis_faligndata(t4, acc2);
       
   607     vis_alignaddr(dp, off);
       
   608     *dp++ = vis_faligndata(acc0, acc1);
       
   609     *dp++ = vis_faligndata(acc1, acc2);
       
   610     acc0 = acc2; i += 8; sa += 2;
       
   611   }
       
   612 
       
   613   if (i <= xsize - 4) {
       
   614     s0 = *sa++;
       
   615     vis_alignaddr(dp, 6);
       
   616     t3 = VIS_LD_U16_I(table, (s0 << 1) & 0x1FE);
       
   617     t2 = VIS_LD_U16_I(table, (s0 >> 7) & 0x1FE);
       
   618     t1 = VIS_LD_U16_I(table, (s0 >> 15) & 0x1FE);
       
   619     t0 = VIS_LD_U16_I(table, (s0 >> 23) & 0x1FE);
       
   620     acc1 = vis_faligndata(t3, acc1);
       
   621     acc1 = vis_faligndata(t2, acc1);
       
   622     acc1 = vis_faligndata(t1, acc1);
       
   623     acc1 = vis_faligndata(t0, acc1);
       
   624     vis_alignaddr(dp, off);
       
   625     *dp++ = vis_faligndata(acc0, acc1);
       
   626     acc0 = acc1;
       
   627   }
       
   628 
       
   629   sp = (mlib_u8*)sa;
       
   630 
       
   631   if ((mlib_addr) dp <= (mlib_addr) dend) {
       
   632 
       
   633     num = (((mlib_u8*) dend - (mlib_u8*) dp) + off + 1) >> 1;
       
   634     sp  += (num - 1);
       
   635     vis_alignaddr(dp, 6);
       
   636 #pragma pipeloop(0)
       
   637     for (i = 0; i < num; i ++) {
       
   638       s0 = (mlib_s32) *sp;
       
   639       sp --;
       
   640 
       
   641       t0  = VIS_LD_U16_I(table, 2*s0);
       
   642       acc1 = vis_faligndata(t0, acc1);
       
   643     }
       
   644 
       
   645     vis_alignaddr(dp, off);
       
   646     emask = vis_edge8(dp, dend);
       
   647     vis_pst_8(vis_faligndata(acc0, acc1), dp++, emask);
       
   648   }
       
   649 
       
   650   if ((mlib_addr) dp <= (mlib_addr) dend) {
       
   651     emask = vis_edge8(dp, dend);
       
   652     vis_pst_8(vis_faligndata(acc1, acc1), dp++, emask);
       
   653   }
       
   654 }
       
   655 
       
   656 /***************************************************************/
       
   657 void mlib_v_ImageLookUpSI_U8_U8_2_DstA8D1_SMALL(const mlib_u8 *src,
       
   658                                                 mlib_u8       *dst,
       
   659                                                 mlib_s32      xsize,
       
   660                                                 const mlib_u8 **table)
       
   661 {
       
   662   mlib_u8  *sp;              /* pointer to source data */
       
   663   mlib_u32 s0, s1, s2, s3;   /* source data */
       
   664   mlib_u16 *dl;              /* pointer to start of destination */
       
   665   mlib_u16 *dend;            /* pointer to end of destination */
       
   666   mlib_d64 *dp;              /* aligned pointer to destination */
       
   667   mlib_d64 t0, t1, t2;       /* destination data */
       
   668   mlib_d64 t3, t4, t5;       /* destination data */
       
   669   mlib_d64 t6, t7, acc;      /* destination data */
       
   670   mlib_s32 emask;            /* edge mask */
       
   671   mlib_s32 i, num;           /* loop variable */
       
   672   const mlib_u8  *tab0 = table[0];
       
   673   const mlib_u8  *tab1 = table[1];
       
   674 
       
   675   sp   = (void *)src;
       
   676   dl   = (mlib_u16*)dst;
       
   677   dp   = (mlib_d64 *) dl;
       
   678   dend = dl + xsize - 1;
       
   679 
       
   680   vis_alignaddr((void *) 0, 7);
       
   681 
       
   682   if (xsize >= 4) {
       
   683 
       
   684     s0 = sp[0];
       
   685     s1 = sp[1];
       
   686     s2 = sp[2];
       
   687     s3 = sp[3];
       
   688     sp += 4;
       
   689 
       
   690 #pragma pipeloop(0)
       
   691     for(i = 0; i <= xsize - 8; i+=4, sp+=4) {
       
   692       t7 = VIS_LD_U8_I(tab1, s3);
       
   693       t6 = VIS_LD_U8_I(tab0, s3);
       
   694       t5 = VIS_LD_U8_I(tab1, s2);
       
   695       t4 = VIS_LD_U8_I(tab0, s2);
       
   696       t3 = VIS_LD_U8_I(tab1, s1);
       
   697       t2 = VIS_LD_U8_I(tab0, s1);
       
   698       t1 = VIS_LD_U8_I(tab1, s0);
       
   699       t0 = VIS_LD_U8_I(tab0, s0);
       
   700       acc = vis_faligndata(t7, acc);
       
   701       acc = vis_faligndata(t6, acc);
       
   702       acc = vis_faligndata(t5, acc);
       
   703       acc = vis_faligndata(t4, acc);
       
   704       acc = vis_faligndata(t3, acc);
       
   705       acc = vis_faligndata(t2, acc);
       
   706       acc = vis_faligndata(t1, acc);
       
   707       acc = vis_faligndata(t0, acc);
       
   708       s0 = sp[0];
       
   709       s1 = sp[1];
       
   710       s2 = sp[2];
       
   711       s3 = sp[3];
       
   712       *dp++ = acc;
       
   713     }
       
   714 
       
   715     t7 = VIS_LD_U8_I(tab1, s3);
       
   716     t6 = VIS_LD_U8_I(tab0, s3);
       
   717     t5 = VIS_LD_U8_I(tab1, s2);
       
   718     t4 = VIS_LD_U8_I(tab0, s2);
       
   719     t3 = VIS_LD_U8_I(tab1, s1);
       
   720     t2 = VIS_LD_U8_I(tab0, s1);
       
   721     t1 = VIS_LD_U8_I(tab1, s0);
       
   722     t0 = VIS_LD_U8_I(tab0, s0);
       
   723     acc = vis_faligndata(t7, acc);
       
   724     acc = vis_faligndata(t6, acc);
       
   725     acc = vis_faligndata(t5, acc);
       
   726     acc = vis_faligndata(t4, acc);
       
   727     acc = vis_faligndata(t3, acc);
       
   728     acc = vis_faligndata(t2, acc);
       
   729     acc = vis_faligndata(t1, acc);
       
   730     acc = vis_faligndata(t0, acc);
       
   731     *dp++ = acc;
       
   732   }
       
   733 
       
   734   if ((mlib_addr) dp <= (mlib_addr) dend) {
       
   735 
       
   736     num = (mlib_u16*) dend - (mlib_u16*) dp;
       
   737     sp  += num;
       
   738     num ++;
       
   739 #pragma pipeloop(0)
       
   740     for (i = 0; i < num; i ++) {
       
   741       s0 = (mlib_s32) *sp;
       
   742       sp --;
       
   743 
       
   744       t0  = VIS_LD_U8_I(tab1, s0);
       
   745       acc = vis_faligndata(t0, acc);
       
   746 
       
   747       t0  = VIS_LD_U8_I(tab0, s0);
       
   748       acc = vis_faligndata(t0, acc);
       
   749     }
       
   750 
       
   751     emask = vis_edge16(dp, dend);
       
   752     vis_pst_16(acc, dp, emask);
       
   753   }
       
   754 }
       
   755 
       
   756 /***************************************************************/
       
   757 void mlib_v_ImageLookUpSI_U8_U8_2_D1_SMALL(const mlib_u8 *src,
       
   758                                            mlib_u8       *dst,
       
   759                                            mlib_s32      xsize,
       
   760                                            const mlib_u8 **table)
       
   761 {
       
   762   mlib_u8  *sp;                /* pointer to source data */
       
   763   mlib_u32 s0, s1, s2, s3, s4; /* source data */
       
   764   mlib_u8  *dl;                /* pointer to start of destination */
       
   765   mlib_u8  *dend;              /* pointer to end of destination */
       
   766   mlib_d64 *dp;                /* aligned pointer to destination */
       
   767   mlib_d64 t0, t1, t2;         /* destination data */
       
   768   mlib_d64 t3, t4, t5;         /* destination data */
       
   769   mlib_d64 t6, t7, acc;        /* destination data */
       
   770   mlib_s32 emask;              /* edge mask */
       
   771   mlib_s32 i, num;             /* loop variable */
       
   772   const mlib_u8  *tab0 = table[0];
       
   773   const mlib_u8  *tab1 = table[1];
       
   774 
       
   775   sp   = (void *)src;
       
   776   dl   = dst;
       
   777 
       
   778   dend = dl + 2 * xsize - 1;
       
   779 
       
   780   vis_alignaddr((void *) 0, 7);
       
   781 
       
   782   s0 = *sp++;
       
   783   *dl++ = tab0[s0];
       
   784   dp   = (mlib_d64 *) dl;
       
   785   xsize--;
       
   786 
       
   787   if (xsize >= 4) {
       
   788 
       
   789     s1 = sp[0];
       
   790     s2 = sp[1];
       
   791     s3 = sp[2];
       
   792     s4 = sp[3];
       
   793     sp += 4;
       
   794 
       
   795 #pragma pipeloop(0)
       
   796     for(i = 0; i <= xsize - 8; i+=4, sp+=4) {
       
   797       t7 = VIS_LD_U8_I(tab0, s4);
       
   798       t6 = VIS_LD_U8_I(tab1, s3);
       
   799       t5 = VIS_LD_U8_I(tab0, s3);
       
   800       t4 = VIS_LD_U8_I(tab1, s2);
       
   801       t3 = VIS_LD_U8_I(tab0, s2);
       
   802       t2 = VIS_LD_U8_I(tab1, s1);
       
   803       t1 = VIS_LD_U8_I(tab0, s1);
       
   804       t0 = VIS_LD_U8_I(tab1, s0);
       
   805       acc = vis_faligndata(t7, acc);
       
   806       acc = vis_faligndata(t6, acc);
       
   807       acc = vis_faligndata(t5, acc);
       
   808       acc = vis_faligndata(t4, acc);
       
   809       acc = vis_faligndata(t3, acc);
       
   810       acc = vis_faligndata(t2, acc);
       
   811       acc = vis_faligndata(t1, acc);
       
   812       acc = vis_faligndata(t0, acc);
       
   813       s0 = s4;
       
   814       s1 = sp[0];
       
   815       s2 = sp[1];
       
   816       s3 = sp[2];
       
   817       s4 = sp[3];
       
   818       *dp++ = acc;
       
   819     }
       
   820 
       
   821     t7 = VIS_LD_U8_I(tab0, s4);
       
   822     t6 = VIS_LD_U8_I(tab1, s3);
       
   823     t5 = VIS_LD_U8_I(tab0, s3);
       
   824     t4 = VIS_LD_U8_I(tab1, s2);
       
   825     t3 = VIS_LD_U8_I(tab0, s2);
       
   826     t2 = VIS_LD_U8_I(tab1, s1);
       
   827     t1 = VIS_LD_U8_I(tab0, s1);
       
   828     t0 = VIS_LD_U8_I(tab1, s0);
       
   829     acc = vis_faligndata(t7, acc);
       
   830     acc = vis_faligndata(t6, acc);
       
   831     acc = vis_faligndata(t5, acc);
       
   832     acc = vis_faligndata(t4, acc);
       
   833     acc = vis_faligndata(t3, acc);
       
   834     acc = vis_faligndata(t2, acc);
       
   835     acc = vis_faligndata(t1, acc);
       
   836     acc = vis_faligndata(t0, acc);
       
   837     s0 = s4;
       
   838     *dp++ = acc;
       
   839   }
       
   840 
       
   841   num = ((mlib_u8*) dend - (mlib_u8*) dp) >> 1;
       
   842   sp  += num;
       
   843   num ++;
       
   844 
       
   845 #pragma pipeloop(0)
       
   846   for (i = 0; i < num; i ++) {
       
   847     s1 = (mlib_s32) *sp;
       
   848     sp --;
       
   849 
       
   850     t0  = VIS_LD_U8_I(tab1, s1);
       
   851     acc = vis_faligndata(t0, acc);
       
   852 
       
   853     t0  = VIS_LD_U8_I(tab0, s1);
       
   854     acc = vis_faligndata(t0, acc);
       
   855   }
       
   856 
       
   857   t0  = VIS_LD_U8_I(tab1, s0);
       
   858   acc = vis_faligndata(t0, acc);
       
   859   emask = vis_edge8(dp, dend);
       
   860   vis_pst_8(acc, dp, emask);
       
   861 }
       
   862 
       
   863 /***************************************************************/
       
   864 void mlib_v_ImageLookUpSI_U8_U8_2(const mlib_u8 *src,
       
   865                                   mlib_s32      slb,
       
   866                                   mlib_u8       *dst,
       
   867                                   mlib_s32      dlb,
       
   868                                   mlib_s32      xsize,
       
   869                                   mlib_s32      ysize,
       
   870                                   const mlib_u8 **table)
       
   871 {
       
   872   if ((xsize * ysize) < 650) {
       
   873     mlib_u8  *sl;
       
   874     mlib_u8  *dl;
       
   875     mlib_s32 i, j;
       
   876 
       
   877     sl = (void *)src;
       
   878     dl = dst;
       
   879 
       
   880     /* row loop */
       
   881     for (j = 0; j < ysize; j ++) {
       
   882       mlib_u8 *sp = sl;
       
   883       mlib_u8 *dp = dl;
       
   884       mlib_s32 off, s0, size = xsize;
       
   885 
       
   886       off = ((8 - ((mlib_addr)dp & 7)) & 7) >> 1;
       
   887       off = (off < size) ? off : size;
       
   888 
       
   889       for (i = 0; i < off; i++) {
       
   890         s0 = *sp++;
       
   891         *dp++ = table[0][s0];
       
   892         *dp++ = table[1][s0];
       
   893         size--;
       
   894       }
       
   895 
       
   896       if (size > 0) {
       
   897 
       
   898         if (((mlib_addr)dp & 1) == 0) {
       
   899           mlib_v_ImageLookUpSI_U8_U8_2_DstA8D1_SMALL(sp, dp, size, table);
       
   900         } else {
       
   901           mlib_v_ImageLookUpSI_U8_U8_2_D1_SMALL(sp, dp, size, table);
       
   902         }
       
   903       }
       
   904 
       
   905       sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
       
   906       dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
       
   907     }
       
   908 
       
   909   } else {
       
   910     mlib_u8  *sl;
       
   911     mlib_u8  *dl;
       
   912     mlib_u16 tab[256];
       
   913     const mlib_u8  *tab0 = table[0];
       
   914     const mlib_u8  *tab1 = table[1];
       
   915     mlib_s32 i, j, s0, s1, s2;
       
   916 
       
   917     s0 = tab0[0];
       
   918     s1 = tab1[0];
       
   919     for (i = 1; i < 256; i++) {
       
   920       s2 = (s0 << 8) + s1;
       
   921       s0 = tab0[i];
       
   922       s1 = tab1[i];
       
   923       tab[i-1] = (mlib_u16)s2;
       
   924     }
       
   925 
       
   926     s2 = (s0 << 8) + s1;
       
   927     tab[255] = (mlib_u16)s2;
       
   928 
       
   929     sl = (void *)src;
       
   930     dl = dst;
       
   931 
       
   932     /* row loop */
       
   933     for (j = 0; j < ysize; j ++) {
       
   934       mlib_u8 *sp = sl;
       
   935       mlib_u8 *dp = dl;
       
   936       mlib_s32 off, s0, size = xsize;
       
   937 
       
   938       if (((mlib_addr)dp & 1) == 0) {
       
   939 
       
   940         off = ((8 - ((mlib_addr)dp & 7)) & 7) >> 1;
       
   941         off = (off < size) ? off : size;
       
   942 
       
   943         for (i = 0; i < off; i++) {
       
   944           *(mlib_u16*)dp = tab[(*sp)];
       
   945           dp += 2;
       
   946           size--; sp++;
       
   947         }
       
   948 
       
   949         if (size > 0) {
       
   950 
       
   951           off = (mlib_addr)sp & 3;
       
   952 
       
   953           if (off == 0) {
       
   954             mlib_v_ImageLookUpSI_U8_U8_2_SrcOff0_D1(sp, dp, size, tab);
       
   955           } else if (off == 1) {
       
   956             mlib_v_ImageLookUpSI_U8_U8_2_SrcOff1_D1(sp, dp, size, tab);
       
   957           } else if (off == 2) {
       
   958             mlib_v_ImageLookUpSI_U8_U8_2_SrcOff2_D1(sp, dp, size, tab);
       
   959           } else {
       
   960             mlib_v_ImageLookUpSI_U8_U8_2_SrcOff3_D1(sp, dp, size, tab);
       
   961           }
       
   962         }
       
   963 
       
   964       } else {
       
   965 
       
   966         off = ((4 - ((mlib_addr)sp & 3)) & 3);
       
   967         off = (off < size) ? off : size;
       
   968 
       
   969         for (i = 0; i < off; i++) {
       
   970           s0 = tab[(*sp)];
       
   971           *dp++ = (s0 >> 8);
       
   972           *dp++ = (s0 & 0xFF);
       
   973           size--; sp++;
       
   974         }
       
   975 
       
   976         if (size > 0) {
       
   977           mlib_v_ImageLookUpSI_U8_U8_2_DstNonAl_D1(sp, dp, size, tab);
       
   978         }
       
   979       }
       
   980 
       
   981       sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
       
   982       dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
       
   983     }
       
   984   }
       
   985 }
       
   986 
       
   987 /***************************************************************/
       
   988 void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff0_D1(const mlib_u8  *src,
       
   989                                              mlib_u8        *dst,
       
   990                                              mlib_s32       xsize,
       
   991                                              const mlib_d64 *table)
       
   992 {
       
   993   mlib_u8  *sp;            /* pointer to source data */
       
   994   mlib_u32 *sa;            /* aligned pointer to source data */
       
   995   mlib_u32 s0;             /* source data */
       
   996   mlib_u8  *dl;            /* pointer to start of destination */
       
   997   mlib_f32 *dp;            /* aligned pointer to destination */
       
   998   mlib_d64 t0, t1, t2, t3; /* destination data */
       
   999   mlib_d64 acc0, acc1;     /* destination data */
       
  1000   mlib_s32 i;              /* loop variable */
       
  1001   mlib_u8  *ptr;
       
  1002 
       
  1003   dl   =  dst;
       
  1004   dp   = (mlib_f32 *) dl;
       
  1005   sp = (void *)src;
       
  1006   sa = (mlib_u32*)sp;
       
  1007 
       
  1008   vis_alignaddr((void *) 0, 3);
       
  1009 
       
  1010   i = 0;
       
  1011 
       
  1012   if (xsize >= 4) {
       
  1013 
       
  1014     s0 = *sa++;
       
  1015 
       
  1016 #pragma pipeloop(0)
       
  1017     for(i = 0; i <= xsize - 8; i+=4, dp+=3) {
       
  1018       t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 21) & 0x7F8 ));
       
  1019       t1 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 13) & 0x7F8 ));
       
  1020       t2 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 5) & 0x7F8 ));
       
  1021       t3 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
       
  1022       acc0 = vis_faligndata(t0, t0);
       
  1023       acc0 = vis_faligndata(acc0, t1);
       
  1024       acc1 = vis_faligndata(acc0, acc0);
       
  1025       acc0 = vis_faligndata(acc0, t2);
       
  1026       acc1 = vis_faligndata(acc1, acc0);
       
  1027       acc0 = vis_faligndata(acc0, t3);
       
  1028       s0 = *sa++;
       
  1029       dp[0] = vis_read_lo(acc1);
       
  1030       dp[1] = vis_read_hi(acc0);
       
  1031       dp[2] = vis_read_lo(acc0);
       
  1032     }
       
  1033 
       
  1034     t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 21) & 0x7F8 ));
       
  1035     t1 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 13) & 0x7F8 ));
       
  1036     t2 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 5) & 0x7F8 ));
       
  1037     t3 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
       
  1038     acc0 = vis_faligndata(t0, t0);
       
  1039     acc0 = vis_faligndata(acc0, t1);
       
  1040     acc1 = vis_faligndata(acc0, acc0);
       
  1041     acc0 = vis_faligndata(acc0, t2);
       
  1042     acc1 = vis_faligndata(acc1, acc0);
       
  1043     acc0 = vis_faligndata(acc0, t3);
       
  1044     dp[0] = vis_read_lo(acc1);
       
  1045     dp[1] = vis_read_hi(acc0);
       
  1046     dp[2] = vis_read_lo(acc0);
       
  1047     dp += 3;
       
  1048     i += 4;
       
  1049   }
       
  1050 
       
  1051   dl = (mlib_u8*)dp;
       
  1052 
       
  1053 #pragma pipeloop(0)
       
  1054   for (; i < xsize; i++) {
       
  1055     ptr = (mlib_u8*)(table + src[i]);
       
  1056     dl[0] = ptr[0];
       
  1057     dl[1] = ptr[1];
       
  1058     dl[2] = ptr[2];
       
  1059     dl += 3;
       
  1060   }
       
  1061 }
       
  1062 
       
  1063 /***************************************************************/
       
  1064 void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff1_D1(const mlib_u8  *src,
       
  1065                                              mlib_u8        *dst,
       
  1066                                              mlib_s32       xsize,
       
  1067                                              const mlib_d64 *table)
       
  1068 {
       
  1069   mlib_u8  *sp;            /* pointer to source data */
       
  1070   mlib_u32 *sa;            /* aligned pointer to source data */
       
  1071   mlib_u32 s0, s1;         /* source data */
       
  1072   mlib_u8  *dl;            /* pointer to start of destination */
       
  1073   mlib_f32 *dp;            /* aligned pointer to destination */
       
  1074   mlib_d64 t0, t1, t2, t3; /* destination data */
       
  1075   mlib_d64 acc0, acc1;     /* destination data */
       
  1076   mlib_s32 i;              /* loop variable */
       
  1077   mlib_u8  *ptr;
       
  1078 
       
  1079   dl   =  dst;
       
  1080   dp   = (mlib_f32 *) dl;
       
  1081   sp = (void *)src;
       
  1082   sa = (mlib_u32*)(sp - 1);
       
  1083 
       
  1084   vis_alignaddr((void *) 0, 3);
       
  1085 
       
  1086   i = 0;
       
  1087   s0 = *sa++;
       
  1088 
       
  1089   if (xsize >= 4) {
       
  1090 
       
  1091     s1 = *sa++;
       
  1092 
       
  1093 #pragma pipeloop(0)
       
  1094     for(i = 0; i <= xsize - 8; i+=4, dp+=3) {
       
  1095       t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 13) & 0x7F8 ));
       
  1096       t1 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 5) & 0x7F8 ));
       
  1097       t2 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
       
  1098       t3 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 21) & 0x7F8 ));
       
  1099       acc0 = vis_faligndata(t0, t0);
       
  1100       acc0 = vis_faligndata(acc0, t1);
       
  1101       acc1 = vis_faligndata(acc0, acc0);
       
  1102       acc0 = vis_faligndata(acc0, t2);
       
  1103       acc1 = vis_faligndata(acc1, acc0);
       
  1104       acc0 = vis_faligndata(acc0, t3);
       
  1105       s0 = s1;
       
  1106       s1 = *sa++;
       
  1107       dp[0] = vis_read_lo(acc1);
       
  1108       dp[1] = vis_read_hi(acc0);
       
  1109       dp[2] = vis_read_lo(acc0);
       
  1110     }
       
  1111 
       
  1112     t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 13) & 0x7F8 ));
       
  1113     t1 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 5) & 0x7F8 ));
       
  1114     t2 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
       
  1115     t3 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 21) & 0x7F8 ));
       
  1116     acc0 = vis_faligndata(t0, t0);
       
  1117     acc0 = vis_faligndata(acc0, t1);
       
  1118     acc1 = vis_faligndata(acc0, acc0);
       
  1119     acc0 = vis_faligndata(acc0, t2);
       
  1120     acc1 = vis_faligndata(acc1, acc0);
       
  1121     acc0 = vis_faligndata(acc0, t3);
       
  1122     dp[0] = vis_read_lo(acc1);
       
  1123     dp[1] = vis_read_hi(acc0);
       
  1124     dp[2] = vis_read_lo(acc0);
       
  1125     dp += 3;
       
  1126     i += 4;
       
  1127   }
       
  1128 
       
  1129   dl = (mlib_u8*)dp;
       
  1130 
       
  1131 #pragma pipeloop(0)
       
  1132   for (; i < xsize; i++) {
       
  1133     ptr = (mlib_u8*)(table + src[i]);
       
  1134     dl[0] = ptr[0];
       
  1135     dl[1] = ptr[1];
       
  1136     dl[2] = ptr[2];
       
  1137     dl += 3;
       
  1138   }
       
  1139 }
       
  1140 
       
  1141 /***************************************************************/
       
  1142 void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff2_D1(const mlib_u8  *src,
       
  1143                                              mlib_u8        *dst,
       
  1144                                              mlib_s32       xsize,
       
  1145                                              const mlib_d64 *table)
       
  1146 {
       
  1147   mlib_u8  *sp;            /* pointer to source data */
       
  1148   mlib_u32 *sa;            /* aligned pointer to source data */
       
  1149   mlib_u32 s0, s1;         /* source data */
       
  1150   mlib_u8  *dl;            /* pointer to start of destination */
       
  1151   mlib_f32 *dp;            /* aligned pointer to destination */
       
  1152   mlib_d64 t0, t1, t2, t3; /* destination data */
       
  1153   mlib_d64 acc0, acc1;     /* destination data */
       
  1154   mlib_s32 i;              /* loop variable */
       
  1155   mlib_u8  *ptr;
       
  1156 
       
  1157   dl   =  dst;
       
  1158   dp   = (mlib_f32 *) dl;
       
  1159   sp = (void *)src;
       
  1160   sa = (mlib_u32*)(sp - 2);
       
  1161 
       
  1162   vis_alignaddr((void *) 0, 3);
       
  1163 
       
  1164   i = 0;
       
  1165   s0 = *sa++;
       
  1166 
       
  1167   if (xsize >= 4) {
       
  1168 
       
  1169     s1 = *sa++;
       
  1170 
       
  1171 #pragma pipeloop(0)
       
  1172     for(i = 0; i <= xsize - 8; i+=4, dp+=3) {
       
  1173       t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 5) & 0x7F8 ));
       
  1174       t1 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
       
  1175       t2 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 21) & 0x7F8 ));
       
  1176       t3 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 13) & 0x7F8 ));
       
  1177       acc0 = vis_faligndata(t0, t0);
       
  1178       acc0 = vis_faligndata(acc0, t1);
       
  1179       acc1 = vis_faligndata(acc0, acc0);
       
  1180       acc0 = vis_faligndata(acc0, t2);
       
  1181       acc1 = vis_faligndata(acc1, acc0);
       
  1182       acc0 = vis_faligndata(acc0, t3);
       
  1183       s0 = s1;
       
  1184       s1 = *sa++;
       
  1185       dp[0] = vis_read_lo(acc1);
       
  1186       dp[1] = vis_read_hi(acc0);
       
  1187       dp[2] = vis_read_lo(acc0);
       
  1188     }
       
  1189 
       
  1190     t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 >> 5) & 0x7F8 ));
       
  1191     t1 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
       
  1192     t2 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 21) & 0x7F8 ));
       
  1193     t3 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 13) & 0x7F8 ));
       
  1194     acc0 = vis_faligndata(t0, t0);
       
  1195     acc0 = vis_faligndata(acc0, t1);
       
  1196     acc1 = vis_faligndata(acc0, acc0);
       
  1197     acc0 = vis_faligndata(acc0, t2);
       
  1198     acc1 = vis_faligndata(acc1, acc0);
       
  1199     acc0 = vis_faligndata(acc0, t3);
       
  1200     dp[0] = vis_read_lo(acc1);
       
  1201     dp[1] = vis_read_hi(acc0);
       
  1202     dp[2] = vis_read_lo(acc0);
       
  1203     dp += 3;
       
  1204     i += 4;
       
  1205   }
       
  1206 
       
  1207   dl = (mlib_u8*)dp;
       
  1208 
       
  1209 #pragma pipeloop(0)
       
  1210   for (; i < xsize; i++) {
       
  1211     ptr = (mlib_u8*)(table + src[i]);
       
  1212     dl[0] = ptr[0];
       
  1213     dl[1] = ptr[1];
       
  1214     dl[2] = ptr[2];
       
  1215     dl += 3;
       
  1216   }
       
  1217 }
       
  1218 
       
  1219 /***************************************************************/
       
  1220 void mlib_v_ImageLookUpSI_U8_U8_3_SrcOff3_D1(const mlib_u8  *src,
       
  1221                                              mlib_u8        *dst,
       
  1222                                              mlib_s32       xsize,
       
  1223                                              const mlib_d64 *table)
       
  1224 {
       
  1225   mlib_u8  *sp;            /* pointer to source data */
       
  1226   mlib_u32 *sa;            /* aligned pointer to source data */
       
  1227   mlib_u32 s0, s1;         /* source data */
       
  1228   mlib_u8  *dl;            /* pointer to start of destination */
       
  1229   mlib_f32 *dp;            /* aligned pointer to destination */
       
  1230   mlib_d64 t0, t1, t2, t3; /* destination data */
       
  1231   mlib_d64 acc0, acc1;     /* destination data */
       
  1232   mlib_s32 i;              /* loop variable */
       
  1233   mlib_u8  *ptr;
       
  1234 
       
  1235   dl   =  dst;
       
  1236   dp   = (mlib_f32 *) dl;
       
  1237   sp = (void *)src;
       
  1238   sa = (mlib_u32*)(sp - 3);
       
  1239 
       
  1240   vis_alignaddr((void *) 0, 3);
       
  1241 
       
  1242   i = 0;
       
  1243   s0 = *sa++;
       
  1244 
       
  1245   if (xsize >= 4) {
       
  1246 
       
  1247     s1 = *sa++;
       
  1248 
       
  1249 #pragma pipeloop(0)
       
  1250     for(i = 0; i <= xsize - 8; i+=4, dp+=3) {
       
  1251       t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
       
  1252       t1 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 21) & 0x7F8 ));
       
  1253       t2 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 13) & 0x7F8 ));
       
  1254       t3 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 5) & 0x7F8 ));
       
  1255       acc0 = vis_faligndata(t0, t0);
       
  1256       acc0 = vis_faligndata(acc0, t1);
       
  1257       acc1 = vis_faligndata(acc0, acc0);
       
  1258       acc0 = vis_faligndata(acc0, t2);
       
  1259       acc1 = vis_faligndata(acc1, acc0);
       
  1260       acc0 = vis_faligndata(acc0, t3);
       
  1261       s0 = s1;
       
  1262       s1 = *sa++;
       
  1263       dp[0] = vis_read_lo(acc1);
       
  1264       dp[1] = vis_read_hi(acc0);
       
  1265       dp[2] = vis_read_lo(acc0);
       
  1266     }
       
  1267 
       
  1268     t0 = *(mlib_d64*)((mlib_u8*)table + ((s0 << 3) & 0x7F8 ));
       
  1269     t1 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 21) & 0x7F8 ));
       
  1270     t2 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 13) & 0x7F8 ));
       
  1271     t3 = *(mlib_d64*)((mlib_u8*)table + ((s1 >> 5) & 0x7F8 ));
       
  1272     acc0 = vis_faligndata(t0, t0);
       
  1273     acc0 = vis_faligndata(acc0, t1);
       
  1274     acc1 = vis_faligndata(acc0, acc0);
       
  1275     acc0 = vis_faligndata(acc0, t2);
       
  1276     acc1 = vis_faligndata(acc1, acc0);
       
  1277     acc0 = vis_faligndata(acc0, t3);
       
  1278     dp[0] = vis_read_lo(acc1);
       
  1279     dp[1] = vis_read_hi(acc0);
       
  1280     dp[2] = vis_read_lo(acc0);
       
  1281     dp += 3;
       
  1282     i += 4;
       
  1283   }
       
  1284 
       
  1285   dl = (mlib_u8*)dp;
       
  1286 
       
  1287 #pragma pipeloop(0)
       
  1288   for (; i < xsize; i++) {
       
  1289     ptr = (mlib_u8*)(table + src[i]);
       
  1290     dl[0] = ptr[0];
       
  1291     dl[1] = ptr[1];
       
  1292     dl[2] = ptr[2];
       
  1293     dl += 3;
       
  1294   }
       
  1295 }
       
  1296 
       
  1297 /***************************************************************/
       
  1298 void mlib_v_ImageLookUpSI_U8_U8_3_D1_SMALL(const mlib_u8 *src,
       
  1299                                            mlib_u8       *dst,
       
  1300                                            mlib_s32      xsize,
       
  1301                                            const mlib_u8 **table)
       
  1302 {
       
  1303   mlib_u8  *sp;              /* pointer to source data */
       
  1304   mlib_u8  *dl;              /* pointer to start of destination */
       
  1305   mlib_d64 *dp;              /* aligned pointer to destination */
       
  1306   mlib_d64 t0, t1, t2;       /* destination data */
       
  1307   mlib_d64 t3, t4, t5;       /* destination data */
       
  1308   mlib_d64 t6, t7;           /* destination data */
       
  1309   mlib_d64 acc0, acc1, acc2; /* destination data */
       
  1310   mlib_s32 i;                /* loop variable */
       
  1311   const mlib_u8  *tab0 = table[0];
       
  1312   const mlib_u8  *tab1 = table[1];
       
  1313   const mlib_u8  *tab2 = table[2];
       
  1314   mlib_u32 s00, s01, s02, s03;
       
  1315   mlib_u32 s10, s11, s12, s13;
       
  1316 
       
  1317   sp   = (void *)src;
       
  1318   dl   = dst;
       
  1319   dp   = (mlib_d64 *) dl;
       
  1320 
       
  1321   vis_alignaddr((void *) 0, 7);
       
  1322 
       
  1323   i = 0;
       
  1324 
       
  1325   if (xsize >= 8) {
       
  1326 
       
  1327     s00 = sp[0];
       
  1328     s01 = sp[1];
       
  1329     s02 = sp[2];
       
  1330     s03 = sp[3];
       
  1331     s10 = sp[4];
       
  1332     s11 = sp[5];
       
  1333     s12 = sp[6];
       
  1334     s13 = sp[7];
       
  1335     sp += 8;
       
  1336 
       
  1337 #pragma pipeloop(0)
       
  1338     for(i = 0; i <= xsize - 16; i+=8, sp+=8) {
       
  1339       t7 = VIS_LD_U8_I(tab1, s02);
       
  1340       t6 = VIS_LD_U8_I(tab0, s02);
       
  1341       t5 = VIS_LD_U8_I(tab2, s01);
       
  1342       t4 = VIS_LD_U8_I(tab1, s01);
       
  1343       t3 = VIS_LD_U8_I(tab0, s01);
       
  1344       t2 = VIS_LD_U8_I(tab2, s00);
       
  1345       t1 = VIS_LD_U8_I(tab1, s00);
       
  1346       t0 = VIS_LD_U8_I(tab0, s00);
       
  1347       acc0 = vis_faligndata(t7, acc0);
       
  1348       acc0 = vis_faligndata(t6, acc0);
       
  1349       acc0 = vis_faligndata(t5, acc0);
       
  1350       acc0 = vis_faligndata(t4, acc0);
       
  1351       acc0 = vis_faligndata(t3, acc0);
       
  1352       acc0 = vis_faligndata(t2, acc0);
       
  1353       acc0 = vis_faligndata(t1, acc0);
       
  1354       acc0 = vis_faligndata(t0, acc0);
       
  1355       t7 = VIS_LD_U8_I(tab0, s11);
       
  1356       t6 = VIS_LD_U8_I(tab2, s10);
       
  1357       t5 = VIS_LD_U8_I(tab1, s10);
       
  1358       t4 = VIS_LD_U8_I(tab0, s10);
       
  1359       t3 = VIS_LD_U8_I(tab2, s03);
       
  1360       t2 = VIS_LD_U8_I(tab1, s03);
       
  1361       t1 = VIS_LD_U8_I(tab0, s03);
       
  1362       t0 = VIS_LD_U8_I(tab2, s02);
       
  1363       acc1 = vis_faligndata(t7, acc1);
       
  1364       acc1 = vis_faligndata(t6, acc1);
       
  1365       acc1 = vis_faligndata(t5, acc1);
       
  1366       acc1 = vis_faligndata(t4, acc1);
       
  1367       acc1 = vis_faligndata(t3, acc1);
       
  1368       acc1 = vis_faligndata(t2, acc1);
       
  1369       acc1 = vis_faligndata(t1, acc1);
       
  1370       acc1 = vis_faligndata(t0, acc1);
       
  1371       t7 = VIS_LD_U8_I(tab2, s13);
       
  1372       t6 = VIS_LD_U8_I(tab1, s13);
       
  1373       t5 = VIS_LD_U8_I(tab0, s13);
       
  1374       t4 = VIS_LD_U8_I(tab2, s12);
       
  1375       t3 = VIS_LD_U8_I(tab1, s12);
       
  1376       t2 = VIS_LD_U8_I(tab0, s12);
       
  1377       t1 = VIS_LD_U8_I(tab2, s11);
       
  1378       t0 = VIS_LD_U8_I(tab1, s11);
       
  1379       acc2 = vis_faligndata(t7, acc2);
       
  1380       acc2 = vis_faligndata(t6, acc2);
       
  1381       acc2 = vis_faligndata(t5, acc2);
       
  1382       acc2 = vis_faligndata(t4, acc2);
       
  1383       acc2 = vis_faligndata(t3, acc2);
       
  1384       acc2 = vis_faligndata(t2, acc2);
       
  1385       acc2 = vis_faligndata(t1, acc2);
       
  1386       acc2 = vis_faligndata(t0, acc2);
       
  1387       s00 = sp[0];
       
  1388       s01 = sp[1];
       
  1389       s02 = sp[2];
       
  1390       s03 = sp[3];
       
  1391       s10 = sp[4];
       
  1392       s11 = sp[5];
       
  1393       s12 = sp[6];
       
  1394       s13 = sp[7];
       
  1395       *dp++ = acc0;
       
  1396       *dp++ = acc1;
       
  1397       *dp++ = acc2;
       
  1398     }
       
  1399 
       
  1400     t7 = VIS_LD_U8_I(tab1, s02);
       
  1401     t6 = VIS_LD_U8_I(tab0, s02);
       
  1402     t5 = VIS_LD_U8_I(tab2, s01);
       
  1403     t4 = VIS_LD_U8_I(tab1, s01);
       
  1404     t3 = VIS_LD_U8_I(tab0, s01);
       
  1405     t2 = VIS_LD_U8_I(tab2, s00);
       
  1406     t1 = VIS_LD_U8_I(tab1, s00);
       
  1407     t0 = VIS_LD_U8_I(tab0, s00);
       
  1408     acc0 = vis_faligndata(t7, acc0);
       
  1409     acc0 = vis_faligndata(t6, acc0);
       
  1410     acc0 = vis_faligndata(t5, acc0);
       
  1411     acc0 = vis_faligndata(t4, acc0);
       
  1412     acc0 = vis_faligndata(t3, acc0);
       
  1413     acc0 = vis_faligndata(t2, acc0);
       
  1414     acc0 = vis_faligndata(t1, acc0);
       
  1415     acc0 = vis_faligndata(t0, acc0);
       
  1416     t7 = VIS_LD_U8_I(tab0, s11);
       
  1417     t6 = VIS_LD_U8_I(tab2, s10);
       
  1418     t5 = VIS_LD_U8_I(tab1, s10);
       
  1419     t4 = VIS_LD_U8_I(tab0, s10);
       
  1420     t3 = VIS_LD_U8_I(tab2, s03);
       
  1421     t2 = VIS_LD_U8_I(tab1, s03);
       
  1422     t1 = VIS_LD_U8_I(tab0, s03);
       
  1423     t0 = VIS_LD_U8_I(tab2, s02);
       
  1424     acc1 = vis_faligndata(t7, acc1);
       
  1425     acc1 = vis_faligndata(t6, acc1);
       
  1426     acc1 = vis_faligndata(t5, acc1);
       
  1427     acc1 = vis_faligndata(t4, acc1);
       
  1428     acc1 = vis_faligndata(t3, acc1);
       
  1429     acc1 = vis_faligndata(t2, acc1);
       
  1430     acc1 = vis_faligndata(t1, acc1);
       
  1431     acc1 = vis_faligndata(t0, acc1);
       
  1432     t7 = VIS_LD_U8_I(tab2, s13);
       
  1433     t6 = VIS_LD_U8_I(tab1, s13);
       
  1434     t5 = VIS_LD_U8_I(tab0, s13);
       
  1435     t4 = VIS_LD_U8_I(tab2, s12);
       
  1436     t3 = VIS_LD_U8_I(tab1, s12);
       
  1437     t2 = VIS_LD_U8_I(tab0, s12);
       
  1438     t1 = VIS_LD_U8_I(tab2, s11);
       
  1439     t0 = VIS_LD_U8_I(tab1, s11);
       
  1440     acc2 = vis_faligndata(t7, acc2);
       
  1441     acc2 = vis_faligndata(t6, acc2);
       
  1442     acc2 = vis_faligndata(t5, acc2);
       
  1443     acc2 = vis_faligndata(t4, acc2);
       
  1444     acc2 = vis_faligndata(t3, acc2);
       
  1445     acc2 = vis_faligndata(t2, acc2);
       
  1446     acc2 = vis_faligndata(t1, acc2);
       
  1447     acc2 = vis_faligndata(t0, acc2);
       
  1448     *dp++ = acc0;
       
  1449     *dp++ = acc1;
       
  1450     *dp++ = acc2;
       
  1451     i += 8;
       
  1452   }
       
  1453 
       
  1454   dl = (mlib_u8*)dp;
       
  1455 
       
  1456 #pragma pipeloop(0)
       
  1457   for (; i < xsize; i++) {
       
  1458     s00 = sp[0];
       
  1459     dl[0] = tab0[s00];
       
  1460     dl[1] = tab1[s00];
       
  1461     dl[2] = tab2[s00];
       
  1462     dl += 3; sp ++;
       
  1463   }
       
  1464 }
       
  1465 
       
  1466 /***************************************************************/
       
  1467 void mlib_v_ImageLookUpSI_U8_U8_3(const mlib_u8 *src,
       
  1468                                   mlib_s32      slb,
       
  1469                                   mlib_u8       *dst,
       
  1470                                   mlib_s32      dlb,
       
  1471                                   mlib_s32      xsize,
       
  1472                                   mlib_s32      ysize,
       
  1473                                   const mlib_u8 **table)
       
  1474 {
       
  1475   if ((xsize * ysize) < 650) {
       
  1476     mlib_u8  *sl;
       
  1477     mlib_u8  *dl;
       
  1478     mlib_s32 i, j;
       
  1479     const mlib_u8  *tab0 = table[0];
       
  1480     const mlib_u8  *tab1 = table[1];
       
  1481     const mlib_u8  *tab2 = table[2];
       
  1482 
       
  1483     sl = (void *)src;
       
  1484     dl = dst;
       
  1485 
       
  1486     /* row loop */
       
  1487     for (j = 0; j < ysize; j ++) {
       
  1488       mlib_u8 *sp = sl;
       
  1489       mlib_u8 *dp = dl;
       
  1490       mlib_s32 off, s0, size = xsize;
       
  1491 
       
  1492       off = (mlib_addr)dp & 7;
       
  1493       off = (off * 5) & 7;
       
  1494       off = (off < size) ? off : size;
       
  1495 
       
  1496       for (i = 0; i < off; i++) {
       
  1497         s0 = *sp++;
       
  1498         *dp++ = tab0[s0];
       
  1499         *dp++ = tab1[s0];
       
  1500         *dp++ = tab2[s0];
       
  1501         size--;
       
  1502       }
       
  1503 
       
  1504       if (size > 0) {
       
  1505         mlib_v_ImageLookUpSI_U8_U8_3_D1_SMALL(sp, dp, size, table);
       
  1506       }
       
  1507 
       
  1508       sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
       
  1509       dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
       
  1510     }
       
  1511 
       
  1512   } else {
       
  1513     mlib_u8  *sl;
       
  1514     mlib_u8  *dl;
       
  1515     mlib_u32 tab[512];
       
  1516     const mlib_u8  *tab0 = table[0];
       
  1517     const mlib_u8  *tab1 = table[1];
       
  1518     const mlib_u8  *tab2 = table[2];
       
  1519     mlib_s32 i, j;
       
  1520     mlib_u32 s0, s1, s2, s3;
       
  1521 
       
  1522     s0 = tab0[0];
       
  1523     s1 = tab1[0];
       
  1524     s2 = tab2[0];
       
  1525     for (i = 1; i < 256; i++) {
       
  1526       s3 = (s0 << 24) + (s1 << 16) + (s2 << 8);
       
  1527       s0 = tab0[i];
       
  1528       s1 = tab1[i];
       
  1529       s2 = tab2[i];
       
  1530       tab[2*i-2] = s3;
       
  1531     }
       
  1532 
       
  1533     s3 = (s0 << 24) + (s1 << 16) + (s2 << 8);
       
  1534     tab[510] = s3;
       
  1535 
       
  1536     sl = (void *)src;
       
  1537     dl = dst;
       
  1538 
       
  1539     /* row loop */
       
  1540     for (j = 0; j < ysize; j ++) {
       
  1541       mlib_u8 *sp = sl;
       
  1542       mlib_u8 *dp = dl;
       
  1543       mlib_s32 off, size = xsize;
       
  1544       mlib_u8  *ptr;
       
  1545 
       
  1546       off = ((mlib_addr)dp & 3);
       
  1547       off = (off < size) ? off : size;
       
  1548 
       
  1549 #pragma pipeloop(0)
       
  1550       for (i = 0; i < off; i++) {
       
  1551         ptr = (mlib_u8*)(tab + 2*sp[i]);
       
  1552         dp[0] = ptr[0];
       
  1553         dp[1] = ptr[1];
       
  1554         dp[2] = ptr[2];
       
  1555         dp += 3;
       
  1556       }
       
  1557 
       
  1558       size -= off;
       
  1559       sp += off;
       
  1560 
       
  1561       if (size > 0) {
       
  1562         off = (mlib_addr)sp & 3;
       
  1563 
       
  1564         if (off == 0) {
       
  1565           mlib_v_ImageLookUpSI_U8_U8_3_SrcOff0_D1(sp, dp, size, (mlib_d64*)tab);
       
  1566         } else if (off == 1) {
       
  1567           mlib_v_ImageLookUpSI_U8_U8_3_SrcOff1_D1(sp, dp, size, (mlib_d64*)tab);
       
  1568         } else if (off == 2) {
       
  1569           mlib_v_ImageLookUpSI_U8_U8_3_SrcOff2_D1(sp, dp, size, (mlib_d64*)tab);
       
  1570         } else if (off == 3) {
       
  1571           mlib_v_ImageLookUpSI_U8_U8_3_SrcOff3_D1(sp, dp, size, (mlib_d64*)tab);
       
  1572         }
       
  1573       }
       
  1574 
       
  1575       sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
       
  1576       dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
       
  1577     }
       
  1578   }
       
  1579 }
       
  1580 
       
  1581 /***************************************************************/
       
  1582 void mlib_v_ImageLookUpSI_U8_U8_4_SrcOff0_D1(const mlib_u8  *src,
       
  1583                                              mlib_u8        *dst,
       
  1584                                              mlib_s32       xsize,
       
  1585                                              const mlib_f32 *table)
       
  1586 {
       
  1587   mlib_u32 *sa;          /* aligned pointer to source data */
       
  1588   mlib_u8  *sp;          /* pointer to source data */
       
  1589   mlib_u32 s0;           /* source data */
       
  1590   mlib_f32 *dp;          /* aligned pointer to destination */
       
  1591   mlib_f32 acc0, acc1;   /* destination data */
       
  1592   mlib_f32 acc2, acc3;   /* destination data */
       
  1593   mlib_s32 i;            /* loop variable */
       
  1594   mlib_u32 s00, s01, s02, s03;
       
  1595 
       
  1596   sa   = (mlib_u32*)src;
       
  1597   dp   = (mlib_f32 *) dst;
       
  1598 
       
  1599   i = 0;
       
  1600 
       
  1601   if (xsize >= 4) {
       
  1602 
       
  1603     s0 = *sa++;
       
  1604     s00 = (s0 >> 22) & 0x3FC;
       
  1605     s01 = (s0 >> 14) & 0x3FC;
       
  1606 
       
  1607 #pragma pipeloop(0)
       
  1608     for(i = 0; i <= xsize - 8; i+=4, dp += 4) {
       
  1609       s02 = (s0 >> 6) & 0x3FC;
       
  1610       s03 = (s0 << 2) & 0x3FC;
       
  1611       acc0 = *(mlib_f32*)((mlib_u8*)table + s00);
       
  1612       acc1 = *(mlib_f32*)((mlib_u8*)table + s01);
       
  1613       acc2 = *(mlib_f32*)((mlib_u8*)table + s02);
       
  1614       acc3 = *(mlib_f32*)((mlib_u8*)table + s03);
       
  1615       s0 = *sa++;
       
  1616       s00 = (s0 >> 22) & 0x3FC;
       
  1617       s01 = (s0 >> 14) & 0x3FC;
       
  1618       dp[0] = acc0;
       
  1619       dp[1] = acc1;
       
  1620       dp[2] = acc2;
       
  1621       dp[3] = acc3;
       
  1622     }
       
  1623 
       
  1624     s02 = (s0 >> 6) & 0x3FC;
       
  1625     s03 = (s0 << 2) & 0x3FC;
       
  1626     acc0 = *(mlib_f32*)((mlib_u8*)table + s00);
       
  1627     acc1 = *(mlib_f32*)((mlib_u8*)table + s01);
       
  1628     acc2 = *(mlib_f32*)((mlib_u8*)table + s02);
       
  1629     acc3 = *(mlib_f32*)((mlib_u8*)table + s03);
       
  1630     dp[0] = acc0;
       
  1631     dp[1] = acc1;
       
  1632     dp[2] = acc2;
       
  1633     dp[3] = acc3;
       
  1634     dp += 4;
       
  1635     i += 4;
       
  1636   }
       
  1637 
       
  1638   sp = (mlib_u8*)sa;
       
  1639 
       
  1640   if ( i <= xsize - 2) {
       
  1641     *dp++ = table[sp[0]];
       
  1642     *dp++ = table[sp[1]];
       
  1643     i+=2; sp += 2;
       
  1644   }
       
  1645 
       
  1646   if ( i < xsize) *dp = table[sp[0]];
       
  1647 }
       
  1648 
       
  1649 /***************************************************************/
       
  1650 void mlib_v_ImageLookUpSI_U8_U8_4_DstNonAl_D1(const mlib_u8  *src,
       
  1651                                               mlib_u8        *dst,
       
  1652                                               mlib_s32       xsize,
       
  1653                                               const mlib_f32 *table)
       
  1654 {
       
  1655   mlib_u32 *sa;              /* aligned pointer to source data */
       
  1656   mlib_u8  *sp;              /* pointer to source data */
       
  1657   mlib_u32 s0;               /* source data */
       
  1658   mlib_u8  *dl;              /* pointer to start of destination */
       
  1659   mlib_d64 *dp;              /* aligned pointer to destination */
       
  1660   mlib_d64 acc0, acc1, acc2; /* destination data */
       
  1661   mlib_s32 i;                /* loop variable */
       
  1662   mlib_u8  *dend;            /* pointer to end of destination */
       
  1663   mlib_s32 emask;            /* edge mask */
       
  1664   mlib_s32 off;
       
  1665   mlib_u32 s00, s01, s02, s03;
       
  1666 
       
  1667   sa   = (mlib_u32*)src;
       
  1668   sp = (void *)src;
       
  1669   dl = dst;
       
  1670   dend = dl + (xsize << 2) - 1;
       
  1671   dp   = (mlib_d64 *) ((mlib_addr) dl & (~7));
       
  1672   off  = (mlib_addr) dp - (mlib_addr) dl;
       
  1673   vis_alignaddr(dp, off);
       
  1674 
       
  1675   emask = vis_edge8(dl, dend);
       
  1676   acc0 = vis_freg_pair(table[sp[0]], table[sp[1]]);
       
  1677   vis_pst_8(vis_faligndata(acc0, acc0), dp++, emask);
       
  1678   sp += 2;
       
  1679 
       
  1680   xsize -= 2;
       
  1681 
       
  1682   if (xsize >= 2) {
       
  1683     acc1 = vis_freg_pair(table[sp[0]], table[sp[1]]);
       
  1684     *dp++ = vis_faligndata(acc0, acc1);
       
  1685     acc0 = acc1;
       
  1686     sp += 2; xsize -= 2;
       
  1687   }
       
  1688 
       
  1689   sa++;
       
  1690 
       
  1691   i = 0;
       
  1692 
       
  1693   if (xsize >= 4) {
       
  1694 
       
  1695     s0 = *sa++;
       
  1696     s00 = (s0 >> 22) & 0x3FC;
       
  1697     s01 = (s0 >> 14) & 0x3FC;
       
  1698 
       
  1699 #pragma pipeloop(0)
       
  1700     for(i = 0; i <= xsize - 8; i+=4, dp += 2) {
       
  1701       s02 = (s0 >> 6) & 0x3FC;
       
  1702       s03 = (s0 << 2) & 0x3FC;
       
  1703       acc1 = vis_freg_pair(*(mlib_f32*)((mlib_u8*)table + s00),
       
  1704                            *(mlib_f32*)((mlib_u8*)table + s01));
       
  1705       acc2 = vis_freg_pair(*(mlib_f32*)((mlib_u8*)table + s02),
       
  1706                            *(mlib_f32*)((mlib_u8*)table + s03));
       
  1707       s0 = *sa++;
       
  1708       s00 = (s0 >> 22) & 0x3FC;
       
  1709       s01 = (s0 >> 14) & 0x3FC;
       
  1710       dp[0] = vis_faligndata(acc0, acc1);
       
  1711       dp[1] = vis_faligndata(acc1, acc2);
       
  1712       acc0 = acc2;
       
  1713     }
       
  1714 
       
  1715     s02 = (s0 >> 6) & 0x3FC;
       
  1716     s03 = (s0 << 2) & 0x3FC;
       
  1717     acc1 = vis_freg_pair(*(mlib_f32*)((mlib_u8*)table + s00),
       
  1718                          *(mlib_f32*)((mlib_u8*)table + s01));
       
  1719     acc2 = vis_freg_pair(*(mlib_f32*)((mlib_u8*)table + s02),
       
  1720                          *(mlib_f32*)((mlib_u8*)table + s03));
       
  1721     dp[0] = vis_faligndata(acc0, acc1);
       
  1722     dp[1] = vis_faligndata(acc1, acc2);
       
  1723     acc0 = acc2;
       
  1724     sp = (mlib_u8*)sa;
       
  1725     dp += 2;
       
  1726     i += 4;
       
  1727   }
       
  1728 
       
  1729   if ( i <= xsize - 2) {
       
  1730     acc1 = vis_freg_pair(table[sp[0]], table[sp[1]]);
       
  1731     *dp++ = vis_faligndata(acc0, acc1);
       
  1732     acc0 = acc1;
       
  1733     i+=2; sp += 2;
       
  1734   }
       
  1735 
       
  1736   if ((mlib_addr) dp <= (mlib_addr) dend) {
       
  1737     emask = vis_edge8(dp, dend);
       
  1738     acc1 = vis_freg_pair(table[sp[0]], table[sp[1]]);
       
  1739     vis_pst_8(vis_faligndata(acc0, acc1), dp++, emask);
       
  1740   }
       
  1741 
       
  1742   if ((mlib_addr) dp <= (mlib_addr) dend) {
       
  1743     emask = vis_edge8(dp, dend);
       
  1744     vis_pst_8(vis_faligndata(acc1, acc1), dp++, emask);
       
  1745   }
       
  1746 }
       
  1747 
       
  1748 /***************************************************************/
       
  1749 void mlib_v_ImageLookUpSI_U8_U8_4_DstOff0_D1_SMALL(const mlib_u8 *src,
       
  1750                                                    mlib_u8       *dst,
       
  1751                                                    mlib_s32      xsize,
       
  1752                                                    const mlib_u8 **table)
       
  1753 {
       
  1754   mlib_u8  *sp;              /* pointer to source data */
       
  1755   mlib_u32 s0, s1;           /* source data */
       
  1756   mlib_u8 *dl;               /* pointer to start of destination */
       
  1757   mlib_d64 *dp;              /* aligned pointer to destination */
       
  1758   mlib_d64 t0, t1, t2;       /* destination data */
       
  1759   mlib_d64 t3, t4, t5;       /* destination data */
       
  1760   mlib_d64 t6, t7, acc;      /* destination data */
       
  1761   mlib_s32 i;                /* loop variable */
       
  1762   const mlib_u8  *tab0 = table[0];
       
  1763   const mlib_u8  *tab1 = table[1];
       
  1764   const mlib_u8  *tab2 = table[2];
       
  1765   const mlib_u8  *tab3 = table[3];
       
  1766 
       
  1767   sp   = (void *)src;
       
  1768   dl   = dst;
       
  1769   dp   = (mlib_d64 *) dl;
       
  1770 
       
  1771   vis_alignaddr((void *) 0, 7);
       
  1772 
       
  1773   if (xsize >= 2) {
       
  1774 
       
  1775     s0 = sp[0];
       
  1776     s1 = sp[1];
       
  1777     sp += 2;
       
  1778 
       
  1779 #pragma pipeloop(0)
       
  1780     for(i = 0; i <= xsize - 4; i+=2, sp+=2) {
       
  1781       t7 = VIS_LD_U8_I(tab3, s1);
       
  1782       t6 = VIS_LD_U8_I(tab2, s1);
       
  1783       t5 = VIS_LD_U8_I(tab1, s1);
       
  1784       t4 = VIS_LD_U8_I(tab0, s1);
       
  1785       t3 = VIS_LD_U8_I(tab3, s0);
       
  1786       t2 = VIS_LD_U8_I(tab2, s0);
       
  1787       t1 = VIS_LD_U8_I(tab1, s0);
       
  1788       t0 = VIS_LD_U8_I(tab0, s0);
       
  1789       acc = vis_faligndata(t7, acc);
       
  1790       acc = vis_faligndata(t6, acc);
       
  1791       acc = vis_faligndata(t5, acc);
       
  1792       acc = vis_faligndata(t4, acc);
       
  1793       acc = vis_faligndata(t3, acc);
       
  1794       acc = vis_faligndata(t2, acc);
       
  1795       acc = vis_faligndata(t1, acc);
       
  1796       acc = vis_faligndata(t0, acc);
       
  1797       s0 = sp[0];
       
  1798       s1 = sp[1];
       
  1799       *dp++ = acc;
       
  1800     }
       
  1801 
       
  1802     t7 = VIS_LD_U8_I(tab3, s1);
       
  1803     t6 = VIS_LD_U8_I(tab2, s1);
       
  1804     t5 = VIS_LD_U8_I(tab1, s1);
       
  1805     t4 = VIS_LD_U8_I(tab0, s1);
       
  1806     t3 = VIS_LD_U8_I(tab3, s0);
       
  1807     t2 = VIS_LD_U8_I(tab2, s0);
       
  1808     t1 = VIS_LD_U8_I(tab1, s0);
       
  1809     t0 = VIS_LD_U8_I(tab0, s0);
       
  1810     acc = vis_faligndata(t7, acc);
       
  1811     acc = vis_faligndata(t6, acc);
       
  1812     acc = vis_faligndata(t5, acc);
       
  1813     acc = vis_faligndata(t4, acc);
       
  1814     acc = vis_faligndata(t3, acc);
       
  1815     acc = vis_faligndata(t2, acc);
       
  1816     acc = vis_faligndata(t1, acc);
       
  1817     acc = vis_faligndata(t0, acc);
       
  1818     *dp++ = acc;
       
  1819   }
       
  1820 
       
  1821   if ((xsize & 1) != 0) {
       
  1822     s0 = sp[0];
       
  1823     t7 = VIS_LD_U8_I(tab3, s0);
       
  1824     t6 = VIS_LD_U8_I(tab2, s0);
       
  1825     t5 = VIS_LD_U8_I(tab1, s0);
       
  1826     t4 = VIS_LD_U8_I(tab0, s0);
       
  1827     acc = vis_faligndata(t7, acc);
       
  1828     acc = vis_faligndata(t6, acc);
       
  1829     acc = vis_faligndata(t5, acc);
       
  1830     acc = vis_faligndata(t4, acc);
       
  1831     *(mlib_f32*)dp = vis_read_hi(acc);
       
  1832   }
       
  1833 }
       
  1834 
       
  1835 /***************************************************************/
       
  1836 void mlib_v_ImageLookUpSI_U8_U8_4_DstOff1_D1_SMALL(const mlib_u8 *src,
       
  1837                                                    mlib_u8       *dst,
       
  1838                                                    mlib_s32      xsize,
       
  1839                                                    const mlib_u8 **table)
       
  1840 {
       
  1841   mlib_u8  *sp;              /* pointer to source data */
       
  1842   mlib_u32 s0, s1, s2;       /* source data */
       
  1843   mlib_u8  *dl;              /* pointer to start of destination */
       
  1844   mlib_d64 *dp;              /* aligned pointer to destination */
       
  1845   mlib_d64 t0, t1, t2;       /* destination data */
       
  1846   mlib_d64 t3, t4, t5;       /* destination data */
       
  1847   mlib_d64 t6, t7, acc;      /* destination data */
       
  1848   mlib_s32 i;                /* loop variable */
       
  1849   const mlib_u8  *tab0 = table[0];
       
  1850   const mlib_u8  *tab1 = table[1];
       
  1851   const mlib_u8  *tab2 = table[2];
       
  1852   const mlib_u8  *tab3 = table[3];
       
  1853 
       
  1854   sp   = (void *)src;
       
  1855   dl   = dst;
       
  1856   dp   = (mlib_d64 *) dl;
       
  1857 
       
  1858   vis_alignaddr((void *) 0, 7);
       
  1859 
       
  1860   s0 = *sp++;
       
  1861 
       
  1862   if (xsize >= 2) {
       
  1863 
       
  1864     s1 = sp[0];
       
  1865     s2 = sp[1];
       
  1866     sp += 2;
       
  1867 
       
  1868 #pragma pipeloop(0)
       
  1869     for(i = 0; i <= xsize - 4; i+=2, sp+=2) {
       
  1870       t7 = VIS_LD_U8_I(tab0, s2);
       
  1871       t6 = VIS_LD_U8_I(tab3, s1);
       
  1872       t5 = VIS_LD_U8_I(tab2, s1);
       
  1873       t4 = VIS_LD_U8_I(tab1, s1);
       
  1874       t3 = VIS_LD_U8_I(tab0, s1);
       
  1875       t2 = VIS_LD_U8_I(tab3, s0);
       
  1876       t1 = VIS_LD_U8_I(tab2, s0);
       
  1877       t0 = VIS_LD_U8_I(tab1, s0);
       
  1878       acc = vis_faligndata(t7, acc);
       
  1879       acc = vis_faligndata(t6, acc);
       
  1880       acc = vis_faligndata(t5, acc);
       
  1881       acc = vis_faligndata(t4, acc);
       
  1882       acc = vis_faligndata(t3, acc);
       
  1883       acc = vis_faligndata(t2, acc);
       
  1884       acc = vis_faligndata(t1, acc);
       
  1885       acc = vis_faligndata(t0, acc);
       
  1886       s0 = s2;
       
  1887       s1 = sp[0];
       
  1888       s2 = sp[1];
       
  1889       *dp++ = acc;
       
  1890     }
       
  1891 
       
  1892     t7 = VIS_LD_U8_I(tab0, s2);
       
  1893     t6 = VIS_LD_U8_I(tab3, s1);
       
  1894     t5 = VIS_LD_U8_I(tab2, s1);
       
  1895     t4 = VIS_LD_U8_I(tab1, s1);
       
  1896     t3 = VIS_LD_U8_I(tab0, s1);
       
  1897     t2 = VIS_LD_U8_I(tab3, s0);
       
  1898     t1 = VIS_LD_U8_I(tab2, s0);
       
  1899     t0 = VIS_LD_U8_I(tab1, s0);
       
  1900     acc = vis_faligndata(t7, acc);
       
  1901     acc = vis_faligndata(t6, acc);
       
  1902     acc = vis_faligndata(t5, acc);
       
  1903     acc = vis_faligndata(t4, acc);
       
  1904     acc = vis_faligndata(t3, acc);
       
  1905     acc = vis_faligndata(t2, acc);
       
  1906     acc = vis_faligndata(t1, acc);
       
  1907     acc = vis_faligndata(t0, acc);
       
  1908     s0 = s2;
       
  1909     *dp++ = acc;
       
  1910   }
       
  1911 
       
  1912   dl = (mlib_u8*)dp;
       
  1913 
       
  1914   if ((xsize & 1) != 0) {
       
  1915     s1 = sp[0];
       
  1916     t7 = VIS_LD_U8_I(tab0, s1);
       
  1917     t6 = VIS_LD_U8_I(tab3, s0);
       
  1918     t5 = VIS_LD_U8_I(tab2, s0);
       
  1919     t4 = VIS_LD_U8_I(tab1, s0);
       
  1920     acc = vis_faligndata(t7, acc);
       
  1921     acc = vis_faligndata(t6, acc);
       
  1922     acc = vis_faligndata(t5, acc);
       
  1923     acc = vis_faligndata(t4, acc);
       
  1924     *(mlib_f32*)dl = vis_read_hi(acc);
       
  1925     dl += 4;
       
  1926     s0 = s1;
       
  1927   }
       
  1928 
       
  1929   dl[0] = tab1[s0];
       
  1930   dl[1] = tab2[s0];
       
  1931   dl[2] = tab3[s0];
       
  1932 }
       
  1933 
       
  1934 /***************************************************************/
       
  1935 void mlib_v_ImageLookUpSI_U8_U8_4_DstOff2_D1_SMALL(const mlib_u8 *src,
       
  1936                                                    mlib_u8       *dst,
       
  1937                                                    mlib_s32      xsize,
       
  1938                                                    const mlib_u8 **table)
       
  1939 {
       
  1940   mlib_u8  *sp;              /* pointer to source data */
       
  1941   mlib_u32 s0, s1, s2;       /* source data */
       
  1942   mlib_u8  *dl;              /* pointer to start of destination */
       
  1943   mlib_d64 *dp;              /* aligned pointer to destination */
       
  1944   mlib_d64 t0, t1, t2;       /* destination data */
       
  1945   mlib_d64 t3, t4, t5;       /* destination data */
       
  1946   mlib_d64 t6, t7, acc;      /* destination data */
       
  1947   mlib_s32 i;                /* loop variable */
       
  1948   const mlib_u8  *tab0 = table[0];
       
  1949   const mlib_u8  *tab1 = table[1];
       
  1950   const mlib_u8  *tab2 = table[2];
       
  1951   const mlib_u8  *tab3 = table[3];
       
  1952 
       
  1953   sp   = (void *)src;
       
  1954   dl   = dst;
       
  1955   dp   = (mlib_d64 *) dl;
       
  1956 
       
  1957   vis_alignaddr((void *) 0, 7);
       
  1958 
       
  1959   s0 = *sp++;
       
  1960 
       
  1961   if (xsize >= 2) {
       
  1962 
       
  1963     s1 = sp[0];
       
  1964     s2 = sp[1];
       
  1965     sp += 2;
       
  1966 
       
  1967 #pragma pipeloop(0)
       
  1968     for(i = 0; i <= xsize - 4; i+=2, sp+=2) {
       
  1969       t7 = VIS_LD_U8_I(tab1, s2);
       
  1970       t6 = VIS_LD_U8_I(tab0, s2);
       
  1971       t5 = VIS_LD_U8_I(tab3, s1);
       
  1972       t4 = VIS_LD_U8_I(tab2, s1);
       
  1973       t3 = VIS_LD_U8_I(tab1, s1);
       
  1974       t2 = VIS_LD_U8_I(tab0, s1);
       
  1975       t1 = VIS_LD_U8_I(tab3, s0);
       
  1976       t0 = VIS_LD_U8_I(tab2, s0);
       
  1977       acc = vis_faligndata(t7, acc);
       
  1978       acc = vis_faligndata(t6, acc);
       
  1979       acc = vis_faligndata(t5, acc);
       
  1980       acc = vis_faligndata(t4, acc);
       
  1981       acc = vis_faligndata(t3, acc);
       
  1982       acc = vis_faligndata(t2, acc);
       
  1983       acc = vis_faligndata(t1, acc);
       
  1984       acc = vis_faligndata(t0, acc);
       
  1985       s0 = s2;
       
  1986       s1 = sp[0];
       
  1987       s2 = sp[1];
       
  1988       *dp++ = acc;
       
  1989     }
       
  1990 
       
  1991     t7 = VIS_LD_U8_I(tab1, s2);
       
  1992     t6 = VIS_LD_U8_I(tab0, s2);
       
  1993     t5 = VIS_LD_U8_I(tab3, s1);
       
  1994     t4 = VIS_LD_U8_I(tab2, s1);
       
  1995     t3 = VIS_LD_U8_I(tab1, s1);
       
  1996     t2 = VIS_LD_U8_I(tab0, s1);
       
  1997     t1 = VIS_LD_U8_I(tab3, s0);
       
  1998     t0 = VIS_LD_U8_I(tab2, s0);
       
  1999     acc = vis_faligndata(t7, acc);
       
  2000     acc = vis_faligndata(t6, acc);
       
  2001     acc = vis_faligndata(t5, acc);
       
  2002     acc = vis_faligndata(t4, acc);
       
  2003     acc = vis_faligndata(t3, acc);
       
  2004     acc = vis_faligndata(t2, acc);
       
  2005     acc = vis_faligndata(t1, acc);
       
  2006     acc = vis_faligndata(t0, acc);
       
  2007     s0 = s2;
       
  2008     *dp++ = acc;
       
  2009   }
       
  2010 
       
  2011   dl = (mlib_u8*)dp;
       
  2012 
       
  2013   if ((xsize & 1) != 0) {
       
  2014     s1 = sp[0];
       
  2015     t7 = VIS_LD_U8_I(tab1, s1);
       
  2016     t6 = VIS_LD_U8_I(tab0, s1);
       
  2017     t5 = VIS_LD_U8_I(tab3, s0);
       
  2018     t4 = VIS_LD_U8_I(tab2, s0);
       
  2019     acc = vis_faligndata(t7, acc);
       
  2020     acc = vis_faligndata(t6, acc);
       
  2021     acc = vis_faligndata(t5, acc);
       
  2022     acc = vis_faligndata(t4, acc);
       
  2023     *(mlib_f32*)dl = vis_read_hi(acc);
       
  2024     dl += 4;
       
  2025     s0 = s1;
       
  2026   }
       
  2027 
       
  2028   dl[0] = tab2[s0];
       
  2029   dl[1] = tab3[s0];
       
  2030 }
       
  2031 
       
  2032 /***************************************************************/
       
  2033 void mlib_v_ImageLookUpSI_U8_U8_4_DstOff3_D1_SMALL(const mlib_u8 *src,
       
  2034                                                    mlib_u8       *dst,
       
  2035                                                    mlib_s32      xsize,
       
  2036                                                    const mlib_u8 **table)
       
  2037 {
       
  2038   mlib_u8  *sp;              /* pointer to source data */
       
  2039   mlib_u32 s0, s1, s2;       /* source data */
       
  2040   mlib_u8 *dl;               /* pointer to start of destination */
       
  2041   mlib_d64 *dp;              /* aligned pointer to destination */
       
  2042   mlib_d64 t0, t1, t2;       /* destination data */
       
  2043   mlib_d64 t3, t4, t5;       /* destination data */
       
  2044   mlib_d64 t6, t7, acc;      /* destination data */
       
  2045   mlib_s32 i;                /* loop variable */
       
  2046   const mlib_u8  *tab0 = table[0];
       
  2047   const mlib_u8  *tab1 = table[1];
       
  2048   const mlib_u8  *tab2 = table[2];
       
  2049   const mlib_u8  *tab3 = table[3];
       
  2050 
       
  2051   sp   = (void *)src;
       
  2052   dl   = dst;
       
  2053   dp   = (mlib_d64 *) dl;
       
  2054 
       
  2055   vis_alignaddr((void *) 0, 7);
       
  2056 
       
  2057   s0 = *sp++;
       
  2058 
       
  2059   if (xsize >= 2) {
       
  2060 
       
  2061     s1 = sp[0];
       
  2062     s2 = sp[1];
       
  2063     sp += 2;
       
  2064 
       
  2065 #pragma pipeloop(0)
       
  2066     for(i = 0; i <= xsize - 4; i+=2, sp+=2) {
       
  2067       t7 = VIS_LD_U8_I(tab2, s2);
       
  2068       t6 = VIS_LD_U8_I(tab1, s2);
       
  2069       t5 = VIS_LD_U8_I(tab0, s2);
       
  2070       t4 = VIS_LD_U8_I(tab3, s1);
       
  2071       t3 = VIS_LD_U8_I(tab2, s1);
       
  2072       t2 = VIS_LD_U8_I(tab1, s1);
       
  2073       t1 = VIS_LD_U8_I(tab0, s1);
       
  2074       t0 = VIS_LD_U8_I(tab3, s0);
       
  2075       acc = vis_faligndata(t7, acc);
       
  2076       acc = vis_faligndata(t6, acc);
       
  2077       acc = vis_faligndata(t5, acc);
       
  2078       acc = vis_faligndata(t4, acc);
       
  2079       acc = vis_faligndata(t3, acc);
       
  2080       acc = vis_faligndata(t2, acc);
       
  2081       acc = vis_faligndata(t1, acc);
       
  2082       acc = vis_faligndata(t0, acc);
       
  2083       s0 = s2;
       
  2084       s1 = sp[0];
       
  2085       s2 = sp[1];
       
  2086       *dp++ = acc;
       
  2087     }
       
  2088 
       
  2089     t7 = VIS_LD_U8_I(tab2, s2);
       
  2090     t6 = VIS_LD_U8_I(tab1, s2);
       
  2091     t5 = VIS_LD_U8_I(tab0, s2);
       
  2092     t4 = VIS_LD_U8_I(tab3, s1);
       
  2093     t3 = VIS_LD_U8_I(tab2, s1);
       
  2094     t2 = VIS_LD_U8_I(tab1, s1);
       
  2095     t1 = VIS_LD_U8_I(tab0, s1);
       
  2096     t0 = VIS_LD_U8_I(tab3, s0);
       
  2097     acc = vis_faligndata(t7, acc);
       
  2098     acc = vis_faligndata(t6, acc);
       
  2099     acc = vis_faligndata(t5, acc);
       
  2100     acc = vis_faligndata(t4, acc);
       
  2101     acc = vis_faligndata(t3, acc);
       
  2102     acc = vis_faligndata(t2, acc);
       
  2103     acc = vis_faligndata(t1, acc);
       
  2104     acc = vis_faligndata(t0, acc);
       
  2105     s0 = s2;
       
  2106     *dp++ = acc;
       
  2107   }
       
  2108 
       
  2109   dl = (mlib_u8*)dp;
       
  2110 
       
  2111   if ((xsize & 1) != 0) {
       
  2112     s1 = sp[0];
       
  2113     t7 = VIS_LD_U8_I(tab2, s1);
       
  2114     t6 = VIS_LD_U8_I(tab1, s1);
       
  2115     t5 = VIS_LD_U8_I(tab0, s1);
       
  2116     t4 = VIS_LD_U8_I(tab3, s0);
       
  2117     acc = vis_faligndata(t7, acc);
       
  2118     acc = vis_faligndata(t6, acc);
       
  2119     acc = vis_faligndata(t5, acc);
       
  2120     acc = vis_faligndata(t4, acc);
       
  2121     *(mlib_f32*)dl = vis_read_hi(acc);
       
  2122     dl += 4;
       
  2123     s0 = s1;
       
  2124   }
       
  2125 
       
  2126   dl[0] = tab3[s0];
       
  2127 }
       
  2128 
       
  2129 /***************************************************************/
       
  2130 void mlib_v_ImageLookUpSI_U8_U8_4(const mlib_u8 *src,
       
  2131                                   mlib_s32      slb,
       
  2132                                   mlib_u8       *dst,
       
  2133                                   mlib_s32      dlb,
       
  2134                                   mlib_s32      xsize,
       
  2135                                   mlib_s32      ysize,
       
  2136                                   const mlib_u8 **table)
       
  2137 {
       
  2138   if ((xsize * ysize) < 500) {
       
  2139     mlib_u8  *sl;
       
  2140     mlib_u8  *dl;
       
  2141     mlib_s32 j;
       
  2142     const mlib_u8  *tab0 = table[0];
       
  2143     const mlib_u8  *tab1 = table[1];
       
  2144     const mlib_u8  *tab2 = table[2];
       
  2145     const mlib_u8  *tab3 = table[3];
       
  2146 
       
  2147     sl = (void *)src;
       
  2148     dl = dst;
       
  2149 
       
  2150     /* row loop */
       
  2151     for (j = 0; j < ysize; j ++) {
       
  2152       mlib_u8 *sp = sl;
       
  2153       mlib_u8 *dp = dl;
       
  2154       mlib_s32 off, s0, size = xsize;
       
  2155 
       
  2156       off =  (8 - ((mlib_addr)dp & 7)) & 7;
       
  2157 
       
  2158       if ((off >= 4) && (size > 0)) {
       
  2159         s0 = *sp++;
       
  2160         *dp++ = tab0[s0];
       
  2161         *dp++ = tab1[s0];
       
  2162         *dp++ = tab2[s0];
       
  2163         *dp++ = tab3[s0];
       
  2164         size--;
       
  2165       }
       
  2166 
       
  2167       if (size > 0) {
       
  2168         off =  (4 - ((mlib_addr)dp & 3)) & 3;
       
  2169 
       
  2170         if (off == 0) {
       
  2171           mlib_v_ImageLookUpSI_U8_U8_4_DstOff0_D1_SMALL(sp, dp, size, table);
       
  2172         } else if (off == 1) {
       
  2173           s0 = *sp;
       
  2174           *dp++ = tab0[s0];
       
  2175           size--;
       
  2176           mlib_v_ImageLookUpSI_U8_U8_4_DstOff1_D1_SMALL(sp, dp, size, table);
       
  2177         } else if (off == 2) {
       
  2178           s0 = *sp;
       
  2179           *dp++ = tab0[s0];
       
  2180           *dp++ = tab1[s0];
       
  2181           size--;
       
  2182           mlib_v_ImageLookUpSI_U8_U8_4_DstOff2_D1_SMALL(sp, dp, size, table);
       
  2183         } else if (off == 3) {
       
  2184           s0 = *sp;
       
  2185           *dp++ = tab0[s0];
       
  2186           *dp++ = tab1[s0];
       
  2187           *dp++ = tab2[s0];
       
  2188           size--;
       
  2189           mlib_v_ImageLookUpSI_U8_U8_4_DstOff3_D1_SMALL(sp, dp, size, table);
       
  2190         }
       
  2191       }
       
  2192 
       
  2193       sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
       
  2194       dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
       
  2195     }
       
  2196 
       
  2197   } else {
       
  2198     mlib_u8  *sl;
       
  2199     mlib_u8  *dl;
       
  2200     mlib_u32 tab[256];
       
  2201     const mlib_u8  *tab0 = table[0];
       
  2202     const mlib_u8  *tab1 = table[1];
       
  2203     const mlib_u8  *tab2 = table[2];
       
  2204     const mlib_u8  *tab3 = table[3];
       
  2205     mlib_s32 i, j;
       
  2206     mlib_u32 s0, s1, s2, s3, s4;
       
  2207 
       
  2208     s0 = tab0[0];
       
  2209     s1 = tab1[0];
       
  2210     s2 = tab2[0];
       
  2211     s3 = tab3[0];
       
  2212     for (i = 1; i < 256; i++) {
       
  2213       s4 = (s0 << 24) + (s1 << 16) + (s2 << 8) + s3;
       
  2214       s0 = tab0[i];
       
  2215       s1 = tab1[i];
       
  2216       s2 = tab2[i];
       
  2217       s3 = tab3[i];
       
  2218       tab[i-1] = s4;
       
  2219     }
       
  2220 
       
  2221     s4 = (s0 << 24) + (s1 << 16) + (s2 << 8) + s3;
       
  2222     tab[255] = s4;
       
  2223 
       
  2224     sl = (void *)src;
       
  2225     dl = dst;
       
  2226 
       
  2227     /* row loop */
       
  2228     for (j = 0; j < ysize; j ++) {
       
  2229       mlib_u8 *sp = sl;
       
  2230       mlib_u8 *dp = dl;
       
  2231       mlib_s32 off, size = xsize;
       
  2232 
       
  2233       if (((mlib_addr)dp & 3) == 0) {
       
  2234         off = (4 - (mlib_addr)sp & 3) & 3;
       
  2235 
       
  2236         off = (off < size) ? off : size;
       
  2237 
       
  2238 #pragma pipeloop(0)
       
  2239         for (i = 0; i < off; i++) {
       
  2240           *(mlib_u32*)dp = tab[(*sp)];
       
  2241           dp += 4; sp++;
       
  2242         }
       
  2243 
       
  2244         size -= off;
       
  2245 
       
  2246         if (size > 0) {
       
  2247           mlib_v_ImageLookUpSI_U8_U8_4_SrcOff0_D1(sp, dp, size, (mlib_f32*)tab);
       
  2248         }
       
  2249 
       
  2250       } else {
       
  2251 
       
  2252         off = ((4 - ((mlib_addr)sp & 3)) & 3);
       
  2253         off = (off < size) ? off : size;
       
  2254 
       
  2255         for (i = 0; i < off; i++) {
       
  2256           s0 = tab[(*sp)];
       
  2257           *dp++ = (s0 >> 24);
       
  2258           *dp++ = (s0 >> 16);
       
  2259           *dp++ = (s0 >> 8);
       
  2260           *dp++ = s0;
       
  2261           size--; sp++;
       
  2262         }
       
  2263 
       
  2264         if (size > 0) {
       
  2265           mlib_v_ImageLookUpSI_U8_U8_4_DstNonAl_D1(sp, dp, size, (mlib_f32*)tab);
       
  2266         }
       
  2267       }
       
  2268 
       
  2269       sl = (mlib_u8 *) ((mlib_u8 *) sl + slb);
       
  2270       dl = (mlib_u8 *) ((mlib_u8 *) dl + dlb);
       
  2271     }
       
  2272   }
       
  2273 }
       
  2274 
       
  2275 /***************************************************************/