jdk/src/java.desktop/unix/native/libmlib_image/mlib_v_ImageAffine_BL_S16.c
changeset 25859 3317bb8137f4
parent 5506 202f599c92aa
child 38415 acea5f7d354b
equal deleted inserted replaced
25858:836adbf7a2cd 25859:3317bb8137f4
       
     1 /*
       
     2  * Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.  Oracle designates this
       
     8  * particular file as subject to the "Classpath" exception as provided
       
     9  * by Oracle in the LICENSE file that accompanied this code.
       
    10  *
       
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    14  * version 2 for more details (a copy is included in the LICENSE file that
       
    15  * accompanied this code).
       
    16  *
       
    17  * You should have received a copy of the GNU General Public License version
       
    18  * 2 along with this work; if not, write to the Free Software Foundation,
       
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    20  *
       
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    22  * or visit www.oracle.com if you need additional information or have any
       
    23  * questions.
       
    24  */
       
    25 
       
    26 
       
    27 
       
    28 /*
       
    29  *      The functions step along the lines from xLeft to xRight and apply
       
    30  *      the bilinear filtering.
       
    31  *
       
    32  */
       
    33 
       
    34 #include "vis_proto.h"
       
    35 #include "mlib_image.h"
       
    36 #include "mlib_ImageColormap.h"
       
    37 #include "mlib_ImageCopy.h"
       
    38 #include "mlib_ImageAffine.h"
       
    39 #include "mlib_v_ImageFilters.h"
       
    40 #include "mlib_v_ImageChannelExtract.h"
       
    41 
       
    42 /*#define MLIB_VIS2*/
       
    43 
       
    44 /***************************************************************/
       
    45 #define DTYPE mlib_s16
       
    46 
       
    47 #define FUN_NAME(CHAN) mlib_ImageAffine_s16_##CHAN##_bl
       
    48 
       
    49 /***************************************************************/
       
    50 static mlib_status FUN_NAME(2ch_na)(mlib_affine_param *param);
       
    51 static mlib_status FUN_NAME(4ch_na)(mlib_affine_param *param);
       
    52 
       
    53 /***************************************************************/
       
    54 const mlib_u64 mlib_dmask_arr[] = {
       
    55   0x0000000000000000, 0x000000000000FFFF, 0x00000000FFFF0000, 0x00000000FFFFFFFF,
       
    56   0x0000FFFF00000000, 0x0000FFFF0000FFFF, 0x0000FFFFFFFF0000, 0x0000FFFFFFFFFFFF,
       
    57   0xFFFF000000000000, 0xFFFF00000000FFFF, 0xFFFF0000FFFF0000, 0xFFFF0000FFFFFFFF,
       
    58   0xFFFFFFFF00000000, 0xFFFFFFFF0000FFFF, 0xFFFFFFFFFFFF0000, 0xFFFFFFFFFFFFFFFF
       
    59 };
       
    60 
       
    61 /***************************************************************/
       
    62 #define XOR_8000(x)
       
    63 
       
    64 /***************************************************************/
       
    65 #ifdef MLIB_VIS2
       
    66 #define MLIB_WRITE_BMASK(bmask) vis_write_bmask(bmask, 0)
       
    67 #else
       
    68 #define MLIB_WRITE_BMASK(bmask)
       
    69 #endif
       
    70 
       
    71 /***************************************************************/
       
    72 #undef  DECLAREVAR
       
    73 #define DECLAREVAR()                                            \
       
    74   DECLAREVAR0();                                                \
       
    75   mlib_s32  *warp_tbl   = param -> warp_tbl;                    \
       
    76   mlib_s32  srcYStride = param -> srcYStride;                   \
       
    77   mlib_u8   *dl;                                                \
       
    78   mlib_s32  i, size;                                            \
       
    79   /*mlib_d64  mask_8000 = vis_to_double_dup(0x80008000);*/      \
       
    80   mlib_d64  mask_7fff = vis_to_double_dup(0x7FFF7FFF);          \
       
    81   mlib_d64  dx64, dy64, deltax, deltay, delta1_x, delta1_y;     \
       
    82   mlib_d64  s0, s1, s2, s3;                                     \
       
    83   mlib_d64  d0, d1, d2, d3, dd
       
    84 
       
    85 /***************************************************************/
       
    86 
       
    87 /* arguments (x, y) are swapped to prevent overflow */
       
    88 #define FMUL_16x16(x, y)                        \
       
    89   vis_fpadd16(vis_fmul8sux16(y, x),             \
       
    90               vis_fmul8ulx16(y, x))
       
    91 
       
    92 /***************************************************************/
       
    93 #define BUF_SIZE  512
       
    94 
       
    95 /***************************************************************/
       
    96 #define DOUBLE_4U16(x0, x1, x2, x3)                                 \
       
    97   vis_to_double(((((x0) & 0xFFFE) << 15) | (((x1) & 0xFFFE) >> 1)), \
       
    98                 ((((x2) & 0xFFFE) << 15) | (((x3) & 0xFFFE) >> 1)))
       
    99 
       
   100 /***************************************************************/
       
   101 #define BL_SUM()                                                \
       
   102   XOR_8000(s0);                                                 \
       
   103   XOR_8000(s1);                                                 \
       
   104   XOR_8000(s2);                                                 \
       
   105   XOR_8000(s3);                                                 \
       
   106                                                                 \
       
   107   delta1_x = vis_fpsub16(mask_7fff, deltax);                    \
       
   108   delta1_y = vis_fpsub16(mask_7fff, deltay);                    \
       
   109                                                                 \
       
   110   d0 = FMUL_16x16(s0, delta1_x);                                \
       
   111   d1 = FMUL_16x16(s1, deltax);                                  \
       
   112   d0 = vis_fpadd16(d0, d1);                                     \
       
   113   d0 = vis_fpadd16(d0, d0);                                     \
       
   114   d0 = FMUL_16x16(d0, delta1_y);                                \
       
   115                                                                 \
       
   116   d2 = FMUL_16x16(s2, delta1_x);                                \
       
   117   d3 = FMUL_16x16(s3, deltax);                                  \
       
   118   d2 = vis_fpadd16(d2, d3);                                     \
       
   119   d2 = vis_fpadd16(d2, d2);                                     \
       
   120   d2 = FMUL_16x16(d2, deltay);                                  \
       
   121                                                                 \
       
   122   dd = vis_fpadd16(d0, d2);                                     \
       
   123   dd = vis_fpadd16(dd, dd);                                     \
       
   124   XOR_8000(dd);                                                 \
       
   125                                                                 \
       
   126   deltax = vis_fpadd16(deltax, dx64);                           \
       
   127   deltay = vis_fpadd16(deltay, dy64);                           \
       
   128   deltax = vis_fand(deltax, mask_7fff);                         \
       
   129   deltay = vis_fand(deltay, mask_7fff)
       
   130 
       
   131 /***************************************************************/
       
   132 #define BL_SUM_3CH()                                            \
       
   133   XOR_8000(s0);                                                 \
       
   134   XOR_8000(s1);                                                 \
       
   135   XOR_8000(s2);                                                 \
       
   136   XOR_8000(s3);                                                 \
       
   137                                                                 \
       
   138   delta1_x = vis_fpsub16(mask_7fff, deltax);                    \
       
   139   delta1_y = vis_fpsub16(mask_7fff, deltay);                    \
       
   140                                                                 \
       
   141   d0 = FMUL_16x16(s0, delta1_y);                                \
       
   142   d2 = FMUL_16x16(s2, deltay);                                  \
       
   143   d0 = vis_fpadd16(d0, d2);                                     \
       
   144   d0 = vis_fpadd16(d0, d0);                                     \
       
   145   d0 = FMUL_16x16(d0, delta1_x);                                \
       
   146                                                                 \
       
   147   d1 = FMUL_16x16(s1, delta1_y);                                \
       
   148   d3 = FMUL_16x16(s3, deltay);                                  \
       
   149   d1 = vis_fpadd16(d1, d3);                                     \
       
   150   d1 = vis_fpadd16(d1, d1);                                     \
       
   151   d1 = FMUL_16x16(d1, deltax);                                  \
       
   152                                                                 \
       
   153   vis_alignaddr((void*)0, 2);                                   \
       
   154   d0 = vis_faligndata(d0, d0);                                  \
       
   155   dd = vis_fpadd16(d0, d1);                                     \
       
   156   dd = vis_fpadd16(dd, dd);                                     \
       
   157   XOR_8000(dd);                                                 \
       
   158                                                                 \
       
   159   deltax = vis_fpadd16(deltax, dx64);                           \
       
   160   deltay = vis_fpadd16(deltay, dy64);                           \
       
   161   deltax = vis_fand(deltax, mask_7fff);                         \
       
   162   deltay = vis_fand(deltay, mask_7fff)
       
   163 
       
   164 /***************************************************************/
       
   165 #define LD_U16(sp, ind) vis_ld_u16(sp + ind)
       
   166 
       
   167 /***************************************************************/
       
   168 #ifndef MLIB_VIS2
       
   169 
       
   170 #define LOAD_1CH()                                              \
       
   171   s0 = vis_faligndata(LD_U16(sp3, 0), mask_7fff);               \
       
   172   s1 = vis_faligndata(LD_U16(sp3, 2), mask_7fff);               \
       
   173   s2 = vis_faligndata(LD_U16(sp3, srcYStride), mask_7fff);      \
       
   174   s3 = vis_faligndata(LD_U16(sp3, srcYStride + 2), mask_7fff);  \
       
   175                                                                 \
       
   176   s0 = vis_faligndata(LD_U16(sp2, 0), s0);                      \
       
   177   s1 = vis_faligndata(LD_U16(sp2, 2), s1);                      \
       
   178   s2 = vis_faligndata(LD_U16(sp2, srcYStride), s2);             \
       
   179   s3 = vis_faligndata(LD_U16(sp2, srcYStride + 2), s3);         \
       
   180                                                                 \
       
   181   s0 = vis_faligndata(LD_U16(sp1, 0), s0);                      \
       
   182   s1 = vis_faligndata(LD_U16(sp1, 2), s1);                      \
       
   183   s2 = vis_faligndata(LD_U16(sp1, srcYStride), s2);             \
       
   184   s3 = vis_faligndata(LD_U16(sp1, srcYStride + 2), s3);         \
       
   185                                                                 \
       
   186   s0 = vis_faligndata(LD_U16(sp0, 0), s0);                      \
       
   187   s1 = vis_faligndata(LD_U16(sp0, 2), s1);                      \
       
   188   s2 = vis_faligndata(LD_U16(sp0, srcYStride), s2);             \
       
   189   s3 = vis_faligndata(LD_U16(sp0, srcYStride + 2), s3)
       
   190 
       
   191 #else
       
   192 
       
   193 #define LOAD_1CH()                                                             \
       
   194   s0 = vis_bshuffle(LD_U16(sp0, 0), LD_U16(sp2, 0));                           \
       
   195   s1 = vis_bshuffle(LD_U16(sp0, 2), LD_U16(sp2, 2));                           \
       
   196   s2 = vis_bshuffle(LD_U16(sp0, srcYStride), LD_U16(sp2, srcYStride));         \
       
   197   s3 = vis_bshuffle(LD_U16(sp0, srcYStride + 2), LD_U16(sp2, srcYStride + 2)); \
       
   198                                                                                \
       
   199   t0 = vis_bshuffle(LD_U16(sp1, 0), LD_U16(sp3, 0));                           \
       
   200   t1 = vis_bshuffle(LD_U16(sp1, 2), LD_U16(sp3, 2));                           \
       
   201   t2 = vis_bshuffle(LD_U16(sp1, srcYStride), LD_U16(sp3, srcYStride));         \
       
   202   t3 = vis_bshuffle(LD_U16(sp1, srcYStride + 2), LD_U16(sp3, srcYStride + 2)); \
       
   203                                                                                \
       
   204   s0 = vis_bshuffle(s0, t0);                                                   \
       
   205   s1 = vis_bshuffle(s1, t1);                                                   \
       
   206   s2 = vis_bshuffle(s2, t2);                                                   \
       
   207   s3 = vis_bshuffle(s3, t3)
       
   208 
       
   209 #endif
       
   210 
       
   211 /***************************************************************/
       
   212 #define GET_POINTER(sp)                                                       \
       
   213   sp = *(mlib_u8**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + 2*(X >> MLIB_SHIFT); \
       
   214   X += dX;                                                                    \
       
   215   Y += dY
       
   216 
       
   217 /***************************************************************/
       
   218 #undef  PREPARE_DELTAS
       
   219 #define PREPARE_DELTAS                                                             \
       
   220   if (warp_tbl != NULL) {                                                          \
       
   221     dX = warp_tbl[2*j    ];                                                        \
       
   222     dY = warp_tbl[2*j + 1];                                                        \
       
   223     dx64 = vis_to_double_dup((((dX << 1) & 0xFFFF) << 16) | ((dX << 1) & 0xFFFF)); \
       
   224     dy64 = vis_to_double_dup((((dY << 1) & 0xFFFF) << 16) | ((dY << 1) & 0xFFFF)); \
       
   225   }
       
   226 
       
   227 /***************************************************************/
       
   228 mlib_status FUN_NAME(1ch)(mlib_affine_param *param)
       
   229 {
       
   230   DECLAREVAR();
       
   231   mlib_s32 off;
       
   232   mlib_s32 x0, x1, x2, x3, y0, y1, y2, y3;
       
   233 #ifdef MLIB_VIS2
       
   234   mlib_d64 t0, t1, t2, t3;
       
   235   vis_write_bmask(0x45CD67EF, 0);
       
   236 #else
       
   237   vis_alignaddr((void*)0, 6);
       
   238 #endif
       
   239 
       
   240   dx64 = vis_to_double_dup((((dX << 1) & 0xFFFF) << 16) | ((dX << 1) & 0xFFFF));
       
   241   dy64 = vis_to_double_dup((((dY << 1) & 0xFFFF) << 16) | ((dY << 1) & 0xFFFF));
       
   242 
       
   243   for (j = yStart; j <= yFinish; j++) {
       
   244     mlib_u8  *sp0, *sp1, *sp2, *sp3;
       
   245     mlib_d64 *dp, dmask;
       
   246 
       
   247     NEW_LINE(1);
       
   248 
       
   249     off = (mlib_s32)dl & 7;
       
   250     dp = (mlib_d64*)(dl - off);
       
   251     off >>= 1;
       
   252 
       
   253     x0 = X - off*dX; y0 = Y - off*dY;
       
   254     x1 = x0 + dX;    y1 = y0 + dY;
       
   255     x2 = x1 + dX;    y2 = y1 + dY;
       
   256     x3 = x2 + dX;    y3 = y2 + dY;
       
   257 
       
   258     deltax = DOUBLE_4U16(x0, x1, x2, x3);
       
   259     deltay = DOUBLE_4U16(y0, y1, y2, y3);
       
   260 
       
   261     if (off) {
       
   262       mlib_s32 emask = vis_edge16((void*)(2*off), (void*)(2*(off + size - 1)));
       
   263 
       
   264       off = 4 - off;
       
   265       GET_POINTER(sp3);
       
   266       sp0 = sp1 = sp2 = sp3;
       
   267 
       
   268       if (off > 1 && size > 1) {
       
   269         GET_POINTER(sp3);
       
   270       }
       
   271 
       
   272       if (off > 2) {
       
   273         sp2 = sp3;
       
   274 
       
   275         if (size > 2) {
       
   276           GET_POINTER(sp3);
       
   277         }
       
   278       }
       
   279 
       
   280       LOAD_1CH();
       
   281       BL_SUM();
       
   282 
       
   283       dmask = ((mlib_d64*)mlib_dmask_arr)[emask];
       
   284       *dp++ = vis_for (vis_fand(dmask, dd), vis_fandnot(dmask, dp[0]));
       
   285 
       
   286       size -= off;
       
   287 
       
   288       if (size < 0) size = 0;
       
   289     }
       
   290 
       
   291 #pragma pipeloop(0)
       
   292     for (i = 0; i < size/4; i++) {
       
   293       GET_POINTER(sp0);
       
   294       GET_POINTER(sp1);
       
   295       GET_POINTER(sp2);
       
   296       GET_POINTER(sp3);
       
   297 
       
   298       LOAD_1CH();
       
   299       BL_SUM();
       
   300 
       
   301       dp[i] = dd;
       
   302     }
       
   303 
       
   304     off = size & 3;
       
   305 
       
   306     if (off) {
       
   307       GET_POINTER(sp0);
       
   308       sp1 = sp2 = sp3 = sp0;
       
   309 
       
   310       if (off > 1) {
       
   311         GET_POINTER(sp1);
       
   312       }
       
   313 
       
   314       if (off > 2) {
       
   315         GET_POINTER(sp2);
       
   316       }
       
   317 
       
   318       LOAD_1CH();
       
   319       BL_SUM();
       
   320 
       
   321       dmask = ((mlib_d64*)mlib_dmask_arr)[(0xF0 >> off) & 0x0F];
       
   322       dp[i] = vis_for (vis_fand(dmask, dd), vis_fandnot(dmask, dp[i]));
       
   323     }
       
   324   }
       
   325 
       
   326   return MLIB_SUCCESS;
       
   327 }
       
   328 
       
   329 /***************************************************************/
       
   330 #undef  GET_POINTER
       
   331 #define GET_POINTER(sp)                                                      \
       
   332   sp = *(mlib_f32**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + (X >> MLIB_SHIFT); \
       
   333   X += dX;                                                                   \
       
   334   Y += dY
       
   335 
       
   336 /***************************************************************/
       
   337 #define LOAD_2CH()                                              \
       
   338   s0 = vis_freg_pair(sp0[0], sp1[0]);                           \
       
   339   s1 = vis_freg_pair(sp0[1], sp1[1]);                           \
       
   340   s2 = vis_freg_pair(sp0[srcYStride], sp1[srcYStride]);         \
       
   341   s3 = vis_freg_pair(sp0[srcYStride + 1], sp1[srcYStride + 1])
       
   342 
       
   343 /***************************************************************/
       
   344 #undef  PREPARE_DELTAS
       
   345 #define PREPARE_DELTAS                                               \
       
   346   if (warp_tbl != NULL) {                                            \
       
   347     dX = warp_tbl[2*j    ];                                          \
       
   348     dY = warp_tbl[2*j + 1];                                          \
       
   349     dx64 = vis_to_double_dup(((dX & 0xFFFF) << 16) | (dX & 0xFFFF)); \
       
   350     dy64 = vis_to_double_dup(((dY & 0xFFFF) << 16) | (dY & 0xFFFF)); \
       
   351   }
       
   352 
       
   353 /***************************************************************/
       
   354 mlib_status FUN_NAME(2ch)(mlib_affine_param *param)
       
   355 {
       
   356   DECLAREVAR();
       
   357   mlib_s32 off;
       
   358   mlib_s32 x0, x1, y0, y1;
       
   359 
       
   360   if (((mlib_s32)lineAddr[0] | (mlib_s32)dstData | srcYStride | dstYStride) & 3) {
       
   361     return FUN_NAME(2ch_na)(param);
       
   362   }
       
   363 
       
   364   srcYStride >>= 2;
       
   365 
       
   366   dx64 = vis_to_double_dup(((dX & 0xFFFF) << 16) | (dX & 0xFFFF));
       
   367   dy64 = vis_to_double_dup(((dY & 0xFFFF) << 16) | (dY & 0xFFFF));
       
   368 
       
   369   for (j = yStart; j <= yFinish; j++) {
       
   370     mlib_f32 *sp0, *sp1;
       
   371     mlib_d64 *dp;
       
   372 
       
   373     NEW_LINE(2);
       
   374 
       
   375     off = (mlib_s32)dl & 7;
       
   376     dp = (mlib_d64*)(dl - off);
       
   377 
       
   378     if (off) {
       
   379       x0 = X - dX; y0 = Y - dY;
       
   380       x1 = X;      y1 = Y;
       
   381     } else {
       
   382       x0 = X;      y0 = Y;
       
   383       x1 = X + dX; y1 = Y + dY;
       
   384     }
       
   385 
       
   386     deltax = DOUBLE_4U16(x0, x0, x1, x1);
       
   387     deltay = DOUBLE_4U16(y0, y0, y1, y1);
       
   388 
       
   389     if (off) {
       
   390       GET_POINTER(sp1);
       
   391       sp0 = sp1;
       
   392       LOAD_2CH();
       
   393 
       
   394       BL_SUM();
       
   395 
       
   396       ((mlib_f32*)dp)[1] = vis_read_lo(dd);
       
   397       dp++;
       
   398       size--;
       
   399     }
       
   400 
       
   401 #pragma pipeloop(0)
       
   402     for (i = 0; i < size/2; i++) {
       
   403       GET_POINTER(sp0);
       
   404       GET_POINTER(sp1);
       
   405       LOAD_2CH();
       
   406 
       
   407       BL_SUM();
       
   408 
       
   409       *dp++ = dd;
       
   410     }
       
   411 
       
   412     if (size & 1) {
       
   413       GET_POINTER(sp0);
       
   414       sp1 = sp0;
       
   415       LOAD_2CH();
       
   416 
       
   417       BL_SUM();
       
   418 
       
   419       ((mlib_f32*)dp)[0] = vis_read_hi(dd);
       
   420     }
       
   421   }
       
   422 
       
   423   return MLIB_SUCCESS;
       
   424 }
       
   425 
       
   426 /***************************************************************/
       
   427 #undef  GET_POINTER
       
   428 #define GET_POINTER(sp)                                                       \
       
   429   sp = *(mlib_u8**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + 4*(X >> MLIB_SHIFT); \
       
   430   X += dX;                                                                    \
       
   431   Y += dY
       
   432 
       
   433 /***************************************************************/
       
   434 #ifndef MLIB_VIS2
       
   435 
       
   436 #define LOAD_2CH_NA()                                           \
       
   437   s0 = vis_faligndata(LD_U16(sp1, 2), mask_7fff);               \
       
   438   s1 = vis_faligndata(LD_U16(sp1, 6), mask_7fff);               \
       
   439   s2 = vis_faligndata(LD_U16(sp1, srcYStride + 2), mask_7fff);  \
       
   440   s3 = vis_faligndata(LD_U16(sp1, srcYStride + 6), mask_7fff);  \
       
   441                                                                 \
       
   442   s0 = vis_faligndata(LD_U16(sp1, 0), s0);                      \
       
   443   s1 = vis_faligndata(LD_U16(sp1, 4), s1);                      \
       
   444   s2 = vis_faligndata(LD_U16(sp1, srcYStride), s2);             \
       
   445   s3 = vis_faligndata(LD_U16(sp1, srcYStride + 4), s3);         \
       
   446                                                                 \
       
   447   s0 = vis_faligndata(LD_U16(sp0, 2), s0);                      \
       
   448   s1 = vis_faligndata(LD_U16(sp0, 6), s1);                      \
       
   449   s2 = vis_faligndata(LD_U16(sp0, srcYStride + 2), s2);         \
       
   450   s3 = vis_faligndata(LD_U16(sp0, srcYStride + 6), s3);         \
       
   451                                                                 \
       
   452   s0 = vis_faligndata(LD_U16(sp0, 0), s0);                      \
       
   453   s1 = vis_faligndata(LD_U16(sp0, 4), s1);                      \
       
   454   s2 = vis_faligndata(LD_U16(sp0, srcYStride), s2);             \
       
   455   s3 = vis_faligndata(LD_U16(sp0, srcYStride + 4), s3)
       
   456 
       
   457 #else
       
   458 
       
   459 #define LOAD_2CH_NA()                                                          \
       
   460   s0 = vis_bshuffle(LD_U16(sp0, 0), LD_U16(sp1, 0));                           \
       
   461   s1 = vis_bshuffle(LD_U16(sp0, 4), LD_U16(sp1, 4));                           \
       
   462   s2 = vis_bshuffle(LD_U16(sp0, srcYStride), LD_U16(sp1, srcYStride));         \
       
   463   s3 = vis_bshuffle(LD_U16(sp0, srcYStride + 4), LD_U16(sp1, srcYStride + 4)); \
       
   464                                                                                \
       
   465   t0 = vis_bshuffle(LD_U16(sp0, 2), LD_U16(sp1, 2));                           \
       
   466   t1 = vis_bshuffle(LD_U16(sp0, 6), LD_U16(sp1, 6));                           \
       
   467   t2 = vis_bshuffle(LD_U16(sp0, srcYStride + 2), LD_U16(sp1, srcYStride + 2)); \
       
   468   t3 = vis_bshuffle(LD_U16(sp0, srcYStride + 6), LD_U16(sp1, srcYStride + 6)); \
       
   469                                                                                \
       
   470   s0 = vis_bshuffle(s0, t0);                                                   \
       
   471   s1 = vis_bshuffle(s1, t1);                                                   \
       
   472   s2 = vis_bshuffle(s2, t2);                                                   \
       
   473   s3 = vis_bshuffle(s3, t3)
       
   474 
       
   475 #endif
       
   476 
       
   477 /***************************************************************/
       
   478 mlib_status FUN_NAME(2ch_na)(mlib_affine_param *param)
       
   479 {
       
   480   DECLAREVAR();
       
   481   mlib_s32 max_xsize = param -> max_xsize, bsize;
       
   482   mlib_s32 x0, x1, y0, y1;
       
   483   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
       
   484 #ifdef MLIB_VIS2
       
   485   mlib_d64 t0, t1, t2, t3;
       
   486 #endif
       
   487 
       
   488   bsize = (max_xsize + 1)/2;
       
   489 
       
   490   if (bsize > BUF_SIZE) {
       
   491     pbuff = mlib_malloc(bsize*sizeof(mlib_d64));
       
   492 
       
   493     if (pbuff == NULL) return MLIB_FAILURE;
       
   494   }
       
   495 
       
   496   MLIB_WRITE_BMASK(0x45CD67EF);
       
   497 
       
   498   dx64 = vis_to_double_dup(((dX & 0xFFFF) << 16) | (dX & 0xFFFF));
       
   499   dy64 = vis_to_double_dup(((dY & 0xFFFF) << 16) | (dY & 0xFFFF));
       
   500 
       
   501   for (j = yStart; j <= yFinish; j++) {
       
   502     mlib_u8 *sp0, *sp1;
       
   503 
       
   504 #ifndef MLIB_VIS2
       
   505     vis_alignaddr((void*)0, 6);
       
   506 #endif
       
   507 
       
   508     NEW_LINE(2);
       
   509 
       
   510     x0 = X;      y0 = Y;
       
   511     x1 = X + dX; y1 = Y + dY;
       
   512 
       
   513     deltax = DOUBLE_4U16(x0, x0, x1, x1);
       
   514     deltay = DOUBLE_4U16(y0, y0, y1, y1);
       
   515 
       
   516 #pragma pipeloop(0)
       
   517     for (i = 0; i < size/2; i++) {
       
   518       GET_POINTER(sp0);
       
   519       GET_POINTER(sp1);
       
   520       LOAD_2CH_NA();
       
   521 
       
   522       BL_SUM();
       
   523 
       
   524       pbuff[i] = dd;
       
   525     }
       
   526 
       
   527     if (size & 1) {
       
   528       GET_POINTER(sp0);
       
   529       sp1 = sp0;
       
   530       LOAD_2CH_NA();
       
   531 
       
   532       BL_SUM();
       
   533 
       
   534       pbuff[i] = dd;
       
   535     }
       
   536 
       
   537     mlib_ImageCopy_na((mlib_u8*)pbuff, dl, 4*size);
       
   538   }
       
   539 
       
   540   if (pbuff != buff) {
       
   541     mlib_free(pbuff);
       
   542   }
       
   543 
       
   544   return MLIB_SUCCESS;
       
   545 }
       
   546 
       
   547 /***************************************************************/
       
   548 #undef  PREPARE_DELTAS
       
   549 #define PREPARE_DELTAS                                                             \
       
   550   if (warp_tbl != NULL) {                                                          \
       
   551     dX = warp_tbl[2*j    ];                                                        \
       
   552     dY = warp_tbl[2*j + 1];                                                        \
       
   553     dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */                       \
       
   554     dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */                       \
       
   555     dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF)); \
       
   556     dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF)); \
       
   557   }
       
   558 
       
   559 /***************************************************************/
       
   560 mlib_status FUN_NAME(3ch)(mlib_affine_param *param)
       
   561 {
       
   562   DECLAREVAR();
       
   563   mlib_s32 max_xsize = param -> max_xsize;
       
   564   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
       
   565 
       
   566   if (max_xsize > BUF_SIZE) {
       
   567     pbuff = mlib_malloc(max_xsize*sizeof(mlib_d64));
       
   568 
       
   569     if (pbuff == NULL) return MLIB_FAILURE;
       
   570   }
       
   571 
       
   572   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
       
   573   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
       
   574   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
       
   575   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
       
   576 
       
   577   for (j = yStart; j <= yFinish; j++) {
       
   578     mlib_u8  *sp;
       
   579     mlib_d64 *sp0, *sp1;
       
   580 
       
   581     NEW_LINE(3);
       
   582 
       
   583     deltax = DOUBLE_4U16(X, X, X, X);
       
   584     deltay = DOUBLE_4U16(Y, Y, Y, Y);
       
   585 
       
   586 #pragma pipeloop(0)
       
   587     for (i = 0; i < size; i++) {
       
   588       sp = *(mlib_u8**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + 6*(X >> MLIB_SHIFT) - 2;
       
   589 
       
   590       vis_alignaddr(sp, 0);
       
   591       sp0 = AL_ADDR(sp, 0);
       
   592       s0 = vis_faligndata(sp0[0], sp0[1]);
       
   593       s1 = vis_faligndata(sp0[1], sp0[2]);
       
   594 
       
   595       vis_alignaddr(sp, srcYStride);
       
   596       sp1 = AL_ADDR(sp, srcYStride);
       
   597       s2 = vis_faligndata(sp1[0], sp1[1]);
       
   598       s3 = vis_faligndata(sp1[1], sp1[2]);
       
   599 
       
   600       BL_SUM_3CH();
       
   601 
       
   602       pbuff[i] = dd;
       
   603       X += dX;
       
   604       Y += dY;
       
   605     }
       
   606 
       
   607     mlib_v_ImageChannelExtract_S16_43L_D1((void *)pbuff, (void *)dl, size);
       
   608   }
       
   609 
       
   610   if (pbuff != buff) {
       
   611     mlib_free(pbuff);
       
   612   }
       
   613 
       
   614   return MLIB_SUCCESS;
       
   615 }
       
   616 
       
   617 /***************************************************************/
       
   618 mlib_status FUN_NAME(4ch)(mlib_affine_param *param)
       
   619 {
       
   620   DECLAREVAR();
       
   621 
       
   622   if (((mlib_s32)lineAddr[0] | (mlib_s32)dstData | srcYStride | dstYStride) & 7) {
       
   623     return FUN_NAME(4ch_na)(param);
       
   624   }
       
   625 
       
   626   srcYStride >>= 3;
       
   627 
       
   628   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
       
   629   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
       
   630   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
       
   631   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
       
   632 
       
   633   for (j = yStart; j <= yFinish; j++) {
       
   634     mlib_d64 *sp;
       
   635 
       
   636     NEW_LINE(4);
       
   637 
       
   638     deltax = DOUBLE_4U16(X, X, X, X);
       
   639     deltay = DOUBLE_4U16(Y, Y, Y, Y);
       
   640 
       
   641 #pragma pipeloop(0)
       
   642     for (i = 0; i < size; i++) {
       
   643       sp = *(mlib_d64**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + (X >> MLIB_SHIFT);
       
   644       s0 = sp[0];
       
   645       s1 = sp[1];
       
   646       s2 = sp[srcYStride];
       
   647       s3 = sp[srcYStride + 1];
       
   648 
       
   649       BL_SUM();
       
   650 
       
   651       ((mlib_d64*)dl)[i] = dd;
       
   652       X += dX;
       
   653       Y += dY;
       
   654     }
       
   655   }
       
   656 
       
   657   return MLIB_SUCCESS;
       
   658 }
       
   659 
       
   660 /***************************************************************/
       
   661 mlib_status FUN_NAME(4ch_na)(mlib_affine_param *param)
       
   662 {
       
   663   DECLAREVAR();
       
   664   mlib_s32 max_xsize = param -> max_xsize;
       
   665   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
       
   666 
       
   667   if (max_xsize > BUF_SIZE) {
       
   668     pbuff = mlib_malloc(max_xsize*sizeof(mlib_d64));
       
   669 
       
   670     if (pbuff == NULL) return MLIB_FAILURE;
       
   671   }
       
   672 
       
   673   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
       
   674   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
       
   675   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
       
   676   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
       
   677 
       
   678   for (j = yStart; j <= yFinish; j++) {
       
   679     mlib_u8  *sp;
       
   680     mlib_d64 *sp0, *sp1;
       
   681 
       
   682     NEW_LINE(4);
       
   683 
       
   684     deltax = DOUBLE_4U16(X, X, X, X);
       
   685     deltay = DOUBLE_4U16(Y, Y, Y, Y);
       
   686 
       
   687 #pragma pipeloop(0)
       
   688     for (i = 0; i < size; i++) {
       
   689       sp = *(mlib_u8**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + 8*(X >> MLIB_SHIFT);
       
   690 
       
   691       vis_alignaddr(sp, 0);
       
   692       sp0 = AL_ADDR(sp, 0);
       
   693       s0 = vis_faligndata(sp0[0], sp0[1]);
       
   694       s1 = vis_faligndata(sp0[1], sp0[2]);
       
   695 
       
   696       vis_alignaddr(sp, srcYStride);
       
   697       sp1 = AL_ADDR(sp, srcYStride);
       
   698       s2 = vis_faligndata(sp1[0], sp1[1]);
       
   699       s3 = vis_faligndata(sp1[1], sp1[2]);
       
   700 
       
   701       BL_SUM();
       
   702 
       
   703       pbuff[i] = dd;
       
   704       X += dX;
       
   705       Y += dY;
       
   706     }
       
   707 
       
   708     mlib_ImageCopy_na((mlib_u8*)pbuff, dl, 8*size);
       
   709   }
       
   710 
       
   711   if (pbuff != buff) {
       
   712     mlib_free(pbuff);
       
   713   }
       
   714 
       
   715   return MLIB_SUCCESS;
       
   716 }
       
   717 
       
   718 /***************************************************************/
       
   719 #define LUT(x)  plut[x]
       
   720 
       
   721 mlib_status FUN_NAME(s16_i)(mlib_affine_param *param,
       
   722                             const void        *colormap)
       
   723 {
       
   724   DECLAREVAR();
       
   725   mlib_s32 nchan   = mlib_ImageGetLutChannels(colormap);
       
   726   mlib_s32 lut_off = mlib_ImageGetLutOffset(colormap);
       
   727   mlib_d64 *plut = (mlib_d64*)mlib_ImageGetLutNormalTable(colormap) - lut_off;
       
   728   mlib_s32 max_xsize = param -> max_xsize;
       
   729   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
       
   730 
       
   731   srcYStride /= sizeof(DTYPE);
       
   732 
       
   733   if (max_xsize > BUF_SIZE) {
       
   734     pbuff = mlib_malloc(max_xsize*sizeof(mlib_d64));
       
   735 
       
   736     if (pbuff == NULL) return MLIB_FAILURE;
       
   737   }
       
   738 
       
   739   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
       
   740   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
       
   741   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
       
   742   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
       
   743 
       
   744   for (j = yStart; j <= yFinish; j++) {
       
   745     DTYPE *sp;
       
   746 
       
   747     NEW_LINE(1);
       
   748 
       
   749     deltax = DOUBLE_4U16(X, X, X, X);
       
   750     deltay = DOUBLE_4U16(Y, Y, Y, Y);
       
   751 
       
   752 #pragma pipeloop(0)
       
   753     for (i = 0; i < size; i++) {
       
   754       sp = *(DTYPE**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + (X >> MLIB_SHIFT);
       
   755       s0 = LUT(sp[0]);
       
   756       s1 = LUT(sp[1]);
       
   757       s2 = LUT(sp[srcYStride]);
       
   758       s3 = LUT(sp[srcYStride + 1]);
       
   759 
       
   760       BL_SUM();
       
   761 
       
   762       pbuff[i] = dd;
       
   763       X += dX;
       
   764       Y += dY;
       
   765     }
       
   766 
       
   767     if (nchan == 3) {
       
   768       mlib_ImageColorTrue2IndexLine_S16_S16_3_in_4((void*)pbuff, (void*)dl, size, colormap);
       
   769     } else {
       
   770       mlib_ImageColorTrue2IndexLine_S16_S16_4((void*)pbuff, (void*)dl, size, colormap);
       
   771     }
       
   772   }
       
   773 
       
   774   if (pbuff != buff) {
       
   775     mlib_free(pbuff);
       
   776   }
       
   777 
       
   778   return MLIB_SUCCESS;
       
   779 }
       
   780 
       
   781 /***************************************************************/
       
   782 #undef  DTYPE
       
   783 #define DTYPE mlib_u8
       
   784 
       
   785 mlib_status FUN_NAME(u8_i)(mlib_affine_param *param,
       
   786                            const void        *colormap)
       
   787 {
       
   788   DECLAREVAR();
       
   789   mlib_s32 nchan   = mlib_ImageGetLutChannels(colormap);
       
   790   mlib_s32 lut_off = mlib_ImageGetLutOffset(colormap);
       
   791   mlib_d64 *plut = (mlib_d64*)mlib_ImageGetLutNormalTable(colormap) - lut_off;
       
   792   mlib_s32 max_xsize = param -> max_xsize;
       
   793   mlib_d64 buff[BUF_SIZE], *pbuff = buff;
       
   794 
       
   795   if (max_xsize > BUF_SIZE) {
       
   796     pbuff = mlib_malloc(max_xsize*sizeof(mlib_d64));
       
   797 
       
   798     if (pbuff == NULL) return MLIB_FAILURE;
       
   799   }
       
   800 
       
   801   dX = (dX - (dX >> 31)) &~ 1; /* rounding towards ZERO */
       
   802   dY = (dY - (dY >> 31)) &~ 1; /* rounding towards ZERO */
       
   803   dx64 = vis_to_double_dup((((dX >> 1) & 0xFFFF) << 16) | ((dX >> 1) & 0xFFFF));
       
   804   dy64 = vis_to_double_dup((((dY >> 1) & 0xFFFF) << 16) | ((dY >> 1) & 0xFFFF));
       
   805 
       
   806   for (j = yStart; j <= yFinish; j++) {
       
   807     DTYPE *sp;
       
   808 
       
   809     NEW_LINE(1);
       
   810 
       
   811     deltax = DOUBLE_4U16(X, X, X, X);
       
   812     deltay = DOUBLE_4U16(Y, Y, Y, Y);
       
   813 
       
   814 #pragma pipeloop(0)
       
   815     for (i = 0; i < size; i++) {
       
   816       sp = *(DTYPE**)((mlib_u8*)lineAddr + PTR_SHIFT(Y)) + (X >> MLIB_SHIFT);
       
   817       s0 = LUT(sp[0]);
       
   818       s1 = LUT(sp[1]);
       
   819       s2 = LUT(sp[srcYStride]);
       
   820       s3 = LUT(sp[srcYStride + 1]);
       
   821 
       
   822       BL_SUM();
       
   823 
       
   824       pbuff[i] = dd;
       
   825       X += dX;
       
   826       Y += dY;
       
   827     }
       
   828 
       
   829     if (nchan == 3) {
       
   830       mlib_ImageColorTrue2IndexLine_S16_U8_3_in_4((void*)pbuff, (void*)dl, size, colormap);
       
   831     } else {
       
   832       mlib_ImageColorTrue2IndexLine_S16_U8_4((void*)pbuff, (void*)dl, size, colormap);
       
   833     }
       
   834   }
       
   835 
       
   836   if (pbuff != buff) {
       
   837     mlib_free(pbuff);
       
   838   }
       
   839 
       
   840   return MLIB_SUCCESS;
       
   841 }
       
   842 
       
   843 /***************************************************************/