jdk/src/share/native/sun/awt/medialib/mlib_ImageConv_16ext.c
changeset 2 90ce3da70b43
child 5506 202f599c92aa
equal deleted inserted replaced
0:fd16c54261b3 2:90ce3da70b43
       
     1 /*
       
     2  * Copyright 2003 Sun Microsystems, Inc.  All Rights Reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.  Sun designates this
       
     8  * particular file as subject to the "Classpath" exception as provided
       
     9  * by Sun in the LICENSE file that accompanied this code.
       
    10  *
       
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    14  * version 2 for more details (a copy is included in the LICENSE file that
       
    15  * accompanied this code).
       
    16  *
       
    17  * You should have received a copy of the GNU General Public License version
       
    18  * 2 along with this work; if not, write to the Free Software Foundation,
       
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    20  *
       
    21  * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
       
    22  * CA 95054 USA or visit www.sun.com if you need additional information or
       
    23  * have any questions.
       
    24  */
       
    25 
       
    26 
       
    27 /*
       
    28  * FUNCTION
       
    29  *   Internal functions for mlib_ImageConv* on U8/S16/U16 type and
       
    30  *   MLIB_EDGE_SRC_EXTEND mask
       
    31  */
       
    32 
       
    33 #include "mlib_image.h"
       
    34 #include "mlib_ImageConv.h"
       
    35 #include "mlib_c_ImageConv.h"
       
    36 
       
    37 /*
       
    38  * This define switches between functions of different data types
       
    39  */
       
    40 
       
    41 #define IMG_TYPE 2
       
    42 
       
    43 /***************************************************************/
       
    44 #if IMG_TYPE == 1
       
    45 
       
    46 #define DTYPE             mlib_u8
       
    47 #define CONV_FUNC(KERN)   mlib_c_conv##KERN##ext_u8(PARAM)
       
    48 #define CONV_FUNC_MxN     mlib_c_convMxNext_u8(PARAM_MxN)
       
    49 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u8(PARAM)
       
    50 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u8(PARAM_MxN)
       
    51 #define DSCALE            (1 << 24)
       
    52 #define FROM_S32(x)       (((x) >> 24) ^ 128)
       
    53 #define S64TOS32(x)       (x)
       
    54 #define SAT_OFF           -(1u << 31)
       
    55 
       
    56 #elif IMG_TYPE == 2
       
    57 
       
    58 #define DTYPE             mlib_s16
       
    59 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_s16(PARAM)
       
    60 #define CONV_FUNC_MxN     mlib_convMxNext_s16(PARAM_MxN)
       
    61 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_s16(PARAM)
       
    62 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_s16(PARAM_MxN)
       
    63 #define DSCALE            65536.0
       
    64 #define FROM_S32(x)       ((x) >> 16)
       
    65 #define S64TOS32(x)       ((x) & 0xffffffff)
       
    66 #define SAT_OFF
       
    67 
       
    68 #elif IMG_TYPE == 3
       
    69 
       
    70 #define DTYPE             mlib_u16
       
    71 #define CONV_FUNC(KERN)   mlib_conv##KERN##ext_u16(PARAM)
       
    72 #define CONV_FUNC_MxN     mlib_convMxNext_u16(PARAM_MxN)
       
    73 #define CONV_FUNC_I(KERN) mlib_i_conv##KERN##ext_u16(PARAM)
       
    74 #define CONV_FUNC_MxN_I   mlib_i_convMxNext_u16(PARAM_MxN)
       
    75 #define DSCALE            65536.0
       
    76 #define FROM_S32(x)       (((x) >> 16) ^ 0x8000)
       
    77 #define S64TOS32(x)       (x)
       
    78 #define SAT_OFF           -(1u << 31)
       
    79 
       
    80 #endif /* IMG_TYPE == 1 */
       
    81 
       
    82 /***************************************************************/
       
    83 #define KSIZE1 (KSIZE - 1)
       
    84 
       
    85 /***************************************************************/
       
    86 #define PARAM                                                   \
       
    87   mlib_image       *dst,                                        \
       
    88   const mlib_image *src,                                        \
       
    89   mlib_s32         dx_l,                                        \
       
    90   mlib_s32         dx_r,                                        \
       
    91   mlib_s32         dy_t,                                        \
       
    92   mlib_s32         dy_b,                                        \
       
    93   const mlib_s32   *kern,                                       \
       
    94   mlib_s32         scalef_expon,                                \
       
    95   mlib_s32         cmask
       
    96 
       
    97 /***************************************************************/
       
    98 #define PARAM_MxN                                               \
       
    99   mlib_image       *dst,                                        \
       
   100   const mlib_image *src,                                        \
       
   101   const mlib_s32   *kernel,                                     \
       
   102   mlib_s32         m,                                           \
       
   103   mlib_s32         n,                                           \
       
   104   mlib_s32         dx_l,                                        \
       
   105   mlib_s32         dx_r,                                        \
       
   106   mlib_s32         dy_t,                                        \
       
   107   mlib_s32         dy_b,                                        \
       
   108   mlib_s32         scale,                                       \
       
   109   mlib_s32         cmask
       
   110 
       
   111 /***************************************************************/
       
   112 #define FTYPE mlib_d64
       
   113 
       
   114 #ifndef MLIB_USE_FTOI_CLAMPING
       
   115 
       
   116 #define CLAMP_S32(x)                                            \
       
   117   (((x) <= MLIB_S32_MIN) ? MLIB_S32_MIN : (((x) >= MLIB_S32_MAX) ? MLIB_S32_MAX : (mlib_s32)(x)))
       
   118 
       
   119 #else
       
   120 
       
   121 #define CLAMP_S32(x) ((mlib_s32)(x))
       
   122 
       
   123 #endif /* MLIB_USE_FTOI_CLAMPING */
       
   124 
       
   125 /***************************************************************/
       
   126 #define D2I(x) CLAMP_S32((x) SAT_OFF)
       
   127 
       
   128 /***************************************************************/
       
   129 #ifdef _LITTLE_ENDIAN
       
   130 
       
   131 #define STORE2(res0, res1)                                      \
       
   132   dp[0    ] = res1;                                             \
       
   133   dp[chan1] = res0
       
   134 
       
   135 #else
       
   136 
       
   137 #define STORE2(res0, res1)                                      \
       
   138   dp[0    ] = res0;                                             \
       
   139   dp[chan1] = res1
       
   140 
       
   141 #endif /* _LITTLE_ENDIAN */
       
   142 
       
   143 /***************************************************************/
       
   144 #ifdef _NO_LONGLONG
       
   145 
       
   146 #define LOAD_BUFF(buff)                                         \
       
   147   buff[i    ] = sp[0];                                          \
       
   148   buff[i + 1] = sp[chan1]
       
   149 
       
   150 #else /* _NO_LONGLONG */
       
   151 
       
   152 #ifdef _LITTLE_ENDIAN
       
   153 
       
   154 #define LOAD_BUFF(buff)                                         \
       
   155   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[chan1]) << 32) | S64TOS32((mlib_s64)sp[0])
       
   156 
       
   157 #else /* _LITTLE_ENDIAN */
       
   158 
       
   159 #define LOAD_BUFF(buff)                                         \
       
   160   *(mlib_s64*)(buff + i) = (((mlib_s64)sp[0]) << 32) | S64TOS32((mlib_s64)sp[chan1])
       
   161 
       
   162 #endif /* _LITTLE_ENDIAN */
       
   163 #endif /* _NO_LONGLONG */
       
   164 
       
   165 /***************************************************************/
       
   166 #define MLIB_D2_24 16777216.0f
       
   167 
       
   168 /***************************************************************/
       
   169 typedef union {
       
   170   mlib_d64 d64;
       
   171   struct {
       
   172     mlib_s32 i0;
       
   173     mlib_s32 i1;
       
   174   } i32s;
       
   175 } d64_2x32;
       
   176 
       
   177 /***************************************************************/
       
   178 #define BUFF_LINE 256
       
   179 
       
   180 /***************************************************************/
       
   181 #define DEF_VARS(type)                                          \
       
   182   type     *adr_src, *sl, *sp, *sl1;                            \
       
   183   type     *adr_dst, *dl, *dp;                                  \
       
   184   FTYPE    *pbuff = buff;                                       \
       
   185   mlib_s32 *buffi, *buffo;                                      \
       
   186   mlib_s32 wid, hgt, sll, dll;                                  \
       
   187   mlib_s32 nchannel, chan1, chan2;                              \
       
   188   mlib_s32 i, j, c, swid
       
   189 
       
   190 /***************************************************************/
       
   191 #define LOAD_KERNEL3()                                                   \
       
   192   FTYPE    scalef = DSCALE;                                              \
       
   193   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7, k8;                           \
       
   194   FTYPE    p00, p01, p02, p03,                                           \
       
   195            p10, p11, p12, p13,                                           \
       
   196            p20, p21, p22, p23;                                           \
       
   197                                                                          \
       
   198   while (scalef_expon > 30) {                                            \
       
   199     scalef /= (1 << 30);                                                 \
       
   200     scalef_expon -= 30;                                                  \
       
   201   }                                                                      \
       
   202                                                                          \
       
   203   scalef /= (1 << scalef_expon);                                         \
       
   204                                                                          \
       
   205   /* keep kernel in regs */                                              \
       
   206   k0 = scalef * kern[0];  k1 = scalef * kern[1];  k2 = scalef * kern[2]; \
       
   207   k3 = scalef * kern[3];  k4 = scalef * kern[4];  k5 = scalef * kern[5]; \
       
   208   k6 = scalef * kern[6];  k7 = scalef * kern[7];  k8 = scalef * kern[8]
       
   209 
       
   210 /***************************************************************/
       
   211 #define LOAD_KERNEL(SIZE)                                       \
       
   212   FTYPE    scalef = DSCALE;                                     \
       
   213                                                                 \
       
   214   while (scalef_expon > 30) {                                   \
       
   215     scalef /= (1 << 30);                                        \
       
   216     scalef_expon -= 30;                                         \
       
   217   }                                                             \
       
   218                                                                 \
       
   219   scalef /= (1 << scalef_expon);                                \
       
   220                                                                 \
       
   221   for (j = 0; j < SIZE; j++) k[j] = scalef * kern[j]
       
   222 
       
   223 /***************************************************************/
       
   224 #define GET_SRC_DST_PARAMETERS(type)                            \
       
   225   hgt = mlib_ImageGetHeight(src);                               \
       
   226   wid = mlib_ImageGetWidth(src);                                \
       
   227   nchannel = mlib_ImageGetChannels(src);                        \
       
   228   sll = mlib_ImageGetStride(src) / sizeof(type);                \
       
   229   dll = mlib_ImageGetStride(dst) / sizeof(type);                \
       
   230   adr_src = (type *)mlib_ImageGetData(src);                     \
       
   231   adr_dst = (type *)mlib_ImageGetData(dst)
       
   232 
       
   233 /***************************************************************/
       
   234 #ifndef __sparc
       
   235 #if IMG_TYPE == 1
       
   236 
       
   237 /*
       
   238  * Test for the presence of any "1" bit in bits
       
   239    8 to 31 of val. If present, then val is either
       
   240    negative or >255. If over/underflows of 8 bits
       
   241    are uncommon, then this technique can be a win,
       
   242    since only a single test, rather than two, is
       
   243    necessary to determine if clamping is needed.
       
   244    On the other hand, if over/underflows are common,
       
   245    it adds an extra test.
       
   246 */
       
   247 #define CLAMP_STORE(dst, val)                                   \
       
   248   if (val & 0xffffff00) {                                       \
       
   249     if (val < MLIB_U8_MIN)                                      \
       
   250       dst = MLIB_U8_MIN;                                        \
       
   251     else                                                        \
       
   252       dst = MLIB_U8_MAX;                                        \
       
   253   } else {                                                      \
       
   254     dst = (mlib_u8)val;                                         \
       
   255   }
       
   256 
       
   257 #elif IMG_TYPE == 2
       
   258 
       
   259 #define CLAMP_STORE(dst, val)                                   \
       
   260   if (val >= MLIB_S16_MAX)                                      \
       
   261     dst = MLIB_S16_MAX;                                         \
       
   262   else if (val <= MLIB_S16_MIN)                                 \
       
   263     dst = MLIB_S16_MIN;                                         \
       
   264   else                                                          \
       
   265     dst = (mlib_s16)val
       
   266 
       
   267 #elif IMG_TYPE == 3
       
   268 
       
   269 #define CLAMP_STORE(dst, val)                                   \
       
   270   if (val >= MLIB_U16_MAX)                                      \
       
   271     dst = MLIB_U16_MAX;                                         \
       
   272   else if (val <= MLIB_U16_MIN)                                 \
       
   273     dst = MLIB_U16_MIN;                                         \
       
   274   else                                                          \
       
   275     dst = (mlib_u16)val
       
   276 
       
   277 #endif /* IMG_TYPE == 1 */
       
   278 #endif /* __sparc */
       
   279 
       
   280 /***************************************************************/
       
   281 #define KSIZE  3
       
   282 
       
   283 mlib_status CONV_FUNC(3x3)
       
   284 {
       
   285   FTYPE    buff[(KSIZE + 2)*BUFF_LINE], *buff0, *buff1, *buff2, *buff3, *buffT;
       
   286   DEF_VARS(DTYPE);
       
   287   DTYPE *sl2;
       
   288 #ifndef __sparc
       
   289   mlib_s32 d0, d1;
       
   290 #endif /* __sparc */
       
   291   LOAD_KERNEL3();
       
   292   GET_SRC_DST_PARAMETERS(DTYPE);
       
   293 
       
   294   swid = wid + KSIZE1;
       
   295 
       
   296   if (swid > BUFF_LINE) {
       
   297     pbuff = mlib_malloc((KSIZE + 2)*sizeof(FTYPE   )*swid);
       
   298 
       
   299     if (pbuff == NULL) return MLIB_FAILURE;
       
   300   }
       
   301 
       
   302   buff0 = pbuff;
       
   303   buff1 = buff0 + swid;
       
   304   buff2 = buff1 + swid;
       
   305   buff3 = buff2 + swid;
       
   306   buffo = (mlib_s32*)(buff3 + swid);
       
   307   buffi = buffo + (swid &~ 1);
       
   308 
       
   309   swid -= (dx_l + dx_r);
       
   310 
       
   311   chan1 = nchannel;
       
   312   chan2 = chan1 + chan1;
       
   313 
       
   314   for (c = 0; c < nchannel; c++) {
       
   315     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
       
   316 
       
   317     sl = adr_src + c;
       
   318     dl = adr_dst + c;
       
   319 
       
   320     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
       
   321     else sl1 = sl;
       
   322 
       
   323     if ((hgt - dy_b) > 0) sl2 = sl1 + sll;
       
   324     else sl2 = sl1;
       
   325 
       
   326     for (i = 0; i < dx_l; i++) {
       
   327       buff0[i] = (FTYPE)sl[0];
       
   328       buff1[i] = (FTYPE)sl1[0];
       
   329       buff2[i] = (FTYPE)sl2[0];
       
   330     }
       
   331 
       
   332 #ifdef __SUNPRO_C
       
   333 #pragma pipeloop(0)
       
   334 #endif /* __SUNPRO_C */
       
   335     for (i = 0; i < swid; i++) {
       
   336       buff0[i + dx_l] = (FTYPE)sl[i*chan1];
       
   337       buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
       
   338       buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
       
   339     }
       
   340 
       
   341     for (i = 0; i < dx_r; i++) {
       
   342       buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
       
   343       buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
       
   344       buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
       
   345     }
       
   346 
       
   347     if ((hgt - dy_b) > 1) sl = sl2 + sll;
       
   348     else sl = sl2;
       
   349 
       
   350     for (j = 0; j < hgt; j++) {
       
   351       FTYPE    s0, s1;
       
   352 
       
   353       p02 = buff0[0];
       
   354       p12 = buff1[0];
       
   355       p22 = buff2[0];
       
   356 
       
   357       p03 = buff0[1];
       
   358       p13 = buff1[1];
       
   359       p23 = buff2[1];
       
   360 
       
   361       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
       
   362       s1 = p03 * k0 + p13 * k3 + p23 * k6;
       
   363 
       
   364       sp = sl;
       
   365       dp = dl;
       
   366 
       
   367 #ifdef __SUNPRO_C
       
   368 #pragma pipeloop(0)
       
   369 #endif /* __SUNPRO_C */
       
   370       for (i = 0; i <= (wid - 2); i += 2) {
       
   371 #ifdef __sparc
       
   372 #ifdef _NO_LONGLONG
       
   373         mlib_s32 o64_1, o64_2;
       
   374 #else /* _NO_LONGLONG */
       
   375         mlib_s64 o64;
       
   376 #endif /* _NO_LONGLONG */
       
   377 #endif /* __sparc */
       
   378         d64_2x32 dd;
       
   379 
       
   380         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
       
   381         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3];
       
   382 
       
   383         LOAD_BUFF(buffi);
       
   384 
       
   385         dd.d64 = *(FTYPE   *)(buffi + i);
       
   386         buff3[i + dx_l    ] = (FTYPE)dd.i32s.i0;
       
   387         buff3[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
       
   388 
       
   389 #ifndef __sparc
       
   390 
       
   391         d0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
       
   392         d1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
       
   393 
       
   394         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
       
   395         s1 = p03 * k0 + p13 * k3 + p23 * k6;
       
   396 
       
   397         dp[0    ] = FROM_S32(d0);
       
   398         dp[chan1] = FROM_S32(d1);
       
   399 
       
   400 #else /* __sparc */
       
   401 
       
   402         dd.i32s.i0 = D2I(s0 + p02 * k2 + p12 * k5 + p22 * k8);
       
   403         dd.i32s.i1 = D2I(s1 + p02 * k1 + p03 * k2 + p12 * k4 + p13 * k5 + p22 * k7 + p23 * k8);
       
   404         *(FTYPE   *)(buffo + i) = dd.d64;
       
   405 
       
   406         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
       
   407         s1 = p03 * k0 + p13 * k3 + p23 * k6;
       
   408 
       
   409 #ifdef _NO_LONGLONG
       
   410 
       
   411         o64_1 = buffo[i];
       
   412         o64_2 = buffo[i+1];
       
   413 #if IMG_TYPE != 1
       
   414         STORE2(FROM_S32(o64_1), FROM_S32(o64_2));
       
   415 #else
       
   416         STORE2(o64_1 >> 24, o64_2 >> 24);
       
   417 #endif /* IMG_TYPE != 1 */
       
   418 
       
   419 #else /* _NO_LONGLONG */
       
   420 
       
   421         o64 = *(mlib_s64*)(buffo + i);
       
   422 #if IMG_TYPE != 1
       
   423         STORE2(FROM_S32(o64 >> 32), FROM_S32(o64));
       
   424 #else
       
   425         STORE2(o64 >> 56, o64 >> 24);
       
   426 #endif /* IMG_TYPE != 1 */
       
   427 #endif /* _NO_LONGLONG */
       
   428 #endif /* __sparc */
       
   429 
       
   430         sp += chan2;
       
   431         dp += chan2;
       
   432       }
       
   433 
       
   434       for (; i < wid; i++) {
       
   435         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];
       
   436         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1];
       
   437         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2];
       
   438 
       
   439         buffi[i] = (mlib_s32)sp[0];
       
   440         buff3[i + dx_l] = (FTYPE)buffi[i];
       
   441 
       
   442 #ifndef __sparc
       
   443 
       
   444         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
       
   445                  p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
       
   446 
       
   447         dp[0] = FROM_S32(d0);
       
   448 
       
   449 #else  /* __sparc */
       
   450 
       
   451         buffo[i] = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p10 * k3 + p11 * k4 +
       
   452                        p12 * k5 + p20 * k6 + p21 * k7 + p22 * k8);
       
   453 #if IMG_TYPE != 1
       
   454         dp[0] = FROM_S32(buffo[i]);
       
   455 #else
       
   456         dp[0] = buffo[i] >> 24;
       
   457 #endif /* IMG_TYPE != 1 */
       
   458 #endif /* __sparc */
       
   459 
       
   460         sp += chan1;
       
   461         dp += chan1;
       
   462       }
       
   463 
       
   464       for (; i < swid; i++) {
       
   465         buffi[i] = (mlib_s32)sp[0];
       
   466         buff3[i + dx_l] = (FTYPE)buffi[i];
       
   467         sp += chan1;
       
   468       }
       
   469 
       
   470       for (i = 0; i < dx_l; i++) buff3[i] = buff3[dx_l];
       
   471       for (i = 0; i < dx_r; i++) buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
       
   472 
       
   473       if (j < hgt - dy_b - 2) sl += sll;
       
   474       dl += dll;
       
   475 
       
   476       buffT = buff0;
       
   477       buff0 = buff1;
       
   478       buff1 = buff2;
       
   479       buff2 = buff3;
       
   480       buff3 = buffT;
       
   481     }
       
   482   }
       
   483 
       
   484 #ifdef __sparc
       
   485 #if IMG_TYPE == 1
       
   486   {
       
   487     mlib_s32 amask = (1 << nchannel) - 1;
       
   488 
       
   489     if ((cmask & amask) != amask) {
       
   490       mlib_ImageXor80(adr_dst, wid, hgt, dll, nchannel, cmask);
       
   491     } else {
       
   492       mlib_ImageXor80_aa(adr_dst, wid*nchannel, hgt, dll);
       
   493     }
       
   494   }
       
   495 
       
   496 #endif /* IMG_TYPE == 1 */
       
   497 #endif /* __sparc */
       
   498 
       
   499   if (pbuff != buff) mlib_free(pbuff);
       
   500 
       
   501   return MLIB_SUCCESS;
       
   502 }
       
   503 
       
   504 /***************************************************************/
       
   505 #ifndef __sparc /* for x86, using integer multiplies is faster */
       
   506 
       
   507 mlib_status CONV_FUNC_I(3x3)
       
   508 {
       
   509   DTYPE    *adr_src, *sl, *sp0, *sp1, *sp2, *sp_1, *sp_2;
       
   510   DTYPE    *adr_dst, *dl, *dp;
       
   511   mlib_s32 wid, hgt, sll, dll;
       
   512   mlib_s32 nchannel, chan1, chan2, delta_chan;
       
   513   mlib_s32 i, j, c;
       
   514   mlib_s32 shift1, shift2;
       
   515   mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8;
       
   516   mlib_s32 p02, p03,
       
   517            p12, p13,
       
   518            p22, p23;
       
   519 
       
   520 #if IMG_TYPE != 1
       
   521   shift1 = 16;
       
   522 #else
       
   523   shift1 = 8;
       
   524 #endif /* IMG_TYPE != 1 */
       
   525 
       
   526   shift2 = scalef_expon - shift1;
       
   527 
       
   528   /* keep kernel in regs */
       
   529   k0 = kern[0] >> shift1;  k1 = kern[1] >> shift1;  k2 = kern[2] >> shift1;
       
   530   k3 = kern[3] >> shift1;  k4 = kern[4] >> shift1;  k5 = kern[5] >> shift1;
       
   531   k6 = kern[6] >> shift1;  k7 = kern[7] >> shift1;  k8 = kern[8] >> shift1;
       
   532 
       
   533   GET_SRC_DST_PARAMETERS(DTYPE);
       
   534 
       
   535   chan1 = nchannel;
       
   536   chan2 = chan1 + chan1;
       
   537   delta_chan = 0;
       
   538 
       
   539   if ((1 > dx_l) && (1 < wid + KSIZE1 - dx_r)) delta_chan = chan1;
       
   540 
       
   541   for (c = 0; c < chan1; c++) {
       
   542     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
       
   543 
       
   544     sl = adr_src + c;
       
   545     dl = adr_dst + c;
       
   546 
       
   547     sp_1 = sl;
       
   548 
       
   549     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl += sll;
       
   550     sp_2 = sl;
       
   551 
       
   552     if ((hgt - dy_b) > 0) sl += sll;
       
   553 
       
   554     for (j = 0; j < hgt; j++) {
       
   555       mlib_s32 s0, s1;
       
   556       mlib_s32 pix0, pix1;
       
   557 
       
   558       dp  = dl;
       
   559       sp0 = sp_1;
       
   560       sp_1 = sp_2;
       
   561       sp_2 = sl;
       
   562 
       
   563       sp1 = sp_1;
       
   564       sp2 = sp_2;
       
   565 
       
   566       p02 = sp0[0];
       
   567       p12 = sp1[0];
       
   568       p22 = sp2[0];
       
   569 
       
   570       p03 = sp0[delta_chan];
       
   571       p13 = sp1[delta_chan];
       
   572       p23 = sp2[delta_chan];
       
   573 
       
   574       s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
       
   575       s1 = p03 * k0 + p13 * k3 + p23 * k6;
       
   576 
       
   577       sp0 += (chan1 + delta_chan);
       
   578       sp1 += (chan1 + delta_chan);
       
   579       sp2 += (chan1 + delta_chan);
       
   580 
       
   581 #ifdef __SUNPRO_C
       
   582 #pragma pipeloop(0)
       
   583 #endif /* __SUNPRO_C */
       
   584       for (i = 0; i <= (wid - dx_r - 2); i += 2) {
       
   585         p02 = sp0[0];     p12 = sp1[0];     p22 = sp2[0];
       
   586         p03 = sp0[chan1]; p13 = sp1[chan1]; p23 = sp2[chan1];
       
   587 
       
   588         pix0 = (s0 + p02 * k2 + p12 * k5 + p22 * k8) >> shift2;
       
   589         pix1 = (s1 + p02 * k1 + p03 * k2 + p12 * k4 +
       
   590                 p13 * k5 + p22 * k7 + p23 * k8) >> shift2;
       
   591 
       
   592         CLAMP_STORE(dp[0],     pix0);
       
   593         CLAMP_STORE(dp[chan1], pix1);
       
   594 
       
   595         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
       
   596         s1 = p03 * k0 + p13 * k3 + p23 * k6;
       
   597 
       
   598         sp0 += chan2;
       
   599         sp1 += chan2;
       
   600         sp2 += chan2;
       
   601         dp += chan2;
       
   602       }
       
   603 
       
   604       p02 = p03; p12 = p13; p22 = p23;
       
   605 
       
   606       for (; i < wid - dx_r; i++) {
       
   607         p03 = sp0[0]; p13 = sp1[0]; p23 = sp2[0];
       
   608         pix0 = (s0 + p03 * k2 + p13 * k5 + p23 * k8) >> shift2;
       
   609         CLAMP_STORE(dp[0], pix0);
       
   610         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
       
   611         p02 = p03; p12 = p13; p22 = p23;
       
   612         sp0 += chan1;
       
   613         sp1 += chan1;
       
   614         sp2 += chan1;
       
   615         dp += chan1;
       
   616       }
       
   617 
       
   618       sp0 -= chan1;
       
   619       sp1 -= chan1;
       
   620       sp2 -= chan1;
       
   621 
       
   622       for (; i < wid; i++) {
       
   623         p03 = sp0[0]; p13 = sp1[0]; p23 = sp2[0];
       
   624         pix0 = (s0 + p03 * k2 + p13 * k5 + p23 * k8) >> shift2;
       
   625         CLAMP_STORE(dp[0], pix0);
       
   626         s0 = p02 * k0 + p03 * k1 + p12 * k3 + p13 * k4 + p22 * k6 + p23 * k7;
       
   627         p02 = p03; p12 = p13; p22 = p23;
       
   628         dp += chan1;
       
   629       }
       
   630 
       
   631       if (j < hgt - dy_b - 1) sl += sll;
       
   632       dl += dll;
       
   633     }
       
   634   }
       
   635 
       
   636   return MLIB_SUCCESS;
       
   637 }
       
   638 
       
   639 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
       
   640 
       
   641 /***************************************************************/
       
   642 #undef  KSIZE
       
   643 #define KSIZE 4
       
   644 
       
   645 mlib_status CONV_FUNC(4x4)
       
   646 {
       
   647   FTYPE    buff[(KSIZE + 3)*BUFF_LINE];
       
   648   FTYPE    *buff0, *buff1, *buff2, *buff3, *buff4, *buffd, *buffT;
       
   649   FTYPE    k[KSIZE*KSIZE];
       
   650   mlib_s32 d0, d1;
       
   651   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7;
       
   652   FTYPE    p00, p01, p02, p03, p04,
       
   653            p10, p11, p12, p13, p14,
       
   654            p20, p21, p22, p23,
       
   655            p30, p31, p32, p33;
       
   656   DEF_VARS(DTYPE);
       
   657   DTYPE *sl2, *sl3;
       
   658   LOAD_KERNEL(KSIZE*KSIZE);
       
   659   GET_SRC_DST_PARAMETERS(DTYPE);
       
   660 
       
   661   swid = wid + KSIZE1;
       
   662 
       
   663   if (swid > BUFF_LINE) {
       
   664     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE   )*swid);
       
   665 
       
   666     if (pbuff == NULL) return MLIB_FAILURE;
       
   667   }
       
   668 
       
   669   buff0 = pbuff;
       
   670   buff1 = buff0 + swid;
       
   671   buff2 = buff1 + swid;
       
   672   buff3 = buff2 + swid;
       
   673   buff4 = buff3 + swid;
       
   674   buffd = buff4 + swid;
       
   675   buffo = (mlib_s32*)(buffd + swid);
       
   676   buffi = buffo + (swid &~ 1);
       
   677 
       
   678   swid -= (dx_l + dx_r);
       
   679 
       
   680   chan1 = nchannel;
       
   681   chan2 = chan1 + chan1;
       
   682 
       
   683   for (c = 0; c < nchannel; c++) {
       
   684     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
       
   685 
       
   686     sl = adr_src + c;
       
   687     dl = adr_dst + c;
       
   688 
       
   689     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
       
   690     else sl1 = sl;
       
   691 
       
   692     if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
       
   693     else sl2 = sl1;
       
   694 
       
   695     if ((hgt - dy_b) > 0) sl3 = sl2 + sll;
       
   696     else sl3 = sl2;
       
   697 
       
   698     for (i = 0; i < dx_l; i++) {
       
   699       buff0[i] = (FTYPE)sl[0];
       
   700       buff1[i] = (FTYPE)sl1[0];
       
   701       buff2[i] = (FTYPE)sl2[0];
       
   702       buff3[i] = (FTYPE)sl3[0];
       
   703     }
       
   704 
       
   705 #ifdef __SUNPRO_C
       
   706 #pragma pipeloop(0)
       
   707 #endif /* __SUNPRO_C */
       
   708     for (i = 0; i < swid; i++) {
       
   709       buff0[i + dx_l] = (FTYPE)sl[i*chan1];
       
   710       buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
       
   711       buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
       
   712       buff3[i + dx_l] = (FTYPE)sl3[i*chan1];
       
   713     }
       
   714 
       
   715     for (i = 0; i < dx_r; i++) {
       
   716       buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
       
   717       buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
       
   718       buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
       
   719       buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
       
   720     }
       
   721 
       
   722     if ((hgt - dy_b) > 1) sl = sl3 + sll;
       
   723     else sl = sl3;
       
   724 
       
   725     for (j = 0; j < hgt; j++) {
       
   726       d64_2x32 dd;
       
   727 
       
   728       /*
       
   729        *  First loop on two first lines of kernel
       
   730        */
       
   731       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3];
       
   732       k4 = k[4]; k5 = k[5]; k6 = k[6]; k7 = k[7];
       
   733 
       
   734       sp = sl;
       
   735       dp = dl;
       
   736 
       
   737       p02 = buff0[0];
       
   738       p12 = buff1[0];
       
   739       p03 = buff0[1];
       
   740       p13 = buff1[1];
       
   741       p04 = buff0[2];
       
   742 
       
   743 #ifdef __SUNPRO_C
       
   744 #pragma pipeloop(0)
       
   745 #endif /* __SUNPRO_C */
       
   746       for (i = 0; i <= (wid - 2); i += 2) {
       
   747         p00 = p02; p10 = p12;
       
   748         p01 = p03; p11 = p13;
       
   749         p02 = p04; p12 = buff1[i + 2];
       
   750         p03 = buff0[i + 3]; p13 = buff1[i + 3];
       
   751         p04 = buff0[i + 4]; p14 = buff1[i + 4];
       
   752 
       
   753         LOAD_BUFF(buffi);
       
   754 
       
   755         dd.d64 = *(FTYPE   *)(buffi + i);
       
   756         buff4[i + dx_l    ] = (FTYPE)dd.i32s.i0;
       
   757         buff4[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
       
   758 
       
   759         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
       
   760                         p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7);
       
   761         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
       
   762                         p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7);
       
   763 
       
   764         sp += chan2;
       
   765       }
       
   766 
       
   767       /*
       
   768        *  Second loop on two last lines of kernel
       
   769        */
       
   770       k0 = k[ 8]; k1 = k[ 9]; k2 = k[10]; k3 = k[11];
       
   771       k4 = k[12]; k5 = k[13]; k6 = k[14]; k7 = k[15];
       
   772 
       
   773       p02 = buff2[0];
       
   774       p12 = buff3[0];
       
   775       p03 = buff2[1];
       
   776       p13 = buff3[1];
       
   777       p04 = buff2[2];
       
   778 
       
   779 #ifdef __SUNPRO_C
       
   780 #pragma pipeloop(0)
       
   781 #endif /* __SUNPRO_C */
       
   782       for (i = 0; i <= (wid - 2); i += 2) {
       
   783         p00 = p02; p10 = p12;
       
   784         p01 = p03; p11 = p13;
       
   785         p02 = p04; p12 = buff3[i + 2];
       
   786         p03 = buff2[i + 3]; p13 = buff3[i + 3];
       
   787         p04 = buff2[i + 4]; p14 = buff3[i + 4];
       
   788 
       
   789         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 +
       
   790                  p10 * k4 + p11 * k5 + p12 * k6 + p13 * k7 + buffd[i]);
       
   791         d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 +
       
   792                  p11 * k4 + p12 * k5 + p13 * k6 + p14 * k7 + buffd[i + 1]);
       
   793 
       
   794         dp[0    ] = FROM_S32(d0);
       
   795         dp[chan1] = FROM_S32(d1);
       
   796 
       
   797         dp += chan2;
       
   798       }
       
   799 
       
   800       /* last pixels */
       
   801       for (; i < wid; i++) {
       
   802         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
       
   803         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
       
   804         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
       
   805         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
       
   806 
       
   807         buff4[i + dx_l] = (FTYPE)sp[0];
       
   808 
       
   809         buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] +
       
   810                        p10 * k[4] + p11 * k[5] + p12 * k[6] + p13 * k[7] +
       
   811                        p20 * k[ 8] + p21 * k[ 9] + p22 * k[10] + p23 * k[11] +
       
   812                        p30 * k[12] + p31 * k[13] + p32 * k[14] + p33 * k[15]);
       
   813 
       
   814         dp[0] = FROM_S32(buffo[i]);
       
   815 
       
   816         sp += chan1;
       
   817         dp += chan1;
       
   818       }
       
   819 
       
   820       for (; i < swid; i++) {
       
   821         buff4[i + dx_l] = (FTYPE)sp[0];
       
   822         sp += chan1;
       
   823       }
       
   824 
       
   825       for (i = 0; i < dx_l; i++) buff4[i] = buff4[dx_l];
       
   826       for (i = 0; i < dx_r; i++) buff4[swid + dx_l + i] = buff4[swid + dx_l - 1];
       
   827 
       
   828       /* next line */
       
   829 
       
   830       if (j < hgt - dy_b - 2) sl += sll;
       
   831       dl += dll;
       
   832 
       
   833       buffT = buff0;
       
   834       buff0 = buff1;
       
   835       buff1 = buff2;
       
   836       buff2 = buff3;
       
   837       buff3 = buff4;
       
   838       buff4 = buffT;
       
   839     }
       
   840   }
       
   841 
       
   842   if (pbuff != buff) mlib_free(pbuff);
       
   843 
       
   844   return MLIB_SUCCESS;
       
   845 }
       
   846 
       
   847 /***************************************************************/
       
   848 #undef  KSIZE
       
   849 #define KSIZE 5
       
   850 
       
   851 mlib_status CONV_FUNC(5x5)
       
   852 {
       
   853   FTYPE    buff[(KSIZE + 3)*BUFF_LINE];
       
   854   FTYPE    *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffd, *buffT;
       
   855   FTYPE    k[KSIZE*KSIZE];
       
   856   mlib_s32 d0, d1;
       
   857   FTYPE    k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
       
   858   FTYPE    p00, p01, p02, p03, p04, p05,
       
   859            p10, p11, p12, p13, p14, p15,
       
   860            p20, p21, p22, p23, p24,
       
   861            p30, p31, p32, p33, p34,
       
   862            p40, p41, p42, p43, p44;
       
   863   DEF_VARS(DTYPE);
       
   864   DTYPE *sl2, *sl3, *sl4;
       
   865   LOAD_KERNEL(KSIZE*KSIZE);
       
   866   GET_SRC_DST_PARAMETERS(DTYPE);
       
   867 
       
   868   swid = wid + KSIZE1;
       
   869 
       
   870   if (swid > BUFF_LINE) {
       
   871     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE   )*swid);
       
   872 
       
   873     if (pbuff == NULL) return MLIB_FAILURE;
       
   874   }
       
   875 
       
   876   buff0 = pbuff;
       
   877   buff1 = buff0 + swid;
       
   878   buff2 = buff1 + swid;
       
   879   buff3 = buff2 + swid;
       
   880   buff4 = buff3 + swid;
       
   881   buff5 = buff4 + swid;
       
   882   buffd = buff5 + swid;
       
   883   buffo = (mlib_s32*)(buffd + swid);
       
   884   buffi = buffo + (swid &~ 1);
       
   885 
       
   886   swid -= (dx_l + dx_r);
       
   887 
       
   888   chan1 = nchannel;
       
   889   chan2 = chan1 + chan1;
       
   890 
       
   891   for (c = 0; c < nchannel; c++) {
       
   892     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
       
   893 
       
   894     sl = adr_src + c;
       
   895     dl = adr_dst + c;
       
   896 
       
   897     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
       
   898     else sl1 = sl;
       
   899 
       
   900     if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
       
   901     else sl2 = sl1;
       
   902 
       
   903     if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl3 = sl2 + sll;
       
   904     else sl3 = sl2;
       
   905 
       
   906     if ((hgt - dy_b) > 0) sl4 = sl3 + sll;
       
   907     else sl4 = sl3;
       
   908 
       
   909     for (i = 0; i < dx_l; i++) {
       
   910       buff0[i] = (FTYPE)sl[0];
       
   911       buff1[i] = (FTYPE)sl1[0];
       
   912       buff2[i] = (FTYPE)sl2[0];
       
   913       buff3[i] = (FTYPE)sl3[0];
       
   914       buff4[i] = (FTYPE)sl4[0];
       
   915     }
       
   916 
       
   917 #ifdef __SUNPRO_C
       
   918 #pragma pipeloop(0)
       
   919 #endif /* __SUNPRO_C */
       
   920     for (i = 0; i < swid; i++) {
       
   921       buff0[i + dx_l] = (FTYPE)sl[i*chan1];
       
   922       buff1[i + dx_l] = (FTYPE)sl1[i*chan1];
       
   923       buff2[i + dx_l] = (FTYPE)sl2[i*chan1];
       
   924       buff3[i + dx_l] = (FTYPE)sl3[i*chan1];
       
   925       buff4[i + dx_l] = (FTYPE)sl4[i*chan1];
       
   926     }
       
   927 
       
   928     for (i = 0; i < dx_r; i++) {
       
   929       buff0[swid + dx_l + i] = buff0[swid + dx_l - 1];
       
   930       buff1[swid + dx_l + i] = buff1[swid + dx_l - 1];
       
   931       buff2[swid + dx_l + i] = buff2[swid + dx_l - 1];
       
   932       buff3[swid + dx_l + i] = buff3[swid + dx_l - 1];
       
   933       buff4[swid + dx_l + i] = buff4[swid + dx_l - 1];
       
   934     }
       
   935 
       
   936     if ((hgt - dy_b) > 1) sl = sl4 + sll;
       
   937     else sl = sl4;
       
   938 
       
   939     for (j = 0; j < hgt; j++) {
       
   940       d64_2x32 dd;
       
   941 
       
   942       /*
       
   943        *  First loop
       
   944        */
       
   945       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
       
   946       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
       
   947 
       
   948       sp = sl;
       
   949       dp = dl;
       
   950 
       
   951       p02 = buff0[0];
       
   952       p12 = buff1[0];
       
   953       p03 = buff0[1];
       
   954       p13 = buff1[1];
       
   955       p04 = buff0[2];
       
   956       p14 = buff1[2];
       
   957 
       
   958 #ifdef __SUNPRO_C
       
   959 #pragma pipeloop(0)
       
   960 #endif /* __SUNPRO_C */
       
   961       for (i = 0; i <= (wid - 2); i += 2) {
       
   962         p00 = p02; p10 = p12;
       
   963         p01 = p03; p11 = p13;
       
   964         p02 = p04; p12 = p14;
       
   965 
       
   966         LOAD_BUFF(buffi);
       
   967 
       
   968         p03 = buff0[i + 3]; p13 = buff1[i + 3];
       
   969         p04 = buff0[i + 4]; p14 = buff1[i + 4];
       
   970         p05 = buff0[i + 5]; p15 = buff1[i + 5];
       
   971 
       
   972         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
       
   973                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
       
   974         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
       
   975                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
       
   976 
       
   977         sp += chan2;
       
   978       }
       
   979 
       
   980       /*
       
   981        *  Second loop
       
   982        */
       
   983       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
       
   984       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
       
   985 
       
   986       p02 = buff2[0];
       
   987       p12 = buff3[0];
       
   988       p03 = buff2[1];
       
   989       p13 = buff3[1];
       
   990 
       
   991 #ifdef __SUNPRO_C
       
   992 #pragma pipeloop(0)
       
   993 #endif /* __SUNPRO_C */
       
   994       for (i = 0; i <= (wid - 2); i += 2) {
       
   995         p00 = p02; p10 = p12;
       
   996         p01 = p03; p11 = p13;
       
   997 
       
   998         p02 = buff2[i + 2]; p12 = buff3[i + 2];
       
   999         p03 = buff2[i + 3]; p13 = buff3[i + 3];
       
  1000         p04 = buff2[i + 4]; p14 = buff3[i + 4];
       
  1001         p05 = buff2[i + 5]; p15 = buff3[i + 5];
       
  1002 
       
  1003         dd.d64 = *(FTYPE   *)(buffi + i);
       
  1004         buff5[i + dx_l    ] = (FTYPE)dd.i32s.i0;
       
  1005         buff5[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
       
  1006 
       
  1007         buffd[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
       
  1008                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
       
  1009         buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
       
  1010                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
       
  1011       }
       
  1012 
       
  1013       /*
       
  1014        *  3 loop
       
  1015        */
       
  1016       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
       
  1017 
       
  1018       p02 = buff4[0];
       
  1019       p03 = buff4[1];
       
  1020       p04 = buff4[2];
       
  1021       p05 = buff4[3];
       
  1022 
       
  1023 #ifdef __SUNPRO_C
       
  1024 #pragma pipeloop(0)
       
  1025 #endif /* __SUNPRO_C */
       
  1026       for (i = 0; i <= (wid - 2); i += 2) {
       
  1027         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
       
  1028 
       
  1029         p04 = buff4[i + 4]; p05 = buff4[i + 5];
       
  1030 
       
  1031         d0 = D2I(p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 + buffd[i]);
       
  1032         d1 = D2I(p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 + buffd[i + 1]);
       
  1033 
       
  1034         dp[0    ] = FROM_S32(d0);
       
  1035         dp[chan1] = FROM_S32(d1);
       
  1036 
       
  1037         dp += chan2;
       
  1038       }
       
  1039 
       
  1040       /* last pixels */
       
  1041       for (; i < wid; i++) {
       
  1042         p00 = buff0[i];     p10 = buff1[i];     p20 = buff2[i];     p30 = buff3[i];
       
  1043         p01 = buff0[i + 1]; p11 = buff1[i + 1]; p21 = buff2[i + 1]; p31 = buff3[i + 1];
       
  1044         p02 = buff0[i + 2]; p12 = buff1[i + 2]; p22 = buff2[i + 2]; p32 = buff3[i + 2];
       
  1045         p03 = buff0[i + 3]; p13 = buff1[i + 3]; p23 = buff2[i + 3]; p33 = buff3[i + 3];
       
  1046         p04 = buff0[i + 4]; p14 = buff1[i + 4]; p24 = buff2[i + 4]; p34 = buff3[i + 4];
       
  1047 
       
  1048         p40 = buff4[i];     p41 = buff4[i + 1]; p42 = buff4[i + 2];
       
  1049         p43 = buff4[i + 3]; p44 = buff4[i + 4];
       
  1050 
       
  1051         buff5[i + dx_l] = (FTYPE)sp[0];
       
  1052 
       
  1053         buffo[i] = D2I(p00 * k[0] + p01 * k[1] + p02 * k[2] + p03 * k[3] + p04 * k[4] +
       
  1054                        p10 * k[5] + p11 * k[6] + p12 * k[7] + p13 * k[8] + p14 * k[9] +
       
  1055                        p20 * k[10] + p21 * k[11] + p22 * k[12] + p23 * k[13] + p24 * k[14] +
       
  1056                        p30 * k[15] + p31 * k[16] + p32 * k[17] + p33 * k[18] + p34 * k[19] +
       
  1057                        p40 * k[20] + p41 * k[21] + p42 * k[22] + p43 * k[23] + p44 * k[24]);
       
  1058 
       
  1059         dp[0] = FROM_S32(buffo[i]);
       
  1060 
       
  1061         sp += chan1;
       
  1062         dp += chan1;
       
  1063       }
       
  1064 
       
  1065       for (; i < swid; i++) {
       
  1066         buff5[i + dx_l] = (FTYPE)sp[0];
       
  1067         sp += chan1;
       
  1068       }
       
  1069 
       
  1070       for (i = 0; i < dx_l; i++) buff5[i] = buff5[dx_l];
       
  1071       for (i = 0; i < dx_r; i++) buff5[swid + dx_l + i] = buff5[swid + dx_l - 1];
       
  1072 
       
  1073       /* next line */
       
  1074 
       
  1075       if (j < hgt - dy_b - 2) sl += sll;
       
  1076       dl += dll;
       
  1077 
       
  1078       buffT = buff0;
       
  1079       buff0 = buff1;
       
  1080       buff1 = buff2;
       
  1081       buff2 = buff3;
       
  1082       buff3 = buff4;
       
  1083       buff4 = buff5;
       
  1084       buff5 = buffT;
       
  1085     }
       
  1086   }
       
  1087 
       
  1088   if (pbuff != buff) mlib_free(pbuff);
       
  1089 
       
  1090   return MLIB_SUCCESS;
       
  1091 }
       
  1092 
       
  1093 /***************************************************************/
       
  1094 #ifndef __sparc /* for x86, using integer multiplies is faster */
       
  1095 
       
  1096 mlib_status CONV_FUNC_I(5x5)
       
  1097 {
       
  1098   mlib_s32 buff[BUFF_LINE];
       
  1099   mlib_s32 *buffd;
       
  1100   mlib_s32 k[KSIZE*KSIZE];
       
  1101   mlib_s32 shift1, shift2;
       
  1102   mlib_s32 k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
       
  1103   mlib_s32 p00, p01, p02, p03, p04, p05,
       
  1104            p10, p11, p12, p13, p14, p15;
       
  1105   DTYPE    *adr_src, *sl, *sp0, *sp1, *sp2, *sp3, *sp4;
       
  1106   DTYPE    *sp_1, *sp_2, *sp_3, *sp_4;
       
  1107   DTYPE    *adr_dst, *dl, *dp;
       
  1108   mlib_s32 *pbuff = buff;
       
  1109   mlib_s32 wid, hgt, sll, dll;
       
  1110   mlib_s32 nchannel, chan1, chan2, chan4;
       
  1111   mlib_s32 delta_chan1, delta_chan2, delta_chan3;
       
  1112   mlib_s32 i, j, c;
       
  1113 
       
  1114 #if IMG_TYPE != 1
       
  1115   shift1 = 16;
       
  1116 #else
       
  1117   shift1 = 8;
       
  1118 #endif /* IMG_TYPE != 1 */
       
  1119 
       
  1120   shift2 = scalef_expon - shift1;
       
  1121 
       
  1122   for (j = 0; j < KSIZE*KSIZE; j++) k[j] = kern[j] >> shift1;
       
  1123 
       
  1124   GET_SRC_DST_PARAMETERS(DTYPE);
       
  1125 
       
  1126   if (wid > BUFF_LINE) {
       
  1127     pbuff = mlib_malloc(sizeof(mlib_s32)*wid);
       
  1128 
       
  1129     if (pbuff == NULL) return MLIB_FAILURE;
       
  1130   }
       
  1131 
       
  1132   buffd = pbuff;
       
  1133 
       
  1134   chan1 = nchannel;
       
  1135   chan2 = chan1 + chan1;
       
  1136 
       
  1137   if ((1 > dx_l) && (1 < wid + KSIZE1 - dx_r)) delta_chan1 = chan1;
       
  1138   else delta_chan1 = 0;
       
  1139 
       
  1140   if ((2 > dx_l) && (2 < wid + KSIZE1 - dx_r)) delta_chan2 = delta_chan1 + chan1;
       
  1141   else delta_chan2 = delta_chan1;
       
  1142 
       
  1143   if ((3 > dx_l) && (3 < wid + KSIZE1 - dx_r)) delta_chan3 = delta_chan2 + chan1;
       
  1144   else delta_chan3 = delta_chan2;
       
  1145 
       
  1146   chan4 = chan1 + delta_chan3;
       
  1147 
       
  1148   for (c = 0; c < chan1; c++) {
       
  1149     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
       
  1150 
       
  1151     sl = adr_src + c;
       
  1152     dl = adr_dst + c;
       
  1153 
       
  1154     sp_1 = sl;
       
  1155 
       
  1156     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl += sll;
       
  1157     sp_2 = sl;
       
  1158 
       
  1159     if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl += sll;
       
  1160     sp_3 = sl;
       
  1161 
       
  1162     if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl += sll;
       
  1163     sp_4 = sl;
       
  1164 
       
  1165     if ((hgt - dy_b) > 0) sl += sll;
       
  1166 
       
  1167     for (j = 0; j < hgt; j++) {
       
  1168       mlib_s32 pix0, pix1;
       
  1169 
       
  1170       dp  = dl;
       
  1171       sp0 = sp_1;
       
  1172       sp_1 = sp_2;
       
  1173       sp_2 = sp_3;
       
  1174       sp_3 = sp_4;
       
  1175       sp_4 = sl;
       
  1176 
       
  1177       sp1 = sp_1;
       
  1178       sp2 = sp_2;
       
  1179       sp3 = sp_3;
       
  1180       sp4 = sp_4;
       
  1181 
       
  1182       /*
       
  1183        *  First loop
       
  1184        */
       
  1185 
       
  1186       k0 = k[0]; k1 = k[1]; k2 = k[2]; k3 = k[3]; k4 = k[4];
       
  1187       k5 = k[5]; k6 = k[6]; k7 = k[7]; k8 = k[8]; k9 = k[9];
       
  1188 
       
  1189       p02 = sp0[0];           p12 = sp1[0];
       
  1190       p03 = sp0[delta_chan1]; p13 = sp1[delta_chan1];
       
  1191       p04 = sp0[delta_chan2]; p14 = sp1[delta_chan2];
       
  1192       p05 = sp0[delta_chan3]; p15 = sp1[delta_chan3];
       
  1193 
       
  1194       sp0 += chan4;
       
  1195       sp1 += chan4;
       
  1196 
       
  1197 #ifdef __SUNPRO_C
       
  1198 #pragma pipeloop(0)
       
  1199 #endif /* __SUNPRO_C */
       
  1200       for (i = 0; i <= (wid - dx_r - 2); i += 2) {
       
  1201         p00 = p02; p10 = p12;
       
  1202         p01 = p03; p11 = p13;
       
  1203         p02 = p04; p12 = p14;
       
  1204         p03 = p05; p13 = p15;
       
  1205 
       
  1206         p04 = sp0[0];     p14 = sp1[0];
       
  1207         p05 = sp0[chan1]; p15 = sp1[chan1];
       
  1208 
       
  1209         buffd[i    ] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
       
  1210                         p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
       
  1211         buffd[i + 1] = (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
       
  1212                         p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
       
  1213 
       
  1214         sp0 += chan2;
       
  1215         sp1 += chan2;
       
  1216       }
       
  1217 
       
  1218       p01 = p02; p02 = p03; p03 = p04; p04 = p05;
       
  1219       p11 = p12; p12 = p13; p13 = p14; p14 = p15;
       
  1220 
       
  1221       for (; i < wid - dx_r; i++) {
       
  1222         p00 = p01; p10 = p11;
       
  1223         p01 = p02; p11 = p12;
       
  1224         p02 = p03; p12 = p13;
       
  1225         p03 = p04; p13 = p14;
       
  1226 
       
  1227         p04 = sp0[0];     p14 = sp1[0];
       
  1228 
       
  1229         buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
       
  1230                     p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
       
  1231 
       
  1232         sp0 += chan1;
       
  1233         sp1 += chan1;
       
  1234       }
       
  1235 
       
  1236       sp0 -= chan1;
       
  1237       sp1 -= chan1;
       
  1238 
       
  1239       for (; i < wid; i++) {
       
  1240         p00 = p01; p10 = p11;
       
  1241         p01 = p02; p11 = p12;
       
  1242         p02 = p03; p12 = p13;
       
  1243         p03 = p04; p13 = p14;
       
  1244 
       
  1245         p04 = sp0[0];     p14 = sp1[0];
       
  1246 
       
  1247         buffd[i] = (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
       
  1248                     p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
       
  1249       }
       
  1250 
       
  1251       /*
       
  1252        *  Second loop
       
  1253        */
       
  1254 
       
  1255       k0 = k[10]; k1 = k[11]; k2 = k[12]; k3 = k[13]; k4 = k[14];
       
  1256       k5 = k[15]; k6 = k[16]; k7 = k[17]; k8 = k[18]; k9 = k[19];
       
  1257 
       
  1258       p02 = sp2[0];           p12 = sp3[0];
       
  1259       p03 = sp2[delta_chan1]; p13 = sp3[delta_chan1];
       
  1260       p04 = sp2[delta_chan2]; p14 = sp3[delta_chan2];
       
  1261       p05 = sp2[delta_chan3]; p15 = sp3[delta_chan3];
       
  1262 
       
  1263       sp2 += chan4;
       
  1264       sp3 += chan4;
       
  1265 
       
  1266 #ifdef __SUNPRO_C
       
  1267 #pragma pipeloop(0)
       
  1268 #endif /* __SUNPRO_C */
       
  1269       for (i = 0; i <= (wid - dx_r - 2); i += 2) {
       
  1270         p00 = p02; p10 = p12;
       
  1271         p01 = p03; p11 = p13;
       
  1272         p02 = p04; p12 = p14;
       
  1273         p03 = p05; p13 = p15;
       
  1274 
       
  1275         p04 = sp2[0];     p14 = sp3[0];
       
  1276         p05 = sp2[chan1]; p15 = sp3[chan1];
       
  1277 
       
  1278         buffd[i    ] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
       
  1279                          p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
       
  1280         buffd[i + 1] += (p01 * k0 + p02 * k1 + p03 * k2 + p04 * k3 + p05 * k4 +
       
  1281                          p11 * k5 + p12 * k6 + p13 * k7 + p14 * k8 + p15 * k9);
       
  1282 
       
  1283         sp2 += chan2;
       
  1284         sp3 += chan2;
       
  1285       }
       
  1286 
       
  1287       p01 = p02; p02 = p03; p03 = p04; p04 = p05;
       
  1288       p11 = p12; p12 = p13; p13 = p14; p14 = p15;
       
  1289 
       
  1290       for (; i < wid - dx_r; i++) {
       
  1291         p00 = p01; p10 = p11;
       
  1292         p01 = p02; p11 = p12;
       
  1293         p02 = p03; p12 = p13;
       
  1294         p03 = p04; p13 = p14;
       
  1295 
       
  1296         p04 = sp2[0];     p14 = sp3[0];
       
  1297 
       
  1298         buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
       
  1299                      p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
       
  1300 
       
  1301         sp2 += chan1;
       
  1302         sp3 += chan1;
       
  1303       }
       
  1304 
       
  1305       sp2 -= chan1;
       
  1306       sp3 -= chan1;
       
  1307 
       
  1308       for (; i < wid; i++) {
       
  1309         p00 = p01; p10 = p11;
       
  1310         p01 = p02; p11 = p12;
       
  1311         p02 = p03; p12 = p13;
       
  1312         p03 = p04; p13 = p14;
       
  1313 
       
  1314         p04 = sp2[0];     p14 = sp3[0];
       
  1315 
       
  1316         buffd[i] += (p00 * k0 + p01 * k1 + p02 * k2 + p03 * k3 + p04 * k4 +
       
  1317                      p10 * k5 + p11 * k6 + p12 * k7 + p13 * k8 + p14 * k9);
       
  1318       }
       
  1319 
       
  1320       /*
       
  1321        *  3 loop
       
  1322        */
       
  1323 
       
  1324       k0 = k[20]; k1 = k[21]; k2 = k[22]; k3 = k[23]; k4 = k[24];
       
  1325 
       
  1326       p02 = sp4[0];
       
  1327       p03 = sp4[delta_chan1];
       
  1328       p04 = sp4[delta_chan2];
       
  1329       p05 = sp4[delta_chan3];
       
  1330 
       
  1331       sp4 += chan4;
       
  1332 
       
  1333 #ifdef __SUNPRO_C
       
  1334 #pragma pipeloop(0)
       
  1335 #endif /* __SUNPRO_C */
       
  1336       for (i = 0; i <= (wid - dx_r - 2); i += 2) {
       
  1337         p00 = p02; p01 = p03; p02 = p04; p03 = p05;
       
  1338 
       
  1339         p04 = sp4[0]; p05 = sp4[chan1];
       
  1340 
       
  1341         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
       
  1342                 p03 * k3 + p04 * k4) >> shift2;
       
  1343         pix1 = (buffd[i + 1] + p01 * k0 + p02 * k1 + p03 * k2 +
       
  1344                 p04 * k3 + p05 * k4) >> shift2;
       
  1345 
       
  1346         CLAMP_STORE(dp[0],     pix0);
       
  1347         CLAMP_STORE(dp[chan1], pix1);
       
  1348 
       
  1349         dp  += chan2;
       
  1350         sp4 += chan2;
       
  1351       }
       
  1352 
       
  1353       p01 = p02; p02 = p03; p03 = p04; p04 = p05;
       
  1354 
       
  1355       for (; i < wid - dx_r; i++) {
       
  1356         p00 = p01; p01 = p02; p02 = p03; p03 = p04;
       
  1357 
       
  1358         p04 = sp4[0];
       
  1359 
       
  1360         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
       
  1361                 p03 * k3 + p04 * k4) >> shift2;
       
  1362         CLAMP_STORE(dp[0],     pix0);
       
  1363 
       
  1364         dp  += chan1;
       
  1365         sp4 += chan1;
       
  1366       }
       
  1367 
       
  1368       sp4 -= chan1;
       
  1369 
       
  1370       for (; i < wid; i++) {
       
  1371         p00 = p01; p01 = p02; p02 = p03; p03 = p04;
       
  1372 
       
  1373         p04 = sp4[0];
       
  1374 
       
  1375         pix0 = (buffd[i    ] + p00 * k0 + p01 * k1 + p02 * k2 +
       
  1376                 p03 * k3 + p04 * k4) >> shift2;
       
  1377         CLAMP_STORE(dp[0],     pix0);
       
  1378 
       
  1379         dp  += chan1;
       
  1380       }
       
  1381 
       
  1382       /* next line */
       
  1383 
       
  1384       if (j < hgt - dy_b - 1) sl += sll;
       
  1385       dl += dll;
       
  1386     }
       
  1387   }
       
  1388 
       
  1389   if (pbuff != buff) mlib_free(pbuff);
       
  1390 
       
  1391   return MLIB_SUCCESS;
       
  1392 }
       
  1393 
       
  1394 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
       
  1395 
       
  1396 /***************************************************************/
       
  1397 #if IMG_TYPE == 1
       
  1398 
       
  1399 #undef  KSIZE
       
  1400 #define KSIZE 7
       
  1401 
       
  1402 mlib_status CONV_FUNC(7x7)
       
  1403 {
       
  1404   FTYPE    buff[(KSIZE + 3)*BUFF_LINE], *buffs[2*(KSIZE + 1)], *buffd;
       
  1405   FTYPE    k[KSIZE*KSIZE];
       
  1406   mlib_s32 l, m, buff_ind;
       
  1407   mlib_s32 d0, d1;
       
  1408   FTYPE    k0, k1, k2, k3, k4, k5, k6;
       
  1409   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
       
  1410   DTYPE *sl2, *sl3, *sl4, *sl5, *sl6;
       
  1411   DEF_VARS(DTYPE);
       
  1412   LOAD_KERNEL(KSIZE*KSIZE);
       
  1413   GET_SRC_DST_PARAMETERS(DTYPE);
       
  1414 
       
  1415   swid = wid + KSIZE1;
       
  1416 
       
  1417   if (wid > BUFF_LINE) {
       
  1418     pbuff = mlib_malloc((KSIZE + 3)*sizeof(FTYPE   )*wid);
       
  1419 
       
  1420     if (pbuff == NULL) return MLIB_FAILURE;
       
  1421   }
       
  1422 
       
  1423   for (l = 0; l < KSIZE + 1; l++) buffs[l] = pbuff + l*swid;
       
  1424   for (l = 0; l < KSIZE + 1; l++) buffs[l + (KSIZE + 1)] = buffs[l];
       
  1425   buffd = buffs[KSIZE] + swid;
       
  1426   buffo = (mlib_s32*)(buffd + swid);
       
  1427   buffi = buffo + (swid &~ 1);
       
  1428 
       
  1429   swid -= (dx_l + dx_r);
       
  1430 
       
  1431   chan1 = nchannel;
       
  1432   chan2 = chan1 + chan1;
       
  1433 
       
  1434   for (c = 0; c < nchannel; c++) {
       
  1435     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
       
  1436 
       
  1437     sl = adr_src + c;
       
  1438     dl = adr_dst + c;
       
  1439 
       
  1440     if ((1 > dy_t) && (1 < hgt + KSIZE1 - dy_b)) sl1 = sl + sll;
       
  1441     else sl1 = sl;
       
  1442 
       
  1443     if ((2 > dy_t) && (2 < hgt + KSIZE1 - dy_b)) sl2 = sl1 + sll;
       
  1444     else sl2 = sl1;
       
  1445 
       
  1446     if ((3 > dy_t) && (3 < hgt + KSIZE1 - dy_b)) sl3 = sl2 + sll;
       
  1447     else sl3 = sl2;
       
  1448 
       
  1449     if ((4 > dy_t) && (4 < hgt + KSIZE1 - dy_b)) sl4 = sl3 + sll;
       
  1450     else sl4 = sl3;
       
  1451 
       
  1452     if ((5 > dy_t) && (5 < hgt + KSIZE1 - dy_b)) sl5 = sl4 + sll;
       
  1453     else sl5 = sl4;
       
  1454 
       
  1455     if ((hgt - dy_b) > 0) sl6 = sl5 + sll;
       
  1456     else sl6 = sl5;
       
  1457 
       
  1458     for (i = 0; i < dx_l; i++) {
       
  1459       buffs[0][i] = (FTYPE)sl[0];
       
  1460       buffs[1][i] = (FTYPE)sl1[0];
       
  1461       buffs[2][i] = (FTYPE)sl2[0];
       
  1462       buffs[3][i] = (FTYPE)sl3[0];
       
  1463       buffs[4][i] = (FTYPE)sl4[0];
       
  1464       buffs[5][i] = (FTYPE)sl5[0];
       
  1465       buffs[6][i] = (FTYPE)sl6[0];
       
  1466     }
       
  1467 
       
  1468 #ifdef __SUNPRO_C
       
  1469 #pragma pipeloop(0)
       
  1470 #endif /* __SUNPRO_C */
       
  1471     for (i = 0; i < swid; i++) {
       
  1472       buffs[0][i + dx_l] = (FTYPE)sl[i*chan1];
       
  1473       buffs[1][i + dx_l] = (FTYPE)sl1[i*chan1];
       
  1474       buffs[2][i + dx_l] = (FTYPE)sl2[i*chan1];
       
  1475       buffs[3][i + dx_l] = (FTYPE)sl3[i*chan1];
       
  1476       buffs[4][i + dx_l] = (FTYPE)sl4[i*chan1];
       
  1477       buffs[5][i + dx_l] = (FTYPE)sl5[i*chan1];
       
  1478       buffs[6][i + dx_l] = (FTYPE)sl6[i*chan1];
       
  1479     }
       
  1480 
       
  1481     for (i = 0; i < dx_r; i++) {
       
  1482       buffs[0][swid + dx_l + i] = buffs[0][swid + dx_l - 1];
       
  1483       buffs[1][swid + dx_l + i] = buffs[1][swid + dx_l - 1];
       
  1484       buffs[2][swid + dx_l + i] = buffs[2][swid + dx_l - 1];
       
  1485       buffs[3][swid + dx_l + i] = buffs[3][swid + dx_l - 1];
       
  1486       buffs[4][swid + dx_l + i] = buffs[4][swid + dx_l - 1];
       
  1487       buffs[5][swid + dx_l + i] = buffs[5][swid + dx_l - 1];
       
  1488       buffs[6][swid + dx_l + i] = buffs[6][swid + dx_l - 1];
       
  1489     }
       
  1490 
       
  1491     buff_ind = 0;
       
  1492 
       
  1493 #ifdef __SUNPRO_C
       
  1494 #pragma pipeloop(0)
       
  1495 #endif /* __SUNPRO_C */
       
  1496     for (i = 0; i < wid; i++) buffd[i] = 0.0;
       
  1497 
       
  1498     if ((hgt - dy_b) > 1) sl = sl6 + sll;
       
  1499     else sl = sl6;
       
  1500 
       
  1501     for (j = 0; j < hgt; j++) {
       
  1502       FTYPE    **buffc = buffs + buff_ind;
       
  1503       FTYPE    *buffn = buffc[KSIZE];
       
  1504       FTYPE    *pk = k;
       
  1505 
       
  1506       for (l = 0; l < KSIZE; l++) {
       
  1507         FTYPE    *buff = buffc[l];
       
  1508         d64_2x32 dd;
       
  1509 
       
  1510         sp = sl;
       
  1511         dp = dl;
       
  1512 
       
  1513         p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
       
  1514         p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
       
  1515 
       
  1516         k0 = *pk++; k1 = *pk++; k2 = *pk++; k3 = *pk++;
       
  1517         k4 = *pk++; k5 = *pk++; k6 = *pk++;
       
  1518 
       
  1519         if (l < (KSIZE - 1)) {
       
  1520 #ifdef __SUNPRO_C
       
  1521 #pragma pipeloop(0)
       
  1522 #endif /* __SUNPRO_C */
       
  1523           for (i = 0; i <= (wid - 2); i += 2) {
       
  1524             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
       
  1525 
       
  1526             p6 = buff[i + 6]; p7 = buff[i + 7];
       
  1527 
       
  1528             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
       
  1529             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
       
  1530           }
       
  1531 
       
  1532         } else {
       
  1533 #ifdef __SUNPRO_C
       
  1534 #pragma pipeloop(0)
       
  1535 #endif /* __SUNPRO_C */
       
  1536           for (i = 0; i <= (wid - 2); i += 2) {
       
  1537             p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
       
  1538 
       
  1539             p6 = buff[i + 6]; p7 = buff[i + 7];
       
  1540 
       
  1541             LOAD_BUFF(buffi);
       
  1542 
       
  1543             dd.d64 = *(FTYPE   *)(buffi + i);
       
  1544             buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
       
  1545             buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
       
  1546 
       
  1547             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
       
  1548             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
       
  1549 
       
  1550             dp[0    ] = FROM_S32(d0);
       
  1551             dp[chan1] = FROM_S32(d1);
       
  1552 
       
  1553             buffd[i    ] = 0.0;
       
  1554             buffd[i + 1] = 0.0;
       
  1555 
       
  1556             sp += chan2;
       
  1557             dp += chan2;
       
  1558           }
       
  1559         }
       
  1560       }
       
  1561 
       
  1562       /* last pixels */
       
  1563       for (; i < wid; i++) {
       
  1564         FTYPE    *pk = k, s = 0;
       
  1565         mlib_s32 d0;
       
  1566 
       
  1567         for (l = 0; l < KSIZE; l++) {
       
  1568           FTYPE    *buff = buffc[l] + i;
       
  1569 
       
  1570           for (m = 0; m < KSIZE; m++) s += buff[m] * (*pk++);
       
  1571         }
       
  1572 
       
  1573         d0 = D2I(s);
       
  1574         dp[0] = FROM_S32(d0);
       
  1575 
       
  1576         buffn[i + dx_l] = (FTYPE)sp[0];
       
  1577 
       
  1578         sp += chan1;
       
  1579         dp += chan1;
       
  1580       }
       
  1581 
       
  1582       for (; i < swid; i++) {
       
  1583         buffn[i + dx_l] = (FTYPE)sp[0];
       
  1584         sp += chan1;
       
  1585       }
       
  1586 
       
  1587       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
       
  1588       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
       
  1589 
       
  1590       /* next line */
       
  1591 
       
  1592       if (j < hgt - dy_b - 2) sl += sll;
       
  1593       dl += dll;
       
  1594 
       
  1595       buff_ind++;
       
  1596 
       
  1597       if (buff_ind >= KSIZE + 1) buff_ind = 0;
       
  1598     }
       
  1599   }
       
  1600 
       
  1601   if (pbuff != buff) mlib_free(pbuff);
       
  1602 
       
  1603   return MLIB_SUCCESS;
       
  1604 }
       
  1605 
       
  1606 #endif /* IMG_TYPE == 1 */
       
  1607 
       
  1608 /***************************************************************/
       
  1609 #define MAX_KER   7
       
  1610 #define MAX_N    15
       
  1611 #define BUFF_SIZE   1600
       
  1612 #define CACHE_SIZE  (64*1024)
       
  1613 
       
  1614 static mlib_status mlib_ImageConv1xN_ext(mlib_image       *dst,
       
  1615                                          const mlib_image *src,
       
  1616                                          const mlib_d64   *k,
       
  1617                                          mlib_s32         n,
       
  1618                                          mlib_s32         dy_t,
       
  1619                                          mlib_s32         dy_b,
       
  1620                                          mlib_s32         cmask)
       
  1621 {
       
  1622   DTYPE    *adr_src, *sl;
       
  1623   DTYPE    *adr_dst, *dl, *dp;
       
  1624   FTYPE    buff[BUFF_SIZE];
       
  1625   FTYPE    *buffd;
       
  1626   FTYPE    *pbuff = buff;
       
  1627   const FTYPE    *pk;
       
  1628   FTYPE    k0, k1, k2, k3;
       
  1629   FTYPE    p0, p1, p2, p3, p4;
       
  1630   FTYPE    *sbuff;
       
  1631   mlib_s32 l, k_off, off, bsize;
       
  1632   mlib_s32 max_hsize, smax_hsize, shgt, hsize, kh;
       
  1633   mlib_s32 d0, d1, ii;
       
  1634   mlib_s32 wid, hgt, sll, dll;
       
  1635   mlib_s32 nchannel;
       
  1636   mlib_s32 i, j, c;
       
  1637   GET_SRC_DST_PARAMETERS(DTYPE);
       
  1638 
       
  1639   max_hsize = ((CACHE_SIZE/sizeof(DTYPE))/sll) - (n - 1);
       
  1640 
       
  1641   if (max_hsize < 1) max_hsize = 1;
       
  1642   if (max_hsize > hgt) max_hsize = hgt;
       
  1643 
       
  1644   shgt = hgt + (n - 1);
       
  1645   smax_hsize = max_hsize + (n - 1);
       
  1646 
       
  1647   bsize = 2 * (smax_hsize + 1);
       
  1648 
       
  1649   if (bsize > BUFF_SIZE) {
       
  1650     pbuff = mlib_malloc(sizeof(FTYPE)*bsize);
       
  1651 
       
  1652     if (pbuff == NULL) return MLIB_FAILURE;
       
  1653   }
       
  1654 
       
  1655   sbuff = pbuff;
       
  1656   buffd = sbuff + smax_hsize;
       
  1657 
       
  1658   shgt -= (dy_t + dy_b);
       
  1659   k_off = 0;
       
  1660 
       
  1661   for (l = 0; l < hgt; l += hsize) {
       
  1662     hsize = hgt - l;
       
  1663 
       
  1664     if (hsize > max_hsize) hsize = max_hsize;
       
  1665 
       
  1666     smax_hsize = hsize + (n - 1);
       
  1667 
       
  1668     for (c = 0; c < nchannel; c++) {
       
  1669       if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
       
  1670 
       
  1671       sl = adr_src + c;
       
  1672       dl = adr_dst + c;
       
  1673 
       
  1674 #ifdef __SUNPRO_C
       
  1675 #pragma pipeloop(0)
       
  1676 #endif /* __SUNPRO_C */
       
  1677       for (i = 0; i < hsize; i++) buffd[i] = 0.0;
       
  1678 
       
  1679       for (j = 0; j < wid; j++) {
       
  1680         FTYPE    *buff = sbuff;
       
  1681 
       
  1682         for (i = k_off, ii = 0; (i < dy_t) && (ii < smax_hsize); i++, ii++) {
       
  1683           sbuff[i - k_off] = (FTYPE)sl[0];
       
  1684         }
       
  1685 
       
  1686 #ifdef __SUNPRO_C
       
  1687 #pragma pipeloop(0)
       
  1688 #endif /* __SUNPRO_C */
       
  1689         for (; (i < shgt + dy_t) && (ii < smax_hsize); i++, ii++) {
       
  1690           sbuff[i - k_off] = (FTYPE)sl[(i - dy_t)*sll];
       
  1691         }
       
  1692 
       
  1693         for (; (i < shgt + dy_t + dy_b) && (ii < smax_hsize); i++, ii++) {
       
  1694           sbuff[i - k_off] = (FTYPE)sl[(shgt - 1)*sll];
       
  1695         }
       
  1696 
       
  1697         pk = k;
       
  1698 
       
  1699         for (off = 0; off < (n - 4); off += 4) {
       
  1700 
       
  1701           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
       
  1702           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
       
  1703 
       
  1704 #ifdef __SUNPRO_C
       
  1705 #pragma pipeloop(0)
       
  1706 #endif /* __SUNPRO_C */
       
  1707           for (i = 0; i < hsize; i += 2) {
       
  1708             p0 = p2; p1 = p3; p2 = p4;
       
  1709 
       
  1710             p3 = buff[i + 3]; p4 = buff[i + 4];
       
  1711 
       
  1712             buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
       
  1713             buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
       
  1714           }
       
  1715 
       
  1716           pk += 4;
       
  1717           buff += 4;
       
  1718         }
       
  1719 
       
  1720         dp = dl;
       
  1721         kh = n - off;
       
  1722 
       
  1723         if (kh == 4) {
       
  1724           p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
       
  1725           k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
       
  1726 
       
  1727 #ifdef __SUNPRO_C
       
  1728 #pragma pipeloop(0)
       
  1729 #endif /* __SUNPRO_C */
       
  1730           for (i = 0; i <= (hsize - 2); i += 2) {
       
  1731             p0 = p2; p1 = p3; p2 = p4;
       
  1732 
       
  1733             p3 = buff[i + 3]; p4 = buff[i + 4];
       
  1734 
       
  1735             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
       
  1736             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
       
  1737 
       
  1738             dp[0  ] = FROM_S32(d0);
       
  1739             dp[dll] = FROM_S32(d1);
       
  1740 
       
  1741             buffd[i    ] = 0.0;
       
  1742             buffd[i + 1] = 0.0;
       
  1743 
       
  1744             dp += 2*dll;
       
  1745           }
       
  1746 
       
  1747           if (i < hsize) {
       
  1748             p0 = p2; p1 = p3; p2 = p4;
       
  1749             p3 = buff[i + 3];
       
  1750             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i]);
       
  1751             dp[0] = FROM_S32(d0);
       
  1752             buffd[i] = 0.0;
       
  1753           }
       
  1754 
       
  1755         } else if (kh == 3) {
       
  1756 
       
  1757           p2 = buff[0]; p3 = buff[1];
       
  1758           k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
       
  1759 
       
  1760 #ifdef __SUNPRO_C
       
  1761 #pragma pipeloop(0)
       
  1762 #endif /* __SUNPRO_C */
       
  1763           for (i = 0; i <= (hsize - 2); i += 2) {
       
  1764             p0 = p2; p1 = p3;
       
  1765 
       
  1766             p2 = buff[i + 2]; p3 = buff[i + 3];
       
  1767 
       
  1768             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
       
  1769             d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
       
  1770 
       
  1771             dp[0  ] = FROM_S32(d0);
       
  1772             dp[dll] = FROM_S32(d1);
       
  1773 
       
  1774             buffd[i    ] = 0.0;
       
  1775             buffd[i + 1] = 0.0;
       
  1776 
       
  1777             dp += 2*dll;
       
  1778           }
       
  1779 
       
  1780           if (i < hsize) {
       
  1781             p0 = p2; p1 = p3;
       
  1782             p2 = buff[i + 2];
       
  1783             d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i]);
       
  1784             dp[0] = FROM_S32(d0);
       
  1785 
       
  1786             buffd[i] = 0.0;
       
  1787           }
       
  1788 
       
  1789         } else if (kh == 2) {
       
  1790 
       
  1791           p2 = buff[0];
       
  1792           k0 = pk[0]; k1 = pk[1];
       
  1793 
       
  1794 #ifdef __SUNPRO_C
       
  1795 #pragma pipeloop(0)
       
  1796 #endif /* __SUNPRO_C */
       
  1797           for (i = 0; i <= (hsize - 2); i += 2) {
       
  1798             p0 = p2;
       
  1799 
       
  1800             p1 = buff[i + 1]; p2 = buff[i + 2];
       
  1801 
       
  1802             d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
       
  1803             d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
       
  1804 
       
  1805             dp[0  ] = FROM_S32(d0);
       
  1806             dp[dll] = FROM_S32(d1);
       
  1807 
       
  1808             buffd[i    ] = 0.0;
       
  1809             buffd[i + 1] = 0.0;
       
  1810 
       
  1811             dp += 2*dll;
       
  1812           }
       
  1813 
       
  1814           if (i < hsize) {
       
  1815             p0 = p2;
       
  1816             p1 = buff[i + 1];
       
  1817             d0 = D2I(p0*k0 + p1*k1 + buffd[i]);
       
  1818             dp[0] = FROM_S32(d0);
       
  1819 
       
  1820             buffd[i] = 0.0;
       
  1821           }
       
  1822 
       
  1823         } else /* kh == 1 */{
       
  1824 
       
  1825           k0 = pk[0];
       
  1826 
       
  1827 #ifdef __SUNPRO_C
       
  1828 #pragma pipeloop(0)
       
  1829 #endif /* __SUNPRO_C */
       
  1830           for (i = 0; i <= (hsize - 2); i += 2) {
       
  1831             p0 = buff[i]; p1 = buff[i + 1];
       
  1832 
       
  1833             d0 = D2I(p0*k0 + buffd[i    ]);
       
  1834             d1 = D2I(p1*k0 + buffd[i + 1]);
       
  1835 
       
  1836             dp[0  ] = FROM_S32(d0);
       
  1837             dp[dll] = FROM_S32(d1);
       
  1838 
       
  1839             buffd[i    ] = 0.0;
       
  1840             buffd[i + 1] = 0.0;
       
  1841 
       
  1842             dp += 2*dll;
       
  1843           }
       
  1844 
       
  1845           if (i < hsize) {
       
  1846             p0 = buff[i];
       
  1847             d0 = D2I(p0*k0 + buffd[i]);
       
  1848             dp[0] = FROM_S32(d0);
       
  1849 
       
  1850             buffd[i] = 0.0;
       
  1851           }
       
  1852         }
       
  1853 
       
  1854         /* next line */
       
  1855         sl += nchannel;
       
  1856         dl += nchannel;
       
  1857       }
       
  1858     }
       
  1859 
       
  1860     k_off += max_hsize;
       
  1861     adr_dst += max_hsize*dll;
       
  1862   }
       
  1863 
       
  1864   if (pbuff != buff) mlib_free(pbuff);
       
  1865 
       
  1866   return MLIB_SUCCESS;
       
  1867 }
       
  1868 
       
  1869 /***************************************************************/
       
  1870 mlib_status CONV_FUNC_MxN
       
  1871 {
       
  1872   DTYPE    *adr_src, *sl, *sp;
       
  1873   DTYPE    *adr_dst, *dl, *dp;
       
  1874   FTYPE    buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
       
  1875   FTYPE    **buffs = buffs_arr, *buffd;
       
  1876   FTYPE    akernel[256], *k = akernel, fscale = DSCALE;
       
  1877   FTYPE    *pbuff = buff;
       
  1878   FTYPE    k0, k1, k2, k3, k4, k5, k6;
       
  1879   FTYPE    p0, p1, p2, p3, p4, p5, p6, p7;
       
  1880   mlib_s32 *buffi;
       
  1881   mlib_s32 mn, l, off, kw, bsize, buff_ind;
       
  1882   mlib_s32 d0, d1;
       
  1883   mlib_s32 wid, hgt, sll, dll;
       
  1884   mlib_s32 nchannel, chan1, chan2;
       
  1885   mlib_s32 i, j, c, swid;
       
  1886   d64_2x32 dd;
       
  1887   GET_SRC_DST_PARAMETERS(DTYPE);
       
  1888 
       
  1889   if (scale > 30) {
       
  1890     fscale *= 1.0/(1 << 30);
       
  1891     scale -= 30;
       
  1892   }
       
  1893 
       
  1894   fscale /= (1 << scale);
       
  1895 
       
  1896   mn = m*n;
       
  1897 
       
  1898   if (mn > 256) {
       
  1899     k = mlib_malloc(mn*sizeof(mlib_d64));
       
  1900 
       
  1901     if (k == NULL) return MLIB_FAILURE;
       
  1902   }
       
  1903 
       
  1904   for (i = 0; i < mn; i++) {
       
  1905     k[i] = kernel[i]*fscale;
       
  1906   }
       
  1907 
       
  1908   if (m == 1) return mlib_ImageConv1xN_ext(dst, src, k, n, dy_t, dy_b, cmask);
       
  1909 
       
  1910   swid = wid + (m - 1);
       
  1911 
       
  1912   bsize = (n + 3)*swid;
       
  1913 
       
  1914   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
       
  1915     pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1));
       
  1916 
       
  1917     if (pbuff == NULL) return MLIB_FAILURE;
       
  1918     buffs = (FTYPE   **)(pbuff + bsize);
       
  1919   }
       
  1920 
       
  1921   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
       
  1922   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
       
  1923   buffd = buffs[n] + swid;
       
  1924   buffi = (mlib_s32*)(buffd + swid);
       
  1925 
       
  1926   chan1 = nchannel;
       
  1927   chan2 = chan1 + chan1;
       
  1928 
       
  1929   swid -= (dx_l + dx_r);
       
  1930 
       
  1931   for (c = 0; c < nchannel; c++) {
       
  1932     if (!(cmask & (1 << (chan1 - 1 - c)))) continue;
       
  1933 
       
  1934     sl = adr_src + c;
       
  1935     dl = adr_dst + c;
       
  1936 
       
  1937     for (l = 0; l < n; l++) {
       
  1938       FTYPE    *buff = buffs[l];
       
  1939 
       
  1940       for (i = 0; i < dx_l; i++) {
       
  1941         buff[i] = (FTYPE)sl[0];
       
  1942       }
       
  1943 
       
  1944 #ifdef __SUNPRO_C
       
  1945 #pragma pipeloop(0)
       
  1946 #endif /* __SUNPRO_C */
       
  1947       for (i = 0; i < swid; i++) {
       
  1948         buff[i + dx_l] = (FTYPE)sl[i*chan1];
       
  1949       }
       
  1950 
       
  1951       for (i = 0; i < dx_r; i++) {
       
  1952         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
       
  1953       }
       
  1954 
       
  1955       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
       
  1956     }
       
  1957 
       
  1958     buff_ind = 0;
       
  1959 
       
  1960 #ifdef __SUNPRO_C
       
  1961 #pragma pipeloop(0)
       
  1962 #endif /* __SUNPRO_C */
       
  1963     for (i = 0; i < wid; i++) buffd[i] = 0.0;
       
  1964 
       
  1965     for (j = 0; j < hgt; j++) {
       
  1966       FTYPE    **buffc = buffs + buff_ind;
       
  1967       FTYPE    *buffn = buffc[n];
       
  1968       FTYPE    *pk = k;
       
  1969 
       
  1970       for (l = 0; l < n; l++) {
       
  1971         FTYPE    *buff_l = buffc[l];
       
  1972 
       
  1973         for (off = 0; off < m;) {
       
  1974           FTYPE    *buff = buff_l + off;
       
  1975 
       
  1976           kw = m - off;
       
  1977 
       
  1978           if (kw > 2*MAX_KER) kw = MAX_KER; else
       
  1979             if (kw > MAX_KER) kw = kw/2;
       
  1980           off += kw;
       
  1981 
       
  1982           sp = sl;
       
  1983           dp = dl;
       
  1984 
       
  1985           if (kw == 7) {
       
  1986 
       
  1987             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
       
  1988             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
       
  1989 
       
  1990             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
       
  1991             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
       
  1992 
       
  1993             if (l < (n - 1) || off < m) {
       
  1994 #ifdef __SUNPRO_C
       
  1995 #pragma pipeloop(0)
       
  1996 #endif /* __SUNPRO_C */
       
  1997               for (i = 0; i <= (wid - 2); i += 2) {
       
  1998                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
       
  1999 
       
  2000                 p6 = buff[i + 6]; p7 = buff[i + 7];
       
  2001 
       
  2002                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
       
  2003                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
       
  2004               }
       
  2005 
       
  2006             } else {
       
  2007 #ifdef __SUNPRO_C
       
  2008 #pragma pipeloop(0)
       
  2009 #endif /* __SUNPRO_C */
       
  2010               for (i = 0; i <= (wid - 2); i += 2) {
       
  2011                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
       
  2012 
       
  2013                 p6 = buff[i + 6]; p7 = buff[i + 7];
       
  2014 
       
  2015                 LOAD_BUFF(buffi);
       
  2016 
       
  2017                 dd.d64 = *(FTYPE   *)(buffi + i);
       
  2018                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
       
  2019                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
       
  2020 
       
  2021                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
       
  2022                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
       
  2023 
       
  2024                 dp[0    ] = FROM_S32(d0);
       
  2025                 dp[chan1] = FROM_S32(d1);
       
  2026 
       
  2027                 buffd[i    ] = 0.0;
       
  2028                 buffd[i + 1] = 0.0;
       
  2029 
       
  2030                 sp += chan2;
       
  2031                 dp += chan2;
       
  2032               }
       
  2033             }
       
  2034 
       
  2035           } else if (kw == 6) {
       
  2036 
       
  2037             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
       
  2038             p5 = buff[3]; p6 = buff[4];
       
  2039 
       
  2040             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
       
  2041             k4 = pk[4]; k5 = pk[5];
       
  2042 
       
  2043             if (l < (n - 1) || off < m) {
       
  2044 #ifdef __SUNPRO_C
       
  2045 #pragma pipeloop(0)
       
  2046 #endif /* __SUNPRO_C */
       
  2047               for (i = 0; i <= (wid - 2); i += 2) {
       
  2048                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
       
  2049 
       
  2050                 p5 = buff[i + 5]; p6 = buff[i + 6];
       
  2051 
       
  2052                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
       
  2053                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
       
  2054               }
       
  2055 
       
  2056             } else {
       
  2057 #ifdef __SUNPRO_C
       
  2058 #pragma pipeloop(0)
       
  2059 #endif /* __SUNPRO_C */
       
  2060               for (i = 0; i <= (wid - 2); i += 2) {
       
  2061                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
       
  2062 
       
  2063                 p5 = buff[i + 5]; p6 = buff[i + 6];
       
  2064 
       
  2065                 LOAD_BUFF(buffi);
       
  2066 
       
  2067                 dd.d64 = *(FTYPE   *)(buffi + i);
       
  2068                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
       
  2069                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
       
  2070 
       
  2071                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
       
  2072                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
       
  2073 
       
  2074                 dp[0    ] = FROM_S32(d0);
       
  2075                 dp[chan1] = FROM_S32(d1);
       
  2076 
       
  2077                 buffd[i    ] = 0.0;
       
  2078                 buffd[i + 1] = 0.0;
       
  2079 
       
  2080                 sp += chan2;
       
  2081                 dp += chan2;
       
  2082               }
       
  2083             }
       
  2084 
       
  2085           } else if (kw == 5) {
       
  2086 
       
  2087             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
       
  2088             p5 = buff[3];
       
  2089 
       
  2090             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
       
  2091             k4 = pk[4];
       
  2092 
       
  2093             if (l < (n - 1) || off < m) {
       
  2094 #ifdef __SUNPRO_C
       
  2095 #pragma pipeloop(0)
       
  2096 #endif /* __SUNPRO_C */
       
  2097               for (i = 0; i <= (wid - 2); i += 2) {
       
  2098                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
       
  2099 
       
  2100                 p4 = buff[i + 4]; p5 = buff[i + 5];
       
  2101 
       
  2102                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
       
  2103                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
       
  2104               }
       
  2105 
       
  2106             } else {
       
  2107 #ifdef __SUNPRO_C
       
  2108 #pragma pipeloop(0)
       
  2109 #endif /* __SUNPRO_C */
       
  2110               for (i = 0; i <= (wid - 2); i += 2) {
       
  2111                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
       
  2112 
       
  2113                 p4 = buff[i + 4]; p5 = buff[i + 5];
       
  2114 
       
  2115                 LOAD_BUFF(buffi);
       
  2116 
       
  2117                 dd.d64 = *(FTYPE   *)(buffi + i);
       
  2118                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
       
  2119                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
       
  2120 
       
  2121                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
       
  2122                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
       
  2123 
       
  2124                 dp[0    ] = FROM_S32(d0);
       
  2125                 dp[chan1] = FROM_S32(d1);
       
  2126 
       
  2127                 buffd[i    ] = 0.0;
       
  2128                 buffd[i + 1] = 0.0;
       
  2129 
       
  2130                 sp += chan2;
       
  2131                 dp += chan2;
       
  2132               }
       
  2133             }
       
  2134 
       
  2135           } else if (kw == 4) {
       
  2136 
       
  2137             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
       
  2138 
       
  2139             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
       
  2140 
       
  2141             if (l < (n - 1) || off < m) {
       
  2142 #ifdef __SUNPRO_C
       
  2143 #pragma pipeloop(0)
       
  2144 #endif /* __SUNPRO_C */
       
  2145               for (i = 0; i <= (wid - 2); i += 2) {
       
  2146                 p0 = p2; p1 = p3; p2 = p4;
       
  2147 
       
  2148                 p3 = buff[i + 3]; p4 = buff[i + 4];
       
  2149 
       
  2150                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
       
  2151                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
       
  2152               }
       
  2153 
       
  2154             } else {
       
  2155 #ifdef __SUNPRO_C
       
  2156 #pragma pipeloop(0)
       
  2157 #endif /* __SUNPRO_C */
       
  2158               for (i = 0; i <= (wid - 2); i += 2) {
       
  2159                 p0 = p2; p1 = p3; p2 = p4;
       
  2160 
       
  2161                 p3 = buff[i + 3]; p4 = buff[i + 4];
       
  2162 
       
  2163                 LOAD_BUFF(buffi);
       
  2164 
       
  2165                 dd.d64 = *(FTYPE   *)(buffi + i);
       
  2166                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
       
  2167                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
       
  2168 
       
  2169                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
       
  2170                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
       
  2171 
       
  2172                 dp[0    ] = FROM_S32(d0);
       
  2173                 dp[chan1] = FROM_S32(d1);
       
  2174 
       
  2175                 buffd[i    ] = 0.0;
       
  2176                 buffd[i + 1] = 0.0;
       
  2177 
       
  2178                 sp += chan2;
       
  2179                 dp += chan2;
       
  2180               }
       
  2181             }
       
  2182 
       
  2183           } else if (kw == 3) {
       
  2184 
       
  2185             p2 = buff[0]; p3 = buff[1];
       
  2186             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
       
  2187 
       
  2188             if (l < (n - 1) || off < m) {
       
  2189 #ifdef __SUNPRO_C
       
  2190 #pragma pipeloop(0)
       
  2191 #endif /* __SUNPRO_C */
       
  2192               for (i = 0; i <= (wid - 2); i += 2) {
       
  2193                 p0 = p2; p1 = p3;
       
  2194 
       
  2195                 p2 = buff[i + 2]; p3 = buff[i + 3];
       
  2196 
       
  2197                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
       
  2198                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
       
  2199               }
       
  2200 
       
  2201             } else {
       
  2202 #ifdef __SUNPRO_C
       
  2203 #pragma pipeloop(0)
       
  2204 #endif /* __SUNPRO_C */
       
  2205               for (i = 0; i <= (wid - 2); i += 2) {
       
  2206                 p0 = p2; p1 = p3;
       
  2207 
       
  2208                 p2 = buff[i + 2]; p3 = buff[i + 3];
       
  2209 
       
  2210                 LOAD_BUFF(buffi);
       
  2211 
       
  2212                 dd.d64 = *(FTYPE   *)(buffi + i);
       
  2213                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
       
  2214                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
       
  2215 
       
  2216                 d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
       
  2217                 d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
       
  2218 
       
  2219                 dp[0    ] = FROM_S32(d0);
       
  2220                 dp[chan1] = FROM_S32(d1);
       
  2221 
       
  2222                 buffd[i    ] = 0.0;
       
  2223                 buffd[i + 1] = 0.0;
       
  2224 
       
  2225                 sp += chan2;
       
  2226                 dp += chan2;
       
  2227               }
       
  2228             }
       
  2229 
       
  2230           } else /* if (kw == 2) */ {
       
  2231 
       
  2232             p2 = buff[0];
       
  2233             k0 = pk[0]; k1 = pk[1];
       
  2234 
       
  2235             if (l < (n - 1) || off < m) {
       
  2236 #ifdef __SUNPRO_C
       
  2237 #pragma pipeloop(0)
       
  2238 #endif /* __SUNPRO_C */
       
  2239               for (i = 0; i <= (wid - 2); i += 2) {
       
  2240                 p0 = p2;
       
  2241 
       
  2242                 p1 = buff[i + 1]; p2 = buff[i + 2];
       
  2243 
       
  2244                 buffd[i    ] += p0*k0 + p1*k1;
       
  2245                 buffd[i + 1] += p1*k0 + p2*k1;
       
  2246               }
       
  2247 
       
  2248             } else {
       
  2249 #ifdef __SUNPRO_C
       
  2250 #pragma pipeloop(0)
       
  2251 #endif /* __SUNPRO_C */
       
  2252               for (i = 0; i <= (wid - 2); i += 2) {
       
  2253                 p0 = p2;
       
  2254 
       
  2255                 p1 = buff[i + 1]; p2 = buff[i + 2];
       
  2256 
       
  2257                 LOAD_BUFF(buffi);
       
  2258 
       
  2259                 dd.d64 = *(FTYPE   *)(buffi + i);
       
  2260                 buffn[i + dx_l    ] = (FTYPE)dd.i32s.i0;
       
  2261                 buffn[i + dx_l + 1] = (FTYPE)dd.i32s.i1;
       
  2262 
       
  2263                 d0 = D2I(p0*k0 + p1*k1 + buffd[i    ]);
       
  2264                 d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]);
       
  2265 
       
  2266                 dp[0    ] = FROM_S32(d0);
       
  2267                 dp[chan1] = FROM_S32(d1);
       
  2268 
       
  2269                 buffd[i    ] = 0.0;
       
  2270                 buffd[i + 1] = 0.0;
       
  2271 
       
  2272                 sp += chan2;
       
  2273                 dp += chan2;
       
  2274               }
       
  2275             }
       
  2276           }
       
  2277 
       
  2278           pk += kw;
       
  2279         }
       
  2280       }
       
  2281 
       
  2282       /* last pixels */
       
  2283       for (; i < wid; i++) {
       
  2284         FTYPE    *pk = k, s = 0;
       
  2285         mlib_s32 x, d0;
       
  2286 
       
  2287         for (l = 0; l < n; l++) {
       
  2288           FTYPE    *buff = buffc[l] + i;
       
  2289 
       
  2290           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
       
  2291         }
       
  2292 
       
  2293         d0 = D2I(s);
       
  2294         dp[0] = FROM_S32(d0);
       
  2295 
       
  2296         buffn[i + dx_l] = (FTYPE)sp[0];
       
  2297 
       
  2298         sp += chan1;
       
  2299         dp += chan1;
       
  2300       }
       
  2301 
       
  2302       for (; i < swid; i++) {
       
  2303         buffn[i + dx_l] = (FTYPE)sp[0];
       
  2304         sp += chan1;
       
  2305       }
       
  2306 
       
  2307       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
       
  2308       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
       
  2309 
       
  2310       /* next line */
       
  2311 
       
  2312       if (j < hgt - dy_b - 2) sl += sll;
       
  2313       dl += dll;
       
  2314 
       
  2315       buff_ind++;
       
  2316 
       
  2317       if (buff_ind >= n + 1) buff_ind = 0;
       
  2318     }
       
  2319   }
       
  2320 
       
  2321   if (pbuff != buff) mlib_free(pbuff);
       
  2322 
       
  2323   return MLIB_SUCCESS;
       
  2324 }
       
  2325 
       
  2326 /***************************************************************/
       
  2327 #ifndef __sparc /* for x86, using integer multiplies is faster */
       
  2328 
       
  2329 #define STORE_RES(res, x)                                       \
       
  2330   x >>= shift2;                                                 \
       
  2331   CLAMP_STORE(res, x)
       
  2332 
       
  2333 mlib_status CONV_FUNC_MxN_I
       
  2334 {
       
  2335   DTYPE    *adr_src, *sl, *sp;
       
  2336   DTYPE    *adr_dst, *dl, *dp;
       
  2337   mlib_s32 buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)];
       
  2338   mlib_s32 *pbuff = buff;
       
  2339   mlib_s32 **buffs = buffs_arr, *buffd;
       
  2340   mlib_s32 l, off, kw, bsize, buff_ind;
       
  2341   mlib_s32 d0, d1, shift1, shift2;
       
  2342   mlib_s32 k0, k1, k2, k3, k4, k5, k6;
       
  2343   mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7;
       
  2344   mlib_s32 wid, hgt, sll, dll;
       
  2345   mlib_s32 nchannel, chan1;
       
  2346   mlib_s32 i, j, c, swid;
       
  2347   mlib_s32 chan2;
       
  2348   mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl;
       
  2349   GET_SRC_DST_PARAMETERS(DTYPE);
       
  2350 
       
  2351 #if IMG_TYPE != 1
       
  2352   shift1 = 16;
       
  2353 #else
       
  2354   shift1 = 8;
       
  2355 #endif /* IMG_TYPE != 1 */
       
  2356   shift2 = scale - shift1;
       
  2357 
       
  2358   chan1 = nchannel;
       
  2359   chan2 = chan1 + chan1;
       
  2360 
       
  2361   swid = wid + (m - 1);
       
  2362 
       
  2363   bsize = (n + 2)*swid;
       
  2364 
       
  2365   if ((bsize > BUFF_SIZE) || (n > MAX_N)) {
       
  2366     pbuff = mlib_malloc(sizeof(mlib_s32)*bsize + sizeof(mlib_s32 *)*2*(n + 1));
       
  2367 
       
  2368     if (pbuff == NULL) return MLIB_FAILURE;
       
  2369     buffs = (mlib_s32 **)(pbuff + bsize);
       
  2370   }
       
  2371 
       
  2372   for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*swid;
       
  2373   for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l];
       
  2374   buffd = buffs[n] + swid;
       
  2375 
       
  2376   if (m*n > MAX_N*MAX_N) {
       
  2377     k = mlib_malloc(sizeof(mlib_s32)*(m*n));
       
  2378 
       
  2379     if (k == NULL) {
       
  2380       if (pbuff != buff) mlib_free(pbuff);
       
  2381       return MLIB_FAILURE;
       
  2382     }
       
  2383   }
       
  2384 
       
  2385   for (i = 0; i < m*n; i++) {
       
  2386     k[i] = kernel[i] >> shift1;
       
  2387   }
       
  2388 
       
  2389   swid -= (dx_l + dx_r);
       
  2390 
       
  2391   for (c = 0; c < nchannel; c++) {
       
  2392     if (!(cmask & (1 << (nchannel - 1 - c)))) continue;
       
  2393 
       
  2394     sl = adr_src + c;
       
  2395     dl = adr_dst + c;
       
  2396 
       
  2397     for (l = 0; l < n; l++) {
       
  2398       mlib_s32  *buff = buffs[l];
       
  2399 
       
  2400       for (i = 0; i < dx_l; i++) {
       
  2401         buff[i] = (mlib_s32)sl[0];
       
  2402       }
       
  2403 
       
  2404 #ifdef __SUNPRO_C
       
  2405 #pragma pipeloop(0)
       
  2406 #endif /* __SUNPRO_C */
       
  2407       for (i = 0; i < swid; i++) {
       
  2408         buff[i + dx_l] = (mlib_s32)sl[i*chan1];
       
  2409       }
       
  2410 
       
  2411       for (i = 0; i < dx_r; i++) {
       
  2412         buff[swid + dx_l + i] = buff[swid + dx_l - 1];
       
  2413       }
       
  2414 
       
  2415       if ((l >= dy_t) && (l < hgt + n - dy_b - 2)) sl += sll;
       
  2416     }
       
  2417 
       
  2418     buff_ind = 0;
       
  2419 
       
  2420 #ifdef __SUNPRO_C
       
  2421 #pragma pipeloop(0)
       
  2422 #endif /* __SUNPRO_C */
       
  2423     for (i = 0; i < wid; i++) buffd[i] = 0;
       
  2424 
       
  2425     for (j = 0; j < hgt; j++) {
       
  2426       mlib_s32 **buffc = buffs + buff_ind;
       
  2427       mlib_s32 *buffn = buffc[n];
       
  2428       mlib_s32 *pk = k;
       
  2429 
       
  2430       for (l = 0; l < n; l++) {
       
  2431         mlib_s32  *buff_l = buffc[l];
       
  2432 
       
  2433         for (off = 0; off < m;) {
       
  2434           mlib_s32 *buff = buff_l + off;
       
  2435 
       
  2436           sp = sl;
       
  2437           dp = dl;
       
  2438 
       
  2439           kw = m - off;
       
  2440 
       
  2441           if (kw > 2*MAX_KER) kw = MAX_KER; else
       
  2442             if (kw > MAX_KER) kw = kw/2;
       
  2443           off += kw;
       
  2444 
       
  2445           if (kw == 7) {
       
  2446 
       
  2447             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
       
  2448             p5 = buff[3]; p6 = buff[4]; p7 = buff[5];
       
  2449 
       
  2450             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
       
  2451             k4 = pk[4]; k5 = pk[5]; k6 = pk[6];
       
  2452 
       
  2453             if (l < (n - 1) || off < m) {
       
  2454 #ifdef __SUNPRO_C
       
  2455 #pragma pipeloop(0)
       
  2456 #endif /* __SUNPRO_C */
       
  2457               for (i = 0; i <= (wid - 2); i += 2) {
       
  2458                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
       
  2459 
       
  2460                 p6 = buff[i + 6]; p7 = buff[i + 7];
       
  2461 
       
  2462                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6;
       
  2463                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6;
       
  2464               }
       
  2465 
       
  2466             } else {
       
  2467 #ifdef __SUNPRO_C
       
  2468 #pragma pipeloop(0)
       
  2469 #endif /* __SUNPRO_C */
       
  2470               for (i = 0; i <= (wid - 2); i += 2) {
       
  2471                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7;
       
  2472 
       
  2473                 p6 = buff[i + 6]; p7 = buff[i + 7];
       
  2474 
       
  2475                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
       
  2476                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
       
  2477 
       
  2478                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i    ]);
       
  2479                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]);
       
  2480 
       
  2481                 STORE_RES(dp[0    ], d0);
       
  2482                 STORE_RES(dp[chan1], d1);
       
  2483 
       
  2484                 buffd[i    ] = 0;
       
  2485                 buffd[i + 1] = 0;
       
  2486 
       
  2487                 sp += chan2;
       
  2488                 dp += chan2;
       
  2489               }
       
  2490             }
       
  2491 
       
  2492           } else if (kw == 6) {
       
  2493 
       
  2494             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
       
  2495             p5 = buff[3]; p6 = buff[4];
       
  2496 
       
  2497             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
       
  2498             k4 = pk[4]; k5 = pk[5];
       
  2499 
       
  2500             if (l < (n - 1) || off < m) {
       
  2501 #ifdef __SUNPRO_C
       
  2502 #pragma pipeloop(0)
       
  2503 #endif /* __SUNPRO_C */
       
  2504               for (i = 0; i <= (wid - 2); i += 2) {
       
  2505                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
       
  2506 
       
  2507                 p5 = buff[i + 5]; p6 = buff[i + 6];
       
  2508 
       
  2509                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5;
       
  2510                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5;
       
  2511               }
       
  2512 
       
  2513             } else {
       
  2514 #ifdef __SUNPRO_C
       
  2515 #pragma pipeloop(0)
       
  2516 #endif /* __SUNPRO_C */
       
  2517               for (i = 0; i <= (wid - 2); i += 2) {
       
  2518                 p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6;
       
  2519 
       
  2520                 p5 = buff[i + 5]; p6 = buff[i + 6];
       
  2521 
       
  2522                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
       
  2523                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
       
  2524 
       
  2525                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i    ]);
       
  2526                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]);
       
  2527 
       
  2528                 STORE_RES(dp[0    ], d0);
       
  2529                 STORE_RES(dp[chan1], d1);
       
  2530 
       
  2531                 buffd[i    ] = 0;
       
  2532                 buffd[i + 1] = 0;
       
  2533 
       
  2534                 sp += chan2;
       
  2535                 dp += chan2;
       
  2536               }
       
  2537             }
       
  2538 
       
  2539           } else if (kw == 5) {
       
  2540 
       
  2541             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
       
  2542             p5 = buff[3];
       
  2543 
       
  2544             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
       
  2545             k4 = pk[4];
       
  2546 
       
  2547             if (l < (n - 1) || off < m) {
       
  2548 #ifdef __SUNPRO_C
       
  2549 #pragma pipeloop(0)
       
  2550 #endif /* __SUNPRO_C */
       
  2551               for (i = 0; i <= (wid - 2); i += 2) {
       
  2552                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
       
  2553 
       
  2554                 p4 = buff[i + 4]; p5 = buff[i + 5];
       
  2555 
       
  2556                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4;
       
  2557                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4;
       
  2558               }
       
  2559 
       
  2560             } else {
       
  2561 #ifdef __SUNPRO_C
       
  2562 #pragma pipeloop(0)
       
  2563 #endif /* __SUNPRO_C */
       
  2564               for (i = 0; i <= (wid - 2); i += 2) {
       
  2565                 p0 = p2; p1 = p3; p2 = p4; p3 = p5;
       
  2566 
       
  2567                 p4 = buff[i + 4]; p5 = buff[i + 5];
       
  2568 
       
  2569                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
       
  2570                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
       
  2571 
       
  2572                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i    ]);
       
  2573                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]);
       
  2574 
       
  2575                 STORE_RES(dp[0    ], d0);
       
  2576                 STORE_RES(dp[chan1], d1);
       
  2577 
       
  2578                 buffd[i    ] = 0;
       
  2579                 buffd[i + 1] = 0;
       
  2580 
       
  2581                 sp += chan2;
       
  2582                 dp += chan2;
       
  2583               }
       
  2584             }
       
  2585 
       
  2586           } else if (kw == 4) {
       
  2587 
       
  2588             p2 = buff[0]; p3 = buff[1]; p4 = buff[2];
       
  2589 
       
  2590             k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3];
       
  2591 
       
  2592             if (l < (n - 1) || off < m) {
       
  2593 #ifdef __SUNPRO_C
       
  2594 #pragma pipeloop(0)
       
  2595 #endif /* __SUNPRO_C */
       
  2596               for (i = 0; i <= (wid - 2); i += 2) {
       
  2597                 p0 = p2; p1 = p3; p2 = p4;
       
  2598 
       
  2599                 p3 = buff[i + 3]; p4 = buff[i + 4];
       
  2600 
       
  2601                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2 + p3*k3;
       
  2602                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3;
       
  2603               }
       
  2604 
       
  2605             } else {
       
  2606 #ifdef __SUNPRO_C
       
  2607 #pragma pipeloop(0)
       
  2608 #endif /* __SUNPRO_C */
       
  2609               for (i = 0; i <= (wid - 2); i += 2) {
       
  2610                 p0 = p2; p1 = p3; p2 = p4;
       
  2611 
       
  2612                 p3 = buff[i + 3]; p4 = buff[i + 4];
       
  2613 
       
  2614                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
       
  2615                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
       
  2616 
       
  2617                 d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i    ]);
       
  2618                 d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]);
       
  2619 
       
  2620                 STORE_RES(dp[0    ], d0);
       
  2621                 STORE_RES(dp[chan1], d1);
       
  2622 
       
  2623                 buffd[i    ] = 0;
       
  2624                 buffd[i + 1] = 0;
       
  2625 
       
  2626                 sp += chan2;
       
  2627                 dp += chan2;
       
  2628               }
       
  2629             }
       
  2630 
       
  2631           } else if (kw == 3) {
       
  2632 
       
  2633             p2 = buff[0]; p3 = buff[1];
       
  2634             k0 = pk[0]; k1 = pk[1]; k2 = pk[2];
       
  2635 
       
  2636             if (l < (n - 1) || off < m) {
       
  2637 #ifdef __SUNPRO_C
       
  2638 #pragma pipeloop(0)
       
  2639 #endif /* __SUNPRO_C */
       
  2640               for (i = 0; i <= (wid - 2); i += 2) {
       
  2641                 p0 = p2; p1 = p3;
       
  2642 
       
  2643                 p2 = buff[i + 2]; p3 = buff[i + 3];
       
  2644 
       
  2645                 buffd[i    ] += p0*k0 + p1*k1 + p2*k2;
       
  2646                 buffd[i + 1] += p1*k0 + p2*k1 + p3*k2;
       
  2647               }
       
  2648 
       
  2649             } else {
       
  2650 #ifdef __SUNPRO_C
       
  2651 #pragma pipeloop(0)
       
  2652 #endif /* __SUNPRO_C */
       
  2653               for (i = 0; i <= (wid - 2); i += 2) {
       
  2654                 p0 = p2; p1 = p3;
       
  2655 
       
  2656                 p2 = buff[i + 2]; p3 = buff[i + 3];
       
  2657 
       
  2658                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
       
  2659                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
       
  2660 
       
  2661                 d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i    ]);
       
  2662                 d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]);
       
  2663 
       
  2664                 STORE_RES(dp[0    ], d0);
       
  2665                 STORE_RES(dp[chan1], d1);
       
  2666 
       
  2667                 buffd[i    ] = 0;
       
  2668                 buffd[i + 1] = 0;
       
  2669 
       
  2670                 sp += chan2;
       
  2671                 dp += chan2;
       
  2672               }
       
  2673             }
       
  2674 
       
  2675           } else if (kw == 2) {
       
  2676 
       
  2677             p2 = buff[0];
       
  2678             k0 = pk[0]; k1 = pk[1];
       
  2679 
       
  2680             if (l < (n - 1) || off < m) {
       
  2681 #ifdef __SUNPRO_C
       
  2682 #pragma pipeloop(0)
       
  2683 #endif /* __SUNPRO_C */
       
  2684               for (i = 0; i <= (wid - 2); i += 2) {
       
  2685                 p0 = p2;
       
  2686 
       
  2687                 p1 = buff[i + 1]; p2 = buff[i + 2];
       
  2688 
       
  2689                 buffd[i    ] += p0*k0 + p1*k1;
       
  2690                 buffd[i + 1] += p1*k0 + p2*k1;
       
  2691               }
       
  2692 
       
  2693             } else {
       
  2694 #ifdef __SUNPRO_C
       
  2695 #pragma pipeloop(0)
       
  2696 #endif /* __SUNPRO_C */
       
  2697               for (i = 0; i <= (wid - 2); i += 2) {
       
  2698                 p0 = p2;
       
  2699 
       
  2700                 p1 = buff[i + 1]; p2 = buff[i + 2];
       
  2701 
       
  2702                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
       
  2703                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
       
  2704 
       
  2705                 d0 = (p0*k0 + p1*k1 + buffd[i    ]);
       
  2706                 d1 = (p1*k0 + p2*k1 + buffd[i + 1]);
       
  2707 
       
  2708                 STORE_RES(dp[0    ], d0);
       
  2709                 STORE_RES(dp[chan1], d1);
       
  2710 
       
  2711                 buffd[i    ] = 0;
       
  2712                 buffd[i + 1] = 0;
       
  2713 
       
  2714                 sp += chan2;
       
  2715                 dp += chan2;
       
  2716               }
       
  2717             }
       
  2718 
       
  2719           } else /* kw == 1 */{
       
  2720 
       
  2721             k0 = pk[0];
       
  2722 
       
  2723             if (l < (n - 1) || off < m) {
       
  2724 #ifdef __SUNPRO_C
       
  2725 #pragma pipeloop(0)
       
  2726 #endif /* __SUNPRO_C */
       
  2727               for (i = 0; i <= (wid - 2); i += 2) {
       
  2728                 p0 = buff[i]; p1 = buff[i + 1];
       
  2729 
       
  2730                 buffd[i    ] += p0*k0;
       
  2731                 buffd[i + 1] += p1*k0;
       
  2732               }
       
  2733 
       
  2734             } else {
       
  2735 #ifdef __SUNPRO_C
       
  2736 #pragma pipeloop(0)
       
  2737 #endif /* __SUNPRO_C */
       
  2738               for (i = 0; i <= (wid - 2); i += 2) {
       
  2739                 p0 = buff[i]; p1 = buff[i + 1];
       
  2740 
       
  2741                 buffn[i + dx_l    ] = (mlib_s32)sp[0];
       
  2742                 buffn[i + dx_l + 1] = (mlib_s32)sp[chan1];
       
  2743 
       
  2744                 d0 = (p0*k0 + buffd[i    ]);
       
  2745                 d1 = (p1*k0 + buffd[i + 1]);
       
  2746 
       
  2747                 STORE_RES(dp[0    ], d0);
       
  2748                 STORE_RES(dp[chan1], d1);
       
  2749 
       
  2750                 buffd[i    ] = 0;
       
  2751                 buffd[i + 1] = 0;
       
  2752 
       
  2753                 sp += chan2;
       
  2754                 dp += chan2;
       
  2755               }
       
  2756             }
       
  2757           }
       
  2758 
       
  2759           pk += kw;
       
  2760         }
       
  2761       }
       
  2762 
       
  2763       /* last pixels */
       
  2764       for (; i < wid; i++) {
       
  2765         mlib_s32 *pk = k, x, s = 0;
       
  2766 
       
  2767         for (l = 0; l < n; l++) {
       
  2768           mlib_s32 *buff = buffc[l] + i;
       
  2769 
       
  2770           for (x = 0; x < m; x++) s += buff[x] * (*pk++);
       
  2771         }
       
  2772 
       
  2773         STORE_RES(dp[0], s);
       
  2774 
       
  2775         buffn[i + dx_l] = (mlib_s32)sp[0];
       
  2776 
       
  2777         sp += chan1;
       
  2778         dp += chan1;
       
  2779       }
       
  2780 
       
  2781       for (; i < swid; i++) {
       
  2782         buffn[i + dx_l] = (mlib_s32)sp[0];
       
  2783         sp += chan1;
       
  2784       }
       
  2785 
       
  2786       for (i = 0; i < dx_l; i++) buffn[i] = buffn[dx_l];
       
  2787       for (i = 0; i < dx_r; i++) buffn[swid + dx_l + i] = buffn[swid + dx_l - 1];
       
  2788 
       
  2789       /* next line */
       
  2790 
       
  2791       if (j < hgt - dy_b - 2) sl += sll;
       
  2792       dl += dll;
       
  2793 
       
  2794       buff_ind++;
       
  2795 
       
  2796       if (buff_ind >= n + 1) buff_ind = 0;
       
  2797     }
       
  2798   }
       
  2799 
       
  2800   if (pbuff != buff) mlib_free(pbuff);
       
  2801   if (k != k_locl) mlib_free(k);
       
  2802 
       
  2803   return MLIB_SUCCESS;
       
  2804 }
       
  2805 
       
  2806 #endif /* __sparc ( for x86, using integer multiplies is faster ) */
       
  2807 
       
  2808 /***************************************************************/