--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/jdk/src/solaris/native/sun/awt/medialib/mlib_v_ImageAffine_BC.c Sat Dec 01 00:00:00 2007 +0000
@@ -0,0 +1,1204 @@
+/*
+ * Copyright 1998-2003 Sun Microsystems, Inc. All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation. Sun designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Sun in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ */
+
+
+
+/*
+ * The functions step along the lines from xLeft to xRight and apply
+ * the bicubic filtering.
+ *
+ */
+
+#include "vis_proto.h"
+#include "mlib_ImageAffine.h"
+#include "mlib_v_ImageFilters.h"
+
+/*#define MLIB_VIS2*/
+
+/***************************************************************/
+#define DTYPE mlib_u8
+
+#define FILTER_BITS 8
+
+/***************************************************************/
+#ifdef MLIB_VIS2
+#define MLIB_WRITE_BMASK(bmask) vis_write_bmask(bmask, 0)
+#else
+#define MLIB_WRITE_BMASK(bmask)
+#endif /* MLIB_VIS2 */
+
+/***************************************************************/
+#define sPtr srcPixelPtr
+
+/***************************************************************/
+#define NEXT_PIXEL_1BC_U8() \
+ xSrc = (X>>MLIB_SHIFT)-1; \
+ ySrc = (Y>>MLIB_SHIFT)-1; \
+ sPtr = (mlib_u8 *)lineAddr[ySrc] + xSrc
+
+/***************************************************************/
+#ifndef MLIB_VIS2
+
+#define ALIGN_ADDR(da, dp) \
+ da = vis_alignaddr(dp, 0)
+
+#else
+
+#define ALIGN_ADDR(da, dp) \
+ vis_alignaddr(dp, 0); \
+ da = (mlib_d64*)(((mlib_addr)(dp)) &~ 7)
+
+#endif /* MLIB_VIS2 */
+
+/***************************************************************/
+#define LOAD_BC_U8_1CH_1PIXEL(mlib_filters_u8) \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ row00 = vis_faligndata(data0, data1); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ row10 = vis_faligndata(data0, data1); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ row20 = vis_faligndata(data0, data1); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ row30 = vis_faligndata(data0, data1); \
+ filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK; \
+ yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
+ filterposx = (X >> FILTER_SHIFT) & FILTER_MASK; \
+ xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposx)); \
+ X += dX; \
+ Y += dY
+
+/***************************************************************/
+#ifndef MLIB_VIS2
+
+#define SUM_4x16(v1, v3) \
+ vis_alignaddr((void*)2, 0); \
+ v0 = vis_faligndata(v3, v3); \
+ v2 = vis_fpadd16(v3, v0); \
+ v1 = vis_write_lo(v1, vis_fpadd16s(vis_read_hi(v2), vis_read_lo(v2)))
+
+#else
+
+#define SUM_4x16(v1, v3) \
+ v2 = vis_freg_pair(vis_fpadd16s(vis_read_hi(v3), vis_read_lo(v3)), \
+ vis_fpadd16s(vis_read_hi(v3), vis_read_lo(v3))); \
+ v3 = vis_bshuffle(v2, v2); \
+ v1 = vis_write_lo(v1, vis_fpadd16s(vis_read_hi(v3), vis_read_lo(v3)))
+
+#endif /* MLIB_VIS2 */
+
+/***************************************************************/
+#define RESULT_1BC_U8_1PIXEL(ind) \
+ v0 = vis_fmul8x16au(vis_read_hi(row0##ind), vis_read_hi(yFilter)); \
+ v1 = vis_fmul8x16al(vis_read_hi(row1##ind), vis_read_hi(yFilter)); \
+ sum = vis_fpadd16(v0, v1); \
+ v2 = vis_fmul8x16au(vis_read_hi(row2##ind), vis_read_lo(yFilter)); \
+ sum = vis_fpadd16(sum, v2); \
+ v3 = vis_fmul8x16al(vis_read_hi(row3##ind), vis_read_lo(yFilter)); \
+ sum = vis_fpadd16(sum, v3); \
+ v0 = vis_fmul8sux16(sum, xFilter); \
+ v1 = vis_fmul8ulx16(sum, xFilter); \
+ v3 = vis_fpadd16(v1, v0); \
+ SUM_4x16(v1, v3); \
+ res = vis_write_lo(res, vis_fpack16(v1))
+
+/***************************************************************/
+#define BC_U8_1CH(index, ind1, ind2, mlib_filters_u8) \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ v0 = vis_fmul8x16au(vis_read_hi(row0##ind1), vis_read_hi(yFilter)); \
+ filterposy = (Y >> FILTER_SHIFT); \
+ data1 = dpSrc[1]; \
+ v1 = vis_fmul8x16al(vis_read_hi(row1##ind1), vis_read_hi(yFilter)); \
+ row0##ind2 = vis_faligndata(data0, data1); \
+ filterposx = (X >> FILTER_SHIFT); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ sum = vis_fpadd16(v0, v1); \
+ data0 = dpSrc[0]; \
+ v2 = vis_fmul8x16au(vis_read_hi(row2##ind1), vis_read_lo(yFilter)); \
+ X += dX; \
+ data1 = dpSrc[1]; \
+ row1##ind2 = vis_faligndata(data0, data1); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ Y += dY; \
+ sum = vis_fpadd16(sum, v2); \
+ xSrc = (X>>MLIB_SHIFT)-1; \
+ v3 = vis_fmul8x16al(vis_read_hi(row3##ind1), vis_read_lo(yFilter)); \
+ data0 = dpSrc[0]; \
+ ySrc = (Y>>MLIB_SHIFT)-1; \
+ sum = vis_fpadd16(sum, v3); \
+ data1 = dpSrc[1]; \
+ filterposy &= FILTER_MASK; \
+ v0 = vis_fmul8sux16(sum, xFilter); \
+ row2##ind2 = vis_faligndata(data0, data1); \
+ sPtr += srcYStride; \
+ v1 = vis_fmul8ulx16(sum, xFilter); \
+ filterposx &= FILTER_MASK; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ d##index = vis_fpadd16(v0, v1); \
+ data1 = dpSrc[1]; \
+ row3##ind2 = vis_faligndata(data0, data1); \
+ yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
+ xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposx)); \
+ sPtr = (mlib_u8 *)lineAddr[ySrc] + xSrc
+
+/***************************************************************/
+#ifndef MLIB_VIS2
+
+#define FADD_1BC_U8() \
+ p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0)); \
+ p1 = vis_fpadd16s(vis_read_hi(d1), vis_read_lo(d1)); \
+ p2 = vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2)); \
+ p3 = vis_fpadd16s(vis_read_hi(d3), vis_read_lo(d3)); \
+ m02 = vis_fpmerge(p0, p2); \
+ m13 = vis_fpmerge(p1, p3); \
+ m0213 = vis_fpmerge(vis_read_hi(m02), vis_read_hi(m13)); \
+ e0 = vis_fpmerge(vis_read_hi(m0213), vis_read_lo(m0213)); \
+ m0213 = vis_fpmerge(vis_read_lo(m02), vis_read_lo(m13)); \
+ e1 = vis_fpmerge(vis_read_hi(m0213), vis_read_lo(m0213)); \
+ res = vis_fpadd16(e0, e1)
+
+#else
+
+#define FADD_1BC_U8() \
+ v0 = vis_freg_pair(vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0)), \
+ vis_fpadd16s(vis_read_hi(d1), vis_read_lo(d1))); \
+ v1 = vis_freg_pair(vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2)), \
+ vis_fpadd16s(vis_read_hi(d3), vis_read_lo(d3))); \
+ v2 = vis_bshuffle(v0, v0); \
+ v3 = vis_bshuffle(v1, v1); \
+ res = vis_freg_pair(vis_fpadd16s(vis_read_hi(v2), vis_read_lo(v2)), \
+ vis_fpadd16s(vis_read_hi(v3), vis_read_lo(v3)))
+
+#endif /* MLIB_VIS2 */
+
+/***************************************************************/
+mlib_status mlib_ImageAffine_u8_1ch_bc (mlib_affine_param *param)
+{
+ DECLAREVAR_BC();
+ mlib_s32 filterposx, filterposy;
+ mlib_d64 data0, data1;
+ mlib_d64 sum;
+ mlib_d64 row00, row10, row20, row30;
+ mlib_d64 row01, row11, row21, row31;
+ mlib_d64 xFilter, yFilter;
+ mlib_d64 v0, v1, v2, v3;
+ mlib_d64 d0, d1, d2, d3;
+#ifndef MLIB_VIS2
+ mlib_f32 p0, p1, p2, p3;
+ mlib_d64 e0, e1;
+ mlib_d64 m02, m13, m0213;
+#endif /* MLIB_VIS2 */
+ mlib_d64 *dpSrc;
+ mlib_s32 align, cols, i;
+ mlib_d64 res;
+ const mlib_s16 *mlib_filters_table;
+
+ if (filter == MLIB_BICUBIC) {
+ mlib_filters_table = mlib_filters_u8_bc;
+ } else {
+ mlib_filters_table = mlib_filters_u8_bc2;
+ }
+
+ for (j = yStart; j <= yFinish; j++) {
+
+ vis_write_gsr(3 << 3);
+ MLIB_WRITE_BMASK(0x0145ABEF);
+
+ CLIP(1);
+
+ cols = xRight - xLeft + 1;
+ align = (4 - ((mlib_addr)dstPixelPtr) & 3) & 3;
+ align = (cols < align)? cols : align;
+
+ for (i = 0; i < align; i++) {
+ NEXT_PIXEL_1BC_U8();
+ LOAD_BC_U8_1CH_1PIXEL(mlib_filters_table);
+ RESULT_1BC_U8_1PIXEL(0);
+ vis_st_u8(res, dstPixelPtr++);
+ }
+
+ if (i <= cols - 10) {
+
+ NEXT_PIXEL_1BC_U8();
+ LOAD_BC_U8_1CH_1PIXEL(mlib_filters_table);
+
+ NEXT_PIXEL_1BC_U8();
+
+ BC_U8_1CH(0, 0, 1, mlib_filters_table);
+ BC_U8_1CH(1, 1, 0, mlib_filters_table);
+ BC_U8_1CH(2, 0, 1, mlib_filters_table);
+ BC_U8_1CH(3, 1, 0, mlib_filters_table);
+
+ FADD_1BC_U8();
+
+ BC_U8_1CH(0, 0, 1, mlib_filters_table);
+ BC_U8_1CH(1, 1, 0, mlib_filters_table);
+ BC_U8_1CH(2, 0, 1, mlib_filters_table);
+ BC_U8_1CH(3, 1, 0, mlib_filters_table);
+
+#pragma pipeloop(0)
+ for (; i <= cols - 14; i+=4) {
+ *(mlib_f32*)dstPixelPtr = vis_fpack16(res);
+ FADD_1BC_U8();
+ BC_U8_1CH(0, 0, 1, mlib_filters_table);
+ BC_U8_1CH(1, 1, 0, mlib_filters_table);
+ BC_U8_1CH(2, 0, 1, mlib_filters_table);
+ BC_U8_1CH(3, 1, 0, mlib_filters_table);
+ dstPixelPtr += 4;
+ }
+
+ *(mlib_f32*)dstPixelPtr = vis_fpack16(res);
+ dstPixelPtr += 4;
+ FADD_1BC_U8();
+ *(mlib_f32*)dstPixelPtr = vis_fpack16(res);
+ dstPixelPtr += 4;
+
+ RESULT_1BC_U8_1PIXEL(0);
+ vis_st_u8(res, dstPixelPtr++);
+
+ LOAD_BC_U8_1CH_1PIXEL(mlib_filters_table);
+ RESULT_1BC_U8_1PIXEL(0);
+ vis_st_u8(res, dstPixelPtr++);
+ i += 10;
+ }
+
+ for (; i < cols; i++) {
+ NEXT_PIXEL_1BC_U8();
+ LOAD_BC_U8_1CH_1PIXEL(mlib_filters_table);
+ RESULT_1BC_U8_1PIXEL(0);
+ vis_st_u8(res, dstPixelPtr++);
+ }
+ }
+
+ return MLIB_SUCCESS;
+}
+
+/***************************************************************/
+#define FADD_2BC_U8() \
+ d0 = vis_fpadd16(d00, d10); \
+ d1 = vis_fpadd16(d01, d11); \
+ d2 = vis_fpadd16(d02, d12); \
+ d3 = vis_fpadd16(d03, d13); \
+ p0 = vis_fpadd16s(vis_read_hi(d0), vis_read_lo(d0)); \
+ p1 = vis_fpadd16s(vis_read_hi(d1), vis_read_lo(d1)); \
+ p2 = vis_fpadd16s(vis_read_hi(d2), vis_read_lo(d2)); \
+ p3 = vis_fpadd16s(vis_read_hi(d3), vis_read_lo(d3)); \
+ e0 = vis_freg_pair(p0, p1); \
+ e1 = vis_freg_pair(p2, p3); \
+ res = vis_fpack16_pair(e0, e1)
+
+/***************************************************************/
+#define LOAD_BC_U8_2CH_1PIXEL(mlib_filters_u8) \
+ filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK; \
+ yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
+ filterposx = (X >> FILTER_SHIFT) & FILTER_MASK; \
+ xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposx)); \
+ X += dX; \
+ Y += dY; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ row0 = vis_faligndata(data0, data1); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ row1 = vis_faligndata(data0, data1); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ row2 = vis_faligndata(data0, data1); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ row3 = vis_faligndata(data0, data1)
+
+/***************************************************************/
+#define NEXT_PIXEL_2BC_U8() \
+ xSrc = (X>>MLIB_SHIFT)-1; \
+ ySrc = (Y>>MLIB_SHIFT)-1; \
+ sPtr = (mlib_u8 *)lineAddr[ySrc] + (xSrc<<1)
+
+/***************************************************************/
+#define RESULT_2BC_U8_1PIXEL() \
+ v00 = vis_fmul8x16au(vis_read_hi(row0), vis_read_hi(yFilter)); \
+ dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter)); \
+ v01 = vis_fmul8x16au(vis_read_lo(row0), vis_read_hi(yFilter)); \
+ dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr)); \
+ v10 = vis_fmul8x16al(vis_read_hi(row1), vis_read_hi(yFilter)); \
+ dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr)); \
+ v11 = vis_fmul8x16al(vis_read_lo(row1), vis_read_hi(yFilter)); \
+ dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr)); \
+ v20 = vis_fmul8x16au(vis_read_hi(row2), vis_read_lo(yFilter)); \
+ xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1)); \
+ v21 = vis_fmul8x16au(vis_read_lo(row2), vis_read_lo(yFilter)); \
+ xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1)); \
+ v30 = vis_fmul8x16al(vis_read_hi(row3), vis_read_lo(yFilter)); \
+ sum0 = vis_fpadd16(v00, v10); \
+ v31 = vis_fmul8x16al(vis_read_lo(row3), vis_read_lo(yFilter)); \
+ sum1 = vis_fpadd16(v01, v11); \
+ sum0 = vis_fpadd16(sum0, v20); \
+ sum1 = vis_fpadd16(sum1, v21); \
+ sum0 = vis_fpadd16(sum0, v30); \
+ sum1 = vis_fpadd16(sum1, v31); \
+ v00 = vis_fmul8sux16(sum0, xFilter0); \
+ v01 = vis_fmul8sux16(sum1, xFilter1); \
+ v10 = vis_fmul8ulx16(sum0, xFilter0); \
+ sum0 = vis_fpadd16(v00, v10); \
+ v11 = vis_fmul8ulx16(sum1, xFilter1); \
+ sum1 = vis_fpadd16(v01, v11); \
+ d0 = vis_fpadd16(sum0, sum1); \
+ v00 = vis_write_lo(v00, vis_fpadd16s(vis_read_hi(d0), \
+ vis_read_lo(d0))); \
+ res = vis_write_lo(res, vis_fpack16(v00))
+
+/***************************************************************/
+#define BC_U8_2CH(index, mlib_filters_u8) \
+ v00 = vis_fmul8x16au(vis_read_hi(row0), vis_read_hi(yFilter)); \
+ dr = vis_fpmerge(vis_read_hi(xFilter), vis_read_lo(xFilter)); \
+ v01 = vis_fmul8x16au(vis_read_lo(row0), vis_read_hi(yFilter)); \
+ dr = vis_fpmerge(vis_read_hi(dr), vis_read_lo(dr)); \
+ v10 = vis_fmul8x16al(vis_read_hi(row1), vis_read_hi(yFilter)); \
+ dr1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr)); \
+ v11 = vis_fmul8x16al(vis_read_lo(row1), vis_read_hi(yFilter)); \
+ dr = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr)); \
+ v20 = vis_fmul8x16au(vis_read_hi(row2), vis_read_lo(yFilter)); \
+ xFilter0 = vis_fpmerge(vis_read_hi(dr), vis_read_hi(dr1)); \
+ v21 = vis_fmul8x16au(vis_read_lo(row2), vis_read_lo(yFilter)); \
+ xFilter1 = vis_fpmerge(vis_read_lo(dr), vis_read_lo(dr1)); \
+ v30 = vis_fmul8x16al(vis_read_hi(row3), vis_read_lo(yFilter)); \
+ v31 = vis_fmul8x16al(vis_read_lo(row3), vis_read_lo(yFilter)); \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ sum0 = vis_fpadd16(v00, v10); \
+ filterposy = (Y >> FILTER_SHIFT); \
+ data1 = dpSrc[1]; \
+ row0 = vis_faligndata(data0, data1); \
+ filterposx = (X >> FILTER_SHIFT); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ sum1 = vis_fpadd16(v01, v11); \
+ X += dX; \
+ data1 = dpSrc[1]; \
+ sum0 = vis_fpadd16(sum0, v20); \
+ row1 = vis_faligndata(data0, data1); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ Y += dY; \
+ sum1 = vis_fpadd16(sum1, v21); \
+ xSrc = (X>>MLIB_SHIFT)-1; \
+ data0 = dpSrc[0]; \
+ ySrc = (Y>>MLIB_SHIFT)-1; \
+ sum0 = vis_fpadd16(sum0, v30); \
+ data1 = dpSrc[1]; \
+ filterposy &= FILTER_MASK; \
+ sum1 = vis_fpadd16(sum1, v31); \
+ v00 = vis_fmul8sux16(sum0, xFilter0); \
+ row2 = vis_faligndata(data0, data1); \
+ v01 = vis_fmul8sux16(sum1, xFilter1); \
+ sPtr += srcYStride; \
+ v10 = vis_fmul8ulx16(sum0, xFilter0); \
+ filterposx &= FILTER_MASK; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ v11= vis_fmul8ulx16(sum1, xFilter1); \
+ data0 = dpSrc[0]; \
+ d0##index = vis_fpadd16(v00, v10); \
+ data1 = dpSrc[1]; \
+ row3 = vis_faligndata(data0, data1); \
+ yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
+ d1##index = vis_fpadd16(v01, v11); \
+ xFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposx)); \
+ sPtr = (mlib_u8 *)lineAddr[ySrc] + (xSrc<<1)
+
+/***************************************************************/
+mlib_status mlib_ImageAffine_u8_2ch_bc (mlib_affine_param *param)
+{
+ DECLAREVAR_BC();
+ DTYPE *dstLineEnd;
+ mlib_s32 filterposx, filterposy;
+ mlib_d64 data0, data1;
+ mlib_d64 sum0, sum1;
+ mlib_d64 row0, row1, row2, row3;
+ mlib_f32 p0, p1, p2, p3;
+ mlib_d64 xFilter;
+ mlib_d64 xFilter0, xFilter1, yFilter;
+ mlib_d64 v00, v10, v20, v30;
+ mlib_d64 v01, v11, v21, v31;
+ mlib_d64 d0, d1, d2, d3;
+ mlib_d64 d00, d01, d02, d03;
+ mlib_d64 d10, d11, d12, d13;
+ mlib_d64 e0, e1;
+ mlib_d64 *dpSrc;
+ mlib_s32 cols, i, mask, off;
+ mlib_d64 dr, dr1;
+ mlib_d64 res, *dp;
+ const mlib_s16 *mlib_filters_table;
+
+ if (filter == MLIB_BICUBIC) {
+ mlib_filters_table = mlib_filters_u8_bc;
+ } else {
+ mlib_filters_table = mlib_filters_u8_bc2;
+ }
+
+ for (j = yStart; j <= yFinish; j++) {
+
+ vis_write_gsr(3 << 3);
+
+ CLIP(2);
+ dstLineEnd = (DTYPE*)dstData + 2 * xRight;
+
+ cols = xRight - xLeft + 1;
+ dp = vis_alignaddr(dstPixelPtr, 0);
+ off = dstPixelPtr - (mlib_u8*)dp;
+ dstLineEnd += 1;
+ mask = vis_edge8(dstPixelPtr, dstLineEnd);
+ i = 0;
+
+ if (i <= cols - 10) {
+
+ NEXT_PIXEL_2BC_U8();
+ LOAD_BC_U8_2CH_1PIXEL(mlib_filters_table);
+
+ NEXT_PIXEL_2BC_U8();
+
+ BC_U8_2CH(0, mlib_filters_table);
+ BC_U8_2CH(1, mlib_filters_table);
+ BC_U8_2CH(2, mlib_filters_table);
+ BC_U8_2CH(3, mlib_filters_table);
+
+ FADD_2BC_U8();
+
+ BC_U8_2CH(0, mlib_filters_table);
+ BC_U8_2CH(1, mlib_filters_table);
+ BC_U8_2CH(2, mlib_filters_table);
+ BC_U8_2CH(3, mlib_filters_table);
+
+#pragma pipeloop(0)
+ for (; i <= cols-14; i+=4) {
+ vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
+ res = vis_faligndata(res, res);
+ vis_pst_8(res, dp++, mask);
+ vis_pst_8(res, dp, ~mask);
+ FADD_2BC_U8();
+ BC_U8_2CH(0, mlib_filters_table);
+ BC_U8_2CH(1, mlib_filters_table);
+ BC_U8_2CH(2, mlib_filters_table);
+ BC_U8_2CH(3, mlib_filters_table);
+ }
+
+ vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
+ res = vis_faligndata(res, res);
+ vis_pst_8(res, dp++, mask);
+ vis_pst_8(res, dp, ~mask);
+
+ FADD_2BC_U8();
+ vis_alignaddr((void *)(8 - (mlib_addr)dstPixelPtr), 0);
+ res = vis_faligndata(res, res);
+ vis_pst_8(res, dp++, mask);
+ vis_pst_8(res, dp, ~mask);
+
+ dstPixelPtr = (mlib_u8*)dp + off;
+
+ RESULT_2BC_U8_1PIXEL();
+ vis_alignaddr((void *)7, 0);
+ vis_st_u8(res, dstPixelPtr+1);
+ res = vis_faligndata(res, res);
+ vis_st_u8(res, dstPixelPtr);
+ dstPixelPtr += 2;
+
+ LOAD_BC_U8_2CH_1PIXEL(mlib_filters_table);
+ RESULT_2BC_U8_1PIXEL();
+ vis_alignaddr((void *)7, 0);
+ vis_st_u8(res, dstPixelPtr+1);
+ res = vis_faligndata(res, res);
+ vis_st_u8(res, dstPixelPtr);
+ dstPixelPtr += 2;
+ i += 10;
+ }
+
+ for (; i < cols; i++) {
+ NEXT_PIXEL_2BC_U8();
+ LOAD_BC_U8_2CH_1PIXEL(mlib_filters_table);
+ RESULT_2BC_U8_1PIXEL();
+ vis_alignaddr((void *)7, 0);
+ vis_st_u8(res, dstPixelPtr+1);
+ res = vis_faligndata(res, res);
+ vis_st_u8(res, dstPixelPtr);
+ dstPixelPtr += 2;
+ }
+ }
+
+ return MLIB_SUCCESS;
+}
+
+/***************************************************************/
+#ifndef MLIB_VIS2
+
+#define FADD_3BC_U8() \
+ vis_alignaddr((void*)6, 0); \
+ d3 = vis_faligndata(d0, d1); \
+ vis_alignaddr((void*)2, 0); \
+ d4 = vis_faligndata(d1, d2); \
+ d0 = vis_fpadd16(d0, d3); \
+ d2 = vis_fpadd16(d2, d4); \
+ d1 = vis_faligndata(d2, d2); \
+ d0 = vis_fpadd16(d0, d1); \
+ f0.f = vis_fpack16(d0)
+
+#else
+
+#define FADD_3BC_U8() \
+ vis_alignaddr((void*)4, 0); \
+ d3 = vis_bshuffle(d0, d1); \
+ d1 = vis_faligndata(d1, d2); \
+ d2 = vis_faligndata(d2, d2); \
+ d4 = vis_bshuffle(d1, d2); \
+ d0 = vis_fpadd16(d0, d3); \
+ d1 = vis_fpadd16(d1, d4); \
+ d0 = vis_fpadd16(d0, d1); \
+ f0.f = vis_fpack16(d0)
+
+#endif /* MLIB_VIS2 */
+
+/***************************************************************/
+#define LOAD_BC_U8_3CH_1PIXEL(mlib_filters_u8, mlib_filters_u8_3) \
+ filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK; \
+ yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
+ filterposx = (X >> FILTER_SHIFT) & FILTER_MASK; \
+ xPtr=((mlib_d64 *)((mlib_u8 *)mlib_filters_u8_3+3*filterposx)); \
+ xFilter0 = xPtr[0]; \
+ xFilter1 = xPtr[1]; \
+ xFilter2 = xPtr[2]; \
+ X += dX; \
+ Y += dY; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ data2 = dpSrc[2]; \
+ row00 = vis_faligndata(data0, data1); \
+ row01 = vis_faligndata(data1, data2); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ data2 = dpSrc[2]; \
+ row10 = vis_faligndata(data0, data1); \
+ row11 = vis_faligndata(data1, data2); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ data2 = dpSrc[2]; \
+ row20 = vis_faligndata(data0, data1); \
+ row21 = vis_faligndata(data1, data2); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ data2 = dpSrc[2]; \
+ row30 = vis_faligndata(data0, data1); \
+ row31 = vis_faligndata(data1, data2)
+
+/***************************************************************/
+#define STORE_BC_U8_3CH_1PIXEL() \
+ dstPixelPtr[0] = f0.t[0]; \
+ dstPixelPtr[1] = f0.t[1]; \
+ dstPixelPtr[2] = f0.t[2]; \
+ dstPixelPtr += 3
+
+/***************************************************************/
+#define NEXT_PIXEL_3BC_U8() \
+ xSrc = (X>>MLIB_SHIFT)-1; \
+ ySrc = (Y>>MLIB_SHIFT)-1; \
+ sPtr = (mlib_u8 *)lineAddr[ySrc] + (3*xSrc)
+
+/***************************************************************/
+#define RESULT_3BC_U8_1PIXEL() \
+ v00 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); \
+ v01 = vis_fmul8x16au(vis_read_lo(row00), vis_read_hi(yFilter)); \
+ v02 = vis_fmul8x16au(vis_read_hi(row01), vis_read_hi(yFilter)); \
+ v10 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); \
+ v11 = vis_fmul8x16al(vis_read_lo(row10), vis_read_hi(yFilter)); \
+ v12 = vis_fmul8x16al(vis_read_hi(row11), vis_read_hi(yFilter)); \
+ v20 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); \
+ sum0 = vis_fpadd16(v00, v10); \
+ v21 = vis_fmul8x16au(vis_read_lo(row20), vis_read_lo(yFilter)); \
+ sum1 = vis_fpadd16(v01, v11); \
+ v22 = vis_fmul8x16au(vis_read_hi(row21), vis_read_lo(yFilter)); \
+ sum2 = vis_fpadd16(v02, v12); \
+ v30 = vis_fmul8x16al(vis_read_hi(row30), vis_read_lo(yFilter)); \
+ sum0 = vis_fpadd16(sum0, v20); \
+ v31 = vis_fmul8x16al(vis_read_lo(row30), vis_read_lo(yFilter)); \
+ sum1 = vis_fpadd16(sum1, v21); \
+ v32 = vis_fmul8x16al(vis_read_hi(row31), vis_read_lo(yFilter)); \
+ sum2 = vis_fpadd16(sum2, v22); \
+ sum0 = vis_fpadd16(sum0, v30); \
+ sum1 = vis_fpadd16(sum1, v31); \
+ v00 = vis_fmul8sux16(sum0, xFilter0); \
+ sum2 = vis_fpadd16(sum2, v32); \
+ v01 = vis_fmul8ulx16(sum0, xFilter0); \
+ v10 = vis_fmul8sux16(sum1, xFilter1); \
+ d0 = vis_fpadd16(v00, v01); \
+ v11 = vis_fmul8ulx16(sum1, xFilter1); \
+ v20 = vis_fmul8sux16(sum2, xFilter2); \
+ d1 = vis_fpadd16(v10, v11); \
+ v21 = vis_fmul8ulx16(sum2, xFilter2); \
+ d2 = vis_fpadd16(v20, v21); \
+ FADD_3BC_U8();
+
+/***************************************************************/
+#define BC_U8_3CH(mlib_filters_u8, mlib_filters_u8_3) \
+ v00 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); \
+ v01 = vis_fmul8x16au(vis_read_lo(row00), vis_read_hi(yFilter)); \
+ v02 = vis_fmul8x16au(vis_read_hi(row01), vis_read_hi(yFilter)); \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ filterposy = (Y >> FILTER_SHIFT); \
+ v10 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); \
+ data1 = dpSrc[1]; \
+ v11 = vis_fmul8x16al(vis_read_lo(row10), vis_read_hi(yFilter)); \
+ sum0 = vis_fpadd16(v00, v10); \
+ data2 = dpSrc[2]; \
+ row00 = vis_faligndata(data0, data1); \
+ v12 = vis_fmul8x16al(vis_read_hi(row11), vis_read_hi(yFilter)); \
+ row01 = vis_faligndata(data1, data2); \
+ filterposx = (X >> FILTER_SHIFT); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ v20 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); \
+ sum1 = vis_fpadd16(v01, v11); \
+ data0 = dpSrc[0]; \
+ X += dX; \
+ data1 = dpSrc[1]; \
+ v21 = vis_fmul8x16au(vis_read_lo(row20), vis_read_lo(yFilter)); \
+ sum2 = vis_fpadd16(v02, v12); \
+ data2 = dpSrc[2]; \
+ row10 = vis_faligndata(data0, data1); \
+ v22 = vis_fmul8x16au(vis_read_hi(row21), vis_read_lo(yFilter)); \
+ row11 = vis_faligndata(data1, data2); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ Y += dY; \
+ xSrc = (X>>MLIB_SHIFT)-1; \
+ v30 = vis_fmul8x16al(vis_read_hi(row30), vis_read_lo(yFilter)); \
+ sum0 = vis_fpadd16(sum0, v20); \
+ data0 = dpSrc[0]; \
+ ySrc = (Y>>MLIB_SHIFT)-1; \
+ data1 = dpSrc[1]; \
+ v31 = vis_fmul8x16al(vis_read_lo(row30), vis_read_lo(yFilter)); \
+ sum1 = vis_fpadd16(sum1, v21); \
+ data2 = dpSrc[2]; \
+ filterposy &= FILTER_MASK; \
+ row20 = vis_faligndata(data0, data1); \
+ v32 = vis_fmul8x16al(vis_read_hi(row31), vis_read_lo(yFilter)); \
+ row21 = vis_faligndata(data1, data2); \
+ sPtr += srcYStride; \
+ filterposx &= FILTER_MASK; \
+ sum2 = vis_fpadd16(sum2, v22); \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ sum0 = vis_fpadd16(sum0, v30); \
+ data0 = dpSrc[0]; \
+ sum1 = vis_fpadd16(sum1, v31); \
+ v00 = vis_fmul8sux16(sum0, xFilter0); \
+ data1 = dpSrc[1]; \
+ sum2 = vis_fpadd16(sum2, v32); \
+ v01 = vis_fmul8ulx16(sum0, xFilter0); \
+ data2 = dpSrc[2]; \
+ row30 = vis_faligndata(data0, data1); \
+ v10 = vis_fmul8sux16(sum1, xFilter1); \
+ d0 = vis_fpadd16(v00, v01); \
+ row31 = vis_faligndata(data1, data2); \
+ yFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposy)); \
+ v11 = vis_fmul8ulx16(sum1, xFilter1); \
+ xPtr=((mlib_d64 *)((mlib_u8 *)mlib_filters_u8_3+3*filterposx)); \
+ xFilter0 = xPtr[0]; \
+ v20 = vis_fmul8sux16(sum2, xFilter2); \
+ d1 = vis_fpadd16(v10, v11); \
+ xFilter1 = xPtr[1]; \
+ v21 = vis_fmul8ulx16(sum2, xFilter2); \
+ xFilter2 = xPtr[2]; \
+ sPtr = (mlib_u8 *)lineAddr[ySrc] + (3*xSrc); \
+ d2 = vis_fpadd16(v20, v21)
+
+/***************************************************************/
+mlib_status mlib_ImageAffine_u8_3ch_bc (mlib_affine_param *param)
+{
+ DECLAREVAR_BC();
+ mlib_s32 filterposx, filterposy;
+ mlib_d64 data0, data1, data2;
+ mlib_d64 sum0, sum1, sum2;
+ mlib_d64 row00, row10, row20, row30;
+ mlib_d64 row01, row11, row21, row31;
+ mlib_d64 xFilter0, xFilter1, xFilter2, yFilter;
+ mlib_d64 v00, v10, v20, v30;
+ mlib_d64 v01, v11, v21, v31;
+ mlib_d64 v02, v12, v22, v32;
+ mlib_d64 d0, d1, d2, d3, d4;
+ mlib_d64 *dpSrc;
+ mlib_s32 cols, i;
+ mlib_d64 *xPtr;
+ union {
+ mlib_u8 t[4];
+ mlib_f32 f;
+ } f0;
+ const mlib_s16 *mlib_filters_table ;
+ const mlib_s16 *mlib_filters_table_3;
+
+ if (filter == MLIB_BICUBIC) {
+ mlib_filters_table = mlib_filters_u8_bc;
+ mlib_filters_table_3 = mlib_filters_u8_bc_3;
+ } else {
+ mlib_filters_table = mlib_filters_u8_bc2;
+ mlib_filters_table_3 = mlib_filters_u8_bc2_3;
+ }
+
+ vis_write_gsr(3 << 3);
+ MLIB_WRITE_BMASK(0x6789ABCD);
+
+ for (j = yStart; j <= yFinish; j ++) {
+
+ CLIP(3);
+
+ cols = xRight - xLeft + 1;
+ i = 0;
+
+ if (i <= cols - 4) {
+
+ NEXT_PIXEL_3BC_U8();
+ LOAD_BC_U8_3CH_1PIXEL(mlib_filters_table, mlib_filters_table_3);
+
+ NEXT_PIXEL_3BC_U8();
+
+ BC_U8_3CH(mlib_filters_table, mlib_filters_table_3);
+ FADD_3BC_U8();
+
+ BC_U8_3CH(mlib_filters_table, mlib_filters_table_3);
+
+#pragma pipeloop(0)
+ for (; i < cols-4; i++) {
+ STORE_BC_U8_3CH_1PIXEL();
+
+ FADD_3BC_U8();
+ BC_U8_3CH(mlib_filters_table, mlib_filters_table_3);
+ }
+
+ STORE_BC_U8_3CH_1PIXEL();
+
+ FADD_3BC_U8();
+ STORE_BC_U8_3CH_1PIXEL();
+
+ RESULT_3BC_U8_1PIXEL();
+ STORE_BC_U8_3CH_1PIXEL();
+
+ LOAD_BC_U8_3CH_1PIXEL(mlib_filters_table, mlib_filters_table_3);
+ RESULT_3BC_U8_1PIXEL();
+ STORE_BC_U8_3CH_1PIXEL();
+ i += 4;
+ }
+
+ for (; i < cols; i++) {
+ NEXT_PIXEL_3BC_U8();
+ LOAD_BC_U8_3CH_1PIXEL(mlib_filters_table, mlib_filters_table_3);
+ RESULT_3BC_U8_1PIXEL();
+ STORE_BC_U8_3CH_1PIXEL();
+ }
+ }
+
+ return MLIB_SUCCESS;
+}
+
+/***************************************************************/
+#define FADD_4BC_U8() \
+ d0 = vis_fpadd16(d00, d10); \
+ d1 = vis_fpadd16(d20, d30); \
+ d0 = vis_fpadd16(d0, d1); \
+ d2 = vis_fpadd16(d01, d11); \
+ d3 = vis_fpadd16(d21, d31); \
+ d2 = vis_fpadd16(d2, d3); \
+ res = vis_fpack16_pair(d0, d2)
+
+/***************************************************************/
+#define LOAD_BC_U8_4CH_1PIXEL(mlib_filters_u8, mlib_filters_u8_4) \
+ filterposy = (Y >> FILTER_SHIFT) & FILTER_MASK; \
+ yFilter = *((mlib_d64 *) ((mlib_u8 *)mlib_filters_u8 + filterposy)); \
+ filterposx = (X >> FILTER_SHIFT) & FILTER_MASK; \
+ xPtr=((mlib_d64 *)((mlib_u8 *)mlib_filters_u8_4+4*filterposx)); \
+ xFilter0 = xPtr[0]; \
+ xFilter1 = xPtr[1]; \
+ xFilter2 = xPtr[2]; \
+ xFilter3 = xPtr[3]; \
+ X += dX; \
+ Y += dY; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ data2 = dpSrc[2]; \
+ row00 = vis_faligndata(data0, data1); \
+ row01 = vis_faligndata(data1, data2); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ data2 = dpSrc[2]; \
+ row10 = vis_faligndata(data0, data1); \
+ row11 = vis_faligndata(data1, data2); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ data2 = dpSrc[2]; \
+ row20 = vis_faligndata(data0, data1); \
+ row21 = vis_faligndata(data1, data2); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ data1 = dpSrc[1]; \
+ data2 = dpSrc[2]; \
+ row30 = vis_faligndata(data0, data1); \
+ row31 = vis_faligndata(data1, data2)
+
+/***************************************************************/
+#define NEXT_PIXEL_4BC_U8() \
+ xSrc = (X>>MLIB_SHIFT)-1; \
+ ySrc = (Y>>MLIB_SHIFT)-1; \
+ sPtr = (mlib_u8 *)lineAddr[ySrc] + (4*xSrc)
+
+/***************************************************************/
+#define RESULT_4BC_U8_1PIXEL(ind) \
+ v00 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); \
+ v01 = vis_fmul8x16au(vis_read_lo(row00), vis_read_hi(yFilter)); \
+ v02 = vis_fmul8x16au(vis_read_hi(row01), vis_read_hi(yFilter)); \
+ v03 = vis_fmul8x16au(vis_read_lo(row01), vis_read_hi(yFilter)); \
+ v10 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); \
+ v11 = vis_fmul8x16al(vis_read_lo(row10), vis_read_hi(yFilter)); \
+ sum0 = vis_fpadd16(v00, v10); \
+ v12 = vis_fmul8x16al(vis_read_hi(row11), vis_read_hi(yFilter)); \
+ sum1 = vis_fpadd16(v01, v11); \
+ v13 = vis_fmul8x16al(vis_read_lo(row11), vis_read_hi(yFilter)); \
+ sum2 = vis_fpadd16(v02, v12); \
+ v20 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); \
+ sum3 = vis_fpadd16(v03, v13); \
+ v21 = vis_fmul8x16au(vis_read_lo(row20), vis_read_lo(yFilter)); \
+ sum0 = vis_fpadd16(sum0, v20); \
+ v22 = vis_fmul8x16au(vis_read_hi(row21), vis_read_lo(yFilter)); \
+ sum1 = vis_fpadd16(sum1, v21); \
+ v23 = vis_fmul8x16au(vis_read_lo(row21), vis_read_lo(yFilter)); \
+ sum2 = vis_fpadd16(sum2, v22); \
+ v30 = vis_fmul8x16al(vis_read_hi(row30), vis_read_lo(yFilter)); \
+ sum3 = vis_fpadd16(sum3, v23); \
+ v31 = vis_fmul8x16al(vis_read_lo(row30), vis_read_lo(yFilter)); \
+ sum0 = vis_fpadd16(sum0, v30); \
+ v32 = vis_fmul8x16al(vis_read_hi(row31), vis_read_lo(yFilter)); \
+ sum1 = vis_fpadd16(sum1, v31); \
+ v33 = vis_fmul8x16al(vis_read_lo(row31), vis_read_lo(yFilter)); \
+ sum2 = vis_fpadd16(sum2, v32); \
+ v00 = vis_fmul8sux16(sum0, xFilter0); \
+ sum3 = vis_fpadd16(sum3, v33); \
+ v01 = vis_fmul8ulx16(sum0, xFilter0); \
+ v10 = vis_fmul8sux16(sum1, xFilter1); \
+ d0##ind = vis_fpadd16(v00, v01); \
+ v11 = vis_fmul8ulx16(sum1, xFilter1); \
+ v20 = vis_fmul8sux16(sum2, xFilter2); \
+ d1##ind = vis_fpadd16(v10, v11); \
+ v21 = vis_fmul8ulx16(sum2, xFilter2); \
+ v30 = vis_fmul8sux16(sum3, xFilter3); \
+ d2##ind = vis_fpadd16(v20, v21); \
+ v31 = vis_fmul8ulx16(sum3, xFilter3); \
+ d3##ind = vis_fpadd16(v30, v31)
+
+/***************************************************************/
+#define BC_U8_4CH(ind, mlib_filters_u8, mlib_filters_u8_4) \
+ v00 = vis_fmul8x16au(vis_read_hi(row00), vis_read_hi(yFilter)); \
+ v01 = vis_fmul8x16au(vis_read_lo(row00), vis_read_hi(yFilter)); \
+ v02 = vis_fmul8x16au(vis_read_hi(row01), vis_read_hi(yFilter)); \
+ v03 = vis_fmul8x16au(vis_read_lo(row01), vis_read_hi(yFilter)); \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ data0 = dpSrc[0]; \
+ filterposy = (Y >> FILTER_SHIFT); \
+ v10 = vis_fmul8x16al(vis_read_hi(row10), vis_read_hi(yFilter)); \
+ data1 = dpSrc[1]; \
+ v11 = vis_fmul8x16al(vis_read_lo(row10), vis_read_hi(yFilter)); \
+ sum0 = vis_fpadd16(v00, v10); \
+ data2 = dpSrc[2]; \
+ row00 = vis_faligndata(data0, data1); \
+ v12 = vis_fmul8x16al(vis_read_hi(row11), vis_read_hi(yFilter)); \
+ row01 = vis_faligndata(data1, data2); \
+ filterposx = (X >> FILTER_SHIFT); \
+ v13 = vis_fmul8x16al(vis_read_lo(row11), vis_read_hi(yFilter)); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ v20 = vis_fmul8x16au(vis_read_hi(row20), vis_read_lo(yFilter)); \
+ sum1 = vis_fpadd16(v01, v11); \
+ data0 = dpSrc[0]; \
+ X += dX; \
+ data1 = dpSrc[1]; \
+ v21 = vis_fmul8x16au(vis_read_lo(row20), vis_read_lo(yFilter)); \
+ sum2 = vis_fpadd16(v02, v12); \
+ data2 = dpSrc[2]; \
+ row10 = vis_faligndata(data0, data1); \
+ v22 = vis_fmul8x16au(vis_read_hi(row21), vis_read_lo(yFilter)); \
+ row11 = vis_faligndata(data1, data2); \
+ sPtr += srcYStride; \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ v23 = vis_fmul8x16au(vis_read_lo(row21), vis_read_lo(yFilter)); \
+ sum3 = vis_fpadd16(v03, v13); \
+ Y += dY; \
+ xSrc = (X>>MLIB_SHIFT)-1; \
+ v30 = vis_fmul8x16al(vis_read_hi(row30), vis_read_lo(yFilter)); \
+ sum0 = vis_fpadd16(sum0, v20); \
+ data0 = dpSrc[0]; \
+ ySrc = (Y>>MLIB_SHIFT)-1; \
+ data1 = dpSrc[1]; \
+ v31 = vis_fmul8x16al(vis_read_lo(row30), vis_read_lo(yFilter)); \
+ sum1 = vis_fpadd16(sum1, v21); \
+ data2 = dpSrc[2]; \
+ filterposy &= FILTER_MASK; \
+ row20 = vis_faligndata(data0, data1); \
+ v32 = vis_fmul8x16al(vis_read_hi(row31), vis_read_lo(yFilter)); \
+ row21 = vis_faligndata(data1, data2); \
+ sPtr += srcYStride; \
+ filterposx &= FILTER_MASK; \
+ v33 = vis_fmul8x16al(vis_read_lo(row31), vis_read_lo(yFilter)); \
+ sum2 = vis_fpadd16(sum2, v22); \
+ ALIGN_ADDR(dpSrc, sPtr); \
+ sum3 = vis_fpadd16(sum3, v23); \
+ sum0 = vis_fpadd16(sum0, v30); \
+ data0 = dpSrc[0]; \
+ sum1 = vis_fpadd16(sum1, v31); \
+ v00 = vis_fmul8sux16(sum0, xFilter0); \
+ data1 = dpSrc[1]; \
+ sum2 = vis_fpadd16(sum2, v32); \
+ v01 = vis_fmul8ulx16(sum0, xFilter0); \
+ sum3 = vis_fpadd16(sum3, v33); \
+ data2 = dpSrc[2]; \
+ row30 = vis_faligndata(data0, data1); \
+ v10 = vis_fmul8sux16(sum1, xFilter1); \
+ d0##ind = vis_fpadd16(v00, v01); \
+ row31 = vis_faligndata(data1, data2); \
+ yFilter = *((mlib_d64 *)((mlib_u8 *)mlib_filters_u8 + filterposy)); \
+ v11 = vis_fmul8ulx16(sum1, xFilter1); \
+ xPtr=((mlib_d64 *)((mlib_u8 *)mlib_filters_u8_4+4*filterposx)); \
+ xFilter0 = xPtr[0]; \
+ v20 = vis_fmul8sux16(sum2, xFilter2); \
+ d1##ind = vis_fpadd16(v10, v11); \
+ xFilter1 = xPtr[1]; \
+ v21 = vis_fmul8ulx16(sum2, xFilter2); \
+ xFilter2 = xPtr[2]; \
+ v30 = vis_fmul8sux16(sum3, xFilter3); \
+ d2##ind = vis_fpadd16(v20, v21); \
+ v31 = vis_fmul8ulx16(sum3, xFilter3); \
+ xFilter3 = xPtr[3]; \
+ sPtr = (mlib_u8 *)lineAddr[ySrc] + (4*xSrc); \
+ d3##ind = vis_fpadd16(v30, v31)
+
+/***************************************************************/
+mlib_status mlib_ImageAffine_u8_4ch_bc (mlib_affine_param *param)
+{
+ DECLAREVAR_BC();
+ DTYPE *dstLineEnd;
+ mlib_s32 filterposx, filterposy;
+ mlib_d64 data0, data1, data2;
+ mlib_d64 sum0, sum1, sum2, sum3;
+ mlib_d64 row00, row10, row20, row30;
+ mlib_d64 row01, row11, row21, row31;
+ mlib_d64 xFilter0, xFilter1, xFilter2, xFilter3, yFilter;
+ mlib_d64 v00, v10, v20, v30;
+ mlib_d64 v01, v11, v21, v31;
+ mlib_d64 v02, v12, v22, v32;
+ mlib_d64 v03, v13, v23, v33;
+ mlib_d64 d0, d1, d2, d3;
+ mlib_d64 d00, d10, d20, d30;
+ mlib_d64 d01, d11, d21, d31;
+ mlib_d64 *dpSrc;
+ mlib_s32 cols, i;
+ mlib_d64 res, *dp, *xPtr;
+ mlib_s32 mask, emask, gsrd;
+ const mlib_s16 *mlib_filters_table ;
+ const mlib_s16 *mlib_filters_table_4;
+
+ if (filter == MLIB_BICUBIC) {
+ mlib_filters_table = mlib_filters_u8_bc;
+ mlib_filters_table_4 = mlib_filters_u8_bc_4;
+ } else {
+ mlib_filters_table = mlib_filters_u8_bc2;
+ mlib_filters_table_4 = mlib_filters_u8_bc2_4;
+ }
+
+ for (j = yStart; j <= yFinish; j++) {
+
+ vis_write_gsr(3 << 3);
+
+ CLIP(4);
+ dstLineEnd = (DTYPE*)dstData + 4 * xRight;
+ dstLineEnd += 3;
+ dp = (mlib_d64*)vis_alignaddr(dstPixelPtr, 0);
+ mask = vis_edge8(dstPixelPtr, dstLineEnd);
+ gsrd = ((8 - (mlib_addr)dstPixelPtr) & 7);
+
+ cols = xRight - xLeft + 1;
+ i = 0;
+
+ if (i <= cols - 6) {
+
+ NEXT_PIXEL_4BC_U8();
+ LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
+
+ NEXT_PIXEL_4BC_U8();
+
+ BC_U8_4CH(0, mlib_filters_table, mlib_filters_table_4);
+ BC_U8_4CH(1, mlib_filters_table, mlib_filters_table_4);
+ FADD_4BC_U8();
+
+ BC_U8_4CH(0, mlib_filters_table, mlib_filters_table_4);
+ BC_U8_4CH(1, mlib_filters_table, mlib_filters_table_4);
+
+#pragma pipeloop(0)
+ for (; i <= cols-8; i+=2) {
+ vis_alignaddr((void *)gsrd, 0);
+ res = vis_faligndata(res, res);
+
+ vis_pst_8(res, dp++, mask);
+ vis_pst_8(res, dp, ~mask);
+
+ FADD_4BC_U8();
+ BC_U8_4CH(0, mlib_filters_table, mlib_filters_table_4);
+ BC_U8_4CH(1, mlib_filters_table, mlib_filters_table_4);
+ }
+
+ vis_alignaddr((void *)gsrd, 0);
+ res = vis_faligndata(res, res);
+
+ vis_pst_8(res, dp++, mask);
+ vis_pst_8(res, dp, ~mask);
+
+ FADD_4BC_U8();
+ vis_alignaddr((void *)gsrd, 0);
+ res = vis_faligndata(res, res);
+
+ vis_pst_8(res, dp++, mask);
+ vis_pst_8(res, dp, ~mask);
+
+ RESULT_4BC_U8_1PIXEL(0);
+ LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
+ RESULT_4BC_U8_1PIXEL(1);
+ FADD_4BC_U8();
+
+ vis_alignaddr((void *)gsrd, 0);
+ res = vis_faligndata(res, res);
+
+ vis_pst_8(res, dp++, mask);
+ vis_pst_8(res, dp, ~mask);
+ i += 6;
+ }
+
+ if (i <= cols-4) {
+ NEXT_PIXEL_4BC_U8();
+ LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
+
+ NEXT_PIXEL_4BC_U8();
+
+ BC_U8_4CH(0, mlib_filters_table, mlib_filters_table_4);
+ BC_U8_4CH(1, mlib_filters_table, mlib_filters_table_4);
+ FADD_4BC_U8();
+ vis_alignaddr((void *)gsrd, 0);
+ res = vis_faligndata(res, res);
+
+ vis_pst_8(res, dp++, mask);
+ vis_pst_8(res, dp, ~mask);
+
+ RESULT_4BC_U8_1PIXEL(0);
+ LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
+ RESULT_4BC_U8_1PIXEL(1);
+ FADD_4BC_U8();
+
+ vis_alignaddr((void *)gsrd, 0);
+ res = vis_faligndata(res, res);
+
+ vis_pst_8(res, dp++, mask);
+ vis_pst_8(res, dp, ~mask);
+ i += 4;
+ }
+
+ if (i <= cols-2) {
+ NEXT_PIXEL_4BC_U8();
+ LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
+ RESULT_4BC_U8_1PIXEL(0);
+
+ NEXT_PIXEL_4BC_U8();
+ LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
+ RESULT_4BC_U8_1PIXEL(1);
+ FADD_4BC_U8();
+
+ vis_alignaddr((void *)gsrd, 0);
+ res = vis_faligndata(res, res);
+
+ vis_pst_8(res, dp++, mask);
+ vis_pst_8(res, dp, ~mask);
+ i += 2;
+ }
+
+ if (i < cols) {
+ NEXT_PIXEL_4BC_U8();
+ LOAD_BC_U8_4CH_1PIXEL(mlib_filters_table, mlib_filters_table_4);
+ RESULT_4BC_U8_1PIXEL(0);
+
+ d0 = vis_fpadd16(d00, d10);
+ d1 = vis_fpadd16(d20, d30);
+ d0 = vis_fpadd16(d0, d1);
+ res = vis_fpack16_pair(d0, d0);
+ vis_alignaddr((void *)gsrd, 0);
+ res = vis_faligndata(res, res);
+
+ emask = vis_edge8(dp, dstLineEnd);
+ vis_pst_8(res, dp++, emask & mask);
+
+ if ((mlib_u8*)dp <= (mlib_u8*)dstLineEnd) {
+ mask = vis_edge8(dp, dstLineEnd);
+ vis_pst_8(res, dp, mask);
+ }
+ }
+ }
+
+ return MLIB_SUCCESS;
+}
+
+/***************************************************************/