5646 prfm(Address(src, SoftwarePrefetchHintDistance)); |
5646 prfm(Address(src, SoftwarePrefetchHintDistance)); |
5647 orr(v4, T16B, Vtmp1, Vtmp2); |
5647 orr(v4, T16B, Vtmp1, Vtmp2); |
5648 orr(v5, T16B, Vtmp3, Vtmp4); |
5648 orr(v5, T16B, Vtmp3, Vtmp4); |
5649 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); |
5649 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); |
5650 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); |
5650 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); |
5651 stpq(Vtmp1, Vtmp3, dst); |
|
5652 uzp2(v5, T16B, v4, v5); // high bytes |
5651 uzp2(v5, T16B, v4, v5); // high bytes |
5653 umov(tmp2, v5, D, 1); |
5652 umov(tmp2, v5, D, 1); |
5654 fmovd(tmp1, v5); |
5653 fmovd(tmp1, v5); |
5655 orr(tmp1, tmp1, tmp2); |
5654 orr(tmp1, tmp1, tmp2); |
5656 cbnz(tmp1, LOOP_8); |
5655 cbnz(tmp1, LOOP_8); |
|
5656 stpq(Vtmp1, Vtmp3, dst); |
5657 sub(len, len, 32); |
5657 sub(len, len, 32); |
5658 add(dst, dst, 32); |
5658 add(dst, dst, 32); |
5659 add(src, src, 64); |
5659 add(src, src, 64); |
5660 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); |
5660 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); |
5661 br(GE, NEXT_32_PRFM); |
5661 br(GE, NEXT_32_PRFM); |
5669 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); |
5669 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); |
5670 } |
5670 } |
5671 prfm(Address(src, SoftwarePrefetchHintDistance)); |
5671 prfm(Address(src, SoftwarePrefetchHintDistance)); |
5672 uzp1(v4, T16B, Vtmp1, Vtmp2); |
5672 uzp1(v4, T16B, Vtmp1, Vtmp2); |
5673 uzp1(v5, T16B, Vtmp3, Vtmp4); |
5673 uzp1(v5, T16B, Vtmp3, Vtmp4); |
5674 stpq(v4, v5, dst); |
|
5675 orr(Vtmp1, T16B, Vtmp1, Vtmp2); |
5674 orr(Vtmp1, T16B, Vtmp1, Vtmp2); |
5676 orr(Vtmp3, T16B, Vtmp3, Vtmp4); |
5675 orr(Vtmp3, T16B, Vtmp3, Vtmp4); |
5677 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes |
5676 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes |
5678 umov(tmp2, Vtmp1, D, 1); |
5677 umov(tmp2, Vtmp1, D, 1); |
5679 fmovd(tmp1, Vtmp1); |
5678 fmovd(tmp1, Vtmp1); |
5680 orr(tmp1, tmp1, tmp2); |
5679 orr(tmp1, tmp1, tmp2); |
5681 cbnz(tmp1, LOOP_8); |
5680 cbnz(tmp1, LOOP_8); |
|
5681 stpq(v4, v5, dst); |
5682 sub(len, len, 32); |
5682 sub(len, len, 32); |
5683 add(dst, dst, 32); |
5683 add(dst, dst, 32); |
5684 add(src, src, 64); |
5684 add(src, src, 64); |
5685 cmp(len, (u1)32); |
5685 cmp(len, (u1)32); |
5686 br(GE, NEXT_32); |
5686 br(GE, NEXT_32); |
5691 br(LT, LOOP_1); |
5691 br(LT, LOOP_1); |
5692 BIND(NEXT_8); |
5692 BIND(NEXT_8); |
5693 ld1(Vtmp1, T8H, src); |
5693 ld1(Vtmp1, T8H, src); |
5694 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes |
5694 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes |
5695 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes |
5695 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes |
5696 strd(Vtmp2, dst); |
|
5697 fmovd(tmp1, Vtmp3); |
5696 fmovd(tmp1, Vtmp3); |
5698 cbnz(tmp1, NEXT_1); |
5697 cbnz(tmp1, NEXT_1); |
|
5698 strd(Vtmp2, dst); |
5699 |
5699 |
5700 sub(len, len, 8); |
5700 sub(len, len, 8); |
5701 add(dst, dst, 8); |
5701 add(dst, dst, 8); |
5702 add(src, src, 16); |
5702 add(src, src, 16); |
5703 cmp(len, (u1)8); |
5703 cmp(len, (u1)8); |
5706 BIND(LOOP_1); |
5706 BIND(LOOP_1); |
5707 #endif |
5707 #endif |
5708 cbz(len, DONE); |
5708 cbz(len, DONE); |
5709 BIND(NEXT_1); |
5709 BIND(NEXT_1); |
5710 ldrh(tmp1, Address(post(src, 2))); |
5710 ldrh(tmp1, Address(post(src, 2))); |
5711 strb(tmp1, Address(post(dst, 1))); |
|
5712 tst(tmp1, 0xff00); |
5711 tst(tmp1, 0xff00); |
5713 br(NE, SET_RESULT); |
5712 br(NE, SET_RESULT); |
|
5713 strb(tmp1, Address(post(dst, 1))); |
5714 subs(len, len, 1); |
5714 subs(len, len, 1); |
5715 br(GT, NEXT_1); |
5715 br(GT, NEXT_1); |
5716 |
5716 |
5717 BIND(SET_RESULT); |
5717 BIND(SET_RESULT); |
5718 sub(result, result, len); // Return index where we stopped |
5718 sub(result, result, len); // Return index where we stopped |