774 __ bind(L); |
775 __ bind(L); |
775 } |
776 } |
776 #endif |
777 #endif |
777 |
778 |
778 // Fill 8 registers |
779 // Fill 8 registers |
779 __ ldp(t0, t1, Address(s, 2 * unit)); |
780 if (UseSIMDForMemoryOps) { |
780 __ ldp(t2, t3, Address(s, 4 * unit)); |
781 __ ldpq(v0, v1, Address(s, 4 * unit)); |
781 __ ldp(t4, t5, Address(s, 6 * unit)); |
782 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); |
782 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); |
783 } else { |
|
784 __ ldp(t0, t1, Address(s, 2 * unit)); |
|
785 __ ldp(t2, t3, Address(s, 4 * unit)); |
|
786 __ ldp(t4, t5, Address(s, 6 * unit)); |
|
787 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); |
|
788 } |
783 |
789 |
784 __ subs(count, count, 16); |
790 __ subs(count, count, 16); |
785 __ br(Assembler::LO, drain); |
791 __ br(Assembler::LO, drain); |
786 |
792 |
787 int prefetch = PrefetchCopyIntervalInBytes; |
793 int prefetch = PrefetchCopyIntervalInBytes; |
795 __ bind(again); |
801 __ bind(again); |
796 |
802 |
797 if (PrefetchCopyIntervalInBytes > 0) |
803 if (PrefetchCopyIntervalInBytes > 0) |
798 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); |
804 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); |
799 |
805 |
800 __ stp(t0, t1, Address(d, 2 * unit)); |
806 if (UseSIMDForMemoryOps) { |
801 __ ldp(t0, t1, Address(s, 2 * unit)); |
807 __ stpq(v0, v1, Address(d, 4 * unit)); |
802 __ stp(t2, t3, Address(d, 4 * unit)); |
808 __ ldpq(v0, v1, Address(s, 4 * unit)); |
803 __ ldp(t2, t3, Address(s, 4 * unit)); |
809 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); |
804 __ stp(t4, t5, Address(d, 6 * unit)); |
810 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); |
805 __ ldp(t4, t5, Address(s, 6 * unit)); |
811 } else { |
806 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); |
812 __ stp(t0, t1, Address(d, 2 * unit)); |
807 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); |
813 __ ldp(t0, t1, Address(s, 2 * unit)); |
|
814 __ stp(t2, t3, Address(d, 4 * unit)); |
|
815 __ ldp(t2, t3, Address(s, 4 * unit)); |
|
816 __ stp(t4, t5, Address(d, 6 * unit)); |
|
817 __ ldp(t4, t5, Address(s, 6 * unit)); |
|
818 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); |
|
819 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); |
|
820 } |
808 |
821 |
809 __ subs(count, count, 8); |
822 __ subs(count, count, 8); |
810 __ br(Assembler::HS, again); |
823 __ br(Assembler::HS, again); |
811 |
824 |
812 // Drain |
825 // Drain |
813 __ bind(drain); |
826 __ bind(drain); |
814 __ stp(t0, t1, Address(d, 2 * unit)); |
827 if (UseSIMDForMemoryOps) { |
815 __ stp(t2, t3, Address(d, 4 * unit)); |
828 __ stpq(v0, v1, Address(d, 4 * unit)); |
816 __ stp(t4, t5, Address(d, 6 * unit)); |
829 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); |
817 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); |
830 } else { |
818 |
831 __ stp(t0, t1, Address(d, 2 * unit)); |
819 if (direction == copy_forwards) { |
832 __ stp(t2, t3, Address(d, 4 * unit)); |
820 __ add(s, s, 2 * wordSize); |
833 __ stp(t4, t5, Address(d, 6 * unit)); |
821 __ add(d, d, 2 * wordSize); |
834 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); |
822 } |
835 } |
823 |
836 |
824 { |
837 { |
825 Label L1, L2; |
838 Label L1, L2; |
826 __ tbz(count, exact_log2(4), L1); |
839 __ tbz(count, exact_log2(4), L1); |
827 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); |
840 if (UseSIMDForMemoryOps) { |
828 __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); |
841 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); |
829 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); |
842 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); |
830 __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); |
843 } else { |
|
844 __ ldp(t0, t1, Address(s, 2 * unit)); |
|
845 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); |
|
846 __ stp(t0, t1, Address(d, 2 * unit)); |
|
847 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); |
|
848 } |
831 __ bind(L1); |
849 __ bind(L1); |
|
850 |
|
851 if (direction == copy_forwards) { |
|
852 __ add(s, s, 2 * wordSize); |
|
853 __ add(d, d, 2 * wordSize); |
|
854 } |
832 |
855 |
833 __ tbz(count, 1, L2); |
856 __ tbz(count, 1, L2); |
834 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); |
857 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); |
835 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); |
858 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); |
836 __ bind(L2); |
859 __ bind(L2); |
912 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; |
935 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; |
913 const Register send = r17, dend = r18; |
936 const Register send = r17, dend = r18; |
914 |
937 |
915 if (PrefetchCopyIntervalInBytes > 0) |
938 if (PrefetchCopyIntervalInBytes > 0) |
916 __ prfm(Address(s, 0), PLDL1KEEP); |
939 __ prfm(Address(s, 0), PLDL1KEEP); |
917 |
940 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); |
918 __ cmp(count, 80/granularity); |
|
919 __ br(Assembler::HI, copy_big); |
941 __ br(Assembler::HI, copy_big); |
920 |
942 |
921 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); |
943 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); |
922 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); |
944 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); |
923 |
945 |
929 |
951 |
930 __ cmp(count, 32/granularity); |
952 __ cmp(count, 32/granularity); |
931 __ br(Assembler::LS, copy32); |
953 __ br(Assembler::LS, copy32); |
932 |
954 |
933 // 33..64 bytes |
955 // 33..64 bytes |
934 __ ldp(t0, t1, Address(s, 0)); |
956 if (UseSIMDForMemoryOps) { |
935 __ ldp(t2, t3, Address(s, 16)); |
957 __ ldpq(v0, v1, Address(s, 0)); |
936 __ ldp(t4, t5, Address(send, -32)); |
958 __ ldpq(v2, v3, Address(send, -32)); |
937 __ ldp(t6, t7, Address(send, -16)); |
959 __ stpq(v0, v1, Address(d, 0)); |
938 |
960 __ stpq(v2, v3, Address(dend, -32)); |
939 __ stp(t0, t1, Address(d, 0)); |
961 } else { |
940 __ stp(t2, t3, Address(d, 16)); |
962 __ ldp(t0, t1, Address(s, 0)); |
941 __ stp(t4, t5, Address(dend, -32)); |
963 __ ldp(t2, t3, Address(s, 16)); |
942 __ stp(t6, t7, Address(dend, -16)); |
964 __ ldp(t4, t5, Address(send, -32)); |
|
965 __ ldp(t6, t7, Address(send, -16)); |
|
966 |
|
967 __ stp(t0, t1, Address(d, 0)); |
|
968 __ stp(t2, t3, Address(d, 16)); |
|
969 __ stp(t4, t5, Address(dend, -32)); |
|
970 __ stp(t6, t7, Address(dend, -16)); |
|
971 } |
943 __ b(finish); |
972 __ b(finish); |
944 |
973 |
945 // 17..32 bytes |
974 // 17..32 bytes |
946 __ bind(copy32); |
975 __ bind(copy32); |
947 __ ldp(t0, t1, Address(s, 0)); |
976 __ ldp(t0, t1, Address(s, 0)); |
948 __ ldp(t2, t3, Address(send, -16)); |
977 __ ldp(t2, t3, Address(send, -16)); |
949 __ stp(t0, t1, Address(d, 0)); |
978 __ stp(t0, t1, Address(d, 0)); |
950 __ stp(t2, t3, Address(dend, -16)); |
979 __ stp(t2, t3, Address(dend, -16)); |
951 __ b(finish); |
980 __ b(finish); |
952 |
981 |
953 // 65..80 bytes |
982 // 65..80/96 bytes |
|
983 // (96 bytes if SIMD because we do 32 byes per instruction) |
954 __ bind(copy80); |
984 __ bind(copy80); |
955 __ ldp(t0, t1, Address(s, 0)); |
985 if (UseSIMDForMemoryOps) { |
956 __ ldp(t2, t3, Address(s, 16)); |
986 __ ldpq(v0, v1, Address(s, 0)); |
957 __ ldp(t4, t5, Address(s, 32)); |
987 __ ldpq(v2, v3, Address(s, 32)); |
958 __ ldp(t6, t7, Address(s, 48)); |
988 __ ldpq(v4, v5, Address(send, -32)); |
959 __ ldp(t8, t9, Address(send, -16)); |
989 __ stpq(v0, v1, Address(d, 0)); |
960 |
990 __ stpq(v2, v3, Address(d, 32)); |
961 __ stp(t0, t1, Address(d, 0)); |
991 __ stpq(v4, v5, Address(dend, -32)); |
962 __ stp(t2, t3, Address(d, 16)); |
992 } else { |
963 __ stp(t4, t5, Address(d, 32)); |
993 __ ldp(t0, t1, Address(s, 0)); |
964 __ stp(t6, t7, Address(d, 48)); |
994 __ ldp(t2, t3, Address(s, 16)); |
965 __ stp(t8, t9, Address(dend, -16)); |
995 __ ldp(t4, t5, Address(s, 32)); |
|
996 __ ldp(t6, t7, Address(s, 48)); |
|
997 __ ldp(t8, t9, Address(send, -16)); |
|
998 |
|
999 __ stp(t0, t1, Address(d, 0)); |
|
1000 __ stp(t2, t3, Address(d, 16)); |
|
1001 __ stp(t4, t5, Address(d, 32)); |
|
1002 __ stp(t6, t7, Address(d, 48)); |
|
1003 __ stp(t8, t9, Address(dend, -16)); |
|
1004 } |
966 __ b(finish); |
1005 __ b(finish); |
967 |
1006 |
968 // 0..16 bytes |
1007 // 0..16 bytes |
969 __ bind(copy16); |
1008 __ bind(copy16); |
970 __ cmp(count, 8/granularity); |
1009 __ cmp(count, 8/granularity); |