--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Fri Oct 17 15:18:02 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Tue Oct 21 11:21:45 2008 -0700
@@ -791,6 +791,69 @@
}
}
+
+ // Copy 64 bytes chunks
+ //
+ // Inputs:
+ // from - source array address
+ // to_from - destination array address - from
+ // qword_count - 8-bytes element count, negative
+ //
+ void xmm_copy_forward(Register from, Register to_from, Register qword_count) {
+ assert( UseSSE >= 2, "supported cpu only" );
+ Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
+ // Copy 64-byte chunks
+ __ jmpb(L_copy_64_bytes);
+ __ align(16);
+ __ BIND(L_copy_64_bytes_loop);
+
+ if(UseUnalignedLoadStores) {
+ __ movdqu(xmm0, Address(from, 0));
+ __ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
+ __ movdqu(xmm1, Address(from, 16));
+ __ movdqu(Address(from, to_from, Address::times_1, 16), xmm1);
+ __ movdqu(xmm2, Address(from, 32));
+ __ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
+ __ movdqu(xmm3, Address(from, 48));
+ __ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
+
+ } else {
+ __ movq(xmm0, Address(from, 0));
+ __ movq(Address(from, to_from, Address::times_1, 0), xmm0);
+ __ movq(xmm1, Address(from, 8));
+ __ movq(Address(from, to_from, Address::times_1, 8), xmm1);
+ __ movq(xmm2, Address(from, 16));
+ __ movq(Address(from, to_from, Address::times_1, 16), xmm2);
+ __ movq(xmm3, Address(from, 24));
+ __ movq(Address(from, to_from, Address::times_1, 24), xmm3);
+ __ movq(xmm4, Address(from, 32));
+ __ movq(Address(from, to_from, Address::times_1, 32), xmm4);
+ __ movq(xmm5, Address(from, 40));
+ __ movq(Address(from, to_from, Address::times_1, 40), xmm5);
+ __ movq(xmm6, Address(from, 48));
+ __ movq(Address(from, to_from, Address::times_1, 48), xmm6);
+ __ movq(xmm7, Address(from, 56));
+ __ movq(Address(from, to_from, Address::times_1, 56), xmm7);
+ }
+
+ __ addl(from, 64);
+ __ BIND(L_copy_64_bytes);
+ __ subl(qword_count, 8);
+ __ jcc(Assembler::greaterEqual, L_copy_64_bytes_loop);
+ __ addl(qword_count, 8);
+ __ jccb(Assembler::zero, L_exit);
+ //
+ // length is too short, just copy qwords
+ //
+ __ BIND(L_copy_8_bytes);
+ __ movq(xmm0, Address(from, 0));
+ __ movq(Address(from, to_from, Address::times_1), xmm0);
+ __ addl(from, 8);
+ __ decrement(qword_count);
+ __ jcc(Assembler::greater, L_copy_8_bytes);
+ __ BIND(L_exit);
+ }
+
// Copy 64 bytes chunks
//
// Inputs:
@@ -799,6 +862,7 @@
// qword_count - 8-bytes element count, negative
//
void mmx_copy_forward(Register from, Register to_from, Register qword_count) {
+ assert( VM_Version::supports_mmx(), "supported cpu only" );
Label L_copy_64_bytes_loop, L_copy_64_bytes, L_copy_8_bytes, L_exit;
// Copy 64-byte chunks
__ jmpb(L_copy_64_bytes);
@@ -876,7 +940,7 @@
__ subptr(to, from); // to --> to_from
__ cmpl(count, 2<<shift); // Short arrays (< 8 bytes) copy by element
__ jcc(Assembler::below, L_copy_4_bytes); // use unsigned cmp
- if (!aligned && (t == T_BYTE || t == T_SHORT)) {
+ if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) {
// align source address at 4 bytes address boundary
if (t == T_BYTE) {
// One byte misalignment happens only for byte arrays
@@ -906,20 +970,26 @@
__ mov(count, rax); // restore 'count'
__ jmpb(L_copy_2_bytes); // all dwords were copied
} else {
- // align to 8 bytes, we know we are 4 byte aligned to start
- __ testptr(from, 4);
- __ jccb(Assembler::zero, L_copy_64_bytes);
- __ movl(rax, Address(from, 0));
- __ movl(Address(from, to_from, Address::times_1, 0), rax);
- __ addptr(from, 4);
- __ subl(count, 1<<shift);
+ if (!UseUnalignedLoadStores) {
+ // align to 8 bytes, we know we are 4 byte aligned to start
+ __ testptr(from, 4);
+ __ jccb(Assembler::zero, L_copy_64_bytes);
+ __ movl(rax, Address(from, 0));
+ __ movl(Address(from, to_from, Address::times_1, 0), rax);
+ __ addptr(from, 4);
+ __ subl(count, 1<<shift);
+ }
__ BIND(L_copy_64_bytes);
__ mov(rax, count);
__ shrl(rax, shift+1); // 8 bytes chunk count
//
// Copy 8-byte chunks through MMX registers, 8 per iteration of the loop
//
- mmx_copy_forward(from, to_from, rax);
+ if (UseXMMForArrayCopy) {
+ xmm_copy_forward(from, to_from, rax);
+ } else {
+ mmx_copy_forward(from, to_from, rax);
+ }
}
// copy tailing dword
__ BIND(L_copy_4_bytes);
@@ -1069,13 +1139,20 @@
__ align(16);
// Move 8 bytes
__ BIND(L_copy_8_bytes_loop);
- __ movq(mmx0, Address(from, count, sf, 0));
- __ movq(Address(to, count, sf, 0), mmx0);
+ if (UseXMMForArrayCopy) {
+ __ movq(xmm0, Address(from, count, sf, 0));
+ __ movq(Address(to, count, sf, 0), xmm0);
+ } else {
+ __ movq(mmx0, Address(from, count, sf, 0));
+ __ movq(Address(to, count, sf, 0), mmx0);
+ }
__ BIND(L_copy_8_bytes);
__ subl(count, 2<<shift);
__ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
__ addl(count, 2<<shift);
- __ emms();
+ if (!UseXMMForArrayCopy) {
+ __ emms();
+ }
}
__ BIND(L_copy_4_bytes);
// copy prefix qword
@@ -1143,7 +1220,11 @@
__ subptr(to, from); // to --> to_from
if (VM_Version::supports_mmx()) {
- mmx_copy_forward(from, to_from, count);
+ if (UseXMMForArrayCopy) {
+ xmm_copy_forward(from, to_from, count);
+ } else {
+ mmx_copy_forward(from, to_from, count);
+ }
} else {
__ jmpb(L_copy_8_bytes);
__ align(16);
@@ -1196,8 +1277,13 @@
__ align(16);
__ BIND(L_copy_8_bytes_loop);
if (VM_Version::supports_mmx()) {
- __ movq(mmx0, Address(from, count, Address::times_8));
- __ movq(Address(to, count, Address::times_8), mmx0);
+ if (UseXMMForArrayCopy) {
+ __ movq(xmm0, Address(from, count, Address::times_8));
+ __ movq(Address(to, count, Address::times_8), xmm0);
+ } else {
+ __ movq(mmx0, Address(from, count, Address::times_8));
+ __ movq(Address(to, count, Address::times_8), mmx0);
+ }
} else {
__ fild_d(Address(from, count, Address::times_8));
__ fistp_d(Address(to, count, Address::times_8));
@@ -1206,7 +1292,7 @@
__ decrement(count);
__ jcc(Assembler::greaterEqual, L_copy_8_bytes_loop);
- if (VM_Version::supports_mmx()) {
+ if (VM_Version::supports_mmx() && !UseXMMForArrayCopy) {
__ emms();
}
inc_copy_counter_np(T_LONG);