--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Thu Jan 03 15:09:55 2013 -0800
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Thu Jan 03 16:30:47 2013 -0800
@@ -1286,23 +1286,54 @@
// end_to - destination array end address
// qword_count - 64-bits element count, negative
// to - scratch
- // L_copy_32_bytes - entry label
+ // L_copy_bytes - entry label
// L_copy_8_bytes - exit label
//
- void copy_32_bytes_forward(Register end_from, Register end_to,
+ void copy_bytes_forward(Register end_from, Register end_to,
Register qword_count, Register to,
- Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
+ Label& L_copy_bytes, Label& L_copy_8_bytes) {
DEBUG_ONLY(__ stop("enter at entry label, not here"));
Label L_loop;
__ align(OptoLoopAlignment);
- __ BIND(L_loop);
- if(UseUnalignedLoadStores) {
- __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
- __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
- __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
- __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
-
+ if (UseUnalignedLoadStores) {
+ Label L_end;
+ // Copy 64-bytes per iteration
+ __ BIND(L_loop);
+ if (UseAVX >= 2) {
+ __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+ __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
+ __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
+ __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
+ } else {
+ __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
+ __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
+ __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
+ __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
+ }
+ __ BIND(L_copy_bytes);
+ __ addptr(qword_count, 8);
+ __ jcc(Assembler::lessEqual, L_loop);
+ __ subptr(qword_count, 4); // sub(8) and add(4)
+ __ jccb(Assembler::greater, L_end);
+ // Copy trailing 32 bytes
+ if (UseAVX >= 2) {
+ __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
+ __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
+ } else {
+ __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
+ __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
+ __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
+ }
+ __ addptr(qword_count, 4);
+ __ BIND(L_end);
} else {
+ // Copy 32-bytes per iteration
+ __ BIND(L_loop);
__ movq(to, Address(end_from, qword_count, Address::times_8, -24));
__ movq(Address(end_to, qword_count, Address::times_8, -24), to);
__ movq(to, Address(end_from, qword_count, Address::times_8, -16));
@@ -1311,15 +1342,15 @@
__ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
__ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
__ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
+
+ __ BIND(L_copy_bytes);
+ __ addptr(qword_count, 4);
+ __ jcc(Assembler::lessEqual, L_loop);
}
- __ BIND(L_copy_32_bytes);
- __ addptr(qword_count, 4);
- __ jcc(Assembler::lessEqual, L_loop);
__ subptr(qword_count, 4);
__ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
}
-
// Copy big chunks backward
//
// Inputs:
@@ -1327,23 +1358,55 @@
// dest - destination array address
// qword_count - 64-bits element count
// to - scratch
- // L_copy_32_bytes - entry label
+ // L_copy_bytes - entry label
// L_copy_8_bytes - exit label
//
- void copy_32_bytes_backward(Register from, Register dest,
+ void copy_bytes_backward(Register from, Register dest,
Register qword_count, Register to,
- Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
+ Label& L_copy_bytes, Label& L_copy_8_bytes) {
DEBUG_ONLY(__ stop("enter at entry label, not here"));
Label L_loop;
__ align(OptoLoopAlignment);
- __ BIND(L_loop);
- if(UseUnalignedLoadStores) {
- __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
- __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
- __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
- __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
-
+ if (UseUnalignedLoadStores) {
+ Label L_end;
+ // Copy 64-bytes per iteration
+ __ BIND(L_loop);
+ if (UseAVX >= 2) {
+ __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
+ __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
+ __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
+ __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
+ } else {
+ __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
+ __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
+ __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
+ __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
+ }
+ __ BIND(L_copy_bytes);
+ __ subptr(qword_count, 8);
+ __ jcc(Assembler::greaterEqual, L_loop);
+
+ __ addptr(qword_count, 4); // add(8) and sub(4)
+ __ jccb(Assembler::less, L_end);
+ // Copy trailing 32 bytes
+ if (UseAVX >= 2) {
+ __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
+ __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
+ } else {
+ __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
+ __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
+ __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
+ }
+ __ subptr(qword_count, 4);
+ __ BIND(L_end);
} else {
+ // Copy 32-bytes per iteration
+ __ BIND(L_loop);
__ movq(to, Address(from, qword_count, Address::times_8, 24));
__ movq(Address(dest, qword_count, Address::times_8, 24), to);
__ movq(to, Address(from, qword_count, Address::times_8, 16));
@@ -1352,10 +1415,11 @@
__ movq(Address(dest, qword_count, Address::times_8, 8), to);
__ movq(to, Address(from, qword_count, Address::times_8, 0));
__ movq(Address(dest, qword_count, Address::times_8, 0), to);
+
+ __ BIND(L_copy_bytes);
+ __ subptr(qword_count, 4);
+ __ jcc(Assembler::greaterEqual, L_loop);
}
- __ BIND(L_copy_32_bytes);
- __ subptr(qword_count, 4);
- __ jcc(Assembler::greaterEqual, L_loop);
__ addptr(qword_count, 4);
__ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
}
@@ -1385,7 +1449,7 @@
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
- Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
+ Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
Label L_copy_byte, L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
@@ -1417,7 +1481,7 @@
__ lea(end_from, Address(from, qword_count, Address::times_8, -8));
__ lea(end_to, Address(to, qword_count, Address::times_8, -8));
__ negptr(qword_count); // make the count negative
- __ jmp(L_copy_32_bytes);
+ __ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@@ -1460,8 +1524,8 @@
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
- // Copy in 32-bytes chunks
- copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+ // Copy in multi-bytes chunks
+ copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
__ jmp(L_copy_4_bytes);
return start;
@@ -1488,7 +1552,7 @@
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
- Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
+ Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
@@ -1531,10 +1595,10 @@
// Check for and copy trailing dword
__ BIND(L_copy_4_bytes);
__ testl(byte_count, 4);
- __ jcc(Assembler::zero, L_copy_32_bytes);
+ __ jcc(Assembler::zero, L_copy_bytes);
__ movl(rax, Address(from, qword_count, Address::times_8));
__ movl(Address(to, qword_count, Address::times_8), rax);
- __ jmp(L_copy_32_bytes);
+ __ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@@ -1549,8 +1613,8 @@
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
- // Copy in 32-bytes chunks
- copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+ // Copy in multi-bytes chunks
+ copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
restore_arg_regs();
inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
@@ -1585,7 +1649,7 @@
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
- Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
+ Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
@@ -1616,7 +1680,7 @@
__ lea(end_from, Address(from, qword_count, Address::times_8, -8));
__ lea(end_to, Address(to, qword_count, Address::times_8, -8));
__ negptr(qword_count);
- __ jmp(L_copy_32_bytes);
+ __ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@@ -1652,8 +1716,8 @@
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
- // Copy in 32-bytes chunks
- copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+ // Copy in multi-bytes chunks
+ copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
__ jmp(L_copy_4_bytes);
return start;
@@ -1700,7 +1764,7 @@
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
- Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes;
+ Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
@@ -1735,10 +1799,10 @@
// Check for and copy trailing dword
__ BIND(L_copy_4_bytes);
__ testl(word_count, 2);
- __ jcc(Assembler::zero, L_copy_32_bytes);
+ __ jcc(Assembler::zero, L_copy_bytes);
__ movl(rax, Address(from, qword_count, Address::times_8));
__ movl(Address(to, qword_count, Address::times_8), rax);
- __ jmp(L_copy_32_bytes);
+ __ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@@ -1753,8 +1817,8 @@
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
- // Copy in 32-bytes chunks
- copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+ // Copy in multi-bytes chunks
+ copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
restore_arg_regs();
inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
@@ -1790,7 +1854,7 @@
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
- Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
+ Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
@@ -1826,7 +1890,7 @@
__ lea(end_from, Address(from, qword_count, Address::times_8, -8));
__ lea(end_to, Address(to, qword_count, Address::times_8, -8));
__ negptr(qword_count);
- __ jmp(L_copy_32_bytes);
+ __ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@@ -1853,8 +1917,8 @@
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
- // Copy 32-bytes chunks
- copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+ // Copy in multi-bytes chunks
+ copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
__ jmp(L_copy_4_bytes);
return start;
@@ -1882,7 +1946,7 @@
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
- Label L_copy_32_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
+ Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
@@ -1916,10 +1980,10 @@
// Check for and copy trailing dword
__ testl(dword_count, 1);
- __ jcc(Assembler::zero, L_copy_32_bytes);
+ __ jcc(Assembler::zero, L_copy_bytes);
__ movl(rax, Address(from, dword_count, Address::times_4, -4));
__ movl(Address(to, dword_count, Address::times_4, -4), rax);
- __ jmp(L_copy_32_bytes);
+ __ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@@ -1937,8 +2001,8 @@
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
- // Copy in 32-bytes chunks
- copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+ // Copy in multi-bytes chunks
+ copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
__ bind(L_exit);
if (is_oop) {
@@ -1976,7 +2040,7 @@
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
- Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
+ Label L_copy_bytes, L_copy_8_bytes, L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register qword_count = rdx; // elements count
@@ -2008,7 +2072,7 @@
__ lea(end_from, Address(from, qword_count, Address::times_8, -8));
__ lea(end_to, Address(to, qword_count, Address::times_8, -8));
__ negptr(qword_count);
- __ jmp(L_copy_32_bytes);
+ __ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@@ -2027,8 +2091,8 @@
__ ret(0);
}
- // Copy 64-byte chunks
- copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+ // Copy in multi-bytes chunks
+ copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
if (is_oop) {
__ BIND(L_exit);
@@ -2065,7 +2129,7 @@
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
- Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
+ Label L_copy_bytes, L_copy_8_bytes, L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register qword_count = rdx; // elements count
@@ -2091,7 +2155,7 @@
gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
}
- __ jmp(L_copy_32_bytes);
+ __ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@@ -2110,8 +2174,8 @@
__ ret(0);
}
- // Copy in 32-bytes chunks
- copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
+ // Copy in multi-bytes chunks
+ copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
if (is_oop) {
__ BIND(L_exit);