7033154: Improve C1 arraycopy performance
authorroland
Sun, 03 Apr 2011 12:00:54 +0200
changeset 9102 4708a4aefb33
parent 9101 ff58f9a8e31c
child 9103 535a93f494f6
7033154: Improve C1 arraycopy performance Summary: better static analysis. Take advantage of array copy stubs. Reviewed-by: never
hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp
hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
hotspot/src/share/vm/c1/c1_GraphBuilder.cpp
hotspot/src/share/vm/c1/c1_Instruction.cpp
hotspot/src/share/vm/c1/c1_Instruction.hpp
hotspot/src/share/vm/c1/c1_LIR.hpp
hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
hotspot/src/share/vm/c1/c1_Optimizer.cpp
hotspot/src/share/vm/c1/c1_Runtime1.cpp
hotspot/src/share/vm/c1/c1_Runtime1.hpp
hotspot/src/share/vm/opto/library_call.cpp
hotspot/src/share/vm/runtime/stubRoutines.cpp
hotspot/src/share/vm/runtime/stubRoutines.hpp
--- a/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Sun Apr 03 12:00:54 2011 +0200
@@ -2065,20 +2065,36 @@
   // the known type isn't loaded since the code sanity checks
   // in debug mode and the type isn't required when we know the exact type
   // also check that the type is an array type.
-  // We also, for now, always call the stub if the barrier set requires a
-  // write_ref_pre barrier (which the stub does, but none of the optimized
-  // cases currently does).
-  if (op->expected_type() == NULL ||
-      Universe::heap()->barrier_set()->has_write_ref_pre_barrier()) {
+  if (op->expected_type() == NULL) {
     __ mov(src,     O0);
     __ mov(src_pos, O1);
     __ mov(dst,     O2);
     __ mov(dst_pos, O3);
     __ mov(length,  O4);
-    __ call_VM_leaf(tmp, CAST_FROM_FN_PTR(address, Runtime1::arraycopy));
-
-    __ br_zero(Assembler::less, false, Assembler::pn, O0, *stub->entry());
-    __ delayed()->nop();
+    address copyfunc_addr = StubRoutines::generic_arraycopy();
+
+    if (copyfunc_addr == NULL) { // Use C version if stub was not generated
+      __ call_VM_leaf(tmp, CAST_FROM_FN_PTR(address, Runtime1::arraycopy));
+    } else {
+#ifndef PRODUCT
+      if (PrintC1Statistics) {
+        address counter = (address)&Runtime1::_generic_arraycopystub_cnt;
+        __ inc_counter(counter, G1, G3);
+      }
+#endif
+      __ call_VM_leaf(tmp, copyfunc_addr);
+    }
+
+    if (copyfunc_addr != NULL) {
+      __ xor3(O0, -1, tmp);
+      __ sub(length, tmp, length);
+      __ add(src_pos, tmp, src_pos);
+      __ br_zero(Assembler::less, false, Assembler::pn, O0, *stub->entry());
+      __ delayed()->add(dst_pos, tmp, dst_pos);
+    } else {
+      __ br_zero(Assembler::less, false, Assembler::pn, O0, *stub->entry());
+      __ delayed()->nop();
+    }
     __ bind(*stub->continuation());
     return;
   }
@@ -2135,20 +2151,143 @@
     __ delayed()->nop();
   }
 
+#ifndef _LP64
+  __ sra(dst_pos, 0, dst_pos); //higher 32bits must be null
+  __ sra(src_pos, 0, src_pos); //higher 32bits must be null
+#endif
+
+  int shift = shift_amount(basic_type);
+
   if (flags & LIR_OpArrayCopy::type_check) {
-    if (UseCompressedOops) {
-      // We don't need decode because we just need to compare
-      __ lduw(src, oopDesc::klass_offset_in_bytes(), tmp);
-      __ lduw(dst, oopDesc::klass_offset_in_bytes(), tmp2);
-      __ cmp(tmp, tmp2);
-      __ br(Assembler::notEqual, false, Assembler::pt, *stub->entry());
+    // We don't know the array types are compatible
+    if (basic_type != T_OBJECT) {
+      // Simple test for basic type arrays
+      if (UseCompressedOops) {
+        // We don't need decode because we just need to compare
+        __ lduw(src, oopDesc::klass_offset_in_bytes(), tmp);
+        __ lduw(dst, oopDesc::klass_offset_in_bytes(), tmp2);
+        __ cmp(tmp, tmp2);
+        __ br(Assembler::notEqual, false, Assembler::pt, *stub->entry());
+      } else {
+        __ ld_ptr(src, oopDesc::klass_offset_in_bytes(), tmp);
+        __ ld_ptr(dst, oopDesc::klass_offset_in_bytes(), tmp2);
+        __ cmp(tmp, tmp2);
+        __ brx(Assembler::notEqual, false, Assembler::pt, *stub->entry());
+      }
+      __ delayed()->nop();
     } else {
-      __ ld_ptr(src, oopDesc::klass_offset_in_bytes(), tmp);
-      __ ld_ptr(dst, oopDesc::klass_offset_in_bytes(), tmp2);
-      __ cmp(tmp, tmp2);
-      __ brx(Assembler::notEqual, false, Assembler::pt, *stub->entry());
+      // For object arrays, if src is a sub class of dst then we can
+      // safely do the copy.
+      address copyfunc_addr = StubRoutines::checkcast_arraycopy();
+
+      Label cont, slow;
+      assert_different_registers(tmp, tmp2, G3, G1);
+
+      __ load_klass(src, G3);
+      __ load_klass(dst, G1);
+
+      __ check_klass_subtype_fast_path(G3, G1, tmp, tmp2, &cont, copyfunc_addr == NULL ? stub->entry() : &slow, NULL);
+
+      __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
+      __ delayed()->nop();
+
+      __ cmp(G3, 0);
+      if (copyfunc_addr != NULL) { // use stub if available
+        // src is not a sub class of dst so we have to do a
+        // per-element check.
+        __ br(Assembler::notEqual, false, Assembler::pt, cont);
+        __ delayed()->nop();
+
+        __ bind(slow);
+
+        int mask = LIR_OpArrayCopy::src_objarray|LIR_OpArrayCopy::dst_objarray;
+        if ((flags & mask) != mask) {
+          // Check that at least both of them object arrays.
+          assert(flags & mask, "one of the two should be known to be an object array");
+
+          if (!(flags & LIR_OpArrayCopy::src_objarray)) {
+            __ load_klass(src, tmp);
+          } else if (!(flags & LIR_OpArrayCopy::dst_objarray)) {
+            __ load_klass(dst, tmp);
+          }
+          int lh_offset = klassOopDesc::header_size() * HeapWordSize +
+            Klass::layout_helper_offset_in_bytes();
+
+          __ lduw(tmp, lh_offset, tmp2);
+
+          jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
+          __ set(objArray_lh, tmp);
+          __ cmp(tmp, tmp2);
+          __ br(Assembler::notEqual, false, Assembler::pt,  *stub->entry());
+          __ delayed()->nop();
+        }
+
+        Register src_ptr = O0;
+        Register dst_ptr = O1;
+        Register len     = O2;
+        Register chk_off = O3;
+        Register super_k = O4;
+
+        __ add(src, arrayOopDesc::base_offset_in_bytes(basic_type), src_ptr);
+        if (shift == 0) {
+          __ add(src_ptr, src_pos, src_ptr);
+        } else {
+          __ sll(src_pos, shift, tmp);
+          __ add(src_ptr, tmp, src_ptr);
+        }
+
+        __ add(dst, arrayOopDesc::base_offset_in_bytes(basic_type), dst_ptr);
+        if (shift == 0) {
+          __ add(dst_ptr, dst_pos, dst_ptr);
+        } else {
+          __ sll(dst_pos, shift, tmp);
+          __ add(dst_ptr, tmp, dst_ptr);
+        }
+        LP64_ONLY( __ sra(length, 0, length)); //higher 32bits must be null
+        __ mov(length, len);
+        __ load_klass(dst, tmp);
+
+        int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
+                         objArrayKlass::element_klass_offset_in_bytes());
+        __ ld_ptr(tmp, ek_offset, super_k);
+
+        int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
+                          Klass::super_check_offset_offset_in_bytes());
+        __ lduw(super_k, sco_offset, chk_off);
+
+        __ call_VM_leaf(tmp, copyfunc_addr);
+
+#ifndef PRODUCT
+        if (PrintC1Statistics) {
+          Label failed;
+          __ br_notnull(O0, false, Assembler::pn,  failed);
+          __ delayed()->nop();
+          __ inc_counter((address)&Runtime1::_arraycopy_checkcast_cnt, G1, G3);
+          __ bind(failed);
+        }
+#endif
+
+        __ br_null(O0, false, Assembler::pt,  *stub->continuation());
+        __ delayed()->xor3(O0, -1, tmp);
+
+#ifndef PRODUCT
+        if (PrintC1Statistics) {
+          __ inc_counter((address)&Runtime1::_arraycopy_checkcast_attempt_cnt, G1, G3);
+        }
+#endif
+
+        __ sub(length, tmp, length);
+        __ add(src_pos, tmp, src_pos);
+        __ br(Assembler::always, false, Assembler::pt, *stub->entry());
+        __ delayed()->add(dst_pos, tmp, dst_pos);
+
+        __ bind(cont);
+      } else {
+        __ br(Assembler::equal, false, Assembler::pn, *stub->entry());
+        __ delayed()->nop();
+        __ bind(cont);
+      }
     }
-    __ delayed()->nop();
   }
 
 #ifdef ASSERT
@@ -2207,14 +2346,18 @@
   }
 #endif
 
-  int shift = shift_amount(basic_type);
+#ifndef PRODUCT
+  if (PrintC1Statistics) {
+    address counter = Runtime1::arraycopy_count_address(basic_type);
+    __ inc_counter(counter, G1, G3);
+  }
+#endif
 
   Register src_ptr = O0;
   Register dst_ptr = O1;
   Register len     = O2;
 
   __ add(src, arrayOopDesc::base_offset_in_bytes(basic_type), src_ptr);
-  LP64_ONLY(__ sra(src_pos, 0, src_pos);) //higher 32bits must be null
   if (shift == 0) {
     __ add(src_ptr, src_pos, src_ptr);
   } else {
@@ -2223,7 +2366,6 @@
   }
 
   __ add(dst, arrayOopDesc::base_offset_in_bytes(basic_type), dst_ptr);
-  LP64_ONLY(__ sra(dst_pos, 0, dst_pos);) //higher 32bits must be null
   if (shift == 0) {
     __ add(dst_ptr, dst_pos, dst_ptr);
   } else {
@@ -2231,18 +2373,14 @@
     __ add(dst_ptr, tmp, dst_ptr);
   }
 
-  if (basic_type != T_OBJECT) {
-    if (shift == 0) {
-      __ mov(length, len);
-    } else {
-      __ sll(length, shift, len);
-    }
-    __ call_VM_leaf(tmp, CAST_FROM_FN_PTR(address, Runtime1::primitive_arraycopy));
-  } else {
-    // oop_arraycopy takes a length in number of elements, so don't scale it.
-    __ mov(length, len);
-    __ call_VM_leaf(tmp, CAST_FROM_FN_PTR(address, Runtime1::oop_arraycopy));
-  }
+  bool disjoint = (flags & LIR_OpArrayCopy::overlapping) == 0;
+  bool aligned = (flags & LIR_OpArrayCopy::unaligned) == 0;
+  const char *name;
+  address entry = StubRoutines::select_arraycopy_function(basic_type, aligned, disjoint, name, false);
+
+  // arraycopy stubs takes a length in number of elements, so don't scale it.
+  __ mov(length, len);
+  __ call_VM_leaf(tmp, entry);
 
   __ bind(*stub->continuation());
 }
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Sun Apr 03 12:00:54 2011 +0200
@@ -3102,7 +3102,7 @@
   BasicType basic_type = default_type != NULL ? default_type->element_type()->basic_type() : T_ILLEGAL;
   if (basic_type == T_ARRAY) basic_type = T_OBJECT;
 
-  // if we don't know anything or it's an object array, just go through the generic arraycopy
+  // if we don't know anything, just go through the generic arraycopy
   if (default_type == NULL) {
     Label done;
     // save outgoing arguments on stack in case call to System.arraycopy is needed
@@ -3123,7 +3123,9 @@
     store_parameter(src, 4);
     NOT_LP64(assert(src == rcx && src_pos == rdx, "mismatch in calling convention");)
 
-    address entry = CAST_FROM_FN_PTR(address, Runtime1::arraycopy);
+    address C_entry = CAST_FROM_FN_PTR(address, Runtime1::arraycopy);
+
+    address copyfunc_addr = StubRoutines::generic_arraycopy();
 
     // pass arguments: may push as this is not a safepoint; SP must be fix at each safepoint
 #ifdef _LP64
@@ -3141,11 +3143,29 @@
     // Allocate abi space for args but be sure to keep stack aligned
     __ subptr(rsp, 6*wordSize);
     store_parameter(j_rarg4, 4);
-    __ call(RuntimeAddress(entry));
+    if (copyfunc_addr == NULL) { // Use C version if stub was not generated
+      __ call(RuntimeAddress(C_entry));
+    } else {
+#ifndef PRODUCT
+      if (PrintC1Statistics) {
+        __ incrementl(ExternalAddress((address)&Runtime1::_generic_arraycopystub_cnt));
+      }
+#endif
+      __ call(RuntimeAddress(copyfunc_addr));
+    }
     __ addptr(rsp, 6*wordSize);
 #else
     __ mov(c_rarg4, j_rarg4);
-    __ call(RuntimeAddress(entry));
+    if (copyfunc_addr == NULL) { // Use C version if stub was not generated
+      __ call(RuntimeAddress(C_entry));
+    } else {
+#ifndef PRODUCT
+      if (PrintC1Statistics) {
+        __ incrementl(ExternalAddress((address)&Runtime1::_generic_arraycopystub_cnt));
+      }
+#endif
+      __ call(RuntimeAddress(copyfunc_addr));
+    }
 #endif // _WIN64
 #else
     __ push(length);
@@ -3153,13 +3173,28 @@
     __ push(dst);
     __ push(src_pos);
     __ push(src);
-    __ call_VM_leaf(entry, 5); // removes pushed parameter from the stack
+
+    if (copyfunc_addr == NULL) { // Use C version if stub was not generated
+      __ call_VM_leaf(C_entry, 5); // removes pushed parameter from the stack
+    } else {
+#ifndef PRODUCT
+      if (PrintC1Statistics) {
+        __ incrementl(ExternalAddress((address)&Runtime1::_generic_arraycopystub_cnt));
+      }
+#endif
+      __ call_VM_leaf(copyfunc_addr, 5); // removes pushed parameter from the stack
+    }
 
 #endif // _LP64
 
     __ cmpl(rax, 0);
     __ jcc(Assembler::equal, *stub->continuation());
 
+    if (copyfunc_addr != NULL) {
+      __ mov(tmp, rax);
+      __ xorl(tmp, -1);
+    }
+
     // Reload values from the stack so they are where the stub
     // expects them.
     __ movptr   (dst,     Address(rsp, 0*BytesPerWord));
@@ -3167,6 +3202,12 @@
     __ movptr   (length,  Address(rsp, 2*BytesPerWord));
     __ movptr   (src_pos, Address(rsp, 3*BytesPerWord));
     __ movptr   (src,     Address(rsp, 4*BytesPerWord));
+
+    if (copyfunc_addr != NULL) {
+      __ subl(length, tmp);
+      __ addl(src_pos, tmp);
+      __ addl(dst_pos, tmp);
+    }
     __ jmp(*stub->entry());
 
     __ bind(*stub->continuation());
@@ -3226,10 +3267,6 @@
     __ testl(dst_pos, dst_pos);
     __ jcc(Assembler::less, *stub->entry());
   }
-  if (flags & LIR_OpArrayCopy::length_positive_check) {
-    __ testl(length, length);
-    __ jcc(Assembler::less, *stub->entry());
-  }
 
   if (flags & LIR_OpArrayCopy::src_range_check) {
     __ lea(tmp, Address(src_pos, length, Address::times_1, 0));
@@ -3242,15 +3279,190 @@
     __ jcc(Assembler::above, *stub->entry());
   }
 
+  if (flags & LIR_OpArrayCopy::length_positive_check) {
+    __ testl(length, length);
+    __ jcc(Assembler::less, *stub->entry());
+    __ jcc(Assembler::zero, *stub->continuation());
+  }
+
+#ifdef _LP64
+  __ movl2ptr(src_pos, src_pos); //higher 32bits must be null
+  __ movl2ptr(dst_pos, dst_pos); //higher 32bits must be null
+#endif
+
   if (flags & LIR_OpArrayCopy::type_check) {
-    if (UseCompressedOops) {
-      __ movl(tmp, src_klass_addr);
-      __ cmpl(tmp, dst_klass_addr);
+    // We don't know the array types are compatible
+    if (basic_type != T_OBJECT) {
+      // Simple test for basic type arrays
+      if (UseCompressedOops) {
+        __ movl(tmp, src_klass_addr);
+        __ cmpl(tmp, dst_klass_addr);
+      } else {
+        __ movptr(tmp, src_klass_addr);
+        __ cmpptr(tmp, dst_klass_addr);
+      }
+      __ jcc(Assembler::notEqual, *stub->entry());
     } else {
-      __ movptr(tmp, src_klass_addr);
-      __ cmpptr(tmp, dst_klass_addr);
+      // For object arrays, if src is a sub class of dst then we can
+      // safely do the copy.
+      Label cont, slow;
+
+      __ push(src);
+      __ push(dst);
+
+      __ load_klass(src, src);
+      __ load_klass(dst, dst);
+
+      __ check_klass_subtype_fast_path(src, dst, tmp, &cont, &slow, NULL);
+
+      __ push(src);
+      __ push(dst);
+      __ call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
+      __ pop(dst);
+      __ pop(src);
+
+      __ cmpl(src, 0);
+      __ jcc(Assembler::notEqual, cont);
+
+      __ bind(slow);
+      __ pop(dst);
+      __ pop(src);
+
+      address copyfunc_addr = StubRoutines::checkcast_arraycopy();
+      if (copyfunc_addr != NULL) { // use stub if available
+        // src is not a sub class of dst so we have to do a
+        // per-element check.
+
+        int mask = LIR_OpArrayCopy::src_objarray|LIR_OpArrayCopy::dst_objarray;
+        if ((flags & mask) != mask) {
+          // Check that at least both of them object arrays.
+          assert(flags & mask, "one of the two should be known to be an object array");
+
+          if (!(flags & LIR_OpArrayCopy::src_objarray)) {
+            __ load_klass(tmp, src);
+          } else if (!(flags & LIR_OpArrayCopy::dst_objarray)) {
+            __ load_klass(tmp, dst);
+          }
+          int lh_offset = klassOopDesc::header_size() * HeapWordSize +
+            Klass::layout_helper_offset_in_bytes();
+          Address klass_lh_addr(tmp, lh_offset);
+          jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
+          __ cmpl(klass_lh_addr, objArray_lh);
+          __ jcc(Assembler::notEqual, *stub->entry());
+        }
+
+#ifndef _LP64
+        // save caller save registers
+        store_parameter(rax, 2);
+        store_parameter(rcx, 1);
+        store_parameter(rdx, 0);
+
+        __ movptr(tmp, dst_klass_addr);
+        __ movptr(tmp, Address(tmp, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
+        __ push(tmp);
+        __ movl(tmp, Address(tmp, Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc)));
+        __ push(tmp);
+        __ push(length);
+        __ lea(tmp, Address(dst, dst_pos, scale, arrayOopDesc::base_offset_in_bytes(basic_type)));
+        __ push(tmp);
+        __ lea(tmp, Address(src, src_pos, scale, arrayOopDesc::base_offset_in_bytes(basic_type)));
+        __ push(tmp);
+
+        __ call_VM_leaf(copyfunc_addr, 5);
+#else
+        __ movl2ptr(length, length); //higher 32bits must be null
+
+        // save caller save registers: copy them to callee save registers
+        __ mov(rbx, rdx);
+        __ mov(r13, r8);
+        __ mov(r14, r9);
+#ifndef _WIN64
+        store_parameter(rsi, 1);
+        store_parameter(rcx, 0);
+        // on WIN64 other incoming parameters are in rdi and rsi saved
+        // across the call
+#endif
+
+        __ lea(c_rarg0, Address(src, src_pos, scale, arrayOopDesc::base_offset_in_bytes(basic_type)));
+        assert_different_registers(c_rarg0, dst, dst_pos, length);
+        __ lea(c_rarg1, Address(dst, dst_pos, scale, arrayOopDesc::base_offset_in_bytes(basic_type)));
+        assert_different_registers(c_rarg1, dst, length);
+
+        __ mov(c_rarg2, length);
+        assert_different_registers(c_rarg2, dst);
+
+#ifdef _WIN64
+        // Allocate abi space for args but be sure to keep stack aligned
+        __ subptr(rsp, 6*wordSize);
+        __ load_klass(c_rarg3, dst);
+        __ movptr(c_rarg3, Address(c_rarg3, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
+        store_parameter(c_rarg3, 4);
+        __ movl(c_rarg3, Address(c_rarg3, Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc)));
+        __ call(RuntimeAddress(copyfunc_addr));
+        __ addptr(rsp, 6*wordSize);
+#else
+        __ load_klass(c_rarg4, dst);
+        __ movptr(c_rarg4, Address(c_rarg4, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
+        __ movl(c_rarg3, Address(c_rarg4, Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc)));
+        __ call(RuntimeAddress(copyfunc_addr));
+#endif
+
+#endif
+
+#ifndef PRODUCT
+        if (PrintC1Statistics) {
+          Label failed;
+          __ testl(rax, rax);
+          __ jcc(Assembler::notZero, failed);
+          __ incrementl(ExternalAddress((address)&Runtime1::_arraycopy_checkcast_cnt));
+          __ bind(failed);
+        }
+#endif
+
+        __ testl(rax, rax);
+        __ jcc(Assembler::zero, *stub->continuation());
+
+#ifndef PRODUCT
+        if (PrintC1Statistics) {
+          __ incrementl(ExternalAddress((address)&Runtime1::_arraycopy_checkcast_attempt_cnt));
+        }
+#endif
+
+        __ mov(tmp, rax);
+
+        __ xorl(tmp, -1);
+
+#ifndef _LP64
+        // restore caller save registers
+        assert_different_registers(tmp, rdx, rcx, rax); // result of stub will be lost
+        __ movptr(rdx, Address(rsp, 0*BytesPerWord));
+        __ movptr(rcx, Address(rsp, 1*BytesPerWord));
+        __ movptr(rax, Address(rsp, 2*BytesPerWord));
+#else
+        // restore caller save registers
+        __ mov(rdx, rbx);
+        __ mov(r8, r13);
+        __ mov(r9, r14);
+#ifndef _WIN64
+        assert_different_registers(tmp, rdx, r8, r9, rcx, rsi); // result of stub will be lost
+        __ movptr(rcx, Address(rsp, 0*BytesPerWord));
+        __ movptr(rsi, Address(rsp, 1*BytesPerWord));
+#else
+        assert_different_registers(tmp, rdx, r8, r9); // result of stub will be lost
+#endif
+#endif
+
+        __ subl(length, tmp);
+        __ addl(src_pos, tmp);
+        __ addl(dst_pos, tmp);
+      }
+
+      __ jmp(*stub->entry());
+
+      __ bind(cont);
+      __ pop(dst);
+      __ pop(src);
     }
-    __ jcc(Assembler::notEqual, *stub->entry());
   }
 
 #ifdef ASSERT
@@ -3291,16 +3503,16 @@
   }
 #endif
 
-  if (shift_amount > 0 && basic_type != T_OBJECT) {
-    __ shlptr(length, shift_amount);
+#ifndef PRODUCT
+  if (PrintC1Statistics) {
+    __ incrementl(ExternalAddress(Runtime1::arraycopy_count_address(basic_type)));
   }
+#endif
 
 #ifdef _LP64
   assert_different_registers(c_rarg0, dst, dst_pos, length);
-  __ movl2ptr(src_pos, src_pos); //higher 32bits must be null
   __ lea(c_rarg0, Address(src, src_pos, scale, arrayOopDesc::base_offset_in_bytes(basic_type)));
   assert_different_registers(c_rarg1, length);
-  __ movl2ptr(dst_pos, dst_pos); //higher 32bits must be null
   __ lea(c_rarg1, Address(dst, dst_pos, scale, arrayOopDesc::base_offset_in_bytes(basic_type)));
   __ mov(c_rarg2, length);
 
@@ -3311,11 +3523,12 @@
   store_parameter(tmp, 1);
   store_parameter(length, 2);
 #endif // _LP64
-  if (basic_type == T_OBJECT) {
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, Runtime1::oop_arraycopy), 0);
-  } else {
-    __ call_VM_leaf(CAST_FROM_FN_PTR(address, Runtime1::primitive_arraycopy), 0);
-  }
+
+  bool disjoint = (flags & LIR_OpArrayCopy::overlapping) == 0;
+  bool aligned = (flags & LIR_OpArrayCopy::unaligned) == 0;
+  const char *name;
+  address entry = StubRoutines::select_arraycopy_function(basic_type, aligned, disjoint, name, false);
+  __ call_VM_leaf(entry, 0);
 
   __ bind(*stub->continuation());
 }
--- a/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp	Sun Apr 03 12:00:54 2011 +0200
@@ -2824,7 +2824,7 @@
   int idx = 0;
   if (!method()->is_static()) {
     // we should always see the receiver
-    state->store_local(idx, new Local(objectType, idx));
+    state->store_local(idx, new Local(method()->holder(), objectType, idx));
     idx = 1;
   }
 
@@ -2836,7 +2836,7 @@
     // don't allow T_ARRAY to propagate into locals types
     if (basic_type == T_ARRAY) basic_type = T_OBJECT;
     ValueType* vt = as_ValueType(basic_type);
-    state->store_local(idx, new Local(vt, idx));
+    state->store_local(idx, new Local(type, vt, idx));
     idx += type->size();
   }
 
--- a/hotspot/src/share/vm/c1/c1_Instruction.cpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/share/vm/c1/c1_Instruction.cpp	Sun Apr 03 12:00:54 2011 +0200
@@ -135,6 +135,33 @@
 }
 
 
+ciType* Local::exact_type() const {
+  ciType* type = declared_type();
+
+  // for primitive arrays, the declared type is the exact type
+  if (type->is_type_array_klass()) {
+    return type;
+  } else if (type->is_instance_klass()) {
+    ciInstanceKlass* ik = (ciInstanceKlass*)type;
+    if (ik->is_loaded() && ik->is_final() && !ik->is_interface()) {
+      return type;
+    }
+  } else if (type->is_obj_array_klass()) {
+    ciObjArrayKlass* oak = (ciObjArrayKlass*)type;
+    ciType* base = oak->base_element_type();
+    if (base->is_instance_klass()) {
+      ciInstanceKlass* ik = base->as_instance_klass();
+      if (ik->is_loaded() && ik->is_final()) {
+        return type;
+      }
+    } else if (base->is_primitive_type()) {
+      return type;
+    }
+  }
+  return NULL;
+}
+
+
 ciType* LoadIndexed::exact_type() const {
   ciType* array_type = array()->exact_type();
   if (array_type == NULL) {
@@ -189,16 +216,21 @@
   return ciTypeArrayKlass::make(elt_type());
 }
 
-
 ciType* NewObjectArray::exact_type() const {
   return ciObjArrayKlass::make(klass());
 }
 
+ciType* NewArray::declared_type() const {
+  return exact_type();
+}
 
 ciType* NewInstance::exact_type() const {
   return klass();
 }
 
+ciType* NewInstance::declared_type() const {
+  return exact_type();
+}
 
 ciType* CheckCast::declared_type() const {
   return klass();
@@ -349,6 +381,11 @@
   if (state()        != NULL) state()->values_do(f);
 }
 
+ciType* Invoke::declared_type() const {
+  ciType *t = _target->signature()->return_type();
+  assert(t->basic_type() != T_VOID, "need return value of void method?");
+  return t;
+}
 
 // Implementation of Contant
 intx Constant::hash() const {
--- a/hotspot/src/share/vm/c1/c1_Instruction.hpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/share/vm/c1/c1_Instruction.hpp	Sun Apr 03 12:00:54 2011 +0200
@@ -621,16 +621,21 @@
 LEAF(Local, Instruction)
  private:
   int      _java_index;                          // the local index within the method to which the local belongs
+  ciType*  _declared_type;
  public:
   // creation
-  Local(ValueType* type, int index)
+  Local(ciType* declared, ValueType* type, int index)
     : Instruction(type)
     , _java_index(index)
+    , _declared_type(declared)
   {}
 
   // accessors
   int java_index() const                         { return _java_index; }
 
+  ciType* declared_type() const                  { return _declared_type; }
+  ciType* exact_type() const;
+
   // generic
   virtual void input_values_do(ValueVisitor* f)   { /* no values */ }
 };
@@ -1146,6 +1151,8 @@
   BasicTypeList* signature() const               { return _signature; }
   ciMethod* target() const                       { return _target; }
 
+  ciType* declared_type() const;
+
   // Returns false if target is not loaded
   bool target_is_final() const                   { return check_flag(TargetIsFinalFlag); }
   bool target_is_loaded() const                  { return check_flag(TargetIsLoadedFlag); }
@@ -1187,6 +1194,7 @@
   // generic
   virtual bool can_trap() const                  { return true; }
   ciType* exact_type() const;
+  ciType* declared_type() const;
 };
 
 
@@ -1208,6 +1216,8 @@
 
   virtual bool needs_exception_state() const     { return false; }
 
+  ciType* declared_type() const;
+
   // generic
   virtual bool can_trap() const                  { return true; }
   virtual void input_values_do(ValueVisitor* f)   { StateSplit::input_values_do(f); f->visit(&_length); }
@@ -1397,6 +1407,7 @@
   vmIntrinsics::ID _id;
   Values*          _args;
   Value            _recv;
+  int              _nonnull_state; // mask identifying which args are nonnull
 
  public:
   // preserves_state can be set to true for Intrinsics
@@ -1417,6 +1428,7 @@
   , _id(id)
   , _args(args)
   , _recv(NULL)
+  , _nonnull_state(AllBits)
   {
     assert(args != NULL, "args must exist");
     ASSERT_VALUES
@@ -1442,6 +1454,23 @@
   Value receiver() const                         { assert(has_receiver(), "must have receiver"); return _recv; }
   bool preserves_state() const                   { return check_flag(PreservesStateFlag); }
 
+  bool arg_needs_null_check(int i) {
+    if (i >= 0 && i < (int)sizeof(_nonnull_state) * BitsPerByte) {
+      return is_set_nth_bit(_nonnull_state, i);
+    }
+    return true;
+  }
+
+  void set_arg_needs_null_check(int i, bool check) {
+    if (i >= 0 && i < (int)sizeof(_nonnull_state) * BitsPerByte) {
+      if (check) {
+        _nonnull_state |= nth_bit(i);
+      } else {
+        _nonnull_state &= ~(nth_bit(i));
+      }
+    }
+  }
+
   // generic
   virtual bool can_trap() const                  { return check_flag(CanTrapFlag); }
   virtual void input_values_do(ValueVisitor* f) {
--- a/hotspot/src/share/vm/c1/c1_LIR.hpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/share/vm/c1/c1_LIR.hpp	Sun Apr 03 12:00:54 2011 +0200
@@ -1215,7 +1215,11 @@
     src_range_check        = 1 << 5,
     dst_range_check        = 1 << 6,
     type_check             = 1 << 7,
-    all_flags              = (1 << 8) - 1
+    overlapping            = 1 << 8,
+    unaligned              = 1 << 9,
+    src_objarray           = 1 << 10,
+    dst_objarray           = 1 << 11,
+    all_flags              = (1 << 12) - 1
   };
 
   LIR_OpArrayCopy(LIR_Opr src, LIR_Opr src_pos, LIR_Opr dst, LIR_Opr dst_pos, LIR_Opr length, LIR_Opr tmp,
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp	Sun Apr 03 12:00:54 2011 +0200
@@ -706,6 +706,38 @@
   }
 }
 
+static Value maxvalue(IfOp* ifop) {
+  switch (ifop->cond()) {
+    case If::eql: return NULL;
+    case If::neq: return NULL;
+    case If::lss: // x <  y ? x : y
+    case If::leq: // x <= y ? x : y
+      if (ifop->x() == ifop->tval() &&
+          ifop->y() == ifop->fval()) return ifop->y();
+      return NULL;
+
+    case If::gtr: // x >  y ? y : x
+    case If::geq: // x >= y ? y : x
+      if (ifop->x() == ifop->tval() &&
+          ifop->y() == ifop->fval()) return ifop->y();
+      return NULL;
+
+  }
+}
+
+static ciType* phi_declared_type(Phi* phi) {
+  ciType* t = phi->operand_at(0)->declared_type();
+  if (t == NULL) {
+    return NULL;
+  }
+  for(int i = 1; i < phi->operand_count(); i++) {
+    if (t != phi->operand_at(i)->declared_type()) {
+      return NULL;
+    }
+  }
+  return t;
+}
+
 void LIRGenerator::arraycopy_helper(Intrinsic* x, int* flagsp, ciArrayKlass** expected_typep) {
   Instruction* src     = x->argument_at(0);
   Instruction* src_pos = x->argument_at(1);
@@ -715,12 +747,20 @@
 
   // first try to identify the likely type of the arrays involved
   ciArrayKlass* expected_type = NULL;
-  bool is_exact = false;
+  bool is_exact = false, src_objarray = false, dst_objarray = false;
   {
     ciArrayKlass* src_exact_type    = as_array_klass(src->exact_type());
     ciArrayKlass* src_declared_type = as_array_klass(src->declared_type());
+    Phi* phi;
+    if (src_declared_type == NULL && (phi = src->as_Phi()) != NULL) {
+      src_declared_type = as_array_klass(phi_declared_type(phi));
+    }
     ciArrayKlass* dst_exact_type    = as_array_klass(dst->exact_type());
     ciArrayKlass* dst_declared_type = as_array_klass(dst->declared_type());
+    if (dst_declared_type == NULL && (phi = dst->as_Phi()) != NULL) {
+      dst_declared_type = as_array_klass(phi_declared_type(phi));
+    }
+
     if (src_exact_type != NULL && src_exact_type == dst_exact_type) {
       // the types exactly match so the type is fully known
       is_exact = true;
@@ -744,17 +784,60 @@
     if (expected_type == NULL) expected_type = dst_exact_type;
     if (expected_type == NULL) expected_type = src_declared_type;
     if (expected_type == NULL) expected_type = dst_declared_type;
+
+    src_objarray = (src_exact_type && src_exact_type->is_obj_array_klass()) || (src_declared_type && src_declared_type->is_obj_array_klass());
+    dst_objarray = (dst_exact_type && dst_exact_type->is_obj_array_klass()) || (dst_declared_type && dst_declared_type->is_obj_array_klass());
   }
 
   // if a probable array type has been identified, figure out if any
   // of the required checks for a fast case can be elided.
   int flags = LIR_OpArrayCopy::all_flags;
+
+  if (!src_objarray)
+    flags &= ~LIR_OpArrayCopy::src_objarray;
+  if (!dst_objarray)
+    flags &= ~LIR_OpArrayCopy::dst_objarray;
+
+  if (!x->arg_needs_null_check(0))
+    flags &= ~LIR_OpArrayCopy::src_null_check;
+  if (!x->arg_needs_null_check(2))
+    flags &= ~LIR_OpArrayCopy::dst_null_check;
+
+
   if (expected_type != NULL) {
-    // try to skip null checks
-    if (src->as_NewArray() != NULL)
+    Value length_limit = NULL;
+
+    IfOp* ifop = length->as_IfOp();
+    if (ifop != NULL) {
+      // look for expressions like min(v, a.length) which ends up as
+      //   x > y ? y : x  or  x >= y ? y : x
+      if ((ifop->cond() == If::gtr || ifop->cond() == If::geq) &&
+          ifop->x() == ifop->fval() &&
+          ifop->y() == ifop->tval()) {
+        length_limit = ifop->y();
+      }
+    }
+
+    // try to skip null checks and range checks
+    NewArray* src_array = src->as_NewArray();
+    if (src_array != NULL) {
       flags &= ~LIR_OpArrayCopy::src_null_check;
-    if (dst->as_NewArray() != NULL)
+      if (length_limit != NULL &&
+          src_array->length() == length_limit &&
+          is_constant_zero(src_pos)) {
+        flags &= ~LIR_OpArrayCopy::src_range_check;
+      }
+    }
+
+    NewArray* dst_array = dst->as_NewArray();
+    if (dst_array != NULL) {
       flags &= ~LIR_OpArrayCopy::dst_null_check;
+      if (length_limit != NULL &&
+          dst_array->length() == length_limit &&
+          is_constant_zero(dst_pos)) {
+        flags &= ~LIR_OpArrayCopy::dst_range_check;
+      }
+    }
 
     // check from incoming constant values
     if (positive_constant(src_pos))
@@ -788,6 +871,28 @@
     }
   }
 
+  IntConstant* src_int = src_pos->type()->as_IntConstant();
+  IntConstant* dst_int = dst_pos->type()->as_IntConstant();
+  if (src_int && dst_int) {
+    int s_offs = src_int->value();
+    int d_offs = dst_int->value();
+    if (src_int->value() >= dst_int->value()) {
+      flags &= ~LIR_OpArrayCopy::overlapping;
+    }
+    if (expected_type != NULL) {
+      BasicType t = expected_type->element_type()->basic_type();
+      int element_size = type2aelembytes(t);
+      if (((arrayOopDesc::base_offset_in_bytes(t) + s_offs * element_size) % HeapWordSize == 0) &&
+          ((arrayOopDesc::base_offset_in_bytes(t) + d_offs * element_size) % HeapWordSize == 0)) {
+        flags &= ~LIR_OpArrayCopy::unaligned;
+      }
+    }
+  } else if (src_pos == dst_pos || is_constant_zero(dst_pos)) {
+    // src and dest positions are the same, or dst is zero so assume
+    // nonoverlapping copy.
+    flags &= ~LIR_OpArrayCopy::overlapping;
+  }
+
   if (src == dst) {
     // moving within a single array so no type checks are needed
     if (flags & LIR_OpArrayCopy::type_check) {
--- a/hotspot/src/share/vm/c1/c1_Optimizer.cpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/share/vm/c1/c1_Optimizer.cpp	Sun Apr 03 12:00:54 2011 +0200
@@ -644,7 +644,7 @@
 void NullCheckVisitor::do_InstanceOf     (InstanceOf*      x) {}
 void NullCheckVisitor::do_MonitorEnter   (MonitorEnter*    x) { nce()->handle_AccessMonitor(x); }
 void NullCheckVisitor::do_MonitorExit    (MonitorExit*     x) { nce()->handle_AccessMonitor(x); }
-void NullCheckVisitor::do_Intrinsic      (Intrinsic*       x) { nce()->clear_last_explicit_null_check(); }
+void NullCheckVisitor::do_Intrinsic      (Intrinsic*       x) { nce()->handle_Intrinsic(x);     }
 void NullCheckVisitor::do_BlockBegin     (BlockBegin*      x) {}
 void NullCheckVisitor::do_Goto           (Goto*            x) {}
 void NullCheckVisitor::do_If             (If*              x) {}
@@ -1023,6 +1023,12 @@
 
 void NullCheckEliminator::handle_Intrinsic(Intrinsic* x) {
   if (!x->has_receiver()) {
+    if (x->id() == vmIntrinsics::_arraycopy) {
+      for (int i = 0; i < x->number_of_arguments(); i++) {
+        x->set_arg_needs_null_check(i, !set_contains(x->argument_at(i)));
+      }
+    }
+
     // Be conservative
     clear_last_explicit_null_check();
     return;
--- a/hotspot/src/share/vm/c1/c1_Runtime1.cpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/share/vm/c1/c1_Runtime1.cpp	Sun Apr 03 12:00:54 2011 +0200
@@ -103,7 +103,10 @@
 int Runtime1::_generic_arraycopy_cnt = 0;
 int Runtime1::_primitive_arraycopy_cnt = 0;
 int Runtime1::_oop_arraycopy_cnt = 0;
+int Runtime1::_generic_arraycopystub_cnt = 0;
 int Runtime1::_arraycopy_slowcase_cnt = 0;
+int Runtime1::_arraycopy_checkcast_cnt = 0;
+int Runtime1::_arraycopy_checkcast_attempt_cnt = 0;
 int Runtime1::_new_type_array_slowcase_cnt = 0;
 int Runtime1::_new_object_array_slowcase_cnt = 0;
 int Runtime1::_new_instance_slowcase_cnt = 0;
@@ -119,6 +122,32 @@
 int Runtime1::_throw_incompatible_class_change_error_count = 0;
 int Runtime1::_throw_array_store_exception_count = 0;
 int Runtime1::_throw_count = 0;
+
+static int _byte_arraycopy_cnt = 0;
+static int _short_arraycopy_cnt = 0;
+static int _int_arraycopy_cnt = 0;
+static int _long_arraycopy_cnt = 0;
+static int _oop_arraycopy_cnt = 0;
+
+address Runtime1::arraycopy_count_address(BasicType type) {
+  switch (type) {
+  case T_BOOLEAN:
+  case T_BYTE:   return (address)&_byte_arraycopy_cnt;
+  case T_CHAR:
+  case T_SHORT:  return (address)&_short_arraycopy_cnt;
+  case T_FLOAT:
+  case T_INT:    return (address)&_int_arraycopy_cnt;
+  case T_DOUBLE:
+  case T_LONG:   return (address)&_long_arraycopy_cnt;
+  case T_ARRAY:
+  case T_OBJECT: return (address)&_oop_arraycopy_cnt;
+  default:
+    ShouldNotReachHere();
+    return NULL;
+  }
+}
+
+
 #endif
 
 // Simple helper to see if the caller of a runtime stub which
@@ -1229,9 +1258,17 @@
   tty->print_cr(" _handle_wrong_method_cnt:        %d", SharedRuntime::_wrong_method_ctr);
   tty->print_cr(" _ic_miss_cnt:                    %d", SharedRuntime::_ic_miss_ctr);
   tty->print_cr(" _generic_arraycopy_cnt:          %d", _generic_arraycopy_cnt);
+  tty->print_cr(" _generic_arraycopystub_cnt:      %d", _generic_arraycopystub_cnt);
+  tty->print_cr(" _byte_arraycopy_cnt:             %d", _byte_arraycopy_cnt);
+  tty->print_cr(" _short_arraycopy_cnt:            %d", _short_arraycopy_cnt);
+  tty->print_cr(" _int_arraycopy_cnt:              %d", _int_arraycopy_cnt);
+  tty->print_cr(" _long_arraycopy_cnt:             %d", _long_arraycopy_cnt);
   tty->print_cr(" _primitive_arraycopy_cnt:        %d", _primitive_arraycopy_cnt);
-  tty->print_cr(" _oop_arraycopy_cnt:              %d", _oop_arraycopy_cnt);
+  tty->print_cr(" _oop_arraycopy_cnt (C):          %d", Runtime1::_oop_arraycopy_cnt);
+  tty->print_cr(" _oop_arraycopy_cnt (stub):       %d", _oop_arraycopy_cnt);
   tty->print_cr(" _arraycopy_slowcase_cnt:         %d", _arraycopy_slowcase_cnt);
+  tty->print_cr(" _arraycopy_checkcast_cnt:        %d", _arraycopy_checkcast_cnt);
+  tty->print_cr(" _arraycopy_checkcast_attempt_cnt:%d", _arraycopy_checkcast_attempt_cnt);
 
   tty->print_cr(" _new_type_array_slowcase_cnt:    %d", _new_type_array_slowcase_cnt);
   tty->print_cr(" _new_object_array_slowcase_cnt:  %d", _new_object_array_slowcase_cnt);
--- a/hotspot/src/share/vm/c1/c1_Runtime1.hpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/share/vm/c1/c1_Runtime1.hpp	Sun Apr 03 12:00:54 2011 +0200
@@ -94,7 +94,10 @@
   static int _generic_arraycopy_cnt;
   static int _primitive_arraycopy_cnt;
   static int _oop_arraycopy_cnt;
+  static int _generic_arraycopystub_cnt;
   static int _arraycopy_slowcase_cnt;
+  static int _arraycopy_checkcast_cnt;
+  static int _arraycopy_checkcast_attempt_cnt;
   static int _new_type_array_slowcase_cnt;
   static int _new_object_array_slowcase_cnt;
   static int _new_instance_slowcase_cnt;
@@ -174,7 +177,8 @@
   static void trace_block_entry(jint block_id);
 
 #ifndef PRODUCT
-  static address throw_count_address()       { return (address)&_throw_count;       }
+  static address throw_count_address()               { return (address)&_throw_count;             }
+  static address arraycopy_count_address(BasicType type);
 #endif
 
   // directly accessible leaf routine
--- a/hotspot/src/share/vm/opto/library_call.cpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/share/vm/opto/library_call.cpp	Sun Apr 03 12:00:54 2011 +0200
@@ -4292,81 +4292,6 @@
   return true;
 }
 
-
-// constants for computing the copy function
-enum {
-  COPYFUNC_UNALIGNED = 0,
-  COPYFUNC_ALIGNED = 1,                 // src, dest aligned to HeapWordSize
-  COPYFUNC_CONJOINT = 0,
-  COPYFUNC_DISJOINT = 2                 // src != dest, or transfer can descend
-};
-
-// Note:  The condition "disjoint" applies also for overlapping copies
-// where an descending copy is permitted (i.e., dest_offset <= src_offset).
-static address
-select_arraycopy_function(BasicType t, bool aligned, bool disjoint, const char* &name, bool dest_uninitialized) {
-  int selector =
-    (aligned  ? COPYFUNC_ALIGNED  : COPYFUNC_UNALIGNED) +
-    (disjoint ? COPYFUNC_DISJOINT : COPYFUNC_CONJOINT);
-
-#define RETURN_STUB(xxx_arraycopy) { \
-  name = #xxx_arraycopy; \
-  return StubRoutines::xxx_arraycopy(); }
-
-#define RETURN_STUB_PARM(xxx_arraycopy, parm) {           \
-  name = #xxx_arraycopy; \
-  return StubRoutines::xxx_arraycopy(parm); }
-
-  switch (t) {
-  case T_BYTE:
-  case T_BOOLEAN:
-    switch (selector) {
-    case COPYFUNC_CONJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jbyte_arraycopy);
-    case COPYFUNC_CONJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jbyte_arraycopy);
-    case COPYFUNC_DISJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jbyte_disjoint_arraycopy);
-    case COPYFUNC_DISJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jbyte_disjoint_arraycopy);
-    }
-  case T_CHAR:
-  case T_SHORT:
-    switch (selector) {
-    case COPYFUNC_CONJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jshort_arraycopy);
-    case COPYFUNC_CONJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jshort_arraycopy);
-    case COPYFUNC_DISJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jshort_disjoint_arraycopy);
-    case COPYFUNC_DISJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jshort_disjoint_arraycopy);
-    }
-  case T_INT:
-  case T_FLOAT:
-    switch (selector) {
-    case COPYFUNC_CONJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jint_arraycopy);
-    case COPYFUNC_CONJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jint_arraycopy);
-    case COPYFUNC_DISJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jint_disjoint_arraycopy);
-    case COPYFUNC_DISJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jint_disjoint_arraycopy);
-    }
-  case T_DOUBLE:
-  case T_LONG:
-    switch (selector) {
-    case COPYFUNC_CONJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jlong_arraycopy);
-    case COPYFUNC_CONJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jlong_arraycopy);
-    case COPYFUNC_DISJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jlong_disjoint_arraycopy);
-    case COPYFUNC_DISJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jlong_disjoint_arraycopy);
-    }
-  case T_ARRAY:
-  case T_OBJECT:
-    switch (selector) {
-    case COPYFUNC_CONJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB_PARM(oop_arraycopy, dest_uninitialized);
-    case COPYFUNC_CONJOINT | COPYFUNC_ALIGNED:    RETURN_STUB_PARM(arrayof_oop_arraycopy, dest_uninitialized);
-    case COPYFUNC_DISJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB_PARM(oop_disjoint_arraycopy, dest_uninitialized);
-    case COPYFUNC_DISJOINT | COPYFUNC_ALIGNED:    RETURN_STUB_PARM(arrayof_oop_disjoint_arraycopy, dest_uninitialized);
-    }
-  default:
-    ShouldNotReachHere();
-    return NULL;
-  }
-
-#undef RETURN_STUB
-#undef RETURN_STUB_PARM
-}
-
 //------------------------------basictype2arraycopy----------------------------
 address LibraryCallKit::basictype2arraycopy(BasicType t,
                                             Node* src_offset,
@@ -4399,7 +4324,7 @@
     disjoint = true;
   }
 
-  return select_arraycopy_function(t, aligned, disjoint, name, dest_uninitialized);
+  return StubRoutines::select_arraycopy_function(t, aligned, disjoint, name, dest_uninitialized);
 }
 
 
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp	Sun Apr 03 12:00:54 2011 +0200
@@ -433,3 +433,77 @@
 
 #undef RETURN_STUB
 }
+
+// constants for computing the copy function
+enum {
+  COPYFUNC_UNALIGNED = 0,
+  COPYFUNC_ALIGNED = 1,                 // src, dest aligned to HeapWordSize
+  COPYFUNC_CONJOINT = 0,
+  COPYFUNC_DISJOINT = 2                 // src != dest, or transfer can descend
+};
+
+// Note:  The condition "disjoint" applies also for overlapping copies
+// where an descending copy is permitted (i.e., dest_offset <= src_offset).
+address
+StubRoutines::select_arraycopy_function(BasicType t, bool aligned, bool disjoint, const char* &name, bool dest_uninitialized) {
+  int selector =
+    (aligned  ? COPYFUNC_ALIGNED  : COPYFUNC_UNALIGNED) +
+    (disjoint ? COPYFUNC_DISJOINT : COPYFUNC_CONJOINT);
+
+#define RETURN_STUB(xxx_arraycopy) { \
+  name = #xxx_arraycopy; \
+  return StubRoutines::xxx_arraycopy(); }
+
+#define RETURN_STUB_PARM(xxx_arraycopy, parm) {           \
+  name = #xxx_arraycopy; \
+  return StubRoutines::xxx_arraycopy(parm); }
+
+  switch (t) {
+  case T_BYTE:
+  case T_BOOLEAN:
+    switch (selector) {
+    case COPYFUNC_CONJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jbyte_arraycopy);
+    case COPYFUNC_CONJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jbyte_arraycopy);
+    case COPYFUNC_DISJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jbyte_disjoint_arraycopy);
+    case COPYFUNC_DISJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jbyte_disjoint_arraycopy);
+    }
+  case T_CHAR:
+  case T_SHORT:
+    switch (selector) {
+    case COPYFUNC_CONJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jshort_arraycopy);
+    case COPYFUNC_CONJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jshort_arraycopy);
+    case COPYFUNC_DISJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jshort_disjoint_arraycopy);
+    case COPYFUNC_DISJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jshort_disjoint_arraycopy);
+    }
+  case T_INT:
+  case T_FLOAT:
+    switch (selector) {
+    case COPYFUNC_CONJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jint_arraycopy);
+    case COPYFUNC_CONJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jint_arraycopy);
+    case COPYFUNC_DISJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jint_disjoint_arraycopy);
+    case COPYFUNC_DISJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jint_disjoint_arraycopy);
+    }
+  case T_DOUBLE:
+  case T_LONG:
+    switch (selector) {
+    case COPYFUNC_CONJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jlong_arraycopy);
+    case COPYFUNC_CONJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jlong_arraycopy);
+    case COPYFUNC_DISJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB(jlong_disjoint_arraycopy);
+    case COPYFUNC_DISJOINT | COPYFUNC_ALIGNED:    RETURN_STUB(arrayof_jlong_disjoint_arraycopy);
+    }
+  case T_ARRAY:
+  case T_OBJECT:
+    switch (selector) {
+    case COPYFUNC_CONJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB_PARM(oop_arraycopy, dest_uninitialized);
+    case COPYFUNC_CONJOINT | COPYFUNC_ALIGNED:    RETURN_STUB_PARM(arrayof_oop_arraycopy, dest_uninitialized);
+    case COPYFUNC_DISJOINT | COPYFUNC_UNALIGNED:  RETURN_STUB_PARM(oop_disjoint_arraycopy, dest_uninitialized);
+    case COPYFUNC_DISJOINT | COPYFUNC_ALIGNED:    RETURN_STUB_PARM(arrayof_oop_disjoint_arraycopy, dest_uninitialized);
+    }
+  default:
+    ShouldNotReachHere();
+    return NULL;
+  }
+
+#undef RETURN_STUB
+#undef RETURN_STUB_PARM
+}
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp	Sat Apr 02 10:54:15 2011 -0700
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp	Sun Apr 03 12:00:54 2011 +0200
@@ -282,6 +282,8 @@
   static address addr_fpu_subnormal_bias2()                { return (address)&_fpu_subnormal_bias2; }
 
 
+  static address select_arraycopy_function(BasicType t, bool aligned, bool disjoint, const char* &name, bool dest_uninitialized);
+
   static address jbyte_arraycopy()  { return _jbyte_arraycopy; }
   static address jshort_arraycopy() { return _jshort_arraycopy; }
   static address jint_arraycopy()   { return _jint_arraycopy; }