hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp
changeset 35085 839c8ba29724
parent 34148 6efbc7ffd767
child 35156 a06b3d7455d6
--- a/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Fri Dec 04 14:06:38 2015 +0100
+++ b/hotspot/src/cpu/ppc/vm/stubGenerator_ppc.cpp	Fri Dec 04 16:38:04 2015 +0100
@@ -48,6 +48,12 @@
 #define BLOCK_COMMENT(str) __ block_comment(str)
 #endif
 
+#if defined(ABI_ELFv2)
+#define STUB_ENTRY(name) StubRoutines::name()
+#else
+#define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name())->entry()
+#endif
+
 class StubGenerator: public StubCodeGenerator {
  private:
 
@@ -259,8 +265,7 @@
       //
 
       // global toc register
-      __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1);
-
+      __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R11_scratch1);
       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
       // when called via a c2i.
 
@@ -619,14 +624,17 @@
   //  Kills:
   //     nothing
   //
-  void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1) {
+  void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1,
+                                       Register preserve1 = noreg, Register preserve2 = noreg) {
     BarrierSet* const bs = Universe::heap()->barrier_set();
     switch (bs->kind()) {
       case BarrierSet::G1SATBCTLogging:
         // With G1, don't generate the call if we statically know that the target in uninitialized
         if (!dest_uninitialized) {
-          const int spill_slots = 4 * wordSize;
-          const int frame_size  = frame::abi_reg_args_size + spill_slots;
+          int spill_slots = 3;
+          if (preserve1 != noreg) { spill_slots++; }
+          if (preserve2 != noreg) { spill_slots++; }
+          const int frame_size = align_size_up(frame::abi_reg_args_size + spill_slots * BytesPerWord, frame::alignment_in_bytes);
           Label filtered;
 
           // Is marking active?
@@ -640,17 +648,23 @@
           __ beq(CCR0, filtered);
 
           __ save_LR_CR(R0);
-          __ push_frame_reg_args(spill_slots, R0);
-          __ std(from,  frame_size - 1 * wordSize, R1_SP);
-          __ std(to,    frame_size - 2 * wordSize, R1_SP);
-          __ std(count, frame_size - 3 * wordSize, R1_SP);
+          __ push_frame(frame_size, R0);
+          int slot_nr = 0;
+          __ std(from,  frame_size - (++slot_nr) * wordSize, R1_SP);
+          __ std(to,    frame_size - (++slot_nr) * wordSize, R1_SP);
+          __ std(count, frame_size - (++slot_nr) * wordSize, R1_SP);
+          if (preserve1 != noreg) { __ std(preserve1, frame_size - (++slot_nr) * wordSize, R1_SP); }
+          if (preserve2 != noreg) { __ std(preserve2, frame_size - (++slot_nr) * wordSize, R1_SP); }
 
           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);
 
-          __ ld(from,  frame_size - 1 * wordSize, R1_SP);
-          __ ld(to,    frame_size - 2 * wordSize, R1_SP);
-          __ ld(count, frame_size - 3 * wordSize, R1_SP);
-          __ pop_frame();
+          slot_nr = 0;
+          __ ld(from,  frame_size - (++slot_nr) * wordSize, R1_SP);
+          __ ld(to,    frame_size - (++slot_nr) * wordSize, R1_SP);
+          __ ld(count, frame_size - (++slot_nr) * wordSize, R1_SP);
+          if (preserve1 != noreg) { __ ld(preserve1, frame_size - (++slot_nr) * wordSize, R1_SP); }
+          if (preserve2 != noreg) { __ ld(preserve2, frame_size - (++slot_nr) * wordSize, R1_SP); }
+          __ addi(R1_SP, R1_SP, frame_size); // pop_frame()
           __ restore_LR_CR(R0);
 
           __ bind(filtered);
@@ -674,27 +688,22 @@
   //
   //  The input registers and R0 are overwritten.
   //
-  void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, bool branchToEnd) {
+  void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, Register preserve = noreg) {
     BarrierSet* const bs = Universe::heap()->barrier_set();
 
     switch (bs->kind()) {
       case BarrierSet::G1SATBCTLogging:
         {
-          if (branchToEnd) {
-            __ save_LR_CR(R0);
-            // We need this frame only to spill LR.
-            __ push_frame_reg_args(0, R0);
-            __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
-            __ pop_frame();
-            __ restore_LR_CR(R0);
-          } else {
-            // Tail call: fake call from stub caller by branching without linking.
-            address entry_point = (address)CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post);
-            __ mr_if_needed(R3_ARG1, addr);
-            __ mr_if_needed(R4_ARG2, count);
-            __ load_const(R11, entry_point, R0);
-            __ call_c_and_return_to_caller(R11);
-          }
+          int spill_slots = (preserve != noreg) ? 1 : 0;
+          const int frame_size = align_size_up(frame::abi_reg_args_size + spill_slots * BytesPerWord, frame::alignment_in_bytes);
+
+          __ save_LR_CR(R0);
+          __ push_frame(frame_size, R0);
+          if (preserve != noreg) { __ std(preserve, frame_size - 1 * wordSize, R1_SP); }
+          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
+          if (preserve != noreg) { __ ld(preserve, frame_size - 1 * wordSize, R1_SP); }
+          __ addi(R1_SP, R1_SP, frame_size); // pop_frame();
+          __ restore_LR_CR(R0);
         }
         break;
       case BarrierSet::CardTableForRS:
@@ -729,12 +738,9 @@
           __ addi(addr, addr, 1);
           __ bdnz(Lstore_loop);
           __ bind(Lskip_loop);
-
-          if (!branchToEnd) __ blr();
         }
       break;
       case BarrierSet::ModRef:
-        if (!branchToEnd) __ blr();
         break;
       default:
         ShouldNotReachHere();
@@ -763,8 +769,10 @@
 
     // Procedure for large arrays (uses data cache block zero instruction).
     Label dwloop, fast, fastloop, restloop, lastdword, done;
-    int cl_size=VM_Version::get_cache_line_size(), cl_dwords=cl_size>>3, cl_dwordaddr_bits=exact_log2(cl_dwords);
-    int min_dcbz=2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
+    int cl_size = VM_Version::L1_data_cache_line_size();
+    int cl_dwords = cl_size >> 3;
+    int cl_dwordaddr_bits = exact_log2(cl_dwords);
+    int min_dcbz = 2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
 
     // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
     __ dcbtst(base_ptr_reg);                    // Indicate write access to first cache line ...
@@ -1081,7 +1089,6 @@
     Register tmp1 = R6_ARG4;
     Register tmp2 = R7_ARG5;
 
-    Label l_overlap;
 #ifdef ASSERT
     __ srdi_(tmp2, R5_ARG3, 31);
     __ asm_assert_eq("missing zero extend", 0xAFFE);
@@ -1091,19 +1098,11 @@
     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
     __ cmpld(CCR1, tmp1, tmp2);
-    __ crand(CCR0, Assembler::less, CCR1, Assembler::less);
-    __ blt(CCR0, l_overlap); // Src before dst and distance smaller than size.
-
-    // need to copy forwards
-    if (__ is_within_range_of_b(no_overlap_target, __ pc())) {
-      __ b(no_overlap_target);
-    } else {
-      __ load_const(tmp1, no_overlap_target, tmp2);
-      __ mtctr(tmp1);
-      __ bctr();
-    }
-
-    __ bind(l_overlap);
+    __ crnand(CCR0, Assembler::less, CCR1, Assembler::less);
+    // Overlaps if Src before dst and distance smaller than size.
+    // Branch to forward copy routine otherwise (within range of 32kB).
+    __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::less), no_overlap_target);
+
     // need to copy backwards
   }
 
@@ -1248,6 +1247,7 @@
     }
 
     __ bind(l_4);
+    __ li(R3_RET, 0); // return 0
     __ blr();
 
     return start;
@@ -1269,15 +1269,9 @@
     Register tmp2 = R7_ARG5;
     Register tmp3 = R8_ARG6;
 
-#if defined(ABI_ELFv2)
     address nooverlap_target = aligned ?
-      StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
-      StubRoutines::jbyte_disjoint_arraycopy();
-#else
-    address nooverlap_target = aligned ?
-      ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() :
-      ((FunctionDescriptor*)StubRoutines::jbyte_disjoint_arraycopy())->entry();
-#endif
+      STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy) :
+      STUB_ENTRY(jbyte_disjoint_arraycopy);
 
     array_overlap_test(nooverlap_target, 0);
     // Do reverse copy. We assume the case of actual overlap is rare enough
@@ -1292,6 +1286,7 @@
     __ lbzx(tmp1, R3_ARG1, R5_ARG3);
     __ bge(CCR0, l_1);
 
+    __ li(R3_RET, 0); // return 0
     __ blr();
 
     return start;
@@ -1474,6 +1469,7 @@
       __ bdnz(l_5);
     }
     __ bind(l_4);
+    __ li(R3_RET, 0); // return 0
     __ blr();
 
     return start;
@@ -1495,15 +1491,9 @@
     Register tmp2 = R7_ARG5;
     Register tmp3 = R8_ARG6;
 
-#if defined(ABI_ELFv2)
     address nooverlap_target = aligned ?
-        StubRoutines::arrayof_jshort_disjoint_arraycopy() :
-        StubRoutines::jshort_disjoint_arraycopy();
-#else
-    address nooverlap_target = aligned ?
-        ((FunctionDescriptor*)StubRoutines::arrayof_jshort_disjoint_arraycopy())->entry() :
-        ((FunctionDescriptor*)StubRoutines::jshort_disjoint_arraycopy())->entry();
-#endif
+      STUB_ENTRY(arrayof_jshort_disjoint_arraycopy) :
+      STUB_ENTRY(jshort_disjoint_arraycopy);
 
     array_overlap_test(nooverlap_target, 1);
 
@@ -1517,6 +1507,7 @@
     __ lhzx(tmp2, R3_ARG1, tmp1);
     __ bge(CCR0, l_1);
 
+    __ li(R3_RET, 0); // return 0
     __ blr();
 
     return start;
@@ -1620,6 +1611,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ function_entry();
     generate_disjoint_int_copy_core(aligned);
+    __ li(R3_RET, 0); // return 0
     __ blr();
     return start;
   }
@@ -1704,20 +1696,15 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ function_entry();
 
-#if defined(ABI_ELFv2)
     address nooverlap_target = aligned ?
-      StubRoutines::arrayof_jint_disjoint_arraycopy() :
-      StubRoutines::jint_disjoint_arraycopy();
-#else
-    address nooverlap_target = aligned ?
-      ((FunctionDescriptor*)StubRoutines::arrayof_jint_disjoint_arraycopy())->entry() :
-      ((FunctionDescriptor*)StubRoutines::jint_disjoint_arraycopy())->entry();
-#endif
+      STUB_ENTRY(arrayof_jint_disjoint_arraycopy) :
+      STUB_ENTRY(jint_disjoint_arraycopy);
 
     array_overlap_test(nooverlap_target, 2);
 
     generate_conjoint_int_copy_core(aligned);
 
+    __ li(R3_RET, 0); // return 0
     __ blr();
 
     return start;
@@ -1796,6 +1783,7 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ function_entry();
     generate_disjoint_long_copy_core(aligned);
+    __ li(R3_RET, 0); // return 0
     __ blr();
 
     return start;
@@ -1878,19 +1866,14 @@
     StubCodeMark mark(this, "StubRoutines", name);
     address start = __ function_entry();
 
-#if defined(ABI_ELFv2)
     address nooverlap_target = aligned ?
-      StubRoutines::arrayof_jlong_disjoint_arraycopy() :
-      StubRoutines::jlong_disjoint_arraycopy();
-#else
-    address nooverlap_target = aligned ?
-      ((FunctionDescriptor*)StubRoutines::arrayof_jlong_disjoint_arraycopy())->entry() :
-      ((FunctionDescriptor*)StubRoutines::jlong_disjoint_arraycopy())->entry();
-#endif
+      STUB_ENTRY(arrayof_jlong_disjoint_arraycopy) :
+      STUB_ENTRY(jlong_disjoint_arraycopy);
 
     array_overlap_test(nooverlap_target, 3);
     generate_conjoint_long_copy_core(aligned);
 
+    __ li(R3_RET, 0); // return 0
     __ blr();
 
     return start;
@@ -1910,15 +1893,9 @@
 
     address start = __ function_entry();
 
-#if defined(ABI_ELFv2)
     address nooverlap_target = aligned ?
-      StubRoutines::arrayof_oop_disjoint_arraycopy() :
-      StubRoutines::oop_disjoint_arraycopy();
-#else
-    address nooverlap_target = aligned ?
-      ((FunctionDescriptor*)StubRoutines::arrayof_oop_disjoint_arraycopy())->entry() :
-      ((FunctionDescriptor*)StubRoutines::oop_disjoint_arraycopy())->entry();
-#endif
+      STUB_ENTRY(arrayof_oop_disjoint_arraycopy) :
+      STUB_ENTRY(oop_disjoint_arraycopy);
 
     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
 
@@ -1934,7 +1911,9 @@
       generate_conjoint_long_copy_core(aligned);
     }
 
-    gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
+    gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1);
+    __ li(R3_RET, 0); // return 0
+    __ blr();
     return start;
   }
 
@@ -1964,11 +1943,460 @@
       generate_disjoint_long_copy_core(aligned);
     }
 
-    gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
+    gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1);
+    __ li(R3_RET, 0); // return 0
+    __ blr();
+
+    return start;
+  }
+
+
+  // Helper for generating a dynamic type check.
+  // Smashes only the given temp registers.
+  void generate_type_check(Register sub_klass,
+                           Register super_check_offset,
+                           Register super_klass,
+                           Register temp,
+                           Label& L_success) {
+    assert_different_registers(sub_klass, super_check_offset, super_klass);
+
+    BLOCK_COMMENT("type_check:");
+
+    Label L_miss;
+
+    __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, R0, &L_success, &L_miss, NULL,
+                                     super_check_offset);
+    __ check_klass_subtype_slow_path(sub_klass, super_klass, temp, R0, &L_success, NULL);
+
+    // Fall through on failure!
+    __ bind(L_miss);
+  }
+
+
+  //  Generate stub for checked oop copy.
+  //
+  // Arguments for generated stub:
+  //      from:  R3
+  //      to:    R4
+  //      count: R5 treated as signed
+  //      ckoff: R6 (super_check_offset)
+  //      ckval: R7 (super_klass)
+  //      ret:   R3 zero for success; (-1^K) where K is partial transfer count
+  //
+  address generate_checkcast_copy(const char *name, bool dest_uninitialized) {
+
+    const Register R3_from   = R3_ARG1;      // source array address
+    const Register R4_to     = R4_ARG2;      // destination array address
+    const Register R5_count  = R5_ARG3;      // elements count
+    const Register R6_ckoff  = R6_ARG4;      // super_check_offset
+    const Register R7_ckval  = R7_ARG5;      // super_klass
+
+    const Register R8_offset = R8_ARG6;      // loop var, with stride wordSize
+    const Register R9_remain = R9_ARG7;      // loop var, with stride -1
+    const Register R10_oop   = R10_ARG8;     // actual oop copied
+    const Register R11_klass = R11_scratch1; // oop._klass
+    const Register R12_tmp   = R12_scratch2;
+
+    const Register R2_minus1 = R2;
+
+    //__ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ function_entry();
+
+    // TODO: Assert that int is 64 bit sign extended and arrays are not conjoint.
+
+    gen_write_ref_array_pre_barrier(R3_from, R4_to, R5_count, dest_uninitialized, R12_tmp, /* preserve: */ R6_ckoff, R7_ckval);
+
+    //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
+
+    Label load_element, store_element, store_null, success, do_card_marks;
+    __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
+    __ li(R8_offset, 0);                   // Offset from start of arrays.
+    __ li(R2_minus1, -1);
+    __ bne(CCR0, load_element);
+
+    // Empty array: Nothing to do.
+    __ li(R3_RET, 0);           // Return 0 on (trivial) success.
+    __ blr();
+
+    // ======== begin loop ========
+    // (Entry is load_element.)
+    __ align(OptoLoopAlignment);
+    __ bind(store_element);
+    if (UseCompressedOops) {
+      __ encode_heap_oop_not_null(R10_oop);
+      __ bind(store_null);
+      __ stw(R10_oop, R8_offset, R4_to);
+    } else {
+      __ bind(store_null);
+      __ std(R10_oop, R8_offset, R4_to);
+    }
+
+    __ addi(R8_offset, R8_offset, heapOopSize);   // Step to next offset.
+    __ add_(R9_remain, R2_minus1, R9_remain);     // Decrement the count.
+    __ beq(CCR0, success);
+
+    // ======== loop entry is here ========
+    __ bind(load_element);
+    __ load_heap_oop(R10_oop, R8_offset, R3_from, &store_null);  // Load the oop.
+
+    __ load_klass(R11_klass, R10_oop); // Query the object klass.
+
+    generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp,
+                        // Branch to this on success:
+                        store_element);
+    // ======== end loop ========
+
+    // It was a real error; we must depend on the caller to finish the job.
+    // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
+    // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
+    // and report their number to the caller.
+    __ subf_(R5_count, R9_remain, R5_count);
+    __ nand(R3_RET, R5_count, R5_count);   // report (-1^K) to caller
+    __ bne(CCR0, do_card_marks);
+    __ blr();
+
+    __ bind(success);
+    __ li(R3_RET, 0);
+
+    __ bind(do_card_marks);
+    // Store check on R4_to[0..R5_count-1].
+    gen_write_ref_array_post_barrier(R4_to, R5_count, R12_tmp, /* preserve: */ R3_RET);
+    __ blr();
+    return start;
+  }
+
+
+  //  Generate 'unsafe' array copy stub.
+  //  Though just as safe as the other stubs, it takes an unscaled
+  //  size_t argument instead of an element count.
+  //
+  // Arguments for generated stub:
+  //      from:  R3
+  //      to:    R4
+  //      count: R5 byte count, treated as ssize_t, can be zero
+  //
+  // Examines the alignment of the operands and dispatches
+  // to a long, int, short, or byte copy loop.
+  //
+  address generate_unsafe_copy(const char* name,
+                               address byte_copy_entry,
+                               address short_copy_entry,
+                               address int_copy_entry,
+                               address long_copy_entry) {
+
+    const Register R3_from   = R3_ARG1;      // source array address
+    const Register R4_to     = R4_ARG2;      // destination array address
+    const Register R5_count  = R5_ARG3;      // elements count (as long on PPC64)
+
+    const Register R6_bits   = R6_ARG4;      // test copy of low bits
+    const Register R7_tmp    = R7_ARG5;
+
+    //__ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ function_entry();
+
+    // Bump this on entry, not on exit:
+    //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
+
+    Label short_copy, int_copy, long_copy;
+
+    __ orr(R6_bits, R3_from, R4_to);
+    __ orr(R6_bits, R6_bits, R5_count);
+    __ andi_(R0, R6_bits, (BytesPerLong-1));
+    __ beq(CCR0, long_copy);
+
+    __ andi_(R0, R6_bits, (BytesPerInt-1));
+    __ beq(CCR0, int_copy);
+
+    __ andi_(R0, R6_bits, (BytesPerShort-1));
+    __ beq(CCR0, short_copy);
+
+    // byte_copy:
+    __ b(byte_copy_entry);
+
+    __ bind(short_copy);
+    __ srwi(R5_count, R5_count, LogBytesPerShort);
+    __ b(short_copy_entry);
+
+    __ bind(int_copy);
+    __ srwi(R5_count, R5_count, LogBytesPerInt);
+    __ b(int_copy_entry);
+
+    __ bind(long_copy);
+    __ srwi(R5_count, R5_count, LogBytesPerLong);
+    __ b(long_copy_entry);
 
     return start;
   }
 
+
+  // Perform range checks on the proposed arraycopy.
+  // Kills the two temps, but nothing else.
+  // Also, clean the sign bits of src_pos and dst_pos.
+  void arraycopy_range_checks(Register src,     // source array oop
+                              Register src_pos, // source position
+                              Register dst,     // destination array oop
+                              Register dst_pos, // destination position
+                              Register length,  // length of copy
+                              Register temp1, Register temp2,
+                              Label& L_failed) {
+    BLOCK_COMMENT("arraycopy_range_checks:");
+
+    const Register array_length = temp1;  // scratch
+    const Register end_pos      = temp2;  // scratch
+
+    //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
+    __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
+    __ add(end_pos, src_pos, length);  // src_pos + length
+    __ cmpd(CCR0, end_pos, array_length);
+    __ bgt(CCR0, L_failed);
+
+    //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
+    __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
+    __ add(end_pos, dst_pos, length);  // src_pos + length
+    __ cmpd(CCR0, end_pos, array_length);
+    __ bgt(CCR0, L_failed);
+
+    BLOCK_COMMENT("arraycopy_range_checks done");
+  }
+
+
+  //
+  //  Generate generic array copy stubs
+  //
+  //  Input:
+  //    R3    -  src oop
+  //    R4    -  src_pos
+  //    R5    -  dst oop
+  //    R6    -  dst_pos
+  //    R7    -  element count
+  //
+  //  Output:
+  //    R3 ==  0  -  success
+  //    R3 == -1  -  need to call System.arraycopy
+  //
+  address generate_generic_copy(const char *name,
+                                address entry_jbyte_arraycopy,
+                                address entry_jshort_arraycopy,
+                                address entry_jint_arraycopy,
+                                address entry_oop_arraycopy,
+                                address entry_disjoint_oop_arraycopy,
+                                address entry_jlong_arraycopy,
+                                address entry_checkcast_arraycopy) {
+    Label L_failed, L_objArray;
+
+    // Input registers
+    const Register src       = R3_ARG1;  // source array oop
+    const Register src_pos   = R4_ARG2;  // source position
+    const Register dst       = R5_ARG3;  // destination array oop
+    const Register dst_pos   = R6_ARG4;  // destination position
+    const Register length    = R7_ARG5;  // elements count
+
+    // registers used as temp
+    const Register src_klass = R8_ARG6;  // source array klass
+    const Register dst_klass = R9_ARG7;  // destination array klass
+    const Register lh        = R10_ARG8; // layout handler
+    const Register temp      = R2;
+
+    //__ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ function_entry();
+
+    // Bump this on entry, not on exit:
+    //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
+
+    // In principle, the int arguments could be dirty.
+
+    //-----------------------------------------------------------------------
+    // Assembler stubs will be used for this call to arraycopy
+    // if the following conditions are met:
+    //
+    // (1) src and dst must not be null.
+    // (2) src_pos must not be negative.
+    // (3) dst_pos must not be negative.
+    // (4) length  must not be negative.
+    // (5) src klass and dst klass should be the same and not NULL.
+    // (6) src and dst should be arrays.
+    // (7) src_pos + length must not exceed length of src.
+    // (8) dst_pos + length must not exceed length of dst.
+    BLOCK_COMMENT("arraycopy initial argument checks");
+
+    __ cmpdi(CCR1, src, 0);      // if (src == NULL) return -1;
+    __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
+    __ cmpdi(CCR5, dst, 0);      // if (dst == NULL) return -1;
+    __ cror(CCR1, Assembler::equal, CCR0, Assembler::less);
+    __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
+    __ cror(CCR5, Assembler::equal, CCR0, Assembler::less);
+    __ extsw_(length, length);   // if (length < 0) return -1;
+    __ cror(CCR1, Assembler::equal, CCR5, Assembler::equal);
+    __ cror(CCR1, Assembler::equal, CCR0, Assembler::less);
+    __ beq(CCR1, L_failed);
+
+    BLOCK_COMMENT("arraycopy argument klass checks");
+    __ load_klass(src_klass, src);
+    __ load_klass(dst_klass, dst);
+
+    // Load layout helper
+    //
+    //  |array_tag|     | header_size | element_type |     |log2_element_size|
+    // 32        30    24            16              8     2                 0
+    //
+    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
+    //
+
+    int lh_offset = in_bytes(Klass::layout_helper_offset());
+
+    // Load 32-bits signed value. Use br() instruction with it to check icc.
+    __ lwz(lh, lh_offset, src_klass);
+
+    // Handle objArrays completely differently...
+    jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
+    __ load_const_optimized(temp, objArray_lh, R0);
+    __ cmpw(CCR0, lh, temp);
+    __ beq(CCR0, L_objArray);
+
+    __ cmpd(CCR5, src_klass, dst_klass);          // if (src->klass() != dst->klass()) return -1;
+    __ cmpwi(CCR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
+
+    __ crnand(CCR5, Assembler::equal, CCR6, Assembler::less);
+    __ beq(CCR5, L_failed);
+
+    // At this point, it is known to be a typeArray (array_tag 0x3).
+#ifdef ASSERT
+    { Label L;
+      jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
+      __ load_const_optimized(temp, lh_prim_tag_in_place, R0);
+      __ cmpw(CCR0, lh, temp);
+      __ bge(CCR0, L);
+      __ stop("must be a primitive array");
+      __ bind(L);
+    }
+#endif
+
+    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
+                           temp, dst_klass, L_failed);
+
+    // TypeArrayKlass
+    //
+    // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
+    // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
+    //
+
+    const Register offset = dst_klass;    // array offset
+    const Register elsize = src_klass;    // log2 element size
+
+    __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
+    __ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
+    __ add(src, offset, src);       // src array offset
+    __ add(dst, offset, dst);       // dst array offset
+
+    // Next registers should be set before the jump to corresponding stub.
+    const Register from     = R3_ARG1;  // source array address
+    const Register to       = R4_ARG2;  // destination array address
+    const Register count    = R5_ARG3;  // elements count
+
+    // 'from', 'to', 'count' registers should be set in this order
+    // since they are the same as 'src', 'src_pos', 'dst'.
+
+    BLOCK_COMMENT("scale indexes to element size");
+    __ sld(src_pos, src_pos, elsize);
+    __ sld(dst_pos, dst_pos, elsize);
+    __ add(from, src_pos, src);  // src_addr
+    __ add(to, dst_pos, dst);    // dst_addr
+    __ mr(count, length);        // length
+
+    BLOCK_COMMENT("choose copy loop based on element size");
+    // Using conditional branches with range 32kB.
+    const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CCR0, Assembler::equal);
+    __ cmpwi(CCR0, elsize, 0);
+    __ bc(bo, bi, entry_jbyte_arraycopy);
+    __ cmpwi(CCR0, elsize, LogBytesPerShort);
+    __ bc(bo, bi, entry_jshort_arraycopy);
+    __ cmpwi(CCR0, elsize, LogBytesPerInt);
+    __ bc(bo, bi, entry_jint_arraycopy);
+#ifdef ASSERT
+    { Label L;
+      __ cmpwi(CCR0, elsize, LogBytesPerLong);
+      __ beq(CCR0, L);
+      __ stop("must be long copy, but elsize is wrong");
+      __ bind(L);
+    }
+#endif
+    __ b(entry_jlong_arraycopy);
+
+    // ObjArrayKlass
+  __ bind(L_objArray);
+    // live at this point:  src_klass, dst_klass, src[_pos], dst[_pos], length
+
+    Label L_disjoint_plain_copy, L_checkcast_copy;
+    //  test array classes for subtyping
+    __ cmpd(CCR0, src_klass, dst_klass);         // usual case is exact equality
+    __ bne(CCR0, L_checkcast_copy);
+
+    // Identically typed arrays can be copied without element-wise checks.
+    arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
+                           temp, lh, L_failed);
+
+    __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
+    __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
+    __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
+    __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
+    __ add(from, src_pos, src);  // src_addr
+    __ add(to, dst_pos, dst);    // dst_addr
+    __ mr(count, length);        // length
+    __ b(entry_oop_arraycopy);
+
+  __ bind(L_checkcast_copy);
+    // live at this point:  src_klass, dst_klass
+    {
+      // Before looking at dst.length, make sure dst is also an objArray.
+      __ lwz(temp, lh_offset, dst_klass);
+      __ cmpw(CCR0, lh, temp);
+      __ bne(CCR0, L_failed);
+
+      // It is safe to examine both src.length and dst.length.
+      arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
+                             temp, lh, L_failed);
+
+      // Marshal the base address arguments now, freeing registers.
+      __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
+      __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
+      __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
+      __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
+      __ add(from, src_pos, src);  // src_addr
+      __ add(to, dst_pos, dst);    // dst_addr
+      __ mr(count, length);        // length
+
+      Register sco_temp = R6_ARG4;             // This register is free now.
+      assert_different_registers(from, to, count, sco_temp,
+                                 dst_klass, src_klass);
+
+      // Generate the type check.
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());
+      __ lwz(sco_temp, sco_offset, dst_klass);
+      generate_type_check(src_klass, sco_temp, dst_klass,
+                          temp, L_disjoint_plain_copy);
+
+      // Fetch destination element klass from the ObjArrayKlass header.
+      int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
+
+      // The checkcast_copy loop needs two extra arguments:
+      __ ld(R7_ARG5, ek_offset, dst_klass);   // dest elem klass
+      __ lwz(R6_ARG4, sco_offset, R7_ARG5);   // sco of elem klass
+      __ b(entry_checkcast_arraycopy);
+    }
+
+    __ bind(L_disjoint_plain_copy);
+    __ b(entry_disjoint_oop_arraycopy);
+
+  __ bind(L_failed);
+    __ li(R3_RET, -1); // return -1
+    __ blr();
+    return start;
+  }
+
+
   void generate_arraycopy_stubs() {
     // Note: the disjoint stubs must be generated first, some of
     // the conjoint stubs use them.
@@ -2005,6 +2433,24 @@
     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
 
+    // special/generic versions
+    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", false);
+    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", true);
+
+    StubRoutines::_unsafe_arraycopy  = generate_unsafe_copy("unsafe_arraycopy",
+                                                            STUB_ENTRY(jbyte_arraycopy),
+                                                            STUB_ENTRY(jshort_arraycopy),
+                                                            STUB_ENTRY(jint_arraycopy),
+                                                            STUB_ENTRY(jlong_arraycopy));
+    StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
+                                                             STUB_ENTRY(jbyte_arraycopy),
+                                                             STUB_ENTRY(jshort_arraycopy),
+                                                             STUB_ENTRY(jint_arraycopy),
+                                                             STUB_ENTRY(oop_arraycopy),
+                                                             STUB_ENTRY(oop_disjoint_arraycopy),
+                                                             STUB_ENTRY(jlong_arraycopy),
+                                                             STUB_ENTRY(checkcast_arraycopy));
+
     // fill routines
     StubRoutines::_jbyte_fill          = generate_fill(T_BYTE,  false, "jbyte_fill");
     StubRoutines::_jshort_fill         = generate_fill(T_SHORT, false, "jshort_fill");