src/hotspot/cpu/sparc/stubGenerator_sparc.cpp
changeset 47216 71c04702a3d5
parent 46462 f92a713126b1
child 47561 f59f0e51ef8a
equal deleted inserted replaced
47215:4ebc2e2fb97c 47216:71c04702a3d5
       
     1 /*
       
     2  * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.
       
     8  *
       
     9  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    12  * version 2 for more details (a copy is included in the LICENSE file that
       
    13  * accompanied this code).
       
    14  *
       
    15  * You should have received a copy of the GNU General Public License version
       
    16  * 2 along with this work; if not, write to the Free Software Foundation,
       
    17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    18  *
       
    19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    20  * or visit www.oracle.com if you need additional information or have any
       
    21  * questions.
       
    22  *
       
    23  */
       
    24 
       
    25 #include "precompiled.hpp"
       
    26 #include "asm/macroAssembler.inline.hpp"
       
    27 #include "interpreter/interpreter.hpp"
       
    28 #include "nativeInst_sparc.hpp"
       
    29 #include "oops/instanceOop.hpp"
       
    30 #include "oops/method.hpp"
       
    31 #include "oops/objArrayKlass.hpp"
       
    32 #include "oops/oop.inline.hpp"
       
    33 #include "prims/methodHandles.hpp"
       
    34 #include "runtime/frame.inline.hpp"
       
    35 #include "runtime/handles.inline.hpp"
       
    36 #include "runtime/sharedRuntime.hpp"
       
    37 #include "runtime/stubCodeGenerator.hpp"
       
    38 #include "runtime/stubRoutines.hpp"
       
    39 #include "runtime/thread.inline.hpp"
       
    40 #ifdef COMPILER2
       
    41 #include "opto/runtime.hpp"
       
    42 #endif
       
    43 
       
    44 // Declaration and definition of StubGenerator (no .hpp file).
       
    45 // For a more detailed description of the stub routine structure
       
    46 // see the comment in stubRoutines.hpp.
       
    47 
       
    48 #define __ _masm->
       
    49 
       
    50 #ifdef PRODUCT
       
    51 #define BLOCK_COMMENT(str) /* nothing */
       
    52 #else
       
    53 #define BLOCK_COMMENT(str) __ block_comment(str)
       
    54 #endif
       
    55 
       
    56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
       
    57 
       
    58 // Note:  The register L7 is used as L7_thread_cache, and may not be used
       
    59 //        any other way within this module.
       
    60 
       
    61 
       
    62 static const Register& Lstub_temp = L2;
       
    63 
       
    64 // -------------------------------------------------------------------------------------------------------------------------
       
    65 // Stub Code definitions
       
    66 
       
    67 class StubGenerator: public StubCodeGenerator {
       
    68  private:
       
    69 
       
    70 #ifdef PRODUCT
       
    71 #define inc_counter_np(a,b,c)
       
    72 #else
       
    73 #define inc_counter_np(counter, t1, t2) \
       
    74   BLOCK_COMMENT("inc_counter " #counter); \
       
    75   __ inc_counter(&counter, t1, t2);
       
    76 #endif
       
    77 
       
    78   //----------------------------------------------------------------------------------------------------
       
    79   // Call stubs are used to call Java from C
       
    80 
       
    81   address generate_call_stub(address& return_pc) {
       
    82     StubCodeMark mark(this, "StubRoutines", "call_stub");
       
    83     address start = __ pc();
       
    84 
       
    85     // Incoming arguments:
       
    86     //
       
    87     // o0         : call wrapper address
       
    88     // o1         : result (address)
       
    89     // o2         : result type
       
    90     // o3         : method
       
    91     // o4         : (interpreter) entry point
       
    92     // o5         : parameters (address)
       
    93     // [sp + 0x5c]: parameter size (in words)
       
    94     // [sp + 0x60]: thread
       
    95     //
       
    96     // +---------------+ <--- sp + 0
       
    97     // |               |
       
    98     // . reg save area .
       
    99     // |               |
       
   100     // +---------------+ <--- sp + 0x40
       
   101     // |               |
       
   102     // . extra 7 slots .
       
   103     // |               |
       
   104     // +---------------+ <--- sp + 0x5c
       
   105     // |  param. size  |
       
   106     // +---------------+ <--- sp + 0x60
       
   107     // |    thread     |
       
   108     // +---------------+
       
   109     // |               |
       
   110 
       
   111     // note: if the link argument position changes, adjust
       
   112     //       the code in frame::entry_frame_call_wrapper()
       
   113 
       
   114     const Argument link           = Argument(0, false); // used only for GC
       
   115     const Argument result         = Argument(1, false);
       
   116     const Argument result_type    = Argument(2, false);
       
   117     const Argument method         = Argument(3, false);
       
   118     const Argument entry_point    = Argument(4, false);
       
   119     const Argument parameters     = Argument(5, false);
       
   120     const Argument parameter_size = Argument(6, false);
       
   121     const Argument thread         = Argument(7, false);
       
   122 
       
   123     // setup thread register
       
   124     __ ld_ptr(thread.as_address(), G2_thread);
       
   125     __ reinit_heapbase();
       
   126 
       
   127 #ifdef ASSERT
       
   128     // make sure we have no pending exceptions
       
   129     { const Register t = G3_scratch;
       
   130       Label L;
       
   131       __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
       
   132       __ br_null_short(t, Assembler::pt, L);
       
   133       __ stop("StubRoutines::call_stub: entered with pending exception");
       
   134       __ bind(L);
       
   135     }
       
   136 #endif
       
   137 
       
   138     // create activation frame & allocate space for parameters
       
   139     { const Register t = G3_scratch;
       
   140       __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
       
   141       __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
       
   142       __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
       
   143       __ sll(t, Interpreter::logStackElementSize, t);           // compute number of bytes
       
   144       __ neg(t);                                                // negate so it can be used with save
       
   145       __ save(SP, t, SP);                                       // setup new frame
       
   146     }
       
   147 
       
   148     // +---------------+ <--- sp + 0
       
   149     // |               |
       
   150     // . reg save area .
       
   151     // |               |
       
   152     // +---------------+ <--- sp + 0x40
       
   153     // |               |
       
   154     // . extra 7 slots .
       
   155     // |               |
       
   156     // +---------------+ <--- sp + 0x5c
       
   157     // |  empty slot   |      (only if parameter size is even)
       
   158     // +---------------+
       
   159     // |               |
       
   160     // .  parameters   .
       
   161     // |               |
       
   162     // +---------------+ <--- fp + 0
       
   163     // |               |
       
   164     // . reg save area .
       
   165     // |               |
       
   166     // +---------------+ <--- fp + 0x40
       
   167     // |               |
       
   168     // . extra 7 slots .
       
   169     // |               |
       
   170     // +---------------+ <--- fp + 0x5c
       
   171     // |  param. size  |
       
   172     // +---------------+ <--- fp + 0x60
       
   173     // |    thread     |
       
   174     // +---------------+
       
   175     // |               |
       
   176 
       
   177     // pass parameters if any
       
   178     BLOCK_COMMENT("pass parameters if any");
       
   179     { const Register src = parameters.as_in().as_register();
       
   180       const Register dst = Lentry_args;
       
   181       const Register tmp = G3_scratch;
       
   182       const Register cnt = G4_scratch;
       
   183 
       
   184       // test if any parameters & setup of Lentry_args
       
   185       Label exit;
       
   186       __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
       
   187       __ add( FP, STACK_BIAS, dst );
       
   188       __ cmp_zero_and_br(Assembler::zero, cnt, exit);
       
   189       __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
       
   190 
       
   191       // copy parameters if any
       
   192       Label loop;
       
   193       __ BIND(loop);
       
   194       // Store parameter value
       
   195       __ ld_ptr(src, 0, tmp);
       
   196       __ add(src, BytesPerWord, src);
       
   197       __ st_ptr(tmp, dst, 0);
       
   198       __ deccc(cnt);
       
   199       __ br(Assembler::greater, false, Assembler::pt, loop);
       
   200       __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
       
   201 
       
   202       // done
       
   203       __ BIND(exit);
       
   204     }
       
   205 
       
   206     // setup parameters, method & call Java function
       
   207 #ifdef ASSERT
       
   208     // layout_activation_impl checks it's notion of saved SP against
       
   209     // this register, so if this changes update it as well.
       
   210     const Register saved_SP = Lscratch;
       
   211     __ mov(SP, saved_SP);                               // keep track of SP before call
       
   212 #endif
       
   213 
       
   214     // setup parameters
       
   215     const Register t = G3_scratch;
       
   216     __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
       
   217     __ sll(t, Interpreter::logStackElementSize, t);    // compute number of bytes
       
   218     __ sub(FP, t, Gargs);                              // setup parameter pointer
       
   219     __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
       
   220     __ mov(SP, O5_savedSP);
       
   221 
       
   222 
       
   223     // do the call
       
   224     //
       
   225     // the following register must be setup:
       
   226     //
       
   227     // G2_thread
       
   228     // G5_method
       
   229     // Gargs
       
   230     BLOCK_COMMENT("call Java function");
       
   231     __ jmpl(entry_point.as_in().as_register(), G0, O7);
       
   232     __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
       
   233 
       
   234     BLOCK_COMMENT("call_stub_return_address:");
       
   235     return_pc = __ pc();
       
   236 
       
   237     // The callee, if it wasn't interpreted, can return with SP changed so
       
   238     // we can no longer assert of change of SP.
       
   239 
       
   240     // store result depending on type
       
   241     // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
       
   242     //  is treated as T_INT)
       
   243     { const Register addr = result     .as_in().as_register();
       
   244       const Register type = result_type.as_in().as_register();
       
   245       Label is_long, is_float, is_double, is_object, exit;
       
   246       __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
       
   247       __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
       
   248       __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
       
   249       __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
       
   250       __ delayed()->nop();
       
   251 
       
   252       // store int result
       
   253       __ st(O0, addr, G0);
       
   254 
       
   255       __ BIND(exit);
       
   256       __ ret();
       
   257       __ delayed()->restore();
       
   258 
       
   259       __ BIND(is_object);
       
   260       __ ba(exit);
       
   261       __ delayed()->st_ptr(O0, addr, G0);
       
   262 
       
   263       __ BIND(is_float);
       
   264       __ ba(exit);
       
   265       __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
       
   266 
       
   267       __ BIND(is_double);
       
   268       __ ba(exit);
       
   269       __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
       
   270 
       
   271       __ BIND(is_long);
       
   272       __ ba(exit);
       
   273       __ delayed()->st_long(O0, addr, G0);      // store entire long
       
   274      }
       
   275      return start;
       
   276   }
       
   277 
       
   278 
       
   279   //----------------------------------------------------------------------------------------------------
       
   280   // Return point for a Java call if there's an exception thrown in Java code.
       
   281   // The exception is caught and transformed into a pending exception stored in
       
   282   // JavaThread that can be tested from within the VM.
       
   283   //
       
   284   // Oexception: exception oop
       
   285 
       
   286   address generate_catch_exception() {
       
   287     StubCodeMark mark(this, "StubRoutines", "catch_exception");
       
   288 
       
   289     address start = __ pc();
       
   290     // verify that thread corresponds
       
   291     __ verify_thread();
       
   292 
       
   293     const Register& temp_reg = Gtemp;
       
   294     Address pending_exception_addr    (G2_thread, Thread::pending_exception_offset());
       
   295     Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset   ());
       
   296     Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset   ());
       
   297 
       
   298     // set pending exception
       
   299     __ verify_oop(Oexception);
       
   300     __ st_ptr(Oexception, pending_exception_addr);
       
   301     __ set((intptr_t)__FILE__, temp_reg);
       
   302     __ st_ptr(temp_reg, exception_file_offset_addr);
       
   303     __ set((intptr_t)__LINE__, temp_reg);
       
   304     __ st(temp_reg, exception_line_offset_addr);
       
   305 
       
   306     // complete return to VM
       
   307     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
       
   308 
       
   309     AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
       
   310     __ jump_to(stub_ret, temp_reg);
       
   311     __ delayed()->nop();
       
   312 
       
   313     return start;
       
   314   }
       
   315 
       
   316 
       
   317   //----------------------------------------------------------------------------------------------------
       
   318   // Continuation point for runtime calls returning with a pending exception
       
   319   // The pending exception check happened in the runtime or native call stub
       
   320   // The pending exception in Thread is converted into a Java-level exception
       
   321   //
       
   322   // Contract with Java-level exception handler: O0 = exception
       
   323   //                                             O1 = throwing pc
       
   324 
       
   325   address generate_forward_exception() {
       
   326     StubCodeMark mark(this, "StubRoutines", "forward_exception");
       
   327     address start = __ pc();
       
   328 
       
   329     // Upon entry, O7 has the return address returning into Java
       
   330     // (interpreted or compiled) code; i.e. the return address
       
   331     // becomes the throwing pc.
       
   332 
       
   333     const Register& handler_reg = Gtemp;
       
   334 
       
   335     Address exception_addr(G2_thread, Thread::pending_exception_offset());
       
   336 
       
   337 #ifdef ASSERT
       
   338     // make sure that this code is only executed if there is a pending exception
       
   339     { Label L;
       
   340       __ ld_ptr(exception_addr, Gtemp);
       
   341       __ br_notnull_short(Gtemp, Assembler::pt, L);
       
   342       __ stop("StubRoutines::forward exception: no pending exception (1)");
       
   343       __ bind(L);
       
   344     }
       
   345 #endif
       
   346 
       
   347     // compute exception handler into handler_reg
       
   348     __ get_thread();
       
   349     __ ld_ptr(exception_addr, Oexception);
       
   350     __ verify_oop(Oexception);
       
   351     __ save_frame(0);             // compensates for compiler weakness
       
   352     __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
       
   353     BLOCK_COMMENT("call exception_handler_for_return_address");
       
   354     __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
       
   355     __ mov(O0, handler_reg);
       
   356     __ restore();                 // compensates for compiler weakness
       
   357 
       
   358     __ ld_ptr(exception_addr, Oexception);
       
   359     __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
       
   360 
       
   361 #ifdef ASSERT
       
   362     // make sure exception is set
       
   363     { Label L;
       
   364       __ br_notnull_short(Oexception, Assembler::pt, L);
       
   365       __ stop("StubRoutines::forward exception: no pending exception (2)");
       
   366       __ bind(L);
       
   367     }
       
   368 #endif
       
   369     // jump to exception handler
       
   370     __ jmp(handler_reg, 0);
       
   371     // clear pending exception
       
   372     __ delayed()->st_ptr(G0, exception_addr);
       
   373 
       
   374     return start;
       
   375   }
       
   376 
       
   377   // Safefetch stubs.
       
   378   void generate_safefetch(const char* name, int size, address* entry,
       
   379                           address* fault_pc, address* continuation_pc) {
       
   380     // safefetch signatures:
       
   381     //   int      SafeFetch32(int*      adr, int      errValue);
       
   382     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
       
   383     //
       
   384     // arguments:
       
   385     //   o0 = adr
       
   386     //   o1 = errValue
       
   387     //
       
   388     // result:
       
   389     //   o0  = *adr or errValue
       
   390 
       
   391     StubCodeMark mark(this, "StubRoutines", name);
       
   392 
       
   393     // Entry point, pc or function descriptor.
       
   394     __ align(CodeEntryAlignment);
       
   395     *entry = __ pc();
       
   396 
       
   397     __ mov(O0, G1);  // g1 = o0
       
   398     __ mov(O1, O0);  // o0 = o1
       
   399     // Load *adr into c_rarg1, may fault.
       
   400     *fault_pc = __ pc();
       
   401     switch (size) {
       
   402       case 4:
       
   403         // int32_t
       
   404         __ ldsw(G1, 0, O0);  // o0 = [g1]
       
   405         break;
       
   406       case 8:
       
   407         // int64_t
       
   408         __ ldx(G1, 0, O0);   // o0 = [g1]
       
   409         break;
       
   410       default:
       
   411         ShouldNotReachHere();
       
   412     }
       
   413 
       
   414     // return errValue or *adr
       
   415     *continuation_pc = __ pc();
       
   416     // By convention with the trap handler we ensure there is a non-CTI
       
   417     // instruction in the trap shadow.
       
   418     __ nop();
       
   419     __ retl();
       
   420     __ delayed()->nop();
       
   421   }
       
   422 
       
   423   //------------------------------------------------------------------------------------------------------------------------
       
   424   // Continuation point for throwing of implicit exceptions that are not handled in
       
   425   // the current activation. Fabricates an exception oop and initiates normal
       
   426   // exception dispatching in this frame. Only callee-saved registers are preserved
       
   427   // (through the normal register window / RegisterMap handling).
       
   428   // If the compiler needs all registers to be preserved between the fault
       
   429   // point and the exception handler then it must assume responsibility for that in
       
   430   // AbstractCompiler::continuation_for_implicit_null_exception or
       
   431   // continuation_for_implicit_division_by_zero_exception. All other implicit
       
   432   // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
       
   433   // either at call sites or otherwise assume that stack unwinding will be initiated,
       
   434   // so caller saved registers were assumed volatile in the compiler.
       
   435 
       
   436   // Note that we generate only this stub into a RuntimeStub, because it needs to be
       
   437   // properly traversed and ignored during GC, so we change the meaning of the "__"
       
   438   // macro within this method.
       
   439 #undef __
       
   440 #define __ masm->
       
   441 
       
   442   address generate_throw_exception(const char* name, address runtime_entry,
       
   443                                    Register arg1 = noreg, Register arg2 = noreg) {
       
   444 #ifdef ASSERT
       
   445     int insts_size = VerifyThread ? 1 * K : 600;
       
   446 #else
       
   447     int insts_size = VerifyThread ? 1 * K : 256;
       
   448 #endif /* ASSERT */
       
   449     int locs_size  = 32;
       
   450 
       
   451     CodeBuffer      code(name, insts_size, locs_size);
       
   452     MacroAssembler* masm = new MacroAssembler(&code);
       
   453 
       
   454     __ verify_thread();
       
   455 
       
   456     // This is an inlined and slightly modified version of call_VM
       
   457     // which has the ability to fetch the return PC out of thread-local storage
       
   458     __ assert_not_delayed();
       
   459 
       
   460     // Note that we always push a frame because on the SPARC
       
   461     // architecture, for all of our implicit exception kinds at call
       
   462     // sites, the implicit exception is taken before the callee frame
       
   463     // is pushed.
       
   464     __ save_frame(0);
       
   465 
       
   466     int frame_complete = __ offset();
       
   467 
       
   468     // Note that we always have a runtime stub frame on the top of stack by this point
       
   469     Register last_java_sp = SP;
       
   470     // 64-bit last_java_sp is biased!
       
   471     __ set_last_Java_frame(last_java_sp, G0);
       
   472     if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
       
   473     __ save_thread(noreg);
       
   474     if (arg1 != noreg) {
       
   475       assert(arg2 != O1, "clobbered");
       
   476       __ mov(arg1, O1);
       
   477     }
       
   478     if (arg2 != noreg) {
       
   479       __ mov(arg2, O2);
       
   480     }
       
   481     // do the call
       
   482     BLOCK_COMMENT("call runtime_entry");
       
   483     __ call(runtime_entry, relocInfo::runtime_call_type);
       
   484     if (!VerifyThread)
       
   485       __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
       
   486     else
       
   487       __ delayed()->nop();             // (thread already passed)
       
   488     __ restore_thread(noreg);
       
   489     __ reset_last_Java_frame();
       
   490 
       
   491     // check for pending exceptions. use Gtemp as scratch register.
       
   492 #ifdef ASSERT
       
   493     Label L;
       
   494 
       
   495     Address exception_addr(G2_thread, Thread::pending_exception_offset());
       
   496     Register scratch_reg = Gtemp;
       
   497     __ ld_ptr(exception_addr, scratch_reg);
       
   498     __ br_notnull_short(scratch_reg, Assembler::pt, L);
       
   499     __ should_not_reach_here();
       
   500     __ bind(L);
       
   501 #endif // ASSERT
       
   502     BLOCK_COMMENT("call forward_exception_entry");
       
   503     __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
       
   504     // we use O7 linkage so that forward_exception_entry has the issuing PC
       
   505     __ delayed()->restore();
       
   506 
       
   507     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
       
   508     return stub->entry_point();
       
   509   }
       
   510 
       
   511 #undef __
       
   512 #define __ _masm->
       
   513 
       
   514 
       
   515   // Generate a routine that sets all the registers so we
       
   516   // can tell if the stop routine prints them correctly.
       
   517   address generate_test_stop() {
       
   518     StubCodeMark mark(this, "StubRoutines", "test_stop");
       
   519     address start = __ pc();
       
   520 
       
   521     int i;
       
   522 
       
   523     __ save_frame(0);
       
   524 
       
   525     static jfloat zero = 0.0, one = 1.0;
       
   526 
       
   527     // put addr in L0, then load through L0 to F0
       
   528     __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
       
   529     __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
       
   530 
       
   531     // use add to put 2..18 in F2..F18
       
   532     for ( i = 2;  i <= 18;  ++i ) {
       
   533       __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
       
   534     }
       
   535 
       
   536     // Now put double 2 in F16, double 18 in F18
       
   537     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
       
   538     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
       
   539 
       
   540     // use add to put 20..32 in F20..F32
       
   541     for (i = 20; i < 32; i += 2) {
       
   542       __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
       
   543     }
       
   544 
       
   545     // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
       
   546     for ( i = 0; i < 8; ++i ) {
       
   547       if (i < 6) {
       
   548         __ set(     i, as_iRegister(i));
       
   549         __ set(16 + i, as_oRegister(i));
       
   550         __ set(24 + i, as_gRegister(i));
       
   551       }
       
   552       __ set( 8 + i, as_lRegister(i));
       
   553     }
       
   554 
       
   555     __ stop("testing stop");
       
   556 
       
   557 
       
   558     __ ret();
       
   559     __ delayed()->restore();
       
   560 
       
   561     return start;
       
   562   }
       
   563 
       
   564 
       
   565   address generate_stop_subroutine() {
       
   566     StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
       
   567     address start = __ pc();
       
   568 
       
   569     __ stop_subroutine();
       
   570 
       
   571     return start;
       
   572   }
       
   573 
       
   574   address generate_flush_callers_register_windows() {
       
   575     StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
       
   576     address start = __ pc();
       
   577 
       
   578     __ flushw();
       
   579     __ retl(false);
       
   580     __ delayed()->add( FP, STACK_BIAS, O0 );
       
   581     // The returned value must be a stack pointer whose register save area
       
   582     // is flushed, and will stay flushed while the caller executes.
       
   583 
       
   584     return start;
       
   585   }
       
   586 
       
   587   // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
       
   588   //
       
   589   // Arguments:
       
   590   //
       
   591   //      exchange_value: O0
       
   592   //      dest:           O1
       
   593   //
       
   594   // Results:
       
   595   //
       
   596   //     O0: the value previously stored in dest
       
   597   //
       
   598   address generate_atomic_xchg() {
       
   599     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
       
   600     address start = __ pc();
       
   601 
       
   602     if (UseCASForSwap) {
       
   603       // Use CAS instead of swap, just in case the MP hardware
       
   604       // prefers to work with just one kind of synch. instruction.
       
   605       Label retry;
       
   606       __ BIND(retry);
       
   607       __ mov(O0, O3);       // scratch copy of exchange value
       
   608       __ ld(O1, 0, O2);     // observe the previous value
       
   609       // try to replace O2 with O3
       
   610       __ cas(O1, O2, O3);
       
   611       __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
       
   612 
       
   613       __ retl(false);
       
   614       __ delayed()->mov(O2, O0);  // report previous value to caller
       
   615     } else {
       
   616       __ retl(false);
       
   617       __ delayed()->swap(O1, 0, O0);
       
   618     }
       
   619 
       
   620     return start;
       
   621   }
       
   622 
       
   623 
       
   624   // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
       
   625   //
       
   626   // Arguments:
       
   627   //
       
   628   //      exchange_value: O0
       
   629   //      dest:           O1
       
   630   //      compare_value:  O2
       
   631   //
       
   632   // Results:
       
   633   //
       
   634   //     O0: the value previously stored in dest
       
   635   //
       
   636   address generate_atomic_cmpxchg() {
       
   637     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
       
   638     address start = __ pc();
       
   639 
       
   640     // cmpxchg(dest, compare_value, exchange_value)
       
   641     __ cas(O1, O2, O0);
       
   642     __ retl(false);
       
   643     __ delayed()->nop();
       
   644 
       
   645     return start;
       
   646   }
       
   647 
       
   648   // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
       
   649   //
       
   650   // Arguments:
       
   651   //
       
   652   //      exchange_value: O1:O0
       
   653   //      dest:           O2
       
   654   //      compare_value:  O4:O3
       
   655   //
       
   656   // Results:
       
   657   //
       
   658   //     O1:O0: the value previously stored in dest
       
   659   //
       
   660   // Overwrites: G1,G2,G3
       
   661   //
       
   662   address generate_atomic_cmpxchg_long() {
       
   663     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
       
   664     address start = __ pc();
       
   665 
       
   666     __ sllx(O0, 32, O0);
       
   667     __ srl(O1, 0, O1);
       
   668     __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
       
   669     __ sllx(O3, 32, O3);
       
   670     __ srl(O4, 0, O4);
       
   671     __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
       
   672     __ casx(O2, O3, O0);
       
   673     __ srl(O0, 0, O1);    // unpacked return value in O1:O0
       
   674     __ retl(false);
       
   675     __ delayed()->srlx(O0, 32, O0);
       
   676 
       
   677     return start;
       
   678   }
       
   679 
       
   680 
       
   681   // Support for jint Atomic::add(jint add_value, volatile jint* dest).
       
   682   //
       
   683   // Arguments:
       
   684   //
       
   685   //      add_value: O0   (e.g., +1 or -1)
       
   686   //      dest:      O1
       
   687   //
       
   688   // Results:
       
   689   //
       
   690   //     O0: the new value stored in dest
       
   691   //
       
   692   // Overwrites: O3
       
   693   //
       
   694   address generate_atomic_add() {
       
   695     StubCodeMark mark(this, "StubRoutines", "atomic_add");
       
   696     address start = __ pc();
       
   697     __ BIND(_atomic_add_stub);
       
   698 
       
   699     Label(retry);
       
   700     __ BIND(retry);
       
   701 
       
   702     __ lduw(O1, 0, O2);
       
   703     __ add(O0, O2, O3);
       
   704     __ cas(O1, O2, O3);
       
   705     __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
       
   706     __ retl(false);
       
   707     __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
       
   708 
       
   709     return start;
       
   710   }
       
   711   Label _atomic_add_stub;  // called from other stubs
       
   712 
       
   713 
       
   714   // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
       
   715   // Arguments :
       
   716   //
       
   717   //      ret  : O0, returned
       
   718   //      icc/xcc: set as O0 (depending on wordSize)
       
   719   //      sub  : O1, argument, not changed
       
   720   //      super: O2, argument, not changed
       
   721   //      raddr: O7, blown by call
       
   722   address generate_partial_subtype_check() {
       
   723     __ align(CodeEntryAlignment);
       
   724     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
       
   725     address start = __ pc();
       
   726     Label miss;
       
   727 
       
   728     __ save_frame(0);
       
   729     Register Rret   = I0;
       
   730     Register Rsub   = I1;
       
   731     Register Rsuper = I2;
       
   732 
       
   733     Register L0_ary_len = L0;
       
   734     Register L1_ary_ptr = L1;
       
   735     Register L2_super   = L2;
       
   736     Register L3_index   = L3;
       
   737 
       
   738     __ check_klass_subtype_slow_path(Rsub, Rsuper,
       
   739                                      L0, L1, L2, L3,
       
   740                                      NULL, &miss);
       
   741 
       
   742     // Match falls through here.
       
   743     __ addcc(G0,0,Rret);        // set Z flags, Z result
       
   744 
       
   745     __ ret();                   // Result in Rret is zero; flags set to Z
       
   746     __ delayed()->restore();
       
   747 
       
   748     __ BIND(miss);
       
   749     __ addcc(G0,1,Rret);        // set NZ flags, NZ result
       
   750 
       
   751     __ ret();                   // Result in Rret is != 0; flags set to NZ
       
   752     __ delayed()->restore();
       
   753 
       
   754     return start;
       
   755   }
       
   756 
       
   757 
       
   758   // Called from MacroAssembler::verify_oop
       
   759   //
       
   760   address generate_verify_oop_subroutine() {
       
   761     StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
       
   762 
       
   763     address start = __ pc();
       
   764 
       
   765     __ verify_oop_subroutine();
       
   766 
       
   767     return start;
       
   768   }
       
   769 
       
   770 
       
   771   //
       
   772   // Verify that a register contains clean 32-bits positive value
       
   773   // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
       
   774   //
       
   775   //  Input:
       
   776   //    Rint  -  32-bits value
       
   777   //    Rtmp  -  scratch
       
   778   //
       
   779   void assert_clean_int(Register Rint, Register Rtmp) {
       
   780   #if defined(ASSERT)
       
   781     __ signx(Rint, Rtmp);
       
   782     __ cmp(Rint, Rtmp);
       
   783     __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
       
   784   #endif
       
   785   }
       
   786 
       
   787   //
       
   788   //  Generate overlap test for array copy stubs
       
   789   //
       
   790   //  Input:
       
   791   //    O0    -  array1
       
   792   //    O1    -  array2
       
   793   //    O2    -  element count
       
   794   //
       
   795   //  Kills temps:  O3, O4
       
   796   //
       
   797   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
       
   798     assert(no_overlap_target != NULL, "must be generated");
       
   799     array_overlap_test(no_overlap_target, NULL, log2_elem_size);
       
   800   }
       
   801   void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
       
   802     array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
       
   803   }
       
   804   void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
       
   805     const Register from       = O0;
       
   806     const Register to         = O1;
       
   807     const Register count      = O2;
       
   808     const Register to_from    = O3; // to - from
       
   809     const Register byte_count = O4; // count << log2_elem_size
       
   810 
       
   811       __ subcc(to, from, to_from);
       
   812       __ sll_ptr(count, log2_elem_size, byte_count);
       
   813       if (NOLp == NULL)
       
   814         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
       
   815       else
       
   816         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
       
   817       __ delayed()->cmp(to_from, byte_count);
       
   818       if (NOLp == NULL)
       
   819         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
       
   820       else
       
   821         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
       
   822       __ delayed()->nop();
       
   823   }
       
   824 
       
   825   //
       
   826   //  Generate pre-write barrier for array.
       
   827   //
       
   828   //  Input:
       
   829   //     addr     - register containing starting address
       
   830   //     count    - register containing element count
       
   831   //     tmp      - scratch register
       
   832   //
       
   833   //  The input registers are overwritten.
       
   834   //
       
   835   void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
       
   836     BarrierSet* bs = Universe::heap()->barrier_set();
       
   837     switch (bs->kind()) {
       
   838       case BarrierSet::G1SATBCTLogging:
       
   839         // With G1, don't generate the call if we statically know that the target in uninitialized
       
   840         if (!dest_uninitialized) {
       
   841           __ save_frame(0);
       
   842           // Save the necessary global regs... will be used after.
       
   843           if (addr->is_global()) {
       
   844             __ mov(addr, L0);
       
   845           }
       
   846           if (count->is_global()) {
       
   847             __ mov(count, L1);
       
   848           }
       
   849           __ mov(addr->after_save(), O0);
       
   850           // Get the count into O1
       
   851           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
       
   852           __ delayed()->mov(count->after_save(), O1);
       
   853           if (addr->is_global()) {
       
   854             __ mov(L0, addr);
       
   855           }
       
   856           if (count->is_global()) {
       
   857             __ mov(L1, count);
       
   858           }
       
   859           __ restore();
       
   860         }
       
   861         break;
       
   862       case BarrierSet::CardTableForRS:
       
   863       case BarrierSet::CardTableExtension:
       
   864       case BarrierSet::ModRef:
       
   865         break;
       
   866       default:
       
   867         ShouldNotReachHere();
       
   868     }
       
   869   }
       
   870   //
       
   871   //  Generate post-write barrier for array.
       
   872   //
       
   873   //  Input:
       
   874   //     addr     - register containing starting address
       
   875   //     count    - register containing element count
       
   876   //     tmp      - scratch register
       
   877   //
       
   878   //  The input registers are overwritten.
       
   879   //
       
   880   void gen_write_ref_array_post_barrier(Register addr, Register count,
       
   881                                         Register tmp) {
       
   882     BarrierSet* bs = Universe::heap()->barrier_set();
       
   883 
       
   884     switch (bs->kind()) {
       
   885       case BarrierSet::G1SATBCTLogging:
       
   886         {
       
   887           // Get some new fresh output registers.
       
   888           __ save_frame(0);
       
   889           __ mov(addr->after_save(), O0);
       
   890           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
       
   891           __ delayed()->mov(count->after_save(), O1);
       
   892           __ restore();
       
   893         }
       
   894         break;
       
   895       case BarrierSet::CardTableForRS:
       
   896       case BarrierSet::CardTableExtension:
       
   897         {
       
   898           CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
       
   899           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
       
   900           assert_different_registers(addr, count, tmp);
       
   901 
       
   902           Label L_loop;
       
   903 
       
   904           __ sll_ptr(count, LogBytesPerHeapOop, count);
       
   905           __ sub(count, BytesPerHeapOop, count);
       
   906           __ add(count, addr, count);
       
   907           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
       
   908           __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
       
   909           __ srl_ptr(count, CardTableModRefBS::card_shift, count);
       
   910           __ sub(count, addr, count);
       
   911           AddressLiteral rs(ct->byte_map_base);
       
   912           __ set(rs, tmp);
       
   913         __ BIND(L_loop);
       
   914           __ stb(G0, tmp, addr);
       
   915           __ subcc(count, 1, count);
       
   916           __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
       
   917           __ delayed()->add(addr, 1, addr);
       
   918         }
       
   919         break;
       
   920       case BarrierSet::ModRef:
       
   921         break;
       
   922       default:
       
   923         ShouldNotReachHere();
       
   924     }
       
   925   }
       
   926 
       
   927   //
       
   928   // Generate main code for disjoint arraycopy
       
   929   //
       
   930   typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
       
   931                                               Label& L_loop, bool use_prefetch, bool use_bis);
       
   932 
       
   933   void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
       
   934                           int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) {
       
   935     Label L_copy;
       
   936 
       
   937     assert(log2_elem_size <= 3, "the following code should be changed");
       
   938     int count_dec = 16>>log2_elem_size;
       
   939 
       
   940     int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
       
   941     assert(prefetch_dist < 4096, "invalid value");
       
   942     prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
       
   943     int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
       
   944 
       
   945     if (UseBlockCopy) {
       
   946       Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
       
   947 
       
   948       // 64 bytes tail + bytes copied in one loop iteration
       
   949       int tail_size = 64 + iter_size;
       
   950       int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
       
   951       // Use BIS copy only for big arrays since it requires membar.
       
   952       __ set(block_copy_count, O4);
       
   953       __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
       
   954       // This code is for disjoint source and destination:
       
   955       //   to <= from || to >= from+count
       
   956       // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
       
   957       __ sub(from, to, O4);
       
   958       __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
       
   959       __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
       
   960 
       
   961       __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
       
   962       // BIS should not be used to copy tail (64 bytes+iter_size)
       
   963       // to avoid zeroing of following values.
       
   964       __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
       
   965 
       
   966       if (prefetch_count > 0) { // rounded up to one iteration count
       
   967         // Do prefetching only if copy size is bigger
       
   968         // than prefetch distance.
       
   969         __ set(prefetch_count, O4);
       
   970         __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
       
   971         __ sub(count, O4, count);
       
   972 
       
   973         (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
       
   974         __ set(prefetch_count, O4);
       
   975         __ add(count, O4, count);
       
   976 
       
   977       } // prefetch_count > 0
       
   978 
       
   979       (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
       
   980       __ add(count, (tail_size>>log2_elem_size), count); // restore count
       
   981 
       
   982       __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
       
   983       // BIS needs membar.
       
   984       __ membar(Assembler::StoreLoad);
       
   985       // Copy tail
       
   986       __ ba_short(L_copy);
       
   987 
       
   988       __ BIND(L_skip_block_copy);
       
   989     } // UseBlockCopy
       
   990 
       
   991     if (prefetch_count > 0) { // rounded up to one iteration count
       
   992       // Do prefetching only if copy size is bigger
       
   993       // than prefetch distance.
       
   994       __ set(prefetch_count, O4);
       
   995       __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
       
   996       __ sub(count, O4, count);
       
   997 
       
   998       Label L_copy_prefetch;
       
   999       (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
       
  1000       __ set(prefetch_count, O4);
       
  1001       __ add(count, O4, count);
       
  1002 
       
  1003     } // prefetch_count > 0
       
  1004 
       
  1005     (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
       
  1006   }
       
  1007 
       
  1008 
       
  1009 
       
  1010   //
       
  1011   // Helper methods for copy_16_bytes_forward_with_shift()
       
  1012   //
       
  1013   void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
       
  1014                                 Label& L_loop, bool use_prefetch, bool use_bis) {
       
  1015 
       
  1016     const Register left_shift  = G1; // left  shift bit counter
       
  1017     const Register right_shift = G5; // right shift bit counter
       
  1018 
       
  1019     __ align(OptoLoopAlignment);
       
  1020     __ BIND(L_loop);
       
  1021     if (use_prefetch) {
       
  1022       if (ArraycopySrcPrefetchDistance > 0) {
       
  1023         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
       
  1024       }
       
  1025       if (ArraycopyDstPrefetchDistance > 0) {
       
  1026         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
       
  1027       }
       
  1028     }
       
  1029     __ ldx(from, 0, O4);
       
  1030     __ ldx(from, 8, G4);
       
  1031     __ inc(to, 16);
       
  1032     __ inc(from, 16);
       
  1033     __ deccc(count, count_dec); // Can we do next iteration after this one?
       
  1034     __ srlx(O4, right_shift, G3);
       
  1035     __ bset(G3, O3);
       
  1036     __ sllx(O4, left_shift,  O4);
       
  1037     __ srlx(G4, right_shift, G3);
       
  1038     __ bset(G3, O4);
       
  1039     if (use_bis) {
       
  1040       __ stxa(O3, to, -16);
       
  1041       __ stxa(O4, to, -8);
       
  1042     } else {
       
  1043       __ stx(O3, to, -16);
       
  1044       __ stx(O4, to, -8);
       
  1045     }
       
  1046     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
       
  1047     __ delayed()->sllx(G4, left_shift,  O3);
       
  1048   }
       
  1049 
       
  1050   // Copy big chunks forward with shift
       
  1051   //
       
  1052   // Inputs:
       
  1053   //   from      - source arrays
       
  1054   //   to        - destination array aligned to 8-bytes
       
  1055   //   count     - elements count to copy >= the count equivalent to 16 bytes
       
  1056   //   count_dec - elements count's decrement equivalent to 16 bytes
       
  1057   //   L_copy_bytes - copy exit label
       
  1058   //
       
  1059   void copy_16_bytes_forward_with_shift(Register from, Register to,
       
  1060                      Register count, int log2_elem_size, Label& L_copy_bytes) {
       
  1061     Label L_aligned_copy, L_copy_last_bytes;
       
  1062     assert(log2_elem_size <= 3, "the following code should be changed");
       
  1063     int count_dec = 16>>log2_elem_size;
       
  1064 
       
  1065     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
       
  1066     __ andcc(from, 7, G1); // misaligned bytes
       
  1067     __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
       
  1068     __ delayed()->nop();
       
  1069 
       
  1070     const Register left_shift  = G1; // left  shift bit counter
       
  1071     const Register right_shift = G5; // right shift bit counter
       
  1072 
       
  1073     __ sll(G1, LogBitsPerByte, left_shift);
       
  1074     __ mov(64, right_shift);
       
  1075     __ sub(right_shift, left_shift, right_shift);
       
  1076 
       
  1077     //
       
  1078     // Load 2 aligned 8-bytes chunks and use one from previous iteration
       
  1079     // to form 2 aligned 8-bytes chunks to store.
       
  1080     //
       
  1081     __ dec(count, count_dec);   // Pre-decrement 'count'
       
  1082     __ andn(from, 7, from);     // Align address
       
  1083     __ ldx(from, 0, O3);
       
  1084     __ inc(from, 8);
       
  1085     __ sllx(O3, left_shift,  O3);
       
  1086 
       
  1087     disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop);
       
  1088 
       
  1089     __ inccc(count, count_dec>>1 ); // + 8 bytes
       
  1090     __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
       
  1091     __ delayed()->inc(count, count_dec>>1); // restore 'count'
       
  1092 
       
  1093     // copy 8 bytes, part of them already loaded in O3
       
  1094     __ ldx(from, 0, O4);
       
  1095     __ inc(to, 8);
       
  1096     __ inc(from, 8);
       
  1097     __ srlx(O4, right_shift, G3);
       
  1098     __ bset(O3, G3);
       
  1099     __ stx(G3, to, -8);
       
  1100 
       
  1101     __ BIND(L_copy_last_bytes);
       
  1102     __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
       
  1103     __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
       
  1104     __ delayed()->sub(from, right_shift, from);       // restore address
       
  1105 
       
  1106     __ BIND(L_aligned_copy);
       
  1107   }
       
  1108 
       
  1109   // Copy big chunks backward with shift
       
  1110   //
       
  1111   // Inputs:
       
  1112   //   end_from  - source arrays end address
       
  1113   //   end_to    - destination array end address aligned to 8-bytes
       
  1114   //   count     - elements count to copy >= the count equivalent to 16 bytes
       
  1115   //   count_dec - elements count's decrement equivalent to 16 bytes
       
  1116   //   L_aligned_copy - aligned copy exit label
       
  1117   //   L_copy_bytes   - copy exit label
       
  1118   //
       
  1119   void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
       
  1120                      Register count, int count_dec,
       
  1121                      Label& L_aligned_copy, Label& L_copy_bytes) {
       
  1122     Label L_loop, L_copy_last_bytes;
       
  1123 
       
  1124     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
       
  1125       __ andcc(end_from, 7, G1); // misaligned bytes
       
  1126       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
       
  1127       __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
       
  1128 
       
  1129     const Register left_shift  = G1; // left  shift bit counter
       
  1130     const Register right_shift = G5; // right shift bit counter
       
  1131 
       
  1132       __ sll(G1, LogBitsPerByte, left_shift);
       
  1133       __ mov(64, right_shift);
       
  1134       __ sub(right_shift, left_shift, right_shift);
       
  1135 
       
  1136     //
       
  1137     // Load 2 aligned 8-bytes chunks and use one from previous iteration
       
  1138     // to form 2 aligned 8-bytes chunks to store.
       
  1139     //
       
  1140       __ andn(end_from, 7, end_from);     // Align address
       
  1141       __ ldx(end_from, 0, O3);
       
  1142       __ align(OptoLoopAlignment);
       
  1143     __ BIND(L_loop);
       
  1144       __ ldx(end_from, -8, O4);
       
  1145       __ deccc(count, count_dec); // Can we do next iteration after this one?
       
  1146       __ ldx(end_from, -16, G4);
       
  1147       __ dec(end_to, 16);
       
  1148       __ dec(end_from, 16);
       
  1149       __ srlx(O3, right_shift, O3);
       
  1150       __ sllx(O4, left_shift,  G3);
       
  1151       __ bset(G3, O3);
       
  1152       __ stx(O3, end_to, 8);
       
  1153       __ srlx(O4, right_shift, O4);
       
  1154       __ sllx(G4, left_shift,  G3);
       
  1155       __ bset(G3, O4);
       
  1156       __ stx(O4, end_to, 0);
       
  1157       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
       
  1158       __ delayed()->mov(G4, O3);
       
  1159 
       
  1160       __ inccc(count, count_dec>>1 ); // + 8 bytes
       
  1161       __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
       
  1162       __ delayed()->inc(count, count_dec>>1); // restore 'count'
       
  1163 
       
  1164       // copy 8 bytes, part of them already loaded in O3
       
  1165       __ ldx(end_from, -8, O4);
       
  1166       __ dec(end_to, 8);
       
  1167       __ dec(end_from, 8);
       
  1168       __ srlx(O3, right_shift, O3);
       
  1169       __ sllx(O4, left_shift,  G3);
       
  1170       __ bset(O3, G3);
       
  1171       __ stx(G3, end_to, 0);
       
  1172 
       
  1173     __ BIND(L_copy_last_bytes);
       
  1174       __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
       
  1175       __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
       
  1176       __ delayed()->add(end_from, left_shift, end_from); // restore address
       
  1177   }
       
  1178 
       
  1179   //
       
  1180   //  Generate stub for disjoint byte copy.  If "aligned" is true, the
       
  1181   //  "from" and "to" addresses are assumed to be heapword aligned.
       
  1182   //
       
  1183   // Arguments for generated stub:
       
  1184   //      from:  O0
       
  1185   //      to:    O1
       
  1186   //      count: O2 treated as signed
       
  1187   //
       
  1188   address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
       
  1189     __ align(CodeEntryAlignment);
       
  1190     StubCodeMark mark(this, "StubRoutines", name);
       
  1191     address start = __ pc();
       
  1192 
       
  1193     Label L_skip_alignment, L_align;
       
  1194     Label L_copy_byte, L_copy_byte_loop, L_exit;
       
  1195 
       
  1196     const Register from      = O0;   // source array address
       
  1197     const Register to        = O1;   // destination array address
       
  1198     const Register count     = O2;   // elements count
       
  1199     const Register offset    = O5;   // offset from start of arrays
       
  1200     // O3, O4, G3, G4 are used as temp registers
       
  1201 
       
  1202     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
       
  1203 
       
  1204     if (entry != NULL) {
       
  1205       *entry = __ pc();
       
  1206       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
       
  1207       BLOCK_COMMENT("Entry:");
       
  1208     }
       
  1209 
       
  1210     // for short arrays, just do single element copy
       
  1211     __ cmp(count, 23); // 16 + 7
       
  1212     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
       
  1213     __ delayed()->mov(G0, offset);
       
  1214 
       
  1215     if (aligned) {
       
  1216       // 'aligned' == true when it is known statically during compilation
       
  1217       // of this arraycopy call site that both 'from' and 'to' addresses
       
  1218       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
       
  1219       //
       
  1220       // Aligned arrays have 4 bytes alignment in 32-bits VM
       
  1221       // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
       
  1222       //
       
  1223     } else {
       
  1224       // copy bytes to align 'to' on 8 byte boundary
       
  1225       __ andcc(to, 7, G1); // misaligned bytes
       
  1226       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
       
  1227       __ delayed()->neg(G1);
       
  1228       __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
       
  1229       __ sub(count, G1, count);
       
  1230     __ BIND(L_align);
       
  1231       __ ldub(from, 0, O3);
       
  1232       __ deccc(G1);
       
  1233       __ inc(from);
       
  1234       __ stb(O3, to, 0);
       
  1235       __ br(Assembler::notZero, false, Assembler::pt, L_align);
       
  1236       __ delayed()->inc(to);
       
  1237     __ BIND(L_skip_alignment);
       
  1238     }
       
  1239     if (!aligned) {
       
  1240       // Copy with shift 16 bytes per iteration if arrays do not have
       
  1241       // the same alignment mod 8, otherwise fall through to the next
       
  1242       // code for aligned copy.
       
  1243       // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
       
  1244       // Also jump over aligned copy after the copy with shift completed.
       
  1245 
       
  1246       copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
       
  1247     }
       
  1248 
       
  1249     // Both array are 8 bytes aligned, copy 16 bytes at a time
       
  1250       __ and3(count, 7, G4); // Save count
       
  1251       __ srl(count, 3, count);
       
  1252      generate_disjoint_long_copy_core(aligned);
       
  1253       __ mov(G4, count);     // Restore count
       
  1254 
       
  1255     // copy tailing bytes
       
  1256     __ BIND(L_copy_byte);
       
  1257       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
       
  1258       __ align(OptoLoopAlignment);
       
  1259     __ BIND(L_copy_byte_loop);
       
  1260       __ ldub(from, offset, O3);
       
  1261       __ deccc(count);
       
  1262       __ stb(O3, to, offset);
       
  1263       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
       
  1264       __ delayed()->inc(offset);
       
  1265 
       
  1266     __ BIND(L_exit);
       
  1267       // O3, O4 are used as temp registers
       
  1268       inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
       
  1269       __ retl();
       
  1270       __ delayed()->mov(G0, O0); // return 0
       
  1271     return start;
       
  1272   }
       
  1273 
       
  1274   //
       
  1275   //  Generate stub for conjoint byte copy.  If "aligned" is true, the
       
  1276   //  "from" and "to" addresses are assumed to be heapword aligned.
       
  1277   //
       
  1278   // Arguments for generated stub:
       
  1279   //      from:  O0
       
  1280   //      to:    O1
       
  1281   //      count: O2 treated as signed
       
  1282   //
       
  1283   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
       
  1284                                       address *entry, const char *name) {
       
  1285     // Do reverse copy.
       
  1286 
       
  1287     __ align(CodeEntryAlignment);
       
  1288     StubCodeMark mark(this, "StubRoutines", name);
       
  1289     address start = __ pc();
       
  1290 
       
  1291     Label L_skip_alignment, L_align, L_aligned_copy;
       
  1292     Label L_copy_byte, L_copy_byte_loop, L_exit;
       
  1293 
       
  1294     const Register from      = O0;   // source array address
       
  1295     const Register to        = O1;   // destination array address
       
  1296     const Register count     = O2;   // elements count
       
  1297     const Register end_from  = from; // source array end address
       
  1298     const Register end_to    = to;   // destination array end address
       
  1299 
       
  1300     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
       
  1301 
       
  1302     if (entry != NULL) {
       
  1303       *entry = __ pc();
       
  1304       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
       
  1305       BLOCK_COMMENT("Entry:");
       
  1306     }
       
  1307 
       
  1308     array_overlap_test(nooverlap_target, 0);
       
  1309 
       
  1310     __ add(to, count, end_to);       // offset after last copied element
       
  1311 
       
  1312     // for short arrays, just do single element copy
       
  1313     __ cmp(count, 23); // 16 + 7
       
  1314     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
       
  1315     __ delayed()->add(from, count, end_from);
       
  1316 
       
  1317     {
       
  1318       // Align end of arrays since they could be not aligned even
       
  1319       // when arrays itself are aligned.
       
  1320 
       
  1321       // copy bytes to align 'end_to' on 8 byte boundary
       
  1322       __ andcc(end_to, 7, G1); // misaligned bytes
       
  1323       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
       
  1324       __ delayed()->nop();
       
  1325       __ sub(count, G1, count);
       
  1326     __ BIND(L_align);
       
  1327       __ dec(end_from);
       
  1328       __ dec(end_to);
       
  1329       __ ldub(end_from, 0, O3);
       
  1330       __ deccc(G1);
       
  1331       __ brx(Assembler::notZero, false, Assembler::pt, L_align);
       
  1332       __ delayed()->stb(O3, end_to, 0);
       
  1333     __ BIND(L_skip_alignment);
       
  1334     }
       
  1335     if (aligned) {
       
  1336       // Both arrays are aligned to 8-bytes in 64-bits VM.
       
  1337       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
       
  1338       // in unaligned case.
       
  1339       __ dec(count, 16);
       
  1340     } else {
       
  1341       // Copy with shift 16 bytes per iteration if arrays do not have
       
  1342       // the same alignment mod 8, otherwise jump to the next
       
  1343       // code for aligned copy (and substracting 16 from 'count' before jump).
       
  1344       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
       
  1345       // Also jump over aligned copy after the copy with shift completed.
       
  1346 
       
  1347       copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
       
  1348                                         L_aligned_copy, L_copy_byte);
       
  1349     }
       
  1350     // copy 4 elements (16 bytes) at a time
       
  1351       __ align(OptoLoopAlignment);
       
  1352     __ BIND(L_aligned_copy);
       
  1353       __ dec(end_from, 16);
       
  1354       __ ldx(end_from, 8, O3);
       
  1355       __ ldx(end_from, 0, O4);
       
  1356       __ dec(end_to, 16);
       
  1357       __ deccc(count, 16);
       
  1358       __ stx(O3, end_to, 8);
       
  1359       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
       
  1360       __ delayed()->stx(O4, end_to, 0);
       
  1361       __ inc(count, 16);
       
  1362 
       
  1363     // copy 1 element (2 bytes) at a time
       
  1364     __ BIND(L_copy_byte);
       
  1365       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
       
  1366       __ align(OptoLoopAlignment);
       
  1367     __ BIND(L_copy_byte_loop);
       
  1368       __ dec(end_from);
       
  1369       __ dec(end_to);
       
  1370       __ ldub(end_from, 0, O4);
       
  1371       __ deccc(count);
       
  1372       __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
       
  1373       __ delayed()->stb(O4, end_to, 0);
       
  1374 
       
  1375     __ BIND(L_exit);
       
  1376     // O3, O4 are used as temp registers
       
  1377     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
       
  1378     __ retl();
       
  1379     __ delayed()->mov(G0, O0); // return 0
       
  1380     return start;
       
  1381   }
       
  1382 
       
  1383   //
       
  1384   //  Generate stub for disjoint short copy.  If "aligned" is true, the
       
  1385   //  "from" and "to" addresses are assumed to be heapword aligned.
       
  1386   //
       
  1387   // Arguments for generated stub:
       
  1388   //      from:  O0
       
  1389   //      to:    O1
       
  1390   //      count: O2 treated as signed
       
  1391   //
       
  1392   address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
       
  1393     __ align(CodeEntryAlignment);
       
  1394     StubCodeMark mark(this, "StubRoutines", name);
       
  1395     address start = __ pc();
       
  1396 
       
  1397     Label L_skip_alignment, L_skip_alignment2;
       
  1398     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
       
  1399 
       
  1400     const Register from      = O0;   // source array address
       
  1401     const Register to        = O1;   // destination array address
       
  1402     const Register count     = O2;   // elements count
       
  1403     const Register offset    = O5;   // offset from start of arrays
       
  1404     // O3, O4, G3, G4 are used as temp registers
       
  1405 
       
  1406     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
       
  1407 
       
  1408     if (entry != NULL) {
       
  1409       *entry = __ pc();
       
  1410       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
       
  1411       BLOCK_COMMENT("Entry:");
       
  1412     }
       
  1413 
       
  1414     // for short arrays, just do single element copy
       
  1415     __ cmp(count, 11); // 8 + 3  (22 bytes)
       
  1416     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
       
  1417     __ delayed()->mov(G0, offset);
       
  1418 
       
  1419     if (aligned) {
       
  1420       // 'aligned' == true when it is known statically during compilation
       
  1421       // of this arraycopy call site that both 'from' and 'to' addresses
       
  1422       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
       
  1423       //
       
  1424       // Aligned arrays have 4 bytes alignment in 32-bits VM
       
  1425       // and 8 bytes - in 64-bits VM.
       
  1426       //
       
  1427     } else {
       
  1428       // copy 1 element if necessary to align 'to' on an 4 bytes
       
  1429       __ andcc(to, 3, G0);
       
  1430       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
       
  1431       __ delayed()->lduh(from, 0, O3);
       
  1432       __ inc(from, 2);
       
  1433       __ inc(to, 2);
       
  1434       __ dec(count);
       
  1435       __ sth(O3, to, -2);
       
  1436     __ BIND(L_skip_alignment);
       
  1437 
       
  1438       // copy 2 elements to align 'to' on an 8 byte boundary
       
  1439       __ andcc(to, 7, G0);
       
  1440       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
       
  1441       __ delayed()->lduh(from, 0, O3);
       
  1442       __ dec(count, 2);
       
  1443       __ lduh(from, 2, O4);
       
  1444       __ inc(from, 4);
       
  1445       __ inc(to, 4);
       
  1446       __ sth(O3, to, -4);
       
  1447       __ sth(O4, to, -2);
       
  1448     __ BIND(L_skip_alignment2);
       
  1449     }
       
  1450     if (!aligned) {
       
  1451       // Copy with shift 16 bytes per iteration if arrays do not have
       
  1452       // the same alignment mod 8, otherwise fall through to the next
       
  1453       // code for aligned copy.
       
  1454       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
       
  1455       // Also jump over aligned copy after the copy with shift completed.
       
  1456 
       
  1457       copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
       
  1458     }
       
  1459 
       
  1460     // Both array are 8 bytes aligned, copy 16 bytes at a time
       
  1461       __ and3(count, 3, G4); // Save
       
  1462       __ srl(count, 2, count);
       
  1463      generate_disjoint_long_copy_core(aligned);
       
  1464       __ mov(G4, count); // restore
       
  1465 
       
  1466     // copy 1 element at a time
       
  1467     __ BIND(L_copy_2_bytes);
       
  1468       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
       
  1469       __ align(OptoLoopAlignment);
       
  1470     __ BIND(L_copy_2_bytes_loop);
       
  1471       __ lduh(from, offset, O3);
       
  1472       __ deccc(count);
       
  1473       __ sth(O3, to, offset);
       
  1474       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
       
  1475       __ delayed()->inc(offset, 2);
       
  1476 
       
  1477     __ BIND(L_exit);
       
  1478       // O3, O4 are used as temp registers
       
  1479       inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
       
  1480       __ retl();
       
  1481       __ delayed()->mov(G0, O0); // return 0
       
  1482     return start;
       
  1483   }
       
  1484 
       
  1485   //
       
  1486   //  Generate stub for disjoint short fill.  If "aligned" is true, the
       
  1487   //  "to" address is assumed to be heapword aligned.
       
  1488   //
       
  1489   // Arguments for generated stub:
       
  1490   //      to:    O0
       
  1491   //      value: O1
       
  1492   //      count: O2 treated as signed
       
  1493   //
       
  1494   address generate_fill(BasicType t, bool aligned, const char* name) {
       
  1495     __ align(CodeEntryAlignment);
       
  1496     StubCodeMark mark(this, "StubRoutines", name);
       
  1497     address start = __ pc();
       
  1498 
       
  1499     const Register to        = O0;   // source array address
       
  1500     const Register value     = O1;   // fill value
       
  1501     const Register count     = O2;   // elements count
       
  1502     // O3 is used as a temp register
       
  1503 
       
  1504     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
       
  1505 
       
  1506     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
       
  1507     Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
       
  1508 
       
  1509     int shift = -1;
       
  1510     switch (t) {
       
  1511        case T_BYTE:
       
  1512         shift = 2;
       
  1513         break;
       
  1514        case T_SHORT:
       
  1515         shift = 1;
       
  1516         break;
       
  1517       case T_INT:
       
  1518          shift = 0;
       
  1519         break;
       
  1520       default: ShouldNotReachHere();
       
  1521     }
       
  1522 
       
  1523     BLOCK_COMMENT("Entry:");
       
  1524 
       
  1525     if (t == T_BYTE) {
       
  1526       // Zero extend value
       
  1527       __ and3(value, 0xff, value);
       
  1528       __ sllx(value, 8, O3);
       
  1529       __ or3(value, O3, value);
       
  1530     }
       
  1531     if (t == T_SHORT) {
       
  1532       // Zero extend value
       
  1533       __ sllx(value, 48, value);
       
  1534       __ srlx(value, 48, value);
       
  1535     }
       
  1536     if (t == T_BYTE || t == T_SHORT) {
       
  1537       __ sllx(value, 16, O3);
       
  1538       __ or3(value, O3, value);
       
  1539     }
       
  1540 
       
  1541     __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
       
  1542     __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
       
  1543     __ delayed()->andcc(count, 1, G0);
       
  1544 
       
  1545     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
       
  1546       // align source address at 4 bytes address boundary
       
  1547       if (t == T_BYTE) {
       
  1548         // One byte misalignment happens only for byte arrays
       
  1549         __ andcc(to, 1, G0);
       
  1550         __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
       
  1551         __ delayed()->nop();
       
  1552         __ stb(value, to, 0);
       
  1553         __ inc(to, 1);
       
  1554         __ dec(count, 1);
       
  1555         __ BIND(L_skip_align1);
       
  1556       }
       
  1557       // Two bytes misalignment happens only for byte and short (char) arrays
       
  1558       __ andcc(to, 2, G0);
       
  1559       __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
       
  1560       __ delayed()->nop();
       
  1561       __ sth(value, to, 0);
       
  1562       __ inc(to, 2);
       
  1563       __ dec(count, 1 << (shift - 1));
       
  1564       __ BIND(L_skip_align2);
       
  1565     }
       
  1566     if (!aligned) {
       
  1567       // align to 8 bytes, we know we are 4 byte aligned to start
       
  1568       __ andcc(to, 7, G0);
       
  1569       __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
       
  1570       __ delayed()->nop();
       
  1571       __ stw(value, to, 0);
       
  1572       __ inc(to, 4);
       
  1573       __ dec(count, 1 << shift);
       
  1574       __ BIND(L_fill_32_bytes);
       
  1575     }
       
  1576 
       
  1577     if (t == T_INT) {
       
  1578       // Zero extend value
       
  1579       __ srl(value, 0, value);
       
  1580     }
       
  1581     if (t == T_BYTE || t == T_SHORT || t == T_INT) {
       
  1582       __ sllx(value, 32, O3);
       
  1583       __ or3(value, O3, value);
       
  1584     }
       
  1585 
       
  1586     Label L_check_fill_8_bytes;
       
  1587     // Fill 32-byte chunks
       
  1588     __ subcc(count, 8 << shift, count);
       
  1589     __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
       
  1590     __ delayed()->nop();
       
  1591 
       
  1592     Label L_fill_32_bytes_loop, L_fill_4_bytes;
       
  1593     __ align(16);
       
  1594     __ BIND(L_fill_32_bytes_loop);
       
  1595 
       
  1596     __ stx(value, to, 0);
       
  1597     __ stx(value, to, 8);
       
  1598     __ stx(value, to, 16);
       
  1599     __ stx(value, to, 24);
       
  1600 
       
  1601     __ subcc(count, 8 << shift, count);
       
  1602     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
       
  1603     __ delayed()->add(to, 32, to);
       
  1604 
       
  1605     __ BIND(L_check_fill_8_bytes);
       
  1606     __ addcc(count, 8 << shift, count);
       
  1607     __ brx(Assembler::zero, false, Assembler::pn, L_exit);
       
  1608     __ delayed()->subcc(count, 1 << (shift + 1), count);
       
  1609     __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
       
  1610     __ delayed()->andcc(count, 1<<shift, G0);
       
  1611 
       
  1612     //
       
  1613     // length is too short, just fill 8 bytes at a time
       
  1614     //
       
  1615     Label L_fill_8_bytes_loop;
       
  1616     __ BIND(L_fill_8_bytes_loop);
       
  1617     __ stx(value, to, 0);
       
  1618     __ subcc(count, 1 << (shift + 1), count);
       
  1619     __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
       
  1620     __ delayed()->add(to, 8, to);
       
  1621 
       
  1622     // fill trailing 4 bytes
       
  1623     __ andcc(count, 1<<shift, G0);  // in delay slot of branches
       
  1624     if (t == T_INT) {
       
  1625       __ BIND(L_fill_elements);
       
  1626     }
       
  1627     __ BIND(L_fill_4_bytes);
       
  1628     __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
       
  1629     if (t == T_BYTE || t == T_SHORT) {
       
  1630       __ delayed()->andcc(count, 1<<(shift-1), G0);
       
  1631     } else {
       
  1632       __ delayed()->nop();
       
  1633     }
       
  1634     __ stw(value, to, 0);
       
  1635     if (t == T_BYTE || t == T_SHORT) {
       
  1636       __ inc(to, 4);
       
  1637       // fill trailing 2 bytes
       
  1638       __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
       
  1639       __ BIND(L_fill_2_bytes);
       
  1640       __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
       
  1641       __ delayed()->andcc(count, 1, count);
       
  1642       __ sth(value, to, 0);
       
  1643       if (t == T_BYTE) {
       
  1644         __ inc(to, 2);
       
  1645         // fill trailing byte
       
  1646         __ andcc(count, 1, count);  // in delay slot of branches
       
  1647         __ BIND(L_fill_byte);
       
  1648         __ brx(Assembler::zero, false, Assembler::pt, L_exit);
       
  1649         __ delayed()->nop();
       
  1650         __ stb(value, to, 0);
       
  1651       } else {
       
  1652         __ BIND(L_fill_byte);
       
  1653       }
       
  1654     } else {
       
  1655       __ BIND(L_fill_2_bytes);
       
  1656     }
       
  1657     __ BIND(L_exit);
       
  1658     __ retl();
       
  1659     __ delayed()->nop();
       
  1660 
       
  1661     // Handle copies less than 8 bytes.  Int is handled elsewhere.
       
  1662     if (t == T_BYTE) {
       
  1663       __ BIND(L_fill_elements);
       
  1664       Label L_fill_2, L_fill_4;
       
  1665       // in delay slot __ andcc(count, 1, G0);
       
  1666       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
       
  1667       __ delayed()->andcc(count, 2, G0);
       
  1668       __ stb(value, to, 0);
       
  1669       __ inc(to, 1);
       
  1670       __ BIND(L_fill_2);
       
  1671       __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
       
  1672       __ delayed()->andcc(count, 4, G0);
       
  1673       __ stb(value, to, 0);
       
  1674       __ stb(value, to, 1);
       
  1675       __ inc(to, 2);
       
  1676       __ BIND(L_fill_4);
       
  1677       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
       
  1678       __ delayed()->nop();
       
  1679       __ stb(value, to, 0);
       
  1680       __ stb(value, to, 1);
       
  1681       __ stb(value, to, 2);
       
  1682       __ retl();
       
  1683       __ delayed()->stb(value, to, 3);
       
  1684     }
       
  1685 
       
  1686     if (t == T_SHORT) {
       
  1687       Label L_fill_2;
       
  1688       __ BIND(L_fill_elements);
       
  1689       // in delay slot __ andcc(count, 1, G0);
       
  1690       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
       
  1691       __ delayed()->andcc(count, 2, G0);
       
  1692       __ sth(value, to, 0);
       
  1693       __ inc(to, 2);
       
  1694       __ BIND(L_fill_2);
       
  1695       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
       
  1696       __ delayed()->nop();
       
  1697       __ sth(value, to, 0);
       
  1698       __ retl();
       
  1699       __ delayed()->sth(value, to, 2);
       
  1700     }
       
  1701     return start;
       
  1702   }
       
  1703 
       
  1704   //
       
  1705   //  Generate stub for conjoint short copy.  If "aligned" is true, the
       
  1706   //  "from" and "to" addresses are assumed to be heapword aligned.
       
  1707   //
       
  1708   // Arguments for generated stub:
       
  1709   //      from:  O0
       
  1710   //      to:    O1
       
  1711   //      count: O2 treated as signed
       
  1712   //
       
  1713   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
       
  1714                                        address *entry, const char *name) {
       
  1715     // Do reverse copy.
       
  1716 
       
  1717     __ align(CodeEntryAlignment);
       
  1718     StubCodeMark mark(this, "StubRoutines", name);
       
  1719     address start = __ pc();
       
  1720 
       
  1721     Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
       
  1722     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
       
  1723 
       
  1724     const Register from      = O0;   // source array address
       
  1725     const Register to        = O1;   // destination array address
       
  1726     const Register count     = O2;   // elements count
       
  1727     const Register end_from  = from; // source array end address
       
  1728     const Register end_to    = to;   // destination array end address
       
  1729 
       
  1730     const Register byte_count = O3;  // bytes count to copy
       
  1731 
       
  1732     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
       
  1733 
       
  1734     if (entry != NULL) {
       
  1735       *entry = __ pc();
       
  1736       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
       
  1737       BLOCK_COMMENT("Entry:");
       
  1738     }
       
  1739 
       
  1740     array_overlap_test(nooverlap_target, 1);
       
  1741 
       
  1742     __ sllx(count, LogBytesPerShort, byte_count);
       
  1743     __ add(to, byte_count, end_to);  // offset after last copied element
       
  1744 
       
  1745     // for short arrays, just do single element copy
       
  1746     __ cmp(count, 11); // 8 + 3  (22 bytes)
       
  1747     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
       
  1748     __ delayed()->add(from, byte_count, end_from);
       
  1749 
       
  1750     {
       
  1751       // Align end of arrays since they could be not aligned even
       
  1752       // when arrays itself are aligned.
       
  1753 
       
  1754       // copy 1 element if necessary to align 'end_to' on an 4 bytes
       
  1755       __ andcc(end_to, 3, G0);
       
  1756       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
       
  1757       __ delayed()->lduh(end_from, -2, O3);
       
  1758       __ dec(end_from, 2);
       
  1759       __ dec(end_to, 2);
       
  1760       __ dec(count);
       
  1761       __ sth(O3, end_to, 0);
       
  1762     __ BIND(L_skip_alignment);
       
  1763 
       
  1764       // copy 2 elements to align 'end_to' on an 8 byte boundary
       
  1765       __ andcc(end_to, 7, G0);
       
  1766       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
       
  1767       __ delayed()->lduh(end_from, -2, O3);
       
  1768       __ dec(count, 2);
       
  1769       __ lduh(end_from, -4, O4);
       
  1770       __ dec(end_from, 4);
       
  1771       __ dec(end_to, 4);
       
  1772       __ sth(O3, end_to, 2);
       
  1773       __ sth(O4, end_to, 0);
       
  1774     __ BIND(L_skip_alignment2);
       
  1775     }
       
  1776     if (aligned) {
       
  1777       // Both arrays are aligned to 8-bytes in 64-bits VM.
       
  1778       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
       
  1779       // in unaligned case.
       
  1780       __ dec(count, 8);
       
  1781     } else {
       
  1782       // Copy with shift 16 bytes per iteration if arrays do not have
       
  1783       // the same alignment mod 8, otherwise jump to the next
       
  1784       // code for aligned copy (and substracting 8 from 'count' before jump).
       
  1785       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
       
  1786       // Also jump over aligned copy after the copy with shift completed.
       
  1787 
       
  1788       copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
       
  1789                                         L_aligned_copy, L_copy_2_bytes);
       
  1790     }
       
  1791     // copy 4 elements (16 bytes) at a time
       
  1792       __ align(OptoLoopAlignment);
       
  1793     __ BIND(L_aligned_copy);
       
  1794       __ dec(end_from, 16);
       
  1795       __ ldx(end_from, 8, O3);
       
  1796       __ ldx(end_from, 0, O4);
       
  1797       __ dec(end_to, 16);
       
  1798       __ deccc(count, 8);
       
  1799       __ stx(O3, end_to, 8);
       
  1800       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
       
  1801       __ delayed()->stx(O4, end_to, 0);
       
  1802       __ inc(count, 8);
       
  1803 
       
  1804     // copy 1 element (2 bytes) at a time
       
  1805     __ BIND(L_copy_2_bytes);
       
  1806       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
       
  1807     __ BIND(L_copy_2_bytes_loop);
       
  1808       __ dec(end_from, 2);
       
  1809       __ dec(end_to, 2);
       
  1810       __ lduh(end_from, 0, O4);
       
  1811       __ deccc(count);
       
  1812       __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
       
  1813       __ delayed()->sth(O4, end_to, 0);
       
  1814 
       
  1815     __ BIND(L_exit);
       
  1816     // O3, O4 are used as temp registers
       
  1817     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
       
  1818     __ retl();
       
  1819     __ delayed()->mov(G0, O0); // return 0
       
  1820     return start;
       
  1821   }
       
  1822 
       
  1823   //
       
  1824   // Helper methods for generate_disjoint_int_copy_core()
       
  1825   //
       
  1826   void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
       
  1827                           Label& L_loop, bool use_prefetch, bool use_bis) {
       
  1828 
       
  1829     __ align(OptoLoopAlignment);
       
  1830     __ BIND(L_loop);
       
  1831     if (use_prefetch) {
       
  1832       if (ArraycopySrcPrefetchDistance > 0) {
       
  1833         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
       
  1834       }
       
  1835       if (ArraycopyDstPrefetchDistance > 0) {
       
  1836         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
       
  1837       }
       
  1838     }
       
  1839     __ ldx(from, 4, O4);
       
  1840     __ ldx(from, 12, G4);
       
  1841     __ inc(to, 16);
       
  1842     __ inc(from, 16);
       
  1843     __ deccc(count, 4); // Can we do next iteration after this one?
       
  1844 
       
  1845     __ srlx(O4, 32, G3);
       
  1846     __ bset(G3, O3);
       
  1847     __ sllx(O4, 32, O4);
       
  1848     __ srlx(G4, 32, G3);
       
  1849     __ bset(G3, O4);
       
  1850     if (use_bis) {
       
  1851       __ stxa(O3, to, -16);
       
  1852       __ stxa(O4, to, -8);
       
  1853     } else {
       
  1854       __ stx(O3, to, -16);
       
  1855       __ stx(O4, to, -8);
       
  1856     }
       
  1857     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
       
  1858     __ delayed()->sllx(G4, 32,  O3);
       
  1859 
       
  1860   }
       
  1861 
       
  1862   //
       
  1863   //  Generate core code for disjoint int copy (and oop copy on 32-bit).
       
  1864   //  If "aligned" is true, the "from" and "to" addresses are assumed
       
  1865   //  to be heapword aligned.
       
  1866   //
       
  1867   // Arguments:
       
  1868   //      from:  O0
       
  1869   //      to:    O1
       
  1870   //      count: O2 treated as signed
       
  1871   //
       
  1872   void generate_disjoint_int_copy_core(bool aligned) {
       
  1873 
       
  1874     Label L_skip_alignment, L_aligned_copy;
       
  1875     Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
       
  1876 
       
  1877     const Register from      = O0;   // source array address
       
  1878     const Register to        = O1;   // destination array address
       
  1879     const Register count     = O2;   // elements count
       
  1880     const Register offset    = O5;   // offset from start of arrays
       
  1881     // O3, O4, G3, G4 are used as temp registers
       
  1882 
       
  1883     // 'aligned' == true when it is known statically during compilation
       
  1884     // of this arraycopy call site that both 'from' and 'to' addresses
       
  1885     // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
       
  1886     //
       
  1887     // Aligned arrays have 4 bytes alignment in 32-bits VM
       
  1888     // and 8 bytes - in 64-bits VM.
       
  1889     //
       
  1890     if (!aligned) {
       
  1891       // The next check could be put under 'ifndef' since the code in
       
  1892       // generate_disjoint_long_copy_core() has own checks and set 'offset'.
       
  1893 
       
  1894       // for short arrays, just do single element copy
       
  1895       __ cmp(count, 5); // 4 + 1 (20 bytes)
       
  1896       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
       
  1897       __ delayed()->mov(G0, offset);
       
  1898 
       
  1899       // copy 1 element to align 'to' on an 8 byte boundary
       
  1900       __ andcc(to, 7, G0);
       
  1901       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
       
  1902       __ delayed()->ld(from, 0, O3);
       
  1903       __ inc(from, 4);
       
  1904       __ inc(to, 4);
       
  1905       __ dec(count);
       
  1906       __ st(O3, to, -4);
       
  1907     __ BIND(L_skip_alignment);
       
  1908 
       
  1909     // if arrays have same alignment mod 8, do 4 elements copy
       
  1910       __ andcc(from, 7, G0);
       
  1911       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
       
  1912       __ delayed()->ld(from, 0, O3);
       
  1913 
       
  1914     //
       
  1915     // Load 2 aligned 8-bytes chunks and use one from previous iteration
       
  1916     // to form 2 aligned 8-bytes chunks to store.
       
  1917     //
       
  1918     // copy_16_bytes_forward_with_shift() is not used here since this
       
  1919     // code is more optimal.
       
  1920 
       
  1921     // copy with shift 4 elements (16 bytes) at a time
       
  1922       __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
       
  1923       __ sllx(O3, 32,  O3);
       
  1924 
       
  1925       disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop);
       
  1926 
       
  1927       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
       
  1928       __ delayed()->inc(count, 4); // restore 'count'
       
  1929 
       
  1930     __ BIND(L_aligned_copy);
       
  1931     } // !aligned
       
  1932 
       
  1933     // copy 4 elements (16 bytes) at a time
       
  1934       __ and3(count, 1, G4); // Save
       
  1935       __ srl(count, 1, count);
       
  1936      generate_disjoint_long_copy_core(aligned);
       
  1937       __ mov(G4, count);     // Restore
       
  1938 
       
  1939     // copy 1 element at a time
       
  1940     __ BIND(L_copy_4_bytes);
       
  1941       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
       
  1942     __ BIND(L_copy_4_bytes_loop);
       
  1943       __ ld(from, offset, O3);
       
  1944       __ deccc(count);
       
  1945       __ st(O3, to, offset);
       
  1946       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
       
  1947       __ delayed()->inc(offset, 4);
       
  1948     __ BIND(L_exit);
       
  1949   }
       
  1950 
       
  1951   //
       
  1952   //  Generate stub for disjoint int copy.  If "aligned" is true, the
       
  1953   //  "from" and "to" addresses are assumed to be heapword aligned.
       
  1954   //
       
  1955   // Arguments for generated stub:
       
  1956   //      from:  O0
       
  1957   //      to:    O1
       
  1958   //      count: O2 treated as signed
       
  1959   //
       
  1960   address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
       
  1961     __ align(CodeEntryAlignment);
       
  1962     StubCodeMark mark(this, "StubRoutines", name);
       
  1963     address start = __ pc();
       
  1964 
       
  1965     const Register count = O2;
       
  1966     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
       
  1967 
       
  1968     if (entry != NULL) {
       
  1969       *entry = __ pc();
       
  1970       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
       
  1971       BLOCK_COMMENT("Entry:");
       
  1972     }
       
  1973 
       
  1974     generate_disjoint_int_copy_core(aligned);
       
  1975 
       
  1976     // O3, O4 are used as temp registers
       
  1977     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
       
  1978     __ retl();
       
  1979     __ delayed()->mov(G0, O0); // return 0
       
  1980     return start;
       
  1981   }
       
  1982 
       
  1983   //
       
  1984   //  Generate core code for conjoint int copy (and oop copy on 32-bit).
       
  1985   //  If "aligned" is true, the "from" and "to" addresses are assumed
       
  1986   //  to be heapword aligned.
       
  1987   //
       
  1988   // Arguments:
       
  1989   //      from:  O0
       
  1990   //      to:    O1
       
  1991   //      count: O2 treated as signed
       
  1992   //
       
  1993   void generate_conjoint_int_copy_core(bool aligned) {
       
  1994     // Do reverse copy.
       
  1995 
       
  1996     Label L_skip_alignment, L_aligned_copy;
       
  1997     Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
       
  1998 
       
  1999     const Register from      = O0;   // source array address
       
  2000     const Register to        = O1;   // destination array address
       
  2001     const Register count     = O2;   // elements count
       
  2002     const Register end_from  = from; // source array end address
       
  2003     const Register end_to    = to;   // destination array end address
       
  2004     // O3, O4, O5, G3 are used as temp registers
       
  2005 
       
  2006     const Register byte_count = O3;  // bytes count to copy
       
  2007 
       
  2008       __ sllx(count, LogBytesPerInt, byte_count);
       
  2009       __ add(to, byte_count, end_to); // offset after last copied element
       
  2010 
       
  2011       __ cmp(count, 5); // for short arrays, just do single element copy
       
  2012       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
       
  2013       __ delayed()->add(from, byte_count, end_from);
       
  2014 
       
  2015     // copy 1 element to align 'to' on an 8 byte boundary
       
  2016       __ andcc(end_to, 7, G0);
       
  2017       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
       
  2018       __ delayed()->nop();
       
  2019       __ dec(count);
       
  2020       __ dec(end_from, 4);
       
  2021       __ dec(end_to,   4);
       
  2022       __ ld(end_from, 0, O4);
       
  2023       __ st(O4, end_to, 0);
       
  2024     __ BIND(L_skip_alignment);
       
  2025 
       
  2026     // Check if 'end_from' and 'end_to' has the same alignment.
       
  2027       __ andcc(end_from, 7, G0);
       
  2028       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
       
  2029       __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
       
  2030 
       
  2031     // copy with shift 4 elements (16 bytes) at a time
       
  2032     //
       
  2033     // Load 2 aligned 8-bytes chunks and use one from previous iteration
       
  2034     // to form 2 aligned 8-bytes chunks to store.
       
  2035     //
       
  2036       __ ldx(end_from, -4, O3);
       
  2037       __ align(OptoLoopAlignment);
       
  2038     __ BIND(L_copy_16_bytes);
       
  2039       __ ldx(end_from, -12, O4);
       
  2040       __ deccc(count, 4);
       
  2041       __ ldx(end_from, -20, O5);
       
  2042       __ dec(end_to, 16);
       
  2043       __ dec(end_from, 16);
       
  2044       __ srlx(O3, 32, O3);
       
  2045       __ sllx(O4, 32, G3);
       
  2046       __ bset(G3, O3);
       
  2047       __ stx(O3, end_to, 8);
       
  2048       __ srlx(O4, 32, O4);
       
  2049       __ sllx(O5, 32, G3);
       
  2050       __ bset(O4, G3);
       
  2051       __ stx(G3, end_to, 0);
       
  2052       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
       
  2053       __ delayed()->mov(O5, O3);
       
  2054 
       
  2055       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
       
  2056       __ delayed()->inc(count, 4);
       
  2057 
       
  2058     // copy 4 elements (16 bytes) at a time
       
  2059       __ align(OptoLoopAlignment);
       
  2060     __ BIND(L_aligned_copy);
       
  2061       __ dec(end_from, 16);
       
  2062       __ ldx(end_from, 8, O3);
       
  2063       __ ldx(end_from, 0, O4);
       
  2064       __ dec(end_to, 16);
       
  2065       __ deccc(count, 4);
       
  2066       __ stx(O3, end_to, 8);
       
  2067       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
       
  2068       __ delayed()->stx(O4, end_to, 0);
       
  2069       __ inc(count, 4);
       
  2070 
       
  2071     // copy 1 element (4 bytes) at a time
       
  2072     __ BIND(L_copy_4_bytes);
       
  2073       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
       
  2074     __ BIND(L_copy_4_bytes_loop);
       
  2075       __ dec(end_from, 4);
       
  2076       __ dec(end_to, 4);
       
  2077       __ ld(end_from, 0, O4);
       
  2078       __ deccc(count);
       
  2079       __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
       
  2080       __ delayed()->st(O4, end_to, 0);
       
  2081     __ BIND(L_exit);
       
  2082   }
       
  2083 
       
  2084   //
       
  2085   //  Generate stub for conjoint int copy.  If "aligned" is true, the
       
  2086   //  "from" and "to" addresses are assumed to be heapword aligned.
       
  2087   //
       
  2088   // Arguments for generated stub:
       
  2089   //      from:  O0
       
  2090   //      to:    O1
       
  2091   //      count: O2 treated as signed
       
  2092   //
       
  2093   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
       
  2094                                      address *entry, const char *name) {
       
  2095     __ align(CodeEntryAlignment);
       
  2096     StubCodeMark mark(this, "StubRoutines", name);
       
  2097     address start = __ pc();
       
  2098 
       
  2099     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
       
  2100 
       
  2101     if (entry != NULL) {
       
  2102       *entry = __ pc();
       
  2103       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
       
  2104       BLOCK_COMMENT("Entry:");
       
  2105     }
       
  2106 
       
  2107     array_overlap_test(nooverlap_target, 2);
       
  2108 
       
  2109     generate_conjoint_int_copy_core(aligned);
       
  2110 
       
  2111     // O3, O4 are used as temp registers
       
  2112     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
       
  2113     __ retl();
       
  2114     __ delayed()->mov(G0, O0); // return 0
       
  2115     return start;
       
  2116   }
       
  2117 
       
  2118   //
       
  2119   // Helper methods for generate_disjoint_long_copy_core()
       
  2120   //
       
  2121   void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
       
  2122                           Label& L_loop, bool use_prefetch, bool use_bis) {
       
  2123     __ align(OptoLoopAlignment);
       
  2124     __ BIND(L_loop);
       
  2125     for (int off = 0; off < 64; off += 16) {
       
  2126       if (use_prefetch && (off & 31) == 0) {
       
  2127         if (ArraycopySrcPrefetchDistance > 0) {
       
  2128           __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
       
  2129         }
       
  2130         if (ArraycopyDstPrefetchDistance > 0) {
       
  2131           __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
       
  2132         }
       
  2133       }
       
  2134       __ ldx(from,  off+0, O4);
       
  2135       __ ldx(from,  off+8, O5);
       
  2136       if (use_bis) {
       
  2137         __ stxa(O4, to,  off+0);
       
  2138         __ stxa(O5, to,  off+8);
       
  2139       } else {
       
  2140         __ stx(O4, to,  off+0);
       
  2141         __ stx(O5, to,  off+8);
       
  2142       }
       
  2143     }
       
  2144     __ deccc(count, 8);
       
  2145     __ inc(from, 64);
       
  2146     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
       
  2147     __ delayed()->inc(to, 64);
       
  2148   }
       
  2149 
       
  2150   //
       
  2151   //  Generate core code for disjoint long copy (and oop copy on 64-bit).
       
  2152   //  "aligned" is ignored, because we must make the stronger
       
  2153   //  assumption that both addresses are always 64-bit aligned.
       
  2154   //
       
  2155   // Arguments:
       
  2156   //      from:  O0
       
  2157   //      to:    O1
       
  2158   //      count: O2 treated as signed
       
  2159   //
       
  2160   // count -= 2;
       
  2161   // if ( count >= 0 ) { // >= 2 elements
       
  2162   //   if ( count > 6) { // >= 8 elements
       
  2163   //     count -= 6; // original count - 8
       
  2164   //     do {
       
  2165   //       copy_8_elements;
       
  2166   //       count -= 8;
       
  2167   //     } while ( count >= 0 );
       
  2168   //     count += 6;
       
  2169   //   }
       
  2170   //   if ( count >= 0 ) { // >= 2 elements
       
  2171   //     do {
       
  2172   //       copy_2_elements;
       
  2173   //     } while ( (count=count-2) >= 0 );
       
  2174   //   }
       
  2175   // }
       
  2176   // count += 2;
       
  2177   // if ( count != 0 ) { // 1 element left
       
  2178   //   copy_1_element;
       
  2179   // }
       
  2180   //
       
  2181   void generate_disjoint_long_copy_core(bool aligned) {
       
  2182     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
       
  2183     const Register from    = O0;  // source array address
       
  2184     const Register to      = O1;  // destination array address
       
  2185     const Register count   = O2;  // elements count
       
  2186     const Register offset0 = O4;  // element offset
       
  2187     const Register offset8 = O5;  // next element offset
       
  2188 
       
  2189     __ deccc(count, 2);
       
  2190     __ mov(G0, offset0);   // offset from start of arrays (0)
       
  2191     __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
       
  2192     __ delayed()->add(offset0, 8, offset8);
       
  2193 
       
  2194     // Copy by 64 bytes chunks
       
  2195 
       
  2196     const Register from64 = O3;  // source address
       
  2197     const Register to64   = G3;  // destination address
       
  2198     __ subcc(count, 6, O3);
       
  2199     __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
       
  2200     __ delayed()->mov(to,   to64);
       
  2201     // Now we can use O4(offset0), O5(offset8) as temps
       
  2202     __ mov(O3, count);
       
  2203     // count >= 0 (original count - 8)
       
  2204     __ mov(from, from64);
       
  2205 
       
  2206     disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop);
       
  2207 
       
  2208       // Restore O4(offset0), O5(offset8)
       
  2209       __ sub(from64, from, offset0);
       
  2210       __ inccc(count, 6); // restore count
       
  2211       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
       
  2212       __ delayed()->add(offset0, 8, offset8);
       
  2213 
       
  2214       // Copy by 16 bytes chunks
       
  2215       __ align(OptoLoopAlignment);
       
  2216     __ BIND(L_copy_16_bytes);
       
  2217       __ ldx(from, offset0, O3);
       
  2218       __ ldx(from, offset8, G3);
       
  2219       __ deccc(count, 2);
       
  2220       __ stx(O3, to, offset0);
       
  2221       __ inc(offset0, 16);
       
  2222       __ stx(G3, to, offset8);
       
  2223       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
       
  2224       __ delayed()->inc(offset8, 16);
       
  2225 
       
  2226       // Copy last 8 bytes
       
  2227     __ BIND(L_copy_8_bytes);
       
  2228       __ inccc(count, 2);
       
  2229       __ brx(Assembler::zero, true, Assembler::pn, L_exit );
       
  2230       __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
       
  2231       __ ldx(from, offset0, O3);
       
  2232       __ stx(O3, to, offset0);
       
  2233     __ BIND(L_exit);
       
  2234   }
       
  2235 
       
  2236   //
       
  2237   //  Generate stub for disjoint long copy.
       
  2238   //  "aligned" is ignored, because we must make the stronger
       
  2239   //  assumption that both addresses are always 64-bit aligned.
       
  2240   //
       
  2241   // Arguments for generated stub:
       
  2242   //      from:  O0
       
  2243   //      to:    O1
       
  2244   //      count: O2 treated as signed
       
  2245   //
       
  2246   address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
       
  2247     __ align(CodeEntryAlignment);
       
  2248     StubCodeMark mark(this, "StubRoutines", name);
       
  2249     address start = __ pc();
       
  2250 
       
  2251     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
       
  2252 
       
  2253     if (entry != NULL) {
       
  2254       *entry = __ pc();
       
  2255       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
       
  2256       BLOCK_COMMENT("Entry:");
       
  2257     }
       
  2258 
       
  2259     generate_disjoint_long_copy_core(aligned);
       
  2260 
       
  2261     // O3, O4 are used as temp registers
       
  2262     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
       
  2263     __ retl();
       
  2264     __ delayed()->mov(G0, O0); // return 0
       
  2265     return start;
       
  2266   }
       
  2267 
       
  2268   //
       
  2269   //  Generate core code for conjoint long copy (and oop copy on 64-bit).
       
  2270   //  "aligned" is ignored, because we must make the stronger
       
  2271   //  assumption that both addresses are always 64-bit aligned.
       
  2272   //
       
  2273   // Arguments:
       
  2274   //      from:  O0
       
  2275   //      to:    O1
       
  2276   //      count: O2 treated as signed
       
  2277   //
       
  2278   void generate_conjoint_long_copy_core(bool aligned) {
       
  2279     // Do reverse copy.
       
  2280     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
       
  2281     const Register from    = O0;  // source array address
       
  2282     const Register to      = O1;  // destination array address
       
  2283     const Register count   = O2;  // elements count
       
  2284     const Register offset8 = O4;  // element offset
       
  2285     const Register offset0 = O5;  // previous element offset
       
  2286 
       
  2287       __ subcc(count, 1, count);
       
  2288       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
       
  2289       __ delayed()->sllx(count, LogBytesPerLong, offset8);
       
  2290       __ sub(offset8, 8, offset0);
       
  2291       __ align(OptoLoopAlignment);
       
  2292     __ BIND(L_copy_16_bytes);
       
  2293       __ ldx(from, offset8, O2);
       
  2294       __ ldx(from, offset0, O3);
       
  2295       __ stx(O2, to, offset8);
       
  2296       __ deccc(offset8, 16);      // use offset8 as counter
       
  2297       __ stx(O3, to, offset0);
       
  2298       __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
       
  2299       __ delayed()->dec(offset0, 16);
       
  2300 
       
  2301     __ BIND(L_copy_8_bytes);
       
  2302       __ brx(Assembler::negative, false, Assembler::pn, L_exit );
       
  2303       __ delayed()->nop();
       
  2304       __ ldx(from, 0, O3);
       
  2305       __ stx(O3, to, 0);
       
  2306     __ BIND(L_exit);
       
  2307   }
       
  2308 
       
  2309   //  Generate stub for conjoint long copy.
       
  2310   //  "aligned" is ignored, because we must make the stronger
       
  2311   //  assumption that both addresses are always 64-bit aligned.
       
  2312   //
       
  2313   // Arguments for generated stub:
       
  2314   //      from:  O0
       
  2315   //      to:    O1
       
  2316   //      count: O2 treated as signed
       
  2317   //
       
  2318   address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
       
  2319                                       address *entry, const char *name) {
       
  2320     __ align(CodeEntryAlignment);
       
  2321     StubCodeMark mark(this, "StubRoutines", name);
       
  2322     address start = __ pc();
       
  2323 
       
  2324     assert(aligned, "Should always be aligned");
       
  2325 
       
  2326     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
       
  2327 
       
  2328     if (entry != NULL) {
       
  2329       *entry = __ pc();
       
  2330       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
       
  2331       BLOCK_COMMENT("Entry:");
       
  2332     }
       
  2333 
       
  2334     array_overlap_test(nooverlap_target, 3);
       
  2335 
       
  2336     generate_conjoint_long_copy_core(aligned);
       
  2337 
       
  2338     // O3, O4 are used as temp registers
       
  2339     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
       
  2340     __ retl();
       
  2341     __ delayed()->mov(G0, O0); // return 0
       
  2342     return start;
       
  2343   }
       
  2344 
       
  2345   //  Generate stub for disjoint oop copy.  If "aligned" is true, the
       
  2346   //  "from" and "to" addresses are assumed to be heapword aligned.
       
  2347   //
       
  2348   // Arguments for generated stub:
       
  2349   //      from:  O0
       
  2350   //      to:    O1
       
  2351   //      count: O2 treated as signed
       
  2352   //
       
  2353   address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
       
  2354                                      bool dest_uninitialized = false) {
       
  2355 
       
  2356     const Register from  = O0;  // source array address
       
  2357     const Register to    = O1;  // destination array address
       
  2358     const Register count = O2;  // elements count
       
  2359 
       
  2360     __ align(CodeEntryAlignment);
       
  2361     StubCodeMark mark(this, "StubRoutines", name);
       
  2362     address start = __ pc();
       
  2363 
       
  2364     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
       
  2365 
       
  2366     if (entry != NULL) {
       
  2367       *entry = __ pc();
       
  2368       // caller can pass a 64-bit byte count here
       
  2369       BLOCK_COMMENT("Entry:");
       
  2370     }
       
  2371 
       
  2372     // save arguments for barrier generation
       
  2373     __ mov(to, G1);
       
  2374     __ mov(count, G5);
       
  2375     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
       
  2376     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
       
  2377     if (UseCompressedOops) {
       
  2378       generate_disjoint_int_copy_core(aligned);
       
  2379     } else {
       
  2380       generate_disjoint_long_copy_core(aligned);
       
  2381     }
       
  2382     // O0 is used as temp register
       
  2383     gen_write_ref_array_post_barrier(G1, G5, O0);
       
  2384 
       
  2385     // O3, O4 are used as temp registers
       
  2386     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
       
  2387     __ retl();
       
  2388     __ delayed()->mov(G0, O0); // return 0
       
  2389     return start;
       
  2390   }
       
  2391 
       
  2392   //  Generate stub for conjoint oop copy.  If "aligned" is true, the
       
  2393   //  "from" and "to" addresses are assumed to be heapword aligned.
       
  2394   //
       
  2395   // Arguments for generated stub:
       
  2396   //      from:  O0
       
  2397   //      to:    O1
       
  2398   //      count: O2 treated as signed
       
  2399   //
       
  2400   address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
       
  2401                                      address *entry, const char *name,
       
  2402                                      bool dest_uninitialized = false) {
       
  2403 
       
  2404     const Register from  = O0;  // source array address
       
  2405     const Register to    = O1;  // destination array address
       
  2406     const Register count = O2;  // elements count
       
  2407 
       
  2408     __ align(CodeEntryAlignment);
       
  2409     StubCodeMark mark(this, "StubRoutines", name);
       
  2410     address start = __ pc();
       
  2411 
       
  2412     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
       
  2413 
       
  2414     if (entry != NULL) {
       
  2415       *entry = __ pc();
       
  2416       // caller can pass a 64-bit byte count here
       
  2417       BLOCK_COMMENT("Entry:");
       
  2418     }
       
  2419 
       
  2420     array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
       
  2421 
       
  2422     // save arguments for barrier generation
       
  2423     __ mov(to, G1);
       
  2424     __ mov(count, G5);
       
  2425     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
       
  2426 
       
  2427     if (UseCompressedOops) {
       
  2428       generate_conjoint_int_copy_core(aligned);
       
  2429     } else {
       
  2430       generate_conjoint_long_copy_core(aligned);
       
  2431     }
       
  2432 
       
  2433     // O0 is used as temp register
       
  2434     gen_write_ref_array_post_barrier(G1, G5, O0);
       
  2435 
       
  2436     // O3, O4 are used as temp registers
       
  2437     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
       
  2438     __ retl();
       
  2439     __ delayed()->mov(G0, O0); // return 0
       
  2440     return start;
       
  2441   }
       
  2442 
       
  2443 
       
  2444   // Helper for generating a dynamic type check.
       
  2445   // Smashes only the given temp registers.
       
  2446   void generate_type_check(Register sub_klass,
       
  2447                            Register super_check_offset,
       
  2448                            Register super_klass,
       
  2449                            Register temp,
       
  2450                            Label& L_success) {
       
  2451     assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
       
  2452 
       
  2453     BLOCK_COMMENT("type_check:");
       
  2454 
       
  2455     Label L_miss, L_pop_to_miss;
       
  2456 
       
  2457     assert_clean_int(super_check_offset, temp);
       
  2458 
       
  2459     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
       
  2460                                      &L_success, &L_miss, NULL,
       
  2461                                      super_check_offset);
       
  2462 
       
  2463     BLOCK_COMMENT("type_check_slow_path:");
       
  2464     __ save_frame(0);
       
  2465     __ check_klass_subtype_slow_path(sub_klass->after_save(),
       
  2466                                      super_klass->after_save(),
       
  2467                                      L0, L1, L2, L4,
       
  2468                                      NULL, &L_pop_to_miss);
       
  2469     __ ba(L_success);
       
  2470     __ delayed()->restore();
       
  2471 
       
  2472     __ bind(L_pop_to_miss);
       
  2473     __ restore();
       
  2474 
       
  2475     // Fall through on failure!
       
  2476     __ BIND(L_miss);
       
  2477   }
       
  2478 
       
  2479 
       
  2480   //  Generate stub for checked oop copy.
       
  2481   //
       
  2482   // Arguments for generated stub:
       
  2483   //      from:  O0
       
  2484   //      to:    O1
       
  2485   //      count: O2 treated as signed
       
  2486   //      ckoff: O3 (super_check_offset)
       
  2487   //      ckval: O4 (super_klass)
       
  2488   //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
       
  2489   //
       
  2490   address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
       
  2491 
       
  2492     const Register O0_from   = O0;      // source array address
       
  2493     const Register O1_to     = O1;      // destination array address
       
  2494     const Register O2_count  = O2;      // elements count
       
  2495     const Register O3_ckoff  = O3;      // super_check_offset
       
  2496     const Register O4_ckval  = O4;      // super_klass
       
  2497 
       
  2498     const Register O5_offset = O5;      // loop var, with stride wordSize
       
  2499     const Register G1_remain = G1;      // loop var, with stride -1
       
  2500     const Register G3_oop    = G3;      // actual oop copied
       
  2501     const Register G4_klass  = G4;      // oop._klass
       
  2502     const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
       
  2503 
       
  2504     __ align(CodeEntryAlignment);
       
  2505     StubCodeMark mark(this, "StubRoutines", name);
       
  2506     address start = __ pc();
       
  2507 
       
  2508 #ifdef ASSERT
       
  2509     // We sometimes save a frame (see generate_type_check below).
       
  2510     // If this will cause trouble, let's fail now instead of later.
       
  2511     __ save_frame(0);
       
  2512     __ restore();
       
  2513 #endif
       
  2514 
       
  2515     assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
       
  2516 
       
  2517 #ifdef ASSERT
       
  2518     // caller guarantees that the arrays really are different
       
  2519     // otherwise, we would have to make conjoint checks
       
  2520     { Label L;
       
  2521       __ mov(O3, G1);           // spill: overlap test smashes O3
       
  2522       __ mov(O4, G4);           // spill: overlap test smashes O4
       
  2523       array_overlap_test(L, LogBytesPerHeapOop);
       
  2524       __ stop("checkcast_copy within a single array");
       
  2525       __ bind(L);
       
  2526       __ mov(G1, O3);
       
  2527       __ mov(G4, O4);
       
  2528     }
       
  2529 #endif //ASSERT
       
  2530 
       
  2531     if (entry != NULL) {
       
  2532       *entry = __ pc();
       
  2533       // caller can pass a 64-bit byte count here (from generic stub)
       
  2534       BLOCK_COMMENT("Entry:");
       
  2535     }
       
  2536     gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
       
  2537 
       
  2538     Label load_element, store_element, do_card_marks, fail, done;
       
  2539     __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
       
  2540     __ brx(Assembler::notZero, false, Assembler::pt, load_element);
       
  2541     __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
       
  2542 
       
  2543     // Empty array:  Nothing to do.
       
  2544     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
       
  2545     __ retl();
       
  2546     __ delayed()->set(0, O0);           // return 0 on (trivial) success
       
  2547 
       
  2548     // ======== begin loop ========
       
  2549     // (Loop is rotated; its entry is load_element.)
       
  2550     // Loop variables:
       
  2551     //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
       
  2552     //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
       
  2553     //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
       
  2554     __ align(OptoLoopAlignment);
       
  2555 
       
  2556     __ BIND(store_element);
       
  2557     __ deccc(G1_remain);                // decrement the count
       
  2558     __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
       
  2559     __ inc(O5_offset, heapOopSize);     // step to next offset
       
  2560     __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
       
  2561     __ delayed()->set(0, O0);           // return -1 on success
       
  2562 
       
  2563     // ======== loop entry is here ========
       
  2564     __ BIND(load_element);
       
  2565     __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
       
  2566     __ br_null_short(G3_oop, Assembler::pt, store_element);
       
  2567 
       
  2568     __ load_klass(G3_oop, G4_klass); // query the object klass
       
  2569 
       
  2570     generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
       
  2571                         // branch to this on success:
       
  2572                         store_element);
       
  2573     // ======== end loop ========
       
  2574 
       
  2575     // It was a real error; we must depend on the caller to finish the job.
       
  2576     // Register G1 has number of *remaining* oops, O2 number of *total* oops.
       
  2577     // Emit GC store barriers for the oops we have copied (O2 minus G1),
       
  2578     // and report their number to the caller.
       
  2579     __ BIND(fail);
       
  2580     __ subcc(O2_count, G1_remain, O2_count);
       
  2581     __ brx(Assembler::zero, false, Assembler::pt, done);
       
  2582     __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
       
  2583 
       
  2584     __ BIND(do_card_marks);
       
  2585     gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
       
  2586 
       
  2587     __ BIND(done);
       
  2588     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
       
  2589     __ retl();
       
  2590     __ delayed()->nop();             // return value in 00
       
  2591 
       
  2592     return start;
       
  2593   }
       
  2594 
       
  2595 
       
  2596   //  Generate 'unsafe' array copy stub
       
  2597   //  Though just as safe as the other stubs, it takes an unscaled
       
  2598   //  size_t argument instead of an element count.
       
  2599   //
       
  2600   // Arguments for generated stub:
       
  2601   //      from:  O0
       
  2602   //      to:    O1
       
  2603   //      count: O2 byte count, treated as ssize_t, can be zero
       
  2604   //
       
  2605   // Examines the alignment of the operands and dispatches
       
  2606   // to a long, int, short, or byte copy loop.
       
  2607   //
       
  2608   address generate_unsafe_copy(const char* name,
       
  2609                                address byte_copy_entry,
       
  2610                                address short_copy_entry,
       
  2611                                address int_copy_entry,
       
  2612                                address long_copy_entry) {
       
  2613 
       
  2614     const Register O0_from   = O0;      // source array address
       
  2615     const Register O1_to     = O1;      // destination array address
       
  2616     const Register O2_count  = O2;      // elements count
       
  2617 
       
  2618     const Register G1_bits   = G1;      // test copy of low bits
       
  2619 
       
  2620     __ align(CodeEntryAlignment);
       
  2621     StubCodeMark mark(this, "StubRoutines", name);
       
  2622     address start = __ pc();
       
  2623 
       
  2624     // bump this on entry, not on exit:
       
  2625     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
       
  2626 
       
  2627     __ or3(O0_from, O1_to, G1_bits);
       
  2628     __ or3(O2_count,       G1_bits, G1_bits);
       
  2629 
       
  2630     __ btst(BytesPerLong-1, G1_bits);
       
  2631     __ br(Assembler::zero, true, Assembler::pt,
       
  2632           long_copy_entry, relocInfo::runtime_call_type);
       
  2633     // scale the count on the way out:
       
  2634     __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
       
  2635 
       
  2636     __ btst(BytesPerInt-1, G1_bits);
       
  2637     __ br(Assembler::zero, true, Assembler::pt,
       
  2638           int_copy_entry, relocInfo::runtime_call_type);
       
  2639     // scale the count on the way out:
       
  2640     __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
       
  2641 
       
  2642     __ btst(BytesPerShort-1, G1_bits);
       
  2643     __ br(Assembler::zero, true, Assembler::pt,
       
  2644           short_copy_entry, relocInfo::runtime_call_type);
       
  2645     // scale the count on the way out:
       
  2646     __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
       
  2647 
       
  2648     __ br(Assembler::always, false, Assembler::pt,
       
  2649           byte_copy_entry, relocInfo::runtime_call_type);
       
  2650     __ delayed()->nop();
       
  2651 
       
  2652     return start;
       
  2653   }
       
  2654 
       
  2655 
       
  2656   // Perform range checks on the proposed arraycopy.
       
  2657   // Kills the two temps, but nothing else.
       
  2658   // Also, clean the sign bits of src_pos and dst_pos.
       
  2659   void arraycopy_range_checks(Register src,     // source array oop (O0)
       
  2660                               Register src_pos, // source position (O1)
       
  2661                               Register dst,     // destination array oo (O2)
       
  2662                               Register dst_pos, // destination position (O3)
       
  2663                               Register length,  // length of copy (O4)
       
  2664                               Register temp1, Register temp2,
       
  2665                               Label& L_failed) {
       
  2666     BLOCK_COMMENT("arraycopy_range_checks:");
       
  2667 
       
  2668     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
       
  2669 
       
  2670     const Register array_length = temp1;  // scratch
       
  2671     const Register end_pos      = temp2;  // scratch
       
  2672 
       
  2673     // Note:  This next instruction may be in the delay slot of a branch:
       
  2674     __ add(length, src_pos, end_pos);  // src_pos + length
       
  2675     __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
       
  2676     __ cmp(end_pos, array_length);
       
  2677     __ br(Assembler::greater, false, Assembler::pn, L_failed);
       
  2678 
       
  2679     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
       
  2680     __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
       
  2681     __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
       
  2682     __ cmp(end_pos, array_length);
       
  2683     __ br(Assembler::greater, false, Assembler::pn, L_failed);
       
  2684 
       
  2685     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
       
  2686     // Move with sign extension can be used since they are positive.
       
  2687     __ delayed()->signx(src_pos, src_pos);
       
  2688     __ signx(dst_pos, dst_pos);
       
  2689 
       
  2690     BLOCK_COMMENT("arraycopy_range_checks done");
       
  2691   }
       
  2692 
       
  2693 
       
  2694   //
       
  2695   //  Generate generic array copy stubs
       
  2696   //
       
  2697   //  Input:
       
  2698   //    O0    -  src oop
       
  2699   //    O1    -  src_pos
       
  2700   //    O2    -  dst oop
       
  2701   //    O3    -  dst_pos
       
  2702   //    O4    -  element count
       
  2703   //
       
  2704   //  Output:
       
  2705   //    O0 ==  0  -  success
       
  2706   //    O0 == -1  -  need to call System.arraycopy
       
  2707   //
       
  2708   address generate_generic_copy(const char *name,
       
  2709                                 address entry_jbyte_arraycopy,
       
  2710                                 address entry_jshort_arraycopy,
       
  2711                                 address entry_jint_arraycopy,
       
  2712                                 address entry_oop_arraycopy,
       
  2713                                 address entry_jlong_arraycopy,
       
  2714                                 address entry_checkcast_arraycopy) {
       
  2715     Label L_failed, L_objArray;
       
  2716 
       
  2717     // Input registers
       
  2718     const Register src      = O0;  // source array oop
       
  2719     const Register src_pos  = O1;  // source position
       
  2720     const Register dst      = O2;  // destination array oop
       
  2721     const Register dst_pos  = O3;  // destination position
       
  2722     const Register length   = O4;  // elements count
       
  2723 
       
  2724     // registers used as temp
       
  2725     const Register G3_src_klass = G3; // source array klass
       
  2726     const Register G4_dst_klass = G4; // destination array klass
       
  2727     const Register G5_lh        = G5; // layout handler
       
  2728     const Register O5_temp      = O5;
       
  2729 
       
  2730     __ align(CodeEntryAlignment);
       
  2731     StubCodeMark mark(this, "StubRoutines", name);
       
  2732     address start = __ pc();
       
  2733 
       
  2734     // bump this on entry, not on exit:
       
  2735     inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
       
  2736 
       
  2737     // In principle, the int arguments could be dirty.
       
  2738     //assert_clean_int(src_pos, G1);
       
  2739     //assert_clean_int(dst_pos, G1);
       
  2740     //assert_clean_int(length, G1);
       
  2741 
       
  2742     //-----------------------------------------------------------------------
       
  2743     // Assembler stubs will be used for this call to arraycopy
       
  2744     // if the following conditions are met:
       
  2745     //
       
  2746     // (1) src and dst must not be null.
       
  2747     // (2) src_pos must not be negative.
       
  2748     // (3) dst_pos must not be negative.
       
  2749     // (4) length  must not be negative.
       
  2750     // (5) src klass and dst klass should be the same and not NULL.
       
  2751     // (6) src and dst should be arrays.
       
  2752     // (7) src_pos + length must not exceed length of src.
       
  2753     // (8) dst_pos + length must not exceed length of dst.
       
  2754     BLOCK_COMMENT("arraycopy initial argument checks");
       
  2755 
       
  2756     //  if (src == NULL) return -1;
       
  2757     __ br_null(src, false, Assembler::pn, L_failed);
       
  2758 
       
  2759     //  if (src_pos < 0) return -1;
       
  2760     __ delayed()->tst(src_pos);
       
  2761     __ br(Assembler::negative, false, Assembler::pn, L_failed);
       
  2762     __ delayed()->nop();
       
  2763 
       
  2764     //  if (dst == NULL) return -1;
       
  2765     __ br_null(dst, false, Assembler::pn, L_failed);
       
  2766 
       
  2767     //  if (dst_pos < 0) return -1;
       
  2768     __ delayed()->tst(dst_pos);
       
  2769     __ br(Assembler::negative, false, Assembler::pn, L_failed);
       
  2770 
       
  2771     //  if (length < 0) return -1;
       
  2772     __ delayed()->tst(length);
       
  2773     __ br(Assembler::negative, false, Assembler::pn, L_failed);
       
  2774 
       
  2775     BLOCK_COMMENT("arraycopy argument klass checks");
       
  2776     //  get src->klass()
       
  2777     if (UseCompressedClassPointers) {
       
  2778       __ delayed()->nop(); // ??? not good
       
  2779       __ load_klass(src, G3_src_klass);
       
  2780     } else {
       
  2781       __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
       
  2782     }
       
  2783 
       
  2784 #ifdef ASSERT
       
  2785     //  assert(src->klass() != NULL);
       
  2786     BLOCK_COMMENT("assert klasses not null");
       
  2787     { Label L_a, L_b;
       
  2788       __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
       
  2789       __ bind(L_a);
       
  2790       __ stop("broken null klass");
       
  2791       __ bind(L_b);
       
  2792       __ load_klass(dst, G4_dst_klass);
       
  2793       __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
       
  2794       __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
       
  2795       BLOCK_COMMENT("assert done");
       
  2796     }
       
  2797 #endif
       
  2798 
       
  2799     // Load layout helper
       
  2800     //
       
  2801     //  |array_tag|     | header_size | element_type |     |log2_element_size|
       
  2802     // 32        30    24            16              8     2                 0
       
  2803     //
       
  2804     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
       
  2805     //
       
  2806 
       
  2807     int lh_offset = in_bytes(Klass::layout_helper_offset());
       
  2808 
       
  2809     // Load 32-bits signed value. Use br() instruction with it to check icc.
       
  2810     __ lduw(G3_src_klass, lh_offset, G5_lh);
       
  2811 
       
  2812     if (UseCompressedClassPointers) {
       
  2813       __ load_klass(dst, G4_dst_klass);
       
  2814     }
       
  2815     // Handle objArrays completely differently...
       
  2816     juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
       
  2817     __ set(objArray_lh, O5_temp);
       
  2818     __ cmp(G5_lh,       O5_temp);
       
  2819     __ br(Assembler::equal, false, Assembler::pt, L_objArray);
       
  2820     if (UseCompressedClassPointers) {
       
  2821       __ delayed()->nop();
       
  2822     } else {
       
  2823       __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
       
  2824     }
       
  2825 
       
  2826     //  if (src->klass() != dst->klass()) return -1;
       
  2827     __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
       
  2828 
       
  2829     //  if (!src->is_Array()) return -1;
       
  2830     __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
       
  2831     __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
       
  2832 
       
  2833     // At this point, it is known to be a typeArray (array_tag 0x3).
       
  2834 #ifdef ASSERT
       
  2835     __ delayed()->nop();
       
  2836     { Label L;
       
  2837       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
       
  2838       __ set(lh_prim_tag_in_place, O5_temp);
       
  2839       __ cmp(G5_lh,                O5_temp);
       
  2840       __ br(Assembler::greaterEqual, false, Assembler::pt, L);
       
  2841       __ delayed()->nop();
       
  2842       __ stop("must be a primitive array");
       
  2843       __ bind(L);
       
  2844     }
       
  2845 #else
       
  2846     __ delayed();                               // match next insn to prev branch
       
  2847 #endif
       
  2848 
       
  2849     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
       
  2850                            O5_temp, G4_dst_klass, L_failed);
       
  2851 
       
  2852     // TypeArrayKlass
       
  2853     //
       
  2854     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
       
  2855     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
       
  2856     //
       
  2857 
       
  2858     const Register G4_offset = G4_dst_klass;    // array offset
       
  2859     const Register G3_elsize = G3_src_klass;    // log2 element size
       
  2860 
       
  2861     __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
       
  2862     __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
       
  2863     __ add(src, G4_offset, src);       // src array offset
       
  2864     __ add(dst, G4_offset, dst);       // dst array offset
       
  2865     __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
       
  2866 
       
  2867     // next registers should be set before the jump to corresponding stub
       
  2868     const Register from     = O0;  // source array address
       
  2869     const Register to       = O1;  // destination array address
       
  2870     const Register count    = O2;  // elements count
       
  2871 
       
  2872     // 'from', 'to', 'count' registers should be set in this order
       
  2873     // since they are the same as 'src', 'src_pos', 'dst'.
       
  2874 
       
  2875     BLOCK_COMMENT("scale indexes to element size");
       
  2876     __ sll_ptr(src_pos, G3_elsize, src_pos);
       
  2877     __ sll_ptr(dst_pos, G3_elsize, dst_pos);
       
  2878     __ add(src, src_pos, from);       // src_addr
       
  2879     __ add(dst, dst_pos, to);         // dst_addr
       
  2880 
       
  2881     BLOCK_COMMENT("choose copy loop based on element size");
       
  2882     __ cmp(G3_elsize, 0);
       
  2883     __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
       
  2884     __ delayed()->signx(length, count); // length
       
  2885 
       
  2886     __ cmp(G3_elsize, LogBytesPerShort);
       
  2887     __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
       
  2888     __ delayed()->signx(length, count); // length
       
  2889 
       
  2890     __ cmp(G3_elsize, LogBytesPerInt);
       
  2891     __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
       
  2892     __ delayed()->signx(length, count); // length
       
  2893 #ifdef ASSERT
       
  2894     { Label L;
       
  2895       __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
       
  2896       __ stop("must be long copy, but elsize is wrong");
       
  2897       __ bind(L);
       
  2898     }
       
  2899 #endif
       
  2900     __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
       
  2901     __ delayed()->signx(length, count); // length
       
  2902 
       
  2903     // ObjArrayKlass
       
  2904   __ BIND(L_objArray);
       
  2905     // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
       
  2906 
       
  2907     Label L_plain_copy, L_checkcast_copy;
       
  2908     //  test array classes for subtyping
       
  2909     __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
       
  2910     __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
       
  2911     __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
       
  2912 
       
  2913     // Identically typed arrays can be copied without element-wise checks.
       
  2914     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
       
  2915                            O5_temp, G5_lh, L_failed);
       
  2916 
       
  2917     __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
       
  2918     __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
       
  2919     __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
       
  2920     __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
       
  2921     __ add(src, src_pos, from);       // src_addr
       
  2922     __ add(dst, dst_pos, to);         // dst_addr
       
  2923   __ BIND(L_plain_copy);
       
  2924     __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
       
  2925     __ delayed()->signx(length, count); // length
       
  2926 
       
  2927   __ BIND(L_checkcast_copy);
       
  2928     // live at this point:  G3_src_klass, G4_dst_klass
       
  2929     {
       
  2930       // Before looking at dst.length, make sure dst is also an objArray.
       
  2931       // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
       
  2932       __ cmp(G5_lh,                    O5_temp);
       
  2933       __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
       
  2934 
       
  2935       // It is safe to examine both src.length and dst.length.
       
  2936       __ delayed();                             // match next insn to prev branch
       
  2937       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
       
  2938                              O5_temp, G5_lh, L_failed);
       
  2939 
       
  2940       // Marshal the base address arguments now, freeing registers.
       
  2941       __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
       
  2942       __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
       
  2943       __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
       
  2944       __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
       
  2945       __ add(src, src_pos, from);               // src_addr
       
  2946       __ add(dst, dst_pos, to);                 // dst_addr
       
  2947       __ signx(length, count);                  // length (reloaded)
       
  2948 
       
  2949       Register sco_temp = O3;                   // this register is free now
       
  2950       assert_different_registers(from, to, count, sco_temp,
       
  2951                                  G4_dst_klass, G3_src_klass);
       
  2952 
       
  2953       // Generate the type check.
       
  2954       int sco_offset = in_bytes(Klass::super_check_offset_offset());
       
  2955       __ lduw(G4_dst_klass, sco_offset, sco_temp);
       
  2956       generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
       
  2957                           O5_temp, L_plain_copy);
       
  2958 
       
  2959       // Fetch destination element klass from the ObjArrayKlass header.
       
  2960       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
       
  2961 
       
  2962       // the checkcast_copy loop needs two extra arguments:
       
  2963       __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
       
  2964       // lduw(O4, sco_offset, O3);              // sco of elem klass
       
  2965 
       
  2966       __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
       
  2967       __ delayed()->lduw(O4, sco_offset, O3);
       
  2968     }
       
  2969 
       
  2970   __ BIND(L_failed);
       
  2971     __ retl();
       
  2972     __ delayed()->sub(G0, 1, O0); // return -1
       
  2973     return start;
       
  2974   }
       
  2975 
       
  2976   //
       
  2977   //  Generate stub for heap zeroing.
       
  2978   //  "to" address is aligned to jlong (8 bytes).
       
  2979   //
       
  2980   // Arguments for generated stub:
       
  2981   //      to:    O0
       
  2982   //      count: O1 treated as signed (count of HeapWord)
       
  2983   //             count could be 0
       
  2984   //
       
  2985   address generate_zero_aligned_words(const char* name) {
       
  2986     __ align(CodeEntryAlignment);
       
  2987     StubCodeMark mark(this, "StubRoutines", name);
       
  2988     address start = __ pc();
       
  2989 
       
  2990     const Register to    = O0;   // source array address
       
  2991     const Register count = O1;   // HeapWords count
       
  2992     const Register temp  = O2;   // scratch
       
  2993 
       
  2994     Label Ldone;
       
  2995     __ sllx(count, LogHeapWordSize, count); // to bytes count
       
  2996     // Use BIS for zeroing
       
  2997     __ bis_zeroing(to, count, temp, Ldone);
       
  2998     __ bind(Ldone);
       
  2999     __ retl();
       
  3000     __ delayed()->nop();
       
  3001     return start;
       
  3002 }
       
  3003 
       
  3004   void generate_arraycopy_stubs() {
       
  3005     address entry;
       
  3006     address entry_jbyte_arraycopy;
       
  3007     address entry_jshort_arraycopy;
       
  3008     address entry_jint_arraycopy;
       
  3009     address entry_oop_arraycopy;
       
  3010     address entry_jlong_arraycopy;
       
  3011     address entry_checkcast_arraycopy;
       
  3012 
       
  3013     //*** jbyte
       
  3014     // Always need aligned and unaligned versions
       
  3015     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
       
  3016                                                                                   "jbyte_disjoint_arraycopy");
       
  3017     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
       
  3018                                                                                   &entry_jbyte_arraycopy,
       
  3019                                                                                   "jbyte_arraycopy");
       
  3020     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
       
  3021                                                                                   "arrayof_jbyte_disjoint_arraycopy");
       
  3022     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
       
  3023                                                                                   "arrayof_jbyte_arraycopy");
       
  3024 
       
  3025     //*** jshort
       
  3026     // Always need aligned and unaligned versions
       
  3027     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
       
  3028                                                                                     "jshort_disjoint_arraycopy");
       
  3029     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
       
  3030                                                                                     &entry_jshort_arraycopy,
       
  3031                                                                                     "jshort_arraycopy");
       
  3032     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
       
  3033                                                                                     "arrayof_jshort_disjoint_arraycopy");
       
  3034     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
       
  3035                                                                                     "arrayof_jshort_arraycopy");
       
  3036 
       
  3037     //*** jint
       
  3038     // Aligned versions
       
  3039     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
       
  3040                                                                                 "arrayof_jint_disjoint_arraycopy");
       
  3041     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
       
  3042                                                                                 "arrayof_jint_arraycopy");
       
  3043     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
       
  3044     // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
       
  3045     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
       
  3046                                                                                 "jint_disjoint_arraycopy");
       
  3047     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
       
  3048                                                                                 &entry_jint_arraycopy,
       
  3049                                                                                 "jint_arraycopy");
       
  3050 
       
  3051     //*** jlong
       
  3052     // It is always aligned
       
  3053     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
       
  3054                                                                                   "arrayof_jlong_disjoint_arraycopy");
       
  3055     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
       
  3056                                                                                   "arrayof_jlong_arraycopy");
       
  3057     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
       
  3058     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
       
  3059 
       
  3060 
       
  3061     //*** oops
       
  3062     // Aligned versions
       
  3063     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, &entry,
       
  3064                                                                                       "arrayof_oop_disjoint_arraycopy");
       
  3065     StubRoutines::_arrayof_oop_arraycopy                 = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
       
  3066                                                                                       "arrayof_oop_arraycopy");
       
  3067     // Aligned versions without pre-barriers
       
  3068     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
       
  3069                                                                                       "arrayof_oop_disjoint_arraycopy_uninit",
       
  3070                                                                                       /*dest_uninitialized*/true);
       
  3071     StubRoutines::_arrayof_oop_arraycopy_uninit          = generate_conjoint_oop_copy(true, entry, NULL,
       
  3072                                                                                       "arrayof_oop_arraycopy_uninit",
       
  3073                                                                                       /*dest_uninitialized*/true);
       
  3074     if (UseCompressedOops) {
       
  3075       // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
       
  3076       StubRoutines::_oop_disjoint_arraycopy            = generate_disjoint_oop_copy(false, &entry,
       
  3077                                                                                     "oop_disjoint_arraycopy");
       
  3078       StubRoutines::_oop_arraycopy                     = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
       
  3079                                                                                     "oop_arraycopy");
       
  3080       // Unaligned versions without pre-barriers
       
  3081       StubRoutines::_oop_disjoint_arraycopy_uninit     = generate_disjoint_oop_copy(false, &entry,
       
  3082                                                                                     "oop_disjoint_arraycopy_uninit",
       
  3083                                                                                     /*dest_uninitialized*/true);
       
  3084       StubRoutines::_oop_arraycopy_uninit              = generate_conjoint_oop_copy(false, entry, NULL,
       
  3085                                                                                     "oop_arraycopy_uninit",
       
  3086                                                                                     /*dest_uninitialized*/true);
       
  3087     } else {
       
  3088       // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
       
  3089       StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
       
  3090       StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
       
  3091       StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
       
  3092       StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
       
  3093     }
       
  3094 
       
  3095     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
       
  3096     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
       
  3097                                                                         /*dest_uninitialized*/true);
       
  3098 
       
  3099     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
       
  3100                                                               entry_jbyte_arraycopy,
       
  3101                                                               entry_jshort_arraycopy,
       
  3102                                                               entry_jint_arraycopy,
       
  3103                                                               entry_jlong_arraycopy);
       
  3104     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
       
  3105                                                                entry_jbyte_arraycopy,
       
  3106                                                                entry_jshort_arraycopy,
       
  3107                                                                entry_jint_arraycopy,
       
  3108                                                                entry_oop_arraycopy,
       
  3109                                                                entry_jlong_arraycopy,
       
  3110                                                                entry_checkcast_arraycopy);
       
  3111 
       
  3112     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
       
  3113     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
       
  3114     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
       
  3115     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
       
  3116     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
       
  3117     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
       
  3118 
       
  3119     if (UseBlockZeroing) {
       
  3120       StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
       
  3121     }
       
  3122   }
       
  3123 
       
  3124   address generate_aescrypt_encryptBlock() {
       
  3125     // required since we read expanded key 'int' array starting first element without alignment considerations
       
  3126     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
       
  3127            "the following code assumes that first element of an int array is aligned to 8 bytes");
       
  3128     __ align(CodeEntryAlignment);
       
  3129     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
       
  3130     Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
       
  3131     address start = __ pc();
       
  3132     Register from = O0; // source byte array
       
  3133     Register to = O1;   // destination byte array
       
  3134     Register key = O2;  // expanded key array
       
  3135     const Register keylen = O4; //reg for storing expanded key array length
       
  3136 
       
  3137     // read expanded key length
       
  3138     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
       
  3139 
       
  3140     // Method to address arbitrary alignment for load instructions:
       
  3141     // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
       
  3142     // If zero/aligned then continue with double FP load instructions
       
  3143     // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
       
  3144     // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
       
  3145     // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
       
  3146     // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
       
  3147 
       
  3148     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
       
  3149     __ andcc(from, 7, G0);
       
  3150     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
       
  3151     __ delayed()->alignaddr(from, G0, from);
       
  3152 
       
  3153     // aligned case: load input into F54-F56
       
  3154     __ ldf(FloatRegisterImpl::D, from, 0, F54);
       
  3155     __ ldf(FloatRegisterImpl::D, from, 8, F56);
       
  3156     __ ba_short(L_load_expanded_key);
       
  3157 
       
  3158     __ BIND(L_load_misaligned_input);
       
  3159     __ ldf(FloatRegisterImpl::D, from, 0, F54);
       
  3160     __ ldf(FloatRegisterImpl::D, from, 8, F56);
       
  3161     __ ldf(FloatRegisterImpl::D, from, 16, F58);
       
  3162     __ faligndata(F54, F56, F54);
       
  3163     __ faligndata(F56, F58, F56);
       
  3164 
       
  3165     __ BIND(L_load_expanded_key);
       
  3166     // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
       
  3167     for ( int i = 0;  i <= 38; i += 2 ) {
       
  3168       __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
       
  3169     }
       
  3170 
       
  3171     // perform cipher transformation
       
  3172     __ fxor(FloatRegisterImpl::D, F0, F54, F54);
       
  3173     __ fxor(FloatRegisterImpl::D, F2, F56, F56);
       
  3174     // rounds 1 through 8
       
  3175     for ( int i = 4;  i <= 28; i += 8 ) {
       
  3176       __ aes_eround01(as_FloatRegister(i), F54, F56, F58);
       
  3177       __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60);
       
  3178       __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54);
       
  3179       __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56);
       
  3180     }
       
  3181     __ aes_eround01(F36, F54, F56, F58); //round 9
       
  3182     __ aes_eround23(F38, F54, F56, F60);
       
  3183 
       
  3184     // 128-bit original key size
       
  3185     __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit);
       
  3186 
       
  3187     for ( int i = 40;  i <= 50; i += 2 ) {
       
  3188       __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) );
       
  3189     }
       
  3190     __ aes_eround01(F40, F58, F60, F54); //round 10
       
  3191     __ aes_eround23(F42, F58, F60, F56);
       
  3192     __ aes_eround01(F44, F54, F56, F58); //round 11
       
  3193     __ aes_eround23(F46, F54, F56, F60);
       
  3194 
       
  3195     // 192-bit original key size
       
  3196     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput);
       
  3197 
       
  3198     __ ldf(FloatRegisterImpl::D, key, 208, F52);
       
  3199     __ aes_eround01(F48, F58, F60, F54); //round 12
       
  3200     __ aes_eround23(F50, F58, F60, F56);
       
  3201     __ ldf(FloatRegisterImpl::D, key, 216, F46);
       
  3202     __ ldf(FloatRegisterImpl::D, key, 224, F48);
       
  3203     __ ldf(FloatRegisterImpl::D, key, 232, F50);
       
  3204     __ aes_eround01(F52, F54, F56, F58); //round 13
       
  3205     __ aes_eround23(F46, F54, F56, F60);
       
  3206     __ ba_short(L_storeOutput);
       
  3207 
       
  3208     __ BIND(L_doLast128bit);
       
  3209     __ ldf(FloatRegisterImpl::D, key, 160, F48);
       
  3210     __ ldf(FloatRegisterImpl::D, key, 168, F50);
       
  3211 
       
  3212     __ BIND(L_storeOutput);
       
  3213     // perform last round of encryption common for all key sizes
       
  3214     __ aes_eround01_l(F48, F58, F60, F54); //last round
       
  3215     __ aes_eround23_l(F50, F58, F60, F56);
       
  3216 
       
  3217     // Method to address arbitrary alignment for store instructions:
       
  3218     // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
       
  3219     // If zero/aligned then continue with double FP store instructions
       
  3220     // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
       
  3221     // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
       
  3222     // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
       
  3223     // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
       
  3224     // Set GSR.align to (8-n) using alignaddr
       
  3225     // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
       
  3226     // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
       
  3227     // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
       
  3228     // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
       
  3229     // We need to execute this process for both the 8-byte result values
       
  3230 
       
  3231     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
       
  3232     __ andcc(to, 7, O5);
       
  3233     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
       
  3234     __ delayed()->edge8n(to, G0, O3);
       
  3235 
       
  3236     // aligned case: store output into the destination array
       
  3237     __ stf(FloatRegisterImpl::D, F54, to, 0);
       
  3238     __ retl();
       
  3239     __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
       
  3240 
       
  3241     __ BIND(L_store_misaligned_output);
       
  3242     __ add(to, 8, O4);
       
  3243     __ mov(8, O2);
       
  3244     __ sub(O2, O5, O2);
       
  3245     __ alignaddr(O2, G0, O2);
       
  3246     __ faligndata(F54, F54, F54);
       
  3247     __ faligndata(F56, F56, F56);
       
  3248     __ and3(to, -8, to);
       
  3249     __ and3(O4, -8, O4);
       
  3250     __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
       
  3251     __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
       
  3252     __ add(to, 8, to);
       
  3253     __ add(O4, 8, O4);
       
  3254     __ orn(G0, O3, O3);
       
  3255     __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
       
  3256     __ retl();
       
  3257     __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
       
  3258 
       
  3259     return start;
       
  3260   }
       
  3261 
       
  3262   address generate_aescrypt_decryptBlock() {
       
  3263     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
       
  3264            "the following code assumes that first element of an int array is aligned to 8 bytes");
       
  3265     // required since we read original key 'byte' array as well in the decryption stubs
       
  3266     assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
       
  3267            "the following code assumes that first element of a byte array is aligned to 8 bytes");
       
  3268     __ align(CodeEntryAlignment);
       
  3269     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
       
  3270     address start = __ pc();
       
  3271     Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
       
  3272     Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
       
  3273     Register from = O0; // source byte array
       
  3274     Register to = O1;   // destination byte array
       
  3275     Register key = O2;  // expanded key array
       
  3276     Register original_key = O3;  // original key array only required during decryption
       
  3277     const Register keylen = O4;  // reg for storing expanded key array length
       
  3278 
       
  3279     // read expanded key array length
       
  3280     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
       
  3281 
       
  3282     // save 'from' since we may need to recheck alignment in case of 256-bit decryption
       
  3283     __ mov(from, G1);
       
  3284 
       
  3285     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
       
  3286     __ andcc(from, 7, G0);
       
  3287     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
       
  3288     __ delayed()->alignaddr(from, G0, from);
       
  3289 
       
  3290     // aligned case: load input into F52-F54
       
  3291     __ ldf(FloatRegisterImpl::D, from, 0, F52);
       
  3292     __ ldf(FloatRegisterImpl::D, from, 8, F54);
       
  3293     __ ba_short(L_load_original_key);
       
  3294 
       
  3295     __ BIND(L_load_misaligned_input);
       
  3296     __ ldf(FloatRegisterImpl::D, from, 0, F52);
       
  3297     __ ldf(FloatRegisterImpl::D, from, 8, F54);
       
  3298     __ ldf(FloatRegisterImpl::D, from, 16, F56);
       
  3299     __ faligndata(F52, F54, F52);
       
  3300     __ faligndata(F54, F56, F54);
       
  3301 
       
  3302     __ BIND(L_load_original_key);
       
  3303     // load original key from SunJCE expanded decryption key
       
  3304     // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
       
  3305     for ( int i = 0;  i <= 3; i++ ) {
       
  3306       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
       
  3307     }
       
  3308 
       
  3309     // 256-bit original key size
       
  3310     __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
       
  3311 
       
  3312     // 192-bit original key size
       
  3313     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
       
  3314 
       
  3315     // 128-bit original key size
       
  3316     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
       
  3317     for ( int i = 0;  i <= 36; i += 4 ) {
       
  3318       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
       
  3319       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
       
  3320     }
       
  3321 
       
  3322     // perform 128-bit key specific inverse cipher transformation
       
  3323     __ fxor(FloatRegisterImpl::D, F42, F54, F54);
       
  3324     __ fxor(FloatRegisterImpl::D, F40, F52, F52);
       
  3325     __ ba_short(L_common_transform);
       
  3326 
       
  3327     __ BIND(L_expand192bit);
       
  3328 
       
  3329     // start loading rest of the 192-bit key
       
  3330     __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
       
  3331     __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
       
  3332 
       
  3333     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
       
  3334     for ( int i = 0;  i <= 36; i += 6 ) {
       
  3335       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
       
  3336       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
       
  3337       __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
       
  3338     }
       
  3339     __ aes_kexpand1(F42, F46, 7, F48);
       
  3340     __ aes_kexpand2(F44, F48, F50);
       
  3341 
       
  3342     // perform 192-bit key specific inverse cipher transformation
       
  3343     __ fxor(FloatRegisterImpl::D, F50, F54, F54);
       
  3344     __ fxor(FloatRegisterImpl::D, F48, F52, F52);
       
  3345     __ aes_dround23(F46, F52, F54, F58);
       
  3346     __ aes_dround01(F44, F52, F54, F56);
       
  3347     __ aes_dround23(F42, F56, F58, F54);
       
  3348     __ aes_dround01(F40, F56, F58, F52);
       
  3349     __ ba_short(L_common_transform);
       
  3350 
       
  3351     __ BIND(L_expand256bit);
       
  3352 
       
  3353     // load rest of the 256-bit key
       
  3354     for ( int i = 4;  i <= 7; i++ ) {
       
  3355       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
       
  3356     }
       
  3357 
       
  3358     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
       
  3359     for ( int i = 0;  i <= 40; i += 8 ) {
       
  3360       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
       
  3361       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
       
  3362       __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
       
  3363       __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
       
  3364     }
       
  3365     __ aes_kexpand1(F48, F54, 6, F56);
       
  3366     __ aes_kexpand2(F50, F56, F58);
       
  3367 
       
  3368     for ( int i = 0;  i <= 6; i += 2 ) {
       
  3369       __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
       
  3370     }
       
  3371 
       
  3372     // reload original 'from' address
       
  3373     __ mov(G1, from);
       
  3374 
       
  3375     // re-check 8-byte alignment
       
  3376     __ andcc(from, 7, G0);
       
  3377     __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
       
  3378     __ delayed()->alignaddr(from, G0, from);
       
  3379 
       
  3380     // aligned case: load input into F52-F54
       
  3381     __ ldf(FloatRegisterImpl::D, from, 0, F52);
       
  3382     __ ldf(FloatRegisterImpl::D, from, 8, F54);
       
  3383     __ ba_short(L_256bit_transform);
       
  3384 
       
  3385     __ BIND(L_reload_misaligned_input);
       
  3386     __ ldf(FloatRegisterImpl::D, from, 0, F52);
       
  3387     __ ldf(FloatRegisterImpl::D, from, 8, F54);
       
  3388     __ ldf(FloatRegisterImpl::D, from, 16, F56);
       
  3389     __ faligndata(F52, F54, F52);
       
  3390     __ faligndata(F54, F56, F54);
       
  3391 
       
  3392     // perform 256-bit key specific inverse cipher transformation
       
  3393     __ BIND(L_256bit_transform);
       
  3394     __ fxor(FloatRegisterImpl::D, F0, F54, F54);
       
  3395     __ fxor(FloatRegisterImpl::D, F2, F52, F52);
       
  3396     __ aes_dround23(F4, F52, F54, F58);
       
  3397     __ aes_dround01(F6, F52, F54, F56);
       
  3398     __ aes_dround23(F50, F56, F58, F54);
       
  3399     __ aes_dround01(F48, F56, F58, F52);
       
  3400     __ aes_dround23(F46, F52, F54, F58);
       
  3401     __ aes_dround01(F44, F52, F54, F56);
       
  3402     __ aes_dround23(F42, F56, F58, F54);
       
  3403     __ aes_dround01(F40, F56, F58, F52);
       
  3404 
       
  3405     for ( int i = 0;  i <= 7; i++ ) {
       
  3406       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
       
  3407     }
       
  3408 
       
  3409     // perform inverse cipher transformations common for all key sizes
       
  3410     __ BIND(L_common_transform);
       
  3411     for ( int i = 38;  i >= 6; i -= 8 ) {
       
  3412       __ aes_dround23(as_FloatRegister(i), F52, F54, F58);
       
  3413       __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56);
       
  3414       if ( i != 6) {
       
  3415         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54);
       
  3416         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52);
       
  3417       } else {
       
  3418         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54);
       
  3419         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52);
       
  3420       }
       
  3421     }
       
  3422 
       
  3423     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
       
  3424     __ andcc(to, 7, O5);
       
  3425     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
       
  3426     __ delayed()->edge8n(to, G0, O3);
       
  3427 
       
  3428     // aligned case: store output into the destination array
       
  3429     __ stf(FloatRegisterImpl::D, F52, to, 0);
       
  3430     __ retl();
       
  3431     __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
       
  3432 
       
  3433     __ BIND(L_store_misaligned_output);
       
  3434     __ add(to, 8, O4);
       
  3435     __ mov(8, O2);
       
  3436     __ sub(O2, O5, O2);
       
  3437     __ alignaddr(O2, G0, O2);
       
  3438     __ faligndata(F52, F52, F52);
       
  3439     __ faligndata(F54, F54, F54);
       
  3440     __ and3(to, -8, to);
       
  3441     __ and3(O4, -8, O4);
       
  3442     __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
       
  3443     __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
       
  3444     __ add(to, 8, to);
       
  3445     __ add(O4, 8, O4);
       
  3446     __ orn(G0, O3, O3);
       
  3447     __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
       
  3448     __ retl();
       
  3449     __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
       
  3450 
       
  3451     return start;
       
  3452   }
       
  3453 
       
  3454   address generate_cipherBlockChaining_encryptAESCrypt() {
       
  3455     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
       
  3456            "the following code assumes that first element of an int array is aligned to 8 bytes");
       
  3457     assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
       
  3458            "the following code assumes that first element of a byte array is aligned to 8 bytes");
       
  3459     __ align(CodeEntryAlignment);
       
  3460     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
       
  3461     Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
       
  3462     Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
       
  3463     Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
       
  3464     Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
       
  3465     address start = __ pc();
       
  3466     Register from = I0; // source byte array
       
  3467     Register to = I1;   // destination byte array
       
  3468     Register key = I2;  // expanded key array
       
  3469     Register rvec = I3; // init vector
       
  3470     const Register len_reg = I4; // cipher length
       
  3471     const Register keylen = I5;  // reg for storing expanded key array length
       
  3472 
       
  3473     __ save_frame(0);
       
  3474     // save cipher len to return in the end
       
  3475     __ mov(len_reg, L0);
       
  3476 
       
  3477     // read expanded key length
       
  3478     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
       
  3479 
       
  3480     // load initial vector, 8-byte alignment is guranteed
       
  3481     __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
       
  3482     __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
       
  3483     // load key, 8-byte alignment is guranteed
       
  3484     __ ldx(key,0,G1);
       
  3485     __ ldx(key,8,G5);
       
  3486 
       
  3487     // start loading expanded key, 8-byte alignment is guranteed
       
  3488     for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
       
  3489       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
       
  3490     }
       
  3491 
       
  3492     // 128-bit original key size
       
  3493     __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128);
       
  3494 
       
  3495     for ( int i = 40, j = 176;  i <= 46; i += 2, j += 8 ) {
       
  3496       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
       
  3497     }
       
  3498 
       
  3499     // 192-bit original key size
       
  3500     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192);
       
  3501 
       
  3502     for ( int i = 48, j = 208;  i <= 54; i += 2, j += 8 ) {
       
  3503       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
       
  3504     }
       
  3505 
       
  3506     // 256-bit original key size
       
  3507     __ ba_short(L_cbcenc256);
       
  3508 
       
  3509     __ align(OptoLoopAlignment);
       
  3510     __ BIND(L_cbcenc128);
       
  3511     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
       
  3512     __ andcc(from, 7, G0);
       
  3513     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
       
  3514     __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
       
  3515 
       
  3516     // aligned case: load input into G3 and G4
       
  3517     __ ldx(from,0,G3);
       
  3518     __ ldx(from,8,G4);
       
  3519     __ ba_short(L_128bit_transform);
       
  3520 
       
  3521     __ BIND(L_load_misaligned_input_128bit);
       
  3522     // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
       
  3523     __ alignaddr(from, G0, from);
       
  3524     __ ldf(FloatRegisterImpl::D, from, 0, F48);
       
  3525     __ ldf(FloatRegisterImpl::D, from, 8, F50);
       
  3526     __ ldf(FloatRegisterImpl::D, from, 16, F52);
       
  3527     __ faligndata(F48, F50, F48);
       
  3528     __ faligndata(F50, F52, F50);
       
  3529     __ movdtox(F48, G3);
       
  3530     __ movdtox(F50, G4);
       
  3531     __ mov(L1, from);
       
  3532 
       
  3533     __ BIND(L_128bit_transform);
       
  3534     __ xor3(G1,G3,G3);
       
  3535     __ xor3(G5,G4,G4);
       
  3536     __ movxtod(G3,F56);
       
  3537     __ movxtod(G4,F58);
       
  3538     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
       
  3539     __ fxor(FloatRegisterImpl::D, F62, F58, F62);
       
  3540 
       
  3541     // TEN_EROUNDS
       
  3542     for ( int i = 0;  i <= 32; i += 8 ) {
       
  3543       __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
       
  3544       __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
       
  3545       if (i != 32 ) {
       
  3546         __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
       
  3547         __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
       
  3548       } else {
       
  3549         __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
       
  3550         __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
       
  3551       }
       
  3552     }
       
  3553 
       
  3554     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
       
  3555     __ andcc(to, 7, L1);
       
  3556     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
       
  3557     __ delayed()->edge8n(to, G0, L2);
       
  3558 
       
  3559     // aligned case: store output into the destination array
       
  3560     __ stf(FloatRegisterImpl::D, F60, to, 0);
       
  3561     __ stf(FloatRegisterImpl::D, F62, to, 8);
       
  3562     __ ba_short(L_check_loop_end_128bit);
       
  3563 
       
  3564     __ BIND(L_store_misaligned_output_128bit);
       
  3565     __ add(to, 8, L3);
       
  3566     __ mov(8, L4);
       
  3567     __ sub(L4, L1, L4);
       
  3568     __ alignaddr(L4, G0, L4);
       
  3569     // save cipher text before circular right shift
       
  3570     // as it needs to be stored as iv for next block (see code before next retl)
       
  3571     __ movdtox(F60, L6);
       
  3572     __ movdtox(F62, L7);
       
  3573     __ faligndata(F60, F60, F60);
       
  3574     __ faligndata(F62, F62, F62);
       
  3575     __ mov(to, L5);
       
  3576     __ and3(to, -8, to);
       
  3577     __ and3(L3, -8, L3);
       
  3578     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
       
  3579     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
       
  3580     __ add(to, 8, to);
       
  3581     __ add(L3, 8, L3);
       
  3582     __ orn(G0, L2, L2);
       
  3583     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
       
  3584     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
       
  3585     __ mov(L5, to);
       
  3586     __ movxtod(L6, F60);
       
  3587     __ movxtod(L7, F62);
       
  3588 
       
  3589     __ BIND(L_check_loop_end_128bit);
       
  3590     __ add(from, 16, from);
       
  3591     __ add(to, 16, to);
       
  3592     __ subcc(len_reg, 16, len_reg);
       
  3593     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
       
  3594     __ delayed()->nop();
       
  3595     // re-init intial vector for next block, 8-byte alignment is guaranteed
       
  3596     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
       
  3597     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
       
  3598     __ mov(L0, I0);
       
  3599     __ ret();
       
  3600     __ delayed()->restore();
       
  3601 
       
  3602     __ align(OptoLoopAlignment);
       
  3603     __ BIND(L_cbcenc192);
       
  3604     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
       
  3605     __ andcc(from, 7, G0);
       
  3606     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
       
  3607     __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
       
  3608 
       
  3609     // aligned case: load input into G3 and G4
       
  3610     __ ldx(from,0,G3);
       
  3611     __ ldx(from,8,G4);
       
  3612     __ ba_short(L_192bit_transform);
       
  3613 
       
  3614     __ BIND(L_load_misaligned_input_192bit);
       
  3615     // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
       
  3616     __ alignaddr(from, G0, from);
       
  3617     __ ldf(FloatRegisterImpl::D, from, 0, F48);
       
  3618     __ ldf(FloatRegisterImpl::D, from, 8, F50);
       
  3619     __ ldf(FloatRegisterImpl::D, from, 16, F52);
       
  3620     __ faligndata(F48, F50, F48);
       
  3621     __ faligndata(F50, F52, F50);
       
  3622     __ movdtox(F48, G3);
       
  3623     __ movdtox(F50, G4);
       
  3624     __ mov(L1, from);
       
  3625 
       
  3626     __ BIND(L_192bit_transform);
       
  3627     __ xor3(G1,G3,G3);
       
  3628     __ xor3(G5,G4,G4);
       
  3629     __ movxtod(G3,F56);
       
  3630     __ movxtod(G4,F58);
       
  3631     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
       
  3632     __ fxor(FloatRegisterImpl::D, F62, F58, F62);
       
  3633 
       
  3634     // TWELEVE_EROUNDS
       
  3635     for ( int i = 0;  i <= 40; i += 8 ) {
       
  3636       __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
       
  3637       __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
       
  3638       if (i != 40 ) {
       
  3639         __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
       
  3640         __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
       
  3641       } else {
       
  3642         __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
       
  3643         __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
       
  3644       }
       
  3645     }
       
  3646 
       
  3647     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
       
  3648     __ andcc(to, 7, L1);
       
  3649     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
       
  3650     __ delayed()->edge8n(to, G0, L2);
       
  3651 
       
  3652     // aligned case: store output into the destination array
       
  3653     __ stf(FloatRegisterImpl::D, F60, to, 0);
       
  3654     __ stf(FloatRegisterImpl::D, F62, to, 8);
       
  3655     __ ba_short(L_check_loop_end_192bit);
       
  3656 
       
  3657     __ BIND(L_store_misaligned_output_192bit);
       
  3658     __ add(to, 8, L3);
       
  3659     __ mov(8, L4);
       
  3660     __ sub(L4, L1, L4);
       
  3661     __ alignaddr(L4, G0, L4);
       
  3662     __ movdtox(F60, L6);
       
  3663     __ movdtox(F62, L7);
       
  3664     __ faligndata(F60, F60, F60);
       
  3665     __ faligndata(F62, F62, F62);
       
  3666     __ mov(to, L5);
       
  3667     __ and3(to, -8, to);
       
  3668     __ and3(L3, -8, L3);
       
  3669     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
       
  3670     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
       
  3671     __ add(to, 8, to);
       
  3672     __ add(L3, 8, L3);
       
  3673     __ orn(G0, L2, L2);
       
  3674     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
       
  3675     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
       
  3676     __ mov(L5, to);
       
  3677     __ movxtod(L6, F60);
       
  3678     __ movxtod(L7, F62);
       
  3679 
       
  3680     __ BIND(L_check_loop_end_192bit);
       
  3681     __ add(from, 16, from);
       
  3682     __ subcc(len_reg, 16, len_reg);
       
  3683     __ add(to, 16, to);
       
  3684     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
       
  3685     __ delayed()->nop();
       
  3686     // re-init intial vector for next block, 8-byte alignment is guaranteed
       
  3687     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
       
  3688     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
       
  3689     __ mov(L0, I0);
       
  3690     __ ret();
       
  3691     __ delayed()->restore();
       
  3692 
       
  3693     __ align(OptoLoopAlignment);
       
  3694     __ BIND(L_cbcenc256);
       
  3695     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
       
  3696     __ andcc(from, 7, G0);
       
  3697     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
       
  3698     __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
       
  3699 
       
  3700     // aligned case: load input into G3 and G4
       
  3701     __ ldx(from,0,G3);
       
  3702     __ ldx(from,8,G4);
       
  3703     __ ba_short(L_256bit_transform);
       
  3704 
       
  3705     __ BIND(L_load_misaligned_input_256bit);
       
  3706     // cannot clobber F48, F50 and F52. F56, F58 can be used though
       
  3707     __ alignaddr(from, G0, from);
       
  3708     __ movdtox(F60, L2); // save F60 before overwriting
       
  3709     __ ldf(FloatRegisterImpl::D, from, 0, F56);
       
  3710     __ ldf(FloatRegisterImpl::D, from, 8, F58);
       
  3711     __ ldf(FloatRegisterImpl::D, from, 16, F60);
       
  3712     __ faligndata(F56, F58, F56);
       
  3713     __ faligndata(F58, F60, F58);
       
  3714     __ movdtox(F56, G3);
       
  3715     __ movdtox(F58, G4);
       
  3716     __ mov(L1, from);
       
  3717     __ movxtod(L2, F60);
       
  3718 
       
  3719     __ BIND(L_256bit_transform);
       
  3720     __ xor3(G1,G3,G3);
       
  3721     __ xor3(G5,G4,G4);
       
  3722     __ movxtod(G3,F56);
       
  3723     __ movxtod(G4,F58);
       
  3724     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
       
  3725     __ fxor(FloatRegisterImpl::D, F62, F58, F62);
       
  3726 
       
  3727     // FOURTEEN_EROUNDS
       
  3728     for ( int i = 0;  i <= 48; i += 8 ) {
       
  3729       __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
       
  3730       __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
       
  3731       if (i != 48 ) {
       
  3732         __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
       
  3733         __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
       
  3734       } else {
       
  3735         __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
       
  3736         __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
       
  3737       }
       
  3738     }
       
  3739 
       
  3740     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
       
  3741     __ andcc(to, 7, L1);
       
  3742     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
       
  3743     __ delayed()->edge8n(to, G0, L2);
       
  3744 
       
  3745     // aligned case: store output into the destination array
       
  3746     __ stf(FloatRegisterImpl::D, F60, to, 0);
       
  3747     __ stf(FloatRegisterImpl::D, F62, to, 8);
       
  3748     __ ba_short(L_check_loop_end_256bit);
       
  3749 
       
  3750     __ BIND(L_store_misaligned_output_256bit);
       
  3751     __ add(to, 8, L3);
       
  3752     __ mov(8, L4);
       
  3753     __ sub(L4, L1, L4);
       
  3754     __ alignaddr(L4, G0, L4);
       
  3755     __ movdtox(F60, L6);
       
  3756     __ movdtox(F62, L7);
       
  3757     __ faligndata(F60, F60, F60);
       
  3758     __ faligndata(F62, F62, F62);
       
  3759     __ mov(to, L5);
       
  3760     __ and3(to, -8, to);
       
  3761     __ and3(L3, -8, L3);
       
  3762     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
       
  3763     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
       
  3764     __ add(to, 8, to);
       
  3765     __ add(L3, 8, L3);
       
  3766     __ orn(G0, L2, L2);
       
  3767     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
       
  3768     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
       
  3769     __ mov(L5, to);
       
  3770     __ movxtod(L6, F60);
       
  3771     __ movxtod(L7, F62);
       
  3772 
       
  3773     __ BIND(L_check_loop_end_256bit);
       
  3774     __ add(from, 16, from);
       
  3775     __ subcc(len_reg, 16, len_reg);
       
  3776     __ add(to, 16, to);
       
  3777     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
       
  3778     __ delayed()->nop();
       
  3779     // re-init intial vector for next block, 8-byte alignment is guaranteed
       
  3780     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
       
  3781     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
       
  3782     __ mov(L0, I0);
       
  3783     __ ret();
       
  3784     __ delayed()->restore();
       
  3785 
       
  3786     return start;
       
  3787   }
       
  3788 
       
  3789   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
       
  3790     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
       
  3791            "the following code assumes that first element of an int array is aligned to 8 bytes");
       
  3792     assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
       
  3793            "the following code assumes that first element of a byte array is aligned to 8 bytes");
       
  3794     __ align(CodeEntryAlignment);
       
  3795     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
       
  3796     Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
       
  3797     Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
       
  3798     Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
       
  3799     Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
       
  3800     Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
       
  3801     Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
       
  3802     Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
       
  3803     address start = __ pc();
       
  3804     Register from = I0; // source byte array
       
  3805     Register to = I1;   // destination byte array
       
  3806     Register key = I2;  // expanded key array
       
  3807     Register rvec = I3; // init vector
       
  3808     const Register len_reg = I4; // cipher length
       
  3809     const Register original_key = I5;  // original key array only required during decryption
       
  3810     const Register keylen = L6;  // reg for storing expanded key array length
       
  3811 
       
  3812     __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
       
  3813     // save cipher len to return in the end
       
  3814     __ mov(len_reg, L7);
       
  3815 
       
  3816     // load original key from SunJCE expanded decryption key
       
  3817     // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
       
  3818     for ( int i = 0;  i <= 3; i++ ) {
       
  3819       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
       
  3820     }
       
  3821 
       
  3822     // load initial vector, 8-byte alignment is guaranteed
       
  3823     __ ldx(rvec,0,L0);
       
  3824     __ ldx(rvec,8,L1);
       
  3825 
       
  3826     // read expanded key array length
       
  3827     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
       
  3828 
       
  3829     // 256-bit original key size
       
  3830     __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
       
  3831 
       
  3832     // 192-bit original key size
       
  3833     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
       
  3834 
       
  3835     // 128-bit original key size
       
  3836     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
       
  3837     for ( int i = 0;  i <= 36; i += 4 ) {
       
  3838       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
       
  3839       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
       
  3840     }
       
  3841 
       
  3842     // load expanded key[last-1] and key[last] elements
       
  3843     __ movdtox(F40,L2);
       
  3844     __ movdtox(F42,L3);
       
  3845 
       
  3846     __ and3(len_reg, 16, L4);
       
  3847     __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
       
  3848     __ nop();
       
  3849 
       
  3850     __ ba_short(L_dec_first_block_start);
       
  3851 
       
  3852     __ BIND(L_expand192bit);
       
  3853     // load rest of the 192-bit key
       
  3854     __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
       
  3855     __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
       
  3856 
       
  3857     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
       
  3858     for ( int i = 0;  i <= 36; i += 6 ) {
       
  3859       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
       
  3860       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
       
  3861       __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
       
  3862     }
       
  3863     __ aes_kexpand1(F42, F46, 7, F48);
       
  3864     __ aes_kexpand2(F44, F48, F50);
       
  3865 
       
  3866     // load expanded key[last-1] and key[last] elements
       
  3867     __ movdtox(F48,L2);
       
  3868     __ movdtox(F50,L3);
       
  3869 
       
  3870     __ and3(len_reg, 16, L4);
       
  3871     __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
       
  3872     __ nop();
       
  3873 
       
  3874     __ ba_short(L_dec_first_block_start);
       
  3875 
       
  3876     __ BIND(L_expand256bit);
       
  3877     // load rest of the 256-bit key
       
  3878     for ( int i = 4;  i <= 7; i++ ) {
       
  3879       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
       
  3880     }
       
  3881 
       
  3882     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
       
  3883     for ( int i = 0;  i <= 40; i += 8 ) {
       
  3884       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
       
  3885       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
       
  3886       __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
       
  3887       __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
       
  3888     }
       
  3889     __ aes_kexpand1(F48, F54, 6, F56);
       
  3890     __ aes_kexpand2(F50, F56, F58);
       
  3891 
       
  3892     // load expanded key[last-1] and key[last] elements
       
  3893     __ movdtox(F56,L2);
       
  3894     __ movdtox(F58,L3);
       
  3895 
       
  3896     __ and3(len_reg, 16, L4);
       
  3897     __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
       
  3898 
       
  3899     __ BIND(L_dec_first_block_start);
       
  3900     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
       
  3901     __ andcc(from, 7, G0);
       
  3902     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
       
  3903     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
       
  3904 
       
  3905     // aligned case: load input into L4 and L5
       
  3906     __ ldx(from,0,L4);
       
  3907     __ ldx(from,8,L5);
       
  3908     __ ba_short(L_transform_first_block);
       
  3909 
       
  3910     __ BIND(L_load_misaligned_input_first_block);
       
  3911     __ alignaddr(from, G0, from);
       
  3912     // F58, F60, F62 can be clobbered
       
  3913     __ ldf(FloatRegisterImpl::D, from, 0, F58);
       
  3914     __ ldf(FloatRegisterImpl::D, from, 8, F60);
       
  3915     __ ldf(FloatRegisterImpl::D, from, 16, F62);
       
  3916     __ faligndata(F58, F60, F58);
       
  3917     __ faligndata(F60, F62, F60);
       
  3918     __ movdtox(F58, L4);
       
  3919     __ movdtox(F60, L5);
       
  3920     __ mov(G1, from);
       
  3921 
       
  3922     __ BIND(L_transform_first_block);
       
  3923     __ xor3(L2,L4,G1);
       
  3924     __ movxtod(G1,F60);
       
  3925     __ xor3(L3,L5,G1);
       
  3926     __ movxtod(G1,F62);
       
  3927 
       
  3928     // 128-bit original key size
       
  3929     __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128);
       
  3930 
       
  3931     // 192-bit original key size
       
  3932     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192);
       
  3933 
       
  3934     __ aes_dround23(F54, F60, F62, F58);
       
  3935     __ aes_dround01(F52, F60, F62, F56);
       
  3936     __ aes_dround23(F50, F56, F58, F62);
       
  3937     __ aes_dround01(F48, F56, F58, F60);
       
  3938 
       
  3939     __ BIND(L_dec_first_block192);
       
  3940     __ aes_dround23(F46, F60, F62, F58);
       
  3941     __ aes_dround01(F44, F60, F62, F56);
       
  3942     __ aes_dround23(F42, F56, F58, F62);
       
  3943     __ aes_dround01(F40, F56, F58, F60);
       
  3944 
       
  3945     __ BIND(L_dec_first_block128);
       
  3946     for ( int i = 38;  i >= 6; i -= 8 ) {
       
  3947       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
       
  3948       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
       
  3949       if ( i != 6) {
       
  3950         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
       
  3951         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
       
  3952       } else {
       
  3953         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
       
  3954         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
       
  3955       }
       
  3956     }
       
  3957 
       
  3958     __ movxtod(L0,F56);
       
  3959     __ movxtod(L1,F58);
       
  3960     __ mov(L4,L0);
       
  3961     __ mov(L5,L1);
       
  3962     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
       
  3963     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
       
  3964 
       
  3965     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
       
  3966     __ andcc(to, 7, G1);
       
  3967     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
       
  3968     __ delayed()->edge8n(to, G0, G2);
       
  3969 
       
  3970     // aligned case: store output into the destination array
       
  3971     __ stf(FloatRegisterImpl::D, F60, to, 0);
       
  3972     __ stf(FloatRegisterImpl::D, F62, to, 8);
       
  3973     __ ba_short(L_check_decrypt_end);
       
  3974 
       
  3975     __ BIND(L_store_misaligned_output_first_block);
       
  3976     __ add(to, 8, G3);
       
  3977     __ mov(8, G4);
       
  3978     __ sub(G4, G1, G4);
       
  3979     __ alignaddr(G4, G0, G4);
       
  3980     __ faligndata(F60, F60, F60);
       
  3981     __ faligndata(F62, F62, F62);
       
  3982     __ mov(to, G1);
       
  3983     __ and3(to, -8, to);
       
  3984     __ and3(G3, -8, G3);
       
  3985     __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
       
  3986     __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
       
  3987     __ add(to, 8, to);
       
  3988     __ add(G3, 8, G3);
       
  3989     __ orn(G0, G2, G2);
       
  3990     __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
       
  3991     __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
       
  3992     __ mov(G1, to);
       
  3993 
       
  3994     __ BIND(L_check_decrypt_end);
       
  3995     __ add(from, 16, from);
       
  3996     __ add(to, 16, to);
       
  3997     __ subcc(len_reg, 16, len_reg);
       
  3998     __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end);
       
  3999     __ delayed()->nop();
       
  4000 
       
  4001     // 256-bit original key size
       
  4002     __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256);
       
  4003 
       
  4004     // 192-bit original key size
       
  4005     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192);
       
  4006 
       
  4007     __ align(OptoLoopAlignment);
       
  4008     __ BIND(L_dec_next2_blocks128);
       
  4009     __ nop();
       
  4010 
       
  4011     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
       
  4012     __ andcc(from, 7, G0);
       
  4013     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
       
  4014     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
       
  4015 
       
  4016     // aligned case: load input into G4, G5, L4 and L5
       
  4017     __ ldx(from,0,G4);
       
  4018     __ ldx(from,8,G5);
       
  4019     __ ldx(from,16,L4);
       
  4020     __ ldx(from,24,L5);
       
  4021     __ ba_short(L_transform_next2_blocks128);
       
  4022 
       
  4023     __ BIND(L_load_misaligned_next2_blocks128);
       
  4024     __ alignaddr(from, G0, from);
       
  4025     // F40, F42, F58, F60, F62 can be clobbered
       
  4026     __ ldf(FloatRegisterImpl::D, from, 0, F40);
       
  4027     __ ldf(FloatRegisterImpl::D, from, 8, F42);
       
  4028     __ ldf(FloatRegisterImpl::D, from, 16, F60);
       
  4029     __ ldf(FloatRegisterImpl::D, from, 24, F62);
       
  4030     __ ldf(FloatRegisterImpl::D, from, 32, F58);
       
  4031     __ faligndata(F40, F42, F40);
       
  4032     __ faligndata(F42, F60, F42);
       
  4033     __ faligndata(F60, F62, F60);
       
  4034     __ faligndata(F62, F58, F62);
       
  4035     __ movdtox(F40, G4);
       
  4036     __ movdtox(F42, G5);
       
  4037     __ movdtox(F60, L4);
       
  4038     __ movdtox(F62, L5);
       
  4039     __ mov(G1, from);
       
  4040 
       
  4041     __ BIND(L_transform_next2_blocks128);
       
  4042     // F40:F42 used for first 16-bytes
       
  4043     __ xor3(L2,G4,G1);
       
  4044     __ movxtod(G1,F40);
       
  4045     __ xor3(L3,G5,G1);
       
  4046     __ movxtod(G1,F42);
       
  4047 
       
  4048     // F60:F62 used for next 16-bytes
       
  4049     __ xor3(L2,L4,G1);
       
  4050     __ movxtod(G1,F60);
       
  4051     __ xor3(L3,L5,G1);
       
  4052     __ movxtod(G1,F62);
       
  4053 
       
  4054     for ( int i = 38;  i >= 6; i -= 8 ) {
       
  4055       __ aes_dround23(as_FloatRegister(i), F40, F42, F44);
       
  4056       __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46);
       
  4057       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
       
  4058       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
       
  4059       if (i != 6 ) {
       
  4060         __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42);
       
  4061         __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40);
       
  4062         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
       
  4063         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
       
  4064       } else {
       
  4065         __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42);
       
  4066         __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40);
       
  4067         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
       
  4068         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
       
  4069       }
       
  4070     }
       
  4071 
       
  4072     __ movxtod(L0,F46);
       
  4073     __ movxtod(L1,F44);
       
  4074     __ fxor(FloatRegisterImpl::D, F46, F40, F40);
       
  4075     __ fxor(FloatRegisterImpl::D, F44, F42, F42);
       
  4076 
       
  4077     __ movxtod(G4,F56);
       
  4078     __ movxtod(G5,F58);
       
  4079     __ mov(L4,L0);
       
  4080     __ mov(L5,L1);
       
  4081     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
       
  4082     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
       
  4083 
       
  4084     // For mis-aligned store of 32 bytes of result we can do:
       
  4085     // Circular right-shift all 4 FP registers so that 'head' and 'tail'
       
  4086     // parts that need to be stored starting at mis-aligned address are in a FP reg
       
  4087     // the other 3 FP regs can thus be stored using regular store
       
  4088     // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
       
  4089 
       
  4090     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
       
  4091     __ andcc(to, 7, G1);
       
  4092     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
       
  4093     __ delayed()->edge8n(to, G0, G2);
       
  4094 
       
  4095     // aligned case: store output into the destination array
       
  4096     __ stf(FloatRegisterImpl::D, F40, to, 0);
       
  4097     __ stf(FloatRegisterImpl::D, F42, to, 8);
       
  4098     __ stf(FloatRegisterImpl::D, F60, to, 16);
       
  4099     __ stf(FloatRegisterImpl::D, F62, to, 24);
       
  4100     __ ba_short(L_check_decrypt_loop_end128);
       
  4101 
       
  4102     __ BIND(L_store_misaligned_output_next2_blocks128);
       
  4103     __ mov(8, G4);
       
  4104     __ sub(G4, G1, G4);
       
  4105     __ alignaddr(G4, G0, G4);
       
  4106     __ faligndata(F40, F42, F56); // F56 can be clobbered
       
  4107     __ faligndata(F42, F60, F42);
       
  4108     __ faligndata(F60, F62, F60);
       
  4109     __ faligndata(F62, F40, F40);
       
  4110     __ mov(to, G1);
       
  4111     __ and3(to, -8, to);
       
  4112     __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
       
  4113     __ stf(FloatRegisterImpl::D, F56, to, 8);
       
  4114     __ stf(FloatRegisterImpl::D, F42, to, 16);
       
  4115     __ stf(FloatRegisterImpl::D, F60, to, 24);
       
  4116     __ add(to, 32, to);
       
  4117     __ orn(G0, G2, G2);
       
  4118     __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
       
  4119     __ mov(G1, to);
       
  4120 
       
  4121     __ BIND(L_check_decrypt_loop_end128);
       
  4122     __ add(from, 32, from);
       
  4123     __ add(to, 32, to);
       
  4124     __ subcc(len_reg, 32, len_reg);
       
  4125     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
       
  4126     __ delayed()->nop();
       
  4127     __ ba_short(L_cbcdec_end);
       
  4128 
       
  4129     __ align(OptoLoopAlignment);
       
  4130     __ BIND(L_dec_next2_blocks192);
       
  4131     __ nop();
       
  4132 
       
  4133     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
       
  4134     __ andcc(from, 7, G0);
       
  4135     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
       
  4136     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
       
  4137 
       
  4138     // aligned case: load input into G4, G5, L4 and L5
       
  4139     __ ldx(from,0,G4);
       
  4140     __ ldx(from,8,G5);
       
  4141     __ ldx(from,16,L4);
       
  4142     __ ldx(from,24,L5);
       
  4143     __ ba_short(L_transform_next2_blocks192);
       
  4144 
       
  4145     __ BIND(L_load_misaligned_next2_blocks192);
       
  4146     __ alignaddr(from, G0, from);
       
  4147     // F48, F50, F52, F60, F62 can be clobbered
       
  4148     __ ldf(FloatRegisterImpl::D, from, 0, F48);
       
  4149     __ ldf(FloatRegisterImpl::D, from, 8, F50);
       
  4150     __ ldf(FloatRegisterImpl::D, from, 16, F60);
       
  4151     __ ldf(FloatRegisterImpl::D, from, 24, F62);
       
  4152     __ ldf(FloatRegisterImpl::D, from, 32, F52);
       
  4153     __ faligndata(F48, F50, F48);
       
  4154     __ faligndata(F50, F60, F50);
       
  4155     __ faligndata(F60, F62, F60);
       
  4156     __ faligndata(F62, F52, F62);
       
  4157     __ movdtox(F48, G4);
       
  4158     __ movdtox(F50, G5);
       
  4159     __ movdtox(F60, L4);
       
  4160     __ movdtox(F62, L5);
       
  4161     __ mov(G1, from);
       
  4162 
       
  4163     __ BIND(L_transform_next2_blocks192);
       
  4164     // F48:F50 used for first 16-bytes
       
  4165     __ xor3(L2,G4,G1);
       
  4166     __ movxtod(G1,F48);
       
  4167     __ xor3(L3,G5,G1);
       
  4168     __ movxtod(G1,F50);
       
  4169 
       
  4170     // F60:F62 used for next 16-bytes
       
  4171     __ xor3(L2,L4,G1);
       
  4172     __ movxtod(G1,F60);
       
  4173     __ xor3(L3,L5,G1);
       
  4174     __ movxtod(G1,F62);
       
  4175 
       
  4176     for ( int i = 46;  i >= 6; i -= 8 ) {
       
  4177       __ aes_dround23(as_FloatRegister(i), F48, F50, F52);
       
  4178       __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54);
       
  4179       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
       
  4180       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
       
  4181       if (i != 6 ) {
       
  4182         __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50);
       
  4183         __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48);
       
  4184         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
       
  4185         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
       
  4186       } else {
       
  4187         __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50);
       
  4188         __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48);
       
  4189         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
       
  4190         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
       
  4191       }
       
  4192     }
       
  4193 
       
  4194     __ movxtod(L0,F54);
       
  4195     __ movxtod(L1,F52);
       
  4196     __ fxor(FloatRegisterImpl::D, F54, F48, F48);
       
  4197     __ fxor(FloatRegisterImpl::D, F52, F50, F50);
       
  4198 
       
  4199     __ movxtod(G4,F56);
       
  4200     __ movxtod(G5,F58);
       
  4201     __ mov(L4,L0);
       
  4202     __ mov(L5,L1);
       
  4203     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
       
  4204     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
       
  4205 
       
  4206     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
       
  4207     __ andcc(to, 7, G1);
       
  4208     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
       
  4209     __ delayed()->edge8n(to, G0, G2);
       
  4210 
       
  4211     // aligned case: store output into the destination array
       
  4212     __ stf(FloatRegisterImpl::D, F48, to, 0);
       
  4213     __ stf(FloatRegisterImpl::D, F50, to, 8);
       
  4214     __ stf(FloatRegisterImpl::D, F60, to, 16);
       
  4215     __ stf(FloatRegisterImpl::D, F62, to, 24);
       
  4216     __ ba_short(L_check_decrypt_loop_end192);
       
  4217 
       
  4218     __ BIND(L_store_misaligned_output_next2_blocks192);
       
  4219     __ mov(8, G4);
       
  4220     __ sub(G4, G1, G4);
       
  4221     __ alignaddr(G4, G0, G4);
       
  4222     __ faligndata(F48, F50, F56); // F56 can be clobbered
       
  4223     __ faligndata(F50, F60, F50);
       
  4224     __ faligndata(F60, F62, F60);
       
  4225     __ faligndata(F62, F48, F48);
       
  4226     __ mov(to, G1);
       
  4227     __ and3(to, -8, to);
       
  4228     __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
       
  4229     __ stf(FloatRegisterImpl::D, F56, to, 8);
       
  4230     __ stf(FloatRegisterImpl::D, F50, to, 16);
       
  4231     __ stf(FloatRegisterImpl::D, F60, to, 24);
       
  4232     __ add(to, 32, to);
       
  4233     __ orn(G0, G2, G2);
       
  4234     __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
       
  4235     __ mov(G1, to);
       
  4236 
       
  4237     __ BIND(L_check_decrypt_loop_end192);
       
  4238     __ add(from, 32, from);
       
  4239     __ add(to, 32, to);
       
  4240     __ subcc(len_reg, 32, len_reg);
       
  4241     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
       
  4242     __ delayed()->nop();
       
  4243     __ ba_short(L_cbcdec_end);
       
  4244 
       
  4245     __ align(OptoLoopAlignment);
       
  4246     __ BIND(L_dec_next2_blocks256);
       
  4247     __ nop();
       
  4248 
       
  4249     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
       
  4250     __ andcc(from, 7, G0);
       
  4251     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
       
  4252     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
       
  4253 
       
  4254     // aligned case: load input into G4, G5, L4 and L5
       
  4255     __ ldx(from,0,G4);
       
  4256     __ ldx(from,8,G5);
       
  4257     __ ldx(from,16,L4);
       
  4258     __ ldx(from,24,L5);
       
  4259     __ ba_short(L_transform_next2_blocks256);
       
  4260 
       
  4261     __ BIND(L_load_misaligned_next2_blocks256);
       
  4262     __ alignaddr(from, G0, from);
       
  4263     // F0, F2, F4, F60, F62 can be clobbered
       
  4264     __ ldf(FloatRegisterImpl::D, from, 0, F0);
       
  4265     __ ldf(FloatRegisterImpl::D, from, 8, F2);
       
  4266     __ ldf(FloatRegisterImpl::D, from, 16, F60);
       
  4267     __ ldf(FloatRegisterImpl::D, from, 24, F62);
       
  4268     __ ldf(FloatRegisterImpl::D, from, 32, F4);
       
  4269     __ faligndata(F0, F2, F0);
       
  4270     __ faligndata(F2, F60, F2);
       
  4271     __ faligndata(F60, F62, F60);
       
  4272     __ faligndata(F62, F4, F62);
       
  4273     __ movdtox(F0, G4);
       
  4274     __ movdtox(F2, G5);
       
  4275     __ movdtox(F60, L4);
       
  4276     __ movdtox(F62, L5);
       
  4277     __ mov(G1, from);
       
  4278 
       
  4279     __ BIND(L_transform_next2_blocks256);
       
  4280     // F0:F2 used for first 16-bytes
       
  4281     __ xor3(L2,G4,G1);
       
  4282     __ movxtod(G1,F0);
       
  4283     __ xor3(L3,G5,G1);
       
  4284     __ movxtod(G1,F2);
       
  4285 
       
  4286     // F60:F62 used for next 16-bytes
       
  4287     __ xor3(L2,L4,G1);
       
  4288     __ movxtod(G1,F60);
       
  4289     __ xor3(L3,L5,G1);
       
  4290     __ movxtod(G1,F62);
       
  4291 
       
  4292     __ aes_dround23(F54, F0, F2, F4);
       
  4293     __ aes_dround01(F52, F0, F2, F6);
       
  4294     __ aes_dround23(F54, F60, F62, F58);
       
  4295     __ aes_dround01(F52, F60, F62, F56);
       
  4296     __ aes_dround23(F50, F6, F4, F2);
       
  4297     __ aes_dround01(F48, F6, F4, F0);
       
  4298     __ aes_dround23(F50, F56, F58, F62);
       
  4299     __ aes_dround01(F48, F56, F58, F60);
       
  4300     // save F48:F54 in temp registers
       
  4301     __ movdtox(F54,G2);
       
  4302     __ movdtox(F52,G3);
       
  4303     __ movdtox(F50,G6);
       
  4304     __ movdtox(F48,G1);
       
  4305     for ( int i = 46;  i >= 14; i -= 8 ) {
       
  4306       __ aes_dround23(as_FloatRegister(i), F0, F2, F4);
       
  4307       __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6);
       
  4308       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
       
  4309       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
       
  4310       __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2);
       
  4311       __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0);
       
  4312       __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
       
  4313       __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
       
  4314     }
       
  4315     // init F48:F54 with F0:F6 values (original key)
       
  4316     __ ldf(FloatRegisterImpl::D, original_key, 0, F48);
       
  4317     __ ldf(FloatRegisterImpl::D, original_key, 8, F50);
       
  4318     __ ldf(FloatRegisterImpl::D, original_key, 16, F52);
       
  4319     __ ldf(FloatRegisterImpl::D, original_key, 24, F54);
       
  4320     __ aes_dround23(F54, F0, F2, F4);
       
  4321     __ aes_dround01(F52, F0, F2, F6);
       
  4322     __ aes_dround23(F54, F60, F62, F58);
       
  4323     __ aes_dround01(F52, F60, F62, F56);
       
  4324     __ aes_dround23_l(F50, F6, F4, F2);
       
  4325     __ aes_dround01_l(F48, F6, F4, F0);
       
  4326     __ aes_dround23_l(F50, F56, F58, F62);
       
  4327     __ aes_dround01_l(F48, F56, F58, F60);
       
  4328     // re-init F48:F54 with their original values
       
  4329     __ movxtod(G2,F54);
       
  4330     __ movxtod(G3,F52);
       
  4331     __ movxtod(G6,F50);
       
  4332     __ movxtod(G1,F48);
       
  4333 
       
  4334     __ movxtod(L0,F6);
       
  4335     __ movxtod(L1,F4);
       
  4336     __ fxor(FloatRegisterImpl::D, F6, F0, F0);
       
  4337     __ fxor(FloatRegisterImpl::D, F4, F2, F2);
       
  4338 
       
  4339     __ movxtod(G4,F56);
       
  4340     __ movxtod(G5,F58);
       
  4341     __ mov(L4,L0);
       
  4342     __ mov(L5,L1);
       
  4343     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
       
  4344     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
       
  4345 
       
  4346     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
       
  4347     __ andcc(to, 7, G1);
       
  4348     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
       
  4349     __ delayed()->edge8n(to, G0, G2);
       
  4350 
       
  4351     // aligned case: store output into the destination array
       
  4352     __ stf(FloatRegisterImpl::D, F0, to, 0);
       
  4353     __ stf(FloatRegisterImpl::D, F2, to, 8);
       
  4354     __ stf(FloatRegisterImpl::D, F60, to, 16);
       
  4355     __ stf(FloatRegisterImpl::D, F62, to, 24);
       
  4356     __ ba_short(L_check_decrypt_loop_end256);
       
  4357 
       
  4358     __ BIND(L_store_misaligned_output_next2_blocks256);
       
  4359     __ mov(8, G4);
       
  4360     __ sub(G4, G1, G4);
       
  4361     __ alignaddr(G4, G0, G4);
       
  4362     __ faligndata(F0, F2, F56); // F56 can be clobbered
       
  4363     __ faligndata(F2, F60, F2);
       
  4364     __ faligndata(F60, F62, F60);
       
  4365     __ faligndata(F62, F0, F0);
       
  4366     __ mov(to, G1);
       
  4367     __ and3(to, -8, to);
       
  4368     __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
       
  4369     __ stf(FloatRegisterImpl::D, F56, to, 8);
       
  4370     __ stf(FloatRegisterImpl::D, F2, to, 16);
       
  4371     __ stf(FloatRegisterImpl::D, F60, to, 24);
       
  4372     __ add(to, 32, to);
       
  4373     __ orn(G0, G2, G2);
       
  4374     __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
       
  4375     __ mov(G1, to);
       
  4376 
       
  4377     __ BIND(L_check_decrypt_loop_end256);
       
  4378     __ add(from, 32, from);
       
  4379     __ add(to, 32, to);
       
  4380     __ subcc(len_reg, 32, len_reg);
       
  4381     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256);
       
  4382     __ delayed()->nop();
       
  4383 
       
  4384     __ BIND(L_cbcdec_end);
       
  4385     // re-init intial vector for next block, 8-byte alignment is guaranteed
       
  4386     __ stx(L0, rvec, 0);
       
  4387     __ stx(L1, rvec, 8);
       
  4388     __ mov(L7, I0);
       
  4389     __ ret();
       
  4390     __ delayed()->restore();
       
  4391 
       
  4392     return start;
       
  4393   }
       
  4394 
       
  4395   address generate_sha1_implCompress(bool multi_block, const char *name) {
       
  4396     __ align(CodeEntryAlignment);
       
  4397     StubCodeMark mark(this, "StubRoutines", name);
       
  4398     address start = __ pc();
       
  4399 
       
  4400     Label L_sha1_loop, L_sha1_unaligned_input, L_sha1_unaligned_input_loop;
       
  4401     int i;
       
  4402 
       
  4403     Register buf   = O0; // byte[] source+offset
       
  4404     Register state = O1; // int[]  SHA.state
       
  4405     Register ofs   = O2; // int    offset
       
  4406     Register limit = O3; // int    limit
       
  4407 
       
  4408     // load state into F0-F4
       
  4409     for (i = 0; i < 5; i++) {
       
  4410       __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
       
  4411     }
       
  4412 
       
  4413     __ andcc(buf, 7, G0);
       
  4414     __ br(Assembler::notZero, false, Assembler::pn, L_sha1_unaligned_input);
       
  4415     __ delayed()->nop();
       
  4416 
       
  4417     __ BIND(L_sha1_loop);
       
  4418     // load buf into F8-F22
       
  4419     for (i = 0; i < 8; i++) {
       
  4420       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
       
  4421     }
       
  4422     __ sha1();
       
  4423     if (multi_block) {
       
  4424       __ add(ofs, 64, ofs);
       
  4425       __ add(buf, 64, buf);
       
  4426       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_loop);
       
  4427       __ mov(ofs, O0); // to be returned
       
  4428     }
       
  4429 
       
  4430     // store F0-F4 into state and return
       
  4431     for (i = 0; i < 4; i++) {
       
  4432       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
       
  4433     }
       
  4434     __ retl();
       
  4435     __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
       
  4436 
       
  4437     __ BIND(L_sha1_unaligned_input);
       
  4438     __ alignaddr(buf, G0, buf);
       
  4439 
       
  4440     __ BIND(L_sha1_unaligned_input_loop);
       
  4441     // load buf into F8-F22
       
  4442     for (i = 0; i < 9; i++) {
       
  4443       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
       
  4444     }
       
  4445     for (i = 0; i < 8; i++) {
       
  4446       __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
       
  4447     }
       
  4448     __ sha1();
       
  4449     if (multi_block) {
       
  4450       __ add(ofs, 64, ofs);
       
  4451       __ add(buf, 64, buf);
       
  4452       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_unaligned_input_loop);
       
  4453       __ mov(ofs, O0); // to be returned
       
  4454     }
       
  4455 
       
  4456     // store F0-F4 into state and return
       
  4457     for (i = 0; i < 4; i++) {
       
  4458       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
       
  4459     }
       
  4460     __ retl();
       
  4461     __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
       
  4462 
       
  4463     return start;
       
  4464   }
       
  4465 
       
  4466   address generate_sha256_implCompress(bool multi_block, const char *name) {
       
  4467     __ align(CodeEntryAlignment);
       
  4468     StubCodeMark mark(this, "StubRoutines", name);
       
  4469     address start = __ pc();
       
  4470 
       
  4471     Label L_sha256_loop, L_sha256_unaligned_input, L_sha256_unaligned_input_loop;
       
  4472     int i;
       
  4473 
       
  4474     Register buf   = O0; // byte[] source+offset
       
  4475     Register state = O1; // int[]  SHA2.state
       
  4476     Register ofs   = O2; // int    offset
       
  4477     Register limit = O3; // int    limit
       
  4478 
       
  4479     // load state into F0-F7
       
  4480     for (i = 0; i < 8; i++) {
       
  4481       __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
       
  4482     }
       
  4483 
       
  4484     __ andcc(buf, 7, G0);
       
  4485     __ br(Assembler::notZero, false, Assembler::pn, L_sha256_unaligned_input);
       
  4486     __ delayed()->nop();
       
  4487 
       
  4488     __ BIND(L_sha256_loop);
       
  4489     // load buf into F8-F22
       
  4490     for (i = 0; i < 8; i++) {
       
  4491       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
       
  4492     }
       
  4493     __ sha256();
       
  4494     if (multi_block) {
       
  4495       __ add(ofs, 64, ofs);
       
  4496       __ add(buf, 64, buf);
       
  4497       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_loop);
       
  4498       __ mov(ofs, O0); // to be returned
       
  4499     }
       
  4500 
       
  4501     // store F0-F7 into state and return
       
  4502     for (i = 0; i < 7; i++) {
       
  4503       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
       
  4504     }
       
  4505     __ retl();
       
  4506     __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
       
  4507 
       
  4508     __ BIND(L_sha256_unaligned_input);
       
  4509     __ alignaddr(buf, G0, buf);
       
  4510 
       
  4511     __ BIND(L_sha256_unaligned_input_loop);
       
  4512     // load buf into F8-F22
       
  4513     for (i = 0; i < 9; i++) {
       
  4514       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
       
  4515     }
       
  4516     for (i = 0; i < 8; i++) {
       
  4517       __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
       
  4518     }
       
  4519     __ sha256();
       
  4520     if (multi_block) {
       
  4521       __ add(ofs, 64, ofs);
       
  4522       __ add(buf, 64, buf);
       
  4523       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_unaligned_input_loop);
       
  4524       __ mov(ofs, O0); // to be returned
       
  4525     }
       
  4526 
       
  4527     // store F0-F7 into state and return
       
  4528     for (i = 0; i < 7; i++) {
       
  4529       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
       
  4530     }
       
  4531     __ retl();
       
  4532     __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
       
  4533 
       
  4534     return start;
       
  4535   }
       
  4536 
       
  4537   address generate_sha512_implCompress(bool multi_block, const char *name) {
       
  4538     __ align(CodeEntryAlignment);
       
  4539     StubCodeMark mark(this, "StubRoutines", name);
       
  4540     address start = __ pc();
       
  4541 
       
  4542     Label L_sha512_loop, L_sha512_unaligned_input, L_sha512_unaligned_input_loop;
       
  4543     int i;
       
  4544 
       
  4545     Register buf   = O0; // byte[] source+offset
       
  4546     Register state = O1; // long[] SHA5.state
       
  4547     Register ofs   = O2; // int    offset
       
  4548     Register limit = O3; // int    limit
       
  4549 
       
  4550     // load state into F0-F14
       
  4551     for (i = 0; i < 8; i++) {
       
  4552       __ ldf(FloatRegisterImpl::D, state, i*8, as_FloatRegister(i*2));
       
  4553     }
       
  4554 
       
  4555     __ andcc(buf, 7, G0);
       
  4556     __ br(Assembler::notZero, false, Assembler::pn, L_sha512_unaligned_input);
       
  4557     __ delayed()->nop();
       
  4558 
       
  4559     __ BIND(L_sha512_loop);
       
  4560     // load buf into F16-F46
       
  4561     for (i = 0; i < 16; i++) {
       
  4562       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
       
  4563     }
       
  4564     __ sha512();
       
  4565     if (multi_block) {
       
  4566       __ add(ofs, 128, ofs);
       
  4567       __ add(buf, 128, buf);
       
  4568       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_loop);
       
  4569       __ mov(ofs, O0); // to be returned
       
  4570     }
       
  4571 
       
  4572     // store F0-F14 into state and return
       
  4573     for (i = 0; i < 7; i++) {
       
  4574       __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
       
  4575     }
       
  4576     __ retl();
       
  4577     __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
       
  4578 
       
  4579     __ BIND(L_sha512_unaligned_input);
       
  4580     __ alignaddr(buf, G0, buf);
       
  4581 
       
  4582     __ BIND(L_sha512_unaligned_input_loop);
       
  4583     // load buf into F16-F46
       
  4584     for (i = 0; i < 17; i++) {
       
  4585       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
       
  4586     }
       
  4587     for (i = 0; i < 16; i++) {
       
  4588       __ faligndata(as_FloatRegister(i*2 + 16), as_FloatRegister(i*2 + 18), as_FloatRegister(i*2 + 16));
       
  4589     }
       
  4590     __ sha512();
       
  4591     if (multi_block) {
       
  4592       __ add(ofs, 128, ofs);
       
  4593       __ add(buf, 128, buf);
       
  4594       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_unaligned_input_loop);
       
  4595       __ mov(ofs, O0); // to be returned
       
  4596     }
       
  4597 
       
  4598     // store F0-F14 into state and return
       
  4599     for (i = 0; i < 7; i++) {
       
  4600       __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
       
  4601     }
       
  4602     __ retl();
       
  4603     __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
       
  4604 
       
  4605     return start;
       
  4606   }
       
  4607 
       
  4608   /* Single and multi-block ghash operations */
       
  4609   address generate_ghash_processBlocks() {
       
  4610       __ align(CodeEntryAlignment);
       
  4611       Label L_ghash_loop, L_aligned, L_main;
       
  4612       StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
       
  4613       address start = __ pc();
       
  4614 
       
  4615       Register state = I0;
       
  4616       Register subkeyH = I1;
       
  4617       Register data = I2;
       
  4618       Register len = I3;
       
  4619 
       
  4620       __ save_frame(0);
       
  4621 
       
  4622       __ ldx(state, 0, O0);
       
  4623       __ ldx(state, 8, O1);
       
  4624 
       
  4625       // Loop label for multiblock operations
       
  4626       __ BIND(L_ghash_loop);
       
  4627 
       
  4628       // Check if 'data' is unaligned
       
  4629       __ andcc(data, 7, G1);
       
  4630       __ br(Assembler::zero, false, Assembler::pt, L_aligned);
       
  4631       __ delayed()->nop();
       
  4632 
       
  4633       Register left_shift = L1;
       
  4634       Register right_shift = L2;
       
  4635       Register data_ptr = L3;
       
  4636 
       
  4637       // Get left and right shift values in bits
       
  4638       __ sll(G1, LogBitsPerByte, left_shift);
       
  4639       __ mov(64, right_shift);
       
  4640       __ sub(right_shift, left_shift, right_shift);
       
  4641 
       
  4642       // Align to read 'data'
       
  4643       __ sub(data, G1, data_ptr);
       
  4644 
       
  4645       // Load first 8 bytes of 'data'
       
  4646       __ ldx(data_ptr, 0, O4);
       
  4647       __ sllx(O4, left_shift, O4);
       
  4648       __ ldx(data_ptr, 8, O5);
       
  4649       __ srlx(O5, right_shift, G4);
       
  4650       __ bset(G4, O4);
       
  4651 
       
  4652       // Load second 8 bytes of 'data'
       
  4653       __ sllx(O5, left_shift, O5);
       
  4654       __ ldx(data_ptr, 16, G4);
       
  4655       __ srlx(G4, right_shift, G4);
       
  4656       __ ba(L_main);
       
  4657       __ delayed()->bset(G4, O5);
       
  4658 
       
  4659       // If 'data' is aligned, load normally
       
  4660       __ BIND(L_aligned);
       
  4661       __ ldx(data, 0, O4);
       
  4662       __ ldx(data, 8, O5);
       
  4663 
       
  4664       __ BIND(L_main);
       
  4665       __ ldx(subkeyH, 0, O2);
       
  4666       __ ldx(subkeyH, 8, O3);
       
  4667 
       
  4668       __ xor3(O0, O4, O0);
       
  4669       __ xor3(O1, O5, O1);
       
  4670 
       
  4671       __ xmulxhi(O0, O3, G3);
       
  4672       __ xmulx(O0, O2, O5);
       
  4673       __ xmulxhi(O1, O2, G4);
       
  4674       __ xmulxhi(O1, O3, G5);
       
  4675       __ xmulx(O0, O3, G1);
       
  4676       __ xmulx(O1, O3, G2);
       
  4677       __ xmulx(O1, O2, O3);
       
  4678       __ xmulxhi(O0, O2, O4);
       
  4679 
       
  4680       __ mov(0xE1, O0);
       
  4681       __ sllx(O0, 56, O0);
       
  4682 
       
  4683       __ xor3(O5, G3, O5);
       
  4684       __ xor3(O5, G4, O5);
       
  4685       __ xor3(G5, G1, G1);
       
  4686       __ xor3(G1, O3, G1);
       
  4687       __ srlx(G2, 63, O1);
       
  4688       __ srlx(G1, 63, G3);
       
  4689       __ sllx(G2, 63, O3);
       
  4690       __ sllx(G2, 58, O2);
       
  4691       __ xor3(O3, O2, O2);
       
  4692 
       
  4693       __ sllx(G1, 1, G1);
       
  4694       __ or3(G1, O1, G1);
       
  4695 
       
  4696       __ xor3(G1, O2, G1);
       
  4697 
       
  4698       __ sllx(G2, 1, G2);
       
  4699 
       
  4700       __ xmulxhi(G1, O0, O1);
       
  4701       __ xmulx(G1, O0, O2);
       
  4702       __ xmulxhi(G2, O0, O3);
       
  4703       __ xmulx(G2, O0, G1);
       
  4704 
       
  4705       __ xor3(O4, O1, O4);
       
  4706       __ xor3(O5, O2, O5);
       
  4707       __ xor3(O5, O3, O5);
       
  4708 
       
  4709       __ sllx(O4, 1, O2);
       
  4710       __ srlx(O5, 63, O3);
       
  4711 
       
  4712       __ or3(O2, O3, O0);
       
  4713 
       
  4714       __ sllx(O5, 1, O1);
       
  4715       __ srlx(G1, 63, O2);
       
  4716       __ or3(O1, O2, O1);
       
  4717       __ xor3(O1, G3, O1);
       
  4718 
       
  4719       __ deccc(len);
       
  4720       __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
       
  4721       __ delayed()->add(data, 16, data);
       
  4722 
       
  4723       __ stx(O0, I0, 0);
       
  4724       __ stx(O1, I0, 8);
       
  4725 
       
  4726       __ ret();
       
  4727       __ delayed()->restore();
       
  4728 
       
  4729       return start;
       
  4730   }
       
  4731 
       
  4732   /**
       
  4733    *  Arguments:
       
  4734    *
       
  4735    * Inputs:
       
  4736    *   O0   - int   crc
       
  4737    *   O1   - byte* buf
       
  4738    *   O2   - int   len
       
  4739    *   O3   - int*  table
       
  4740    *
       
  4741    * Output:
       
  4742    *   O0   - int crc result
       
  4743    */
       
  4744   address generate_updateBytesCRC32C() {
       
  4745     assert(UseCRC32CIntrinsics, "need CRC32C instruction");
       
  4746 
       
  4747     __ align(CodeEntryAlignment);
       
  4748     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
       
  4749     address start = __ pc();
       
  4750 
       
  4751     const Register crc   = O0;  // crc
       
  4752     const Register buf   = O1;  // source java byte array address
       
  4753     const Register len   = O2;  // number of bytes
       
  4754     const Register table = O3;  // byteTable
       
  4755 
       
  4756     __ kernel_crc32c(crc, buf, len, table);
       
  4757 
       
  4758     __ retl();
       
  4759     __ delayed()->nop();
       
  4760 
       
  4761     return start;
       
  4762   }
       
  4763 
       
  4764 #define ADLER32_NUM_TEMPS 16
       
  4765 
       
  4766   /**
       
  4767    *  Arguments:
       
  4768    *
       
  4769    * Inputs:
       
  4770    *   O0   - int   adler
       
  4771    *   O1   - byte* buff
       
  4772    *   O2   - int   len
       
  4773    *
       
  4774    * Output:
       
  4775    *   O0   - int adler result
       
  4776    */
       
  4777   address generate_updateBytesAdler32() {
       
  4778     __ align(CodeEntryAlignment);
       
  4779     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
       
  4780     address start = __ pc();
       
  4781 
       
  4782     Label L_cleanup_loop, L_cleanup_loop_check;
       
  4783     Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check;
       
  4784     Label L_nmax_check_done;
       
  4785 
       
  4786     // Aliases
       
  4787     Register s1     = O0;
       
  4788     Register s2     = O3;
       
  4789     Register buff   = O1;
       
  4790     Register len    = O2;
       
  4791     Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7};
       
  4792 
       
  4793     // Max number of bytes we can process before having to take the mod
       
  4794     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
       
  4795     unsigned long NMAX = 0x15B0;
       
  4796 
       
  4797     // Zero-out the upper bits of len
       
  4798     __ clruwu(len);
       
  4799 
       
  4800     // Create the mask 0xFFFF
       
  4801     __ set64(0x00FFFF, O4, O5); // O5 is the temp register
       
  4802 
       
  4803     // s1 is initialized to the lower 16 bits of adler
       
  4804     // s2 is initialized to the upper 16 bits of adler
       
  4805     __ srlx(O0, 16, O5); // adler >> 16
       
  4806     __ and3(O0, O4, s1); // s1  = (adler & 0xFFFF)
       
  4807     __ and3(O5, O4, s2); // s2  = ((adler >> 16) & 0xFFFF)
       
  4808 
       
  4809     // The pipelined loop needs at least 16 elements for 1 iteration
       
  4810     // It does check this, but it is more effective to skip to the cleanup loop
       
  4811     // Setup the constant for cutoff checking
       
  4812     __ mov(15, O4);
       
  4813 
       
  4814     // Check if we are above the cutoff, if not go to the cleanup loop immediately
       
  4815     __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check);
       
  4816 
       
  4817     // Free up some registers for our use
       
  4818     for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
       
  4819       __ movxtod(temp[i], as_FloatRegister(2*i));
       
  4820     }
       
  4821 
       
  4822     // Loop maintenance stuff is done at the end of the loop, so skip to there
       
  4823     __ ba_short(L_main_loop_check);
       
  4824 
       
  4825     __ BIND(L_main_loop);
       
  4826 
       
  4827     // Prologue for inner loop
       
  4828     __ ldub(buff, 0, L0);
       
  4829     __ dec(O5);
       
  4830 
       
  4831     for (int i = 1; i < 8; i++) {
       
  4832       __ ldub(buff, i, temp[i]);
       
  4833     }
       
  4834 
       
  4835     __ inc(buff, 8);
       
  4836 
       
  4837     // Inner loop processes 16 elements at a time, might never execute if only 16 elements
       
  4838     // to be processed by the outter loop
       
  4839     __ ba_short(L_inner_loop_check);
       
  4840 
       
  4841     __ BIND(L_inner_loop);
       
  4842 
       
  4843     for (int i = 0; i < 8; i++) {
       
  4844       __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]);
       
  4845       __ add(s1, temp[i], s1);
       
  4846       __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]);
       
  4847       __ add(s2, s1, s2);
       
  4848     }
       
  4849 
       
  4850     // Original temp 0-7 used and new loads to temp 0-7 issued
       
  4851     // temp 8-15 ready to be consumed
       
  4852     __ add(s1, I0, s1);
       
  4853     __ dec(O5);
       
  4854     __ add(s2, s1, s2);
       
  4855     __ add(s1, I1, s1);
       
  4856     __ inc(buff, 16);
       
  4857     __ add(s2, s1, s2);
       
  4858 
       
  4859     for (int i = 0; i < 6; i++) {
       
  4860       __ add(s1, temp[10+i], s1);
       
  4861       __ add(s2, s1, s2);
       
  4862     }
       
  4863 
       
  4864     __ BIND(L_inner_loop_check);
       
  4865     __ nop();
       
  4866     __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop);
       
  4867 
       
  4868     // Epilogue
       
  4869     for (int i = 0; i < 4; i++) {
       
  4870       __ ldub(buff, (2*i), temp[8+(2*i)]);
       
  4871       __ add(s1, temp[i], s1);
       
  4872       __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]);
       
  4873       __ add(s2, s1, s2);
       
  4874     }
       
  4875 
       
  4876     __ add(s1, temp[4], s1);
       
  4877     __ inc(buff, 8);
       
  4878 
       
  4879     for (int i = 0; i < 11; i++) {
       
  4880       __ add(s2, s1, s2);
       
  4881       __ add(s1, temp[5+i], s1);
       
  4882     }
       
  4883 
       
  4884     __ add(s2, s1, s2);
       
  4885 
       
  4886     // Take the mod for s1 and s2
       
  4887     __ set64(0xFFF1, L0, L1);
       
  4888     __ udivx(s1, L0, L1);
       
  4889     __ udivx(s2, L0, L2);
       
  4890     __ mulx(L0, L1, L1);
       
  4891     __ mulx(L0, L2, L2);
       
  4892     __ sub(s1, L1, s1);
       
  4893     __ sub(s2, L2, s2);
       
  4894 
       
  4895     // Make sure there is something left to process
       
  4896     __ BIND(L_main_loop_check);
       
  4897     __ set64(NMAX, L0, L1);
       
  4898     // k = len < NMAX ? len : NMAX
       
  4899     __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done);
       
  4900     __ andn(len, 0x0F, L0); // only loop a multiple of 16 times
       
  4901     __ BIND(L_nmax_check_done);
       
  4902     __ mov(L0, O5);
       
  4903     __ sub(len, L0, len); // len -= k
       
  4904 
       
  4905     __ srlx(O5, 4, O5); // multiplies of 16
       
  4906     __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop);
       
  4907 
       
  4908     // Restore anything we used, take the mod one last time, combine and return
       
  4909     // Restore any registers we saved
       
  4910     for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
       
  4911       __ movdtox(as_FloatRegister(2*i), temp[i]);
       
  4912     }
       
  4913 
       
  4914     // There might be nothing left to process
       
  4915     __ ba_short(L_cleanup_loop_check);
       
  4916 
       
  4917     __ BIND(L_cleanup_loop);
       
  4918     __ ldub(buff, 0, O4); // load single byte form buffer
       
  4919     __ inc(buff); // buff++
       
  4920     __ add(s1, O4, s1); // s1 += *buff++;
       
  4921     __ dec(len); // len--
       
  4922     __ add(s1, s2, s2); // s2 += s1;
       
  4923     __ BIND(L_cleanup_loop_check);
       
  4924     __ nop();
       
  4925     __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop);
       
  4926 
       
  4927     // Take the mod one last time
       
  4928     __ set64(0xFFF1, O1, O2);
       
  4929     __ udivx(s1, O1, O2);
       
  4930     __ udivx(s2, O1, O5);
       
  4931     __ mulx(O1, O2, O2);
       
  4932     __ mulx(O1, O5, O5);
       
  4933     __ sub(s1, O2, s1);
       
  4934     __ sub(s2, O5, s2);
       
  4935 
       
  4936     // Combine lower bits and higher bits
       
  4937     __ sllx(s2, 16, s2); // s2 = s2 << 16
       
  4938     __ or3(s1, s2, s1);  // adler = s2 | s1
       
  4939     // Final return value is in O0
       
  4940     __ retl();
       
  4941     __ delayed()->nop();
       
  4942 
       
  4943     return start;
       
  4944   }
       
  4945 
       
  4946 /**
       
  4947    *  Arguments:
       
  4948    *
       
  4949    * Inputs:
       
  4950    *   O0   - int   crc
       
  4951    *   O1   - byte* buf
       
  4952    *   O2   - int   len
       
  4953    *   O3   - int*  table
       
  4954    *
       
  4955    * Output:
       
  4956    *   O0   - int crc result
       
  4957    */
       
  4958   address generate_updateBytesCRC32() {
       
  4959     assert(UseCRC32Intrinsics, "need VIS3 instructions");
       
  4960 
       
  4961     __ align(CodeEntryAlignment);
       
  4962     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
       
  4963     address start = __ pc();
       
  4964 
       
  4965     const Register crc   = O0; // crc
       
  4966     const Register buf   = O1; // source java byte array address
       
  4967     const Register len   = O2; // length
       
  4968     const Register table = O3; // crc_table address (reuse register)
       
  4969 
       
  4970     __ kernel_crc32(crc, buf, len, table);
       
  4971 
       
  4972     __ retl();
       
  4973     __ delayed()->nop();
       
  4974 
       
  4975     return start;
       
  4976   }
       
  4977 
       
  4978   void generate_initial() {
       
  4979     // Generates all stubs and initializes the entry points
       
  4980 
       
  4981     //------------------------------------------------------------------------------------------------------------------------
       
  4982     // entry points that exist in all platforms
       
  4983     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
       
  4984     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
       
  4985     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
       
  4986 
       
  4987     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
       
  4988     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
       
  4989 
       
  4990     //------------------------------------------------------------------------------------------------------------------------
       
  4991     // entry points that are platform specific
       
  4992     StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
       
  4993 
       
  4994     StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
       
  4995     StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
       
  4996 
       
  4997     // Build this early so it's available for the interpreter.
       
  4998     StubRoutines::_throw_StackOverflowError_entry =
       
  4999             generate_throw_exception("StackOverflowError throw_exception",
       
  5000             CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
       
  5001     StubRoutines::_throw_delayed_StackOverflowError_entry =
       
  5002             generate_throw_exception("delayed StackOverflowError throw_exception",
       
  5003             CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
       
  5004 
       
  5005     if (UseCRC32Intrinsics) {
       
  5006       // set table address before stub generation which use it
       
  5007       StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
       
  5008       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
       
  5009     }
       
  5010 
       
  5011     if (UseCRC32CIntrinsics) {
       
  5012       // set table address before stub generation which use it
       
  5013       StubRoutines::_crc32c_table_addr = (address)StubRoutines::Sparc::_crc32c_table;
       
  5014       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
       
  5015     }
       
  5016   }
       
  5017 
       
  5018 
       
  5019   void generate_all() {
       
  5020     // Generates all stubs and initializes the entry points
       
  5021 
       
  5022     // Generate partial_subtype_check first here since its code depends on
       
  5023     // UseZeroBaseCompressedOops which is defined after heap initialization.
       
  5024     StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
       
  5025     // These entry points require SharedInfo::stack0 to be set up in non-core builds
       
  5026     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
       
  5027     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
       
  5028     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
       
  5029 
       
  5030     // support for verify_oop (must happen after universe_init)
       
  5031     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
       
  5032 
       
  5033     // arraycopy stubs used by compilers
       
  5034     generate_arraycopy_stubs();
       
  5035 
       
  5036     // Don't initialize the platform math functions since sparc
       
  5037     // doesn't have intrinsics for these operations.
       
  5038 
       
  5039     // Safefetch stubs.
       
  5040     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
       
  5041                                                        &StubRoutines::_safefetch32_fault_pc,
       
  5042                                                        &StubRoutines::_safefetch32_continuation_pc);
       
  5043     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
       
  5044                                                        &StubRoutines::_safefetchN_fault_pc,
       
  5045                                                        &StubRoutines::_safefetchN_continuation_pc);
       
  5046 
       
  5047     // generate AES intrinsics code
       
  5048     if (UseAESIntrinsics) {
       
  5049       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
       
  5050       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
       
  5051       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
       
  5052       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
       
  5053     }
       
  5054     // generate GHASH intrinsics code
       
  5055     if (UseGHASHIntrinsics) {
       
  5056       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
       
  5057     }
       
  5058 
       
  5059     // generate SHA1/SHA256/SHA512 intrinsics code
       
  5060     if (UseSHA1Intrinsics) {
       
  5061       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
       
  5062       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
       
  5063     }
       
  5064     if (UseSHA256Intrinsics) {
       
  5065       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
       
  5066       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
       
  5067     }
       
  5068     if (UseSHA512Intrinsics) {
       
  5069       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
       
  5070       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
       
  5071     }
       
  5072     // generate Adler32 intrinsics code
       
  5073     if (UseAdler32Intrinsics) {
       
  5074       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
       
  5075     }
       
  5076   }
       
  5077 
       
  5078 
       
  5079  public:
       
  5080   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
       
  5081     // replace the standard masm with a special one:
       
  5082     _masm = new MacroAssembler(code);
       
  5083 
       
  5084     _stub_count = !all ? 0x100 : 0x200;
       
  5085     if (all) {
       
  5086       generate_all();
       
  5087     } else {
       
  5088       generate_initial();
       
  5089     }
       
  5090 
       
  5091     // make sure this stub is available for all local calls
       
  5092     if (_atomic_add_stub.is_unbound()) {
       
  5093       // generate a second time, if necessary
       
  5094       (void) generate_atomic_add();
       
  5095     }
       
  5096   }
       
  5097 
       
  5098 
       
  5099  private:
       
  5100   int _stub_count;
       
  5101   void stub_prolog(StubCodeDesc* cdesc) {
       
  5102     # ifdef ASSERT
       
  5103       // put extra information in the stub code, to make it more readable
       
  5104       // Write the high part of the address
       
  5105       // [RGV] Check if there is a dependency on the size of this prolog
       
  5106       __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
       
  5107       __ emit_data((intptr_t)cdesc,    relocInfo::none);
       
  5108       __ emit_data(++_stub_count, relocInfo::none);
       
  5109     # endif
       
  5110     align(true);
       
  5111   }
       
  5112 
       
  5113   void align(bool at_header = false) {
       
  5114     // %%%%% move this constant somewhere else
       
  5115     // UltraSPARC cache line size is 8 instructions:
       
  5116     const unsigned int icache_line_size = 32;
       
  5117     const unsigned int icache_half_line_size = 16;
       
  5118 
       
  5119     if (at_header) {
       
  5120       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
       
  5121         __ emit_data(0, relocInfo::none);
       
  5122       }
       
  5123     } else {
       
  5124       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
       
  5125         __ nop();
       
  5126       }
       
  5127     }
       
  5128   }
       
  5129 
       
  5130 }; // end class declaration
       
  5131 
       
  5132 void StubGenerator_generate(CodeBuffer* code, bool all) {
       
  5133   StubGenerator g(code, all);
       
  5134 }