hotspot/src/cpu/arm/vm/stubGenerator_arm.cpp
changeset 42664 29142a56c193
child 46620 750c6edff33b
equal deleted inserted replaced
42663:2335df372367 42664:29142a56c193
       
     1 /*
       
     2  * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.
       
     8  *
       
     9  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    12  * version 2 for more details (a copy is included in the LICENSE file that
       
    13  * accompanied this code).
       
    14  *
       
    15  * You should have received a copy of the GNU General Public License version
       
    16  * 2 along with this work; if not, write to the Free Software Foundation,
       
    17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    18  *
       
    19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    20  * or visit www.oracle.com if you need additional information or have any
       
    21  * questions.
       
    22  *
       
    23  */
       
    24 
       
    25 #include "precompiled.hpp"
       
    26 #include "asm/assembler.hpp"
       
    27 #include "assembler_arm.inline.hpp"
       
    28 #include "interpreter/interpreter.hpp"
       
    29 #include "nativeInst_arm.hpp"
       
    30 #include "oops/instanceOop.hpp"
       
    31 #include "oops/method.hpp"
       
    32 #include "oops/objArrayKlass.hpp"
       
    33 #include "oops/oop.inline.hpp"
       
    34 #include "prims/methodHandles.hpp"
       
    35 #include "runtime/frame.inline.hpp"
       
    36 #include "runtime/handles.inline.hpp"
       
    37 #include "runtime/sharedRuntime.hpp"
       
    38 #include "runtime/stubCodeGenerator.hpp"
       
    39 #include "runtime/stubRoutines.hpp"
       
    40 #ifdef COMPILER2
       
    41 #include "opto/runtime.hpp"
       
    42 #endif
       
    43 
       
    44 // Declaration and definition of StubGenerator (no .hpp file).
       
    45 // For a more detailed description of the stub routine structure
       
    46 // see the comment in stubRoutines.hpp
       
    47 
       
    48 #define __ _masm->
       
    49 
       
    50 #ifdef PRODUCT
       
    51 #define BLOCK_COMMENT(str) /* nothing */
       
    52 #else
       
    53 #define BLOCK_COMMENT(str) __ block_comment(str)
       
    54 #endif
       
    55 
       
    56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
       
    57 
       
    58 // -------------------------------------------------------------------------------------------------------------------------
       
    59 // Stub Code definitions
       
    60 
       
    61 // Platform dependent parameters for array copy stubs
       
    62 
       
    63 // Note: we have noticed a huge change in behavior on a microbenchmark
       
    64 // from platform to platform depending on the configuration.
       
    65 
       
    66 // Instead of adding a series of command line options (which
       
    67 // unfortunately have to be done in the shared file and cannot appear
       
    68 // only in the ARM port), the tested result are hard-coded here in a set
       
    69 // of options, selected by specifying 'ArmCopyPlatform'
       
    70 
       
    71 // Currently, this 'platform' is hardcoded to a value that is a good
       
    72 // enough trade-off.  However, one can easily modify this file to test
       
    73 // the hard-coded configurations or create new ones. If the gain is
       
    74 // significant, we could decide to either add command line options or
       
    75 // add code to automatically choose a configuration.
       
    76 
       
    77 // see comments below for the various configurations created
       
    78 #define DEFAULT_ARRAYCOPY_CONFIG 0
       
    79 #define TEGRA2_ARRAYCOPY_CONFIG 1
       
    80 #define IMX515_ARRAYCOPY_CONFIG 2
       
    81 
       
    82 // Hard coded choices (XXX: could be changed to a command line option)
       
    83 #define ArmCopyPlatform DEFAULT_ARRAYCOPY_CONFIG
       
    84 
       
    85 #ifdef AARCH64
       
    86 #define ArmCopyCacheLineSize 64
       
    87 #else
       
    88 #define ArmCopyCacheLineSize 32 // not worth optimizing to 64 according to measured gains
       
    89 #endif // AARCH64
       
    90 
       
    91 // TODO-AARCH64: tune and revise AArch64 arraycopy optimizations
       
    92 
       
    93 // configuration for each kind of loop
       
    94 typedef struct {
       
    95   int pld_distance;       // prefetch distance (0 => no prefetch, <0: prefetch_before);
       
    96 #ifndef AARCH64
       
    97   bool split_ldm;         // if true, split each STM in STMs with fewer registers
       
    98   bool split_stm;         // if true, split each LTM in LTMs with fewer registers
       
    99 #endif // !AARCH64
       
   100 } arraycopy_loop_config;
       
   101 
       
   102 // configuration for all loops
       
   103 typedef struct {
       
   104   // const char *description;
       
   105   arraycopy_loop_config forward_aligned;
       
   106   arraycopy_loop_config backward_aligned;
       
   107   arraycopy_loop_config forward_shifted;
       
   108   arraycopy_loop_config backward_shifted;
       
   109 } arraycopy_platform_config;
       
   110 
       
   111 // configured platforms
       
   112 static arraycopy_platform_config arraycopy_configurations[] = {
       
   113   // configuration parameters for arraycopy loops
       
   114 #ifdef AARCH64
       
   115   {
       
   116     {-256 }, // forward aligned
       
   117     {-128 }, // backward aligned
       
   118     {-256 }, // forward shifted
       
   119     {-128 }  // backward shifted
       
   120   }
       
   121 #else
       
   122 
       
   123   // Configurations were chosen based on manual analysis of benchmark
       
   124   // results, minimizing overhead with respect to best results on the
       
   125   // different test cases.
       
   126 
       
   127   // Prefetch before is always favored since it avoids dirtying the
       
   128   // cache uselessly for small copies. Code for prefetch after has
       
   129   // been kept in case the difference is significant for some
       
   130   // platforms but we might consider dropping it.
       
   131 
       
   132   // distance, ldm, stm
       
   133   {
       
   134     // default: tradeoff tegra2/imx515/nv-tegra2,
       
   135     // Notes on benchmarking:
       
   136     // - not far from optimal configuration on nv-tegra2
       
   137     // - within 5% of optimal configuration except for backward aligned on IMX
       
   138     // - up to 40% from optimal configuration for backward shifted and backward align for tegra2
       
   139     //   but still on par with the operating system copy
       
   140     {-256, true,  true  }, // forward aligned
       
   141     {-256, true,  true  }, // backward aligned
       
   142     {-256, false, false }, // forward shifted
       
   143     {-256, true,  true  } // backward shifted
       
   144   },
       
   145   {
       
   146     // configuration tuned on tegra2-4.
       
   147     // Warning: should not be used on nv-tegra2 !
       
   148     // Notes:
       
   149     // - prefetch after gives 40% gain on backward copies on tegra2-4,
       
   150     //   resulting in better number than the operating system
       
   151     //   copy. However, this can lead to a 300% loss on nv-tegra and has
       
   152     //   more impact on the cache (fetches futher than what is
       
   153     //   copied). Use this configuration with care, in case it improves
       
   154     //   reference benchmarks.
       
   155     {-256, true,  true  }, // forward aligned
       
   156     {96,   false, false }, // backward aligned
       
   157     {-256, false, false }, // forward shifted
       
   158     {96,   false, false } // backward shifted
       
   159   },
       
   160   {
       
   161     // configuration tuned on imx515
       
   162     // Notes:
       
   163     // - smaller prefetch distance is sufficient to get good result and might be more stable
       
   164     // - refined backward aligned options within 5% of optimal configuration except for
       
   165     //   tests were the arrays fit in the cache
       
   166     {-160, false, false }, // forward aligned
       
   167     {-160, false, false }, // backward aligned
       
   168     {-160, false, false }, // forward shifted
       
   169     {-160, true,  true  } // backward shifted
       
   170   }
       
   171 #endif // AARCH64
       
   172 };
       
   173 
       
   174 class StubGenerator: public StubCodeGenerator {
       
   175 
       
   176 #ifdef PRODUCT
       
   177 #define inc_counter_np(a,b,c) ((void)0)
       
   178 #else
       
   179 #define inc_counter_np(counter, t1, t2) \
       
   180   BLOCK_COMMENT("inc_counter " #counter); \
       
   181   __ inc_counter(&counter, t1, t2);
       
   182 #endif
       
   183 
       
   184  private:
       
   185 
       
   186   address generate_call_stub(address& return_address) {
       
   187     StubCodeMark mark(this, "StubRoutines", "call_stub");
       
   188     address start = __ pc();
       
   189 
       
   190 #ifdef AARCH64
       
   191     const int saved_regs_size = 192;
       
   192 
       
   193     __ stp(FP, LR, Address(SP, -saved_regs_size, pre_indexed));
       
   194     __ mov(FP, SP);
       
   195 
       
   196     int sp_offset = 16;
       
   197     assert(frame::entry_frame_call_wrapper_offset * wordSize == sp_offset, "adjust this code");
       
   198     __ stp(R0,  ZR,  Address(SP, sp_offset)); sp_offset += 16;
       
   199 
       
   200     const int saved_result_and_result_type_offset = sp_offset;
       
   201     __ stp(R1,  R2,  Address(SP, sp_offset)); sp_offset += 16;
       
   202     __ stp(R19, R20, Address(SP, sp_offset)); sp_offset += 16;
       
   203     __ stp(R21, R22, Address(SP, sp_offset)); sp_offset += 16;
       
   204     __ stp(R23, R24, Address(SP, sp_offset)); sp_offset += 16;
       
   205     __ stp(R25, R26, Address(SP, sp_offset)); sp_offset += 16;
       
   206     __ stp(R27, R28, Address(SP, sp_offset)); sp_offset += 16;
       
   207 
       
   208     __ stp_d(V8,  V9,  Address(SP, sp_offset)); sp_offset += 16;
       
   209     __ stp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16;
       
   210     __ stp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16;
       
   211     __ stp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16;
       
   212     assert (sp_offset == saved_regs_size, "adjust this code");
       
   213 
       
   214     __ mov(Rmethod, R3);
       
   215     __ mov(Rthread, R7);
       
   216     __ reinit_heapbase();
       
   217 
       
   218     { // Pass parameters
       
   219       Label done_parameters, pass_parameters;
       
   220 
       
   221       __ mov(Rparams, SP);
       
   222       __ cbz_w(R6, done_parameters);
       
   223 
       
   224       __ sub(Rtemp, SP, R6, ex_uxtw, LogBytesPerWord);
       
   225       __ align_reg(SP, Rtemp, StackAlignmentInBytes);
       
   226       __ add(Rparams, SP, R6, ex_uxtw, LogBytesPerWord);
       
   227 
       
   228       __ bind(pass_parameters);
       
   229       __ subs_w(R6, R6, 1);
       
   230       __ ldr(Rtemp, Address(R5, wordSize, post_indexed));
       
   231       __ str(Rtemp, Address(Rparams, -wordSize, pre_indexed));
       
   232       __ b(pass_parameters, ne);
       
   233 
       
   234       __ bind(done_parameters);
       
   235 
       
   236 #ifdef ASSERT
       
   237       {
       
   238         Label L;
       
   239         __ cmp(SP, Rparams);
       
   240         __ b(L, eq);
       
   241         __ stop("SP does not match Rparams");
       
   242         __ bind(L);
       
   243       }
       
   244 #endif
       
   245     }
       
   246 
       
   247     __ mov(Rsender_sp, SP);
       
   248     __ blr(R4);
       
   249     return_address = __ pc();
       
   250 
       
   251     __ mov(SP, FP);
       
   252 
       
   253     __ ldp(R1, R2, Address(SP, saved_result_and_result_type_offset));
       
   254 
       
   255     { // Handle return value
       
   256       Label cont;
       
   257       __ str(R0, Address(R1));
       
   258 
       
   259       __ cmp_w(R2, T_DOUBLE);
       
   260       __ ccmp_w(R2, T_FLOAT, Assembler::flags_for_condition(eq), ne);
       
   261       __ b(cont, ne);
       
   262 
       
   263       __ str_d(V0, Address(R1));
       
   264       __ bind(cont);
       
   265     }
       
   266 
       
   267     sp_offset = saved_result_and_result_type_offset + 16;
       
   268     __ ldp(R19, R20, Address(SP, sp_offset)); sp_offset += 16;
       
   269     __ ldp(R21, R22, Address(SP, sp_offset)); sp_offset += 16;
       
   270     __ ldp(R23, R24, Address(SP, sp_offset)); sp_offset += 16;
       
   271     __ ldp(R25, R26, Address(SP, sp_offset)); sp_offset += 16;
       
   272     __ ldp(R27, R28, Address(SP, sp_offset)); sp_offset += 16;
       
   273 
       
   274     __ ldp_d(V8,  V9,  Address(SP, sp_offset)); sp_offset += 16;
       
   275     __ ldp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16;
       
   276     __ ldp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16;
       
   277     __ ldp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16;
       
   278     assert (sp_offset == saved_regs_size, "adjust this code");
       
   279 
       
   280     __ ldp(FP, LR, Address(SP, saved_regs_size, post_indexed));
       
   281     __ ret();
       
   282 
       
   283 #else // AARCH64
       
   284 
       
   285     assert(frame::entry_frame_call_wrapper_offset == 0, "adjust this code");
       
   286 
       
   287     __ mov(Rtemp, SP);
       
   288     __ push(RegisterSet(FP) | RegisterSet(LR));
       
   289 #ifndef __SOFTFP__
       
   290     __ fstmdbd(SP, FloatRegisterSet(D8, 8), writeback);
       
   291 #endif
       
   292     __ stmdb(SP, RegisterSet(R0, R2) | RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11, writeback);
       
   293     __ mov(Rmethod, R3);
       
   294     __ ldmia(Rtemp, RegisterSet(R1, R3) | Rthread); // stacked arguments
       
   295 
       
   296     // XXX: TODO
       
   297     // Would be better with respect to native tools if the following
       
   298     // setting of FP was changed to conform to the native ABI, with FP
       
   299     // pointing to the saved FP slot (and the corresponding modifications
       
   300     // for entry_frame_call_wrapper_offset and frame::real_fp).
       
   301     __ mov(FP, SP);
       
   302 
       
   303     {
       
   304       Label no_parameters, pass_parameters;
       
   305       __ cmp(R3, 0);
       
   306       __ b(no_parameters, eq);
       
   307 
       
   308       __ bind(pass_parameters);
       
   309       __ ldr(Rtemp, Address(R2, wordSize, post_indexed)); // Rtemp OK, unused and scratchable
       
   310       __ subs(R3, R3, 1);
       
   311       __ push(Rtemp);
       
   312       __ b(pass_parameters, ne);
       
   313       __ bind(no_parameters);
       
   314     }
       
   315 
       
   316     __ mov(Rsender_sp, SP);
       
   317     __ blx(R1);
       
   318     return_address = __ pc();
       
   319 
       
   320     __ add(SP, FP, wordSize); // Skip link to JavaCallWrapper
       
   321     __ pop(RegisterSet(R2, R3));
       
   322 #ifndef __ABI_HARD__
       
   323     __ cmp(R3, T_LONG);
       
   324     __ cmp(R3, T_DOUBLE, ne);
       
   325     __ str(R0, Address(R2));
       
   326     __ str(R1, Address(R2, wordSize), eq);
       
   327 #else
       
   328     Label cont, l_float, l_double;
       
   329 
       
   330     __ cmp(R3, T_DOUBLE);
       
   331     __ b(l_double, eq);
       
   332 
       
   333     __ cmp(R3, T_FLOAT);
       
   334     __ b(l_float, eq);
       
   335 
       
   336     __ cmp(R3, T_LONG);
       
   337     __ str(R0, Address(R2));
       
   338     __ str(R1, Address(R2, wordSize), eq);
       
   339     __ b(cont);
       
   340 
       
   341 
       
   342     __ bind(l_double);
       
   343     __ fstd(D0, Address(R2));
       
   344     __ b(cont);
       
   345 
       
   346     __ bind(l_float);
       
   347     __ fsts(S0, Address(R2));
       
   348 
       
   349     __ bind(cont);
       
   350 #endif
       
   351 
       
   352     __ pop(RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11);
       
   353 #ifndef __SOFTFP__
       
   354     __ fldmiad(SP, FloatRegisterSet(D8, 8), writeback);
       
   355 #endif
       
   356     __ pop(RegisterSet(FP) | RegisterSet(PC));
       
   357 
       
   358 #endif // AARCH64
       
   359     return start;
       
   360   }
       
   361 
       
   362 
       
   363   // (in) Rexception_obj: exception oop
       
   364   address generate_catch_exception() {
       
   365     StubCodeMark mark(this, "StubRoutines", "catch_exception");
       
   366     address start = __ pc();
       
   367 
       
   368     __ str(Rexception_obj, Address(Rthread, Thread::pending_exception_offset()));
       
   369     __ b(StubRoutines::_call_stub_return_address);
       
   370 
       
   371     return start;
       
   372   }
       
   373 
       
   374 
       
   375   // (in) Rexception_pc: return address
       
   376   address generate_forward_exception() {
       
   377     StubCodeMark mark(this, "StubRoutines", "forward exception");
       
   378     address start = __ pc();
       
   379 
       
   380     __ mov(c_rarg0, Rthread);
       
   381     __ mov(c_rarg1, Rexception_pc);
       
   382     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
       
   383                          SharedRuntime::exception_handler_for_return_address),
       
   384                          c_rarg0, c_rarg1);
       
   385     __ ldr(Rexception_obj, Address(Rthread, Thread::pending_exception_offset()));
       
   386     const Register Rzero = __ zero_register(Rtemp); // Rtemp OK (cleared by above call)
       
   387     __ str(Rzero, Address(Rthread, Thread::pending_exception_offset()));
       
   388 
       
   389 #ifdef ASSERT
       
   390     // make sure exception is set
       
   391     { Label L;
       
   392       __ cbnz(Rexception_obj, L);
       
   393       __ stop("StubRoutines::forward exception: no pending exception (2)");
       
   394       __ bind(L);
       
   395     }
       
   396 #endif
       
   397 
       
   398     // Verify that there is really a valid exception in RAX.
       
   399     __ verify_oop(Rexception_obj);
       
   400 
       
   401     __ jump(R0); // handler is returned in R0 by runtime function
       
   402     return start;
       
   403   }
       
   404 
       
   405 
       
   406 #ifndef AARCH64
       
   407 
       
   408   // Integer division shared routine
       
   409   //   Input:
       
   410   //     R0  - dividend
       
   411   //     R2  - divisor
       
   412   //   Output:
       
   413   //     R0  - remainder
       
   414   //     R1  - quotient
       
   415   //   Destroys:
       
   416   //     R2
       
   417   //     LR
       
   418   address generate_idiv_irem() {
       
   419     Label positive_arguments, negative_or_zero, call_slow_path;
       
   420     Register dividend  = R0;
       
   421     Register divisor   = R2;
       
   422     Register remainder = R0;
       
   423     Register quotient  = R1;
       
   424     Register tmp       = LR;
       
   425     assert(dividend == remainder, "must be");
       
   426 
       
   427     address start = __ pc();
       
   428 
       
   429     // Check for special cases: divisor <= 0 or dividend < 0
       
   430     __ cmp(divisor, 0);
       
   431     __ orrs(quotient, dividend, divisor, ne);
       
   432     __ b(negative_or_zero, le);
       
   433 
       
   434     __ bind(positive_arguments);
       
   435     // Save return address on stack to free one extra register
       
   436     __ push(LR);
       
   437     // Approximate the mamximum order of the quotient
       
   438     __ clz(tmp, dividend);
       
   439     __ clz(quotient, divisor);
       
   440     __ subs(tmp, quotient, tmp);
       
   441     __ mov(quotient, 0);
       
   442     // Jump to the appropriate place in the unrolled loop below
       
   443     __ ldr(PC, Address(PC, tmp, lsl, 2), pl);
       
   444     // If divisor is greater than dividend, return immediately
       
   445     __ pop(PC);
       
   446 
       
   447     // Offset table
       
   448     Label offset_table[32];
       
   449     int i;
       
   450     for (i = 0; i <= 31; i++) {
       
   451       __ emit_address(offset_table[i]);
       
   452     }
       
   453 
       
   454     // Unrolled loop of 32 division steps
       
   455     for (i = 31; i >= 0; i--) {
       
   456       __ bind(offset_table[i]);
       
   457       __ cmp(remainder, AsmOperand(divisor, lsl, i));
       
   458       __ sub(remainder, remainder, AsmOperand(divisor, lsl, i), hs);
       
   459       __ add(quotient, quotient, 1 << i, hs);
       
   460     }
       
   461     __ pop(PC);
       
   462 
       
   463     __ bind(negative_or_zero);
       
   464     // Find the combination of argument signs and jump to corresponding handler
       
   465     __ andr(quotient, dividend, 0x80000000, ne);
       
   466     __ orr(quotient, quotient, AsmOperand(divisor, lsr, 31), ne);
       
   467     __ add(PC, PC, AsmOperand(quotient, ror, 26), ne);
       
   468     __ str(LR, Address(Rthread, JavaThread::saved_exception_pc_offset()));
       
   469 
       
   470     // The leaf runtime function can destroy R0-R3 and R12 registers which are still alive
       
   471     RegisterSet saved_registers = RegisterSet(R3) | RegisterSet(R12);
       
   472 #if R9_IS_SCRATCHED
       
   473     // Safer to save R9 here since callers may have been written
       
   474     // assuming R9 survives. This is suboptimal but may not be worth
       
   475     // revisiting for this slow case.
       
   476 
       
   477     // save also R10 for alignment
       
   478     saved_registers = saved_registers | RegisterSet(R9, R10);
       
   479 #endif
       
   480     {
       
   481       // divisor == 0
       
   482       FixedSizeCodeBlock zero_divisor(_masm, 8, true);
       
   483       __ push(saved_registers);
       
   484       __ mov(R0, Rthread);
       
   485       __ mov(R1, LR);
       
   486       __ mov(R2, SharedRuntime::IMPLICIT_DIVIDE_BY_ZERO);
       
   487       __ b(call_slow_path);
       
   488     }
       
   489 
       
   490     {
       
   491       // divisor > 0 && dividend < 0
       
   492       FixedSizeCodeBlock positive_divisor_negative_dividend(_masm, 8, true);
       
   493       __ push(LR);
       
   494       __ rsb(dividend, dividend, 0);
       
   495       __ bl(positive_arguments);
       
   496       __ rsb(remainder, remainder, 0);
       
   497       __ rsb(quotient, quotient, 0);
       
   498       __ pop(PC);
       
   499     }
       
   500 
       
   501     {
       
   502       // divisor < 0 && dividend > 0
       
   503       FixedSizeCodeBlock negative_divisor_positive_dividend(_masm, 8, true);
       
   504       __ push(LR);
       
   505       __ rsb(divisor, divisor, 0);
       
   506       __ bl(positive_arguments);
       
   507       __ rsb(quotient, quotient, 0);
       
   508       __ pop(PC);
       
   509     }
       
   510 
       
   511     {
       
   512       // divisor < 0 && dividend < 0
       
   513       FixedSizeCodeBlock negative_divisor_negative_dividend(_masm, 8, true);
       
   514       __ push(LR);
       
   515       __ rsb(dividend, dividend, 0);
       
   516       __ rsb(divisor, divisor, 0);
       
   517       __ bl(positive_arguments);
       
   518       __ rsb(remainder, remainder, 0);
       
   519       __ pop(PC);
       
   520     }
       
   521 
       
   522     __ bind(call_slow_path);
       
   523     __ call(CAST_FROM_FN_PTR(address, SharedRuntime::continuation_for_implicit_exception));
       
   524     __ pop(saved_registers);
       
   525     __ bx(R0);
       
   526 
       
   527     return start;
       
   528   }
       
   529 
       
   530 
       
   531  // As per atomic.hpp the Atomic read-modify-write operations must be logically implemented as:
       
   532  //  <fence>; <op>; <membar StoreLoad|StoreStore>
       
   533  // But for load-linked/store-conditional based systems a fence here simply means
       
   534  // no load/store can be reordered with respect to the initial load-linked, so we have:
       
   535  // <membar storeload|loadload> ; load-linked; <op>; store-conditional; <membar storeload|storestore>
       
   536  // There are no memory actions in <op> so nothing further is needed.
       
   537  //
       
   538  // So we define the following for convenience:
       
   539 #define MEMBAR_ATOMIC_OP_PRE \
       
   540     MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::LoadLoad)
       
   541 #define MEMBAR_ATOMIC_OP_POST \
       
   542     MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::StoreStore)
       
   543 
       
   544   // Note: JDK 9 only supports ARMv7+ so we always have ldrexd available even though the
       
   545   // code below allows for it to be otherwise. The else clause indicates an ARMv5 system
       
   546   // for which we do not support MP and so membars are not necessary. This ARMv5 code will
       
   547   // be removed in the future.
       
   548 
       
   549   // Support for jint Atomic::add(jint add_value, volatile jint *dest)
       
   550   //
       
   551   // Arguments :
       
   552   //
       
   553   //      add_value:      R0
       
   554   //      dest:           R1
       
   555   //
       
   556   // Results:
       
   557   //
       
   558   //     R0: the new stored in dest
       
   559   //
       
   560   // Overwrites:
       
   561   //
       
   562   //     R1, R2, R3
       
   563   //
       
   564   address generate_atomic_add() {
       
   565     address start;
       
   566 
       
   567     StubCodeMark mark(this, "StubRoutines", "atomic_add");
       
   568     Label retry;
       
   569     start = __ pc();
       
   570     Register addval    = R0;
       
   571     Register dest      = R1;
       
   572     Register prev      = R2;
       
   573     Register ok        = R2;
       
   574     Register newval    = R3;
       
   575 
       
   576     if (VM_Version::supports_ldrex()) {
       
   577       __ membar(MEMBAR_ATOMIC_OP_PRE, prev);
       
   578       __ bind(retry);
       
   579       __ ldrex(newval, Address(dest));
       
   580       __ add(newval, addval, newval);
       
   581       __ strex(ok, newval, Address(dest));
       
   582       __ cmp(ok, 0);
       
   583       __ b(retry, ne);
       
   584       __ mov (R0, newval);
       
   585       __ membar(MEMBAR_ATOMIC_OP_POST, prev);
       
   586     } else {
       
   587       __ bind(retry);
       
   588       __ ldr (prev, Address(dest));
       
   589       __ add(newval, addval, prev);
       
   590       __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/);
       
   591       __ b(retry, ne);
       
   592       __ mov (R0, newval);
       
   593     }
       
   594     __ bx(LR);
       
   595 
       
   596     return start;
       
   597   }
       
   598 
       
   599   // Support for jint Atomic::xchg(jint exchange_value, volatile jint *dest)
       
   600   //
       
   601   // Arguments :
       
   602   //
       
   603   //      exchange_value: R0
       
   604   //      dest:           R1
       
   605   //
       
   606   // Results:
       
   607   //
       
   608   //     R0: the value previously stored in dest
       
   609   //
       
   610   // Overwrites:
       
   611   //
       
   612   //     R1, R2, R3
       
   613   //
       
   614   address generate_atomic_xchg() {
       
   615     address start;
       
   616 
       
   617     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
       
   618     start = __ pc();
       
   619     Register newval    = R0;
       
   620     Register dest      = R1;
       
   621     Register prev      = R2;
       
   622 
       
   623     Label retry;
       
   624 
       
   625     if (VM_Version::supports_ldrex()) {
       
   626       Register ok=R3;
       
   627       __ membar(MEMBAR_ATOMIC_OP_PRE, prev);
       
   628       __ bind(retry);
       
   629       __ ldrex(prev, Address(dest));
       
   630       __ strex(ok, newval, Address(dest));
       
   631       __ cmp(ok, 0);
       
   632       __ b(retry, ne);
       
   633       __ mov (R0, prev);
       
   634       __ membar(MEMBAR_ATOMIC_OP_POST, prev);
       
   635     } else {
       
   636       __ bind(retry);
       
   637       __ ldr (prev, Address(dest));
       
   638       __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/);
       
   639       __ b(retry, ne);
       
   640       __ mov (R0, prev);
       
   641     }
       
   642     __ bx(LR);
       
   643 
       
   644     return start;
       
   645   }
       
   646 
       
   647   // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint *dest, jint compare_value)
       
   648   //
       
   649   // Arguments :
       
   650   //
       
   651   //      compare_value:  R0
       
   652   //      exchange_value: R1
       
   653   //      dest:           R2
       
   654   //
       
   655   // Results:
       
   656   //
       
   657   //     R0: the value previously stored in dest
       
   658   //
       
   659   // Overwrites:
       
   660   //
       
   661   //     R0, R1, R2, R3, Rtemp
       
   662   //
       
   663   address generate_atomic_cmpxchg() {
       
   664     address start;
       
   665 
       
   666     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
       
   667     start = __ pc();
       
   668     Register cmp       = R0;
       
   669     Register newval    = R1;
       
   670     Register dest      = R2;
       
   671     Register temp1     = R3;
       
   672     Register temp2     = Rtemp; // Rtemp free (native ABI)
       
   673 
       
   674     __ membar(MEMBAR_ATOMIC_OP_PRE, temp1);
       
   675 
       
   676     // atomic_cas returns previous value in R0
       
   677     __ atomic_cas(temp1, temp2, cmp, newval, dest, 0);
       
   678 
       
   679     __ membar(MEMBAR_ATOMIC_OP_POST, temp1);
       
   680 
       
   681     __ bx(LR);
       
   682 
       
   683     return start;
       
   684   }
       
   685 
       
   686   // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
       
   687   // reordered before by a wrapper to (jlong compare_value, jlong exchange_value, volatile jlong *dest)
       
   688   //
       
   689   // Arguments :
       
   690   //
       
   691   //      compare_value:  R1 (High), R0 (Low)
       
   692   //      exchange_value: R3 (High), R2 (Low)
       
   693   //      dest:           SP+0
       
   694   //
       
   695   // Results:
       
   696   //
       
   697   //     R0:R1: the value previously stored in dest
       
   698   //
       
   699   // Overwrites:
       
   700   //
       
   701   address generate_atomic_cmpxchg_long() {
       
   702     address start;
       
   703 
       
   704     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
       
   705     start = __ pc();
       
   706     Register cmp_lo      = R0;
       
   707     Register cmp_hi      = R1;
       
   708     Register newval_lo   = R2;
       
   709     Register newval_hi   = R3;
       
   710     Register addr        = Rtemp;  /* After load from stack */
       
   711     Register temp_lo     = R4;
       
   712     Register temp_hi     = R5;
       
   713     Register temp_result = R8;
       
   714     assert_different_registers(cmp_lo, newval_lo, temp_lo, addr, temp_result, R7);
       
   715     assert_different_registers(cmp_hi, newval_hi, temp_hi, addr, temp_result, R7);
       
   716 
       
   717     __ membar(MEMBAR_ATOMIC_OP_PRE, Rtemp); // Rtemp free (native ABI)
       
   718 
       
   719     // Stack is unaligned, maintain double word alignment by pushing
       
   720     // odd number of regs.
       
   721     __ push(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi));
       
   722     __ ldr(addr, Address(SP, 12));
       
   723 
       
   724     // atomic_cas64 returns previous value in temp_lo, temp_hi
       
   725     __ atomic_cas64(temp_lo, temp_hi, temp_result, cmp_lo, cmp_hi,
       
   726                     newval_lo, newval_hi, addr, 0);
       
   727     __ mov(R0, temp_lo);
       
   728     __ mov(R1, temp_hi);
       
   729 
       
   730     __ pop(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi));
       
   731 
       
   732     __ membar(MEMBAR_ATOMIC_OP_POST, Rtemp); // Rtemp free (native ABI)
       
   733     __ bx(LR);
       
   734 
       
   735     return start;
       
   736   }
       
   737 
       
   738   address generate_atomic_load_long() {
       
   739     address start;
       
   740 
       
   741     StubCodeMark mark(this, "StubRoutines", "atomic_load_long");
       
   742     start = __ pc();
       
   743     Register result_lo = R0;
       
   744     Register result_hi = R1;
       
   745     Register src       = R0;
       
   746 
       
   747     if (!os::is_MP()) {
       
   748       __ ldmia(src, RegisterSet(result_lo, result_hi));
       
   749       __ bx(LR);
       
   750     } else if (VM_Version::supports_ldrexd()) {
       
   751       __ ldrexd(result_lo, Address(src));
       
   752       __ clrex(); // FIXME: safe to remove?
       
   753       __ bx(LR);
       
   754     } else {
       
   755       __ stop("Atomic load(jlong) unsupported on this platform");
       
   756       __ bx(LR);
       
   757     }
       
   758 
       
   759     return start;
       
   760   }
       
   761 
       
   762   address generate_atomic_store_long() {
       
   763     address start;
       
   764 
       
   765     StubCodeMark mark(this, "StubRoutines", "atomic_store_long");
       
   766     start = __ pc();
       
   767     Register newval_lo = R0;
       
   768     Register newval_hi = R1;
       
   769     Register dest      = R2;
       
   770     Register scratch_lo    = R2;
       
   771     Register scratch_hi    = R3;  /* After load from stack */
       
   772     Register result    = R3;
       
   773 
       
   774     if (!os::is_MP()) {
       
   775       __ stmia(dest, RegisterSet(newval_lo, newval_hi));
       
   776       __ bx(LR);
       
   777     } else if (VM_Version::supports_ldrexd()) {
       
   778       __ mov(Rtemp, dest);  // get dest to Rtemp
       
   779       Label retry;
       
   780       __ bind(retry);
       
   781       __ ldrexd(scratch_lo, Address(Rtemp));
       
   782       __ strexd(result, R0, Address(Rtemp));
       
   783       __ rsbs(result, result, 1);
       
   784       __ b(retry, eq);
       
   785       __ bx(LR);
       
   786     } else {
       
   787       __ stop("Atomic store(jlong) unsupported on this platform");
       
   788       __ bx(LR);
       
   789     }
       
   790 
       
   791     return start;
       
   792   }
       
   793 
       
   794 
       
   795 #endif // AARCH64
       
   796 
       
   797 #ifdef COMPILER2
       
   798   // Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super );
       
   799   // Arguments :
       
   800   //
       
   801   //      ret  : R0, returned
       
   802   //      icc/xcc: set as R0 (depending on wordSize)
       
   803   //      sub  : R1, argument, not changed
       
   804   //      super: R2, argument, not changed
       
   805   //      raddr: LR, blown by call
       
   806   address generate_partial_subtype_check() {
       
   807     __ align(CodeEntryAlignment);
       
   808     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
       
   809     address start = __ pc();
       
   810 
       
   811     // based on SPARC check_klass_subtype_[fast|slow]_path (without CompressedOops)
       
   812 
       
   813     // R0 used as tmp_reg (in addition to return reg)
       
   814     Register sub_klass = R1;
       
   815     Register super_klass = R2;
       
   816     Register tmp_reg2 = R3;
       
   817     Register tmp_reg3 = R4;
       
   818 #define saved_set tmp_reg2, tmp_reg3
       
   819 
       
   820     Label L_loop, L_fail;
       
   821 
       
   822     int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
       
   823 
       
   824     // fast check should be redundant
       
   825 
       
   826     // slow check
       
   827     {
       
   828       __ raw_push(saved_set);
       
   829 
       
   830       // a couple of useful fields in sub_klass:
       
   831       int ss_offset = in_bytes(Klass::secondary_supers_offset());
       
   832 
       
   833       // Do a linear scan of the secondary super-klass chain.
       
   834       // This code is rarely used, so simplicity is a virtue here.
       
   835 
       
   836       inc_counter_np(SharedRuntime::_partial_subtype_ctr, tmp_reg2, tmp_reg3);
       
   837 
       
   838       Register scan_temp = tmp_reg2;
       
   839       Register count_temp = tmp_reg3;
       
   840 
       
   841       // We will consult the secondary-super array.
       
   842       __ ldr(scan_temp, Address(sub_klass, ss_offset));
       
   843 
       
   844       Register search_key = super_klass;
       
   845 
       
   846       // Load the array length.
       
   847       __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes()));
       
   848       __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes());
       
   849 
       
   850       __ add(count_temp, count_temp, 1);
       
   851 
       
   852       // Top of search loop
       
   853       __ bind(L_loop);
       
   854       // Notes:
       
   855       //  scan_temp starts at the array elements
       
   856       //  count_temp is 1+size
       
   857       __ subs(count_temp, count_temp, 1);
       
   858       __ b(L_fail, eq); // not found in the array
       
   859 
       
   860       // Load next super to check
       
   861       // In the array of super classes elements are pointer sized.
       
   862       int element_size = wordSize;
       
   863       __ ldr(R0, Address(scan_temp, element_size, post_indexed));
       
   864 
       
   865       // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
       
   866       __ subs(R0, R0, search_key); // set R0 to 0 on success (and flags to eq)
       
   867 
       
   868       // A miss means we are NOT a subtype and need to keep looping
       
   869       __ b(L_loop, ne);
       
   870 
       
   871       // Falling out the bottom means we found a hit; we ARE a subtype
       
   872 
       
   873       // Success.  Cache the super we found and proceed in triumph.
       
   874       __ str(super_klass, Address(sub_klass, sc_offset));
       
   875 
       
   876       // Return success
       
   877       // R0 is already 0 and flags are already set to eq
       
   878       __ raw_pop(saved_set);
       
   879       __ ret();
       
   880 
       
   881       // Return failure
       
   882       __ bind(L_fail);
       
   883 #ifdef AARCH64
       
   884       // count_temp is 0, can't use ZR here
       
   885       __ adds(R0, count_temp, 1); // sets the flags
       
   886 #else
       
   887       __ movs(R0, 1); // sets the flags
       
   888 #endif
       
   889       __ raw_pop(saved_set);
       
   890       __ ret();
       
   891     }
       
   892     return start;
       
   893   }
       
   894 #undef saved_set
       
   895 #endif // COMPILER2
       
   896 
       
   897 
       
   898   //----------------------------------------------------------------------------------------------------
       
   899   // Non-destructive plausibility checks for oops
       
   900 
       
   901   address generate_verify_oop() {
       
   902     StubCodeMark mark(this, "StubRoutines", "verify_oop");
       
   903     address start = __ pc();
       
   904 
       
   905     // Incoming arguments:
       
   906     //
       
   907     // R0: error message (char* )
       
   908     // R1: address of register save area
       
   909     // R2: oop to verify
       
   910     //
       
   911     // All registers are saved before calling this stub. However, condition flags should be saved here.
       
   912 
       
   913     const Register oop   = R2;
       
   914     const Register klass = R3;
       
   915     const Register tmp1  = R6;
       
   916     const Register tmp2  = R8;
       
   917 
       
   918     const Register flags     = Rtmp_save0; // R4/R19
       
   919     const Register ret_addr  = Rtmp_save1; // R5/R20
       
   920     assert_different_registers(oop, klass, tmp1, tmp2, flags, ret_addr, R7);
       
   921 
       
   922     Label exit, error;
       
   923     InlinedAddress verify_oop_count((address) StubRoutines::verify_oop_count_addr());
       
   924 
       
   925 #ifdef AARCH64
       
   926     __ mrs(flags, Assembler::SysReg_NZCV);
       
   927 #else
       
   928     __ mrs(Assembler::CPSR, flags);
       
   929 #endif // AARCH64
       
   930 
       
   931     __ ldr_literal(tmp1, verify_oop_count);
       
   932     __ ldr_s32(tmp2, Address(tmp1));
       
   933     __ add(tmp2, tmp2, 1);
       
   934     __ str_32(tmp2, Address(tmp1));
       
   935 
       
   936     // make sure object is 'reasonable'
       
   937     __ cbz(oop, exit);                           // if obj is NULL it is ok
       
   938 
       
   939     // Check if the oop is in the right area of memory
       
   940     // Note: oop_mask and oop_bits must be updated if the code is saved/reused
       
   941     const address oop_mask = (address) Universe::verify_oop_mask();
       
   942     const address oop_bits = (address) Universe::verify_oop_bits();
       
   943     __ mov_address(tmp1, oop_mask, symbolic_Relocation::oop_mask_reference);
       
   944     __ andr(tmp2, oop, tmp1);
       
   945     __ mov_address(tmp1, oop_bits, symbolic_Relocation::oop_bits_reference);
       
   946     __ cmp(tmp2, tmp1);
       
   947     __ b(error, ne);
       
   948 
       
   949     // make sure klass is 'reasonable'
       
   950     __ load_klass(klass, oop);                   // get klass
       
   951     __ cbz(klass, error);                        // if klass is NULL it is broken
       
   952 
       
   953     // return if everything seems ok
       
   954     __ bind(exit);
       
   955 
       
   956 #ifdef AARCH64
       
   957     __ msr(Assembler::SysReg_NZCV, flags);
       
   958 #else
       
   959     __ msr(Assembler::CPSR_f, flags);
       
   960 #endif // AARCH64
       
   961 
       
   962     __ ret();
       
   963 
       
   964     // handle errors
       
   965     __ bind(error);
       
   966 
       
   967     __ mov(ret_addr, LR);                      // save return address
       
   968 
       
   969     // R0: error message
       
   970     // R1: register save area
       
   971     __ call(CAST_FROM_FN_PTR(address, MacroAssembler::debug));
       
   972 
       
   973     __ mov(LR, ret_addr);
       
   974     __ b(exit);
       
   975 
       
   976     __ bind_literal(verify_oop_count);
       
   977 
       
   978     return start;
       
   979   }
       
   980 
       
   981   //----------------------------------------------------------------------------------------------------
       
   982   // Array copy stubs
       
   983 
       
   984   //
       
   985   //  Generate overlap test for array copy stubs
       
   986   //
       
   987   //  Input:
       
   988   //    R0    -  array1
       
   989   //    R1    -  array2
       
   990   //    R2    -  element count, 32-bit int
       
   991   //
       
   992   //  input registers are preserved
       
   993   //
       
   994   void array_overlap_test(address no_overlap_target, int log2_elem_size, Register tmp1, Register tmp2) {
       
   995     assert(no_overlap_target != NULL, "must be generated");
       
   996     array_overlap_test(no_overlap_target, NULL, log2_elem_size, tmp1, tmp2);
       
   997   }
       
   998   void array_overlap_test(Label& L_no_overlap, int log2_elem_size, Register tmp1, Register tmp2) {
       
   999     array_overlap_test(NULL, &L_no_overlap, log2_elem_size, tmp1, tmp2);
       
  1000   }
       
  1001   void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size, Register tmp1, Register tmp2) {
       
  1002     const Register from       = R0;
       
  1003     const Register to         = R1;
       
  1004     const Register count      = R2;
       
  1005     const Register to_from    = tmp1; // to - from
       
  1006 #ifndef AARCH64
       
  1007     const Register byte_count = (log2_elem_size == 0) ? count : tmp2; // count << log2_elem_size
       
  1008 #endif // AARCH64
       
  1009     assert_different_registers(from, to, count, tmp1, tmp2);
       
  1010 
       
  1011     // no_overlap version works if 'to' lower (unsigned) than 'from'
       
  1012     // and or 'to' more than (count*size) from 'from'
       
  1013 
       
  1014     BLOCK_COMMENT("Array Overlap Test:");
       
  1015     __ subs(to_from, to, from);
       
  1016 #ifndef AARCH64
       
  1017     if (log2_elem_size != 0) {
       
  1018       __ mov(byte_count, AsmOperand(count, lsl, log2_elem_size));
       
  1019     }
       
  1020 #endif // !AARCH64
       
  1021     if (NOLp == NULL)
       
  1022       __ b(no_overlap_target,lo);
       
  1023     else
       
  1024       __ b((*NOLp), lo);
       
  1025 #ifdef AARCH64
       
  1026     __ subs(ZR, to_from, count, ex_sxtw, log2_elem_size);
       
  1027 #else
       
  1028     __ cmp(to_from, byte_count);
       
  1029 #endif // AARCH64
       
  1030     if (NOLp == NULL)
       
  1031       __ b(no_overlap_target, ge);
       
  1032     else
       
  1033       __ b((*NOLp), ge);
       
  1034   }
       
  1035 
       
  1036 #ifdef AARCH64
       
  1037   // TODO-AARCH64: revise usages of bulk_* methods (probably ldp`s and stp`s should interlace)
       
  1038 
       
  1039   // Loads [from, from + count*wordSize) into regs[0], regs[1], ..., regs[count-1]
       
  1040   // and increases 'from' by count*wordSize.
       
  1041   void bulk_load_forward(Register from, const Register regs[], int count) {
       
  1042     assert (count > 0 && count % 2 == 0, "count must be positive even number");
       
  1043     int bytes = count * wordSize;
       
  1044 
       
  1045     int offset = 0;
       
  1046     __ ldp(regs[0], regs[1], Address(from, bytes, post_indexed));
       
  1047     offset += 2*wordSize;
       
  1048 
       
  1049     for (int i = 2; i < count; i += 2) {
       
  1050       __ ldp(regs[i], regs[i+1], Address(from, -bytes + offset));
       
  1051       offset += 2*wordSize;
       
  1052     }
       
  1053 
       
  1054     assert (offset == bytes, "must be");
       
  1055   }
       
  1056 
       
  1057   // Stores regs[0], regs[1], ..., regs[count-1] to [to, to + count*wordSize)
       
  1058   // and increases 'to' by count*wordSize.
       
  1059   void bulk_store_forward(Register to, const Register regs[], int count) {
       
  1060     assert (count > 0 && count % 2 == 0, "count must be positive even number");
       
  1061     int bytes = count * wordSize;
       
  1062 
       
  1063     int offset = 0;
       
  1064     __ stp(regs[0], regs[1], Address(to, bytes, post_indexed));
       
  1065     offset += 2*wordSize;
       
  1066 
       
  1067     for (int i = 2; i < count; i += 2) {
       
  1068       __ stp(regs[i], regs[i+1], Address(to, -bytes + offset));
       
  1069       offset += 2*wordSize;
       
  1070     }
       
  1071 
       
  1072     assert (offset == bytes, "must be");
       
  1073   }
       
  1074 
       
  1075   // Loads [from - count*wordSize, from) into regs[0], regs[1], ..., regs[count-1]
       
  1076   // and decreases 'from' by count*wordSize.
       
  1077   // Note that the word with lowest address goes to regs[0].
       
  1078   void bulk_load_backward(Register from, const Register regs[], int count) {
       
  1079     assert (count > 0 && count % 2 == 0, "count must be positive even number");
       
  1080     int bytes = count * wordSize;
       
  1081 
       
  1082     int offset = 0;
       
  1083 
       
  1084     for (int i = count - 2; i > 0; i -= 2) {
       
  1085       offset += 2*wordSize;
       
  1086       __ ldp(regs[i], regs[i+1], Address(from, -offset));
       
  1087     }
       
  1088 
       
  1089     offset += 2*wordSize;
       
  1090     __ ldp(regs[0], regs[1], Address(from, -bytes, pre_indexed));
       
  1091 
       
  1092     assert (offset == bytes, "must be");
       
  1093   }
       
  1094 
       
  1095   // Stores regs[0], regs[1], ..., regs[count-1] into [to - count*wordSize, to)
       
  1096   // and decreases 'to' by count*wordSize.
       
  1097   // Note that regs[0] value goes into the memory with lowest address.
       
  1098   void bulk_store_backward(Register to, const Register regs[], int count) {
       
  1099     assert (count > 0 && count % 2 == 0, "count must be positive even number");
       
  1100     int bytes = count * wordSize;
       
  1101 
       
  1102     int offset = 0;
       
  1103 
       
  1104     for (int i = count - 2; i > 0; i -= 2) {
       
  1105       offset += 2*wordSize;
       
  1106       __ stp(regs[i], regs[i+1], Address(to, -offset));
       
  1107     }
       
  1108 
       
  1109     offset += 2*wordSize;
       
  1110     __ stp(regs[0], regs[1], Address(to, -bytes, pre_indexed));
       
  1111 
       
  1112     assert (offset == bytes, "must be");
       
  1113   }
       
  1114 #endif // AARCH64
       
  1115 
       
  1116   // TODO-AARCH64: rearrange in-loop prefetches:
       
  1117   //   probably we should choose between "prefetch-store before or after store", not "before or after load".
       
  1118   void prefetch(Register from, Register to, int offset, int to_delta = 0) {
       
  1119     __ prefetch_read(Address(from, offset));
       
  1120 #ifdef AARCH64
       
  1121   // Next line commented out to avoid significant loss of performance in memory copy - JDK-8078120
       
  1122   // __ prfm(pstl1keep, Address(to, offset + to_delta));
       
  1123 #endif // AARCH64
       
  1124   }
       
  1125 
       
  1126   // Generate the inner loop for forward aligned array copy
       
  1127   //
       
  1128   // Arguments
       
  1129   //      from:      src address, 64 bits  aligned
       
  1130   //      to:        dst address, wordSize aligned
       
  1131   //      count:     number of elements (32-bit int)
       
  1132   //      bytes_per_count: number of bytes for each unit of 'count'
       
  1133   //
       
  1134   // Return the minimum initial value for count
       
  1135   //
       
  1136   // Notes:
       
  1137   // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
       
  1138   // - 'to' aligned on wordSize
       
  1139   // - 'count' must be greater or equal than the returned value
       
  1140   //
       
  1141   // Increases 'from' and 'to' by count*bytes_per_count.
       
  1142   //
       
  1143   // Scratches 'count', R3.
       
  1144   // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored).
       
  1145   //
       
  1146   int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count) {
       
  1147     assert (from == R0 && to == R1 && count == R2, "adjust the implementation below");
       
  1148 
       
  1149     const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
       
  1150     arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned;
       
  1151     int pld_offset = config->pld_distance;
       
  1152     const int count_per_loop = bytes_per_loop / bytes_per_count;
       
  1153 
       
  1154 #ifndef AARCH64
       
  1155     bool split_read= config->split_ldm;
       
  1156     bool split_write= config->split_stm;
       
  1157 
       
  1158     // XXX optim: use VLDM/VSTM when available (Neon) with PLD
       
  1159     //  NEONCopyPLD
       
  1160     //      PLD [r1, #0xC0]
       
  1161     //      VLDM r1!,{d0-d7}
       
  1162     //      VSTM r0!,{d0-d7}
       
  1163     //      SUBS r2,r2,#0x40
       
  1164     //      BGE NEONCopyPLD
       
  1165 
       
  1166     __ push(RegisterSet(R4,R10));
       
  1167 #endif // !AARCH64
       
  1168 
       
  1169     const bool prefetch_before = pld_offset < 0;
       
  1170     const bool prefetch_after = pld_offset > 0;
       
  1171 
       
  1172     Label L_skip_pld;
       
  1173 
       
  1174     // predecrease to exit when there is less than count_per_loop
       
  1175     __ sub_32(count, count, count_per_loop);
       
  1176 
       
  1177     if (pld_offset != 0) {
       
  1178       pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
       
  1179 
       
  1180       prefetch(from, to, 0);
       
  1181 
       
  1182       if (prefetch_before) {
       
  1183         // If prefetch is done ahead, final PLDs that overflow the
       
  1184         // copied area can be easily avoided. 'count' is predecreased
       
  1185         // by the prefetch distance to optimize the inner loop and the
       
  1186         // outer loop skips the PLD.
       
  1187         __ subs_32(count, count, (bytes_per_loop+pld_offset)/bytes_per_count);
       
  1188 
       
  1189         // skip prefetch for small copies
       
  1190         __ b(L_skip_pld, lt);
       
  1191       }
       
  1192 
       
  1193       int offset = ArmCopyCacheLineSize;
       
  1194       while (offset <= pld_offset) {
       
  1195         prefetch(from, to, offset);
       
  1196         offset += ArmCopyCacheLineSize;
       
  1197       };
       
  1198     }
       
  1199 
       
  1200 #ifdef AARCH64
       
  1201     const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10};
       
  1202 #endif // AARCH64
       
  1203     {
       
  1204       // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes
       
  1205 
       
  1206       // 32-bit ARM note: we have tried implementing loop unrolling to skip one
       
  1207       // PLD with 64 bytes cache line but the gain was not significant.
       
  1208 
       
  1209       Label L_copy_loop;
       
  1210       __ align(OptoLoopAlignment);
       
  1211       __ BIND(L_copy_loop);
       
  1212 
       
  1213       if (prefetch_before) {
       
  1214         prefetch(from, to, bytes_per_loop + pld_offset);
       
  1215         __ BIND(L_skip_pld);
       
  1216       }
       
  1217 
       
  1218 #ifdef AARCH64
       
  1219       bulk_load_forward(from, data_regs, 8);
       
  1220 #else
       
  1221       if (split_read) {
       
  1222         // Split the register set in two sets so that there is less
       
  1223         // latency between LDM and STM (R3-R6 available while R7-R10
       
  1224         // still loading) and less register locking issue when iterating
       
  1225         // on the first LDM.
       
  1226         __ ldmia(from, RegisterSet(R3, R6), writeback);
       
  1227         __ ldmia(from, RegisterSet(R7, R10), writeback);
       
  1228       } else {
       
  1229         __ ldmia(from, RegisterSet(R3, R10), writeback);
       
  1230       }
       
  1231 #endif // AARCH64
       
  1232 
       
  1233       __ subs_32(count, count, count_per_loop);
       
  1234 
       
  1235       if (prefetch_after) {
       
  1236         prefetch(from, to, pld_offset, bytes_per_loop);
       
  1237       }
       
  1238 
       
  1239 #ifdef AARCH64
       
  1240       bulk_store_forward(to, data_regs, 8);
       
  1241 #else
       
  1242       if (split_write) {
       
  1243         __ stmia(to, RegisterSet(R3, R6), writeback);
       
  1244         __ stmia(to, RegisterSet(R7, R10), writeback);
       
  1245       } else {
       
  1246         __ stmia(to, RegisterSet(R3, R10), writeback);
       
  1247       }
       
  1248 #endif // AARCH64
       
  1249 
       
  1250       __ b(L_copy_loop, ge);
       
  1251 
       
  1252       if (prefetch_before) {
       
  1253         // the inner loop may end earlier, allowing to skip PLD for the last iterations
       
  1254         __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
       
  1255         __ b(L_skip_pld, ge);
       
  1256       }
       
  1257     }
       
  1258     BLOCK_COMMENT("Remaining bytes:");
       
  1259     // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes
       
  1260 
       
  1261     // __ add(count, count, ...); // addition useless for the bit tests
       
  1262     assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits");
       
  1263 
       
  1264 #ifdef AARCH64
       
  1265     assert (bytes_per_loop == 64, "adjust the code below");
       
  1266     assert (bytes_per_count <= 8, "adjust the code below");
       
  1267 
       
  1268     {
       
  1269       Label L;
       
  1270       __ tbz(count, exact_log2(32/bytes_per_count), L);
       
  1271 
       
  1272       bulk_load_forward(from, data_regs, 4);
       
  1273       bulk_store_forward(to, data_regs, 4);
       
  1274 
       
  1275       __ bind(L);
       
  1276     }
       
  1277 
       
  1278     {
       
  1279       Label L;
       
  1280       __ tbz(count, exact_log2(16/bytes_per_count), L);
       
  1281 
       
  1282       bulk_load_forward(from, data_regs, 2);
       
  1283       bulk_store_forward(to, data_regs, 2);
       
  1284 
       
  1285       __ bind(L);
       
  1286     }
       
  1287 
       
  1288     {
       
  1289       Label L;
       
  1290       __ tbz(count, exact_log2(8/bytes_per_count), L);
       
  1291 
       
  1292       __ ldr(R3, Address(from, 8, post_indexed));
       
  1293       __ str(R3, Address(to,   8, post_indexed));
       
  1294 
       
  1295       __ bind(L);
       
  1296     }
       
  1297 
       
  1298     if (bytes_per_count <= 4) {
       
  1299       Label L;
       
  1300       __ tbz(count, exact_log2(4/bytes_per_count), L);
       
  1301 
       
  1302       __ ldr_w(R3, Address(from, 4, post_indexed));
       
  1303       __ str_w(R3, Address(to,   4, post_indexed));
       
  1304 
       
  1305       __ bind(L);
       
  1306     }
       
  1307 
       
  1308     if (bytes_per_count <= 2) {
       
  1309       Label L;
       
  1310       __ tbz(count, exact_log2(2/bytes_per_count), L);
       
  1311 
       
  1312       __ ldrh(R3, Address(from, 2, post_indexed));
       
  1313       __ strh(R3, Address(to,   2, post_indexed));
       
  1314 
       
  1315       __ bind(L);
       
  1316     }
       
  1317 
       
  1318     if (bytes_per_count <= 1) {
       
  1319       Label L;
       
  1320       __ tbz(count, 0, L);
       
  1321 
       
  1322       __ ldrb(R3, Address(from, 1, post_indexed));
       
  1323       __ strb(R3, Address(to,   1, post_indexed));
       
  1324 
       
  1325       __ bind(L);
       
  1326     }
       
  1327 #else
       
  1328     __ tst(count, 16 / bytes_per_count);
       
  1329     __ ldmia(from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes
       
  1330     __ stmia(to, RegisterSet(R3, R6), writeback, ne);
       
  1331 
       
  1332     __ tst(count, 8 / bytes_per_count);
       
  1333     __ ldmia(from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes
       
  1334     __ stmia(to, RegisterSet(R3, R4), writeback, ne);
       
  1335 
       
  1336     if (bytes_per_count <= 4) {
       
  1337       __ tst(count, 4 / bytes_per_count);
       
  1338       __ ldr(R3, Address(from, 4, post_indexed), ne); // copy 4 bytes
       
  1339       __ str(R3, Address(to, 4, post_indexed), ne);
       
  1340     }
       
  1341 
       
  1342     if (bytes_per_count <= 2) {
       
  1343       __ tst(count, 2 / bytes_per_count);
       
  1344       __ ldrh(R3, Address(from, 2, post_indexed), ne); // copy 2 bytes
       
  1345       __ strh(R3, Address(to, 2, post_indexed), ne);
       
  1346     }
       
  1347 
       
  1348     if (bytes_per_count == 1) {
       
  1349       __ tst(count, 1);
       
  1350       __ ldrb(R3, Address(from, 1, post_indexed), ne);
       
  1351       __ strb(R3, Address(to, 1, post_indexed), ne);
       
  1352     }
       
  1353 
       
  1354     __ pop(RegisterSet(R4,R10));
       
  1355 #endif // AARCH64
       
  1356 
       
  1357     return count_per_loop;
       
  1358   }
       
  1359 
       
  1360 
       
  1361   // Generate the inner loop for backward aligned array copy
       
  1362   //
       
  1363   // Arguments
       
  1364   //      end_from:      src end address, 64 bits  aligned
       
  1365   //      end_to:        dst end address, wordSize aligned
       
  1366   //      count:         number of elements (32-bit int)
       
  1367   //      bytes_per_count: number of bytes for each unit of 'count'
       
  1368   //
       
  1369   // Return the minimum initial value for count
       
  1370   //
       
  1371   // Notes:
       
  1372   // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
       
  1373   // - 'end_to' aligned on wordSize
       
  1374   // - 'count' must be greater or equal than the returned value
       
  1375   //
       
  1376   // Decreases 'end_from' and 'end_to' by count*bytes_per_count.
       
  1377   //
       
  1378   // Scratches 'count', R3.
       
  1379   // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored).
       
  1380   //
       
  1381   int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count) {
       
  1382     assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below");
       
  1383 
       
  1384     const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
       
  1385     const int count_per_loop = bytes_per_loop / bytes_per_count;
       
  1386 
       
  1387     arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned;
       
  1388     int pld_offset = config->pld_distance;
       
  1389 
       
  1390 #ifndef AARCH64
       
  1391     bool split_read= config->split_ldm;
       
  1392     bool split_write= config->split_stm;
       
  1393 
       
  1394     // See the forward copy variant for additional comments.
       
  1395 
       
  1396     __ push(RegisterSet(R4,R10));
       
  1397 #endif // !AARCH64
       
  1398 
       
  1399     __ sub_32(count, count, count_per_loop);
       
  1400 
       
  1401     const bool prefetch_before = pld_offset < 0;
       
  1402     const bool prefetch_after = pld_offset > 0;
       
  1403 
       
  1404     Label L_skip_pld;
       
  1405 
       
  1406     if (pld_offset != 0) {
       
  1407       pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
       
  1408 
       
  1409       prefetch(end_from, end_to, -wordSize);
       
  1410 
       
  1411       if (prefetch_before) {
       
  1412         __ subs_32(count, count, (bytes_per_loop + pld_offset) / bytes_per_count);
       
  1413         __ b(L_skip_pld, lt);
       
  1414       }
       
  1415 
       
  1416       int offset = ArmCopyCacheLineSize;
       
  1417       while (offset <= pld_offset) {
       
  1418         prefetch(end_from, end_to, -(wordSize + offset));
       
  1419         offset += ArmCopyCacheLineSize;
       
  1420       };
       
  1421     }
       
  1422 
       
  1423 #ifdef AARCH64
       
  1424     const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10};
       
  1425 #endif // AARCH64
       
  1426     {
       
  1427       // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes
       
  1428 
       
  1429       // 32-bit ARM note: we have tried implementing loop unrolling to skip one
       
  1430       // PLD with 64 bytes cache line but the gain was not significant.
       
  1431 
       
  1432       Label L_copy_loop;
       
  1433       __ align(OptoLoopAlignment);
       
  1434       __ BIND(L_copy_loop);
       
  1435 
       
  1436       if (prefetch_before) {
       
  1437         prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset));
       
  1438         __ BIND(L_skip_pld);
       
  1439       }
       
  1440 
       
  1441 #ifdef AARCH64
       
  1442       bulk_load_backward(end_from, data_regs, 8);
       
  1443 #else
       
  1444       if (split_read) {
       
  1445         __ ldmdb(end_from, RegisterSet(R7, R10), writeback);
       
  1446         __ ldmdb(end_from, RegisterSet(R3, R6), writeback);
       
  1447       } else {
       
  1448         __ ldmdb(end_from, RegisterSet(R3, R10), writeback);
       
  1449       }
       
  1450 #endif // AARCH64
       
  1451 
       
  1452       __ subs_32(count, count, count_per_loop);
       
  1453 
       
  1454       if (prefetch_after) {
       
  1455         prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop);
       
  1456       }
       
  1457 
       
  1458 #ifdef AARCH64
       
  1459       bulk_store_backward(end_to, data_regs, 8);
       
  1460 #else
       
  1461       if (split_write) {
       
  1462         __ stmdb(end_to, RegisterSet(R7, R10), writeback);
       
  1463         __ stmdb(end_to, RegisterSet(R3, R6), writeback);
       
  1464       } else {
       
  1465         __ stmdb(end_to, RegisterSet(R3, R10), writeback);
       
  1466       }
       
  1467 #endif // AARCH64
       
  1468 
       
  1469       __ b(L_copy_loop, ge);
       
  1470 
       
  1471       if (prefetch_before) {
       
  1472         __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
       
  1473         __ b(L_skip_pld, ge);
       
  1474       }
       
  1475     }
       
  1476     BLOCK_COMMENT("Remaining bytes:");
       
  1477     // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes
       
  1478 
       
  1479     // __ add(count, count, ...); // addition useless for the bit tests
       
  1480     assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits");
       
  1481 
       
  1482 #ifdef AARCH64
       
  1483     assert (bytes_per_loop == 64, "adjust the code below");
       
  1484     assert (bytes_per_count <= 8, "adjust the code below");
       
  1485 
       
  1486     {
       
  1487       Label L;
       
  1488       __ tbz(count, exact_log2(32/bytes_per_count), L);
       
  1489 
       
  1490       bulk_load_backward(end_from, data_regs, 4);
       
  1491       bulk_store_backward(end_to, data_regs, 4);
       
  1492 
       
  1493       __ bind(L);
       
  1494     }
       
  1495 
       
  1496     {
       
  1497       Label L;
       
  1498       __ tbz(count, exact_log2(16/bytes_per_count), L);
       
  1499 
       
  1500       bulk_load_backward(end_from, data_regs, 2);
       
  1501       bulk_store_backward(end_to, data_regs, 2);
       
  1502 
       
  1503       __ bind(L);
       
  1504     }
       
  1505 
       
  1506     {
       
  1507       Label L;
       
  1508       __ tbz(count, exact_log2(8/bytes_per_count), L);
       
  1509 
       
  1510       __ ldr(R3, Address(end_from, -8, pre_indexed));
       
  1511       __ str(R3, Address(end_to,   -8, pre_indexed));
       
  1512 
       
  1513       __ bind(L);
       
  1514     }
       
  1515 
       
  1516     if (bytes_per_count <= 4) {
       
  1517       Label L;
       
  1518       __ tbz(count, exact_log2(4/bytes_per_count), L);
       
  1519 
       
  1520       __ ldr_w(R3, Address(end_from, -4, pre_indexed));
       
  1521       __ str_w(R3, Address(end_to,   -4, pre_indexed));
       
  1522 
       
  1523       __ bind(L);
       
  1524     }
       
  1525 
       
  1526     if (bytes_per_count <= 2) {
       
  1527       Label L;
       
  1528       __ tbz(count, exact_log2(2/bytes_per_count), L);
       
  1529 
       
  1530       __ ldrh(R3, Address(end_from, -2, pre_indexed));
       
  1531       __ strh(R3, Address(end_to,   -2, pre_indexed));
       
  1532 
       
  1533       __ bind(L);
       
  1534     }
       
  1535 
       
  1536     if (bytes_per_count <= 1) {
       
  1537       Label L;
       
  1538       __ tbz(count, 0, L);
       
  1539 
       
  1540       __ ldrb(R3, Address(end_from, -1, pre_indexed));
       
  1541       __ strb(R3, Address(end_to,   -1, pre_indexed));
       
  1542 
       
  1543       __ bind(L);
       
  1544     }
       
  1545 #else
       
  1546     __ tst(count, 16 / bytes_per_count);
       
  1547     __ ldmdb(end_from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes
       
  1548     __ stmdb(end_to, RegisterSet(R3, R6), writeback, ne);
       
  1549 
       
  1550     __ tst(count, 8 / bytes_per_count);
       
  1551     __ ldmdb(end_from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes
       
  1552     __ stmdb(end_to, RegisterSet(R3, R4), writeback, ne);
       
  1553 
       
  1554     if (bytes_per_count <= 4) {
       
  1555       __ tst(count, 4 / bytes_per_count);
       
  1556       __ ldr(R3, Address(end_from, -4, pre_indexed), ne); // copy 4 bytes
       
  1557       __ str(R3, Address(end_to, -4, pre_indexed), ne);
       
  1558     }
       
  1559 
       
  1560     if (bytes_per_count <= 2) {
       
  1561       __ tst(count, 2 / bytes_per_count);
       
  1562       __ ldrh(R3, Address(end_from, -2, pre_indexed), ne); // copy 2 bytes
       
  1563       __ strh(R3, Address(end_to, -2, pre_indexed), ne);
       
  1564     }
       
  1565 
       
  1566     if (bytes_per_count == 1) {
       
  1567       __ tst(count, 1);
       
  1568       __ ldrb(R3, Address(end_from, -1, pre_indexed), ne);
       
  1569       __ strb(R3, Address(end_to, -1, pre_indexed), ne);
       
  1570     }
       
  1571 
       
  1572     __ pop(RegisterSet(R4,R10));
       
  1573 #endif // AARCH64
       
  1574 
       
  1575     return count_per_loop;
       
  1576   }
       
  1577 
       
  1578 
       
  1579   // Generate the inner loop for shifted forward array copy (unaligned copy).
       
  1580   // It can be used when bytes_per_count < wordSize, i.e.
       
  1581   //  byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64.
       
  1582   //
       
  1583   // Arguments
       
  1584   //      from:      start src address, 64 bits aligned
       
  1585   //      to:        start dst address, (now) wordSize aligned
       
  1586   //      count:     number of elements (32-bit int)
       
  1587   //      bytes_per_count: number of bytes for each unit of 'count'
       
  1588   //      lsr_shift: shift applied to 'old' value to skipped already written bytes
       
  1589   //      lsl_shift: shift applied to 'new' value to set the high bytes of the next write
       
  1590   //
       
  1591   // Return the minimum initial value for count
       
  1592   //
       
  1593   // Notes:
       
  1594   // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
       
  1595   // - 'to' aligned on wordSize
       
  1596   // - 'count' must be greater or equal than the returned value
       
  1597   // - 'lsr_shift' + 'lsl_shift' = BitsPerWord
       
  1598   // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64
       
  1599   //
       
  1600   // Increases 'to' by count*bytes_per_count.
       
  1601   //
       
  1602   // Scratches 'from' and 'count', R3-R10, R12
       
  1603   //
       
  1604   // On entry:
       
  1605   // - R12 is preloaded with the first 'BitsPerWord' bits read just before 'from'
       
  1606   // - (R12 >> lsr_shift) is the part not yet written (just before 'to')
       
  1607   // --> (*to) = (R12 >> lsr_shift) | (*from) << lsl_shift); ...
       
  1608   //
       
  1609   // This implementation may read more bytes than required.
       
  1610   // Actually, it always reads exactly all data from the copied region with upper bound aligned up by wordSize,
       
  1611   // so excessive read do not cross a word bound and is thus harmless.
       
  1612   //
       
  1613   int generate_forward_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) {
       
  1614     assert (from == R0 && to == R1 && count == R2, "adjust the implementation below");
       
  1615 
       
  1616     const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter
       
  1617     const int count_per_loop = bytes_per_loop / bytes_per_count;
       
  1618 
       
  1619     arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_shifted;
       
  1620     int pld_offset = config->pld_distance;
       
  1621 
       
  1622 #ifndef AARCH64
       
  1623     bool split_read= config->split_ldm;
       
  1624     bool split_write= config->split_stm;
       
  1625 #endif // !AARCH64
       
  1626 
       
  1627     const bool prefetch_before = pld_offset < 0;
       
  1628     const bool prefetch_after = pld_offset > 0;
       
  1629     Label L_skip_pld, L_last_read, L_done;
       
  1630     if (pld_offset != 0) {
       
  1631 
       
  1632       pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
       
  1633 
       
  1634       prefetch(from, to, 0);
       
  1635 
       
  1636       if (prefetch_before) {
       
  1637         __ cmp_32(count, count_per_loop);
       
  1638         __ b(L_last_read, lt);
       
  1639         // skip prefetch for small copies
       
  1640         // warning: count is predecreased by the prefetch distance to optimize the inner loop
       
  1641         __ subs_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop);
       
  1642         __ b(L_skip_pld, lt);
       
  1643       }
       
  1644 
       
  1645       int offset = ArmCopyCacheLineSize;
       
  1646       while (offset <= pld_offset) {
       
  1647         prefetch(from, to, offset);
       
  1648         offset += ArmCopyCacheLineSize;
       
  1649       };
       
  1650     }
       
  1651 
       
  1652     Label L_shifted_loop;
       
  1653 
       
  1654     __ align(OptoLoopAlignment);
       
  1655     __ BIND(L_shifted_loop);
       
  1656 
       
  1657     if (prefetch_before) {
       
  1658       // do it early if there might be register locking issues
       
  1659       prefetch(from, to, bytes_per_loop + pld_offset);
       
  1660       __ BIND(L_skip_pld);
       
  1661     } else {
       
  1662       __ cmp_32(count, count_per_loop);
       
  1663       __ b(L_last_read, lt);
       
  1664     }
       
  1665 
       
  1666 #ifdef AARCH64
       
  1667     const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12};
       
  1668     __ logical_shift_right(R3, R12, lsr_shift); // part of R12 not yet written
       
  1669     __ subs_32(count, count, count_per_loop);
       
  1670     bulk_load_forward(from, &data_regs[1], 8);
       
  1671 #else
       
  1672     // read 32 bytes
       
  1673     if (split_read) {
       
  1674       // if write is not split, use less registers in first set to reduce locking
       
  1675       RegisterSet set1 = split_write ? RegisterSet(R4, R7) : RegisterSet(R4, R5);
       
  1676       RegisterSet set2 = (split_write ? RegisterSet(R8, R10) : RegisterSet(R6, R10)) | R12;
       
  1677       __ ldmia(from, set1, writeback);
       
  1678       __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written
       
  1679       __ ldmia(from, set2, writeback);
       
  1680       __ subs(count, count, count_per_loop); // XXX: should it be before the 2nd LDM ? (latency vs locking)
       
  1681     } else {
       
  1682       __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written
       
  1683       __ ldmia(from, RegisterSet(R4, R10) | R12, writeback); // Note: small latency on R4
       
  1684       __ subs(count, count, count_per_loop);
       
  1685     }
       
  1686 #endif // AARCH64
       
  1687 
       
  1688     if (prefetch_after) {
       
  1689       // do it after the 1st ldm/ldp anyway  (no locking issues with early STM/STP)
       
  1690       prefetch(from, to, pld_offset, bytes_per_loop);
       
  1691     }
       
  1692 
       
  1693     // prepare (shift) the values in R3..R10
       
  1694     __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); // merged below low bytes of next val
       
  1695     __ logical_shift_right(R4, R4, lsr_shift); // unused part of next val
       
  1696     __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); // ...
       
  1697     __ logical_shift_right(R5, R5, lsr_shift);
       
  1698     __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift));
       
  1699     __ logical_shift_right(R6, R6, lsr_shift);
       
  1700     __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift));
       
  1701 #ifndef AARCH64
       
  1702     if (split_write) {
       
  1703       // write the first half as soon as possible to reduce stm locking
       
  1704       __ stmia(to, RegisterSet(R3, R6), writeback, prefetch_before ? gt : ge);
       
  1705     }
       
  1706 #endif // !AARCH64
       
  1707     __ logical_shift_right(R7, R7, lsr_shift);
       
  1708     __ orr(R7, R7, AsmOperand(R8, lsl, lsl_shift));
       
  1709     __ logical_shift_right(R8, R8, lsr_shift);
       
  1710     __ orr(R8, R8, AsmOperand(R9, lsl, lsl_shift));
       
  1711     __ logical_shift_right(R9, R9, lsr_shift);
       
  1712     __ orr(R9, R9, AsmOperand(R10, lsl, lsl_shift));
       
  1713     __ logical_shift_right(R10, R10, lsr_shift);
       
  1714     __ orr(R10, R10, AsmOperand(R12, lsl, lsl_shift));
       
  1715 
       
  1716 #ifdef AARCH64
       
  1717     bulk_store_forward(to, data_regs, 8);
       
  1718 #else
       
  1719     if (split_write) {
       
  1720       __ stmia(to, RegisterSet(R7, R10), writeback, prefetch_before ? gt : ge);
       
  1721     } else {
       
  1722       __ stmia(to, RegisterSet(R3, R10), writeback, prefetch_before ? gt : ge);
       
  1723     }
       
  1724 #endif // AARCH64
       
  1725     __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop)
       
  1726 
       
  1727     if (prefetch_before) {
       
  1728       // the first loop may end earlier, allowing to skip pld at the end
       
  1729       __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
       
  1730 #ifndef AARCH64
       
  1731       __ stmia(to, RegisterSet(R3, R10), writeback); // stmia was skipped
       
  1732 #endif // !AARCH64
       
  1733       __ b(L_skip_pld, ge);
       
  1734       __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop);
       
  1735     }
       
  1736 
       
  1737     __ BIND(L_last_read);
       
  1738     __ b(L_done, eq);
       
  1739 
       
  1740 #ifdef AARCH64
       
  1741     assert(bytes_per_count < 8, "adjust the code below");
       
  1742 
       
  1743     __ logical_shift_right(R3, R12, lsr_shift);
       
  1744 
       
  1745     {
       
  1746       Label L;
       
  1747       __ tbz(count, exact_log2(32/bytes_per_count), L);
       
  1748       bulk_load_forward(from, &data_regs[1], 4);
       
  1749       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
       
  1750       __ logical_shift_right(R4, R4, lsr_shift);
       
  1751       __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift));
       
  1752       __ logical_shift_right(R5, R5, lsr_shift);
       
  1753       __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift));
       
  1754       __ logical_shift_right(R6, R6, lsr_shift);
       
  1755       __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift));
       
  1756       bulk_store_forward(to, data_regs, 4);
       
  1757       __ logical_shift_right(R3, R7, lsr_shift);
       
  1758       __ bind(L);
       
  1759     }
       
  1760 
       
  1761     {
       
  1762       Label L;
       
  1763       __ tbz(count, exact_log2(16/bytes_per_count), L);
       
  1764       bulk_load_forward(from, &data_regs[1], 2);
       
  1765       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
       
  1766       __ logical_shift_right(R4, R4, lsr_shift);
       
  1767       __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift));
       
  1768       bulk_store_forward(to, data_regs, 2);
       
  1769       __ logical_shift_right(R3, R5, lsr_shift);
       
  1770       __ bind(L);
       
  1771     }
       
  1772 
       
  1773     {
       
  1774       Label L;
       
  1775       __ tbz(count, exact_log2(8/bytes_per_count), L);
       
  1776       __ ldr(R4, Address(from, 8, post_indexed));
       
  1777       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
       
  1778       __ str(R3, Address(to, 8, post_indexed));
       
  1779       __ logical_shift_right(R3, R4, lsr_shift);
       
  1780       __ bind(L);
       
  1781     }
       
  1782 
       
  1783     const int have_bytes = lsl_shift/BitsPerByte; // number of already read bytes in R3
       
  1784 
       
  1785     // It remains less than wordSize to write.
       
  1786     // Do not check count if R3 already has maximal number of loaded elements (one less than wordSize).
       
  1787     if (have_bytes < wordSize - bytes_per_count) {
       
  1788       Label L;
       
  1789       __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact
       
  1790       __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store?
       
  1791       __ b(L, le);
       
  1792       __ ldr(R4, Address(from, 8, post_indexed));
       
  1793       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
       
  1794       __ bind(L);
       
  1795     }
       
  1796 
       
  1797     {
       
  1798       Label L;
       
  1799       __ tbz(count, exact_log2(4/bytes_per_count), L);
       
  1800       __ str_w(R3, Address(to, 4, post_indexed));
       
  1801       if (bytes_per_count < 4) {
       
  1802         __ logical_shift_right(R3, R3, 4*BitsPerByte);
       
  1803       }
       
  1804       __ bind(L);
       
  1805     }
       
  1806 
       
  1807     if (bytes_per_count <= 2) {
       
  1808       Label L;
       
  1809       __ tbz(count, exact_log2(2/bytes_per_count), L);
       
  1810       __ strh(R3, Address(to, 2, post_indexed));
       
  1811       if (bytes_per_count < 2) {
       
  1812         __ logical_shift_right(R3, R3, 2*BitsPerByte);
       
  1813       }
       
  1814       __ bind(L);
       
  1815     }
       
  1816 
       
  1817     if (bytes_per_count <= 1) {
       
  1818       Label L;
       
  1819       __ tbz(count, exact_log2(1/bytes_per_count), L);
       
  1820       __ strb(R3, Address(to, 1, post_indexed));
       
  1821       __ bind(L);
       
  1822     }
       
  1823 #else
       
  1824     switch (bytes_per_count) {
       
  1825     case 2:
       
  1826       __ mov(R3, AsmOperand(R12, lsr, lsr_shift));
       
  1827       __ tst(count, 8);
       
  1828       __ ldmia(from, RegisterSet(R4, R7), writeback, ne);
       
  1829       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val
       
  1830       __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val
       
  1831       __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ...
       
  1832       __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne);
       
  1833       __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne);
       
  1834       __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne);
       
  1835       __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne);
       
  1836       __ stmia(to, RegisterSet(R3, R6), writeback, ne);
       
  1837       __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne);
       
  1838 
       
  1839       __ tst(count, 4);
       
  1840       __ ldmia(from, RegisterSet(R4, R5), writeback, ne);
       
  1841       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val
       
  1842       __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val
       
  1843       __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ...
       
  1844       __ stmia(to, RegisterSet(R3, R4), writeback, ne);
       
  1845       __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne);
       
  1846 
       
  1847       __ tst(count, 2);
       
  1848       __ ldr(R4, Address(from, 4, post_indexed), ne);
       
  1849       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne);
       
  1850       __ str(R3, Address(to, 4, post_indexed), ne);
       
  1851       __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne);
       
  1852 
       
  1853       __ tst(count, 1);
       
  1854       __ strh(R3, Address(to, 2, post_indexed), ne); // one last short
       
  1855       break;
       
  1856 
       
  1857     case 1:
       
  1858       __ mov(R3, AsmOperand(R12, lsr, lsr_shift));
       
  1859       __ tst(count, 16);
       
  1860       __ ldmia(from, RegisterSet(R4, R7), writeback, ne);
       
  1861       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val
       
  1862       __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val
       
  1863       __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ...
       
  1864       __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne);
       
  1865       __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne);
       
  1866       __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne);
       
  1867       __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne);
       
  1868       __ stmia(to, RegisterSet(R3, R6), writeback, ne);
       
  1869       __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne);
       
  1870 
       
  1871       __ tst(count, 8);
       
  1872       __ ldmia(from, RegisterSet(R4, R5), writeback, ne);
       
  1873       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val
       
  1874       __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val
       
  1875       __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ...
       
  1876       __ stmia(to, RegisterSet(R3, R4), writeback, ne);
       
  1877       __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne);
       
  1878 
       
  1879       __ tst(count, 4);
       
  1880       __ ldr(R4, Address(from, 4, post_indexed), ne);
       
  1881       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne);
       
  1882       __ str(R3, Address(to, 4, post_indexed), ne);
       
  1883       __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne);
       
  1884 
       
  1885       __ andr(count, count, 3);
       
  1886       __ cmp(count, 2);
       
  1887 
       
  1888       // Note: R3 might contain enough bytes ready to write (3 needed at most),
       
  1889       // thus load on lsl_shift==24 is not needed (in fact forces reading
       
  1890       // beyond source buffer end boundary)
       
  1891       if (lsl_shift == 8) {
       
  1892         __ ldr(R4, Address(from, 4, post_indexed), ge);
       
  1893         __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ge);
       
  1894       } else if (lsl_shift == 16) {
       
  1895         __ ldr(R4, Address(from, 4, post_indexed), gt);
       
  1896         __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), gt);
       
  1897       }
       
  1898 
       
  1899       __ strh(R3, Address(to, 2, post_indexed), ge); // two last bytes
       
  1900       __ mov(R3, AsmOperand(R3, lsr, 16), gt);
       
  1901 
       
  1902       __ tst(count, 1);
       
  1903       __ strb(R3, Address(to, 1, post_indexed), ne); // one last byte
       
  1904       break;
       
  1905     }
       
  1906 #endif // AARCH64
       
  1907 
       
  1908     __ BIND(L_done);
       
  1909     return 0; // no minimum
       
  1910   }
       
  1911 
       
  1912   // Generate the inner loop for shifted backward array copy (unaligned copy).
       
  1913   // It can be used when bytes_per_count < wordSize, i.e.
       
  1914   //  byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64.
       
  1915   //
       
  1916   // Arguments
       
  1917   //      end_from:  end src address, 64 bits aligned
       
  1918   //      end_to:    end dst address, (now) wordSize aligned
       
  1919   //      count:     number of elements (32-bit int)
       
  1920   //      bytes_per_count: number of bytes for each unit of 'count'
       
  1921   //      lsl_shift: shift applied to 'old' value to skipped already written bytes
       
  1922   //      lsr_shift: shift applied to 'new' value to set the low bytes of the next write
       
  1923   //
       
  1924   // Return the minimum initial value for count
       
  1925   //
       
  1926   // Notes:
       
  1927   // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
       
  1928   // - 'end_to' aligned on wordSize
       
  1929   // - 'count' must be greater or equal than the returned value
       
  1930   // - 'lsr_shift' + 'lsl_shift' = 'BitsPerWord'
       
  1931   // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64
       
  1932   //
       
  1933   // Decreases 'end_to' by count*bytes_per_count.
       
  1934   //
       
  1935   // Scratches 'end_from', 'count', R3-R10, R12
       
  1936   //
       
  1937   // On entry:
       
  1938   // - R3 is preloaded with the first 'BitsPerWord' bits read just after 'from'
       
  1939   // - (R3 << lsl_shift) is the part not yet written
       
  1940   // --> (*--to) = (R3 << lsl_shift) | (*--from) >> lsr_shift); ...
       
  1941   //
       
  1942   // This implementation may read more bytes than required.
       
  1943   // Actually, it always reads exactly all data from the copied region with beginning aligned down by wordSize,
       
  1944   // so excessive read do not cross a word bound and is thus harmless.
       
  1945   //
       
  1946   int generate_backward_shifted_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) {
       
  1947     assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below");
       
  1948 
       
  1949     const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter
       
  1950     const int count_per_loop = bytes_per_loop / bytes_per_count;
       
  1951 
       
  1952     arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_shifted;
       
  1953     int pld_offset = config->pld_distance;
       
  1954 
       
  1955 #ifndef AARCH64
       
  1956     bool split_read= config->split_ldm;
       
  1957     bool split_write= config->split_stm;
       
  1958 #endif // !AARCH64
       
  1959 
       
  1960 
       
  1961     const bool prefetch_before = pld_offset < 0;
       
  1962     const bool prefetch_after = pld_offset > 0;
       
  1963 
       
  1964     Label L_skip_pld, L_done, L_last_read;
       
  1965     if (pld_offset != 0) {
       
  1966 
       
  1967       pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
       
  1968 
       
  1969       prefetch(end_from, end_to, -wordSize);
       
  1970 
       
  1971       if (prefetch_before) {
       
  1972         __ cmp_32(count, count_per_loop);
       
  1973         __ b(L_last_read, lt);
       
  1974 
       
  1975         // skip prefetch for small copies
       
  1976         // warning: count is predecreased by the prefetch distance to optimize the inner loop
       
  1977         __ subs_32(count, count, ((bytes_per_loop + pld_offset)/bytes_per_count) + count_per_loop);
       
  1978         __ b(L_skip_pld, lt);
       
  1979       }
       
  1980 
       
  1981       int offset = ArmCopyCacheLineSize;
       
  1982       while (offset <= pld_offset) {
       
  1983         prefetch(end_from, end_to, -(wordSize + offset));
       
  1984         offset += ArmCopyCacheLineSize;
       
  1985       };
       
  1986     }
       
  1987 
       
  1988     Label L_shifted_loop;
       
  1989     __ align(OptoLoopAlignment);
       
  1990     __ BIND(L_shifted_loop);
       
  1991 
       
  1992     if (prefetch_before) {
       
  1993       // do the 1st ldm/ldp first anyway (no locking issues with early STM/STP)
       
  1994       prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset));
       
  1995       __ BIND(L_skip_pld);
       
  1996     } else {
       
  1997       __ cmp_32(count, count_per_loop);
       
  1998       __ b(L_last_read, lt);
       
  1999     }
       
  2000 
       
  2001 #ifdef AARCH64
       
  2002     __ logical_shift_left(R12, R3, lsl_shift);
       
  2003     const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12};
       
  2004     bulk_load_backward(end_from, data_regs, 8);
       
  2005 #else
       
  2006     if (split_read) {
       
  2007       __ ldmdb(end_from, RegisterSet(R7, R10), writeback);
       
  2008       __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
       
  2009       __ ldmdb(end_from, RegisterSet(R3, R6), writeback);
       
  2010     } else {
       
  2011       __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
       
  2012       __ ldmdb(end_from, RegisterSet(R3, R10), writeback);
       
  2013     }
       
  2014 #endif // AARCH64
       
  2015 
       
  2016     __ subs_32(count, count, count_per_loop);
       
  2017 
       
  2018     if (prefetch_after) { // do prefetch during ldm/ldp latency
       
  2019       prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop);
       
  2020     }
       
  2021 
       
  2022     // prepare the values in R4..R10,R12
       
  2023     __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); // merged above high  bytes of prev val
       
  2024     __ logical_shift_left(R10, R10, lsl_shift); // unused part of prev val
       
  2025     __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); // ...
       
  2026     __ logical_shift_left(R9, R9, lsl_shift);
       
  2027     __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift));
       
  2028     __ logical_shift_left(R8, R8, lsl_shift);
       
  2029     __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift));
       
  2030     __ logical_shift_left(R7, R7, lsl_shift);
       
  2031     __ orr(R7, R7, AsmOperand(R6, lsr, lsr_shift));
       
  2032     __ logical_shift_left(R6, R6, lsl_shift);
       
  2033     __ orr(R6, R6, AsmOperand(R5, lsr, lsr_shift));
       
  2034 #ifndef AARCH64
       
  2035     if (split_write) {
       
  2036       // store early to reduce locking issues
       
  2037       __ stmdb(end_to, RegisterSet(R6, R10) | R12, writeback, prefetch_before ? gt : ge);
       
  2038     }
       
  2039 #endif // !AARCH64
       
  2040     __ logical_shift_left(R5, R5, lsl_shift);
       
  2041     __ orr(R5, R5, AsmOperand(R4, lsr, lsr_shift));
       
  2042     __ logical_shift_left(R4, R4, lsl_shift);
       
  2043     __ orr(R4, R4, AsmOperand(R3, lsr, lsr_shift));
       
  2044 
       
  2045 #ifdef AARCH64
       
  2046     bulk_store_backward(end_to, &data_regs[1], 8);
       
  2047 #else
       
  2048     if (split_write) {
       
  2049       __ stmdb(end_to, RegisterSet(R4, R5), writeback, prefetch_before ? gt : ge);
       
  2050     } else {
       
  2051       __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback, prefetch_before ? gt : ge);
       
  2052     }
       
  2053 #endif // AARCH64
       
  2054 
       
  2055     __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop)
       
  2056 
       
  2057     if (prefetch_before) {
       
  2058       // the first loop may end earlier, allowing to skip pld at the end
       
  2059       __ cmn_32(count, ((bytes_per_loop + pld_offset)/bytes_per_count));
       
  2060 #ifndef AARCH64
       
  2061       __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback); // stmdb was skipped
       
  2062 #endif // !AARCH64
       
  2063       __ b(L_skip_pld, ge);
       
  2064       __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop);
       
  2065     }
       
  2066 
       
  2067     __ BIND(L_last_read);
       
  2068     __ b(L_done, eq);
       
  2069 
       
  2070 #ifdef AARCH64
       
  2071     assert(bytes_per_count < 8, "adjust the code below");
       
  2072 
       
  2073     __ logical_shift_left(R12, R3, lsl_shift);
       
  2074 
       
  2075     {
       
  2076       Label L;
       
  2077       __ tbz(count, exact_log2(32/bytes_per_count), L);
       
  2078       bulk_load_backward(end_from, &data_regs[4], 4);
       
  2079 
       
  2080       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
       
  2081       __ logical_shift_left(R10, R10, lsl_shift);
       
  2082       __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift));
       
  2083       __ logical_shift_left(R9, R9, lsl_shift);
       
  2084       __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift));
       
  2085       __ logical_shift_left(R8, R8, lsl_shift);
       
  2086       __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift));
       
  2087 
       
  2088       bulk_store_backward(end_to, &data_regs[5], 4);
       
  2089       __ logical_shift_left(R12, R7, lsl_shift);
       
  2090       __ bind(L);
       
  2091     }
       
  2092 
       
  2093     {
       
  2094       Label L;
       
  2095       __ tbz(count, exact_log2(16/bytes_per_count), L);
       
  2096       bulk_load_backward(end_from, &data_regs[6], 2);
       
  2097 
       
  2098       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
       
  2099       __ logical_shift_left(R10, R10, lsl_shift);
       
  2100       __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift));
       
  2101 
       
  2102       bulk_store_backward(end_to, &data_regs[7], 2);
       
  2103       __ logical_shift_left(R12, R9, lsl_shift);
       
  2104       __ bind(L);
       
  2105     }
       
  2106 
       
  2107     {
       
  2108       Label L;
       
  2109       __ tbz(count, exact_log2(8/bytes_per_count), L);
       
  2110       __ ldr(R10, Address(end_from, -8, pre_indexed));
       
  2111       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
       
  2112       __ str(R12, Address(end_to, -8, pre_indexed));
       
  2113       __ logical_shift_left(R12, R10, lsl_shift);
       
  2114       __ bind(L);
       
  2115     }
       
  2116 
       
  2117     const int have_bytes = lsr_shift/BitsPerByte; // number of already read bytes in R12
       
  2118 
       
  2119     // It remains less than wordSize to write.
       
  2120     // Do not check count if R12 already has maximal number of loaded elements (one less than wordSize).
       
  2121     if (have_bytes < wordSize - bytes_per_count) {
       
  2122       Label L;
       
  2123       __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact
       
  2124       __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store?
       
  2125       __ b(L, le);
       
  2126       __ ldr(R10, Address(end_from, -8, pre_indexed));
       
  2127       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
       
  2128       __ bind(L);
       
  2129     }
       
  2130 
       
  2131     assert (bytes_per_count <= 4, "must be");
       
  2132 
       
  2133     {
       
  2134       Label L;
       
  2135       __ tbz(count, exact_log2(4/bytes_per_count), L);
       
  2136       __ logical_shift_right(R9, R12, (wordSize-4)*BitsPerByte);
       
  2137       __ str_w(R9, Address(end_to, -4, pre_indexed)); // Write 4 MSB
       
  2138       if (bytes_per_count < 4) {
       
  2139         __ logical_shift_left(R12, R12, 4*BitsPerByte); // Promote remaining bytes to MSB
       
  2140       }
       
  2141       __ bind(L);
       
  2142     }
       
  2143 
       
  2144     if (bytes_per_count <= 2) {
       
  2145       Label L;
       
  2146       __ tbz(count, exact_log2(2/bytes_per_count), L);
       
  2147       __ logical_shift_right(R9, R12, (wordSize-2)*BitsPerByte);
       
  2148       __ strh(R9, Address(end_to, -2, pre_indexed)); // Write 2 MSB
       
  2149       if (bytes_per_count < 2) {
       
  2150         __ logical_shift_left(R12, R12, 2*BitsPerByte); // Promote remaining bytes to MSB
       
  2151       }
       
  2152       __ bind(L);
       
  2153     }
       
  2154 
       
  2155     if (bytes_per_count <= 1) {
       
  2156       Label L;
       
  2157       __ tbz(count, exact_log2(1/bytes_per_count), L);
       
  2158       __ logical_shift_right(R9, R12, (wordSize-1)*BitsPerByte);
       
  2159       __ strb(R9, Address(end_to, -1, pre_indexed)); // Write 1 MSB
       
  2160       __ bind(L);
       
  2161     }
       
  2162 #else
       
  2163       switch(bytes_per_count) {
       
  2164       case 2:
       
  2165       __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
       
  2166       __ tst(count, 8);
       
  2167       __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne);
       
  2168       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
       
  2169       __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val
       
  2170       __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ...
       
  2171       __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne);
       
  2172       __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne);
       
  2173       __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne);
       
  2174       __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne);
       
  2175       __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne);
       
  2176       __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne);
       
  2177 
       
  2178       __ tst(count, 4);
       
  2179       __ ldmdb(end_from, RegisterSet(R9, R10), writeback, ne);
       
  2180       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
       
  2181       __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val
       
  2182       __ orr(R10, R10, AsmOperand(R9, lsr,lsr_shift),ne); // ...
       
  2183       __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne);
       
  2184       __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne);
       
  2185 
       
  2186       __ tst(count, 2);
       
  2187       __ ldr(R10, Address(end_from, -4, pre_indexed), ne);
       
  2188       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
       
  2189       __ str(R12, Address(end_to, -4, pre_indexed), ne);
       
  2190       __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne);
       
  2191 
       
  2192       __ tst(count, 1);
       
  2193       __ mov(R12, AsmOperand(R12, lsr, lsr_shift),ne);
       
  2194       __ strh(R12, Address(end_to, -2, pre_indexed), ne); // one last short
       
  2195       break;
       
  2196 
       
  2197       case 1:
       
  2198       __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
       
  2199       __ tst(count, 16);
       
  2200       __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne);
       
  2201       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
       
  2202       __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val
       
  2203       __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ...
       
  2204       __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne);
       
  2205       __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne);
       
  2206       __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne);
       
  2207       __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne);
       
  2208       __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne);
       
  2209       __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne);
       
  2210 
       
  2211       __ tst(count, 8);
       
  2212       __ ldmdb(end_from, RegisterSet(R9,R10), writeback, ne);
       
  2213       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
       
  2214       __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val
       
  2215       __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ...
       
  2216       __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne);
       
  2217       __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne);
       
  2218 
       
  2219       __ tst(count, 4);
       
  2220       __ ldr(R10, Address(end_from, -4, pre_indexed), ne);
       
  2221       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
       
  2222       __ str(R12, Address(end_to, -4, pre_indexed), ne);
       
  2223       __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne);
       
  2224 
       
  2225       __ tst(count, 2);
       
  2226       if (lsr_shift != 24) {
       
  2227         // avoid useless reading R10 when we already have 3 bytes ready in R12
       
  2228         __ ldr(R10, Address(end_from, -4, pre_indexed), ne);
       
  2229         __ orr(R12, R12, AsmOperand(R10, lsr,lsr_shift), ne);
       
  2230       }
       
  2231 
       
  2232       // Note: R12 contains enough bytes ready to write (3 needed at most)
       
  2233       // write the 2 MSBs
       
  2234       __ mov(R9, AsmOperand(R12, lsr, 16), ne);
       
  2235       __ strh(R9, Address(end_to, -2, pre_indexed), ne);
       
  2236       // promote remaining to MSB
       
  2237       __ mov(R12, AsmOperand(R12, lsl, 16), ne);
       
  2238 
       
  2239       __ tst(count, 1);
       
  2240       // write the MSB of R12
       
  2241       __ mov(R12, AsmOperand(R12, lsr, 24), ne);
       
  2242       __ strb(R12, Address(end_to, -1, pre_indexed), ne);
       
  2243 
       
  2244       break;
       
  2245       }
       
  2246 #endif // AARCH64
       
  2247 
       
  2248     __ BIND(L_done);
       
  2249     return 0; // no minimum
       
  2250   }
       
  2251 
       
  2252   // This method is very useful for merging forward/backward implementations
       
  2253   Address get_addr_with_indexing(Register base, int delta, bool forward) {
       
  2254     if (forward) {
       
  2255       return Address(base, delta, post_indexed);
       
  2256     } else {
       
  2257       return Address(base, -delta, pre_indexed);
       
  2258     }
       
  2259   }
       
  2260 
       
  2261 #ifdef AARCH64
       
  2262   // Loads one 'size_in_bytes'-sized value from 'from' in given direction, i.e.
       
  2263   //   if forward:  loads value at from and increases from by size
       
  2264   //   if !forward: loads value at from-size_in_bytes and decreases from by size
       
  2265   void load_one(Register rd, Register from, int size_in_bytes, bool forward) {
       
  2266     assert_different_registers(from, rd);
       
  2267     Address addr = get_addr_with_indexing(from, size_in_bytes, forward);
       
  2268     __ load_sized_value(rd, addr, size_in_bytes, false);
       
  2269   }
       
  2270 
       
  2271   // Stores one 'size_in_bytes'-sized value to 'to' in given direction (see load_one)
       
  2272   void store_one(Register rd, Register to, int size_in_bytes, bool forward) {
       
  2273     assert_different_registers(to, rd);
       
  2274     Address addr = get_addr_with_indexing(to, size_in_bytes, forward);
       
  2275     __ store_sized_value(rd, addr, size_in_bytes);
       
  2276   }
       
  2277 #else
       
  2278   // load_one and store_one are the same as for AArch64 except for
       
  2279   //   *) Support for condition execution
       
  2280   //   *) Second value register argument for 8-byte values
       
  2281 
       
  2282   void load_one(Register rd, Register from, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) {
       
  2283     assert_different_registers(from, rd, rd2);
       
  2284     if (size_in_bytes < 8) {
       
  2285       Address addr = get_addr_with_indexing(from, size_in_bytes, forward);
       
  2286       __ load_sized_value(rd, addr, size_in_bytes, false, cond);
       
  2287     } else {
       
  2288       assert (rd2 != noreg, "second value register must be specified");
       
  2289       assert (rd->encoding() < rd2->encoding(), "wrong value register set");
       
  2290 
       
  2291       if (forward) {
       
  2292         __ ldmia(from, RegisterSet(rd) | rd2, writeback, cond);
       
  2293       } else {
       
  2294         __ ldmdb(from, RegisterSet(rd) | rd2, writeback, cond);
       
  2295       }
       
  2296     }
       
  2297   }
       
  2298 
       
  2299   void store_one(Register rd, Register to, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) {
       
  2300     assert_different_registers(to, rd, rd2);
       
  2301     if (size_in_bytes < 8) {
       
  2302       Address addr = get_addr_with_indexing(to, size_in_bytes, forward);
       
  2303       __ store_sized_value(rd, addr, size_in_bytes, cond);
       
  2304     } else {
       
  2305       assert (rd2 != noreg, "second value register must be specified");
       
  2306       assert (rd->encoding() < rd2->encoding(), "wrong value register set");
       
  2307 
       
  2308       if (forward) {
       
  2309         __ stmia(to, RegisterSet(rd) | rd2, writeback, cond);
       
  2310       } else {
       
  2311         __ stmdb(to, RegisterSet(rd) | rd2, writeback, cond);
       
  2312       }
       
  2313     }
       
  2314   }
       
  2315 #endif // AARCH64
       
  2316 
       
  2317   // Copies data from 'from' to 'to' in specified direction to align 'from' by 64 bits.
       
  2318   // (on 32-bit ARM 64-bit alignment is better for LDM).
       
  2319   //
       
  2320   // Arguments:
       
  2321   //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
       
  2322   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
       
  2323   //     count:             32-bit int, maximum number of elements which can be copied
       
  2324   //     bytes_per_count:   size of an element
       
  2325   //     forward:           specifies copy direction
       
  2326   //
       
  2327   // Notes:
       
  2328   //   'from' and 'to' must be aligned by 'bytes_per_count'
       
  2329   //   'count' must not be less than the returned value
       
  2330   //   shifts 'from' and 'to' by the number of copied bytes in corresponding direction
       
  2331   //   decreases 'count' by the number of elements copied
       
  2332   //
       
  2333   // Returns maximum number of bytes which may be copied.
       
  2334   int align_src(Register from, Register to, Register count, Register tmp, int bytes_per_count, bool forward) {
       
  2335     assert_different_registers(from, to, count, tmp);
       
  2336 #ifdef AARCH64
       
  2337     // TODO-AARCH64: replace by simple loop?
       
  2338     Label Laligned_by_2, Laligned_by_4, Laligned_by_8;
       
  2339 
       
  2340     if (bytes_per_count == 1) {
       
  2341       __ tbz(from, 0, Laligned_by_2);
       
  2342       __ sub_32(count, count, 1);
       
  2343       load_one(tmp, from, 1, forward);
       
  2344       store_one(tmp, to, 1, forward);
       
  2345     }
       
  2346 
       
  2347     __ BIND(Laligned_by_2);
       
  2348 
       
  2349     if (bytes_per_count <= 2) {
       
  2350       __ tbz(from, 1, Laligned_by_4);
       
  2351       __ sub_32(count, count, 2/bytes_per_count);
       
  2352       load_one(tmp, from, 2, forward);
       
  2353       store_one(tmp, to, 2, forward);
       
  2354     }
       
  2355 
       
  2356     __ BIND(Laligned_by_4);
       
  2357 
       
  2358     if (bytes_per_count <= 4) {
       
  2359       __ tbz(from, 2, Laligned_by_8);
       
  2360       __ sub_32(count, count, 4/bytes_per_count);
       
  2361       load_one(tmp, from, 4, forward);
       
  2362       store_one(tmp, to, 4, forward);
       
  2363     }
       
  2364     __ BIND(Laligned_by_8);
       
  2365 #else // AARCH64
       
  2366     if (bytes_per_count < 8) {
       
  2367       Label L_align_src;
       
  2368       __ BIND(L_align_src);
       
  2369       __ tst(from, 7);
       
  2370       // ne => not aligned: copy one element and (if bytes_per_count < 4) loop
       
  2371       __ sub(count, count, 1, ne);
       
  2372       load_one(tmp, from, bytes_per_count, forward, ne);
       
  2373       store_one(tmp, to, bytes_per_count, forward, ne);
       
  2374       if (bytes_per_count < 4) {
       
  2375         __ b(L_align_src, ne); // if bytes_per_count == 4, then 0 or 1 loop iterations are enough
       
  2376       }
       
  2377     }
       
  2378 #endif // AARCH64
       
  2379     return 7/bytes_per_count;
       
  2380   }
       
  2381 
       
  2382   // Copies 'count' of 'bytes_per_count'-sized elements in the specified direction.
       
  2383   //
       
  2384   // Arguments:
       
  2385   //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
       
  2386   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
       
  2387   //     count:             32-bit int, number of elements to be copied
       
  2388   //     entry:             copy loop entry point
       
  2389   //     bytes_per_count:   size of an element
       
  2390   //     forward:           specifies copy direction
       
  2391   //
       
  2392   // Notes:
       
  2393   //     shifts 'from' and 'to'
       
  2394   void copy_small_array(Register from, Register to, Register count, Register tmp, Register tmp2, int bytes_per_count, bool forward, Label & entry) {
       
  2395     assert_different_registers(from, to, count, tmp);
       
  2396 
       
  2397     __ align(OptoLoopAlignment);
       
  2398 #ifdef AARCH64
       
  2399     Label L_small_array_done, L_small_array_loop;
       
  2400     __ BIND(entry);
       
  2401     __ cbz_32(count, L_small_array_done);
       
  2402 
       
  2403     __ BIND(L_small_array_loop);
       
  2404     __ subs_32(count, count, 1);
       
  2405     load_one(tmp, from, bytes_per_count, forward);
       
  2406     store_one(tmp, to, bytes_per_count, forward);
       
  2407     __ b(L_small_array_loop, gt);
       
  2408 
       
  2409     __ BIND(L_small_array_done);
       
  2410 #else
       
  2411     Label L_small_loop;
       
  2412     __ BIND(L_small_loop);
       
  2413     store_one(tmp, to, bytes_per_count, forward, al, tmp2);
       
  2414     __ BIND(entry); // entry point
       
  2415     __ subs(count, count, 1);
       
  2416     load_one(tmp, from, bytes_per_count, forward, ge, tmp2);
       
  2417     __ b(L_small_loop, ge);
       
  2418 #endif // AARCH64
       
  2419   }
       
  2420 
       
  2421   // Aligns 'to' by reading one word from 'from' and writting its part to 'to'.
       
  2422   //
       
  2423   // Arguments:
       
  2424   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
       
  2425   //     count:             32-bit int, number of elements allowed to be copied
       
  2426   //     to_remainder:      remainder of dividing 'to' by wordSize
       
  2427   //     bytes_per_count:   size of an element
       
  2428   //     forward:           specifies copy direction
       
  2429   //     Rval:              contains an already read but not yet written word;
       
  2430   //                        its' LSBs (if forward) or MSBs (if !forward) are to be written to align 'to'.
       
  2431   //
       
  2432   // Notes:
       
  2433   //     'count' must not be less then the returned value
       
  2434   //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
       
  2435   //     shifts 'to' by the number of written bytes (so that it becomes the bound of memory to be written)
       
  2436   //     decreases 'count' by the the number of elements written
       
  2437   //     Rval's MSBs or LSBs remain to be written further by generate_{forward,backward}_shifted_copy_loop
       
  2438   int align_dst(Register to, Register count, Register Rval, Register tmp,
       
  2439                                         int to_remainder, int bytes_per_count, bool forward) {
       
  2440     assert_different_registers(to, count, tmp, Rval);
       
  2441 
       
  2442     assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is not valid");
       
  2443     assert (to_remainder % bytes_per_count == 0, "to must be aligned by bytes_per_count");
       
  2444 
       
  2445     int bytes_to_write = forward ? (wordSize - to_remainder) : to_remainder;
       
  2446 
       
  2447     int offset = 0;
       
  2448 
       
  2449     for (int l = 0; l < LogBytesPerWord; ++l) {
       
  2450       int s = (1 << l);
       
  2451       if (bytes_to_write & s) {
       
  2452         int new_offset = offset + s*BitsPerByte;
       
  2453         if (forward) {
       
  2454           if (offset == 0) {
       
  2455             store_one(Rval, to, s, forward);
       
  2456           } else {
       
  2457             __ logical_shift_right(tmp, Rval, offset);
       
  2458             store_one(tmp, to, s, forward);
       
  2459           }
       
  2460         } else {
       
  2461           __ logical_shift_right(tmp, Rval, BitsPerWord - new_offset);
       
  2462           store_one(tmp, to, s, forward);
       
  2463         }
       
  2464 
       
  2465         offset = new_offset;
       
  2466       }
       
  2467     }
       
  2468 
       
  2469     assert (offset == bytes_to_write * BitsPerByte, "all bytes must be copied");
       
  2470 
       
  2471     __ sub_32(count, count, bytes_to_write/bytes_per_count);
       
  2472 
       
  2473     return bytes_to_write / bytes_per_count;
       
  2474   }
       
  2475 
       
  2476   // Copies 'count' of elements using shifted copy loop
       
  2477   //
       
  2478   // Arguments:
       
  2479   //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
       
  2480   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
       
  2481   //     count:             32-bit int, number of elements to be copied
       
  2482   //     to_remainder:      remainder of dividing 'to' by wordSize
       
  2483   //     bytes_per_count:   size of an element
       
  2484   //     forward:           specifies copy direction
       
  2485   //     Rval:              contains an already read but not yet written word
       
  2486   //
       
  2487   //
       
  2488   // Notes:
       
  2489   //     'count' must not be less then the returned value
       
  2490   //     'from' must be aligned by wordSize
       
  2491   //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
       
  2492   //     shifts 'to' by the number of copied bytes
       
  2493   //
       
  2494   // Scratches R3-R10, R12
       
  2495   int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, Register Rval,
       
  2496                                                         int to_remainder, int bytes_per_count, bool forward) {
       
  2497 
       
  2498     assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is invalid");
       
  2499 
       
  2500     const Register tmp  = forward ? R3 : R12; // TODO-AARCH64: on cojoint_short R4 was used for tmp
       
  2501     assert_different_registers(from, to, count, Rval, tmp);
       
  2502 
       
  2503     int required_to_align = align_dst(to, count, Rval, tmp, to_remainder, bytes_per_count, forward);
       
  2504 
       
  2505     int lsr_shift = (wordSize - to_remainder) * BitsPerByte;
       
  2506     int lsl_shift = to_remainder * BitsPerByte;
       
  2507 
       
  2508     int min_copy;
       
  2509     if (forward) {
       
  2510       min_copy = generate_forward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift);
       
  2511     } else {
       
  2512       min_copy = generate_backward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift);
       
  2513     }
       
  2514 
       
  2515     return min_copy + required_to_align;
       
  2516   }
       
  2517 
       
  2518   // Copies 'count' of elements using shifted copy loop
       
  2519   //
       
  2520   // Arguments:
       
  2521   //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
       
  2522   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
       
  2523   //     count:             32-bit int, number of elements to be copied
       
  2524   //     bytes_per_count:   size of an element
       
  2525   //     forward:           specifies copy direction
       
  2526   //
       
  2527   // Notes:
       
  2528   //     'count' must not be less then the returned value
       
  2529   //     'from' must be aligned by wordSize
       
  2530   //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
       
  2531   //     shifts 'to' by the number of copied bytes
       
  2532   //
       
  2533   // Scratches 'from', 'count', R3 and R12.
       
  2534   // On AArch64 also scratches R4-R10, on 32-bit ARM saves them to use.
       
  2535   int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward) {
       
  2536 
       
  2537     const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect
       
  2538 
       
  2539     int min_copy = 0;
       
  2540 
       
  2541     // Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point,
       
  2542     // then the remainder of 'to' divided by wordSize is one of elements of {seq}.
       
  2543 
       
  2544 #ifdef AARCH64
       
  2545     // TODO-AARCH64: simplify, tune
       
  2546 
       
  2547     load_one(Rval, from, wordSize, forward);
       
  2548 
       
  2549     Label L_loop_finished;
       
  2550 
       
  2551     switch (bytes_per_count) {
       
  2552       case 4:
       
  2553         min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward);
       
  2554         break;
       
  2555       case 2:
       
  2556       {
       
  2557         Label L2, L4, L6;
       
  2558 
       
  2559         __ tbz(to, 1, L4);
       
  2560         __ tbz(to, 2, L2);
       
  2561 
       
  2562         __ BIND(L6);
       
  2563         int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward);
       
  2564         __ b(L_loop_finished);
       
  2565 
       
  2566         __ BIND(L2);
       
  2567         int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
       
  2568         __ b(L_loop_finished);
       
  2569 
       
  2570         __ BIND(L4);
       
  2571         int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward);
       
  2572 
       
  2573         min_copy = MAX2(MAX2(min_copy2, min_copy4), min_copy6);
       
  2574         break;
       
  2575       }
       
  2576       case 1:
       
  2577       {
       
  2578         Label L1, L2, L3, L4, L5, L6, L7;
       
  2579         Label L15, L26;
       
  2580         Label L246;
       
  2581 
       
  2582         __ tbz(to, 0, L246);
       
  2583         __ tbz(to, 1, L15);
       
  2584         __ tbz(to, 2, L3);
       
  2585 
       
  2586         __ BIND(L7);
       
  2587         int min_copy7 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 7, bytes_per_count, forward);
       
  2588         __ b(L_loop_finished);
       
  2589 
       
  2590         __ BIND(L246);
       
  2591         __ tbnz(to, 1, L26);
       
  2592 
       
  2593         __ BIND(L4);
       
  2594         int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward);
       
  2595         __ b(L_loop_finished);
       
  2596 
       
  2597         __ BIND(L15);
       
  2598         __ tbz(to, 2, L1);
       
  2599 
       
  2600         __ BIND(L5);
       
  2601         int min_copy5 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 5, bytes_per_count, forward);
       
  2602         __ b(L_loop_finished);
       
  2603 
       
  2604         __ BIND(L3);
       
  2605         int min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
       
  2606         __ b(L_loop_finished);
       
  2607 
       
  2608         __ BIND(L26);
       
  2609         __ tbz(to, 2, L2);
       
  2610 
       
  2611         __ BIND(L6);
       
  2612         int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward);
       
  2613         __ b(L_loop_finished);
       
  2614 
       
  2615         __ BIND(L1);
       
  2616         int min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
       
  2617         __ b(L_loop_finished);
       
  2618 
       
  2619         __ BIND(L2);
       
  2620         int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
       
  2621 
       
  2622 
       
  2623         min_copy = MAX2(min_copy1, min_copy2);
       
  2624         min_copy = MAX2(min_copy,  min_copy3);
       
  2625         min_copy = MAX2(min_copy,  min_copy4);
       
  2626         min_copy = MAX2(min_copy,  min_copy5);
       
  2627         min_copy = MAX2(min_copy,  min_copy6);
       
  2628         min_copy = MAX2(min_copy,  min_copy7);
       
  2629         break;
       
  2630       }
       
  2631       default:
       
  2632         ShouldNotReachHere();
       
  2633         break;
       
  2634     }
       
  2635     __ BIND(L_loop_finished);
       
  2636 
       
  2637 #else
       
  2638     __ push(RegisterSet(R4,R10));
       
  2639     load_one(Rval, from, wordSize, forward);
       
  2640 
       
  2641     switch (bytes_per_count) {
       
  2642       case 2:
       
  2643         min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
       
  2644         break;
       
  2645       case 1:
       
  2646       {
       
  2647         Label L1, L2, L3;
       
  2648         int min_copy1, min_copy2, min_copy3;
       
  2649 
       
  2650         Label L_loop_finished;
       
  2651 
       
  2652         if (forward) {
       
  2653             __ tbz(to, 0, L2);
       
  2654             __ tbz(to, 1, L1);
       
  2655 
       
  2656             __ BIND(L3);
       
  2657             min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
       
  2658             __ b(L_loop_finished);
       
  2659 
       
  2660             __ BIND(L1);
       
  2661             min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
       
  2662             __ b(L_loop_finished);
       
  2663 
       
  2664             __ BIND(L2);
       
  2665             min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
       
  2666         } else {
       
  2667             __ tbz(to, 0, L2);
       
  2668             __ tbnz(to, 1, L3);
       
  2669 
       
  2670             __ BIND(L1);
       
  2671             min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
       
  2672             __ b(L_loop_finished);
       
  2673 
       
  2674              __ BIND(L3);
       
  2675             min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
       
  2676             __ b(L_loop_finished);
       
  2677 
       
  2678            __ BIND(L2);
       
  2679             min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
       
  2680         }
       
  2681 
       
  2682         min_copy = MAX2(MAX2(min_copy1, min_copy2), min_copy3);
       
  2683 
       
  2684         __ BIND(L_loop_finished);
       
  2685 
       
  2686         break;
       
  2687       }
       
  2688       default:
       
  2689         ShouldNotReachHere();
       
  2690         break;
       
  2691     }
       
  2692 
       
  2693     __ pop(RegisterSet(R4,R10));
       
  2694 #endif // AARCH64
       
  2695 
       
  2696     return min_copy;
       
  2697   }
       
  2698 
       
  2699 #ifndef PRODUCT
       
  2700   int * get_arraycopy_counter(int bytes_per_count) {
       
  2701     switch (bytes_per_count) {
       
  2702       case 1:
       
  2703         return &SharedRuntime::_jbyte_array_copy_ctr;
       
  2704       case 2:
       
  2705         return &SharedRuntime::_jshort_array_copy_ctr;
       
  2706       case 4:
       
  2707         return &SharedRuntime::_jint_array_copy_ctr;
       
  2708       case 8:
       
  2709         return &SharedRuntime::_jlong_array_copy_ctr;
       
  2710       default:
       
  2711         ShouldNotReachHere();
       
  2712         return NULL;
       
  2713     }
       
  2714   }
       
  2715 #endif // !PRODUCT
       
  2716 
       
  2717   //
       
  2718   //  Generate stub for primitive array copy.  If "aligned" is true, the
       
  2719   //  "from" and "to" addresses are assumed to be heapword aligned.
       
  2720   //
       
  2721   //  If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and
       
  2722   //  "nooverlap_target" must be specified as the address to jump if they don't.
       
  2723   //
       
  2724   // Arguments for generated stub:
       
  2725   //      from:  R0
       
  2726   //      to:    R1
       
  2727   //      count: R2 treated as signed 32-bit int
       
  2728   //
       
  2729   address generate_primitive_copy(bool aligned, const char * name, bool status, int bytes_per_count, bool disjoint, address nooverlap_target = NULL) {
       
  2730     __ align(CodeEntryAlignment);
       
  2731     StubCodeMark mark(this, "StubRoutines", name);
       
  2732     address start = __ pc();
       
  2733 
       
  2734     const Register from  = R0;   // source array address
       
  2735     const Register to    = R1;   // destination array address
       
  2736     const Register count = R2;   // elements count
       
  2737     const Register tmp1  = R3;
       
  2738     const Register tmp2  = R12;
       
  2739 
       
  2740     if (!aligned)  {
       
  2741       BLOCK_COMMENT("Entry:");
       
  2742     }
       
  2743 
       
  2744     __ zap_high_non_significant_bits(R2);
       
  2745 
       
  2746     if (!disjoint) {
       
  2747       assert (nooverlap_target != NULL, "must be specified for conjoint case");
       
  2748       array_overlap_test(nooverlap_target, exact_log2(bytes_per_count), tmp1, tmp2);
       
  2749     }
       
  2750 
       
  2751     inc_counter_np(*get_arraycopy_counter(bytes_per_count), tmp1, tmp2);
       
  2752 
       
  2753     // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy
       
  2754     // Disjoint case: perform forward copy
       
  2755     bool forward = disjoint;
       
  2756 
       
  2757 
       
  2758     if (!forward) {
       
  2759       // Set 'from' and 'to' to upper bounds
       
  2760       int log_bytes_per_count = exact_log2(bytes_per_count);
       
  2761       __ add_ptr_scaled_int32(to,   to,   count, log_bytes_per_count);
       
  2762       __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count);
       
  2763     }
       
  2764 
       
  2765     // There are two main copy loop implementations:
       
  2766     //  *) The huge and complex one applicable only for large enough arrays
       
  2767     //  *) The small and simple one applicable for any array (but not efficient for large arrays).
       
  2768     // Currently "small" implementation is used if and only if the "large" one could not be used.
       
  2769     // XXX optim: tune the limit higher ?
       
  2770     // Large implementation lower applicability bound is actually determined by
       
  2771     // aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop.
       
  2772     const int small_copy_limit = (8*wordSize + 7) / bytes_per_count;
       
  2773 
       
  2774     Label L_small_array;
       
  2775     __ cmp_32(count, small_copy_limit);
       
  2776     __ b(L_small_array, le); // TODO-AARCH64: le vs lt
       
  2777 
       
  2778     // Otherwise proceed with large implementation.
       
  2779 
       
  2780     bool from_is_aligned = (bytes_per_count >= 8);
       
  2781     if (aligned && forward && (HeapWordSize % 8 == 0)) {
       
  2782         // if 'from' is heapword aligned and HeapWordSize is divisible by 8,
       
  2783         //  then from is aligned by 8
       
  2784         from_is_aligned = true;
       
  2785     }
       
  2786 
       
  2787     int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward);
       
  2788     assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count");
       
  2789 
       
  2790     // now 'from' is aligned
       
  2791 
       
  2792     bool to_is_aligned = false;
       
  2793 
       
  2794     if (bytes_per_count >= wordSize) {
       
  2795       // 'to' is aligned by bytes_per_count, so it is aligned by wordSize
       
  2796       to_is_aligned = true;
       
  2797     } else {
       
  2798       if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) {
       
  2799         // Originally 'from' and 'to' were heapword aligned;
       
  2800         // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned,
       
  2801         //  so 'to' is also heapword aligned and thus aligned by wordSize.
       
  2802         to_is_aligned = true;
       
  2803       }
       
  2804     }
       
  2805 
       
  2806     Label L_unaligned_dst;
       
  2807 
       
  2808     if (!to_is_aligned) {
       
  2809       BLOCK_COMMENT("Check dst alignment:");
       
  2810       __ tst(to, wordSize - 1);
       
  2811       __ b(L_unaligned_dst, ne); // 'to' is not aligned
       
  2812     }
       
  2813 
       
  2814     // 'from' and 'to' are properly aligned
       
  2815 
       
  2816     int min_copy;
       
  2817     if (forward) {
       
  2818       min_copy = generate_forward_aligned_copy_loop (from, to, count, bytes_per_count);
       
  2819     } else {
       
  2820       min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count);
       
  2821     }
       
  2822     assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count");
       
  2823 
       
  2824     if (status) {
       
  2825       __ mov(R0, 0); // OK
       
  2826     }
       
  2827 
       
  2828     __ ret();
       
  2829 
       
  2830     {
       
  2831       copy_small_array(from, to, count, tmp1, tmp2, bytes_per_count, forward, L_small_array /* entry */);
       
  2832 
       
  2833       if (status) {
       
  2834         __ mov(R0, 0); // OK
       
  2835       }
       
  2836 
       
  2837       __ ret();
       
  2838     }
       
  2839 
       
  2840     if (! to_is_aligned) {
       
  2841       __ BIND(L_unaligned_dst);
       
  2842       int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward);
       
  2843       assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count");
       
  2844 
       
  2845       if (status) {
       
  2846         __ mov(R0, 0); // OK
       
  2847       }
       
  2848 
       
  2849       __ ret();
       
  2850     }
       
  2851 
       
  2852     return start;
       
  2853   }
       
  2854 
       
  2855 #if INCLUDE_ALL_GCS
       
  2856   //
       
  2857   //  Generate pre-write barrier for array.
       
  2858   //
       
  2859   //  Input:
       
  2860   //     addr     - register containing starting address
       
  2861   //     count    - register containing element count, 32-bit int
       
  2862   //     callee_saved_regs -
       
  2863   //                the call must preserve this number of registers: R0, R1, ..., R[callee_saved_regs-1]
       
  2864   //
       
  2865   //  callee_saved_regs must include addr and count
       
  2866   //  Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) except for callee_saved_regs.
       
  2867   void gen_write_ref_array_pre_barrier(Register addr, Register count, int callee_saved_regs) {
       
  2868     BarrierSet* bs = Universe::heap()->barrier_set();
       
  2869     if (bs->has_write_ref_pre_barrier()) {
       
  2870       assert(bs->has_write_ref_array_pre_opt(),
       
  2871              "Else unsupported barrier set.");
       
  2872 
       
  2873       assert( addr->encoding() < callee_saved_regs, "addr must be saved");
       
  2874       assert(count->encoding() < callee_saved_regs, "count must be saved");
       
  2875 
       
  2876       BLOCK_COMMENT("PreBarrier");
       
  2877 
       
  2878 #ifdef AARCH64
       
  2879       callee_saved_regs = round_to(callee_saved_regs, 2);
       
  2880       for (int i = 0; i < callee_saved_regs; i += 2) {
       
  2881         __ raw_push(as_Register(i), as_Register(i+1));
       
  2882       }
       
  2883 #else
       
  2884       RegisterSet saved_regs = RegisterSet(R0, as_Register(callee_saved_regs-1));
       
  2885       __ push(saved_regs | R9ifScratched);
       
  2886 #endif // AARCH64
       
  2887 
       
  2888       if (addr != R0) {
       
  2889         assert_different_registers(count, R0);
       
  2890         __ mov(R0, addr);
       
  2891       }
       
  2892 #ifdef AARCH64
       
  2893       __ zero_extend(R1, count, 32); // BarrierSet::static_write_ref_array_pre takes size_t
       
  2894 #else
       
  2895       if (count != R1) {
       
  2896         __ mov(R1, count);
       
  2897       }
       
  2898 #endif // AARCH64
       
  2899 
       
  2900       __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
       
  2901 
       
  2902 #ifdef AARCH64
       
  2903       for (int i = callee_saved_regs - 2; i >= 0; i -= 2) {
       
  2904         __ raw_pop(as_Register(i), as_Register(i+1));
       
  2905       }
       
  2906 #else
       
  2907       __ pop(saved_regs | R9ifScratched);
       
  2908 #endif // AARCH64
       
  2909     }
       
  2910   }
       
  2911 #endif // INCLUDE_ALL_GCS
       
  2912 
       
  2913   //
       
  2914   //  Generate post-write barrier for array.
       
  2915   //
       
  2916   //  Input:
       
  2917   //     addr     - register containing starting address (can be scratched)
       
  2918   //     count    - register containing element count, 32-bit int (can be scratched)
       
  2919   //     tmp      - scratch register
       
  2920   //
       
  2921   //  Note: LR can be scratched but might be equal to addr, count or tmp
       
  2922   //  Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
       
  2923   void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp) {
       
  2924     assert_different_registers(addr, count, tmp);
       
  2925     BarrierSet* bs = Universe::heap()->barrier_set();
       
  2926 
       
  2927     switch (bs->kind()) {
       
  2928     case BarrierSet::G1SATBCTLogging:
       
  2929       {
       
  2930         BLOCK_COMMENT("G1PostBarrier");
       
  2931         if (addr != R0) {
       
  2932           assert_different_registers(count, R0);
       
  2933           __ mov(R0, addr);
       
  2934         }
       
  2935 #ifdef AARCH64
       
  2936         __ zero_extend(R1, count, 32); // BarrierSet::static_write_ref_array_post takes size_t
       
  2937 #else
       
  2938         if (count != R1) {
       
  2939           __ mov(R1, count);
       
  2940         }
       
  2941 #if R9_IS_SCRATCHED
       
  2942         // Safer to save R9 here since callers may have been written
       
  2943         // assuming R9 survives. This is suboptimal but is not in
       
  2944         // general worth optimizing for the few platforms where R9
       
  2945         // is scratched. Note that the optimization might not be to
       
  2946         // difficult for this particular call site.
       
  2947         __ push(R9);
       
  2948 #endif
       
  2949 #endif // !AARCH64
       
  2950         __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
       
  2951 #ifndef AARCH64
       
  2952 #if R9_IS_SCRATCHED
       
  2953         __ pop(R9);
       
  2954 #endif
       
  2955 #endif // !AARCH64
       
  2956       }
       
  2957       break;
       
  2958     case BarrierSet::CardTableForRS:
       
  2959     case BarrierSet::CardTableExtension:
       
  2960       {
       
  2961         BLOCK_COMMENT("CardTablePostBarrier");
       
  2962         CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
       
  2963         assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
       
  2964 
       
  2965         Label L_cardtable_loop;
       
  2966 
       
  2967         __ add_ptr_scaled_int32(count, addr, count, LogBytesPerHeapOop);
       
  2968         __ sub(count, count, BytesPerHeapOop);                            // last addr
       
  2969 
       
  2970         __ logical_shift_right(addr, addr, CardTableModRefBS::card_shift);
       
  2971         __ logical_shift_right(count, count, CardTableModRefBS::card_shift);
       
  2972         __ sub(count, count, addr); // nb of cards
       
  2973 
       
  2974         // warning: Rthread has not been preserved
       
  2975         __ mov_address(tmp, (address) ct->byte_map_base, symbolic_Relocation::card_table_reference);
       
  2976         __ add(addr,tmp, addr);
       
  2977 
       
  2978         Register zero = __ zero_register(tmp);
       
  2979 
       
  2980         __ BIND(L_cardtable_loop);
       
  2981         __ strb(zero, Address(addr, 1, post_indexed));
       
  2982         __ subs(count, count, 1);
       
  2983         __ b(L_cardtable_loop, ge);
       
  2984       }
       
  2985       break;
       
  2986     case BarrierSet::ModRef:
       
  2987       break;
       
  2988     default:
       
  2989       ShouldNotReachHere();
       
  2990     }
       
  2991   }
       
  2992 
       
  2993   // Generates pattern of code to be placed after raw data copying in generate_oop_copy
       
  2994   // Includes return from arraycopy stub.
       
  2995   //
       
  2996   // Arguments:
       
  2997   //     to:       destination pointer after copying.
       
  2998   //               if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region
       
  2999   //     count:    total number of copied elements, 32-bit int
       
  3000   //
       
  3001   // Blows all volatile (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) and 'to', 'count', 'tmp' registers.
       
  3002   void oop_arraycopy_stub_epilogue_helper(Register to, Register count, Register tmp, bool status, bool forward) {
       
  3003     assert_different_registers(to, count, tmp);
       
  3004 
       
  3005     if (forward) {
       
  3006       // 'to' is upper bound of the modified region
       
  3007       // restore initial dst:
       
  3008       __ sub_ptr_scaled_int32(to, to, count, LogBytesPerHeapOop);
       
  3009     }
       
  3010 
       
  3011     // 'to' is the beginning of the region
       
  3012 
       
  3013     gen_write_ref_array_post_barrier(to, count, tmp);
       
  3014 
       
  3015     if (status) {
       
  3016       __ mov(R0, 0); // OK
       
  3017     }
       
  3018 
       
  3019 #ifdef AARCH64
       
  3020     __ raw_pop(LR, ZR);
       
  3021     __ ret();
       
  3022 #else
       
  3023     __ pop(PC);
       
  3024 #endif // AARCH64
       
  3025   }
       
  3026 
       
  3027 
       
  3028   //  Generate stub for assign-compatible oop copy.  If "aligned" is true, the
       
  3029   //  "from" and "to" addresses are assumed to be heapword aligned.
       
  3030   //
       
  3031   //  If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and
       
  3032   //  "nooverlap_target" must be specified as the address to jump if they don't.
       
  3033   //
       
  3034   // Arguments for generated stub:
       
  3035   //      from:  R0
       
  3036   //      to:    R1
       
  3037   //      count: R2 treated as signed 32-bit int
       
  3038   //
       
  3039   address generate_oop_copy(bool aligned, const char * name, bool status, bool disjoint, address nooverlap_target = NULL) {
       
  3040     __ align(CodeEntryAlignment);
       
  3041     StubCodeMark mark(this, "StubRoutines", name);
       
  3042     address start = __ pc();
       
  3043 
       
  3044     Register from  = R0;
       
  3045     Register to    = R1;
       
  3046     Register count = R2;
       
  3047     Register tmp1  = R3;
       
  3048     Register tmp2  = R12;
       
  3049 
       
  3050 
       
  3051     if (!aligned) {
       
  3052       BLOCK_COMMENT("Entry:");
       
  3053     }
       
  3054 
       
  3055     __ zap_high_non_significant_bits(R2);
       
  3056 
       
  3057     if (!disjoint) {
       
  3058       assert (nooverlap_target != NULL, "must be specified for conjoint case");
       
  3059       array_overlap_test(nooverlap_target, LogBytesPerHeapOop, tmp1, tmp2);
       
  3060     }
       
  3061 
       
  3062     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, tmp1, tmp2);
       
  3063 
       
  3064     // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy
       
  3065     // Disjoint case: perform forward copy
       
  3066     bool forward = disjoint;
       
  3067 
       
  3068     const int bytes_per_count = BytesPerHeapOop;
       
  3069     const int log_bytes_per_count = LogBytesPerHeapOop;
       
  3070 
       
  3071     const Register saved_count = LR;
       
  3072     const int callee_saved_regs = 3; // R0-R2
       
  3073 
       
  3074     // LR is used later to save barrier args
       
  3075 #ifdef AARCH64
       
  3076     __ raw_push(LR, ZR);
       
  3077 #else
       
  3078     __ push(LR);
       
  3079 #endif // AARCH64
       
  3080 
       
  3081 #if INCLUDE_ALL_GCS
       
  3082     gen_write_ref_array_pre_barrier(to, count, callee_saved_regs);
       
  3083 #endif // INCLUDE_ALL_GCS
       
  3084 
       
  3085     // save arguments for barrier generation (after the pre barrier)
       
  3086     __ mov(saved_count, count);
       
  3087 
       
  3088     if (!forward) {
       
  3089       __ add_ptr_scaled_int32(to,   to,   count, log_bytes_per_count);
       
  3090       __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count);
       
  3091     }
       
  3092 
       
  3093     // for short arrays, just do single element copy
       
  3094     Label L_small_array;
       
  3095     const int small_copy_limit = (8*wordSize + 7)/bytes_per_count; // XXX optim: tune the limit higher ?
       
  3096     __ cmp_32(count, small_copy_limit);
       
  3097     __ b(L_small_array, le);
       
  3098 
       
  3099     bool from_is_aligned = (bytes_per_count >= 8);
       
  3100     if (aligned && forward && (HeapWordSize % 8 == 0)) {
       
  3101         // if 'from' is heapword aligned and HeapWordSize is divisible by 8,
       
  3102         //  then from is aligned by 8
       
  3103         from_is_aligned = true;
       
  3104     }
       
  3105 
       
  3106     int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward);
       
  3107     assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count");
       
  3108 
       
  3109     // now 'from' is aligned
       
  3110 
       
  3111     bool to_is_aligned = false;
       
  3112 
       
  3113     if (bytes_per_count >= wordSize) {
       
  3114       // 'to' is aligned by bytes_per_count, so it is aligned by wordSize
       
  3115       to_is_aligned = true;
       
  3116     } else {
       
  3117       if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) {
       
  3118         // Originally 'from' and 'to' were heapword aligned;
       
  3119         // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned,
       
  3120         //  so 'to' is also heapword aligned and thus aligned by wordSize.
       
  3121         to_is_aligned = true;
       
  3122       }
       
  3123     }
       
  3124 
       
  3125     Label L_unaligned_dst;
       
  3126 
       
  3127     if (!to_is_aligned) {
       
  3128       BLOCK_COMMENT("Check dst alignment:");
       
  3129       __ tst(to, wordSize - 1);
       
  3130       __ b(L_unaligned_dst, ne); // 'to' is not aligned
       
  3131     }
       
  3132 
       
  3133     int min_copy;
       
  3134     if (forward) {
       
  3135       min_copy = generate_forward_aligned_copy_loop(from, to, count, bytes_per_count);
       
  3136     } else {
       
  3137       min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count);
       
  3138     }
       
  3139     assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count");
       
  3140 
       
  3141     oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward);
       
  3142 
       
  3143     {
       
  3144       copy_small_array(from, to, count, tmp1, noreg, bytes_per_count, forward, L_small_array);
       
  3145 
       
  3146       oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward);
       
  3147     }
       
  3148 
       
  3149     if (!to_is_aligned) {
       
  3150       // !to_is_aligned <=> UseCompressedOops && AArch64
       
  3151       __ BIND(L_unaligned_dst);
       
  3152 #ifdef AARCH64
       
  3153       assert (UseCompressedOops, "unaligned oop array copy may be requested only with UseCompressedOops");
       
  3154 #else
       
  3155       ShouldNotReachHere();
       
  3156 #endif // AARCH64
       
  3157       int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward);
       
  3158       assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count");
       
  3159 
       
  3160       oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward);
       
  3161     }
       
  3162 
       
  3163     return start;
       
  3164   }
       
  3165 
       
  3166   //  Generate 'unsafe' array copy stub
       
  3167   //  Though just as safe as the other stubs, it takes an unscaled
       
  3168   //  size_t argument instead of an element count.
       
  3169   //
       
  3170   // Arguments for generated stub:
       
  3171   //      from:  R0
       
  3172   //      to:    R1
       
  3173   //      count: R2 byte count, treated as ssize_t, can be zero
       
  3174   //
       
  3175   // Examines the alignment of the operands and dispatches
       
  3176   // to a long, int, short, or byte copy loop.
       
  3177   //
       
  3178   address generate_unsafe_copy(const char* name) {
       
  3179 
       
  3180     const Register R0_from   = R0;      // source array address
       
  3181     const Register R1_to     = R1;      // destination array address
       
  3182     const Register R2_count  = R2;      // elements count
       
  3183 
       
  3184     const Register R3_bits   = R3;      // test copy of low bits
       
  3185 
       
  3186     __ align(CodeEntryAlignment);
       
  3187     StubCodeMark mark(this, "StubRoutines", name);
       
  3188     address start = __ pc();
       
  3189 #ifdef AARCH64
       
  3190     __ NOT_IMPLEMENTED();
       
  3191     start = NULL;
       
  3192 #else
       
  3193     const Register tmp = Rtemp;
       
  3194 
       
  3195     // bump this on entry, not on exit:
       
  3196     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R3, tmp);
       
  3197 
       
  3198     __ orr(R3_bits, R0_from, R1_to);
       
  3199     __ orr(R3_bits, R2_count, R3_bits);
       
  3200 
       
  3201     __ tst(R3_bits, BytesPerLong-1);
       
  3202     __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerLong), eq);
       
  3203     __ jump(StubRoutines::_jlong_arraycopy, relocInfo::runtime_call_type, tmp, eq);
       
  3204 
       
  3205     __ tst(R3_bits, BytesPerInt-1);
       
  3206     __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerInt), eq);
       
  3207     __ jump(StubRoutines::_jint_arraycopy, relocInfo::runtime_call_type, tmp, eq);
       
  3208 
       
  3209     __ tst(R3_bits, BytesPerShort-1);
       
  3210     __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerShort), eq);
       
  3211     __ jump(StubRoutines::_jshort_arraycopy, relocInfo::runtime_call_type, tmp, eq);
       
  3212 
       
  3213     __ jump(StubRoutines::_jbyte_arraycopy, relocInfo::runtime_call_type, tmp);
       
  3214 #endif
       
  3215     return start;
       
  3216   }
       
  3217 
       
  3218   // Helper for generating a dynamic type check.
       
  3219   // Smashes only the given temp registers.
       
  3220   void generate_type_check(Register sub_klass,
       
  3221                            Register super_check_offset,
       
  3222                            Register super_klass,
       
  3223                            Register tmp1,
       
  3224                            Register tmp2,
       
  3225                            Register tmp3,
       
  3226                            Label& L_success) {
       
  3227     assert_different_registers(sub_klass, super_check_offset, super_klass, tmp1, tmp2, tmp3);
       
  3228 
       
  3229     BLOCK_COMMENT("type_check:");
       
  3230 
       
  3231     // If the pointers are equal, we are done (e.g., String[] elements).
       
  3232 
       
  3233     __ cmp(super_klass, sub_klass);
       
  3234     __ b(L_success, eq); // fast success
       
  3235 
       
  3236 
       
  3237     Label L_loop, L_fail;
       
  3238 
       
  3239     int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
       
  3240 
       
  3241     // Check the supertype display:
       
  3242     __ ldr(tmp1, Address(sub_klass, super_check_offset));
       
  3243     __ cmp(tmp1, super_klass);
       
  3244     __ b(L_success, eq);
       
  3245 
       
  3246     __ cmp(super_check_offset, sc_offset);
       
  3247     __ b(L_fail, ne); // failure
       
  3248 
       
  3249     BLOCK_COMMENT("type_check_slow_path:");
       
  3250 
       
  3251     // a couple of useful fields in sub_klass:
       
  3252     int ss_offset = in_bytes(Klass::secondary_supers_offset());
       
  3253 
       
  3254     // Do a linear scan of the secondary super-klass chain.
       
  3255 
       
  3256 #ifndef PRODUCT
       
  3257     int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
       
  3258     __ inc_counter((address) pst_counter, tmp1, tmp2);
       
  3259 #endif
       
  3260 
       
  3261     Register scan_temp = tmp1;
       
  3262     Register count_temp = tmp2;
       
  3263 
       
  3264     // We will consult the secondary-super array.
       
  3265     __ ldr(scan_temp, Address(sub_klass, ss_offset));
       
  3266 
       
  3267     Register search_key = super_klass;
       
  3268 
       
  3269     // Load the array length.
       
  3270     __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes()));
       
  3271     __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes());
       
  3272 
       
  3273     __ add(count_temp, count_temp, 1);
       
  3274 
       
  3275     // Top of search loop
       
  3276     __ bind(L_loop);
       
  3277     // Notes:
       
  3278     //  scan_temp starts at the array elements
       
  3279     //  count_temp is 1+size
       
  3280 
       
  3281     __ subs(count_temp, count_temp, 1);
       
  3282     __ b(L_fail, eq); // not found
       
  3283 
       
  3284     // Load next super to check
       
  3285     // In the array of super classes elements are pointer sized.
       
  3286     int element_size = wordSize;
       
  3287     __ ldr(tmp3, Address(scan_temp, element_size, post_indexed));
       
  3288 
       
  3289     // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
       
  3290     __ cmp(tmp3, search_key);
       
  3291 
       
  3292     // A miss means we are NOT a subtype and need to keep looping
       
  3293     __ b(L_loop, ne);
       
  3294 
       
  3295     // Falling out the bottom means we found a hit; we ARE a subtype
       
  3296 
       
  3297     // Success.  Cache the super we found and proceed in triumph.
       
  3298     __ str(super_klass, Address(sub_klass, sc_offset));
       
  3299 
       
  3300     // Jump to success
       
  3301     __ b(L_success);
       
  3302 
       
  3303     // Fall through on failure!
       
  3304     __ bind(L_fail);
       
  3305   }
       
  3306 
       
  3307   //  Generate stub for checked oop copy.
       
  3308   //
       
  3309   // Arguments for generated stub:
       
  3310   //      from:  R0
       
  3311   //      to:    R1
       
  3312   //      count: R2 treated as signed 32-bit int
       
  3313   //      ckoff: R3 (super_check_offset)
       
  3314   //      ckval: R4 (AArch64) / SP[0] (32-bit ARM) (super_klass)
       
  3315   //      ret:   R0 zero for success; (-1^K) where K is partial transfer count (32-bit)
       
  3316   //
       
  3317   address generate_checkcast_copy(const char * name) {
       
  3318     __ align(CodeEntryAlignment);
       
  3319     StubCodeMark mark(this, "StubRoutines", name);
       
  3320     address start = __ pc();
       
  3321 
       
  3322     const Register from  = R0;  // source array address
       
  3323     const Register to    = R1;  // destination array address
       
  3324     const Register count = R2;  // elements count
       
  3325 
       
  3326     const Register R3_ckoff  = R3;      // super_check_offset
       
  3327     const Register R4_ckval  = R4;      // super_klass
       
  3328 
       
  3329     const int callee_saved_regs = AARCH64_ONLY(5) NOT_AARCH64(4); // LR saved differently
       
  3330 
       
  3331     Label load_element, store_element, do_card_marks, fail;
       
  3332 
       
  3333     BLOCK_COMMENT("Entry:");
       
  3334 
       
  3335     __ zap_high_non_significant_bits(R2);
       
  3336 
       
  3337 #ifdef AARCH64
       
  3338     __ raw_push(LR, ZR);
       
  3339     __ raw_push(R19, R20);
       
  3340 #else
       
  3341     int pushed = 0;
       
  3342     __ push(LR);
       
  3343     pushed+=1;
       
  3344 #endif // AARCH64
       
  3345 
       
  3346 #if INCLUDE_ALL_GCS
       
  3347     gen_write_ref_array_pre_barrier(to, count, callee_saved_regs);
       
  3348 #endif // INCLUDE_ALL_GCS
       
  3349 
       
  3350 #ifndef AARCH64
       
  3351     const RegisterSet caller_saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11;
       
  3352     __ push(caller_saved_regs);
       
  3353     assert(caller_saved_regs.size() == 6, "check the count");
       
  3354     pushed+=6;
       
  3355 
       
  3356     __ ldr(R4_ckval,Address(SP, wordSize*pushed)); // read the argument that was on the stack
       
  3357 #endif // !AARCH64
       
  3358 
       
  3359     // Save arguments for barrier generation (after the pre barrier):
       
  3360     // - must be a caller saved register and not LR
       
  3361     // - ARM32: avoid R10 in case RThread is needed
       
  3362     const Register saved_count = AARCH64_ONLY(R19) NOT_AARCH64(altFP_7_11);
       
  3363 #ifdef AARCH64
       
  3364     __ mov_w(saved_count, count);
       
  3365     __ cbnz_w(count, load_element); // and test count
       
  3366 #else
       
  3367     __ movs(saved_count, count); // and test count
       
  3368     __ b(load_element,ne);
       
  3369 #endif // AARCH64
       
  3370 
       
  3371     // nothing to copy
       
  3372     __ mov(R0, 0);
       
  3373 
       
  3374 #ifdef AARCH64
       
  3375     __ raw_pop(R19, R20);
       
  3376     __ raw_pop(LR, ZR);
       
  3377     __ ret();
       
  3378 #else
       
  3379     __ pop(caller_saved_regs);
       
  3380     __ pop(PC);
       
  3381 #endif // AARCH64
       
  3382 
       
  3383     // ======== begin loop ========
       
  3384     // (Loop is rotated; its entry is load_element.)
       
  3385     __ align(OptoLoopAlignment);
       
  3386     __ BIND(store_element);
       
  3387     if (UseCompressedOops) {
       
  3388       __ store_heap_oop(R5, Address(to, BytesPerHeapOop, post_indexed));  // store the oop, changes flags
       
  3389       __ subs_32(count,count,1);
       
  3390     } else {
       
  3391       __ subs_32(count,count,1);
       
  3392       __ str(R5, Address(to, BytesPerHeapOop, post_indexed));             // store the oop
       
  3393     }
       
  3394     __ b(do_card_marks, eq); // count exhausted
       
  3395 
       
  3396     // ======== loop entry is here ========
       
  3397     __ BIND(load_element);
       
  3398     __ load_heap_oop(R5, Address(from, BytesPerHeapOop, post_indexed));  // load the oop
       
  3399     __ cbz(R5, store_element); // NULL
       
  3400 
       
  3401     __ load_klass(R6, R5);
       
  3402 
       
  3403     generate_type_check(R6, R3_ckoff, R4_ckval, /*tmps*/ R12, R8, R9,
       
  3404                         // branch to this on success:
       
  3405                         store_element);
       
  3406     // ======== end loop ========
       
  3407 
       
  3408     // It was a real error; we must depend on the caller to finish the job.
       
  3409     // Register count has number of *remaining* oops, saved_count number of *total* oops.
       
  3410     // Emit GC store barriers for the oops we have copied
       
  3411     // and report their number to the caller (0 or (-1^n))
       
  3412     __ BIND(fail);
       
  3413 
       
  3414     // Note: fail marked by the fact that count differs from saved_count
       
  3415 
       
  3416     __ BIND(do_card_marks);
       
  3417 
       
  3418     Register copied = AARCH64_ONLY(R20) NOT_AARCH64(R4); // saved
       
  3419     Label L_not_copied;
       
  3420 
       
  3421     __ subs_32(copied, saved_count, count); // copied count (in saved reg)
       
  3422     __ b(L_not_copied, eq); // nothing was copied, skip post barrier
       
  3423     __ sub(to, to, AsmOperand(copied, lsl, LogBytesPerHeapOop)); // initial to value
       
  3424     __ mov(R12, copied); // count arg scratched by post barrier
       
  3425 
       
  3426     gen_write_ref_array_post_barrier(to, R12, R3);
       
  3427 
       
  3428     assert_different_registers(R3,R12,LR,copied,saved_count);
       
  3429     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R3, R12);
       
  3430 
       
  3431     __ BIND(L_not_copied);
       
  3432     __ cmp_32(copied, saved_count); // values preserved in saved registers
       
  3433 
       
  3434 #ifdef AARCH64
       
  3435     __ csinv(R0, ZR, copied, eq); // 0 if all copied else NOT(copied)
       
  3436     __ raw_pop(R19, R20);
       
  3437     __ raw_pop(LR, ZR);
       
  3438     __ ret();
       
  3439 #else
       
  3440     __ mov(R0, 0, eq); // 0 if all copied
       
  3441     __ mvn(R0, copied, ne); // else NOT(copied)
       
  3442     __ pop(caller_saved_regs);
       
  3443     __ pop(PC);
       
  3444 #endif // AARCH64
       
  3445 
       
  3446     return start;
       
  3447   }
       
  3448 
       
  3449   // Perform range checks on the proposed arraycopy.
       
  3450   // Kills the two temps, but nothing else.
       
  3451   void arraycopy_range_checks(Register src,     // source array oop
       
  3452                               Register src_pos, // source position (32-bit int)
       
  3453                               Register dst,     // destination array oop
       
  3454                               Register dst_pos, // destination position (32-bit int)
       
  3455                               Register length,  // length of copy (32-bit int)
       
  3456                               Register temp1, Register temp2,
       
  3457                               Label& L_failed) {
       
  3458 
       
  3459     BLOCK_COMMENT("arraycopy_range_checks:");
       
  3460 
       
  3461     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
       
  3462 
       
  3463     const Register array_length = temp1;  // scratch
       
  3464     const Register end_pos      = temp2;  // scratch
       
  3465 
       
  3466     __ add_32(end_pos, length, src_pos);  // src_pos + length
       
  3467     __ ldr_s32(array_length, Address(src, arrayOopDesc::length_offset_in_bytes()));
       
  3468     __ cmp_32(end_pos, array_length);
       
  3469     __ b(L_failed, hi);
       
  3470 
       
  3471     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
       
  3472     __ add_32(end_pos, length, dst_pos); // dst_pos + length
       
  3473     __ ldr_s32(array_length, Address(dst, arrayOopDesc::length_offset_in_bytes()));
       
  3474     __ cmp_32(end_pos, array_length);
       
  3475     __ b(L_failed, hi);
       
  3476 
       
  3477     BLOCK_COMMENT("arraycopy_range_checks done");
       
  3478   }
       
  3479 
       
  3480   //
       
  3481   //  Generate generic array copy stubs
       
  3482   //
       
  3483   //  Input:
       
  3484   //    R0    -  src oop
       
  3485   //    R1    -  src_pos (32-bit int)
       
  3486   //    R2    -  dst oop
       
  3487   //    R3    -  dst_pos (32-bit int)
       
  3488   //    R4 (AArch64) / SP[0] (32-bit ARM) -  element count (32-bit int)
       
  3489   //
       
  3490   //  Output: (32-bit int)
       
  3491   //    R0 ==  0  -  success
       
  3492   //    R0 <   0  -  need to call System.arraycopy
       
  3493   //
       
  3494   address generate_generic_copy(const char *name) {
       
  3495     Label L_failed, L_objArray;
       
  3496 
       
  3497     // Input registers
       
  3498     const Register src      = R0;  // source array oop
       
  3499     const Register src_pos  = R1;  // source position
       
  3500     const Register dst      = R2;  // destination array oop
       
  3501     const Register dst_pos  = R3;  // destination position
       
  3502 
       
  3503     // registers used as temp
       
  3504     const Register R5_src_klass = R5; // source array klass
       
  3505     const Register R6_dst_klass = R6; // destination array klass
       
  3506     const Register R_lh         = AARCH64_ONLY(R7) NOT_AARCH64(altFP_7_11); // layout handler
       
  3507     const Register R8_temp      = R8;
       
  3508 
       
  3509     __ align(CodeEntryAlignment);
       
  3510     StubCodeMark mark(this, "StubRoutines", name);
       
  3511     address start = __ pc();
       
  3512 
       
  3513     __ zap_high_non_significant_bits(R1);
       
  3514     __ zap_high_non_significant_bits(R3);
       
  3515     __ zap_high_non_significant_bits(R4);
       
  3516 
       
  3517 #ifndef AARCH64
       
  3518     int pushed = 0;
       
  3519     const RegisterSet saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11;
       
  3520     __ push(saved_regs);
       
  3521     assert(saved_regs.size() == 6, "check the count");
       
  3522     pushed+=6;
       
  3523 #endif // !AARCH64
       
  3524 
       
  3525     // bump this on entry, not on exit:
       
  3526     inc_counter_np(SharedRuntime::_generic_array_copy_ctr, R5, R12);
       
  3527 
       
  3528     const Register length   = R4;  // elements count
       
  3529 #ifndef AARCH64
       
  3530     __ ldr(length, Address(SP,4*pushed));
       
  3531 #endif // !AARCH64
       
  3532 
       
  3533 
       
  3534     //-----------------------------------------------------------------------
       
  3535     // Assembler stubs will be used for this call to arraycopy
       
  3536     // if the following conditions are met:
       
  3537     //
       
  3538     // (1) src and dst must not be null.
       
  3539     // (2) src_pos must not be negative.
       
  3540     // (3) dst_pos must not be negative.
       
  3541     // (4) length  must not be negative.
       
  3542     // (5) src klass and dst klass should be the same and not NULL.
       
  3543     // (6) src and dst should be arrays.
       
  3544     // (7) src_pos + length must not exceed length of src.
       
  3545     // (8) dst_pos + length must not exceed length of dst.
       
  3546     BLOCK_COMMENT("arraycopy initial argument checks");
       
  3547 
       
  3548     //  if (src == NULL) return -1;
       
  3549     __ cbz(src, L_failed);
       
  3550 
       
  3551     //  if (src_pos < 0) return -1;
       
  3552     __ cmp_32(src_pos, 0);
       
  3553     __ b(L_failed, lt);
       
  3554 
       
  3555     //  if (dst == NULL) return -1;
       
  3556     __ cbz(dst, L_failed);
       
  3557 
       
  3558     //  if (dst_pos < 0) return -1;
       
  3559     __ cmp_32(dst_pos, 0);
       
  3560     __ b(L_failed, lt);
       
  3561 
       
  3562     //  if (length < 0) return -1;
       
  3563     __ cmp_32(length, 0);
       
  3564     __ b(L_failed, lt);
       
  3565 
       
  3566     BLOCK_COMMENT("arraycopy argument klass checks");
       
  3567     //  get src->klass()
       
  3568     __ load_klass(R5_src_klass, src);
       
  3569 
       
  3570     // Load layout helper
       
  3571     //
       
  3572     //  |array_tag|     | header_size | element_type |     |log2_element_size|
       
  3573     // 32        30    24            16              8     2                 0
       
  3574     //
       
  3575     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
       
  3576     //
       
  3577 
       
  3578     int lh_offset = in_bytes(Klass::layout_helper_offset());
       
  3579     __ ldr_u32(R_lh, Address(R5_src_klass, lh_offset));
       
  3580 
       
  3581     __ load_klass(R6_dst_klass, dst);
       
  3582 
       
  3583     // Handle objArrays completely differently...
       
  3584     juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
       
  3585     __ mov_slow(R8_temp, objArray_lh);
       
  3586     __ cmp_32(R_lh, R8_temp);
       
  3587     __ b(L_objArray,eq);
       
  3588 
       
  3589     //  if (src->klass() != dst->klass()) return -1;
       
  3590     __ cmp(R5_src_klass, R6_dst_klass);
       
  3591     __ b(L_failed, ne);
       
  3592 
       
  3593     //  if (!src->is_Array()) return -1;
       
  3594     __ cmp_32(R_lh, Klass::_lh_neutral_value); // < 0
       
  3595     __ b(L_failed, ge);
       
  3596 
       
  3597     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
       
  3598                            R8_temp, R6_dst_klass, L_failed);
       
  3599 
       
  3600     {
       
  3601       // TypeArrayKlass
       
  3602       //
       
  3603       // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
       
  3604       // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
       
  3605       //
       
  3606 
       
  3607       const Register R6_offset = R6_dst_klass;    // array offset
       
  3608       const Register R12_elsize = R12;            // log2 element size
       
  3609 
       
  3610       __ logical_shift_right(R6_offset, R_lh, Klass::_lh_header_size_shift);
       
  3611       __ andr(R6_offset, R6_offset, (unsigned int)Klass::_lh_header_size_mask); // array_offset
       
  3612       __ add(src, src, R6_offset);       // src array offset
       
  3613       __ add(dst, dst, R6_offset);       // dst array offset
       
  3614       __ andr(R12_elsize, R_lh, (unsigned int)Klass::_lh_log2_element_size_mask); // log2 element size
       
  3615 
       
  3616       // next registers should be set before the jump to corresponding stub
       
  3617       const Register from     = R0;  // source array address
       
  3618       const Register to       = R1;  // destination array address
       
  3619       const Register count    = R2;  // elements count
       
  3620 
       
  3621       // 'from', 'to', 'count' registers should be set in this order
       
  3622       // since they are the same as 'src', 'src_pos', 'dst'.
       
  3623 
       
  3624 #ifdef AARCH64
       
  3625 
       
  3626       BLOCK_COMMENT("choose copy loop based on element size and scale indexes");
       
  3627       Label Lbyte, Lshort, Lint, Llong;
       
  3628 
       
  3629       __ cbz(R12_elsize, Lbyte);
       
  3630 
       
  3631       assert (LogBytesPerShort < LogBytesPerInt && LogBytesPerInt < LogBytesPerLong, "must be");
       
  3632       __ cmp(R12_elsize, LogBytesPerInt);
       
  3633       __ b(Lint,  eq);
       
  3634       __ b(Llong, gt);
       
  3635 
       
  3636       __ BIND(Lshort);
       
  3637       __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerShort);
       
  3638       __ add_ptr_scaled_int32(to,   dst, dst_pos, LogBytesPerShort);
       
  3639       __ mov(count, length);
       
  3640       __ b(StubRoutines::_jshort_arraycopy);
       
  3641 
       
  3642       __ BIND(Lint);
       
  3643       __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerInt);
       
  3644       __ add_ptr_scaled_int32(to,   dst, dst_pos, LogBytesPerInt);
       
  3645       __ mov(count, length);
       
  3646       __ b(StubRoutines::_jint_arraycopy);
       
  3647 
       
  3648       __ BIND(Lbyte);
       
  3649       __ add_ptr_scaled_int32(from, src, src_pos, 0);
       
  3650       __ add_ptr_scaled_int32(to,   dst, dst_pos, 0);
       
  3651       __ mov(count, length);
       
  3652       __ b(StubRoutines::_jbyte_arraycopy);
       
  3653 
       
  3654       __ BIND(Llong);
       
  3655       __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerLong);
       
  3656       __ add_ptr_scaled_int32(to,   dst, dst_pos, LogBytesPerLong);
       
  3657       __ mov(count, length);
       
  3658       __ b(StubRoutines::_jlong_arraycopy);
       
  3659 
       
  3660 #else // AARCH64
       
  3661 
       
  3662       BLOCK_COMMENT("scale indexes to element size");
       
  3663       __ add(from, src, AsmOperand(src_pos, lsl, R12_elsize));       // src_addr
       
  3664       __ add(to, dst, AsmOperand(dst_pos, lsl, R12_elsize));         // dst_addr
       
  3665 
       
  3666       __ mov(count, length);  // length
       
  3667 
       
  3668       // XXX optim: avoid later push in arraycopy variants ?
       
  3669 
       
  3670       __ pop(saved_regs);
       
  3671 
       
  3672       BLOCK_COMMENT("choose copy loop based on element size");
       
  3673       __ cmp(R12_elsize, 0);
       
  3674       __ b(StubRoutines::_jbyte_arraycopy,eq);
       
  3675 
       
  3676       __ cmp(R12_elsize, LogBytesPerShort);
       
  3677       __ b(StubRoutines::_jshort_arraycopy,eq);
       
  3678 
       
  3679       __ cmp(R12_elsize, LogBytesPerInt);
       
  3680       __ b(StubRoutines::_jint_arraycopy,eq);
       
  3681 
       
  3682       __ b(StubRoutines::_jlong_arraycopy);
       
  3683 
       
  3684 #endif // AARCH64
       
  3685     }
       
  3686 
       
  3687     // ObjArrayKlass
       
  3688     __ BIND(L_objArray);
       
  3689     // live at this point:  R5_src_klass, R6_dst_klass, src[_pos], dst[_pos], length
       
  3690 
       
  3691     Label L_plain_copy, L_checkcast_copy;
       
  3692     //  test array classes for subtyping
       
  3693     __ cmp(R5_src_klass, R6_dst_klass);         // usual case is exact equality
       
  3694     __ b(L_checkcast_copy, ne);
       
  3695 
       
  3696     BLOCK_COMMENT("Identically typed arrays");
       
  3697     {
       
  3698       // Identically typed arrays can be copied without element-wise checks.
       
  3699       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
       
  3700                              R8_temp, R_lh, L_failed);
       
  3701 
       
  3702       // next registers should be set before the jump to corresponding stub
       
  3703       const Register from     = R0;  // source array address
       
  3704       const Register to       = R1;  // destination array address
       
  3705       const Register count    = R2;  // elements count
       
  3706 
       
  3707       __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
       
  3708       __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
       
  3709       __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop);         // src_addr
       
  3710       __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop);           // dst_addr
       
  3711       __ BIND(L_plain_copy);
       
  3712       __ mov(count, length);
       
  3713 
       
  3714 #ifndef AARCH64
       
  3715       __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ?
       
  3716 #endif // !AARCH64
       
  3717       __ b(StubRoutines::_oop_arraycopy);
       
  3718     }
       
  3719 
       
  3720     {
       
  3721       __ BIND(L_checkcast_copy);
       
  3722       // live at this point:  R5_src_klass, R6_dst_klass
       
  3723 
       
  3724       // Before looking at dst.length, make sure dst is also an objArray.
       
  3725       __ ldr_u32(R8_temp, Address(R6_dst_klass, lh_offset));
       
  3726       __ cmp_32(R_lh, R8_temp);
       
  3727       __ b(L_failed, ne);
       
  3728 
       
  3729       // It is safe to examine both src.length and dst.length.
       
  3730 
       
  3731       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
       
  3732                              R8_temp, R_lh, L_failed);
       
  3733 
       
  3734       // next registers should be set before the jump to corresponding stub
       
  3735       const Register from     = R0;  // source array address
       
  3736       const Register to       = R1;  // destination array address
       
  3737       const Register count    = R2;  // elements count
       
  3738 
       
  3739       // Marshal the base address arguments now, freeing registers.
       
  3740       __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
       
  3741       __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
       
  3742       __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop);         // src_addr
       
  3743       __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop);           // dst_addr
       
  3744 
       
  3745       __ mov(count, length); // length (reloaded)
       
  3746 
       
  3747       Register sco_temp = R3;                   // this register is free now
       
  3748       assert_different_registers(from, to, count, sco_temp,
       
  3749                                  R6_dst_klass, R5_src_klass);
       
  3750 
       
  3751       // Generate the type check.
       
  3752       int sco_offset = in_bytes(Klass::super_check_offset_offset());
       
  3753       __ ldr_u32(sco_temp, Address(R6_dst_klass, sco_offset));
       
  3754       generate_type_check(R5_src_klass, sco_temp, R6_dst_klass,
       
  3755                           R8_temp, R9,
       
  3756                           AARCH64_ONLY(R10) NOT_AARCH64(R12),
       
  3757                           L_plain_copy);
       
  3758 
       
  3759       // Fetch destination element klass from the ObjArrayKlass header.
       
  3760       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
       
  3761 
       
  3762       // the checkcast_copy loop needs two extra arguments:
       
  3763       const Register Rdst_elem_klass = AARCH64_ONLY(R4) NOT_AARCH64(R3);
       
  3764       __ ldr(Rdst_elem_klass, Address(R6_dst_klass, ek_offset));   // dest elem klass
       
  3765 #ifndef AARCH64
       
  3766       __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ?
       
  3767       __ str(Rdst_elem_klass, Address(SP,0));    // dest elem klass argument
       
  3768 #endif // !AARCH64
       
  3769       __ ldr_u32(R3, Address(Rdst_elem_klass, sco_offset));  // sco of elem klass
       
  3770       __ b(StubRoutines::_checkcast_arraycopy);
       
  3771     }
       
  3772 
       
  3773     __ BIND(L_failed);
       
  3774 
       
  3775 #ifndef AARCH64
       
  3776     __ pop(saved_regs);
       
  3777 #endif // !AARCH64
       
  3778     __ mvn(R0, 0); // failure, with 0 copied
       
  3779     __ ret();
       
  3780 
       
  3781     return start;
       
  3782   }
       
  3783 
       
  3784   // Safefetch stubs.
       
  3785   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
       
  3786     // safefetch signatures:
       
  3787     //   int      SafeFetch32(int*      adr, int      errValue);
       
  3788     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
       
  3789     //
       
  3790     // arguments:
       
  3791     //   R0 = adr
       
  3792     //   R1 = errValue
       
  3793     //
       
  3794     // result:
       
  3795     //   R0  = *adr or errValue
       
  3796 
       
  3797     StubCodeMark mark(this, "StubRoutines", name);
       
  3798 
       
  3799     // Entry point, pc or function descriptor.
       
  3800     *entry = __ pc();
       
  3801 
       
  3802     // Load *adr into c_rarg2, may fault.
       
  3803     *fault_pc = __ pc();
       
  3804 
       
  3805     switch (size) {
       
  3806       case 4: // int32_t
       
  3807         __ ldr_s32(R1, Address(R0));
       
  3808         break;
       
  3809 
       
  3810       case 8: // int64_t
       
  3811 #ifdef AARCH64
       
  3812         __ ldr(R1, Address(R0));
       
  3813 #else
       
  3814         Unimplemented();
       
  3815 #endif // AARCH64
       
  3816         break;
       
  3817 
       
  3818       default:
       
  3819         ShouldNotReachHere();
       
  3820     }
       
  3821 
       
  3822     // return errValue or *adr
       
  3823     *continuation_pc = __ pc();
       
  3824     __ mov(R0, R1);
       
  3825     __ ret();
       
  3826   }
       
  3827 
       
  3828   void generate_arraycopy_stubs() {
       
  3829 
       
  3830     // Note:  the disjoint stubs must be generated first, some of
       
  3831     //        the conjoint stubs use them.
       
  3832 
       
  3833     bool status = false; // non failing C2 stubs need not return a status in R0
       
  3834 
       
  3835 #ifdef TEST_C2_GENERIC_ARRAYCOPY /* Internal development flag */
       
  3836     // With this flag, the C2 stubs are tested by generating calls to
       
  3837     // generic_arraycopy instead of Runtime1::arraycopy
       
  3838 
       
  3839     // Runtime1::arraycopy return a status in R0 (0 if OK, else ~copied)
       
  3840     // and the result is tested to see whether the arraycopy stub should
       
  3841     // be called.
       
  3842 
       
  3843     // When we test arraycopy this way, we must generate extra code in the
       
  3844     // arraycopy methods callable from C2 generic_arraycopy to set the
       
  3845     // status to 0 for those who always succeed (calling the slow path stub might
       
  3846     // lead to errors since the copy has already been performed).
       
  3847 
       
  3848     status = true; // generate a status compatible with C1 calls
       
  3849 #endif
       
  3850 
       
  3851     // these need always status in case they are called from generic_arraycopy
       
  3852     StubRoutines::_jbyte_disjoint_arraycopy  = generate_primitive_copy(false, "jbyte_disjoint_arraycopy",  true, 1, true);
       
  3853     StubRoutines::_jshort_disjoint_arraycopy = generate_primitive_copy(false, "jshort_disjoint_arraycopy", true, 2, true);
       
  3854     StubRoutines::_jint_disjoint_arraycopy   = generate_primitive_copy(false, "jint_disjoint_arraycopy",   true, 4, true);
       
  3855     StubRoutines::_jlong_disjoint_arraycopy  = generate_primitive_copy(false, "jlong_disjoint_arraycopy",  true, 8, true);
       
  3856     StubRoutines::_oop_disjoint_arraycopy    = generate_oop_copy      (false, "oop_disjoint_arraycopy",    true,    true);
       
  3857 
       
  3858     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_primitive_copy(true, "arrayof_jbyte_disjoint_arraycopy", status, 1, true);
       
  3859     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jshort_disjoint_arraycopy",status, 2, true);
       
  3860     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_primitive_copy(true, "arrayof_jint_disjoint_arraycopy",  status, 4, true);
       
  3861     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_primitive_copy(true, "arrayof_jlong_disjoint_arraycopy", status, 8, true);
       
  3862     StubRoutines::_arrayof_oop_disjoint_arraycopy    = generate_oop_copy      (true, "arrayof_oop_disjoint_arraycopy",   status,    true);
       
  3863 
       
  3864     // these need always status in case they are called from generic_arraycopy
       
  3865     StubRoutines::_jbyte_arraycopy  = generate_primitive_copy(false, "jbyte_arraycopy",  true, 1, false, StubRoutines::_jbyte_disjoint_arraycopy);
       
  3866     StubRoutines::_jshort_arraycopy = generate_primitive_copy(false, "jshort_arraycopy", true, 2, false, StubRoutines::_jshort_disjoint_arraycopy);
       
  3867     StubRoutines::_jint_arraycopy   = generate_primitive_copy(false, "jint_arraycopy",   true, 4, false, StubRoutines::_jint_disjoint_arraycopy);
       
  3868     StubRoutines::_jlong_arraycopy  = generate_primitive_copy(false, "jlong_arraycopy",  true, 8, false, StubRoutines::_jlong_disjoint_arraycopy);
       
  3869     StubRoutines::_oop_arraycopy    = generate_oop_copy      (false, "oop_arraycopy",    true,    false, StubRoutines::_oop_disjoint_arraycopy);
       
  3870 
       
  3871     StubRoutines::_arrayof_jbyte_arraycopy    = generate_primitive_copy(true, "arrayof_jbyte_arraycopy",  status, 1, false, StubRoutines::_arrayof_jbyte_disjoint_arraycopy);
       
  3872     StubRoutines::_arrayof_jshort_arraycopy   = generate_primitive_copy(true, "arrayof_jshort_arraycopy", status, 2, false, StubRoutines::_arrayof_jshort_disjoint_arraycopy);
       
  3873 #ifdef _LP64
       
  3874     // since sizeof(jint) < sizeof(HeapWord), there's a different flavor:
       
  3875     StubRoutines::_arrayof_jint_arraycopy     = generate_primitive_copy(true, "arrayof_jint_arraycopy",   status, 4, false, StubRoutines::_arrayof_jint_disjoint_arraycopy);
       
  3876 #else
       
  3877     StubRoutines::_arrayof_jint_arraycopy     = StubRoutines::_jint_arraycopy;
       
  3878 #endif
       
  3879     if (BytesPerHeapOop < HeapWordSize) {
       
  3880       StubRoutines::_arrayof_oop_arraycopy    = generate_oop_copy      (true, "arrayof_oop_arraycopy",    status,    false, StubRoutines::_arrayof_oop_disjoint_arraycopy);
       
  3881     } else {
       
  3882       StubRoutines::_arrayof_oop_arraycopy    = StubRoutines::_oop_arraycopy;
       
  3883     }
       
  3884     StubRoutines::_arrayof_jlong_arraycopy    = StubRoutines::_jlong_arraycopy;
       
  3885 
       
  3886     StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy");
       
  3887     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy");
       
  3888     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy");
       
  3889 
       
  3890 
       
  3891   }
       
  3892 
       
  3893 #ifndef AARCH64
       
  3894 #define COMPILE_CRYPTO
       
  3895 #include "stubRoutinesCrypto_arm.cpp"
       
  3896 #else
       
  3897 
       
  3898 #ifdef COMPILER2
       
  3899   // Arguments:
       
  3900   //
       
  3901   // Inputs:
       
  3902   //   c_rarg0   - source byte array address
       
  3903   //   c_rarg1   - destination byte array address
       
  3904   //   c_rarg2   - K (key) in little endian int array
       
  3905   //
       
  3906   address generate_aescrypt_encryptBlock() {
       
  3907     __ align(CodeEntryAlignment);
       
  3908     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
       
  3909 
       
  3910     Label L_doLast;
       
  3911 
       
  3912     const Register from        = c_rarg0;  // source array address
       
  3913     const Register to          = c_rarg1;  // destination array address
       
  3914     const Register key         = c_rarg2;  // key array address
       
  3915     const Register keylen      = R8;
       
  3916 
       
  3917     address start = __ pc();
       
  3918     __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
       
  3919     __ mov(FP, SP);
       
  3920 
       
  3921     __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
       
  3922 
       
  3923     __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input
       
  3924 
       
  3925     __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  3926 
       
  3927     int quad = 1;
       
  3928     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  3929     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
       
  3930     __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
       
  3931     __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
       
  3932     __ aese(V0, V1);
       
  3933     __ aesmc(V0, V0);
       
  3934     __ aese(V0, V2);
       
  3935     __ aesmc(V0, V0);
       
  3936     __ aese(V0, V3);
       
  3937     __ aesmc(V0, V0);
       
  3938     __ aese(V0, V4);
       
  3939     __ aesmc(V0, V0);
       
  3940 
       
  3941     __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  3942     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  3943     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
       
  3944     __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
       
  3945     __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
       
  3946     __ aese(V0, V1);
       
  3947     __ aesmc(V0, V0);
       
  3948     __ aese(V0, V2);
       
  3949     __ aesmc(V0, V0);
       
  3950     __ aese(V0, V3);
       
  3951     __ aesmc(V0, V0);
       
  3952     __ aese(V0, V4);
       
  3953     __ aesmc(V0, V0);
       
  3954 
       
  3955     __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  3956     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  3957     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
       
  3958 
       
  3959     __ cmp_w(keylen, 44);
       
  3960     __ b(L_doLast, eq);
       
  3961 
       
  3962     __ aese(V0, V1);
       
  3963     __ aesmc(V0, V0);
       
  3964     __ aese(V0, V2);
       
  3965     __ aesmc(V0, V0);
       
  3966 
       
  3967     __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  3968     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  3969     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
       
  3970 
       
  3971     __ cmp_w(keylen, 52);
       
  3972     __ b(L_doLast, eq);
       
  3973 
       
  3974     __ aese(V0, V1);
       
  3975     __ aesmc(V0, V0);
       
  3976     __ aese(V0, V2);
       
  3977     __ aesmc(V0, V0);
       
  3978 
       
  3979     __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  3980     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  3981     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
       
  3982 
       
  3983     __ BIND(L_doLast);
       
  3984 
       
  3985     __ aese(V0, V1);
       
  3986     __ aesmc(V0, V0);
       
  3987     __ aese(V0, V2);
       
  3988 
       
  3989     __ vld1(V1, Address(key), MacroAssembler::VELEM_SIZE_8, 128);
       
  3990     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  3991     __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  3992 
       
  3993     __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128);
       
  3994 
       
  3995     __ mov(R0, 0);
       
  3996 
       
  3997     __ mov(SP, FP);
       
  3998     __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
       
  3999     __ ret(LR);
       
  4000 
       
  4001     return start;
       
  4002   }
       
  4003 
       
  4004   // Arguments:
       
  4005   //
       
  4006   // Inputs:
       
  4007   //   c_rarg0   - source byte array address
       
  4008   //   c_rarg1   - destination byte array address
       
  4009   //   c_rarg2   - K (key) in little endian int array
       
  4010   //
       
  4011   address generate_aescrypt_decryptBlock() {
       
  4012     assert(UseAES, "need AES instructions and misaligned SSE support");
       
  4013     __ align(CodeEntryAlignment);
       
  4014     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
       
  4015     Label L_doLast;
       
  4016 
       
  4017     const Register from        = c_rarg0;  // source array address
       
  4018     const Register to          = c_rarg1;  // destination array address
       
  4019     const Register key         = c_rarg2;  // key array address
       
  4020     const Register keylen      = R8;
       
  4021 
       
  4022     address start = __ pc();
       
  4023     __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
       
  4024     __ mov(FP, SP);
       
  4025 
       
  4026     __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
       
  4027 
       
  4028     __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input
       
  4029 
       
  4030     __ vld1(V5, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4031 
       
  4032     int quad = 1;
       
  4033     __ rev32(V5, V5, MacroAssembler::VELEM_SIZE_8, quad);
       
  4034 
       
  4035     __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4036     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  4037     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
       
  4038     __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
       
  4039     __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
       
  4040     __ aesd(V0, V1);
       
  4041     __ aesimc(V0, V0);
       
  4042     __ aesd(V0, V2);
       
  4043     __ aesimc(V0, V0);
       
  4044     __ aesd(V0, V3);
       
  4045     __ aesimc(V0, V0);
       
  4046     __ aesd(V0, V4);
       
  4047     __ aesimc(V0, V0);
       
  4048 
       
  4049     __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4050     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  4051     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
       
  4052     __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
       
  4053     __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
       
  4054     __ aesd(V0, V1);
       
  4055     __ aesimc(V0, V0);
       
  4056     __ aesd(V0, V2);
       
  4057     __ aesimc(V0, V0);
       
  4058     __ aesd(V0, V3);
       
  4059     __ aesimc(V0, V0);
       
  4060     __ aesd(V0, V4);
       
  4061     __ aesimc(V0, V0);
       
  4062 
       
  4063     __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4064     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  4065     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
       
  4066 
       
  4067     __ cmp_w(keylen, 44);
       
  4068     __ b(L_doLast, eq);
       
  4069 
       
  4070     __ aesd(V0, V1);
       
  4071     __ aesimc(V0, V0);
       
  4072     __ aesd(V0, V2);
       
  4073     __ aesimc(V0, V0);
       
  4074 
       
  4075     __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4076     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  4077     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
       
  4078 
       
  4079     __ cmp_w(keylen, 52);
       
  4080     __ b(L_doLast, eq);
       
  4081 
       
  4082     __ aesd(V0, V1);
       
  4083     __ aesimc(V0, V0);
       
  4084     __ aesd(V0, V2);
       
  4085     __ aesimc(V0, V0);
       
  4086 
       
  4087     __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4088     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  4089     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
       
  4090 
       
  4091     __ BIND(L_doLast);
       
  4092 
       
  4093     __ aesd(V0, V1);
       
  4094     __ aesimc(V0, V0);
       
  4095     __ aesd(V0, V2);
       
  4096 
       
  4097     __ eor(V0, V0, V5, MacroAssembler::VELEM_SIZE_8, quad);
       
  4098 
       
  4099     __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128);
       
  4100 
       
  4101     __ mov(R0, 0);
       
  4102 
       
  4103     __ mov(SP, FP);
       
  4104     __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
       
  4105     __ ret(LR);
       
  4106 
       
  4107 
       
  4108     return start;
       
  4109   }
       
  4110 
       
  4111   // Arguments:
       
  4112   //
       
  4113   // Inputs:
       
  4114   //   c_rarg0   - source byte array address
       
  4115   //   c_rarg1   - destination byte array address
       
  4116   //   c_rarg2   - K (key) in little endian int array
       
  4117   //   c_rarg3   - r vector byte array address
       
  4118   //   c_rarg4   - input length
       
  4119   //
       
  4120   // Output:
       
  4121   //   x0        - input length
       
  4122   //
       
  4123   address generate_cipherBlockChaining_encryptAESCrypt() {
       
  4124     assert(UseAES, "need AES instructions and misaligned SSE support");
       
  4125     __ align(CodeEntryAlignment);
       
  4126     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
       
  4127 
       
  4128     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
       
  4129 
       
  4130     const Register from        = c_rarg0;  // source array address
       
  4131     const Register to          = c_rarg1;  // destination array address
       
  4132     const Register key         = c_rarg2;  // key array address
       
  4133     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
       
  4134                                            // and left with the results of the last encryption block
       
  4135     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
       
  4136     const Register keylen      = R8;
       
  4137 
       
  4138     address start = __ pc();
       
  4139     __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
       
  4140     __ mov(FP, SP);
       
  4141 
       
  4142     __ mov(R9, len_reg);
       
  4143     __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
       
  4144 
       
  4145     __ vld1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
       
  4146 
       
  4147     __ cmp_w(keylen, 52);
       
  4148     __ b(L_loadkeys_44, cc);
       
  4149     __ b(L_loadkeys_52, eq);
       
  4150 
       
  4151     __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4152 
       
  4153     int quad = 1;
       
  4154     __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad);
       
  4155     __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad);
       
  4156     __ BIND(L_loadkeys_52);
       
  4157     __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4158     __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad);
       
  4159     __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad);
       
  4160     __ BIND(L_loadkeys_44);
       
  4161     __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4162     __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad);
       
  4163     __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad);
       
  4164     __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad);
       
  4165     __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad);
       
  4166     __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4167     __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad);
       
  4168     __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad);
       
  4169     __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad);
       
  4170     __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad);
       
  4171     __ vld1(V29, V30, V31, Address(key), MacroAssembler::VELEM_SIZE_8, 128);
       
  4172     __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad);
       
  4173     __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad);
       
  4174     __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad);
       
  4175 
       
  4176     __ BIND(L_aes_loop);
       
  4177     __ vld1(V1, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4178     __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  4179 
       
  4180     __ b(L_rounds_44, cc);
       
  4181     __ b(L_rounds_52, eq);
       
  4182 
       
  4183     __ aese(V0, V17);
       
  4184     __ aesmc(V0, V0);
       
  4185     __ aese(V0, V18);
       
  4186     __ aesmc(V0, V0);
       
  4187     __ BIND(L_rounds_52);
       
  4188     __ aese(V0, V19);
       
  4189     __ aesmc(V0, V0);
       
  4190     __ aese(V0, V20);
       
  4191     __ aesmc(V0, V0);
       
  4192     __ BIND(L_rounds_44);
       
  4193     __ aese(V0, V21);
       
  4194     __ aesmc(V0, V0);
       
  4195     __ aese(V0, V22);
       
  4196     __ aesmc(V0, V0);
       
  4197     __ aese(V0, V23);
       
  4198     __ aesmc(V0, V0);
       
  4199     __ aese(V0, V24);
       
  4200     __ aesmc(V0, V0);
       
  4201     __ aese(V0, V25);
       
  4202     __ aesmc(V0, V0);
       
  4203     __ aese(V0, V26);
       
  4204     __ aesmc(V0, V0);
       
  4205     __ aese(V0, V27);
       
  4206     __ aesmc(V0, V0);
       
  4207     __ aese(V0, V28);
       
  4208     __ aesmc(V0, V0);
       
  4209     __ aese(V0, V29);
       
  4210     __ aesmc(V0, V0);
       
  4211     __ aese(V0, V30);
       
  4212     __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad);
       
  4213 
       
  4214     __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4215     __ sub(len_reg, len_reg, 16);
       
  4216     __ cbnz(len_reg, L_aes_loop);
       
  4217 
       
  4218     __ vst1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
       
  4219 
       
  4220     __ mov(R0, R9);
       
  4221 
       
  4222     __ mov(SP, FP);
       
  4223     __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
       
  4224     __ ret(LR);
       
  4225 
       
  4226     return start;
       
  4227   }
       
  4228 
       
  4229   // Arguments:
       
  4230   //
       
  4231   // Inputs:
       
  4232   //   c_rarg0   - source byte array address
       
  4233   //   c_rarg1   - destination byte array address
       
  4234   //   c_rarg2   - K (key) in little endian int array
       
  4235   //   c_rarg3   - r vector byte array address
       
  4236   //   c_rarg4   - input length
       
  4237   //
       
  4238   // Output:
       
  4239   //   rax       - input length
       
  4240   //
       
  4241   address generate_cipherBlockChaining_decryptAESCrypt() {
       
  4242     assert(UseAES, "need AES instructions and misaligned SSE support");
       
  4243     __ align(CodeEntryAlignment);
       
  4244     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
       
  4245 
       
  4246     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
       
  4247 
       
  4248     const Register from        = c_rarg0;  // source array address
       
  4249     const Register to          = c_rarg1;  // destination array address
       
  4250     const Register key         = c_rarg2;  // key array address
       
  4251     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
       
  4252                                            // and left with the results of the last encryption block
       
  4253     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
       
  4254     const Register keylen      = R8;
       
  4255 
       
  4256     address start = __ pc();
       
  4257     __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
       
  4258     __ mov(FP, SP);
       
  4259 
       
  4260     __ mov(R9, len_reg);
       
  4261     __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
       
  4262 
       
  4263     __ vld1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
       
  4264 
       
  4265     __ vld1(V31, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4266 
       
  4267     int quad = 1;
       
  4268     __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad);
       
  4269 
       
  4270     __ cmp_w(keylen, 52);
       
  4271     __ b(L_loadkeys_44, cc);
       
  4272     __ b(L_loadkeys_52, eq);
       
  4273 
       
  4274     __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4275     __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad);
       
  4276     __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad);
       
  4277     __ BIND(L_loadkeys_52);
       
  4278     __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4279     __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad);
       
  4280     __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad);
       
  4281     __ BIND(L_loadkeys_44);
       
  4282     __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4283     __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad);
       
  4284     __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad);
       
  4285     __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad);
       
  4286     __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad);
       
  4287     __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4288     __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad);
       
  4289     __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad);
       
  4290     __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad);
       
  4291     __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad);
       
  4292     __ vld1(V29, V30, Address(key), MacroAssembler::VELEM_SIZE_8, 128);
       
  4293     __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad);
       
  4294     __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad);
       
  4295 
       
  4296     __ BIND(L_aes_loop);
       
  4297     __ vld1(V0, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4298     __ orr(V1, V0, V0, MacroAssembler::VELEM_SIZE_8, quad);
       
  4299 
       
  4300     __ b(L_rounds_44, cc);
       
  4301     __ b(L_rounds_52, eq);
       
  4302 
       
  4303     __ aesd(V0, V17);
       
  4304     __ aesimc(V0, V0);
       
  4305     __ aesd(V0, V17);
       
  4306     __ aesimc(V0, V0);
       
  4307     __ BIND(L_rounds_52);
       
  4308     __ aesd(V0, V19);
       
  4309     __ aesimc(V0, V0);
       
  4310     __ aesd(V0, V20);
       
  4311     __ aesimc(V0, V0);
       
  4312     __ BIND(L_rounds_44);
       
  4313     __ aesd(V0, V21);
       
  4314     __ aesimc(V0, V0);
       
  4315     __ aesd(V0, V22);
       
  4316     __ aesimc(V0, V0);
       
  4317     __ aesd(V0, V23);
       
  4318     __ aesimc(V0, V0);
       
  4319     __ aesd(V0, V24);
       
  4320     __ aesimc(V0, V0);
       
  4321     __ aesd(V0, V25);
       
  4322     __ aesimc(V0, V0);
       
  4323     __ aesd(V0, V26);
       
  4324     __ aesimc(V0, V0);
       
  4325     __ aesd(V0, V27);
       
  4326     __ aesimc(V0, V0);
       
  4327     __ aesd(V0, V28);
       
  4328     __ aesimc(V0, V0);
       
  4329     __ aesd(V0, V29);
       
  4330     __ aesimc(V0, V0);
       
  4331     __ aesd(V0, V30);
       
  4332     __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad);
       
  4333     __ eor(V0, V0, V2, MacroAssembler::VELEM_SIZE_8, quad);
       
  4334 
       
  4335     __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
       
  4336     __ orr(V2, V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
       
  4337 
       
  4338     __ sub(len_reg, len_reg, 16);
       
  4339     __ cbnz(len_reg, L_aes_loop);
       
  4340 
       
  4341     __ vst1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
       
  4342 
       
  4343     __ mov(R0, R9);
       
  4344 
       
  4345     __ mov(SP, FP);
       
  4346     __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
       
  4347     __ ret(LR);
       
  4348 
       
  4349     return start;
       
  4350   }
       
  4351 
       
  4352 #endif // COMPILER2
       
  4353 #endif // AARCH64
       
  4354 
       
  4355  private:
       
  4356 
       
  4357 #undef  __
       
  4358 #define __ masm->
       
  4359 
       
  4360   //------------------------------------------------------------------------------------------------------------------------
       
  4361   // Continuation point for throwing of implicit exceptions that are not handled in
       
  4362   // the current activation. Fabricates an exception oop and initiates normal
       
  4363   // exception dispatching in this frame.
       
  4364   address generate_throw_exception(const char* name, address runtime_entry) {
       
  4365     int insts_size = 128;
       
  4366     int locs_size  = 32;
       
  4367     CodeBuffer code(name, insts_size, locs_size);
       
  4368     OopMapSet* oop_maps;
       
  4369     int frame_size;
       
  4370     int frame_complete;
       
  4371 
       
  4372     oop_maps = new OopMapSet();
       
  4373     MacroAssembler* masm = new MacroAssembler(&code);
       
  4374 
       
  4375     address start = __ pc();
       
  4376 
       
  4377     frame_size = 2;
       
  4378     __ mov(Rexception_pc, LR);
       
  4379     __ raw_push(FP, LR);
       
  4380 
       
  4381     frame_complete = __ pc() - start;
       
  4382 
       
  4383     // Any extra arguments are already supposed to be R1 and R2
       
  4384     __ mov(R0, Rthread);
       
  4385 
       
  4386     int pc_offset = __ set_last_Java_frame(SP, FP, false, Rtemp);
       
  4387     assert(((__ pc()) - start) == __ offset(), "warning: start differs from code_begin");
       
  4388     __ call(runtime_entry);
       
  4389     if (pc_offset == -1) {
       
  4390       pc_offset = __ offset();
       
  4391     }
       
  4392 
       
  4393     // Generate oop map
       
  4394     OopMap* map =  new OopMap(frame_size*VMRegImpl::slots_per_word, 0);
       
  4395     oop_maps->add_gc_map(pc_offset, map);
       
  4396     __ reset_last_Java_frame(Rtemp); // Rtemp free since scratched by far call
       
  4397 
       
  4398     __ raw_pop(FP, LR);
       
  4399     __ jump(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type, Rtemp);
       
  4400 
       
  4401     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete,
       
  4402                                                       frame_size, oop_maps, false);
       
  4403     return stub->entry_point();
       
  4404   }
       
  4405 
       
  4406   //---------------------------------------------------------------------------
       
  4407   // Initialization
       
  4408 
       
  4409   void generate_initial() {
       
  4410     // Generates all stubs and initializes the entry points
       
  4411 
       
  4412     //------------------------------------------------------------------------------------------------------------------------
       
  4413     // entry points that exist in all platforms
       
  4414     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
       
  4415     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
       
  4416     StubRoutines::_forward_exception_entry      = generate_forward_exception();
       
  4417 
       
  4418     StubRoutines::_call_stub_entry              =
       
  4419       generate_call_stub(StubRoutines::_call_stub_return_address);
       
  4420     // is referenced by megamorphic call
       
  4421     StubRoutines::_catch_exception_entry        = generate_catch_exception();
       
  4422 
       
  4423     // stub for throwing stack overflow error used both by interpreter and compiler
       
  4424     StubRoutines::_throw_StackOverflowError_entry  = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
       
  4425 
       
  4426 #ifndef AARCH64
       
  4427     // integer division used both by interpreter and compiler
       
  4428     StubRoutines::Arm::_idiv_irem_entry = generate_idiv_irem();
       
  4429 
       
  4430     StubRoutines::_atomic_add_entry = generate_atomic_add();
       
  4431     StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
       
  4432     StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
       
  4433     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
       
  4434     StubRoutines::_atomic_load_long_entry = generate_atomic_load_long();
       
  4435     StubRoutines::_atomic_store_long_entry = generate_atomic_store_long();
       
  4436 #endif // !AARCH64
       
  4437   }
       
  4438 
       
  4439   void generate_all() {
       
  4440     // Generates all stubs and initializes the entry points
       
  4441 
       
  4442 #ifdef COMPILER2
       
  4443     // Generate partial_subtype_check first here since its code depends on
       
  4444     // UseZeroBaseCompressedOops which is defined after heap initialization.
       
  4445     StubRoutines::Arm::_partial_subtype_check                = generate_partial_subtype_check();
       
  4446 #endif
       
  4447     // These entry points require SharedInfo::stack0 to be set up in non-core builds
       
  4448     // and need to be relocatable, so they each fabricate a RuntimeStub internally.
       
  4449     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
       
  4450     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
       
  4451     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
       
  4452 
       
  4453     //------------------------------------------------------------------------------------------------------------------------
       
  4454     // entry points that are platform specific
       
  4455 
       
  4456     // support for verify_oop (must happen after universe_init)
       
  4457     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
       
  4458 
       
  4459     // arraycopy stubs used by compilers
       
  4460     generate_arraycopy_stubs();
       
  4461 
       
  4462     // Safefetch stubs.
       
  4463     generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
       
  4464                                                    &StubRoutines::_safefetch32_fault_pc,
       
  4465                                                    &StubRoutines::_safefetch32_continuation_pc);
       
  4466 #ifdef AARCH64
       
  4467     generate_safefetch("SafeFetchN", wordSize, &StubRoutines::_safefetchN_entry,
       
  4468                                                &StubRoutines::_safefetchN_fault_pc,
       
  4469                                                &StubRoutines::_safefetchN_continuation_pc);
       
  4470 #ifdef COMPILER2
       
  4471     if (UseAESIntrinsics) {
       
  4472       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
       
  4473       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
       
  4474       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
       
  4475       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
       
  4476     }
       
  4477 #endif
       
  4478 #else
       
  4479     assert (sizeof(int) == wordSize, "32-bit architecture");
       
  4480     StubRoutines::_safefetchN_entry           = StubRoutines::_safefetch32_entry;
       
  4481     StubRoutines::_safefetchN_fault_pc        = StubRoutines::_safefetch32_fault_pc;
       
  4482     StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc;
       
  4483 #endif // AARCH64
       
  4484 
       
  4485 #ifdef COMPILE_CRYPTO
       
  4486     // generate AES intrinsics code
       
  4487     if (UseAESIntrinsics) {
       
  4488       aes_init();
       
  4489       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
       
  4490       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
       
  4491       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
       
  4492       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
       
  4493     }
       
  4494 #endif // COMPILE_CRYPTO
       
  4495   }
       
  4496 
       
  4497 
       
  4498  public:
       
  4499   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
       
  4500     if (all) {
       
  4501       generate_all();
       
  4502     } else {
       
  4503       generate_initial();
       
  4504     }
       
  4505   }
       
  4506 }; // end class declaration
       
  4507 
       
  4508 void StubGenerator_generate(CodeBuffer* code, bool all) {
       
  4509   StubGenerator g(code, all);
       
  4510 }