diff -r 4ebc2e2fb97c -r 71c04702a3d5 src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp Tue Sep 12 19:03:39 2017 +0200 @@ -0,0 +1,4984 @@ +/* + * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" +#include "asm/macroAssembler.hpp" +#include "asm/macroAssembler.inline.hpp" +#include "interpreter/interpreter.hpp" +#include "nativeInst_aarch64.hpp" +#include "oops/instanceOop.hpp" +#include "oops/method.hpp" +#include "oops/objArrayKlass.hpp" +#include "oops/oop.inline.hpp" +#include "prims/methodHandles.hpp" +#include "runtime/frame.inline.hpp" +#include "runtime/handles.inline.hpp" +#include "runtime/sharedRuntime.hpp" +#include "runtime/stubCodeGenerator.hpp" +#include "runtime/stubRoutines.hpp" +#include "runtime/thread.inline.hpp" +#include "utilities/align.hpp" +#ifdef COMPILER2 +#include "opto/runtime.hpp" +#endif + +#ifdef BUILTIN_SIM +#include "../../../../../../simulator/simulator.hpp" +#endif + +// Declaration and definition of StubGenerator (no .hpp file). +// For a more detailed description of the stub routine structure +// see the comment in stubRoutines.hpp + +#undef __ +#define __ _masm-> +#define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) + +#ifdef PRODUCT +#define BLOCK_COMMENT(str) /* nothing */ +#else +#define BLOCK_COMMENT(str) __ block_comment(str) +#endif + +#define BIND(label) bind(label); BLOCK_COMMENT(#label ":") + +// Stub Code definitions + +class StubGenerator: public StubCodeGenerator { + private: + +#ifdef PRODUCT +#define inc_counter_np(counter) ((void)0) +#else + void inc_counter_np_(int& counter) { + __ lea(rscratch2, ExternalAddress((address)&counter)); + __ ldrw(rscratch1, Address(rscratch2)); + __ addw(rscratch1, rscratch1, 1); + __ strw(rscratch1, Address(rscratch2)); + } +#define inc_counter_np(counter) \ + BLOCK_COMMENT("inc_counter " #counter); \ + inc_counter_np_(counter); +#endif + + // Call stubs are used to call Java from C + // + // Arguments: + // c_rarg0: call wrapper address address + // c_rarg1: result address + // c_rarg2: result type BasicType + // c_rarg3: method Method* + // c_rarg4: (interpreter) entry point address + // c_rarg5: parameters intptr_t* + // c_rarg6: parameter size (in words) int + // c_rarg7: thread Thread* + // + // There is no return from the stub itself as any Java result + // is written to result + // + // we save r30 (lr) as the return PC at the base of the frame and + // link r29 (fp) below it as the frame pointer installing sp (r31) + // into fp. + // + // we save r0-r7, which accounts for all the c arguments. + // + // TODO: strictly do we need to save them all? they are treated as + // volatile by C so could we omit saving the ones we are going to + // place in global registers (thread? method?) or those we only use + // during setup of the Java call? + // + // we don't need to save r8 which C uses as an indirect result location + // return register. + // + // we don't need to save r9-r15 which both C and Java treat as + // volatile + // + // we don't need to save r16-18 because Java does not use them + // + // we save r19-r28 which Java uses as scratch registers and C + // expects to be callee-save + // + // we save the bottom 64 bits of each value stored in v8-v15; it is + // the responsibility of the caller to preserve larger values. + // + // so the stub frame looks like this when we enter Java code + // + // [ return_from_Java ] <--- sp + // [ argument word n ] + // ... + // -27 [ argument word 1 ] + // -26 [ saved v15 ] <--- sp_after_call + // -25 [ saved v14 ] + // -24 [ saved v13 ] + // -23 [ saved v12 ] + // -22 [ saved v11 ] + // -21 [ saved v10 ] + // -20 [ saved v9 ] + // -19 [ saved v8 ] + // -18 [ saved r28 ] + // -17 [ saved r27 ] + // -16 [ saved r26 ] + // -15 [ saved r25 ] + // -14 [ saved r24 ] + // -13 [ saved r23 ] + // -12 [ saved r22 ] + // -11 [ saved r21 ] + // -10 [ saved r20 ] + // -9 [ saved r19 ] + // -8 [ call wrapper (r0) ] + // -7 [ result (r1) ] + // -6 [ result type (r2) ] + // -5 [ method (r3) ] + // -4 [ entry point (r4) ] + // -3 [ parameters (r5) ] + // -2 [ parameter size (r6) ] + // -1 [ thread (r7) ] + // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) + // 1 [ saved lr (r30) ] + + // Call stub stack layout word offsets from fp + enum call_stub_layout { + sp_after_call_off = -26, + + d15_off = -26, + d13_off = -24, + d11_off = -22, + d9_off = -20, + + r28_off = -18, + r26_off = -16, + r24_off = -14, + r22_off = -12, + r20_off = -10, + call_wrapper_off = -8, + result_off = -7, + result_type_off = -6, + method_off = -5, + entry_point_off = -4, + parameter_size_off = -2, + thread_off = -1, + fp_f = 0, + retaddr_off = 1, + }; + + address generate_call_stub(address& return_address) { + assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && + (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, + "adjust this code"); + + StubCodeMark mark(this, "StubRoutines", "call_stub"); + address start = __ pc(); + + const Address sp_after_call(rfp, sp_after_call_off * wordSize); + + const Address call_wrapper (rfp, call_wrapper_off * wordSize); + const Address result (rfp, result_off * wordSize); + const Address result_type (rfp, result_type_off * wordSize); + const Address method (rfp, method_off * wordSize); + const Address entry_point (rfp, entry_point_off * wordSize); + const Address parameter_size(rfp, parameter_size_off * wordSize); + + const Address thread (rfp, thread_off * wordSize); + + const Address d15_save (rfp, d15_off * wordSize); + const Address d13_save (rfp, d13_off * wordSize); + const Address d11_save (rfp, d11_off * wordSize); + const Address d9_save (rfp, d9_off * wordSize); + + const Address r28_save (rfp, r28_off * wordSize); + const Address r26_save (rfp, r26_off * wordSize); + const Address r24_save (rfp, r24_off * wordSize); + const Address r22_save (rfp, r22_off * wordSize); + const Address r20_save (rfp, r20_off * wordSize); + + // stub code + + // we need a C prolog to bootstrap the x86 caller into the sim + __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); + + address aarch64_entry = __ pc(); + +#ifdef BUILTIN_SIM + // Save sender's SP for stack traces. + __ mov(rscratch1, sp); + __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); +#endif + // set up frame and move sp to end of save area + __ enter(); + __ sub(sp, rfp, -sp_after_call_off * wordSize); + + // save register parameters and Java scratch/global registers + // n.b. we save thread even though it gets installed in + // rthread because we want to sanity check rthread later + __ str(c_rarg7, thread); + __ strw(c_rarg6, parameter_size); + __ stp(c_rarg4, c_rarg5, entry_point); + __ stp(c_rarg2, c_rarg3, result_type); + __ stp(c_rarg0, c_rarg1, call_wrapper); + + __ stp(r20, r19, r20_save); + __ stp(r22, r21, r22_save); + __ stp(r24, r23, r24_save); + __ stp(r26, r25, r26_save); + __ stp(r28, r27, r28_save); + + __ stpd(v9, v8, d9_save); + __ stpd(v11, v10, d11_save); + __ stpd(v13, v12, d13_save); + __ stpd(v15, v14, d15_save); + + // install Java thread in global register now we have saved + // whatever value it held + __ mov(rthread, c_rarg7); + // And method + __ mov(rmethod, c_rarg3); + + // set up the heapbase register + __ reinit_heapbase(); + +#ifdef ASSERT + // make sure we have no pending exceptions + { + Label L; + __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); + __ cmp(rscratch1, (unsigned)NULL_WORD); + __ br(Assembler::EQ, L); + __ stop("StubRoutines::call_stub: entered with pending exception"); + __ BIND(L); + } +#endif + // pass parameters if any + __ mov(esp, sp); + __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way + __ andr(sp, rscratch1, -2 * wordSize); + + BLOCK_COMMENT("pass parameters if any"); + Label parameters_done; + // parameter count is still in c_rarg6 + // and parameter pointer identifying param 1 is in c_rarg5 + __ cbzw(c_rarg6, parameters_done); + + address loop = __ pc(); + __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); + __ subsw(c_rarg6, c_rarg6, 1); + __ push(rscratch1); + __ br(Assembler::GT, loop); + + __ BIND(parameters_done); + + // call Java entry -- passing methdoOop, and current sp + // rmethod: Method* + // r13: sender sp + BLOCK_COMMENT("call Java function"); + __ mov(r13, sp); + __ blr(c_rarg4); + + // tell the simulator we have returned to the stub + + // we do this here because the notify will already have been done + // if we get to the next instruction via an exception + // + // n.b. adding this instruction here affects the calculation of + // whether or not a routine returns to the call stub (used when + // doing stack walks) since the normal test is to check the return + // pc against the address saved below. so we may need to allow for + // this extra instruction in the check. + + if (NotifySimulator) { + __ notify(Assembler::method_reentry); + } + // save current address for use by exception handling code + + return_address = __ pc(); + + // store result depending on type (everything that is not + // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) + // n.b. this assumes Java returns an integral result in r0 + // and a floating result in j_farg0 + __ ldr(j_rarg2, result); + Label is_long, is_float, is_double, exit; + __ ldr(j_rarg1, result_type); + __ cmp(j_rarg1, T_OBJECT); + __ br(Assembler::EQ, is_long); + __ cmp(j_rarg1, T_LONG); + __ br(Assembler::EQ, is_long); + __ cmp(j_rarg1, T_FLOAT); + __ br(Assembler::EQ, is_float); + __ cmp(j_rarg1, T_DOUBLE); + __ br(Assembler::EQ, is_double); + + // handle T_INT case + __ strw(r0, Address(j_rarg2)); + + __ BIND(exit); + + // pop parameters + __ sub(esp, rfp, -sp_after_call_off * wordSize); + +#ifdef ASSERT + // verify that threads correspond + { + Label L, S; + __ ldr(rscratch1, thread); + __ cmp(rthread, rscratch1); + __ br(Assembler::NE, S); + __ get_thread(rscratch1); + __ cmp(rthread, rscratch1); + __ br(Assembler::EQ, L); + __ BIND(S); + __ stop("StubRoutines::call_stub: threads must correspond"); + __ BIND(L); + } +#endif + + // restore callee-save registers + __ ldpd(v15, v14, d15_save); + __ ldpd(v13, v12, d13_save); + __ ldpd(v11, v10, d11_save); + __ ldpd(v9, v8, d9_save); + + __ ldp(r28, r27, r28_save); + __ ldp(r26, r25, r26_save); + __ ldp(r24, r23, r24_save); + __ ldp(r22, r21, r22_save); + __ ldp(r20, r19, r20_save); + + __ ldp(c_rarg0, c_rarg1, call_wrapper); + __ ldrw(c_rarg2, result_type); + __ ldr(c_rarg3, method); + __ ldp(c_rarg4, c_rarg5, entry_point); + __ ldp(c_rarg6, c_rarg7, parameter_size); + +#ifndef PRODUCT + // tell the simulator we are about to end Java execution + if (NotifySimulator) { + __ notify(Assembler::method_exit); + } +#endif + // leave frame and return to caller + __ leave(); + __ ret(lr); + + // handle return types different from T_INT + + __ BIND(is_long); + __ str(r0, Address(j_rarg2, 0)); + __ br(Assembler::AL, exit); + + __ BIND(is_float); + __ strs(j_farg0, Address(j_rarg2, 0)); + __ br(Assembler::AL, exit); + + __ BIND(is_double); + __ strd(j_farg0, Address(j_rarg2, 0)); + __ br(Assembler::AL, exit); + + return start; + } + + // Return point for a Java call if there's an exception thrown in + // Java code. The exception is caught and transformed into a + // pending exception stored in JavaThread that can be tested from + // within the VM. + // + // Note: Usually the parameters are removed by the callee. In case + // of an exception crossing an activation frame boundary, that is + // not the case if the callee is compiled code => need to setup the + // rsp. + // + // r0: exception oop + + // NOTE: this is used as a target from the signal handler so it + // needs an x86 prolog which returns into the current simulator + // executing the generated catch_exception code. so the prolog + // needs to install rax in a sim register and adjust the sim's + // restart pc to enter the generated code at the start position + // then return from native to simulated execution. + + address generate_catch_exception() { + StubCodeMark mark(this, "StubRoutines", "catch_exception"); + address start = __ pc(); + + // same as in generate_call_stub(): + const Address sp_after_call(rfp, sp_after_call_off * wordSize); + const Address thread (rfp, thread_off * wordSize); + +#ifdef ASSERT + // verify that threads correspond + { + Label L, S; + __ ldr(rscratch1, thread); + __ cmp(rthread, rscratch1); + __ br(Assembler::NE, S); + __ get_thread(rscratch1); + __ cmp(rthread, rscratch1); + __ br(Assembler::EQ, L); + __ bind(S); + __ stop("StubRoutines::catch_exception: threads must correspond"); + __ bind(L); + } +#endif + + // set pending exception + __ verify_oop(r0); + + __ str(r0, Address(rthread, Thread::pending_exception_offset())); + __ mov(rscratch1, (address)__FILE__); + __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); + __ movw(rscratch1, (int)__LINE__); + __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); + + // complete return to VM + assert(StubRoutines::_call_stub_return_address != NULL, + "_call_stub_return_address must have been generated before"); + __ b(StubRoutines::_call_stub_return_address); + + return start; + } + + // Continuation point for runtime calls returning with a pending + // exception. The pending exception check happened in the runtime + // or native call stub. The pending exception in Thread is + // converted into a Java-level exception. + // + // Contract with Java-level exception handlers: + // r0: exception + // r3: throwing pc + // + // NOTE: At entry of this stub, exception-pc must be in LR !! + + // NOTE: this is always used as a jump target within generated code + // so it just needs to be generated code wiht no x86 prolog + + address generate_forward_exception() { + StubCodeMark mark(this, "StubRoutines", "forward exception"); + address start = __ pc(); + + // Upon entry, LR points to the return address returning into + // Java (interpreted or compiled) code; i.e., the return address + // becomes the throwing pc. + // + // Arguments pushed before the runtime call are still on the stack + // but the exception handler will reset the stack pointer -> + // ignore them. A potential result in registers can be ignored as + // well. + +#ifdef ASSERT + // make sure this code is only executed if there is a pending exception + { + Label L; + __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); + __ cbnz(rscratch1, L); + __ stop("StubRoutines::forward exception: no pending exception (1)"); + __ bind(L); + } +#endif + + // compute exception handler into r19 + + // call the VM to find the handler address associated with the + // caller address. pass thread in r0 and caller pc (ret address) + // in r1. n.b. the caller pc is in lr, unlike x86 where it is on + // the stack. + __ mov(c_rarg1, lr); + // lr will be trashed by the VM call so we move it to R19 + // (callee-saved) because we also need to pass it to the handler + // returned by this call. + __ mov(r19, lr); + BLOCK_COMMENT("call exception_handler_for_return_address"); + __ call_VM_leaf(CAST_FROM_FN_PTR(address, + SharedRuntime::exception_handler_for_return_address), + rthread, c_rarg1); + // we should not really care that lr is no longer the callee + // address. we saved the value the handler needs in r19 so we can + // just copy it to r3. however, the C2 handler will push its own + // frame and then calls into the VM and the VM code asserts that + // the PC for the frame above the handler belongs to a compiled + // Java method. So, we restore lr here to satisfy that assert. + __ mov(lr, r19); + // setup r0 & r3 & clear pending exception + __ mov(r3, r19); + __ mov(r19, r0); + __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); + __ str(zr, Address(rthread, Thread::pending_exception_offset())); + +#ifdef ASSERT + // make sure exception is set + { + Label L; + __ cbnz(r0, L); + __ stop("StubRoutines::forward exception: no pending exception (2)"); + __ bind(L); + } +#endif + + // continue at exception handler + // r0: exception + // r3: throwing pc + // r19: exception handler + __ verify_oop(r0); + __ br(r19); + + return start; + } + + // Non-destructive plausibility checks for oops + // + // Arguments: + // r0: oop to verify + // rscratch1: error message + // + // Stack after saving c_rarg3: + // [tos + 0]: saved c_rarg3 + // [tos + 1]: saved c_rarg2 + // [tos + 2]: saved lr + // [tos + 3]: saved rscratch2 + // [tos + 4]: saved r0 + // [tos + 5]: saved rscratch1 + address generate_verify_oop() { + + StubCodeMark mark(this, "StubRoutines", "verify_oop"); + address start = __ pc(); + + Label exit, error; + + // save c_rarg2 and c_rarg3 + __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); + + // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); + __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); + __ ldr(c_rarg3, Address(c_rarg2)); + __ add(c_rarg3, c_rarg3, 1); + __ str(c_rarg3, Address(c_rarg2)); + + // object is in r0 + // make sure object is 'reasonable' + __ cbz(r0, exit); // if obj is NULL it is OK + + // Check if the oop is in the right area of memory + __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); + __ andr(c_rarg2, r0, c_rarg3); + __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); + + // Compare c_rarg2 and c_rarg3. We don't use a compare + // instruction here because the flags register is live. + __ eor(c_rarg2, c_rarg2, c_rarg3); + __ cbnz(c_rarg2, error); + + // make sure klass is 'reasonable', which is not zero. + __ load_klass(r0, r0); // get klass + __ cbz(r0, error); // if klass is NULL it is broken + + // return if everything seems ok + __ bind(exit); + + __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); + __ ret(lr); + + // handle errors + __ bind(error); + __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); + + __ push(RegSet::range(r0, r29), sp); + // debug(char* msg, int64_t pc, int64_t regs[]) + __ mov(c_rarg0, rscratch1); // pass address of error message + __ mov(c_rarg1, lr); // pass return address + __ mov(c_rarg2, sp); // pass address of regs on stack +#ifndef PRODUCT + assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); +#endif + BLOCK_COMMENT("call MacroAssembler::debug"); + __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); + __ blrt(rscratch1, 3, 0, 1); + + return start; + } + + void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } + + // Generate code for an array write pre barrier + // + // addr - starting address + // count - element count + // tmp - scratch register + // saved_regs - registers to be saved before calling static_write_ref_array_pre + // + // Callers must specify which registers to preserve in saved_regs. + // Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs. + // + void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized, RegSet saved_regs) { + BarrierSet* bs = Universe::heap()->barrier_set(); + switch (bs->kind()) { + case BarrierSet::G1SATBCTLogging: + // With G1, don't generate the call if we statically know that the target in uninitialized + if (!dest_uninitialized) { + __ push(saved_regs, sp); + if (count == c_rarg0) { + if (addr == c_rarg1) { + // exactly backwards!! + __ mov(rscratch1, c_rarg0); + __ mov(c_rarg0, c_rarg1); + __ mov(c_rarg1, rscratch1); + } else { + __ mov(c_rarg1, count); + __ mov(c_rarg0, addr); + } + } else { + __ mov(c_rarg0, addr); + __ mov(c_rarg1, count); + } + __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); + __ pop(saved_regs, sp); + break; + case BarrierSet::CardTableForRS: + case BarrierSet::CardTableExtension: + case BarrierSet::ModRef: + break; + default: + ShouldNotReachHere(); + + } + } + } + + // + // Generate code for an array write post barrier + // + // Input: + // start - register containing starting address of destination array + // end - register containing ending address of destination array + // scratch - scratch register + // saved_regs - registers to be saved before calling static_write_ref_array_post + // + // The input registers are overwritten. + // The ending address is inclusive. + // Callers must specify which registers to preserve in saved_regs. + // Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs. + void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch, RegSet saved_regs) { + assert_different_registers(start, end, scratch); + BarrierSet* bs = Universe::heap()->barrier_set(); + switch (bs->kind()) { + case BarrierSet::G1SATBCTLogging: + + { + __ push(saved_regs, sp); + // must compute element count unless barrier set interface is changed (other platforms supply count) + assert_different_registers(start, end, scratch); + __ lea(scratch, Address(end, BytesPerHeapOop)); + __ sub(scratch, scratch, start); // subtract start to get #bytes + __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count + __ mov(c_rarg0, start); + __ mov(c_rarg1, scratch); + __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); + __ pop(saved_regs, sp); + } + break; + case BarrierSet::CardTableForRS: + case BarrierSet::CardTableExtension: + { + CardTableModRefBS* ct = (CardTableModRefBS*)bs; + assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); + + Label L_loop; + + __ lsr(start, start, CardTableModRefBS::card_shift); + __ lsr(end, end, CardTableModRefBS::card_shift); + __ sub(end, end, start); // number of bytes to copy + + const Register count = end; // 'end' register contains bytes count now + __ load_byte_map_base(scratch); + __ add(start, start, scratch); + if (UseConcMarkSweepGC) { + __ membar(__ StoreStore); + } + __ BIND(L_loop); + __ strb(zr, Address(start, count)); + __ subs(count, count, 1); + __ br(Assembler::GE, L_loop); + } + break; + default: + ShouldNotReachHere(); + + } + } + + // The inner part of zero_words(). This is the bulk operation, + // zeroing words in blocks, possibly using DC ZVA to do it. The + // caller is responsible for zeroing the last few words. + // + // Inputs: + // r10: the HeapWord-aligned base address of an array to zero. + // r11: the count in HeapWords, r11 > 0. + // + // Returns r10 and r11, adjusted for the caller to clear. + // r10: the base address of the tail of words left to clear. + // r11: the number of words in the tail. + // r11 < MacroAssembler::zero_words_block_size. + + address generate_zero_blocks() { + Label store_pair, loop_store_pair, done; + Label base_aligned; + + Register base = r10, cnt = r11; + + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "zero_blocks"); + address start = __ pc(); + + if (UseBlockZeroing) { + int zva_length = VM_Version::zva_length(); + + // Ensure ZVA length can be divided by 16. This is required by + // the subsequent operations. + assert (zva_length % 16 == 0, "Unexpected ZVA Length"); + + __ tbz(base, 3, base_aligned); + __ str(zr, Address(__ post(base, 8))); + __ sub(cnt, cnt, 1); + __ bind(base_aligned); + + // Ensure count >= zva_length * 2 so that it still deserves a zva after + // alignment. + Label small; + int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); + __ subs(rscratch1, cnt, low_limit >> 3); + __ br(Assembler::LT, small); + __ zero_dcache_blocks(base, cnt); + __ bind(small); + } + + { + // Number of stp instructions we'll unroll + const int unroll = + MacroAssembler::zero_words_block_size / 2; + // Clear the remaining blocks. + Label loop; + __ subs(cnt, cnt, unroll * 2); + __ br(Assembler::LT, done); + __ bind(loop); + for (int i = 0; i < unroll; i++) + __ stp(zr, zr, __ post(base, 16)); + __ subs(cnt, cnt, unroll * 2); + __ br(Assembler::GE, loop); + __ bind(done); + __ add(cnt, cnt, unroll * 2); + } + + __ ret(lr); + + return start; + } + + + typedef enum { + copy_forwards = 1, + copy_backwards = -1 + } copy_direction; + + // Bulk copy of blocks of 8 words. + // + // count is a count of words. + // + // Precondition: count >= 8 + // + // Postconditions: + // + // The least significant bit of count contains the remaining count + // of words to copy. The rest of count is trash. + // + // s and d are adjusted to point to the remaining words to copy + // + void generate_copy_longs(Label &start, Register s, Register d, Register count, + copy_direction direction) { + int unit = wordSize * direction; + int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; + + int offset; + const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, + t4 = r7, t5 = r10, t6 = r11, t7 = r12; + const Register stride = r13; + + assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); + assert_different_registers(s, d, count, rscratch1); + + Label again, drain; + const char *stub_name; + if (direction == copy_forwards) + stub_name = "forward_copy_longs"; + else + stub_name = "backward_copy_longs"; + StubCodeMark mark(this, "StubRoutines", stub_name); + __ align(CodeEntryAlignment); + __ bind(start); + + Label unaligned_copy_long; + if (AvoidUnalignedAccesses) { + __ tbnz(d, 3, unaligned_copy_long); + } + + if (direction == copy_forwards) { + __ sub(s, s, bias); + __ sub(d, d, bias); + } + +#ifdef ASSERT + // Make sure we are never given < 8 words + { + Label L; + __ cmp(count, 8); + __ br(Assembler::GE, L); + __ stop("genrate_copy_longs called with < 8 words"); + __ bind(L); + } +#endif + + // Fill 8 registers + if (UseSIMDForMemoryOps) { + __ ldpq(v0, v1, Address(s, 4 * unit)); + __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); + } else { + __ ldp(t0, t1, Address(s, 2 * unit)); + __ ldp(t2, t3, Address(s, 4 * unit)); + __ ldp(t4, t5, Address(s, 6 * unit)); + __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); + } + + __ subs(count, count, 16); + __ br(Assembler::LO, drain); + + int prefetch = PrefetchCopyIntervalInBytes; + bool use_stride = false; + if (direction == copy_backwards) { + use_stride = prefetch > 256; + prefetch = -prefetch; + if (use_stride) __ mov(stride, prefetch); + } + + __ bind(again); + + if (PrefetchCopyIntervalInBytes > 0) + __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); + + if (UseSIMDForMemoryOps) { + __ stpq(v0, v1, Address(d, 4 * unit)); + __ ldpq(v0, v1, Address(s, 4 * unit)); + __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); + __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); + } else { + __ stp(t0, t1, Address(d, 2 * unit)); + __ ldp(t0, t1, Address(s, 2 * unit)); + __ stp(t2, t3, Address(d, 4 * unit)); + __ ldp(t2, t3, Address(s, 4 * unit)); + __ stp(t4, t5, Address(d, 6 * unit)); + __ ldp(t4, t5, Address(s, 6 * unit)); + __ stp(t6, t7, Address(__ pre(d, 8 * unit))); + __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); + } + + __ subs(count, count, 8); + __ br(Assembler::HS, again); + + // Drain + __ bind(drain); + if (UseSIMDForMemoryOps) { + __ stpq(v0, v1, Address(d, 4 * unit)); + __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); + } else { + __ stp(t0, t1, Address(d, 2 * unit)); + __ stp(t2, t3, Address(d, 4 * unit)); + __ stp(t4, t5, Address(d, 6 * unit)); + __ stp(t6, t7, Address(__ pre(d, 8 * unit))); + } + + { + Label L1, L2; + __ tbz(count, exact_log2(4), L1); + if (UseSIMDForMemoryOps) { + __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); + __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); + } else { + __ ldp(t0, t1, Address(s, 2 * unit)); + __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); + __ stp(t0, t1, Address(d, 2 * unit)); + __ stp(t2, t3, Address(__ pre(d, 4 * unit))); + } + __ bind(L1); + + if (direction == copy_forwards) { + __ add(s, s, bias); + __ add(d, d, bias); + } + + __ tbz(count, 1, L2); + __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); + __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); + __ bind(L2); + } + + __ ret(lr); + + if (AvoidUnalignedAccesses) { + Label drain, again; + // Register order for storing. Order is different for backward copy. + + __ bind(unaligned_copy_long); + + // source address is even aligned, target odd aligned + // + // when forward copying word pairs we read long pairs at offsets + // {0, 2, 4, 6} (in long words). when backwards copying we read + // long pairs at offsets {-2, -4, -6, -8}. We adjust the source + // address by -2 in the forwards case so we can compute the + // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 + // or -1. + // + // when forward copying we need to store 1 word, 3 pairs and + // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a + // zero offset We adjust the destination by -1 which means we + // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. + // + // When backwards copyng we need to store 1 word, 3 pairs and + // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use + // offsets {1, 3, 5, 7, 8} * unit. + + if (direction == copy_forwards) { + __ sub(s, s, 16); + __ sub(d, d, 8); + } + + // Fill 8 registers + // + // for forwards copy s was offset by -16 from the original input + // value of s so the register contents are at these offsets + // relative to the 64 bit block addressed by that original input + // and so on for each successive 64 byte block when s is updated + // + // t0 at offset 0, t1 at offset 8 + // t2 at offset 16, t3 at offset 24 + // t4 at offset 32, t5 at offset 40 + // t6 at offset 48, t7 at offset 56 + + // for backwards copy s was not offset so the register contents + // are at these offsets into the preceding 64 byte block + // relative to that original input and so on for each successive + // preceding 64 byte block when s is updated. this explains the + // slightly counter-intuitive looking pattern of register usage + // in the stp instructions for backwards copy. + // + // t0 at offset -16, t1 at offset -8 + // t2 at offset -32, t3 at offset -24 + // t4 at offset -48, t5 at offset -40 + // t6 at offset -64, t7 at offset -56 + + __ ldp(t0, t1, Address(s, 2 * unit)); + __ ldp(t2, t3, Address(s, 4 * unit)); + __ ldp(t4, t5, Address(s, 6 * unit)); + __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); + + __ subs(count, count, 16); + __ br(Assembler::LO, drain); + + int prefetch = PrefetchCopyIntervalInBytes; + bool use_stride = false; + if (direction == copy_backwards) { + use_stride = prefetch > 256; + prefetch = -prefetch; + if (use_stride) __ mov(stride, prefetch); + } + + __ bind(again); + + if (PrefetchCopyIntervalInBytes > 0) + __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); + + if (direction == copy_forwards) { + // allowing for the offset of -8 the store instructions place + // registers into the target 64 bit block at the following + // offsets + // + // t0 at offset 0 + // t1 at offset 8, t2 at offset 16 + // t3 at offset 24, t4 at offset 32 + // t5 at offset 40, t6 at offset 48 + // t7 at offset 56 + + __ str(t0, Address(d, 1 * unit)); + __ stp(t1, t2, Address(d, 2 * unit)); + __ ldp(t0, t1, Address(s, 2 * unit)); + __ stp(t3, t4, Address(d, 4 * unit)); + __ ldp(t2, t3, Address(s, 4 * unit)); + __ stp(t5, t6, Address(d, 6 * unit)); + __ ldp(t4, t5, Address(s, 6 * unit)); + __ str(t7, Address(__ pre(d, 8 * unit))); + __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); + } else { + // d was not offset when we started so the registers are + // written into the 64 bit block preceding d with the following + // offsets + // + // t1 at offset -8 + // t3 at offset -24, t0 at offset -16 + // t5 at offset -48, t2 at offset -32 + // t7 at offset -56, t4 at offset -48 + // t6 at offset -64 + // + // note that this matches the offsets previously noted for the + // loads + + __ str(t1, Address(d, 1 * unit)); + __ stp(t3, t0, Address(d, 3 * unit)); + __ ldp(t0, t1, Address(s, 2 * unit)); + __ stp(t5, t2, Address(d, 5 * unit)); + __ ldp(t2, t3, Address(s, 4 * unit)); + __ stp(t7, t4, Address(d, 7 * unit)); + __ ldp(t4, t5, Address(s, 6 * unit)); + __ str(t6, Address(__ pre(d, 8 * unit))); + __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); + } + + __ subs(count, count, 8); + __ br(Assembler::HS, again); + + // Drain + // + // this uses the same pattern of offsets and register arguments + // as above + __ bind(drain); + if (direction == copy_forwards) { + __ str(t0, Address(d, 1 * unit)); + __ stp(t1, t2, Address(d, 2 * unit)); + __ stp(t3, t4, Address(d, 4 * unit)); + __ stp(t5, t6, Address(d, 6 * unit)); + __ str(t7, Address(__ pre(d, 8 * unit))); + } else { + __ str(t1, Address(d, 1 * unit)); + __ stp(t3, t0, Address(d, 3 * unit)); + __ stp(t5, t2, Address(d, 5 * unit)); + __ stp(t7, t4, Address(d, 7 * unit)); + __ str(t6, Address(__ pre(d, 8 * unit))); + } + // now we need to copy any remaining part block which may + // include a 4 word block subblock and/or a 2 word subblock. + // bits 2 and 1 in the count are the tell-tale for whetehr we + // have each such subblock + { + Label L1, L2; + __ tbz(count, exact_log2(4), L1); + // this is the same as above but copying only 4 longs hence + // with ony one intervening stp between the str instructions + // but note that the offsets and registers still follow the + // same pattern + __ ldp(t0, t1, Address(s, 2 * unit)); + __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); + if (direction == copy_forwards) { + __ str(t0, Address(d, 1 * unit)); + __ stp(t1, t2, Address(d, 2 * unit)); + __ str(t3, Address(__ pre(d, 4 * unit))); + } else { + __ str(t1, Address(d, 1 * unit)); + __ stp(t3, t0, Address(d, 3 * unit)); + __ str(t2, Address(__ pre(d, 4 * unit))); + } + __ bind(L1); + + __ tbz(count, 1, L2); + // this is the same as above but copying only 2 longs hence + // there is no intervening stp between the str instructions + // but note that the offset and register patterns are still + // the same + __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); + if (direction == copy_forwards) { + __ str(t0, Address(d, 1 * unit)); + __ str(t1, Address(__ pre(d, 2 * unit))); + } else { + __ str(t1, Address(d, 1 * unit)); + __ str(t0, Address(__ pre(d, 2 * unit))); + } + __ bind(L2); + + // for forwards copy we need to re-adjust the offsets we + // applied so that s and d are follow the last words written + + if (direction == copy_forwards) { + __ add(s, s, 16); + __ add(d, d, 8); + } + + } + + __ ret(lr); + } + } + + // Small copy: less than 16 bytes. + // + // NB: Ignores all of the bits of count which represent more than 15 + // bytes, so a caller doesn't have to mask them. + + void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { + bool is_backwards = step < 0; + size_t granularity = uabs(step); + int direction = is_backwards ? -1 : 1; + int unit = wordSize * direction; + + Label Lpair, Lword, Lint, Lshort, Lbyte; + + assert(granularity + && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); + + const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; + + // ??? I don't know if this bit-test-and-branch is the right thing + // to do. It does a lot of jumping, resulting in several + // mispredicted branches. It might make more sense to do this + // with something like Duff's device with a single computed branch. + + __ tbz(count, 3 - exact_log2(granularity), Lword); + __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); + __ str(tmp, Address(__ adjust(d, unit, is_backwards))); + __ bind(Lword); + + if (granularity <= sizeof (jint)) { + __ tbz(count, 2 - exact_log2(granularity), Lint); + __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); + __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); + __ bind(Lint); + } + + if (granularity <= sizeof (jshort)) { + __ tbz(count, 1 - exact_log2(granularity), Lshort); + __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); + __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); + __ bind(Lshort); + } + + if (granularity <= sizeof (jbyte)) { + __ tbz(count, 0, Lbyte); + __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); + __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); + __ bind(Lbyte); + } + } + + Label copy_f, copy_b; + + // All-singing all-dancing memory copy. + // + // Copy count units of memory from s to d. The size of a unit is + // step, which can be positive or negative depending on the direction + // of copy. If is_aligned is false, we align the source address. + // + + void copy_memory(bool is_aligned, Register s, Register d, + Register count, Register tmp, int step) { + copy_direction direction = step < 0 ? copy_backwards : copy_forwards; + bool is_backwards = step < 0; + int granularity = uabs(step); + const Register t0 = r3, t1 = r4; + + // <= 96 bytes do inline. Direction doesn't matter because we always + // load all the data before writing anything + Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; + const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; + const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; + const Register send = r17, dend = r18; + + if (PrefetchCopyIntervalInBytes > 0) + __ prfm(Address(s, 0), PLDL1KEEP); + __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); + __ br(Assembler::HI, copy_big); + + __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); + __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); + + __ cmp(count, 16/granularity); + __ br(Assembler::LS, copy16); + + __ cmp(count, 64/granularity); + __ br(Assembler::HI, copy80); + + __ cmp(count, 32/granularity); + __ br(Assembler::LS, copy32); + + // 33..64 bytes + if (UseSIMDForMemoryOps) { + __ ldpq(v0, v1, Address(s, 0)); + __ ldpq(v2, v3, Address(send, -32)); + __ stpq(v0, v1, Address(d, 0)); + __ stpq(v2, v3, Address(dend, -32)); + } else { + __ ldp(t0, t1, Address(s, 0)); + __ ldp(t2, t3, Address(s, 16)); + __ ldp(t4, t5, Address(send, -32)); + __ ldp(t6, t7, Address(send, -16)); + + __ stp(t0, t1, Address(d, 0)); + __ stp(t2, t3, Address(d, 16)); + __ stp(t4, t5, Address(dend, -32)); + __ stp(t6, t7, Address(dend, -16)); + } + __ b(finish); + + // 17..32 bytes + __ bind(copy32); + __ ldp(t0, t1, Address(s, 0)); + __ ldp(t2, t3, Address(send, -16)); + __ stp(t0, t1, Address(d, 0)); + __ stp(t2, t3, Address(dend, -16)); + __ b(finish); + + // 65..80/96 bytes + // (96 bytes if SIMD because we do 32 byes per instruction) + __ bind(copy80); + if (UseSIMDForMemoryOps) { + __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); + __ ldpq(v4, v5, Address(send, -32)); + __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); + __ stpq(v4, v5, Address(dend, -32)); + } else { + __ ldp(t0, t1, Address(s, 0)); + __ ldp(t2, t3, Address(s, 16)); + __ ldp(t4, t5, Address(s, 32)); + __ ldp(t6, t7, Address(s, 48)); + __ ldp(t8, t9, Address(send, -16)); + + __ stp(t0, t1, Address(d, 0)); + __ stp(t2, t3, Address(d, 16)); + __ stp(t4, t5, Address(d, 32)); + __ stp(t6, t7, Address(d, 48)); + __ stp(t8, t9, Address(dend, -16)); + } + __ b(finish); + + // 0..16 bytes + __ bind(copy16); + __ cmp(count, 8/granularity); + __ br(Assembler::LO, copy8); + + // 8..16 bytes + __ ldr(t0, Address(s, 0)); + __ ldr(t1, Address(send, -8)); + __ str(t0, Address(d, 0)); + __ str(t1, Address(dend, -8)); + __ b(finish); + + if (granularity < 8) { + // 4..7 bytes + __ bind(copy8); + __ tbz(count, 2 - exact_log2(granularity), copy4); + __ ldrw(t0, Address(s, 0)); + __ ldrw(t1, Address(send, -4)); + __ strw(t0, Address(d, 0)); + __ strw(t1, Address(dend, -4)); + __ b(finish); + if (granularity < 4) { + // 0..3 bytes + __ bind(copy4); + __ cbz(count, finish); // get rid of 0 case + if (granularity == 2) { + __ ldrh(t0, Address(s, 0)); + __ strh(t0, Address(d, 0)); + } else { // granularity == 1 + // Now 1..3 bytes. Handle the 1 and 2 byte case by copying + // the first and last byte. + // Handle the 3 byte case by loading and storing base + count/2 + // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) + // This does means in the 1 byte case we load/store the same + // byte 3 times. + __ lsr(count, count, 1); + __ ldrb(t0, Address(s, 0)); + __ ldrb(t1, Address(send, -1)); + __ ldrb(t2, Address(s, count)); + __ strb(t0, Address(d, 0)); + __ strb(t1, Address(dend, -1)); + __ strb(t2, Address(d, count)); + } + __ b(finish); + } + } + + __ bind(copy_big); + if (is_backwards) { + __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); + __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); + } + + // Now we've got the small case out of the way we can align the + // source address on a 2-word boundary. + + Label aligned; + + if (is_aligned) { + // We may have to adjust by 1 word to get s 2-word-aligned. + __ tbz(s, exact_log2(wordSize), aligned); + __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); + __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); + __ sub(count, count, wordSize/granularity); + } else { + if (is_backwards) { + __ andr(rscratch2, s, 2 * wordSize - 1); + } else { + __ neg(rscratch2, s); + __ andr(rscratch2, rscratch2, 2 * wordSize - 1); + } + // rscratch2 is the byte adjustment needed to align s. + __ cbz(rscratch2, aligned); + int shift = exact_log2(granularity); + if (shift) __ lsr(rscratch2, rscratch2, shift); + __ sub(count, count, rscratch2); + +#if 0 + // ?? This code is only correct for a disjoint copy. It may or + // may not make sense to use it in that case. + + // Copy the first pair; s and d may not be aligned. + __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); + __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); + + // Align s and d, adjust count + if (is_backwards) { + __ sub(s, s, rscratch2); + __ sub(d, d, rscratch2); + } else { + __ add(s, s, rscratch2); + __ add(d, d, rscratch2); + } +#else + copy_memory_small(s, d, rscratch2, rscratch1, step); +#endif + } + + __ bind(aligned); + + // s is now 2-word-aligned. + + // We have a count of units and some trailing bytes. Adjust the + // count and do a bulk copy of words. + __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); + if (direction == copy_forwards) + __ bl(copy_f); + else + __ bl(copy_b); + + // And the tail. + copy_memory_small(s, d, count, tmp, step); + + if (granularity >= 8) __ bind(copy8); + if (granularity >= 4) __ bind(copy4); + __ bind(finish); + } + + + void clobber_registers() { +#ifdef ASSERT + __ mov(rscratch1, (uint64_t)0xdeadbeef); + __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); + for (Register r = r3; r <= r18; r++) + if (r != rscratch1) __ mov(r, rscratch1); +#endif + } + + // Scan over array at a for count oops, verifying each one. + // Preserves a and count, clobbers rscratch1 and rscratch2. + void verify_oop_array (size_t size, Register a, Register count, Register temp) { + Label loop, end; + __ mov(rscratch1, a); + __ mov(rscratch2, zr); + __ bind(loop); + __ cmp(rscratch2, count); + __ br(Assembler::HS, end); + if (size == (size_t)wordSize) { + __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); + __ verify_oop(temp); + } else { + __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); + __ decode_heap_oop(temp); // calls verify_oop + } + __ add(rscratch2, rscratch2, size); + __ b(loop); + __ bind(end); + } + + // Arguments: + // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary + // ignored + // is_oop - true => oop array, so generate store check code + // name - stub name string + // + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as ssize_t, can be zero + // + // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let + // the hardware handle it. The two dwords within qwords that span + // cache line boundaries will still be loaded and stored atomicly. + // + // Side Effects: + // disjoint_int_copy_entry is set to the no-overlap entry point + // used by generate_conjoint_int_oop_copy(). + // + address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, + const char *name, bool dest_uninitialized = false) { + Register s = c_rarg0, d = c_rarg1, count = c_rarg2; + RegSet saved_reg = RegSet::of(s, d, count); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + __ enter(); + + if (entry != NULL) { + *entry = __ pc(); + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) + BLOCK_COMMENT("Entry:"); + } + + if (is_oop) { + gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_reg); + // save regs before copy_memory + __ push(RegSet::of(d, count), sp); + } + copy_memory(aligned, s, d, count, rscratch1, size); + if (is_oop) { + __ pop(RegSet::of(d, count), sp); + if (VerifyOops) + verify_oop_array(size, d, count, r16); + __ sub(count, count, 1); // make an inclusive end pointer + __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); + gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet()); + } + __ leave(); + __ mov(r0, zr); // return 0 + __ ret(lr); +#ifdef BUILTIN_SIM + { + AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); + sim->notifyCompile(const_cast(name), start); + } +#endif + return start; + } + + // Arguments: + // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary + // ignored + // is_oop - true => oop array, so generate store check code + // name - stub name string + // + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as ssize_t, can be zero + // + // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let + // the hardware handle it. The two dwords within qwords that span + // cache line boundaries will still be loaded and stored atomicly. + // + address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, + address *entry, const char *name, + bool dest_uninitialized = false) { + Register s = c_rarg0, d = c_rarg1, count = c_rarg2; + RegSet saved_regs = RegSet::of(s, d, count); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + __ enter(); + + if (entry != NULL) { + *entry = __ pc(); + // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) + BLOCK_COMMENT("Entry:"); + } + + // use fwd copy when (d-s) above_equal (count*size) + __ sub(rscratch1, d, s); + __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); + __ br(Assembler::HS, nooverlap_target); + + if (is_oop) { + gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_regs); + // save regs before copy_memory + __ push(RegSet::of(d, count), sp); + } + copy_memory(aligned, s, d, count, rscratch1, -size); + if (is_oop) { + __ pop(RegSet::of(d, count), sp); + if (VerifyOops) + verify_oop_array(size, d, count, r16); + __ sub(count, count, 1); // make an inclusive end pointer + __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); + gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet()); + } + __ leave(); + __ mov(r0, zr); // return 0 + __ ret(lr); +#ifdef BUILTIN_SIM + { + AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); + sim->notifyCompile(const_cast(name), start); + } +#endif + return start; +} + + // Arguments: + // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary + // ignored + // name - stub name string + // + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as ssize_t, can be zero + // + // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, + // we let the hardware handle it. The one to eight bytes within words, + // dwords or qwords that span cache line boundaries will still be loaded + // and stored atomically. + // + // Side Effects: + // disjoint_byte_copy_entry is set to the no-overlap entry point // + // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, + // we let the hardware handle it. The one to eight bytes within words, + // dwords or qwords that span cache line boundaries will still be loaded + // and stored atomically. + // + // Side Effects: + // disjoint_byte_copy_entry is set to the no-overlap entry point + // used by generate_conjoint_byte_copy(). + // + address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { + const bool not_oop = false; + return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); + } + + // Arguments: + // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary + // ignored + // name - stub name string + // + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as ssize_t, can be zero + // + // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, + // we let the hardware handle it. The one to eight bytes within words, + // dwords or qwords that span cache line boundaries will still be loaded + // and stored atomically. + // + address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, + address* entry, const char *name) { + const bool not_oop = false; + return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); + } + + // Arguments: + // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary + // ignored + // name - stub name string + // + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as ssize_t, can be zero + // + // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we + // let the hardware handle it. The two or four words within dwords + // or qwords that span cache line boundaries will still be loaded + // and stored atomically. + // + // Side Effects: + // disjoint_short_copy_entry is set to the no-overlap entry point + // used by generate_conjoint_short_copy(). + // + address generate_disjoint_short_copy(bool aligned, + address* entry, const char *name) { + const bool not_oop = false; + return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); + } + + // Arguments: + // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary + // ignored + // name - stub name string + // + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as ssize_t, can be zero + // + // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we + // let the hardware handle it. The two or four words within dwords + // or qwords that span cache line boundaries will still be loaded + // and stored atomically. + // + address generate_conjoint_short_copy(bool aligned, address nooverlap_target, + address *entry, const char *name) { + const bool not_oop = false; + return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); + + } + // Arguments: + // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary + // ignored + // name - stub name string + // + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as ssize_t, can be zero + // + // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let + // the hardware handle it. The two dwords within qwords that span + // cache line boundaries will still be loaded and stored atomicly. + // + // Side Effects: + // disjoint_int_copy_entry is set to the no-overlap entry point + // used by generate_conjoint_int_oop_copy(). + // + address generate_disjoint_int_copy(bool aligned, address *entry, + const char *name, bool dest_uninitialized = false) { + const bool not_oop = false; + return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); + } + + // Arguments: + // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary + // ignored + // name - stub name string + // + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as ssize_t, can be zero + // + // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let + // the hardware handle it. The two dwords within qwords that span + // cache line boundaries will still be loaded and stored atomicly. + // + address generate_conjoint_int_copy(bool aligned, address nooverlap_target, + address *entry, const char *name, + bool dest_uninitialized = false) { + const bool not_oop = false; + return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); + } + + + // Arguments: + // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes + // ignored + // name - stub name string + // + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as size_t, can be zero + // + // Side Effects: + // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the + // no-overlap entry point used by generate_conjoint_long_oop_copy(). + // + address generate_disjoint_long_copy(bool aligned, address *entry, + const char *name, bool dest_uninitialized = false) { + const bool not_oop = false; + return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); + } + + // Arguments: + // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes + // ignored + // name - stub name string + // + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as size_t, can be zero + // + address generate_conjoint_long_copy(bool aligned, + address nooverlap_target, address *entry, + const char *name, bool dest_uninitialized = false) { + const bool not_oop = false; + return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); + } + + // Arguments: + // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes + // ignored + // name - stub name string + // + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as size_t, can be zero + // + // Side Effects: + // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the + // no-overlap entry point used by generate_conjoint_long_oop_copy(). + // + address generate_disjoint_oop_copy(bool aligned, address *entry, + const char *name, bool dest_uninitialized) { + const bool is_oop = true; + const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); + return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); + } + + // Arguments: + // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes + // ignored + // name - stub name string + // + // Inputs: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as size_t, can be zero + // + address generate_conjoint_oop_copy(bool aligned, + address nooverlap_target, address *entry, + const char *name, bool dest_uninitialized) { + const bool is_oop = true; + const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); + return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, + name, dest_uninitialized); + } + + + // Helper for generating a dynamic type check. + // Smashes rscratch1. + void generate_type_check(Register sub_klass, + Register super_check_offset, + Register super_klass, + Label& L_success) { + assert_different_registers(sub_klass, super_check_offset, super_klass); + + BLOCK_COMMENT("type_check:"); + + Label L_miss; + + __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, + super_check_offset); + __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); + + // Fall through on failure! + __ BIND(L_miss); + } + + // + // Generate checkcasting array copy stub + // + // Input: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - element count, treated as ssize_t, can be zero + // c_rarg3 - size_t ckoff (super_check_offset) + // c_rarg4 - oop ckval (super_klass) + // + // Output: + // r0 == 0 - success + // r0 == -1^K - failure, where K is partial transfer count + // + address generate_checkcast_copy(const char *name, address *entry, + bool dest_uninitialized = false) { + + Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; + + // Input registers (after setup_arg_regs) + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register count = c_rarg2; // elementscount + const Register ckoff = c_rarg3; // super_check_offset + const Register ckval = c_rarg4; // super_klass + + RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); + RegSet wb_post_saved_regs = RegSet::of(count); + + // Registers used as temps (r18, r19, r20 are save-on-entry) + const Register count_save = r21; // orig elementscount + const Register start_to = r20; // destination array start address + const Register copied_oop = r18; // actual oop copied + const Register r19_klass = r19; // oop._klass + + //--------------------------------------------------------------- + // Assembler stub will be used for this call to arraycopy + // if the two arrays are subtypes of Object[] but the + // destination array type is not equal to or a supertype + // of the source type. Each element must be separately + // checked. + + assert_different_registers(from, to, count, ckoff, ckval, start_to, + copied_oop, r19_klass, count_save); + + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + __ enter(); // required for proper stackwalking of RuntimeStub frame + +#ifdef ASSERT + // caller guarantees that the arrays really are different + // otherwise, we would have to make conjoint checks + { Label L; + array_overlap_test(L, TIMES_OOP); + __ stop("checkcast_copy within a single array"); + __ bind(L); + } +#endif //ASSERT + + // Caller of this entry point must set up the argument registers. + if (entry != NULL) { + *entry = __ pc(); + BLOCK_COMMENT("Entry:"); + } + + // Empty array: Nothing to do. + __ cbz(count, L_done); + + __ push(RegSet::of(r18, r19, r20, r21), sp); + +#ifdef ASSERT + BLOCK_COMMENT("assert consistent ckoff/ckval"); + // The ckoff and ckval must be mutually consistent, + // even though caller generates both. + { Label L; + int sco_offset = in_bytes(Klass::super_check_offset_offset()); + __ ldrw(start_to, Address(ckval, sco_offset)); + __ cmpw(ckoff, start_to); + __ br(Assembler::EQ, L); + __ stop("super_check_offset inconsistent"); + __ bind(L); + } +#endif //ASSERT + + gen_write_ref_array_pre_barrier(to, count, dest_uninitialized, wb_pre_saved_regs); + + // save the original count + __ mov(count_save, count); + + // Copy from low to high addresses + __ mov(start_to, to); // Save destination array start address + __ b(L_load_element); + + // ======== begin loop ======== + // (Loop is rotated; its entry is L_load_element.) + // Loop control: + // for (; count != 0; count--) { + // copied_oop = load_heap_oop(from++); + // ... generate_type_check ...; + // store_heap_oop(to++, copied_oop); + // } + __ align(OptoLoopAlignment); + + __ BIND(L_store_element); + __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop + __ sub(count, count, 1); + __ cbz(count, L_do_card_marks); + + // ======== loop entry is here ======== + __ BIND(L_load_element); + __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop + __ cbz(copied_oop, L_store_element); + + __ load_klass(r19_klass, copied_oop);// query the object klass + generate_type_check(r19_klass, ckoff, ckval, L_store_element); + // ======== end loop ======== + + // It was a real error; we must depend on the caller to finish the job. + // Register count = remaining oops, count_orig = total oops. + // Emit GC store barriers for the oops we have copied and report + // their number to the caller. + + __ subs(count, count_save, count); // K = partially copied oop count + __ eon(count, count, zr); // report (-1^K) to caller + __ br(Assembler::EQ, L_done_pop); + + __ BIND(L_do_card_marks); + __ add(to, to, -heapOopSize); // make an inclusive end pointer + gen_write_ref_array_post_barrier(start_to, to, rscratch1, wb_post_saved_regs); + + __ bind(L_done_pop); + __ pop(RegSet::of(r18, r19, r20, r21), sp); + inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); + + __ bind(L_done); + __ mov(r0, count); + __ leave(); + __ ret(lr); + + return start; + } + + // Perform range checks on the proposed arraycopy. + // Kills temp, but nothing else. + // Also, clean the sign bits of src_pos and dst_pos. + void arraycopy_range_checks(Register src, // source array oop (c_rarg0) + Register src_pos, // source position (c_rarg1) + Register dst, // destination array oo (c_rarg2) + Register dst_pos, // destination position (c_rarg3) + Register length, + Register temp, + Label& L_failed) { + BLOCK_COMMENT("arraycopy_range_checks:"); + + assert_different_registers(rscratch1, temp); + + // if (src_pos + length > arrayOop(src)->length()) FAIL; + __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); + __ addw(temp, length, src_pos); + __ cmpw(temp, rscratch1); + __ br(Assembler::HI, L_failed); + + // if (dst_pos + length > arrayOop(dst)->length()) FAIL; + __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); + __ addw(temp, length, dst_pos); + __ cmpw(temp, rscratch1); + __ br(Assembler::HI, L_failed); + + // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. + __ movw(src_pos, src_pos); + __ movw(dst_pos, dst_pos); + + BLOCK_COMMENT("arraycopy_range_checks done"); + } + + // These stubs get called from some dumb test routine. + // I'll write them properly when they're called from + // something that's actually doing something. + static void fake_arraycopy_stub(address src, address dst, int count) { + assert(count == 0, "huh?"); + } + + + // + // Generate 'unsafe' array copy stub + // Though just as safe as the other stubs, it takes an unscaled + // size_t argument instead of an element count. + // + // Input: + // c_rarg0 - source array address + // c_rarg1 - destination array address + // c_rarg2 - byte count, treated as ssize_t, can be zero + // + // Examines the alignment of the operands and dispatches + // to a long, int, short, or byte copy loop. + // + address generate_unsafe_copy(const char *name, + address byte_copy_entry, + address short_copy_entry, + address int_copy_entry, + address long_copy_entry) { + Label L_long_aligned, L_int_aligned, L_short_aligned; + Register s = c_rarg0, d = c_rarg1, count = c_rarg2; + + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + __ enter(); // required for proper stackwalking of RuntimeStub frame + + // bump this on entry, not on exit: + inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); + + __ orr(rscratch1, s, d); + __ orr(rscratch1, rscratch1, count); + + __ andr(rscratch1, rscratch1, BytesPerLong-1); + __ cbz(rscratch1, L_long_aligned); + __ andr(rscratch1, rscratch1, BytesPerInt-1); + __ cbz(rscratch1, L_int_aligned); + __ tbz(rscratch1, 0, L_short_aligned); + __ b(RuntimeAddress(byte_copy_entry)); + + __ BIND(L_short_aligned); + __ lsr(count, count, LogBytesPerShort); // size => short_count + __ b(RuntimeAddress(short_copy_entry)); + __ BIND(L_int_aligned); + __ lsr(count, count, LogBytesPerInt); // size => int_count + __ b(RuntimeAddress(int_copy_entry)); + __ BIND(L_long_aligned); + __ lsr(count, count, LogBytesPerLong); // size => long_count + __ b(RuntimeAddress(long_copy_entry)); + + return start; + } + + // + // Generate generic array copy stubs + // + // Input: + // c_rarg0 - src oop + // c_rarg1 - src_pos (32-bits) + // c_rarg2 - dst oop + // c_rarg3 - dst_pos (32-bits) + // c_rarg4 - element count (32-bits) + // + // Output: + // r0 == 0 - success + // r0 == -1^K - failure, where K is partial transfer count + // + address generate_generic_copy(const char *name, + address byte_copy_entry, address short_copy_entry, + address int_copy_entry, address oop_copy_entry, + address long_copy_entry, address checkcast_copy_entry) { + + Label L_failed, L_failed_0, L_objArray; + Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; + + // Input registers + const Register src = c_rarg0; // source array oop + const Register src_pos = c_rarg1; // source position + const Register dst = c_rarg2; // destination array oop + const Register dst_pos = c_rarg3; // destination position + const Register length = c_rarg4; + + StubCodeMark mark(this, "StubRoutines", name); + + __ align(CodeEntryAlignment); + address start = __ pc(); + + __ enter(); // required for proper stackwalking of RuntimeStub frame + + // bump this on entry, not on exit: + inc_counter_np(SharedRuntime::_generic_array_copy_ctr); + + //----------------------------------------------------------------------- + // Assembler stub will be used for this call to arraycopy + // if the following conditions are met: + // + // (1) src and dst must not be null. + // (2) src_pos must not be negative. + // (3) dst_pos must not be negative. + // (4) length must not be negative. + // (5) src klass and dst klass should be the same and not NULL. + // (6) src and dst should be arrays. + // (7) src_pos + length must not exceed length of src. + // (8) dst_pos + length must not exceed length of dst. + // + + // if (src == NULL) return -1; + __ cbz(src, L_failed); + + // if (src_pos < 0) return -1; + __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set + + // if (dst == NULL) return -1; + __ cbz(dst, L_failed); + + // if (dst_pos < 0) return -1; + __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set + + // registers used as temp + const Register scratch_length = r16; // elements count to copy + const Register scratch_src_klass = r17; // array klass + const Register lh = r18; // layout helper + + // if (length < 0) return -1; + __ movw(scratch_length, length); // length (elements count, 32-bits value) + __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set + + __ load_klass(scratch_src_klass, src); +#ifdef ASSERT + // assert(src->klass() != NULL); + { + BLOCK_COMMENT("assert klasses not null {"); + Label L1, L2; + __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL + __ bind(L1); + __ stop("broken null klass"); + __ bind(L2); + __ load_klass(rscratch1, dst); + __ cbz(rscratch1, L1); // this would be broken also + BLOCK_COMMENT("} assert klasses not null done"); + } +#endif + + // Load layout helper (32-bits) + // + // |array_tag| | header_size | element_type | |log2_element_size| + // 32 30 24 16 8 2 0 + // + // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 + // + + const int lh_offset = in_bytes(Klass::layout_helper_offset()); + + // Handle objArrays completely differently... + const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); + __ ldrw(lh, Address(scratch_src_klass, lh_offset)); + __ movw(rscratch1, objArray_lh); + __ eorw(rscratch2, lh, rscratch1); + __ cbzw(rscratch2, L_objArray); + + // if (src->klass() != dst->klass()) return -1; + __ load_klass(rscratch2, dst); + __ eor(rscratch2, rscratch2, scratch_src_klass); + __ cbnz(rscratch2, L_failed); + + // if (!src->is_Array()) return -1; + __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) + + // At this point, it is known to be a typeArray (array_tag 0x3). +#ifdef ASSERT + { + BLOCK_COMMENT("assert primitive array {"); + Label L; + __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); + __ cmpw(lh, rscratch2); + __ br(Assembler::GE, L); + __ stop("must be a primitive array"); + __ bind(L); + BLOCK_COMMENT("} assert primitive array done"); + } +#endif + + arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, + rscratch2, L_failed); + + // TypeArrayKlass + // + // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); + // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); + // + + const Register rscratch1_offset = rscratch1; // array offset + const Register r18_elsize = lh; // element size + + __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, + exact_log2(Klass::_lh_header_size_mask+1)); // array_offset + __ add(src, src, rscratch1_offset); // src array offset + __ add(dst, dst, rscratch1_offset); // dst array offset + BLOCK_COMMENT("choose copy loop based on element size"); + + // next registers should be set before the jump to corresponding stub + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register count = c_rarg2; // elements count + + // 'from', 'to', 'count' registers should be set in such order + // since they are the same as 'src', 'src_pos', 'dst'. + + assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); + + // The possible values of elsize are 0-3, i.e. exact_log2(element + // size in bytes). We do a simple bitwise binary search. + __ BIND(L_copy_bytes); + __ tbnz(r18_elsize, 1, L_copy_ints); + __ tbnz(r18_elsize, 0, L_copy_shorts); + __ lea(from, Address(src, src_pos));// src_addr + __ lea(to, Address(dst, dst_pos));// dst_addr + __ movw(count, scratch_length); // length + __ b(RuntimeAddress(byte_copy_entry)); + + __ BIND(L_copy_shorts); + __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr + __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr + __ movw(count, scratch_length); // length + __ b(RuntimeAddress(short_copy_entry)); + + __ BIND(L_copy_ints); + __ tbnz(r18_elsize, 0, L_copy_longs); + __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr + __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr + __ movw(count, scratch_length); // length + __ b(RuntimeAddress(int_copy_entry)); + + __ BIND(L_copy_longs); +#ifdef ASSERT + { + BLOCK_COMMENT("assert long copy {"); + Label L; + __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize + __ cmpw(r18_elsize, LogBytesPerLong); + __ br(Assembler::EQ, L); + __ stop("must be long copy, but elsize is wrong"); + __ bind(L); + BLOCK_COMMENT("} assert long copy done"); + } +#endif + __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr + __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr + __ movw(count, scratch_length); // length + __ b(RuntimeAddress(long_copy_entry)); + + // ObjArrayKlass + __ BIND(L_objArray); + // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] + + Label L_plain_copy, L_checkcast_copy; + // test array classes for subtyping + __ load_klass(r18, dst); + __ cmp(scratch_src_klass, r18); // usual case is exact equality + __ br(Assembler::NE, L_checkcast_copy); + + // Identically typed arrays can be copied without element-wise checks. + arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, + rscratch2, L_failed); + + __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); + __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); + __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); + __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); + __ movw(count, scratch_length); // length + __ BIND(L_plain_copy); + __ b(RuntimeAddress(oop_copy_entry)); + + __ BIND(L_checkcast_copy); + // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) + { + // Before looking at dst.length, make sure dst is also an objArray. + __ ldrw(rscratch1, Address(r18, lh_offset)); + __ movw(rscratch2, objArray_lh); + __ eorw(rscratch1, rscratch1, rscratch2); + __ cbnzw(rscratch1, L_failed); + + // It is safe to examine both src.length and dst.length. + arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, + r18, L_failed); + + const Register rscratch2_dst_klass = rscratch2; + __ load_klass(rscratch2_dst_klass, dst); // reload + + // Marshal the base address arguments now, freeing registers. + __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); + __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); + __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); + __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); + __ movw(count, length); // length (reloaded) + Register sco_temp = c_rarg3; // this register is free now + assert_different_registers(from, to, count, sco_temp, + rscratch2_dst_klass, scratch_src_klass); + // assert_clean_int(count, sco_temp); + + // Generate the type check. + const int sco_offset = in_bytes(Klass::super_check_offset_offset()); + __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); + // assert_clean_int(sco_temp, r18); + generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); + + // Fetch destination element klass from the ObjArrayKlass header. + int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); + __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); + __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); + + // the checkcast_copy loop needs two extra arguments: + assert(c_rarg3 == sco_temp, "#3 already in place"); + // Set up arguments for checkcast_copy_entry. + __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass + __ b(RuntimeAddress(checkcast_copy_entry)); + } + + __ BIND(L_failed); + __ mov(r0, -1); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(lr); + + return start; + } + + // + // Generate stub for array fill. If "aligned" is true, the + // "to" address is assumed to be heapword aligned. + // + // Arguments for generated stub: + // to: c_rarg0 + // value: c_rarg1 + // count: c_rarg2 treated as signed + // + address generate_fill(BasicType t, bool aligned, const char *name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + BLOCK_COMMENT("Entry:"); + + const Register to = c_rarg0; // source array address + const Register value = c_rarg1; // value + const Register count = c_rarg2; // elements count + + const Register bz_base = r10; // base for block_zero routine + const Register cnt_words = r11; // temp register + + __ enter(); + + Label L_fill_elements, L_exit1; + + int shift = -1; + switch (t) { + case T_BYTE: + shift = 0; + __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element + __ bfi(value, value, 8, 8); // 8 bit -> 16 bit + __ bfi(value, value, 16, 16); // 16 bit -> 32 bit + __ br(Assembler::LO, L_fill_elements); + break; + case T_SHORT: + shift = 1; + __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element + __ bfi(value, value, 16, 16); // 16 bit -> 32 bit + __ br(Assembler::LO, L_fill_elements); + break; + case T_INT: + shift = 2; + __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element + __ br(Assembler::LO, L_fill_elements); + break; + default: ShouldNotReachHere(); + } + + // Align source address at 8 bytes address boundary. + Label L_skip_align1, L_skip_align2, L_skip_align4; + if (!aligned) { + switch (t) { + case T_BYTE: + // One byte misalignment happens only for byte arrays. + __ tbz(to, 0, L_skip_align1); + __ strb(value, Address(__ post(to, 1))); + __ subw(count, count, 1); + __ bind(L_skip_align1); + // Fallthrough + case T_SHORT: + // Two bytes misalignment happens only for byte and short (char) arrays. + __ tbz(to, 1, L_skip_align2); + __ strh(value, Address(__ post(to, 2))); + __ subw(count, count, 2 >> shift); + __ bind(L_skip_align2); + // Fallthrough + case T_INT: + // Align to 8 bytes, we know we are 4 byte aligned to start. + __ tbz(to, 2, L_skip_align4); + __ strw(value, Address(__ post(to, 4))); + __ subw(count, count, 4 >> shift); + __ bind(L_skip_align4); + break; + default: ShouldNotReachHere(); + } + } + + // + // Fill large chunks + // + __ lsrw(cnt_words, count, 3 - shift); // number of words + __ bfi(value, value, 32, 32); // 32 bit -> 64 bit + __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); + if (UseBlockZeroing) { + Label non_block_zeroing, rest; + // If the fill value is zero we can use the fast zero_words(). + __ cbnz(value, non_block_zeroing); + __ mov(bz_base, to); + __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); + __ zero_words(bz_base, cnt_words); + __ b(rest); + __ bind(non_block_zeroing); + __ fill_words(to, cnt_words, value); + __ bind(rest); + } else { + __ fill_words(to, cnt_words, value); + } + + // Remaining count is less than 8 bytes. Fill it by a single store. + // Note that the total length is no less than 8 bytes. + if (t == T_BYTE || t == T_SHORT) { + Label L_exit1; + __ cbzw(count, L_exit1); + __ add(to, to, count, Assembler::LSL, shift); // points to the end + __ str(value, Address(to, -8)); // overwrite some elements + __ bind(L_exit1); + __ leave(); + __ ret(lr); + } + + // Handle copies less than 8 bytes. + Label L_fill_2, L_fill_4, L_exit2; + __ bind(L_fill_elements); + switch (t) { + case T_BYTE: + __ tbz(count, 0, L_fill_2); + __ strb(value, Address(__ post(to, 1))); + __ bind(L_fill_2); + __ tbz(count, 1, L_fill_4); + __ strh(value, Address(__ post(to, 2))); + __ bind(L_fill_4); + __ tbz(count, 2, L_exit2); + __ strw(value, Address(to)); + break; + case T_SHORT: + __ tbz(count, 0, L_fill_4); + __ strh(value, Address(__ post(to, 2))); + __ bind(L_fill_4); + __ tbz(count, 1, L_exit2); + __ strw(value, Address(to)); + break; + case T_INT: + __ cbzw(count, L_exit2); + __ strw(value, Address(to)); + break; + default: ShouldNotReachHere(); + } + __ bind(L_exit2); + __ leave(); + __ ret(lr); + return start; + } + + void generate_arraycopy_stubs() { + address entry; + address entry_jbyte_arraycopy; + address entry_jshort_arraycopy; + address entry_jint_arraycopy; + address entry_oop_arraycopy; + address entry_jlong_arraycopy; + address entry_checkcast_arraycopy; + + generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); + generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); + + StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); + + //*** jbyte + // Always need aligned and unaligned versions + StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, + "jbyte_disjoint_arraycopy"); + StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, + &entry_jbyte_arraycopy, + "jbyte_arraycopy"); + StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, + "arrayof_jbyte_disjoint_arraycopy"); + StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, + "arrayof_jbyte_arraycopy"); + + //*** jshort + // Always need aligned and unaligned versions + StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, + "jshort_disjoint_arraycopy"); + StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, + &entry_jshort_arraycopy, + "jshort_arraycopy"); + StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, + "arrayof_jshort_disjoint_arraycopy"); + StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, + "arrayof_jshort_arraycopy"); + + //*** jint + // Aligned versions + StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, + "arrayof_jint_disjoint_arraycopy"); + StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, + "arrayof_jint_arraycopy"); + // In 64 bit we need both aligned and unaligned versions of jint arraycopy. + // entry_jint_arraycopy always points to the unaligned version + StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, + "jint_disjoint_arraycopy"); + StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, + &entry_jint_arraycopy, + "jint_arraycopy"); + + //*** jlong + // It is always aligned + StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, + "arrayof_jlong_disjoint_arraycopy"); + StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, + "arrayof_jlong_arraycopy"); + StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; + StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; + + //*** oops + { + // With compressed oops we need unaligned versions; notice that + // we overwrite entry_oop_arraycopy. + bool aligned = !UseCompressedOops; + + StubRoutines::_arrayof_oop_disjoint_arraycopy + = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", + /*dest_uninitialized*/false); + StubRoutines::_arrayof_oop_arraycopy + = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", + /*dest_uninitialized*/false); + // Aligned versions without pre-barriers + StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit + = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", + /*dest_uninitialized*/true); + StubRoutines::_arrayof_oop_arraycopy_uninit + = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", + /*dest_uninitialized*/true); + } + + StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; + StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; + StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; + StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; + + StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); + StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, + /*dest_uninitialized*/true); + + StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", + entry_jbyte_arraycopy, + entry_jshort_arraycopy, + entry_jint_arraycopy, + entry_jlong_arraycopy); + + StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", + entry_jbyte_arraycopy, + entry_jshort_arraycopy, + entry_jint_arraycopy, + entry_oop_arraycopy, + entry_jlong_arraycopy, + entry_checkcast_arraycopy); + + StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); + StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); + StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); + StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); + StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); + StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); + } + + void generate_math_stubs() { Unimplemented(); } + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // + address generate_aescrypt_encryptBlock() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); + + Label L_doLast; + + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register keylen = rscratch1; + + address start = __ pc(); + __ enter(); + + __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + + __ ld1(v0, __ T16B, from); // get 16 bytes of input + + __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); + __ rev32(v1, __ T16B, v1); + __ rev32(v2, __ T16B, v2); + __ rev32(v3, __ T16B, v3); + __ rev32(v4, __ T16B, v4); + __ aese(v0, v1); + __ aesmc(v0, v0); + __ aese(v0, v2); + __ aesmc(v0, v0); + __ aese(v0, v3); + __ aesmc(v0, v0); + __ aese(v0, v4); + __ aesmc(v0, v0); + + __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); + __ rev32(v1, __ T16B, v1); + __ rev32(v2, __ T16B, v2); + __ rev32(v3, __ T16B, v3); + __ rev32(v4, __ T16B, v4); + __ aese(v0, v1); + __ aesmc(v0, v0); + __ aese(v0, v2); + __ aesmc(v0, v0); + __ aese(v0, v3); + __ aesmc(v0, v0); + __ aese(v0, v4); + __ aesmc(v0, v0); + + __ ld1(v1, v2, __ T16B, __ post(key, 32)); + __ rev32(v1, __ T16B, v1); + __ rev32(v2, __ T16B, v2); + + __ cmpw(keylen, 44); + __ br(Assembler::EQ, L_doLast); + + __ aese(v0, v1); + __ aesmc(v0, v0); + __ aese(v0, v2); + __ aesmc(v0, v0); + + __ ld1(v1, v2, __ T16B, __ post(key, 32)); + __ rev32(v1, __ T16B, v1); + __ rev32(v2, __ T16B, v2); + + __ cmpw(keylen, 52); + __ br(Assembler::EQ, L_doLast); + + __ aese(v0, v1); + __ aesmc(v0, v0); + __ aese(v0, v2); + __ aesmc(v0, v0); + + __ ld1(v1, v2, __ T16B, __ post(key, 32)); + __ rev32(v1, __ T16B, v1); + __ rev32(v2, __ T16B, v2); + + __ BIND(L_doLast); + + __ aese(v0, v1); + __ aesmc(v0, v0); + __ aese(v0, v2); + + __ ld1(v1, __ T16B, key); + __ rev32(v1, __ T16B, v1); + __ eor(v0, __ T16B, v0, v1); + + __ st1(v0, __ T16B, to); + + __ mov(r0, 0); + + __ leave(); + __ ret(lr); + + return start; + } + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // + address generate_aescrypt_decryptBlock() { + assert(UseAES, "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); + Label L_doLast; + + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register keylen = rscratch1; + + address start = __ pc(); + __ enter(); // required for proper stackwalking of RuntimeStub frame + + __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + + __ ld1(v0, __ T16B, from); // get 16 bytes of input + + __ ld1(v5, __ T16B, __ post(key, 16)); + __ rev32(v5, __ T16B, v5); + + __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); + __ rev32(v1, __ T16B, v1); + __ rev32(v2, __ T16B, v2); + __ rev32(v3, __ T16B, v3); + __ rev32(v4, __ T16B, v4); + __ aesd(v0, v1); + __ aesimc(v0, v0); + __ aesd(v0, v2); + __ aesimc(v0, v0); + __ aesd(v0, v3); + __ aesimc(v0, v0); + __ aesd(v0, v4); + __ aesimc(v0, v0); + + __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); + __ rev32(v1, __ T16B, v1); + __ rev32(v2, __ T16B, v2); + __ rev32(v3, __ T16B, v3); + __ rev32(v4, __ T16B, v4); + __ aesd(v0, v1); + __ aesimc(v0, v0); + __ aesd(v0, v2); + __ aesimc(v0, v0); + __ aesd(v0, v3); + __ aesimc(v0, v0); + __ aesd(v0, v4); + __ aesimc(v0, v0); + + __ ld1(v1, v2, __ T16B, __ post(key, 32)); + __ rev32(v1, __ T16B, v1); + __ rev32(v2, __ T16B, v2); + + __ cmpw(keylen, 44); + __ br(Assembler::EQ, L_doLast); + + __ aesd(v0, v1); + __ aesimc(v0, v0); + __ aesd(v0, v2); + __ aesimc(v0, v0); + + __ ld1(v1, v2, __ T16B, __ post(key, 32)); + __ rev32(v1, __ T16B, v1); + __ rev32(v2, __ T16B, v2); + + __ cmpw(keylen, 52); + __ br(Assembler::EQ, L_doLast); + + __ aesd(v0, v1); + __ aesimc(v0, v0); + __ aesd(v0, v2); + __ aesimc(v0, v0); + + __ ld1(v1, v2, __ T16B, __ post(key, 32)); + __ rev32(v1, __ T16B, v1); + __ rev32(v2, __ T16B, v2); + + __ BIND(L_doLast); + + __ aesd(v0, v1); + __ aesimc(v0, v0); + __ aesd(v0, v2); + + __ eor(v0, __ T16B, v0, v5); + + __ st1(v0, __ T16B, to); + + __ mov(r0, 0); + + __ leave(); + __ ret(lr); + + return start; + } + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - r vector byte array address + // c_rarg4 - input length + // + // Output: + // x0 - input length + // + address generate_cipherBlockChaining_encryptAESCrypt() { + assert(UseAES, "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); + + Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; + + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register rvec = c_rarg3; // r byte array initialized from initvector array address + // and left with the results of the last encryption block + const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) + const Register keylen = rscratch1; + + address start = __ pc(); + + __ enter(); + + __ movw(rscratch2, len_reg); + + __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + + __ ld1(v0, __ T16B, rvec); + + __ cmpw(keylen, 52); + __ br(Assembler::CC, L_loadkeys_44); + __ br(Assembler::EQ, L_loadkeys_52); + + __ ld1(v17, v18, __ T16B, __ post(key, 32)); + __ rev32(v17, __ T16B, v17); + __ rev32(v18, __ T16B, v18); + __ BIND(L_loadkeys_52); + __ ld1(v19, v20, __ T16B, __ post(key, 32)); + __ rev32(v19, __ T16B, v19); + __ rev32(v20, __ T16B, v20); + __ BIND(L_loadkeys_44); + __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); + __ rev32(v21, __ T16B, v21); + __ rev32(v22, __ T16B, v22); + __ rev32(v23, __ T16B, v23); + __ rev32(v24, __ T16B, v24); + __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); + __ rev32(v25, __ T16B, v25); + __ rev32(v26, __ T16B, v26); + __ rev32(v27, __ T16B, v27); + __ rev32(v28, __ T16B, v28); + __ ld1(v29, v30, v31, __ T16B, key); + __ rev32(v29, __ T16B, v29); + __ rev32(v30, __ T16B, v30); + __ rev32(v31, __ T16B, v31); + + __ BIND(L_aes_loop); + __ ld1(v1, __ T16B, __ post(from, 16)); + __ eor(v0, __ T16B, v0, v1); + + __ br(Assembler::CC, L_rounds_44); + __ br(Assembler::EQ, L_rounds_52); + + __ aese(v0, v17); __ aesmc(v0, v0); + __ aese(v0, v18); __ aesmc(v0, v0); + __ BIND(L_rounds_52); + __ aese(v0, v19); __ aesmc(v0, v0); + __ aese(v0, v20); __ aesmc(v0, v0); + __ BIND(L_rounds_44); + __ aese(v0, v21); __ aesmc(v0, v0); + __ aese(v0, v22); __ aesmc(v0, v0); + __ aese(v0, v23); __ aesmc(v0, v0); + __ aese(v0, v24); __ aesmc(v0, v0); + __ aese(v0, v25); __ aesmc(v0, v0); + __ aese(v0, v26); __ aesmc(v0, v0); + __ aese(v0, v27); __ aesmc(v0, v0); + __ aese(v0, v28); __ aesmc(v0, v0); + __ aese(v0, v29); __ aesmc(v0, v0); + __ aese(v0, v30); + __ eor(v0, __ T16B, v0, v31); + + __ st1(v0, __ T16B, __ post(to, 16)); + + __ subw(len_reg, len_reg, 16); + __ cbnzw(len_reg, L_aes_loop); + + __ st1(v0, __ T16B, rvec); + + __ mov(r0, rscratch2); + + __ leave(); + __ ret(lr); + + return start; + } + + // Arguments: + // + // Inputs: + // c_rarg0 - source byte array address + // c_rarg1 - destination byte array address + // c_rarg2 - K (key) in little endian int array + // c_rarg3 - r vector byte array address + // c_rarg4 - input length + // + // Output: + // r0 - input length + // + address generate_cipherBlockChaining_decryptAESCrypt() { + assert(UseAES, "need AES instructions and misaligned SSE support"); + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); + + Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; + + const Register from = c_rarg0; // source array address + const Register to = c_rarg1; // destination array address + const Register key = c_rarg2; // key array address + const Register rvec = c_rarg3; // r byte array initialized from initvector array address + // and left with the results of the last encryption block + const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) + const Register keylen = rscratch1; + + address start = __ pc(); + + __ enter(); + + __ movw(rscratch2, len_reg); + + __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); + + __ ld1(v2, __ T16B, rvec); + + __ ld1(v31, __ T16B, __ post(key, 16)); + __ rev32(v31, __ T16B, v31); + + __ cmpw(keylen, 52); + __ br(Assembler::CC, L_loadkeys_44); + __ br(Assembler::EQ, L_loadkeys_52); + + __ ld1(v17, v18, __ T16B, __ post(key, 32)); + __ rev32(v17, __ T16B, v17); + __ rev32(v18, __ T16B, v18); + __ BIND(L_loadkeys_52); + __ ld1(v19, v20, __ T16B, __ post(key, 32)); + __ rev32(v19, __ T16B, v19); + __ rev32(v20, __ T16B, v20); + __ BIND(L_loadkeys_44); + __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); + __ rev32(v21, __ T16B, v21); + __ rev32(v22, __ T16B, v22); + __ rev32(v23, __ T16B, v23); + __ rev32(v24, __ T16B, v24); + __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); + __ rev32(v25, __ T16B, v25); + __ rev32(v26, __ T16B, v26); + __ rev32(v27, __ T16B, v27); + __ rev32(v28, __ T16B, v28); + __ ld1(v29, v30, __ T16B, key); + __ rev32(v29, __ T16B, v29); + __ rev32(v30, __ T16B, v30); + + __ BIND(L_aes_loop); + __ ld1(v0, __ T16B, __ post(from, 16)); + __ orr(v1, __ T16B, v0, v0); + + __ br(Assembler::CC, L_rounds_44); + __ br(Assembler::EQ, L_rounds_52); + + __ aesd(v0, v17); __ aesimc(v0, v0); + __ aesd(v0, v18); __ aesimc(v0, v0); + __ BIND(L_rounds_52); + __ aesd(v0, v19); __ aesimc(v0, v0); + __ aesd(v0, v20); __ aesimc(v0, v0); + __ BIND(L_rounds_44); + __ aesd(v0, v21); __ aesimc(v0, v0); + __ aesd(v0, v22); __ aesimc(v0, v0); + __ aesd(v0, v23); __ aesimc(v0, v0); + __ aesd(v0, v24); __ aesimc(v0, v0); + __ aesd(v0, v25); __ aesimc(v0, v0); + __ aesd(v0, v26); __ aesimc(v0, v0); + __ aesd(v0, v27); __ aesimc(v0, v0); + __ aesd(v0, v28); __ aesimc(v0, v0); + __ aesd(v0, v29); __ aesimc(v0, v0); + __ aesd(v0, v30); + __ eor(v0, __ T16B, v0, v31); + __ eor(v0, __ T16B, v0, v2); + + __ st1(v0, __ T16B, __ post(to, 16)); + __ orr(v2, __ T16B, v1, v1); + + __ subw(len_reg, len_reg, 16); + __ cbnzw(len_reg, L_aes_loop); + + __ st1(v2, __ T16B, rvec); + + __ mov(r0, rscratch2); + + __ leave(); + __ ret(lr); + + return start; + } + + // Arguments: + // + // Inputs: + // c_rarg0 - byte[] source+offset + // c_rarg1 - int[] SHA.state + // c_rarg2 - int offset + // c_rarg3 - int limit + // + address generate_sha1_implCompress(bool multi_block, const char *name) { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + Register buf = c_rarg0; + Register state = c_rarg1; + Register ofs = c_rarg2; + Register limit = c_rarg3; + + Label keys; + Label sha1_loop; + + // load the keys into v0..v3 + __ adr(rscratch1, keys); + __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); + // load 5 words state into v6, v7 + __ ldrq(v6, Address(state, 0)); + __ ldrs(v7, Address(state, 16)); + + + __ BIND(sha1_loop); + // load 64 bytes of data into v16..v19 + __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); + __ rev32(v16, __ T16B, v16); + __ rev32(v17, __ T16B, v17); + __ rev32(v18, __ T16B, v18); + __ rev32(v19, __ T16B, v19); + + // do the sha1 + __ addv(v4, __ T4S, v16, v0); + __ orr(v20, __ T16B, v6, v6); + + FloatRegister d0 = v16; + FloatRegister d1 = v17; + FloatRegister d2 = v18; + FloatRegister d3 = v19; + + for (int round = 0; round < 20; round++) { + FloatRegister tmp1 = (round & 1) ? v4 : v5; + FloatRegister tmp2 = (round & 1) ? v21 : v22; + FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; + FloatRegister tmp4 = (round & 1) ? v5 : v4; + FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); + + if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); + if (round < 19) __ addv(tmp1, __ T4S, d1, key); + __ sha1h(tmp2, __ T4S, v20); + if (round < 5) + __ sha1c(v20, __ T4S, tmp3, tmp4); + else if (round < 10 || round >= 15) + __ sha1p(v20, __ T4S, tmp3, tmp4); + else + __ sha1m(v20, __ T4S, tmp3, tmp4); + if (round < 16) __ sha1su1(d0, __ T4S, d3); + + tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; + } + + __ addv(v7, __ T2S, v7, v21); + __ addv(v6, __ T4S, v6, v20); + + if (multi_block) { + __ add(ofs, ofs, 64); + __ cmp(ofs, limit); + __ br(Assembler::LE, sha1_loop); + __ mov(c_rarg0, ofs); // return ofs + } + + __ strq(v6, Address(state, 0)); + __ strs(v7, Address(state, 16)); + + __ ret(lr); + + __ bind(keys); + __ emit_int32(0x5a827999); + __ emit_int32(0x6ed9eba1); + __ emit_int32(0x8f1bbcdc); + __ emit_int32(0xca62c1d6); + + return start; + } + + + // Arguments: + // + // Inputs: + // c_rarg0 - byte[] source+offset + // c_rarg1 - int[] SHA.state + // c_rarg2 - int offset + // c_rarg3 - int limit + // + address generate_sha256_implCompress(bool multi_block, const char *name) { + static const uint32_t round_consts[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, + }; + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", name); + address start = __ pc(); + + Register buf = c_rarg0; + Register state = c_rarg1; + Register ofs = c_rarg2; + Register limit = c_rarg3; + + Label sha1_loop; + + __ stpd(v8, v9, __ pre(sp, -32)); + __ stpd(v10, v11, Address(sp, 16)); + +// dga == v0 +// dgb == v1 +// dg0 == v2 +// dg1 == v3 +// dg2 == v4 +// t0 == v6 +// t1 == v7 + + // load 16 keys to v16..v31 + __ lea(rscratch1, ExternalAddress((address)round_consts)); + __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); + __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); + __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); + __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); + + // load 8 words (256 bits) state + __ ldpq(v0, v1, state); + + __ BIND(sha1_loop); + // load 64 bytes of data into v8..v11 + __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); + __ rev32(v8, __ T16B, v8); + __ rev32(v9, __ T16B, v9); + __ rev32(v10, __ T16B, v10); + __ rev32(v11, __ T16B, v11); + + __ addv(v6, __ T4S, v8, v16); + __ orr(v2, __ T16B, v0, v0); + __ orr(v3, __ T16B, v1, v1); + + FloatRegister d0 = v8; + FloatRegister d1 = v9; + FloatRegister d2 = v10; + FloatRegister d3 = v11; + + + for (int round = 0; round < 16; round++) { + FloatRegister tmp1 = (round & 1) ? v6 : v7; + FloatRegister tmp2 = (round & 1) ? v7 : v6; + FloatRegister tmp3 = (round & 1) ? v2 : v4; + FloatRegister tmp4 = (round & 1) ? v4 : v2; + + if (round < 12) __ sha256su0(d0, __ T4S, d1); + __ orr(v4, __ T16B, v2, v2); + if (round < 15) + __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); + __ sha256h(v2, __ T4S, v3, tmp2); + __ sha256h2(v3, __ T4S, v4, tmp2); + if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); + + tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; + } + + __ addv(v0, __ T4S, v0, v2); + __ addv(v1, __ T4S, v1, v3); + + if (multi_block) { + __ add(ofs, ofs, 64); + __ cmp(ofs, limit); + __ br(Assembler::LE, sha1_loop); + __ mov(c_rarg0, ofs); // return ofs + } + + __ ldpd(v10, v11, Address(sp, 16)); + __ ldpd(v8, v9, __ post(sp, 32)); + + __ stpq(v0, v1, state); + + __ ret(lr); + + return start; + } + +#ifndef BUILTIN_SIM + // Safefetch stubs. + void generate_safefetch(const char* name, int size, address* entry, + address* fault_pc, address* continuation_pc) { + // safefetch signatures: + // int SafeFetch32(int* adr, int errValue); + // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); + // + // arguments: + // c_rarg0 = adr + // c_rarg1 = errValue + // + // result: + // PPC_RET = *adr or errValue + + StubCodeMark mark(this, "StubRoutines", name); + + // Entry point, pc or function descriptor. + *entry = __ pc(); + + // Load *adr into c_rarg1, may fault. + *fault_pc = __ pc(); + switch (size) { + case 4: + // int32_t + __ ldrw(c_rarg1, Address(c_rarg0, 0)); + break; + case 8: + // int64_t + __ ldr(c_rarg1, Address(c_rarg0, 0)); + break; + default: + ShouldNotReachHere(); + } + + // return errValue or *adr + *continuation_pc = __ pc(); + __ mov(r0, c_rarg1); + __ ret(lr); + } +#endif + + /** + * Arguments: + * + * Inputs: + * c_rarg0 - int crc + * c_rarg1 - byte* buf + * c_rarg2 - int length + * + * Ouput: + * rax - int crc result + */ + address generate_updateBytesCRC32() { + assert(UseCRC32Intrinsics, "what are we doing here?"); + + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); + + address start = __ pc(); + + const Register crc = c_rarg0; // crc + const Register buf = c_rarg1; // source java byte array address + const Register len = c_rarg2; // length + const Register table0 = c_rarg3; // crc_table address + const Register table1 = c_rarg4; + const Register table2 = c_rarg5; + const Register table3 = c_rarg6; + const Register tmp3 = c_rarg7; + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + + __ kernel_crc32(crc, buf, len, + table0, table1, table2, table3, rscratch1, rscratch2, tmp3); + + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(lr); + + return start; + } + + /** + * Arguments: + * + * Inputs: + * c_rarg0 - int crc + * c_rarg1 - byte* buf + * c_rarg2 - int length + * c_rarg3 - int* table + * + * Ouput: + * r0 - int crc result + */ + address generate_updateBytesCRC32C() { + assert(UseCRC32CIntrinsics, "what are we doing here?"); + + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); + + address start = __ pc(); + + const Register crc = c_rarg0; // crc + const Register buf = c_rarg1; // source java byte array address + const Register len = c_rarg2; // length + const Register table0 = c_rarg3; // crc_table address + const Register table1 = c_rarg4; + const Register table2 = c_rarg5; + const Register table3 = c_rarg6; + const Register tmp3 = c_rarg7; + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + + __ kernel_crc32c(crc, buf, len, + table0, table1, table2, table3, rscratch1, rscratch2, tmp3); + + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(lr); + + return start; + } + + /*** + * Arguments: + * + * Inputs: + * c_rarg0 - int adler + * c_rarg1 - byte* buff + * c_rarg2 - int len + * + * Output: + * c_rarg0 - int adler result + */ + address generate_updateBytesAdler32() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); + address start = __ pc(); + + Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; + + // Aliases + Register adler = c_rarg0; + Register s1 = c_rarg0; + Register s2 = c_rarg3; + Register buff = c_rarg1; + Register len = c_rarg2; + Register nmax = r4; + Register base = r5; + Register count = r6; + Register temp0 = rscratch1; + Register temp1 = rscratch2; + Register temp2 = r7; + + // Max number of bytes we can process before having to take the mod + // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 + unsigned long BASE = 0xfff1; + unsigned long NMAX = 0x15B0; + + __ mov(base, BASE); + __ mov(nmax, NMAX); + + // s1 is initialized to the lower 16 bits of adler + // s2 is initialized to the upper 16 bits of adler + __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) + __ uxth(s1, adler); // s1 = (adler & 0xffff) + + // The pipelined loop needs at least 16 elements for 1 iteration + // It does check this, but it is more effective to skip to the cleanup loop + __ cmp(len, 16); + __ br(Assembler::HS, L_nmax); + __ cbz(len, L_combine); + + __ bind(L_simple_by1_loop); + __ ldrb(temp0, Address(__ post(buff, 1))); + __ add(s1, s1, temp0); + __ add(s2, s2, s1); + __ subs(len, len, 1); + __ br(Assembler::HI, L_simple_by1_loop); + + // s1 = s1 % BASE + __ subs(temp0, s1, base); + __ csel(s1, temp0, s1, Assembler::HS); + + // s2 = s2 % BASE + __ lsr(temp0, s2, 16); + __ lsl(temp1, temp0, 4); + __ sub(temp1, temp1, temp0); + __ add(s2, temp1, s2, ext::uxth); + + __ subs(temp0, s2, base); + __ csel(s2, temp0, s2, Assembler::HS); + + __ b(L_combine); + + __ bind(L_nmax); + __ subs(len, len, nmax); + __ sub(count, nmax, 16); + __ br(Assembler::LO, L_by16); + + __ bind(L_nmax_loop); + + __ ldp(temp0, temp1, Address(__ post(buff, 16))); + + __ add(s1, s1, temp0, ext::uxtb); + __ ubfx(temp2, temp0, 8, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 16, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 24, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 32, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 40, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 48, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ add(s2, s2, s1); + __ add(s1, s1, temp0, Assembler::LSR, 56); + __ add(s2, s2, s1); + + __ add(s1, s1, temp1, ext::uxtb); + __ ubfx(temp2, temp1, 8, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 16, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 24, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 32, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 40, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 48, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ add(s2, s2, s1); + __ add(s1, s1, temp1, Assembler::LSR, 56); + __ add(s2, s2, s1); + + __ subs(count, count, 16); + __ br(Assembler::HS, L_nmax_loop); + + // s1 = s1 % BASE + __ lsr(temp0, s1, 16); + __ lsl(temp1, temp0, 4); + __ sub(temp1, temp1, temp0); + __ add(temp1, temp1, s1, ext::uxth); + + __ lsr(temp0, temp1, 16); + __ lsl(s1, temp0, 4); + __ sub(s1, s1, temp0); + __ add(s1, s1, temp1, ext:: uxth); + + __ subs(temp0, s1, base); + __ csel(s1, temp0, s1, Assembler::HS); + + // s2 = s2 % BASE + __ lsr(temp0, s2, 16); + __ lsl(temp1, temp0, 4); + __ sub(temp1, temp1, temp0); + __ add(temp1, temp1, s2, ext::uxth); + + __ lsr(temp0, temp1, 16); + __ lsl(s2, temp0, 4); + __ sub(s2, s2, temp0); + __ add(s2, s2, temp1, ext:: uxth); + + __ subs(temp0, s2, base); + __ csel(s2, temp0, s2, Assembler::HS); + + __ subs(len, len, nmax); + __ sub(count, nmax, 16); + __ br(Assembler::HS, L_nmax_loop); + + __ bind(L_by16); + __ adds(len, len, count); + __ br(Assembler::LO, L_by1); + + __ bind(L_by16_loop); + + __ ldp(temp0, temp1, Address(__ post(buff, 16))); + + __ add(s1, s1, temp0, ext::uxtb); + __ ubfx(temp2, temp0, 8, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 16, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 24, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 32, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 40, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp0, 48, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ add(s2, s2, s1); + __ add(s1, s1, temp0, Assembler::LSR, 56); + __ add(s2, s2, s1); + + __ add(s1, s1, temp1, ext::uxtb); + __ ubfx(temp2, temp1, 8, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 16, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 24, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 32, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 40, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ ubfx(temp2, temp1, 48, 8); + __ add(s2, s2, s1); + __ add(s1, s1, temp2); + __ add(s2, s2, s1); + __ add(s1, s1, temp1, Assembler::LSR, 56); + __ add(s2, s2, s1); + + __ subs(len, len, 16); + __ br(Assembler::HS, L_by16_loop); + + __ bind(L_by1); + __ adds(len, len, 15); + __ br(Assembler::LO, L_do_mod); + + __ bind(L_by1_loop); + __ ldrb(temp0, Address(__ post(buff, 1))); + __ add(s1, temp0, s1); + __ add(s2, s2, s1); + __ subs(len, len, 1); + __ br(Assembler::HS, L_by1_loop); + + __ bind(L_do_mod); + // s1 = s1 % BASE + __ lsr(temp0, s1, 16); + __ lsl(temp1, temp0, 4); + __ sub(temp1, temp1, temp0); + __ add(temp1, temp1, s1, ext::uxth); + + __ lsr(temp0, temp1, 16); + __ lsl(s1, temp0, 4); + __ sub(s1, s1, temp0); + __ add(s1, s1, temp1, ext:: uxth); + + __ subs(temp0, s1, base); + __ csel(s1, temp0, s1, Assembler::HS); + + // s2 = s2 % BASE + __ lsr(temp0, s2, 16); + __ lsl(temp1, temp0, 4); + __ sub(temp1, temp1, temp0); + __ add(temp1, temp1, s2, ext::uxth); + + __ lsr(temp0, temp1, 16); + __ lsl(s2, temp0, 4); + __ sub(s2, s2, temp0); + __ add(s2, s2, temp1, ext:: uxth); + + __ subs(temp0, s2, base); + __ csel(s2, temp0, s2, Assembler::HS); + + // Combine lower bits and higher bits + __ bind(L_combine); + __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) + + __ ret(lr); + + return start; + } + + /** + * Arguments: + * + * Input: + * c_rarg0 - x address + * c_rarg1 - x length + * c_rarg2 - y address + * c_rarg3 - y lenth + * c_rarg4 - z address + * c_rarg5 - z length + */ + address generate_multiplyToLen() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); + + address start = __ pc(); + const Register x = r0; + const Register xlen = r1; + const Register y = r2; + const Register ylen = r3; + const Register z = r4; + const Register zlen = r5; + + const Register tmp1 = r10; + const Register tmp2 = r11; + const Register tmp3 = r12; + const Register tmp4 = r13; + const Register tmp5 = r14; + const Register tmp6 = r15; + const Register tmp7 = r16; + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(lr); + + return start; + } + + void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, + FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, + FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { + // Karatsuba multiplication performs a 128*128 -> 256-bit + // multiplication in three 128-bit multiplications and a few + // additions. + // + // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) + // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 + // + // Inputs: + // + // A0 in a.d[0] (subkey) + // A1 in a.d[1] + // (A1+A0) in a1_xor_a0.d[0] + // + // B0 in b.d[0] (state) + // B1 in b.d[1] + + __ ext(tmp1, __ T16B, b, b, 0x08); + __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 + __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) + __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 + __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) + + __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); + __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 + __ eor(tmp2, __ T16B, tmp2, tmp4); + __ eor(tmp2, __ T16B, tmp2, tmp3); + + // Register pair holds the result of carry-less multiplication + __ ins(result_hi, __ D, tmp2, 0, 1); + __ ins(result_lo, __ D, tmp2, 1, 0); + } + + void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, + FloatRegister p, FloatRegister z, FloatRegister t1) { + const FloatRegister t0 = result; + + // The GCM field polynomial f is z^128 + p(z), where p = + // z^7+z^2+z+1. + // + // z^128 === -p(z) (mod (z^128 + p(z))) + // + // so, given that the product we're reducing is + // a == lo + hi * z^128 + // substituting, + // === lo - hi * p(z) (mod (z^128 + p(z))) + // + // we reduce by multiplying hi by p(z) and subtracting the result + // from (i.e. XORing it with) lo. Because p has no nonzero high + // bits we can do this with two 64-bit multiplications, lo*p and + // hi*p. + + __ pmull2(t0, __ T1Q, hi, p, __ T2D); + __ ext(t1, __ T16B, t0, z, 8); + __ eor(hi, __ T16B, hi, t1); + __ ext(t1, __ T16B, z, t0, 8); + __ eor(lo, __ T16B, lo, t1); + __ pmull(t0, __ T1Q, hi, p, __ T1D); + __ eor(result, __ T16B, lo, t0); + } + + address generate_has_negatives(address &has_negatives_long) { + StubCodeMark mark(this, "StubRoutines", "has_negatives"); + const int large_loop_size = 64; + const uint64_t UPPER_BIT_MASK=0x8080808080808080; + int dcache_line = VM_Version::dcache_line_size(); + + Register ary1 = r1, len = r2, result = r0; + + __ align(CodeEntryAlignment); + address entry = __ pc(); + + __ enter(); + + Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, + LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; + + __ cmp(len, 15); + __ br(Assembler::GT, LEN_OVER_15); + // The only case when execution falls into this code is when pointer is near + // the end of memory page and we have to avoid reading next page + __ add(ary1, ary1, len); + __ subs(len, len, 8); + __ br(Assembler::GT, LEN_OVER_8); + __ ldr(rscratch2, Address(ary1, -8)); + __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. + __ lsrv(rscratch2, rscratch2, rscratch1); + __ tst(rscratch2, UPPER_BIT_MASK); + __ cset(result, Assembler::NE); + __ leave(); + __ ret(lr); + __ bind(LEN_OVER_8); + __ ldp(rscratch1, rscratch2, Address(ary1, -16)); + __ sub(len, len, 8); // no data dep., then sub can be executed while loading + __ tst(rscratch2, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE_NO_POP); + __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes + __ lsrv(rscratch1, rscratch1, rscratch2); + __ tst(rscratch1, UPPER_BIT_MASK); + __ cset(result, Assembler::NE); + __ leave(); + __ ret(lr); + + Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; + const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; + + has_negatives_long = __ pc(); // 2nd entry point + + __ enter(); + + __ bind(LEN_OVER_15); + __ push(spilled_regs, sp); + __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment + __ cbz(rscratch2, ALIGNED); + __ ldp(tmp6, tmp1, Address(ary1)); + __ mov(tmp5, 16); + __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address + __ add(ary1, ary1, rscratch1); + __ sub(len, len, rscratch1); + __ orr(tmp6, tmp6, tmp1); + __ tst(tmp6, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE); + + __ bind(ALIGNED); + __ cmp(len, large_loop_size); + __ br(Assembler::LT, CHECK_16); + // Perform 16-byte load as early return in pre-loop to handle situation + // when initially aligned large array has negative values at starting bytes, + // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is + // slower. Cases with negative bytes further ahead won't be affected that + // much. In fact, it'll be faster due to early loads, less instructions and + // less branches in LARGE_LOOP. + __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); + __ sub(len, len, 16); + __ orr(tmp6, tmp6, tmp1); + __ tst(tmp6, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE); + __ cmp(len, large_loop_size); + __ br(Assembler::LT, CHECK_16); + + if (SoftwarePrefetchHintDistance >= 0 + && SoftwarePrefetchHintDistance >= dcache_line) { + // initial prefetch + __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); + } + __ bind(LARGE_LOOP); + if (SoftwarePrefetchHintDistance >= 0) { + __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); + } + // Issue load instructions first, since it can save few CPU/MEM cycles, also + // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) + // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 + // instructions per cycle and have less branches, but this approach disables + // early return, thus, all 64 bytes are loaded and checked every time. + __ ldp(tmp2, tmp3, Address(ary1)); + __ ldp(tmp4, tmp5, Address(ary1, 16)); + __ ldp(rscratch1, rscratch2, Address(ary1, 32)); + __ ldp(tmp6, tmp1, Address(ary1, 48)); + __ add(ary1, ary1, large_loop_size); + __ sub(len, len, large_loop_size); + __ orr(tmp2, tmp2, tmp3); + __ orr(tmp4, tmp4, tmp5); + __ orr(rscratch1, rscratch1, rscratch2); + __ orr(tmp6, tmp6, tmp1); + __ orr(tmp2, tmp2, tmp4); + __ orr(rscratch1, rscratch1, tmp6); + __ orr(tmp2, tmp2, rscratch1); + __ tst(tmp2, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE); + __ cmp(len, large_loop_size); + __ br(Assembler::GE, LARGE_LOOP); + + __ bind(CHECK_16); // small 16-byte load pre-loop + __ cmp(len, 16); + __ br(Assembler::LT, POST_LOOP16); + + __ bind(LOOP16); // small 16-byte load loop + __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); + __ sub(len, len, 16); + __ orr(tmp2, tmp2, tmp3); + __ tst(tmp2, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE); + __ cmp(len, 16); + __ br(Assembler::GE, LOOP16); // 16-byte load loop end + + __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally + __ cmp(len, 8); + __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); + __ ldr(tmp3, Address(__ post(ary1, 8))); + __ sub(len, len, 8); + __ tst(tmp3, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE); + + __ bind(POST_LOOP16_LOAD_TAIL); + __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 + __ ldr(tmp1, Address(ary1)); + __ mov(tmp2, 64); + __ sub(tmp4, tmp2, len, __ LSL, 3); + __ lslv(tmp1, tmp1, tmp4); + __ tst(tmp1, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE); + // Fallthrough + + __ bind(RET_FALSE); + __ pop(spilled_regs, sp); + __ leave(); + __ mov(result, zr); + __ ret(lr); + + __ bind(RET_TRUE); + __ pop(spilled_regs, sp); + __ bind(RET_TRUE_NO_POP); + __ leave(); + __ mov(result, 1); + __ ret(lr); + + __ bind(DONE); + __ pop(spilled_regs, sp); + __ leave(); + __ ret(lr); + return entry; + } + /** + * Arguments: + * + * Input: + * c_rarg0 - current state address + * c_rarg1 - H key address + * c_rarg2 - data address + * c_rarg3 - number of blocks + * + * Output: + * Updated state at c_rarg0 + */ + address generate_ghash_processBlocks() { + // Bafflingly, GCM uses little-endian for the byte order, but + // big-endian for the bit order. For example, the polynomial 1 is + // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. + // + // So, we must either reverse the bytes in each word and do + // everything big-endian or reverse the bits in each byte and do + // it little-endian. On AArch64 it's more idiomatic to reverse + // the bits in each byte (we have an instruction, RBIT, to do + // that) and keep the data in little-endian bit order throught the + // calculation, bit-reversing the inputs and outputs. + + StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); + __ align(wordSize * 2); + address p = __ pc(); + __ emit_int64(0x87); // The low-order bits of the field + // polynomial (i.e. p = z^7+z^2+z+1) + // repeated in the low and high parts of a + // 128-bit vector + __ emit_int64(0x87); + + __ align(CodeEntryAlignment); + address start = __ pc(); + + Register state = c_rarg0; + Register subkeyH = c_rarg1; + Register data = c_rarg2; + Register blocks = c_rarg3; + + FloatRegister vzr = v30; + __ eor(vzr, __ T16B, vzr, vzr); // zero register + + __ ldrq(v0, Address(state)); + __ ldrq(v1, Address(subkeyH)); + + __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH + __ rbit(v0, __ T16B, v0); + __ rev64(v1, __ T16B, v1); + __ rbit(v1, __ T16B, v1); + + __ ldrq(v26, p); + + __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 + __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) + + { + Label L_ghash_loop; + __ bind(L_ghash_loop); + + __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit + // reversing each byte + __ rbit(v2, __ T16B, v2); + __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state + + // Multiply state in v2 by subkey in v1 + ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, + /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, + /*temps*/v6, v20, v18, v21); + // Reduce v7:v5 by the field polynomial + ghash_reduce(v0, v5, v7, v26, vzr, v20); + + __ sub(blocks, blocks, 1); + __ cbnz(blocks, L_ghash_loop); + } + + // The bit-reversed result is at this point in v0 + __ rev64(v1, __ T16B, v0); + __ rbit(v1, __ T16B, v1); + + __ st1(v1, __ T16B, state); + __ ret(lr); + + return start; + } + + // Continuation point for throwing of implicit exceptions that are + // not handled in the current activation. Fabricates an exception + // oop and initiates normal exception dispatching in this + // frame. Since we need to preserve callee-saved values (currently + // only for C2, but done for C1 as well) we need a callee-saved oop + // map and therefore have to make these stubs into RuntimeStubs + // rather than BufferBlobs. If the compiler needs all registers to + // be preserved between the fault point and the exception handler + // then it must assume responsibility for that in + // AbstractCompiler::continuation_for_implicit_null_exception or + // continuation_for_implicit_division_by_zero_exception. All other + // implicit exceptions (e.g., NullPointerException or + // AbstractMethodError on entry) are either at call sites or + // otherwise assume that stack unwinding will be initiated, so + // caller saved registers were assumed volatile in the compiler. + +#undef __ +#define __ masm-> + + address generate_throw_exception(const char* name, + address runtime_entry, + Register arg1 = noreg, + Register arg2 = noreg) { + // Information about frame layout at time of blocking runtime call. + // Note that we only have to preserve callee-saved registers since + // the compilers are responsible for supplying a continuation point + // if they expect all registers to be preserved. + // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 + enum layout { + rfp_off = 0, + rfp_off2, + return_off, + return_off2, + framesize // inclusive of return address + }; + + int insts_size = 512; + int locs_size = 64; + + CodeBuffer code(name, insts_size, locs_size); + OopMapSet* oop_maps = new OopMapSet(); + MacroAssembler* masm = new MacroAssembler(&code); + + address start = __ pc(); + + // This is an inlined and slightly modified version of call_VM + // which has the ability to fetch the return PC out of + // thread-local storage and also sets up last_Java_sp slightly + // differently than the real call_VM + + __ enter(); // Save FP and LR before call + + assert(is_even(framesize/2), "sp not 16-byte aligned"); + + // lr and fp are already in place + __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog + + int frame_complete = __ pc() - start; + + // Set up last_Java_sp and last_Java_fp + address the_pc = __ pc(); + __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); + + // Call runtime + if (arg1 != noreg) { + assert(arg2 != c_rarg1, "clobbered"); + __ mov(c_rarg1, arg1); + } + if (arg2 != noreg) { + __ mov(c_rarg2, arg2); + } + __ mov(c_rarg0, rthread); + BLOCK_COMMENT("call runtime_entry"); + __ mov(rscratch1, runtime_entry); + __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); + + // Generate oop map + OopMap* map = new OopMap(framesize, 0); + + oop_maps->add_gc_map(the_pc - start, map); + + __ reset_last_Java_frame(true); + __ maybe_isb(); + + __ leave(); + + // check for pending exceptions +#ifdef ASSERT + Label L; + __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); + __ cbnz(rscratch1, L); + __ should_not_reach_here(); + __ bind(L); +#endif // ASSERT + __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); + + + // codeBlob framesize is in words (not VMRegImpl::slot_size) + RuntimeStub* stub = + RuntimeStub::new_runtime_stub(name, + &code, + frame_complete, + (framesize >> (LogBytesPerWord - LogBytesPerInt)), + oop_maps, false); + return stub->entry_point(); + } + + class MontgomeryMultiplyGenerator : public MacroAssembler { + + Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, + Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; + + RegSet _toSave; + bool _squaring; + + public: + MontgomeryMultiplyGenerator (Assembler *as, bool squaring) + : MacroAssembler(as->code()), _squaring(squaring) { + + // Register allocation + + Register reg = c_rarg0; + Pa_base = reg; // Argument registers + if (squaring) + Pb_base = Pa_base; + else + Pb_base = ++reg; + Pn_base = ++reg; + Rlen= ++reg; + inv = ++reg; + Pm_base = ++reg; + + // Working registers: + Ra = ++reg; // The current digit of a, b, n, and m. + Rb = ++reg; + Rm = ++reg; + Rn = ++reg; + + Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. + Pb = ++reg; + Pm = ++reg; + Pn = ++reg; + + t0 = ++reg; // Three registers which form a + t1 = ++reg; // triple-precision accumuator. + t2 = ++reg; + + Ri = ++reg; // Inner and outer loop indexes. + Rj = ++reg; + + Rhi_ab = ++reg; // Product registers: low and high parts + Rlo_ab = ++reg; // of a*b and m*n. + Rhi_mn = ++reg; + Rlo_mn = ++reg; + + // r19 and up are callee-saved. + _toSave = RegSet::range(r19, reg) + Pm_base; + } + + private: + void save_regs() { + push(_toSave, sp); + } + + void restore_regs() { + pop(_toSave, sp); + } + + template + void unroll_2(Register count, T block) { + Label loop, end, odd; + tbnz(count, 0, odd); + cbz(count, end); + align(16); + bind(loop); + (this->*block)(); + bind(odd); + (this->*block)(); + subs(count, count, 2); + br(Assembler::GT, loop); + bind(end); + } + + template + void unroll_2(Register count, T block, Register d, Register s, Register tmp) { + Label loop, end, odd; + tbnz(count, 0, odd); + cbz(count, end); + align(16); + bind(loop); + (this->*block)(d, s, tmp); + bind(odd); + (this->*block)(d, s, tmp); + subs(count, count, 2); + br(Assembler::GT, loop); + bind(end); + } + + void pre1(RegisterOrConstant i) { + block_comment("pre1"); + // Pa = Pa_base; + // Pb = Pb_base + i; + // Pm = Pm_base; + // Pn = Pn_base + i; + // Ra = *Pa; + // Rb = *Pb; + // Rm = *Pm; + // Rn = *Pn; + ldr(Ra, Address(Pa_base)); + ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); + ldr(Rm, Address(Pm_base)); + ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); + lea(Pa, Address(Pa_base)); + lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); + lea(Pm, Address(Pm_base)); + lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); + + // Zero the m*n result. + mov(Rhi_mn, zr); + mov(Rlo_mn, zr); + } + + // The core multiply-accumulate step of a Montgomery + // multiplication. The idea is to schedule operations as a + // pipeline so that instructions with long latencies (loads and + // multiplies) have time to complete before their results are + // used. This most benefits in-order implementations of the + // architecture but out-of-order ones also benefit. + void step() { + block_comment("step"); + // MACC(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + umulh(Rhi_ab, Ra, Rb); + mul(Rlo_ab, Ra, Rb); + ldr(Ra, pre(Pa, wordSize)); + ldr(Rb, pre(Pb, -wordSize)); + acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the + // previous iteration. + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + umulh(Rhi_mn, Rm, Rn); + mul(Rlo_mn, Rm, Rn); + ldr(Rm, pre(Pm, wordSize)); + ldr(Rn, pre(Pn, -wordSize)); + acc(Rhi_ab, Rlo_ab, t0, t1, t2); + } + + void post1() { + block_comment("post1"); + + // MACC(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + umulh(Rhi_ab, Ra, Rb); + mul(Rlo_ab, Ra, Rb); + acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n + acc(Rhi_ab, Rlo_ab, t0, t1, t2); + + // *Pm = Rm = t0 * inv; + mul(Rm, t0, inv); + str(Rm, Address(Pm)); + + // MACC(Rm, Rn, t0, t1, t2); + // t0 = t1; t1 = t2; t2 = 0; + umulh(Rhi_mn, Rm, Rn); + +#ifndef PRODUCT + // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); + { + mul(Rlo_mn, Rm, Rn); + add(Rlo_mn, t0, Rlo_mn); + Label ok; + cbz(Rlo_mn, ok); { + stop("broken Montgomery multiply"); + } bind(ok); + } +#endif + // We have very carefully set things up so that + // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate + // the lower half of Rm * Rn because we know the result already: + // it must be -t0. t0 + (-t0) must generate a carry iff + // t0 != 0. So, rather than do a mul and an adds we just set + // the carry flag iff t0 is nonzero. + // + // mul(Rlo_mn, Rm, Rn); + // adds(zr, t0, Rlo_mn); + subs(zr, t0, 1); // Set carry iff t0 is nonzero + adcs(t0, t1, Rhi_mn); + adc(t1, t2, zr); + mov(t2, zr); + } + + void pre2(RegisterOrConstant i, RegisterOrConstant len) { + block_comment("pre2"); + // Pa = Pa_base + i-len; + // Pb = Pb_base + len; + // Pm = Pm_base + i-len; + // Pn = Pn_base + len; + + if (i.is_register()) { + sub(Rj, i.as_register(), len); + } else { + mov(Rj, i.as_constant()); + sub(Rj, Rj, len); + } + // Rj == i-len + + lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); + lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); + lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); + lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); + + // Ra = *++Pa; + // Rb = *--Pb; + // Rm = *++Pm; + // Rn = *--Pn; + ldr(Ra, pre(Pa, wordSize)); + ldr(Rb, pre(Pb, -wordSize)); + ldr(Rm, pre(Pm, wordSize)); + ldr(Rn, pre(Pn, -wordSize)); + + mov(Rhi_mn, zr); + mov(Rlo_mn, zr); + } + + void post2(RegisterOrConstant i, RegisterOrConstant len) { + block_comment("post2"); + if (i.is_constant()) { + mov(Rj, i.as_constant()-len.as_constant()); + } else { + sub(Rj, i.as_register(), len); + } + + adds(t0, t0, Rlo_mn); // The pending m*n, low part + + // As soon as we know the least significant digit of our result, + // store it. + // Pm_base[i-len] = t0; + str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); + + // t0 = t1; t1 = t2; t2 = 0; + adcs(t0, t1, Rhi_mn); // The pending m*n, high part + adc(t1, t2, zr); + mov(t2, zr); + } + + // A carry in t0 after Montgomery multiplication means that we + // should subtract multiples of n from our result in m. We'll + // keep doing that until there is no carry. + void normalize(RegisterOrConstant len) { + block_comment("normalize"); + // while (t0) + // t0 = sub(Pm_base, Pn_base, t0, len); + Label loop, post, again; + Register cnt = t1, i = t2; // Re-use registers; we're done with them now + cbz(t0, post); { + bind(again); { + mov(i, zr); + mov(cnt, len); + ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); + ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); + subs(zr, zr, zr); // set carry flag, i.e. no borrow + align(16); + bind(loop); { + sbcs(Rm, Rm, Rn); + str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); + add(i, i, 1); + ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); + ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); + sub(cnt, cnt, 1); + } cbnz(cnt, loop); + sbc(t0, t0, zr); + } cbnz(t0, again); + } bind(post); + } + + // Move memory at s to d, reversing words. + // Increments d to end of copied memory + // Destroys tmp1, tmp2 + // Preserves len + // Leaves s pointing to the address which was in d at start + void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { + assert(tmp1 < r19 && tmp2 < r19, "register corruption"); + + lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); + mov(tmp1, len); + unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); + sub(s, d, len, ext::uxtw, LogBytesPerWord); + } + // where + void reverse1(Register d, Register s, Register tmp) { + ldr(tmp, pre(s, -wordSize)); + ror(tmp, tmp, 32); + str(tmp, post(d, wordSize)); + } + + void step_squaring() { + // An extra ACC + step(); + acc(Rhi_ab, Rlo_ab, t0, t1, t2); + } + + void last_squaring(RegisterOrConstant i) { + Label dont; + // if ((i & 1) == 0) { + tbnz(i.as_register(), 0, dont); { + // MACC(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + umulh(Rhi_ab, Ra, Rb); + mul(Rlo_ab, Ra, Rb); + acc(Rhi_ab, Rlo_ab, t0, t1, t2); + } bind(dont); + } + + void extra_step_squaring() { + acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n + + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + umulh(Rhi_mn, Rm, Rn); + mul(Rlo_mn, Rm, Rn); + ldr(Rm, pre(Pm, wordSize)); + ldr(Rn, pre(Pn, -wordSize)); + } + + void post1_squaring() { + acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n + + // *Pm = Rm = t0 * inv; + mul(Rm, t0, inv); + str(Rm, Address(Pm)); + + // MACC(Rm, Rn, t0, t1, t2); + // t0 = t1; t1 = t2; t2 = 0; + umulh(Rhi_mn, Rm, Rn); + +#ifndef PRODUCT + // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); + { + mul(Rlo_mn, Rm, Rn); + add(Rlo_mn, t0, Rlo_mn); + Label ok; + cbz(Rlo_mn, ok); { + stop("broken Montgomery multiply"); + } bind(ok); + } +#endif + // We have very carefully set things up so that + // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate + // the lower half of Rm * Rn because we know the result already: + // it must be -t0. t0 + (-t0) must generate a carry iff + // t0 != 0. So, rather than do a mul and an adds we just set + // the carry flag iff t0 is nonzero. + // + // mul(Rlo_mn, Rm, Rn); + // adds(zr, t0, Rlo_mn); + subs(zr, t0, 1); // Set carry iff t0 is nonzero + adcs(t0, t1, Rhi_mn); + adc(t1, t2, zr); + mov(t2, zr); + } + + void acc(Register Rhi, Register Rlo, + Register t0, Register t1, Register t2) { + adds(t0, t0, Rlo); + adcs(t1, t1, Rhi); + adc(t2, t2, zr); + } + + public: + /** + * Fast Montgomery multiplication. The derivation of the + * algorithm is in A Cryptographic Library for the Motorola + * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. + * + * Arguments: + * + * Inputs for multiplication: + * c_rarg0 - int array elements a + * c_rarg1 - int array elements b + * c_rarg2 - int array elements n (the modulus) + * c_rarg3 - int length + * c_rarg4 - int inv + * c_rarg5 - int array elements m (the result) + * + * Inputs for squaring: + * c_rarg0 - int array elements a + * c_rarg1 - int array elements n (the modulus) + * c_rarg2 - int length + * c_rarg3 - int inv + * c_rarg4 - int array elements m (the result) + * + */ + address generate_multiply() { + Label argh, nothing; + bind(argh); + stop("MontgomeryMultiply total_allocation must be <= 8192"); + + align(CodeEntryAlignment); + address entry = pc(); + + cbzw(Rlen, nothing); + + enter(); + + // Make room. + cmpw(Rlen, 512); + br(Assembler::HI, argh); + sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); + andr(sp, Ra, -2 * wordSize); + + lsrw(Rlen, Rlen, 1); // length in longwords = len/2 + + { + // Copy input args, reversing as we go. We use Ra as a + // temporary variable. + reverse(Ra, Pa_base, Rlen, t0, t1); + if (!_squaring) + reverse(Ra, Pb_base, Rlen, t0, t1); + reverse(Ra, Pn_base, Rlen, t0, t1); + } + + // Push all call-saved registers and also Pm_base which we'll need + // at the end. + save_regs(); + +#ifndef PRODUCT + // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); + { + ldr(Rn, Address(Pn_base, 0)); + mul(Rlo_mn, Rn, inv); + cmp(Rlo_mn, -1); + Label ok; + br(EQ, ok); { + stop("broken inverse in Montgomery multiply"); + } bind(ok); + } +#endif + + mov(Pm_base, Ra); + + mov(t0, zr); + mov(t1, zr); + mov(t2, zr); + + block_comment("for (int i = 0; i < len; i++) {"); + mov(Ri, zr); { + Label loop, end; + cmpw(Ri, Rlen); + br(Assembler::GE, end); + + bind(loop); + pre1(Ri); + + block_comment(" for (j = i; j; j--) {"); { + movw(Rj, Ri); + unroll_2(Rj, &MontgomeryMultiplyGenerator::step); + } block_comment(" } // j"); + + post1(); + addw(Ri, Ri, 1); + cmpw(Ri, Rlen); + br(Assembler::LT, loop); + bind(end); + block_comment("} // i"); + } + + block_comment("for (int i = len; i < 2*len; i++) {"); + mov(Ri, Rlen); { + Label loop, end; + cmpw(Ri, Rlen, Assembler::LSL, 1); + br(Assembler::GE, end); + + bind(loop); + pre2(Ri, Rlen); + + block_comment(" for (j = len*2-i-1; j; j--) {"); { + lslw(Rj, Rlen, 1); + subw(Rj, Rj, Ri); + subw(Rj, Rj, 1); + unroll_2(Rj, &MontgomeryMultiplyGenerator::step); + } block_comment(" } // j"); + + post2(Ri, Rlen); + addw(Ri, Ri, 1); + cmpw(Ri, Rlen, Assembler::LSL, 1); + br(Assembler::LT, loop); + bind(end); + } + block_comment("} // i"); + + normalize(Rlen); + + mov(Ra, Pm_base); // Save Pm_base in Ra + restore_regs(); // Restore caller's Pm_base + + // Copy our result into caller's Pm_base + reverse(Pm_base, Ra, Rlen, t0, t1); + + leave(); + bind(nothing); + ret(lr); + + return entry; + } + // In C, approximately: + + // void + // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], + // unsigned long Pn_base[], unsigned long Pm_base[], + // unsigned long inv, int len) { + // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator + // unsigned long *Pa, *Pb, *Pn, *Pm; + // unsigned long Ra, Rb, Rn, Rm; + + // int i; + + // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); + + // for (i = 0; i < len; i++) { + // int j; + + // Pa = Pa_base; + // Pb = Pb_base + i; + // Pm = Pm_base; + // Pn = Pn_base + i; + + // Ra = *Pa; + // Rb = *Pb; + // Rm = *Pm; + // Rn = *Pn; + + // int iters = i; + // for (j = 0; iters--; j++) { + // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); + // MACC(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + // } + + // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); + // MACC(Ra, Rb, t0, t1, t2); + // *Pm = Rm = t0 * inv; + // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + + // assert(t0 == 0, "broken Montgomery multiply"); + + // t0 = t1; t1 = t2; t2 = 0; + // } + + // for (i = len; i < 2*len; i++) { + // int j; + + // Pa = Pa_base + i-len; + // Pb = Pb_base + len; + // Pm = Pm_base + i-len; + // Pn = Pn_base + len; + + // Ra = *++Pa; + // Rb = *--Pb; + // Rm = *++Pm; + // Rn = *--Pn; + + // int iters = len*2-i-1; + // for (j = i-len+1; iters--; j++) { + // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); + // MACC(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + // } + + // Pm_base[i-len] = t0; + // t0 = t1; t1 = t2; t2 = 0; + // } + + // while (t0) + // t0 = sub(Pm_base, Pn_base, t0, len); + // } + + /** + * Fast Montgomery squaring. This uses asymptotically 25% fewer + * multiplies than Montgomery multiplication so it should be up to + * 25% faster. However, its loop control is more complex and it + * may actually run slower on some machines. + * + * Arguments: + * + * Inputs: + * c_rarg0 - int array elements a + * c_rarg1 - int array elements n (the modulus) + * c_rarg2 - int length + * c_rarg3 - int inv + * c_rarg4 - int array elements m (the result) + * + */ + address generate_square() { + Label argh; + bind(argh); + stop("MontgomeryMultiply total_allocation must be <= 8192"); + + align(CodeEntryAlignment); + address entry = pc(); + + enter(); + + // Make room. + cmpw(Rlen, 512); + br(Assembler::HI, argh); + sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); + andr(sp, Ra, -2 * wordSize); + + lsrw(Rlen, Rlen, 1); // length in longwords = len/2 + + { + // Copy input args, reversing as we go. We use Ra as a + // temporary variable. + reverse(Ra, Pa_base, Rlen, t0, t1); + reverse(Ra, Pn_base, Rlen, t0, t1); + } + + // Push all call-saved registers and also Pm_base which we'll need + // at the end. + save_regs(); + + mov(Pm_base, Ra); + + mov(t0, zr); + mov(t1, zr); + mov(t2, zr); + + block_comment("for (int i = 0; i < len; i++) {"); + mov(Ri, zr); { + Label loop, end; + bind(loop); + cmp(Ri, Rlen); + br(Assembler::GE, end); + + pre1(Ri); + + block_comment("for (j = (i+1)/2; j; j--) {"); { + add(Rj, Ri, 1); + lsr(Rj, Rj, 1); + unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); + } block_comment(" } // j"); + + last_squaring(Ri); + + block_comment(" for (j = i/2; j; j--) {"); { + lsr(Rj, Ri, 1); + unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); + } block_comment(" } // j"); + + post1_squaring(); + add(Ri, Ri, 1); + cmp(Ri, Rlen); + br(Assembler::LT, loop); + + bind(end); + block_comment("} // i"); + } + + block_comment("for (int i = len; i < 2*len; i++) {"); + mov(Ri, Rlen); { + Label loop, end; + bind(loop); + cmp(Ri, Rlen, Assembler::LSL, 1); + br(Assembler::GE, end); + + pre2(Ri, Rlen); + + block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { + lsl(Rj, Rlen, 1); + sub(Rj, Rj, Ri); + sub(Rj, Rj, 1); + lsr(Rj, Rj, 1); + unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); + } block_comment(" } // j"); + + last_squaring(Ri); + + block_comment(" for (j = (2*len-i)/2; j; j--) {"); { + lsl(Rj, Rlen, 1); + sub(Rj, Rj, Ri); + lsr(Rj, Rj, 1); + unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); + } block_comment(" } // j"); + + post2(Ri, Rlen); + add(Ri, Ri, 1); + cmp(Ri, Rlen, Assembler::LSL, 1); + + br(Assembler::LT, loop); + bind(end); + block_comment("} // i"); + } + + normalize(Rlen); + + mov(Ra, Pm_base); // Save Pm_base in Ra + restore_regs(); // Restore caller's Pm_base + + // Copy our result into caller's Pm_base + reverse(Pm_base, Ra, Rlen, t0, t1); + + leave(); + ret(lr); + + return entry; + } + // In C, approximately: + + // void + // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], + // unsigned long Pm_base[], unsigned long inv, int len) { + // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator + // unsigned long *Pa, *Pb, *Pn, *Pm; + // unsigned long Ra, Rb, Rn, Rm; + + // int i; + + // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); + + // for (i = 0; i < len; i++) { + // int j; + + // Pa = Pa_base; + // Pb = Pa_base + i; + // Pm = Pm_base; + // Pn = Pn_base + i; + + // Ra = *Pa; + // Rb = *Pb; + // Rm = *Pm; + // Rn = *Pn; + + // int iters = (i+1)/2; + // for (j = 0; iters--; j++) { + // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); + // MACC2(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + // } + // if ((i & 1) == 0) { + // assert(Ra == Pa_base[j], "must be"); + // MACC(Ra, Ra, t0, t1, t2); + // } + // iters = i/2; + // assert(iters == i-j, "must be"); + // for (; iters--; j++) { + // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + // } + + // *Pm = Rm = t0 * inv; + // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + + // assert(t0 == 0, "broken Montgomery multiply"); + + // t0 = t1; t1 = t2; t2 = 0; + // } + + // for (i = len; i < 2*len; i++) { + // int start = i-len+1; + // int end = start + (len - start)/2; + // int j; + + // Pa = Pa_base + i-len; + // Pb = Pa_base + len; + // Pm = Pm_base + i-len; + // Pn = Pn_base + len; + + // Ra = *++Pa; + // Rb = *--Pb; + // Rm = *++Pm; + // Rn = *--Pn; + + // int iters = (2*len-i-1)/2; + // assert(iters == end-start, "must be"); + // for (j = start; iters--; j++) { + // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); + // MACC2(Ra, Rb, t0, t1, t2); + // Ra = *++Pa; + // Rb = *--Pb; + // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + // } + // if ((i & 1) == 0) { + // assert(Ra == Pa_base[j], "must be"); + // MACC(Ra, Ra, t0, t1, t2); + // } + // iters = (2*len-i)/2; + // assert(iters == len-j, "must be"); + // for (; iters--; j++) { + // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); + // MACC(Rm, Rn, t0, t1, t2); + // Rm = *++Pm; + // Rn = *--Pn; + // } + // Pm_base[i-len] = t0; + // t0 = t1; t1 = t2; t2 = 0; + // } + + // while (t0) + // t0 = sub(Pm_base, Pn_base, t0, len); + // } + }; + + + // Initialization + void generate_initial() { + // Generate initial stubs and initializes the entry points + + // entry points that exist in all platforms Note: This is code + // that could be shared among different platforms - however the + // benefit seems to be smaller than the disadvantage of having a + // much more complicated generator structure. See also comment in + // stubRoutines.hpp. + + StubRoutines::_forward_exception_entry = generate_forward_exception(); + + StubRoutines::_call_stub_entry = + generate_call_stub(StubRoutines::_call_stub_return_address); + + // is referenced by megamorphic call + StubRoutines::_catch_exception_entry = generate_catch_exception(); + + // Build this early so it's available for the interpreter. + StubRoutines::_throw_StackOverflowError_entry = + generate_throw_exception("StackOverflowError throw_exception", + CAST_FROM_FN_PTR(address, + SharedRuntime::throw_StackOverflowError)); + StubRoutines::_throw_delayed_StackOverflowError_entry = + generate_throw_exception("delayed StackOverflowError throw_exception", + CAST_FROM_FN_PTR(address, + SharedRuntime::throw_delayed_StackOverflowError)); + if (UseCRC32Intrinsics) { + // set table address before stub generation which use it + StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; + StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); + } + } + + void generate_all() { + // support for verify_oop (must happen after universe_init) + StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); + StubRoutines::_throw_AbstractMethodError_entry = + generate_throw_exception("AbstractMethodError throw_exception", + CAST_FROM_FN_PTR(address, + SharedRuntime:: + throw_AbstractMethodError)); + + StubRoutines::_throw_IncompatibleClassChangeError_entry = + generate_throw_exception("IncompatibleClassChangeError throw_exception", + CAST_FROM_FN_PTR(address, + SharedRuntime:: + throw_IncompatibleClassChangeError)); + + StubRoutines::_throw_NullPointerException_at_call_entry = + generate_throw_exception("NullPointerException at call throw_exception", + CAST_FROM_FN_PTR(address, + SharedRuntime:: + throw_NullPointerException_at_call)); + + // arraycopy stubs used by compilers + generate_arraycopy_stubs(); + + // has negatives stub for large arrays. + StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); + + if (UseMultiplyToLenIntrinsic) { + StubRoutines::_multiplyToLen = generate_multiplyToLen(); + } + + if (UseMontgomeryMultiplyIntrinsic) { + StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); + MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); + StubRoutines::_montgomeryMultiply = g.generate_multiply(); + } + + if (UseMontgomerySquareIntrinsic) { + StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); + MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); + // We use generate_multiply() rather than generate_square() + // because it's faster for the sizes of modulus we care about. + StubRoutines::_montgomerySquare = g.generate_multiply(); + } + +#ifndef BUILTIN_SIM + // generate GHASH intrinsics code + if (UseGHASHIntrinsics) { + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); + } + + if (UseAESIntrinsics) { + StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); + StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); + StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); + StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); + } + + if (UseSHA1Intrinsics) { + StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); + StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); + } + if (UseSHA256Intrinsics) { + StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); + StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); + } + + if (UseCRC32CIntrinsics) { + StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); + } + + // generate Adler32 intrinsics code + if (UseAdler32Intrinsics) { + StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); + } + + // Safefetch stubs. + generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, + &StubRoutines::_safefetch32_fault_pc, + &StubRoutines::_safefetch32_continuation_pc); + generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, + &StubRoutines::_safefetchN_fault_pc, + &StubRoutines::_safefetchN_continuation_pc); +#endif + StubRoutines::aarch64::set_completed(); + } + + public: + StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { + if (all) { + generate_all(); + } else { + generate_initial(); + } + } +}; // end class declaration + +void StubGenerator_generate(CodeBuffer* code, bool all) { + StubGenerator g(code, all); +}