8206459: [s390] Prevent restoring incorrect bcp and locals in interpreter and avoid incorrect size of partialSubtypeCheckNode in C2
Reviewed-by: goetz
/*
* Copyright (c) 2016, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2017, SAP SE. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/
#include "precompiled.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "registerSaver_s390.hpp"
#include "gc/shared/barrierSet.hpp"
#include "gc/shared/barrierSetAssembler.hpp"
#include "interpreter/interpreter.hpp"
#include "interpreter/interp_masm.hpp"
#include "nativeInst_s390.hpp"
#include "oops/instanceOop.hpp"
#include "oops/objArrayKlass.hpp"
#include "oops/oop.inline.hpp"
#include "prims/methodHandles.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/handles.inline.hpp"
#include "runtime/sharedRuntime.hpp"
#include "runtime/stubCodeGenerator.hpp"
#include "runtime/stubRoutines.hpp"
#include "runtime/thread.inline.hpp"
// Declaration and definition of StubGenerator (no .hpp file).
// For a more detailed description of the stub routine structure
// see the comment in stubRoutines.hpp.
#ifdef PRODUCT
#define __ _masm->
#else
#define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
#endif
#define BLOCK_COMMENT(str) if (PrintAssembly) __ block_comment(str)
#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
// -----------------------------------------------------------------------
// Stub Code definitions
class StubGenerator: public StubCodeGenerator {
private:
//----------------------------------------------------------------------
// Call stubs are used to call Java from C.
//
// Arguments:
//
// R2 - call wrapper address : address
// R3 - result : intptr_t*
// R4 - result type : BasicType
// R5 - method : method
// R6 - frame mgr entry point : address
// [SP+160] - parameter block : intptr_t*
// [SP+172] - parameter count in words : int
// [SP+176] - thread : Thread*
//
address generate_call_stub(address& return_address) {
// Set up a new C frame, copy Java arguments, call frame manager
// or native_entry, and process result.
StubCodeMark mark(this, "StubRoutines", "call_stub");
address start = __ pc();
Register r_arg_call_wrapper_addr = Z_ARG1;
Register r_arg_result_addr = Z_ARG2;
Register r_arg_result_type = Z_ARG3;
Register r_arg_method = Z_ARG4;
Register r_arg_entry = Z_ARG5;
// offsets to fp
#define d_arg_thread 176
#define d_arg_argument_addr 160
#define d_arg_argument_count 168+4
Register r_entryframe_fp = Z_tmp_1;
Register r_top_of_arguments_addr = Z_ARG4;
Register r_new_arg_entry = Z_R14;
// macros for frame offsets
#define call_wrapper_address_offset \
_z_entry_frame_locals_neg(call_wrapper_address)
#define result_address_offset \
_z_entry_frame_locals_neg(result_address)
#define result_type_offset \
_z_entry_frame_locals_neg(result_type)
#define arguments_tos_address_offset \
_z_entry_frame_locals_neg(arguments_tos_address)
{
//
// STACK on entry to call_stub:
//
// F1 [C_FRAME]
// ...
//
Register r_argument_addr = Z_tmp_3;
Register r_argumentcopy_addr = Z_tmp_4;
Register r_argument_size_in_bytes = Z_ARG5;
Register r_frame_size = Z_R1;
Label arguments_copied;
// Save non-volatile registers to ABI of caller frame.
BLOCK_COMMENT("save registers, push frame {");
__ z_stmg(Z_R6, Z_R14, 16, Z_SP);
__ z_std(Z_F8, 96, Z_SP);
__ z_std(Z_F9, 104, Z_SP);
__ z_std(Z_F10, 112, Z_SP);
__ z_std(Z_F11, 120, Z_SP);
__ z_std(Z_F12, 128, Z_SP);
__ z_std(Z_F13, 136, Z_SP);
__ z_std(Z_F14, 144, Z_SP);
__ z_std(Z_F15, 152, Z_SP);
//
// Push ENTRY_FRAME including arguments:
//
// F0 [TOP_IJAVA_FRAME_ABI]
// [outgoing Java arguments]
// [ENTRY_FRAME_LOCALS]
// F1 [C_FRAME]
// ...
//
// Calculate new frame size and push frame.
#define abi_plus_locals_size \
(frame::z_top_ijava_frame_abi_size + frame::z_entry_frame_locals_size)
if (abi_plus_locals_size % BytesPerWord == 0) {
// Preload constant part of frame size.
__ load_const_optimized(r_frame_size, -abi_plus_locals_size/BytesPerWord);
// Keep copy of our frame pointer (caller's SP).
__ z_lgr(r_entryframe_fp, Z_SP);
// Add space required by arguments to frame size.
__ z_slgf(r_frame_size, d_arg_argument_count, Z_R0, Z_SP);
// Move Z_ARG5 early, it will be used as a local.
__ z_lgr(r_new_arg_entry, r_arg_entry);
// Convert frame size from words to bytes.
__ z_sllg(r_frame_size, r_frame_size, LogBytesPerWord);
__ push_frame(r_frame_size, r_entryframe_fp,
false/*don't copy SP*/, true /*frame size sign inverted*/);
} else {
guarantee(false, "frame sizes should be multiples of word size (BytesPerWord)");
}
BLOCK_COMMENT("} save, push");
// Load argument registers for call.
BLOCK_COMMENT("prepare/copy arguments {");
__ z_lgr(Z_method, r_arg_method);
__ z_lg(Z_thread, d_arg_thread, r_entryframe_fp);
// Calculate top_of_arguments_addr which will be tos (not prepushed) later.
// Wimply use SP + frame::top_ijava_frame_size.
__ add2reg(r_top_of_arguments_addr,
frame::z_top_ijava_frame_abi_size - BytesPerWord, Z_SP);
// Initialize call_stub locals (step 1).
if ((call_wrapper_address_offset + BytesPerWord == result_address_offset) &&
(result_address_offset + BytesPerWord == result_type_offset) &&
(result_type_offset + BytesPerWord == arguments_tos_address_offset)) {
__ z_stmg(r_arg_call_wrapper_addr, r_top_of_arguments_addr,
call_wrapper_address_offset, r_entryframe_fp);
} else {
__ z_stg(r_arg_call_wrapper_addr,
call_wrapper_address_offset, r_entryframe_fp);
__ z_stg(r_arg_result_addr,
result_address_offset, r_entryframe_fp);
__ z_stg(r_arg_result_type,
result_type_offset, r_entryframe_fp);
__ z_stg(r_top_of_arguments_addr,
arguments_tos_address_offset, r_entryframe_fp);
}
// Copy Java arguments.
// Any arguments to copy?
__ load_and_test_int2long(Z_R1, Address(r_entryframe_fp, d_arg_argument_count));
__ z_bre(arguments_copied);
// Prepare loop and copy arguments in reverse order.
{
// Calculate argument size in bytes.
__ z_sllg(r_argument_size_in_bytes, Z_R1, LogBytesPerWord);
// Get addr of first incoming Java argument.
__ z_lg(r_argument_addr, d_arg_argument_addr, r_entryframe_fp);
// Let r_argumentcopy_addr point to last outgoing Java argument.
__ add2reg(r_argumentcopy_addr, BytesPerWord, r_top_of_arguments_addr); // = Z_SP+160 effectively.
// Let r_argument_addr point to last incoming Java argument.
__ add2reg_with_index(r_argument_addr, -BytesPerWord,
r_argument_size_in_bytes, r_argument_addr);
// Now loop while Z_R1 > 0 and copy arguments.
{
Label next_argument;
__ bind(next_argument);
// Mem-mem move.
__ z_mvc(0, BytesPerWord-1, r_argumentcopy_addr, 0, r_argument_addr);
__ add2reg(r_argument_addr, -BytesPerWord);
__ add2reg(r_argumentcopy_addr, BytesPerWord);
__ z_brct(Z_R1, next_argument);
}
} // End of argument copy loop.
__ bind(arguments_copied);
}
BLOCK_COMMENT("} arguments");
BLOCK_COMMENT("call {");
{
// Call frame manager or native entry.
//
// Register state on entry to frame manager / native entry:
//
// Z_ARG1 = r_top_of_arguments_addr - intptr_t *sender tos (prepushed)
// Lesp = (SP) + copied_arguments_offset - 8
// Z_method - method
// Z_thread - JavaThread*
//
// Here, the usual SP is the initial_caller_sp.
__ z_lgr(Z_R10, Z_SP);
// Z_esp points to the slot below the last argument.
__ z_lgr(Z_esp, r_top_of_arguments_addr);
//
// Stack on entry to frame manager / native entry:
//
// F0 [TOP_IJAVA_FRAME_ABI]
// [outgoing Java arguments]
// [ENTRY_FRAME_LOCALS]
// F1 [C_FRAME]
// ...
//
// Do a light-weight C-call here, r_new_arg_entry holds the address
// of the interpreter entry point (frame manager or native entry)
// and save runtime-value of return_pc in return_address
// (call by reference argument).
return_address = __ call_stub(r_new_arg_entry);
}
BLOCK_COMMENT("} call");
{
BLOCK_COMMENT("restore registers {");
// Returned from frame manager or native entry.
// Now pop frame, process result, and return to caller.
//
// Stack on exit from frame manager / native entry:
//
// F0 [ABI]
// ...
// [ENTRY_FRAME_LOCALS]
// F1 [C_FRAME]
// ...
//
// Just pop the topmost frame ...
//
Label ret_is_object;
Label ret_is_long;
Label ret_is_float;
Label ret_is_double;
// Restore frame pointer.
__ z_lg(r_entryframe_fp, _z_abi(callers_sp), Z_SP);
// Pop frame. Done here to minimize stalls.
__ pop_frame();
// Reload some volatile registers which we've spilled before the call
// to frame manager / native entry.
// Access all locals via frame pointer, because we know nothing about
// the topmost frame's size.
__ z_lg(r_arg_result_addr, result_address_offset, r_entryframe_fp);
__ z_lg(r_arg_result_type, result_type_offset, r_entryframe_fp);
// Restore non-volatiles.
__ z_lmg(Z_R6, Z_R14, 16, Z_SP);
__ z_ld(Z_F8, 96, Z_SP);
__ z_ld(Z_F9, 104, Z_SP);
__ z_ld(Z_F10, 112, Z_SP);
__ z_ld(Z_F11, 120, Z_SP);
__ z_ld(Z_F12, 128, Z_SP);
__ z_ld(Z_F13, 136, Z_SP);
__ z_ld(Z_F14, 144, Z_SP);
__ z_ld(Z_F15, 152, Z_SP);
BLOCK_COMMENT("} restore");
//
// Stack on exit from call_stub:
//
// 0 [C_FRAME]
// ...
//
// No call_stub frames left.
//
// All non-volatiles have been restored at this point!!
//------------------------------------------------------------------------
// The following code makes some assumptions on the T_<type> enum values.
// The enum is defined in globalDefinitions.hpp.
// The validity of the assumptions is tested as far as possible.
// The assigned values should not be shuffled
// T_BOOLEAN==4 - lowest used enum value
// T_NARROWOOP==16 - largest used enum value
//------------------------------------------------------------------------
BLOCK_COMMENT("process result {");
Label firstHandler;
int handlerLen= 8;
#ifdef ASSERT
char assertMsg[] = "check BasicType definition in globalDefinitions.hpp";
__ z_chi(r_arg_result_type, T_BOOLEAN);
__ asm_assert_low(assertMsg, 0x0234);
__ z_chi(r_arg_result_type, T_NARROWOOP);
__ asm_assert_high(assertMsg, 0x0235);
#endif
__ add2reg(r_arg_result_type, -T_BOOLEAN); // Remove offset.
__ z_larl(Z_R1, firstHandler); // location of first handler
__ z_sllg(r_arg_result_type, r_arg_result_type, 3); // Each handler is 8 bytes long.
__ z_bc(MacroAssembler::bcondAlways, 0, r_arg_result_type, Z_R1);
__ align(handlerLen);
__ bind(firstHandler);
// T_BOOLEAN:
guarantee(T_BOOLEAN == 4, "check BasicType definition in globalDefinitions.hpp");
__ z_st(Z_RET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
// T_CHAR:
guarantee(T_CHAR == T_BOOLEAN+1, "check BasicType definition in globalDefinitions.hpp");
__ z_st(Z_RET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
// T_FLOAT:
guarantee(T_FLOAT == T_CHAR+1, "check BasicType definition in globalDefinitions.hpp");
__ z_ste(Z_FRET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
// T_DOUBLE:
guarantee(T_DOUBLE == T_FLOAT+1, "check BasicType definition in globalDefinitions.hpp");
__ z_std(Z_FRET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
// T_BYTE:
guarantee(T_BYTE == T_DOUBLE+1, "check BasicType definition in globalDefinitions.hpp");
__ z_st(Z_RET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
// T_SHORT:
guarantee(T_SHORT == T_BYTE+1, "check BasicType definition in globalDefinitions.hpp");
__ z_st(Z_RET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
// T_INT:
guarantee(T_INT == T_SHORT+1, "check BasicType definition in globalDefinitions.hpp");
__ z_st(Z_RET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
// T_LONG:
guarantee(T_LONG == T_INT+1, "check BasicType definition in globalDefinitions.hpp");
__ z_stg(Z_RET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
// T_OBJECT:
guarantee(T_OBJECT == T_LONG+1, "check BasicType definition in globalDefinitions.hpp");
__ z_stg(Z_RET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
// T_ARRAY:
guarantee(T_ARRAY == T_OBJECT+1, "check BasicType definition in globalDefinitions.hpp");
__ z_stg(Z_RET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
// T_VOID:
guarantee(T_VOID == T_ARRAY+1, "check BasicType definition in globalDefinitions.hpp");
__ z_stg(Z_RET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
// T_ADDRESS:
guarantee(T_ADDRESS == T_VOID+1, "check BasicType definition in globalDefinitions.hpp");
__ z_stg(Z_RET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
// T_NARROWOOP:
guarantee(T_NARROWOOP == T_ADDRESS+1, "check BasicType definition in globalDefinitions.hpp");
__ z_st(Z_RET, 0, r_arg_result_addr);
__ z_br(Z_R14); // Return to caller.
__ align(handlerLen);
BLOCK_COMMENT("} process result");
}
return start;
}
// Return point for a Java call if there's an exception thrown in
// Java code. The exception is caught and transformed into a
// pending exception stored in JavaThread that can be tested from
// within the VM.
address generate_catch_exception() {
StubCodeMark mark(this, "StubRoutines", "catch_exception");
address start = __ pc();
//
// Registers alive
//
// Z_thread
// Z_ARG1 - address of pending exception
// Z_ARG2 - return address in call stub
//
const Register exception_file = Z_R0;
const Register exception_line = Z_R1;
__ load_const_optimized(exception_file, (void*)__FILE__);
__ load_const_optimized(exception_line, (void*)__LINE__);
__ z_stg(Z_ARG1, thread_(pending_exception));
// Store into `char *'.
__ z_stg(exception_file, thread_(exception_file));
// Store into `int'.
__ z_st(exception_line, thread_(exception_line));
// Complete return to VM.
assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
// Continue in call stub.
__ z_br(Z_ARG2);
return start;
}
// Continuation point for runtime calls returning with a pending
// exception. The pending exception check happened in the runtime
// or native call stub. The pending exception in Thread is
// converted into a Java-level exception.
//
// Read:
// Z_R14: pc the runtime library callee wants to return to.
// Since the exception occurred in the callee, the return pc
// from the point of view of Java is the exception pc.
//
// Invalidate:
// Volatile registers (except below).
//
// Update:
// Z_ARG1: exception
// (Z_R14 is unchanged and is live out).
//
address generate_forward_exception() {
StubCodeMark mark(this, "StubRoutines", "forward_exception");
address start = __ pc();
#define pending_exception_offset in_bytes(Thread::pending_exception_offset())
#ifdef ASSERT
// Get pending exception oop.
__ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
// Make sure that this code is only executed if there is a pending exception.
{
Label L;
__ z_ltgr(Z_ARG1, Z_ARG1);
__ z_brne(L);
__ stop("StubRoutines::forward exception: no pending exception (1)");
__ bind(L);
}
__ verify_oop(Z_ARG1, "StubRoutines::forward exception: not an oop");
#endif
__ z_lgr(Z_ARG2, Z_R14); // Copy exception pc into Z_ARG2.
__ save_return_pc();
__ push_frame_abi160(0);
// Find exception handler.
__ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address),
Z_thread,
Z_ARG2);
// Copy handler's address.
__ z_lgr(Z_R1, Z_RET);
__ pop_frame();
__ restore_return_pc();
// Set up the arguments for the exception handler:
// - Z_ARG1: exception oop
// - Z_ARG2: exception pc
// Load pending exception oop.
__ z_lg(Z_ARG1, pending_exception_offset, Z_thread);
// The exception pc is the return address in the caller,
// must load it into Z_ARG2
__ z_lgr(Z_ARG2, Z_R14);
#ifdef ASSERT
// Make sure exception is set.
{ Label L;
__ z_ltgr(Z_ARG1, Z_ARG1);
__ z_brne(L);
__ stop("StubRoutines::forward exception: no pending exception (2)");
__ bind(L);
}
#endif
// Clear the pending exception.
__ clear_mem(Address(Z_thread, pending_exception_offset), sizeof(void *));
// Jump to exception handler
__ z_br(Z_R1 /*handler address*/);
return start;
#undef pending_exception_offset
}
// Continuation point for throwing of implicit exceptions that are
// not handled in the current activation. Fabricates an exception
// oop and initiates normal exception dispatching in this
// frame. Only callee-saved registers are preserved (through the
// normal RegisterMap handling). If the compiler
// needs all registers to be preserved between the fault point and
// the exception handler then it must assume responsibility for that
// in AbstractCompiler::continuation_for_implicit_null_exception or
// continuation_for_implicit_division_by_zero_exception. All other
// implicit exceptions (e.g., NullPointerException or
// AbstractMethodError on entry) are either at call sites or
// otherwise assume that stack unwinding will be initiated, so
// caller saved registers were assumed volatile in the compiler.
// Note that we generate only this stub into a RuntimeStub, because
// it needs to be properly traversed and ignored during GC, so we
// change the meaning of the "__" macro within this method.
// Note: the routine set_pc_not_at_call_for_caller in
// SharedRuntime.cpp requires that this code be generated into a
// RuntimeStub.
#undef __
#define __ masm->
address generate_throw_exception(const char* name, address runtime_entry,
bool restore_saved_exception_pc,
Register arg1 = noreg, Register arg2 = noreg) {
assert_different_registers(arg1, Z_R0_scratch); // would be destroyed by push_frame()
assert_different_registers(arg2, Z_R0_scratch); // would be destroyed by push_frame()
int insts_size = 256;
int locs_size = 0;
CodeBuffer code(name, insts_size, locs_size);
MacroAssembler* masm = new MacroAssembler(&code);
int framesize_in_bytes;
address start = __ pc();
__ save_return_pc();
framesize_in_bytes = __ push_frame_abi160(0);
address frame_complete_pc = __ pc();
if (restore_saved_exception_pc) {
__ unimplemented("StubGenerator::throw_exception", 74);
}
// Note that we always have a runtime stub frame on the top of stack at this point.
__ get_PC(Z_R1);
__ set_last_Java_frame(/*sp*/Z_SP, /*pc*/Z_R1);
// Do the call.
BLOCK_COMMENT("call runtime_entry");
__ call_VM_leaf(runtime_entry, Z_thread, arg1, arg2);
__ reset_last_Java_frame();
#ifdef ASSERT
// Make sure that this code is only executed if there is a pending exception.
{ Label L;
__ z_lg(Z_R0,
in_bytes(Thread::pending_exception_offset()),
Z_thread);
__ z_ltgr(Z_R0, Z_R0);
__ z_brne(L);
__ stop("StubRoutines::throw_exception: no pending exception");
__ bind(L);
}
#endif
__ pop_frame();
__ restore_return_pc();
__ load_const_optimized(Z_R1, StubRoutines::forward_exception_entry());
__ z_br(Z_R1);
RuntimeStub* stub =
RuntimeStub::new_runtime_stub(name, &code,
frame_complete_pc - start,
framesize_in_bytes/wordSize,
NULL /*oop_maps*/, false);
return stub->entry_point();
}
#undef __
#ifdef PRODUCT
#define __ _masm->
#else
#define __ (Verbose ? (_masm->block_comment(FILE_AND_LINE),_masm):_masm)->
#endif
// Support for uint StubRoutine::zarch::partial_subtype_check(Klass
// sub, Klass super);
//
// Arguments:
// ret : Z_RET, returned
// sub : Z_ARG2, argument, not changed
// super: Z_ARG3, argument, not changed
//
// raddr: Z_R14, blown by call
//
address generate_partial_subtype_check() {
StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
Label miss;
address start = __ pc();
const Register Rsubklass = Z_ARG2; // subklass
const Register Rsuperklass = Z_ARG3; // superklass
// No args, but tmp registers that are killed.
const Register Rlength = Z_ARG4; // cache array length
const Register Rarray_ptr = Z_ARG5; // Current value from cache array.
if (UseCompressedOops) {
assert(Universe::heap() != NULL, "java heap must be initialized to generate partial_subtype_check stub");
}
// Always take the slow path (see SPARC).
__ check_klass_subtype_slow_path(Rsubklass, Rsuperklass,
Rarray_ptr, Rlength, NULL, &miss);
// Match falls through here.
__ clear_reg(Z_RET); // Zero indicates a match. Set EQ flag in CC.
__ z_br(Z_R14);
__ BIND(miss);
__ load_const_optimized(Z_RET, 1); // One indicates a miss.
__ z_ltgr(Z_RET, Z_RET); // Set NE flag in CR.
__ z_br(Z_R14);
return start;
}
// Return address of code to be called from code generated by
// MacroAssembler::verify_oop.
//
// Don't generate, rather use C++ code.
address generate_verify_oop_subroutine() {
// Don't generate a StubCodeMark, because no code is generated!
// Generating the mark triggers notifying the oprofile jvmti agent
// about the dynamic code generation, but the stub without
// code (code_size == 0) confuses opjitconv
// StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
address start = 0;
return start;
}
// This is to test that the count register contains a positive int value.
// Required because C2 does not respect int to long conversion for stub calls.
void assert_positive_int(Register count) {
#ifdef ASSERT
__ z_srag(Z_R0, count, 31); // Just leave the sign (must be zero) in Z_R0.
__ asm_assert_eq("missing zero extend", 0xAFFE);
#endif
}
// Generate overlap test for array copy stubs.
// If no actual overlap is detected, control is transferred to the
// "normal" copy stub (entry address passed in disjoint_copy_target).
// Otherwise, execution continues with the code generated by the
// caller of array_overlap_test.
//
// Input:
// Z_ARG1 - from
// Z_ARG2 - to
// Z_ARG3 - element count
void array_overlap_test(address disjoint_copy_target, int log2_elem_size) {
__ MacroAssembler::compare_and_branch_optimized(Z_ARG2, Z_ARG1, Assembler::bcondNotHigh,
disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
Register index = Z_ARG3;
if (log2_elem_size > 0) {
__ z_sllg(Z_R1, Z_ARG3, log2_elem_size); // byte count
index = Z_R1;
}
__ add2reg_with_index(Z_R1, 0, index, Z_ARG1); // First byte after "from" range.
__ MacroAssembler::compare_and_branch_optimized(Z_R1, Z_ARG2, Assembler::bcondNotHigh,
disjoint_copy_target, /*len64=*/true, /*has_sign=*/false);
// Destructive overlap: let caller generate code for that.
}
// Generate stub for disjoint array copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments for generated stub:
// from: Z_ARG1
// to: Z_ARG2
// count: Z_ARG3 treated as signed
void generate_disjoint_copy(bool aligned, int element_size,
bool branchToEnd,
bool restoreArgs) {
// This is the zarch specific stub generator for general array copy tasks.
// It has the following prereqs and features:
//
// - No destructive overlap allowed (else unpredictable results).
// - Destructive overlap does not exist if the leftmost byte of the target
// does not coincide with any of the source bytes (except the leftmost).
//
// Register usage upon entry:
// Z_ARG1 == Z_R2 : address of source array
// Z_ARG2 == Z_R3 : address of target array
// Z_ARG3 == Z_R4 : length of operands (# of elements on entry)
//
// Register usage within the generator:
// - Z_R0 and Z_R1 are KILLed by the stub routine (target addr/len).
// Used as pair register operand in complex moves, scratch registers anyway.
// - Z_R5 is KILLed by the stub routine (source register pair addr/len) (even/odd reg).
// Same as R0/R1, but no scratch register.
// - Z_ARG1, Z_ARG2, Z_ARG3 are USEd but preserved by the stub routine,
// but they might get temporarily overwritten.
Register save_reg = Z_ARG4; // (= Z_R5), holds original target operand address for restore.
{
Register llen_reg = Z_R1; // Holds left operand len (odd reg).
Register laddr_reg = Z_R0; // Holds left operand addr (even reg), overlaps with data_reg.
Register rlen_reg = Z_R5; // Holds right operand len (odd reg), overlaps with save_reg.
Register raddr_reg = Z_R4; // Holds right operand addr (even reg), overlaps with len_reg.
Register data_reg = Z_R0; // Holds copied data chunk in alignment process and copy loop.
Register len_reg = Z_ARG3; // Holds operand len (#elements at entry, #bytes shortly after).
Register dst_reg = Z_ARG2; // Holds left (target) operand addr.
Register src_reg = Z_ARG1; // Holds right (source) operand addr.
Label doMVCLOOP, doMVCLOOPcount, doMVCLOOPiterate;
Label doMVCUnrolled;
NearLabel doMVC, doMVCgeneral, done;
Label MVC_template;
address pcMVCblock_b, pcMVCblock_e;
bool usedMVCLE = true;
bool usedMVCLOOP = true;
bool usedMVCUnrolled = false;
bool usedMVC = false;
bool usedMVCgeneral = false;
int stride;
Register stride_reg;
Register ix_reg;
assert((element_size<=256) && (256%element_size == 0), "element size must be <= 256, power of 2");
unsigned int log2_size = exact_log2(element_size);
switch (element_size) {
case 1: BLOCK_COMMENT("ARRAYCOPY DISJOINT byte {"); break;
case 2: BLOCK_COMMENT("ARRAYCOPY DISJOINT short {"); break;
case 4: BLOCK_COMMENT("ARRAYCOPY DISJOINT int {"); break;
case 8: BLOCK_COMMENT("ARRAYCOPY DISJOINT long {"); break;
default: BLOCK_COMMENT("ARRAYCOPY DISJOINT {"); break;
}
assert_positive_int(len_reg);
BLOCK_COMMENT("preparation {");
// No copying if len <= 0.
if (branchToEnd) {
__ compare64_and_branch(len_reg, (intptr_t) 0, Assembler::bcondNotHigh, done);
} else {
if (VM_Version::has_CompareBranch()) {
__ z_cgib(len_reg, 0, Assembler::bcondNotHigh, 0, Z_R14);
} else {
__ z_ltgr(len_reg, len_reg);
__ z_bcr(Assembler::bcondNotPositive, Z_R14);
}
}
// Prefetch just one cache line. Speculative opt for short arrays.
// Do not use Z_R1 in prefetch. Is undefined here.
if (VM_Version::has_Prefetch()) {
__ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
__ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
}
BLOCK_COMMENT("} preparation");
// Save args only if really needed.
// Keep len test local to branch. Is generated only once.
BLOCK_COMMENT("mode selection {");
// Special handling for arrays with only a few elements.
// Nothing fancy: just an executed MVC.
if (log2_size > 0) {
__ z_sllg(Z_R1, len_reg, log2_size); // Remember #bytes in Z_R1.
}
if (element_size != 8) {
__ z_cghi(len_reg, 256/element_size);
__ z_brnh(doMVC);
usedMVC = true;
}
if (element_size == 8) { // Long and oop arrays are always aligned.
__ z_cghi(len_reg, 256/element_size);
__ z_brnh(doMVCUnrolled);
usedMVCUnrolled = true;
}
// Prefetch another cache line. We, for sure, have more than one line to copy.
if (VM_Version::has_Prefetch()) {
__ z_pfd(0x01, 256, Z_R0, src_reg); // Fetch access.
__ z_pfd(0x02, 256, Z_R0, dst_reg); // Store access.
}
if (restoreArgs) {
// Remember entry value of ARG2 to restore all arguments later from that knowledge.
__ z_lgr(save_reg, dst_reg);
}
__ z_cghi(len_reg, 4096/element_size);
if (log2_size == 0) {
__ z_lgr(Z_R1, len_reg); // Init Z_R1 with #bytes
}
__ z_brnh(doMVCLOOP);
// Fall through to MVCLE case.
BLOCK_COMMENT("} mode selection");
// MVCLE: for long arrays
// DW aligned: Best performance for sizes > 4kBytes.
// unaligned: Least complex for sizes > 256 bytes.
if (usedMVCLE) {
BLOCK_COMMENT("mode MVCLE {");
// Setup registers for mvcle.
//__ z_lgr(llen_reg, len_reg);// r1 <- r4 #bytes already in Z_R1, aka llen_reg.
__ z_lgr(laddr_reg, dst_reg); // r0 <- r3
__ z_lgr(raddr_reg, src_reg); // r4 <- r2
__ z_lgr(rlen_reg, llen_reg); // r5 <- r1
__ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb0); // special: bypass cache
// __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0xb8); // special: Hold data in cache.
// __ MacroAssembler::move_long_ext(laddr_reg, raddr_reg, 0);
if (restoreArgs) {
// MVCLE updates the source (Z_R4,Z_R5) and target (Z_R0,Z_R1) register pairs.
// Dst_reg (Z_ARG2) and src_reg (Z_ARG1) are left untouched. No restore required.
// Len_reg (Z_ARG3) is destroyed and must be restored.
__ z_slgr(laddr_reg, dst_reg); // copied #bytes
if (log2_size > 0) {
__ z_srag(Z_ARG3, laddr_reg, log2_size); // Convert back to #elements.
} else {
__ z_lgr(Z_ARG3, laddr_reg);
}
}
if (branchToEnd) {
__ z_bru(done);
} else {
__ z_br(Z_R14);
}
BLOCK_COMMENT("} mode MVCLE");
}
// No fallthru possible here.
// MVCUnrolled: for short, aligned arrays.
if (usedMVCUnrolled) {
BLOCK_COMMENT("mode MVC unrolled {");
stride = 8;
// Generate unrolled MVC instructions.
for (int ii = 32; ii > 1; ii--) {
__ z_mvc(0, ii * stride-1, dst_reg, 0, src_reg); // ii*8 byte copy
if (branchToEnd) {
__ z_bru(done);
} else {
__ z_br(Z_R14);
}
}
pcMVCblock_b = __ pc();
__ z_mvc(0, 1 * stride-1, dst_reg, 0, src_reg); // 8 byte copy
if (branchToEnd) {
__ z_bru(done);
} else {
__ z_br(Z_R14);
}
pcMVCblock_e = __ pc();
Label MVC_ListEnd;
__ bind(MVC_ListEnd);
// This is an absolute fast path:
// - Array len in bytes must be not greater than 256.
// - Array len in bytes must be an integer mult of DW
// to save expensive handling of trailing bytes.
// - Argument restore is not done,
// i.e. previous code must not alter arguments (this code doesn't either).
__ bind(doMVCUnrolled);
// Avoid mul, prefer shift where possible.
// Combine shift right (for #DW) with shift left (for block size).
// Set CC for zero test below (asm_assert).
// Note: #bytes comes in Z_R1, #DW in len_reg.
unsigned int MVCblocksize = pcMVCblock_e - pcMVCblock_b;
unsigned int logMVCblocksize = 0xffffffffU; // Pacify compiler ("used uninitialized" warning).
if (log2_size > 0) { // Len was scaled into Z_R1.
switch (MVCblocksize) {
case 8: logMVCblocksize = 3;
__ z_ltgr(Z_R0, Z_R1); // #bytes is index
break; // reasonable size, use shift
case 16: logMVCblocksize = 4;
__ z_slag(Z_R0, Z_R1, logMVCblocksize-log2_size);
break; // reasonable size, use shift
default: logMVCblocksize = 0;
__ z_ltgr(Z_R0, len_reg); // #DW for mul
break; // all other sizes: use mul
}
} else {
guarantee(log2_size, "doMVCUnrolled: only for DW entities");
}
// This test (and branch) is redundant. Previous code makes sure that
// - element count > 0
// - element size == 8.
// Thus, len reg should never be zero here. We insert an asm_assert() here,
// just to double-check and to be on the safe side.
__ asm_assert(false, "zero len cannot occur", 99);
__ z_larl(Z_R1, MVC_ListEnd); // Get addr of last instr block.
// Avoid mul, prefer shift where possible.
if (logMVCblocksize == 0) {
__ z_mghi(Z_R0, MVCblocksize);
}
__ z_slgr(Z_R1, Z_R0);
__ z_br(Z_R1);
BLOCK_COMMENT("} mode MVC unrolled");
}
// No fallthru possible here.
// MVC execute template
// Must always generate. Usage may be switched on below.
// There is no suitable place after here to put the template.
__ bind(MVC_template);
__ z_mvc(0,0,dst_reg,0,src_reg); // Instr template, never exec directly!
// MVC Loop: for medium-sized arrays
// Only for DW aligned arrays (src and dst).
// #bytes to copy must be at least 256!!!
// Non-aligned cases handled separately.
stride = 256;
stride_reg = Z_R1; // Holds #bytes when control arrives here.
ix_reg = Z_ARG3; // Alias for len_reg.
if (usedMVCLOOP) {
BLOCK_COMMENT("mode MVC loop {");
__ bind(doMVCLOOP);
__ z_lcgr(ix_reg, Z_R1); // Ix runs from -(n-2)*stride to 1*stride (inclusive).
__ z_llill(stride_reg, stride);
__ add2reg(ix_reg, 2*stride); // Thus: increment ix by 2*stride.
__ bind(doMVCLOOPiterate);
__ z_mvc(0, stride-1, dst_reg, 0, src_reg);
__ add2reg(dst_reg, stride);
__ add2reg(src_reg, stride);
__ bind(doMVCLOOPcount);
__ z_brxlg(ix_reg, stride_reg, doMVCLOOPiterate);
// Don 't use add2reg() here, since we must set the condition code!
__ z_aghi(ix_reg, -2*stride); // Compensate incr from above: zero diff means "all copied".
if (restoreArgs) {
__ z_lcgr(Z_R1, ix_reg); // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
__ z_brnz(doMVCgeneral); // We're not done yet, ix_reg is not zero.
// ARG1, ARG2, and ARG3 were altered by the code above, so restore them building on save_reg.
__ z_slgr(dst_reg, save_reg); // copied #bytes
__ z_slgr(src_reg, dst_reg); // = ARG1 (now restored)
if (log2_size) {
__ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3.
} else {
__ z_lgr(Z_ARG3, dst_reg);
}
__ z_lgr(Z_ARG2, save_reg); // ARG2 now restored.
if (branchToEnd) {
__ z_bru(done);
} else {
__ z_br(Z_R14);
}
} else {
if (branchToEnd) {
__ z_brz(done); // CC set by aghi instr.
} else {
__ z_bcr(Assembler::bcondZero, Z_R14); // We're all done if zero.
}
__ z_lcgr(Z_R1, ix_reg); // Prepare ix_reg for copy loop, #bytes expected in Z_R1.
// __ z_bru(doMVCgeneral); // fallthru
}
usedMVCgeneral = true;
BLOCK_COMMENT("} mode MVC loop");
}
// Fallthru to doMVCgeneral
// MVCgeneral: for short, unaligned arrays, after other copy operations
// Somewhat expensive due to use of EX instruction, but simple.
if (usedMVCgeneral) {
BLOCK_COMMENT("mode MVC general {");
__ bind(doMVCgeneral);
__ add2reg(len_reg, -1, Z_R1); // Get #bytes-1 for EXECUTE.
if (VM_Version::has_ExecuteExtensions()) {
__ z_exrl(len_reg, MVC_template); // Execute MVC with variable length.
} else {
__ z_larl(Z_R1, MVC_template); // Get addr of instr template.
__ z_ex(len_reg, 0, Z_R0, Z_R1); // Execute MVC with variable length.
} // penalty: 9 ticks
if (restoreArgs) {
// ARG1, ARG2, and ARG3 were altered by code executed before, so restore them building on save_reg
__ z_slgr(dst_reg, save_reg); // Copied #bytes without the "doMVCgeneral" chunk
__ z_slgr(src_reg, dst_reg); // = ARG1 (now restored), was not advanced for "doMVCgeneral" chunk
__ add2reg_with_index(dst_reg, 1, len_reg, dst_reg); // Len of executed MVC was not accounted for, yet.
if (log2_size) {
__ z_srag(Z_ARG3, dst_reg, log2_size); // Convert back to #elements to restore ARG3
} else {
__ z_lgr(Z_ARG3, dst_reg);
}
__ z_lgr(Z_ARG2, save_reg); // ARG2 now restored.
}
if (usedMVC) {
if (branchToEnd) {
__ z_bru(done);
} else {
__ z_br(Z_R14);
}
} else {
if (!branchToEnd) __ z_br(Z_R14);
}
BLOCK_COMMENT("} mode MVC general");
}
// Fallthru possible if following block not generated.
// MVC: for short, unaligned arrays
// Somewhat expensive due to use of EX instruction, but simple. penalty: 9 ticks.
// Differs from doMVCgeneral in reconstruction of ARG2, ARG3, and ARG4.
if (usedMVC) {
BLOCK_COMMENT("mode MVC {");
__ bind(doMVC);
// get #bytes-1 for EXECUTE
if (log2_size) {
__ add2reg(Z_R1, -1); // Length was scaled into Z_R1.
} else {
__ add2reg(Z_R1, -1, len_reg); // Length was not scaled.
}
if (VM_Version::has_ExecuteExtensions()) {
__ z_exrl(Z_R1, MVC_template); // Execute MVC with variable length.
} else {
__ z_lgr(Z_R0, Z_R5); // Save ARG4, may be unnecessary.
__ z_larl(Z_R5, MVC_template); // Get addr of instr template.
__ z_ex(Z_R1, 0, Z_R0, Z_R5); // Execute MVC with variable length.
__ z_lgr(Z_R5, Z_R0); // Restore ARG4, may be unnecessary.
}
if (!branchToEnd) {
__ z_br(Z_R14);
}
BLOCK_COMMENT("} mode MVC");
}
__ bind(done);
switch (element_size) {
case 1: BLOCK_COMMENT("} ARRAYCOPY DISJOINT byte "); break;
case 2: BLOCK_COMMENT("} ARRAYCOPY DISJOINT short"); break;
case 4: BLOCK_COMMENT("} ARRAYCOPY DISJOINT int "); break;
case 8: BLOCK_COMMENT("} ARRAYCOPY DISJOINT long "); break;
default: BLOCK_COMMENT("} ARRAYCOPY DISJOINT "); break;
}
}
}
// Generate stub for conjoint array copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
//
// Arguments for generated stub:
// from: Z_ARG1
// to: Z_ARG2
// count: Z_ARG3 treated as signed
void generate_conjoint_copy(bool aligned, int element_size, bool branchToEnd) {
// This is the zarch specific stub generator for general array copy tasks.
// It has the following prereqs and features:
//
// - Destructive overlap exists and is handled by reverse copy.
// - Destructive overlap exists if the leftmost byte of the target
// does coincide with any of the source bytes (except the leftmost).
// - Z_R0 and Z_R1 are KILLed by the stub routine (data and stride)
// - Z_ARG1 and Z_ARG2 are USEd but preserved by the stub routine.
// - Z_ARG3 is USED but preserved by the stub routine.
// - Z_ARG4 is used as index register and is thus KILLed.
//
{
Register stride_reg = Z_R1; // Stride & compare value in loop (negative element_size).
Register data_reg = Z_R0; // Holds value of currently processed element.
Register ix_reg = Z_ARG4; // Holds byte index of currently processed element.
Register len_reg = Z_ARG3; // Holds length (in #elements) of arrays.
Register dst_reg = Z_ARG2; // Holds left operand addr.
Register src_reg = Z_ARG1; // Holds right operand addr.
assert(256%element_size == 0, "Element size must be power of 2.");
assert(element_size <= 8, "Can't handle more than DW units.");
switch (element_size) {
case 1: BLOCK_COMMENT("ARRAYCOPY CONJOINT byte {"); break;
case 2: BLOCK_COMMENT("ARRAYCOPY CONJOINT short {"); break;
case 4: BLOCK_COMMENT("ARRAYCOPY CONJOINT int {"); break;
case 8: BLOCK_COMMENT("ARRAYCOPY CONJOINT long {"); break;
default: BLOCK_COMMENT("ARRAYCOPY CONJOINT {"); break;
}
assert_positive_int(len_reg);
if (VM_Version::has_Prefetch()) {
__ z_pfd(0x01, 0, Z_R0, src_reg); // Fetch access.
__ z_pfd(0x02, 0, Z_R0, dst_reg); // Store access.
}
unsigned int log2_size = exact_log2(element_size);
if (log2_size) {
__ z_sllg(ix_reg, len_reg, log2_size);
} else {
__ z_lgr(ix_reg, len_reg);
}
// Optimize reverse copy loop.
// Main loop copies DW units which may be unaligned. Unaligned access adds some penalty ticks.
// Unaligned DW access (neither fetch nor store) is DW-atomic, but should be alignment-atomic.
// Preceding the main loop, some bytes are copied to obtain a DW-multiple remaining length.
Label countLoop1;
Label copyLoop1;
Label skipBY;
Label skipHW;
int stride = -8;
__ load_const_optimized(stride_reg, stride); // Prepare for DW copy loop.
if (element_size == 8) // Nothing to do here.
__ z_bru(countLoop1);
else { // Do not generate dead code.
__ z_tmll(ix_reg, 7); // Check the "odd" bits.
__ z_bre(countLoop1); // There are none, very good!
}
if (log2_size == 0) { // Handle leftover Byte.
__ z_tmll(ix_reg, 1);
__ z_bre(skipBY);
__ z_lb(data_reg, -1, ix_reg, src_reg);
__ z_stcy(data_reg, -1, ix_reg, dst_reg);
__ add2reg(ix_reg, -1); // Decrement delayed to avoid AGI.
__ bind(skipBY);
// fallthru
}
if (log2_size <= 1) { // Handle leftover HW.
__ z_tmll(ix_reg, 2);
__ z_bre(skipHW);
__ z_lhy(data_reg, -2, ix_reg, src_reg);
__ z_sthy(data_reg, -2, ix_reg, dst_reg);
__ add2reg(ix_reg, -2); // Decrement delayed to avoid AGI.
__ bind(skipHW);
__ z_tmll(ix_reg, 4);
__ z_bre(countLoop1);
// fallthru
}
if (log2_size <= 2) { // There are just 4 bytes (left) that need to be copied.
__ z_ly(data_reg, -4, ix_reg, src_reg);
__ z_sty(data_reg, -4, ix_reg, dst_reg);
__ add2reg(ix_reg, -4); // Decrement delayed to avoid AGI.
__ z_bru(countLoop1);
}
// Control can never get to here. Never! Never ever!
__ z_illtrap(0x99);
__ bind(copyLoop1);
__ z_lg(data_reg, 0, ix_reg, src_reg);
__ z_stg(data_reg, 0, ix_reg, dst_reg);
__ bind(countLoop1);
__ z_brxhg(ix_reg, stride_reg, copyLoop1);
if (!branchToEnd)
__ z_br(Z_R14);
switch (element_size) {
case 1: BLOCK_COMMENT("} ARRAYCOPY CONJOINT byte "); break;
case 2: BLOCK_COMMENT("} ARRAYCOPY CONJOINT short"); break;
case 4: BLOCK_COMMENT("} ARRAYCOPY CONJOINT int "); break;
case 8: BLOCK_COMMENT("} ARRAYCOPY CONJOINT long "); break;
default: BLOCK_COMMENT("} ARRAYCOPY CONJOINT "); break;
}
}
}
// Generate stub for disjoint byte copy. If "aligned" is true, the
// "from" and "to" addresses are assumed to be heapword aligned.
address generate_disjoint_byte_copy(bool aligned, const char * name) {
StubCodeMark mark(this, "StubRoutines", name);
// This is the zarch specific stub generator for byte array copy.
// Refer to generate_disjoint_copy for a list of prereqs and features:
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
generate_disjoint_copy(aligned, 1, false, false);
return __ addr_at(start_off);
}
address generate_disjoint_short_copy(bool aligned, const char * name) {
StubCodeMark mark(this, "StubRoutines", name);
// This is the zarch specific stub generator for short array copy.
// Refer to generate_disjoint_copy for a list of prereqs and features:
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
generate_disjoint_copy(aligned, 2, false, false);
return __ addr_at(start_off);
}
address generate_disjoint_int_copy(bool aligned, const char * name) {
StubCodeMark mark(this, "StubRoutines", name);
// This is the zarch specific stub generator for int array copy.
// Refer to generate_disjoint_copy for a list of prereqs and features:
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
generate_disjoint_copy(aligned, 4, false, false);
return __ addr_at(start_off);
}
address generate_disjoint_long_copy(bool aligned, const char * name) {
StubCodeMark mark(this, "StubRoutines", name);
// This is the zarch specific stub generator for long array copy.
// Refer to generate_disjoint_copy for a list of prereqs and features:
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
generate_disjoint_copy(aligned, 8, false, false);
return __ addr_at(start_off);
}
address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
StubCodeMark mark(this, "StubRoutines", name);
// This is the zarch specific stub generator for oop array copy.
// Refer to generate_disjoint_copy for a list of prereqs and features.
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
unsigned int size = UseCompressedOops ? 4 : 8;
DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
}
if (aligned) {
decorators |= ARRAYCOPY_ALIGNED;
}
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
generate_disjoint_copy(aligned, size, true, true);
bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
return __ addr_at(start_off);
}
address generate_conjoint_byte_copy(bool aligned, const char * name) {
StubCodeMark mark(this, "StubRoutines", name);
// This is the zarch specific stub generator for overlapping byte array copy.
// Refer to generate_conjoint_copy for a list of prereqs and features:
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
address nooverlap_target = aligned ? StubRoutines::arrayof_jbyte_disjoint_arraycopy()
: StubRoutines::jbyte_disjoint_arraycopy();
array_overlap_test(nooverlap_target, 0); // Branch away to nooverlap_target if disjoint.
generate_conjoint_copy(aligned, 1, false);
return __ addr_at(start_off);
}
address generate_conjoint_short_copy(bool aligned, const char * name) {
StubCodeMark mark(this, "StubRoutines", name);
// This is the zarch specific stub generator for overlapping short array copy.
// Refer to generate_conjoint_copy for a list of prereqs and features:
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
address nooverlap_target = aligned ? StubRoutines::arrayof_jshort_disjoint_arraycopy()
: StubRoutines::jshort_disjoint_arraycopy();
array_overlap_test(nooverlap_target, 1); // Branch away to nooverlap_target if disjoint.
generate_conjoint_copy(aligned, 2, false);
return __ addr_at(start_off);
}
address generate_conjoint_int_copy(bool aligned, const char * name) {
StubCodeMark mark(this, "StubRoutines", name);
// This is the zarch specific stub generator for overlapping int array copy.
// Refer to generate_conjoint_copy for a list of prereqs and features:
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
address nooverlap_target = aligned ? StubRoutines::arrayof_jint_disjoint_arraycopy()
: StubRoutines::jint_disjoint_arraycopy();
array_overlap_test(nooverlap_target, 2); // Branch away to nooverlap_target if disjoint.
generate_conjoint_copy(aligned, 4, false);
return __ addr_at(start_off);
}
address generate_conjoint_long_copy(bool aligned, const char * name) {
StubCodeMark mark(this, "StubRoutines", name);
// This is the zarch specific stub generator for overlapping long array copy.
// Refer to generate_conjoint_copy for a list of prereqs and features:
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
address nooverlap_target = aligned ? StubRoutines::arrayof_jlong_disjoint_arraycopy()
: StubRoutines::jlong_disjoint_arraycopy();
array_overlap_test(nooverlap_target, 3); // Branch away to nooverlap_target if disjoint.
generate_conjoint_copy(aligned, 8, false);
return __ addr_at(start_off);
}
address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
StubCodeMark mark(this, "StubRoutines", name);
// This is the zarch specific stub generator for overlapping oop array copy.
// Refer to generate_conjoint_copy for a list of prereqs and features.
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
unsigned int size = UseCompressedOops ? 4 : 8;
unsigned int shift = UseCompressedOops ? 2 : 3;
address nooverlap_target = aligned ? StubRoutines::arrayof_oop_disjoint_arraycopy(dest_uninitialized)
: StubRoutines::oop_disjoint_arraycopy(dest_uninitialized);
// Branch to disjoint_copy (if applicable) before pre_barrier to avoid double pre_barrier.
array_overlap_test(nooverlap_target, shift); // Branch away to nooverlap_target if disjoint.
DecoratorSet decorators = IN_HEAP | IS_ARRAY;
if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
}
if (aligned) {
decorators |= ARRAYCOPY_ALIGNED;
}
BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
bs->arraycopy_prologue(_masm, decorators, T_OBJECT, Z_ARG1, Z_ARG2, Z_ARG3);
generate_conjoint_copy(aligned, size, true); // Must preserve ARG2, ARG3.
bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, Z_ARG2, Z_ARG3, true);
return __ addr_at(start_off);
}
void generate_arraycopy_stubs() {
// Note: the disjoint stubs must be generated first, some of
// the conjoint stubs use them.
StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy (false, "jbyte_disjoint_arraycopy");
StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy (false, "jint_disjoint_arraycopy");
StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy (false, "jlong_disjoint_arraycopy");
StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy (false, "oop_disjoint_arraycopy", false);
StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy (false, "oop_disjoint_arraycopy_uninit", true);
StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy (true, "arrayof_jbyte_disjoint_arraycopy");
StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy (true, "arrayof_jint_disjoint_arraycopy");
StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy (true, "arrayof_jlong_disjoint_arraycopy");
StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy (true, "arrayof_oop_disjoint_arraycopy", false);
StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy (true, "arrayof_oop_disjoint_arraycopy_uninit", true);
StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy (false, "jbyte_arraycopy");
StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy");
StubRoutines::_jint_arraycopy = generate_conjoint_int_copy (false, "jint_arraycopy");
StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy (false, "jlong_arraycopy");
StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy (false, "oop_arraycopy", false);
StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy (false, "oop_arraycopy_uninit", true);
StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy (true, "arrayof_jbyte_arraycopy");
StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy (true, "arrayof_jint_arraycopy");
StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy (true, "arrayof_jlong_arraycopy");
StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy (true, "arrayof_oop_arraycopy", false);
StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy (true, "arrayof_oop_arraycopy_uninit", true);
}
void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
// safefetch signatures:
// int SafeFetch32(int* adr, int errValue);
// intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
//
// arguments:
// Z_ARG1 = adr
// Z_ARG2 = errValue
//
// result:
// Z_RET = *adr or errValue
StubCodeMark mark(this, "StubRoutines", name);
// entry point
// Load *adr into Z_ARG2, may fault.
*entry = *fault_pc = __ pc();
switch (size) {
case 4:
// Sign extended int32_t.
__ z_lgf(Z_ARG2, 0, Z_ARG1);
break;
case 8:
// int64_t
__ z_lg(Z_ARG2, 0, Z_ARG1);
break;
default:
ShouldNotReachHere();
}
// Return errValue or *adr.
*continuation_pc = __ pc();
__ z_lgr(Z_RET, Z_ARG2);
__ z_br(Z_R14);
}
// Call interface for AES_encryptBlock, AES_decryptBlock stubs.
//
// Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
// Z_ARG2 - destination data block. Ptr to leftmost byte to be stored.
// For in-place encryption/decryption, ARG1 and ARG2 can point
// to the same piece of storage.
// Z_ARG3 - Crypto key address (expanded key). The first n bits of
// the expanded key constitute the original AES-<n> key (see below).
//
// Z_RET - return value. First unprocessed byte offset in src buffer.
//
// Some remarks:
// The crypto key, as passed from the caller to these encryption stubs,
// is a so-called expanded key. It is derived from the original key
// by the Rijndael key schedule, see http://en.wikipedia.org/wiki/Rijndael_key_schedule
// With the expanded key, the cipher/decipher task is decomposed in
// multiple, less complex steps, called rounds. Sun SPARC and Intel
// processors obviously implement support for those less complex steps.
// z/Architecture provides instructions for full cipher/decipher complexity.
// Therefore, we need the original, not the expanded key here.
// Luckily, the first n bits of an AES-<n> expanded key are formed
// by the original key itself. That takes us out of trouble. :-)
// The key length (in bytes) relation is as follows:
// original expanded rounds key bit keylen
// key bytes key bytes length in words
// 16 176 11 128 44
// 24 208 13 192 52
// 32 240 15 256 60
//
// The crypto instructions used in the AES* stubs have some specific register requirements.
// Z_R0 holds the crypto function code. Please refer to the KM/KMC instruction
// description in the "z/Architecture Principles of Operation" manual for details.
// Z_R1 holds the parameter block address. The parameter block contains the cryptographic key
// (KM instruction) and the chaining value (KMC instruction).
// dst must designate an even-numbered register, holding the address of the output message.
// src must designate an even/odd register pair, holding the address/length of the original message
// Helper function which generates code to
// - load the function code in register fCode (== Z_R0).
// - load the data block length (depends on cipher function) into register srclen if requested.
// - is_decipher switches between cipher/decipher function codes
// - set_len requests (if true) loading the data block length in register srclen
void generate_load_AES_fCode(Register keylen, Register fCode, Register srclen, bool is_decipher) {
BLOCK_COMMENT("Set fCode {"); {
Label fCode_set;
int mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
bool identical_dataBlk_len = (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk)
&& (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
// Expanded key length is 44/52/60 * 4 bytes for AES-128/AES-192/AES-256.
__ z_cghi(keylen, 52); // Check only once at the beginning. keylen and fCode may share the same register.
__ z_lghi(fCode, VM_Version::Cipher::_AES128 + mode);
if (!identical_dataBlk_len) {
__ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
}
__ z_brl(fCode_set); // keyLen < 52: AES128
__ z_lghi(fCode, VM_Version::Cipher::_AES192 + mode);
if (!identical_dataBlk_len) {
__ z_lghi(srclen, VM_Version::Cipher::_AES192_dataBlk);
}
__ z_bre(fCode_set); // keyLen == 52: AES192
__ z_lghi(fCode, VM_Version::Cipher::_AES256 + mode);
if (!identical_dataBlk_len) {
__ z_lghi(srclen, VM_Version::Cipher::_AES256_dataBlk);
}
// __ z_brh(fCode_set); // keyLen < 52: AES128 // fallthru
__ bind(fCode_set);
if (identical_dataBlk_len) {
__ z_lghi(srclen, VM_Version::Cipher::_AES128_dataBlk);
}
}
BLOCK_COMMENT("} Set fCode");
}
// Push a parameter block for the cipher/decipher instruction on the stack.
// Layout of the additional stack space allocated for AES_cipherBlockChaining:
//
// | |
// +--------+ <-- SP before expansion
// | |
// : : alignment loss, 0..(AES_parmBlk_align-8) bytes
// | |
// +--------+
// | |
// : : space for parameter block, size VM_Version::Cipher::_AES*_parmBlk_C
// | |
// +--------+ <-- parmBlk, octoword-aligned, start of parameter block
// | |
// : : additional stack space for spills etc., size AES_parmBlk_addspace, DW @ Z_SP not usable!!!
// | |
// +--------+ <-- Z_SP after expansion
void generate_push_Block(int dataBlk_len, int parmBlk_len, int crypto_fCode,
Register parmBlk, Register keylen, Register fCode, Register cv, Register key) {
const int AES_parmBlk_align = 32; // octoword alignment.
const int AES_parmBlk_addspace = 24; // Must be sufficiently large to hold all spilled registers
// (currently 2) PLUS 1 DW for the frame pointer.
const int cv_len = dataBlk_len;
const int key_len = parmBlk_len - cv_len;
// This len must be known at JIT compile time. Only then are we able to recalc the SP before resize.
// We buy this knowledge by wasting some (up to AES_parmBlk_align) bytes of stack space.
const int resize_len = cv_len + key_len + AES_parmBlk_align + AES_parmBlk_addspace;
// Use parmBlk as temp reg here to hold the frame pointer.
__ resize_frame(-resize_len, parmBlk, true);
// calculate parmBlk address from updated (resized) SP.
__ add2reg(parmBlk, resize_len - (cv_len + key_len), Z_SP);
__ z_nill(parmBlk, (~(AES_parmBlk_align-1)) & 0xffff); // Align parameter block.
// There is room for stuff in the range [parmBlk-AES_parmBlk_addspace+8, parmBlk).
__ z_stg(keylen, -8, parmBlk); // Spill keylen for later use.
// calculate (SP before resize) from updated SP.
__ add2reg(keylen, resize_len, Z_SP); // keylen holds prev SP for now.
__ z_stg(keylen, -16, parmBlk); // Spill prev SP for easy revert.
__ z_mvc(0, cv_len-1, parmBlk, 0, cv); // Copy cv.
__ z_mvc(cv_len, key_len-1, parmBlk, 0, key); // Copy key.
__ z_lghi(fCode, crypto_fCode);
}
// NOTE:
// Before returning, the stub has to copy the chaining value from
// the parmBlk, where it was updated by the crypto instruction, back
// to the chaining value array the address of which was passed in the cv argument.
// As all the available registers are used and modified by KMC, we need to save
// the key length across the KMC instruction. We do so by spilling it to the stack,
// just preceding the parmBlk (at (parmBlk - 8)).
void generate_push_parmBlk(Register keylen, Register fCode, Register parmBlk, Register key, Register cv, bool is_decipher) {
int mode = is_decipher ? VM_Version::CipherMode::decipher : VM_Version::CipherMode::cipher;
Label parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
BLOCK_COMMENT("push parmBlk {");
if (VM_Version::has_Crypto_AES() ) { __ z_cghi(keylen, 52); }
if (VM_Version::has_Crypto_AES128()) { __ z_brl(parmBlk_128); } // keyLen < 52: AES128
if (VM_Version::has_Crypto_AES192()) { __ z_bre(parmBlk_192); } // keyLen == 52: AES192
if (VM_Version::has_Crypto_AES256()) { __ z_brh(parmBlk_256); } // keyLen > 52: AES256
// Security net: requested AES function not available on this CPU.
// NOTE:
// As of now (March 2015), this safety net is not required. JCE policy files limit the
// cryptographic strength of the keys used to 128 bit. If we have AES hardware support
// at all, we have at least AES-128.
__ stop_static("AES key strength not supported by CPU. Use -XX:-UseAES as remedy.", 0);
if (VM_Version::has_Crypto_AES256()) {
__ bind(parmBlk_256);
generate_push_Block(VM_Version::Cipher::_AES256_dataBlk,
VM_Version::Cipher::_AES256_parmBlk_C,
VM_Version::Cipher::_AES256 + mode,
parmBlk, keylen, fCode, cv, key);
if (VM_Version::has_Crypto_AES128() || VM_Version::has_Crypto_AES192()) {
__ z_bru(parmBlk_set); // Fallthru otherwise.
}
}
if (VM_Version::has_Crypto_AES192()) {
__ bind(parmBlk_192);
generate_push_Block(VM_Version::Cipher::_AES192_dataBlk,
VM_Version::Cipher::_AES192_parmBlk_C,
VM_Version::Cipher::_AES192 + mode,
parmBlk, keylen, fCode, cv, key);
if (VM_Version::has_Crypto_AES128()) {
__ z_bru(parmBlk_set); // Fallthru otherwise.
}
}
if (VM_Version::has_Crypto_AES128()) {
__ bind(parmBlk_128);
generate_push_Block(VM_Version::Cipher::_AES128_dataBlk,
VM_Version::Cipher::_AES128_parmBlk_C,
VM_Version::Cipher::_AES128 + mode,
parmBlk, keylen, fCode, cv, key);
// Fallthru
}
__ bind(parmBlk_set);
BLOCK_COMMENT("} push parmBlk");
}
// Pop a parameter block from the stack. The chaining value portion of the parameter block
// is copied back to the cv array as it is needed for subsequent cipher steps.
// The keylen value as well as the original SP (before resizing) was pushed to the stack
// when pushing the parameter block.
void generate_pop_parmBlk(Register keylen, Register parmBlk, Register key, Register cv) {
BLOCK_COMMENT("pop parmBlk {");
bool identical_dataBlk_len = (VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES192_dataBlk) &&
(VM_Version::Cipher::_AES128_dataBlk == VM_Version::Cipher::_AES256_dataBlk);
if (identical_dataBlk_len) {
int cv_len = VM_Version::Cipher::_AES128_dataBlk;
__ z_mvc(0, cv_len-1, cv, 0, parmBlk); // Copy cv.
} else {
int cv_len;
Label parmBlk_128, parmBlk_192, parmBlk_256, parmBlk_set;
__ z_lg(keylen, -8, parmBlk); // restore keylen
__ z_cghi(keylen, 52);
if (VM_Version::has_Crypto_AES256()) __ z_brh(parmBlk_256); // keyLen > 52: AES256
if (VM_Version::has_Crypto_AES192()) __ z_bre(parmBlk_192); // keyLen == 52: AES192
// if (VM_Version::has_Crypto_AES128()) __ z_brl(parmBlk_128); // keyLen < 52: AES128 // fallthru
// Security net: there is no one here. If we would need it, we should have
// fallen into it already when pushing the parameter block.
if (VM_Version::has_Crypto_AES128()) {
__ bind(parmBlk_128);
cv_len = VM_Version::Cipher::_AES128_dataBlk;
__ z_mvc(0, cv_len-1, cv, 0, parmBlk); // Copy cv.
if (VM_Version::has_Crypto_AES192() || VM_Version::has_Crypto_AES256()) {
__ z_bru(parmBlk_set);
}
}
if (VM_Version::has_Crypto_AES192()) {
__ bind(parmBlk_192);
cv_len = VM_Version::Cipher::_AES192_dataBlk;
__ z_mvc(0, cv_len-1, cv, 0, parmBlk); // Copy cv.
if (VM_Version::has_Crypto_AES256()) {
__ z_bru(parmBlk_set);
}
}
if (VM_Version::has_Crypto_AES256()) {
__ bind(parmBlk_256);
cv_len = VM_Version::Cipher::_AES256_dataBlk;
__ z_mvc(0, cv_len-1, cv, 0, parmBlk); // Copy cv.
// __ z_bru(parmBlk_set); // fallthru
}
__ bind(parmBlk_set);
}
__ z_lg(Z_SP, -16, parmBlk); // Revert resize_frame_absolute. Z_SP saved by push_parmBlk.
BLOCK_COMMENT("} pop parmBlk");
}
// Compute AES encrypt/decrypt function.
void generate_AES_cipherBlock(bool is_decipher) {
// Incoming arguments.
Register from = Z_ARG1; // source byte array
Register to = Z_ARG2; // destination byte array
Register key = Z_ARG3; // expanded key array
const Register keylen = Z_R0; // Temporarily (until fCode is set) holds the expanded key array length.
// Register definitions as required by KM instruction.
const Register fCode = Z_R0; // crypto function code
const Register parmBlk = Z_R1; // parameter block address (points to crypto key)
const Register src = Z_ARG1; // Must be even reg (KM requirement).
const Register srclen = Z_ARG2; // Must be odd reg and pair with src. Overwrites destination address.
const Register dst = Z_ARG3; // Must be even reg (KM requirement). Overwrites expanded key address.
// Read key len of expanded key (in 4-byte words).
__ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
// Copy arguments to registers as required by crypto instruction.
__ z_lgr(parmBlk, key); // crypto key (in T_INT array).
__ lgr_if_needed(src, from); // Copy src address. Will not emit, src/from are identical.
__ z_lgr(dst, to); // Copy dst address, even register required.
// Construct function code into fCode(Z_R0), data block length into srclen(Z_ARG2).
generate_load_AES_fCode(keylen, fCode, srclen, is_decipher);
__ km(dst, src); // Cipher the message.
__ z_br(Z_R14);
}
// Compute AES encrypt function.
address generate_AES_encryptBlock(const char* name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
generate_AES_cipherBlock(false);
return __ addr_at(start_off);
}
// Compute AES decrypt function.
address generate_AES_decryptBlock(const char* name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
generate_AES_cipherBlock(true);
return __ addr_at(start_off);
}
// These stubs receive the addresses of the cryptographic key and of the chaining value as two separate
// arguments (registers "key" and "cv", respectively). The KMC instruction, on the other hand, requires
// chaining value and key to be, in this sequence, adjacent in storage. Thus, we need to allocate some
// thread-local working storage. Using heap memory incurs all the hassles of allocating/freeing.
// Stack space, on the contrary, is deallocated automatically when we return from the stub to the caller.
// *** WARNING ***
// Please note that we do not formally allocate stack space, nor do we
// update the stack pointer. Therefore, no function calls are allowed
// and nobody else must use the stack range where the parameter block
// is located.
// We align the parameter block to the next available octoword.
//
// Compute chained AES encrypt function.
void generate_AES_cipherBlockChaining(bool is_decipher) {
Register from = Z_ARG1; // source byte array (clear text)
Register to = Z_ARG2; // destination byte array (ciphered)
Register key = Z_ARG3; // expanded key array.
Register cv = Z_ARG4; // chaining value
const Register msglen = Z_ARG5; // Total length of the msg to be encrypted. Value must be returned
// in Z_RET upon completion of this stub. Is 32-bit integer.
const Register keylen = Z_R0; // Expanded key length, as read from key array. Temp only.
const Register fCode = Z_R0; // crypto function code
const Register parmBlk = Z_R1; // parameter block address (points to crypto key)
const Register src = Z_ARG1; // is Z_R2
const Register srclen = Z_ARG2; // Overwrites destination address.
const Register dst = Z_ARG3; // Overwrites key address.
// Read key len of expanded key (in 4-byte words).
__ z_lgf(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
// Construct parm block address in parmBlk (== Z_R1), copy cv and key to parm block.
// Construct function code in fCode (Z_R0).
generate_push_parmBlk(keylen, fCode, parmBlk, key, cv, is_decipher);
// Prepare other registers for instruction.
__ lgr_if_needed(src, from); // Copy src address. Will not emit, src/from are identical.
__ z_lgr(dst, to);
__ z_llgfr(srclen, msglen); // We pass the offsets as ints, not as longs as required.
__ kmc(dst, src); // Cipher the message.
generate_pop_parmBlk(keylen, parmBlk, key, cv);
__ z_llgfr(Z_RET, msglen); // We pass the offsets as ints, not as longs as required.
__ z_br(Z_R14);
}
// Compute chained AES encrypt function.
address generate_cipherBlockChaining_AES_encrypt(const char* name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
generate_AES_cipherBlockChaining(false);
return __ addr_at(start_off);
}
// Compute chained AES encrypt function.
address generate_cipherBlockChaining_AES_decrypt(const char* name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
generate_AES_cipherBlockChaining(true);
return __ addr_at(start_off);
}
// Call interface for all SHA* stubs.
//
// Z_ARG1 - source data block. Ptr to leftmost byte to be processed.
// Z_ARG2 - current SHA state. Ptr to state area. This area serves as
// parameter block as required by the crypto instruction.
// Z_ARG3 - current byte offset in source data block.
// Z_ARG4 - last byte offset in source data block.
// (Z_ARG4 - Z_ARG3) gives the #bytes remaining to be processed.
//
// Z_RET - return value. First unprocessed byte offset in src buffer.
//
// A few notes on the call interface:
// - All stubs, whether they are single-block or multi-block, are assumed to
// digest an integer multiple of the data block length of data. All data
// blocks are digested using the intermediate message digest (KIMD) instruction.
// Special end processing, as done by the KLMD instruction, seems to be
// emulated by the calling code.
//
// - Z_ARG1 addresses the first byte of source data. The offset (Z_ARG3) is
// already accounted for.
//
// - The current SHA state (the intermediate message digest value) is contained
// in an area addressed by Z_ARG2. The area size depends on the SHA variant
// and is accessible via the enum VM_Version::MsgDigest::_SHA<n>_parmBlk_I
//
// - The single-block stub is expected to digest exactly one data block, starting
// at the address passed in Z_ARG1.
//
// - The multi-block stub is expected to digest all data blocks which start in
// the offset interval [srcOff(Z_ARG3), srcLimit(Z_ARG4)). The exact difference
// (srcLimit-srcOff), rounded up to the next multiple of the data block length,
// gives the number of blocks to digest. It must be assumed that the calling code
// provides for a large enough source data buffer.
//
// Compute SHA-1 function.
address generate_SHA1_stub(bool multiBlock, const char* name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
const Register srcBuff = Z_ARG1; // Points to first block to process (offset already added).
const Register SHAState = Z_ARG2; // Only on entry. Reused soon thereafter for kimd register pairs.
const Register srcOff = Z_ARG3; // int
const Register srcLimit = Z_ARG4; // Only passed in multiBlock case. int
const Register SHAState_local = Z_R1;
const Register SHAState_save = Z_ARG3;
const Register srcBufLen = Z_ARG2; // Destroys state address, must be copied before.
Label useKLMD, rtn;
__ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA1); // function code
__ z_lgr(SHAState_local, SHAState); // SHAState == parameter block
if (multiBlock) { // Process everything from offset to limit.
// The following description is valid if we get a raw (unpimped) source data buffer,
// spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailled above,
// the calling convention for these stubs is different. We leave the description in
// to inform the reader what must be happening hidden in the calling code.
//
// The data block to be processed can have arbitrary length, i.e. its length does not
// need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
// two different paths. If the length is an integer multiple, we use KIMD, saving us
// to copy the SHA state back and forth. If the length is odd, we copy the SHA state
// to the stack, execute a KLMD instruction on it and copy the result back to the
// caller's SHA state location.
// Total #srcBuff blocks to process.
if (VM_Version::has_DistinctOpnds()) {
__ z_srk(srcBufLen, srcLimit, srcOff); // exact difference
__ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1); // round up
__ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
__ z_ark(srcLimit, srcOff, srcBufLen); // Srclimit temporarily holds return value.
__ z_llgfr(srcBufLen, srcBufLen); // Cast to 64-bit.
} else {
__ z_lgfr(srcBufLen, srcLimit); // Exact difference. srcLimit passed as int.
__ z_sgfr(srcBufLen, srcOff); // SrcOff passed as int, now properly casted to long.
__ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1); // round up
__ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA1_dataBlk-1)) & 0xffff);
__ z_lgr(srcLimit, srcOff); // SrcLimit temporarily holds return value.
__ z_agr(srcLimit, srcBufLen);
}
// Integral #blocks to digest?
// As a result of the calculations above, srcBufLen MUST be an integer
// multiple of _SHA1_dataBlk, or else we are in big trouble.
// We insert an asm_assert into the KLMD case to guard against that.
__ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA1_dataBlk-1);
__ z_brc(Assembler::bcondNotAllZero, useKLMD);
// Process all full blocks.
__ kimd(srcBuff);
__ z_lgr(Z_RET, srcLimit); // Offset of first unprocessed byte in buffer.
} else { // Process one data block only.
__ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA1_dataBlk); // #srcBuff bytes to process
__ kimd(srcBuff);
__ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA1_dataBlk, srcOff); // Offset of first unprocessed byte in buffer. No 32 to 64 bit extension needed.
}
__ bind(rtn);
__ z_br(Z_R14);
if (multiBlock) {
__ bind(useKLMD);
#if 1
// Security net: this stub is believed to be called for full-sized data blocks only
// NOTE: The following code is believed to be correct, but is is not tested.
__ stop_static("SHA128 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
#endif
}
return __ addr_at(start_off);
}
// Compute SHA-256 function.
address generate_SHA256_stub(bool multiBlock, const char* name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
const Register srcBuff = Z_ARG1;
const Register SHAState = Z_ARG2; // Only on entry. Reused soon thereafter.
const Register SHAState_local = Z_R1;
const Register SHAState_save = Z_ARG3;
const Register srcOff = Z_ARG3;
const Register srcLimit = Z_ARG4;
const Register srcBufLen = Z_ARG2; // Destroys state address, must be copied before.
Label useKLMD, rtn;
__ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA256); // function code
__ z_lgr(SHAState_local, SHAState); // SHAState == parameter block
if (multiBlock) { // Process everything from offset to limit.
// The following description is valid if we get a raw (unpimped) source data buffer,
// spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailled above,
// the calling convention for these stubs is different. We leave the description in
// to inform the reader what must be happening hidden in the calling code.
//
// The data block to be processed can have arbitrary length, i.e. its length does not
// need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
// two different paths. If the length is an integer multiple, we use KIMD, saving us
// to copy the SHA state back and forth. If the length is odd, we copy the SHA state
// to the stack, execute a KLMD instruction on it and copy the result back to the
// caller's SHA state location.
// total #srcBuff blocks to process
if (VM_Version::has_DistinctOpnds()) {
__ z_srk(srcBufLen, srcLimit, srcOff); // exact difference
__ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
__ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
__ z_ark(srcLimit, srcOff, srcBufLen); // Srclimit temporarily holds return value.
__ z_llgfr(srcBufLen, srcBufLen); // Cast to 64-bit.
} else {
__ z_lgfr(srcBufLen, srcLimit); // exact difference
__ z_sgfr(srcBufLen, srcOff);
__ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1); // round up
__ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA256_dataBlk-1)) & 0xffff);
__ z_lgr(srcLimit, srcOff); // Srclimit temporarily holds return value.
__ z_agr(srcLimit, srcBufLen);
}
// Integral #blocks to digest?
// As a result of the calculations above, srcBufLen MUST be an integer
// multiple of _SHA1_dataBlk, or else we are in big trouble.
// We insert an asm_assert into the KLMD case to guard against that.
__ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA256_dataBlk-1);
__ z_brc(Assembler::bcondNotAllZero, useKLMD);
// Process all full blocks.
__ kimd(srcBuff);
__ z_lgr(Z_RET, srcLimit); // Offset of first unprocessed byte in buffer.
} else { // Process one data block only.
__ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA256_dataBlk); // #srcBuff bytes to process
__ kimd(srcBuff);
__ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA256_dataBlk, srcOff); // Offset of first unprocessed byte in buffer.
}
__ bind(rtn);
__ z_br(Z_R14);
if (multiBlock) {
__ bind(useKLMD);
#if 1
// Security net: this stub is believed to be called for full-sized data blocks only.
// NOTE:
// The following code is believed to be correct, but is is not tested.
__ stop_static("SHA256 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
#endif
}
return __ addr_at(start_off);
}
// Compute SHA-512 function.
address generate_SHA512_stub(bool multiBlock, const char* name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
const Register srcBuff = Z_ARG1;
const Register SHAState = Z_ARG2; // Only on entry. Reused soon thereafter.
const Register SHAState_local = Z_R1;
const Register SHAState_save = Z_ARG3;
const Register srcOff = Z_ARG3;
const Register srcLimit = Z_ARG4;
const Register srcBufLen = Z_ARG2; // Destroys state address, must be copied before.
Label useKLMD, rtn;
__ load_const_optimized(Z_R0, (int)VM_Version::MsgDigest::_SHA512); // function code
__ z_lgr(SHAState_local, SHAState); // SHAState == parameter block
if (multiBlock) { // Process everything from offset to limit.
// The following description is valid if we get a raw (unpimped) source data buffer,
// spanning the range between [srcOff(Z_ARG3), srcLimit(Z_ARG4)). As detailled above,
// the calling convention for these stubs is different. We leave the description in
// to inform the reader what must be happening hidden in the calling code.
//
// The data block to be processed can have arbitrary length, i.e. its length does not
// need to be an integer multiple of SHA<n>_datablk. Therefore, we need to implement
// two different paths. If the length is an integer multiple, we use KIMD, saving us
// to copy the SHA state back and forth. If the length is odd, we copy the SHA state
// to the stack, execute a KLMD instruction on it and copy the result back to the
// caller's SHA state location.
// total #srcBuff blocks to process
if (VM_Version::has_DistinctOpnds()) {
__ z_srk(srcBufLen, srcLimit, srcOff); // exact difference
__ z_ahi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
__ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
__ z_ark(srcLimit, srcOff, srcBufLen); // Srclimit temporarily holds return value.
__ z_llgfr(srcBufLen, srcBufLen); // Cast to 64-bit.
} else {
__ z_lgfr(srcBufLen, srcLimit); // exact difference
__ z_sgfr(srcBufLen, srcOff);
__ z_aghi(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1); // round up
__ z_nill(srcBufLen, (~(VM_Version::MsgDigest::_SHA512_dataBlk-1)) & 0xffff);
__ z_lgr(srcLimit, srcOff); // Srclimit temporarily holds return value.
__ z_agr(srcLimit, srcBufLen);
}
// integral #blocks to digest?
// As a result of the calculations above, srcBufLen MUST be an integer
// multiple of _SHA1_dataBlk, or else we are in big trouble.
// We insert an asm_assert into the KLMD case to guard against that.
__ z_tmll(srcBufLen, VM_Version::MsgDigest::_SHA512_dataBlk-1);
__ z_brc(Assembler::bcondNotAllZero, useKLMD);
// Process all full blocks.
__ kimd(srcBuff);
__ z_lgr(Z_RET, srcLimit); // Offset of first unprocessed byte in buffer.
} else { // Process one data block only.
__ load_const_optimized(srcBufLen, (int)VM_Version::MsgDigest::_SHA512_dataBlk); // #srcBuff bytes to process
__ kimd(srcBuff);
__ add2reg(Z_RET, (int)VM_Version::MsgDigest::_SHA512_dataBlk, srcOff); // Offset of first unprocessed byte in buffer.
}
__ bind(rtn);
__ z_br(Z_R14);
if (multiBlock) {
__ bind(useKLMD);
#if 1
// Security net: this stub is believed to be called for full-sized data blocks only
// NOTE:
// The following code is believed to be correct, but is is not tested.
__ stop_static("SHA512 stub can digest full data blocks only. Use -XX:-UseSHA as remedy.", 0);
#endif
}
return __ addr_at(start_off);
}
/**
* Arguments:
*
* Inputs:
* Z_ARG1 - int crc
* Z_ARG2 - byte* buf
* Z_ARG3 - int length (of buffer)
*
* Result:
* Z_RET - int crc result
**/
// Compute CRC function (generic, for all polynomials).
void generate_CRC_updateBytes(const char* name, Register table, bool invertCRC) {
// arguments to kernel_crc32:
Register crc = Z_ARG1; // Current checksum, preset by caller or result from previous call, int.
Register data = Z_ARG2; // source byte array
Register dataLen = Z_ARG3; // #bytes to process, int
// Register table = Z_ARG4; // crc table address. Preloaded and passed in by caller.
const Register t0 = Z_R10; // work reg for kernel* emitters
const Register t1 = Z_R11; // work reg for kernel* emitters
const Register t2 = Z_R12; // work reg for kernel* emitters
const Register t3 = Z_R13; // work reg for kernel* emitters
assert_different_registers(crc, data, dataLen, table);
// We pass these values as ints, not as longs as required by C calling convention.
// Crc used as int.
__ z_llgfr(dataLen, dataLen);
__ resize_frame(-(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
__ z_stmg(Z_R10, Z_R13, 1*8, Z_SP); // Spill regs 10..11 to make them available as work registers.
__ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, invertCRC);
__ z_lmg(Z_R10, Z_R13, 1*8, Z_SP); // Spill regs 10..11 back from stack.
__ resize_frame(+(6*8), Z_R0, true); // Resize frame to provide add'l space to spill 5 registers.
__ z_llgfr(Z_RET, crc); // Updated crc is function result. No copying required, just zero upper 32 bits.
__ z_br(Z_R14); // Result already in Z_RET == Z_ARG1.
}
// Compute CRC32 function.
address generate_CRC32_updateBytes(const char* name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
assert(UseCRC32Intrinsics, "should not generate this stub (%s) with CRC32 intrinsics disabled", name);
BLOCK_COMMENT("CRC32_updateBytes {");
Register table = Z_ARG4; // crc32 table address.
StubRoutines::zarch::generate_load_crc_table_addr(_masm, table);
generate_CRC_updateBytes(name, table, true);
BLOCK_COMMENT("} CRC32_updateBytes");
return __ addr_at(start_off);
}
// Compute CRC32C function.
address generate_CRC32C_updateBytes(const char* name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
unsigned int start_off = __ offset(); // Remember stub start address (is rtn value).
assert(UseCRC32CIntrinsics, "should not generate this stub (%s) with CRC32C intrinsics disabled", name);
BLOCK_COMMENT("CRC32C_updateBytes {");
Register table = Z_ARG4; // crc32c table address.
StubRoutines::zarch::generate_load_crc32c_table_addr(_masm, table);
generate_CRC_updateBytes(name, table, false);
BLOCK_COMMENT("} CRC32C_updateBytes");
return __ addr_at(start_off);
}
// Arguments:
// Z_ARG1 - x address
// Z_ARG2 - x length
// Z_ARG3 - y address
// Z_ARG4 - y length
// Z_ARG5 - z address
// 160[Z_SP] - z length
address generate_multiplyToLen() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
address start = __ pc();
const Register x = Z_ARG1;
const Register xlen = Z_ARG2;
const Register y = Z_ARG3;
const Register ylen = Z_ARG4;
const Register z = Z_ARG5;
// zlen is passed on the stack:
// Address zlen(Z_SP, _z_abi(remaining_cargs));
// Next registers will be saved on stack in multiply_to_len().
const Register tmp1 = Z_tmp_1;
const Register tmp2 = Z_tmp_2;
const Register tmp3 = Z_tmp_3;
const Register tmp4 = Z_tmp_4;
const Register tmp5 = Z_R9;
BLOCK_COMMENT("Entry:");
__ z_llgfr(xlen, xlen);
__ z_llgfr(ylen, ylen);
__ multiply_to_len(x, xlen, y, ylen, z, tmp1, tmp2, tmp3, tmp4, tmp5);
__ z_br(Z_R14); // Return to caller.
return start;
}
void generate_initial() {
// Generates all stubs and initializes the entry points.
// Entry points that exist in all platforms.
// Note: This is code that could be shared among different
// platforms - however the benefit seems to be smaller than the
// disadvantage of having a much more complicated generator
// structure. See also comment in stubRoutines.hpp.
StubRoutines::_forward_exception_entry = generate_forward_exception();
StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
StubRoutines::_catch_exception_entry = generate_catch_exception();
// Build this early so it's available for the interpreter.
StubRoutines::_throw_StackOverflowError_entry =
generate_throw_exception("StackOverflowError throw_exception",
CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
StubRoutines::_throw_delayed_StackOverflowError_entry =
generate_throw_exception("delayed StackOverflowError throw_exception",
CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError), false);
//----------------------------------------------------------------------
// Entry points that are platform specific.
if (UseCRC32Intrinsics) {
StubRoutines::_crc_table_adr = (address)StubRoutines::zarch::_crc_table;
StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes("CRC32_updateBytes");
}
if (UseCRC32CIntrinsics) {
StubRoutines::_crc32c_table_addr = (address)StubRoutines::zarch::_crc32c_table;
StubRoutines::_updateBytesCRC32C = generate_CRC32C_updateBytes("CRC32C_updateBytes");
}
// Comapct string intrinsics: Translate table for string inflate intrinsic. Used by trot instruction.
StubRoutines::zarch::_trot_table_addr = (address)StubRoutines::zarch::_trot_table;
}
void generate_all() {
// Generates all stubs and initializes the entry points.
StubRoutines::zarch::_partial_subtype_check = generate_partial_subtype_check();
// These entry points require SharedInfo::stack0 to be set up in non-core builds.
StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false);
StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false);
StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
// Support for verify_oop (must happen after universe_init).
StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine();
// Arraycopy stubs used by compilers.
generate_arraycopy_stubs();
// safefetch stubs
generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, &StubRoutines::_safefetch32_fault_pc, &StubRoutines::_safefetch32_continuation_pc);
generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, &StubRoutines::_safefetchN_fault_pc, &StubRoutines::_safefetchN_continuation_pc);
// Generate AES intrinsics code.
if (UseAESIntrinsics) {
StubRoutines::_aescrypt_encryptBlock = generate_AES_encryptBlock("AES_encryptBlock");
StubRoutines::_aescrypt_decryptBlock = generate_AES_decryptBlock("AES_decryptBlock");
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_AES_encrypt("AES_encryptBlock_chaining");
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_AES_decrypt("AES_decryptBlock_chaining");
}
// Generate SHA1/SHA256/SHA512 intrinsics code.
if (UseSHA1Intrinsics) {
StubRoutines::_sha1_implCompress = generate_SHA1_stub(false, "SHA1_singleBlock");
StubRoutines::_sha1_implCompressMB = generate_SHA1_stub(true, "SHA1_multiBlock");
}
if (UseSHA256Intrinsics) {
StubRoutines::_sha256_implCompress = generate_SHA256_stub(false, "SHA256_singleBlock");
StubRoutines::_sha256_implCompressMB = generate_SHA256_stub(true, "SHA256_multiBlock");
}
if (UseSHA512Intrinsics) {
StubRoutines::_sha512_implCompress = generate_SHA512_stub(false, "SHA512_singleBlock");
StubRoutines::_sha512_implCompressMB = generate_SHA512_stub(true, "SHA512_multiBlock");
}
#ifdef COMPILER2
if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
}
if (UseMontgomeryMultiplyIntrinsic) {
StubRoutines::_montgomeryMultiply
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
}
if (UseMontgomerySquareIntrinsic) {
StubRoutines::_montgomerySquare
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
}
#endif
}
public:
StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
// Replace the standard masm with a special one:
_masm = new MacroAssembler(code);
_stub_count = !all ? 0x100 : 0x200;
if (all) {
generate_all();
} else {
generate_initial();
}
}
private:
int _stub_count;
void stub_prolog(StubCodeDesc* cdesc) {
#ifdef ASSERT
// Put extra information in the stub code, to make it more readable.
// Write the high part of the address.
// [RGV] Check if there is a dependency on the size of this prolog.
__ emit_32((intptr_t)cdesc >> 32);
__ emit_32((intptr_t)cdesc);
__ emit_32(++_stub_count);
#endif
align(true);
}
void align(bool at_header = false) {
// z/Architecture cache line size is 256 bytes.
// There is no obvious benefit in aligning stub
// code to cache lines. Use CodeEntryAlignment instead.
const unsigned int icache_line_size = CodeEntryAlignment;
const unsigned int icache_half_line_size = MIN2<unsigned int>(32, CodeEntryAlignment);
if (at_header) {
while ((intptr_t)(__ pc()) % icache_line_size != 0) {
__ emit_16(0);
}
} else {
while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
__ z_nop();
}
}
}
};
void StubGenerator_generate(CodeBuffer* code, bool all) {
StubGenerator g(code, all);
}