8164888: Intrinsify fused mac operations on SPARC
Summary: Such speed, much wow
Reviewed-by: kvn
Contributed-by: phedlin@oracle.com
--- a/hotspot/src/cpu/sparc/vm/abstractInterpreter_sparc.cpp Tue Jun 27 15:46:16 2017 +0200
+++ b/hotspot/src/cpu/sparc/vm/abstractInterpreter_sparc.cpp Tue Jun 27 15:50:09 2017 +0200
@@ -52,8 +52,16 @@
return i;
}
+// These should never be compiled since the interpreter will prefer the compiled
+// version to the intrinsic version.
bool AbstractInterpreter::can_be_compiled(methodHandle m) {
- // No special entry points that preclude compilation
+ switch (method_kind(m)) {
+ case Interpreter::java_lang_math_fmaD:
+ case Interpreter::java_lang_math_fmaF:
+ return false;
+ default:
+ break;
+ }
return true;
}
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp Tue Jun 27 15:46:16 2017 +0200
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp Tue Jun 27 15:50:09 2017 +0200
@@ -628,6 +628,9 @@
// CRC32C instruction supported only on certain processors
static void crc32c_only() { assert(VM_Version::has_crc32c(), "This instruction only works on SPARC with CRC32C"); }
+ // FMAf instructions supported only on certain processors
+ static void fmaf_only() { assert(VM_Version::has_fmaf(), "This instruction only works on SPARC with FMAf"); }
+
// instruction only in VIS1
static void vis1_only() { assert(VM_Version::has_vis1(), "This instruction only works on SPARC with VIS1"); }
@@ -923,6 +926,10 @@
inline void fsqrt(FloatRegisterImpl::Width w, FloatRegister s, FloatRegister d);
+ // fmaf instructions.
+
+ inline void fmadd(FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d);
+
// pp 165
inline void flush(Register s1, Register s2);
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.inline.hpp Tue Jun 27 15:46:16 2017 +0200
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.inline.hpp Tue Jun 27 15:50:09 2017 +0200
@@ -355,6 +355,11 @@
emit_int32(op(arith_op) | fd(d, w) | op3(fpop1_op3) | opf(0x28 + w) | fs2(s, w));
}
+inline void Assembler::fmadd(FloatRegisterImpl::Width w, FloatRegister s1, FloatRegister s2, FloatRegister s3, FloatRegister d) {
+ fmaf_only();
+ emit_int32(op(arith_op) | fd(d, w) | op3(stpartialf_op3) | fs1(s1, w) | fs3(s3, w) | op5(w) | fs2(s2, w));
+}
+
inline void Assembler::flush(Register s1, Register s2) {
emit_int32(op(arith_op) | op3(flush_op3) | rs1(s1) | rs2(s2));
}
--- a/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp Tue Jun 27 15:46:16 2017 +0200
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp Tue Jun 27 15:50:09 2017 +0200
@@ -440,6 +440,31 @@
}
void LIR_Assembler::emit_op3(LIR_Op3* op) {
+ switch (op->code()) {
+ case lir_idiv:
+ case lir_irem: // Both idiv & irem are handled after the switch (below).
+ break;
+ case lir_fmaf:
+ __ fmadd(FloatRegisterImpl::S,
+ op->in_opr1()->as_float_reg(),
+ op->in_opr2()->as_float_reg(),
+ op->in_opr3()->as_float_reg(),
+ op->result_opr()->as_float_reg());
+ return;
+ case lir_fmad:
+ __ fmadd(FloatRegisterImpl::D,
+ op->in_opr1()->as_double_reg(),
+ op->in_opr2()->as_double_reg(),
+ op->in_opr3()->as_double_reg(),
+ op->result_opr()->as_double_reg());
+ return;
+ default:
+ ShouldNotReachHere();
+ break;
+ }
+
+ // Handle idiv & irem:
+
Register Rdividend = op->in_opr1()->as_register();
Register Rdivisor = noreg;
Register Rscratch = op->in_opr3()->as_register();
--- a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp Tue Jun 27 15:46:16 2017 +0200
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp Tue Jun 27 15:50:09 2017 +0200
@@ -953,7 +953,29 @@
}
void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
- fatal("FMA intrinsic is not implemented on this platform");
+ assert(x->number_of_arguments() == 3, "wrong type");
+ assert(UseFMA, "Needs FMA instructions support.");
+
+ LIRItem a(x->argument_at(0), this);
+ LIRItem b(x->argument_at(1), this);
+ LIRItem c(x->argument_at(2), this);
+
+ a.load_item();
+ b.load_item();
+ c.load_item();
+
+ LIR_Opr ina = a.result();
+ LIR_Opr inb = b.result();
+ LIR_Opr inc = c.result();
+ LIR_Opr res = rlock_result(x);
+
+ switch (x->id()) {
+ case vmIntrinsics::_fmaF: __ fmaf(ina, inb, inc, res); break;
+ case vmIntrinsics::_fmaD: __ fmad(ina, inb, inc, res); break;
+ default:
+ ShouldNotReachHere();
+ break;
+ }
}
void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
--- a/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp Tue Jun 27 15:46:16 2017 +0200
+++ b/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp Tue Jun 27 15:50:09 2017 +0200
@@ -33,7 +33,7 @@
define_pd_global(bool, BackgroundCompilation, true);
define_pd_global(bool, CICompileOSR, true);
-define_pd_global(bool, InlineIntrinsics, false);
+define_pd_global(bool, InlineIntrinsics, true);
define_pd_global(bool, PreferInterpreterNativeStubs, false);
define_pd_global(bool, ProfileTraps, true);
define_pd_global(bool, UseOnStackReplacement, true);
--- a/hotspot/src/cpu/sparc/vm/sparc.ad Tue Jun 27 15:46:16 2017 +0200
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad Tue Jun 27 15:50:09 2017 +0200
@@ -2627,6 +2627,33 @@
__ fsqrt(FloatRegisterImpl::D, Fsrc, Fdst);
%}
+
+
+enc_class fmadds (sflt_reg dst, sflt_reg a, sflt_reg b, sflt_reg c) %{
+ MacroAssembler _masm(&cbuf);
+
+ FloatRegister Frd = reg_to_SingleFloatRegister_object($dst$$reg);
+ FloatRegister Fra = reg_to_SingleFloatRegister_object($a$$reg);
+ FloatRegister Frb = reg_to_SingleFloatRegister_object($b$$reg);
+ FloatRegister Frc = reg_to_SingleFloatRegister_object($c$$reg);
+
+ __ fmadd(FloatRegisterImpl::S, Fra, Frb, Frc, Frd);
+%}
+
+enc_class fmaddd (dflt_reg dst, dflt_reg a, dflt_reg b, dflt_reg c) %{
+ MacroAssembler _masm(&cbuf);
+
+ FloatRegister Frd = reg_to_DoubleFloatRegister_object($dst$$reg);
+ FloatRegister Fra = reg_to_DoubleFloatRegister_object($a$$reg);
+ FloatRegister Frb = reg_to_DoubleFloatRegister_object($b$$reg);
+ FloatRegister Frc = reg_to_DoubleFloatRegister_object($c$$reg);
+
+ __ fmadd(FloatRegisterImpl::D, Fra, Frb, Frc, Frd);
+%}
+
+
+
+
enc_class fmovs (dflt_reg dst, dflt_reg src) %{
MacroAssembler _masm(&cbuf);
@@ -4540,6 +4567,26 @@
FDIV : C(17);
%}
+// Fused floating-point multiply-add float.
+pipe_class fmaF_regx4(regF dst, regF src1, regF src2, regF src3) %{
+ single_instruction;
+ dst : X(write);
+ src1 : E(read);
+ src2 : E(read);
+ src3 : E(read);
+ FM : R;
+%}
+
+// Fused gloating-point multiply-add double.
+pipe_class fmaD_regx4(regD dst, regD src1, regD src2, regD src3) %{
+ single_instruction;
+ dst : X(write);
+ src1 : E(read);
+ src2 : E(read);
+ src3 : E(read);
+ FM : R;
+%}
+
// Floating Point Move/Negate/Abs Float
pipe_class faddF_reg(regF dst, regF src) %{
single_instruction;
@@ -7531,6 +7578,24 @@
ins_pipe(fdivD_reg_reg);
%}
+// Single precision fused floating-point multiply-add (d = a * b + c).
+instruct fmaF_regx4(regF dst, regF a, regF b, regF c) %{
+ predicate(UseFMA);
+ match(Set dst (FmaF c (Binary a b)));
+ format %{ "fmadds $a,$b,$c,$dst\t# $dst = $a * $b + $c" %}
+ ins_encode(fmadds(dst, a, b, c));
+ ins_pipe(fmaF_regx4);
+%}
+
+// Double precision fused floating-point multiply-add (d = a * b + c).
+instruct fmaD_regx4(regD dst, regD a, regD b, regD c) %{
+ predicate(UseFMA);
+ match(Set dst (FmaD c (Binary a b)));
+ format %{ "fmaddd $a,$b,$c,$dst\t# $dst = $a * $b + $c" %}
+ ins_encode(fmaddd(dst, a, b, c));
+ ins_pipe(fmaD_regx4);
+%}
+
//----------Logical Instructions-----------------------------------------------
// And Instructions
// Register And
--- a/hotspot/src/cpu/sparc/vm/templateInterpreterGenerator_sparc.cpp Tue Jun 27 15:46:16 2017 +0200
+++ b/hotspot/src/cpu/sparc/vm/templateInterpreterGenerator_sparc.cpp Tue Jun 27 15:50:09 2017 +0200
@@ -153,13 +153,12 @@
__ delayed()->srl( G4_scratch, 2, G4_scratch );
__ bind(NextArg);
-
}
__ bind(done);
__ ret();
- __ delayed()->
- restore(O0, 0, Lscratch); // caller's Lscratch gets the result handler
+ __ delayed()->restore(O0, 0, Lscratch); // caller's Lscratch gets the result handler
+
return entry;
}
@@ -177,7 +176,6 @@
// returns verified_entry_point or NULL
// we ignore it in any case
__ ba_short(Lcontinue);
-
}
@@ -196,7 +194,6 @@
// the call_VM checks for exception, so we should never return here.
__ should_not_reach_here();
return entry;
-
}
void TemplateInterpreterGenerator::save_native_result(void) {
@@ -474,7 +471,6 @@
__ delayed()->nop();
__ bind(done);
}
-
}
// Allocate monitor and lock method (asm interpreter)
@@ -590,7 +586,7 @@
// pop parameters from the callers stack by adjusting Lesp
// set O0 to Lesp
// compute X = (max_locals - num_parameters)
-// bump SP up by X to accomadate the extra locals
+// bump SP up by X to accommodate the extra locals
// compute X = max_expression_stack
// + vm_local_words
// + 16 words of register save area
@@ -688,7 +684,7 @@
// 1) Increase caller's SP by for the extra local space needed:
// (check for overflow)
// Efficient implementation of xload/xstore bytecodes requires
- // that arguments and non-argument locals are in a contigously
+ // that arguments and non-argument locals are in a contiguously
// addressable memory block => non-argument locals must be
// allocated in the caller's frame.
//
@@ -782,7 +778,7 @@
__ sub(Gframe_size, Glocals_size, Gframe_size);
//
- // bump SP to accomodate the extra locals
+ // bump SP to accommodate the extra locals
//
__ sub(SP, Glocals_size, SP);
}
@@ -810,9 +806,9 @@
Register mirror = LcpoolCache;
__ load_mirror(mirror, Lmethod);
__ st_ptr(mirror, FP, (frame::interpreter_frame_mirror_offset * wordSize) + STACK_BIAS);
- __ get_constant_pool_cache( LcpoolCache ); // set LcpoolCache
+ __ get_constant_pool_cache(LcpoolCache); // set LcpoolCache
__ sub(FP, rounded_vm_local_words * BytesPerWord, Lmonitors ); // set Lmonitors
- __ add( Lmonitors, STACK_BIAS, Lmonitors ); // Account for 64 bit stack bias
+ __ add(Lmonitors, STACK_BIAS, Lmonitors); // Account for 64 bit stack bias
__ sub(Lmonitors, BytesPerWord, Lesp); // set Lesp
// setup interpreter activation registers
@@ -984,7 +980,7 @@
__ ldx( Gargs, 16, buf);
__ lduw(Gargs, 24, crc);
__ add(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE), buf); // account for the header size
- __ add(buf ,offset, buf);
+ __ add(buf, offset, buf);
}
// Call the crc32 kernel
@@ -1057,8 +1053,58 @@
return NULL;
}
-// Not supported
-address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKind kind) {
+/* Math routines only partially supported.
+ *
+ * Providing support for fma (float/double) only.
+ */
+address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKind kind)
+{
+ if (!InlineIntrinsics) return NULL; // Generate a vanilla entry
+
+ address entry = __ pc();
+
+ switch (kind) {
+ case Interpreter::java_lang_math_fmaF:
+ if (UseFMA) {
+ // float .fma(float a, float b, float c)
+ const FloatRegister ra = F1;
+ const FloatRegister rb = F2;
+ const FloatRegister rc = F3;
+ const FloatRegister rd = F0; // Result.
+
+ __ ldf(FloatRegisterImpl::S, Gargs, 0, rc);
+ __ ldf(FloatRegisterImpl::S, Gargs, 8, rb);
+ __ ldf(FloatRegisterImpl::S, Gargs, 16, ra);
+
+ __ fmadd(FloatRegisterImpl::S, ra, rb, rc, rd);
+ __ retl(); // Result in F0 (rd).
+ __ delayed()->mov(O5_savedSP, SP);
+
+ return entry;
+ }
+ break;
+ case Interpreter::java_lang_math_fmaD:
+ if (UseFMA) {
+ // double .fma(double a, double b, double c)
+ const FloatRegister ra = F2; // D1
+ const FloatRegister rb = F4; // D2
+ const FloatRegister rc = F6; // D3
+ const FloatRegister rd = F0; // D0 Result.
+
+ __ ldf(FloatRegisterImpl::D, Gargs, 0, rc);
+ __ ldf(FloatRegisterImpl::D, Gargs, 16, rb);
+ __ ldf(FloatRegisterImpl::D, Gargs, 32, ra);
+
+ __ fmadd(FloatRegisterImpl::D, ra, rb, rc, rd);
+ __ retl(); // Result in D0 (rd).
+ __ delayed()->mov(O5_savedSP, SP);
+
+ return entry;
+ }
+ break;
+ default:
+ break;
+ }
return NULL;
}
@@ -1071,7 +1117,7 @@
// Doing the banging earlier fails if the caller frame is not an interpreter
// frame.
// (Also, the exception throwing code expects to unlock any synchronized
- // method receiever, so do the banging after locking the receiver.)
+ // method receiver, so do the banging after locking the receiver.)
// Bang each page in the shadow zone. We can't assume it's been done for
// an interpreter frame with greater than a page of locals, so each page
@@ -1112,8 +1158,7 @@
// rethink these assertions - they can be simplified and shared (gri 2/25/2000)
#ifdef ASSERT
__ ld(G5_method, Method::access_flags_offset(), Gtmp1);
- {
- Label L;
+ { Label L;
__ btst(JVM_ACC_NATIVE, Gtmp1);
__ br(Assembler::notZero, false, Assembler::pt, L);
__ delayed()->nop();
@@ -1362,7 +1407,7 @@
// didn't see any synchronization is progress, and escapes.
__ set(_thread_in_native_trans, G3_scratch);
__ st(G3_scratch, thread_state);
- if(os::is_MP()) {
+ if (os::is_MP()) {
if (UseMembar) {
// Force this write out before the read below
__ membar(Assembler::StoreLoad);
@@ -1425,8 +1470,7 @@
// If we have an oop result store it where it will be safe for any further gc
// until we return now that we've released the handle it might be protected by
- {
- Label no_oop, store_result;
+ { Label no_oop, store_result;
__ set((intptr_t)AbstractInterpreter::result_handler(T_OBJECT), G3_scratch);
__ cmp_and_brx_short(G3_scratch, Lscratch, Assembler::notEqual, Assembler::pt, no_oop);
@@ -1484,8 +1528,7 @@
// dispose of return address and remove activation
#ifdef ASSERT
- {
- Label ok;
+ { Label ok;
__ cmp_and_brx_short(I5_savedSP, FP, Assembler::greaterEqualUnsigned, Assembler::pt, ok);
__ stop("bad I5_savedSP value");
__ should_not_reach_here();
@@ -1495,15 +1538,12 @@
__ jmp(Lscratch, 0);
__ delayed()->nop();
-
if (inc_counter) {
// handle invocation counter overflow
__ bind(invocation_counter_overflow);
generate_counter_overflow(Lcontinue);
}
-
-
return entry;
}
@@ -1533,8 +1573,7 @@
// rethink these assertions - they can be simplified and shared (gri 2/25/2000)
#ifdef ASSERT
__ ld(G5_method, Method::access_flags_offset(), Gtmp1);
- {
- Label L;
+ { Label L;
__ btst(JVM_ACC_NATIVE, Gtmp1);
__ br(Assembler::zero, false, Assembler::pt, L);
__ delayed()->nop();
@@ -1666,7 +1705,6 @@
generate_counter_overflow(Lcontinue);
}
-
return entry;
}
@@ -1786,8 +1824,7 @@
}
#if INCLUDE_JVMTI
- {
- Label L_done;
+ { Label L_done;
__ ldub(Address(Lbcp, 0), G1_scratch); // Load current bytecode
__ cmp_and_br_short(G1_scratch, Bytecodes::_invokestatic, Assembler::notEqual, Assembler::pn, L_done);
@@ -1827,7 +1864,7 @@
__ get_vm_result(Oexception);
__ verify_oop(Oexception);
- const int return_reg_adjustment = frame::pc_return_offset;
+ const int return_reg_adjustment = frame::pc_return_offset;
Address issuing_pc_addr(I7, return_reg_adjustment);
// We are done with this activation frame; find out where to go next.
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Jun 27 15:46:16 2017 +0200
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Jun 27 15:50:09 2017 +0200
@@ -317,7 +317,11 @@
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
- if (UseFMA) {
+ if (has_fmaf()) {
+ if (FLAG_IS_DEFAULT(UseFMA)) {
+ UseFMA = true;
+ }
+ } else if (UseFMA) {
warning("FMA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseFMA, false);
}
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp Tue Jun 27 15:46:16 2017 +0200
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp Tue Jun 27 15:50:09 2017 +0200
@@ -248,7 +248,7 @@
static jint verify_oop_count() { return _verify_oop_count; }
static jint* verify_oop_count_addr() { return &_verify_oop_count; }
// a subroutine for debugging the GC
- static address verify_oop_subroutine_entry_address() { return (address)&_verify_oop_subroutine_entry; }
+ static address verify_oop_subroutine_entry_address() { return (address)&_verify_oop_subroutine_entry; }
static address catch_exception_entry() { return _catch_exception_entry; }
@@ -335,8 +335,8 @@
static address checkcast_arraycopy(bool dest_uninitialized = false) {
return dest_uninitialized ? _checkcast_arraycopy_uninit : _checkcast_arraycopy;
}
- static address unsafe_arraycopy() { return _unsafe_arraycopy; }
- static address generic_arraycopy() { return _generic_arraycopy; }
+ static address unsafe_arraycopy() { return _unsafe_arraycopy; }
+ static address generic_arraycopy() { return _generic_arraycopy; }
static address jbyte_fill() { return _jbyte_fill; }
static address jshort_fill() { return _jshort_fill; }
@@ -349,8 +349,8 @@
static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; }
static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; }
static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; }
- static address counterMode_AESCrypt() { return _counterMode_AESCrypt; }
- static address ghash_processBlocks() { return _ghash_processBlocks; }
+ static address counterMode_AESCrypt() { return _counterMode_AESCrypt; }
+ static address ghash_processBlocks() { return _ghash_processBlocks; }
static address sha1_implCompress() { return _sha1_implCompress; }
static address sha1_implCompressMB() { return _sha1_implCompressMB; }
@@ -366,9 +366,9 @@
static address updateBytesCRC32C() { return _updateBytesCRC32C; }
static address updateBytesAdler32() { return _updateBytesAdler32; }
- static address multiplyToLen() {return _multiplyToLen; }
- static address squareToLen() {return _squareToLen; }
- static address mulAdd() {return _mulAdd; }
+ static address multiplyToLen() { return _multiplyToLen; }
+ static address squareToLen() { return _squareToLen; }
+ static address mulAdd() { return _mulAdd; }
static address montgomeryMultiply() { return _montgomeryMultiply; }
static address montgomerySquare() { return _montgomerySquare; }
@@ -376,7 +376,7 @@
static address dexp() { return _dexp; }
static address dlog() { return _dlog; }
- static address dlog10() { return _dlog10; }
+ static address dlog10() { return _dlog10; }
static address dpow() { return _dpow; }
static address dsin() { return _dsin; }
static address dcos() { return _dcos; }
@@ -387,7 +387,7 @@
static address select_fill_function(BasicType t, bool aligned, const char* &name);
- static address zero_aligned_words() { return _zero_aligned_words; }
+ static address zero_aligned_words() { return _zero_aligned_words; }
static double intrinsic_log10(double d) {
assert(_intrinsic_log10 != NULL, "must be defined");