8171398: s390x: Make interpreter's math entries consistent with C1 and C2 and support FMA
authormdoerr
Tue, 20 Dec 2016 14:55:18 +0100
changeset 42897 57e7b1c75d17
parent 42896 c868a2f327c6
child 42898 af1346a135ce
8171398: s390x: Make interpreter's math entries consistent with C1 and C2 and support FMA Reviewed-by: lucy, goetz
hotspot/src/cpu/s390/vm/assembler_s390.hpp
hotspot/src/cpu/s390/vm/assembler_s390.inline.hpp
hotspot/src/cpu/s390/vm/c1_LIRAssembler_s390.cpp
hotspot/src/cpu/s390/vm/c1_LIRGenerator_s390.cpp
hotspot/src/cpu/s390/vm/s390.ad
hotspot/src/cpu/s390/vm/stubGenerator_s390.cpp
hotspot/src/cpu/s390/vm/templateInterpreterGenerator_s390.cpp
hotspot/src/cpu/s390/vm/vm_version_s390.cpp
--- a/hotspot/src/cpu/s390/vm/assembler_s390.hpp	Tue Dec 20 13:02:16 2016 +0000
+++ b/hotspot/src/cpu/s390/vm/assembler_s390.hpp	Tue Dec 20 14:55:18 2016 +0100
@@ -790,6 +790,16 @@
 #define MDB_ZOPC    (unsigned long)(237L << 40 | 28)
 #define MXDB_ZOPC   (unsigned long)(237L << 40 | 7)
 
+// Multiply-Add
+#define MAEBR_ZOPC  (unsigned  int)(179 << 24 | 14 << 16)
+#define MADBR_ZOPC  (unsigned  int)(179 << 24 | 30 << 16)
+#define MSEBR_ZOPC  (unsigned  int)(179 << 24 | 15 << 16)
+#define MSDBR_ZOPC  (unsigned  int)(179 << 24 | 31 << 16)
+#define MAEB_ZOPC   (unsigned long)(237L << 40 | 14)
+#define MADB_ZOPC   (unsigned long)(237L << 40 | 30)
+#define MSEB_ZOPC   (unsigned long)(237L << 40 | 15)
+#define MSDB_ZOPC   (unsigned long)(237L << 40 | 31)
+
 // Divide
 // RR, signed
 #define DSGFR_ZOPC  (unsigned  int)(0xb91d << 16)
@@ -2205,6 +2215,20 @@
   inline void z_meeb( FloatRegister f1, const Address& a);
   inline void z_mdb(  FloatRegister f1, const Address& a);
 
+  // MUL-ADD
+  inline void z_maebr(FloatRegister f1, FloatRegister f3, FloatRegister f2);    // f1 = f3 * f2 + f1          ; float
+  inline void z_madbr(FloatRegister f1, FloatRegister f3, FloatRegister f2);    // f1 = f3 * f2 + f1          ; double
+  inline void z_msebr(FloatRegister f1, FloatRegister f3, FloatRegister f2);    // f1 = f3 * f2 - f1          ; float
+  inline void z_msdbr(FloatRegister f1, FloatRegister f3, FloatRegister f2);    // f1 = f3 * f2 - f1          ; double
+  inline void z_maeb(FloatRegister f1, FloatRegister f3, int64_t d2, Register x2, Register b2); // f1 = f3 * *(d2+x2+b2) + f1 ; float
+  inline void z_madb(FloatRegister f1, FloatRegister f3, int64_t d2, Register x2, Register b2); // f1 = f3 * *(d2+x2+b2) + f1 ; double
+  inline void z_mseb(FloatRegister f1, FloatRegister f3, int64_t d2, Register x2, Register b2); // f1 = f3 * *(d2+x2+b2) - f1 ; float
+  inline void z_msdb(FloatRegister f1, FloatRegister f3, int64_t d2, Register x2, Register b2); // f1 = f3 * *(d2+x2+b2) - f1 ; double
+  inline void z_maeb(FloatRegister f1, FloatRegister f3, const Address& a);
+  inline void z_madb(FloatRegister f1, FloatRegister f3, const Address& a);
+  inline void z_mseb(FloatRegister f1, FloatRegister f3, const Address& a);
+  inline void z_msdb(FloatRegister f1, FloatRegister f3, const Address& a);
+
   // DIV
   inline void z_debr( FloatRegister f1, FloatRegister f2);                      // f1 = f1 / f2               ; float
   inline void z_ddbr( FloatRegister f1, FloatRegister f2);                      // f1 = f1 / f2               ; double
--- a/hotspot/src/cpu/s390/vm/assembler_s390.inline.hpp	Tue Dec 20 13:02:16 2016 +0000
+++ b/hotspot/src/cpu/s390/vm/assembler_s390.inline.hpp	Tue Dec 20 14:55:18 2016 +0100
@@ -778,6 +778,23 @@
 
 
 //---------------
+// MUL-ADD
+//---------------
+inline void Assembler::z_maebr(FloatRegister f1, FloatRegister f3, FloatRegister f2) { emit_32( MAEBR_ZOPC | fregt(f1, 16, 32) | freg(f3, 24, 32) | freg(f2, 28, 32) );}
+inline void Assembler::z_madbr(FloatRegister f1, FloatRegister f3, FloatRegister f2) { emit_32( MADBR_ZOPC | fregt(f1, 16, 32) | freg(f3, 24, 32) | freg(f2, 28, 32) );}
+inline void Assembler::z_msebr(FloatRegister f1, FloatRegister f3, FloatRegister f2) { emit_32( MSEBR_ZOPC | fregt(f1, 16, 32) | freg(f3, 24, 32) | freg(f2, 28, 32) );}
+inline void Assembler::z_msdbr(FloatRegister f1, FloatRegister f3, FloatRegister f2) { emit_32( MSDBR_ZOPC | fregt(f1, 16, 32) | freg(f3, 24, 32) | freg(f2, 28, 32) );}
+inline void Assembler::z_maeb(FloatRegister f1, FloatRegister f3, int64_t d2, Register x2, Register b2) { emit_48( MAEB_ZOPC | fregt(f1, 32, 48) | freg(f3, 8, 48) | uimm12(d2, 20, 48) | reg(x2, 12, 48) | regz(b2, 16, 48) );}
+inline void Assembler::z_madb(FloatRegister f1, FloatRegister f3, int64_t d2, Register x2, Register b2) { emit_48( MADB_ZOPC | fregt(f1, 32, 48) | freg(f3, 8, 48) | uimm12(d2, 20, 48) | reg(x2, 12, 48) | regz(b2, 16, 48) );}
+inline void Assembler::z_mseb(FloatRegister f1, FloatRegister f3, int64_t d2, Register x2, Register b2) { emit_48( MSEB_ZOPC | fregt(f1, 32, 48) | freg(f3, 8, 48) | uimm12(d2, 20, 48) | reg(x2, 12, 48) | regz(b2, 16, 48) );}
+inline void Assembler::z_msdb(FloatRegister f1, FloatRegister f3, int64_t d2, Register x2, Register b2) { emit_48( MSDB_ZOPC | fregt(f1, 32, 48) | freg(f3, 8, 48) | uimm12(d2, 20, 48) | reg(x2, 12, 48) | regz(b2, 16, 48) );}
+inline void Assembler::z_maeb(FloatRegister f1, FloatRegister f3, const Address& a) { z_maeb(f1, f3, a.disp(), a.indexOrR0(), a.baseOrR0()); }
+inline void Assembler::z_madb(FloatRegister f1, FloatRegister f3, const Address& a) { z_madb(f1, f3, a.disp(), a.indexOrR0(), a.baseOrR0()); }
+inline void Assembler::z_mseb(FloatRegister f1, FloatRegister f3, const Address& a) { z_mseb(f1, f3, a.disp(), a.indexOrR0(), a.baseOrR0()); }
+inline void Assembler::z_msdb(FloatRegister f1, FloatRegister f3, const Address& a) { z_msdb(f1, f3, a.disp(), a.indexOrR0(), a.baseOrR0()); }
+
+
+//---------------
 // DIV
 //---------------
 inline void Assembler::z_debr( FloatRegister f1, FloatRegister f2)                      { emit_32( DEBR_ZOPC | fregt( f1, 24, 32) | freg( f2, 28, 32));}
--- a/hotspot/src/cpu/s390/vm/c1_LIRAssembler_s390.cpp	Tue Dec 20 13:02:16 2016 +0000
+++ b/hotspot/src/cpu/s390/vm/c1_LIRAssembler_s390.cpp	Tue Dec 20 14:55:18 2016 +0100
@@ -324,6 +324,22 @@
                       op->result_opr(),
                       op->info());
       break;
+    case lir_fmad: {
+      const FloatRegister opr1 = op->in_opr1()->as_double_reg(),
+                          opr2 = op->in_opr2()->as_double_reg(),
+                          opr3 = op->in_opr3()->as_double_reg(),
+                          res  = op->result_opr()->as_double_reg();
+      __ z_madbr(opr3, opr1, opr2);
+      if (res != opr3) { __ z_ldr(res, opr3); }
+    } break;
+    case lir_fmaf: {
+      const FloatRegister opr1 = op->in_opr1()->as_float_reg(),
+                          opr2 = op->in_opr2()->as_float_reg(),
+                          opr3 = op->in_opr3()->as_float_reg(),
+                          res  = op->result_opr()->as_float_reg();
+      __ z_maebr(opr3, opr1, opr2);
+      if (res != opr3) { __ z_ler(res, opr3); }
+    } break;
     default: ShouldNotReachHere(); break;
   }
 }
--- a/hotspot/src/cpu/s390/vm/c1_LIRGenerator_s390.cpp	Tue Dec 20 13:02:16 2016 +0000
+++ b/hotspot/src/cpu/s390/vm/c1_LIRGenerator_s390.cpp	Tue Dec 20 14:55:18 2016 +0100
@@ -1237,7 +1237,28 @@
 }
 
 void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
-  fatal("FMA intrinsic is not implemented on this platform");
+  assert(x->number_of_arguments() == 3, "wrong type");
+  assert(UseFMA, "Needs FMA instructions support.");
+  LIRItem value(x->argument_at(0), this);
+  LIRItem value1(x->argument_at(1), this);
+  LIRItem value2(x->argument_at(2), this);
+
+  value2.set_destroys_register();
+
+  value.load_item();
+  value1.load_item();
+  value2.load_item();
+
+  LIR_Opr calc_input = value.result();
+  LIR_Opr calc_input1 = value1.result();
+  LIR_Opr calc_input2 = value2.result();
+  LIR_Opr calc_result = rlock_result(x);
+
+  switch (x->id()) {
+  case vmIntrinsics::_fmaD:   __ fmad(calc_input, calc_input1, calc_input2, calc_result); break;
+  case vmIntrinsics::_fmaF:   __ fmaf(calc_input, calc_input1, calc_input2, calc_result); break;
+  default:                    ShouldNotReachHere();
+  }
 }
 
 void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
--- a/hotspot/src/cpu/s390/vm/s390.ad	Tue Dec 20 13:02:16 2016 +0000
+++ b/hotspot/src/cpu/s390/vm/s390.ad	Tue Dec 20 14:55:18 2016 +0100
@@ -7249,6 +7249,171 @@
   ins_pipe(pipe_class_dummy);
 %}
 
+// Multiply-Accumulate
+// src1 * src2 + dst
+instruct maddF_reg_reg(regF dst, regF src1, regF src2) %{
+  match(Set dst (FmaF dst (Binary src1 src2)));
+  // CC unchanged by MUL-ADD.
+  ins_cost(ALU_REG_COST);
+  size(4);
+  format %{ "MAEBR    $dst, $src1, $src2" %}
+  ins_encode %{
+    __ z_maebr($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// src1 * src2 + dst
+instruct maddD_reg_reg(regD dst, regD src1, regD src2) %{
+  match(Set dst (FmaD dst (Binary src1 src2)));
+  // CC unchanged by MUL-ADD.
+  ins_cost(ALU_REG_COST);
+  size(4);
+  format %{ "MADBR    $dst, $src1, $src2" %}
+  ins_encode %{
+    __ z_madbr($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// src1 * src2 - dst
+instruct msubF_reg_reg(regF dst, regF src1, regF src2) %{
+  match(Set dst (FmaF (NegF dst) (Binary src1 src2)));
+  // CC unchanged by MUL-SUB.
+  ins_cost(ALU_REG_COST);
+  size(4);
+  format %{ "MSEBR    $dst, $src1, $src2" %}
+  ins_encode %{
+    __ z_msebr($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// src1 * src2 - dst
+instruct msubD_reg_reg(regD dst, regD src1, regD src2) %{
+  match(Set dst (FmaD (NegD dst) (Binary src1 src2)));
+  // CC unchanged by MUL-SUB.
+  ins_cost(ALU_REG_COST);
+  size(4);
+  format %{ "MSDBR    $dst, $src1, $src2" %}
+  ins_encode %{
+    __ z_msdbr($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// src1 * src2 + dst
+instruct maddF_reg_mem(regF dst, regF src1, memoryRX src2) %{
+  match(Set dst (FmaF dst (Binary src1 (LoadF src2))));
+  // CC unchanged by MUL-ADD.
+  ins_cost(ALU_MEMORY_COST);
+  size(6);
+  format %{ "MAEB     $dst, $src1, $src2" %}
+  ins_encode %{
+    __ z_maeb($dst$$FloatRegister, $src1$$FloatRegister,
+              Address(reg_to_register_object($src2$$base), $src2$$index$$Register, $src2$$disp));
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// src1 * src2 + dst
+instruct maddD_reg_mem(regD dst, regD src1, memoryRX src2) %{
+  match(Set dst (FmaD dst (Binary src1 (LoadD src2))));
+  // CC unchanged by MUL-ADD.
+  ins_cost(ALU_MEMORY_COST);
+  size(6);
+  format %{ "MADB     $dst, $src1, $src2" %}
+  ins_encode %{
+    __ z_madb($dst$$FloatRegister, $src1$$FloatRegister,
+              Address(reg_to_register_object($src2$$base), $src2$$index$$Register, $src2$$disp));
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// src1 * src2 - dst
+instruct msubF_reg_mem(regF dst, regF src1, memoryRX src2) %{
+  match(Set dst (FmaF (NegF dst) (Binary src1 (LoadF src2))));
+  // CC unchanged by MUL-SUB.
+  ins_cost(ALU_MEMORY_COST);
+  size(6);
+  format %{ "MSEB     $dst, $src1, $src2" %}
+  ins_encode %{
+    __ z_mseb($dst$$FloatRegister, $src1$$FloatRegister,
+              Address(reg_to_register_object($src2$$base), $src2$$index$$Register, $src2$$disp));
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// src1 * src2 - dst
+instruct msubD_reg_mem(regD dst, regD src1, memoryRX src2) %{
+  match(Set dst (FmaD (NegD dst) (Binary src1 (LoadD src2))));
+  // CC unchanged by MUL-SUB.
+  ins_cost(ALU_MEMORY_COST);
+  size(6);
+  format %{ "MSDB    $dst, $src1, $src2" %}
+  ins_encode %{
+    __ z_msdb($dst$$FloatRegister, $src1$$FloatRegister,
+              Address(reg_to_register_object($src2$$base), $src2$$index$$Register, $src2$$disp));
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// src1 * src2 + dst
+instruct maddF_mem_reg(regF dst, memoryRX src1, regF src2) %{
+  match(Set dst (FmaF dst (Binary (LoadF src1) src2)));
+  // CC unchanged by MUL-ADD.
+  ins_cost(ALU_MEMORY_COST);
+  size(6);
+  format %{ "MAEB     $dst, $src1, $src2" %}
+  ins_encode %{
+    __ z_maeb($dst$$FloatRegister, $src2$$FloatRegister,
+              Address(reg_to_register_object($src1$$base), $src1$$index$$Register, $src1$$disp));
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// src1 * src2 + dst
+instruct maddD_mem_reg(regD dst, memoryRX src1, regD src2) %{
+  match(Set dst (FmaD dst (Binary (LoadD src1) src2)));
+  // CC unchanged by MUL-ADD.
+  ins_cost(ALU_MEMORY_COST);
+  size(6);
+  format %{ "MADB     $dst, $src1, $src2" %}
+  ins_encode %{
+    __ z_madb($dst$$FloatRegister, $src2$$FloatRegister,
+              Address(reg_to_register_object($src1$$base), $src1$$index$$Register, $src1$$disp));
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// src1 * src2 - dst
+instruct msubF_mem_reg(regF dst, memoryRX src1, regF src2) %{
+  match(Set dst (FmaF (NegF dst) (Binary (LoadF src1) src2)));
+  // CC unchanged by MUL-SUB.
+  ins_cost(ALU_MEMORY_COST);
+  size(6);
+  format %{ "MSEB     $dst, $src1, $src2" %}
+  ins_encode %{
+    __ z_mseb($dst$$FloatRegister, $src2$$FloatRegister,
+              Address(reg_to_register_object($src1$$base), $src1$$index$$Register, $src1$$disp));
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// src1 * src2 - dst
+instruct msubD_mem_reg(regD dst, memoryRX src1, regD src2) %{
+  match(Set dst (FmaD (NegD dst) (Binary (LoadD src1) src2)));
+  // CC unchanged by MUL-SUB.
+  ins_cost(ALU_MEMORY_COST);
+  size(6);
+  format %{ "MSDB    $dst, $src1, $src2" %}
+  ins_encode %{
+    __ z_msdb($dst$$FloatRegister, $src2$$FloatRegister,
+              Address(reg_to_register_object($src1$$base), $src1$$index$$Register, $src1$$disp));
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
 //  DIV
 
 //  Div float single precision
--- a/hotspot/src/cpu/s390/vm/stubGenerator_s390.cpp	Tue Dec 20 13:02:16 2016 +0000
+++ b/hotspot/src/cpu/s390/vm/stubGenerator_s390.cpp	Tue Dec 20 14:55:18 2016 +0100
@@ -2038,15 +2038,15 @@
     generate_push_parmBlk(keylen, fCode, parmBlk, key, cv, true);
 
     // Prepare other registers for instruction.
-    // __ z_lgr(src, from);  // Not needed, registers are the same.
+    // __ z_lgr(src, from);     // Not needed, registers are the same.
     __ z_lgr(dst, to);
-    __ z_lgr(srclen, msglen);
-
-    __ kmc(dst, src);          // Decipher the message.
+    __ z_llgfr(srclen, msglen); // We pass the offsets as ints, not as longs as required.
+
+    __ kmc(dst, src);           // Decipher the message.
 
     generate_pop_parmBlk(keylen, parmBlk, key, cv);
 
-    __ z_lgr(Z_RET, msglen);
+    __ z_llgfr(Z_RET, msglen);  // We pass the offsets as ints, not as longs as required.
     __ z_br(Z_R14);
 
     return __ addr_at(start_off);
--- a/hotspot/src/cpu/s390/vm/templateInterpreterGenerator_s390.cpp	Tue Dec 20 13:02:16 2016 +0000
+++ b/hotspot/src/cpu/s390/vm/templateInterpreterGenerator_s390.cpp	Tue Dec 20 14:55:18 2016 +0100
@@ -1297,36 +1297,96 @@
 // Math function, frame manager must set up an interpreter state, etc.
 address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKind kind) {
 
-  if (!InlineIntrinsics) { return NULL; } // Generate a vanilla entry.
-
-  // Only support absolute value and square root.
-  if (kind != Interpreter::java_lang_math_abs && kind != Interpreter::java_lang_math_sqrt) {
-    return NULL;
+  // Decide what to do: Use same platform specific instructions and runtime calls as compilers.
+  bool use_instruction = false;
+  address runtime_entry = NULL;
+  int num_args = 1;
+  bool double_precision = true;
+
+  // s390 specific:
+  switch (kind) {
+    case Interpreter::java_lang_math_sqrt:
+    case Interpreter::java_lang_math_abs:  use_instruction = true; break;
+    case Interpreter::java_lang_math_fmaF:
+    case Interpreter::java_lang_math_fmaD: use_instruction = UseFMA; break;
+    default: break; // Fall back to runtime call.
+  }
+
+  switch (kind) {
+    case Interpreter::java_lang_math_sin  : runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dsin);   break;
+    case Interpreter::java_lang_math_cos  : runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dcos);   break;
+    case Interpreter::java_lang_math_tan  : runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dtan);   break;
+    case Interpreter::java_lang_math_abs  : /* run interpreted */ break;
+    case Interpreter::java_lang_math_sqrt : /* runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dsqrt); not available */ break;
+    case Interpreter::java_lang_math_log  : runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dlog);   break;
+    case Interpreter::java_lang_math_log10: runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dlog10); break;
+    case Interpreter::java_lang_math_pow  : runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dpow); num_args = 2; break;
+    case Interpreter::java_lang_math_exp  : runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dexp);   break;
+    case Interpreter::java_lang_math_fmaF : /* run interpreted */ num_args = 3; double_precision = false; break;
+    case Interpreter::java_lang_math_fmaD : /* run interpreted */ num_args = 3; break;
+    default: ShouldNotReachHere();
   }
 
-  BLOCK_COMMENT("math_entry {");
-
-  address math_entry = __ pc();
-
-  if (kind == Interpreter::java_lang_math_abs) {
-    // Load operand from stack.
-    __ mem2freg_opt(Z_FRET, Address(Z_esp, Interpreter::stackElementSize));
-    __ z_lpdbr(Z_FRET);
+  // Use normal entry if neither instruction nor runtime call is used.
+  if (!use_instruction && runtime_entry == NULL) return NULL;
+
+  address entry = __ pc();
+
+  if (use_instruction) {
+    switch (kind) {
+      case Interpreter::java_lang_math_sqrt:
+        // Can use memory operand directly.
+        __ z_sqdb(Z_FRET, Interpreter::stackElementSize, Z_esp);
+        break;
+      case Interpreter::java_lang_math_abs:
+        // Load operand from stack.
+        __ mem2freg_opt(Z_FRET, Address(Z_esp, Interpreter::stackElementSize));
+        __ z_lpdbr(Z_FRET);
+        break;
+      case Interpreter::java_lang_math_fmaF:
+        __ mem2freg_opt(Z_FRET,  Address(Z_esp,     Interpreter::stackElementSize)); // result reg = arg3
+        __ mem2freg_opt(Z_FARG2, Address(Z_esp, 3 * Interpreter::stackElementSize)); // arg1
+        __ z_maeb(Z_FRET, Z_FARG2, Address(Z_esp, 2 * Interpreter::stackElementSize));
+        break;
+      case Interpreter::java_lang_math_fmaD:
+        __ mem2freg_opt(Z_FRET,  Address(Z_esp,     Interpreter::stackElementSize)); // result reg = arg3
+        __ mem2freg_opt(Z_FARG2, Address(Z_esp, 5 * Interpreter::stackElementSize)); // arg1
+        __ z_madb(Z_FRET, Z_FARG2, Address(Z_esp, 3 * Interpreter::stackElementSize));
+        break;
+      default: ShouldNotReachHere();
+    }
   } else {
-    // sqrt
-    // Can use memory operand directly.
-    __ z_sqdb(Z_FRET, Interpreter::stackElementSize, Z_esp);
+    // Load arguments
+    assert(num_args <= 4, "passed in registers");
+    if (double_precision) {
+      int offset = (2 * num_args - 1) * Interpreter::stackElementSize;
+      for (int i = 0; i < num_args; ++i) {
+        __ mem2freg_opt(as_FloatRegister(Z_FARG1->encoding() + 2 * i), Address(Z_esp, offset));
+        offset -= 2 * Interpreter::stackElementSize;
+      }
+    } else {
+      int offset = num_args * Interpreter::stackElementSize;
+      for (int i = 0; i < num_args; ++i) {
+        __ mem2freg_opt(as_FloatRegister(Z_FARG1->encoding() + 2 * i), Address(Z_esp, offset));
+        offset -= Interpreter::stackElementSize;
+      }
+    }
+    // Call runtime
+    __ save_return_pc();       // Save Z_R14.
+    __ push_frame_abi160(0);   // Without new frame the RT call could overwrite the saved Z_R14.
+
+    __ call_VM_leaf(runtime_entry);
+
+    __ pop_frame();
+    __ restore_return_pc();    // Restore Z_R14.
   }
 
-  // Restore caller sp for c2i case.
+  // Pop c2i arguments (if any) off when we return.
   __ resize_frame_absolute(Z_R10, Z_R0, true); // Cut the stack back to where the caller started.
 
-  // We are done, return.
   __ z_br(Z_R14);
 
-  BLOCK_COMMENT("} math_entry");
-
-  return math_entry;
+  return entry;
 }
 
 // Interpreter stub for calling a native method. (asm interpreter).
--- a/hotspot/src/cpu/s390/vm/vm_version_s390.cpp	Tue Dec 20 13:02:16 2016 +0000
+++ b/hotspot/src/cpu/s390/vm/vm_version_s390.cpp	Tue Dec 20 14:55:18 2016 +0100
@@ -155,9 +155,8 @@
     FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
   }
 
-  if (UseFMA) {
-    warning("FMA instructions are not available on this CPU");
-    FLAG_SET_DEFAULT(UseFMA, false);
+  if (FLAG_IS_DEFAULT(UseFMA)) {
+    FLAG_SET_DEFAULT(UseFMA, true);
   }
 
   // On z/Architecture, we take UseSHA as the general switch to enable/disable the SHA intrinsics.