7133857: exp() and pow() should use the x87 ISA on x86
authorroland
Tue, 15 May 2012 10:10:23 +0200
changeset 12739 09f26b73ae66
parent 12623 09fcb0dc71ad
child 12740 fa38bd69a34b
7133857: exp() and pow() should use the x87 ISA on x86 Summary: use x87 instructions to implement exp() and pow() in interpreter/c1/c2. Reviewed-by: kvn, never, twisti
hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
hotspot/src/cpu/sparc/vm/interpreter_sparc.cpp
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
hotspot/src/cpu/x86/vm/c1_LinearScan_x86.cpp
hotspot/src/cpu/x86/vm/interpreter_x86_32.cpp
hotspot/src/cpu/x86/vm/interpreter_x86_64.cpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
hotspot/src/cpu/x86/vm/x86_32.ad
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/share/vm/c1/c1_GraphBuilder.cpp
hotspot/src/share/vm/c1/c1_LIR.cpp
hotspot/src/share/vm/c1/c1_LIR.hpp
hotspot/src/share/vm/c1/c1_LIRAssembler.cpp
hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
hotspot/src/share/vm/c1/c1_LinearScan.cpp
hotspot/src/share/vm/interpreter/abstractInterpreter.hpp
hotspot/src/share/vm/interpreter/interpreter.cpp
hotspot/src/share/vm/interpreter/templateInterpreter.cpp
hotspot/src/share/vm/opto/library_call.cpp
hotspot/src/share/vm/opto/subnode.cpp
--- a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp	Tue May 15 10:10:23 2012 +0200
@@ -738,7 +738,8 @@
     case vmIntrinsics::_dlog: // fall through
     case vmIntrinsics::_dsin: // fall through
     case vmIntrinsics::_dtan: // fall through
-    case vmIntrinsics::_dcos: {
+    case vmIntrinsics::_dcos: // fall through
+    case vmIntrinsics::_dexp: {
       assert(x->number_of_arguments() == 1, "wrong type");
 
       address runtime_entry = NULL;
@@ -758,12 +759,23 @@
       case vmIntrinsics::_dlog10:
         runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dlog10);
         break;
+      case vmIntrinsics::_dexp:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dexp);
+        break;
       default:
         ShouldNotReachHere();
       }
 
       LIR_Opr result = call_runtime(x->argument_at(0), runtime_entry, x->type(), NULL);
       set_result(x, result);
+      break;
+    }
+    case vmIntrinsics::_dpow: {
+      assert(x->number_of_arguments() == 2, "wrong type");
+      address runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dpow);
+      LIR_Opr result = call_runtime(x->argument_at(0), x->argument_at(1), runtime_entry, x->type(), NULL);
+      set_result(x, result);
+      break;
     }
   }
 }
--- a/hotspot/src/cpu/sparc/vm/interpreter_sparc.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/sparc/vm/interpreter_sparc.cpp	Tue May 15 10:10:23 2012 +0200
@@ -403,6 +403,8 @@
     case Interpreter::java_lang_math_abs     :                                                                             break;
     case Interpreter::java_lang_math_log     :                                                                             break;
     case Interpreter::java_lang_math_log10   :                                                                             break;
+    case Interpreter::java_lang_math_pow     :                                                                             break;
+    case Interpreter::java_lang_math_exp     :                                                                             break;
     case Interpreter::java_lang_ref_reference_get
                                              : entry_point = ((InterpreterGenerator*)this)->generate_Reference_get_entry(); break;
     default                                  : ShouldNotReachHere();                                                       break;
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Tue May 15 10:10:23 2012 +0200
@@ -3578,6 +3578,21 @@
   emit_byte(0xF1);
 }
 
+void Assembler::frndint() {
+  emit_byte(0xD9);
+  emit_byte(0xFC);
+}
+
+void Assembler::f2xm1() {
+  emit_byte(0xD9);
+  emit_byte(0xF0);
+}
+
+void Assembler::fldl2e() {
+  emit_byte(0xD9);
+  emit_byte(0xEA);
+}
+
 // SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
 static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
 // SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
@@ -6868,6 +6883,242 @@
   Assembler::fldcw(as_Address(src));
 }
 
+void MacroAssembler::pow_exp_core_encoding() {
+  // kills rax, rcx, rdx
+  subptr(rsp,sizeof(jdouble));
+  // computes 2^X. Stack: X ...
+  // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
+  // keep it on the thread's stack to compute 2^int(X) later
+  // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
+  // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
+  fld_s(0);                 // Stack: X X ...
+  frndint();                // Stack: int(X) X ...
+  fsuba(1);                 // Stack: int(X) X-int(X) ...
+  fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
+  f2xm1();                  // Stack: 2^(X-int(X))-1 ...
+  fld1();                   // Stack: 1 2^(X-int(X))-1 ...
+  faddp(1);                 // Stack: 2^(X-int(X))
+  // computes 2^(int(X)): add exponent bias (1023) to int(X), then
+  // shift int(X)+1023 to exponent position.
+  // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
+  // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
+  // values so detect them and set result to NaN.
+  movl(rax,Address(rsp,0));
+  movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
+  addl(rax, 1023);
+  movl(rdx,rax);
+  shll(rax,20);
+  // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
+  addl(rdx,1);
+  // Check that 1 < int(X)+1023+1 < 2048
+  // in 3 steps:
+  // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
+  // 2- (int(X)+1023+1)&-2048 != 0
+  // 3- (int(X)+1023+1)&-2048 != 1
+  // Do 2- first because addl just updated the flags.
+  cmov32(Assembler::equal,rax,rcx);
+  cmpl(rdx,1);
+  cmov32(Assembler::equal,rax,rcx);
+  testl(rdx,rcx);
+  cmov32(Assembler::notEqual,rax,rcx);
+  movl(Address(rsp,4),rax);
+  movl(Address(rsp,0),0);
+  fmul_d(Address(rsp,0));   // Stack: 2^X ...
+  addptr(rsp,sizeof(jdouble));
+}
+
+void MacroAssembler::fast_pow() {
+  // computes X^Y = 2^(Y * log2(X))
+  // if fast computation is not possible, result is NaN. Requires
+  // fallback from user of this macro.
+  fyl2x();                 // Stack: (Y*log2(X)) ...
+  pow_exp_core_encoding(); // Stack: exp(X) ...
+}
+
+void MacroAssembler::fast_exp() {
+  // computes exp(X) = 2^(X * log2(e))
+  // if fast computation is not possible, result is NaN. Requires
+  // fallback from user of this macro.
+  fldl2e();                // Stack: log2(e) X ...
+  fmulp(1);                // Stack: (X*log2(e)) ...
+  pow_exp_core_encoding(); // Stack: exp(X) ...
+}
+
+void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
+  // kills rax, rcx, rdx
+  // pow and exp needs 2 extra registers on the fpu stack.
+  Label slow_case, done;
+  Register tmp = noreg;
+  if (!VM_Version::supports_cmov()) {
+    // fcmp needs a temporary so preserve rdx,
+    tmp = rdx;
+  }
+  Register tmp2 = rax;
+  NOT_LP64(Register tmp3 = rcx;)
+
+  if (is_exp) {
+    // Stack: X
+    fld_s(0);                   // duplicate argument for runtime call. Stack: X X
+    fast_exp();                 // Stack: exp(X) X
+    fcmp(tmp, 0, false, false); // Stack: exp(X) X
+    // exp(X) not equal to itself: exp(X) is NaN go to slow case.
+    jcc(Assembler::parity, slow_case);
+    // get rid of duplicate argument. Stack: exp(X)
+    if (num_fpu_regs_in_use > 0) {
+      fxch();
+      fpop();
+    } else {
+      ffree(1);
+    }
+    jmp(done);
+  } else {
+    // Stack: X Y
+    Label x_negative, y_odd;
+
+    fldz();                     // Stack: 0 X Y
+    fcmp(tmp, 1, true, false);  // Stack: X Y
+    jcc(Assembler::above, x_negative);
+
+    // X >= 0
+
+    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
+    fld_s(1);                   // Stack: X Y X Y
+    fast_pow();                 // Stack: X^Y X Y
+    fcmp(tmp, 0, false, false); // Stack: X^Y X Y
+    // X^Y not equal to itself: X^Y is NaN go to slow case.
+    jcc(Assembler::parity, slow_case);
+    // get rid of duplicate arguments. Stack: X^Y
+    if (num_fpu_regs_in_use > 0) {
+      fxch(); fpop();
+      fxch(); fpop();
+    } else {
+      ffree(2);
+      ffree(1);
+    }
+    jmp(done);
+
+    // X <= 0
+    bind(x_negative);
+
+    fld_s(1);                   // Stack: Y X Y
+    frndint();                  // Stack: int(Y) X Y
+    fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
+    jcc(Assembler::notEqual, slow_case);
+
+    subptr(rsp, 8);
+
+    // For X^Y, when X < 0, Y has to be an integer and the final
+    // result depends on whether it's odd or even. We just checked
+    // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
+    // integer to test its parity. If int(Y) is huge and doesn't fit
+    // in the 64 bit integer range, the integer indefinite value will
+    // end up in the gp registers. Huge numbers are all even, the
+    // integer indefinite number is even so it's fine.
+
+#ifdef ASSERT
+    // Let's check we don't end up with an integer indefinite number
+    // when not expected. First test for huge numbers: check whether
+    // int(Y)+1 == int(Y) which is true for very large numbers and
+    // those are all even. A 64 bit integer is guaranteed to not
+    // overflow for numbers where y+1 != y (when precision is set to
+    // double precision).
+    Label y_not_huge;
+
+    fld1();                     // Stack: 1 int(Y) X Y
+    fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
+
+#ifdef _LP64
+    // trip to memory to force the precision down from double extended
+    // precision
+    fstp_d(Address(rsp, 0));
+    fld_d(Address(rsp, 0));
+#endif
+
+    fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
+#endif
+
+    // move int(Y) as 64 bit integer to thread's stack
+    fistp_d(Address(rsp,0));    // Stack: X Y
+
+#ifdef ASSERT
+    jcc(Assembler::notEqual, y_not_huge);
+
+    // Y is huge so we know it's even. It may not fit in a 64 bit
+    // integer and we don't want the debug code below to see the
+    // integer indefinite value so overwrite int(Y) on the thread's
+    // stack with 0.
+    movl(Address(rsp, 0), 0);
+    movl(Address(rsp, 4), 0);
+
+    bind(y_not_huge);
+#endif
+
+    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
+    fld_s(1);                   // Stack: X Y X Y
+    fabs();                     // Stack: abs(X) Y X Y
+    fast_pow();                 // Stack: abs(X)^Y X Y
+    fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
+    // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
+
+    pop(tmp2);
+    NOT_LP64(pop(tmp3));
+    jcc(Assembler::parity, slow_case);
+
+#ifdef ASSERT
+    // Check that int(Y) is not integer indefinite value (int
+    // overflow). Shouldn't happen because for values that would
+    // overflow, 1+int(Y)==Y which was tested earlier.
+#ifndef _LP64
+    {
+      Label integer;
+      testl(tmp2, tmp2);
+      jcc(Assembler::notZero, integer);
+      cmpl(tmp3, 0x80000000);
+      jcc(Assembler::notZero, integer);
+      stop("integer indefinite value shouldn't be seen here");
+      bind(integer);
+    }
+#else
+    {
+      Label integer;
+      shlq(tmp2, 1);
+      jcc(Assembler::carryClear, integer);
+      jcc(Assembler::notZero, integer);
+      stop("integer indefinite value shouldn't be seen here");
+      bind(integer);
+    }
+#endif
+#endif
+
+    // get rid of duplicate arguments. Stack: X^Y
+    if (num_fpu_regs_in_use > 0) {
+      fxch(); fpop();
+      fxch(); fpop();
+    } else {
+      ffree(2);
+      ffree(1);
+    }
+
+    testl(tmp2, 1);
+    jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
+    // X <= 0, Y even: X^Y = -abs(X)^Y
+
+    fchs();                     // Stack: -abs(X)^Y Y
+    jmp(done);
+  }
+
+  // slow case: runtime call
+  bind(slow_case);
+
+  fpop();                       // pop incorrect result or int(Y)
+
+  fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
+                      is_exp ? 1 : 2, num_fpu_regs_in_use);
+
+  // Come here with result in F-TOS
+  bind(done);
+}
+
 void MacroAssembler::fpop() {
   ffree();
   fincstp();
@@ -8045,6 +8296,144 @@
 #endif
 }
 
+void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
+  pusha();
+
+  // if we are coming from c1, xmm registers may be live
+  if (UseSSE >= 1) {
+    subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
+  }
+  int off = 0;
+  if (UseSSE == 1)  {
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
+  } else if (UseSSE >= 2)  {
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7);
+#ifdef _LP64
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15);
+#endif
+  }
+
+  // Preserve registers across runtime call
+  int incoming_argument_and_return_value_offset = -1;
+  if (num_fpu_regs_in_use > 1) {
+    // Must preserve all other FPU regs (could alternatively convert
+    // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
+    // FPU state, but can not trust C compiler)
+    NEEDS_CLEANUP;
+    // NOTE that in this case we also push the incoming argument(s) to
+    // the stack and restore it later; we also use this stack slot to
+    // hold the return value from dsin, dcos etc.
+    for (int i = 0; i < num_fpu_regs_in_use; i++) {
+      subptr(rsp, sizeof(jdouble));
+      fstp_d(Address(rsp, 0));
+    }
+    incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
+    for (int i = nb_args-1; i >= 0; i--) {
+      fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
+    }
+  }
+
+  subptr(rsp, nb_args*sizeof(jdouble));
+  for (int i = 0; i < nb_args; i++) {
+    fstp_d(Address(rsp, i*sizeof(jdouble)));
+  }
+
+#ifdef _LP64
+  if (nb_args > 0) {
+    movdbl(xmm0, Address(rsp, 0));
+  }
+  if (nb_args > 1) {
+    movdbl(xmm1, Address(rsp, sizeof(jdouble)));
+  }
+  assert(nb_args <= 2, "unsupported number of args");
+#endif // _LP64
+
+  // NOTE: we must not use call_VM_leaf here because that requires a
+  // complete interpreter frame in debug mode -- same bug as 4387334
+  // MacroAssembler::call_VM_leaf_base is perfectly safe and will
+  // do proper 64bit abi
+
+  NEEDS_CLEANUP;
+  // Need to add stack banging before this runtime call if it needs to
+  // be taken; however, there is no generic stack banging routine at
+  // the MacroAssembler level
+
+  MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
+
+#ifdef _LP64
+  movsd(Address(rsp, 0), xmm0);
+  fld_d(Address(rsp, 0));
+#endif // _LP64
+  addptr(rsp, sizeof(jdouble) * nb_args);
+  if (num_fpu_regs_in_use > 1) {
+    // Must save return value to stack and then restore entire FPU
+    // stack except incoming arguments
+    fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
+    for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
+      fld_d(Address(rsp, 0));
+      addptr(rsp, sizeof(jdouble));
+    }
+    fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
+    addptr(rsp, sizeof(jdouble) * nb_args);
+  }
+
+  off = 0;
+  if (UseSSE == 1)  {
+    movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
+  } else if (UseSSE >= 2)  {
+    movdbl(xmm0, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm1, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm2, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm3, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm4, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm5, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm6, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm7, Address(rsp,off++*sizeof(jdouble)));
+#ifdef _LP64
+    movdbl(xmm8, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm9, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm10, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm11, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm12, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm13, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm14, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm15, Address(rsp,off++*sizeof(jdouble)));
+#endif
+  }
+  if (UseSSE >= 1) {
+    addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
+  }
+  popa();
+}
+
 static const double     pi_4 =  0.7853981633974483;
 
 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
@@ -8092,73 +8481,27 @@
 
   // slow case: runtime call
   bind(slow_case);
-  // Preserve registers across runtime call
-  pusha();
-  int incoming_argument_and_return_value_offset = -1;
-  if (num_fpu_regs_in_use > 1) {
-    // Must preserve all other FPU regs (could alternatively convert
-    // SharedRuntime::dsin and dcos into assembly routines known not to trash
-    // FPU state, but can not trust C compiler)
-    NEEDS_CLEANUP;
-    // NOTE that in this case we also push the incoming argument to
-    // the stack and restore it later; we also use this stack slot to
-    // hold the return value from dsin or dcos.
-    for (int i = 0; i < num_fpu_regs_in_use; i++) {
-      subptr(rsp, sizeof(jdouble));
-      fstp_d(Address(rsp, 0));
-    }
-    incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
-    fld_d(Address(rsp, incoming_argument_and_return_value_offset));
-  }
-  subptr(rsp, sizeof(jdouble));
-  fstp_d(Address(rsp, 0));
-#ifdef _LP64
-  movdbl(xmm0, Address(rsp, 0));
-#endif // _LP64
-
-  // NOTE: we must not use call_VM_leaf here because that requires a
-  // complete interpreter frame in debug mode -- same bug as 4387334
-  // MacroAssembler::call_VM_leaf_base is perfectly safe and will
-  // do proper 64bit abi
-
-  NEEDS_CLEANUP;
-  // Need to add stack banging before this runtime call if it needs to
-  // be taken; however, there is no generic stack banging routine at
-  // the MacroAssembler level
+
   switch(trig) {
   case 's':
     {
-      MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 0);
+      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
     }
     break;
   case 'c':
     {
-      MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 0);
+      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
     }
     break;
   case 't':
     {
-      MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 0);
+      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
     }
     break;
   default:
     assert(false, "bad intrinsic");
     break;
   }
-#ifdef _LP64
-    movsd(Address(rsp, 0), xmm0);
-    fld_d(Address(rsp, 0));
-#endif // _LP64
-  addptr(rsp, sizeof(jdouble));
-  if (num_fpu_regs_in_use > 1) {
-    // Must save return value to stack and then restore entire FPU stack
-    fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
-    for (int i = 0; i < num_fpu_regs_in_use; i++) {
-      fld_d(Address(rsp, 0));
-      addptr(rsp, sizeof(jdouble));
-    }
-  }
-  popa();
 
   // Come here with result in F-TOS
   bind(done);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Tue May 15 10:10:23 2012 +0200
@@ -1148,6 +1148,9 @@
   void fxsave(Address dst);
 
   void fyl2x();
+  void frndint();
+  void f2xm1();
+  void fldl2e();
 
   void hlt();
 
@@ -2387,7 +2390,28 @@
   void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
   void ldmxcsr(AddressLiteral src);
 
+  // compute pow(x,y) and exp(x) with x86 instructions. Don't cover
+  // all corner cases and may result in NaN and require fallback to a
+  // runtime call.
+  void fast_pow();
+  void fast_exp();
+
+  // computes exp(x). Fallback to runtime call included.
+  void exp_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(true, num_fpu_regs_in_use); }
+  // computes pow(x,y). Fallback to runtime call included.
+  void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(false, num_fpu_regs_in_use); }
+
 private:
+
+  // call runtime as a fallback for trig functions and pow/exp.
+  void fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use);
+
+  // computes 2^(Ylog2X); Ylog2X in ST(0)
+  void pow_exp_core_encoding();
+
+  // computes pow(x,y) or exp(x). Fallback to runtime call included.
+  void pow_or_exp(bool is_exp, int num_fpu_regs_in_use);
+
   // these are private because users should be doing movflt/movdbl
 
   void movss(Address dst, XMMRegister src)     { Assembler::movss(dst, src); }
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Tue May 15 10:10:23 2012 +0200
@@ -2446,6 +2446,12 @@
         // Should consider not saving rbx, if not necessary
         __ trigfunc('t', op->as_Op2()->fpu_stack_size());
         break;
+      case lir_exp :
+        __ exp_with_fallback(op->as_Op2()->fpu_stack_size());
+        break;
+      case lir_pow :
+        __ pow_with_fallback(op->as_Op2()->fpu_stack_size());
+        break;
       default      : ShouldNotReachHere();
     }
   } else {
--- a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp	Tue May 15 10:10:23 2012 +0200
@@ -823,7 +823,7 @@
 
 
 void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
-  assert(x->number_of_arguments() == 1, "wrong type");
+  assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");
   LIRItem value(x->argument_at(0), this);
 
   bool use_fpu = false;
@@ -834,6 +834,8 @@
       case vmIntrinsics::_dtan:
       case vmIntrinsics::_dlog:
       case vmIntrinsics::_dlog10:
+      case vmIntrinsics::_dexp:
+      case vmIntrinsics::_dpow:
         use_fpu = true;
     }
   } else {
@@ -843,20 +845,37 @@
   value.load_item();
 
   LIR_Opr calc_input = value.result();
+  LIR_Opr calc_input2 = NULL;
+  if (x->id() == vmIntrinsics::_dpow) {
+    LIRItem extra_arg(x->argument_at(1), this);
+    if (UseSSE < 2) {
+      extra_arg.set_destroys_register();
+    }
+    extra_arg.load_item();
+    calc_input2 = extra_arg.result();
+  }
   LIR_Opr calc_result = rlock_result(x);
 
-  // sin and cos need two free fpu stack slots, so register two temporary operands
+  // sin, cos, pow and exp need two free fpu stack slots, so register
+  // two temporary operands
   LIR_Opr tmp1 = FrameMap::caller_save_fpu_reg_at(0);
   LIR_Opr tmp2 = FrameMap::caller_save_fpu_reg_at(1);
 
   if (use_fpu) {
     LIR_Opr tmp = FrameMap::fpu0_double_opr;
+    int tmp_start = 1;
+    if (calc_input2 != NULL) {
+      __ move(calc_input2, tmp);
+      tmp_start = 2;
+      calc_input2 = tmp;
+    }
     __ move(calc_input, tmp);
 
     calc_input = tmp;
     calc_result = tmp;
-    tmp1 = FrameMap::caller_save_fpu_reg_at(1);
-    tmp2 = FrameMap::caller_save_fpu_reg_at(2);
+
+    tmp1 = FrameMap::caller_save_fpu_reg_at(tmp_start);
+    tmp2 = FrameMap::caller_save_fpu_reg_at(tmp_start + 1);
   }
 
   switch(x->id()) {
@@ -867,6 +886,8 @@
     case vmIntrinsics::_dtan:   __ tan  (calc_input, calc_result, tmp1, tmp2);              break;
     case vmIntrinsics::_dlog:   __ log  (calc_input, calc_result, tmp1);                    break;
     case vmIntrinsics::_dlog10: __ log10(calc_input, calc_result, tmp1);                    break;
+    case vmIntrinsics::_dexp:   __ exp  (calc_input, calc_result,              tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
+    case vmIntrinsics::_dpow:   __ pow  (calc_input, calc_input2, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
     default:                    ShouldNotReachHere();
   }
 
--- a/hotspot/src/cpu/x86/vm/c1_LinearScan_x86.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_LinearScan_x86.cpp	Tue May 15 10:10:23 2012 +0200
@@ -690,8 +690,8 @@
 
     case lir_mul_strictfp:
     case lir_div_strictfp: {
-      assert(op2->tmp_opr()->is_fpu_register(), "strict operations need temporary fpu stack slot");
-      insert_free_if_dead(op2->tmp_opr());
+      assert(op2->tmp1_opr()->is_fpu_register(), "strict operations need temporary fpu stack slot");
+      insert_free_if_dead(op2->tmp1_opr());
       assert(sim()->stack_size() <= 7, "at least one stack slot must be free");
       // fall-through: continue with the normal handling of lir_mul and lir_div
     }
@@ -787,16 +787,17 @@
 
     case lir_log:
     case lir_log10: {
-      // log and log10 needs one temporary fpu stack slot, so there is ontemporary
-      // registers stored in temp of the operation.
-      // the stack allocator must guarantee that the stack slots are really free,
-      // otherwise there might be a stack overflow.
+      // log and log10 need one temporary fpu stack slot, so
+      // there is one temporary registers stored in temp of the
+      // operation. the stack allocator must guarantee that the stack
+      // slots are really free, otherwise there might be a stack
+      // overflow.
       assert(right->is_illegal(), "must be");
       assert(left->is_fpu_register(), "must be");
       assert(res->is_fpu_register(), "must be");
-      assert(op2->tmp_opr()->is_fpu_register(), "must be");
+      assert(op2->tmp1_opr()->is_fpu_register(), "must be");
 
-      insert_free_if_dead(op2->tmp_opr());
+      insert_free_if_dead(op2->tmp1_opr());
       insert_free_if_dead(res, left);
       insert_exchange(left);
       do_rename(left, res);
@@ -812,8 +813,9 @@
 
     case lir_tan:
     case lir_sin:
-    case lir_cos: {
-      // sin and cos need two temporary fpu stack slots, so there are two temporary
+    case lir_cos:
+    case lir_exp: {
+      // sin, cos and exp need two temporary fpu stack slots, so there are two temporary
       // registers (stored in right and temp of the operation).
       // the stack allocator must guarantee that the stack slots are really free,
       // otherwise there might be a stack overflow.
@@ -821,11 +823,11 @@
       assert(res->is_fpu_register(), "must be");
       // assert(left->is_last_use(), "old value gets destroyed");
       assert(right->is_fpu_register(), "right is used as the first temporary register");
-      assert(op2->tmp_opr()->is_fpu_register(), "temp is used as the second temporary register");
-      assert(fpu_num(left) != fpu_num(right) && fpu_num(right) != fpu_num(op2->tmp_opr()) && fpu_num(op2->tmp_opr()) != fpu_num(res), "need distinct temp registers");
+      assert(op2->tmp1_opr()->is_fpu_register(), "temp is used as the second temporary register");
+      assert(fpu_num(left) != fpu_num(right) && fpu_num(right) != fpu_num(op2->tmp1_opr()) && fpu_num(op2->tmp1_opr()) != fpu_num(res), "need distinct temp registers");
 
       insert_free_if_dead(right);
-      insert_free_if_dead(op2->tmp_opr());
+      insert_free_if_dead(op2->tmp1_opr());
 
       insert_free_if_dead(res, left);
       insert_exchange(left);
@@ -839,6 +841,53 @@
       break;
     }
 
+    case lir_pow: {
+      // pow needs two temporary fpu stack slots, so there are two temporary
+      // registers (stored in tmp1 and tmp2 of the operation).
+      // the stack allocator must guarantee that the stack slots are really free,
+      // otherwise there might be a stack overflow.
+      assert(left->is_fpu_register(), "must be");
+      assert(right->is_fpu_register(), "must be");
+      assert(res->is_fpu_register(), "must be");
+
+      assert(op2->tmp1_opr()->is_fpu_register(), "tmp1 is the first temporary register");
+      assert(op2->tmp2_opr()->is_fpu_register(), "tmp2 is the second temporary register");
+      assert(fpu_num(left) != fpu_num(right) && fpu_num(left) != fpu_num(op2->tmp1_opr()) && fpu_num(left) != fpu_num(op2->tmp2_opr()) && fpu_num(left) != fpu_num(res), "need distinct temp registers");
+      assert(fpu_num(right) != fpu_num(op2->tmp1_opr()) && fpu_num(right) != fpu_num(op2->tmp2_opr()) && fpu_num(right) != fpu_num(res), "need distinct temp registers");
+      assert(fpu_num(op2->tmp1_opr()) != fpu_num(op2->tmp2_opr()) && fpu_num(op2->tmp1_opr()) != fpu_num(res), "need distinct temp registers");
+      assert(fpu_num(op2->tmp2_opr()) != fpu_num(res), "need distinct temp registers");
+
+      insert_free_if_dead(op2->tmp1_opr());
+      insert_free_if_dead(op2->tmp2_opr());
+
+      // Must bring both operands to top of stack with following operand ordering:
+      // * fpu stack before pow: ... right left
+      // * fpu stack after pow:  ... left
+
+      insert_free_if_dead(res, right);
+
+      if (tos_offset(right) != 1) {
+        insert_exchange(right);
+        insert_exchange(1);
+      }
+      insert_exchange(left);
+      assert(tos_offset(right) == 1, "check");
+      assert(tos_offset(left) == 0, "check");
+
+      new_left = to_fpu_stack_top(left);
+      new_right = to_fpu_stack(right);
+
+      op2->set_fpu_stack_size(sim()->stack_size());
+      assert(sim()->stack_size() <= 6, "at least two stack slots must be free");
+
+      sim()->pop();
+
+      do_rename(right, res);
+
+      new_res = to_fpu_stack_top(res);
+      break;
+    }
+
     default: {
       assert(false, "missed a fpu-operation");
     }
--- a/hotspot/src/cpu/x86/vm/interpreter_x86_32.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/interpreter_x86_32.cpp	Tue May 15 10:10:23 2012 +0200
@@ -181,6 +181,19 @@
         __ push_fTOS();
         __ pop_fTOS();
         break;
+    case Interpreter::java_lang_math_pow:
+      __ fld_d(Address(rsp, 3*wordSize)); // second argument
+      __ pow_with_fallback(0);
+      // Store to stack to convert 80bit precision back to 64bits
+      __ push_fTOS();
+      __ pop_fTOS();
+      break;
+    case Interpreter::java_lang_math_exp:
+      __ exp_with_fallback(0);
+      // Store to stack to convert 80bit precision back to 64bits
+      __ push_fTOS();
+      __ pop_fTOS();
+      break;
     default                              :
         ShouldNotReachHere();
   }
--- a/hotspot/src/cpu/x86/vm/interpreter_x86_64.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/interpreter_x86_64.cpp	Tue May 15 10:10:23 2012 +0200
@@ -271,6 +271,14 @@
       case Interpreter::java_lang_math_log10:
           __ flog10();
           break;
+      case Interpreter::java_lang_math_pow:
+          __ fld_d(Address(rsp, 3*wordSize)); // second argument (one
+                                              // empty stack slot)
+          __ pow_with_fallback(0);
+          break;
+      case Interpreter::java_lang_math_exp:
+          __ exp_with_fallback(0);
+           break;
       default                              :
           ShouldNotReachHere();
     }
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Tue May 15 10:10:23 2012 +0200
@@ -2136,11 +2136,23 @@
       __ trigfunc('t');
       __ ret(0);
     }
+    {
+      StubCodeMark mark(this, "StubRoutines", "exp");
+      StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
 
-    // The intrinsic version of these seem to return the same value as
-    // the strict version.
-    StubRoutines::_intrinsic_exp = SharedRuntime::dexp;
-    StubRoutines::_intrinsic_pow = SharedRuntime::dpow;
+      __ fld_d(Address(rsp, 4));
+      __ exp_with_fallback(0);
+      __ ret(0);
+    }
+    {
+      StubCodeMark mark(this, "StubRoutines", "pow");
+      StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
+
+      __ fld_d(Address(rsp, 12));
+      __ fld_d(Address(rsp, 4));
+      __ pow_with_fallback(0);
+      __ ret(0);
+    }
   }
 
  public:
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Tue May 15 10:10:23 2012 +0200
@@ -2928,11 +2928,34 @@
       __ addq(rsp, 8);
       __ ret(0);
     }
-
-    // The intrinsic version of these seem to return the same value as
-    // the strict version.
-    StubRoutines::_intrinsic_exp = SharedRuntime::dexp;
-    StubRoutines::_intrinsic_pow = SharedRuntime::dpow;
+    {
+      StubCodeMark mark(this, "StubRoutines", "exp");
+      StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
+
+      __ subq(rsp, 8);
+      __ movdbl(Address(rsp, 0), xmm0);
+      __ fld_d(Address(rsp, 0));
+      __ exp_with_fallback(0);
+      __ fstp_d(Address(rsp, 0));
+      __ movdbl(xmm0, Address(rsp, 0));
+      __ addq(rsp, 8);
+      __ ret(0);
+    }
+    {
+      StubCodeMark mark(this, "StubRoutines", "pow");
+      StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
+
+      __ subq(rsp, 8);
+      __ movdbl(Address(rsp, 0), xmm1);
+      __ fld_d(Address(rsp, 0));
+      __ movdbl(Address(rsp, 0), xmm0);
+      __ fld_d(Address(rsp, 0));
+      __ pow_with_fallback(0);
+      __ fstp_d(Address(rsp, 0));
+      __ movdbl(xmm0, Address(rsp, 0));
+      __ addq(rsp, 8);
+      __ ret(0);
+    }
   }
 
 #undef __
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp	Tue May 15 10:10:23 2012 +0200
@@ -1518,7 +1518,9 @@
     case Interpreter::java_lang_math_abs     : // fall thru
     case Interpreter::java_lang_math_log     : // fall thru
     case Interpreter::java_lang_math_log10   : // fall thru
-    case Interpreter::java_lang_math_sqrt    : entry_point = ((InterpreterGenerator*)this)->generate_math_entry(kind);     break;
+    case Interpreter::java_lang_math_sqrt    : // fall thru
+    case Interpreter::java_lang_math_pow     : // fall thru
+    case Interpreter::java_lang_math_exp     : entry_point = ((InterpreterGenerator*)this)->generate_math_entry(kind);     break;
     case Interpreter::java_lang_ref_reference_get
                                              : entry_point = ((InterpreterGenerator*)this)->generate_Reference_get_entry(); break;
     default                                  : ShouldNotReachHere();                                                       break;
@@ -1540,7 +1542,9 @@
     case Interpreter::java_lang_math_abs     : // fall thru
     case Interpreter::java_lang_math_log     : // fall thru
     case Interpreter::java_lang_math_log10   : // fall thru
-    case Interpreter::java_lang_math_sqrt    :
+    case Interpreter::java_lang_math_sqrt    : // fall thru
+    case Interpreter::java_lang_math_pow     : // fall thru
+    case Interpreter::java_lang_math_exp     :
       return false;
     default:
       return true;
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp	Tue May 15 10:10:23 2012 +0200
@@ -1534,7 +1534,9 @@
   case Interpreter::java_lang_math_abs     : // fall thru
   case Interpreter::java_lang_math_log     : // fall thru
   case Interpreter::java_lang_math_log10   : // fall thru
-  case Interpreter::java_lang_math_sqrt    : entry_point = ((InterpreterGenerator*) this)->generate_math_entry(kind);    break;
+  case Interpreter::java_lang_math_sqrt    : // fall thru
+  case Interpreter::java_lang_math_pow     : // fall thru
+  case Interpreter::java_lang_math_exp     : entry_point = ((InterpreterGenerator*) this)->generate_math_entry(kind);    break;
   case Interpreter::java_lang_ref_reference_get
                                            : entry_point = ((InterpreterGenerator*)this)->generate_Reference_get_entry(); break;
   default                                  : ShouldNotReachHere();                                                       break;
@@ -1558,7 +1560,9 @@
     case Interpreter::java_lang_math_abs     : // fall thru
     case Interpreter::java_lang_math_log     : // fall thru
     case Interpreter::java_lang_math_log10   : // fall thru
-    case Interpreter::java_lang_math_sqrt    :
+    case Interpreter::java_lang_math_sqrt    : // fall thru
+    case Interpreter::java_lang_math_pow     : // fall thru
+    case Interpreter::java_lang_math_exp     :
       return false;
     default:
       return true;
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Tue May 15 10:10:23 2012 +0200
@@ -2536,45 +2536,6 @@
     __ fld_d(Address(rsp, 0));
   %}
 
-  // Compute X^Y using Intel's fast hardware instructions, if possible.
-  // Otherwise return a NaN.
-  enc_class pow_exp_core_encoding %{
-    // FPR1 holds Y*ln2(X).  Compute FPR1 = 2^(Y*ln2(X))
-    emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xC0);  // fdup = fld st(0)          Q       Q
-    emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xFC);  // frndint               int(Q)      Q
-    emit_opcode(cbuf,0xDC); emit_opcode(cbuf,0xE9);  // fsub st(1) -= st(0);  int(Q) frac(Q)
-    emit_opcode(cbuf,0xDB);                          // FISTP [ESP]           frac(Q)
-    emit_opcode(cbuf,0x1C);
-    emit_d8(cbuf,0x24);
-    emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xF0);  // f2xm1                 2^frac(Q)-1
-    emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xE8);  // fld1                  1 2^frac(Q)-1
-    emit_opcode(cbuf,0xDE); emit_opcode(cbuf,0xC1);  // faddp                 2^frac(Q)
-    emit_opcode(cbuf,0x8B);                          // mov rax,[esp+0]=int(Q)
-    encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 0, false);
-    emit_opcode(cbuf,0xC7);                          // mov rcx,0xFFFFF800 - overflow mask
-    emit_rm(cbuf, 0x3, 0x0, ECX_enc);
-    emit_d32(cbuf,0xFFFFF800);
-    emit_opcode(cbuf,0x81);                          // add rax,1023 - the double exponent bias
-    emit_rm(cbuf, 0x3, 0x0, EAX_enc);
-    emit_d32(cbuf,1023);
-    emit_opcode(cbuf,0x8B);                          // mov rbx,eax
-    emit_rm(cbuf, 0x3, EBX_enc, EAX_enc);
-    emit_opcode(cbuf,0xC1);                          // shl rax,20 - Slide to exponent position
-    emit_rm(cbuf,0x3,0x4,EAX_enc);
-    emit_d8(cbuf,20);
-    emit_opcode(cbuf,0x85);                          // test rbx,ecx - check for overflow
-    emit_rm(cbuf, 0x3, EBX_enc, ECX_enc);
-    emit_opcode(cbuf,0x0F); emit_opcode(cbuf,0x45);  // CMOVne rax,ecx - overflow; stuff NAN into EAX
-    emit_rm(cbuf, 0x3, EAX_enc, ECX_enc);
-    emit_opcode(cbuf,0x89);                          // mov [esp+4],eax - Store as part of double word
-    encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 4, false);
-    emit_opcode(cbuf,0xC7);                          // mov [esp+0],0   - [ESP] = (double)(1<<int(Q)) = 2^int(Q)
-    encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
-    emit_d32(cbuf,0);
-    emit_opcode(cbuf,0xDC);                          // fmul dword st(0),[esp+0]; FPR1 = 2^int(Q)*2^frac(Q) = 2^Q
-    encode_RegMem(cbuf, 0x1, ESP_enc, 0x4, 0, 0, false);
-  %}
-
   enc_class Push_Result_Mod_DPR( regDPR src) %{
     if ($src$$reg != FPR1L_enc) {
       // fincstp
@@ -10100,161 +10061,67 @@
   ins_pipe( pipe_slow );
 %}
 
-instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
+instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
   predicate (UseSSE<=1);
   match(Set Y (PowD X Y));  // Raise X to the Yth power
-  effect(KILL rax, KILL rbx, KILL rcx);
-  format %{ "SUB    ESP,8\t\t# Fast-path POW encoding\n\t"
-            "FLD_D  $X\n\t"
-            "FYL2X  \t\t\t# Q=Y*ln2(X)\n\t"
-
-            "FDUP   \t\t\t# Q Q\n\t"
-            "FRNDINT\t\t\t# int(Q) Q\n\t"
-            "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
-            "FISTP  dword [ESP]\n\t"
-            "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
-            "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
-            "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
-            "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
-            "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
-            "ADD    EAX,1023\t\t# Double exponent bias\n\t"
-            "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
-            "SHL    EAX,20\t\t# Shift exponent into place\n\t"
-            "TEST   EBX,ECX\t\t# Check for overflow\n\t"
-            "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
-            "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
-            "MOV    [ESP+0],0\n\t"
-            "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
-
-            "ADD    ESP,8"
-             %}
-  ins_encode( push_stack_temp_qword,
-              Push_Reg_DPR(X),
-              Opcode(0xD9), Opcode(0xF1),   // fyl2x
-              pow_exp_core_encoding,
-              pop_stack_temp_qword);
-  ins_pipe( pipe_slow );
-%}
-
-instruct powD_reg(regD dst, regD src0, regD src1, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx ) %{
+  effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
+  format %{ "fast_pow $X $Y -> $Y  // KILL $rax, $rcx, $rdx" %}
+  ins_encode %{
+    __ subptr(rsp, 8);
+    __ fld_s($X$$reg - 1);
+    __ fast_pow();
+    __ addptr(rsp, 8);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct powD_reg(regD dst, regD src0, regD src1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   match(Set dst (PowD src0 src1));  // Raise src0 to the src1'th power
-  effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx );
-  format %{ "SUB    ESP,8\t\t# Fast-path POW encoding\n\t"
-            "MOVSD  [ESP],$src1\n\t"
-            "FLD    FPR1,$src1\n\t"
-            "MOVSD  [ESP],$src0\n\t"
-            "FLD    FPR1,$src0\n\t"
-            "FYL2X  \t\t\t# Q=Y*ln2(X)\n\t"
-
-            "FDUP   \t\t\t# Q Q\n\t"
-            "FRNDINT\t\t\t# int(Q) Q\n\t"
-            "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
-            "FISTP  dword [ESP]\n\t"
-            "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
-            "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
-            "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
-            "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
-            "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
-            "ADD    EAX,1023\t\t# Double exponent bias\n\t"
-            "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
-            "SHL    EAX,20\t\t# Shift exponent into place\n\t"
-            "TEST   EBX,ECX\t\t# Check for overflow\n\t"
-            "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
-            "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
-            "MOV    [ESP+0],0\n\t"
-            "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
-
-            "FST_D  [ESP]\n\t"
-            "MOVSD  $dst,[ESP]\n\t"
-            "ADD    ESP,8"
-             %}
-  ins_encode( push_stack_temp_qword,
-              push_xmm_to_fpr1(src1),
-              push_xmm_to_fpr1(src0),
-              Opcode(0xD9), Opcode(0xF1),   // fyl2x
-              pow_exp_core_encoding,
-              Push_ResultD(dst) );
-  ins_pipe( pipe_slow );
-%}
-
-
-instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
+  effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
+  format %{ "fast_pow $src0 $src1 -> $dst  // KILL $rax, $rcx, $rdx" %}
+  ins_encode %{
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src1$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ movdbl(Address(rsp, 0), $src0$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ fast_pow();
+    __ fstp_d(Address(rsp, 0));
+    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
+    __ addptr(rsp, 8);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+
+instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
   predicate (UseSSE<=1);
   match(Set dpr1 (ExpD dpr1));
-  effect(KILL rax, KILL rbx, KILL rcx);
-  format %{ "SUB    ESP,8\t\t# Fast-path EXP encoding"
-            "FLDL2E \t\t\t# Ld log2(e) X\n\t"
-            "FMULP  \t\t\t# Q=X*log2(e)\n\t"
-
-            "FDUP   \t\t\t# Q Q\n\t"
-            "FRNDINT\t\t\t# int(Q) Q\n\t"
-            "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
-            "FISTP  dword [ESP]\n\t"
-            "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
-            "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
-            "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
-            "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
-            "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
-            "ADD    EAX,1023\t\t# Double exponent bias\n\t"
-            "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
-            "SHL    EAX,20\t\t# Shift exponent into place\n\t"
-            "TEST   EBX,ECX\t\t# Check for overflow\n\t"
-            "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
-            "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
-            "MOV    [ESP+0],0\n\t"
-            "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
-
-            "ADD    ESP,8"
-             %}
-  ins_encode( push_stack_temp_qword,
-              Opcode(0xD9), Opcode(0xEA),   // fldl2e
-              Opcode(0xDE), Opcode(0xC9),   // fmulp
-              pow_exp_core_encoding,
-              pop_stack_temp_qword);
-  ins_pipe( pipe_slow );
-%}
-
-instruct expD_reg(regD dst, regD src, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
+  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
+  format %{ "fast_exp $dpr1 -> $dpr1  // KILL $rax, $rcx, $rdx" %}
+  ins_encode %{
+    __ fast_exp();
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct expD_reg(regD dst, regD src, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
   predicate (UseSSE>=2);
   match(Set dst (ExpD src));
-  effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx);
-  format %{ "SUB    ESP,8\t\t# Fast-path EXP encoding\n\t"
-            "MOVSD  [ESP],$src\n\t"
-            "FLDL2E \t\t\t# Ld log2(e) X\n\t"
-            "FMULP  \t\t\t# Q=X*log2(e) X\n\t"
-
-            "FDUP   \t\t\t# Q Q\n\t"
-            "FRNDINT\t\t\t# int(Q) Q\n\t"
-            "FSUB   ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
-            "FISTP  dword [ESP]\n\t"
-            "F2XM1  \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
-            "FLD1   \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
-            "FADDP  \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
-            "MOV    EAX,[ESP]\t# Pick up int(Q)\n\t"
-            "MOV    ECX,0xFFFFF800\t# Overflow mask\n\t"
-            "ADD    EAX,1023\t\t# Double exponent bias\n\t"
-            "MOV    EBX,EAX\t\t# Preshifted biased expo\n\t"
-            "SHL    EAX,20\t\t# Shift exponent into place\n\t"
-            "TEST   EBX,ECX\t\t# Check for overflow\n\t"
-            "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
-            "MOV    [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
-            "MOV    [ESP+0],0\n\t"
-            "FMUL   ST(0),[ESP+0]\t# Scale\n\t"
-
-            "FST_D  [ESP]\n\t"
-            "MOVSD  $dst,[ESP]\n\t"
-            "ADD    ESP,8"
-             %}
-  ins_encode( Push_SrcD(src),
-              Opcode(0xD9), Opcode(0xEA),   // fldl2e
-              Opcode(0xDE), Opcode(0xC9),   // fmulp
-              pow_exp_core_encoding,
-              Push_ResultD(dst) );
-  ins_pipe( pipe_slow );
-%}
-
-
+  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
+  format %{ "fast_exp $dst -> $src  // KILL $rax, $rcx, $rdx" %}
+  ins_encode %{
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ fast_exp();
+    __ fstp_d(Address(rsp, 0));
+    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
+    __ addptr(rsp, 8);
+  %}
+  ins_pipe( pipe_slow );
+%}
 
 instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{
   predicate (UseSSE<=1);
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Tue May 15 10:10:23 2012 +0200
@@ -9823,7 +9823,39 @@
   ins_pipe( pipe_slow );
 %}
 
-
+instruct powD_reg(regD dst, regD src0, regD src1, rax_RegI rax, rdx_RegI rdx, rcx_RegI rcx, rFlagsReg cr) %{
+  match(Set dst (PowD src0 src1));  // Raise src0 to the src1'th power
+  effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
+  format %{ "fast_pow $src0 $src1 -> $dst  // KILL $rax, $rcx, $rdx" %}
+  ins_encode %{
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src1$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ movdbl(Address(rsp, 0), $src0$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ fast_pow();
+    __ fstp_d(Address(rsp, 0));
+    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
+    __ addptr(rsp, 8);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct expD_reg(regD dst, regD src, rax_RegI rax, rdx_RegI rdx, rcx_RegI rcx, rFlagsReg cr) %{
+  match(Set dst (ExpD src));
+  effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
+  format %{ "fast_exp $dst -> $src  // KILL $rax, $rcx, $rdx" %}
+  ins_encode %{
+    __ subptr(rsp, 8);
+    __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+    __ fld_d(Address(rsp, 0));
+    __ fast_exp();
+    __ fstp_d(Address(rsp, 0));
+    __ movdbl($dst$$XMMRegister, Address(rsp, 0));
+    __ addptr(rsp, 8);
+  %}
+  ins_pipe( pipe_slow );
+%}
 
 //----------Arithmetic Conversion Instructions---------------------------------
 
--- a/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/share/vm/c1/c1_GraphBuilder.cpp	Tue May 15 10:10:23 2012 +0200
@@ -2949,6 +2949,8 @@
   case vmIntrinsics::_dtan          : // fall through
   case vmIntrinsics::_dlog          : // fall through
   case vmIntrinsics::_dlog10        : // fall through
+  case vmIntrinsics::_dexp          : // fall through
+  case vmIntrinsics::_dpow          : // fall through
     {
       // Compiles where the root method is an intrinsic need a special
       // compilation environment because the bytecodes for the method
@@ -2969,6 +2971,9 @@
       _state = start_block->state()->copy_for_parsing();
       _last  = start_block;
       load_local(doubleType, 0);
+      if (scope->method()->intrinsic_id() == vmIntrinsics::_dpow) {
+        load_local(doubleType, 2);
+      }
 
       // Emit the intrinsic node.
       bool result = try_inline_intrinsics(scope->method());
@@ -3182,6 +3187,8 @@
     case vmIntrinsics::_dtan          : // fall through
     case vmIntrinsics::_dlog          : // fall through
     case vmIntrinsics::_dlog10        : // fall through
+    case vmIntrinsics::_dexp          : // fall through
+    case vmIntrinsics::_dpow          : // fall through
       if (!InlineMathNatives) return false;
       cantrap = false;
       preserves_state = true;
--- a/hotspot/src/share/vm/c1/c1_LIR.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/share/vm/c1/c1_LIR.cpp	Tue May 15 10:10:23 2012 +0200
@@ -624,11 +624,13 @@
     {
       assert(op->as_Op2() != NULL, "must be");
       LIR_Op2* op2 = (LIR_Op2*)op;
+      assert(op2->_tmp2->is_illegal() && op2->_tmp3->is_illegal() &&
+             op2->_tmp4->is_illegal() && op2->_tmp5->is_illegal(), "not used");
 
       if (op2->_info)                     do_info(op2->_info);
       if (op2->_opr1->is_valid())         do_input(op2->_opr1);
       if (op2->_opr2->is_valid())         do_input(op2->_opr2);
-      if (op2->_tmp->is_valid())          do_temp(op2->_tmp);
+      if (op2->_tmp1->is_valid())         do_temp(op2->_tmp1);
       if (op2->_result->is_valid())       do_output(op2->_result);
 
       break;
@@ -641,7 +643,8 @@
       assert(op->as_Op2() != NULL, "must be");
       LIR_Op2* op2 = (LIR_Op2*)op;
 
-      assert(op2->_info == NULL && op2->_tmp->is_illegal(), "not used");
+      assert(op2->_info == NULL && op2->_tmp1->is_illegal() && op2->_tmp2->is_illegal() &&
+             op2->_tmp3->is_illegal() && op2->_tmp4->is_illegal() && op2->_tmp5->is_illegal(), "not used");
       assert(op2->_opr1->is_valid() && op2->_opr2->is_valid() && op2->_result->is_valid(), "used");
 
       do_input(op2->_opr1);
@@ -665,10 +668,12 @@
       assert(op2->_opr1->is_valid(), "used");
       assert(op2->_opr2->is_valid(), "used");
       assert(op2->_result->is_valid(), "used");
+      assert(op2->_tmp2->is_illegal() && op2->_tmp3->is_illegal() &&
+             op2->_tmp4->is_illegal() && op2->_tmp5->is_illegal(), "not used");
 
       do_input(op2->_opr1); do_temp(op2->_opr1);
       do_input(op2->_opr2); do_temp(op2->_opr2);
-      if (op2->_tmp->is_valid()) do_temp(op2->_tmp);
+      if (op2->_tmp1->is_valid()) do_temp(op2->_tmp1);
       do_output(op2->_result);
 
       break;
@@ -682,6 +687,8 @@
       if (op2->_opr1->is_valid())         do_temp(op2->_opr1);
       if (op2->_opr2->is_valid())         do_input(op2->_opr2); // exception object is input parameter
       assert(op2->_result->is_illegal(), "no result");
+      assert(op2->_tmp2->is_illegal() && op2->_tmp3->is_illegal() &&
+             op2->_tmp4->is_illegal() && op2->_tmp5->is_illegal(), "not used");
 
       break;
     }
@@ -702,7 +709,8 @@
     case lir_sin:
     case lir_cos:
     case lir_log:
-    case lir_log10: {
+    case lir_log10:
+    case lir_exp: {
       assert(op->as_Op2() != NULL, "must be");
       LIR_Op2* op2 = (LIR_Op2*)op;
 
@@ -711,16 +719,47 @@
       // Register input operand as temp to guarantee that it doesn't
       // overlap with the input.
       assert(op2->_info == NULL, "not used");
+      assert(op2->_tmp5->is_illegal(), "not used");
+      assert(op2->_tmp2->is_valid() == (op->code() == lir_exp), "not used");
+      assert(op2->_tmp3->is_valid() == (op->code() == lir_exp), "not used");
+      assert(op2->_tmp4->is_valid() == (op->code() == lir_exp), "not used");
       assert(op2->_opr1->is_valid(), "used");
       do_input(op2->_opr1); do_temp(op2->_opr1);
 
       if (op2->_opr2->is_valid())         do_temp(op2->_opr2);
-      if (op2->_tmp->is_valid())          do_temp(op2->_tmp);
+      if (op2->_tmp1->is_valid())         do_temp(op2->_tmp1);
+      if (op2->_tmp2->is_valid())         do_temp(op2->_tmp2);
+      if (op2->_tmp3->is_valid())         do_temp(op2->_tmp3);
+      if (op2->_tmp4->is_valid())         do_temp(op2->_tmp4);
       if (op2->_result->is_valid())       do_output(op2->_result);
 
       break;
     }
 
+    case lir_pow: {
+      assert(op->as_Op2() != NULL, "must be");
+      LIR_Op2* op2 = (LIR_Op2*)op;
+
+      // On x86 pow needs two temporary fpu stack slots: tmp1 and
+      // tmp2. Register input operands as temps to guarantee that it
+      // doesn't overlap with the temporary slots.
+      assert(op2->_info == NULL, "not used");
+      assert(op2->_opr1->is_valid() && op2->_opr2->is_valid(), "used");
+      assert(op2->_tmp1->is_valid() && op2->_tmp2->is_valid() && op2->_tmp3->is_valid()
+             && op2->_tmp4->is_valid() && op2->_tmp5->is_valid(), "used");
+      assert(op2->_result->is_valid(), "used");
+
+      do_input(op2->_opr1); do_temp(op2->_opr1);
+      do_input(op2->_opr2); do_temp(op2->_opr2);
+      do_temp(op2->_tmp1);
+      do_temp(op2->_tmp2);
+      do_temp(op2->_tmp3);
+      do_temp(op2->_tmp4);
+      do_temp(op2->_tmp5);
+      do_output(op2->_result);
+
+      break;
+    }
 
 // LIR_Op3
     case lir_idiv:
@@ -1670,6 +1709,8 @@
      case lir_tan:                   s = "tan";           break;
      case lir_log:                   s = "log";           break;
      case lir_log10:                 s = "log10";         break;
+     case lir_exp:                   s = "exp";           break;
+     case lir_pow:                   s = "pow";           break;
      case lir_logic_and:             s = "logic_and";     break;
      case lir_logic_or:              s = "logic_or";      break;
      case lir_logic_xor:             s = "logic_xor";     break;
@@ -1892,7 +1933,11 @@
   }
   in_opr1()->print(out);    out->print(" ");
   in_opr2()->print(out);    out->print(" ");
-  if (tmp_opr()->is_valid()) { tmp_opr()->print(out);    out->print(" "); }
+  if (tmp1_opr()->is_valid()) { tmp1_opr()->print(out);    out->print(" "); }
+  if (tmp2_opr()->is_valid()) { tmp2_opr()->print(out);    out->print(" "); }
+  if (tmp3_opr()->is_valid()) { tmp3_opr()->print(out);    out->print(" "); }
+  if (tmp4_opr()->is_valid()) { tmp4_opr()->print(out);    out->print(" "); }
+  if (tmp5_opr()->is_valid()) { tmp5_opr()->print(out);    out->print(" "); }
   result_opr()->print(out);
 }
 
--- a/hotspot/src/share/vm/c1/c1_LIR.hpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/share/vm/c1/c1_LIR.hpp	Tue May 15 10:10:23 2012 +0200
@@ -916,6 +916,8 @@
       , lir_tan
       , lir_log
       , lir_log10
+      , lir_exp
+      , lir_pow
       , lir_logic_and
       , lir_logic_or
       , lir_logic_xor
@@ -1560,7 +1562,11 @@
   LIR_Opr   _opr1;
   LIR_Opr   _opr2;
   BasicType _type;
-  LIR_Opr   _tmp;
+  LIR_Opr   _tmp1;
+  LIR_Opr   _tmp2;
+  LIR_Opr   _tmp3;
+  LIR_Opr   _tmp4;
+  LIR_Opr   _tmp5;
   LIR_Condition _condition;
 
   void verify() const;
@@ -1573,7 +1579,11 @@
     , _type(T_ILLEGAL)
     , _condition(condition)
     , _fpu_stack_size(0)
-    , _tmp(LIR_OprFact::illegalOpr) {
+    , _tmp1(LIR_OprFact::illegalOpr)
+    , _tmp2(LIR_OprFact::illegalOpr)
+    , _tmp3(LIR_OprFact::illegalOpr)
+    , _tmp4(LIR_OprFact::illegalOpr)
+    , _tmp5(LIR_OprFact::illegalOpr) {
     assert(code == lir_cmp, "code check");
   }
 
@@ -1584,7 +1594,11 @@
     , _type(type)
     , _condition(condition)
     , _fpu_stack_size(0)
-    , _tmp(LIR_OprFact::illegalOpr) {
+    , _tmp1(LIR_OprFact::illegalOpr)
+    , _tmp2(LIR_OprFact::illegalOpr)
+    , _tmp3(LIR_OprFact::illegalOpr)
+    , _tmp4(LIR_OprFact::illegalOpr)
+    , _tmp5(LIR_OprFact::illegalOpr) {
     assert(code == lir_cmove, "code check");
     assert(type != T_ILLEGAL, "cmove should have type");
   }
@@ -1597,25 +1611,38 @@
     , _type(type)
     , _condition(lir_cond_unknown)
     , _fpu_stack_size(0)
-    , _tmp(LIR_OprFact::illegalOpr) {
+    , _tmp1(LIR_OprFact::illegalOpr)
+    , _tmp2(LIR_OprFact::illegalOpr)
+    , _tmp3(LIR_OprFact::illegalOpr)
+    , _tmp4(LIR_OprFact::illegalOpr)
+    , _tmp5(LIR_OprFact::illegalOpr) {
     assert(code != lir_cmp && is_in_range(code, begin_op2, end_op2), "code check");
   }
 
-  LIR_Op2(LIR_Code code, LIR_Opr opr1, LIR_Opr opr2, LIR_Opr result, LIR_Opr tmp)
+  LIR_Op2(LIR_Code code, LIR_Opr opr1, LIR_Opr opr2, LIR_Opr result, LIR_Opr tmp1, LIR_Opr tmp2 = LIR_OprFact::illegalOpr,
+          LIR_Opr tmp3 = LIR_OprFact::illegalOpr, LIR_Opr tmp4 = LIR_OprFact::illegalOpr, LIR_Opr tmp5 = LIR_OprFact::illegalOpr)
     : LIR_Op(code, result, NULL)
     , _opr1(opr1)
     , _opr2(opr2)
     , _type(T_ILLEGAL)
     , _condition(lir_cond_unknown)
     , _fpu_stack_size(0)
-    , _tmp(tmp) {
+    , _tmp1(tmp1)
+    , _tmp2(tmp2)
+    , _tmp3(tmp3)
+    , _tmp4(tmp4)
+    , _tmp5(tmp5) {
     assert(code != lir_cmp && is_in_range(code, begin_op2, end_op2), "code check");
   }
 
   LIR_Opr in_opr1() const                        { return _opr1; }
   LIR_Opr in_opr2() const                        { return _opr2; }
   BasicType type()  const                        { return _type; }
-  LIR_Opr tmp_opr() const                        { return _tmp; }
+  LIR_Opr tmp1_opr() const                       { return _tmp1; }
+  LIR_Opr tmp2_opr() const                       { return _tmp2; }
+  LIR_Opr tmp3_opr() const                       { return _tmp3; }
+  LIR_Opr tmp4_opr() const                       { return _tmp4; }
+  LIR_Opr tmp5_opr() const                       { return _tmp5; }
   LIR_Condition condition() const  {
     assert(code() == lir_cmp || code() == lir_cmove, "only valid for cmp and cmove"); return _condition;
   }
@@ -2025,6 +2052,8 @@
   void sin (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_sin , from, tmp1, to, tmp2)); }
   void cos (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_cos , from, tmp1, to, tmp2)); }
   void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); }
+  void exp (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5)                { append(new LIR_Op2(lir_exp , from, tmp1, to, tmp2, tmp3, tmp4, tmp5)); }
+  void pow (LIR_Opr arg1, LIR_Opr arg2, LIR_Opr res, LIR_Opr tmp1, LIR_Opr tmp2, LIR_Opr tmp3, LIR_Opr tmp4, LIR_Opr tmp5) { append(new LIR_Op2(lir_pow, arg1, arg2, res, tmp1, tmp2, tmp3, tmp4, tmp5)); }
 
   void add (LIR_Opr left, LIR_Opr right, LIR_Opr res)      { append(new LIR_Op2(lir_add, left, right, res)); }
   void sub (LIR_Opr left, LIR_Opr right, LIR_Opr res, CodeEmitInfo* info = NULL) { append(new LIR_Op2(lir_sub, left, right, res, info)); }
--- a/hotspot/src/share/vm/c1/c1_LIRAssembler.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/share/vm/c1/c1_LIRAssembler.cpp	Tue May 15 10:10:23 2012 +0200
@@ -718,7 +718,7 @@
       if (op->in_opr2()->is_constant()) {
         shift_op(op->code(), op->in_opr1(), op->in_opr2()->as_constant_ptr()->as_jint(), op->result_opr());
       } else {
-        shift_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op->tmp_opr());
+        shift_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op->tmp1_opr());
       }
       break;
 
@@ -746,6 +746,8 @@
     case lir_cos:
     case lir_log:
     case lir_log10:
+    case lir_exp:
+    case lir_pow:
       intrinsic_op(op->code(), op->in_opr1(), op->in_opr2(), op->result_opr(), op);
       break;
 
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp	Tue May 15 10:10:23 2012 +0200
@@ -2960,7 +2960,9 @@
   case vmIntrinsics::_dsqrt:          // fall through
   case vmIntrinsics::_dtan:           // fall through
   case vmIntrinsics::_dsin :          // fall through
-  case vmIntrinsics::_dcos :          do_MathIntrinsic(x); break;
+  case vmIntrinsics::_dcos :          // fall through
+  case vmIntrinsics::_dexp :          // fall through
+  case vmIntrinsics::_dpow :          do_MathIntrinsic(x); break;
   case vmIntrinsics::_arraycopy:      do_ArrayCopy(x);     break;
 
   // java.nio.Buffer.checkIndex
--- a/hotspot/src/share/vm/c1/c1_LinearScan.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/share/vm/c1/c1_LinearScan.cpp	Tue May 15 10:10:23 2012 +0200
@@ -6579,6 +6579,8 @@
         case lir_abs:
         case lir_log10:
         case lir_log:
+        case lir_pow:
+        case lir_exp:
         case lir_logic_and:
         case lir_logic_or:
         case lir_logic_xor:
--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp	Tue May 15 10:10:23 2012 +0200
@@ -107,6 +107,8 @@
     java_lang_math_sqrt,                                        // implementation of java.lang.Math.sqrt  (x)
     java_lang_math_log,                                         // implementation of java.lang.Math.log   (x)
     java_lang_math_log10,                                       // implementation of java.lang.Math.log10 (x)
+    java_lang_math_pow,                                         // implementation of java.lang.Math.pow   (x,y)
+    java_lang_math_exp,                                         // implementation of java.lang.Math.exp   (x)
     java_lang_ref_reference_get,                                // implementation of java.lang.ref.Reference.get()
     number_of_method_entries,
     invalid = -1
--- a/hotspot/src/share/vm/interpreter/interpreter.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/share/vm/interpreter/interpreter.cpp	Tue May 15 10:10:23 2012 +0200
@@ -221,6 +221,8 @@
     case vmIntrinsics::_dsqrt : return java_lang_math_sqrt ;
     case vmIntrinsics::_dlog  : return java_lang_math_log  ;
     case vmIntrinsics::_dlog10: return java_lang_math_log10;
+    case vmIntrinsics::_dpow  : return java_lang_math_pow  ;
+    case vmIntrinsics::_dexp  : return java_lang_math_exp  ;
 
     case vmIntrinsics::_Reference_get:
                                 return java_lang_ref_reference_get;
--- a/hotspot/src/share/vm/interpreter/templateInterpreter.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/share/vm/interpreter/templateInterpreter.cpp	Tue May 15 10:10:23 2012 +0200
@@ -370,6 +370,8 @@
   method_entry(java_lang_math_sqrt )
   method_entry(java_lang_math_log  )
   method_entry(java_lang_math_log10)
+  method_entry(java_lang_math_exp  )
+  method_entry(java_lang_math_pow  )
   method_entry(java_lang_ref_reference_get)
 
   // all native method kinds (must be one contiguous block)
--- a/hotspot/src/share/vm/opto/library_call.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/share/vm/opto/library_call.cpp	Tue May 15 10:10:23 2012 +0200
@@ -1557,9 +1557,6 @@
   // every again.  NaN results requires StrictMath.exp handling.
   if (too_many_traps(Deoptimization::Reason_intrinsic))  return false;
 
-  // Do not intrinsify on older platforms which lack cmove.
-  if (ConditionalMoveLimit == 0)  return false;
-
   _sp += arg_size();        // restore stack pointer
   Node *x = pop_math_arg();
   Node *result = _gvn.transform(new (C, 2) ExpDNode(0,x));
@@ -1802,15 +1799,11 @@
   case vmIntrinsics::_dsqrt: return Matcher::has_match_rule(Op_SqrtD) ? inline_sqrt(id) : false;
   case vmIntrinsics::_dabs:  return Matcher::has_match_rule(Op_AbsD)  ? inline_abs(id)  : false;
 
-    // These intrinsics don't work on X86.  The ad implementation doesn't
-    // handle NaN's properly.  Instead of returning infinity, the ad
-    // implementation returns a NaN on overflow. See bug: 6304089
-    // Once the ad implementations are fixed, change the code below
-    // to match the intrinsics above
-
   case vmIntrinsics::_dexp:  return
+    Matcher::has_match_rule(Op_ExpD) ? inline_exp(id) :
     runtime_math(OptoRuntime::Math_D_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dexp), "EXP");
   case vmIntrinsics::_dpow:  return
+    Matcher::has_match_rule(Op_PowD) ? inline_pow(id) :
     runtime_math(OptoRuntime::Math_DD_D_Type(), CAST_FROM_FN_PTR(address, SharedRuntime::dpow), "POW");
 
    // These intrinsics are not yet correctly implemented
--- a/hotspot/src/share/vm/opto/subnode.cpp	Mon May 14 09:36:00 2012 -0700
+++ b/hotspot/src/share/vm/opto/subnode.cpp	Tue May 15 10:10:23 2012 +0200
@@ -1314,7 +1314,5 @@
   if( t2->base() != Type::DoubleCon ) return Type::DOUBLE;
   double d1 = t1->getd();
   double d2 = t2->getd();
-  if( d1 < 0.0 ) return Type::DOUBLE;
-  if( d2 < 0.0 ) return Type::DOUBLE;
   return TypeD::make( StubRoutines::intrinsic_pow( d1, d2 ) );
 }