hotspot/src/cpu/x86/vm/assembler_x86.cpp
changeset 12739 09f26b73ae66
parent 12268 f7897aacb9ce
child 12742 b055b648431a
equal deleted inserted replaced
12623:09fcb0dc71ad 12739:09f26b73ae66
  3576 void Assembler::fyl2x() {
  3576 void Assembler::fyl2x() {
  3577   emit_byte(0xD9);
  3577   emit_byte(0xD9);
  3578   emit_byte(0xF1);
  3578   emit_byte(0xF1);
  3579 }
  3579 }
  3580 
  3580 
       
  3581 void Assembler::frndint() {
       
  3582   emit_byte(0xD9);
       
  3583   emit_byte(0xFC);
       
  3584 }
       
  3585 
       
  3586 void Assembler::f2xm1() {
       
  3587   emit_byte(0xD9);
       
  3588   emit_byte(0xF0);
       
  3589 }
       
  3590 
       
  3591 void Assembler::fldl2e() {
       
  3592   emit_byte(0xD9);
       
  3593   emit_byte(0xEA);
       
  3594 }
       
  3595 
  3581 // SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
  3596 // SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
  3582 static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
  3597 static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
  3583 // SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
  3598 // SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
  3584 static int simd_opc[4] = { 0,    0, 0x38, 0x3A };
  3599 static int simd_opc[4] = { 0,    0, 0x38, 0x3A };
  3585 
  3600 
  6866 
  6881 
  6867 void MacroAssembler::fldcw(AddressLiteral src) {
  6882 void MacroAssembler::fldcw(AddressLiteral src) {
  6868   Assembler::fldcw(as_Address(src));
  6883   Assembler::fldcw(as_Address(src));
  6869 }
  6884 }
  6870 
  6885 
       
  6886 void MacroAssembler::pow_exp_core_encoding() {
       
  6887   // kills rax, rcx, rdx
       
  6888   subptr(rsp,sizeof(jdouble));
       
  6889   // computes 2^X. Stack: X ...
       
  6890   // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
       
  6891   // keep it on the thread's stack to compute 2^int(X) later
       
  6892   // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
       
  6893   // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
       
  6894   fld_s(0);                 // Stack: X X ...
       
  6895   frndint();                // Stack: int(X) X ...
       
  6896   fsuba(1);                 // Stack: int(X) X-int(X) ...
       
  6897   fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
       
  6898   f2xm1();                  // Stack: 2^(X-int(X))-1 ...
       
  6899   fld1();                   // Stack: 1 2^(X-int(X))-1 ...
       
  6900   faddp(1);                 // Stack: 2^(X-int(X))
       
  6901   // computes 2^(int(X)): add exponent bias (1023) to int(X), then
       
  6902   // shift int(X)+1023 to exponent position.
       
  6903   // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
       
  6904   // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
       
  6905   // values so detect them and set result to NaN.
       
  6906   movl(rax,Address(rsp,0));
       
  6907   movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
       
  6908   addl(rax, 1023);
       
  6909   movl(rdx,rax);
       
  6910   shll(rax,20);
       
  6911   // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
       
  6912   addl(rdx,1);
       
  6913   // Check that 1 < int(X)+1023+1 < 2048
       
  6914   // in 3 steps:
       
  6915   // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
       
  6916   // 2- (int(X)+1023+1)&-2048 != 0
       
  6917   // 3- (int(X)+1023+1)&-2048 != 1
       
  6918   // Do 2- first because addl just updated the flags.
       
  6919   cmov32(Assembler::equal,rax,rcx);
       
  6920   cmpl(rdx,1);
       
  6921   cmov32(Assembler::equal,rax,rcx);
       
  6922   testl(rdx,rcx);
       
  6923   cmov32(Assembler::notEqual,rax,rcx);
       
  6924   movl(Address(rsp,4),rax);
       
  6925   movl(Address(rsp,0),0);
       
  6926   fmul_d(Address(rsp,0));   // Stack: 2^X ...
       
  6927   addptr(rsp,sizeof(jdouble));
       
  6928 }
       
  6929 
       
  6930 void MacroAssembler::fast_pow() {
       
  6931   // computes X^Y = 2^(Y * log2(X))
       
  6932   // if fast computation is not possible, result is NaN. Requires
       
  6933   // fallback from user of this macro.
       
  6934   fyl2x();                 // Stack: (Y*log2(X)) ...
       
  6935   pow_exp_core_encoding(); // Stack: exp(X) ...
       
  6936 }
       
  6937 
       
  6938 void MacroAssembler::fast_exp() {
       
  6939   // computes exp(X) = 2^(X * log2(e))
       
  6940   // if fast computation is not possible, result is NaN. Requires
       
  6941   // fallback from user of this macro.
       
  6942   fldl2e();                // Stack: log2(e) X ...
       
  6943   fmulp(1);                // Stack: (X*log2(e)) ...
       
  6944   pow_exp_core_encoding(); // Stack: exp(X) ...
       
  6945 }
       
  6946 
       
  6947 void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
       
  6948   // kills rax, rcx, rdx
       
  6949   // pow and exp needs 2 extra registers on the fpu stack.
       
  6950   Label slow_case, done;
       
  6951   Register tmp = noreg;
       
  6952   if (!VM_Version::supports_cmov()) {
       
  6953     // fcmp needs a temporary so preserve rdx,
       
  6954     tmp = rdx;
       
  6955   }
       
  6956   Register tmp2 = rax;
       
  6957   NOT_LP64(Register tmp3 = rcx;)
       
  6958 
       
  6959   if (is_exp) {
       
  6960     // Stack: X
       
  6961     fld_s(0);                   // duplicate argument for runtime call. Stack: X X
       
  6962     fast_exp();                 // Stack: exp(X) X
       
  6963     fcmp(tmp, 0, false, false); // Stack: exp(X) X
       
  6964     // exp(X) not equal to itself: exp(X) is NaN go to slow case.
       
  6965     jcc(Assembler::parity, slow_case);
       
  6966     // get rid of duplicate argument. Stack: exp(X)
       
  6967     if (num_fpu_regs_in_use > 0) {
       
  6968       fxch();
       
  6969       fpop();
       
  6970     } else {
       
  6971       ffree(1);
       
  6972     }
       
  6973     jmp(done);
       
  6974   } else {
       
  6975     // Stack: X Y
       
  6976     Label x_negative, y_odd;
       
  6977 
       
  6978     fldz();                     // Stack: 0 X Y
       
  6979     fcmp(tmp, 1, true, false);  // Stack: X Y
       
  6980     jcc(Assembler::above, x_negative);
       
  6981 
       
  6982     // X >= 0
       
  6983 
       
  6984     fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
       
  6985     fld_s(1);                   // Stack: X Y X Y
       
  6986     fast_pow();                 // Stack: X^Y X Y
       
  6987     fcmp(tmp, 0, false, false); // Stack: X^Y X Y
       
  6988     // X^Y not equal to itself: X^Y is NaN go to slow case.
       
  6989     jcc(Assembler::parity, slow_case);
       
  6990     // get rid of duplicate arguments. Stack: X^Y
       
  6991     if (num_fpu_regs_in_use > 0) {
       
  6992       fxch(); fpop();
       
  6993       fxch(); fpop();
       
  6994     } else {
       
  6995       ffree(2);
       
  6996       ffree(1);
       
  6997     }
       
  6998     jmp(done);
       
  6999 
       
  7000     // X <= 0
       
  7001     bind(x_negative);
       
  7002 
       
  7003     fld_s(1);                   // Stack: Y X Y
       
  7004     frndint();                  // Stack: int(Y) X Y
       
  7005     fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
       
  7006     jcc(Assembler::notEqual, slow_case);
       
  7007 
       
  7008     subptr(rsp, 8);
       
  7009 
       
  7010     // For X^Y, when X < 0, Y has to be an integer and the final
       
  7011     // result depends on whether it's odd or even. We just checked
       
  7012     // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
       
  7013     // integer to test its parity. If int(Y) is huge and doesn't fit
       
  7014     // in the 64 bit integer range, the integer indefinite value will
       
  7015     // end up in the gp registers. Huge numbers are all even, the
       
  7016     // integer indefinite number is even so it's fine.
       
  7017 
       
  7018 #ifdef ASSERT
       
  7019     // Let's check we don't end up with an integer indefinite number
       
  7020     // when not expected. First test for huge numbers: check whether
       
  7021     // int(Y)+1 == int(Y) which is true for very large numbers and
       
  7022     // those are all even. A 64 bit integer is guaranteed to not
       
  7023     // overflow for numbers where y+1 != y (when precision is set to
       
  7024     // double precision).
       
  7025     Label y_not_huge;
       
  7026 
       
  7027     fld1();                     // Stack: 1 int(Y) X Y
       
  7028     fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
       
  7029 
       
  7030 #ifdef _LP64
       
  7031     // trip to memory to force the precision down from double extended
       
  7032     // precision
       
  7033     fstp_d(Address(rsp, 0));
       
  7034     fld_d(Address(rsp, 0));
       
  7035 #endif
       
  7036 
       
  7037     fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
       
  7038 #endif
       
  7039 
       
  7040     // move int(Y) as 64 bit integer to thread's stack
       
  7041     fistp_d(Address(rsp,0));    // Stack: X Y
       
  7042 
       
  7043 #ifdef ASSERT
       
  7044     jcc(Assembler::notEqual, y_not_huge);
       
  7045 
       
  7046     // Y is huge so we know it's even. It may not fit in a 64 bit
       
  7047     // integer and we don't want the debug code below to see the
       
  7048     // integer indefinite value so overwrite int(Y) on the thread's
       
  7049     // stack with 0.
       
  7050     movl(Address(rsp, 0), 0);
       
  7051     movl(Address(rsp, 4), 0);
       
  7052 
       
  7053     bind(y_not_huge);
       
  7054 #endif
       
  7055 
       
  7056     fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
       
  7057     fld_s(1);                   // Stack: X Y X Y
       
  7058     fabs();                     // Stack: abs(X) Y X Y
       
  7059     fast_pow();                 // Stack: abs(X)^Y X Y
       
  7060     fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
       
  7061     // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
       
  7062 
       
  7063     pop(tmp2);
       
  7064     NOT_LP64(pop(tmp3));
       
  7065     jcc(Assembler::parity, slow_case);
       
  7066 
       
  7067 #ifdef ASSERT
       
  7068     // Check that int(Y) is not integer indefinite value (int
       
  7069     // overflow). Shouldn't happen because for values that would
       
  7070     // overflow, 1+int(Y)==Y which was tested earlier.
       
  7071 #ifndef _LP64
       
  7072     {
       
  7073       Label integer;
       
  7074       testl(tmp2, tmp2);
       
  7075       jcc(Assembler::notZero, integer);
       
  7076       cmpl(tmp3, 0x80000000);
       
  7077       jcc(Assembler::notZero, integer);
       
  7078       stop("integer indefinite value shouldn't be seen here");
       
  7079       bind(integer);
       
  7080     }
       
  7081 #else
       
  7082     {
       
  7083       Label integer;
       
  7084       shlq(tmp2, 1);
       
  7085       jcc(Assembler::carryClear, integer);
       
  7086       jcc(Assembler::notZero, integer);
       
  7087       stop("integer indefinite value shouldn't be seen here");
       
  7088       bind(integer);
       
  7089     }
       
  7090 #endif
       
  7091 #endif
       
  7092 
       
  7093     // get rid of duplicate arguments. Stack: X^Y
       
  7094     if (num_fpu_regs_in_use > 0) {
       
  7095       fxch(); fpop();
       
  7096       fxch(); fpop();
       
  7097     } else {
       
  7098       ffree(2);
       
  7099       ffree(1);
       
  7100     }
       
  7101 
       
  7102     testl(tmp2, 1);
       
  7103     jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
       
  7104     // X <= 0, Y even: X^Y = -abs(X)^Y
       
  7105 
       
  7106     fchs();                     // Stack: -abs(X)^Y Y
       
  7107     jmp(done);
       
  7108   }
       
  7109 
       
  7110   // slow case: runtime call
       
  7111   bind(slow_case);
       
  7112 
       
  7113   fpop();                       // pop incorrect result or int(Y)
       
  7114 
       
  7115   fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
       
  7116                       is_exp ? 1 : 2, num_fpu_regs_in_use);
       
  7117 
       
  7118   // Come here with result in F-TOS
       
  7119   bind(done);
       
  7120 }
       
  7121 
  6871 void MacroAssembler::fpop() {
  7122 void MacroAssembler::fpop() {
  6872   ffree();
  7123   ffree();
  6873   fincstp();
  7124   fincstp();
  6874 }
  7125 }
  6875 
  7126 
  8043   }
  8294   }
  8044   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
  8295   adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0);
  8045 #endif
  8296 #endif
  8046 }
  8297 }
  8047 
  8298 
       
  8299 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
       
  8300   pusha();
       
  8301 
       
  8302   // if we are coming from c1, xmm registers may be live
       
  8303   if (UseSSE >= 1) {
       
  8304     subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
       
  8305   }
       
  8306   int off = 0;
       
  8307   if (UseSSE == 1)  {
       
  8308     movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
       
  8309     movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
       
  8310     movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
       
  8311     movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
       
  8312     movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
       
  8313     movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
       
  8314     movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
       
  8315     movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
       
  8316   } else if (UseSSE >= 2)  {
       
  8317     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0);
       
  8318     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1);
       
  8319     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2);
       
  8320     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3);
       
  8321     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4);
       
  8322     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5);
       
  8323     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6);
       
  8324     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7);
       
  8325 #ifdef _LP64
       
  8326     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8);
       
  8327     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9);
       
  8328     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10);
       
  8329     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11);
       
  8330     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12);
       
  8331     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13);
       
  8332     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14);
       
  8333     movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15);
       
  8334 #endif
       
  8335   }
       
  8336 
       
  8337   // Preserve registers across runtime call
       
  8338   int incoming_argument_and_return_value_offset = -1;
       
  8339   if (num_fpu_regs_in_use > 1) {
       
  8340     // Must preserve all other FPU regs (could alternatively convert
       
  8341     // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
       
  8342     // FPU state, but can not trust C compiler)
       
  8343     NEEDS_CLEANUP;
       
  8344     // NOTE that in this case we also push the incoming argument(s) to
       
  8345     // the stack and restore it later; we also use this stack slot to
       
  8346     // hold the return value from dsin, dcos etc.
       
  8347     for (int i = 0; i < num_fpu_regs_in_use; i++) {
       
  8348       subptr(rsp, sizeof(jdouble));
       
  8349       fstp_d(Address(rsp, 0));
       
  8350     }
       
  8351     incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
       
  8352     for (int i = nb_args-1; i >= 0; i--) {
       
  8353       fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
       
  8354     }
       
  8355   }
       
  8356 
       
  8357   subptr(rsp, nb_args*sizeof(jdouble));
       
  8358   for (int i = 0; i < nb_args; i++) {
       
  8359     fstp_d(Address(rsp, i*sizeof(jdouble)));
       
  8360   }
       
  8361 
       
  8362 #ifdef _LP64
       
  8363   if (nb_args > 0) {
       
  8364     movdbl(xmm0, Address(rsp, 0));
       
  8365   }
       
  8366   if (nb_args > 1) {
       
  8367     movdbl(xmm1, Address(rsp, sizeof(jdouble)));
       
  8368   }
       
  8369   assert(nb_args <= 2, "unsupported number of args");
       
  8370 #endif // _LP64
       
  8371 
       
  8372   // NOTE: we must not use call_VM_leaf here because that requires a
       
  8373   // complete interpreter frame in debug mode -- same bug as 4387334
       
  8374   // MacroAssembler::call_VM_leaf_base is perfectly safe and will
       
  8375   // do proper 64bit abi
       
  8376 
       
  8377   NEEDS_CLEANUP;
       
  8378   // Need to add stack banging before this runtime call if it needs to
       
  8379   // be taken; however, there is no generic stack banging routine at
       
  8380   // the MacroAssembler level
       
  8381 
       
  8382   MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
       
  8383 
       
  8384 #ifdef _LP64
       
  8385   movsd(Address(rsp, 0), xmm0);
       
  8386   fld_d(Address(rsp, 0));
       
  8387 #endif // _LP64
       
  8388   addptr(rsp, sizeof(jdouble) * nb_args);
       
  8389   if (num_fpu_regs_in_use > 1) {
       
  8390     // Must save return value to stack and then restore entire FPU
       
  8391     // stack except incoming arguments
       
  8392     fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
       
  8393     for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
       
  8394       fld_d(Address(rsp, 0));
       
  8395       addptr(rsp, sizeof(jdouble));
       
  8396     }
       
  8397     fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
       
  8398     addptr(rsp, sizeof(jdouble) * nb_args);
       
  8399   }
       
  8400 
       
  8401   off = 0;
       
  8402   if (UseSSE == 1)  {
       
  8403     movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
       
  8404     movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
       
  8405     movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
       
  8406     movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
       
  8407     movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
       
  8408     movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
       
  8409     movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
       
  8410     movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
       
  8411   } else if (UseSSE >= 2)  {
       
  8412     movdbl(xmm0, Address(rsp,off++*sizeof(jdouble)));
       
  8413     movdbl(xmm1, Address(rsp,off++*sizeof(jdouble)));
       
  8414     movdbl(xmm2, Address(rsp,off++*sizeof(jdouble)));
       
  8415     movdbl(xmm3, Address(rsp,off++*sizeof(jdouble)));
       
  8416     movdbl(xmm4, Address(rsp,off++*sizeof(jdouble)));
       
  8417     movdbl(xmm5, Address(rsp,off++*sizeof(jdouble)));
       
  8418     movdbl(xmm6, Address(rsp,off++*sizeof(jdouble)));
       
  8419     movdbl(xmm7, Address(rsp,off++*sizeof(jdouble)));
       
  8420 #ifdef _LP64
       
  8421     movdbl(xmm8, Address(rsp,off++*sizeof(jdouble)));
       
  8422     movdbl(xmm9, Address(rsp,off++*sizeof(jdouble)));
       
  8423     movdbl(xmm10, Address(rsp,off++*sizeof(jdouble)));
       
  8424     movdbl(xmm11, Address(rsp,off++*sizeof(jdouble)));
       
  8425     movdbl(xmm12, Address(rsp,off++*sizeof(jdouble)));
       
  8426     movdbl(xmm13, Address(rsp,off++*sizeof(jdouble)));
       
  8427     movdbl(xmm14, Address(rsp,off++*sizeof(jdouble)));
       
  8428     movdbl(xmm15, Address(rsp,off++*sizeof(jdouble)));
       
  8429 #endif
       
  8430   }
       
  8431   if (UseSSE >= 1) {
       
  8432     addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
       
  8433   }
       
  8434   popa();
       
  8435 }
       
  8436 
  8048 static const double     pi_4 =  0.7853981633974483;
  8437 static const double     pi_4 =  0.7853981633974483;
  8049 
  8438 
  8050 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
  8439 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
  8051   // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
  8440   // A hand-coded argument reduction for values in fabs(pi/4, pi/2)
  8052   // was attempted in this code; unfortunately it appears that the
  8441   // was attempted in this code; unfortunately it appears that the
  8090     jmp(done);
  8479     jmp(done);
  8091   }
  8480   }
  8092 
  8481 
  8093   // slow case: runtime call
  8482   // slow case: runtime call
  8094   bind(slow_case);
  8483   bind(slow_case);
  8095   // Preserve registers across runtime call
  8484 
  8096   pusha();
       
  8097   int incoming_argument_and_return_value_offset = -1;
       
  8098   if (num_fpu_regs_in_use > 1) {
       
  8099     // Must preserve all other FPU regs (could alternatively convert
       
  8100     // SharedRuntime::dsin and dcos into assembly routines known not to trash
       
  8101     // FPU state, but can not trust C compiler)
       
  8102     NEEDS_CLEANUP;
       
  8103     // NOTE that in this case we also push the incoming argument to
       
  8104     // the stack and restore it later; we also use this stack slot to
       
  8105     // hold the return value from dsin or dcos.
       
  8106     for (int i = 0; i < num_fpu_regs_in_use; i++) {
       
  8107       subptr(rsp, sizeof(jdouble));
       
  8108       fstp_d(Address(rsp, 0));
       
  8109     }
       
  8110     incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
       
  8111     fld_d(Address(rsp, incoming_argument_and_return_value_offset));
       
  8112   }
       
  8113   subptr(rsp, sizeof(jdouble));
       
  8114   fstp_d(Address(rsp, 0));
       
  8115 #ifdef _LP64
       
  8116   movdbl(xmm0, Address(rsp, 0));
       
  8117 #endif // _LP64
       
  8118 
       
  8119   // NOTE: we must not use call_VM_leaf here because that requires a
       
  8120   // complete interpreter frame in debug mode -- same bug as 4387334
       
  8121   // MacroAssembler::call_VM_leaf_base is perfectly safe and will
       
  8122   // do proper 64bit abi
       
  8123 
       
  8124   NEEDS_CLEANUP;
       
  8125   // Need to add stack banging before this runtime call if it needs to
       
  8126   // be taken; however, there is no generic stack banging routine at
       
  8127   // the MacroAssembler level
       
  8128   switch(trig) {
  8485   switch(trig) {
  8129   case 's':
  8486   case 's':
  8130     {
  8487     {
  8131       MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 0);
  8488       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
  8132     }
  8489     }
  8133     break;
  8490     break;
  8134   case 'c':
  8491   case 'c':
  8135     {
  8492     {
  8136       MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 0);
  8493       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
  8137     }
  8494     }
  8138     break;
  8495     break;
  8139   case 't':
  8496   case 't':
  8140     {
  8497     {
  8141       MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 0);
  8498       fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
  8142     }
  8499     }
  8143     break;
  8500     break;
  8144   default:
  8501   default:
  8145     assert(false, "bad intrinsic");
  8502     assert(false, "bad intrinsic");
  8146     break;
  8503     break;
  8147   }
  8504   }
  8148 #ifdef _LP64
       
  8149     movsd(Address(rsp, 0), xmm0);
       
  8150     fld_d(Address(rsp, 0));
       
  8151 #endif // _LP64
       
  8152   addptr(rsp, sizeof(jdouble));
       
  8153   if (num_fpu_regs_in_use > 1) {
       
  8154     // Must save return value to stack and then restore entire FPU stack
       
  8155     fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
       
  8156     for (int i = 0; i < num_fpu_regs_in_use; i++) {
       
  8157       fld_d(Address(rsp, 0));
       
  8158       addptr(rsp, sizeof(jdouble));
       
  8159     }
       
  8160   }
       
  8161   popa();
       
  8162 
  8505 
  8163   // Come here with result in F-TOS
  8506   // Come here with result in F-TOS
  8164   bind(done);
  8507   bind(done);
  8165 
  8508 
  8166   if (tmp != noreg) {
  8509   if (tmp != noreg) {