8055494: Add C2 x86 intrinsic for BigInteger::multiplyToLen() method
Summary: Add new C2 intrinsic for BigInteger::multiplyToLen() on x86 in 64-bit VM.
Reviewed-by: roland
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Tue Sep 02 12:48:45 2014 -0700
@@ -4937,6 +4937,26 @@
emit_arith(0x03, 0xC0, dst, src);
}
+void Assembler::adcxq(Register dst, Register src) {
+ //assert(VM_Version::supports_adx(), "adx instructions not supported");
+ emit_int8((unsigned char)0x66);
+ int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+ emit_int8(0x0F);
+ emit_int8(0x38);
+ emit_int8((unsigned char)0xF6);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::adoxq(Register dst, Register src) {
+ //assert(VM_Version::supports_adx(), "adx instructions not supported");
+ emit_int8((unsigned char)0xF3);
+ int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+ emit_int8(0x0F);
+ emit_int8(0x38);
+ emit_int8((unsigned char)0xF6);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
void Assembler::andq(Address dst, int32_t imm32) {
InstructionMark im(this);
prefixq(dst);
@@ -5444,6 +5464,26 @@
emit_int8((unsigned char)(0xC0 | encode));
}
+void Assembler::mulq(Address src) {
+ InstructionMark im(this);
+ prefixq(src);
+ emit_int8((unsigned char)0xF7);
+ emit_operand(rsp, src);
+}
+
+void Assembler::mulq(Register src) {
+ int encode = prefixq_and_encode(src->encoding());
+ emit_int8((unsigned char)0xF7);
+ emit_int8((unsigned char)(0xE0 | encode));
+}
+
+void Assembler::mulxq(Register dst1, Register dst2, Register src) {
+ assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
+ int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, true, false);
+ emit_int8((unsigned char)0xF6);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
void Assembler::negq(Register dst) {
int encode = prefixq_and_encode(dst->encoding());
emit_int8((unsigned char)0xF7);
@@ -5572,6 +5612,28 @@
emit_int8(imm8);
}
}
+
+void Assembler::rorq(Register dst, int imm8) {
+ assert(isShiftCount(imm8 >> 1), "illegal shift count");
+ int encode = prefixq_and_encode(dst->encoding());
+ if (imm8 == 1) {
+ emit_int8((unsigned char)0xD1);
+ emit_int8((unsigned char)(0xC8 | encode));
+ } else {
+ emit_int8((unsigned char)0xC1);
+ emit_int8((unsigned char)(0xc8 | encode));
+ emit_int8(imm8);
+ }
+}
+
+void Assembler::rorxq(Register dst, Register src, int imm8) {
+ assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A, true, false);
+ emit_int8((unsigned char)0xF0);
+ emit_int8((unsigned char)(0xC0 | encode));
+ emit_int8(imm8);
+}
+
void Assembler::sarq(Register dst, int imm8) {
assert(isShiftCount(imm8 >> 1), "illegal shift count");
int encode = prefixq_and_encode(dst->encoding());
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp Tue Sep 02 12:48:45 2014 -0700
@@ -888,6 +888,14 @@
void addq(Register dst, Address src);
void addq(Register dst, Register src);
+#ifdef _LP64
+ //Add Unsigned Integers with Carry Flag
+ void adcxq(Register dst, Register src);
+
+ //Add Unsigned Integers with Overflow Flag
+ void adoxq(Register dst, Register src);
+#endif
+
void addr_nop_4();
void addr_nop_5();
void addr_nop_7();
@@ -1204,19 +1212,20 @@
void idivl(Register src);
void divl(Register src); // Unsigned division
+#ifdef _LP64
void idivq(Register src);
+#endif
void imull(Register dst, Register src);
void imull(Register dst, Register src, int value);
void imull(Register dst, Address src);
+#ifdef _LP64
void imulq(Register dst, Register src);
void imulq(Register dst, Register src, int value);
-#ifdef _LP64
void imulq(Register dst, Address src);
#endif
-
// jcc is the generic conditional branch generator to run-
// time routines, jcc is used for branches to labels. jcc
// takes a branch opcode (cc) and a label (L) and generates
@@ -1408,9 +1417,16 @@
void movzwq(Register dst, Register src);
#endif
+ // Unsigned multiply with RAX destination register
void mull(Address src);
void mull(Register src);
+#ifdef _LP64
+ void mulq(Address src);
+ void mulq(Register src);
+ void mulxq(Register dst1, Register dst2, Register src);
+#endif
+
// Multiply Scalar Double-Precision Floating-Point Values
void mulsd(XMMRegister dst, Address src);
void mulsd(XMMRegister dst, XMMRegister src);
@@ -1541,6 +1557,11 @@
void ret(int imm16);
+#ifdef _LP64
+ void rorq(Register dst, int imm8);
+ void rorxq(Register dst, Register src, int imm8);
+#endif
+
void sahf();
void sarl(Register dst, int imm8);
--- a/hotspot/src/cpu/x86/vm/globals_x86.hpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/globals_x86.hpp Tue Sep 02 12:48:45 2014 -0700
@@ -176,6 +176,8 @@
"Use count trailing zeros instruction") \
\
product(bool, UseBMI1Instructions, false, \
- "Use BMI instructions")
-
+ "Use BMI1 instructions") \
+ \
+ product(bool, UseBMI2Instructions, false, \
+ "Use BMI2 instructions")
#endif // CPU_X86_VM_GLOBALS_X86_HPP
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Tue Sep 02 12:48:45 2014 -0700
@@ -7293,6 +7293,467 @@
bind(L_done);
}
+#ifdef _LP64
+/**
+ * Helper for multiply_to_len().
+ */
+void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
+ addq(dest_lo, src1);
+ adcq(dest_hi, 0);
+ addq(dest_lo, src2);
+ adcq(dest_hi, 0);
+}
+
+/**
+ * Multiply 64 bit by 64 bit first loop.
+ */
+void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
+ Register y, Register y_idx, Register z,
+ Register carry, Register product,
+ Register idx, Register kdx) {
+ //
+ // jlong carry, x[], y[], z[];
+ // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
+ // huge_128 product = y[idx] * x[xstart] + carry;
+ // z[kdx] = (jlong)product;
+ // carry = (jlong)(product >>> 64);
+ // }
+ // z[xstart] = carry;
+ //
+
+ Label L_first_loop, L_first_loop_exit;
+ Label L_one_x, L_one_y, L_multiply;
+
+ decrementl(xstart);
+ jcc(Assembler::negative, L_one_x);
+
+ movq(x_xstart, Address(x, xstart, Address::times_4, 0));
+ rorq(x_xstart, 32); // convert big-endian to little-endian
+
+ bind(L_first_loop);
+ decrementl(idx);
+ jcc(Assembler::negative, L_first_loop_exit);
+ decrementl(idx);
+ jcc(Assembler::negative, L_one_y);
+ movq(y_idx, Address(y, idx, Address::times_4, 0));
+ rorq(y_idx, 32); // convert big-endian to little-endian
+ bind(L_multiply);
+ movq(product, x_xstart);
+ mulq(y_idx); // product(rax) * y_idx -> rdx:rax
+ addq(product, carry);
+ adcq(rdx, 0);
+ subl(kdx, 2);
+ movl(Address(z, kdx, Address::times_4, 4), product);
+ shrq(product, 32);
+ movl(Address(z, kdx, Address::times_4, 0), product);
+ movq(carry, rdx);
+ jmp(L_first_loop);
+
+ bind(L_one_y);
+ movl(y_idx, Address(y, 0));
+ jmp(L_multiply);
+
+ bind(L_one_x);
+ movl(x_xstart, Address(x, 0));
+ jmp(L_first_loop);
+
+ bind(L_first_loop_exit);
+}
+
+/**
+ * Multiply 64 bit by 64 bit and add 128 bit.
+ */
+void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
+ Register yz_idx, Register idx,
+ Register carry, Register product, int offset) {
+ // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
+ // z[kdx] = (jlong)product;
+
+ movq(yz_idx, Address(y, idx, Address::times_4, offset));
+ rorq(yz_idx, 32); // convert big-endian to little-endian
+ movq(product, x_xstart);
+ mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
+ movq(yz_idx, Address(z, idx, Address::times_4, offset));
+ rorq(yz_idx, 32); // convert big-endian to little-endian
+
+ add2_with_carry(rdx, product, carry, yz_idx);
+
+ movl(Address(z, idx, Address::times_4, offset+4), product);
+ shrq(product, 32);
+ movl(Address(z, idx, Address::times_4, offset), product);
+
+}
+
+/**
+ * Multiply 128 bit by 128 bit. Unrolled inner loop.
+ */
+void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
+ Register yz_idx, Register idx, Register jdx,
+ Register carry, Register product,
+ Register carry2) {
+ // jlong carry, x[], y[], z[];
+ // int kdx = ystart+1;
+ // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
+ // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
+ // z[kdx+idx+1] = (jlong)product;
+ // jlong carry2 = (jlong)(product >>> 64);
+ // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
+ // z[kdx+idx] = (jlong)product;
+ // carry = (jlong)(product >>> 64);
+ // }
+ // idx += 2;
+ // if (idx > 0) {
+ // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
+ // z[kdx+idx] = (jlong)product;
+ // carry = (jlong)(product >>> 64);
+ // }
+ //
+
+ Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
+
+ movl(jdx, idx);
+ andl(jdx, 0xFFFFFFFC);
+ shrl(jdx, 2);
+
+ bind(L_third_loop);
+ subl(jdx, 1);
+ jcc(Assembler::negative, L_third_loop_exit);
+ subl(idx, 4);
+
+ multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
+ movq(carry2, rdx);
+
+ multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
+ movq(carry, rdx);
+ jmp(L_third_loop);
+
+ bind (L_third_loop_exit);
+
+ andl (idx, 0x3);
+ jcc(Assembler::zero, L_post_third_loop_done);
+
+ Label L_check_1;
+ subl(idx, 2);
+ jcc(Assembler::negative, L_check_1);
+
+ multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
+ movq(carry, rdx);
+
+ bind (L_check_1);
+ addl (idx, 0x2);
+ andl (idx, 0x1);
+ subl(idx, 1);
+ jcc(Assembler::negative, L_post_third_loop_done);
+
+ movl(yz_idx, Address(y, idx, Address::times_4, 0));
+ movq(product, x_xstart);
+ mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
+ movl(yz_idx, Address(z, idx, Address::times_4, 0));
+
+ add2_with_carry(rdx, product, yz_idx, carry);
+
+ movl(Address(z, idx, Address::times_4, 0), product);
+ shrq(product, 32);
+
+ shlq(rdx, 32);
+ orq(product, rdx);
+ movq(carry, product);
+
+ bind(L_post_third_loop_done);
+}
+
+/**
+ * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
+ *
+ */
+void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
+ Register carry, Register carry2,
+ Register idx, Register jdx,
+ Register yz_idx1, Register yz_idx2,
+ Register tmp, Register tmp3, Register tmp4) {
+ assert(UseBMI2Instructions, "should be used only when BMI2 is available");
+
+ // jlong carry, x[], y[], z[];
+ // int kdx = ystart+1;
+ // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
+ // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
+ // jlong carry2 = (jlong)(tmp3 >>> 64);
+ // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
+ // carry = (jlong)(tmp4 >>> 64);
+ // z[kdx+idx+1] = (jlong)tmp3;
+ // z[kdx+idx] = (jlong)tmp4;
+ // }
+ // idx += 2;
+ // if (idx > 0) {
+ // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
+ // z[kdx+idx] = (jlong)yz_idx1;
+ // carry = (jlong)(yz_idx1 >>> 64);
+ // }
+ //
+
+ Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
+
+ movl(jdx, idx);
+ andl(jdx, 0xFFFFFFFC);
+ shrl(jdx, 2);
+
+ bind(L_third_loop);
+ subl(jdx, 1);
+ jcc(Assembler::negative, L_third_loop_exit);
+ subl(idx, 4);
+
+ movq(yz_idx1, Address(y, idx, Address::times_4, 8));
+ rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
+ movq(yz_idx2, Address(y, idx, Address::times_4, 0));
+ rorxq(yz_idx2, yz_idx2, 32);
+
+ mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
+ mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
+
+ movq(yz_idx1, Address(z, idx, Address::times_4, 8));
+ rorxq(yz_idx1, yz_idx1, 32);
+ movq(yz_idx2, Address(z, idx, Address::times_4, 0));
+ rorxq(yz_idx2, yz_idx2, 32);
+
+ if (VM_Version::supports_adx()) {
+ adcxq(tmp3, carry);
+ adoxq(tmp3, yz_idx1);
+
+ adcxq(tmp4, tmp);
+ adoxq(tmp4, yz_idx2);
+
+ movl(carry, 0); // does not affect flags
+ adcxq(carry2, carry);
+ adoxq(carry2, carry);
+ } else {
+ add2_with_carry(tmp4, tmp3, carry, yz_idx1);
+ add2_with_carry(carry2, tmp4, tmp, yz_idx2);
+ }
+ movq(carry, carry2);
+
+ movl(Address(z, idx, Address::times_4, 12), tmp3);
+ shrq(tmp3, 32);
+ movl(Address(z, idx, Address::times_4, 8), tmp3);
+
+ movl(Address(z, idx, Address::times_4, 4), tmp4);
+ shrq(tmp4, 32);
+ movl(Address(z, idx, Address::times_4, 0), tmp4);
+
+ jmp(L_third_loop);
+
+ bind (L_third_loop_exit);
+
+ andl (idx, 0x3);
+ jcc(Assembler::zero, L_post_third_loop_done);
+
+ Label L_check_1;
+ subl(idx, 2);
+ jcc(Assembler::negative, L_check_1);
+
+ movq(yz_idx1, Address(y, idx, Address::times_4, 0));
+ rorxq(yz_idx1, yz_idx1, 32);
+ mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
+ movq(yz_idx2, Address(z, idx, Address::times_4, 0));
+ rorxq(yz_idx2, yz_idx2, 32);
+
+ add2_with_carry(tmp4, tmp3, carry, yz_idx2);
+
+ movl(Address(z, idx, Address::times_4, 4), tmp3);
+ shrq(tmp3, 32);
+ movl(Address(z, idx, Address::times_4, 0), tmp3);
+ movq(carry, tmp4);
+
+ bind (L_check_1);
+ addl (idx, 0x2);
+ andl (idx, 0x1);
+ subl(idx, 1);
+ jcc(Assembler::negative, L_post_third_loop_done);
+ movl(tmp4, Address(y, idx, Address::times_4, 0));
+ mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
+ movl(tmp4, Address(z, idx, Address::times_4, 0));
+
+ add2_with_carry(carry2, tmp3, tmp4, carry);
+
+ movl(Address(z, idx, Address::times_4, 0), tmp3);
+ shrq(tmp3, 32);
+
+ shlq(carry2, 32);
+ orq(tmp3, carry2);
+ movq(carry, tmp3);
+
+ bind(L_post_third_loop_done);
+}
+
+/**
+ * Code for BigInteger::multiplyToLen() instrinsic.
+ *
+ * rdi: x
+ * rax: xlen
+ * rsi: y
+ * rcx: ylen
+ * r8: z
+ * r11: zlen
+ * r12: tmp1
+ * r13: tmp2
+ * r14: tmp3
+ * r15: tmp4
+ * rbx: tmp5
+ *
+ */
+void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
+ Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
+ ShortBranchVerifier sbv(this);
+ assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
+
+ push(tmp1);
+ push(tmp2);
+ push(tmp3);
+ push(tmp4);
+ push(tmp5);
+
+ push(xlen);
+ push(zlen);
+
+ const Register idx = tmp1;
+ const Register kdx = tmp2;
+ const Register xstart = tmp3;
+
+ const Register y_idx = tmp4;
+ const Register carry = tmp5;
+ const Register product = xlen;
+ const Register x_xstart = zlen; // reuse register
+
+ // First Loop.
+ //
+ // final static long LONG_MASK = 0xffffffffL;
+ // int xstart = xlen - 1;
+ // int ystart = ylen - 1;
+ // long carry = 0;
+ // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
+ // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
+ // z[kdx] = (int)product;
+ // carry = product >>> 32;
+ // }
+ // z[xstart] = (int)carry;
+ //
+
+ movl(idx, ylen); // idx = ylen;
+ movl(kdx, zlen); // kdx = xlen+ylen;
+ xorq(carry, carry); // carry = 0;
+
+ Label L_done;
+
+ movl(xstart, xlen);
+ decrementl(xstart);
+ jcc(Assembler::negative, L_done);
+
+ multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
+
+ Label L_second_loop;
+ testl(kdx, kdx);
+ jcc(Assembler::zero, L_second_loop);
+
+ Label L_carry;
+ subl(kdx, 1);
+ jcc(Assembler::zero, L_carry);
+
+ movl(Address(z, kdx, Address::times_4, 0), carry);
+ shrq(carry, 32);
+ subl(kdx, 1);
+
+ bind(L_carry);
+ movl(Address(z, kdx, Address::times_4, 0), carry);
+
+ // Second and third (nested) loops.
+ //
+ // for (int i = xstart-1; i >= 0; i--) { // Second loop
+ // carry = 0;
+ // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
+ // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
+ // (z[k] & LONG_MASK) + carry;
+ // z[k] = (int)product;
+ // carry = product >>> 32;
+ // }
+ // z[i] = (int)carry;
+ // }
+ //
+ // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
+
+ const Register jdx = tmp1;
+
+ bind(L_second_loop);
+ xorl(carry, carry); // carry = 0;
+ movl(jdx, ylen); // j = ystart+1
+
+ subl(xstart, 1); // i = xstart-1;
+ jcc(Assembler::negative, L_done);
+
+ push (z);
+
+ Label L_last_x;
+ lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
+ subl(xstart, 1); // i = xstart-1;
+ jcc(Assembler::negative, L_last_x);
+
+ if (UseBMI2Instructions) {
+ movq(rdx, Address(x, xstart, Address::times_4, 0));
+ rorxq(rdx, rdx, 32); // convert big-endian to little-endian
+ } else {
+ movq(x_xstart, Address(x, xstart, Address::times_4, 0));
+ rorq(x_xstart, 32); // convert big-endian to little-endian
+ }
+
+ Label L_third_loop_prologue;
+ bind(L_third_loop_prologue);
+
+ push (x);
+ push (xstart);
+ push (ylen);
+
+
+ if (UseBMI2Instructions) {
+ multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
+ } else { // !UseBMI2Instructions
+ multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
+ }
+
+ pop(ylen);
+ pop(xlen);
+ pop(x);
+ pop(z);
+
+ movl(tmp3, xlen);
+ addl(tmp3, 1);
+ movl(Address(z, tmp3, Address::times_4, 0), carry);
+ subl(tmp3, 1);
+ jccb(Assembler::negative, L_done);
+
+ shrq(carry, 32);
+ movl(Address(z, tmp3, Address::times_4, 0), carry);
+ jmp(L_second_loop);
+
+ // Next infrequent code is moved outside loops.
+ bind(L_last_x);
+ if (UseBMI2Instructions) {
+ movl(rdx, Address(x, 0));
+ } else {
+ movl(x_xstart, Address(x, 0));
+ }
+ jmp(L_third_loop_prologue);
+
+ bind(L_done);
+
+ pop(zlen);
+ pop(xlen);
+
+ pop(tmp5);
+ pop(tmp4);
+ pop(tmp3);
+ pop(tmp2);
+ pop(tmp1);
+}
+#endif
+
/**
* Emits code to update CRC-32 with a byte value according to constants in table
*
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Tue Sep 02 12:48:45 2014 -0700
@@ -1221,6 +1221,28 @@
XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
XMMRegister tmp4, Register tmp5, Register result);
+#ifdef _LP64
+ void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
+ void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
+ Register y, Register y_idx, Register z,
+ Register carry, Register product,
+ Register idx, Register kdx);
+ void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
+ Register yz_idx, Register idx,
+ Register carry, Register product, int offset);
+ void multiply_128_x_128_bmi2_loop(Register y, Register z,
+ Register carry, Register carry2,
+ Register idx, Register jdx,
+ Register yz_idx1, Register yz_idx2,
+ Register tmp, Register tmp3, Register tmp4);
+ void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
+ Register yz_idx, Register idx, Register jdx,
+ Register carry, Register product,
+ Register carry2);
+ void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
+ Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
+#endif
+
// CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
void update_byte_crc32(Register crc, Register val, Register table);
void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Tue Sep 02 12:48:45 2014 -0700
@@ -3677,6 +3677,70 @@
return start;
}
+
+ /**
+ * Arguments:
+ *
+ * Input:
+ * c_rarg0 - x address
+ * c_rarg1 - x length
+ * c_rarg2 - y address
+ * c_rarg3 - y lenth
+ * not Win64
+ * c_rarg4 - z address
+ * c_rarg5 - z length
+ * Win64
+ * rsp+40 - z address
+ * rsp+48 - z length
+ */
+ address generate_multiplyToLen() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
+
+ address start = __ pc();
+ // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
+ // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
+ const Register x = rdi;
+ const Register xlen = rax;
+ const Register y = rsi;
+ const Register ylen = rcx;
+ const Register z = r8;
+ const Register zlen = r11;
+
+ // Next registers will be saved on stack in multiply_to_len().
+ const Register tmp1 = r12;
+ const Register tmp2 = r13;
+ const Register tmp3 = r14;
+ const Register tmp4 = r15;
+ const Register tmp5 = rbx;
+
+ BLOCK_COMMENT("Entry:");
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifndef _WIN64
+ __ movptr(zlen, r9); // Save r9 in r11 - zlen
+#endif
+ setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
+ // ylen => rcx, z => r8, zlen => r11
+ // r9 and r10 may be used to save non-volatile registers
+#ifdef _WIN64
+ // last 2 arguments (#4, #5) are on stack on Win64
+ __ movptr(z, Address(rsp, 6 * wordSize));
+ __ movptr(zlen, Address(rsp, 7 * wordSize));
+#endif
+
+ __ movptr(xlen, rsi);
+ __ movptr(y, rdx);
+ __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
+
+ restore_arg_regs();
+
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ return start;
+ }
+
#undef __
#define __ masm->
@@ -3917,6 +3981,11 @@
generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
&StubRoutines::_safefetchN_fault_pc,
&StubRoutines::_safefetchN_continuation_pc);
+#ifdef COMPILER2
+ if (UseMultiplyToLenIntrinsic) {
+ StubRoutines::_multiplyToLen = generate_multiplyToLen();
+ }
+#endif
}
public:
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Tue Sep 02 12:48:45 2014 -0700
@@ -485,7 +485,7 @@
}
char buf[256];
- jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+ jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
cores_per_cpu(), threads_per_core(),
cpu_family(), _model, _stepping,
(supports_cmov() ? ", cmov" : ""),
@@ -514,7 +514,8 @@
(supports_tscinv_bit() ? ", tscinvbit": ""),
(supports_tscinv() ? ", tscinv": ""),
(supports_bmi1() ? ", bmi1" : ""),
- (supports_bmi2() ? ", bmi2" : ""));
+ (supports_bmi2() ? ", bmi2" : ""),
+ (supports_adx() ? ", adx" : ""));
_features_str = os::strdup(buf);
// UseSSE is set to the smaller of what hardware supports and what
@@ -566,7 +567,7 @@
}
} else if (UseCRC32Intrinsics) {
if (!FLAG_IS_DEFAULT(UseCRC32Intrinsics))
- warning("CRC32 Intrinsics requires AVX and CLMUL instructions (not available on this CPU)");
+ warning("CRC32 Intrinsics requires CLMUL instructions (not available on this CPU)");
FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
}
@@ -689,7 +690,20 @@
}
#endif
}
+
+#ifdef _LP64
+ if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
+ UseMultiplyToLenIntrinsic = true;
+ }
+#else
+ if (UseMultiplyToLenIntrinsic) {
+ if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
+ warning("multiplyToLen intrinsic is not available in 32-bit VM");
+ }
+ FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false);
+ }
#endif
+#endif // COMPILER2
// On new cpus instructions which update whole XMM register should be used
// to prevent partial register stall due to dependencies on high half.
@@ -832,6 +846,9 @@
}
}
}
+ if(FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
+ AllocatePrefetchInstr = 3;
+ }
}
// Use count leading zeros count instruction if available.
@@ -844,23 +861,35 @@
FLAG_SET_DEFAULT(UseCountLeadingZerosInstruction, false);
}
+ // Use count trailing zeros instruction if available
if (supports_bmi1()) {
+ // tzcnt does not require VEX prefix
+ if (FLAG_IS_DEFAULT(UseCountTrailingZerosInstruction)) {
+ UseCountTrailingZerosInstruction = true;
+ }
+ } else if (UseCountTrailingZerosInstruction) {
+ warning("tzcnt instruction is not available on this CPU");
+ FLAG_SET_DEFAULT(UseCountTrailingZerosInstruction, false);
+ }
+
+ // BMI instructions use an encoding with VEX prefix.
+ // VEX prefix is generated only when AVX > 0.
+ if (supports_bmi1() && supports_avx()) {
if (FLAG_IS_DEFAULT(UseBMI1Instructions)) {
UseBMI1Instructions = true;
}
} else if (UseBMI1Instructions) {
- warning("BMI1 instructions are not available on this CPU");
+ warning("BMI1 instructions are not available on this CPU (AVX is also required)");
FLAG_SET_DEFAULT(UseBMI1Instructions, false);
}
- // Use count trailing zeros instruction if available
- if (supports_bmi1()) {
- if (FLAG_IS_DEFAULT(UseCountTrailingZerosInstruction)) {
- UseCountTrailingZerosInstruction = UseBMI1Instructions;
+ if (supports_bmi2() && supports_avx()) {
+ if (FLAG_IS_DEFAULT(UseBMI2Instructions)) {
+ UseBMI2Instructions = true;
}
- } else if (UseCountTrailingZerosInstruction) {
- warning("tzcnt instruction is not available on this CPU");
- FLAG_SET_DEFAULT(UseCountTrailingZerosInstruction, false);
+ } else if (UseBMI2Instructions) {
+ warning("BMI2 instructions are not available on this CPU (AVX is also required)");
+ FLAG_SET_DEFAULT(UseBMI2Instructions, false);
}
// Use population count instruction if available.
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Tue Sep 02 12:48:45 2014 -0700
@@ -209,7 +209,9 @@
erms : 1,
: 1,
rtm : 1,
- : 20;
+ : 7,
+ adx : 1,
+ : 12;
} bits;
};
@@ -260,7 +262,8 @@
CPU_CLMUL = (1 << 21), // carryless multiply for CRC
CPU_BMI1 = (1 << 22),
CPU_BMI2 = (1 << 23),
- CPU_RTM = (1 << 24) // Restricted Transactional Memory instructions
+ CPU_RTM = (1 << 24), // Restricted Transactional Memory instructions
+ CPU_ADX = (1 << 25)
} cpuFeatureFlags;
enum {
@@ -465,10 +468,16 @@
}
// Intel features.
if(is_intel()) {
+ if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
+ result |= CPU_ADX;
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
result |= CPU_BMI2;
if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
result |= CPU_LZCNT;
+ // for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
+ if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
+ result |= CPU_3DNOW_PREFETCH;
+ }
}
return result;
@@ -625,6 +634,7 @@
static bool supports_rtm() { return (_cpuFeatures & CPU_RTM) != 0; }
static bool supports_bmi1() { return (_cpuFeatures & CPU_BMI1) != 0; }
static bool supports_bmi2() { return (_cpuFeatures & CPU_BMI2) != 0; }
+ static bool supports_adx() { return (_cpuFeatures & CPU_ADX) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&
extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/hotspot/src/share/vm/asm/register.hpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/share/vm/asm/register.hpp Tue Sep 02 12:48:45 2014 -0700
@@ -275,4 +275,101 @@
);
}
+inline void assert_different_registers(
+ AbstractRegister a,
+ AbstractRegister b,
+ AbstractRegister c,
+ AbstractRegister d,
+ AbstractRegister e,
+ AbstractRegister f,
+ AbstractRegister g,
+ AbstractRegister h,
+ AbstractRegister i,
+ AbstractRegister j
+) {
+ assert(
+ a != b && a != c && a != d && a != e && a != f && a != g && a != h && a != i && a != j
+ && b != c && b != d && b != e && b != f && b != g && b != h && b != i && b != j
+ && c != d && c != e && c != f && c != g && c != h && c != i && c != j
+ && d != e && d != f && d != g && d != h && d != i && d != j
+ && e != f && e != g && e != h && e != i && e != j
+ && f != g && f != h && f != i && f != j
+ && g != h && g != i && g != j
+ && h != i && h != j
+ && i != j,
+ err_msg_res("registers must be different: a=" INTPTR_FORMAT ", b=" INTPTR_FORMAT
+ ", c=" INTPTR_FORMAT ", d=" INTPTR_FORMAT ", e=" INTPTR_FORMAT
+ ", f=" INTPTR_FORMAT ", g=" INTPTR_FORMAT ", h=" INTPTR_FORMAT
+ ", i=" INTPTR_FORMAT ", j=" INTPTR_FORMAT "",
+ p2i(a), p2i(b), p2i(c), p2i(d), p2i(e), p2i(f), p2i(g), p2i(h), p2i(i), p2i(j))
+ );
+}
+
+inline void assert_different_registers(
+ AbstractRegister a,
+ AbstractRegister b,
+ AbstractRegister c,
+ AbstractRegister d,
+ AbstractRegister e,
+ AbstractRegister f,
+ AbstractRegister g,
+ AbstractRegister h,
+ AbstractRegister i,
+ AbstractRegister j,
+ AbstractRegister k
+) {
+ assert(
+ a != b && a != c && a != d && a != e && a != f && a != g && a != h && a != i && a != j && a !=k
+ && b != c && b != d && b != e && b != f && b != g && b != h && b != i && b != j && b !=k
+ && c != d && c != e && c != f && c != g && c != h && c != i && c != j && c !=k
+ && d != e && d != f && d != g && d != h && d != i && d != j && d !=k
+ && e != f && e != g && e != h && e != i && e != j && e !=k
+ && f != g && f != h && f != i && f != j && f !=k
+ && g != h && g != i && g != j && g !=k
+ && h != i && h != j && h !=k
+ && i != j && i !=k
+ && j !=k,
+ err_msg_res("registers must be different: a=" INTPTR_FORMAT ", b=" INTPTR_FORMAT
+ ", c=" INTPTR_FORMAT ", d=" INTPTR_FORMAT ", e=" INTPTR_FORMAT
+ ", f=" INTPTR_FORMAT ", g=" INTPTR_FORMAT ", h=" INTPTR_FORMAT
+ ", i=" INTPTR_FORMAT ", j=" INTPTR_FORMAT ", k=" INTPTR_FORMAT "",
+ p2i(a), p2i(b), p2i(c), p2i(d), p2i(e), p2i(f), p2i(g), p2i(h), p2i(i), p2i(j), p2i(k))
+ );
+}
+
+inline void assert_different_registers(
+ AbstractRegister a,
+ AbstractRegister b,
+ AbstractRegister c,
+ AbstractRegister d,
+ AbstractRegister e,
+ AbstractRegister f,
+ AbstractRegister g,
+ AbstractRegister h,
+ AbstractRegister i,
+ AbstractRegister j,
+ AbstractRegister k,
+ AbstractRegister l
+) {
+ assert(
+ a != b && a != c && a != d && a != e && a != f && a != g && a != h && a != i && a != j && a !=k && a !=l
+ && b != c && b != d && b != e && b != f && b != g && b != h && b != i && b != j && b !=k && b !=l
+ && c != d && c != e && c != f && c != g && c != h && c != i && c != j && c !=k && c !=l
+ && d != e && d != f && d != g && d != h && d != i && d != j && d !=k && d !=l
+ && e != f && e != g && e != h && e != i && e != j && e !=k && e !=l
+ && f != g && f != h && f != i && f != j && f !=k && f !=l
+ && g != h && g != i && g != j && g !=k && g !=l
+ && h != i && h != j && h !=k && h !=l
+ && i != j && i !=k && i !=l
+ && j !=k && j !=l
+ && k !=l,
+ err_msg_res("registers must be different: a=" INTPTR_FORMAT ", b=" INTPTR_FORMAT
+ ", c=" INTPTR_FORMAT ", d=" INTPTR_FORMAT ", e=" INTPTR_FORMAT
+ ", f=" INTPTR_FORMAT ", g=" INTPTR_FORMAT ", h=" INTPTR_FORMAT
+ ", i=" INTPTR_FORMAT ", j=" INTPTR_FORMAT ", k=" INTPTR_FORMAT
+ ", l=" INTPTR_FORMAT "",
+ p2i(a), p2i(b), p2i(c), p2i(d), p2i(e), p2i(f), p2i(g), p2i(h), p2i(i), p2i(j), p2i(k), p2i(l))
+ );
+}
+
#endif // SHARE_VM_ASM_REGISTER_HPP
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp Tue Sep 02 12:48:45 2014 -0700
@@ -788,6 +788,11 @@
do_name( encodeISOArray_name, "encodeISOArray") \
do_signature(encodeISOArray_signature, "([CI[BII)I") \
\
+ do_class(java_math_BigInteger, "java/math/BigInteger") \
+ do_intrinsic(_multiplyToLen, java_math_BigInteger, multiplyToLen_name, multiplyToLen_signature, F_R) \
+ do_name( multiplyToLen_name, "multiplyToLen") \
+ do_signature(multiplyToLen_signature, "([II[II[I)[I") \
+ \
/* java/lang/ref/Reference */ \
do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \
\
--- a/hotspot/src/share/vm/opto/c2_globals.hpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp Tue Sep 02 12:48:45 2014 -0700
@@ -650,6 +650,9 @@
product(bool, UseMathExactIntrinsics, true, \
"Enables intrinsification of various java.lang.Math functions") \
\
+ product(bool, UseMultiplyToLenIntrinsic, false, \
+ "Enables intrinsification of BigInteger.multiplyToLen()") \
+ \
product(bool, UseTypeSpeculation, true, \
"Speculatively propagate types from profiles") \
\
--- a/hotspot/src/share/vm/opto/escape.cpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/share/vm/opto/escape.cpp Tue Sep 02 12:48:45 2014 -0700
@@ -945,7 +945,8 @@
strcmp(call->as_CallLeaf()->_name, "sha256_implCompress") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha256_implCompressMB") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha512_implCompress") == 0 ||
- strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0)
+ strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 ||
+ strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0)
))) {
call->dump();
fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
--- a/hotspot/src/share/vm/opto/library_call.cpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/share/vm/opto/library_call.cpp Tue Sep 02 12:48:45 2014 -0700
@@ -285,6 +285,7 @@
bool inline_updateCRC32();
bool inline_updateBytesCRC32();
bool inline_updateByteBufferCRC32();
+ bool inline_multiplyToLen();
};
@@ -293,8 +294,12 @@
vmIntrinsics::ID id = m->intrinsic_id();
assert(id != vmIntrinsics::_none, "must be a VM intrinsic");
- if (DisableIntrinsic[0] != '\0'
- && strstr(DisableIntrinsic, vmIntrinsics::name_at(id)) != NULL) {
+ ccstr disable_intr = NULL;
+
+ if ((DisableIntrinsic[0] != '\0'
+ && strstr(DisableIntrinsic, vmIntrinsics::name_at(id)) != NULL) ||
+ (method_has_option_value("DisableIntrinsic", disable_intr)
+ && strstr(disable_intr, vmIntrinsics::name_at(id)) != NULL)) {
// disabled by a user request on the command line:
// example: -XX:DisableIntrinsic=_hashCode,_getClass
return NULL;
@@ -477,6 +482,10 @@
if (!UseAESIntrinsics) return NULL;
break;
+ case vmIntrinsics::_multiplyToLen:
+ if (!UseMultiplyToLenIntrinsic) return NULL;
+ break;
+
case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
if (!UseAESIntrinsics) return NULL;
@@ -876,6 +885,9 @@
case vmIntrinsics::_digestBase_implCompressMB:
return inline_digestBase_implCompressMB(predicate);
+ case vmIntrinsics::_multiplyToLen:
+ return inline_multiplyToLen();
+
case vmIntrinsics::_encodeISOArray:
return inline_encodeISOArray();
@@ -4924,6 +4936,106 @@
return true;
}
+//-------------inline_multiplyToLen-----------------------------------
+bool LibraryCallKit::inline_multiplyToLen() {
+ assert(UseMultiplyToLenIntrinsic, "not implementated on this platform");
+
+ address stubAddr = StubRoutines::multiplyToLen();
+ if (stubAddr == NULL) {
+ return false; // Intrinsic's stub is not implemented on this platform
+ }
+ const char* stubName = "multiplyToLen";
+
+ assert(callee()->signature()->size() == 5, "multiplyToLen has 5 parameters");
+
+ Node* x = argument(1);
+ Node* xlen = argument(2);
+ Node* y = argument(3);
+ Node* ylen = argument(4);
+ Node* z = argument(5);
+
+ const Type* x_type = x->Value(&_gvn);
+ const Type* y_type = y->Value(&_gvn);
+ const TypeAryPtr* top_x = x_type->isa_aryptr();
+ const TypeAryPtr* top_y = y_type->isa_aryptr();
+ if (top_x == NULL || top_x->klass() == NULL ||
+ top_y == NULL || top_y->klass() == NULL) {
+ // failed array check
+ return false;
+ }
+
+ BasicType x_elem = x_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+ BasicType y_elem = y_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+ if (x_elem != T_INT || y_elem != T_INT) {
+ return false;
+ }
+
+ // Set the original stack and the reexecute bit for the interpreter to reexecute
+ // the bytecode that invokes BigInteger.multiplyToLen() if deoptimization happens
+ // on the return from z array allocation in runtime.
+ { PreserveReexecuteState preexecs(this);
+ jvms()->set_should_reexecute(true);
+
+ Node* x_start = array_element_address(x, intcon(0), x_elem);
+ Node* y_start = array_element_address(y, intcon(0), y_elem);
+ // 'x_start' points to x array + scaled xlen
+ // 'y_start' points to y array + scaled ylen
+
+ // Allocate the result array
+ Node* zlen = _gvn.transform(new AddINode(xlen, ylen));
+ Node* klass_node = makecon(TypeKlassPtr::make(ciTypeArrayKlass::make(T_INT)));
+
+ IdealKit ideal(this);
+
+#define __ ideal.
+ Node* one = __ ConI(1);
+ Node* zero = __ ConI(0);
+ IdealVariable need_alloc(ideal), z_alloc(ideal); __ declarations_done();
+ __ set(need_alloc, zero);
+ __ set(z_alloc, z);
+ __ if_then(z, BoolTest::eq, null()); {
+ __ increment (need_alloc, one);
+ } __ else_(); {
+ // Update graphKit memory and control from IdealKit.
+ sync_kit(ideal);
+ Node* zlen_arg = load_array_length(z);
+ // Update IdealKit memory and control from graphKit.
+ __ sync_kit(this);
+ __ if_then(zlen_arg, BoolTest::lt, zlen); {
+ __ increment (need_alloc, one);
+ } __ end_if();
+ } __ end_if();
+
+ __ if_then(__ value(need_alloc), BoolTest::ne, zero); {
+ // Update graphKit memory and control from IdealKit.
+ sync_kit(ideal);
+ Node * narr = new_array(klass_node, zlen, 1);
+ // Update IdealKit memory and control from graphKit.
+ __ sync_kit(this);
+ __ set(z_alloc, narr);
+ } __ end_if();
+
+ sync_kit(ideal);
+ z = __ value(z_alloc);
+ _gvn.set_type(z, TypeAryPtr::INTS);
+ // Final sync IdealKit and GraphKit.
+ final_sync(ideal);
+#undef __
+
+ Node* z_start = array_element_address(z, intcon(0), T_INT);
+
+ Node* call = make_runtime_call(RC_LEAF|RC_NO_FP,
+ OptoRuntime::multiplyToLen_Type(),
+ stubAddr, stubName, TypePtr::BOTTOM,
+ x_start, xlen, y_start, ylen, z_start, zlen);
+ } // original reexecute is set back here
+
+ C->set_has_split_ifs(true); // Has chance for split-if optimization
+ set_result(z);
+ return true;
+}
+
+
/**
* Calculate CRC32 for byte.
* int java.util.zip.CRC32.update(int crc, int b)
--- a/hotspot/src/share/vm/opto/runtime.cpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/share/vm/opto/runtime.cpp Tue Sep 02 12:48:45 2014 -0700
@@ -922,6 +922,30 @@
return TypeFunc::make(domain, range);
}
+const TypeFunc* OptoRuntime::multiplyToLen_Type() {
+ // create input type (domain)
+ int num_args = 6;
+ int argcnt = num_args;
+ const Type** fields = TypeTuple::fields(argcnt);
+ int argp = TypeFunc::Parms;
+ fields[argp++] = TypePtr::NOTNULL; // x
+ fields[argp++] = TypeInt::INT; // xlen
+ fields[argp++] = TypePtr::NOTNULL; // y
+ fields[argp++] = TypeInt::INT; // ylen
+ fields[argp++] = TypePtr::NOTNULL; // z
+ fields[argp++] = TypeInt::INT; // zlen
+ assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+ const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+ // no result type needed
+ fields = TypeTuple::fields(1);
+ fields[TypeFunc::Parms+0] = NULL;
+ const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+ return TypeFunc::make(domain, range);
+}
+
+
+
//------------- Interpreter state access for on stack replacement
const TypeFunc* OptoRuntime::osr_end_Type() {
// create input type (domain)
--- a/hotspot/src/share/vm/opto/runtime.hpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/share/vm/opto/runtime.hpp Tue Sep 02 12:48:45 2014 -0700
@@ -310,6 +310,8 @@
static const TypeFunc* sha_implCompress_Type();
static const TypeFunc* digestBase_implCompressMB_Type();
+ static const TypeFunc* multiplyToLen_Type();
+
static const TypeFunc* updateBytesCRC32_Type();
// leaf on stack replacement interpreter accessor types
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp Tue Sep 02 12:48:45 2014 -0700
@@ -135,6 +135,8 @@
address StubRoutines::_updateBytesCRC32 = NULL;
address StubRoutines::_crc_table_adr = NULL;
+address StubRoutines::_multiplyToLen = NULL;
+
double (* StubRoutines::_intrinsic_log )(double) = NULL;
double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
double (* StubRoutines::_intrinsic_exp )(double) = NULL;
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp Tue Sep 02 12:48:45 2014 -0700
@@ -202,6 +202,8 @@
static address _updateBytesCRC32;
static address _crc_table_adr;
+ static address _multiplyToLen;
+
// These are versions of the java.lang.Math methods which perform
// the same operations as the intrinsic version. They are used for
// constant folding in the compiler to ensure equivalence. If the
@@ -358,6 +360,8 @@
static address updateBytesCRC32() { return _updateBytesCRC32; }
static address crc_table_addr() { return _crc_table_adr; }
+ static address multiplyToLen() {return _multiplyToLen; }
+
static address select_fill_function(BasicType t, bool aligned, const char* &name);
static address zero_aligned_words() { return _zero_aligned_words; }
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp Tue Sep 02 10:26:48 2014 -0700
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp Tue Sep 02 12:48:45 2014 -0700
@@ -811,6 +811,7 @@
static_field(StubRoutines, _cipherBlockChaining_decryptAESCrypt, address) \
static_field(StubRoutines, _updateBytesCRC32, address) \
static_field(StubRoutines, _crc_table_adr, address) \
+ static_field(StubRoutines, _multiplyToLen, address) \
\
/*****************/ \
/* SharedRuntime */ \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/intrinsics/multiplytolen/TestMultiplyToLen.java Tue Sep 02 12:48:45 2014 -0700
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8055494
+ * @summary Add C2 x86 intrinsic for BigInteger::multiplyToLen() method
+ *
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
+ * -XX:CompileCommand=exclude,TestMultiplyToLen::main
+ * -XX:CompileCommand=option,TestMultiplyToLen::base_multiply,ccstr,DisableIntrinsic,_multiplyToLen
+ * -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_multiplyToLen
+ * -XX:CompileCommand=inline,java.math.BigInteger::multiply TestMultiplyToLen
+ */
+
+import java.util.Random;
+import java.math.*;
+
+public class TestMultiplyToLen {
+
+ // Avoid intrinsic by preventing inlining multiply() and multiplyToLen().
+ public static BigInteger base_multiply(BigInteger op1, BigInteger op2) {
+ return op1.multiply(op2);
+ }
+
+ // Generate multiplyToLen() intrinsic by inlining multiply().
+ public static BigInteger new_multiply(BigInteger op1, BigInteger op2) {
+ return op1.multiply(op2);
+ }
+
+ public static boolean bytecompare(BigInteger b1, BigInteger b2) {
+ byte[] data1 = b1.toByteArray();
+ byte[] data2 = b2.toByteArray();
+ if (data1.length != data2.length)
+ return false;
+ for (int i = 0; i < data1.length; i++) {
+ if (data1[i] != data2[i])
+ return false;
+ }
+ return true;
+ }
+
+ public static String stringify(BigInteger b) {
+ String strout= "";
+ byte [] data = b.toByteArray();
+ for (int i = 0; i < data.length; i++) {
+ strout += (String.format("%02x",data[i]) + " ");
+ }
+ return strout;
+ }
+
+ public static void main(String args[]) throws Exception {
+
+ BigInteger oldsum = new BigInteger("0");
+ BigInteger newsum = new BigInteger("0");
+
+ BigInteger b1, b2, oldres, newres;
+
+ Random rand = new Random();
+ long seed = System.nanoTime();
+ Random rand1 = new Random();
+ long seed1 = System.nanoTime();
+ rand.setSeed(seed);
+ rand1.setSeed(seed1);
+
+ for (int j = 0; j < 1000000; j++) {
+ int rand_int = rand1.nextInt(3136)+32;
+ int rand_int1 = rand1.nextInt(3136)+32;
+ b1 = new BigInteger(rand_int, rand);
+ b2 = new BigInteger(rand_int1, rand);
+
+ oldres = base_multiply(b1,b2);
+ newres = new_multiply(b1,b2);
+
+ oldsum = oldsum.add(oldres);
+ newsum = newsum.add(newres);
+
+ if (!bytecompare(oldres,newres)) {
+ System.out.print("mismatch for:b1:" + stringify(b1) + " :b2:" + stringify(b2) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
+ System.out.println(b1);
+ System.out.println(b2);
+ throw new Exception("Failed");
+ }
+ }
+ if (!bytecompare(oldsum,newsum)) {
+ System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
+ throw new Exception("Failed");
+ } else {
+ System.out.println("Success");
+ }
+ }
+}