8081778: Use Intel x64 CPU instructions for RSA acceleration
Summary: Add intrinsics for BigInteger squareToLen and mulAdd methods.
Reviewed-by: kvn, jrose
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Wed Jun 03 15:02:10 2015 -0700
@@ -2813,6 +2813,13 @@
emit_arith(0x0B, 0xC0, dst, src);
}
+void Assembler::orl(Address dst, Register src) {
+ InstructionMark im(this);
+ prefix(dst, src);
+ emit_int8(0x09);
+ emit_operand(src, dst);
+}
+
void Assembler::packuswb(XMMRegister dst, Address src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
@@ -6907,6 +6914,19 @@
}
}
+void Assembler::rcrq(Register dst, int imm8) {
+ assert(isShiftCount(imm8 >> 1), "illegal shift count");
+ int encode = prefixq_and_encode(dst->encoding());
+ if (imm8 == 1) {
+ emit_int8((unsigned char)0xD1);
+ emit_int8((unsigned char)(0xD8 | encode));
+ } else {
+ emit_int8((unsigned char)0xC1);
+ emit_int8((unsigned char)(0xD8 | encode));
+ emit_int8(imm8);
+ }
+}
+
void Assembler::rorq(Register dst, int imm8) {
assert(isShiftCount(imm8 >> 1), "illegal shift count");
int encode = prefixq_and_encode(dst->encoding());
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp Wed Jun 03 15:02:10 2015 -0700
@@ -1594,6 +1594,7 @@
void orl(Register dst, int32_t imm32);
void orl(Register dst, Address src);
void orl(Register dst, Register src);
+ void orl(Address dst, Register src);
void orq(Address dst, int32_t imm32);
void orq(Register dst, int32_t imm32);
@@ -1694,6 +1695,8 @@
void rclq(Register dst, int imm8);
+ void rcrq(Register dst, int imm8);
+
void rdtsc();
void ret(int imm16);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp Wed Jun 03 15:02:10 2015 -0700
@@ -7750,6 +7750,503 @@
pop(tmp2);
pop(tmp1);
}
+
+//Helper functions for square_to_len()
+
+/**
+ * Store the squares of x[], right shifted one bit (divided by 2) into z[]
+ * Preserves x and z and modifies rest of the registers.
+ */
+
+void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+ // Perform square and right shift by 1
+ // Handle odd xlen case first, then for even xlen do the following
+ // jlong carry = 0;
+ // for (int j=0, i=0; j < xlen; j+=2, i+=4) {
+ // huge_128 product = x[j:j+1] * x[j:j+1];
+ // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65);
+ // z[i+2:i+3] = (jlong)(product >>> 1);
+ // carry = (jlong)product;
+ // }
+
+ xorq(tmp5, tmp5); // carry
+ xorq(rdxReg, rdxReg);
+ xorl(tmp1, tmp1); // index for x
+ xorl(tmp4, tmp4); // index for z
+
+ Label L_first_loop, L_first_loop_exit;
+
+ testl(xlen, 1);
+ jccb(Assembler::zero, L_first_loop); //jump if xlen is even
+
+ // Square and right shift by 1 the odd element using 32 bit multiply
+ movl(raxReg, Address(x, tmp1, Address::times_4, 0));
+ imulq(raxReg, raxReg);
+ shrq(raxReg, 1);
+ adcq(tmp5, 0);
+ movq(Address(z, tmp4, Address::times_4, 0), raxReg);
+ incrementl(tmp1);
+ addl(tmp4, 2);
+
+ // Square and right shift by 1 the rest using 64 bit multiply
+ bind(L_first_loop);
+ cmpptr(tmp1, xlen);
+ jccb(Assembler::equal, L_first_loop_exit);
+
+ // Square
+ movq(raxReg, Address(x, tmp1, Address::times_4, 0));
+ rorq(raxReg, 32); // convert big-endian to little-endian
+ mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax
+
+ // Right shift by 1 and save carry
+ shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1
+ rcrq(rdxReg, 1);
+ rcrq(raxReg, 1);
+ adcq(tmp5, 0);
+
+ // Store result in z
+ movq(Address(z, tmp4, Address::times_4, 0), rdxReg);
+ movq(Address(z, tmp4, Address::times_4, 8), raxReg);
+
+ // Update indices for x and z
+ addl(tmp1, 2);
+ addl(tmp4, 4);
+ jmp(L_first_loop);
+
+ bind(L_first_loop_exit);
+}
+
+
+/**
+ * Perform the following multiply add operation using BMI2 instructions
+ * carry:sum = sum + op1*op2 + carry
+ * op2 should be in rdx
+ * op2 is preserved, all other registers are modified
+ */
+void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) {
+ // assert op2 is rdx
+ mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1
+ addq(sum, carry);
+ adcq(tmp2, 0);
+ addq(sum, op1);
+ adcq(tmp2, 0);
+ movq(carry, tmp2);
+}
+
+/**
+ * Perform the following multiply add operation:
+ * carry:sum = sum + op1*op2 + carry
+ * Preserves op1, op2 and modifies rest of registers
+ */
+void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) {
+ // rdx:rax = op1 * op2
+ movq(raxReg, op2);
+ mulq(op1);
+
+ // rdx:rax = sum + carry + rdx:rax
+ addq(sum, carry);
+ adcq(rdxReg, 0);
+ addq(sum, raxReg);
+ adcq(rdxReg, 0);
+
+ // carry:sum = rdx:sum
+ movq(carry, rdxReg);
+}
+
+/**
+ * Add 64 bit long carry into z[] with carry propogation.
+ * Preserves z and carry register values and modifies rest of registers.
+ *
+ */
+void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
+ Label L_fourth_loop, L_fourth_loop_exit;
+
+ movl(tmp1, 1);
+ subl(zlen, 2);
+ addq(Address(z, zlen, Address::times_4, 0), carry);
+
+ bind(L_fourth_loop);
+ jccb(Assembler::carryClear, L_fourth_loop_exit);
+ subl(zlen, 2);
+ jccb(Assembler::negative, L_fourth_loop_exit);
+ addq(Address(z, zlen, Address::times_4, 0), tmp1);
+ jmp(L_fourth_loop);
+ bind(L_fourth_loop_exit);
+}
+
+/**
+ * Shift z[] left by 1 bit.
+ * Preserves x, len, z and zlen registers and modifies rest of the registers.
+ *
+ */
+void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) {
+
+ Label L_fifth_loop, L_fifth_loop_exit;
+
+ // Fifth loop
+ // Perform primitiveLeftShift(z, zlen, 1)
+
+ const Register prev_carry = tmp1;
+ const Register new_carry = tmp4;
+ const Register value = tmp2;
+ const Register zidx = tmp3;
+
+ // int zidx, carry;
+ // long value;
+ // carry = 0;
+ // for (zidx = zlen-2; zidx >=0; zidx -= 2) {
+ // (carry:value) = (z[i] << 1) | carry ;
+ // z[i] = value;
+ // }
+
+ movl(zidx, zlen);
+ xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register
+
+ bind(L_fifth_loop);
+ decl(zidx); // Use decl to preserve carry flag
+ decl(zidx);
+ jccb(Assembler::negative, L_fifth_loop_exit);
+
+ if (UseBMI2Instructions) {
+ movq(value, Address(z, zidx, Address::times_4, 0));
+ rclq(value, 1);
+ rorxq(value, value, 32);
+ movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
+ }
+ else {
+ // clear new_carry
+ xorl(new_carry, new_carry);
+
+ // Shift z[i] by 1, or in previous carry and save new carry
+ movq(value, Address(z, zidx, Address::times_4, 0));
+ shlq(value, 1);
+ adcl(new_carry, 0);
+
+ orq(value, prev_carry);
+ rorq(value, 0x20);
+ movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form
+
+ // Set previous carry = new carry
+ movl(prev_carry, new_carry);
+ }
+ jmp(L_fifth_loop);
+
+ bind(L_fifth_loop_exit);
+}
+
+
+/**
+ * Code for BigInteger::squareToLen() intrinsic
+ *
+ * rdi: x
+ * rsi: len
+ * r8: z
+ * rcx: zlen
+ * r12: tmp1
+ * r13: tmp2
+ * r14: tmp3
+ * r15: tmp4
+ * rbx: tmp5
+ *
+ */
+void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+
+ Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply;
+ push(tmp1);
+ push(tmp2);
+ push(tmp3);
+ push(tmp4);
+ push(tmp5);
+
+ // First loop
+ // Store the squares, right shifted one bit (i.e., divided by 2).
+ square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg);
+
+ // Add in off-diagonal sums.
+ //
+ // Second, third (nested) and fourth loops.
+ // zlen +=2;
+ // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) {
+ // carry = 0;
+ // long op2 = x[xidx:xidx+1];
+ // for (int j=xidx-2,k=zidx; j >= 0; j-=2) {
+ // k -= 2;
+ // long op1 = x[j:j+1];
+ // long sum = z[k:k+1];
+ // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs);
+ // z[k:k+1] = sum;
+ // }
+ // add_one_64(z, k, carry, tmp_regs);
+ // }
+
+ const Register carry = tmp5;
+ const Register sum = tmp3;
+ const Register op1 = tmp4;
+ Register op2 = tmp2;
+
+ push(zlen);
+ push(len);
+ addl(zlen,2);
+ bind(L_second_loop);
+ xorq(carry, carry);
+ subl(zlen, 4);
+ subl(len, 2);
+ push(zlen);
+ push(len);
+ cmpl(len, 0);
+ jccb(Assembler::lessEqual, L_second_loop_exit);
+
+ // Multiply an array by one 64 bit long.
+ if (UseBMI2Instructions) {
+ op2 = rdxReg;
+ movq(op2, Address(x, len, Address::times_4, 0));
+ rorxq(op2, op2, 32);
+ }
+ else {
+ movq(op2, Address(x, len, Address::times_4, 0));
+ rorq(op2, 32);
+ }
+
+ bind(L_third_loop);
+ decrementl(len);
+ jccb(Assembler::negative, L_third_loop_exit);
+ decrementl(len);
+ jccb(Assembler::negative, L_last_x);
+
+ movq(op1, Address(x, len, Address::times_4, 0));
+ rorq(op1, 32);
+
+ bind(L_multiply);
+ subl(zlen, 2);
+ movq(sum, Address(z, zlen, Address::times_4, 0));
+
+ // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry.
+ if (UseBMI2Instructions) {
+ multiply_add_64_bmi2(sum, op1, op2, carry, tmp2);
+ }
+ else {
+ multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+ }
+
+ movq(Address(z, zlen, Address::times_4, 0), sum);
+
+ jmp(L_third_loop);
+ bind(L_third_loop_exit);
+
+ // Fourth loop
+ // Add 64 bit long carry into z with carry propogation.
+ // Uses offsetted zlen.
+ add_one_64(z, zlen, carry, tmp1);
+
+ pop(len);
+ pop(zlen);
+ jmp(L_second_loop);
+
+ // Next infrequent code is moved outside loops.
+ bind(L_last_x);
+ movl(op1, Address(x, 0));
+ jmp(L_multiply);
+
+ bind(L_second_loop_exit);
+ pop(len);
+ pop(zlen);
+ pop(len);
+ pop(zlen);
+
+ // Fifth loop
+ // Shift z left 1 bit.
+ lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4);
+
+ // z[zlen-1] |= x[len-1] & 1;
+ movl(tmp3, Address(x, len, Address::times_4, -4));
+ andl(tmp3, 1);
+ orl(Address(z, zlen, Address::times_4, -4), tmp3);
+
+ pop(tmp5);
+ pop(tmp4);
+ pop(tmp3);
+ pop(tmp2);
+ pop(tmp1);
+}
+
+/**
+ * Helper function for mul_add()
+ * Multiply the in[] by int k and add to out[] starting at offset offs using
+ * 128 bit by 32 bit multiply and return the carry in tmp5.
+ * Only quad int aligned length of in[] is operated on in this function.
+ * k is in rdxReg for BMI2Instructions, for others it is in tmp2.
+ * This function preserves out, in and k registers.
+ * len and offset point to the appropriate index in "in" & "out" correspondingly
+ * tmp5 has the carry.
+ * other registers are temporary and are modified.
+ *
+ */
+void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in,
+ Register offset, Register len, Register tmp1, Register tmp2, Register tmp3,
+ Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+
+ Label L_first_loop, L_first_loop_exit;
+
+ movl(tmp1, len);
+ shrl(tmp1, 2);
+
+ bind(L_first_loop);
+ subl(tmp1, 1);
+ jccb(Assembler::negative, L_first_loop_exit);
+
+ subl(len, 4);
+ subl(offset, 4);
+
+ Register op2 = tmp2;
+ const Register sum = tmp3;
+ const Register op1 = tmp4;
+ const Register carry = tmp5;
+
+ if (UseBMI2Instructions) {
+ op2 = rdxReg;
+ }
+
+ movq(op1, Address(in, len, Address::times_4, 8));
+ rorq(op1, 32);
+ movq(sum, Address(out, offset, Address::times_4, 8));
+ rorq(sum, 32);
+ if (UseBMI2Instructions) {
+ multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+ }
+ else {
+ multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+ }
+ // Store back in big endian from little endian
+ rorq(sum, 0x20);
+ movq(Address(out, offset, Address::times_4, 8), sum);
+
+ movq(op1, Address(in, len, Address::times_4, 0));
+ rorq(op1, 32);
+ movq(sum, Address(out, offset, Address::times_4, 0));
+ rorq(sum, 32);
+ if (UseBMI2Instructions) {
+ multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+ }
+ else {
+ multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+ }
+ // Store back in big endian from little endian
+ rorq(sum, 0x20);
+ movq(Address(out, offset, Address::times_4, 0), sum);
+
+ jmp(L_first_loop);
+ bind(L_first_loop_exit);
+}
+
+/**
+ * Code for BigInteger::mulAdd() intrinsic
+ *
+ * rdi: out
+ * rsi: in
+ * r11: offs (out.length - offset)
+ * rcx: len
+ * r8: k
+ * r12: tmp1
+ * r13: tmp2
+ * r14: tmp3
+ * r15: tmp4
+ * rbx: tmp5
+ * Multiply the in[] by word k and add to out[], return the carry in rax
+ */
+void MacroAssembler::mul_add(Register out, Register in, Register offs,
+ Register len, Register k, Register tmp1, Register tmp2, Register tmp3,
+ Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
+
+ Label L_carry, L_last_in, L_done;
+
+// carry = 0;
+// for (int j=len-1; j >= 0; j--) {
+// long product = (in[j] & LONG_MASK) * kLong +
+// (out[offs] & LONG_MASK) + carry;
+// out[offs--] = (int)product;
+// carry = product >>> 32;
+// }
+//
+ push(tmp1);
+ push(tmp2);
+ push(tmp3);
+ push(tmp4);
+ push(tmp5);
+
+ Register op2 = tmp2;
+ const Register sum = tmp3;
+ const Register op1 = tmp4;
+ const Register carry = tmp5;
+
+ if (UseBMI2Instructions) {
+ op2 = rdxReg;
+ movl(op2, k);
+ }
+ else {
+ movl(op2, k);
+ }
+
+ xorq(carry, carry);
+
+ //First loop
+
+ //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply
+ //The carry is in tmp5
+ mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
+
+ //Multiply the trailing in[] entry using 64 bit by 32 bit, if any
+ decrementl(len);
+ jccb(Assembler::negative, L_carry);
+ decrementl(len);
+ jccb(Assembler::negative, L_last_in);
+
+ movq(op1, Address(in, len, Address::times_4, 0));
+ rorq(op1, 32);
+
+ subl(offs, 2);
+ movq(sum, Address(out, offs, Address::times_4, 0));
+ rorq(sum, 32);
+
+ if (UseBMI2Instructions) {
+ multiply_add_64_bmi2(sum, op1, op2, carry, raxReg);
+ }
+ else {
+ multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg);
+ }
+
+ // Store back in big endian from little endian
+ rorq(sum, 0x20);
+ movq(Address(out, offs, Address::times_4, 0), sum);
+
+ testl(len, len);
+ jccb(Assembler::zero, L_carry);
+
+ //Multiply the last in[] entry, if any
+ bind(L_last_in);
+ movl(op1, Address(in, 0));
+ movl(sum, Address(out, offs, Address::times_4, -4));
+
+ movl(raxReg, k);
+ mull(op1); //tmp4 * eax -> edx:eax
+ addl(sum, carry);
+ adcl(rdxReg, 0);
+ addl(sum, raxReg);
+ adcl(rdxReg, 0);
+ movl(carry, rdxReg);
+
+ movl(Address(out, offs, Address::times_4, -4), sum);
+
+ bind(L_carry);
+ //return tmp5/carry as carry in rax
+ movl(rax, carry);
+
+ bind(L_done);
+ pop(tmp5);
+ pop(tmp4);
+ pop(tmp3);
+ pop(tmp2);
+ pop(tmp1);
+}
#endif
/**
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Wed Jun 03 15:02:10 2015 -0700
@@ -1241,6 +1241,25 @@
Register carry2);
void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
+
+ void square_rshift(Register x, Register len, Register z, Register tmp1, Register tmp3,
+ Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
+ void multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry,
+ Register tmp2);
+ void multiply_add_64(Register sum, Register op1, Register op2, Register carry,
+ Register rdxReg, Register raxReg);
+ void add_one_64(Register z, Register zlen, Register carry, Register tmp1);
+ void lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
+ Register tmp3, Register tmp4);
+ void square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2,
+ Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg);
+
+ void mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1,
+ Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
+ Register raxReg);
+ void mul_add(Register out, Register in, Register offset, Register len, Register k, Register tmp1,
+ Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg,
+ Register raxReg);
#endif
// CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Jun 03 15:02:10 2015 -0700
@@ -3785,6 +3785,107 @@
return start;
}
+/**
+ * Arguments:
+ *
+ // Input:
+ // c_rarg0 - x address
+ // c_rarg1 - x length
+ // c_rarg2 - z address
+ // c_rarg3 - z lenth
+ *
+ */
+ address generate_squareToLen() {
+
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "squareToLen");
+
+ address start = __ pc();
+ // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
+ // Unix: rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
+ const Register x = rdi;
+ const Register len = rsi;
+ const Register z = r8;
+ const Register zlen = rcx;
+
+ const Register tmp1 = r12;
+ const Register tmp2 = r13;
+ const Register tmp3 = r14;
+ const Register tmp4 = r15;
+ const Register tmp5 = rbx;
+
+ BLOCK_COMMENT("Entry:");
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+ setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
+ // zlen => rcx
+ // r9 and r10 may be used to save non-volatile registers
+ __ movptr(r8, rdx);
+ __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
+
+ restore_arg_regs();
+
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ return start;
+ }
+
+ /**
+ * Arguments:
+ *
+ * Input:
+ * c_rarg0 - out address
+ * c_rarg1 - in address
+ * c_rarg2 - offset
+ * c_rarg3 - len
+ * not Win64
+ * c_rarg4 - k
+ * Win64
+ * rsp+40 - k
+ */
+ address generate_mulAdd() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "mulAdd");
+
+ address start = __ pc();
+ // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
+ // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
+ const Register out = rdi;
+ const Register in = rsi;
+ const Register offset = r11;
+ const Register len = rcx;
+ const Register k = r8;
+
+ // Next registers will be saved on stack in mul_add().
+ const Register tmp1 = r12;
+ const Register tmp2 = r13;
+ const Register tmp3 = r14;
+ const Register tmp4 = r15;
+ const Register tmp5 = rbx;
+
+ BLOCK_COMMENT("Entry:");
+ __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+ setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
+ // len => rcx, k => r8
+ // r9 and r10 may be used to save non-volatile registers
+#ifdef _WIN64
+ // last argument is on stack on Win64
+ __ movl(k, Address(rsp, 6 * wordSize));
+#endif
+ __ movptr(r11, rdx); // move offset in rdx to offset(r11)
+ __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
+
+ restore_arg_regs();
+
+ __ leave(); // required for proper stackwalking of RuntimeStub frame
+ __ ret(0);
+
+ return start;
+ }
+
+
#undef __
#define __ masm->
@@ -4030,6 +4131,12 @@
if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
}
+ if (UseSquareToLenIntrinsic) {
+ StubRoutines::_squareToLen = generate_squareToLen();
+ }
+ if (UseMulAddIntrinsic) {
+ StubRoutines::_mulAdd = generate_mulAdd();
+ }
#endif
}
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp Wed Jun 03 15:02:10 2015 -0700
@@ -33,7 +33,7 @@
enum platform_dependent_constants {
code_size1 = 19000, // simply increase if too small (assembler will crash if too small)
- code_size2 = 22000 // simply increase if too small (assembler will crash if too small)
+ code_size2 = 23000 // simply increase if too small (assembler will crash if too small)
};
class x86 {
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Wed Jun 03 15:02:10 2015 -0700
@@ -790,6 +790,12 @@
if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
UseMultiplyToLenIntrinsic = true;
}
+ if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
+ UseSquareToLenIntrinsic = true;
+ }
+ if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
+ UseMulAddIntrinsic = true;
+ }
#else
if (UseMultiplyToLenIntrinsic) {
if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
@@ -797,6 +803,18 @@
}
FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false);
}
+ if (UseSquareToLenIntrinsic) {
+ if (!FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
+ warning("squareToLen intrinsic is not available in 32-bit VM");
+ }
+ FLAG_SET_DEFAULT(UseSquareToLenIntrinsic, false);
+ }
+ if (UseMulAddIntrinsic) {
+ if (!FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
+ warning("mulAdd intrinsic is not available in 32-bit VM");
+ }
+ FLAG_SET_DEFAULT(UseMulAddIntrinsic, false);
+ }
#endif
#endif // COMPILER2
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp Wed Jun 03 15:02:10 2015 -0700
@@ -799,6 +799,14 @@
do_name( multiplyToLen_name, "multiplyToLen") \
do_signature(multiplyToLen_signature, "([II[II[I)[I") \
\
+ do_intrinsic(_squareToLen, java_math_BigInteger, squareToLen_name, squareToLen_signature, F_S) \
+ do_name( squareToLen_name, "implSquareToLen") \
+ do_signature(squareToLen_signature, "([II[II)[I") \
+ \
+ do_intrinsic(_mulAdd, java_math_BigInteger, mulAdd_name, mulAdd_signature, F_S) \
+ do_name( mulAdd_name, "implMulAdd") \
+ do_signature(mulAdd_signature, "([I[IIII)I") \
+ \
/* java/lang/ref/Reference */ \
do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \
\
--- a/hotspot/src/share/vm/opto/c2_globals.hpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp Wed Jun 03 15:02:10 2015 -0700
@@ -665,6 +665,12 @@
product(bool, UseMultiplyToLenIntrinsic, false, \
"Enables intrinsification of BigInteger.multiplyToLen()") \
\
+ product(bool, UseSquareToLenIntrinsic, false, \
+ "Enables intrinsification of BigInteger.squareToLen()") \
+ \
+ product(bool, UseMulAddIntrinsic, false, \
+ "Enables intrinsification of BigInteger.mulAdd()") \
+ \
product(bool, UseTypeSpeculation, true, \
"Speculatively propagate types from profiles") \
\
--- a/hotspot/src/share/vm/opto/escape.cpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/share/vm/opto/escape.cpp Wed Jun 03 15:02:10 2015 -0700
@@ -972,7 +972,9 @@
strcmp(call->as_CallLeaf()->_name, "sha256_implCompressMB") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha512_implCompress") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 ||
- strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0)
+ strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0 ||
+ strcmp(call->as_CallLeaf()->_name, "squareToLen") == 0 ||
+ strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0)
))) {
call->dump();
fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
--- a/hotspot/src/share/vm/opto/library_call.cpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/share/vm/opto/library_call.cpp Wed Jun 03 15:02:10 2015 -0700
@@ -291,6 +291,8 @@
bool inline_updateBytesCRC32();
bool inline_updateByteBufferCRC32();
bool inline_multiplyToLen();
+ bool inline_squareToLen();
+ bool inline_mulAdd();
bool inline_profileBoolean();
bool inline_isCompileConstant();
@@ -494,6 +496,14 @@
if (!UseMultiplyToLenIntrinsic) return NULL;
break;
+ case vmIntrinsics::_squareToLen:
+ if (!UseSquareToLenIntrinsic) return NULL;
+ break;
+
+ case vmIntrinsics::_mulAdd:
+ if (!UseMulAddIntrinsic) return NULL;
+ break;
+
case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
if (!UseAESIntrinsics) return NULL;
@@ -913,6 +923,12 @@
case vmIntrinsics::_multiplyToLen:
return inline_multiplyToLen();
+ case vmIntrinsics::_squareToLen:
+ return inline_squareToLen();
+
+ case vmIntrinsics::_mulAdd:
+ return inline_mulAdd();
+
case vmIntrinsics::_encodeISOArray:
return inline_encodeISOArray();
@@ -5306,6 +5322,100 @@
return true;
}
+//-------------inline_squareToLen------------------------------------
+bool LibraryCallKit::inline_squareToLen() {
+ assert(UseSquareToLenIntrinsic, "not implementated on this platform");
+
+ address stubAddr = StubRoutines::squareToLen();
+ if (stubAddr == NULL) {
+ return false; // Intrinsic's stub is not implemented on this platform
+ }
+ const char* stubName = "squareToLen";
+
+ assert(callee()->signature()->size() == 4, "implSquareToLen has 4 parameters");
+
+ Node* x = argument(0);
+ Node* len = argument(1);
+ Node* z = argument(2);
+ Node* zlen = argument(3);
+
+ const Type* x_type = x->Value(&_gvn);
+ const Type* z_type = z->Value(&_gvn);
+ const TypeAryPtr* top_x = x_type->isa_aryptr();
+ const TypeAryPtr* top_z = z_type->isa_aryptr();
+ if (top_x == NULL || top_x->klass() == NULL ||
+ top_z == NULL || top_z->klass() == NULL) {
+ // failed array check
+ return false;
+ }
+
+ BasicType x_elem = x_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+ BasicType z_elem = z_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+ if (x_elem != T_INT || z_elem != T_INT) {
+ return false;
+ }
+
+
+ Node* x_start = array_element_address(x, intcon(0), x_elem);
+ Node* z_start = array_element_address(z, intcon(0), z_elem);
+
+ Node* call = make_runtime_call(RC_LEAF|RC_NO_FP,
+ OptoRuntime::squareToLen_Type(),
+ stubAddr, stubName, TypePtr::BOTTOM,
+ x_start, len, z_start, zlen);
+
+ set_result(z);
+ return true;
+}
+
+//-------------inline_mulAdd------------------------------------------
+bool LibraryCallKit::inline_mulAdd() {
+ assert(UseMulAddIntrinsic, "not implementated on this platform");
+
+ address stubAddr = StubRoutines::mulAdd();
+ if (stubAddr == NULL) {
+ return false; // Intrinsic's stub is not implemented on this platform
+ }
+ const char* stubName = "mulAdd";
+
+ assert(callee()->signature()->size() == 5, "mulAdd has 5 parameters");
+
+ Node* out = argument(0);
+ Node* in = argument(1);
+ Node* offset = argument(2);
+ Node* len = argument(3);
+ Node* k = argument(4);
+
+ const Type* out_type = out->Value(&_gvn);
+ const Type* in_type = in->Value(&_gvn);
+ const TypeAryPtr* top_out = out_type->isa_aryptr();
+ const TypeAryPtr* top_in = in_type->isa_aryptr();
+ if (top_out == NULL || top_out->klass() == NULL ||
+ top_in == NULL || top_in->klass() == NULL) {
+ // failed array check
+ return false;
+ }
+
+ BasicType out_elem = out_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+ BasicType in_elem = in_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
+ if (out_elem != T_INT || in_elem != T_INT) {
+ return false;
+ }
+
+ Node* outlen = load_array_length(out);
+ Node* new_offset = _gvn.transform(new SubINode(outlen, offset));
+ Node* out_start = array_element_address(out, intcon(0), out_elem);
+ Node* in_start = array_element_address(in, intcon(0), in_elem);
+
+ Node* call = make_runtime_call(RC_LEAF|RC_NO_FP,
+ OptoRuntime::mulAdd_Type(),
+ stubAddr, stubName, TypePtr::BOTTOM,
+ out_start,in_start, new_offset, len, k);
+ Node* result = _gvn.transform(new ProjNode(call, TypeFunc::Parms));
+ set_result(result);
+ return true;
+}
+
/**
* Calculate CRC32 for byte.
--- a/hotspot/src/share/vm/opto/runtime.cpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/share/vm/opto/runtime.cpp Wed Jun 03 15:02:10 2015 -0700
@@ -945,6 +945,48 @@
return TypeFunc::make(domain, range);
}
+const TypeFunc* OptoRuntime::squareToLen_Type() {
+ // create input type (domain)
+ int num_args = 4;
+ int argcnt = num_args;
+ const Type** fields = TypeTuple::fields(argcnt);
+ int argp = TypeFunc::Parms;
+ fields[argp++] = TypePtr::NOTNULL; // x
+ fields[argp++] = TypeInt::INT; // len
+ fields[argp++] = TypePtr::NOTNULL; // z
+ fields[argp++] = TypeInt::INT; // zlen
+ assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+ const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+ // no result type needed
+ fields = TypeTuple::fields(1);
+ fields[TypeFunc::Parms+0] = NULL;
+ const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+ return TypeFunc::make(domain, range);
+}
+
+// for mulAdd calls, 2 pointers and 3 ints, returning int
+const TypeFunc* OptoRuntime::mulAdd_Type() {
+ // create input type (domain)
+ int num_args = 5;
+ int argcnt = num_args;
+ const Type** fields = TypeTuple::fields(argcnt);
+ int argp = TypeFunc::Parms;
+ fields[argp++] = TypePtr::NOTNULL; // out
+ fields[argp++] = TypePtr::NOTNULL; // in
+ fields[argp++] = TypeInt::INT; // offset
+ fields[argp++] = TypeInt::INT; // len
+ fields[argp++] = TypeInt::INT; // k
+ assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+ const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+ // returning carry (int)
+ fields = TypeTuple::fields(1);
+ fields[TypeFunc::Parms+0] = TypeInt::INT;
+ const TypeTuple* range = TypeTuple::make(TypeFunc::Parms+1, fields);
+ return TypeFunc::make(domain, range);
+}
+
//------------- Interpreter state access for on stack replacement
--- a/hotspot/src/share/vm/opto/runtime.hpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/share/vm/opto/runtime.hpp Wed Jun 03 15:02:10 2015 -0700
@@ -312,6 +312,10 @@
static const TypeFunc* multiplyToLen_Type();
+ static const TypeFunc* squareToLen_Type();
+
+ static const TypeFunc* mulAdd_Type();
+
static const TypeFunc* updateBytesCRC32_Type();
// leaf on stack replacement interpreter accessor types
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp Wed Jun 03 15:02:10 2015 -0700
@@ -137,6 +137,8 @@
address StubRoutines::_crc_table_adr = NULL;
address StubRoutines::_multiplyToLen = NULL;
+address StubRoutines::_squareToLen = NULL;
+address StubRoutines::_mulAdd = NULL;
double (* StubRoutines::_intrinsic_log )(double) = NULL;
double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp Wed Jun 03 15:02:10 2015 -0700
@@ -197,6 +197,8 @@
static address _crc_table_adr;
static address _multiplyToLen;
+ static address _squareToLen;
+ static address _mulAdd;
// These are versions of the java.lang.Math methods which perform
// the same operations as the intrinsic version. They are used for
@@ -356,6 +358,8 @@
static address crc_table_addr() { return _crc_table_adr; }
static address multiplyToLen() {return _multiplyToLen; }
+ static address squareToLen() {return _squareToLen; }
+ static address mulAdd() {return _mulAdd; }
static address select_fill_function(BasicType t, bool aligned, const char* &name);
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp Fri May 29 17:56:50 2015 +0200
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp Wed Jun 03 15:02:10 2015 -0700
@@ -831,6 +831,8 @@
static_field(StubRoutines, _updateBytesCRC32, address) \
static_field(StubRoutines, _crc_table_adr, address) \
static_field(StubRoutines, _multiplyToLen, address) \
+ static_field(StubRoutines, _squareToLen, address) \
+ static_field(StubRoutines, _mulAdd, address) \
\
/*****************/ \
/* SharedRuntime */ \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/intrinsics/muladd/TestMulAdd.java Wed Jun 03 15:02:10 2015 -0700
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8081778
+ * @summary Add C2 x86 intrinsic for BigInteger::mulAdd() method
+ *
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
+ * -XX:+IgnoreUnrecognizedVMOptions -XX:-UseSquareToLenIntrinsic -XX:-UseMultiplyToLenIntrinsic
+ * -XX:CompileCommand=dontinline,TestMulAdd::main
+ * -XX:CompileCommand=option,TestMulAdd::base_multiply,ccstr,DisableIntrinsic,_mulAdd
+ * -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_mulAdd
+ * -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_mulAdd
+ * -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_mulAdd
+ * -XX:CompileCommand=option,java.math.BigInteger::mulAdd,ccstr,DisableIntrinsic,_mulAdd
+ * -XX:CompileCommand=inline,java.math.BigInteger::multiply
+ * -XX:CompileCommand=inline,java.math.BigInteger::square
+ * -XX:CompileCommand=inline,java.math.BigInteger::squareToLen
+ * -XX:CompileCommand=inline,java.math.BigInteger::mulAdd TestMulAdd
+ */
+
+import java.util.Random;
+import java.math.*;
+
+public class TestMulAdd {
+
+ // Avoid intrinsic by preventing inlining multiply() and mulAdd().
+ public static BigInteger base_multiply(BigInteger op1) {
+ return op1.multiply(op1);
+ }
+
+ // Generate mulAdd() intrinsic by inlining multiply().
+ public static BigInteger new_multiply(BigInteger op1) {
+ return op1.multiply(op1);
+ }
+
+ public static boolean bytecompare(BigInteger b1, BigInteger b2) {
+ byte[] data1 = b1.toByteArray();
+ byte[] data2 = b2.toByteArray();
+ if (data1.length != data2.length)
+ return false;
+ for (int i = 0; i < data1.length; i++) {
+ if (data1[i] != data2[i])
+ return false;
+ }
+ return true;
+ }
+
+ public static String stringify(BigInteger b) {
+ String strout= "";
+ byte [] data = b.toByteArray();
+ for (int i = 0; i < data.length; i++) {
+ strout += (String.format("%02x",data[i]) + " ");
+ }
+ return strout;
+ }
+
+ public static void main(String args[]) throws Exception {
+
+ BigInteger oldsum = new BigInteger("0");
+ BigInteger newsum = new BigInteger("0");
+
+ BigInteger b1, b2, oldres, newres;
+
+ Random rand = new Random();
+ long seed = System.nanoTime();
+ Random rand1 = new Random();
+ long seed1 = System.nanoTime();
+ rand.setSeed(seed);
+ rand1.setSeed(seed1);
+
+ for (int j = 0; j < 100000; j++) {
+ int rand_int = rand1.nextInt(3136)+32;
+ b1 = new BigInteger(rand_int, rand);
+
+ oldres = base_multiply(b1);
+ newres = new_multiply(b1);
+
+ oldsum = oldsum.add(oldres);
+ newsum = newsum.add(newres);
+
+ if (!bytecompare(oldres,newres)) {
+ System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
+ System.out.println(b1);
+ throw new Exception("Failed");
+ }
+ }
+ if (!bytecompare(oldsum,newsum)) {
+ System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
+ throw new Exception("Failed");
+ } else {
+ System.out.println("Success");
+ }
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/intrinsics/squaretolen/TestSquareToLen.java Wed Jun 03 15:02:10 2015 -0700
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8081778
+ * @summary Add C2 x86 intrinsic for BigInteger::squareToLen() method
+ *
+ * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
+ * -XX:CompileCommand=exclude,TestSquareToLen::main
+ * -XX:CompileCommand=option,TestSquareToLen::base_multiply,ccstr,DisableIntrinsic,_squareToLen
+ * -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_squareToLen
+ * -XX:CompileCommand=option,java.math.BigInteger::square,ccstr,DisableIntrinsic,_squareToLen
+ * -XX:CompileCommand=option,java.math.BigInteger::squareToLen,ccstr,DisableIntrinsic,_squareToLen
+ * -XX:CompileCommand=inline,java.math.BigInteger::multiply
+ * -XX:CompileCommand=inline,java.math.BigInteger::square
+ * -XX:CompileCommand=inline,java.math.BigInteger::squareToLen TestSquareToLen
+ */
+
+import java.util.Random;
+import java.math.*;
+
+public class TestSquareToLen {
+
+ // Avoid intrinsic by preventing inlining multiply() and squareToLen().
+ public static BigInteger base_multiply(BigInteger op1) {
+ return op1.multiply(op1);
+ }
+
+ // Generate squareToLen() intrinsic by inlining multiply().
+ public static BigInteger new_multiply(BigInteger op1) {
+ return op1.multiply(op1);
+ }
+
+ public static boolean bytecompare(BigInteger b1, BigInteger b2) {
+ byte[] data1 = b1.toByteArray();
+ byte[] data2 = b2.toByteArray();
+ if (data1.length != data2.length)
+ return false;
+ for (int i = 0; i < data1.length; i++) {
+ if (data1[i] != data2[i])
+ return false;
+ }
+ return true;
+ }
+
+ public static String stringify(BigInteger b) {
+ String strout= "";
+ byte [] data = b.toByteArray();
+ for (int i = 0; i < data.length; i++) {
+ strout += (String.format("%02x",data[i]) + " ");
+ }
+ return strout;
+ }
+
+ public static void main(String args[]) throws Exception {
+
+ BigInteger oldsum = new BigInteger("0");
+ BigInteger newsum = new BigInteger("0");
+
+ BigInteger b1, b2, oldres, newres;
+
+ Random rand = new Random();
+ long seed = System.nanoTime();
+ Random rand1 = new Random();
+ long seed1 = System.nanoTime();
+ rand.setSeed(seed);
+ rand1.setSeed(seed1);
+
+ for (int j = 0; j < 100000; j++) {
+ int rand_int = rand1.nextInt(3136)+32;
+ b1 = new BigInteger(rand_int, rand);
+
+ oldres = base_multiply(b1);
+ newres = new_multiply(b1);
+
+ oldsum = oldsum.add(oldres);
+ newsum = newsum.add(newres);
+
+ if (!bytecompare(oldres,newres)) {
+ System.out.print("mismatch for:b1:" + stringify(b1) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
+ System.out.println(b1);
+ throw new Exception("Failed");
+ }
+ }
+ if (!bytecompare(oldsum,newsum)) {
+ System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
+ throw new Exception("Failed");
+ } else {
+ System.out.println("Success");
+ }
+ }
+}