--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp Sat Sep 30 01:38:57 2017 +0000
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp Mon Oct 02 17:20:14 2017 +0300
@@ -2840,6 +2840,44 @@
bind(L_done);
}
+// Code for BigInteger::mulAdd instrinsic
+// out = r0
+// in = r1
+// offset = r2 (already out.length-offset)
+// len = r3
+// k = r4
+//
+// pseudo code from java implementation:
+// carry = 0;
+// offset = out.length-offset - 1;
+// for (int j=len-1; j >= 0; j--) {
+// product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
+// out[offset--] = (int)product;
+// carry = product >>> 32;
+// }
+// return (int)carry;
+void MacroAssembler::mul_add(Register out, Register in, Register offset,
+ Register len, Register k) {
+ Label LOOP, END;
+ // pre-loop
+ cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
+ csel(out, zr, out, Assembler::EQ);
+ br(Assembler::EQ, END);
+ add(in, in, len, LSL, 2); // in[j+1] address
+ add(offset, out, offset, LSL, 2); // out[offset + 1] address
+ mov(out, zr); // used to keep carry now
+ BIND(LOOP);
+ ldrw(rscratch1, Address(pre(in, -4)));
+ madd(rscratch1, rscratch1, k, out);
+ ldrw(rscratch2, Address(pre(offset, -4)));
+ add(rscratch1, rscratch1, rscratch2);
+ strw(rscratch1, Address(offset));
+ lsr(out, rscratch1, 32);
+ subs(len, len, 1);
+ br(Assembler::NE, LOOP);
+ BIND(END);
+}
+
/**
* Emits code to update CRC-32 with a byte value according to constants in table
*
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp Sat Sep 30 01:38:57 2017 +0000
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp Mon Oct 02 17:20:14 2017 +0300
@@ -1265,6 +1265,7 @@
void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z,
Register zlen, Register tmp1, Register tmp2, Register tmp3,
Register tmp4, Register tmp5, Register tmp6, Register tmp7);
+ void mul_add(Register out, Register in, Register offs, Register len, Register k);
// ISB may be needed because of a safepoint
void maybe_isb() { isb(); }
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp Sat Sep 30 01:38:57 2017 +0000
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp Mon Oct 02 17:20:14 2017 +0300
@@ -3607,6 +3607,63 @@
return start;
}
+ address generate_squareToLen() {
+ // squareToLen algorithm for sizes 1..127 described in java code works
+ // faster than multiply_to_len on some CPUs and slower on others, but
+ // multiply_to_len shows a bit better overall results
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "squareToLen");
+ address start = __ pc();
+
+ const Register x = r0;
+ const Register xlen = r1;
+ const Register z = r2;
+ const Register zlen = r3;
+ const Register y = r4; // == x
+ const Register ylen = r5; // == xlen
+
+ const Register tmp1 = r10;
+ const Register tmp2 = r11;
+ const Register tmp3 = r12;
+ const Register tmp4 = r13;
+ const Register tmp5 = r14;
+ const Register tmp6 = r15;
+ const Register tmp7 = r16;
+
+ RegSet spilled_regs = RegSet::of(y, ylen);
+ BLOCK_COMMENT("Entry:");
+ __ enter();
+ __ push(spilled_regs, sp);
+ __ mov(y, x);
+ __ mov(ylen, xlen);
+ __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+ __ pop(spilled_regs, sp);
+ __ leave();
+ __ ret(lr);
+ return start;
+ }
+
+ address generate_mulAdd() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "mulAdd");
+
+ address start = __ pc();
+
+ const Register out = r0;
+ const Register in = r1;
+ const Register offset = r2;
+ const Register len = r3;
+ const Register k = r4;
+
+ BLOCK_COMMENT("Entry:");
+ __ enter();
+ __ mul_add(out, in, offset, len, k);
+ __ leave();
+ __ ret(lr);
+
+ return start;
+ }
+
void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
@@ -4913,6 +4970,14 @@
StubRoutines::_multiplyToLen = generate_multiplyToLen();
}
+ if (UseSquareToLenIntrinsic) {
+ StubRoutines::_squareToLen = generate_squareToLen();
+ }
+
+ if (UseMulAddIntrinsic) {
+ StubRoutines::_mulAdd = generate_mulAdd();
+ }
+
if (UseMontgomeryMultiplyIntrinsic) {
StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
--- a/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp Sat Sep 30 01:38:57 2017 +0000
+++ b/src/hotspot/cpu/aarch64/vm_version_aarch64.cpp Mon Oct 02 17:20:14 2017 +0300
@@ -340,6 +340,14 @@
UseMultiplyToLenIntrinsic = true;
}
+ if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) {
+ UseSquareToLenIntrinsic = true;
+ }
+
+ if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) {
+ UseMulAddIntrinsic = true;
+ }
+
if (FLAG_IS_DEFAULT(UseBarriersForVolatile)) {
UseBarriersForVolatile = (_features & CPU_DMB_ATOMICS) != 0;
}