--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad Fri Jul 24 00:54:05 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad Mon Jul 27 09:42:23 2015 +0200
@@ -2167,8 +2167,12 @@
return 0; // Self copy, no move.
}
+ bool is64 = (src_lo & 1) == 0 && src_lo + 1 == src_hi &&
+ (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi;
+ int src_offset = ra_->reg2offset(src_lo);
+ int dst_offset = ra_->reg2offset(dst_lo);
+
if (bottom_type()->isa_vect() != NULL) {
- uint len = 4;
uint ireg = ideal_reg();
assert(ireg == Op_VecD || ireg == Op_VecX, "must be 64 bit or 128 bit vector");
if (cbuf) {
@@ -2176,334 +2180,115 @@
assert((src_lo_rc != rc_int && dst_lo_rc != rc_int), "sanity");
if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) {
// stack->stack
- int src_offset = ra_->reg2offset(src_lo);
- int dst_offset = ra_->reg2offset(dst_lo);
assert((src_offset & 7) && (dst_offset & 7), "unaligned stack offset");
- len = 8;
if (ireg == Op_VecD) {
- __ ldr(rscratch1, Address(sp, src_offset));
- __ str(rscratch1, Address(sp, dst_offset));
+ __ unspill(rscratch1, true, src_offset);
+ __ spill(rscratch1, true, dst_offset);
} else {
- if (src_offset < 512) {
- __ ldp(rscratch1, rscratch2, Address(sp, src_offset));
- } else {
- __ ldr(rscratch1, Address(sp, src_offset));
- __ ldr(rscratch2, Address(sp, src_offset+4));
- len += 4;
- }
- if (dst_offset < 512) {
- __ stp(rscratch1, rscratch2, Address(sp, dst_offset));
- } else {
- __ str(rscratch1, Address(sp, dst_offset));
- __ str(rscratch2, Address(sp, dst_offset+4));
- len += 4;
- }
+ __ spill_copy128(src_offset, dst_offset);
}
} else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) {
- __ orr(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+ __ mov(as_FloatRegister(Matcher::_regEncode[dst_lo]),
ireg == Op_VecD ? __ T8B : __ T16B,
- as_FloatRegister(Matcher::_regEncode[src_lo]),
as_FloatRegister(Matcher::_regEncode[src_lo]));
} else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) {
- __ str(as_FloatRegister(Matcher::_regEncode[src_lo]),
- ireg == Op_VecD ? __ D : __ Q,
- Address(sp, ra_->reg2offset(dst_lo)));
+ __ spill(as_FloatRegister(Matcher::_regEncode[src_lo]),
+ ireg == Op_VecD ? __ D : __ Q,
+ ra_->reg2offset(dst_lo));
} else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) {
- __ ldr(as_FloatRegister(Matcher::_regEncode[dst_lo]),
- ireg == Op_VecD ? __ D : __ Q,
- Address(sp, ra_->reg2offset(src_lo)));
+ __ unspill(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+ ireg == Op_VecD ? __ D : __ Q,
+ ra_->reg2offset(src_lo));
} else {
ShouldNotReachHere();
}
- } else if (st) {
- if (src_lo_rc == rc_stack && dst_lo_rc == rc_stack) {
- // stack->stack
- int src_offset = ra_->reg2offset(src_lo);
- int dst_offset = ra_->reg2offset(dst_lo);
- if (ireg == Op_VecD) {
- st->print("ldr rscratch1, [sp, #%d]", src_offset);
- st->print("str rscratch1, [sp, #%d]", dst_offset);
+ }
+ } else if (cbuf) {
+ MacroAssembler _masm(cbuf);
+ switch (src_lo_rc) {
+ case rc_int:
+ if (dst_lo_rc == rc_int) { // gpr --> gpr copy
+ if (is64) {
+ __ mov(as_Register(Matcher::_regEncode[dst_lo]),
+ as_Register(Matcher::_regEncode[src_lo]));
} else {
- if (src_offset < 512) {
- st->print("ldp rscratch1, rscratch2, [sp, #%d]", src_offset);
- } else {
- st->print("ldr rscratch1, [sp, #%d]", src_offset);
- st->print("\nldr rscratch2, [sp, #%d]", src_offset+4);
- }
- if (dst_offset < 512) {
- st->print("\nstp rscratch1, rscratch2, [sp, #%d]", dst_offset);
- } else {
- st->print("\nstr rscratch1, [sp, #%d]", dst_offset);
- st->print("\nstr rscratch2, [sp, #%d]", dst_offset+4);
- }
- }
- st->print("\t# vector spill, stack to stack");
- } else if (src_lo_rc == rc_float && dst_lo_rc == rc_float) {
- st->print("mov %s, %s\t# vector spill, reg to reg",
- Matcher::regName[dst_lo], Matcher::regName[src_lo]);
- } else if (src_lo_rc == rc_float && dst_lo_rc == rc_stack) {
- st->print("str %s, [sp, #%d]\t# vector spill, reg to stack",
- Matcher::regName[src_lo], ra_->reg2offset(dst_lo));
- } else if (src_lo_rc == rc_stack && dst_lo_rc == rc_float) {
- st->print("ldr %s, [sp, #%d]\t# vector spill, stack to reg",
- Matcher::regName[dst_lo], ra_->reg2offset(src_lo));
- }
- }
- return len;
- }
-
- switch (src_lo_rc) {
- case rc_int:
- if (dst_lo_rc == rc_int) { // gpr --> gpr copy
- if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
- (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
- // 64 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ mov(as_Register(Matcher::_regEncode[dst_lo]),
- as_Register(Matcher::_regEncode[src_lo]));
- } else if (st) {
- st->print("mov %s, %s\t# shuffle",
- Matcher::regName[dst_lo],
- Matcher::regName[src_lo]);
- }
- } else {
- // 32 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ movw(as_Register(Matcher::_regEncode[dst_lo]),
- as_Register(Matcher::_regEncode[src_lo]));
- } else if (st) {
- st->print("movw %s, %s\t# shuffle",
- Matcher::regName[dst_lo],
- Matcher::regName[src_lo]);
+ MacroAssembler _masm(cbuf);
+ __ movw(as_Register(Matcher::_regEncode[dst_lo]),
+ as_Register(Matcher::_regEncode[src_lo]));
}
- }
- } else if (dst_lo_rc == rc_float) { // gpr --> fpr copy
- if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
- (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
- // 64 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ fmovd(as_FloatRegister(Matcher::_regEncode[dst_lo]),
- as_Register(Matcher::_regEncode[src_lo]));
- } else if (st) {
- st->print("fmovd %s, %s\t# shuffle",
- Matcher::regName[dst_lo],
- Matcher::regName[src_lo]);
- }
- } else {
- // 32 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ fmovs(as_FloatRegister(Matcher::_regEncode[dst_lo]),
- as_Register(Matcher::_regEncode[src_lo]));
- } else if (st) {
- st->print("fmovs %s, %s\t# shuffle",
- Matcher::regName[dst_lo],
- Matcher::regName[src_lo]);
- }
- }
- } else { // gpr --> stack spill
- assert(dst_lo_rc == rc_stack, "spill to bad register class");
- int dst_offset = ra_->reg2offset(dst_lo);
- if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
- (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
- // 64 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ str(as_Register(Matcher::_regEncode[src_lo]),
- Address(sp, dst_offset));
- } else if (st) {
- st->print("str %s, [sp, #%d]\t# spill",
- Matcher::regName[src_lo],
- dst_offset);
- }
- } else {
- // 32 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ strw(as_Register(Matcher::_regEncode[src_lo]),
- Address(sp, dst_offset));
- } else if (st) {
- st->print("strw %s, [sp, #%d]\t# spill",
- Matcher::regName[src_lo],
- dst_offset);
- }
- }
- }
- return 4;
- case rc_float:
- if (dst_lo_rc == rc_int) { // fpr --> gpr copy
- if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
- (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
- // 64 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ fmovd(as_Register(Matcher::_regEncode[dst_lo]),
- as_FloatRegister(Matcher::_regEncode[src_lo]));
- } else if (st) {
- st->print("fmovd %s, %s\t# shuffle",
- Matcher::regName[dst_lo],
- Matcher::regName[src_lo]);
+ } else if (dst_lo_rc == rc_float) { // gpr --> fpr copy
+ if (is64) {
+ __ fmovd(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+ as_Register(Matcher::_regEncode[src_lo]));
+ } else {
+ __ fmovs(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+ as_Register(Matcher::_regEncode[src_lo]));
}
- } else {
- // 32 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ fmovs(as_Register(Matcher::_regEncode[dst_lo]),
- as_FloatRegister(Matcher::_regEncode[src_lo]));
- } else if (st) {
- st->print("fmovs %s, %s\t# shuffle",
- Matcher::regName[dst_lo],
- Matcher::regName[src_lo]);
- }
- }
- } else if (dst_lo_rc == rc_float) { // fpr --> fpr copy
- if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
- (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
- // 64 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ fmovd(as_FloatRegister(Matcher::_regEncode[dst_lo]),
- as_FloatRegister(Matcher::_regEncode[src_lo]));
- } else if (st) {
- st->print("fmovd %s, %s\t# shuffle",
- Matcher::regName[dst_lo],
- Matcher::regName[src_lo]);
- }
- } else {
- // 32 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ fmovs(as_FloatRegister(Matcher::_regEncode[dst_lo]),
- as_FloatRegister(Matcher::_regEncode[src_lo]));
- } else if (st) {
- st->print("fmovs %s, %s\t# shuffle",
- Matcher::regName[dst_lo],
- Matcher::regName[src_lo]);
- }
- }
- } else { // fpr --> stack spill
- assert(dst_lo_rc == rc_stack, "spill to bad register class");
- int dst_offset = ra_->reg2offset(dst_lo);
- if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
- (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
- // 64 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ strd(as_FloatRegister(Matcher::_regEncode[src_lo]),
- Address(sp, dst_offset));
- } else if (st) {
- st->print("strd %s, [sp, #%d]\t# spill",
- Matcher::regName[src_lo],
- dst_offset);
- }
- } else {
- // 32 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ strs(as_FloatRegister(Matcher::_regEncode[src_lo]),
- Address(sp, dst_offset));
- } else if (st) {
- st->print("strs %s, [sp, #%d]\t# spill",
- Matcher::regName[src_lo],
- dst_offset);
- }
+ } else { // gpr --> stack spill
+ assert(dst_lo_rc == rc_stack, "spill to bad register class");
+ __ spill(as_Register(Matcher::_regEncode[src_lo]), is64, dst_offset);
}
- }
- return 4;
- case rc_stack:
- int src_offset = ra_->reg2offset(src_lo);
- if (dst_lo_rc == rc_int) { // stack --> gpr load
- if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
- (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
- // 64 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ ldr(as_Register(Matcher::_regEncode[dst_lo]),
- Address(sp, src_offset));
- } else if (st) {
- st->print("ldr %s, [sp, %d]\t# restore",
- Matcher::regName[dst_lo],
- src_offset);
+ break;
+ case rc_float:
+ if (dst_lo_rc == rc_int) { // fpr --> gpr copy
+ if (is64) {
+ __ fmovd(as_Register(Matcher::_regEncode[dst_lo]),
+ as_FloatRegister(Matcher::_regEncode[src_lo]));
+ } else {
+ __ fmovs(as_Register(Matcher::_regEncode[dst_lo]),
+ as_FloatRegister(Matcher::_regEncode[src_lo]));
}
- } else {
- // 32 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ ldrw(as_Register(Matcher::_regEncode[dst_lo]),
- Address(sp, src_offset));
- } else if (st) {
- st->print("ldr %s, [sp, %d]\t# restore",
- Matcher::regName[dst_lo],
- src_offset);
- }
- }
- return 4;
- } else if (dst_lo_rc == rc_float) { // stack --> fpr load
- if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
- (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
- // 64 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ ldrd(as_FloatRegister(Matcher::_regEncode[dst_lo]),
- Address(sp, src_offset));
- } else if (st) {
- st->print("ldrd %s, [sp, %d]\t# restore",
- Matcher::regName[dst_lo],
- src_offset);
+ } else if (dst_lo_rc == rc_float) { // fpr --> fpr copy
+ if (cbuf) {
+ __ fmovd(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+ as_FloatRegister(Matcher::_regEncode[src_lo]));
+ } else {
+ __ fmovs(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+ as_FloatRegister(Matcher::_regEncode[src_lo]));
}
- } else {
- // 32 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ ldrs(as_FloatRegister(Matcher::_regEncode[dst_lo]),
- Address(sp, src_offset));
- } else if (st) {
- st->print("ldrs %s, [sp, %d]\t# restore",
- Matcher::regName[dst_lo],
- src_offset);
- }
+ } else { // fpr --> stack spill
+ assert(dst_lo_rc == rc_stack, "spill to bad register class");
+ __ spill(as_FloatRegister(Matcher::_regEncode[src_lo]),
+ is64 ? __ D : __ S, dst_offset);
}
- return 4;
- } else { // stack --> stack copy
- assert(dst_lo_rc == rc_stack, "spill to bad register class");
- int dst_offset = ra_->reg2offset(dst_lo);
- if (((src_lo & 1) == 0 && src_lo + 1 == src_hi) &&
- (dst_lo & 1) == 0 && dst_lo + 1 == dst_hi) {
- // 64 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ ldr(rscratch1, Address(sp, src_offset));
- __ str(rscratch1, Address(sp, dst_offset));
- } else if (st) {
- st->print("ldr rscratch1, [sp, %d]\t# mem-mem spill",
- src_offset);
- st->print("\n\t");
- st->print("str rscratch1, [sp, %d]",
- dst_offset);
- }
- } else {
- // 32 bit
- if (cbuf) {
- MacroAssembler _masm(cbuf);
- __ ldrw(rscratch1, Address(sp, src_offset));
- __ strw(rscratch1, Address(sp, dst_offset));
- } else if (st) {
- st->print("ldrw rscratch1, [sp, %d]\t# mem-mem spill",
- src_offset);
- st->print("\n\t");
- st->print("strw rscratch1, [sp, %d]",
- dst_offset);
- }
+ break;
+ case rc_stack:
+ if (dst_lo_rc == rc_int) { // stack --> gpr load
+ __ unspill(as_Register(Matcher::_regEncode[dst_lo]), is64, src_offset);
+ } else if (dst_lo_rc == rc_float) { // stack --> fpr load
+ __ unspill(as_FloatRegister(Matcher::_regEncode[dst_lo]),
+ is64 ? __ D : __ S, src_offset);
+ } else { // stack --> stack copy
+ assert(dst_lo_rc == rc_stack, "spill to bad register class");
+ __ unspill(rscratch1, is64, src_offset);
+ __ spill(rscratch1, is64, dst_offset);
}
- return 8;
+ break;
+ default:
+ assert(false, "bad rc_class for spill");
+ ShouldNotReachHere();
}
}
- assert(false," bad rc_class for spill ");
- Unimplemented();
+ if (st) {
+ st->print("spill ");
+ if (src_lo_rc == rc_stack) {
+ st->print("[sp, #%d] -> ", ra_->reg2offset(src_lo));
+ } else {
+ st->print("%s -> ", Matcher::regName[src_lo]);
+ }
+ if (dst_lo_rc == rc_stack) {
+ st->print("[sp, #%d]", ra_->reg2offset(dst_lo));
+ } else {
+ st->print("%s", Matcher::regName[dst_lo]);
+ }
+ if (bottom_type()->isa_vect() != NULL) {
+ st->print("\t# vector spill size = %d", ideal_reg()==Op_VecD ? 64:128);
+ } else {
+ st->print("\t# spill size = %d", is64 ? 64:32);
+ }
+ }
+
return 0;
}
@@ -2522,7 +2307,7 @@
}
uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const {
- return implementation(NULL, ra_, true, NULL);
+ return MachNode::size(ra_);
}
//=============================================================================
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Fri Jul 24 00:54:05 2015 -0700
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Mon Jul 27 09:42:23 2015 +0200
@@ -120,10 +120,8 @@
// we save r19-r28 which Java uses as scratch registers and C
// expects to be callee-save
//
- // we don't save any FP registers since only v8-v15 are callee-save
- // (strictly only the f and d components) and Java uses them as
- // callee-save. v0-v7 are arg registers and C treats v16-v31 as
- // volatile (as does Java?)
+ // we save the bottom 64 bits of each value stored in v8-v15; it is
+ // the responsibility of the caller to preserve larger values.
//
// so the stub frame looks like this when we enter Java code
//
@@ -131,14 +129,14 @@
// [ argument word n ]
// ...
// -27 [ argument word 1 ]
- // -26 [ saved d15 ] <--- sp_after_call
- // -25 [ saved d14 ]
- // -24 [ saved d13 ]
- // -23 [ saved d12 ]
- // -22 [ saved d11 ]
- // -21 [ saved d10 ]
- // -20 [ saved d9 ]
- // -19 [ saved d8 ]
+ // -26 [ saved v15 ] <--- sp_after_call
+ // -25 [ saved v14 ]
+ // -24 [ saved v13 ]
+ // -23 [ saved v12 ]
+ // -22 [ saved v11 ]
+ // -21 [ saved v10 ]
+ // -20 [ saved v9 ]
+ // -19 [ saved v8 ]
// -18 [ saved r28 ]
// -17 [ saved r27 ]
// -16 [ saved r26 ]
@@ -2544,6 +2542,828 @@
return stub->entry_point();
}
+ class MontgomeryMultiplyGenerator : public MacroAssembler {
+
+ Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
+ Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
+
+ RegSet _toSave;
+ bool _squaring;
+
+ public:
+ MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
+ : MacroAssembler(as->code()), _squaring(squaring) {
+
+ // Register allocation
+
+ Register reg = c_rarg0;
+ Pa_base = reg; // Argument registers
+ if (squaring)
+ Pb_base = Pa_base;
+ else
+ Pb_base = ++reg;
+ Pn_base = ++reg;
+ Rlen= ++reg;
+ inv = ++reg;
+ Pm_base = ++reg;
+
+ // Working registers:
+ Ra = ++reg; // The current digit of a, b, n, and m.
+ Rb = ++reg;
+ Rm = ++reg;
+ Rn = ++reg;
+
+ Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m.
+ Pb = ++reg;
+ Pm = ++reg;
+ Pn = ++reg;
+
+ t0 = ++reg; // Three registers which form a
+ t1 = ++reg; // triple-precision accumuator.
+ t2 = ++reg;
+
+ Ri = ++reg; // Inner and outer loop indexes.
+ Rj = ++reg;
+
+ Rhi_ab = ++reg; // Product registers: low and high parts
+ Rlo_ab = ++reg; // of a*b and m*n.
+ Rhi_mn = ++reg;
+ Rlo_mn = ++reg;
+
+ // r19 and up are callee-saved.
+ _toSave = RegSet::range(r19, reg) + Pm_base;
+ }
+
+ private:
+ void save_regs() {
+ push(_toSave, sp);
+ }
+
+ void restore_regs() {
+ pop(_toSave, sp);
+ }
+
+ template <typename T>
+ void unroll_2(Register count, T block) {
+ Label loop, end, odd;
+ tbnz(count, 0, odd);
+ cbz(count, end);
+ align(16);
+ bind(loop);
+ (this->*block)();
+ bind(odd);
+ (this->*block)();
+ subs(count, count, 2);
+ br(Assembler::GT, loop);
+ bind(end);
+ }
+
+ template <typename T>
+ void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
+ Label loop, end, odd;
+ tbnz(count, 0, odd);
+ cbz(count, end);
+ align(16);
+ bind(loop);
+ (this->*block)(d, s, tmp);
+ bind(odd);
+ (this->*block)(d, s, tmp);
+ subs(count, count, 2);
+ br(Assembler::GT, loop);
+ bind(end);
+ }
+
+ void pre1(RegisterOrConstant i) {
+ block_comment("pre1");
+ // Pa = Pa_base;
+ // Pb = Pb_base + i;
+ // Pm = Pm_base;
+ // Pn = Pn_base + i;
+ // Ra = *Pa;
+ // Rb = *Pb;
+ // Rm = *Pm;
+ // Rn = *Pn;
+ ldr(Ra, Address(Pa_base));
+ ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
+ ldr(Rm, Address(Pm_base));
+ ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
+ lea(Pa, Address(Pa_base));
+ lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
+ lea(Pm, Address(Pm_base));
+ lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
+
+ // Zero the m*n result.
+ mov(Rhi_mn, zr);
+ mov(Rlo_mn, zr);
+ }
+
+ // The core multiply-accumulate step of a Montgomery
+ // multiplication. The idea is to schedule operations as a
+ // pipeline so that instructions with long latencies (loads and
+ // multiplies) have time to complete before their results are
+ // used. This most benefits in-order implementations of the
+ // architecture but out-of-order ones also benefit.
+ void step() {
+ block_comment("step");
+ // MACC(Ra, Rb, t0, t1, t2);
+ // Ra = *++Pa;
+ // Rb = *--Pb;
+ umulh(Rhi_ab, Ra, Rb);
+ mul(Rlo_ab, Ra, Rb);
+ ldr(Ra, pre(Pa, wordSize));
+ ldr(Rb, pre(Pb, -wordSize));
+ acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
+ // previous iteration.
+ // MACC(Rm, Rn, t0, t1, t2);
+ // Rm = *++Pm;
+ // Rn = *--Pn;
+ umulh(Rhi_mn, Rm, Rn);
+ mul(Rlo_mn, Rm, Rn);
+ ldr(Rm, pre(Pm, wordSize));
+ ldr(Rn, pre(Pn, -wordSize));
+ acc(Rhi_ab, Rlo_ab, t0, t1, t2);
+ }
+
+ void post1() {
+ block_comment("post1");
+
+ // MACC(Ra, Rb, t0, t1, t2);
+ // Ra = *++Pa;
+ // Rb = *--Pb;
+ umulh(Rhi_ab, Ra, Rb);
+ mul(Rlo_ab, Ra, Rb);
+ acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
+ acc(Rhi_ab, Rlo_ab, t0, t1, t2);
+
+ // *Pm = Rm = t0 * inv;
+ mul(Rm, t0, inv);
+ str(Rm, Address(Pm));
+
+ // MACC(Rm, Rn, t0, t1, t2);
+ // t0 = t1; t1 = t2; t2 = 0;
+ umulh(Rhi_mn, Rm, Rn);
+
+#ifndef PRODUCT
+ // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
+ {
+ mul(Rlo_mn, Rm, Rn);
+ add(Rlo_mn, t0, Rlo_mn);
+ Label ok;
+ cbz(Rlo_mn, ok); {
+ stop("broken Montgomery multiply");
+ } bind(ok);
+ }
+#endif
+ // We have very carefully set things up so that
+ // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
+ // the lower half of Rm * Rn because we know the result already:
+ // it must be -t0. t0 + (-t0) must generate a carry iff
+ // t0 != 0. So, rather than do a mul and an adds we just set
+ // the carry flag iff t0 is nonzero.
+ //
+ // mul(Rlo_mn, Rm, Rn);
+ // adds(zr, t0, Rlo_mn);
+ subs(zr, t0, 1); // Set carry iff t0 is nonzero
+ adcs(t0, t1, Rhi_mn);
+ adc(t1, t2, zr);
+ mov(t2, zr);
+ }
+
+ void pre2(RegisterOrConstant i, RegisterOrConstant len) {
+ block_comment("pre2");
+ // Pa = Pa_base + i-len;
+ // Pb = Pb_base + len;
+ // Pm = Pm_base + i-len;
+ // Pn = Pn_base + len;
+
+ if (i.is_register()) {
+ sub(Rj, i.as_register(), len);
+ } else {
+ mov(Rj, i.as_constant());
+ sub(Rj, Rj, len);
+ }
+ // Rj == i-len
+
+ lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
+ lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
+ lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
+ lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
+
+ // Ra = *++Pa;
+ // Rb = *--Pb;
+ // Rm = *++Pm;
+ // Rn = *--Pn;
+ ldr(Ra, pre(Pa, wordSize));
+ ldr(Rb, pre(Pb, -wordSize));
+ ldr(Rm, pre(Pm, wordSize));
+ ldr(Rn, pre(Pn, -wordSize));
+
+ mov(Rhi_mn, zr);
+ mov(Rlo_mn, zr);
+ }
+
+ void post2(RegisterOrConstant i, RegisterOrConstant len) {
+ block_comment("post2");
+ if (i.is_constant()) {
+ mov(Rj, i.as_constant()-len.as_constant());
+ } else {
+ sub(Rj, i.as_register(), len);
+ }
+
+ adds(t0, t0, Rlo_mn); // The pending m*n, low part
+
+ // As soon as we know the least significant digit of our result,
+ // store it.
+ // Pm_base[i-len] = t0;
+ str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
+
+ // t0 = t1; t1 = t2; t2 = 0;
+ adcs(t0, t1, Rhi_mn); // The pending m*n, high part
+ adc(t1, t2, zr);
+ mov(t2, zr);
+ }
+
+ // A carry in t0 after Montgomery multiplication means that we
+ // should subtract multiples of n from our result in m. We'll
+ // keep doing that until there is no carry.
+ void normalize(RegisterOrConstant len) {
+ block_comment("normalize");
+ // while (t0)
+ // t0 = sub(Pm_base, Pn_base, t0, len);
+ Label loop, post, again;
+ Register cnt = t1, i = t2; // Re-use registers; we're done with them now
+ cbz(t0, post); {
+ bind(again); {
+ mov(i, zr);
+ mov(cnt, len);
+ ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
+ ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
+ subs(zr, zr, zr); // set carry flag, i.e. no borrow
+ align(16);
+ bind(loop); {
+ sbcs(Rm, Rm, Rn);
+ str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
+ add(i, i, 1);
+ ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
+ ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
+ sub(cnt, cnt, 1);
+ } cbnz(cnt, loop);
+ sbc(t0, t0, zr);
+ } cbnz(t0, again);
+ } bind(post);
+ }
+
+ // Move memory at s to d, reversing words.
+ // Increments d to end of copied memory
+ // Destroys tmp1, tmp2
+ // Preserves len
+ // Leaves s pointing to the address which was in d at start
+ void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
+ assert(tmp1 < r19 && tmp2 < r19, "register corruption");
+
+ lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
+ mov(tmp1, len);
+ unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
+ sub(s, d, len, ext::uxtw, LogBytesPerWord);
+ }
+ // where
+ void reverse1(Register d, Register s, Register tmp) {
+ ldr(tmp, pre(s, -wordSize));
+ ror(tmp, tmp, 32);
+ str(tmp, post(d, wordSize));
+ }
+
+ void step_squaring() {
+ // An extra ACC
+ step();
+ acc(Rhi_ab, Rlo_ab, t0, t1, t2);
+ }
+
+ void last_squaring(RegisterOrConstant i) {
+ Label dont;
+ // if ((i & 1) == 0) {
+ tbnz(i.as_register(), 0, dont); {
+ // MACC(Ra, Rb, t0, t1, t2);
+ // Ra = *++Pa;
+ // Rb = *--Pb;
+ umulh(Rhi_ab, Ra, Rb);
+ mul(Rlo_ab, Ra, Rb);
+ acc(Rhi_ab, Rlo_ab, t0, t1, t2);
+ } bind(dont);
+ }
+
+ void extra_step_squaring() {
+ acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
+
+ // MACC(Rm, Rn, t0, t1, t2);
+ // Rm = *++Pm;
+ // Rn = *--Pn;
+ umulh(Rhi_mn, Rm, Rn);
+ mul(Rlo_mn, Rm, Rn);
+ ldr(Rm, pre(Pm, wordSize));
+ ldr(Rn, pre(Pn, -wordSize));
+ }
+
+ void post1_squaring() {
+ acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
+
+ // *Pm = Rm = t0 * inv;
+ mul(Rm, t0, inv);
+ str(Rm, Address(Pm));
+
+ // MACC(Rm, Rn, t0, t1, t2);
+ // t0 = t1; t1 = t2; t2 = 0;
+ umulh(Rhi_mn, Rm, Rn);
+
+#ifndef PRODUCT
+ // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
+ {
+ mul(Rlo_mn, Rm, Rn);
+ add(Rlo_mn, t0, Rlo_mn);
+ Label ok;
+ cbz(Rlo_mn, ok); {
+ stop("broken Montgomery multiply");
+ } bind(ok);
+ }
+#endif
+ // We have very carefully set things up so that
+ // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
+ // the lower half of Rm * Rn because we know the result already:
+ // it must be -t0. t0 + (-t0) must generate a carry iff
+ // t0 != 0. So, rather than do a mul and an adds we just set
+ // the carry flag iff t0 is nonzero.
+ //
+ // mul(Rlo_mn, Rm, Rn);
+ // adds(zr, t0, Rlo_mn);
+ subs(zr, t0, 1); // Set carry iff t0 is nonzero
+ adcs(t0, t1, Rhi_mn);
+ adc(t1, t2, zr);
+ mov(t2, zr);
+ }
+
+ void acc(Register Rhi, Register Rlo,
+ Register t0, Register t1, Register t2) {
+ adds(t0, t0, Rlo);
+ adcs(t1, t1, Rhi);
+ adc(t2, t2, zr);
+ }
+
+ public:
+ /**
+ * Fast Montgomery multiplication. The derivation of the
+ * algorithm is in A Cryptographic Library for the Motorola
+ * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
+ *
+ * Arguments:
+ *
+ * Inputs for multiplication:
+ * c_rarg0 - int array elements a
+ * c_rarg1 - int array elements b
+ * c_rarg2 - int array elements n (the modulus)
+ * c_rarg3 - int length
+ * c_rarg4 - int inv
+ * c_rarg5 - int array elements m (the result)
+ *
+ * Inputs for squaring:
+ * c_rarg0 - int array elements a
+ * c_rarg1 - int array elements n (the modulus)
+ * c_rarg2 - int length
+ * c_rarg3 - int inv
+ * c_rarg4 - int array elements m (the result)
+ *
+ */
+ address generate_multiply() {
+ Label argh, nothing;
+ bind(argh);
+ stop("MontgomeryMultiply total_allocation must be <= 8192");
+
+ align(CodeEntryAlignment);
+ address entry = pc();
+
+ cbzw(Rlen, nothing);
+
+ enter();
+
+ // Make room.
+ cmpw(Rlen, 512);
+ br(Assembler::HI, argh);
+ sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
+ andr(sp, Ra, -2 * wordSize);
+
+ lsrw(Rlen, Rlen, 1); // length in longwords = len/2
+
+ {
+ // Copy input args, reversing as we go. We use Ra as a
+ // temporary variable.
+ reverse(Ra, Pa_base, Rlen, t0, t1);
+ if (!_squaring)
+ reverse(Ra, Pb_base, Rlen, t0, t1);
+ reverse(Ra, Pn_base, Rlen, t0, t1);
+ }
+
+ // Push all call-saved registers and also Pm_base which we'll need
+ // at the end.
+ save_regs();
+
+#ifndef PRODUCT
+ // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+ {
+ ldr(Rn, Address(Pn_base, 0));
+ mul(Rlo_mn, Rn, inv);
+ cmp(Rlo_mn, -1);
+ Label ok;
+ br(EQ, ok); {
+ stop("broken inverse in Montgomery multiply");
+ } bind(ok);
+ }
+#endif
+
+ mov(Pm_base, Ra);
+
+ mov(t0, zr);
+ mov(t1, zr);
+ mov(t2, zr);
+
+ block_comment("for (int i = 0; i < len; i++) {");
+ mov(Ri, zr); {
+ Label loop, end;
+ cmpw(Ri, Rlen);
+ br(Assembler::GE, end);
+
+ bind(loop);
+ pre1(Ri);
+
+ block_comment(" for (j = i; j; j--) {"); {
+ movw(Rj, Ri);
+ unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
+ } block_comment(" } // j");
+
+ post1();
+ addw(Ri, Ri, 1);
+ cmpw(Ri, Rlen);
+ br(Assembler::LT, loop);
+ bind(end);
+ block_comment("} // i");
+ }
+
+ block_comment("for (int i = len; i < 2*len; i++) {");
+ mov(Ri, Rlen); {
+ Label loop, end;
+ cmpw(Ri, Rlen, Assembler::LSL, 1);
+ br(Assembler::GE, end);
+
+ bind(loop);
+ pre2(Ri, Rlen);
+
+ block_comment(" for (j = len*2-i-1; j; j--) {"); {
+ lslw(Rj, Rlen, 1);
+ subw(Rj, Rj, Ri);
+ subw(Rj, Rj, 1);
+ unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
+ } block_comment(" } // j");
+
+ post2(Ri, Rlen);
+ addw(Ri, Ri, 1);
+ cmpw(Ri, Rlen, Assembler::LSL, 1);
+ br(Assembler::LT, loop);
+ bind(end);
+ }
+ block_comment("} // i");
+
+ normalize(Rlen);
+
+ mov(Ra, Pm_base); // Save Pm_base in Ra
+ restore_regs(); // Restore caller's Pm_base
+
+ // Copy our result into caller's Pm_base
+ reverse(Pm_base, Ra, Rlen, t0, t1);
+
+ leave();
+ bind(nothing);
+ ret(lr);
+
+ return entry;
+ }
+ // In C, approximately:
+
+ // void
+ // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
+ // unsigned long Pn_base[], unsigned long Pm_base[],
+ // unsigned long inv, int len) {
+ // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+ // unsigned long *Pa, *Pb, *Pn, *Pm;
+ // unsigned long Ra, Rb, Rn, Rm;
+
+ // int i;
+
+ // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
+
+ // for (i = 0; i < len; i++) {
+ // int j;
+
+ // Pa = Pa_base;
+ // Pb = Pb_base + i;
+ // Pm = Pm_base;
+ // Pn = Pn_base + i;
+
+ // Ra = *Pa;
+ // Rb = *Pb;
+ // Rm = *Pm;
+ // Rn = *Pn;
+
+ // int iters = i;
+ // for (j = 0; iters--; j++) {
+ // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
+ // MACC(Ra, Rb, t0, t1, t2);
+ // Ra = *++Pa;
+ // Rb = *--Pb;
+ // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
+ // MACC(Rm, Rn, t0, t1, t2);
+ // Rm = *++Pm;
+ // Rn = *--Pn;
+ // }
+
+ // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
+ // MACC(Ra, Rb, t0, t1, t2);
+ // *Pm = Rm = t0 * inv;
+ // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
+ // MACC(Rm, Rn, t0, t1, t2);
+
+ // assert(t0 == 0, "broken Montgomery multiply");
+
+ // t0 = t1; t1 = t2; t2 = 0;
+ // }
+
+ // for (i = len; i < 2*len; i++) {
+ // int j;
+
+ // Pa = Pa_base + i-len;
+ // Pb = Pb_base + len;
+ // Pm = Pm_base + i-len;
+ // Pn = Pn_base + len;
+
+ // Ra = *++Pa;
+ // Rb = *--Pb;
+ // Rm = *++Pm;
+ // Rn = *--Pn;
+
+ // int iters = len*2-i-1;
+ // for (j = i-len+1; iters--; j++) {
+ // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
+ // MACC(Ra, Rb, t0, t1, t2);
+ // Ra = *++Pa;
+ // Rb = *--Pb;
+ // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
+ // MACC(Rm, Rn, t0, t1, t2);
+ // Rm = *++Pm;
+ // Rn = *--Pn;
+ // }
+
+ // Pm_base[i-len] = t0;
+ // t0 = t1; t1 = t2; t2 = 0;
+ // }
+
+ // while (t0)
+ // t0 = sub(Pm_base, Pn_base, t0, len);
+ // }
+
+ /**
+ * Fast Montgomery squaring. This uses asymptotically 25% fewer
+ * multiplies than Montgomery multiplication so it should be up to
+ * 25% faster. However, its loop control is more complex and it
+ * may actually run slower on some machines.
+ *
+ * Arguments:
+ *
+ * Inputs:
+ * c_rarg0 - int array elements a
+ * c_rarg1 - int array elements n (the modulus)
+ * c_rarg2 - int length
+ * c_rarg3 - int inv
+ * c_rarg4 - int array elements m (the result)
+ *
+ */
+ address generate_square() {
+ Label argh;
+ bind(argh);
+ stop("MontgomeryMultiply total_allocation must be <= 8192");
+
+ align(CodeEntryAlignment);
+ address entry = pc();
+
+ enter();
+
+ // Make room.
+ cmpw(Rlen, 512);
+ br(Assembler::HI, argh);
+ sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
+ andr(sp, Ra, -2 * wordSize);
+
+ lsrw(Rlen, Rlen, 1); // length in longwords = len/2
+
+ {
+ // Copy input args, reversing as we go. We use Ra as a
+ // temporary variable.
+ reverse(Ra, Pa_base, Rlen, t0, t1);
+ reverse(Ra, Pn_base, Rlen, t0, t1);
+ }
+
+ // Push all call-saved registers and also Pm_base which we'll need
+ // at the end.
+ save_regs();
+
+ mov(Pm_base, Ra);
+
+ mov(t0, zr);
+ mov(t1, zr);
+ mov(t2, zr);
+
+ block_comment("for (int i = 0; i < len; i++) {");
+ mov(Ri, zr); {
+ Label loop, end;
+ bind(loop);
+ cmp(Ri, Rlen);
+ br(Assembler::GE, end);
+
+ pre1(Ri);
+
+ block_comment("for (j = (i+1)/2; j; j--) {"); {
+ add(Rj, Ri, 1);
+ lsr(Rj, Rj, 1);
+ unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
+ } block_comment(" } // j");
+
+ last_squaring(Ri);
+
+ block_comment(" for (j = i/2; j; j--) {"); {
+ lsr(Rj, Ri, 1);
+ unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
+ } block_comment(" } // j");
+
+ post1_squaring();
+ add(Ri, Ri, 1);
+ cmp(Ri, Rlen);
+ br(Assembler::LT, loop);
+
+ bind(end);
+ block_comment("} // i");
+ }
+
+ block_comment("for (int i = len; i < 2*len; i++) {");
+ mov(Ri, Rlen); {
+ Label loop, end;
+ bind(loop);
+ cmp(Ri, Rlen, Assembler::LSL, 1);
+ br(Assembler::GE, end);
+
+ pre2(Ri, Rlen);
+
+ block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
+ lsl(Rj, Rlen, 1);
+ sub(Rj, Rj, Ri);
+ sub(Rj, Rj, 1);
+ lsr(Rj, Rj, 1);
+ unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
+ } block_comment(" } // j");
+
+ last_squaring(Ri);
+
+ block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
+ lsl(Rj, Rlen, 1);
+ sub(Rj, Rj, Ri);
+ lsr(Rj, Rj, 1);
+ unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
+ } block_comment(" } // j");
+
+ post2(Ri, Rlen);
+ add(Ri, Ri, 1);
+ cmp(Ri, Rlen, Assembler::LSL, 1);
+
+ br(Assembler::LT, loop);
+ bind(end);
+ block_comment("} // i");
+ }
+
+ normalize(Rlen);
+
+ mov(Ra, Pm_base); // Save Pm_base in Ra
+ restore_regs(); // Restore caller's Pm_base
+
+ // Copy our result into caller's Pm_base
+ reverse(Pm_base, Ra, Rlen, t0, t1);
+
+ leave();
+ ret(lr);
+
+ return entry;
+ }
+ // In C, approximately:
+
+ // void
+ // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
+ // unsigned long Pm_base[], unsigned long inv, int len) {
+ // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+ // unsigned long *Pa, *Pb, *Pn, *Pm;
+ // unsigned long Ra, Rb, Rn, Rm;
+
+ // int i;
+
+ // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
+
+ // for (i = 0; i < len; i++) {
+ // int j;
+
+ // Pa = Pa_base;
+ // Pb = Pa_base + i;
+ // Pm = Pm_base;
+ // Pn = Pn_base + i;
+
+ // Ra = *Pa;
+ // Rb = *Pb;
+ // Rm = *Pm;
+ // Rn = *Pn;
+
+ // int iters = (i+1)/2;
+ // for (j = 0; iters--; j++) {
+ // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
+ // MACC2(Ra, Rb, t0, t1, t2);
+ // Ra = *++Pa;
+ // Rb = *--Pb;
+ // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
+ // MACC(Rm, Rn, t0, t1, t2);
+ // Rm = *++Pm;
+ // Rn = *--Pn;
+ // }
+ // if ((i & 1) == 0) {
+ // assert(Ra == Pa_base[j], "must be");
+ // MACC(Ra, Ra, t0, t1, t2);
+ // }
+ // iters = i/2;
+ // assert(iters == i-j, "must be");
+ // for (; iters--; j++) {
+ // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
+ // MACC(Rm, Rn, t0, t1, t2);
+ // Rm = *++Pm;
+ // Rn = *--Pn;
+ // }
+
+ // *Pm = Rm = t0 * inv;
+ // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
+ // MACC(Rm, Rn, t0, t1, t2);
+
+ // assert(t0 == 0, "broken Montgomery multiply");
+
+ // t0 = t1; t1 = t2; t2 = 0;
+ // }
+
+ // for (i = len; i < 2*len; i++) {
+ // int start = i-len+1;
+ // int end = start + (len - start)/2;
+ // int j;
+
+ // Pa = Pa_base + i-len;
+ // Pb = Pa_base + len;
+ // Pm = Pm_base + i-len;
+ // Pn = Pn_base + len;
+
+ // Ra = *++Pa;
+ // Rb = *--Pb;
+ // Rm = *++Pm;
+ // Rn = *--Pn;
+
+ // int iters = (2*len-i-1)/2;
+ // assert(iters == end-start, "must be");
+ // for (j = start; iters--; j++) {
+ // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
+ // MACC2(Ra, Rb, t0, t1, t2);
+ // Ra = *++Pa;
+ // Rb = *--Pb;
+ // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
+ // MACC(Rm, Rn, t0, t1, t2);
+ // Rm = *++Pm;
+ // Rn = *--Pn;
+ // }
+ // if ((i & 1) == 0) {
+ // assert(Ra == Pa_base[j], "must be");
+ // MACC(Ra, Ra, t0, t1, t2);
+ // }
+ // iters = (2*len-i)/2;
+ // assert(iters == len-j, "must be");
+ // for (; iters--; j++) {
+ // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
+ // MACC(Rm, Rn, t0, t1, t2);
+ // Rm = *++Pm;
+ // Rn = *--Pn;
+ // }
+ // Pm_base[i-len] = t0;
+ // t0 = t1; t1 = t2; t2 = 0;
+ // }
+
+ // while (t0)
+ // t0 = sub(Pm_base, Pn_base, t0, len);
+ // }
+ };
+
// Initialization
void generate_initial() {
// Generate initial stubs and initializes the entry points
@@ -2603,6 +3423,20 @@
StubRoutines::_multiplyToLen = generate_multiplyToLen();
}
+ if (UseMontgomeryMultiplyIntrinsic) {
+ StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
+ MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
+ StubRoutines::_montgomeryMultiply = g.generate_multiply();
+ }
+
+ if (UseMontgomerySquareIntrinsic) {
+ StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
+ MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
+ // We use generate_multiply() rather than generate_square()
+ // because it's faster for the sizes of modulus we care about.
+ StubRoutines::_montgomerySquare = g.generate_multiply();
+ }
+
#ifndef BUILTIN_SIM
if (UseAESIntrinsics) {
StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();