# HG changeset patch # User goetz # Date 1506325063 -7200 # Node ID 122833427b36ca843758ee93d439e2d33968c24e # Parent 50790528dd2557b19e5ab779f02b883427322466 8185976: PPC64: Implement MulAdd and SquareToLen intrinsics Summary: This implementation is based on the algorithm implemented in java. It yields a performance speedup of: JDK8: 23% JDK9: 5% JDK10: 5% Reviewed-by: mdoerr, goetz Contributed-by: gustavo.scalet@eldorado.org.br diff -r 50790528dd25 -r 122833427b36 src/hotspot/cpu/ppc/assembler_ppc.hpp --- a/src/hotspot/cpu/ppc/assembler_ppc.hpp Mon Sep 25 08:43:43 2017 +0200 +++ b/src/hotspot/cpu/ppc/assembler_ppc.hpp Mon Sep 25 09:37:43 2017 +0200 @@ -1308,6 +1308,7 @@ inline void li( Register d, int si16); inline void lis( Register d, int si16); inline void addir(Register d, int si16, Register a); + inline void subi( Register d, Register a, int si16); static bool is_addi(int x) { return ADDI_OPCODE == (x & ADDI_OPCODE_MASK); diff -r 50790528dd25 -r 122833427b36 src/hotspot/cpu/ppc/assembler_ppc.inline.hpp --- a/src/hotspot/cpu/ppc/assembler_ppc.inline.hpp Mon Sep 25 08:43:43 2017 +0200 +++ b/src/hotspot/cpu/ppc/assembler_ppc.inline.hpp Mon Sep 25 09:37:43 2017 +0200 @@ -164,6 +164,7 @@ inline void Assembler::li( Register d, int si16) { Assembler::addi_r0ok( d, R0, si16); } inline void Assembler::lis( Register d, int si16) { Assembler::addis_r0ok(d, R0, si16); } inline void Assembler::addir(Register d, int si16, Register a) { Assembler::addi(d, a, si16); } +inline void Assembler::subi( Register d, Register a, int si16) { Assembler::addi(d, a, -si16); } // PPC 1, section 3.3.9, Fixed-Point Compare Instructions inline void Assembler::cmpi( ConditionRegister f, int l, Register a, int si16) { emit_int32( CMPI_OPCODE | bf(f) | l10(l) | ra(a) | simm(si16,16)); } diff -r 50790528dd25 -r 122833427b36 src/hotspot/cpu/ppc/macroAssembler_ppc.cpp --- a/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp Mon Sep 25 08:43:43 2017 +0200 +++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.cpp Mon Sep 25 09:37:43 2017 +0200 @@ -5234,6 +5234,40 @@ bind(L_post_third_loop_done); } // multiply_128_x_128_loop +void MacroAssembler::muladd(Register out, Register in, + Register offset, Register len, Register k, + Register tmp1, Register tmp2, Register carry) { + + // Labels + Label LOOP, SKIP; + + // Make sure length is positive. + cmpdi (CCR0, len, 0); + + // Prepare variables + subi (offset, offset, 4); + li (carry, 0); + ble (CCR0, SKIP); + + mtctr (len); + subi (len, len, 1 ); + sldi (len, len, 2 ); + + // Main loop + bind(LOOP); + lwzx (tmp1, len, in ); + lwzx (tmp2, offset, out ); + mulld (tmp1, tmp1, k ); + add (tmp2, carry, tmp2 ); + add (tmp2, tmp1, tmp2 ); + stwx (tmp2, offset, out ); + srdi (carry, tmp2, 32 ); + subi (offset, offset, 4 ); + subi (len, len, 4 ); + bdnz (LOOP); + bind(SKIP); +} + void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, diff -r 50790528dd25 -r 122833427b36 src/hotspot/cpu/ppc/macroAssembler_ppc.hpp --- a/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp Mon Sep 25 08:43:43 2017 +0200 +++ b/src/hotspot/cpu/ppc/macroAssembler_ppc.hpp Mon Sep 25 09:37:43 2017 +0200 @@ -815,6 +815,8 @@ Register yz_idx, Register idx, Register carry, Register product_high, Register product, Register carry2, Register tmp); + void muladd(Register out, Register in, Register offset, Register len, Register k, + Register tmp1, Register tmp2, Register carry); void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, diff -r 50790528dd25 -r 122833427b36 src/hotspot/cpu/ppc/stubGenerator_ppc.cpp --- a/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp Mon Sep 25 08:43:43 2017 +0200 +++ b/src/hotspot/cpu/ppc/stubGenerator_ppc.cpp Mon Sep 25 09:37:43 2017 +0200 @@ -3306,6 +3306,267 @@ BLOCK_COMMENT("} Stub body"); } + /** + * Arguments: + * + * Input: + * R3_ARG1 - out address + * R4_ARG2 - in address + * R5_ARG3 - offset + * R6_ARG4 - len + * R7_ARG5 - k + * Output: + * R3_RET - carry + */ + address generate_mulAdd() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "mulAdd"); + + address start = __ function_entry(); + + // C2 does not sign extend signed parameters to full 64 bits registers: + __ rldic (R5_ARG3, R5_ARG3, 2, 32); // always positive + __ clrldi(R6_ARG4, R6_ARG4, 32); // force zero bits on higher word + __ clrldi(R7_ARG5, R7_ARG5, 32); // force zero bits on higher word + + __ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10); + + // Moves output carry to return register + __ mr (R3_RET, R10); + + __ blr(); + + return start; + } + + /** + * Arguments: + * + * Input: + * R3_ARG1 - in address + * R4_ARG2 - in length + * R5_ARG3 - out address + * R6_ARG4 - out length + */ + address generate_squareToLen() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "squareToLen"); + + address start = __ function_entry(); + + // args - higher word is cleaned (unsignedly) due to int to long casting + const Register in = R3_ARG1; + const Register in_len = R4_ARG2; + __ clrldi(in_len, in_len, 32); + const Register out = R5_ARG3; + const Register out_len = R6_ARG4; + __ clrldi(out_len, out_len, 32); + + // output + const Register ret = R3_RET; + + // temporaries + const Register lplw_s = R7; + const Register in_aux = R8; + const Register out_aux = R9; + const Register piece = R10; + const Register product = R14; + const Register lplw = R15; + const Register i_minus1 = R16; + const Register carry = R17; + const Register offset = R18; + const Register off_aux = R19; + const Register t = R20; + const Register mlen = R21; + const Register len = R22; + const Register a = R23; + const Register b = R24; + const Register i = R25; + const Register c = R26; + const Register cs = R27; + + // Labels + Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_MULADD, SKIP_LOOP_SQUARE; + Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_MULADD, LOOP_SQUARE; + + // Save non-volatile regs (frameless). + int current_offs = -8; + __ std(R28, current_offs, R1_SP); current_offs -= 8; + __ std(R27, current_offs, R1_SP); current_offs -= 8; + __ std(R26, current_offs, R1_SP); current_offs -= 8; + __ std(R25, current_offs, R1_SP); current_offs -= 8; + __ std(R24, current_offs, R1_SP); current_offs -= 8; + __ std(R23, current_offs, R1_SP); current_offs -= 8; + __ std(R22, current_offs, R1_SP); current_offs -= 8; + __ std(R21, current_offs, R1_SP); current_offs -= 8; + __ std(R20, current_offs, R1_SP); current_offs -= 8; + __ std(R19, current_offs, R1_SP); current_offs -= 8; + __ std(R18, current_offs, R1_SP); current_offs -= 8; + __ std(R17, current_offs, R1_SP); current_offs -= 8; + __ std(R16, current_offs, R1_SP); current_offs -= 8; + __ std(R15, current_offs, R1_SP); current_offs -= 8; + __ std(R14, current_offs, R1_SP); + + // Store the squares, right shifted one bit (i.e., divided by 2) + __ subi (out_aux, out, 8); + __ subi (in_aux, in, 4); + __ cmpwi (CCR0, in_len, 0); + // Initialize lplw outside of the loop + __ xorr (lplw, lplw, lplw); + __ ble (CCR0, SKIP_LOOP_SQUARE); // in_len <= 0 + __ mtctr (in_len); + + __ bind(LOOP_SQUARE); + __ lwzu (piece, 4, in_aux); + __ mulld (product, piece, piece); + // shift left 63 bits and only keep the MSB + __ rldic (lplw_s, lplw, 63, 0); + __ mr (lplw, product); + // shift right 1 bit without sign extension + __ srdi (product, product, 1); + // join them to the same register and store it + __ orr (product, lplw_s, product); +#ifdef VM_LITTLE_ENDIAN + // Swap low and high words for little endian + __ rldicl (product, product, 32, 0); +#endif + __ stdu (product, 8, out_aux); + __ bdnz (LOOP_SQUARE); + + __ bind(SKIP_LOOP_SQUARE); + + // Add in off-diagonal sums + __ cmpwi (CCR0, in_len, 0); + __ ble (CCR0, SKIP_DIAGONAL_SUM); + // Avoid CTR usage here in order to use it at mulAdd + __ subi (i_minus1, in_len, 1); + __ li (offset, 4); + + __ bind(LOOP_DIAGONAL_SUM); + + __ sldi (off_aux, out_len, 2); + __ sub (off_aux, off_aux, offset); + + __ mr (len, i_minus1); + __ sldi (mlen, i_minus1, 2); + __ lwzx (t, in, mlen); + + __ muladd (out, in, off_aux, len, t, a, b, carry); + + // begin + // off_aux = out_len*4 - 4 - mlen - offset*4 - 4; + __ addi (mlen, mlen, 4); + __ sldi (a, out_len, 2); + __ subi (a, a, 4); + __ sub (a, a, mlen); + __ subi (off_aux, offset, 4); + __ sub (off_aux, a, off_aux); + + __ lwzx (b, off_aux, out); + __ add (b, b, carry); + __ stwx (b, off_aux, out); + + // if (((uint64_t)s >> 32) != 0) { + __ srdi_ (a, b, 32); + __ beq (CCR0, SKIP_ADDONE); + + // while (--mlen >= 0) { + __ bind(LOOP_ADDONE); + __ subi (mlen, mlen, 4); + __ cmpwi (CCR0, mlen, 0); + __ beq (CCR0, SKIP_ADDONE); + + // if (--offset_aux < 0) { // Carry out of number + __ subi (off_aux, off_aux, 4); + __ cmpwi (CCR0, off_aux, 0); + __ blt (CCR0, SKIP_ADDONE); + + // } else { + __ lwzx (b, off_aux, out); + __ addi (b, b, 1); + __ stwx (b, off_aux, out); + __ cmpwi (CCR0, b, 0); + __ bne (CCR0, SKIP_ADDONE); + __ b (LOOP_ADDONE); + + __ bind(SKIP_ADDONE); + // } } } end + + __ addi (offset, offset, 8); + __ subi (i_minus1, i_minus1, 1); + __ cmpwi (CCR0, i_minus1, 0); + __ bge (CCR0, LOOP_DIAGONAL_SUM); + + __ bind(SKIP_DIAGONAL_SUM); + + // Shift back up and set low bit + // Shifts 1 bit left up to len positions. Assumes no leading zeros + // begin + __ cmpwi (CCR0, out_len, 0); + __ ble (CCR0, SKIP_LSHIFT); + __ li (i, 0); + __ lwz (c, 0, out); + __ subi (b, out_len, 1); + __ mtctr (b); + + __ bind(LOOP_LSHIFT); + __ mr (b, c); + __ addi (cs, i, 4); + __ lwzx (c, out, cs); + + __ sldi (b, b, 1); + __ srwi (cs, c, 31); + __ orr (b, b, cs); + __ stwx (b, i, out); + + __ addi (i, i, 4); + __ bdnz (LOOP_LSHIFT); + + __ sldi (c, out_len, 2); + __ subi (c, c, 4); + __ lwzx (b, out, c); + __ sldi (b, b, 1); + __ stwx (b, out, c); + + __ bind(SKIP_LSHIFT); + // end + + // Set low bit + __ sldi (i, in_len, 2); + __ subi (i, i, 4); + __ lwzx (i, in, i); + __ sldi (c, out_len, 2); + __ subi (c, c, 4); + __ lwzx (b, out, c); + + __ andi (i, i, 1); + __ orr (i, b, i); + + __ stwx (i, out, c); + + // Restore non-volatile regs. + current_offs = -8; + __ ld(R28, current_offs, R1_SP); current_offs -= 8; + __ ld(R27, current_offs, R1_SP); current_offs -= 8; + __ ld(R26, current_offs, R1_SP); current_offs -= 8; + __ ld(R25, current_offs, R1_SP); current_offs -= 8; + __ ld(R24, current_offs, R1_SP); current_offs -= 8; + __ ld(R23, current_offs, R1_SP); current_offs -= 8; + __ ld(R22, current_offs, R1_SP); current_offs -= 8; + __ ld(R21, current_offs, R1_SP); current_offs -= 8; + __ ld(R20, current_offs, R1_SP); current_offs -= 8; + __ ld(R19, current_offs, R1_SP); current_offs -= 8; + __ ld(R18, current_offs, R1_SP); current_offs -= 8; + __ ld(R17, current_offs, R1_SP); current_offs -= 8; + __ ld(R16, current_offs, R1_SP); current_offs -= 8; + __ ld(R15, current_offs, R1_SP); current_offs -= 8; + __ ld(R14, current_offs, R1_SP); + + __ mr(ret, out); + __ blr(); + + return start; + } /** * Arguments: @@ -3500,6 +3761,12 @@ } #endif + if (UseSquareToLenIntrinsic) { + StubRoutines::_squareToLen = generate_squareToLen(); + } + if (UseMulAddIntrinsic) { + StubRoutines::_mulAdd = generate_mulAdd(); + } if (UseMontgomeryMultiplyIntrinsic) { StubRoutines::_montgomeryMultiply = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); diff -r 50790528dd25 -r 122833427b36 src/hotspot/cpu/ppc/vm_version_ppc.cpp --- a/src/hotspot/cpu/ppc/vm_version_ppc.cpp Mon Sep 25 08:43:43 2017 +0200 +++ b/src/hotspot/cpu/ppc/vm_version_ppc.cpp Mon Sep 25 09:37:43 2017 +0200 @@ -258,6 +258,12 @@ FLAG_SET_DEFAULT(UseSHA512Intrinsics, false); } + if (FLAG_IS_DEFAULT(UseSquareToLenIntrinsic)) { + UseSquareToLenIntrinsic = true; + } + if (FLAG_IS_DEFAULT(UseMulAddIntrinsic)) { + UseMulAddIntrinsic = true; + } if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) { UseMultiplyToLenIntrinsic = true; }