8077615: AARCH64: Add C2 intrinsic for BigInteger::multiplyToLen() method
authoraph
Tue, 14 Apr 2015 11:43:18 +0100
changeset 30225 e9722ea461d4
parent 30224 c41bbf9a0e32
child 30226 5f1a3a275862
8077615: AARCH64: Add C2 intrinsic for BigInteger::multiplyToLen() method Summary: Add C2 intrinsic for BigInteger::multiplyToLen() on AArch64. Reviewed-by: kvn
hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp
hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.hpp
hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp
hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp
hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
hotspot/test/compiler/intrinsics/multiplytolen/TestMultiplyToLen.java
--- a/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Apr 14 18:16:10 2015 +0300
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Apr 14 11:43:18 2015 +0100
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1469,7 +1469,7 @@
     f(op, 31, 29);
     f(0b11010000, 28, 21);
     f(0b000000, 15, 10);
-    rf(Rm, 16), rf(Rn, 5), rf(Rd, 0);
+    zrf(Rm, 16), zrf(Rn, 5), zrf(Rd, 0);
   }
 
   #define INSN(NAME, op)                                \
--- a/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.hpp	Tue Apr 14 18:16:10 2015 +0300
+++ b/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.hpp	Tue Apr 14 11:43:18 2015 +0100
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -40,6 +40,8 @@
 
  protected:
   // Interpreter specific version of call_VM_base
+  using MacroAssembler::call_VM_leaf_base;
+
   virtual void call_VM_leaf_base(address entry_point,
                                  int number_of_arguments);
 
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Apr 14 18:16:10 2015 +0300
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Apr 14 11:43:18 2015 +0100
@@ -1,6 +1,6 @@
 /*
- * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -2246,6 +2246,341 @@
 }
 
 /**
+ * Helpers for multiply_to_len().
+ */
+void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
+                                     Register src1, Register src2) {
+  adds(dest_lo, dest_lo, src1);
+  adc(dest_hi, dest_hi, zr);
+  adds(dest_lo, dest_lo, src2);
+  adc(final_dest_hi, dest_hi, zr);
+}
+
+// Generate an address from (r + r1 extend offset).  "size" is the
+// size of the operand.  The result may be in rscratch2.
+Address MacroAssembler::offsetted_address(Register r, Register r1,
+                                          Address::extend ext, int offset, int size) {
+  if (offset || (ext.shift() % size != 0)) {
+    lea(rscratch2, Address(r, r1, ext));
+    return Address(rscratch2, offset);
+  } else {
+    return Address(r, r1, ext);
+  }
+}
+
+/**
+ * Multiply 64 bit by 64 bit first loop.
+ */
+void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
+                                           Register y, Register y_idx, Register z,
+                                           Register carry, Register product,
+                                           Register idx, Register kdx) {
+  //
+  //  jlong carry, x[], y[], z[];
+  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
+  //    huge_128 product = y[idx] * x[xstart] + carry;
+  //    z[kdx] = (jlong)product;
+  //    carry  = (jlong)(product >>> 64);
+  //  }
+  //  z[xstart] = carry;
+  //
+
+  Label L_first_loop, L_first_loop_exit;
+  Label L_one_x, L_one_y, L_multiply;
+
+  subsw(xstart, xstart, 1);
+  br(Assembler::MI, L_one_x);
+
+  lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
+  ldr(x_xstart, Address(rscratch1));
+  ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
+
+  bind(L_first_loop);
+  subsw(idx, idx, 1);
+  br(Assembler::MI, L_first_loop_exit);
+  subsw(idx, idx, 1);
+  br(Assembler::MI, L_one_y);
+  lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
+  ldr(y_idx, Address(rscratch1));
+  ror(y_idx, y_idx, 32); // convert big-endian to little-endian
+  bind(L_multiply);
+
+  // AArch64 has a multiply-accumulate instruction that we can't use
+  // here because it has no way to process carries, so we have to use
+  // separate add and adc instructions.  Bah.
+  umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
+  mul(product, x_xstart, y_idx);
+  adds(product, product, carry);
+  adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
+
+  subw(kdx, kdx, 2);
+  ror(product, product, 32); // back to big-endian
+  str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
+
+  b(L_first_loop);
+
+  bind(L_one_y);
+  ldrw(y_idx, Address(y,  0));
+  b(L_multiply);
+
+  bind(L_one_x);
+  ldrw(x_xstart, Address(x,  0));
+  b(L_first_loop);
+
+  bind(L_first_loop_exit);
+}
+
+/**
+ * Multiply 128 bit by 128. Unrolled inner loop.
+ *
+ */
+void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
+                                             Register carry, Register carry2,
+                                             Register idx, Register jdx,
+                                             Register yz_idx1, Register yz_idx2,
+                                             Register tmp, Register tmp3, Register tmp4,
+                                             Register tmp6, Register product_hi) {
+
+  //   jlong carry, x[], y[], z[];
+  //   int kdx = ystart+1;
+  //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
+  //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
+  //     jlong carry2  = (jlong)(tmp3 >>> 64);
+  //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
+  //     carry  = (jlong)(tmp4 >>> 64);
+  //     z[kdx+idx+1] = (jlong)tmp3;
+  //     z[kdx+idx] = (jlong)tmp4;
+  //   }
+  //   idx += 2;
+  //   if (idx > 0) {
+  //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
+  //     z[kdx+idx] = (jlong)yz_idx1;
+  //     carry  = (jlong)(yz_idx1 >>> 64);
+  //   }
+  //
+
+  Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
+
+  lsrw(jdx, idx, 2);
+
+  bind(L_third_loop);
+
+  subsw(jdx, jdx, 1);
+  br(Assembler::MI, L_third_loop_exit);
+  subw(idx, idx, 4);
+
+  lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
+
+  ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
+
+  lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
+
+  ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
+  ror(yz_idx2, yz_idx2, 32);
+
+  ldp(rscratch2, rscratch1, Address(tmp6, 0));
+
+  mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
+  umulh(tmp4, product_hi, yz_idx1);
+
+  ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
+  ror(rscratch2, rscratch2, 32);
+
+  mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
+  umulh(carry2, product_hi, yz_idx2);
+
+  // propagate sum of both multiplications into carry:tmp4:tmp3
+  adds(tmp3, tmp3, carry);
+  adc(tmp4, tmp4, zr);
+  adds(tmp3, tmp3, rscratch1);
+  adcs(tmp4, tmp4, tmp);
+  adc(carry, carry2, zr);
+  adds(tmp4, tmp4, rscratch2);
+  adc(carry, carry, zr);
+
+  ror(tmp3, tmp3, 32); // convert little-endian to big-endian
+  ror(tmp4, tmp4, 32);
+  stp(tmp4, tmp3, Address(tmp6, 0));
+
+  b(L_third_loop);
+  bind (L_third_loop_exit);
+
+  andw (idx, idx, 0x3);
+  cbz(idx, L_post_third_loop_done);
+
+  Label L_check_1;
+  subsw(idx, idx, 2);
+  br(Assembler::MI, L_check_1);
+
+  lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
+  ldr(yz_idx1, Address(rscratch1, 0));
+  ror(yz_idx1, yz_idx1, 32);
+  mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
+  umulh(tmp4, product_hi, yz_idx1);
+  lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
+  ldr(yz_idx2, Address(rscratch1, 0));
+  ror(yz_idx2, yz_idx2, 32);
+
+  add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
+
+  ror(tmp3, tmp3, 32);
+  str(tmp3, Address(rscratch1, 0));
+
+  bind (L_check_1);
+
+  andw (idx, idx, 0x1);
+  subsw(idx, idx, 1);
+  br(Assembler::MI, L_post_third_loop_done);
+  ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
+  mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
+  umulh(carry2, tmp4, product_hi);
+  ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
+
+  add2_with_carry(carry2, tmp3, tmp4, carry);
+
+  strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
+  extr(carry, carry2, tmp3, 32);
+
+  bind(L_post_third_loop_done);
+}
+
+/**
+ * Code for BigInteger::multiplyToLen() instrinsic.
+ *
+ * r0: x
+ * r1: xlen
+ * r2: y
+ * r3: ylen
+ * r4:  z
+ * r5: zlen
+ * r10: tmp1
+ * r11: tmp2
+ * r12: tmp3
+ * r13: tmp4
+ * r14: tmp5
+ * r15: tmp6
+ * r16: tmp7
+ *
+ */
+void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
+                                     Register z, Register zlen,
+                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4,
+                                     Register tmp5, Register tmp6, Register product_hi) {
+
+  assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
+
+  const Register idx = tmp1;
+  const Register kdx = tmp2;
+  const Register xstart = tmp3;
+
+  const Register y_idx = tmp4;
+  const Register carry = tmp5;
+  const Register product  = xlen;
+  const Register x_xstart = zlen;  // reuse register
+
+  // First Loop.
+  //
+  //  final static long LONG_MASK = 0xffffffffL;
+  //  int xstart = xlen - 1;
+  //  int ystart = ylen - 1;
+  //  long carry = 0;
+  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
+  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
+  //    z[kdx] = (int)product;
+  //    carry = product >>> 32;
+  //  }
+  //  z[xstart] = (int)carry;
+  //
+
+  movw(idx, ylen);      // idx = ylen;
+  movw(kdx, zlen);      // kdx = xlen+ylen;
+  mov(carry, zr);       // carry = 0;
+
+  Label L_done;
+
+  movw(xstart, xlen);
+  subsw(xstart, xstart, 1);
+  br(Assembler::MI, L_done);
+
+  multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
+
+  Label L_second_loop;
+  cbzw(kdx, L_second_loop);
+
+  Label L_carry;
+  subw(kdx, kdx, 1);
+  cbzw(kdx, L_carry);
+
+  strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
+  lsr(carry, carry, 32);
+  subw(kdx, kdx, 1);
+
+  bind(L_carry);
+  strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
+
+  // Second and third (nested) loops.
+  //
+  // for (int i = xstart-1; i >= 0; i--) { // Second loop
+  //   carry = 0;
+  //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
+  //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
+  //                    (z[k] & LONG_MASK) + carry;
+  //     z[k] = (int)product;
+  //     carry = product >>> 32;
+  //   }
+  //   z[i] = (int)carry;
+  // }
+  //
+  // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
+
+  const Register jdx = tmp1;
+
+  bind(L_second_loop);
+  mov(carry, zr);                // carry = 0;
+  movw(jdx, ylen);               // j = ystart+1
+
+  subsw(xstart, xstart, 1);      // i = xstart-1;
+  br(Assembler::MI, L_done);
+
+  str(z, Address(pre(sp, -4 * wordSize)));
+
+  Label L_last_x;
+  lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
+  subsw(xstart, xstart, 1);       // i = xstart-1;
+  br(Assembler::MI, L_last_x);
+
+  lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
+  ldr(product_hi, Address(rscratch1));
+  ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
+
+  Label L_third_loop_prologue;
+  bind(L_third_loop_prologue);
+
+  str(ylen, Address(sp, wordSize));
+  stp(x, xstart, Address(sp, 2 * wordSize));
+  multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
+                          tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
+  ldp(z, ylen, Address(post(sp, 2 * wordSize)));
+  ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
+
+  addw(tmp3, xlen, 1);
+  strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
+  subsw(tmp3, tmp3, 1);
+  br(Assembler::MI, L_done);
+
+  lsr(carry, carry, 32);
+  strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
+  b(L_second_loop);
+
+  // Next infrequent code is moved outside loops.
+  bind(L_last_x);
+  ldrw(product_hi, Address(x,  0));
+  b(L_third_loop_prologue);
+
+  bind(L_done);
+}
+
+/**
  * Emits code to update CRC-32 with a byte value according to constants in table
  *
  * @param [in,out]crc   Register containing the crc.
--- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Apr 14 18:16:10 2015 +0300
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Apr 14 11:43:18 2015 +0100
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -167,9 +167,8 @@
 
   // aliases defined in AARCH64 spec
 
-
   template<class T>
-  inline void  cmpw(Register Rd, T imm)  { subsw(zr, Rd, imm); }
+  inline void cmpw(Register Rd, T imm)  { subsw(zr, Rd, imm); }
   inline void cmp(Register Rd, unsigned imm)  { subs(zr, Rd, imm); }
 
   inline void cmnw(Register Rd, unsigned imm) { addsw(zr, Rd, imm); }
@@ -1121,9 +1120,34 @@
                       Register tmp1, Register tmp2,
                       Register tmp3, Register tmp4,
                       int int_cnt1, Register result);
-
+private:
+  void add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
+                       Register src1, Register src2);
+  void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
+    add2_with_carry(dest_hi, dest_hi, dest_lo, src1, src2);
+  }
+  void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
+                             Register y, Register y_idx, Register z,
+                             Register carry, Register product,
+                             Register idx, Register kdx);
+  void multiply_128_x_128_loop(Register y, Register z,
+                               Register carry, Register carry2,
+                               Register idx, Register jdx,
+                               Register yz_idx1, Register yz_idx2,
+                               Register tmp, Register tmp3, Register tmp4,
+                               Register tmp7, Register product_hi);
+public:
+  void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z,
+                       Register zlen, Register tmp1, Register tmp2, Register tmp3,
+                       Register tmp4, Register tmp5, Register tmp6, Register tmp7);
   // ISB may be needed because of a safepoint
   void maybe_isb() { isb(); }
+
+private:
+  // Return the effective address r + (r1 << ext) + offset.
+  // Uses rscratch2.
+  Address offsetted_address(Register r, Register r1, Address::extend ext,
+                            int offset, int size);
 };
 
 // Used by aarch64.ad to control code generation
--- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Tue Apr 14 18:16:10 2015 +0300
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Tue Apr 14 11:43:18 2015 +0100
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -2356,8 +2356,45 @@
     return start;
   }
 
-#undef __
-#define __ masm->
+  /**
+   *  Arguments:
+   *
+   *  Input:
+   *    c_rarg0   - x address
+   *    c_rarg1   - x length
+   *    c_rarg2   - y address
+   *    c_rarg3   - y lenth
+   *    c_rarg4   - z address
+   *    c_rarg5   - z length
+   */
+  address generate_multiplyToLen() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
+
+    address start = __ pc();
+    const Register x     = r0;
+    const Register xlen  = r1;
+    const Register y     = r2;
+    const Register ylen  = r3;
+    const Register z     = r4;
+    const Register zlen  = r5;
+
+    const Register tmp1  = r10;
+    const Register tmp2  = r11;
+    const Register tmp3  = r12;
+    const Register tmp4  = r13;
+    const Register tmp5  = r14;
+    const Register tmp6  = r15;
+    const Register tmp7  = r16;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(lr);
+
+    return start;
+  }
 
   // Continuation point for throwing of implicit exceptions that are
   // not handled in the current activation. Fabricates an exception
@@ -2375,6 +2412,9 @@
   // otherwise assume that stack unwinding will be initiated, so
   // caller saved registers were assumed volatile in the compiler.
 
+#undef __
+#define __ masm->
+
   address generate_throw_exception(const char* name,
                                    address runtime_entry,
                                    Register arg1 = noreg,
@@ -2518,6 +2558,10 @@
     // arraycopy stubs used by compilers
     generate_arraycopy_stubs();
 
+    if (UseMultiplyToLenIntrinsic) {
+      StubRoutines::_multiplyToLen = generate_multiplyToLen();
+    }
+
 #ifndef BUILTIN_SIM
     if (UseAESIntrinsics) {
       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Tue Apr 14 18:16:10 2015 +0300
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Tue Apr 14 11:43:18 2015 +0100
@@ -198,6 +198,10 @@
     FLAG_SET_DEFAULT(UseUnalignedAccesses, true);
   }
 
+  if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
+    UseMultiplyToLenIntrinsic = true;
+  }
+
 #ifdef COMPILER2
   if (FLAG_IS_DEFAULT(OptoScheduling)) {
     OptoScheduling = true;
--- a/hotspot/test/compiler/intrinsics/multiplytolen/TestMultiplyToLen.java	Tue Apr 14 18:16:10 2015 +0300
+++ b/hotspot/test/compiler/intrinsics/multiplytolen/TestMultiplyToLen.java	Tue Apr 14 11:43:18 2015 +0100
@@ -34,6 +34,7 @@
  *      -XX:CompileCommand=inline,java.math.BigInteger::multiply TestMultiplyToLen
  */
 
+import java.util.Arrays;
 import java.util.Random;
 import java.math.*;
 
@@ -97,12 +98,36 @@
         newsum = newsum.add(newres);
 
         if (!bytecompare(oldres,newres)) {
+          System.out.println(b1);
+          System.out.println(b2);
+          System.out.print("mismatch for:b1:" + stringify(b1) + " :b2:" + stringify(b2) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
+          throw new Exception("Failed");
+        }
+      }
+
+      // Test carry propagation.  Multiple carries during bignum
+      // multiplication are rare (especially when using 64-bit
+      // arithmetic) so we have to provoke them deliberately.
+      for (int j = 4; j <= 396; j += 4) {
+        byte[] bytes = new byte[j];
+        Arrays.fill(bytes, (byte)255);
+        b1 = new BigInteger(bytes);
+        b2 = new BigInteger(bytes);
+
+        oldres = base_multiply(b1,b2);
+        newres = new_multiply(b1,b2);
+
+        oldsum = oldsum.add(oldres);
+        newsum = newsum.add(newres);
+
+        if (!bytecompare(oldres,newres)) {
           System.out.print("mismatch for:b1:" + stringify(b1) + " :b2:" + stringify(b2) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
           System.out.println(b1);
           System.out.println(b2);
           throw new Exception("Failed");
         }
       }
+
       if (!bytecompare(oldsum,newsum))  {
         System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
         throw new Exception("Failed");