6603011: RFE: Optimize long division
authorrasbold
Wed, 07 May 2008 08:06:46 -0700
changeset 392 0b3167e2f2de
parent 379 10767ca40189
child 393 31d6ce8d9edb
6603011: RFE: Optimize long division Summary: Transform long division by constant into multiply Reviewed-by: never, kvn
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/share/vm/opto/classes.hpp
hotspot/src/share/vm/opto/divnode.cpp
hotspot/src/share/vm/opto/mulnode.cpp
hotspot/src/share/vm/opto/mulnode.hpp
hotspot/src/share/vm/opto/type.hpp
hotspot/src/share/vm/utilities/globalDefinitions.hpp
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Tue Apr 29 19:45:22 2008 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Wed May 07 08:06:46 2008 -0700
@@ -8075,6 +8075,18 @@
   ins_pipe(ialu_reg_mem_alu0);
 %}
 
+instruct mulHiL_rReg(rdx_RegL dst, no_rax_RegL src, rax_RegL rax, rFlagsReg cr)
+%{
+  match(Set dst (MulHiL src rax));
+  effect(USE_KILL rax, KILL cr);
+
+  ins_cost(300);
+  format %{ "imulq   RDX:RAX, RAX, $src\t# mulhi" %}
+  opcode(0xF7, 0x5); /* Opcode F7 /5 */
+  ins_encode(REX_reg_wide(src), OpcP, reg_opc(src));
+  ins_pipe(ialu_reg_reg_alu0);
+%}
+
 instruct divI_rReg(rax_RegI rax, rdx_RegI rdx, no_rax_rdx_RegI div,
                    rFlagsReg cr)
 %{
--- a/hotspot/src/share/vm/opto/classes.hpp	Tue Apr 29 19:45:22 2008 -0700
+++ b/hotspot/src/share/vm/opto/classes.hpp	Wed May 07 08:06:46 2008 -0700
@@ -164,6 +164,7 @@
 macro(MoveD2L)
 macro(MulD)
 macro(MulF)
+macro(MulHiL)
 macro(MulI)
 macro(MulL)
 macro(Multi)
--- a/hotspot/src/share/vm/opto/divnode.cpp	Tue Apr 29 19:45:22 2008 -0700
+++ b/hotspot/src/share/vm/opto/divnode.cpp	Wed May 07 08:06:46 2008 -0700
@@ -30,70 +30,86 @@
 #include "incls/_divnode.cpp.incl"
 #include <math.h>
 
-// Implement the integer constant divide -> long multiply transform found in
-//   "Division by Invariant Integers using Multiplication"
-//     by Granlund and Montgomery
-static Node *transform_int_divide_to_long_multiply( PhaseGVN *phase, Node *dividend, int divisor ) {
+//----------------------magic_int_divide_constants-----------------------------
+// Compute magic multiplier and shift constant for converting a 32 bit divide
+// by constant into a multiply/shift/add series. Return false if calculations
+// fail.
+//
+// Borrowed almost verbatum from Hacker's Delight by Henry S. Warren, Jr. with
+// minor type name and parameter changes.
+static bool magic_int_divide_constants(jint d, jint &M, jint &s) {
+  int32_t p;
+  uint32_t ad, anc, delta, q1, r1, q2, r2, t;
+  const uint32_t two31 = 0x80000000L;     // 2**31.
+
+  ad = ABS(d);
+  if (d == 0 || d == 1) return false;
+  t = two31 + ((uint32_t)d >> 31);
+  anc = t - 1 - t%ad;     // Absolute value of nc.
+  p = 31;                 // Init. p.
+  q1 = two31/anc;         // Init. q1 = 2**p/|nc|.
+  r1 = two31 - q1*anc;    // Init. r1 = rem(2**p, |nc|).
+  q2 = two31/ad;          // Init. q2 = 2**p/|d|.
+  r2 = two31 - q2*ad;     // Init. r2 = rem(2**p, |d|).
+  do {
+    p = p + 1;
+    q1 = 2*q1;            // Update q1 = 2**p/|nc|.
+    r1 = 2*r1;            // Update r1 = rem(2**p, |nc|).
+    if (r1 >= anc) {      // (Must be an unsigned
+      q1 = q1 + 1;        // comparison here).
+      r1 = r1 - anc;
+    }
+    q2 = 2*q2;            // Update q2 = 2**p/|d|.
+    r2 = 2*r2;            // Update r2 = rem(2**p, |d|).
+    if (r2 >= ad) {       // (Must be an unsigned
+      q2 = q2 + 1;        // comparison here).
+      r2 = r2 - ad;
+    }
+    delta = ad - r2;
+  } while (q1 < delta || (q1 == delta && r1 == 0));
+
+  M = q2 + 1;
+  if (d < 0) M = -M;      // Magic number and
+  s = p - 32;             // shift amount to return.
+
+  return true;
+}
+
+//--------------------------transform_int_divide-------------------------------
+// Convert a division by constant divisor into an alternate Ideal graph.
+// Return NULL if no transformation occurs.
+static Node *transform_int_divide( PhaseGVN *phase, Node *dividend, jint divisor ) {
 
   // Check for invalid divisors
-  assert( divisor != 0 && divisor != min_jint && divisor != 1,
-    "bad divisor for transforming to long multiply" );
-
-  // Compute l = ceiling(log2(d))
-  //   presumes d is more likely small
-  bool d_pos = divisor >= 0;
-  int d = d_pos ? divisor : -divisor;
-  unsigned ud = (unsigned)d;
-  const int N = 32;
-  int l = log2_intptr(d-1)+1;
-  int sh_post = l;
-
-  const uint64_t U1 = (uint64_t)1;
+  assert( divisor != 0 && divisor != min_jint,
+          "bad divisor for transforming to long multiply" );
 
-  // Cliff pointed out how to prevent overflow (from the paper)
-  uint64_t m_low  =  (((U1 << l) - ud) << N)                  / ud + (U1 << N);
-  uint64_t m_high = ((((U1 << l) - ud) << N) + (U1 << (l+1))) / ud + (U1 << N);
-
-  // Reduce to lowest terms
-  for ( ; sh_post > 0; sh_post-- ) {
-    uint64_t m_low_1  = m_low  >> 1;
-    uint64_t m_high_1 = m_high >> 1;
-    if ( m_low_1 >= m_high_1 )
-      break;
-    m_low  = m_low_1;
-    m_high = m_high_1;
-  }
+  bool d_pos = divisor >= 0;
+  jint d = d_pos ? divisor : -divisor;
+  const int N = 32;
 
   // Result
-  Node *q;
+  Node *q = NULL;
 
-  // division by +/- 1
   if (d == 1) {
-    // Filtered out as identity above
-    if (d_pos)
-      return NULL;
-
-    // Just negate the value
-    else {
+    // division by +/- 1
+    if (!d_pos) {
+      // Just negate the value
       q = new (phase->C, 3) SubINode(phase->intcon(0), dividend);
     }
-  }
-
-  // division by +/- a power of 2
-  else if ( is_power_of_2(d) ) {
+  } else if ( is_power_of_2(d) ) {
+    // division by +/- a power of 2
 
     // See if we can simply do a shift without rounding
     bool needs_rounding = true;
     const Type *dt = phase->type(dividend);
     const TypeInt *dti = dt->isa_int();
-
-    // we don't need to round a positive dividend
-    if (dti && dti->_lo >= 0)
+    if (dti && dti->_lo >= 0) {
+      // we don't need to round a positive dividend
       needs_rounding = false;
-
-    // An AND mask of sufficient size clears the low bits and
-    // I can avoid rounding.
-    else if( dividend->Opcode() == Op_AndI ) {
+    } else if( dividend->Opcode() == Op_AndI ) {
+      // An AND mask of sufficient size clears the low bits and
+      // I can avoid rounding.
       const TypeInt *andconi = phase->type( dividend->in(2) )->isa_int();
       if( andconi && andconi->is_con(-d) ) {
         dividend = dividend->in(1);
@@ -102,47 +118,271 @@
     }
 
     // Add rounding to the shift to handle the sign bit
-    if( needs_rounding ) {
-      Node *t1 = phase->transform(new (phase->C, 3) RShiftINode(dividend, phase->intcon(l - 1)));
-      Node *t2 = phase->transform(new (phase->C, 3) URShiftINode(t1, phase->intcon(N - l)));
-      dividend = phase->transform(new (phase->C, 3) AddINode(dividend, t2));
+    int l = log2_intptr(d-1)+1;
+    if (needs_rounding) {
+      // Divide-by-power-of-2 can be made into a shift, but you have to do
+      // more math for the rounding.  You need to add 0 for positive
+      // numbers, and "i-1" for negative numbers.  Example: i=4, so the
+      // shift is by 2.  You need to add 3 to negative dividends and 0 to
+      // positive ones.  So (-7+3)>>2 becomes -1, (-4+3)>>2 becomes -1,
+      // (-2+3)>>2 becomes 0, etc.
+
+      // Compute 0 or -1, based on sign bit
+      Node *sign = phase->transform(new (phase->C, 3) RShiftINode(dividend, phase->intcon(N - 1)));
+      // Mask sign bit to the low sign bits
+      Node *round = phase->transform(new (phase->C, 3) URShiftINode(sign, phase->intcon(N - l)));
+      // Round up before shifting
+      dividend = phase->transform(new (phase->C, 3) AddINode(dividend, round));
     }
 
+    // Shift for division
     q = new (phase->C, 3) RShiftINode(dividend, phase->intcon(l));
 
-    if (!d_pos)
+    if (!d_pos) {
       q = new (phase->C, 3) SubINode(phase->intcon(0), phase->transform(q));
+    }
+  } else {
+    // Attempt the jint constant divide -> multiply transform found in
+    //   "Division by Invariant Integers using Multiplication"
+    //     by Granlund and Montgomery
+    // See also "Hacker's Delight", chapter 10 by Warren.
+
+    jint magic_const;
+    jint shift_const;
+    if (magic_int_divide_constants(d, magic_const, shift_const)) {
+      Node *magic = phase->longcon(magic_const);
+      Node *dividend_long = phase->transform(new (phase->C, 2) ConvI2LNode(dividend));
+
+      // Compute the high half of the dividend x magic multiplication
+      Node *mul_hi = phase->transform(new (phase->C, 3) MulLNode(dividend_long, magic));
+
+      if (magic_const < 0) {
+        mul_hi = phase->transform(new (phase->C, 3) RShiftLNode(mul_hi, phase->intcon(N)));
+        mul_hi = phase->transform(new (phase->C, 2) ConvL2INode(mul_hi));
+
+        // The magic multiplier is too large for a 32 bit constant. We've adjusted
+        // it down by 2^32, but have to add 1 dividend back in after the multiplication.
+        // This handles the "overflow" case described by Granlund and Montgomery.
+        mul_hi = phase->transform(new (phase->C, 3) AddINode(dividend, mul_hi));
+
+        // Shift over the (adjusted) mulhi
+        if (shift_const != 0) {
+          mul_hi = phase->transform(new (phase->C, 3) RShiftINode(mul_hi, phase->intcon(shift_const)));
+        }
+      } else {
+        // No add is required, we can merge the shifts together.
+        mul_hi = phase->transform(new (phase->C, 3) RShiftLNode(mul_hi, phase->intcon(N + shift_const)));
+        mul_hi = phase->transform(new (phase->C, 2) ConvL2INode(mul_hi));
+      }
+
+      // Get a 0 or -1 from the sign of the dividend.
+      Node *addend0 = mul_hi;
+      Node *addend1 = phase->transform(new (phase->C, 3) RShiftINode(dividend, phase->intcon(N-1)));
+
+      // If the divisor is negative, swap the order of the input addends;
+      // this has the effect of negating the quotient.
+      if (!d_pos) {
+        Node *temp = addend0; addend0 = addend1; addend1 = temp;
+      }
+
+      // Adjust the final quotient by subtracting -1 (adding 1)
+      // from the mul_hi.
+      q = new (phase->C, 3) SubINode(addend0, addend1);
+    }
+  }
+
+  return q;
+}
+
+//---------------------magic_long_divide_constants-----------------------------
+// Compute magic multiplier and shift constant for converting a 64 bit divide
+// by constant into a multiply/shift/add series. Return false if calculations
+// fail.
+//
+// Borrowed almost verbatum from Hacker's Delight by Henry S. Warren, Jr. with
+// minor type name and parameter changes.  Adjusted to 64 bit word width.
+static bool magic_long_divide_constants(jlong d, jlong &M, jint &s) {
+  int64_t p;
+  uint64_t ad, anc, delta, q1, r1, q2, r2, t;
+  const uint64_t two63 = 0x8000000000000000LL;     // 2**63.
+
+  ad = ABS(d);
+  if (d == 0 || d == 1) return false;
+  t = two63 + ((uint64_t)d >> 63);
+  anc = t - 1 - t%ad;     // Absolute value of nc.
+  p = 63;                 // Init. p.
+  q1 = two63/anc;         // Init. q1 = 2**p/|nc|.
+  r1 = two63 - q1*anc;    // Init. r1 = rem(2**p, |nc|).
+  q2 = two63/ad;          // Init. q2 = 2**p/|d|.
+  r2 = two63 - q2*ad;     // Init. r2 = rem(2**p, |d|).
+  do {
+    p = p + 1;
+    q1 = 2*q1;            // Update q1 = 2**p/|nc|.
+    r1 = 2*r1;            // Update r1 = rem(2**p, |nc|).
+    if (r1 >= anc) {      // (Must be an unsigned
+      q1 = q1 + 1;        // comparison here).
+      r1 = r1 - anc;
+    }
+    q2 = 2*q2;            // Update q2 = 2**p/|d|.
+    r2 = 2*r2;            // Update r2 = rem(2**p, |d|).
+    if (r2 >= ad) {       // (Must be an unsigned
+      q2 = q2 + 1;        // comparison here).
+      r2 = r2 - ad;
+    }
+    delta = ad - r2;
+  } while (q1 < delta || (q1 == delta && r1 == 0));
+
+  M = q2 + 1;
+  if (d < 0) M = -M;      // Magic number and
+  s = p - 64;             // shift amount to return.
+
+  return true;
+}
+
+//---------------------long_by_long_mulhi--------------------------------------
+// Generate ideal node graph for upper half of a 64 bit x 64 bit multiplication
+static Node *long_by_long_mulhi( PhaseGVN *phase, Node *dividend, jlong magic_const) {
+  // If the architecture supports a 64x64 mulhi, there is
+  // no need to synthesize it in ideal nodes.
+  if (Matcher::has_match_rule(Op_MulHiL)) {
+    Node *v = phase->longcon(magic_const);
+    return new (phase->C, 3) MulHiLNode(dividend, v);
   }
 
-  // division by something else
-  else if (m_high < (U1 << (N-1))) {
-    Node *t1 = phase->transform(new (phase->C, 2) ConvI2LNode(dividend));
-    Node *t2 = phase->transform(new (phase->C, 3) MulLNode(t1, phase->longcon(m_high)));
-    Node *t3 = phase->transform(new (phase->C, 3) RShiftLNode(t2, phase->intcon(sh_post+N)));
-    Node *t4 = phase->transform(new (phase->C, 2) ConvL2INode(t3));
-    Node *t5 = phase->transform(new (phase->C, 3) RShiftINode(dividend, phase->intcon(N-1)));
+  const int N = 64;
+
+  Node *u_hi = phase->transform(new (phase->C, 3) RShiftLNode(dividend, phase->intcon(N / 2)));
+  Node *u_lo = phase->transform(new (phase->C, 3) AndLNode(dividend, phase->longcon(0xFFFFFFFF)));
+
+  Node *v_hi = phase->longcon(magic_const >> N/2);
+  Node *v_lo = phase->longcon(magic_const & 0XFFFFFFFF);
+
+  Node *hihi_product = phase->transform(new (phase->C, 3) MulLNode(u_hi, v_hi));
+  Node *hilo_product = phase->transform(new (phase->C, 3) MulLNode(u_hi, v_lo));
+  Node *lohi_product = phase->transform(new (phase->C, 3) MulLNode(u_lo, v_hi));
+  Node *lolo_product = phase->transform(new (phase->C, 3) MulLNode(u_lo, v_lo));
+
+  Node *t1 = phase->transform(new (phase->C, 3) URShiftLNode(lolo_product, phase->intcon(N / 2)));
+  Node *t2 = phase->transform(new (phase->C, 3) AddLNode(hilo_product, t1));
+  Node *t3 = phase->transform(new (phase->C, 3) RShiftLNode(t2, phase->intcon(N / 2)));
+  Node *t4 = phase->transform(new (phase->C, 3) AndLNode(t2, phase->longcon(0xFFFFFFFF)));
+  Node *t5 = phase->transform(new (phase->C, 3) AddLNode(t4, lohi_product));
+  Node *t6 = phase->transform(new (phase->C, 3) RShiftLNode(t5, phase->intcon(N / 2)));
+  Node *t7 = phase->transform(new (phase->C, 3) AddLNode(t3, hihi_product));
+
+  return new (phase->C, 3) AddLNode(t7, t6);
+}
+
+
+//--------------------------transform_long_divide------------------------------
+// Convert a division by constant divisor into an alternate Ideal graph.
+// Return NULL if no transformation occurs.
+static Node *transform_long_divide( PhaseGVN *phase, Node *dividend, jlong divisor ) {
+  // Check for invalid divisors
+  assert( divisor != 0L && divisor != min_jlong,
+          "bad divisor for transforming to long multiply" );
+
+  bool d_pos = divisor >= 0;
+  jlong d = d_pos ? divisor : -divisor;
+  const int N = 64;
+
+  // Result
+  Node *q = NULL;
+
+  if (d == 1) {
+    // division by +/- 1
+    if (!d_pos) {
+      // Just negate the value
+      q = new (phase->C, 3) SubLNode(phase->longcon(0), dividend);
+    }
+  } else if ( is_power_of_2_long(d) ) {
+
+    // division by +/- a power of 2
+
+    // See if we can simply do a shift without rounding
+    bool needs_rounding = true;
+    const Type *dt = phase->type(dividend);
+    const TypeLong *dtl = dt->isa_long();
 
-    q = new (phase->C, 3) SubINode(d_pos ? t4 : t5, d_pos ? t5 : t4);
+    if (dtl && dtl->_lo > 0) {
+      // we don't need to round a positive dividend
+      needs_rounding = false;
+    } else if( dividend->Opcode() == Op_AndL ) {
+      // An AND mask of sufficient size clears the low bits and
+      // I can avoid rounding.
+      const TypeLong *andconl = phase->type( dividend->in(2) )->isa_long();
+      if( andconl && andconl->is_con(-d)) {
+        dividend = dividend->in(1);
+        needs_rounding = false;
+      }
+    }
+
+    // Add rounding to the shift to handle the sign bit
+    int l = log2_long(d-1)+1;
+    if (needs_rounding) {
+      // Divide-by-power-of-2 can be made into a shift, but you have to do
+      // more math for the rounding.  You need to add 0 for positive
+      // numbers, and "i-1" for negative numbers.  Example: i=4, so the
+      // shift is by 2.  You need to add 3 to negative dividends and 0 to
+      // positive ones.  So (-7+3)>>2 becomes -1, (-4+3)>>2 becomes -1,
+      // (-2+3)>>2 becomes 0, etc.
+
+      // Compute 0 or -1, based on sign bit
+      Node *sign = phase->transform(new (phase->C, 3) RShiftLNode(dividend, phase->intcon(N - 1)));
+      // Mask sign bit to the low sign bits
+      Node *round = phase->transform(new (phase->C, 3) URShiftLNode(sign, phase->intcon(N - l)));
+      // Round up before shifting
+      dividend = phase->transform(new (phase->C, 3) AddLNode(dividend, round));
+    }
+
+    // Shift for division
+    q = new (phase->C, 3) RShiftLNode(dividend, phase->intcon(l));
+
+    if (!d_pos) {
+      q = new (phase->C, 3) SubLNode(phase->longcon(0), phase->transform(q));
+    }
+  } else {
+    // Attempt the jlong constant divide -> multiply transform found in
+    //   "Division by Invariant Integers using Multiplication"
+    //     by Granlund and Montgomery
+    // See also "Hacker's Delight", chapter 10 by Warren.
+
+    jlong magic_const;
+    jint shift_const;
+    if (magic_long_divide_constants(d, magic_const, shift_const)) {
+      // Compute the high half of the dividend x magic multiplication
+      Node *mul_hi = phase->transform(long_by_long_mulhi(phase, dividend, magic_const));
+
+      // The high half of the 128-bit multiply is computed.
+      if (magic_const < 0) {
+        // The magic multiplier is too large for a 64 bit constant. We've adjusted
+        // it down by 2^64, but have to add 1 dividend back in after the multiplication.
+        // This handles the "overflow" case described by Granlund and Montgomery.
+        mul_hi = phase->transform(new (phase->C, 3) AddLNode(dividend, mul_hi));
+      }
+
+      // Shift over the (adjusted) mulhi
+      if (shift_const != 0) {
+        mul_hi = phase->transform(new (phase->C, 3) RShiftLNode(mul_hi, phase->intcon(shift_const)));
+      }
+
+      // Get a 0 or -1 from the sign of the dividend.
+      Node *addend0 = mul_hi;
+      Node *addend1 = phase->transform(new (phase->C, 3) RShiftLNode(dividend, phase->intcon(N-1)));
+
+      // If the divisor is negative, swap the order of the input addends;
+      // this has the effect of negating the quotient.
+      if (!d_pos) {
+        Node *temp = addend0; addend0 = addend1; addend1 = temp;
+      }
+
+      // Adjust the final quotient by subtracting -1 (adding 1)
+      // from the mul_hi.
+      q = new (phase->C, 3) SubLNode(addend0, addend1);
+    }
   }
 
-  // This handles that case where m_high is >= 2**(N-1). In that case,
-  // we subtract out 2**N from the multiply and add it in later as
-  // "dividend" in the equation (t5). This case computes the same result
-  // as the immediately preceeding case, save that rounding and overflow
-  // are accounted for.
-  else {
-    Node *t1 = phase->transform(new (phase->C, 2) ConvI2LNode(dividend));
-    Node *t2 = phase->transform(new (phase->C, 3) MulLNode(t1, phase->longcon(m_high - (U1 << N))));
-    Node *t3 = phase->transform(new (phase->C, 3) RShiftLNode(t2, phase->intcon(N)));
-    Node *t4 = phase->transform(new (phase->C, 2) ConvL2INode(t3));
-    Node *t5 = phase->transform(new (phase->C, 3) AddINode(dividend, t4));
-    Node *t6 = phase->transform(new (phase->C, 3) RShiftINode(t5, phase->intcon(sh_post)));
-    Node *t7 = phase->transform(new (phase->C, 3) RShiftINode(dividend, phase->intcon(N-1)));
-
-    q = new (phase->C, 3) SubINode(d_pos ? t6 : t7, d_pos ? t7 : t6);
-  }
-
-  return (q);
+  return q;
 }
 
 //=============================================================================
@@ -164,7 +404,7 @@
   const TypeInt *ti = t->isa_int();
   if( !ti ) return NULL;
   if( !ti->is_con() ) return NULL;
-  int i = ti->get_con();        // Get divisor
+  jint i = ti->get_con();       // Get divisor
 
   if (i == 0) return NULL;      // Dividing by zero constant does not idealize
 
@@ -173,7 +413,7 @@
   // Dividing by MININT does not optimize as a power-of-2 shift.
   if( i == min_jint ) return NULL;
 
-  return transform_int_divide_to_long_multiply( phase, in(1), i );
+  return transform_int_divide( phase, in(1), i );
 }
 
 //------------------------------Value------------------------------------------
@@ -255,85 +495,22 @@
   if (in(0) && remove_dead_region(phase, can_reshape))  return this;
 
   const Type *t = phase->type( in(2) );
-  if( t == TypeLong::ONE )       // Identity?
+  if( t == TypeLong::ONE )      // Identity?
     return NULL;                // Skip it
 
-  const TypeLong *ti = t->isa_long();
-  if( !ti ) return NULL;
-  if( !ti->is_con() ) return NULL;
-  jlong i = ti->get_con();      // Get divisor
-  if( i ) set_req(0, NULL);     // Dividing by a not-zero constant; no faulting
+  const TypeLong *tl = t->isa_long();
+  if( !tl ) return NULL;
+  if( !tl->is_con() ) return NULL;
+  jlong l = tl->get_con();      // Get divisor
+
+  if (l == 0) return NULL;      // Dividing by zero constant does not idealize
+
+  set_req(0,NULL);              // Dividing by a not-zero constant; no faulting
 
   // Dividing by MININT does not optimize as a power-of-2 shift.
-  if( i == min_jlong ) return NULL;
-
-  // Check for negative power of 2 divisor, if so, negate it and set a flag
-  // to indicate result needs to be negated.  Note that negating the dividend
-  // here does not work when it has the value MININT
-  Node *dividend = in(1);
-  bool negate_res = false;
-  if (is_power_of_2_long(-i)) {
-    i = -i;                     // Flip divisor
-    negate_res = true;
-  }
-
-  // Check for power of 2
-  if (!is_power_of_2_long(i))   // Is divisor a power of 2?
-    return NULL;                // Not a power of 2
-
-  // Compute number of bits to shift
-  int log_i = log2_long(i);
-
-  // See if we can simply do a shift without rounding
-  bool needs_rounding = true;
-  const Type *dt = phase->type(dividend);
-  const TypeLong *dtl = dt->isa_long();
+  if( l == min_jlong ) return NULL;
 
-  if (dtl && dtl->_lo > 0) {
-    // we don't need to round a positive dividend
-    needs_rounding = false;
-  } else if( dividend->Opcode() == Op_AndL ) {
-    // An AND mask of sufficient size clears the low bits and
-    // I can avoid rounding.
-    const TypeLong *andconi = phase->type( dividend->in(2) )->isa_long();
-    if( andconi &&
-        andconi->is_con() &&
-        andconi->get_con() == -i ) {
-      dividend = dividend->in(1);
-      needs_rounding = false;
-    }
-  }
-
-  if (!needs_rounding) {
-    Node *result = new (phase->C, 3) RShiftLNode(dividend, phase->intcon(log_i));
-    if (negate_res) {
-      result = phase->transform(result);
-      result = new (phase->C, 3) SubLNode(phase->longcon(0), result);
-    }
-    return result;
-  }
-
-  // Divide-by-power-of-2 can be made into a shift, but you have to do
-  // more math for the rounding.  You need to add 0 for positive
-  // numbers, and "i-1" for negative numbers.  Example: i=4, so the
-  // shift is by 2.  You need to add 3 to negative dividends and 0 to
-  // positive ones.  So (-7+3)>>2 becomes -1, (-4+3)>>2 becomes -1,
-  // (-2+3)>>2 becomes 0, etc.
-
-  // Compute 0 or -1, based on sign bit
-  Node *sign = phase->transform(new (phase->C, 3) RShiftLNode(dividend,phase->intcon(63)));
-  // Mask sign bit to the low sign bits
-  Node *round = phase->transform(new (phase->C, 3) AndLNode(sign,phase->longcon(i-1)));
-  // Round up before shifting
-  Node *sum = phase->transform(new (phase->C, 3) AddLNode(dividend,round));
-  // Shift for division
-  Node *result = new (phase->C, 3) RShiftLNode(sum, phase->intcon(log_i));
-  if (negate_res) {
-    result = phase->transform(result);
-    result = new (phase->C, 3) SubLNode(phase->longcon(0), result);
-  }
-
-  return result;
+  return transform_long_divide( phase, in(1), l );
 }
 
 //------------------------------Value------------------------------------------
@@ -615,10 +792,10 @@
       hook->init_req(0, x);       // Add a use to x to prevent him from dying
       // Generate code to reduce X rapidly to nearly 2^k-1.
       for( int i = 0; i < trip_count; i++ ) {
-          Node *xl = phase->transform( new (phase->C, 3) AndINode(x,divisor) );
-          Node *xh = phase->transform( new (phase->C, 3) RShiftINode(x,phase->intcon(k)) ); // Must be signed
-          x = phase->transform( new (phase->C, 3) AddINode(xh,xl) );
-          hook->set_req(0, x);
+        Node *xl = phase->transform( new (phase->C, 3) AndINode(x,divisor) );
+        Node *xh = phase->transform( new (phase->C, 3) RShiftINode(x,phase->intcon(k)) ); // Must be signed
+        x = phase->transform( new (phase->C, 3) AddINode(xh,xl) );
+        hook->set_req(0, x);
       }
 
       // Generate sign-fixup code.  Was original value positive?
@@ -675,18 +852,21 @@
   hook->init_req(0, in(1));
 
   // Divide using the transform from DivI to MulL
-  Node *divide = phase->transform( transform_int_divide_to_long_multiply( phase, in(1), pos_con ) );
+  Node *result = transform_int_divide( phase, in(1), pos_con );
+  if (result != NULL) {
+    Node *divide = phase->transform(result);
 
-  // Re-multiply, using a shift if this is a power of two
-  Node *mult = NULL;
+    // Re-multiply, using a shift if this is a power of two
+    Node *mult = NULL;
 
-  if( log2_con >= 0 )
-    mult = phase->transform( new (phase->C, 3) LShiftINode( divide, phase->intcon( log2_con ) ) );
-  else
-    mult = phase->transform( new (phase->C, 3) MulINode( divide, phase->intcon( pos_con ) ) );
+    if( log2_con >= 0 )
+      mult = phase->transform( new (phase->C, 3) LShiftINode( divide, phase->intcon( log2_con ) ) );
+    else
+      mult = phase->transform( new (phase->C, 3) MulINode( divide, phase->intcon( pos_con ) ) );
 
-  // Finally, subtract the multiplied divided value from the original
-  Node *result = new (phase->C, 3) SubINode( in(1), mult );
+    // Finally, subtract the multiplied divided value from the original
+    result = new (phase->C, 3) SubINode( in(1), mult );
+  }
 
   // Now remove the bogus extra edges used to keep things alive
   if (can_reshape) {
@@ -748,73 +928,126 @@
   // Get the modulus
   const Type *t = phase->type( in(2) );
   if( t == Type::TOP ) return NULL;
-  const TypeLong *ti = t->is_long();
+  const TypeLong *tl = t->is_long();
 
   // Check for useless control input
   // Check for excluding mod-zero case
-  if( in(0) && (ti->_hi < 0 || ti->_lo > 0) ) {
+  if( in(0) && (tl->_hi < 0 || tl->_lo > 0) ) {
     set_req(0, NULL);        // Yank control input
     return this;
   }
 
   // See if we are MOD'ing by 2^k or 2^k-1.
-  if( !ti->is_con() ) return NULL;
-  jlong con = ti->get_con();
-  bool m1 = false;
-  if( !is_power_of_2_long(con) ) {      // Not 2^k
-    if( !is_power_of_2_long(con+1) ) // Not 2^k-1?
-      return NULL;              // No interesting mod hacks
-    m1 = true;                  // Found 2^k-1
-    con++;                      // Convert to 2^k form
-  }
-  uint k = log2_long(con);       // Extract k
+  if( !tl->is_con() ) return NULL;
+  jlong con = tl->get_con();
+
+  Node *hook = new (phase->C, 1) Node(1);
 
   // Expand mod
-  if( !m1 ) {                   // Case 2^k
-  } else {                      // Case 2^k-1
+  if( con >= 0 && con < max_jlong && is_power_of_2_long(con+1) ) {
+    uint k = log2_long(con);       // Extract k
+
     // Basic algorithm by David Detlefs.  See fastmod_long.java for gory details.
     // Used to help a popular random number generator which does a long-mod
     // of 2^31-1 and shows up in SpecJBB and SciMark.
     static int unroll_factor[] = { 999, 999, 61, 30, 20, 15, 12, 10, 8, 7, 6, 6, 5, 5, 4, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 /*past here we assume 1 forever*/};
     int trip_count = 1;
     if( k < ARRAY_SIZE(unroll_factor)) trip_count = unroll_factor[k];
-    if( trip_count > 4 ) return NULL; // Too much unrolling
-    if (ConditionalMoveLimit == 0) return NULL;  // cmov is required
 
-    Node *x = in(1);            // Value being mod'd
-    Node *divisor = in(2);      // Also is mask
+    // If the unroll factor is not too large, and if conditional moves are
+    // ok, then use this case
+    if( trip_count <= 5 && ConditionalMoveLimit != 0 ) {
+      Node *x = in(1);            // Value being mod'd
+      Node *divisor = in(2);      // Also is mask
 
-    Node *hook = new (phase->C, 1) Node(x);
-    // Generate code to reduce X rapidly to nearly 2^k-1.
-    for( int i = 0; i < trip_count; i++ ) {
+      hook->init_req(0, x);       // Add a use to x to prevent him from dying
+      // Generate code to reduce X rapidly to nearly 2^k-1.
+      for( int i = 0; i < trip_count; i++ ) {
         Node *xl = phase->transform( new (phase->C, 3) AndLNode(x,divisor) );
         Node *xh = phase->transform( new (phase->C, 3) RShiftLNode(x,phase->intcon(k)) ); // Must be signed
         x = phase->transform( new (phase->C, 3) AddLNode(xh,xl) );
         hook->set_req(0, x);    // Add a use to x to prevent him from dying
+      }
+
+      // Generate sign-fixup code.  Was original value positive?
+      // long hack_res = (i >= 0) ? divisor : CONST64(1);
+      Node *cmp1 = phase->transform( new (phase->C, 3) CmpLNode( in(1), phase->longcon(0) ) );
+      Node *bol1 = phase->transform( new (phase->C, 2) BoolNode( cmp1, BoolTest::ge ) );
+      Node *cmov1= phase->transform( new (phase->C, 4) CMoveLNode(bol1, phase->longcon(1), divisor, TypeLong::LONG) );
+      // if( x >= hack_res ) x -= divisor;
+      Node *sub  = phase->transform( new (phase->C, 3) SubLNode( x, divisor ) );
+      Node *cmp2 = phase->transform( new (phase->C, 3) CmpLNode( x, cmov1 ) );
+      Node *bol2 = phase->transform( new (phase->C, 2) BoolNode( cmp2, BoolTest::ge ) );
+      // Convention is to not transform the return value of an Ideal
+      // since Ideal is expected to return a modified 'this' or a new node.
+      Node *cmov2= new (phase->C, 4) CMoveLNode(bol2, x, sub, TypeLong::LONG);
+      // cmov2 is now the mod
+
+      // Now remove the bogus extra edges used to keep things alive
+      if (can_reshape) {
+        phase->is_IterGVN()->remove_dead_node(hook);
+      } else {
+        hook->set_req(0, NULL);   // Just yank bogus edge during Parse phase
+      }
+      return cmov2;
     }
-    // Generate sign-fixup code.  Was original value positive?
-    // long hack_res = (i >= 0) ? divisor : CONST64(1);
-    Node *cmp1 = phase->transform( new (phase->C, 3) CmpLNode( in(1), phase->longcon(0) ) );
-    Node *bol1 = phase->transform( new (phase->C, 2) BoolNode( cmp1, BoolTest::ge ) );
-    Node *cmov1= phase->transform( new (phase->C, 4) CMoveLNode(bol1, phase->longcon(1), divisor, TypeLong::LONG) );
-    // if( x >= hack_res ) x -= divisor;
-    Node *sub  = phase->transform( new (phase->C, 3) SubLNode( x, divisor ) );
-    Node *cmp2 = phase->transform( new (phase->C, 3) CmpLNode( x, cmov1 ) );
-    Node *bol2 = phase->transform( new (phase->C, 2) BoolNode( cmp2, BoolTest::ge ) );
-    // Convention is to not transform the return value of an Ideal
-    // since Ideal is expected to return a modified 'this' or a new node.
-    Node *cmov2= new (phase->C, 4) CMoveLNode(bol2, x, sub, TypeLong::LONG);
-    // cmov2 is now the mod
+  }
+
+  // Fell thru, the unroll case is not appropriate. Transform the modulo
+  // into a long multiply/int multiply/subtract case
+
+  // Cannot handle mod 0, and min_jint isn't handled by the transform
+  if( con == 0 || con == min_jlong ) return NULL;
+
+  // Get the absolute value of the constant; at this point, we can use this
+  jlong pos_con = (con >= 0) ? con : -con;
+
+  // integer Mod 1 is always 0
+  if( pos_con == 1 ) return new (phase->C, 1) ConLNode(TypeLong::ZERO);
+
+  int log2_con = -1;
+
+  // If this is a power of two, they maybe we can mask it
+  if( is_power_of_2_long(pos_con) ) {
+    log2_con = log2_long(pos_con);
+
+    const Type *dt = phase->type(in(1));
+    const TypeLong *dtl = dt->isa_long();
+
+    // See if this can be masked, if the dividend is non-negative
+    if( dtl && dtl->_lo >= 0 )
+      return ( new (phase->C, 3) AndLNode( in(1), phase->longcon( pos_con-1 ) ) );
+  }
 
-    // Now remove the bogus extra edges used to keep things alive
-    if (can_reshape) {
-      phase->is_IterGVN()->remove_dead_node(hook);
-    } else {
-      hook->set_req(0, NULL);   // Just yank bogus edge during Parse phase
-    }
-    return cmov2;
+  // Save in(1) so that it cannot be changed or deleted
+  hook->init_req(0, in(1));
+
+  // Divide using the transform from DivI to MulL
+  Node *result = transform_long_divide( phase, in(1), pos_con );
+  if (result != NULL) {
+    Node *divide = phase->transform(result);
+
+    // Re-multiply, using a shift if this is a power of two
+    Node *mult = NULL;
+
+    if( log2_con >= 0 )
+      mult = phase->transform( new (phase->C, 3) LShiftLNode( divide, phase->intcon( log2_con ) ) );
+    else
+      mult = phase->transform( new (phase->C, 3) MulLNode( divide, phase->longcon( pos_con ) ) );
+
+    // Finally, subtract the multiplied divided value from the original
+    result = new (phase->C, 3) SubLNode( in(1), mult );
   }
-  return NULL;
+
+  // Now remove the bogus extra edges used to keep things alive
+  if (can_reshape) {
+    phase->is_IterGVN()->remove_dead_node(hook);
+  } else {
+    hook->set_req(0, NULL);       // Just yank bogus edge during Parse phase
+  }
+
+  // return the value
+  return result;
 }
 
 //------------------------------Value------------------------------------------
--- a/hotspot/src/share/vm/opto/mulnode.cpp	Tue Apr 29 19:45:22 2008 -0700
+++ b/hotspot/src/share/vm/opto/mulnode.cpp	Wed May 07 08:06:46 2008 -0700
@@ -365,6 +365,25 @@
 }
 
 //=============================================================================
+//------------------------------Value------------------------------------------
+const Type *MulHiLNode::Value( PhaseTransform *phase ) const {
+  // Either input is TOP ==> the result is TOP
+  const Type *t1 = phase->type( in(1) );
+  const Type *t2 = phase->type( in(2) );
+  if( t1 == Type::TOP ) return Type::TOP;
+  if( t2 == Type::TOP ) return Type::TOP;
+
+  // Either input is BOTTOM ==> the result is the local BOTTOM
+  const Type *bot = bottom_type();
+  if( (t1 == bot) || (t2 == bot) ||
+      (t1 == Type::BOTTOM) || (t2 == Type::BOTTOM) )
+    return bot;
+
+  // It is not worth trying to constant fold this stuff!
+  return TypeLong::LONG;
+}
+
+//=============================================================================
 //------------------------------mul_ring---------------------------------------
 // Supplied function returns the product of the inputs IN THE CURRENT RING.
 // For the logical operations the ring's MUL is really a logical AND function.
--- a/hotspot/src/share/vm/opto/mulnode.hpp	Tue Apr 29 19:45:22 2008 -0700
+++ b/hotspot/src/share/vm/opto/mulnode.hpp	Wed May 07 08:06:46 2008 -0700
@@ -133,6 +133,16 @@
   virtual uint ideal_reg() const { return Op_RegD; }
 };
 
+//-------------------------------MulHiLNode------------------------------------
+// Upper 64 bits of a 64 bit by 64 bit multiply
+class MulHiLNode : public Node {
+public:
+  MulHiLNode( Node *in1, Node *in2 ) : Node(0,in1,in2) {}
+  virtual int Opcode() const;
+  virtual const Type *Value( PhaseTransform *phase ) const;
+  const Type *bottom_type() const { return TypeLong::LONG; }
+  virtual uint ideal_reg() const { return Op_RegL; }
+};
 
 //------------------------------AndINode---------------------------------------
 // Logically AND 2 integers.  Included with the MUL nodes because it inherits
--- a/hotspot/src/share/vm/opto/type.hpp	Tue Apr 29 19:45:22 2008 -0700
+++ b/hotspot/src/share/vm/opto/type.hpp	Wed May 07 08:06:46 2008 -0700
@@ -442,6 +442,7 @@
 
   // Check for single integer
   int is_con() const { return _lo==_hi; }
+  bool is_con(int i) const { return is_con() && _lo == i; }
   jlong get_con() const { assert( is_con(), "" ); return _lo; }
 
   virtual bool        is_finite() const;  // Has a finite value
--- a/hotspot/src/share/vm/utilities/globalDefinitions.hpp	Tue Apr 29 19:45:22 2008 -0700
+++ b/hotspot/src/share/vm/utilities/globalDefinitions.hpp	Wed May 07 08:06:46 2008 -0700
@@ -890,7 +890,7 @@
     i++; p *= 2;
   }
   // p = 2^(i+1) && x < p (i.e., 2^i <= x < 2^(i+1))
-  // (if p = 0 then overflow occured and i = 31)
+  // (if p = 0 then overflow occured and i = 63)
   return i;
 }