6987135: Performance regression on Intel platform with 32-bits edition between 6u13 and 6u14.
authorkvn
Tue, 02 Nov 2010 09:00:37 -0700
changeset 7115 32300e243300
parent 7114 65d21c4c6337
child 7116 e24b7743e3d4
6987135: Performance regression on Intel platform with 32-bits edition between 6u13 and 6u14. Summary: Use hardware DIV instruction for long division by constant when it is faster than code with multiply. Reviewed-by: never
hotspot/src/cpu/sparc/vm/sparc.ad
hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.hpp
hotspot/src/cpu/x86/vm/x86_32.ad
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp
hotspot/src/share/vm/opto/divnode.cpp
hotspot/src/share/vm/opto/matcher.hpp
--- a/hotspot/src/cpu/sparc/vm/sparc.ad	Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad	Tue Nov 02 09:00:37 2010 -0700
@@ -1843,6 +1843,12 @@
   return can_be_java_arg(reg);
 }
 
+bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
+  // Use hardware SDIVX instruction when it is
+  // faster than a code which use multiply.
+  return VM_Version::has_fast_idiv();
+}
+
 // Register for DIVI projection of divmodI
 RegMask Matcher::divI_proj_mask() {
   ShouldNotReachHere();
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Tue Nov 02 09:00:37 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -80,7 +80,8 @@
       FLAG_SET_DEFAULT(InteriorEntryAlignment, 4);
     }
     if (is_niagara1_plus()) {
-      if (AllocatePrefetchStyle > 0 && FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
+      if (has_blk_init() && AllocatePrefetchStyle > 0 &&
+          FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
         // Use BIS instruction for allocation prefetch.
         FLAG_SET_DEFAULT(AllocatePrefetchStyle, 3);
         if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
@@ -118,16 +119,18 @@
 #endif
 
   char buf[512];
-  jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                (has_v8() ? ", has_v8" : ""),
                (has_v9() ? ", has_v9" : ""),
                (has_hardware_popc() ? ", popc" : ""),
                (has_vis1() ? ", has_vis1" : ""),
                (has_vis2() ? ", has_vis2" : ""),
+               (has_blk_init() ? ", has_blk_init" : ""),
                (is_ultra3() ? ", is_ultra3" : ""),
                (is_sun4v() ? ", is_sun4v" : ""),
                (is_niagara1() ? ", is_niagara1" : ""),
                (is_niagara1_plus() ? ", is_niagara1_plus" : ""),
+               (is_sparc64() ? ", is_sparc64" : ""),
                (!has_hardware_mul32() ? ", no-mul32" : ""),
                (!has_hardware_div32() ? ", no-div32" : ""),
                (!has_hardware_fsmuld() ? ", no-fsmuld" : ""));
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp	Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp	Tue Nov 02 09:00:37 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -33,7 +33,9 @@
     v9_instructions    = 5,
     vis1_instructions  = 6,
     vis2_instructions  = 7,
-    sun4v_instructions = 8
+    sun4v_instructions = 8,
+    blk_init_instructions = 9,
+    fmaf_instructions  = 10
   };
 
   enum Feature_Flag_Set {
@@ -49,6 +51,8 @@
     vis1_instructions_m = 1 << vis1_instructions,
     vis2_instructions_m = 1 << vis2_instructions,
     sun4v_m             = 1 << sun4v_instructions,
+    blk_init_instructions_m = 1 << blk_init_instructions,
+    fmaf_instructions_m = 1 << fmaf_instructions,
 
     generic_v8_m        = v8_instructions_m | hardware_mul32_m | hardware_div32_m | hardware_fsmuld_m,
     generic_v9_m        = generic_v8_m | v9_instructions_m,
@@ -67,6 +71,7 @@
   static int  platform_features(int features);
 
   static bool is_niagara1(int features) { return (features & sun4v_m) != 0; }
+  static bool  is_sparc64(int features) { return (features & fmaf_instructions_m) != 0; }
 
   static int maximum_niagara1_processor_count() { return 32; }
   // Returns true if the platform is in the niagara line and
@@ -86,6 +91,7 @@
   static bool has_hardware_popc()       { return (_features & hardware_popc_m) != 0; }
   static bool has_vis1()                { return (_features & vis1_instructions_m) != 0; }
   static bool has_vis2()                { return (_features & vis2_instructions_m) != 0; }
+  static bool has_blk_init()            { return (_features & blk_init_instructions_m) != 0; }
 
   static bool supports_compare_and_exchange()
                                         { return has_v9(); }
@@ -93,8 +99,10 @@
   static bool is_ultra3()               { return (_features & ultra3_m) == ultra3_m; }
   static bool is_sun4v()                { return (_features & sun4v_m) != 0; }
   static bool is_niagara1()             { return is_niagara1(_features); }
+  static bool is_sparc64()              { return is_sparc64(_features); }
 
   static bool has_fast_fxtof()          { return has_v9() && !is_ultra3(); }
+  static bool has_fast_idiv()           { return is_niagara1_plus() || is_sparc64(); }
 
   static const char* cpu_features()     { return _features_str; }
 
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Tue Nov 02 09:00:37 2010 -0700
@@ -1288,7 +1288,7 @@
   if (is8bit(value)) {
     emit_byte(0x6B);
     emit_byte(0xC0 | encode);
-    emit_byte(value);
+    emit_byte(value & 0xFF);
   } else {
     emit_byte(0x69);
     emit_byte(0xC0 | encode);
@@ -3903,7 +3903,7 @@
   if (is8bit(value)) {
     emit_byte(0x6B);
     emit_byte(0xC0 | encode);
-    emit_byte(value);
+    emit_byte(value & 0xFF);
   } else {
     emit_byte(0x69);
     emit_byte(0xC0 | encode);
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Tue Nov 02 09:00:37 2010 -0700
@@ -446,6 +446,10 @@
   static bool supports_lzcnt()    { return (_cpuFeatures & CPU_LZCNT) != 0; }
   static bool supports_sse4a()    { return (_cpuFeatures & CPU_SSE4A) != 0; }
 
+  // Intel Core and newer cpus have fast IDIV instruction (excluding Atom).
+  static bool has_fast_idiv()     { return is_intel() && cpu_family() == 6 &&
+                                           supports_sse3() && _model != 0x1C; }
+
   static bool supports_compare_and_exchange() { return true; }
 
   static const char* cpu_features()           { return _features_str; }
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Tue Nov 02 09:00:37 2010 -0700
@@ -1508,6 +1508,16 @@
   return can_be_java_arg(reg);
 }
 
+bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
+  // Use hardware integer DIV instruction when
+  // it is faster than a code which use multiply.
+  // Only when constant divisor fits into 32 bit
+  // (min_jint is excluded to get only correct
+  // positive 32 bit values from negative).
+  return VM_Version::has_fast_idiv() &&
+         (divisor == (int)divisor && divisor != min_jint);
+}
+
 // Register for DIVI projection of divmodI
 RegMask Matcher::divI_proj_mask() {
   return EAX_REG_mask;
@@ -1546,6 +1556,9 @@
       return true;
     }
   }
+  if (opc == Op_ConL && (n->get_long() & 0xFFFFFFFF00000000LL) == 0LL) {
+    return true;
+  }
   return false;
 }
 
@@ -2309,9 +2322,11 @@
   enc_class move_long_big_shift_sign( eRegL dst, immI_32_63 cnt ) %{
     emit_opcode( cbuf, 0x8B ); // Move
     emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg));
-    emit_d8(cbuf,$primary);
-    emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
-    emit_d8(cbuf,$cnt$$constant-32);
+    if( $cnt$$constant > 32 ) { // Shift, if not by zero
+      emit_d8(cbuf,$primary);
+      emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
+      emit_d8(cbuf,$cnt$$constant-32);
+    }
     emit_d8(cbuf,$primary);
     emit_rm(cbuf, 0x3, $secondary, HIGH_FROM_LOW($dst$$reg));
     emit_d8(cbuf,31);
@@ -8842,6 +8857,103 @@
   ins_pipe( pipe_slow );
 %}
 
+// Divide Register Long (no special case since divisor != -1)
+instruct divL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{
+  match(Set dst (DivL dst imm));
+  effect( TEMP tmp, TEMP tmp2, KILL cr );
+  ins_cost(1000);
+  format %{ "MOV    $tmp,abs($imm) # ldiv EDX:EAX,$imm\n\t"
+            "CMP    $tmp,EDX\n\t"
+            "JA,s   fast\n\t"
+            "MOV    $tmp2,EAX\n\t"
+            "MOV    EAX,EDX\n\t"
+            "SAR    EDX,31\n\t"
+            "IDIV   $tmp\n\t"
+            "XCHG   EAX,$tmp2 \n\t"
+            "IDIV   $tmp\n\t"
+            "CDQ\n\t"
+            "ADD    EDX,$tmp2\n\t"
+            "JMP,s  done\n"
+    "fast:\n\t"
+            "IDIV   $tmp\n\t"
+            "XOR    EDX,EDX\n"
+    "done:\n\t"
+            "NEG    EDX:EAX # if $imm < 0" %}
+  ins_encode %{
+    int con = (int)$imm$$constant;
+    assert(con != 0 && con != -1 && con != min_jint, "wrong divisor");
+    int pcon = (con > 0) ? con : -con;
+    Label Lfast, Ldone;
+
+    __ movl($tmp$$Register, pcon);
+    __ cmpl($tmp$$Register, HIGH_FROM_LOW($dst$$Register));
+    __ jccb(Assembler::above, Lfast);
+
+    __ movl($tmp2$$Register, $dst$$Register); // save
+    __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
+    __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // src sign
+    __ idivl($tmp$$Register);
+    __ xchgl($dst$$Register, $tmp2$$Register);
+    __ idivl($tmp$$Register);
+    __ cdql();
+    __ addl(HIGH_FROM_LOW($dst$$Register),$tmp2$$Register);
+    __ jmpb(Ldone);
+
+    __ bind(Lfast);
+    // fast path: src is positive and result fits into 32 bit
+    __ idivl($tmp$$Register);
+    __ xorl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
+
+    __ bind(Ldone);
+    if (con < 0) {
+      __ lneg(HIGH_FROM_LOW($dst$$Register), $dst$$Register);
+    }
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// Remainder Register Long (remainder fit into 32 bits)
+instruct modL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{
+  match(Set dst (ModL dst imm));
+  effect( TEMP tmp, TEMP tmp2, KILL cr );
+  ins_cost(1000);
+  format %{ "MOV    $tmp,abs($imm) # lrem EDX:EAX,$imm\n\t"
+            "CMP    $tmp,EDX\n\t"
+            "JA,s   fast\n\t"
+            "MOV    $tmp2,EAX\n\t"
+            "MOV    EAX,EDX\n\t"
+            "SAR    EDX,31\n\t"
+            "IDIV   $tmp\n\t"
+            "MOV    EAX,$tmp2\n"
+    "fast:\n\t"
+            "IDIV   $tmp\n\t"
+            "MOV    EAX,EDX\n\t"
+            "SAR    EDX,31\n\t" %}
+  ins_encode %{
+    int con = (int)$imm$$constant;
+    assert(con != 0 && con != -1 && con != min_jint, "wrong divisor");
+    int pcon = (con > 0) ? con : -con;
+    Label  Lfast;
+
+    __ movl($tmp$$Register, pcon);
+    __ cmpl($tmp$$Register, HIGH_FROM_LOW($dst$$Register));
+    __ jccb(Assembler::above, Lfast); // src is positive and result fits into 32 bit
+
+    __ movl($tmp2$$Register, $dst$$Register); // save
+    __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
+    __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // src sign
+    __ idivl($tmp$$Register);
+    __ movl($dst$$Register, $tmp2$$Register);
+
+    __ bind(Lfast);
+    __ idivl($tmp$$Register);
+    __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
+    __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // result sign
+
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // Integer Shift Instructions
 // Shift Left by one
 instruct shlI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Tue Nov 02 09:00:37 2010 -0700
@@ -2065,6 +2065,13 @@
   return can_be_java_arg(reg);
 }
 
+bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
+  // In 64 bit mode a code which use multiply when
+  // devisor is constant is faster than hardware
+  // DIV instruction (it uses MulHiL).
+  return false;
+}
+
 // Register for DIVI projection of divmodI
 RegMask Matcher::divI_proj_mask() {
   return INT_RAX_REG_mask;
--- a/hotspot/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp	Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp	Tue Nov 02 09:00:37 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -65,10 +65,6 @@
   // getisax(2), SI_ARCHITECTURE_32, and SI_ARCHITECTURE_64 are
   // supported on Solaris 10 and later.
   if (os::Solaris::supports_getisax()) {
-#ifndef PRODUCT
-    if (PrintMiscellaneous && Verbose)
-      tty->print_cr("getisax(2) supported.");
-#endif
 
     // Check 32-bit architecture.
     do_sysinfo(SI_ARCHITECTURE_32, "sparc", &features, v8_instructions_m);
@@ -81,6 +77,11 @@
     uint_t avn = os::Solaris::getisax(&av, 1);
     assert(avn == 1, "should only return one av");
 
+#ifndef PRODUCT
+    if (PrintMiscellaneous && Verbose)
+      tty->print_cr("getisax(2) returned: " PTR32_FORMAT, av);
+#endif
+
     if (av & AV_SPARC_MUL32)  features |= hardware_mul32_m;
     if (av & AV_SPARC_DIV32)  features |= hardware_div32_m;
     if (av & AV_SPARC_FSMULD) features |= hardware_fsmuld_m;
@@ -88,11 +89,22 @@
     if (av & AV_SPARC_POPC)   features |= hardware_popc_m;
     if (av & AV_SPARC_VIS)    features |= vis1_instructions_m;
     if (av & AV_SPARC_VIS2)   features |= vis2_instructions_m;
+
+    // Next values are not defined before Solaris 10
+    // but Solaris 8 is used for jdk6 update builds.
+#ifndef AV_SPARC_ASI_BLK_INIT
+#define AV_SPARC_ASI_BLK_INIT 0x0080  /* ASI_BLK_INIT_xxx ASI */
+#endif
+#ifndef AV_SPARC_FMAF
+#define AV_SPARC_FMAF 0x0100  /* Sparc64 Fused Multiply-Add */
+#endif
+    if (av & AV_SPARC_ASI_BLK_INIT) features |= blk_init_instructions_m;
+    if (av & AV_SPARC_FMAF)         features |= fmaf_instructions_m;
   } else {
     // getisax(2) failed, use the old legacy code.
 #ifndef PRODUCT
     if (PrintMiscellaneous && Verbose)
-      tty->print_cr("getisax(2) not supported.");
+      tty->print_cr("getisax(2) is not supported.");
 #endif
 
     char   tmp;
--- a/hotspot/src/share/vm/opto/divnode.cpp	Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/share/vm/opto/divnode.cpp	Tue Nov 02 09:00:37 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -388,7 +388,8 @@
     if (!d_pos) {
       q = new (phase->C, 3) SubLNode(phase->longcon(0), phase->transform(q));
     }
-  } else {
+  } else if ( !Matcher::use_asm_for_ldiv_by_con(d) ) { // Use hardware DIV instruction when
+                                                       // it is faster than code generated below.
     // Attempt the jlong constant divide -> multiply transform found in
     //   "Division by Invariant Integers using Multiplication"
     //     by Granlund and Montgomery
@@ -558,7 +559,7 @@
 
   set_req(0,NULL);              // Dividing by a not-zero constant; no faulting
 
-  // Dividing by MININT does not optimize as a power-of-2 shift.
+  // Dividing by MINLONG does not optimize as a power-of-2 shift.
   if( l == min_jlong ) return NULL;
 
   return transform_long_divide( phase, in(1), l );
@@ -1062,7 +1063,7 @@
   // Fell thru, the unroll case is not appropriate. Transform the modulo
   // into a long multiply/int multiply/subtract case
 
-  // Cannot handle mod 0, and min_jint isn't handled by the transform
+  // Cannot handle mod 0, and min_jlong isn't handled by the transform
   if( con == 0 || con == min_jlong ) return NULL;
 
   // Get the absolute value of the constant; at this point, we can use this
@@ -1075,7 +1076,7 @@
 
   // If this is a power of two, then maybe we can mask it
   if( is_power_of_2_long(pos_con) ) {
-    log2_con = log2_long(pos_con);
+    log2_con = exact_log2_long(pos_con);
 
     const Type *dt = phase->type(in(1));
     const TypeLong *dtl = dt->isa_long();
@@ -1088,7 +1089,7 @@
   // Save in(1) so that it cannot be changed or deleted
   hook->init_req(0, in(1));
 
-  // Divide using the transform from DivI to MulL
+  // Divide using the transform from DivL to MulL
   Node *result = transform_long_divide( phase, in(1), pos_con );
   if (result != NULL) {
     Node *divide = phase->transform(result);
--- a/hotspot/src/share/vm/opto/matcher.hpp	Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/share/vm/opto/matcher.hpp	Tue Nov 02 09:00:37 2010 -0700
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -298,6 +298,10 @@
   // Register for MODL projection of divmodL
   static RegMask modL_proj_mask();
 
+  // Use hardware DIV instruction when it is faster than
+  // a code which use multiply for division by constant.
+  static bool use_asm_for_ldiv_by_con( jlong divisor );
+
   static const RegMask method_handle_invoke_SP_save_mask();
 
   // Java-Interpreter calling convention