6987135: Performance regression on Intel platform with 32-bits edition between 6u13 and 6u14.
Summary: Use hardware DIV instruction for long division by constant when it is faster than code with multiply.
Reviewed-by: never
--- a/hotspot/src/cpu/sparc/vm/sparc.ad Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad Tue Nov 02 09:00:37 2010 -0700
@@ -1843,6 +1843,12 @@
return can_be_java_arg(reg);
}
+bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
+ // Use hardware SDIVX instruction when it is
+ // faster than a code which use multiply.
+ return VM_Version::has_fast_idiv();
+}
+
// Register for DIVI projection of divmodI
RegMask Matcher::divI_proj_mask() {
ShouldNotReachHere();
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Tue Nov 02 09:00:37 2010 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -80,7 +80,8 @@
FLAG_SET_DEFAULT(InteriorEntryAlignment, 4);
}
if (is_niagara1_plus()) {
- if (AllocatePrefetchStyle > 0 && FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
+ if (has_blk_init() && AllocatePrefetchStyle > 0 &&
+ FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
// Use BIS instruction for allocation prefetch.
FLAG_SET_DEFAULT(AllocatePrefetchStyle, 3);
if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
@@ -118,16 +119,18 @@
#endif
char buf[512];
- jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s",
+ jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
(has_v8() ? ", has_v8" : ""),
(has_v9() ? ", has_v9" : ""),
(has_hardware_popc() ? ", popc" : ""),
(has_vis1() ? ", has_vis1" : ""),
(has_vis2() ? ", has_vis2" : ""),
+ (has_blk_init() ? ", has_blk_init" : ""),
(is_ultra3() ? ", is_ultra3" : ""),
(is_sun4v() ? ", is_sun4v" : ""),
(is_niagara1() ? ", is_niagara1" : ""),
(is_niagara1_plus() ? ", is_niagara1_plus" : ""),
+ (is_sparc64() ? ", is_sparc64" : ""),
(!has_hardware_mul32() ? ", no-mul32" : ""),
(!has_hardware_div32() ? ", no-div32" : ""),
(!has_hardware_fsmuld() ? ", no-fsmuld" : ""));
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.hpp Tue Nov 02 09:00:37 2010 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -33,7 +33,9 @@
v9_instructions = 5,
vis1_instructions = 6,
vis2_instructions = 7,
- sun4v_instructions = 8
+ sun4v_instructions = 8,
+ blk_init_instructions = 9,
+ fmaf_instructions = 10
};
enum Feature_Flag_Set {
@@ -49,6 +51,8 @@
vis1_instructions_m = 1 << vis1_instructions,
vis2_instructions_m = 1 << vis2_instructions,
sun4v_m = 1 << sun4v_instructions,
+ blk_init_instructions_m = 1 << blk_init_instructions,
+ fmaf_instructions_m = 1 << fmaf_instructions,
generic_v8_m = v8_instructions_m | hardware_mul32_m | hardware_div32_m | hardware_fsmuld_m,
generic_v9_m = generic_v8_m | v9_instructions_m,
@@ -67,6 +71,7 @@
static int platform_features(int features);
static bool is_niagara1(int features) { return (features & sun4v_m) != 0; }
+ static bool is_sparc64(int features) { return (features & fmaf_instructions_m) != 0; }
static int maximum_niagara1_processor_count() { return 32; }
// Returns true if the platform is in the niagara line and
@@ -86,6 +91,7 @@
static bool has_hardware_popc() { return (_features & hardware_popc_m) != 0; }
static bool has_vis1() { return (_features & vis1_instructions_m) != 0; }
static bool has_vis2() { return (_features & vis2_instructions_m) != 0; }
+ static bool has_blk_init() { return (_features & blk_init_instructions_m) != 0; }
static bool supports_compare_and_exchange()
{ return has_v9(); }
@@ -93,8 +99,10 @@
static bool is_ultra3() { return (_features & ultra3_m) == ultra3_m; }
static bool is_sun4v() { return (_features & sun4v_m) != 0; }
static bool is_niagara1() { return is_niagara1(_features); }
+ static bool is_sparc64() { return is_sparc64(_features); }
static bool has_fast_fxtof() { return has_v9() && !is_ultra3(); }
+ static bool has_fast_idiv() { return is_niagara1_plus() || is_sparc64(); }
static const char* cpu_features() { return _features_str; }
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Tue Nov 02 09:00:37 2010 -0700
@@ -1288,7 +1288,7 @@
if (is8bit(value)) {
emit_byte(0x6B);
emit_byte(0xC0 | encode);
- emit_byte(value);
+ emit_byte(value & 0xFF);
} else {
emit_byte(0x69);
emit_byte(0xC0 | encode);
@@ -3903,7 +3903,7 @@
if (is8bit(value)) {
emit_byte(0x6B);
emit_byte(0xC0 | encode);
- emit_byte(value);
+ emit_byte(value & 0xFF);
} else {
emit_byte(0x69);
emit_byte(0xC0 | encode);
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Tue Nov 02 09:00:37 2010 -0700
@@ -446,6 +446,10 @@
static bool supports_lzcnt() { return (_cpuFeatures & CPU_LZCNT) != 0; }
static bool supports_sse4a() { return (_cpuFeatures & CPU_SSE4A) != 0; }
+ // Intel Core and newer cpus have fast IDIV instruction (excluding Atom).
+ static bool has_fast_idiv() { return is_intel() && cpu_family() == 6 &&
+ supports_sse3() && _model != 0x1C; }
+
static bool supports_compare_and_exchange() { return true; }
static const char* cpu_features() { return _features_str; }
--- a/hotspot/src/cpu/x86/vm/x86_32.ad Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad Tue Nov 02 09:00:37 2010 -0700
@@ -1508,6 +1508,16 @@
return can_be_java_arg(reg);
}
+bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
+ // Use hardware integer DIV instruction when
+ // it is faster than a code which use multiply.
+ // Only when constant divisor fits into 32 bit
+ // (min_jint is excluded to get only correct
+ // positive 32 bit values from negative).
+ return VM_Version::has_fast_idiv() &&
+ (divisor == (int)divisor && divisor != min_jint);
+}
+
// Register for DIVI projection of divmodI
RegMask Matcher::divI_proj_mask() {
return EAX_REG_mask;
@@ -1546,6 +1556,9 @@
return true;
}
}
+ if (opc == Op_ConL && (n->get_long() & 0xFFFFFFFF00000000LL) == 0LL) {
+ return true;
+ }
return false;
}
@@ -2309,9 +2322,11 @@
enc_class move_long_big_shift_sign( eRegL dst, immI_32_63 cnt ) %{
emit_opcode( cbuf, 0x8B ); // Move
emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg));
- emit_d8(cbuf,$primary);
- emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
- emit_d8(cbuf,$cnt$$constant-32);
+ if( $cnt$$constant > 32 ) { // Shift, if not by zero
+ emit_d8(cbuf,$primary);
+ emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
+ emit_d8(cbuf,$cnt$$constant-32);
+ }
emit_d8(cbuf,$primary);
emit_rm(cbuf, 0x3, $secondary, HIGH_FROM_LOW($dst$$reg));
emit_d8(cbuf,31);
@@ -8842,6 +8857,103 @@
ins_pipe( pipe_slow );
%}
+// Divide Register Long (no special case since divisor != -1)
+instruct divL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{
+ match(Set dst (DivL dst imm));
+ effect( TEMP tmp, TEMP tmp2, KILL cr );
+ ins_cost(1000);
+ format %{ "MOV $tmp,abs($imm) # ldiv EDX:EAX,$imm\n\t"
+ "CMP $tmp,EDX\n\t"
+ "JA,s fast\n\t"
+ "MOV $tmp2,EAX\n\t"
+ "MOV EAX,EDX\n\t"
+ "SAR EDX,31\n\t"
+ "IDIV $tmp\n\t"
+ "XCHG EAX,$tmp2 \n\t"
+ "IDIV $tmp\n\t"
+ "CDQ\n\t"
+ "ADD EDX,$tmp2\n\t"
+ "JMP,s done\n"
+ "fast:\n\t"
+ "IDIV $tmp\n\t"
+ "XOR EDX,EDX\n"
+ "done:\n\t"
+ "NEG EDX:EAX # if $imm < 0" %}
+ ins_encode %{
+ int con = (int)$imm$$constant;
+ assert(con != 0 && con != -1 && con != min_jint, "wrong divisor");
+ int pcon = (con > 0) ? con : -con;
+ Label Lfast, Ldone;
+
+ __ movl($tmp$$Register, pcon);
+ __ cmpl($tmp$$Register, HIGH_FROM_LOW($dst$$Register));
+ __ jccb(Assembler::above, Lfast);
+
+ __ movl($tmp2$$Register, $dst$$Register); // save
+ __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
+ __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // src sign
+ __ idivl($tmp$$Register);
+ __ xchgl($dst$$Register, $tmp2$$Register);
+ __ idivl($tmp$$Register);
+ __ cdql();
+ __ addl(HIGH_FROM_LOW($dst$$Register),$tmp2$$Register);
+ __ jmpb(Ldone);
+
+ __ bind(Lfast);
+ // fast path: src is positive and result fits into 32 bit
+ __ idivl($tmp$$Register);
+ __ xorl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
+
+ __ bind(Ldone);
+ if (con < 0) {
+ __ lneg(HIGH_FROM_LOW($dst$$Register), $dst$$Register);
+ }
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// Remainder Register Long (remainder fit into 32 bits)
+instruct modL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{
+ match(Set dst (ModL dst imm));
+ effect( TEMP tmp, TEMP tmp2, KILL cr );
+ ins_cost(1000);
+ format %{ "MOV $tmp,abs($imm) # lrem EDX:EAX,$imm\n\t"
+ "CMP $tmp,EDX\n\t"
+ "JA,s fast\n\t"
+ "MOV $tmp2,EAX\n\t"
+ "MOV EAX,EDX\n\t"
+ "SAR EDX,31\n\t"
+ "IDIV $tmp\n\t"
+ "MOV EAX,$tmp2\n"
+ "fast:\n\t"
+ "IDIV $tmp\n\t"
+ "MOV EAX,EDX\n\t"
+ "SAR EDX,31\n\t" %}
+ ins_encode %{
+ int con = (int)$imm$$constant;
+ assert(con != 0 && con != -1 && con != min_jint, "wrong divisor");
+ int pcon = (con > 0) ? con : -con;
+ Label Lfast;
+
+ __ movl($tmp$$Register, pcon);
+ __ cmpl($tmp$$Register, HIGH_FROM_LOW($dst$$Register));
+ __ jccb(Assembler::above, Lfast); // src is positive and result fits into 32 bit
+
+ __ movl($tmp2$$Register, $dst$$Register); // save
+ __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
+ __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // src sign
+ __ idivl($tmp$$Register);
+ __ movl($dst$$Register, $tmp2$$Register);
+
+ __ bind(Lfast);
+ __ idivl($tmp$$Register);
+ __ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
+ __ sarl(HIGH_FROM_LOW($dst$$Register), 31); // result sign
+
+ %}
+ ins_pipe( pipe_slow );
+%}
+
// Integer Shift Instructions
// Shift Left by one
instruct shlI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
--- a/hotspot/src/cpu/x86/vm/x86_64.ad Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad Tue Nov 02 09:00:37 2010 -0700
@@ -2065,6 +2065,13 @@
return can_be_java_arg(reg);
}
+bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
+ // In 64 bit mode a code which use multiply when
+ // devisor is constant is faster than hardware
+ // DIV instruction (it uses MulHiL).
+ return false;
+}
+
// Register for DIVI projection of divmodI
RegMask Matcher::divI_proj_mask() {
return INT_RAX_REG_mask;
--- a/hotspot/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/os_cpu/solaris_sparc/vm/vm_version_solaris_sparc.cpp Tue Nov 02 09:00:37 2010 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -65,10 +65,6 @@
// getisax(2), SI_ARCHITECTURE_32, and SI_ARCHITECTURE_64 are
// supported on Solaris 10 and later.
if (os::Solaris::supports_getisax()) {
-#ifndef PRODUCT
- if (PrintMiscellaneous && Verbose)
- tty->print_cr("getisax(2) supported.");
-#endif
// Check 32-bit architecture.
do_sysinfo(SI_ARCHITECTURE_32, "sparc", &features, v8_instructions_m);
@@ -81,6 +77,11 @@
uint_t avn = os::Solaris::getisax(&av, 1);
assert(avn == 1, "should only return one av");
+#ifndef PRODUCT
+ if (PrintMiscellaneous && Verbose)
+ tty->print_cr("getisax(2) returned: " PTR32_FORMAT, av);
+#endif
+
if (av & AV_SPARC_MUL32) features |= hardware_mul32_m;
if (av & AV_SPARC_DIV32) features |= hardware_div32_m;
if (av & AV_SPARC_FSMULD) features |= hardware_fsmuld_m;
@@ -88,11 +89,22 @@
if (av & AV_SPARC_POPC) features |= hardware_popc_m;
if (av & AV_SPARC_VIS) features |= vis1_instructions_m;
if (av & AV_SPARC_VIS2) features |= vis2_instructions_m;
+
+ // Next values are not defined before Solaris 10
+ // but Solaris 8 is used for jdk6 update builds.
+#ifndef AV_SPARC_ASI_BLK_INIT
+#define AV_SPARC_ASI_BLK_INIT 0x0080 /* ASI_BLK_INIT_xxx ASI */
+#endif
+#ifndef AV_SPARC_FMAF
+#define AV_SPARC_FMAF 0x0100 /* Sparc64 Fused Multiply-Add */
+#endif
+ if (av & AV_SPARC_ASI_BLK_INIT) features |= blk_init_instructions_m;
+ if (av & AV_SPARC_FMAF) features |= fmaf_instructions_m;
} else {
// getisax(2) failed, use the old legacy code.
#ifndef PRODUCT
if (PrintMiscellaneous && Verbose)
- tty->print_cr("getisax(2) not supported.");
+ tty->print_cr("getisax(2) is not supported.");
#endif
char tmp;
--- a/hotspot/src/share/vm/opto/divnode.cpp Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/share/vm/opto/divnode.cpp Tue Nov 02 09:00:37 2010 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -388,7 +388,8 @@
if (!d_pos) {
q = new (phase->C, 3) SubLNode(phase->longcon(0), phase->transform(q));
}
- } else {
+ } else if ( !Matcher::use_asm_for_ldiv_by_con(d) ) { // Use hardware DIV instruction when
+ // it is faster than code generated below.
// Attempt the jlong constant divide -> multiply transform found in
// "Division by Invariant Integers using Multiplication"
// by Granlund and Montgomery
@@ -558,7 +559,7 @@
set_req(0,NULL); // Dividing by a not-zero constant; no faulting
- // Dividing by MININT does not optimize as a power-of-2 shift.
+ // Dividing by MINLONG does not optimize as a power-of-2 shift.
if( l == min_jlong ) return NULL;
return transform_long_divide( phase, in(1), l );
@@ -1062,7 +1063,7 @@
// Fell thru, the unroll case is not appropriate. Transform the modulo
// into a long multiply/int multiply/subtract case
- // Cannot handle mod 0, and min_jint isn't handled by the transform
+ // Cannot handle mod 0, and min_jlong isn't handled by the transform
if( con == 0 || con == min_jlong ) return NULL;
// Get the absolute value of the constant; at this point, we can use this
@@ -1075,7 +1076,7 @@
// If this is a power of two, then maybe we can mask it
if( is_power_of_2_long(pos_con) ) {
- log2_con = log2_long(pos_con);
+ log2_con = exact_log2_long(pos_con);
const Type *dt = phase->type(in(1));
const TypeLong *dtl = dt->isa_long();
@@ -1088,7 +1089,7 @@
// Save in(1) so that it cannot be changed or deleted
hook->init_req(0, in(1));
- // Divide using the transform from DivI to MulL
+ // Divide using the transform from DivL to MulL
Node *result = transform_long_divide( phase, in(1), pos_con );
if (result != NULL) {
Node *divide = phase->transform(result);
--- a/hotspot/src/share/vm/opto/matcher.hpp Sat Oct 30 13:08:23 2010 -0700
+++ b/hotspot/src/share/vm/opto/matcher.hpp Tue Nov 02 09:00:37 2010 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -298,6 +298,10 @@
// Register for MODL projection of divmodL
static RegMask modL_proj_mask();
+ // Use hardware DIV instruction when it is faster than
+ // a code which use multiply for division by constant.
+ static bool use_asm_for_ldiv_by_con( jlong divisor );
+
static const RegMask method_handle_invoke_SP_save_mask();
// Java-Interpreter calling convention