7063629: use cbcond in C2 generated code on T4
authorkvn
Thu, 11 Aug 2011 12:08:11 -0700
changeset 10264 6879f93d268d
parent 10263 fa58671dde31
child 10265 4c869854aebd
7063629: use cbcond in C2 generated code on T4 Summary: Use new short branch instruction in C2 generated code. Reviewed-by: never
hotspot/src/cpu/sparc/vm/assembler_sparc.hpp
hotspot/src/cpu/sparc/vm/sparc.ad
hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/x86_32.ad
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/os_cpu/linux_x86/vm/linux_x86_32.ad
hotspot/src/os_cpu/linux_x86/vm/linux_x86_64.ad
hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_32.ad
hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad
hotspot/src/share/vm/adlc/formssel.cpp
hotspot/src/share/vm/adlc/output_h.cpp
hotspot/src/share/vm/opto/block.cpp
hotspot/src/share/vm/opto/block.hpp
hotspot/src/share/vm/opto/compile.hpp
hotspot/src/share/vm/opto/machnode.hpp
hotspot/src/share/vm/opto/matcher.hpp
hotspot/src/share/vm/opto/node.hpp
hotspot/src/share/vm/opto/output.cpp
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp	Thu Aug 11 12:08:11 2011 -0700
@@ -1192,6 +1192,8 @@
     assert(offset() == 0 || !cbcond_before(), "cbcond should not follow an other cbcond");
   }
 
+public:
+
   bool use_cbcond(Label& L) {
     if (!UseCBCond || cbcond_before()) return false;
     intptr_t x = intptr_t(target_distance(L)) - intptr_t(pc());
@@ -1199,7 +1201,6 @@
     return is_simm(x, 12);
   }
 
-public:
   // Tells assembler you know that next instruction is delayed
   Assembler* delayed() {
 #ifdef CHECK_DELAY
@@ -1248,6 +1249,10 @@
   inline void bpr(RCondition c, bool a, Predict p, Register s1, address d, relocInfo::relocType rt = relocInfo::none);
   inline void bpr(RCondition c, bool a, Predict p, Register s1, Label& L);
 
+  // compare and branch
+  inline void cbcond(Condition c, CC cc, Register s1, Register s2, Label& L);
+  inline void cbcond(Condition c, CC cc, Register s1, int simm5, Label& L);
+
  protected: // use MacroAssembler::br instead
 
   // pp 138
@@ -1275,10 +1280,6 @@
   inline void cb( Condition c, bool a, address d, relocInfo::relocType rt = relocInfo::none );
   inline void cb( Condition c, bool a, Label& L );
 
-  // compare and branch
-  inline void cbcond(Condition c, CC cc, Register s1, Register s2, Label& L);
-  inline void cbcond(Condition c, CC cc, Register s1, int simm5, Label& L);
-
   // pp 149
 
   inline void call( address d,  relocInfo::relocType rt = relocInfo::runtime_call_type );
--- a/hotspot/src/cpu/sparc/vm/sparc.ad	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad	Thu Aug 11 12:08:11 2011 -0700
@@ -1834,8 +1834,10 @@
 //
 // NOTE: If the platform does not provide any short branch variants, then
 //       this method should return false for offset 0.
-bool Matcher::is_short_branch_offset(int rule, int offset) {
-  return false;
+bool Matcher::is_short_branch_offset(int rule, int br_size, int offset) {
+  // The passed offset is relative to address of the branch.
+  // Don't need to adjust the offset.
+  return UseCBCond && Assembler::is_simm(offset, 12);
 }
 
 const bool Matcher::isSimpleConstant64(jlong value) {
@@ -3315,6 +3317,7 @@
 //----------Instruction Attributes---------------------------------------------
 ins_attrib ins_cost(DEFAULT_COST); // Required cost attribute
 ins_attrib ins_size(32);           // Required size attribute (in bits)
+ins_attrib ins_avoid_back_to_back(0); // instruction should not be generated back to back
 ins_attrib ins_short_branch(0);    // Required flag: is this instruction a
                                    // non-matching short branch variant of some
                                                             // long branch?
@@ -3402,6 +3405,15 @@
   interface(CONST_INTER);
 %}
 
+// Integer Immediate: 5-bit
+operand immI5() %{
+  predicate(Assembler::is_simm(n->get_int(), 5));
+  match(ConI);
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 // Integer Immediate: 0-bit
 operand immI0() %{
   predicate(n->get_int() == 0);
@@ -3625,6 +3637,15 @@
   interface(CONST_INTER);
 %}
 
+// Integer Immediate: 5-bit
+operand immL5() %{
+  predicate(n->get_long() == (int)n->get_long() && Assembler::is_simm((int)n->get_long(), 5));
+  match(ConL);
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
 // Long Immediate: 13-bit
 operand immL13() %{
   predicate((-4096L < n->get_long()) && (n->get_long() <= 4095L));
@@ -5157,6 +5178,42 @@
     MS  : R;
 %}
 
+// Compare and branch
+pipe_class cmp_br_reg_reg(Universe br, cmpOp cmp, iRegI src1, iRegI src2, label labl, flagsReg cr) %{
+    instruction_count(2); has_delay_slot;
+    cr    : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+    BR    : R;
+%}
+
+// Compare and branch
+pipe_class cmp_br_reg_imm(Universe br, cmpOp cmp, iRegI src1, immI13 src2, label labl, flagsReg cr) %{
+    instruction_count(2); has_delay_slot;
+    cr    : E(write);
+    src1  : R(read);
+    IALU  : R;
+    BR    : R;
+%}
+
+// Compare and branch using cbcond
+pipe_class cbcond_reg_reg(Universe br, cmpOp cmp, iRegI src1, iRegI src2, label labl) %{
+    single_instruction;
+    src1  : E(read);
+    src2  : E(read);
+    IALU  : R;
+    BR    : R;
+%}
+
+// Compare and branch using cbcond
+pipe_class cbcond_reg_imm(Universe br, cmpOp cmp, iRegI src1, immI5 src2, label labl) %{
+    single_instruction;
+    src1  : E(read);
+    IALU  : R;
+    BR    : R;
+%}
+
 pipe_class br_fcc(Universe br, cmpOpF cc, flagsReg cr, label labl) %{
     single_instruction_with_delay_slot;
     cr    : E(read);
@@ -9198,6 +9255,25 @@
   ins_pipe(br);
 %}
 
+// Direct Branch, short with no delay slot
+instruct branch_short(label labl) %{
+  match(Goto);
+  predicate(UseCBCond);
+  effect(USE labl);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "BA     $labl\t! short branch" %}
+  ins_encode %{ 
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ ba_short(*L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_imm);
+%}
+
 // Conditional Direct Branch
 instruct branchCon(cmpOp cmp, flagsReg icc, label labl) %{
   match(If cmp icc);
@@ -9211,50 +9287,11 @@
   ins_pipe(br_cc);
 %}
 
-// Branch-on-register tests all 64 bits.  We assume that values
-// in 64-bit registers always remains zero or sign extended
-// unless our code munges the high bits.  Interrupts can chop
-// the high order bits to zero or sign at any time.
-instruct branchCon_regI(cmpOp_reg cmp, iRegI op1, immI0 zero, label labl) %{
-  match(If cmp (CmpI op1 zero));
-  predicate(can_branch_register(_kids[0]->_leaf, _kids[1]->_leaf));
-  effect(USE labl);
-
-  size(8);
-  ins_cost(BRANCH_COST);
-  format %{ "BR$cmp   $op1,$labl" %}
-  ins_encode( enc_bpr( labl, cmp, op1 ) );
-  ins_pipe(br_reg);
-%}
-
-instruct branchCon_regP(cmpOp_reg cmp, iRegP op1, immP0 null, label labl) %{
-  match(If cmp (CmpP op1 null));
-  predicate(can_branch_register(_kids[0]->_leaf, _kids[1]->_leaf));
-  effect(USE labl);
-
-  size(8);
-  ins_cost(BRANCH_COST);
-  format %{ "BR$cmp   $op1,$labl" %}
-  ins_encode( enc_bpr( labl, cmp, op1 ) );
-  ins_pipe(br_reg);
-%}
-
-instruct branchCon_regL(cmpOp_reg cmp, iRegL op1, immL0 zero, label labl) %{
-  match(If cmp (CmpL op1 zero));
-  predicate(can_branch_register(_kids[0]->_leaf, _kids[1]->_leaf));
-  effect(USE labl);
-
-  size(8);
-  ins_cost(BRANCH_COST);
-  format %{ "BR$cmp   $op1,$labl" %}
-  ins_encode( enc_bpr( labl, cmp, op1 ) );
-  ins_pipe(br_reg);
-%}
-
 instruct branchConU(cmpOpU cmp, flagsRegU icc, label labl) %{
   match(If cmp icc);
   effect(USE labl);
 
+  ins_cost(BRANCH_COST);
   format %{ "BP$cmp  $icc,$labl" %}
   // Prim = bits 24-22, Secnd = bits 31-30
   ins_encode( enc_bp( labl, cmp, icc ) );
@@ -9321,6 +9358,506 @@
   ins_pipe(br_cc);
 %}
 
+// Compare and branch instructions
+instruct cmpI_reg_branch(cmpOp cmp, iRegI op1, iRegI op2, label labl, flagsReg icc) %{
+  match(If cmp (CmpI op1 op2));
+  effect(USE labl, KILL icc);
+
+  size(12);
+  ins_cost(BRANCH_COST);
+  format %{ "CMP    $op1,$op2\t! int\n\t"
+            "BP$cmp   $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+    __ cmp($op1$$Register, $op2$$Register);
+    __ bp((Assembler::Condition)($cmp$$cmpcode), false, Assembler::icc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
+  ins_pipe(cmp_br_reg_reg);
+%}
+
+instruct cmpI_imm_branch(cmpOp cmp, iRegI op1, immI5 op2, label labl, flagsReg icc) %{
+  match(If cmp (CmpI op1 op2));
+  effect(USE labl, KILL icc);
+
+  size(12);
+  ins_cost(BRANCH_COST);
+  format %{ "CMP    $op1,$op2\t! int\n\t"
+            "BP$cmp   $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+    __ cmp($op1$$Register, $op2$$constant);
+    __ bp((Assembler::Condition)($cmp$$cmpcode), false, Assembler::icc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
+  ins_pipe(cmp_br_reg_imm);
+%}
+
+instruct cmpU_reg_branch(cmpOpU cmp, iRegI op1, iRegI op2, label labl, flagsRegU icc) %{
+  match(If cmp (CmpU op1 op2));
+  effect(USE labl, KILL icc);
+
+  size(12);
+  ins_cost(BRANCH_COST);
+  format %{ "CMP    $op1,$op2\t! unsigned\n\t"
+            "BP$cmp  $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+    __ cmp($op1$$Register, $op2$$Register);
+    __ bp((Assembler::Condition)($cmp$$cmpcode), false, Assembler::icc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
+  ins_pipe(cmp_br_reg_reg);
+%}
+
+instruct cmpU_imm_branch(cmpOpU cmp, iRegI op1, immI5 op2, label labl, flagsRegU icc) %{
+  match(If cmp (CmpU op1 op2));
+  effect(USE labl, KILL icc);
+
+  size(12);
+  ins_cost(BRANCH_COST);
+  format %{ "CMP    $op1,$op2\t! unsigned\n\t"
+            "BP$cmp  $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+    __ cmp($op1$$Register, $op2$$constant);
+    __ bp((Assembler::Condition)($cmp$$cmpcode), false, Assembler::icc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
+  ins_pipe(cmp_br_reg_imm);
+%}
+
+instruct cmpL_reg_branch(cmpOp cmp, iRegL op1, iRegL op2, label labl, flagsRegL xcc) %{
+  match(If cmp (CmpL op1 op2));
+  effect(USE labl, KILL xcc);
+
+  size(12);
+  ins_cost(BRANCH_COST);
+  format %{ "CMP    $op1,$op2\t! long\n\t"
+            "BP$cmp   $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+    __ cmp($op1$$Register, $op2$$Register);
+    __ bp((Assembler::Condition)($cmp$$cmpcode), false, Assembler::xcc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
+  ins_pipe(cmp_br_reg_reg);
+%}
+
+instruct cmpL_imm_branch(cmpOp cmp, iRegL op1, immL5 op2, label labl, flagsRegL xcc) %{
+  match(If cmp (CmpL op1 op2));
+  effect(USE labl, KILL xcc);
+
+  size(12);
+  ins_cost(BRANCH_COST);
+  format %{ "CMP    $op1,$op2\t! long\n\t"
+            "BP$cmp   $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+    __ cmp($op1$$Register, $op2$$constant);
+    __ bp((Assembler::Condition)($cmp$$cmpcode), false, Assembler::xcc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
+  ins_pipe(cmp_br_reg_imm);
+%}
+
+// Compare Pointers and branch
+instruct cmpP_reg_branch(cmpOpP cmp, iRegP op1, iRegP op2, label labl, flagsRegP pcc) %{
+  match(If cmp (CmpP op1 op2));
+  effect(USE labl, KILL pcc);
+
+  size(12);
+  ins_cost(BRANCH_COST);
+  format %{ "CMP    $op1,$op2\t! ptr\n\t"
+            "B$cmp   $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+    __ cmp($op1$$Register, $op2$$Register);
+    __ bp((Assembler::Condition)($cmp$$cmpcode), false, Assembler::ptr_cc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
+  ins_pipe(cmp_br_reg_reg);
+%}
+
+instruct cmpP_null_branch(cmpOpP cmp, iRegP op1, immP0 null, label labl, flagsRegP pcc) %{
+  match(If cmp (CmpP op1 null));
+  effect(USE labl, KILL pcc);
+
+  size(12);
+  ins_cost(BRANCH_COST);
+  format %{ "CMP    $op1,0\t! ptr\n\t"
+            "B$cmp   $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+    __ cmp($op1$$Register, G0);
+    // bpr() is not used here since it has shorter distance.
+    __ bp((Assembler::Condition)($cmp$$cmpcode), false, Assembler::ptr_cc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
+  ins_pipe(cmp_br_reg_reg);
+%}
+
+instruct cmpN_reg_branch(cmpOp cmp, iRegN op1, iRegN op2, label labl, flagsReg icc) %{
+  match(If cmp (CmpN op1 op2));
+  effect(USE labl, KILL icc);
+
+  size(12);
+  ins_cost(BRANCH_COST);
+  format %{ "CMP    $op1,$op2\t! compressed ptr\n\t"
+            "BP$cmp   $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+    __ cmp($op1$$Register, $op2$$Register);
+    __ bp((Assembler::Condition)($cmp$$cmpcode), false, Assembler::icc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
+  ins_pipe(cmp_br_reg_reg);
+%}
+
+instruct cmpN_null_branch(cmpOp cmp, iRegN op1, immN0 null, label labl, flagsReg icc) %{
+  match(If cmp (CmpN op1 null));
+  effect(USE labl, KILL icc);
+
+  size(12);
+  ins_cost(BRANCH_COST);
+  format %{ "CMP    $op1,0\t! compressed ptr\n\t"
+            "BP$cmp   $labl" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+    __ cmp($op1$$Register, G0);
+    __ bp((Assembler::Condition)($cmp$$cmpcode), false, Assembler::icc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
+  ins_pipe(cmp_br_reg_reg);
+%}
+
+// Loop back branch
+instruct cmpI_reg_branchLoopEnd(cmpOp cmp, iRegI op1, iRegI op2, label labl, flagsReg icc) %{
+  match(CountedLoopEnd cmp (CmpI op1 op2));
+  effect(USE labl, KILL icc);
+
+  size(12);
+  ins_cost(BRANCH_COST);
+  format %{ "CMP    $op1,$op2\t! int\n\t"
+            "BP$cmp   $labl\t! Loop end" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+    __ cmp($op1$$Register, $op2$$Register);
+    __ bp((Assembler::Condition)($cmp$$cmpcode), false, Assembler::icc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
+  ins_pipe(cmp_br_reg_reg);
+%}
+
+instruct cmpI_imm_branchLoopEnd(cmpOp cmp, iRegI op1, immI5 op2, label labl, flagsReg icc) %{
+  match(CountedLoopEnd cmp (CmpI op1 op2));
+  effect(USE labl, KILL icc);
+
+  size(12);
+  ins_cost(BRANCH_COST);
+  format %{ "CMP    $op1,$op2\t! int\n\t"
+            "BP$cmp   $labl\t! Loop end" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    Assembler::Predict predict_taken =
+      cbuf.is_backward_branch(*L) ? Assembler::pt : Assembler::pn;
+    __ cmp($op1$$Register, $op2$$constant);
+    __ bp((Assembler::Condition)($cmp$$cmpcode), false, Assembler::icc, predict_taken, *L);
+    __ delayed()->nop();
+  %}
+  ins_pipe(cmp_br_reg_imm);
+%}
+
+// Short compare and branch instructions
+instruct cmpI_reg_branch_short(cmpOp cmp, iRegI op1, iRegI op2, label labl, flagsReg icc) %{
+  match(If cmp (CmpI op1 op2));
+  predicate(UseCBCond);
+  effect(USE labl, KILL icc);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CWB$cmp  $op1,$op2,$labl\t! int" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ cbcond((Assembler::Condition)($cmp$$cmpcode), Assembler::icc, $op1$$Register, $op2$$Register, *L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_reg);
+%}
+
+instruct cmpI_imm_branch_short(cmpOp cmp, iRegI op1, immI5 op2, label labl, flagsReg icc) %{
+  match(If cmp (CmpI op1 op2));
+  predicate(UseCBCond);
+  effect(USE labl, KILL icc);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CWB$cmp  $op1,$op2,$labl\t! int" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ cbcond((Assembler::Condition)($cmp$$cmpcode), Assembler::icc, $op1$$Register, $op2$$constant, *L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_imm);
+%}
+
+instruct cmpU_reg_branch_short(cmpOpU cmp, iRegI op1, iRegI op2, label labl, flagsRegU icc) %{
+  match(If cmp (CmpU op1 op2));
+  predicate(UseCBCond);
+  effect(USE labl, KILL icc);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CWB$cmp $op1,$op2,$labl\t! unsigned" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ cbcond((Assembler::Condition)($cmp$$cmpcode), Assembler::icc, $op1$$Register, $op2$$Register, *L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_reg);
+%}
+
+instruct cmpU_imm_branch_short(cmpOpU cmp, iRegI op1, immI5 op2, label labl, flagsRegU icc) %{
+  match(If cmp (CmpU op1 op2));
+  predicate(UseCBCond);
+  effect(USE labl, KILL icc);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CWB$cmp $op1,$op2,$labl\t! unsigned" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ cbcond((Assembler::Condition)($cmp$$cmpcode), Assembler::icc, $op1$$Register, $op2$$constant, *L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_imm);
+%}
+
+instruct cmpL_reg_branch_short(cmpOp cmp, iRegL op1, iRegL op2, label labl, flagsRegL xcc) %{
+  match(If cmp (CmpL op1 op2));
+  predicate(UseCBCond);
+  effect(USE labl, KILL xcc);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CXB$cmp  $op1,$op2,$labl\t! long" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ cbcond((Assembler::Condition)($cmp$$cmpcode), Assembler::xcc, $op1$$Register, $op2$$Register, *L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_reg);
+%}
+
+instruct cmpL_imm_branch_short(cmpOp cmp, iRegL op1, immL5 op2, label labl, flagsRegL xcc) %{
+  match(If cmp (CmpL op1 op2));
+  predicate(UseCBCond);
+  effect(USE labl, KILL xcc);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CXB$cmp  $op1,$op2,$labl\t! long" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ cbcond((Assembler::Condition)($cmp$$cmpcode), Assembler::xcc, $op1$$Register, $op2$$constant, *L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_imm);
+%}
+
+// Compare Pointers and branch
+instruct cmpP_reg_branch_short(cmpOpP cmp, iRegP op1, iRegP op2, label labl, flagsRegP pcc) %{
+  match(If cmp (CmpP op1 op2));
+  predicate(UseCBCond);
+  effect(USE labl, KILL pcc);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+#ifdef _LP64
+  format %{ "CXB$cmp $op1,$op2,$labl\t! ptr" %}
+#else
+  format %{ "CWB$cmp $op1,$op2,$labl\t! ptr" %}
+#endif
+  ins_encode %{
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ cbcond((Assembler::Condition)($cmp$$cmpcode), Assembler::ptr_cc, $op1$$Register, $op2$$Register, *L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_reg);
+%}
+
+instruct cmpP_null_branch_short(cmpOpP cmp, iRegP op1, immP0 null, label labl, flagsRegP pcc) %{
+  match(If cmp (CmpP op1 null));
+  predicate(UseCBCond);
+  effect(USE labl, KILL pcc);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+#ifdef _LP64
+  format %{ "CXB$cmp $op1,0,$labl\t! ptr" %}
+#else
+  format %{ "CWB$cmp $op1,0,$labl\t! ptr" %}
+#endif
+  ins_encode %{
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ cbcond((Assembler::Condition)($cmp$$cmpcode), Assembler::ptr_cc, $op1$$Register, G0, *L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_reg);
+%}
+
+instruct cmpN_reg_branch_short(cmpOp cmp, iRegN op1, iRegN op2, label labl, flagsReg icc) %{
+  match(If cmp (CmpN op1 op2));
+  predicate(UseCBCond);
+  effect(USE labl, KILL icc);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CWB$cmp  $op1,op2,$labl\t! compressed ptr" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ cbcond((Assembler::Condition)($cmp$$cmpcode), Assembler::icc, $op1$$Register, $op2$$Register, *L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_reg);
+%}
+
+instruct cmpN_null_branch_short(cmpOp cmp, iRegN op1, immN0 null, label labl, flagsReg icc) %{
+  match(If cmp (CmpN op1 null));
+  predicate(UseCBCond);
+  effect(USE labl, KILL icc);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CWB$cmp  $op1,0,$labl\t! compressed ptr" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ cbcond((Assembler::Condition)($cmp$$cmpcode), Assembler::icc, $op1$$Register, G0, *L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_reg);
+%}
+
+// Loop back branch
+instruct cmpI_reg_branchLoopEnd_short(cmpOp cmp, iRegI op1, iRegI op2, label labl, flagsReg icc) %{
+  match(CountedLoopEnd cmp (CmpI op1 op2));
+  predicate(UseCBCond);
+  effect(USE labl, KILL icc);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CWB$cmp  $op1,$op2,$labl\t! Loop end" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ cbcond((Assembler::Condition)($cmp$$cmpcode), Assembler::icc, $op1$$Register, $op2$$Register, *L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_reg);
+%}
+
+instruct cmpI_imm_branchLoopEnd_short(cmpOp cmp, iRegI op1, immI5 op2, label labl, flagsReg icc) %{
+  match(CountedLoopEnd cmp (CmpI op1 op2));
+  predicate(UseCBCond);
+  effect(USE labl, KILL icc);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "CWB$cmp  $op1,$op2,$labl\t! Loop end" %}
+  ins_encode %{
+    Label* L = $labl$$label;
+    assert(__ use_cbcond(*L), "back to back cbcond");
+    __ cbcond((Assembler::Condition)($cmp$$cmpcode), Assembler::icc, $op1$$Register, $op2$$constant, *L);
+  %}
+  ins_short_branch(1);
+  ins_avoid_back_to_back(1);
+  ins_pipe(cbcond_reg_imm);
+%}
+
+// Branch-on-register tests all 64 bits.  We assume that values
+// in 64-bit registers always remains zero or sign extended
+// unless our code munges the high bits.  Interrupts can chop
+// the high order bits to zero or sign at any time.
+instruct branchCon_regI(cmpOp_reg cmp, iRegI op1, immI0 zero, label labl) %{
+  match(If cmp (CmpI op1 zero));
+  predicate(can_branch_register(_kids[0]->_leaf, _kids[1]->_leaf));
+  effect(USE labl);
+
+  size(8);
+  ins_cost(BRANCH_COST);
+  format %{ "BR$cmp   $op1,$labl" %}
+  ins_encode( enc_bpr( labl, cmp, op1 ) );
+  ins_pipe(br_reg);
+%}
+
+instruct branchCon_regP(cmpOp_reg cmp, iRegP op1, immP0 null, label labl) %{
+  match(If cmp (CmpP op1 null));
+  predicate(can_branch_register(_kids[0]->_leaf, _kids[1]->_leaf));
+  effect(USE labl);
+
+  size(8);
+  ins_cost(BRANCH_COST);
+  format %{ "BR$cmp   $op1,$labl" %}
+  ins_encode( enc_bpr( labl, cmp, op1 ) );
+  ins_pipe(br_reg);
+%}
+
+instruct branchCon_regL(cmpOp_reg cmp, iRegL op1, immL0 zero, label labl) %{
+  match(If cmp (CmpL op1 zero));
+  predicate(can_branch_register(_kids[0]->_leaf, _kids[1]->_leaf));
+  effect(USE labl);
+
+  size(8);
+  ins_cost(BRANCH_COST);
+  format %{ "BR$cmp   $op1,$labl" %}
+  ins_encode( enc_bpr( labl, cmp, op1 ) );
+  ins_pipe(br_reg);
+%}
+
+
 // ============================================================================
 // Long Compare
 //
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Thu Aug 11 12:08:11 2011 -0700
@@ -144,8 +144,13 @@
 
   // Currently not supported anywhere.
   FLAG_SET_DEFAULT(UseFPUForSpilling, false);
+
+  assert((InteriorEntryAlignment % relocInfo::addr_unit()) == 0, "alignment is not a multiple of NOP size");
 #endif
 
+  assert((CodeEntryAlignment % relocInfo::addr_unit()) == 0, "alignment is not a multiple of NOP size");
+  assert((OptoLoopAlignment % relocInfo::addr_unit()) == 0, "alignment is not a multiple of NOP size");
+
   char buf[512];
   jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                (has_v9() ? ", v9" : (has_v8() ? ", v8" : "")),
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Thu Aug 11 12:08:11 2011 -0700
@@ -1339,9 +1339,8 @@
   emit_operand(rax, dst);
 }
 
-void Assembler::jcc(Condition cc, Label& L, relocInfo::relocType rtype) {
-  InstructionMark im(this);
-  relocate(rtype);
+void Assembler::jcc(Condition cc, Label& L, bool maybe_short) {
+  InstructionMark im(this);
   assert((0 <= cc) && (cc < 16), "illegal cc");
   if (L.is_bound()) {
     address dst = target(L);
@@ -1350,7 +1349,7 @@
     const int short_size = 2;
     const int long_size = 6;
     intptr_t offs = (intptr_t)dst - (intptr_t)_code_pos;
-    if (rtype == relocInfo::none && is8bit(offs - short_size)) {
+    if (maybe_short && is8bit(offs - short_size)) {
       // 0111 tttn #8-bit disp
       emit_byte(0x70 | cc);
       emit_byte((offs - short_size) & 0xFF);
@@ -1399,7 +1398,7 @@
   emit_operand(rsp, adr);
 }
 
-void Assembler::jmp(Label& L, relocInfo::relocType rtype) {
+void Assembler::jmp(Label& L, bool maybe_short) {
   if (L.is_bound()) {
     address entry = target(L);
     assert(entry != NULL, "jmp most probably wrong");
@@ -1407,7 +1406,7 @@
     const int short_size = 2;
     const int long_size = 5;
     intptr_t offs = entry - _code_pos;
-    if (rtype == relocInfo::none && is8bit(offs - short_size)) {
+    if (maybe_short && is8bit(offs - short_size)) {
       emit_byte(0xEB);
       emit_byte((offs - short_size) & 0xFF);
     } else {
@@ -1420,7 +1419,6 @@
     // the forward jump will not run beyond 256 bytes, use jmpb to
     // force an 8-bit displacement.
     InstructionMark im(this);
-    relocate(rtype);
     L.add_patch_at(code(), locator());
     emit_byte(0xE9);
     emit_long(0);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Thu Aug 11 12:08:11 2011 -0700
@@ -1065,8 +1065,7 @@
   // Note: The same Label can be used for forward and backward branches
   // but it may be bound only once.
 
-  void jcc(Condition cc, Label& L,
-           relocInfo::relocType rtype = relocInfo::none);
+  void jcc(Condition cc, Label& L, bool maybe_short = true);
 
   // Conditional jump to a 8-bit offset to L.
   // WARNING: be very careful using this for forward jumps.  If the label is
@@ -1077,7 +1076,7 @@
   void jmp(Address entry);    // pc <- entry
 
   // Label operations & relative jumps (PPUM Appendix D)
-  void jmp(Label& L, relocInfo::relocType rtype = relocInfo::none);   // unconditional jump to L
+  void jmp(Label& L, bool maybe_short = true);   // unconditional jump to L
 
   void jmp(Register entry); // pc <- entry
 
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Thu Aug 11 12:08:11 2011 -0700
@@ -1369,7 +1369,12 @@
 //
 // NOTE: If the platform does not provide any short branch variants, then
 //       this method should return false for offset 0.
-bool Matcher::is_short_branch_offset(int rule, int offset) {
+bool Matcher::is_short_branch_offset(int rule, int br_size, int offset) {
+  // The passed offset is relative to address of the branch.
+  // On 86 a branch displacement is calculated relative to address
+  // of a next instruction.
+  offset -= br_size;
+
   // the short version of jmpConUCF2 contains multiple branches,
   // making the reach slightly less
   if (rule == jmpConUCF2_rule)
@@ -1713,18 +1718,6 @@
     else                               emit_d32(cbuf,con);
   %}
 
-  enc_class Lbl (label labl) %{ // GOTO
-    Label *l = $labl$$label;
-    emit_d32(cbuf, (l->loc_pos() - (cbuf.insts_size()+4)));
-  %}
-
-  enc_class LblShort (label labl) %{ // GOTO
-    Label *l = $labl$$label;
-    int disp = l->loc_pos() - (cbuf.insts_size()+1);
-    assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
-    emit_d8(cbuf, disp);
-  %}
-
   enc_class OpcSReg (eRegI dst) %{    // BSWAP
     emit_cc(cbuf, $secondary, $dst$$reg );
   %}
@@ -1747,21 +1740,6 @@
     emit_rm(cbuf, 0x3, $secondary, $div$$reg );
   %}
 
-  enc_class Jcc (cmpOp cop, label labl) %{    // JCC
-    Label *l = $labl$$label;
-    $$$emit8$primary;
-    emit_cc(cbuf, $secondary, $cop$$cmpcode);
-    emit_d32(cbuf, (l->loc_pos() - (cbuf.insts_size()+4)));
-  %}
-
-  enc_class JccShort (cmpOp cop, label labl) %{    // JCC
-    Label *l = $labl$$label;
-    emit_cc(cbuf, $primary, $cop$$cmpcode);
-    int disp = l->loc_pos() - (cbuf.insts_size()+1);
-    assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
-    emit_d8(cbuf, disp);
-  %}
-
   enc_class enc_cmov(cmpOp cop ) %{ // CMOV
     $$$emit8$primary;
     emit_cc(cbuf, $secondary, $cop$$cmpcode);
@@ -13055,8 +13033,10 @@
   ins_cost(300);
   format %{ "JMP    $labl" %}
   size(5);
-  opcode(0xE9);
-  ins_encode( OpcP, Lbl( labl ) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jmp(*L, false); // Always long jump
+  %}
   ins_pipe( pipe_jmp );
 %}
 
@@ -13068,8 +13048,10 @@
   ins_cost(300);
   format %{ "J$cop    $labl" %}
   size(6);
-  opcode(0x0F, 0x80);
-  ins_encode( Jcc( cop, labl) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+  %}
   ins_pipe( pipe_jcc );
 %}
 
@@ -13081,8 +13063,10 @@
   ins_cost(300);
   format %{ "J$cop    $labl\t# Loop end" %}
   size(6);
-  opcode(0x0F, 0x80);
-  ins_encode( Jcc( cop, labl) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+  %}
   ins_pipe( pipe_jcc );
 %}
 
@@ -13094,8 +13078,10 @@
   ins_cost(300);
   format %{ "J$cop,u  $labl\t# Loop end" %}
   size(6);
-  opcode(0x0F, 0x80);
-  ins_encode( Jcc( cop, labl) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+  %}
   ins_pipe( pipe_jcc );
 %}
 
@@ -13106,8 +13092,10 @@
   ins_cost(200);
   format %{ "J$cop,u  $labl\t# Loop end" %}
   size(6);
-  opcode(0x0F, 0x80);
-  ins_encode( Jcc( cop, labl) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+  %}
   ins_pipe( pipe_jcc );
 %}
 
@@ -13119,8 +13107,10 @@
   ins_cost(300);
   format %{ "J$cop,u  $labl" %}
   size(6);
-  opcode(0x0F, 0x80);
-  ins_encode(Jcc(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+  %}
   ins_pipe(pipe_jcc);
 %}
 
@@ -13131,8 +13121,10 @@
   ins_cost(200);
   format %{ "J$cop,u  $labl" %}
   size(6);
-  opcode(0x0F, 0x80);
-  ins_encode(Jcc(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+  %}
   ins_pipe(pipe_jcc);
 %}
 
@@ -13151,28 +13143,19 @@
       $$emit$$"done:"
     }
   %}
-  size(12);
-  opcode(0x0F, 0x80);
   ins_encode %{
     Label* l = $labl$$label;
-    $$$emit8$primary;
-    emit_cc(cbuf, $secondary, Assembler::parity);
-    int parity_disp = -1;
-    bool ok = false;
     if ($cop$$cmpcode == Assembler::notEqual) {
-       // the two jumps 6 bytes apart so the jump distances are too
-       parity_disp = l->loc_pos() - (cbuf.insts_size() + 4);
+      __ jcc(Assembler::parity, *l, false);
+      __ jcc(Assembler::notEqual, *l, false);
     } else if ($cop$$cmpcode == Assembler::equal) {
-       parity_disp = 6;
-       ok = true;
+      Label done;
+      __ jccb(Assembler::parity, done);
+      __ jcc(Assembler::equal, *l, false);
+      __ bind(done);
     } else {
        ShouldNotReachHere();
     }
-    emit_d32(cbuf, parity_disp);
-    $$$emit8$primary;
-    emit_cc(cbuf, $secondary, $cop$$cmpcode);
-    int disp = l->loc_pos() - (cbuf.insts_size() + 4);
-    emit_d32(cbuf, disp);
   %}
   ins_pipe(pipe_jcc);
 %}
@@ -13239,8 +13222,10 @@
   ins_cost(300);
   format %{ "JMP,s  $labl" %}
   size(2);
-  opcode(0xEB);
-  ins_encode( OpcP, LblShort( labl ) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jmpb(*L);
+  %}
   ins_pipe( pipe_jmp );
   ins_short_branch(1);
 %}
@@ -13253,8 +13238,10 @@
   ins_cost(300);
   format %{ "J$cop,s  $labl" %}
   size(2);
-  opcode(0x70);
-  ins_encode( JccShort( cop, labl) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
+  %}
   ins_pipe( pipe_jcc );
   ins_short_branch(1);
 %}
@@ -13267,8 +13254,10 @@
   ins_cost(300);
   format %{ "J$cop,s  $labl\t# Loop end" %}
   size(2);
-  opcode(0x70);
-  ins_encode( JccShort( cop, labl) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
+  %}
   ins_pipe( pipe_jcc );
   ins_short_branch(1);
 %}
@@ -13281,8 +13270,10 @@
   ins_cost(300);
   format %{ "J$cop,us $labl\t# Loop end" %}
   size(2);
-  opcode(0x70);
-  ins_encode( JccShort( cop, labl) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
+  %}
   ins_pipe( pipe_jcc );
   ins_short_branch(1);
 %}
@@ -13294,8 +13285,10 @@
   ins_cost(300);
   format %{ "J$cop,us $labl\t# Loop end" %}
   size(2);
-  opcode(0x70);
-  ins_encode( JccShort( cop, labl) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
+  %}
   ins_pipe( pipe_jcc );
   ins_short_branch(1);
 %}
@@ -13308,8 +13301,10 @@
   ins_cost(300);
   format %{ "J$cop,us $labl" %}
   size(2);
-  opcode(0x70);
-  ins_encode( JccShort( cop, labl) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
+  %}
   ins_pipe( pipe_jcc );
   ins_short_branch(1);
 %}
@@ -13321,8 +13316,10 @@
   ins_cost(300);
   format %{ "J$cop,us $labl" %}
   size(2);
-  opcode(0x70);
-  ins_encode( JccShort( cop, labl) );
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
+  %}
   ins_pipe( pipe_jcc );
   ins_short_branch(1);
 %}
@@ -13343,24 +13340,19 @@
     }
   %}
   size(4);
-  opcode(0x70);
   ins_encode %{
     Label* l = $labl$$label;
-    emit_cc(cbuf, $primary, Assembler::parity);
-    int parity_disp = -1;
     if ($cop$$cmpcode == Assembler::notEqual) {
-      parity_disp = l->loc_pos() - (cbuf.insts_size() + 1);
+      __ jccb(Assembler::parity, *l);
+      __ jccb(Assembler::notEqual, *l);
     } else if ($cop$$cmpcode == Assembler::equal) {
-      parity_disp = 2;
+      Label done;
+      __ jccb(Assembler::parity, done);
+      __ jccb(Assembler::equal, *l);
+      __ bind(done);
     } else {
-      ShouldNotReachHere();
-    }
-    emit_d8(cbuf, parity_disp);
-    emit_cc(cbuf, $primary, $cop$$cmpcode);
-    int disp = l->loc_pos() - (cbuf.insts_size() + 1);
-    emit_d8(cbuf, disp);
-    assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
-    assert(-128 <= parity_disp && parity_disp <= 127, "Displacement too large for short jmp");
+       ShouldNotReachHere();
+    }
   %}
   ins_pipe(pipe_jcc);
   ins_short_branch(1);
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Thu Aug 11 12:08:11 2011 -0700
@@ -1966,7 +1966,12 @@
 //
 // NOTE: If the platform does not provide any short branch variants, then
 //       this method should return false for offset 0.
-bool Matcher::is_short_branch_offset(int rule, int offset) {
+bool Matcher::is_short_branch_offset(int rule, int br_size, int offset) {
+  // The passed offset is relative to address of the branch.
+  // On 86 a branch displacement is calculated relative to address
+  // of a next instruction.
+  offset -= br_size;
+
   // the short version of jmpConUCF2 contains multiple branches,
   // making the reach slightly less
   if (rule == jmpConUCF2_rule)
@@ -2426,22 +2431,6 @@
     }
   %}
 
-  enc_class Lbl(label labl)
-  %{
-    // GOTO
-    Label* l = $labl$$label;
-    emit_d32(cbuf, (l->loc_pos() - (cbuf.insts_size() + 4)));
-  %}
-
-  enc_class LblShort(label labl)
-  %{
-    // GOTO
-    Label* l = $labl$$label;
-    int disp = l->loc_pos() - (cbuf.insts_size() + 1);
-    assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
-    emit_d8(cbuf, disp);
-  %}
-
   enc_class opc2_reg(rRegI dst)
   %{
     // BSWAP
@@ -2460,25 +2449,6 @@
     emit_rm(cbuf, 0x3, $secondary, $div$$reg & 7);
   %}
 
-  enc_class Jcc(cmpOp cop, label labl)
-  %{
-    // JCC
-    Label* l = $labl$$label;
-    $$$emit8$primary;
-    emit_cc(cbuf, $secondary, $cop$$cmpcode);
-    emit_d32(cbuf, (l->loc_pos() - (cbuf.insts_size() + 4)));
-  %}
-
-  enc_class JccShort (cmpOp cop, label labl)
-  %{
-  // JCC
-    Label *l = $labl$$label;
-    emit_cc(cbuf, $primary, $cop$$cmpcode);
-    int disp = l->loc_pos() - (cbuf.insts_size() + 1);
-    assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
-    emit_d8(cbuf, disp);
-  %}
-
   enc_class enc_cmov(cmpOp cop)
   %{
     // CMOV
@@ -12011,8 +11981,10 @@
   ins_cost(300);
   format %{ "jmp     $labl" %}
   size(5);
-  opcode(0xE9);
-  ins_encode(OpcP, Lbl(labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jmp(*L, false); // Always long jump
+  %}
   ins_pipe(pipe_jmp);
 %}
 
@@ -12025,8 +11997,10 @@
   ins_cost(300);
   format %{ "j$cop     $labl" %}
   size(6);
-  opcode(0x0F, 0x80);
-  ins_encode(Jcc(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+  %}
   ins_pipe(pipe_jcc);
 %}
 
@@ -12039,8 +12013,10 @@
   ins_cost(300);
   format %{ "j$cop     $labl\t# loop end" %}
   size(6);
-  opcode(0x0F, 0x80);
-  ins_encode(Jcc(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+  %}
   ins_pipe(pipe_jcc);
 %}
 
@@ -12052,8 +12028,10 @@
   ins_cost(300);
   format %{ "j$cop,u   $labl\t# loop end" %}
   size(6);
-  opcode(0x0F, 0x80);
-  ins_encode(Jcc(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+  %}
   ins_pipe(pipe_jcc);
 %}
 
@@ -12064,8 +12042,10 @@
   ins_cost(200);
   format %{ "j$cop,u   $labl\t# loop end" %}
   size(6);
-  opcode(0x0F, 0x80);
-  ins_encode(Jcc(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+  %}
   ins_pipe(pipe_jcc);
 %}
 
@@ -12077,8 +12057,10 @@
   ins_cost(300);
   format %{ "j$cop,u  $labl" %}
   size(6);
-  opcode(0x0F, 0x80);
-  ins_encode(Jcc(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+  %}
   ins_pipe(pipe_jcc);
 %}
 
@@ -12089,8 +12071,10 @@
   ins_cost(200);
   format %{ "j$cop,u  $labl" %}
   size(6);
-  opcode(0x0F, 0x80);
-  ins_encode(Jcc(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jcc((Assembler::Condition)($cop$$cmpcode), *L, false); // Always long jump
+  %}
   ins_pipe(pipe_jcc);
 %}
 
@@ -12109,26 +12093,19 @@
       $$emit$$"done:"
     }
   %}
-  size(12);
-  opcode(0x0F, 0x80);
   ins_encode %{
     Label* l = $labl$$label;
-    $$$emit8$primary;
-    emit_cc(cbuf, $secondary, Assembler::parity);
-    int parity_disp = -1;
     if ($cop$$cmpcode == Assembler::notEqual) {
-       // the two jumps 6 bytes apart so the jump distances are too
-       parity_disp = l->loc_pos() - (cbuf.insts_size() + 4);
+      __ jcc(Assembler::parity, *l, false);
+      __ jcc(Assembler::notEqual, *l, false);
     } else if ($cop$$cmpcode == Assembler::equal) {
-       parity_disp = 6;
+      Label done;
+      __ jccb(Assembler::parity, done);
+      __ jcc(Assembler::equal, *l, false);
+      __ bind(done);
     } else {
        ShouldNotReachHere();
     }
-    emit_d32(cbuf, parity_disp);
-    $$$emit8$primary;
-    emit_cc(cbuf, $secondary, $cop$$cmpcode);
-    int disp = l->loc_pos() - (cbuf.insts_size() + 4);
-    emit_d32(cbuf, disp);
   %}
   ins_pipe(pipe_jcc);
 %}
@@ -12204,8 +12181,10 @@
   ins_cost(300);
   format %{ "jmp,s   $labl" %}
   size(2);
-  opcode(0xEB);
-  ins_encode(OpcP, LblShort(labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jmpb(*L);
+  %}
   ins_pipe(pipe_jmp);
   ins_short_branch(1);
 %}
@@ -12218,8 +12197,10 @@
   ins_cost(300);
   format %{ "j$cop,s   $labl" %}
   size(2);
-  opcode(0x70);
-  ins_encode(JccShort(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
+  %}
   ins_pipe(pipe_jcc);
   ins_short_branch(1);
 %}
@@ -12232,8 +12213,10 @@
   ins_cost(300);
   format %{ "j$cop,s   $labl\t# loop end" %}
   size(2);
-  opcode(0x70);
-  ins_encode(JccShort(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
+  %}
   ins_pipe(pipe_jcc);
   ins_short_branch(1);
 %}
@@ -12246,8 +12229,10 @@
   ins_cost(300);
   format %{ "j$cop,us  $labl\t# loop end" %}
   size(2);
-  opcode(0x70);
-  ins_encode(JccShort(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
+  %}
   ins_pipe(pipe_jcc);
   ins_short_branch(1);
 %}
@@ -12259,8 +12244,10 @@
   ins_cost(300);
   format %{ "j$cop,us  $labl\t# loop end" %}
   size(2);
-  opcode(0x70);
-  ins_encode(JccShort(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
+  %}
   ins_pipe(pipe_jcc);
   ins_short_branch(1);
 %}
@@ -12273,8 +12260,10 @@
   ins_cost(300);
   format %{ "j$cop,us  $labl" %}
   size(2);
-  opcode(0x70);
-  ins_encode(JccShort(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
+  %}
   ins_pipe(pipe_jcc);
   ins_short_branch(1);
 %}
@@ -12286,8 +12275,10 @@
   ins_cost(300);
   format %{ "j$cop,us  $labl" %}
   size(2);
-  opcode(0x70);
-  ins_encode(JccShort(cop, labl));
+  ins_encode %{
+    Label* L = $labl$$label;
+    __ jccb((Assembler::Condition)($cop$$cmpcode), *L);
+  %}
   ins_pipe(pipe_jcc);
   ins_short_branch(1);
 %}
@@ -12308,24 +12299,19 @@
     }
   %}
   size(4);
-  opcode(0x70);
   ins_encode %{
     Label* l = $labl$$label;
-    emit_cc(cbuf, $primary, Assembler::parity);
-    int parity_disp = -1;
     if ($cop$$cmpcode == Assembler::notEqual) {
-      parity_disp = l->loc_pos() - (cbuf.insts_size() + 1);
+      __ jccb(Assembler::parity, *l);
+      __ jccb(Assembler::notEqual, *l);
     } else if ($cop$$cmpcode == Assembler::equal) {
-      parity_disp = 2;
+      Label done;
+      __ jccb(Assembler::parity, done);
+      __ jccb(Assembler::equal, *l);
+      __ bind(done);
     } else {
-      ShouldNotReachHere();
-    }
-    emit_d8(cbuf, parity_disp);
-    emit_cc(cbuf, $primary, $cop$$cmpcode);
-    int disp = l->loc_pos() - (cbuf.insts_size() + 1);
-    emit_d8(cbuf, disp);
-    assert(-128 <= disp && disp <= 127, "Displacement too large for short jmp");
-    assert(-128 <= parity_disp && parity_disp <= 127, "Displacement too large for short jmp");
+       ShouldNotReachHere();
+    }
   %}
   ins_pipe(pipe_jcc);
   ins_short_branch(1);
--- a/hotspot/src/os_cpu/linux_x86/vm/linux_x86_32.ad	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/os_cpu/linux_x86/vm/linux_x86_32.ad	Thu Aug 11 12:08:11 2011 -0700
@@ -154,7 +154,7 @@
 
 
 uint MachBreakpointNode::size(PhaseRegAlloc *ra_) const {
-  return 5;
+  return MachNode::size(ra_);
 }
 
 %}
--- a/hotspot/src/os_cpu/linux_x86/vm/linux_x86_64.ad	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/os_cpu/linux_x86/vm/linux_x86_64.ad	Thu Aug 11 12:08:11 2011 -0700
@@ -167,7 +167,8 @@
 }
 
 uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const {
-  return 5;
+  // distance could be far and requires load and call through register
+  return MachNode::size(ra_);
 }
 
 %}
--- a/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_32.ad	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_32.ad	Thu Aug 11 12:08:11 2011 -0700
@@ -161,7 +161,7 @@
 
 
 uint MachBreakpointNode::size(PhaseRegAlloc *ra_) const {
-  return 5;
+  return MachNode::size(ra_);
 }
 
 %}
--- a/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_64.ad	Thu Aug 11 12:08:11 2011 -0700
@@ -180,7 +180,8 @@
 
 uint MachBreakpointNode::size(PhaseRegAlloc* ra_) const
 {
-  return 5;
+  // distance could be far and requires load and call through register
+  return MachNode::size(ra_);
 }
 
 %}
--- a/hotspot/src/share/vm/adlc/formssel.cpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/share/vm/adlc/formssel.cpp	Thu Aug 11 12:08:11 2011 -0700
@@ -1181,6 +1181,34 @@
       strcmp(reduce_result(), short_branch->reduce_result()) == 0 &&
       _matrule->equivalent(AD.globalNames(), short_branch->_matrule)) {
     // The instructions are equivalent.
+
+    // Now verify that both instructions have the same parameters and
+    // the same effects. Both branch forms should have the same inputs
+    // and resulting projections to correctly replace a long branch node
+    // with corresponding short branch node during code generation.
+
+    bool different = false;
+    if (short_branch->_components.count() != _components.count()) {
+       different = true;
+    } else if (_components.count() > 0) {
+      short_branch->_components.reset();
+      _components.reset();
+      Component *comp;
+      while ((comp = _components.iter()) != NULL) {
+        Component *short_comp = short_branch->_components.iter();
+        if (short_comp == NULL ||
+            short_comp->_type != comp->_type ||
+            short_comp->_usedef != comp->_usedef) {
+          different = true;
+          break;
+        }
+      }
+      if (short_branch->_components.iter() != NULL)
+        different = true;
+    }
+    if (different) {
+      globalAD->syntax_err(short_branch->_linenum, "Instruction %s and its short form %s have different parameters\n", _ident, short_branch->_ident);
+    }
     if (AD._short_branch_debug) {
       fprintf(stderr, "Instruction %s has short form %s\n", _ident, short_branch->_ident);
     }
--- a/hotspot/src/share/vm/adlc/output_h.cpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/share/vm/adlc/output_h.cpp	Thu Aug 11 12:08:11 2011 -0700
@@ -1536,12 +1536,16 @@
     // Each instruction attribute results in a virtual call of same name.
     // The ins_cost is not handled here.
     Attribute *attr = instr->_attribs;
+    bool avoid_back_to_back = false;
     while (attr != NULL) {
       if (strcmp(attr->_ident,"ins_cost") &&
           strcmp(attr->_ident,"ins_short_branch")) {
         fprintf(fp,"  int             %s() const { return %s; }\n",
                 attr->_ident, attr->_val);
       }
+      // Check value for ins_avoid_back_to_back, and if it is true (1), set the flag
+      if (!strcmp(attr->_ident,"ins_avoid_back_to_back") && attr->int_val(*this) != 0)
+        avoid_back_to_back = true;
       attr = (Attribute *)attr->_next;
     }
 
@@ -1704,6 +1708,16 @@
       }
     }
 
+    // flag: if this instruction should not be generated back to back.
+    if ( avoid_back_to_back ) {
+      if ( node_flags_set ) {
+        fprintf(fp," | Flag_avoid_back_to_back");
+      } else {
+        fprintf(fp,"init_flags(Flag_avoid_back_to_back");
+        node_flags_set = true;
+      }
+    }
+
     // Check if machine instructions that USE memory, but do not DEF memory,
     // depend upon a node that defines memory in machine-independent graph.
     if ( instr->needs_anti_dependence_check(_globalNames) ) {
--- a/hotspot/src/share/vm/opto/block.cpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/block.cpp	Thu Aug 11 12:08:11 2011 -0700
@@ -80,35 +80,37 @@
 
 uint Block::code_alignment() {
   // Check for Root block
-  if( _pre_order == 0 ) return CodeEntryAlignment;
+  if (_pre_order == 0) return CodeEntryAlignment;
   // Check for Start block
-  if( _pre_order == 1 ) return InteriorEntryAlignment;
+  if (_pre_order == 1) return InteriorEntryAlignment;
   // Check for loop alignment
-  if (has_loop_alignment())  return loop_alignment();
+  if (has_loop_alignment()) return loop_alignment();
 
-  return 1;                     // no particular alignment
+  return relocInfo::addr_unit(); // no particular alignment
 }
 
 uint Block::compute_loop_alignment() {
   Node *h = head();
-  if( h->is_Loop() && h->as_Loop()->is_inner_loop() )  {
+  int unit_sz = relocInfo::addr_unit();
+  if (h->is_Loop() && h->as_Loop()->is_inner_loop())  {
     // Pre- and post-loops have low trip count so do not bother with
     // NOPs for align loop head.  The constants are hidden from tuning
     // but only because my "divide by 4" heuristic surely gets nearly
     // all possible gain (a "do not align at all" heuristic has a
     // chance of getting a really tiny gain).
-    if( h->is_CountedLoop() && (h->as_CountedLoop()->is_pre_loop() ||
-                                h->as_CountedLoop()->is_post_loop()) )
-      return (OptoLoopAlignment > 4) ? (OptoLoopAlignment>>2) : 1;
+    if (h->is_CountedLoop() && (h->as_CountedLoop()->is_pre_loop() ||
+                                h->as_CountedLoop()->is_post_loop())) {
+      return (OptoLoopAlignment > 4*unit_sz) ? (OptoLoopAlignment>>2) : unit_sz;
+    }
     // Loops with low backedge frequency should not be aligned.
     Node *n = h->in(LoopNode::LoopBackControl)->in(0);
-    if( n->is_MachIf() && n->as_MachIf()->_prob < 0.01 ) {
-      return 1;             // Loop does not loop, more often than not!
+    if (n->is_MachIf() && n->as_MachIf()->_prob < 0.01) {
+      return unit_sz; // Loop does not loop, more often than not!
     }
     return OptoLoopAlignment; // Otherwise align loop head
   }
 
-  return 1;                     // no particular alignment
+  return unit_sz; // no particular alignment
 }
 
 //-----------------------------------------------------------------------------
@@ -271,55 +273,55 @@
 
 //------------------------------dump-------------------------------------------
 #ifndef PRODUCT
-void Block::dump_bidx(const Block* orig) const {
-  if (_pre_order) tty->print("B%d",_pre_order);
-  else tty->print("N%d", head()->_idx);
+void Block::dump_bidx(const Block* orig, outputStream* st) const {
+  if (_pre_order) st->print("B%d",_pre_order);
+  else st->print("N%d", head()->_idx);
 
   if (Verbose && orig != this) {
     // Dump the original block's idx
-    tty->print(" (");
-    orig->dump_bidx(orig);
-    tty->print(")");
+    st->print(" (");
+    orig->dump_bidx(orig, st);
+    st->print(")");
   }
 }
 
-void Block::dump_pred(const Block_Array *bbs, Block* orig) const {
+void Block::dump_pred(const Block_Array *bbs, Block* orig, outputStream* st) const {
   if (is_connector()) {
     for (uint i=1; i<num_preds(); i++) {
       Block *p = ((*bbs)[pred(i)->_idx]);
-      p->dump_pred(bbs, orig);
+      p->dump_pred(bbs, orig, st);
     }
   } else {
-    dump_bidx(orig);
-    tty->print(" ");
+    dump_bidx(orig, st);
+    st->print(" ");
   }
 }
 
-void Block::dump_head( const Block_Array *bbs ) const {
+void Block::dump_head( const Block_Array *bbs, outputStream* st ) const {
   // Print the basic block
-  dump_bidx(this);
-  tty->print(": #\t");
+  dump_bidx(this, st);
+  st->print(": #\t");
 
   // Print the incoming CFG edges and the outgoing CFG edges
   for( uint i=0; i<_num_succs; i++ ) {
-    non_connector_successor(i)->dump_bidx(_succs[i]);
-    tty->print(" ");
+    non_connector_successor(i)->dump_bidx(_succs[i], st);
+    st->print(" ");
   }
-  tty->print("<- ");
+  st->print("<- ");
   if( head()->is_block_start() ) {
     for (uint i=1; i<num_preds(); i++) {
       Node *s = pred(i);
       if (bbs) {
         Block *p = (*bbs)[s->_idx];
-        p->dump_pred(bbs, p);
+        p->dump_pred(bbs, p, st);
       } else {
         while (!s->is_block_start())
           s = s->in(0);
-        tty->print("N%d ", s->_idx );
+        st->print("N%d ", s->_idx );
       }
     }
   } else
-    tty->print("BLOCK HEAD IS JUNK  ");
+    st->print("BLOCK HEAD IS JUNK  ");
 
   // Print loop, if any
   const Block *bhead = this;    // Head of self-loop
@@ -330,24 +332,24 @@
     while (bx->is_connector()) {
       bx = (*bbs)[bx->pred(1)->_idx];
     }
-    tty->print("\tLoop: B%d-B%d ", bhead->_pre_order, bx->_pre_order);
+    st->print("\tLoop: B%d-B%d ", bhead->_pre_order, bx->_pre_order);
     // Dump any loop-specific bits, especially for CountedLoops.
-    loop->dump_spec(tty);
+    loop->dump_spec(st);
   } else if (has_loop_alignment()) {
-    tty->print(" top-of-loop");
+    st->print(" top-of-loop");
   }
-  tty->print(" Freq: %g",_freq);
+  st->print(" Freq: %g",_freq);
   if( Verbose || WizardMode ) {
-    tty->print(" IDom: %d/#%d", _idom ? _idom->_pre_order : 0, _dom_depth);
-    tty->print(" RegPressure: %d",_reg_pressure);
-    tty->print(" IHRP Index: %d",_ihrp_index);
-    tty->print(" FRegPressure: %d",_freg_pressure);
-    tty->print(" FHRP Index: %d",_fhrp_index);
+    st->print(" IDom: %d/#%d", _idom ? _idom->_pre_order : 0, _dom_depth);
+    st->print(" RegPressure: %d",_reg_pressure);
+    st->print(" IHRP Index: %d",_ihrp_index);
+    st->print(" FRegPressure: %d",_freg_pressure);
+    st->print(" FHRP Index: %d",_fhrp_index);
   }
-  tty->print_cr("");
+  st->print_cr("");
 }
 
-void Block::dump() const { dump(0); }
+void Block::dump() const { dump(NULL); }
 
 void Block::dump( const Block_Array *bbs ) const {
   dump_head(bbs);
@@ -441,9 +443,9 @@
       Block *bb = new (_bbs._arena) Block(_bbs._arena,p);
       _bbs.map(p->_idx,bb);
       _bbs.map(x->_idx,bb);
-      if( x != p )                  // Only for root is x == p
+      if( x != p ) {                // Only for root is x == p
         bb->_nodes.push((Node*)x);
-
+      }
       // Now handle predecessors
       ++sum;                        // Count 1 for self block
       uint cnt = bb->num_preds();
--- a/hotspot/src/share/vm/opto/block.hpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/block.hpp	Thu Aug 11 12:08:11 2011 -0700
@@ -329,10 +329,10 @@
 
 #ifndef PRODUCT
   // Debugging print of basic block
-  void dump_bidx(const Block* orig) const;
-  void dump_pred(const Block_Array *bbs, Block* orig) const;
-  void dump_head( const Block_Array *bbs ) const;
-  void dump( ) const;
+  void dump_bidx(const Block* orig, outputStream* st = tty) const;
+  void dump_pred(const Block_Array *bbs, Block* orig, outputStream* st = tty) const;
+  void dump_head( const Block_Array *bbs, outputStream* st = tty ) const;
+  void dump() const;
   void dump( const Block_Array *bbs ) const;
 #endif
 };
--- a/hotspot/src/share/vm/opto/compile.hpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/compile.hpp	Thu Aug 11 12:08:11 2011 -0700
@@ -785,11 +785,17 @@
   // Process an OopMap Element while emitting nodes
   void Process_OopMap_Node(MachNode *mach, int code_offset);
 
+  // Initialize code buffer
+  CodeBuffer* init_buffer(uint* blk_starts);
+
   // Write out basic block data to code buffer
-  void Fill_buffer();
+  void fill_buffer(CodeBuffer* cb, uint* blk_starts);
 
   // Determine which variable sized branches can be shortened
-  void Shorten_branches(Label *labels, int& code_size, int& reloc_size, int& stub_size);
+  void shorten_branches(uint* blk_starts, int& code_size, int& reloc_size, int& stub_size);
+
+  // Inserts nops where needed and final shorten branches.
+  void finalize_offsets_and_shorten(uint* blk_starts);
 
   // Compute the size of first NumberOfLoopInstrToAlign instructions
   // at the head of a loop.
--- a/hotspot/src/share/vm/opto/machnode.hpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/machnode.hpp	Thu Aug 11 12:08:11 2011 -0700
@@ -188,6 +188,9 @@
   virtual MachNode *short_branch_version(Compile* C) { return NULL; }
   bool may_be_short_branch() const { return (flags() & Flag_may_be_short_branch) != 0; }
 
+  // Avoid back to back some instructions on some CPUs.
+  bool avoid_back_to_back() const { return (flags() & Flag_avoid_back_to_back) != 0; }
+
   // First index in _in[] corresponding to operand, or -1 if there is none
   int  operand_index(uint operand) const;
 
--- a/hotspot/src/share/vm/opto/matcher.hpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/matcher.hpp	Thu Aug 11 12:08:11 2011 -0700
@@ -351,7 +351,7 @@
   virtual int      regnum_to_fpu_offset(int regnum);
 
   // Is this branch offset small enough to be addressed by a short branch?
-  bool is_short_branch_offset(int rule, int offset);
+  bool is_short_branch_offset(int rule, int br_size, int offset);
 
   // Optional scaling for the parameter to the ClearArray/CopyArray node.
   static const bool init_array_count_is_in_bytes;
--- a/hotspot/src/share/vm/opto/node.hpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/node.hpp	Thu Aug 11 12:08:11 2011 -0700
@@ -637,7 +637,8 @@
     Flag_is_Branch           = Flag_is_cisc_alternate << 1,
     Flag_is_dead_loop_safe   = Flag_is_Branch << 1,
     Flag_may_be_short_branch = Flag_is_dead_loop_safe << 1,
-    _max_flags = (Flag_may_be_short_branch << 1) - 1 // allow flags combination
+    Flag_avoid_back_to_back  = Flag_may_be_short_branch << 1,
+    _max_flags = (Flag_avoid_back_to_back << 1) - 1 // allow flags combination
   };
 
 private:
--- a/hotspot/src/share/vm/opto/output.cpp	Wed Aug 10 14:06:57 2011 -0700
+++ b/hotspot/src/share/vm/opto/output.cpp	Thu Aug 11 12:08:11 2011 -0700
@@ -128,6 +128,14 @@
   if ( ZapDeadCompiledLocals )  Insert_zap_nodes();
 # endif
 
+  uint* blk_starts = NEW_RESOURCE_ARRAY(uint,_cfg->_num_blocks+1);
+  blk_starts[0]    = 0;
+
+  // Initialize code buffer and process short branches.
+  CodeBuffer* cb = init_buffer(blk_starts);
+
+  if (cb == NULL || failing())  return;
+
   ScheduleAndBundle();
 
 #ifndef PRODUCT
@@ -148,11 +156,13 @@
 
   if (failing())  return;
 
+  finalize_offsets_and_shorten(blk_starts);
+
   BuildOopMaps();
 
   if (failing())  return;
 
-  Fill_buffer();
+  fill_buffer(cb, blk_starts);
 }
 
 bool Compile::need_stack_bang(int frame_size_in_bytes) const {
@@ -325,22 +335,22 @@
   } // if( MaxLoopPad < OptoLoopAlignment-1 )
 }
 
-//----------------------Shorten_branches---------------------------------------
+//----------------------shorten_branches---------------------------------------
 // The architecture description provides short branch variants for some long
 // branch instructions. Replace eligible long branches with short branches.
-void Compile::Shorten_branches(Label *labels, int& code_size, int& reloc_size, int& stub_size) {
-
-  // fill in the nop array for bundling computations
-  MachNode *_nop_list[Bundle::_nop_count];
-  Bundle::initialize_nops(_nop_list, this);
+void Compile::shorten_branches(uint* blk_starts, int& code_size, int& reloc_size, int& stub_size) {
 
   // ------------------
   // Compute size of each block, method size, and relocation information size
-  uint *jmp_end    = NEW_RESOURCE_ARRAY(uint,_cfg->_num_blocks);
-  uint *blk_starts = NEW_RESOURCE_ARRAY(uint,_cfg->_num_blocks+1);
-  DEBUG_ONLY( uint *jmp_target = NEW_RESOURCE_ARRAY(uint,_cfg->_num_blocks); )
-  DEBUG_ONLY( uint *jmp_rule = NEW_RESOURCE_ARRAY(uint,_cfg->_num_blocks); )
-  blk_starts[0]    = 0;
+  uint nblocks  = _cfg->_num_blocks;
+
+  uint*      jmp_offset = NEW_RESOURCE_ARRAY(uint,nblocks);
+  uint*      jmp_size   = NEW_RESOURCE_ARRAY(uint,nblocks);
+  int*       jmp_nidx   = NEW_RESOURCE_ARRAY(int ,nblocks);
+  DEBUG_ONLY( uint *jmp_target = NEW_RESOURCE_ARRAY(uint,nblocks); )
+  DEBUG_ONLY( uint *jmp_rule = NEW_RESOURCE_ARRAY(uint,nblocks); )
+
+  bool has_short_branch_candidate = false;
 
   // Initialize the sizes to 0
   code_size  = 0;          // Size in bytes of generated code
@@ -350,28 +360,35 @@
   reloc_size = 1;          // Number of relocation entries
 
   // Make three passes.  The first computes pessimistic blk_starts,
-  // relative jmp_end and reloc_size information.  The second performs
+  // relative jmp_offset and reloc_size information.  The second performs
   // short branch substitution using the pessimistic sizing.  The
   // third inserts nops where needed.
 
-  Node *nj; // tmp
-
   // Step one, perform a pessimistic sizing pass.
-  uint i;
-  uint min_offset_from_last_call = 1;  // init to a positive value
+  uint last_call_adr = max_uint;
+  uint last_avoid_back_to_back_adr = max_uint;
   uint nop_size = (new (this) MachNopNode())->size(_regalloc);
-  for( i=0; i<_cfg->_num_blocks; i++ ) { // For all blocks
+  for (uint i = 0; i < nblocks; i++) { // For all blocks
     Block *b = _cfg->_blocks[i];
 
+    // During short branch replacement, we store the relative (to blk_starts)
+    // offset of jump in jmp_offset, rather than the absolute offset of jump.
+    // This is so that we do not need to recompute sizes of all nodes when
+    // we compute correct blk_starts in our next sizing pass.
+    jmp_offset[i] = 0;
+    jmp_size[i]   = 0;
+    jmp_nidx[i]   = -1;
+    DEBUG_ONLY( jmp_target[i] = 0; )
+    DEBUG_ONLY( jmp_rule[i]   = 0; )
+
     // Sum all instruction sizes to compute block size
     uint last_inst = b->_nodes.size();
     uint blk_size = 0;
-    for( uint j = 0; j<last_inst; j++ ) {
-      nj = b->_nodes[j];
+    for (uint j = 0; j < last_inst; j++) {
+      Node* nj = b->_nodes[j];
       uint inst_size = nj->size(_regalloc);
-      blk_size += inst_size;
       // Handle machine instruction nodes
-      if( nj->is_Mach() ) {
+      if (nj->is_Mach()) {
         MachNode *mach = nj->as_Mach();
         blk_size += (mach->alignment_required() - 1) * relocInfo::addr_unit(); // assume worst case padding
         reloc_size += mach->reloc();
@@ -388,32 +405,52 @@
         } else if (mach->is_MachSafePoint()) {
           // If call/safepoint are adjacent, account for possible
           // nop to disambiguate the two safepoints.
-          if (min_offset_from_last_call == 0) {
+          // ScheduleAndBundle() can rearrange nodes in a block,
+          // check for all offsets inside this block.
+          if (last_call_adr >= blk_starts[i]) {
+            blk_size += nop_size;
+          }
+        }
+        if (mach->avoid_back_to_back()) {
+          // Nop is inserted between "avoid back to back" instructions.
+          // ScheduleAndBundle() can rearrange nodes in a block,
+          // check for all offsets inside this block.
+          if (last_avoid_back_to_back_adr >= blk_starts[i]) {
             blk_size += nop_size;
           }
         }
+        if (mach->may_be_short_branch()) {
+          if (!nj->is_Branch()) {
+#ifndef PRODUCT
+            nj->dump(3);
+#endif
+            Unimplemented();
+          }
+          assert(jmp_nidx[i] == -1, "block should have only one branch");
+          jmp_offset[i] = blk_size;
+          jmp_size[i]   = inst_size;
+          jmp_nidx[i]   = j;
+          has_short_branch_candidate = true;
+        }
       }
-      min_offset_from_last_call += inst_size;
+      blk_size += inst_size;
       // Remember end of call offset
       if (nj->is_MachCall() && !nj->is_MachCallLeaf()) {
-        min_offset_from_last_call = 0;
+        last_call_adr = blk_starts[i]+blk_size;
+      }
+      // Remember end of avoid_back_to_back offset
+      if (nj->is_Mach() && nj->as_Mach()->avoid_back_to_back()) {
+        last_avoid_back_to_back_adr = blk_starts[i]+blk_size;
       }
     }
 
-    // During short branch replacement, we store the relative (to blk_starts)
-    // end of jump in jmp_end, rather than the absolute end of jump.  This
-    // is so that we do not need to recompute sizes of all nodes when we compute
-    // correct blk_starts in our next sizing pass.
-    jmp_end[i] = blk_size;
-    DEBUG_ONLY( jmp_target[i] = 0; )
-
     // When the next block starts a loop, we may insert pad NOP
     // instructions.  Since we cannot know our future alignment,
     // assume the worst.
-    if( i<_cfg->_num_blocks-1 ) {
+    if (i< nblocks-1) {
       Block *nb = _cfg->_blocks[i+1];
       int max_loop_pad = nb->code_alignment()-relocInfo::addr_unit();
-      if( max_loop_pad > 0 ) {
+      if (max_loop_pad > 0) {
         assert(is_power_of_2(max_loop_pad+relocInfo::addr_unit()), "");
         blk_size += max_loop_pad;
       }
@@ -424,124 +461,100 @@
   }
 
   // Step two, replace eligible long jumps.
-
-  // Note: this will only get the long branches within short branch
-  //   range. Another pass might detect more branches that became
-  //   candidates because the shortening in the first pass exposed
-  //   more opportunities. Unfortunately, this would require
-  //   recomputing the starting and ending positions for the blocks
-  for( i=0; i<_cfg->_num_blocks; i++ ) {
-    Block *b = _cfg->_blocks[i];
-
-    int j;
-    // Find the branch; ignore trailing NOPs.
-    for( j = b->_nodes.size()-1; j>=0; j-- ) {
-      nj = b->_nodes[j];
-      if( !nj->is_Mach() || nj->as_Mach()->ideal_Opcode() != Op_Con )
-        break;
-    }
-
-    if (j >= 0) {
-      if( nj->is_Mach() && nj->as_Mach()->may_be_short_branch() ) {
-        MachNode *mach = nj->as_Mach();
+  bool progress = true;
+  uint last_may_be_short_branch_adr = max_uint;
+  while (has_short_branch_candidate && progress) {
+    progress = false;
+    has_short_branch_candidate = false;
+    int adjust_block_start = 0;
+    for (uint i = 0; i < nblocks; i++) {
+      Block *b = _cfg->_blocks[i];
+      int idx = jmp_nidx[i];
+      MachNode* mach = (idx == -1) ? NULL: b->_nodes[idx]->as_Mach();
+      if (mach != NULL && mach->may_be_short_branch()) {
+#ifdef ASSERT
+        assert(jmp_size[i] > 0 && mach->is_Branch(), "sanity");
+        int j;
+        // Find the branch; ignore trailing NOPs.
+        for (j = b->_nodes.size()-1; j>=0; j--) {
+          Node* n = b->_nodes[j];
+          if (!n->is_Mach() || n->as_Mach()->ideal_Opcode() != Op_Con)
+            break;
+        }
+        assert(j >= 0 && j == idx && b->_nodes[j] == (Node*)mach, "sanity");
+#endif
+        int br_size = jmp_size[i];
+        int br_offs = blk_starts[i] + jmp_offset[i];
+
         // This requires the TRUE branch target be in succs[0]
         uint bnum = b->non_connector_successor(0)->_pre_order;
-        uintptr_t target = blk_starts[bnum];
-        if( mach->is_Branch() ) {
-          int offset = target-(blk_starts[i] + jmp_end[i]);
-          if (_matcher->is_short_branch_offset(mach->rule(), offset)) {
-            // We've got a winner.  Replace this branch.
-            MachNode* replacement = mach->short_branch_version(this);
-            b->_nodes.map(j, replacement);
-            mach->subsume_by(replacement);
-
-            // Update the jmp_end size to save time in our
-            // next pass.
-            jmp_end[i] -= (mach->size(_regalloc) - replacement->size(_regalloc));
-            DEBUG_ONLY( jmp_target[i] = bnum; );
-            DEBUG_ONLY( jmp_rule[i] = mach->rule(); );
+        int offset = blk_starts[bnum] - br_offs;
+        if (bnum > i) { // adjust following block's offset
+          offset -= adjust_block_start;
+        }
+        // In the following code a nop could be inserted before
+        // the branch which will increase the backward distance.
+        bool needs_padding = ((uint)br_offs == last_may_be_short_branch_adr);
+        if (needs_padding && offset <= 0)
+          offset -= nop_size;
+
+        if (_matcher->is_short_branch_offset(mach->rule(), br_size, offset)) {
+          // We've got a winner.  Replace this branch.
+          MachNode* replacement = mach->short_branch_version(this);
+
+          // Update the jmp_size.
+          int new_size = replacement->size(_regalloc);
+          int diff     = br_size - new_size;
+          assert(diff >= (int)nop_size, "short_branch size should be smaller");
+          // Conservatively take into accound padding between
+          // avoid_back_to_back branches. Previous branch could be
+          // converted into avoid_back_to_back branch during next
+          // rounds.
+          if (needs_padding && replacement->avoid_back_to_back()) {
+            jmp_offset[i] += nop_size;
+            diff -= nop_size;
           }
+          adjust_block_start += diff;
+          b->_nodes.map(idx, replacement);
+          mach->subsume_by(replacement);
+          mach = replacement;
+          progress = true;
+
+          jmp_size[i] = new_size;
+          DEBUG_ONLY( jmp_target[i] = bnum; );
+          DEBUG_ONLY( jmp_rule[i] = mach->rule(); );
         } else {
-#ifndef PRODUCT
-          mach->dump(3);
-#endif
-          Unimplemented();
+          // The jump distance is not short, try again during next iteration.
+          has_short_branch_candidate = true;
         }
+      } // (mach->may_be_short_branch())
+      if (mach != NULL && (mach->may_be_short_branch() ||
+                           mach->avoid_back_to_back())) {
+        last_may_be_short_branch_adr = blk_starts[i] + jmp_offset[i] + jmp_size[i];
       }
-    }
-  }
-
-  // Compute the size of first NumberOfLoopInstrToAlign instructions at head
-  // of a loop. It is used to determine the padding for loop alignment.
-  compute_loop_first_inst_sizes();
-
-  // Step 3, compute the offsets of all the labels
-  uint last_call_adr = max_uint;
-  for( i=0; i<_cfg->_num_blocks; i++ ) { // For all blocks
-    // copy the offset of the beginning to the corresponding label
-    assert(labels[i].is_unused(), "cannot patch at this point");
-    labels[i].bind_loc(blk_starts[i], CodeBuffer::SECT_INSTS);
-
-    // insert padding for any instructions that need it
-    Block *b = _cfg->_blocks[i];
-    uint last_inst = b->_nodes.size();
-    uint adr = blk_starts[i];
-    for( uint j = 0; j<last_inst; j++ ) {
-      nj = b->_nodes[j];
-      if( nj->is_Mach() ) {
-        int padding = nj->as_Mach()->compute_padding(adr);
-        // If call/safepoint are adjacent insert a nop (5010568)
-        if (padding == 0 && nj->is_MachSafePoint() && !nj->is_MachCall() &&
-            adr == last_call_adr ) {
-          padding = nop_size;
-        }
-        if(padding > 0) {
-          assert((padding % nop_size) == 0, "padding is not a multiple of NOP size");
-          int nops_cnt = padding / nop_size;
-          MachNode *nop = new (this) MachNopNode(nops_cnt);
-          b->_nodes.insert(j++, nop);
-          _cfg->_bbs.map( nop->_idx, b );
-          adr += padding;
-          last_inst++;
-        }
-      }
-      adr += nj->size(_regalloc);
-
-      // Remember end of call offset
-      if (nj->is_MachCall() && !nj->is_MachCallLeaf()) {
-        last_call_adr = adr;
-      }
-    }
-
-    if ( i != _cfg->_num_blocks-1) {
-      // Get the size of the block
-      uint blk_size = adr - blk_starts[i];
-
-      // When the next block is the top of a loop, we may insert pad NOP
-      // instructions.
-      Block *nb = _cfg->_blocks[i+1];
-      int current_offset = blk_starts[i] + blk_size;
-      current_offset += nb->alignment_padding(current_offset);
-      // Save block size; update total method size
-      blk_starts[i+1] = current_offset;
+      blk_starts[i+1] -= adjust_block_start;
     }
   }
 
 #ifdef ASSERT
-  for( i=0; i<_cfg->_num_blocks; i++ ) { // For all blocks
-    if( jmp_target[i] != 0 ) {
-      int offset = blk_starts[jmp_target[i]]-(blk_starts[i] + jmp_end[i]);
-      if (!_matcher->is_short_branch_offset(jmp_rule[i], offset)) {
-        tty->print_cr("target (%d) - jmp_end(%d) = offset (%d), jmp_block B%d, target_block B%d", blk_starts[jmp_target[i]], blk_starts[i] + jmp_end[i], offset, i, jmp_target[i]);
+  for (uint i = 0; i < nblocks; i++) { // For all blocks
+    if (jmp_target[i] != 0) {
+      int br_size = jmp_size[i];
+      int offset = blk_starts[jmp_target[i]]-(blk_starts[i] + jmp_offset[i]);
+      if (!_matcher->is_short_branch_offset(jmp_rule[i], br_size, offset)) {
+        tty->print_cr("target (%d) - jmp_offset(%d) = offset (%d), jump_size(%d), jmp_block B%d, target_block B%d", blk_starts[jmp_target[i]], blk_starts[i] + jmp_offset[i], offset, br_size, i, jmp_target[i]);
       }
-      assert(_matcher->is_short_branch_offset(jmp_rule[i], offset), "Displacement too large for short jmp");
+      assert(_matcher->is_short_branch_offset(jmp_rule[i], br_size, offset), "Displacement too large for short jmp");
     }
   }
 #endif
 
+  // Step 3, compute the offsets of all blocks, will be done in finalize_offsets_and_shorten()
+  // after ScheduleAndBundle().
+
   // ------------------
   // Compute size for code buffer
-  code_size   = blk_starts[i-1] + jmp_end[i-1];
+  code_size = blk_starts[nblocks];
 
   // Relocation records
   reloc_size += 1;              // Relo entry for exception handler
@@ -550,7 +563,189 @@
   // Min is 2 bytes, max is probably 6 or 8, with a tax up to 25% for
   // a relocation index.
   // The CodeBuffer will expand the locs array if this estimate is too low.
-  reloc_size   *= 10 / sizeof(relocInfo);
+  reloc_size *= 10 / sizeof(relocInfo);
+}
+
+//----------------------finalize_offsets_and_shorten-------------------------
+void Compile::finalize_offsets_and_shorten(uint* blk_starts) {
+  // blk_starts[] contains offsets calculated during short branches processing,
+  // offsets should not be increased during following steps.
+
+  // Compute the size of first NumberOfLoopInstrToAlign instructions at head
+  // of a loop. It is used to determine the padding for loop alignment.
+  compute_loop_first_inst_sizes();
+
+  uint nblocks  = _cfg->_num_blocks;
+#ifdef ASSERT
+  uint*      jmp_target = NEW_RESOURCE_ARRAY(uint,nblocks);
+  uint*      jmp_offset = NEW_RESOURCE_ARRAY(uint,nblocks);
+  uint*      jmp_size   = NEW_RESOURCE_ARRAY(uint,nblocks);
+  uint*      jmp_rule   = NEW_RESOURCE_ARRAY(uint,nblocks);
+#endif
+
+  // Inserts nops where needed and do final short branches replacement.
+  uint nop_size = (new (this) MachNopNode())->size(_regalloc);
+  uint last_call_adr = max_uint;
+  uint last_avoid_back_to_back_adr = max_uint;
+
+  assert(blk_starts[0] == 0, "sanity");
+  uint current_offset = 0;
+  uint block_alignment_padding = 0;
+
+  for (uint i=0; i < nblocks; i++) { // For all blocks
+    Block *b = _cfg->_blocks[i];
+
+#ifdef ASSERT
+    jmp_target[i] = 0;
+    jmp_offset[i] = 0;
+    jmp_size[i]   = 0;
+    jmp_rule[i]   = 0;
+#endif
+
+    // Maximum alignment was added before loop block during
+    // Step One, as result padding for nodes was not added.
+    // Take this into account for block's size change check
+    // and allow increase block's size by the difference
+    // of maximum and actual alignment paddings.
+    DEBUG_ONLY( uint orig_blk_size = blk_starts[i+1] - blk_starts[i] + block_alignment_padding; )
+    uint blk_offset = current_offset;
+
+    uint last_inst = b->_nodes.size();
+    for (uint j = 0; j<last_inst; j++) {
+      Node* nj = b->_nodes[j];
+
+      if (valid_bundle_info(nj) &&
+          node_bundling(nj)->used_in_unconditional_delay()) {
+        continue; // Skip instruction in delay slot
+      }
+
+      uint inst_size = nj->size(_regalloc);
+      if (nj->is_Mach()) {
+        MachNode *mach = nj->as_Mach();
+        int padding = mach->compute_padding(current_offset);
+
+        // If call/safepoint are adjacent insert a nop (5010568)
+        if (padding == 0 && nj->is_MachSafePoint() && !nj->is_MachCall() &&
+            current_offset == last_call_adr) {
+          padding = nop_size;
+        }
+
+        // Inserted a nop between "avoid back to back" instructions.
+        if (padding == 0 && mach->avoid_back_to_back() &&
+            current_offset == last_avoid_back_to_back_adr) {
+          padding = nop_size;
+        }
+
+        if (padding > 0) {
+          assert((padding % nop_size) == 0, "padding is not a multiple of NOP size");
+          int nops_cnt = padding / nop_size;
+          MachNode *nop = new (this) MachNopNode(nops_cnt);
+          b->_nodes.insert(j++, nop);
+          _cfg->_bbs.map(nop->_idx, b);
+          last_inst++;
+          current_offset += padding;
+        }
+
+        // Try to replace long branch if delay slot is not used,
+        // it is mostly for back branches since forward branch's
+        // distance is not updated yet.
+        bool delay_slot_is_used = valid_bundle_info(nj) &&
+                                  node_bundling(nj)->use_unconditional_delay();
+        if (!delay_slot_is_used && mach->may_be_short_branch()) {
+          int br_size = inst_size;
+
+          // This requires the TRUE branch target be in succs[0]
+          uint bnum = b->non_connector_successor(0)->_pre_order;
+          int offset = blk_starts[bnum] - current_offset;
+          if (bnum >= i) {
+            // Current and following block's offset are not
+            // finilized yet, adjust distance.
+            offset -= (blk_starts[i] - blk_offset);
+          }
+          // In the following code a nop could be inserted before
+          // the branch which will increase the backward distance.
+          bool needs_padding = (current_offset == last_avoid_back_to_back_adr);
+          if (needs_padding && offset <= 0)
+            offset -= nop_size;
+
+          if (_matcher->is_short_branch_offset(mach->rule(), br_size, offset)) {
+            // We've got a winner.  Replace this branch.
+            MachNode* replacement = mach->short_branch_version(this);
+
+            // Update the jmp_size.
+            int new_size = replacement->size(_regalloc);
+            assert((br_size - new_size) >= (int)nop_size, "short_branch size should be smaller");
+            // Conservatively take into accound padding between
+            // avoid_back_to_back branches. Previous branch could be
+            // converted into avoid_back_to_back branch during next
+            // rounds.
+            if (needs_padding && replacement->avoid_back_to_back()) {
+              MachNode *nop = new (this) MachNopNode();
+              b->_nodes.insert(j++, nop);
+              _cfg->_bbs.map(nop->_idx, b);
+              last_inst++;
+              current_offset += nop_size;
+            }
+            inst_size = new_size;
+            b->_nodes.map(j, replacement);
+            mach->subsume_by(replacement);
+            nj = replacement;
+#ifdef ASSERT
+            jmp_target[i] = bnum;
+            jmp_offset[i] = current_offset - blk_offset;
+            jmp_size[i]   = new_size;
+            jmp_rule[i]   = mach->rule();
+#endif
+          }
+        }
+      }
+      current_offset += inst_size;
+
+      // Remember end of call offset
+      if (nj->is_MachCall() && !nj->is_MachCallLeaf()) {
+        last_call_adr = current_offset;
+      }
+      // Remember end of avoid_back_to_back offset
+      if (nj->is_Mach() && nj->as_Mach()->avoid_back_to_back()) {
+        last_avoid_back_to_back_adr = current_offset;
+      }
+    }
+    assert(blk_offset <= blk_starts[i], "shouldn't increase distance");
+    blk_starts[i] = blk_offset;
+
+    // When the next block is the top of a loop, we may insert pad NOP
+    // instructions.
+    if (i < nblocks-1) {
+      Block *nb = _cfg->_blocks[i+1];
+      int padding = nb->alignment_padding(current_offset);
+      if (padding > 0) {
+        assert((padding % nop_size) == 0, "padding is not a multiple of NOP size");
+        int nops_cnt = padding / nop_size;
+        MachNode *nop = new (this) MachNopNode(nops_cnt);
+        b->_nodes.insert(b->_nodes.size(), nop);
+        _cfg->_bbs.map(nop->_idx, b);
+        current_offset += padding;
+      }
+      int max_loop_pad = nb->code_alignment()-relocInfo::addr_unit();
+      assert(max_loop_pad >= padding, "sanity");
+      block_alignment_padding = max_loop_pad - padding;
+    }
+    assert(orig_blk_size >= (current_offset - blk_offset), "shouldn't increase block size");
+  }
+  blk_starts[nblocks] = current_offset;
+
+#ifdef ASSERT
+  for (uint i = 0; i < nblocks; i++) { // For all blocks
+    if (jmp_target[i] != 0) {
+      int br_size = jmp_size[i];
+      int offset = blk_starts[jmp_target[i]]-(blk_starts[i] + jmp_offset[i]);
+      if (!_matcher->is_short_branch_offset(jmp_rule[i], br_size, offset)) {
+        tty->print_cr("target (%d) - jmp_offset(%d) = offset (%d), jump_size(%d), jmp_block B%d, target_block B%d", blk_starts[jmp_target[i]], blk_starts[i] + jmp_offset[i], offset, br_size, i, jmp_target[i]);
+      }
+      assert(_matcher->is_short_branch_offset(jmp_rule[i], br_size, offset), "Displacement too large for short jmp");
+    }
+  }
+#endif
 }
 
 //------------------------------FillLocArray-----------------------------------
@@ -1026,7 +1221,7 @@
 
 
 
-// helper for Fill_buffer bailout logic
+// helper for fill_buffer bailout logic
 static void turn_off_compiler(Compile* C) {
   if (CodeCache::largest_free_block() >= CodeCacheMinimumFreeSpace*10) {
     // Do not turn off compilation if a single giant method has
@@ -1039,22 +1234,20 @@
 }
 
 
-//------------------------------Fill_buffer------------------------------------
-void Compile::Fill_buffer() {
+//------------------------------init_buffer------------------------------------
+CodeBuffer* Compile::init_buffer(uint* blk_starts) {
 
   // Set the initially allocated size
   int  code_req   = initial_code_capacity;
   int  locs_req   = initial_locs_capacity;
   int  stub_req   = TraceJumps ? initial_stub_capacity * 10 : initial_stub_capacity;
   int  const_req  = initial_const_capacity;
-  bool labels_not_set = true;
 
   int  pad_req    = NativeCall::instruction_size;
   // The extra spacing after the code is necessary on some platforms.
   // Sometimes we need to patch in a jump after the last instruction,
   // if the nmethod has been deoptimized.  (See 4932387, 4894843.)
 
-  uint i;
   // Compute the byte offset where we can store the deopt pc.
   if (fixed_slots() != 0) {
     _orig_pc_slot_offset_in_bytes = _regalloc->reg2offset(OptoReg::stack2reg(_orig_pc_slot));
@@ -1078,19 +1271,12 @@
     _frame_slots += 8*(16/BytesPerInt);
   }
 #endif
-  assert( _frame_slots >= 0 && _frame_slots < 1000000, "sanity check" );
-
-  // Create an array of unused labels, one for each basic block
-  Label *blk_labels = NEW_RESOURCE_ARRAY(Label, _cfg->_num_blocks+1);
-
-  for( i=0; i <= _cfg->_num_blocks; i++ ) {
-    blk_labels[i].init();
-  }
+  assert(_frame_slots >= 0 && _frame_slots < 1000000, "sanity check");
 
   if (has_mach_constant_base_node()) {
     // Fill the constant table.
-    // Note:  This must happen before Shorten_branches.
-    for (i = 0; i < _cfg->_num_blocks; i++) {
+    // Note:  This must happen before shorten_branches.
+    for (uint i = 0; i < _cfg->_num_blocks; i++) {
       Block* b = _cfg->_blocks[i];
 
       for (uint j = 0; j < b->_nodes.size(); j++) {
@@ -1114,14 +1300,11 @@
   // Initialize the space for the BufferBlob used to find and verify
   // instruction size in MachNode::emit_size()
   init_scratch_buffer_blob(const_req);
-  if (failing())  return; // Out of memory
-
-  // If this machine supports different size branch offsets, then pre-compute
-  // the length of the blocks
-  if( _matcher->is_short_branch_offset(-1, 0) ) {
-    Shorten_branches(blk_labels, code_req, locs_req, stub_req);
-    labels_not_set = false;
-  }
+  if (failing())  return NULL; // Out of memory
+
+  // Pre-compute the length of blocks and replace
+  // long branches with short if machine supports it.
+  shorten_branches(blk_starts, code_req, locs_req, stub_req);
 
   // nmethod and CodeBuffer count stubs & constants as part of method's code.
   int exception_handler_req = size_exception_handler();
@@ -1151,7 +1334,7 @@
   // Have we run out of code space?
   if ((cb->blob() == NULL) || (!CompileBroker::should_compile_new_jobs())) {
     turn_off_compiler(this);
-    return;
+    return NULL;
   }
   // Configure the code buffer.
   cb->initialize_consts_size(const_req);
@@ -1162,6 +1345,12 @@
   MachNode *_nop_list[Bundle::_nop_count];
   Bundle::initialize_nops(_nop_list, this);
 
+  return cb;
+}
+
+//------------------------------fill_buffer------------------------------------
+void Compile::fill_buffer(CodeBuffer* cb, uint* blk_starts) {
+
   // Create oopmap set.
   _oop_map_set = new OopMapSet();
 
@@ -1180,15 +1369,16 @@
 
   int previous_offset = 0;
   int current_offset  = 0;
+#ifdef ASSERT
   int last_call_offset = -1;
-
+  int last_avoid_back_to_back_offset = -1;
+#endif
   // Create an array of unused labels, one for each basic block, if printing is enabled
 #ifndef PRODUCT
   int *node_offsets      = NULL;
-  uint  node_offset_limit = unique();
-
-
-  if ( print_assembly() )
+  uint node_offset_limit = unique();
+
+  if (print_assembly())
     node_offsets         = NEW_RESOURCE_ARRAY(int, node_offset_limit);
 #endif
 
@@ -1199,11 +1389,19 @@
     constant_table().emit(*cb);
   }
 
+  // Create an array of labels, one for each basic block
+  Label *blk_labels = NEW_RESOURCE_ARRAY(Label, _cfg->_num_blocks+1);
+  for (uint i=0; i <= _cfg->_num_blocks; i++) {
+    blk_labels[i].init();
+  }
+
   // ------------------
   // Now fill in the code buffer
   Node *delay_slot = NULL;
 
-  for( i=0; i < _cfg->_num_blocks; i++ ) {
+  for (uint i=0; i < _cfg->_num_blocks; i++) {
+    guarantee(blk_starts[i] == (uint)cb->insts_size(),"should not change size");
+
     Block *b = _cfg->_blocks[i];
 
     Node *head = b->head();
@@ -1211,23 +1409,25 @@
     // If this block needs to start aligned (i.e, can be reached other
     // than by falling-thru from the previous block), then force the
     // start of a new bundle.
-    if( Pipeline::requires_bundling() && starts_bundle(head) )
+    if (Pipeline::requires_bundling() && starts_bundle(head))
       cb->flush_bundle(true);
 
+#ifdef ASSERT
+    if (!b->is_connector()) {
+      stringStream st;
+      b->dump_head(&_cfg->_bbs, &st);
+      MacroAssembler(cb).block_comment(st.as_string());
+    }
+#endif
+
     // Define the label at the beginning of the basic block
-    if (labels_not_set) {
-      MacroAssembler(cb).bind(blk_labels[b->_pre_order]);
-    } else {
-      assert(blk_labels[b->_pre_order].loc_pos() == cb->insts_size(),
-             err_msg("label position does not match code offset: %d != %d",
-                     blk_labels[b->_pre_order].loc_pos(), cb->insts_size()));
-    }
+    MacroAssembler(cb).bind(blk_labels[b->_pre_order]);
 
     uint last_inst = b->_nodes.size();
 
     // Emit block normally, except for last instruction.
     // Emit means "dump code bits into code buffer".
-    for( uint j = 0; j<last_inst; j++ ) {
+    for (uint j = 0; j<last_inst; j++) {
 
       // Get the node
       Node* n = b->_nodes[j];
@@ -1244,7 +1444,7 @@
 
       // If this starts a new instruction group, then flush the current one
       // (but allow split bundles)
-      if( Pipeline::requires_bundling() && starts_bundle(n) )
+      if (Pipeline::requires_bundling() && starts_bundle(n))
         cb->flush_bundle(false);
 
       // The following logic is duplicated in the code ifdeffed for
@@ -1253,38 +1453,35 @@
 
       // Special handling for SafePoint/Call Nodes
       bool is_mcall = false;
-      if( n->is_Mach() ) {
+      if (n->is_Mach()) {
         MachNode *mach = n->as_Mach();
         is_mcall = n->is_MachCall();
         bool is_sfn = n->is_MachSafePoint();
 
         // If this requires all previous instructions be flushed, then do so
-        if( is_sfn || is_mcall || mach->alignment_required() != 1) {
+        if (is_sfn || is_mcall || mach->alignment_required() != 1) {
           cb->flush_bundle(true);
           current_offset = cb->insts_size();
         }
 
+#ifdef ASSERT
+        // A padding may be needed again since a previous instruction
+        // could be moved to delay slot.
+
         // align the instruction if necessary
         int padding = mach->compute_padding(current_offset);
         // Make sure safepoint node for polling is distinct from a call's
         // return by adding a nop if needed.
-        if (is_sfn && !is_mcall && padding == 0 && current_offset == last_call_offset ) {
+        if (is_sfn && !is_mcall && padding == 0 && current_offset == last_call_offset) {
           padding = nop_size;
         }
-        assert( labels_not_set || padding == 0, "instruction should already be aligned");
-
-        if(padding > 0) {
-          assert((padding % nop_size) == 0, "padding is not a multiple of NOP size");
-          int nops_cnt = padding / nop_size;
-          MachNode *nop = new (this) MachNopNode(nops_cnt);
-          b->_nodes.insert(j++, nop);
-          last_inst++;
-          _cfg->_bbs.map( nop->_idx, b );
-          nop->emit(*cb, _regalloc);
-          cb->flush_bundle(true);
-          current_offset = cb->insts_size();
+        if (padding == 0 && mach->avoid_back_to_back() &&
+            current_offset == last_avoid_back_to_back_offset) {
+          // Avoid back to back some instructions.
+          padding = nop_size;
         }
-
+        assert(padding == 0, "padding should be added already");
+#endif
         // Remember the start of the last call in a basic block
         if (is_mcall) {
           MachCallNode *mcall = mach->as_MachCall();
@@ -1302,13 +1499,13 @@
         }
 
         // sfn will be valid whenever mcall is valid now because of inheritance
-        if( is_sfn || is_mcall ) {
+        if (is_sfn || is_mcall) {
 
           // Handle special safepoint nodes for synchronization
-          if( !is_mcall ) {
+          if (!is_mcall) {
             MachSafePointNode *sfn = mach->as_MachSafePoint();
             // !!!!! Stubs only need an oopmap right now, so bail out
-            if( sfn->jvms()->method() == NULL) {
+            if (sfn->jvms()->method() == NULL) {
               // Write the oopmap directly to the code blob??!!
 #             ifdef ENABLE_ZAP_DEAD_LOCALS
               assert( !is_node_getting_a_safepoint(sfn),  "logic does not match; false positive");
@@ -1328,14 +1525,14 @@
         }
 
         // If this is a branch, then fill in the label with the target BB's label
-        else if ( mach->is_Branch() ) {
-
-          if ( mach->ideal_Opcode() == Op_Jump ) {
-            for (uint h = 0; h < b->_num_succs; h++ ) {
+        else if (mach->is_Branch()) {
+
+          if (mach->ideal_Opcode() == Op_Jump) {
+            for (uint h = 0; h < b->_num_succs; h++) {
               Block* succs_block = b->_succs[h];
               for (uint j = 1; j < succs_block->num_preds(); j++) {
                 Node* jpn = succs_block->pred(j);
-                if ( jpn->is_JumpProj() && jpn->in(0) == mach ) {
+                if (jpn->is_JumpProj() && jpn->in(0) == mach) {
                   uint block_num = succs_block->non_connector()->_pre_order;
                   Label *blkLabel = &blk_labels[block_num];
                   mach->add_case_label(jpn->as_JumpProj()->proj_no(), blkLabel);
@@ -1352,7 +1549,7 @@
 
 #ifdef ASSERT
         // Check that oop-store precedes the card-mark
-        else if( mach->ideal_Opcode() == Op_StoreCM ) {
+        else if (mach->ideal_Opcode() == Op_StoreCM) {
           uint storeCM_idx = j;
           int count = 0;
           for (uint prec = mach->req(); prec < mach->len(); prec++) {
@@ -1371,7 +1568,7 @@
         }
 #endif
 
-        else if( !n->is_Proj() ) {
+        else if (!n->is_Proj()) {
           // Remember the beginning of the previous instruction, in case
           // it's followed by a flag-kill and a null-check.  Happens on
           // Intel all the time, with add-to-memory kind of opcodes.
@@ -1388,15 +1585,24 @@
 
       // Save the offset for the listing
 #ifndef PRODUCT
-      if( node_offsets && n->_idx < node_offset_limit )
+      if (node_offsets && n->_idx < node_offset_limit)
         node_offsets[n->_idx] = cb->insts_size();
 #endif
 
       // "Normal" instruction case
+      DEBUG_ONLY( uint instr_offset = cb->insts_size(); )
       n->emit(*cb, _regalloc);
       current_offset  = cb->insts_size();
+
+#ifdef ASSERT
+      if (n->size(_regalloc) != (current_offset-instr_offset)) {
+        n->dump();
+        assert(n->size(_regalloc) == (current_offset-instr_offset), "wrong size of mach node");
+      }
+#endif
       non_safepoints.observe_instruction(n, current_offset);
 
+#ifdef ASSERT
       // mcall is last "call" that can be a safepoint
       // record it so we can see if a poll will directly follow it
       // in which case we'll need a pad to make the PcDesc sites unique
@@ -1408,8 +1614,14 @@
         last_call_offset = current_offset;
       }
 
+      if (n->is_Mach() && n->as_Mach()->avoid_back_to_back()) {
+        // Avoid back to back some instructions.
+        last_avoid_back_to_back_offset = current_offset;
+      }
+#endif
+
       // See if this instruction has a delay slot
-      if ( valid_bundle_info(n) && node_bundling(n)->use_unconditional_delay()) {
+      if (valid_bundle_info(n) && node_bundling(n)->use_unconditional_delay()) {
         assert(delay_slot != NULL, "expecting delay slot node");
 
         // Back up 1 instruction
@@ -1417,15 +1629,15 @@
 
         // Save the offset for the listing
 #ifndef PRODUCT
-        if( node_offsets && delay_slot->_idx < node_offset_limit )
+        if (node_offsets && delay_slot->_idx < node_offset_limit)
           node_offsets[delay_slot->_idx] = cb->insts_size();
 #endif
 
         // Support a SafePoint in the delay slot
-        if( delay_slot->is_MachSafePoint() ) {
+        if (delay_slot->is_MachSafePoint()) {
           MachNode *mach = delay_slot->as_Mach();
           // !!!!! Stubs only need an oopmap right now, so bail out
-          if( !mach->is_MachCall() && mach->as_MachSafePoint()->jvms()->method() == NULL ) {
+          if (!mach->is_MachCall() && mach->as_MachSafePoint()->jvms()->method() == NULL) {
             // Write the oopmap directly to the code blob??!!
 #           ifdef ENABLE_ZAP_DEAD_LOCALS
             assert( !is_node_getting_a_safepoint(mach),  "logic does not match; false positive");
@@ -1449,21 +1661,15 @@
       }
 
     } // End for all instructions in block
-
+#ifdef ASSERT
     // If the next block is the top of a loop, pad this block out to align
     // the loop top a little. Helps prevent pipe stalls at loop back branches.
-    if( i<_cfg->_num_blocks-1 ) {
+    if (i < _cfg->_num_blocks-1) {
       Block *nb = _cfg->_blocks[i+1];
       uint padding = nb->alignment_padding(current_offset);
-      if( padding > 0 ) {
-        MachNode *nop = new (this) MachNopNode(padding / nop_size);
-        b->_nodes.insert( b->_nodes.size(), nop );
-        _cfg->_bbs.map( nop->_idx, b );
-        nop->emit(*cb, _regalloc);
-        current_offset = cb->insts_size();
-      }
+      assert(padding == 0, "alignment should be added already");
     }
-
+#endif
   } // End of for all blocks
 
   non_safepoints.flush_at_end();
@@ -1743,11 +1949,6 @@
   // Create a data structure for all the scheduling information
   Scheduling scheduling(Thread::current()->resource_area(), *this);
 
-  // Initialize the space for the BufferBlob used to find and verify
-  // instruction size in MachNode::emit_size()
-  init_scratch_buffer_blob(MAX_const_size);
-  if (failing())  return;  // Out of memory
-
   // Walk backwards over each basic block, computing the needed alignment
   // Walk over all the basic blocks
   scheduling.DoScheduling();
@@ -2346,6 +2547,12 @@
     // have their delay slots filled in the template expansions, so we don't
     // bother scheduling them.
     Node *last = bb->_nodes[_bb_end];
+    // Ignore trailing NOPs.
+    while (_bb_end > 0 && last->is_Mach() &&
+           last->as_Mach()->ideal_Opcode() == Op_Con) {
+      last = bb->_nodes[--_bb_end];
+    }
+    assert(!last->is_Mach() || last->as_Mach()->ideal_Opcode() != Op_Con, "");
     if( last->is_Catch() ||
        // Exclude unreachable path case when Halt node is in a separate block.
        (_bb_end > 1 && last->is_Mach() && last->as_Mach()->ideal_Opcode() == Op_Halt) ) {
@@ -2680,6 +2887,23 @@
       anti_do_def( b, n, _regalloc->get_reg_second(n), is_def );
     }
 
+    // Kill projections on a branch should appear to occur on the
+    // branch, not afterwards, so grab the masks from the projections
+    // and process them.
+    if (n->is_Branch()) {
+      for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
+        Node* use = n->fast_out(i);
+        if (use->is_Proj()) {
+          RegMask rm = use->out_RegMask();// Make local copy
+          while( rm.is_NotEmpty() ) {
+            OptoReg::Name kill = rm.find_first_elem();
+            rm.Remove(kill);
+            anti_do_def( b, n, kill, false );
+          }
+        }
+      }
+    }
+
     // Check each register used by this instruction for a following DEF/KILL
     // that must occur afterward and requires an anti-dependence edge.
     for( uint j=0; j<n->req(); j++ ) {