7125136: SIGILL on linux amd64 in gc/ArrayJuggle/Juggle29
authorkvn
Wed, 15 Feb 2012 21:37:49 -0800
changeset 11791 3be8cae67887
parent 11790 9bd8cd33db39
child 11792 fd885d66cb86
7125136: SIGILL on linux amd64 in gc/ArrayJuggle/Juggle29 Summary: For C2 moved saving EBP after ESP adjustment. For C1 generated 5 byte nop instruction first if needed. Reviewed-by: never, twisti, azeemj
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
hotspot/src/cpu/x86/vm/x86_32.ad
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/share/vm/opto/output.cpp
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Wed Feb 15 16:29:40 2012 -0800
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Wed Feb 15 21:37:49 2012 -0800
@@ -236,6 +236,16 @@
   }
 }
 
+// Force generation of a 4 byte immediate value even if it fits into 8bit
+void Assembler::emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32) {
+  assert(isByte(op1) && isByte(op2), "wrong opcode");
+  assert((op1 & 0x01) == 1, "should be 32bit operation");
+  assert((op1 & 0x02) == 0, "sign-extension bit should not be set");
+  emit_byte(op1);
+  emit_byte(op2 | encode(dst));
+  emit_long(imm32);
+}
+
 // immediate-to-memory forms
 void Assembler::emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32) {
   assert((op1 & 0x01) == 1, "should be 32bit operation");
@@ -939,6 +949,7 @@
 }
 
 void Assembler::addr_nop_4() {
+  assert(UseAddressNop, "no CPU support");
   // 4 bytes: NOP DWORD PTR [EAX+0]
   emit_byte(0x0F);
   emit_byte(0x1F);
@@ -947,6 +958,7 @@
 }
 
 void Assembler::addr_nop_5() {
+  assert(UseAddressNop, "no CPU support");
   // 5 bytes: NOP DWORD PTR [EAX+EAX*0+0] 8-bits offset
   emit_byte(0x0F);
   emit_byte(0x1F);
@@ -956,6 +968,7 @@
 }
 
 void Assembler::addr_nop_7() {
+  assert(UseAddressNop, "no CPU support");
   // 7 bytes: NOP DWORD PTR [EAX+0] 32-bits offset
   emit_byte(0x0F);
   emit_byte(0x1F);
@@ -964,6 +977,7 @@
 }
 
 void Assembler::addr_nop_8() {
+  assert(UseAddressNop, "no CPU support");
   // 8 bytes: NOP DWORD PTR [EAX+EAX*0+0] 32-bits offset
   emit_byte(0x0F);
   emit_byte(0x1F);
@@ -2769,6 +2783,12 @@
   emit_arith(0x81, 0xE8, dst, imm32);
 }
 
+// Force generation of a 4 byte immediate value even if it fits into 8bit
+void Assembler::subl_imm32(Register dst, int32_t imm32) {
+  prefix(dst);
+  emit_arith_imm32(0x81, 0xE8, dst, imm32);
+}
+
 void Assembler::subl(Register dst, Address src) {
   InstructionMark im(this);
   prefix(src, dst);
@@ -4760,6 +4780,12 @@
   emit_arith(0x81, 0xE8, dst, imm32);
 }
 
+// Force generation of a 4 byte immediate value even if it fits into 8bit
+void Assembler::subq_imm32(Register dst, int32_t imm32) {
+  (void) prefixq_and_encode(dst->encoding());
+  emit_arith_imm32(0x81, 0xE8, dst, imm32);
+}
+
 void Assembler::subq(Register dst, Address src) {
   InstructionMark im(this);
   prefixq(src, dst);
@@ -5101,15 +5127,6 @@
   }
 }
 
-void MacroAssembler::fat_nop() {
-  // A 5 byte nop that is safe for patching (see patch_verified_entry)
-  emit_byte(0x26); // es:
-  emit_byte(0x2e); // cs:
-  emit_byte(0x64); // fs:
-  emit_byte(0x65); // gs:
-  emit_byte(0x90);
-}
-
 void MacroAssembler::jC2(Register tmp, Label& L) {
   // set parity bit if FPU flag C2 is set (via rax)
   save_rax(tmp);
@@ -5704,17 +5721,6 @@
   /* else */      { subq(dst, value)       ; return; }
 }
 
-void MacroAssembler::fat_nop() {
-  // A 5 byte nop that is safe for patching (see patch_verified_entry)
-  // Recommened sequence from 'Software Optimization Guide for the AMD
-  // Hammer Processor'
-  emit_byte(0x66);
-  emit_byte(0x66);
-  emit_byte(0x90);
-  emit_byte(0x66);
-  emit_byte(0x90);
-}
-
 void MacroAssembler::incrementq(Register reg, int value) {
   if (value == min_jint) { addq(reg, value); return; }
   if (value <  0) { decrementq(reg, -value); return; }
@@ -6766,6 +6772,19 @@
   mov(rbp, rsp);
 }
 
+// A 5 byte nop that is safe for patching (see patch_verified_entry)
+void MacroAssembler::fat_nop() {
+  if (UseAddressNop) {
+    addr_nop_5();
+  } else {
+    emit_byte(0x26); // es:
+    emit_byte(0x2e); // cs:
+    emit_byte(0x64); // fs:
+    emit_byte(0x65); // gs:
+    emit_byte(0x90);
+  }
+}
+
 void MacroAssembler::fcmp(Register tmp) {
   fcmp(tmp, 1, true, true);
 }
@@ -7825,6 +7844,11 @@
   LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32));
 }
 
+// Force generation of a 4 byte immediate value even if it fits into 8bit
+void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
+  LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
+}
+
 void MacroAssembler::subptr(Register dst, Register src) {
   LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src));
 }
@@ -9292,6 +9316,80 @@
 }
 #endif // _LP64
 
+
+// C2 compiled method's prolog code.
+void MacroAssembler::verified_entry(int framesize, bool stack_bang, bool fp_mode_24b) {
+
+  // WARNING: Initial instruction MUST be 5 bytes or longer so that
+  // NativeJump::patch_verified_entry will be able to patch out the entry
+  // code safely. The push to verify stack depth is ok at 5 bytes,
+  // the frame allocation can be either 3 or 6 bytes. So if we don't do
+  // stack bang then we must use the 6 byte frame allocation even if
+  // we have no frame. :-(
+
+  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
+  // Remove word for return addr
+  framesize -= wordSize;
+
+  // Calls to C2R adapters often do not accept exceptional returns.
+  // We require that their callers must bang for them.  But be careful, because
+  // some VM calls (such as call site linkage) can use several kilobytes of
+  // stack.  But the stack safety zone should account for that.
+  // See bugs 4446381, 4468289, 4497237.
+  if (stack_bang) {
+    generate_stack_overflow_check(framesize);
+
+    // We always push rbp, so that on return to interpreter rbp, will be
+    // restored correctly and we can correct the stack.
+    push(rbp);
+    // Remove word for ebp
+    framesize -= wordSize;
+
+    // Create frame
+    if (framesize) {
+      subptr(rsp, framesize);
+    }
+  } else {
+    // Create frame (force generation of a 4 byte immediate value)
+    subptr_imm32(rsp, framesize);
+
+    // Save RBP register now.
+    framesize -= wordSize;
+    movptr(Address(rsp, framesize), rbp);
+  }
+
+  if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
+    framesize -= wordSize;
+    movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
+  }
+
+#ifndef _LP64
+  // If method sets FPU control word do it now
+  if (fp_mode_24b) {
+    fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
+  }
+  if (UseSSE >= 2 && VerifyFPU) {
+    verify_FPU(0, "FPU stack must be clean on entry");
+  }
+#endif
+
+#ifdef ASSERT
+  if (VerifyStackAtCalls) {
+    Label L;
+    push(rax);
+    mov(rax, rsp);
+    andptr(rax, StackAlignmentInBytes-1);
+    cmpptr(rax, StackAlignmentInBytes-wordSize);
+    pop(rax);
+    jcc(Assembler::equal, L);
+    stop("Stack is not properly aligned!");
+    bind(L);
+  }
+#endif
+
+}
+
+
 // IndexOf for constant substrings with size >= 8 chars
 // which don't need to be loaded through stack.
 void MacroAssembler::string_indexofC8(Register str1, Register str2,
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Wed Feb 15 16:29:40 2012 -0800
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Wed Feb 15 21:37:49 2012 -0800
@@ -667,6 +667,8 @@
   void emit_arith_b(int op1, int op2, Register dst, int imm8);
 
   void emit_arith(int op1, int op2, Register dst, int32_t imm32);
+  // Force generation of a 4 byte immediate value even if it fits into 8bit
+  void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
   // only 32bit??
   void emit_arith(int op1, int op2, Register dst, jobject obj);
   void emit_arith(int op1, int op2, Register dst, Register src);
@@ -1526,6 +1528,9 @@
   void subq(Register dst, Address src);
   void subq(Register dst, Register src);
 
+  // Force generation of a 4 byte immediate value even if it fits into 8bit
+  void subl_imm32(Register dst, int32_t imm32);
+  void subq_imm32(Register dst, int32_t imm32);
 
   // Subtract Scalar Double-Precision Floating-Point Values
   void subsd(XMMRegister dst, Address src);
@@ -1763,8 +1768,8 @@
   // Alignment
   void align(int modulus);
 
-  // Misc
-  void fat_nop(); // 5 byte nop
+  // A 5 byte nop that is safe for patching (see patch_verified_entry)
+  void fat_nop();
 
   // Stack frame creation/removal
   void enter();
@@ -2275,6 +2280,8 @@
 
   void subptr(Register dst, Address src) { LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); }
   void subptr(Register dst, int32_t src);
+  // Force generation of a 4 byte immediate value even if it fits into 8bit
+  void subptr_imm32(Register dst, int32_t src);
   void subptr(Register dst, Register src);
   void subptr(Register dst, RegisterOrConstant src) {
     if (src.is_constant()) subptr(dst, (int) src.as_constant());
@@ -2566,6 +2573,9 @@
   void movl2ptr(Register dst, Address src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(movl(dst, src)); }
   void movl2ptr(Register dst, Register src) { LP64_ONLY(movslq(dst, src)) NOT_LP64(if (dst != src) movl(dst, src)); }
 
+  // C2 compiled method's prolog code.
+  void verified_entry(int framesize, bool stack_bang, bool fp_mode_24b);
+
   // IndexOf strings.
   // Small strings are loaded through stack if they cross page boundary.
   void string_indexof(Register str1, Register str2,
--- a/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp	Wed Feb 15 16:29:40 2012 -0800
+++ b/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp	Wed Feb 15 21:37:49 2012 -0800
@@ -381,6 +381,16 @@
 
 
 void C1_MacroAssembler::verified_entry() {
+  if (C1Breakpoint || VerifyFPU || !UseStackBanging) {
+    // Verified Entry first instruction should be 5 bytes long for correct
+    // patching by patch_verified_entry().
+    //
+    // C1Breakpoint and VerifyFPU have one byte first instruction.
+    // Also first instruction will be one byte "push(rbp)" if stack banging
+    // code is not generated (see build_frame() above).
+    // For all these cases generate long instruction first.
+    fat_nop();
+  }
   if (C1Breakpoint)int3();
   // build frame
   verify_FPU(0, "method_entry");
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Wed Feb 15 16:29:40 2012 -0800
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Wed Feb 15 21:37:49 2012 -0800
@@ -550,118 +550,66 @@
 
 //=============================================================================
 #ifndef PRODUCT
-void MachPrologNode::format( PhaseRegAlloc *ra_, outputStream* st ) const {
+void MachPrologNode::format(PhaseRegAlloc* ra_, outputStream* st) const {
   Compile* C = ra_->C;
-  if( C->in_24_bit_fp_mode() ) {
-    st->print("FLDCW  24 bit fpu control word");
-    st->print_cr(""); st->print("\t");
-  }
 
   int framesize = C->frame_slots() << LogBytesPerInt;
   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
-  // Remove two words for return addr and rbp,
-  framesize -= 2*wordSize;
-
-  // Calls to C2R adapters often do not accept exceptional returns.
-  // We require that their callers must bang for them.  But be careful, because
-  // some VM calls (such as call site linkage) can use several kilobytes of
-  // stack.  But the stack safety zone should account for that.
-  // See bugs 4446381, 4468289, 4497237.
+  // Remove wordSize for return addr which is already pushed.
+  framesize -= wordSize;
+
   if (C->need_stack_bang(framesize)) {
-    st->print_cr("# stack bang"); st->print("\t");
-  }
-  st->print_cr("PUSHL  EBP"); st->print("\t");
-
-  if( VerifyStackAtCalls ) { // Majik cookie to verify stack depth
-    st->print("PUSH   0xBADB100D\t# Majik cookie for stack depth check");
-    st->print_cr(""); st->print("\t");
     framesize -= wordSize;
-  }
-
-  if ((C->in_24_bit_fp_mode() || VerifyStackAtCalls ) && framesize < 128 ) {
+    st->print("# stack bang");
+    st->print("\n\t");
+    st->print("PUSH   EBP\t# Save EBP");
     if (framesize) {
-      st->print("SUB    ESP,%d\t# Create frame",framesize);
+      st->print("\n\t");
+      st->print("SUB    ESP, #%d\t# Create frame",framesize);
     }
   } else {
-    st->print("SUB    ESP,%d\t# Create frame",framesize);
+    st->print("SUB    ESP, #%d\t# Create frame",framesize);
+    st->print("\n\t");
+    framesize -= wordSize;
+    st->print("MOV    [ESP + #%d], EBP\t# Save EBP",framesize);
+  }
+
+  if (VerifyStackAtCalls) {
+    st->print("\n\t");
+    framesize -= wordSize;
+    st->print("MOV    [ESP + #%d], 0xBADB100D\t# Majik cookie for stack depth check",framesize);
   }
+
+  if( C->in_24_bit_fp_mode() ) {
+    st->print("\n\t");
+    st->print("FLDCW  \t# load 24 bit fpu control word");
+  }
+  if (UseSSE >= 2 && VerifyFPU) {
+    st->print("\n\t");
+    st->print("# verify FPU stack (must be clean on entry)");
+  }
+
+#ifdef ASSERT
+  if (VerifyStackAtCalls) {
+    st->print("\n\t");
+    st->print("# stack alignment check");
+  }
+#endif
+  st->cr();
 }
 #endif
 
 
 void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
   Compile* C = ra_->C;
-
-  if (UseSSE >= 2 && VerifyFPU) {
-    MacroAssembler masm(&cbuf);
-    masm.verify_FPU(0, "FPU stack must be clean on entry");
-  }
-
-  // WARNING: Initial instruction MUST be 5 bytes or longer so that
-  // NativeJump::patch_verified_entry will be able to patch out the entry
-  // code safely. The fldcw is ok at 6 bytes, the push to verify stack
-  // depth is ok at 5 bytes, the frame allocation can be either 3 or
-  // 6 bytes. So if we don't do the fldcw or the push then we must
-  // use the 6 byte frame allocation even if we have no frame. :-(
-  // If method sets FPU control word do it now
-  if( C->in_24_bit_fp_mode() ) {
-    MacroAssembler masm(&cbuf);
-    masm.fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24()));
-  }
+  MacroAssembler _masm(&cbuf);
 
   int framesize = C->frame_slots() << LogBytesPerInt;
-  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
-  // Remove two words for return addr and rbp,
-  framesize -= 2*wordSize;
-
-  // Calls to C2R adapters often do not accept exceptional returns.
-  // We require that their callers must bang for them.  But be careful, because
-  // some VM calls (such as call site linkage) can use several kilobytes of
-  // stack.  But the stack safety zone should account for that.
-  // See bugs 4446381, 4468289, 4497237.
-  if (C->need_stack_bang(framesize)) {
-    MacroAssembler masm(&cbuf);
-    masm.generate_stack_overflow_check(framesize);
-  }
-
-  // We always push rbp, so that on return to interpreter rbp, will be
-  // restored correctly and we can correct the stack.
-  emit_opcode(cbuf, 0x50 | EBP_enc);
-
-  if( VerifyStackAtCalls ) { // Majik cookie to verify stack depth
-    emit_opcode(cbuf, 0x68); // push 0xbadb100d
-    emit_d32(cbuf, 0xbadb100d);
-    framesize -= wordSize;
-  }
-
-  if ((C->in_24_bit_fp_mode() || VerifyStackAtCalls ) && framesize < 128 ) {
-    if (framesize) {
-      emit_opcode(cbuf, 0x83);   // sub  SP,#framesize
-      emit_rm(cbuf, 0x3, 0x05, ESP_enc);
-      emit_d8(cbuf, framesize);
-    }
-  } else {
-    emit_opcode(cbuf, 0x81);   // sub  SP,#framesize
-    emit_rm(cbuf, 0x3, 0x05, ESP_enc);
-    emit_d32(cbuf, framesize);
-  }
+
+  __ verified_entry(framesize, C->need_stack_bang(framesize), C->in_24_bit_fp_mode());
+
   C->set_frame_complete(cbuf.insts_size());
 
-#ifdef ASSERT
-  if (VerifyStackAtCalls) {
-    Label L;
-    MacroAssembler masm(&cbuf);
-    masm.push(rax);
-    masm.mov(rax, rsp);
-    masm.andptr(rax, StackAlignmentInBytes-1);
-    masm.cmpptr(rax, StackAlignmentInBytes-wordSize);
-    masm.pop(rax);
-    masm.jcc(Assembler::equal, L);
-    masm.stop("Stack is not properly aligned!");
-    masm.bind(L);
-  }
-#endif
-
   if (C->has_mach_constant_base_node()) {
     // NOTE: We set the table base offset here because users might be
     // emitted before MachConstantBaseNode.
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Wed Feb 15 16:29:40 2012 -0800
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Wed Feb 15 21:37:49 2012 -0800
@@ -853,121 +853,53 @@
 
 //=============================================================================
 #ifndef PRODUCT
-void MachPrologNode::format(PhaseRegAlloc* ra_, outputStream* st) const
-{
+void MachPrologNode::format(PhaseRegAlloc* ra_, outputStream* st) const {
   Compile* C = ra_->C;
 
   int framesize = C->frame_slots() << LogBytesPerInt;
   assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
-  // Remove wordSize for return adr already pushed
-  // and another for the RBP we are going to save
-  framesize -= 2*wordSize;
-  bool need_nop = true;
-
-  // Calls to C2R adapters often do not accept exceptional returns.
-  // We require that their callers must bang for them.  But be
-  // careful, because some VM calls (such as call site linkage) can
-  // use several kilobytes of stack.  But the stack safety zone should
-  // account for that.  See bugs 4446381, 4468289, 4497237.
+  // Remove wordSize for return addr which is already pushed.
+  framesize -= wordSize;
+
   if (C->need_stack_bang(framesize)) {
-    st->print_cr("# stack bang"); st->print("\t");
-    need_nop = false;
+    framesize -= wordSize;
+    st->print("# stack bang");
+    st->print("\n\t");
+    st->print("pushq   rbp\t# Save rbp");
+    if (framesize) {
+      st->print("\n\t");
+      st->print("subq    rsp, #%d\t# Create frame",framesize);
+    }
+  } else {
+    st->print("subq    rsp, #%d\t# Create frame",framesize);
+    st->print("\n\t");
+    framesize -= wordSize;
+    st->print("movq    [rsp + #%d], rbp\t# Save rbp",framesize);
   }
-  st->print_cr("pushq   rbp"); st->print("\t");
 
   if (VerifyStackAtCalls) {
-    // Majik cookie to verify stack depth
-    st->print_cr("pushq   0xffffffffbadb100d"
-                  "\t# Majik cookie for stack depth check");
-    st->print("\t");
-    framesize -= wordSize; // Remove 2 for cookie
-    need_nop = false;
+    st->print("\n\t");
+    framesize -= wordSize;
+    st->print("movq    [rsp + #%d], 0xbadb100d\t# Majik cookie for stack depth check",framesize);
+#ifdef ASSERT
+    st->print("\n\t");
+    st->print("# stack alignment check");
+#endif
   }
-
-  if (framesize) {
-    st->print("subq    rsp, #%d\t# Create frame", framesize);
-    if (framesize < 0x80 && need_nop) {
-      st->print("\n\tnop\t# nop for patch_verified_entry");
-    }
-  }
+  st->cr();
 }
 #endif
 
-void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const
-{
+void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
   Compile* C = ra_->C;
-
-  // WARNING: Initial instruction MUST be 5 bytes or longer so that
-  // NativeJump::patch_verified_entry will be able to patch out the entry
-  // code safely. The fldcw is ok at 6 bytes, the push to verify stack
-  // depth is ok at 5 bytes, the frame allocation can be either 3 or
-  // 6 bytes. So if we don't do the fldcw or the push then we must
-  // use the 6 byte frame allocation even if we have no frame. :-(
-  // If method sets FPU control word do it now
+  MacroAssembler _masm(&cbuf);
 
   int framesize = C->frame_slots() << LogBytesPerInt;
-  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
-  // Remove wordSize for return adr already pushed
-  // and another for the RBP we are going to save
-  framesize -= 2*wordSize;
-  bool need_nop = true;
-
-  // Calls to C2R adapters often do not accept exceptional returns.
-  // We require that their callers must bang for them.  But be
-  // careful, because some VM calls (such as call site linkage) can
-  // use several kilobytes of stack.  But the stack safety zone should
-  // account for that.  See bugs 4446381, 4468289, 4497237.
-  if (C->need_stack_bang(framesize)) {
-    MacroAssembler masm(&cbuf);
-    masm.generate_stack_overflow_check(framesize);
-    need_nop = false;
-  }
-
-  // We always push rbp so that on return to interpreter rbp will be
-  // restored correctly and we can correct the stack.
-  emit_opcode(cbuf, 0x50 | RBP_enc);
-
-  if (VerifyStackAtCalls) {
-    // Majik cookie to verify stack depth
-    emit_opcode(cbuf, 0x68); // pushq (sign-extended) 0xbadb100d
-    emit_d32(cbuf, 0xbadb100d);
-    framesize -= wordSize; // Remove 2 for cookie
-    need_nop = false;
-  }
-
-  if (framesize) {
-    emit_opcode(cbuf, Assembler::REX_W);
-    if (framesize < 0x80) {
-      emit_opcode(cbuf, 0x83);   // sub  SP,#framesize
-      emit_rm(cbuf, 0x3, 0x05, RSP_enc);
-      emit_d8(cbuf, framesize);
-      if (need_nop) {
-        emit_opcode(cbuf, 0x90); // nop
-      }
-    } else {
-      emit_opcode(cbuf, 0x81);   // sub  SP,#framesize
-      emit_rm(cbuf, 0x3, 0x05, RSP_enc);
-      emit_d32(cbuf, framesize);
-    }
-  }
+
+  __ verified_entry(framesize, C->need_stack_bang(framesize), false);
 
   C->set_frame_complete(cbuf.insts_size());
 
-#ifdef ASSERT
-  if (VerifyStackAtCalls) {
-    Label L;
-    MacroAssembler masm(&cbuf);
-    masm.push(rax);
-    masm.mov(rax, rsp);
-    masm.andptr(rax, StackAlignmentInBytes-1);
-    masm.cmpptr(rax, StackAlignmentInBytes-wordSize);
-    masm.pop(rax);
-    masm.jcc(Assembler::equal, L);
-    masm.stop("Stack is not properly aligned!");
-    masm.bind(L);
-  }
-#endif
-
   if (C->has_mach_constant_base_node()) {
     // NOTE: We set the table base offset here because users might be
     // emitted before MachConstantBaseNode.
--- a/hotspot/src/share/vm/opto/output.cpp	Wed Feb 15 16:29:40 2012 -0800
+++ b/hotspot/src/share/vm/opto/output.cpp	Wed Feb 15 21:37:49 2012 -0800
@@ -167,7 +167,7 @@
   // Determine if we need to generate a stack overflow check.
   // Do it if the method is not a stub function and
   // has java calls or has frame size > vm_page_size/8.
-  return (stub_function() == NULL &&
+  return (UseStackBanging && stub_function() == NULL &&
           (has_java_calls() || frame_size_in_bytes > os::vm_page_size()>>3));
 }