6813212: factor duplicated assembly code for general subclass check (for 6655638)
authorjrose
Fri, 13 Mar 2009 18:39:22 -0700
changeset 2256 82d4e10b7c6b
parent 2255 54abdf3e1055
child 2257 d8e6e11e7f32
6813212: factor duplicated assembly code for general subclass check (for 6655638) Summary: Code in interp_masm, stubGenerator, c1_LIRAssembler, and AD files moved into MacroAssembler. Reviewed-by: kvn
hotspot/src/cpu/sparc/vm/assembler_sparc.cpp
hotspot/src/cpu/sparc/vm/assembler_sparc.hpp
hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp
hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp
hotspot/src/cpu/sparc/vm/interp_masm_sparc.cpp
hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp
hotspot/src/cpu/x86/vm/interp_masm_x86_32.cpp
hotspot/src/cpu/x86/vm/interp_masm_x86_64.cpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
hotspot/src/cpu/x86/vm/x86_32.ad
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/share/vm/opto/graphKit.cpp
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -2767,6 +2767,268 @@
 }
 
 
+void MacroAssembler::check_klass_subtype(Register sub_klass,
+                                         Register super_klass,
+                                         Register temp_reg,
+                                         Register temp2_reg,
+                                         Label& L_success) {
+  Label L_failure, L_pop_to_failure;
+  check_klass_subtype_fast_path(sub_klass, super_klass,
+                                temp_reg, temp2_reg,
+                                &L_success, &L_failure, NULL);
+  Register sub_2 = sub_klass;
+  Register sup_2 = super_klass;
+  if (!sub_2->is_global())  sub_2 = L0;
+  if (!sup_2->is_global())  sup_2 = L1;
+
+  save_frame_and_mov(0, sub_klass, sub_2, super_klass, sup_2);
+  check_klass_subtype_slow_path(sub_2, sup_2,
+                                L2, L3, L4, L5,
+                                NULL, &L_pop_to_failure);
+
+  // on success:
+  restore();
+  ba(false, L_success);
+  delayed()->nop();
+
+  // on failure:
+  bind(L_pop_to_failure);
+  restore();
+  bind(L_failure);
+}
+
+
+void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
+                                                   Register super_klass,
+                                                   Register temp_reg,
+                                                   Register temp2_reg,
+                                                   Label* L_success,
+                                                   Label* L_failure,
+                                                   Label* L_slow_path,
+                                        RegisterConstant super_check_offset,
+                                        Register instanceof_hack) {
+  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
+                   Klass::secondary_super_cache_offset_in_bytes());
+  int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
+                    Klass::super_check_offset_offset_in_bytes());
+
+  bool must_load_sco  = (super_check_offset.constant_or_zero() == -1);
+  bool need_slow_path = (must_load_sco ||
+                         super_check_offset.constant_or_zero() == sco_offset);
+
+  assert_different_registers(sub_klass, super_klass, temp_reg);
+  if (super_check_offset.is_register()) {
+    assert_different_registers(sub_klass, super_klass,
+                               super_check_offset.as_register());
+  } else if (must_load_sco) {
+    assert(temp2_reg != noreg, "supply either a temp or a register offset");
+  }
+
+  Label L_fallthrough;
+  int label_nulls = 0;
+  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
+  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
+  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
+  assert(label_nulls <= 1 || instanceof_hack != noreg ||
+         (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
+         "at most one NULL in the batch, usually");
+
+  // Support for the instanceof hack, which uses delay slots to
+  // set a destination register to zero or one.
+  bool do_bool_sets = (instanceof_hack != noreg);
+#define BOOL_SET(bool_value)                            \
+  if (do_bool_sets && bool_value >= 0)                  \
+    set(bool_value, instanceof_hack)
+#define DELAYED_BOOL_SET(bool_value)                    \
+  if (do_bool_sets && bool_value >= 0)                  \
+    delayed()->set(bool_value, instanceof_hack);        \
+  else delayed()->nop()
+  // Hacked ba(), which may only be used just before L_fallthrough.
+#define FINAL_JUMP(label, bool_value)                   \
+  if (&(label) == &L_fallthrough) {                     \
+    BOOL_SET(bool_value);                               \
+  } else {                                              \
+    ba((do_bool_sets && bool_value >= 0), label);       \
+    DELAYED_BOOL_SET(bool_value);                       \
+  }
+
+  // If the pointers are equal, we are done (e.g., String[] elements).
+  // This self-check enables sharing of secondary supertype arrays among
+  // non-primary types such as array-of-interface.  Otherwise, each such
+  // type would need its own customized SSA.
+  // We move this check to the front of the fast path because many
+  // type checks are in fact trivially successful in this manner,
+  // so we get a nicely predicted branch right at the start of the check.
+  cmp(super_klass, sub_klass);
+  brx(Assembler::equal, do_bool_sets, Assembler::pn, *L_success);
+  DELAYED_BOOL_SET(1);
+
+  // Check the supertype display:
+  if (must_load_sco) {
+    // The super check offset is always positive...
+    lduw(super_klass, sco_offset, temp2_reg);
+    super_check_offset = RegisterConstant(temp2_reg);
+  }
+  ld_ptr(sub_klass, super_check_offset, temp_reg);
+  cmp(super_klass, temp_reg);
+
+  // This check has worked decisively for primary supers.
+  // Secondary supers are sought in the super_cache ('super_cache_addr').
+  // (Secondary supers are interfaces and very deeply nested subtypes.)
+  // This works in the same check above because of a tricky aliasing
+  // between the super_cache and the primary super display elements.
+  // (The 'super_check_addr' can address either, as the case requires.)
+  // Note that the cache is updated below if it does not help us find
+  // what we need immediately.
+  // So if it was a primary super, we can just fail immediately.
+  // Otherwise, it's the slow path for us (no success at this point).
+
+  if (super_check_offset.is_register()) {
+    brx(Assembler::equal, do_bool_sets, Assembler::pn, *L_success);
+    delayed(); if (do_bool_sets)  BOOL_SET(1);
+    // if !do_bool_sets, sneak the next cmp into the delay slot:
+    cmp(super_check_offset.as_register(), sc_offset);
+
+    if (L_failure == &L_fallthrough) {
+      brx(Assembler::equal, do_bool_sets, Assembler::pt, *L_slow_path);
+      delayed()->nop();
+      BOOL_SET(0);  // fallthrough on failure
+    } else {
+      brx(Assembler::notEqual, do_bool_sets, Assembler::pn, *L_failure);
+      DELAYED_BOOL_SET(0);
+      FINAL_JUMP(*L_slow_path, -1);  // -1 => vanilla delay slot
+    }
+  } else if (super_check_offset.as_constant() == sc_offset) {
+    // Need a slow path; fast failure is impossible.
+    if (L_slow_path == &L_fallthrough) {
+      brx(Assembler::equal, do_bool_sets, Assembler::pt, *L_success);
+      DELAYED_BOOL_SET(1);
+    } else {
+      brx(Assembler::notEqual, false, Assembler::pn, *L_slow_path);
+      delayed()->nop();
+      FINAL_JUMP(*L_success, 1);
+    }
+  } else {
+    // No slow path; it's a fast decision.
+    if (L_failure == &L_fallthrough) {
+      brx(Assembler::equal, do_bool_sets, Assembler::pt, *L_success);
+      DELAYED_BOOL_SET(1);
+      BOOL_SET(0);
+    } else {
+      brx(Assembler::notEqual, do_bool_sets, Assembler::pn, *L_failure);
+      DELAYED_BOOL_SET(0);
+      FINAL_JUMP(*L_success, 1);
+    }
+  }
+
+  bind(L_fallthrough);
+
+#undef final_jump
+#undef bool_set
+#undef DELAYED_BOOL_SET
+#undef final_jump
+}
+
+
+void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
+                                                   Register super_klass,
+                                                   Register count_temp,
+                                                   Register scan_temp,
+                                                   Register scratch_reg,
+                                                   Register coop_reg,
+                                                   Label* L_success,
+                                                   Label* L_failure) {
+  assert_different_registers(sub_klass, super_klass,
+                             count_temp, scan_temp, scratch_reg, coop_reg);
+
+  Label L_fallthrough, L_loop;
+  int label_nulls = 0;
+  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
+  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
+  assert(label_nulls <= 1, "at most one NULL in the batch");
+
+  // a couple of useful fields in sub_klass:
+  int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
+                   Klass::secondary_supers_offset_in_bytes());
+  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
+                   Klass::secondary_super_cache_offset_in_bytes());
+
+  // Do a linear scan of the secondary super-klass chain.
+  // This code is rarely used, so simplicity is a virtue here.
+
+#ifndef PRODUCT
+  int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
+  inc_counter((address) pst_counter, count_temp, scan_temp);
+#endif
+
+  // We will consult the secondary-super array.
+  ld_ptr(sub_klass, ss_offset, scan_temp);
+
+  // Compress superclass if necessary.
+  Register search_key = super_klass;
+  bool decode_super_klass = false;
+  if (UseCompressedOops) {
+    if (coop_reg != noreg) {
+      encode_heap_oop_not_null(super_klass, coop_reg);
+      search_key = coop_reg;
+    } else {
+      encode_heap_oop_not_null(super_klass);
+      decode_super_klass = true; // scarce temps!
+    }
+    // The superclass is never null; it would be a basic system error if a null
+    // pointer were to sneak in here.  Note that we have already loaded the
+    // Klass::super_check_offset from the super_klass in the fast path,
+    // so if there is a null in that register, we are already in the afterlife.
+  }
+
+  // Load the array length.  (Positive movl does right thing on LP64.)
+  lduw(scan_temp, arrayOopDesc::length_offset_in_bytes(), count_temp);
+
+  // Check for empty secondary super list
+  tst(count_temp);
+
+  // Top of search loop
+  bind(L_loop);
+  br(Assembler::equal, false, Assembler::pn, *L_failure);
+  delayed()->add(scan_temp, heapOopSize, scan_temp);
+  assert(heapOopSize != 0, "heapOopSize should be initialized");
+
+  // Skip the array header in all array accesses.
+  int elem_offset = arrayOopDesc::base_offset_in_bytes(T_OBJECT);
+  elem_offset -= heapOopSize;   // the scan pointer was pre-incremented also
+
+  // Load next super to check
+  if (UseCompressedOops) {
+    // Don't use load_heap_oop; we don't want to decode the element.
+    lduw(   scan_temp, elem_offset, scratch_reg );
+  } else {
+    ld_ptr( scan_temp, elem_offset, scratch_reg );
+  }
+
+  // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
+  cmp(scratch_reg, search_key);
+
+  // A miss means we are NOT a subtype and need to keep looping
+  brx(Assembler::notEqual, false, Assembler::pn, L_loop);
+  delayed()->deccc(count_temp); // decrement trip counter in delay slot
+
+  // Falling out the bottom means we found a hit; we ARE a subtype
+  if (decode_super_klass) decode_heap_oop(super_klass);
+
+  // Success.  Cache the super we found and proceed in triumph.
+  st_ptr(super_klass, sub_klass, sc_offset);
+
+  if (L_success != &L_fallthrough) {
+    ba(false, *L_success);
+    delayed()->nop();
+  }
+
+  bind(L_fallthrough);
+}
+
+
+
+
 void MacroAssembler::biased_locking_enter(Register obj_reg, Register mark_reg,
                                           Register temp_reg,
                                           Label& done, Label* slow_case,
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp	Fri Mar 13 18:39:22 2009 -0700
@@ -2327,6 +2327,46 @@
                                Register temp_reg, Register temp2_reg,
                                Label& no_such_interface);
 
+  // Test sub_klass against super_klass, with fast and slow paths.
+
+  // The fast path produces a tri-state answer: yes / no / maybe-slow.
+  // One of the three labels can be NULL, meaning take the fall-through.
+  // If super_check_offset is -1, the value is loaded up from super_klass.
+  // No registers are killed, except temp_reg and temp2_reg.
+  // If super_check_offset is not -1, temp2_reg is not used and can be noreg.
+  void check_klass_subtype_fast_path(Register sub_klass,
+                                     Register super_klass,
+                                     Register temp_reg,
+                                     Register temp2_reg,
+                                     Label* L_success,
+                                     Label* L_failure,
+                                     Label* L_slow_path,
+                RegisterConstant super_check_offset = RegisterConstant(-1),
+                Register instanceof_hack = noreg);
+
+  // The rest of the type check; must be wired to a corresponding fast path.
+  // It does not repeat the fast path logic, so don't use it standalone.
+  // The temp_reg can be noreg, if no temps are available.
+  // It can also be sub_klass or super_klass, meaning it's OK to kill that one.
+  // Updates the sub's secondary super cache as necessary.
+  void check_klass_subtype_slow_path(Register sub_klass,
+                                     Register super_klass,
+                                     Register temp_reg,
+                                     Register temp2_reg,
+                                     Register temp3_reg,
+                                     Register temp4_reg,
+                                     Label* L_success,
+                                     Label* L_failure);
+
+  // Simplified, combined version, good for typical uses.
+  // Falls through on failure.
+  void check_klass_subtype(Register sub_klass,
+                           Register super_klass,
+                           Register temp_reg,
+                           Register temp2_reg,
+                           Label& L_success);
+
+
   // Stack overflow checking
 
   // Note: this clobbers G3_scratch
--- a/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -2393,23 +2393,11 @@
 
     // get instance klass
     load(k_RInfo, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc), k_RInfo, T_OBJECT, NULL);
-    // get super_check_offset
-    load(k_RInfo, sizeof(oopDesc) + Klass::super_check_offset_offset_in_bytes(), Rtmp1, T_INT, NULL);
-    // See if we get an immediate positive hit
-    __ ld_ptr(klass_RInfo, Rtmp1, FrameMap::O7_oop_opr->as_register());
-    __ cmp(k_RInfo, O7);
-    __ br(Assembler::equal, false, Assembler::pn, done);
-    __ delayed()->nop();
-    // check for immediate negative hit
-    __ cmp(Rtmp1, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes());
-    __ br(Assembler::notEqual, false, Assembler::pn, *stub->entry());
-    __ delayed()->nop();
-    // check for self
-    __ cmp(klass_RInfo, k_RInfo);
-    __ br(Assembler::equal, false, Assembler::pn, done);
-    __ delayed()->nop();
-
-    // assert(sub.is_same(FrameMap::G3_RInfo) && super.is_same(FrameMap::G1_RInfo), "incorrect call setup");
+    // perform the fast part of the checking logic
+    __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, O7, &done, stub->entry(), NULL);
+
+    // call out-of-line instance of __ check_klass_subtype_slow_path(...):
+    assert(klass_RInfo == G3 && k_RInfo == G1, "incorrect call setup");
     __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
     __ delayed()->nop();
     __ cmp(G3, 0);
@@ -2493,58 +2481,30 @@
       __ delayed()->nop();
       __ bind(done);
     } else {
+      bool need_slow_path = true;
       if (k->is_loaded()) {
-        load(klass_RInfo, k->super_check_offset(), Rtmp1, T_OBJECT, NULL);
-
-        if (sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes() != k->super_check_offset()) {
-          // See if we get an immediate positive hit
-          __ cmp(Rtmp1, k_RInfo );
-          __ br(Assembler::notEqual, false, Assembler::pn, *stub->entry());
-          __ delayed()->nop();
-        } else {
-          // See if we get an immediate positive hit
-          assert_different_registers(Rtmp1, k_RInfo, klass_RInfo);
-          __ cmp(Rtmp1, k_RInfo );
-          __ br(Assembler::equal, false, Assembler::pn, done);
-          // check for self
-          __ delayed()->cmp(klass_RInfo, k_RInfo);
-          __ br(Assembler::equal, false, Assembler::pn, done);
-          __ delayed()->nop();
-
-          // assert(sub.is_same(FrameMap::G3_RInfo) && super.is_same(FrameMap::G1_RInfo), "incorrect call setup");
-          __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
-          __ delayed()->nop();
-          __ cmp(G3, 0);
-          __ br(Assembler::equal, false, Assembler::pn, *stub->entry());
-          __ delayed()->nop();
-        }
-        __ bind(done);
+        if (k->super_check_offset() != sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes())
+          need_slow_path = false;
+        // perform the fast part of the checking logic
+        __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, noreg,
+                                         (need_slow_path ? &done : NULL),
+                                         stub->entry(), NULL,
+                                         RegisterConstant(k->super_check_offset()));
       } else {
-        assert_different_registers(Rtmp1, klass_RInfo, k_RInfo);
-
-        load(k_RInfo, sizeof(oopDesc) + Klass::super_check_offset_offset_in_bytes(), Rtmp1, T_INT, NULL);
-        // See if we get an immediate positive hit
-        load(klass_RInfo, Rtmp1, FrameMap::O7_oop_opr, T_OBJECT);
-        __ cmp(k_RInfo, O7);
-        __ br(Assembler::equal, false, Assembler::pn, done);
-        __ delayed()->nop();
-        // check for immediate negative hit
-        __ cmp(Rtmp1, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes());
-        __ br(Assembler::notEqual, false, Assembler::pn, *stub->entry());
-        // check for self
-        __ delayed()->cmp(klass_RInfo, k_RInfo);
-        __ br(Assembler::equal, false, Assembler::pn, done);
-        __ delayed()->nop();
-
-        // assert(sub.is_same(FrameMap::G3_RInfo) && super.is_same(FrameMap::G1_RInfo), "incorrect call setup");
+        // perform the fast part of the checking logic
+        __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, O7,
+                                         &done, stub->entry(), NULL);
+      }
+      if (need_slow_path) {
+        // call out-of-line instance of __ check_klass_subtype_slow_path(...):
+        assert(klass_RInfo == G3 && k_RInfo == G1, "incorrect call setup");
         __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
         __ delayed()->nop();
         __ cmp(G3, 0);
         __ br(Assembler::equal, false, Assembler::pn, *stub->entry());
         __ delayed()->nop();
-        __ bind(done);
       }
-
+      __ bind(done);
     }
     __ mov(obj, dst);
   } else if (code == lir_instanceof) {
@@ -2582,58 +2542,32 @@
       __ set(0, dst);
       __ bind(done);
     } else {
+      bool need_slow_path = true;
       if (k->is_loaded()) {
-        assert_different_registers(Rtmp1, klass_RInfo, k_RInfo);
-        load(klass_RInfo, k->super_check_offset(), Rtmp1, T_OBJECT, NULL);
-
-        if (sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes() != k->super_check_offset()) {
-          // See if we get an immediate positive hit
-          __ cmp(Rtmp1, k_RInfo );
-          __ br(Assembler::equal, true, Assembler::pt, done);
-          __ delayed()->set(1, dst);
-          __ set(0, dst);
-          __ bind(done);
-        } else {
-          // See if we get an immediate positive hit
-          assert_different_registers(Rtmp1, k_RInfo, klass_RInfo);
-          __ cmp(Rtmp1, k_RInfo );
-          __ br(Assembler::equal, true, Assembler::pt, done);
-          __ delayed()->set(1, dst);
-          // check for self
-          __ cmp(klass_RInfo, k_RInfo);
-          __ br(Assembler::equal, true, Assembler::pt, done);
-          __ delayed()->set(1, dst);
-
-          // assert(sub.is_same(FrameMap::G3_RInfo) && super.is_same(FrameMap::G1_RInfo), "incorrect call setup");
-          __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
-          __ delayed()->nop();
-          __ mov(G3, dst);
-          __ bind(done);
-        }
+        if (k->super_check_offset() != sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes())
+          need_slow_path = false;
+        // perform the fast part of the checking logic
+        __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, O7, noreg,
+                                         (need_slow_path ? &done : NULL),
+                                         (need_slow_path ? &done : NULL), NULL,
+                                         RegisterConstant(k->super_check_offset()),
+                                         dst);
       } else {
         assert(dst != klass_RInfo && dst != k_RInfo, "need 3 registers");
-
-        load(k_RInfo, sizeof(oopDesc) + Klass::super_check_offset_offset_in_bytes(), dst, T_INT, NULL);
-        // See if we get an immediate positive hit
-        load(klass_RInfo, dst, FrameMap::O7_oop_opr, T_OBJECT);
-        __ cmp(k_RInfo, O7);
-        __ br(Assembler::equal, true, Assembler::pt, done);
-        __ delayed()->set(1, dst);
-        // check for immediate negative hit
-        __ cmp(dst, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes());
-        __ br(Assembler::notEqual, true, Assembler::pt, done);
-        __ delayed()->set(0, dst);
-        // check for self
-        __ cmp(klass_RInfo, k_RInfo);
-        __ br(Assembler::equal, true, Assembler::pt, done);
-        __ delayed()->set(1, dst);
-
-        // assert(sub.is_same(FrameMap::G3_RInfo) && super.is_same(FrameMap::G1_RInfo), "incorrect call setup");
+        // perform the fast part of the checking logic
+        __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, O7, dst,
+                                         &done, &done, NULL,
+                                         RegisterConstant(-1),
+                                         dst);
+      }
+      if (need_slow_path) {
+        // call out-of-line instance of __ check_klass_subtype_slow_path(...):
+        assert(klass_RInfo == G3 && k_RInfo == G1, "incorrect call setup");
         __ call(Runtime1::entry_for(Runtime1::slow_subtype_check_id), relocInfo::runtime_call_type);
         __ delayed()->nop();
         __ mov(G3, dst);
-        __ bind(done);
       }
+      __ bind(done);
     }
   } else {
     ShouldNotReachHere();
--- a/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -714,38 +714,19 @@
         //      sub  : G3, argument, destroyed
         //      super: G1, argument, not changed
         //      raddr: O7, blown by call
-        Label loop, miss;
+        Label miss;
 
         __ save_frame(0);               // Blow no registers!
 
-        __ ld_ptr( G3, sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes(), L3 );
-        __ lduw(L3,arrayOopDesc::length_offset_in_bytes(),L0); // length in l0
-        __ add(L3,arrayOopDesc::base_offset_in_bytes(T_OBJECT),L1); // ptr into array
-        __ clr(L4);                     // Index
-        // Load a little early; will load 1 off the end of the array.
-        // Ok for now; revisit if we have other uses of this routine.
-        __ ld_ptr(L1,0,L2);             // Will load a little early
-
-        // The scan loop
-        __ bind(loop);
-        __ add(L1,wordSize,L1); // Bump by OOP size
-        __ cmp(L4,L0);
-        __ br(Assembler::equal,false,Assembler::pn,miss);
-        __ delayed()->inc(L4);  // Bump index
-        __ subcc(L2,G1,L3);             // Check for match; zero in L3 for a hit
-        __ brx( Assembler::notEqual, false, Assembler::pt, loop );
-        __ delayed()->ld_ptr(L1,0,L2); // Will load a little early
-
-        // Got a hit; report success; set cache
-        __ st_ptr( G1, G3, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes() );
+        __ check_klass_subtype_slow_path(G3, G1, L0, L1, L2, L4, NULL, &miss);
 
         __ mov(1, G3);
-        __ ret();                       // Result in G5 is ok; flags set
+        __ ret();                       // Result in G5 is 'true'
         __ delayed()->restore();        // free copy or add can go here
 
         __ bind(miss);
         __ mov(0, G3);
-        __ ret();                       // Result in G5 is ok; flags set
+        __ ret();                       // Result in G5 is 'false'
         __ delayed()->restore();        // free copy or add can go here
       }
 
--- a/hotspot/src/cpu/sparc/vm/interp_masm_sparc.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/sparc/vm/interp_masm_sparc.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -866,65 +866,18 @@
                                                   Register Rtmp2,
                                                   Register Rtmp3,
                                                   Label &ok_is_subtype ) {
-  Label not_subtype, loop;
+  Label not_subtype;
 
   // Profile the not-null value's klass.
   profile_typecheck(Rsub_klass, Rtmp1);
 
-  // Load the super-klass's check offset into Rtmp1
-  ld( Rsuper_klass, sizeof(oopDesc) + Klass::super_check_offset_offset_in_bytes(), Rtmp1 );
-  // Load from the sub-klass's super-class display list, or a 1-word cache of
-  // the secondary superclass list, or a failing value with a sentinel offset
-  // if the super-klass is an interface or exceptionally deep in the Java
-  // hierarchy and we have to scan the secondary superclass list the hard way.
-  ld_ptr( Rsub_klass, Rtmp1, Rtmp2 );
-  // See if we get an immediate positive hit
-  cmp( Rtmp2, Rsuper_klass );
-  brx( Assembler::equal, false, Assembler::pt, ok_is_subtype );
-  // In the delay slot, check for immediate negative hit
-  delayed()->cmp( Rtmp1, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes() );
-  br( Assembler::notEqual, false, Assembler::pt, not_subtype );
-  // In the delay slot, check for self
-  delayed()->cmp( Rsub_klass, Rsuper_klass );
-  brx( Assembler::equal, false, Assembler::pt, ok_is_subtype );
-
-  // Now do a linear scan of the secondary super-klass chain.
-  delayed()->ld_ptr( Rsub_klass, sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes(), Rtmp2 );
-
-  // compress superclass
-  if (UseCompressedOops) encode_heap_oop(Rsuper_klass);
-
-  // Rtmp2 holds the objArrayOop of secondary supers.
-  ld( Rtmp2, arrayOopDesc::length_offset_in_bytes(), Rtmp1 );// Load the array length
-  // Check for empty secondary super list
-  tst(Rtmp1);
-
-  // Top of search loop
-  bind( loop );
-  br( Assembler::equal, false, Assembler::pn, not_subtype );
-  delayed()->nop();
-
-  // load next super to check
-  if (UseCompressedOops) {
-    lduw( Rtmp2, arrayOopDesc::base_offset_in_bytes(T_OBJECT), Rtmp3);
-    // Bump array pointer forward one oop
-    add( Rtmp2, 4, Rtmp2 );
-  } else {
-    ld_ptr( Rtmp2, arrayOopDesc::base_offset_in_bytes(T_OBJECT), Rtmp3);
-    // Bump array pointer forward one oop
-    add( Rtmp2, wordSize, Rtmp2);
-  }
-  // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
-  cmp( Rtmp3, Rsuper_klass );
-  // A miss means we are NOT a subtype and need to keep looping
-  brx( Assembler::notEqual, false, Assembler::pt, loop );
-  delayed()->deccc( Rtmp1 );    // dec trip counter in delay slot
-  // Falling out the bottom means we found a hit; we ARE a subtype
-  if (UseCompressedOops) decode_heap_oop(Rsuper_klass);
-  br( Assembler::always, false, Assembler::pt, ok_is_subtype );
-  // Update the cache
-  delayed()->st_ptr( Rsuper_klass, Rsub_klass,
-                     sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes() );
+  check_klass_subtype_fast_path(Rsub_klass, Rsuper_klass,
+                                Rtmp1, Rtmp2,
+                                &ok_is_subtype, &not_subtype, NULL);
+
+  check_klass_subtype_slow_path(Rsub_klass, Rsuper_klass,
+                                Rtmp1, Rtmp2, Rtmp3, /*hack:*/ noreg,
+                                &ok_is_subtype, NULL);
 
   bind(not_subtype);
   profile_typecheck_failed(Rtmp1);
--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -900,19 +900,7 @@
     __ align(CodeEntryAlignment);
     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
     address start = __ pc();
-    Label loop, miss;
-
-    // Compare super with sub directly, since super is not in its own SSA.
-    // The compiler used to emit this test, but we fold it in here,
-    // to increase overall code density, with no real loss of speed.
-    { Label L;
-      __ cmp(O1, O2);
-      __ brx(Assembler::notEqual, false, Assembler::pt, L);
-      __ delayed()->nop();
-      __ retl();
-      __ delayed()->addcc(G0,0,O0); // set Z flags, zero result
-      __ bind(L);
-    }
+    Label miss;
 
 #if defined(COMPILER2) && !defined(_LP64)
     // Do not use a 'save' because it blows the 64-bit O registers.
@@ -936,56 +924,12 @@
     Register L2_super   = L2;
     Register L3_index   = L3;
 
-#ifdef _LP64
-    Register L4_ooptmp  = L4;
-
-    if (UseCompressedOops) {
-      // this must be under UseCompressedOops check, as we rely upon fact
-      // that L4 not clobbered in C2 on 32-bit platforms, where we do explicit save
-      // on stack, see several lines above
-      __ encode_heap_oop(Rsuper, L4_ooptmp);
-    }
-#endif
-
-    inc_counter_np(SharedRuntime::_partial_subtype_ctr, L0, L1);
-
-    __ ld_ptr( Rsub, sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes(), L3 );
-    __ lduw(L3,arrayOopDesc::length_offset_in_bytes(),L0_ary_len);
-    __ add(L3,arrayOopDesc::base_offset_in_bytes(T_OBJECT),L1_ary_ptr);
-    __ clr(L3_index);           // zero index
-    // Load a little early; will load 1 off the end of the array.
-    // Ok for now; revisit if we have other uses of this routine.
-    if (UseCompressedOops) {
-      __ lduw(L1_ary_ptr,0,L2_super);// Will load a little early
-    } else {
-      __ ld_ptr(L1_ary_ptr,0,L2_super);// Will load a little early
-    }
-
-    assert(heapOopSize != 0, "heapOopSize should be initialized");
-    // The scan loop
-    __ BIND(loop);
-    __ add(L1_ary_ptr, heapOopSize, L1_ary_ptr); // Bump by OOP size
-    __ cmp(L3_index,L0_ary_len);
-    __ br(Assembler::equal,false,Assembler::pn,miss);
-    __ delayed()->inc(L3_index); // Bump index
-
-    if (UseCompressedOops) {
-#ifdef  _LP64
-      __ subcc(L2_super,L4_ooptmp,Rret);   // Check for match; zero in Rret for a hit
-      __ br( Assembler::notEqual, false, Assembler::pt, loop );
-      __ delayed()->lduw(L1_ary_ptr,0,L2_super);// Will load a little early
-#else
-      ShouldNotReachHere();
-#endif
-    } else {
-      __ subcc(L2_super,Rsuper,Rret);   // Check for match; zero in Rret for a hit
-      __ brx( Assembler::notEqual, false, Assembler::pt, loop );
-      __ delayed()->ld_ptr(L1_ary_ptr,0,L2_super);// Will load a little early
-    }
-
-    // Got a hit; report success; set cache.  Cache load doesn't
-    // happen here; for speed it is directly emitted by the compiler.
-    __ st_ptr( Rsuper, Rsub, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes() );
+    __ check_klass_subtype_slow_path(Rsub, Rsuper,
+                                     L0, L1, L2, L3,
+                                     NULL, &miss);
+
+    // Match falls through here.
+    __ addcc(G0,0,Rret);        // set Z flags, Z result
 
 #if defined(COMPILER2) && !defined(_LP64)
     __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
@@ -999,7 +943,6 @@
     __ delayed()->restore();
 #endif
 
-    // Hit or miss falls through here
     __ BIND(miss);
     __ addcc(G0,1,Rret);        // set NZ flags, NZ result
 
@@ -2330,51 +2273,31 @@
                            Register super_check_offset,
                            Register super_klass,
                            Register temp,
-                           Label& L_success,
-                           Register deccc_hack = noreg) {
+                           Label& L_success) {
     assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
 
     BLOCK_COMMENT("type_check:");
 
-    Label L_miss;
+    Label L_miss, L_pop_to_miss;
 
     assert_clean_int(super_check_offset, temp);
 
-    // maybe decrement caller's trip count:
-#define DELAY_SLOT delayed();   \
-    { if (deccc_hack == noreg) __ nop(); else __ deccc(deccc_hack); }
-
-    // if the pointers are equal, we are done (e.g., String[] elements)
-    __ cmp(sub_klass, super_klass);
-    __ brx(Assembler::equal, true, Assembler::pt, L_success);
-    __ DELAY_SLOT;
-
-    // check the supertype display:
-    __ ld_ptr(sub_klass, super_check_offset, temp); // query the super type
-    __ cmp(super_klass,                      temp); // test the super type
-    __ brx(Assembler::equal, true, Assembler::pt, L_success);
-    __ DELAY_SLOT;
-
-    int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                     Klass::secondary_super_cache_offset_in_bytes());
-    __ cmp(super_klass, sc_offset);
-    __ brx(Assembler::notEqual, true, Assembler::pt, L_miss);
-    __ delayed()->nop();
-
+    __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
+                                     &L_success, &L_miss, NULL,
+                                     super_check_offset);
+
+    BLOCK_COMMENT("type_check_slow_path:");
     __ save_frame(0);
-    __ mov(sub_klass->after_save(), O1);
-    // mov(super_klass->after_save(), O2); //fill delay slot
-    assert(StubRoutines::Sparc::_partial_subtype_check != NULL, "order of generation");
-    __ call(StubRoutines::Sparc::_partial_subtype_check);
-    __ delayed()->mov(super_klass->after_save(), O2);
+    __ check_klass_subtype_slow_path(sub_klass->after_save(),
+                                     super_klass->after_save(),
+                                     L0, L1, L2, L4,
+                                     NULL, &L_pop_to_miss);
+    __ ba(false, L_success);
+    __ delayed()->restore();
+
+    __ bind(L_pop_to_miss);
     __ restore();
 
-    // Upon return, the condition codes are already set.
-    __ brx(Assembler::equal, true, Assembler::pt, L_success);
-    __ DELAY_SLOT;
-
-#undef DELAY_SLOT
-
     // Fall through on failure!
     __ BIND(L_miss);
   }
@@ -2411,7 +2334,7 @@
     gen_write_ref_array_pre_barrier(O1, O2);
 
 #ifdef ASSERT
-    // We sometimes save a frame (see partial_subtype_check below).
+    // We sometimes save a frame (see generate_type_check below).
     // If this will cause trouble, let's fail now instead of later.
     __ save_frame(0);
     __ restore();
@@ -2455,41 +2378,39 @@
     //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
     __ align(16);
 
-    __ bind(store_element);
-    // deccc(G1_remain);                // decrement the count (hoisted)
+    __ BIND(store_element);
+    __ deccc(G1_remain);                // decrement the count
     __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
     __ inc(O5_offset, heapOopSize);     // step to next offset
     __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
     __ delayed()->set(0, O0);           // return -1 on success
 
     // ======== loop entry is here ========
-    __ bind(load_element);
+    __ BIND(load_element);
     __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
     __ br_null(G3_oop, true, Assembler::pt, store_element);
-    __ delayed()->deccc(G1_remain);     // decrement the count
+    __ delayed()->nop();
 
     __ load_klass(G3_oop, G4_klass); // query the object klass
 
     generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
                         // branch to this on success:
-                        store_element,
-                        // decrement this on success:
-                        G1_remain);
+                        store_element);
     // ======== end loop ========
 
     // It was a real error; we must depend on the caller to finish the job.
     // Register G1 has number of *remaining* oops, O2 number of *total* oops.
     // Emit GC store barriers for the oops we have copied (O2 minus G1),
     // and report their number to the caller.
-    __ bind(fail);
+    __ BIND(fail);
     __ subcc(O2_count, G1_remain, O2_count);
     __ brx(Assembler::zero, false, Assembler::pt, done);
     __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
 
-    __ bind(do_card_marks);
+    __ BIND(do_card_marks);
     gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
 
-    __ bind(done);
+    __ BIND(done);
     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
     __ retl();
     __ delayed()->nop();             // return value in 00
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -7286,6 +7286,225 @@
 }
 
 
+void MacroAssembler::check_klass_subtype(Register sub_klass,
+                           Register super_klass,
+                           Register temp_reg,
+                           Label& L_success) {
+  Label L_failure;
+  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
+  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
+  bind(L_failure);
+}
+
+
+void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
+                                                   Register super_klass,
+                                                   Register temp_reg,
+                                                   Label* L_success,
+                                                   Label* L_failure,
+                                                   Label* L_slow_path,
+                                        RegisterConstant super_check_offset) {
+  assert_different_registers(sub_klass, super_klass, temp_reg);
+  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
+  if (super_check_offset.is_register()) {
+    assert_different_registers(sub_klass, super_klass,
+                               super_check_offset.as_register());
+  } else if (must_load_sco) {
+    assert(temp_reg != noreg, "supply either a temp or a register offset");
+  }
+
+  Label L_fallthrough;
+  int label_nulls = 0;
+  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
+  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
+  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
+  assert(label_nulls <= 1, "at most one NULL in the batch");
+
+  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
+                   Klass::secondary_super_cache_offset_in_bytes());
+  int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
+                    Klass::super_check_offset_offset_in_bytes());
+  Address super_check_offset_addr(super_klass, sco_offset);
+
+  // Hacked jcc, which "knows" that L_fallthrough, at least, is in
+  // range of a jccb.  If this routine grows larger, reconsider at
+  // least some of these.
+#define local_jcc(assembler_cond, label)                                \
+  if (&(label) == &L_fallthrough)  jccb(assembler_cond, label);         \
+  else                             jcc( assembler_cond, label) /*omit semi*/
+
+  // Hacked jmp, which may only be used just before L_fallthrough.
+#define final_jmp(label)                                                \
+  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
+  else                            jmp(label)                /*omit semi*/
+
+  // If the pointers are equal, we are done (e.g., String[] elements).
+  // This self-check enables sharing of secondary supertype arrays among
+  // non-primary types such as array-of-interface.  Otherwise, each such
+  // type would need its own customized SSA.
+  // We move this check to the front of the fast path because many
+  // type checks are in fact trivially successful in this manner,
+  // so we get a nicely predicted branch right at the start of the check.
+  cmpptr(sub_klass, super_klass);
+  local_jcc(Assembler::equal, *L_success);
+
+  // Check the supertype display:
+  if (must_load_sco) {
+    // Positive movl does right thing on LP64.
+    movl(temp_reg, super_check_offset_addr);
+    super_check_offset = RegisterConstant(temp_reg);
+  }
+  Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
+  cmpptr(super_klass, super_check_addr); // load displayed supertype
+
+  // This check has worked decisively for primary supers.
+  // Secondary supers are sought in the super_cache ('super_cache_addr').
+  // (Secondary supers are interfaces and very deeply nested subtypes.)
+  // This works in the same check above because of a tricky aliasing
+  // between the super_cache and the primary super display elements.
+  // (The 'super_check_addr' can address either, as the case requires.)
+  // Note that the cache is updated below if it does not help us find
+  // what we need immediately.
+  // So if it was a primary super, we can just fail immediately.
+  // Otherwise, it's the slow path for us (no success at this point).
+
+  if (super_check_offset.is_register()) {
+    local_jcc(Assembler::equal, *L_success);
+    cmpl(super_check_offset.as_register(), sc_offset);
+    if (L_failure == &L_fallthrough) {
+      local_jcc(Assembler::equal, *L_slow_path);
+    } else {
+      local_jcc(Assembler::notEqual, *L_failure);
+      final_jmp(*L_slow_path);
+    }
+  } else if (super_check_offset.as_constant() == sc_offset) {
+    // Need a slow path; fast failure is impossible.
+    if (L_slow_path == &L_fallthrough) {
+      local_jcc(Assembler::equal, *L_success);
+    } else {
+      local_jcc(Assembler::notEqual, *L_slow_path);
+      final_jmp(*L_success);
+    }
+  } else {
+    // No slow path; it's a fast decision.
+    if (L_failure == &L_fallthrough) {
+      local_jcc(Assembler::equal, *L_success);
+    } else {
+      local_jcc(Assembler::notEqual, *L_failure);
+      final_jmp(*L_success);
+    }
+  }
+
+  bind(L_fallthrough);
+
+#undef local_jcc
+#undef final_jmp
+}
+
+
+void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
+                                                   Register super_klass,
+                                                   Register temp_reg,
+                                                   Register temp2_reg,
+                                                   Label* L_success,
+                                                   Label* L_failure,
+                                                   bool set_cond_codes) {
+  assert_different_registers(sub_klass, super_klass, temp_reg);
+  if (temp2_reg != noreg)
+    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg);
+#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
+
+  Label L_fallthrough;
+  int label_nulls = 0;
+  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
+  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
+  assert(label_nulls <= 1, "at most one NULL in the batch");
+
+  // a couple of useful fields in sub_klass:
+  int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
+                   Klass::secondary_supers_offset_in_bytes());
+  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
+                   Klass::secondary_super_cache_offset_in_bytes());
+  Address secondary_supers_addr(sub_klass, ss_offset);
+  Address super_cache_addr(     sub_klass, sc_offset);
+
+  // Do a linear scan of the secondary super-klass chain.
+  // This code is rarely used, so simplicity is a virtue here.
+  // The repne_scan instruction uses fixed registers, which we must spill.
+  // Don't worry too much about pre-existing connections with the input regs.
+
+  assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
+  assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
+
+  // Get super_klass value into rax (even if it was in rdi or rcx).
+  bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false;
+  if (super_klass != rax || UseCompressedOops) {
+    if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
+    mov(rax, super_klass);
+  }
+  if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; }
+  if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
+
+#ifndef PRODUCT
+  int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
+  ExternalAddress pst_counter_addr((address) pst_counter);
+  NOT_LP64(  incrementl(pst_counter_addr) );
+  LP64_ONLY( lea(rcx, pst_counter_addr) );
+  LP64_ONLY( incrementl(Address(rcx, 0)) );
+#endif //PRODUCT
+
+  // We will consult the secondary-super array.
+  movptr(rdi, secondary_supers_addr);
+  // Load the array length.  (Positive movl does right thing on LP64.)
+  movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
+  // Skip to start of data.
+  addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
+
+  // Scan RCX words at [RDI] for an occurrence of RAX.
+  // Set NZ/Z based on last compare.
+#ifdef _LP64
+  // This part is tricky, as values in supers array could be 32 or 64 bit wide
+  // and we store values in objArrays always encoded, thus we need to encode
+  // the value of rax before repne.  Note that rax is dead after the repne.
+  if (UseCompressedOops) {
+    encode_heap_oop_not_null(rax);
+    // The superclass is never null; it would be a basic system error if a null
+    // pointer were to sneak in here.  Note that we have already loaded the
+    // Klass::super_check_offset from the super_klass in the fast path,
+    // so if there is a null in that register, we are already in the afterlife.
+    repne_scanl();
+  } else
+#endif // _LP64
+    repne_scan();
+
+  // Unspill the temp. registers:
+  if (pushed_rdi)  pop(rdi);
+  if (pushed_rcx)  pop(rcx);
+  if (pushed_rax)  pop(rax);
+
+  if (set_cond_codes) {
+    // Special hack for the AD files:  rdi is guaranteed non-zero.
+    assert(!pushed_rdi, "rdi must be left non-NULL");
+    // Also, the condition codes are properly set Z/NZ on succeed/failure.
+  }
+
+  if (L_failure == &L_fallthrough)
+        jccb(Assembler::notEqual, *L_failure);
+  else  jcc(Assembler::notEqual, *L_failure);
+
+  // Success.  Cache the super we found and proceed in triumph.
+  movptr(super_cache_addr, super_klass);
+
+  if (L_success != &L_fallthrough) {
+    jmp(*L_success);
+  }
+
+#undef IS_A_TEMP
+
+  bind(L_fallthrough);
+}
+
+
 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) {
   ucomisd(dst, as_Address(src));
 }
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Fri Mar 13 18:39:22 2009 -0700
@@ -1807,6 +1807,40 @@
                                Register scan_temp,
                                Label& no_such_interface);
 
+  // Test sub_klass against super_klass, with fast and slow paths.
+
+  // The fast path produces a tri-state answer: yes / no / maybe-slow.
+  // One of the three labels can be NULL, meaning take the fall-through.
+  // If super_check_offset is -1, the value is loaded up from super_klass.
+  // No registers are killed, except temp_reg.
+  void check_klass_subtype_fast_path(Register sub_klass,
+                                     Register super_klass,
+                                     Register temp_reg,
+                                     Label* L_success,
+                                     Label* L_failure,
+                                     Label* L_slow_path,
+                RegisterConstant super_check_offset = RegisterConstant(-1));
+
+  // The rest of the type check; must be wired to a corresponding fast path.
+  // It does not repeat the fast path logic, so don't use it standalone.
+  // The temp_reg and temp2_reg can be noreg, if no temps are available.
+  // Updates the sub's secondary super cache as necessary.
+  // If set_cond_codes, condition codes will be Z on success, NZ on failure.
+  void check_klass_subtype_slow_path(Register sub_klass,
+                                     Register super_klass,
+                                     Register temp_reg,
+                                     Register temp2_reg,
+                                     Label* L_success,
+                                     Label* L_failure,
+                                     bool set_cond_codes = false);
+
+  // Simplified, combined version, good for typical uses.
+  // Falls through on failure.
+  void check_klass_subtype(Register sub_klass,
+                           Register super_klass,
+                           Register temp_reg,
+                           Label& L_success);
+
   //----
   void set_word_if_not_zero(Register reg); // sets reg to 1 if not zero, otherwise 0
 
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -1598,18 +1598,9 @@
 
     // get instance klass
     __ movptr(k_RInfo, Address(k_RInfo, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
-    // get super_check_offset
-    __ movl(Rtmp1, Address(k_RInfo, sizeof(oopDesc) + Klass::super_check_offset_offset_in_bytes()));
-    // See if we get an immediate positive hit
-    __ cmpptr(k_RInfo, Address(klass_RInfo, Rtmp1, Address::times_1));
-    __ jcc(Assembler::equal, done);
-    // check for immediate negative hit
-    __ cmpl(Rtmp1, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes());
-    __ jcc(Assembler::notEqual, *stub->entry());
-    // check for self
-    __ cmpptr(klass_RInfo, k_RInfo);
-    __ jcc(Assembler::equal, done);
-
+    // perform the fast part of the checking logic
+    __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, &done, stub->entry(), NULL);
+    // call out-of-line instance of __ check_klass_subtype_slow_path(...):
     __ push(klass_RInfo);
     __ push(k_RInfo);
     __ call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
@@ -1735,17 +1726,9 @@
         }
         __ bind(done);
       } else {
-        __ movl(Rtmp1, Address(k_RInfo, sizeof(oopDesc) + Klass::super_check_offset_offset_in_bytes()));
-        // See if we get an immediate positive hit
-        __ cmpptr(k_RInfo, Address(klass_RInfo, Rtmp1, Address::times_1));
-        __ jcc(Assembler::equal, done);
-        // check for immediate negative hit
-        __ cmpl(Rtmp1, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes());
-        __ jcc(Assembler::notEqual, *stub->entry());
-        // check for self
-        __ cmpptr(klass_RInfo, k_RInfo);
-        __ jcc(Assembler::equal, done);
-
+        // perform the fast part of the checking logic
+        __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, &done, stub->entry(), NULL);
+        // call out-of-line instance of __ check_klass_subtype_slow_path(...):
         __ push(klass_RInfo);
         __ push(k_RInfo);
         __ call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
@@ -1821,23 +1804,15 @@
           __ pop(dst);
           __ jmp(done);
         }
-      } else {
-#else
-      { // YUCK
+      }
+        else // next block is unconditional if LP64:
 #endif // LP64
+      {
         assert(dst != klass_RInfo && dst != k_RInfo, "need 3 registers");
 
-        __ movl(dst, Address(k_RInfo, sizeof(oopDesc) + Klass::super_check_offset_offset_in_bytes()));
-        // See if we get an immediate positive hit
-        __ cmpptr(k_RInfo, Address(klass_RInfo, dst, Address::times_1));
-        __ jcc(Assembler::equal, one);
-        // check for immediate negative hit
-        __ cmpl(dst, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes());
-        __ jcc(Assembler::notEqual, zero);
-        // check for self
-        __ cmpptr(klass_RInfo, k_RInfo);
-        __ jcc(Assembler::equal, one);
-
+        // perform the fast part of the checking logic
+        __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, dst, &one, &zero, NULL);
+        // call out-of-line instance of __ check_klass_subtype_slow_path(...):
         __ push(klass_RInfo);
         __ push(k_RInfo);
         __ call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
--- a/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -1354,6 +1354,13 @@
 
     case slow_subtype_check_id:
       {
+        // Typical calling sequence:
+        // __ push(klass_RInfo);  // object klass or other subclass
+        // __ push(sup_k_RInfo);  // array element klass or other superclass
+        // __ call(slow_subtype_check);
+        // Note that the subclass is pushed first, and is therefore deepest.
+        // Previous versions of this code reversed the names 'sub' and 'super'.
+        // This was operationally harmless but made the code unreadable.
         enum layout {
           rax_off, SLOT2(raxH_off)
           rcx_off, SLOT2(rcxH_off)
@@ -1361,9 +1368,10 @@
           rdi_off, SLOT2(rdiH_off)
           // saved_rbp_off, SLOT2(saved_rbpH_off)
           return_off, SLOT2(returnH_off)
-          sub_off, SLOT2(subH_off)
-          super_off, SLOT2(superH_off)
-          framesize
+          sup_k_off, SLOT2(sup_kH_off)
+          klass_off, SLOT2(superH_off)
+          framesize,
+          result_off = klass_off  // deepest argument is also the return value
         };
 
         __ set_info("slow_subtype_check", dont_gc_arguments);
@@ -1373,19 +1381,14 @@
         __ push(rax);
 
         // This is called by pushing args and not with C abi
-        __ movptr(rsi, Address(rsp, (super_off) * VMRegImpl::stack_slot_size)); // super
-        __ movptr(rax, Address(rsp, (sub_off  ) * VMRegImpl::stack_slot_size)); // sub
-
-        __ movptr(rdi,Address(rsi,sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes()));
-        // since size is postive movl does right thing on 64bit
-        __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
-        __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
+        __ movptr(rsi, Address(rsp, (klass_off) * VMRegImpl::stack_slot_size)); // subclass
+        __ movptr(rax, Address(rsp, (sup_k_off) * VMRegImpl::stack_slot_size)); // superclass
 
         Label miss;
-        __ repne_scan();
-        __ jcc(Assembler::notEqual, miss);
-        __ movptr(Address(rsi,sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes()), rax);
-        __ movptr(Address(rsp, (super_off) * VMRegImpl::stack_slot_size), 1); // result
+        __ check_klass_subtype_slow_path(rsi, rax, rcx, rdi, NULL, &miss);
+
+        // fallthrough on success:
+        __ movptr(Address(rsp, (result_off) * VMRegImpl::stack_slot_size), 1); // result
         __ pop(rax);
         __ pop(rcx);
         __ pop(rsi);
@@ -1393,7 +1396,7 @@
         __ ret(0);
 
         __ bind(miss);
-        __ movptr(Address(rsp, (super_off) * VMRegImpl::stack_slot_size), NULL_WORD); // result
+        __ movptr(Address(rsp, (result_off) * VMRegImpl::stack_slot_size), NULL_WORD); // result
         __ pop(rax);
         __ pop(rcx);
         __ pop(rsi);
--- a/hotspot/src/cpu/x86/vm/interp_masm_x86_32.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/interp_masm_x86_32.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -219,47 +219,16 @@
   // Resets EDI to locals.  Register sub_klass cannot be any of the above.
 void InterpreterMacroAssembler::gen_subtype_check( Register Rsub_klass, Label &ok_is_subtype ) {
   assert( Rsub_klass != rax, "rax, holds superklass" );
-  assert( Rsub_klass != rcx, "rcx holds 2ndary super array length" );
-  assert( Rsub_klass != rdi, "rdi holds 2ndary super array scan ptr" );
-  Label not_subtype, loop;
+  assert( Rsub_klass != rcx, "used as a temp" );
+  assert( Rsub_klass != rdi, "used as a temp, restored from locals" );
 
   // Profile the not-null value's klass.
-  profile_typecheck(rcx, Rsub_klass, rdi); // blows rcx, rdi
-
-  // Load the super-klass's check offset into ECX
-  movl( rcx, Address(rax, sizeof(oopDesc) + Klass::super_check_offset_offset_in_bytes() ) );
-  // Load from the sub-klass's super-class display list, or a 1-word cache of
-  // the secondary superclass list, or a failing value with a sentinel offset
-  // if the super-klass is an interface or exceptionally deep in the Java
-  // hierarchy and we have to scan the secondary superclass list the hard way.
-  // See if we get an immediate positive hit
-  cmpptr( rax, Address(Rsub_klass,rcx,Address::times_1) );
-  jcc( Assembler::equal,ok_is_subtype );
+  profile_typecheck(rcx, Rsub_klass, rdi); // blows rcx, reloads rdi
 
-  // Check for immediate negative hit
-  cmpl( rcx, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes() );
-  jcc( Assembler::notEqual, not_subtype );
-  // Check for self
-  cmpptr( Rsub_klass, rax );
-  jcc( Assembler::equal, ok_is_subtype );
+  // Do the check.
+  check_klass_subtype(Rsub_klass, rax, rcx, ok_is_subtype); // blows rcx
 
-  // Now do a linear scan of the secondary super-klass chain.
-  movptr( rdi, Address(Rsub_klass, sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes()) );
-  // EDI holds the objArrayOop of secondary supers.
-  movl( rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));// Load the array length
-  // Skip to start of data; also clear Z flag incase ECX is zero
-  addptr( rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT) );
-  // Scan ECX words at [EDI] for occurance of EAX
-  // Set NZ/Z based on last compare
-  repne_scan();
-  restore_locals();           // Restore EDI; Must not blow flags
-  // Not equal?
-  jcc( Assembler::notEqual, not_subtype );
-  // Must be equal but missed in cache.  Update cache.
-  movptr( Address(Rsub_klass, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes()), rax );
-  jmp( ok_is_subtype );
-
-  bind(not_subtype);
+  // Profile the failure of the check.
   profile_typecheck_failed(rcx); // blows rcx
 }
 
--- a/hotspot/src/cpu/x86/vm/interp_masm_x86_64.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/interp_masm_x86_64.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -232,65 +232,13 @@
   assert(Rsub_klass != rcx, "rcx holds 2ndary super array length");
   assert(Rsub_klass != rdi, "rdi holds 2ndary super array scan ptr");
 
-  Label not_subtype, not_subtype_pop, loop;
-
   // Profile the not-null value's klass.
-  profile_typecheck(rcx, Rsub_klass, rdi); // blows rcx, rdi
-
-  // Load the super-klass's check offset into rcx
-  movl(rcx, Address(rax, sizeof(oopDesc) +
-                    Klass::super_check_offset_offset_in_bytes()));
-  // Load from the sub-klass's super-class display list, or a 1-word
-  // cache of the secondary superclass list, or a failing value with a
-  // sentinel offset if the super-klass is an interface or
-  // exceptionally deep in the Java hierarchy and we have to scan the
-  // secondary superclass list the hard way.  See if we get an
-  // immediate positive hit
-  cmpptr(rax, Address(Rsub_klass, rcx, Address::times_1));
-  jcc(Assembler::equal,ok_is_subtype);
-
-  // Check for immediate negative hit
-  cmpl(rcx, sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes());
-  jcc( Assembler::notEqual, not_subtype );
-  // Check for self
-  cmpptr(Rsub_klass, rax);
-  jcc(Assembler::equal, ok_is_subtype);
+  profile_typecheck(rcx, Rsub_klass, rdi); // blows rcx, reloads rdi
 
-  // Now do a linear scan of the secondary super-klass chain.
-  movptr(rdi, Address(Rsub_klass, sizeof(oopDesc) +
-                      Klass::secondary_supers_offset_in_bytes()));
-  // rdi holds the objArrayOop of secondary supers.
-  // Load the array length
-  movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
-  // Skip to start of data; also clear Z flag incase rcx is zero
-  addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
-  // Scan rcx words at [rdi] for occurance of rax
-  // Set NZ/Z based on last compare
+  // Do the check.
+  check_klass_subtype(Rsub_klass, rax, rcx, ok_is_subtype); // blows rcx
 
-  // this part is kind tricky, as values in supers array could be 32 or 64 bit wide
-  // and we store values in objArrays always encoded, thus we need to encode value
-  // before repne
-  if (UseCompressedOops) {
-    push(rax);
-    encode_heap_oop(rax);
-    repne_scanl();
-    // Not equal?
-    jcc(Assembler::notEqual, not_subtype_pop);
-    // restore heap oop here for movq
-    pop(rax);
-  } else {
-    repne_scan();
-    jcc(Assembler::notEqual, not_subtype);
-  }
-  // Must be equal but missed in cache.  Update cache.
-  movptr(Address(Rsub_klass, sizeof(oopDesc) +
-               Klass::secondary_super_cache_offset_in_bytes()), rax);
-  jmp(ok_is_subtype);
-
-  bind(not_subtype_pop);
-  // restore heap oop here for miss
-  if (UseCompressedOops) pop(rax);
-  bind(not_subtype);
+  // Profile the failure of the check.
   profile_typecheck_failed(rcx); // blows rcx
 }
 
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -1310,81 +1310,51 @@
                            Address& super_check_offset_addr,
                            Address& super_klass_addr,
                            Register temp,
-                           Label* L_success_ptr, Label* L_failure_ptr) {
+                           Label* L_success, Label* L_failure) {
     BLOCK_COMMENT("type_check:");
 
     Label L_fallthrough;
-    bool fall_through_on_success = (L_success_ptr == NULL);
-    if (fall_through_on_success) {
-      L_success_ptr = &L_fallthrough;
-    } else {
-      L_failure_ptr = &L_fallthrough;
-    }
-    Label& L_success = *L_success_ptr;
-    Label& L_failure = *L_failure_ptr;
+#define LOCAL_JCC(assembler_con, label_ptr)                             \
+    if (label_ptr != NULL)  __ jcc(assembler_con, *(label_ptr));        \
+    else                    __ jcc(assembler_con, L_fallthrough) /*omit semi*/
 
+    // The following is a strange variation of the fast path which requires
+    // one less register, because needed values are on the argument stack.
+    // __ check_klass_subtype_fast_path(sub_klass, *super_klass*, temp,
+    //                                  L_success, L_failure, NULL);
     assert_different_registers(sub_klass, temp);
 
-    // a couple of useful fields in sub_klass:
-    int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
-                     Klass::secondary_supers_offset_in_bytes());
     int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
                      Klass::secondary_super_cache_offset_in_bytes());
-    Address secondary_supers_addr(sub_klass, ss_offset);
-    Address super_cache_addr(     sub_klass, sc_offset);
 
     // if the pointers are equal, we are done (e.g., String[] elements)
     __ cmpptr(sub_klass, super_klass_addr);
-    __ jcc(Assembler::equal, L_success);
+    LOCAL_JCC(Assembler::equal, L_success);
 
     // check the supertype display:
     __ movl2ptr(temp, super_check_offset_addr);
     Address super_check_addr(sub_klass, temp, Address::times_1, 0);
     __ movptr(temp, super_check_addr); // load displayed supertype
     __ cmpptr(temp, super_klass_addr); // test the super type
-    __ jcc(Assembler::equal, L_success);
+    LOCAL_JCC(Assembler::equal, L_success);
 
     // if it was a primary super, we can just fail immediately
     __ cmpl(super_check_offset_addr, sc_offset);
-    __ jcc(Assembler::notEqual, L_failure);
-
-    // Now do a linear scan of the secondary super-klass chain.
-    // This code is rarely used, so simplicity is a virtue here.
-    inc_counter_np(SharedRuntime::_partial_subtype_ctr);
-    {
-      // The repne_scan instruction uses fixed registers, which we must spill.
-      // (We need a couple more temps in any case.)
-      __ push(rax);
-      __ push(rcx);
-      __ push(rdi);
-      assert_different_registers(sub_klass, rax, rcx, rdi);
+    LOCAL_JCC(Assembler::notEqual, L_failure);
 
-      __ movptr(rdi, secondary_supers_addr);
-      // Load the array length.
-      __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
-      // Skip to start of data.
-      __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
-      // Scan rcx words at [edi] for occurance of rax,
-      // Set NZ/Z based on last compare
-      __ movptr(rax, super_klass_addr);
-      __ repne_scan();
+    // The repne_scan instruction uses fixed registers, which will get spilled.
+    // We happen to know this works best when super_klass is in rax.
+    Register super_klass = temp;
+    __ movptr(super_klass, super_klass_addr);
+    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg,
+                                     L_success, L_failure);
 
-      // Unspill the temp. registers:
-      __ pop(rdi);
-      __ pop(rcx);
-      __ pop(rax);
-    }
-    __ jcc(Assembler::notEqual, L_failure);
+    __ bind(L_fallthrough);
 
-    // Success.  Cache the super we found and proceed in triumph.
-    __ movptr(temp, super_klass_addr); // note: rax, is dead
-    __ movptr(super_cache_addr, temp);
+    if (L_success == NULL) { BLOCK_COMMENT("L_success:"); }
+    if (L_failure == NULL) { BLOCK_COMMENT("L_failure:"); }
 
-    if (!fall_through_on_success)
-      __ jmp(L_success);
-
-    // Fall through on failure!
-    __ bind(L_fallthrough);
+#undef LOCAL_JCC
   }
 
   //
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -2091,66 +2091,9 @@
 
     Label L_miss;
 
-    // a couple of useful fields in sub_klass:
-    int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
-                     Klass::secondary_supers_offset_in_bytes());
-    int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                     Klass::secondary_super_cache_offset_in_bytes());
-    Address secondary_supers_addr(sub_klass, ss_offset);
-    Address super_cache_addr(     sub_klass, sc_offset);
-
-    // if the pointers are equal, we are done (e.g., String[] elements)
-    __ cmpptr(super_klass, sub_klass);
-    __ jcc(Assembler::equal, L_success);
-
-    // check the supertype display:
-    Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
-    __ cmpptr(super_klass, super_check_addr); // test the super type
-    __ jcc(Assembler::equal, L_success);
-
-    // if it was a primary super, we can just fail immediately
-    __ cmpl(super_check_offset, sc_offset);
-    __ jcc(Assembler::notEqual, L_miss);
-
-    // Now do a linear scan of the secondary super-klass chain.
-    // The repne_scan instruction uses fixed registers, which we must spill.
-    // (We need a couple more temps in any case.)
-    // This code is rarely used, so simplicity is a virtue here.
-    inc_counter_np(SharedRuntime::_partial_subtype_ctr);
-    {
-      __ push(rax);
-      __ push(rcx);
-      __ push(rdi);
-      assert_different_registers(sub_klass, super_klass, rax, rcx, rdi);
-
-      __ movptr(rdi, secondary_supers_addr);
-      // Load the array length.
-      __ movl(rcx, Address(rdi, arrayOopDesc::length_offset_in_bytes()));
-      // Skip to start of data.
-      __ addptr(rdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
-      // Scan rcx words at [rdi] for occurance of rax
-      // Set NZ/Z based on last compare
-      __ movptr(rax, super_klass);
-      if (UseCompressedOops) {
-        // Compare against compressed form.  Don't need to uncompress because
-        // looks like orig rax is restored in popq below.
-        __ encode_heap_oop(rax);
-        __ repne_scanl();
-      } else {
-        __ repne_scan();
-      }
-
-      // Unspill the temp. registers:
-      __ pop(rdi);
-      __ pop(rcx);
-      __ pop(rax);
-
-      __ jcc(Assembler::notEqual, L_miss);
-    }
-
-    // Success.  Cache the super we found and proceed in triumph.
-    __ movptr(super_cache_addr, super_klass); // note: rax is dead
-    __ jmp(L_success);
+    __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
+                                     super_check_offset);
+    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
 
     // Fall through on failure!
     __ BIND(L_miss);
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Fri Mar 13 18:39:22 2009 -0700
@@ -1692,26 +1692,15 @@
     Register Reax = as_Register(EAX_enc); // super class
     Register Recx = as_Register(ECX_enc); // killed
     Register Resi = as_Register(ESI_enc); // sub class
-    Label hit, miss;
+    Label miss;
 
     MacroAssembler _masm(&cbuf);
-    // Compare super with sub directly, since super is not in its own SSA.
-    // The compiler used to emit this test, but we fold it in here,
-    // to allow platform-specific tweaking on sparc.
-    __ cmpptr(Reax, Resi);
-    __ jcc(Assembler::equal, hit);
-#ifndef PRODUCT
-    __ incrementl(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
-#endif //PRODUCT
-    __ movptr(Redi,Address(Resi,sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes()));
-    __ movl(Recx,Address(Redi,arrayOopDesc::length_offset_in_bytes()));
-    __ addptr(Redi,arrayOopDesc::base_offset_in_bytes(T_OBJECT));
-    __ repne_scan();
-    __ jcc(Assembler::notEqual, miss);
-    __ movptr(Address(Resi,sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes()),Reax);
-    __ bind(hit);
-    if( $primary )
-      __ xorptr(Redi,Redi);
+    __ check_klass_subtype_slow_path(Resi, Reax, Recx, Redi,
+                                     NULL, &miss,
+                                     /*set_cond_codes:*/ true);
+    if ($primary) {
+      __ xorptr(Redi, Redi);
+    }
     __ bind(miss);
   %}
 
@@ -12566,15 +12555,12 @@
   effect( KILL rcx, KILL cr );
 
   ins_cost(1100);  // slightly larger than the next version
-  format %{ "CMPL   EAX,ESI\n\t"
-            "JEQ,s  hit\n\t"
-            "MOV    EDI,[$sub+Klass::secondary_supers]\n\t"
+  format %{ "MOV    EDI,[$sub+Klass::secondary_supers]\n\t"
             "MOV    ECX,[EDI+arrayKlass::length]\t# length to scan\n\t"
             "ADD    EDI,arrayKlass::base_offset\t# Skip to start of data; set NZ in case count is zero\n\t"
             "REPNE SCASD\t# Scan *EDI++ for a match with EAX while CX-- != 0\n\t"
             "JNE,s  miss\t\t# Missed: EDI not-zero\n\t"
             "MOV    [$sub+Klass::secondary_super_cache],$super\t# Hit: update cache\n\t"
-     "hit:\n\t"
             "XOR    $result,$result\t\t Hit: EDI zero\n\t"
      "miss:\t" %}
 
@@ -12588,9 +12574,7 @@
   effect( KILL rcx, KILL result );
 
   ins_cost(1000);
-  format %{ "CMPL   EAX,ESI\n\t"
-            "JEQ,s  miss\t# Actually a hit; we are done.\n\t"
-            "MOV    EDI,[$sub+Klass::secondary_supers]\n\t"
+  format %{ "MOV    EDI,[$sub+Klass::secondary_supers]\n\t"
             "MOV    ECX,[EDI+arrayKlass::length]\t# length to scan\n\t"
             "ADD    EDI,arrayKlass::base_offset\t# Skip to start of data; set NZ in case count is zero\n\t"
             "REPNE SCASD\t# Scan *EDI++ for a match with EAX while CX-- != 0\n\t"
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Fri Mar 13 18:39:22 2009 -0700
@@ -2575,43 +2575,13 @@
     Register Rrax = as_Register(RAX_enc); // super class
     Register Rrcx = as_Register(RCX_enc); // killed
     Register Rrsi = as_Register(RSI_enc); // sub class
-    Label hit, miss, cmiss;
+    Label miss;
+    const bool set_cond_codes = true;
 
     MacroAssembler _masm(&cbuf);
-    // Compare super with sub directly, since super is not in its own SSA.
-    // The compiler used to emit this test, but we fold it in here,
-    // to allow platform-specific tweaking on sparc.
-    __ cmpptr(Rrax, Rrsi);
-    __ jcc(Assembler::equal, hit);
-#ifndef PRODUCT
-    __ lea(Rrcx, ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
-    __ incrementl(Address(Rrcx, 0));
-#endif //PRODUCT
-    __ movptr(Rrdi, Address(Rrsi, 
-                          sizeof(oopDesc) + 
-                          Klass::secondary_supers_offset_in_bytes()));
-    __ movl(Rrcx, Address(Rrdi, arrayOopDesc::length_offset_in_bytes()));
-    __ addptr(Rrdi, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
-    if (UseCompressedOops) {
-      __ push(Rrax);
-      __ encode_heap_oop(Rrax);
-      __ repne_scanl();
-      __ pop(Rrax);
-      __ jccb(Assembler::notEqual, miss);
-      __ movptr(Address(Rrsi,
-                      sizeof(oopDesc) +
-                      Klass::secondary_super_cache_offset_in_bytes()),
-              Rrax);
-      __ jmp(hit);
-    } else {
-      __ repne_scan();
-      __ jccb(Assembler::notEqual, miss);
-      __ movptr(Address(Rrsi,
-                      sizeof(oopDesc) +
-                      Klass::secondary_super_cache_offset_in_bytes()),
-              Rrax);
-    }
-    __ bind(hit);
+    __ check_klass_subtype_slow_path(Rrsi, Rrax, Rrcx, Rrdi,
+                                     NULL, &miss,
+                                     /*set_cond_codes:*/ true);
     if ($primary) {
       __ xorptr(Rrdi, Rrdi);
     }
@@ -12179,15 +12149,12 @@
   effect(KILL rcx, KILL cr);
 
   ins_cost(1100);  // slightly larger than the next version
-  format %{ "cmpq    rax, rsi\n\t"
-            "jeq,s   hit\n\t"
-            "movq    rdi, [$sub + (sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes())]\n\t"
+  format %{ "movq    rdi, [$sub + (sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes())]\n\t"
             "movl    rcx, [rdi + arrayOopDesc::length_offset_in_bytes()]\t# length to scan\n\t"
             "addq    rdi, arrayOopDex::base_offset_in_bytes(T_OBJECT)\t# Skip to start of data; set NZ in case count is zero\n\t"
             "repne   scasq\t# Scan *rdi++ for a match with rax while rcx--\n\t"
             "jne,s   miss\t\t# Missed: rdi not-zero\n\t"
             "movq    [$sub + (sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes())], $super\t# Hit: update cache\n\t"
-    "hit:\n\t"
             "xorq    $result, $result\t\t Hit: rdi zero\n\t"
     "miss:\t" %}
 
@@ -12205,9 +12172,7 @@
   effect(KILL rcx, KILL result);
 
   ins_cost(1000);
-  format %{ "cmpq    rax, rsi\n\t"
-            "jeq,s   miss\t# Actually a hit; we are done.\n\t"
-            "movq    rdi, [$sub + (sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes())]\n\t"
+  format %{ "movq    rdi, [$sub + (sizeof(oopDesc) + Klass::secondary_supers_offset_in_bytes())]\n\t"
             "movl    rcx, [rdi + arrayOopDesc::length_offset_in_bytes()]\t# length to scan\n\t"
             "addq    rdi, arrayOopDex::base_offset_in_bytes(T_OBJECT)\t# Skip to start of data; set NZ in case count is zero\n\t"
             "repne   scasq\t# Scan *rdi++ for a match with rax while cx-- != 0\n\t"
--- a/hotspot/src/share/vm/opto/graphKit.cpp	Fri Mar 13 11:35:17 2009 -0700
+++ b/hotspot/src/share/vm/opto/graphKit.cpp	Fri Mar 13 18:39:22 2009 -0700
@@ -2277,7 +2277,7 @@
   r_not_subtype->init_req(1, _gvn.transform( new (C, 1) IfTrueNode (iff2) ) );
   set_control(                _gvn.transform( new (C, 1) IfFalseNode(iff2) ) );
 
-  // Check for self.  Very rare to get here, but its taken 1/3 the time.
+  // Check for self.  Very rare to get here, but it is taken 1/3 the time.
   // No performance impact (too rare) but allows sharing of secondary arrays
   // which has some footprint reduction.
   Node *cmp3 = _gvn.transform( new (C, 3) CmpPNode( subklass, superklass ) );
@@ -2286,11 +2286,27 @@
   r_ok_subtype->init_req(2, _gvn.transform( new (C, 1) IfTrueNode ( iff3 ) ) );
   set_control(               _gvn.transform( new (C, 1) IfFalseNode( iff3 ) ) );
 
+  // -- Roads not taken here: --
+  // We could also have chosen to perform the self-check at the beginning
+  // of this code sequence, as the assembler does.  This would not pay off
+  // the same way, since the optimizer, unlike the assembler, can perform
+  // static type analysis to fold away many successful self-checks.
+  // Non-foldable self checks work better here in second position, because
+  // the initial primary superclass check subsumes a self-check for most
+  // types.  An exception would be a secondary type like array-of-interface,
+  // which does not appear in its own primary supertype display.
+  // Finally, we could have chosen to move the self-check into the
+  // PartialSubtypeCheckNode, and from there out-of-line in a platform
+  // dependent manner.  But it is worthwhile to have the check here,
+  // where it can be perhaps be optimized.  The cost in code space is
+  // small (register compare, branch).
+
   // Now do a linear scan of the secondary super-klass array.  Again, no real
   // performance impact (too rare) but it's gotta be done.
-  // (The stub also contains the self-check of subklass == superklass.
   // Since the code is rarely used, there is no penalty for moving it
-  // out of line, and it can only improve I-cache density.)
+  // out of line, and it can only improve I-cache density.
+  // The decision to inline or out-of-line this final check is platform
+  // dependent, and is found in the AD file definition of PartialSubtypeCheck.
   Node* psc = _gvn.transform(
     new (C, 3) PartialSubtypeCheckNode(control(), subklass, superklass) );