hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
changeset 30244 d4e471395ff5
parent 29325 0e86e64c66e5
child 30310 522ea430079f
equal deleted inserted replaced
30240:a7ba42fa1df6 30244:d4e471395ff5
  1956 //
  1956 //
  1957 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
  1957 // The only other source of unbalanced locking would be JNI.  The "Java Native Interface:
  1958 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
  1958 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
  1959 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
  1959 // should not be unlocked by "normal" java-level locking and vice-versa.  The specification
  1960 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
  1960 // doesn't specify what will occur if a program engages in such mixed-mode locking, however.
       
  1961 // Arguably given that the spec legislates the JNI case as undefined our implementation
       
  1962 // could reasonably *avoid* checking owner in Fast_Unlock().
       
  1963 // In the interest of performance we elide m->Owner==Self check in unlock.
       
  1964 // A perfectly viable alternative is to elide the owner check except when
       
  1965 // Xcheck:jni is enabled.
  1961 
  1966 
  1962 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
  1967 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
  1963   assert(boxReg == rax, "");
  1968   assert(boxReg == rax, "");
  1964   assert_different_registers(objReg, boxReg, tmpReg);
  1969   assert_different_registers(objReg, boxReg, tmpReg);
  1965 
  1970 
  1966   if (EmitSync & 4) {
  1971   if (EmitSync & 4) {
  1967     // Disable - inhibit all inlining.  Force control through the slow-path
  1972     // Disable - inhibit all inlining.  Force control through the slow-path
  1968     cmpptr (rsp, 0);
  1973     cmpptr (rsp, 0);
  1969   } else
       
  1970   if (EmitSync & 8) {
       
  1971     Label DONE_LABEL;
       
  1972     if (UseBiasedLocking) {
       
  1973        biased_locking_exit(objReg, tmpReg, DONE_LABEL);
       
  1974     }
       
  1975     // Classic stack-locking code ...
       
  1976     // Check whether the displaced header is 0
       
  1977     //(=> recursive unlock)
       
  1978     movptr(tmpReg, Address(boxReg, 0));
       
  1979     testptr(tmpReg, tmpReg);
       
  1980     jccb(Assembler::zero, DONE_LABEL);
       
  1981     // If not recursive lock, reset the header to displaced header
       
  1982     if (os::is_MP()) {
       
  1983       lock();
       
  1984     }
       
  1985     cmpxchgptr(tmpReg, Address(objReg, 0));   // Uses RAX which is box
       
  1986     bind(DONE_LABEL);
       
  1987   } else {
  1974   } else {
  1988     Label DONE_LABEL, Stacked, CheckSucc;
  1975     Label DONE_LABEL, Stacked, CheckSucc;
  1989 
  1976 
  1990     // Critically, the biased locking test must have precedence over
  1977     // Critically, the biased locking test must have precedence over
  1991     // and appear before the (box->dhw == 0) recursive stack-lock test.
  1978     // and appear before the (box->dhw == 0) recursive stack-lock test.
  2058 
  2045 
  2059     // Note that we could employ various encoding schemes to reduce
  2046     // Note that we could employ various encoding schemes to reduce
  2060     // the number of loads below (currently 4) to just 2 or 3.
  2047     // the number of loads below (currently 4) to just 2 or 3.
  2061     // Refer to the comments in synchronizer.cpp.
  2048     // Refer to the comments in synchronizer.cpp.
  2062     // In practice the chain of fetches doesn't seem to impact performance, however.
  2049     // In practice the chain of fetches doesn't seem to impact performance, however.
       
  2050     xorptr(boxReg, boxReg);
  2063     if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
  2051     if ((EmitSync & 65536) == 0 && (EmitSync & 256)) {
  2064        // Attempt to reduce branch density - AMD's branch predictor.
  2052        // Attempt to reduce branch density - AMD's branch predictor.
  2065        xorptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
       
  2066        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
  2053        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
  2067        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
  2054        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
  2068        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
  2055        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
  2069        jccb  (Assembler::notZero, DONE_LABEL);
  2056        jccb  (Assembler::notZero, DONE_LABEL);
  2070        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
  2057        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
  2071        jmpb  (DONE_LABEL);
  2058        jmpb  (DONE_LABEL);
  2072     } else {
  2059     } else {
  2073        xorptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
       
  2074        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
  2060        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
  2075        jccb  (Assembler::notZero, DONE_LABEL);
  2061        jccb  (Assembler::notZero, DONE_LABEL);
  2076        movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
  2062        movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
  2077        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
  2063        orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
  2078        jccb  (Assembler::notZero, CheckSucc);
  2064        jccb  (Assembler::notZero, CheckSucc);
  2091        Label LSuccess, LGoSlowPath ;
  2077        Label LSuccess, LGoSlowPath ;
  2092 
  2078 
  2093        bind  (CheckSucc);
  2079        bind  (CheckSucc);
  2094 
  2080 
  2095        // Optional pre-test ... it's safe to elide this
  2081        // Optional pre-test ... it's safe to elide this
  2096        if ((EmitSync & 16) == 0) {
  2082        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
  2097           cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
  2083        jccb(Assembler::zero, LGoSlowPath);
  2098           jccb  (Assembler::zero, LGoSlowPath);
       
  2099        }
       
  2100 
  2084 
  2101        // We have a classic Dekker-style idiom:
  2085        // We have a classic Dekker-style idiom:
  2102        //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
  2086        //    ST m->_owner = 0 ; MEMBAR; LD m->_succ
  2103        // There are a number of ways to implement the barrier:
  2087        // There are a number of ways to implement the barrier:
  2104        // (1) lock:andl &m->_owner, 0
  2088        // (1) lock:andl &m->_owner, 0
  2107        //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
  2091        //     Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8
  2108        // (2) If supported, an explicit MFENCE is appealing.
  2092        // (2) If supported, an explicit MFENCE is appealing.
  2109        //     In older IA32 processors MFENCE is slower than lock:add or xchg
  2093        //     In older IA32 processors MFENCE is slower than lock:add or xchg
  2110        //     particularly if the write-buffer is full as might be the case if
  2094        //     particularly if the write-buffer is full as might be the case if
  2111        //     if stores closely precede the fence or fence-equivalent instruction.
  2095        //     if stores closely precede the fence or fence-equivalent instruction.
  2112        //     In more modern implementations MFENCE appears faster, however.
  2096        //     See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
       
  2097        //     as the situation has changed with Nehalem and Shanghai.
  2113        // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
  2098        // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack
  2114        //     The $lines underlying the top-of-stack should be in M-state.
  2099        //     The $lines underlying the top-of-stack should be in M-state.
  2115        //     The locked add instruction is serializing, of course.
  2100        //     The locked add instruction is serializing, of course.
  2116        // (4) Use xchg, which is serializing
  2101        // (4) Use xchg, which is serializing
  2117        //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
  2102        //     mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works
  2124        // We currently use (3), although it's likely that switching to (2)
  2109        // We currently use (3), although it's likely that switching to (2)
  2125        // is correct for the future.
  2110        // is correct for the future.
  2126 
  2111 
  2127        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
  2112        movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
  2128        if (os::is_MP()) {
  2113        if (os::is_MP()) {
  2129           if (VM_Version::supports_sse2() && 1 == FenceInstruction) {
  2114          lock(); addptr(Address(rsp, 0), 0);
  2130             mfence();
       
  2131           } else {
       
  2132             lock (); addptr(Address(rsp, 0), 0);
       
  2133           }
       
  2134        }
  2115        }
  2135        // Ratify _succ remains non-null
  2116        // Ratify _succ remains non-null
  2136        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
  2117        cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0);
  2137        jccb  (Assembler::notZero, LSuccess);
  2118        jccb  (Assembler::notZero, LSuccess);
  2138 
  2119 
  2177     if ((EmitSync & 65536) == 0) {
  2158     if ((EmitSync & 65536) == 0) {
  2178        bind (CheckSucc);
  2159        bind (CheckSucc);
  2179     }
  2160     }
  2180 #else // _LP64
  2161 #else // _LP64
  2181     // It's inflated
  2162     // It's inflated
  2182     movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
  2163     if (EmitSync & 1024) {
  2183     xorptr(boxReg, r15_thread);
  2164       // Emit code to check that _owner == Self
       
  2165       // We could fold the _owner test into subsequent code more efficiently
       
  2166       // than using a stand-alone check, but since _owner checking is off by
       
  2167       // default we don't bother. We also might consider predicating the
       
  2168       // _owner==Self check on Xcheck:jni or running on a debug build.
       
  2169       movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
       
  2170       xorptr(boxReg, r15_thread);
       
  2171     } else {
       
  2172       xorptr(boxReg, boxReg);
       
  2173     }
  2184     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
  2174     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
  2185     jccb  (Assembler::notZero, DONE_LABEL);
  2175     jccb  (Assembler::notZero, DONE_LABEL);
  2186     movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
  2176     movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
  2187     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
  2177     orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
  2188     jccb  (Assembler::notZero, CheckSucc);
  2178     jccb  (Assembler::notZero, CheckSucc);
  2189     movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
  2179     movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
  2190     jmpb  (DONE_LABEL);
  2180     jmpb  (DONE_LABEL);
  2191 
  2181 
  2192     if ((EmitSync & 65536) == 0) {
  2182     if ((EmitSync & 65536) == 0) {
       
  2183       // Try to avoid passing control into the slow_path ...
  2193       Label LSuccess, LGoSlowPath ;
  2184       Label LSuccess, LGoSlowPath ;
  2194       bind  (CheckSucc);
  2185       bind  (CheckSucc);
       
  2186 
       
  2187       // The following optional optimization can be elided if necessary
       
  2188       // Effectively: if (succ == null) goto SlowPath
       
  2189       // The code reduces the window for a race, however,
       
  2190       // and thus benefits performance.
  2195       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
  2191       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
  2196       jccb  (Assembler::zero, LGoSlowPath);
  2192       jccb  (Assembler::zero, LGoSlowPath);
  2197 
  2193 
  2198       // I'd much rather use lock:andl m->_owner, 0 as it's faster than the
  2194       if ((EmitSync & 16) && os::is_MP()) {
  2199       // the explicit ST;MEMBAR combination, but masm doesn't currently support
  2195         orptr(boxReg, boxReg);
  2200       // "ANDQ M,IMM".  Don't use MFENCE here.  lock:add to TOS, xchg, etc
  2196         xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
  2201       // are all faster when the write buffer is populated.
  2197       } else {
  2202       movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
  2198         movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
  2203       if (os::is_MP()) {
  2199         if (os::is_MP()) {
  2204          lock (); addl (Address(rsp, 0), 0);
  2200           // Memory barrier/fence
       
  2201           // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
       
  2202           // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
       
  2203           // This is faster on Nehalem and AMD Shanghai/Barcelona.
       
  2204           // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
       
  2205           // We might also restructure (ST Owner=0;barrier;LD _Succ) to
       
  2206           // (mov box,0; xchgq box, &m->Owner; LD _succ) .
       
  2207           lock(); addl(Address(rsp, 0), 0);
       
  2208         }
  2205       }
  2209       }
  2206       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
  2210       cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
  2207       jccb  (Assembler::notZero, LSuccess);
  2211       jccb  (Assembler::notZero, LSuccess);
  2208 
  2212 
  2209       movptr (boxReg, (int32_t)NULL_WORD);                   // box is really EAX
  2213       // Rare inopportune interleaving - race.
       
  2214       // The successor vanished in the small window above.
       
  2215       // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
       
  2216       // We need to ensure progress and succession.
       
  2217       // Try to reacquire the lock.
       
  2218       // If that fails then the new owner is responsible for succession and this
       
  2219       // thread needs to take no further action and can exit via the fast path (success).
       
  2220       // If the re-acquire succeeds then pass control into the slow path.
       
  2221       // As implemented, this latter mode is horrible because we generated more
       
  2222       // coherence traffic on the lock *and* artifically extended the critical section
       
  2223       // length while by virtue of passing control into the slow path.
       
  2224 
       
  2225       // box is really RAX -- the following CMPXCHG depends on that binding
       
  2226       // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
       
  2227       movptr(boxReg, (int32_t)NULL_WORD);
  2210       if (os::is_MP()) { lock(); }
  2228       if (os::is_MP()) { lock(); }
  2211       cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
  2229       cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
  2212       jccb  (Assembler::notEqual, LSuccess);
  2230       jccb  (Assembler::notEqual, LSuccess);
  2213       // Intentional fall-through into slow-path
  2231       // Intentional fall-through into slow-path
  2214 
  2232 
  2229     if (EmitSync & 65536) {
  2247     if (EmitSync & 65536) {
  2230        bind (CheckSucc);
  2248        bind (CheckSucc);
  2231     }
  2249     }
  2232 #endif
  2250 #endif
  2233     bind(DONE_LABEL);
  2251     bind(DONE_LABEL);
  2234     // Avoid branch to branch on AMD processors
       
  2235     if (EmitSync & 32768) {
       
  2236        nop();
       
  2237     }
       
  2238   }
  2252   }
  2239 }
  2253 }
  2240 #endif // COMPILER2
  2254 #endif // COMPILER2
  2241 
  2255 
  2242 void MacroAssembler::c2bool(Register x) {
  2256 void MacroAssembler::c2bool(Register x) {