8141635: Implement VarHandles/Unsafe intrinsics on POWER
authormdoerr
Fri, 13 May 2016 22:21:54 +0300
changeset 38685 e34308190947
parent 38684 be16c8bb0132
child 38687 ebf49a952c51
8141635: Implement VarHandles/Unsafe intrinsics on POWER Reviewed-by: shade, kvn, simonis
hotspot/src/cpu/ppc/vm/c1_LIRAssembler_ppc.cpp
hotspot/src/cpu/ppc/vm/c1_LIRGenerator_ppc.cpp
hotspot/src/cpu/ppc/vm/macroAssembler_ppc.cpp
hotspot/src/cpu/ppc/vm/macroAssembler_ppc.hpp
hotspot/src/cpu/ppc/vm/ppc.ad
hotspot/src/share/vm/opto/library_call.cpp
--- a/hotspot/src/cpu/ppc/vm/c1_LIRAssembler_ppc.cpp	Fri May 13 06:36:52 2016 +0000
+++ b/hotspot/src/cpu/ppc/vm/c1_LIRAssembler_ppc.cpp	Fri May 13 22:21:54 2016 +0300
@@ -2563,15 +2563,21 @@
 
   if (is_64bit) {
     __ cmpxchgd(BOOL_RESULT, /*current_value=*/R0, cmp_value, new_value, addr,
-                MacroAssembler::MemBarFenceAfter,
+                MacroAssembler::MemBarNone,
                 MacroAssembler::cmpxchgx_hint_atomic_update(),
                 noreg, NULL, /*check without ldarx first*/true);
   } else {
     __ cmpxchgw(BOOL_RESULT, /*current_value=*/R0, cmp_value, new_value, addr,
-                MacroAssembler::MemBarFenceAfter,
+                MacroAssembler::MemBarNone,
                 MacroAssembler::cmpxchgx_hint_atomic_update(),
                 noreg, /*check without ldarx first*/true);
   }
+
+  if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+    __ isync();
+  } else {
+    __ sync();
+  }
 }
 
 
--- a/hotspot/src/cpu/ppc/vm/c1_LIRGenerator_ppc.cpp	Fri May 13 06:36:52 2016 +0000
+++ b/hotspot/src/cpu/ppc/vm/c1_LIRGenerator_ppc.cpp	Fri May 13 22:21:54 2016 +0300
@@ -1353,7 +1353,11 @@
     }
   }
 
-  __ membar();
+  if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+    __ membar_acquire();
+  } else {
+    __ membar();
+  }
 }
 
 
--- a/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.cpp	Fri May 13 06:36:52 2016 +0000
+++ b/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.cpp	Fri May 13 22:21:54 2016 +0300
@@ -1404,7 +1404,7 @@
 void MacroAssembler::cmpxchgw(ConditionRegister flag, Register dest_current_value,
                               Register compare_value, Register exchange_value,
                               Register addr_base, int semantics, bool cmpxchgx_hint,
-                              Register int_flag_success, bool contention_hint) {
+                              Register int_flag_success, bool contention_hint, bool weak) {
   Label retry;
   Label failed;
   Label done;
@@ -1414,6 +1414,7 @@
   bool use_result_reg    = (int_flag_success != noreg);
   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
                             int_flag_success != exchange_value && int_flag_success != addr_base);
+  assert(!weak || flag == CCR0, "weak only supported with CCR0");
 
   if (use_result_reg && preset_result_reg) {
     li(int_flag_success, 0); // preset (assume cas failed)
@@ -1445,10 +1446,12 @@
   // fall through    => (flag == eq), (dest_current_value == compare_value)
 
   stwcx_(exchange_value, addr_base);
-  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
-    bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
-  } else {
-    bne(                  CCR0, retry); // StXcx_ sets CCR0.
+  if (!weak || use_result_reg) {
+    if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
+      bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
+    } else {
+      bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
+    }
   }
   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
 
@@ -1498,7 +1501,7 @@
 void MacroAssembler::cmpxchgd(ConditionRegister flag,
                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
                               Register addr_base, int semantics, bool cmpxchgx_hint,
-                              Register int_flag_success, Label* failed_ext, bool contention_hint) {
+                              Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
   Label retry;
   Label failed_int;
   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
@@ -1508,6 +1511,7 @@
   bool use_result_reg    = (int_flag_success!=noreg);
   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
+  assert(!weak || flag == CCR0, "weak only supported with CCR0");
   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
 
   if (use_result_reg && preset_result_reg) {
@@ -1538,10 +1542,12 @@
   }
 
   stdcx_(exchange_value, addr_base);
-  if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
-    bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
-  } else {
-    bne(                  CCR0, retry); // stXcx_ sets CCR0
+  if (!weak || use_result_reg || failed_ext) {
+    if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
+      bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
+    } else {
+      bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
+    }
   }
 
   // result in register (must do this at the end because int_flag_success can be the same register as one above)
--- a/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.hpp	Fri May 13 06:36:52 2016 +0000
+++ b/hotspot/src/cpu/ppc/vm/macroAssembler_ppc.hpp	Fri May 13 22:21:54 2016 +0300
@@ -430,11 +430,11 @@
   void cmpxchgw(ConditionRegister flag,
                 Register dest_current_value, Register compare_value, Register exchange_value, Register addr_base,
                 int semantics, bool cmpxchgx_hint = false,
-                Register int_flag_success = noreg, bool contention_hint = false);
+                Register int_flag_success = noreg, bool contention_hint = false, bool weak = false);
   void cmpxchgd(ConditionRegister flag,
                 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
                 Register addr_base, int semantics, bool cmpxchgx_hint = false,
-                Register int_flag_success = noreg, Label* failed = NULL, bool contention_hint = false);
+                Register int_flag_success = noreg, Label* failed = NULL, bool contention_hint = false, bool weak = false);
 
   // interface method calling
   void lookup_interface_method(Register recv_klass,
--- a/hotspot/src/cpu/ppc/vm/ppc.ad	Fri May 13 06:36:52 2016 +0000
+++ b/hotspot/src/cpu/ppc/vm/ppc.ad	Fri May 13 22:21:54 2016 +0300
@@ -3083,7 +3083,11 @@
       __ bne(                  CCR0, Lretry);
     }
     if (RegCollision) __ subf(Rres, Rsrc, Rtmp);
-    __ fence();
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      __ sync();
+    }
   %}
 
   enc_class enc_GetAndAddL(iRegLdst res, iRegPdst mem_ptr, iRegLsrc src) %{
@@ -3108,7 +3112,11 @@
       __ bne(                  CCR0, Lretry);
     }
     if (RegCollision) __ subf(Rres, Rsrc, Rtmp);
-    __ fence();
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      __ sync();
+    }
   %}
 
   enc_class enc_GetAndSetI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src) %{
@@ -3132,7 +3140,11 @@
       __ bne(                  CCR0, Lretry);
     }
     if (RegCollision) __ mr(Rres, Rtmp);
-    __ fence();
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      __ sync();
+    }
   %}
 
   enc_class enc_GetAndSetL(iRegLdst res, iRegPdst mem_ptr, iRegLsrc src) %{
@@ -3156,7 +3168,11 @@
       __ bne(                  CCR0, Lretry);
     }
     if (RegCollision) __ mr(Rres, Rtmp);
-    __ fence();
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      __ sync();
+    }
   %}
 
   // This enc_class is needed so that scheduler gets proper
@@ -7553,6 +7569,8 @@
 // (CompareAndSwap ...)" or "If (CmpI (CompareAndSwap ..))"  cannot be
 // matched.
 
+// Strong versions:
+
 instruct compareAndSwapI_regP_regI_regI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src1, iRegIsrc src2, flagsRegCR0 cr0) %{
   match(Set res (CompareAndSwapI mem_ptr (Binary src1 src2)));
   effect(TEMP cr0);
@@ -7562,8 +7580,13 @@
     // TODO: PPC port $archOpcode(ppc64Opcode_compound);
     // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
     __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
-                MacroAssembler::MemBarFenceAfter, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
                 $res$$Register, true);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      __ sync();
+    }
   %}
   ins_pipe(pipe_class_default);
 %}
@@ -7577,8 +7600,13 @@
     // TODO: PPC port $archOpcode(ppc64Opcode_compound);
     // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
     __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
-                MacroAssembler::MemBarFenceAfter, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
                 $res$$Register, true);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      __ sync();
+    }
   %}
   ins_pipe(pipe_class_default);
 %}
@@ -7592,8 +7620,13 @@
     // TODO: PPC port $archOpcode(ppc64Opcode_compound);
     // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
     __ cmpxchgd(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
-                MacroAssembler::MemBarFenceAfter, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
                 $res$$Register, NULL, true);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      __ sync();
+    }
   %}
   ins_pipe(pipe_class_default);
 %}
@@ -7607,11 +7640,311 @@
     // TODO: PPC port $archOpcode(ppc64Opcode_compound);
     // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
     __ cmpxchgd(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
-                MacroAssembler::MemBarFenceAfter, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
                 $res$$Register, NULL, true);
-  %}
-  ins_pipe(pipe_class_default);
-%}
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      __ sync();
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// Weak versions:
+
+instruct weakCompareAndSwapI_regP_regI_regI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src1, iRegIsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (WeakCompareAndSwapI mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst);
+  effect(TEMP cr0);
+  format %{ "weak CMPXCHGW $res, $mem_ptr, $src1, $src2; as bool" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarNone,
+                MacroAssembler::cmpxchgx_hint_atomic_update(), $res$$Register, true, /*weak*/ true);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct weakCompareAndSwapI_acq_regP_regI_regI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src1, iRegIsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (WeakCompareAndSwapI mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst);
+  effect(TEMP cr0);
+  format %{ "weak CMPXCHGW acq $res, $mem_ptr, $src1, $src2; as bool" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    // Acquire only needed in successful case. Weak node is allowed to report unsuccessful in additional rare cases and
+    // value is never passed to caller.
+    __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                support_IRIW_for_not_multiple_copy_atomic_cpu ? MacroAssembler::MemBarAcq : MacroAssembler::MemBarFenceAfter,
+                MacroAssembler::cmpxchgx_hint_atomic_update(), $res$$Register, true, /*weak*/ true);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct weakCompareAndSwapN_regP_regN_regN(iRegIdst res, iRegPdst mem_ptr, iRegNsrc src1, iRegNsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (WeakCompareAndSwapN mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst);
+  effect(TEMP cr0);
+  format %{ "weak CMPXCHGW $res, $mem_ptr, $src1, $src2; as bool" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarNone,
+                MacroAssembler::cmpxchgx_hint_atomic_update(), $res$$Register, true, /*weak*/ true);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct weakCompareAndSwapN_acq_regP_regN_regN(iRegIdst res, iRegPdst mem_ptr, iRegNsrc src1, iRegNsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (WeakCompareAndSwapN mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst);
+  effect(TEMP cr0);
+  format %{ "weak CMPXCHGW acq $res, $mem_ptr, $src1, $src2; as bool" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    // Acquire only needed in successful case. Weak node is allowed to report unsuccessful in additional rare cases and
+    // value is never passed to caller.
+    __ cmpxchgw(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                support_IRIW_for_not_multiple_copy_atomic_cpu ? MacroAssembler::MemBarAcq : MacroAssembler::MemBarFenceAfter,
+                MacroAssembler::cmpxchgx_hint_atomic_update(), $res$$Register, true, /*weak*/ true);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct weakCompareAndSwapL_regP_regL_regL(iRegIdst res, iRegPdst mem_ptr, iRegLsrc src1, iRegLsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (WeakCompareAndSwapL mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst);
+  effect(TEMP cr0);
+  format %{ "weak CMPXCHGD $res, $mem_ptr, $src1, $src2; as bool" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    // value is never passed to caller.
+    __ cmpxchgd(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarNone,
+                MacroAssembler::cmpxchgx_hint_atomic_update(), $res$$Register, NULL, true, /*weak*/ true);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct weakCompareAndSwapL_acq_regP_regL_regL(iRegIdst res, iRegPdst mem_ptr, iRegLsrc src1, iRegLsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (WeakCompareAndSwapL mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst);
+  effect(TEMP cr0);
+  format %{ "weak CMPXCHGD acq $res, $mem_ptr, $src1, $src2; as bool" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    // Acquire only needed in successful case. Weak node is allowed to report unsuccessful in additional rare cases and
+    // value is never passed to caller.
+    __ cmpxchgd(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                support_IRIW_for_not_multiple_copy_atomic_cpu ? MacroAssembler::MemBarAcq : MacroAssembler::MemBarFenceAfter,
+                MacroAssembler::cmpxchgx_hint_atomic_update(), $res$$Register, NULL, true, /*weak*/ true);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct weakCompareAndSwapP_regP_regP_regP(iRegIdst res, iRegPdst mem_ptr, iRegPsrc src1, iRegPsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (WeakCompareAndSwapP mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst);
+  effect(TEMP cr0);
+  format %{ "weak CMPXCHGD $res, $mem_ptr, $src1, $src2; as bool; ptr" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    __ cmpxchgd(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarNone,
+                MacroAssembler::cmpxchgx_hint_atomic_update(), $res$$Register, NULL, true, /*weak*/ true);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct weakCompareAndSwapP_acq_regP_regP_regP(iRegIdst res, iRegPdst mem_ptr, iRegPsrc src1, iRegPsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (WeakCompareAndSwapP mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst);
+  effect(TEMP cr0);
+  format %{ "weak CMPXCHGD acq $res, $mem_ptr, $src1, $src2; as bool; ptr" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    // Acquire only needed in successful case. Weak node is allowed to report unsuccessful in additional rare cases and
+    // value is never passed to caller.
+    __ cmpxchgd(CCR0, R0, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                support_IRIW_for_not_multiple_copy_atomic_cpu ? MacroAssembler::MemBarAcq : MacroAssembler::MemBarFenceAfter,
+                MacroAssembler::cmpxchgx_hint_atomic_update(), $res$$Register, NULL, true, /*weak*/ true);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// CompareAndExchange
+
+instruct compareAndExchangeI_regP_regI_regI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src1, iRegIsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (CompareAndExchangeI mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst);
+  effect(TEMP_DEF res, TEMP cr0);
+  format %{ "CMPXCHGW $res, $mem_ptr, $src1, $src2; as int" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    __ cmpxchgw(CCR0, $res$$Register, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, true);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct compareAndExchangeI_acq_regP_regI_regI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src1, iRegIsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (CompareAndExchangeI mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst);
+  effect(TEMP_DEF res, TEMP cr0);
+  format %{ "CMPXCHGW acq $res, $mem_ptr, $src1, $src2; as int" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    __ cmpxchgw(CCR0, $res$$Register, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, true);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      // isync would be sufficient in case of CompareAndExchangeAcquire, but we currently don't optimize for that.
+      __ sync();
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct compareAndExchangeN_regP_regN_regN(iRegNdst res, iRegPdst mem_ptr, iRegNsrc src1, iRegNsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (CompareAndExchangeN mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst);
+  effect(TEMP_DEF res, TEMP cr0);
+  format %{ "CMPXCHGW $res, $mem_ptr, $src1, $src2; as narrow oop" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    __ cmpxchgw(CCR0, $res$$Register, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, true);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct compareAndExchangeN_acq_regP_regN_regN(iRegNdst res, iRegPdst mem_ptr, iRegNsrc src1, iRegNsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (CompareAndExchangeN mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst);
+  effect(TEMP_DEF res, TEMP cr0);
+  format %{ "CMPXCHGW acq $res, $mem_ptr, $src1, $src2; as narrow oop" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    __ cmpxchgw(CCR0, $res$$Register, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, true);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      // isync would be sufficient in case of CompareAndExchangeAcquire, but we currently don't optimize for that.
+      __ sync();
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct compareAndExchangeL_regP_regL_regL(iRegLdst res, iRegPdst mem_ptr, iRegLsrc src1, iRegLsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (CompareAndExchangeL mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst);
+  effect(TEMP_DEF res, TEMP cr0);
+  format %{ "CMPXCHGD $res, $mem_ptr, $src1, $src2; as long" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    __ cmpxchgd(CCR0, $res$$Register, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, NULL, true);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct compareAndExchangeL_acq_regP_regL_regL(iRegLdst res, iRegPdst mem_ptr, iRegLsrc src1, iRegLsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (CompareAndExchangeL mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst);
+  effect(TEMP_DEF res, TEMP cr0);
+  format %{ "CMPXCHGD acq $res, $mem_ptr, $src1, $src2; as long" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    __ cmpxchgd(CCR0, $res$$Register, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, NULL, true);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      // isync would be sufficient in case of CompareAndExchangeAcquire, but we currently don't optimize for that.
+      __ sync();
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct compareAndExchangeP_regP_regP_regP(iRegPdst res, iRegPdst mem_ptr, iRegPsrc src1, iRegPsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (CompareAndExchangeP mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() != MemNode::acquire && ((CompareAndSwapNode*)n)->order() != MemNode::seqcst);
+  effect(TEMP_DEF res, TEMP cr0);
+  format %{ "CMPXCHGD $res, $mem_ptr, $src1, $src2; as ptr; ptr" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    __ cmpxchgd(CCR0, $res$$Register, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, NULL, true);
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+instruct compareAndExchangeP_acq_regP_regP_regP(iRegPdst res, iRegPdst mem_ptr, iRegPsrc src1, iRegPsrc src2, flagsRegCR0 cr0) %{
+  match(Set res (CompareAndExchangeP mem_ptr (Binary src1 src2)));
+  predicate(((CompareAndSwapNode*)n)->order() == MemNode::acquire || ((CompareAndSwapNode*)n)->order() == MemNode::seqcst);
+  effect(TEMP_DEF res, TEMP cr0);
+  format %{ "CMPXCHGD acq $res, $mem_ptr, $src1, $src2; as ptr; ptr" %}
+  // Variable size: instruction count smaller if regs are disjoint.
+  ins_encode %{
+    // TODO: PPC port $archOpcode(ppc64Opcode_compound);
+    // CmpxchgX sets CCR0 to cmpX(src1, src2) and Rres to 'true'/'false'.
+    __ cmpxchgd(CCR0, $res$$Register, $src1$$Register, $src2$$Register, $mem_ptr$$Register,
+                MacroAssembler::MemBarNone, MacroAssembler::cmpxchgx_hint_atomic_update(),
+                noreg, NULL, true);
+    if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+      __ isync();
+    } else {
+      // isync would be sufficient in case of CompareAndExchangeAcquire, but we currently don't optimize for that.
+      __ sync();
+    }
+  %}
+  ins_pipe(pipe_class_default);
+%}
+
+// Special RMW
 
 instruct getAndAddI(iRegIdst res, iRegPdst mem_ptr, iRegIsrc src, flagsRegCR0 cr0) %{
   match(Set res (GetAndAddI mem_ptr src));
--- a/hotspot/src/share/vm/opto/library_call.cpp	Fri May 13 06:36:52 2016 +0000
+++ b/hotspot/src/share/vm/opto/library_call.cpp	Fri May 13 22:21:54 2016 +0300
@@ -2697,6 +2697,7 @@
       assert(sig->type_at(0)->basic_type() == T_OBJECT, "get and set base is object");
       assert(sig->type_at(1)->basic_type() == T_LONG, "get and set offset is long");
       assert(sig->type_at(2)->basic_type() == type, "get and set must take expected type as new value/delta");
+      assert(access_kind == Volatile, "mo is not passed to intrinsic nodes in current implementation");
 #endif // ASSERT
         break;
       }
@@ -2822,8 +2823,14 @@
     case Acquire:
       break;
     case Release:
+      insert_mem_bar(Op_MemBarRelease);
+      break;
     case Volatile:
-      insert_mem_bar(Op_MemBarRelease);
+      if (support_IRIW_for_not_multiple_copy_atomic_cpu) {
+        insert_mem_bar(Op_MemBarVolatile);
+      } else {
+        insert_mem_bar(Op_MemBarRelease);
+      }
       break;
     default:
       ShouldNotReachHere();
@@ -3035,6 +3042,7 @@
     case Acquire:
     case Volatile:
       insert_mem_bar(Op_MemBarAcquire);
+      // !support_IRIW_for_not_multiple_copy_atomic_cpu handled in platform code
       break;
     default:
       ShouldNotReachHere();