8206895: aarch64: rework error-prone cmp instuction
authorbulasevich
Fri, 10 Aug 2018 14:22:49 +0300
changeset 51374 7be0084191ed
parent 51373 514035618c1d
child 51375 b812a85b3aa4
8206895: aarch64: rework error-prone cmp instuction Reviewed-by: aph
src/hotspot/cpu/aarch64/aarch64.ad
src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp
src/hotspot/cpu/aarch64/c1_Runtime1_aarch64.cpp
src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp
src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
src/hotspot/cpu/aarch64/macroAssembler_aarch64_log.cpp
src/hotspot/cpu/aarch64/macroAssembler_aarch64_trig.cpp
src/hotspot/cpu/aarch64/methodHandles_aarch64.cpp
src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp
src/hotspot/cpu/aarch64/templateTable_aarch64.cpp
--- a/src/hotspot/cpu/aarch64/aarch64.ad	Fri Aug 10 17:07:44 2018 +0800
+++ b/src/hotspot/cpu/aarch64/aarch64.ad	Fri Aug 10 14:22:49 2018 +0300
@@ -14471,7 +14471,7 @@
   format %{ "cmp   $op1, $op2\t# overflow check long" %}
   ins_cost(INSN_COST);
   ins_encode %{
-    __ cmp($op1$$Register, $op2$$constant);
+    __ subs(zr, $op1$$Register, $op2$$constant);
   %}
 
   ins_pipe(icmp_reg_imm);
--- a/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp	Fri Aug 10 17:07:44 2018 +0800
+++ b/src/hotspot/cpu/aarch64/c1_LIRAssembler_aarch64.cpp	Fri Aug 10 14:22:49 2018 +0300
@@ -1922,7 +1922,7 @@
         if (is_32bit)
           __ cmpw(reg1, imm);
         else
-          __ cmp(reg1, imm);
+          __ subs(zr, reg1, imm);
         return;
       } else {
         __ mov(rscratch1, imm);
@@ -2705,7 +2705,7 @@
 
         if (TypeEntries::is_type_none(current_klass)) {
           __ cbz(rscratch2, none);
-          __ cmp(rscratch2, TypeEntries::null_seen);
+          __ cmp(rscratch2, (u1)TypeEntries::null_seen);
           __ br(Assembler::EQ, none);
           // There is a chance that the checks above (re-reading profiling
           // data from memory) fail if another thread has just set the
@@ -2750,7 +2750,7 @@
           Label ok;
           __ ldr(rscratch1, mdo_addr);
           __ cbz(rscratch1, ok);
-          __ cmp(rscratch1, TypeEntries::null_seen);
+          __ cmp(rscratch1, (u1)TypeEntries::null_seen);
           __ br(Assembler::EQ, ok);
           // may have been set by another thread
           __ dmb(Assembler::ISHLD);
--- a/src/hotspot/cpu/aarch64/c1_Runtime1_aarch64.cpp	Fri Aug 10 17:07:44 2018 +0800
+++ b/src/hotspot/cpu/aarch64/c1_Runtime1_aarch64.cpp	Fri Aug 10 14:22:49 2018 +0300
@@ -712,7 +712,7 @@
           {
             Label ok, not_ok;
             __ ldrw(obj_size, Address(klass, Klass::layout_helper_offset()));
-            __ cmp(obj_size, 0u);
+            __ cmp(obj_size, (u1)0);
             __ br(Assembler::LE, not_ok);  // make sure it's an instance (LH > 0)
             __ tstw(obj_size, Klass::_lh_instance_slow_path_bit);
             __ br(Assembler::EQ, ok);
--- a/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp	Fri Aug 10 17:07:44 2018 +0800
+++ b/src/hotspot/cpu/aarch64/interp_masm_aarch64.cpp	Fri Aug 10 14:22:49 2018 +0300
@@ -1636,7 +1636,7 @@
 
   ldr(rscratch1, mdo_addr);
   cbz(rscratch1, none);
-  cmp(rscratch1, TypeEntries::null_seen);
+  cmp(rscratch1, (u1)TypeEntries::null_seen);
   br(Assembler::EQ, none);
   // There is a chance that the checks above (re-reading profiling
   // data from memory) fail if another thread has just set the
@@ -1670,7 +1670,7 @@
     int off_to_start = is_virtual ? in_bytes(VirtualCallData::virtual_call_data_size()) : in_bytes(CounterData::counter_data_size());
 
     ldrb(rscratch1, Address(mdp, in_bytes(DataLayout::tag_offset()) - off_to_start));
-    cmp(rscratch1, is_virtual ? DataLayout::virtual_call_type_data_tag : DataLayout::call_type_data_tag);
+    cmp(rscratch1, u1(is_virtual ? DataLayout::virtual_call_type_data_tag : DataLayout::call_type_data_tag));
     br(Assembler::NE, profile_continue);
 
     if (MethodData::profile_arguments()) {
@@ -1682,7 +1682,7 @@
           // If return value type is profiled we may have no argument to profile
           ldr(tmp, Address(mdp, in_bytes(TypeEntriesAtCall::cell_count_offset())));
           sub(tmp, tmp, i*TypeStackSlotEntries::per_arg_count());
-          cmp(tmp, TypeStackSlotEntries::per_arg_count());
+          cmp(tmp, (u1)TypeStackSlotEntries::per_arg_count());
           add(rscratch1, mdp, off_to_args);
           br(Assembler::LT, done);
         }
@@ -1752,13 +1752,13 @@
       // length
       Label do_profile;
       ldrb(rscratch1, Address(rbcp, 0));
-      cmp(rscratch1, Bytecodes::_invokedynamic);
+      cmp(rscratch1, (u1)Bytecodes::_invokedynamic);
       br(Assembler::EQ, do_profile);
-      cmp(rscratch1, Bytecodes::_invokehandle);
+      cmp(rscratch1, (u1)Bytecodes::_invokehandle);
       br(Assembler::EQ, do_profile);
       get_method(tmp);
       ldrh(rscratch1, Address(tmp, Method::intrinsic_id_offset_in_bytes()));
-      cmp(rscratch1, vmIntrinsics::_compiledLambdaForm);
+      subs(zr, rscratch1, vmIntrinsics::_compiledLambdaForm);
       br(Assembler::NE, profile_continue);
 
       bind(do_profile);
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Fri Aug 10 17:07:44 2018 +0800
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Fri Aug 10 14:22:49 2018 +0300
@@ -494,7 +494,7 @@
     ldr(swap_reg, mark_addr);
   }
   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
-  cmp(tmp_reg, markOopDesc::biased_lock_pattern);
+  cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
   br(Assembler::NE, cas_label);
   // The bias pattern is present in the object's header. Need to check
   // whether the bias owner and the epoch are both still current.
@@ -633,7 +633,7 @@
   // the bias bit would be clear.
   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
-  cmp(temp_reg, markOopDesc::biased_lock_pattern);
+  cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
   br(Assembler::EQ, done);
 }
 
@@ -1137,7 +1137,7 @@
 
   if (super_check_offset.is_register()) {
     br(Assembler::EQ, *L_success);
-    cmp(super_check_offset.as_register(), sc_offset);
+    subs(zr, super_check_offset.as_register(), sc_offset);
     if (L_failure == &L_fallthrough) {
       br(Assembler::EQ, *L_slow_path);
     } else {
@@ -3312,7 +3312,7 @@
     add(table3, table0, 3*256*sizeof(juint));
 
   if (UseNeon) {
-      cmp(len, 64);
+      cmp(len, (u1)64);
       br(Assembler::LT, L_by16);
       eor(v16, T16B, v16, v16);
 
@@ -4371,10 +4371,10 @@
 
   if (icnt1 == -1) {
     sub(result_tmp, cnt2, cnt1);
-    cmp(cnt1, 8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
+    cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
     br(LT, LINEARSEARCH);
     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
-    cmp(cnt1, 256);
+    subs(zr, cnt1, 256);
     lsr(tmp1, cnt2, 2);
     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
     br(GE, LINEARSTUB);
@@ -4480,7 +4480,7 @@
     BIND(BCLOOP);
       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
       if (!str1_isL) {
-        cmp(ch1, ASIZE);
+        subs(zr, ch1, ASIZE);
         br(HS, BCSKIP);
       }
       strb(ch2, Address(sp, ch1));
@@ -4544,7 +4544,7 @@
         } else {
           mov(result_tmp, 1);
         }
-        cmp(skipch, ASIZE);
+        subs(zr, skipch, ASIZE);
         br(HS, BMADV);
       }
       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
@@ -4565,7 +4565,7 @@
       b(DONE);
 
     BIND(LINEARSTUB);
-    cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
+    cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
     br(LT, LINEAR_MEDIUM);
     mov(result, zr);
     RuntimeAddress stub = NULL;
@@ -4594,7 +4594,7 @@
     {
         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
 
-        cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
+        cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
         br(LT, DOSHORT);
       BIND(LINEAR_MEDIUM);
         (this->*str1_load_1chr)(first, Address(str1));
@@ -4629,7 +4629,7 @@
 
       BIND(DOSHORT);
       if (str1_isL == str2_isL) {
-        cmp(cnt1, 2);
+        cmp(cnt1, (u1)2);
         br(LT, DO1);
         br(GT, DO3);
       }
@@ -4704,7 +4704,7 @@
 
       BIND(DO1);
         (this->*str1_load_1chr)(ch1, str1);
-        cmp(cnt2, 8);
+        cmp(cnt2, (u1)8);
         br(LT, DO1_SHORT);
 
         sub(result_tmp, cnt2, 8/str2_chr_size);
@@ -4727,7 +4727,7 @@
         adds(cnt2_neg, cnt2_neg, 8);
         br(LT, CH1_LOOP);
 
-        cmp(cnt2_neg, 8);
+        cmp(cnt2_neg, (u1)8);
         mov(cnt2_neg, 0);
         br(LT, CH1_LOOP);
         b(NOMATCH);
@@ -4770,7 +4770,7 @@
   Register ch1 = rscratch1;
   Register result_tmp = rscratch2;
 
-  cmp(cnt1, 4);
+  cmp(cnt1, (u1)4);
   br(LT, DO1_SHORT);
 
   orr(ch, ch, ch, LSL, 16);
@@ -4793,7 +4793,7 @@
     adds(cnt1_neg, cnt1_neg, 8);
     br(LT, CH1_LOOP);
 
-    cmp(cnt1_neg, 8);
+    cmp(cnt1_neg, (u1)8);
     mov(cnt1_neg, 0);
     br(LT, CH1_LOOP);
     b(NOMATCH);
@@ -4830,7 +4830,7 @@
       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
       SHORT_LOOP_START, TAIL_CHECK;
 
-  const int STUB_THRESHOLD = 64 + 8;
+  const u1 STUB_THRESHOLD = 64 + 8;
   bool isLL = ae == StrIntrinsicNode::LL;
   bool isLU = ae == StrIntrinsicNode::LU;
   bool isUL = ae == StrIntrinsicNode::UL;
@@ -5225,10 +5225,10 @@
     ldrw(cnt2, Address(a2, length_offset));
     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
     // faster to perform another branch before comparing a1 and a2
-    cmp(cnt1, elem_per_word);
+    cmp(cnt1, (u1)elem_per_word);
     br(LE, SHORT); // short or same
     ldr(tmp3, Address(pre(a1, base_offset)));
-    cmp(cnt1, stubBytesThreshold);
+    subs(zr, cnt1, stubBytesThreshold);
     br(GE, STUB);
     ldr(tmp4, Address(pre(a2, base_offset)));
     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
@@ -5245,7 +5245,7 @@
       cbnz(tmp4, DONE);
       ldr(tmp3, Address(pre(a1, wordSize)));
       ldr(tmp4, Address(pre(a2, wordSize)));
-      cmp(cnt1, elem_per_word);
+      cmp(cnt1, (u1)elem_per_word);
       br(LE, TAIL2);
       cmp(tmp1, tmp2);
     } br(EQ, NEXT_DWORD);
@@ -5418,7 +5418,7 @@
   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
 
   BLOCK_COMMENT("zero_words {");
-  cmp(cnt, zero_words_block_size);
+  cmp(cnt, (u1)zero_words_block_size);
   Label around, done, done16;
   br(LO, around);
   {
@@ -5599,15 +5599,15 @@
       mov(result, len); // Save initial len
 
 #ifndef BUILTIN_SIM
-      cmp(len, 8); // handle shortest strings first
+      cmp(len, (u1)8); // handle shortest strings first
       br(LT, LOOP_1);
-      cmp(len, 32);
+      cmp(len, (u1)32);
       br(LT, NEXT_8);
       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
       // to convert chars to bytes
       if (SoftwarePrefetchHintDistance >= 0) {
         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
-        cmp(len, SoftwarePrefetchHintDistance/2 + 16);
+        subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
         br(LE, NEXT_32_START);
         b(NEXT_32_PRFM_START);
         BIND(NEXT_32_PRFM);
@@ -5627,9 +5627,9 @@
           sub(len, len, 32);
           add(dst, dst, 32);
           add(src, src, 64);
-          cmp(len, SoftwarePrefetchHintDistance/2 + 16);
+          subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
           br(GE, NEXT_32_PRFM);
-          cmp(len, 32);
+          cmp(len, (u1)32);
           br(LT, LOOP_8);
         BIND(NEXT_32);
           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
@@ -5652,12 +5652,12 @@
       sub(len, len, 32);
       add(dst, dst, 32);
       add(src, src, 64);
-      cmp(len, 32);
+      cmp(len, (u1)32);
       br(GE, NEXT_32);
       cbz(len, DONE);
 
     BIND(LOOP_8);
-      cmp(len, 8);
+      cmp(len, (u1)8);
       br(LT, LOOP_1);
     BIND(NEXT_8);
       ld1(Vtmp1, T8H, src);
@@ -5670,7 +5670,7 @@
       sub(len, len, 8);
       add(dst, dst, 8);
       add(src, src, 16);
-      cmp(len, 8);
+      cmp(len, (u1)8);
       br(GE, NEXT_8);
 
     BIND(LOOP_1);
@@ -5747,7 +5747,7 @@
       const int large_loop_threshold = (64 + 16)/8;
       ldrd(vtmp2, post(src, 8));
       andw(len, len, 7);
-      cmp(tmp4, large_loop_threshold);
+      cmp(tmp4, (u1)large_loop_threshold);
       br(GE, to_stub);
       b(loop_start);
 
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Fri Aug 10 17:07:44 2018 +0800
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Fri Aug 10 14:22:49 2018 +0300
@@ -180,8 +180,9 @@
 
   template<class T>
   inline void cmpw(Register Rd, T imm)  { subsw(zr, Rd, imm); }
-  // imm is limited to 12 bits.
-  inline void cmp(Register Rd, unsigned imm)  { subs(zr, Rd, imm); }
+
+  inline void cmp(Register Rd, unsigned char imm8)  { subs(zr, Rd, imm8); }
+  inline void cmp(Register Rd, unsigned imm) __attribute__ ((deprecated));
 
   inline void cmnw(Register Rd, unsigned imm) { addsw(zr, Rd, imm); }
   inline void cmn(Register Rd, unsigned imm) { adds(zr, Rd, imm); }
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64_log.cpp	Fri Aug 10 17:07:44 2018 +0800
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_log.cpp	Fri Aug 10 14:22:49 2018 +0300
@@ -292,7 +292,7 @@
     // vtmp1 = AS_DOUBLE_BITS(0x77F0 << 48 | mantissa(X)) == mx
     fmovd(vtmp1, tmp4);
     subw(tmp2, tmp2, 16);
-    cmp(tmp2, 0x8000);
+    subs(zr, tmp2, 0x8000);
     br(GE, SMALL_VALUE);
   bind(MAIN);
     fmovs(tmp3, vtmp5);                        // int intB0 = AS_INT_BITS(B);
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64_trig.cpp	Fri Aug 10 17:07:44 2018 +0800
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64_trig.cpp	Fri Aug 10 14:22:49 2018 +0300
@@ -297,7 +297,7 @@
       fmsubd(v3, v2, v6, v31); // v3 = r = t - fn * pio2_1
       fmuld(v26, v2, v7);      // v26 = w = fn * pio2_1t
       fsubd(v4, v3, v26);      // y[0] = r - w. Calculated before branch
-      cmp(n, 32);
+      cmp(n, (u1)32);
       br(GT, LARGE_ELSE);
       subw(tmp5, n, 1);        // tmp5 = n - 1
       ldrw(jv, Address(ih, tmp5, Address::lsl(2)));
@@ -312,7 +312,7 @@
           sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1);   // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
 
           block_comment("if(i>16)"); {
-            cmp(tmp3, 16);
+            cmp(tmp3, (u1)16);
             br(LE, X_IS_MEDIUM_BRANCH_DONE);
             // i > 16. 2nd iteration needed
             ldpd(v6, v7, Address(ih, -32));
@@ -328,7 +328,7 @@
             sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1); // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
 
             block_comment("if(i>49)"); {
-              cmp(tmp3, 49);
+              cmp(tmp3, (u1)49);
               br(LE, X_IS_MEDIUM_BRANCH_DONE);
               // 3rd iteration need, 151 bits acc
               ldpd(v6, v7, Address(ih, -16));
--- a/src/hotspot/cpu/aarch64/methodHandles_aarch64.cpp	Fri Aug 10 17:07:44 2018 +0800
+++ b/src/hotspot/cpu/aarch64/methodHandles_aarch64.cpp	Fri Aug 10 14:22:49 2018 +0300
@@ -196,7 +196,7 @@
     Label L;
     BLOCK_COMMENT("verify_intrinsic_id {");
     __ ldrh(rscratch1, Address(rmethod, Method::intrinsic_id_offset_in_bytes()));
-    __ cmp(rscratch1, (int) iid);
+    __ subs(zr, rscratch1, (int) iid);
     __ br(Assembler::EQ, L);
     if (iid == vmIntrinsics::_linkToVirtual ||
         iid == vmIntrinsics::_linkToSpecial) {
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Fri Aug 10 17:07:44 2018 +0800
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Fri Aug 10 14:22:49 2018 +0300
@@ -265,7 +265,7 @@
     {
       Label L;
       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
-      __ cmp(rscratch1, (unsigned)NULL_WORD);
+      __ cmp(rscratch1, (u1)NULL_WORD);
       __ br(Assembler::EQ, L);
       __ stop("StubRoutines::call_stub: entered with pending exception");
       __ BIND(L);
@@ -322,13 +322,13 @@
     __ ldr(j_rarg2, result);
     Label is_long, is_float, is_double, exit;
     __ ldr(j_rarg1, result_type);
-    __ cmp(j_rarg1, T_OBJECT);
+    __ cmp(j_rarg1, (u1)T_OBJECT);
     __ br(Assembler::EQ, is_long);
-    __ cmp(j_rarg1, T_LONG);
+    __ cmp(j_rarg1, (u1)T_LONG);
     __ br(Assembler::EQ, is_long);
-    __ cmp(j_rarg1, T_FLOAT);
+    __ cmp(j_rarg1, (u1)T_FLOAT);
     __ br(Assembler::EQ, is_float);
-    __ cmp(j_rarg1, T_DOUBLE);
+    __ cmp(j_rarg1, (u1)T_DOUBLE);
     __ br(Assembler::EQ, is_double);
 
     // handle T_INT case
@@ -743,7 +743,7 @@
     // Make sure we are never given < 8 words
     {
       Label L;
-      __ cmp(count, 8);
+      __ cmp(count, (u1)8);
       __ br(Assembler::GE, L);
       __ stop("genrate_copy_longs called with < 8 words");
       __ bind(L);
@@ -1103,19 +1103,19 @@
 
     if (PrefetchCopyIntervalInBytes > 0)
       __ prfm(Address(s, 0), PLDL1KEEP);
-    __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
+    __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
     __ br(Assembler::HI, copy_big);
 
     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 
-    __ cmp(count, 16/granularity);
+    __ cmp(count, u1(16/granularity));
     __ br(Assembler::LS, copy16);
 
-    __ cmp(count, 64/granularity);
+    __ cmp(count, u1(64/granularity));
     __ br(Assembler::HI, copy80);
 
-    __ cmp(count, 32/granularity);
+    __ cmp(count, u1(32/granularity));
     __ br(Assembler::LS, copy32);
 
     // 33..64 bytes
@@ -1170,7 +1170,7 @@
 
     // 0..16 bytes
     __ bind(copy16);
-    __ cmp(count, 8/granularity);
+    __ cmp(count, u1(8/granularity));
     __ br(Assembler::LO, copy8);
 
     // 8..16 bytes
@@ -3270,7 +3270,7 @@
 
     // The pipelined loop needs at least 16 elements for 1 iteration
     // It does check this, but it is more effective to skip to the cleanup loop
-    __ cmp(len, 16);
+    __ cmp(len, (u1)16);
     __ br(Assembler::HS, L_nmax);
     __ cbz(len, L_combine);
 
@@ -3654,7 +3654,7 @@
 
   address generate_has_negatives(address &has_negatives_long) {
     StubCodeMark mark(this, "StubRoutines", "has_negatives");
-    const int large_loop_size = 64;
+    const u1 large_loop_size = 64;
     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
     int dcache_line = VM_Version::dcache_line_size();
 
@@ -3668,7 +3668,7 @@
   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
 
-  __ cmp(len, 15);
+  __ cmp(len, (u1)15);
   __ br(Assembler::GT, LEN_OVER_15);
   // The only case when execution falls into this code is when pointer is near
   // the end of memory page and we have to avoid reading next page
@@ -3764,7 +3764,7 @@
     __ br(Assembler::GE, LARGE_LOOP);
 
   __ bind(CHECK_16); // small 16-byte load pre-loop
-    __ cmp(len, 16);
+    __ cmp(len, (u1)16);
     __ br(Assembler::LT, POST_LOOP16);
 
   __ bind(LOOP16); // small 16-byte load loop
@@ -3773,11 +3773,11 @@
     __ orr(tmp2, tmp2, tmp3);
     __ tst(tmp2, UPPER_BIT_MASK);
     __ br(Assembler::NE, RET_TRUE);
-    __ cmp(len, 16);
+    __ cmp(len, (u1)16);
     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
 
   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
-    __ cmp(len, 8);
+    __ cmp(len, (u1)8);
     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
     __ ldr(tmp3, Address(__ post(ary1, 8)));
     __ sub(len, len, 8);
@@ -3942,7 +3942,7 @@
         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
             /* prfm = */ true, NOT_EQUAL);
-        __ cmp(cnt1, nonPrefetchLoopThreshold);
+        __ subs(zr, cnt1, nonPrefetchLoopThreshold);
         __ br(__ LT, TAIL);
       }
       __ bind(NO_PREFETCH_LARGE_LOOP);
@@ -3955,7 +3955,7 @@
         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
             /* prfm = */ true, NOT_EQUAL);
-        __ cmp(cnt1, nonPrefetchLoopThreshold);
+        __ subs(zr, cnt1, nonPrefetchLoopThreshold);
         __ br(__ LT, TAIL);
       }
       __ bind(NO_PREFETCH_LARGE_LOOP);
@@ -4106,7 +4106,7 @@
     __ ldr(tmp3, Address(__ post(cnt1, 8)));
 
     if (SoftwarePrefetchHintDistance >= 0) {
-      __ cmp(cnt2, prefetchLoopExitCondition);
+      __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
       __ br(__ LT, SMALL_LOOP);
       __ bind(LARGE_LOOP_PREFETCH);
         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
@@ -4123,7 +4123,7 @@
           __ subs(tmp4, tmp4, 1);
           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
           __ sub(cnt2, cnt2, 64);
-          __ cmp(cnt2, prefetchLoopExitCondition);
+          __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
           __ br(__ GE, LARGE_LOOP_PREFETCH);
     }
     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
@@ -4137,7 +4137,7 @@
       __ br(__ GE, SMALL_LOOP);
       __ cbz(cnt2, LOAD_LAST);
     __ bind(TAIL); // 1..15 characters left
-      __ cmp(cnt2, -8);
+      __ subs(zr, cnt2, -8);
       __ br(__ GT, TAIL_LOAD_16);
       __ ldrd(vtmp, Address(tmp2));
       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
@@ -4240,7 +4240,7 @@
         compare_string_16_bytes_same(DIFF, DIFF2);
         __ sub(cnt2, cnt2, isLL ? 64 : 32);
         compare_string_16_bytes_same(DIFF, DIFF2);
-        __ cmp(cnt2, largeLoopExitCondition);
+        __ subs(rscratch2, cnt2, largeLoopExitCondition);
         compare_string_16_bytes_same(DIFF, DIFF2);
         __ br(__ GT, LARGE_LOOP_PREFETCH);
         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
@@ -4416,7 +4416,7 @@
       __ add(result, result, wordSize/str2_chr_size);
       __ br(__ GE, L_LOOP);
     __ BIND(L_POST_LOOP);
-      __ cmp(cnt2, -wordSize/str2_chr_size); // no extra characters to check
+      __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
       __ br(__ LE, NOMATCH);
       __ ldr(ch2, Address(str2));
       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
@@ -4446,7 +4446,7 @@
       __ br(__ EQ, NOMATCH);
     __ BIND(L_SMALL_HAS_ZERO_LOOP);
       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
-      __ cmp(cnt1, wordSize/str2_chr_size);
+      __ cmp(cnt1, u1(wordSize/str2_chr_size));
       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
       if (str2_isL) { // LL
         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
@@ -4659,7 +4659,7 @@
     __ zip1(v2, __ T16B, v2, v0);
     __ st1(v1, v2, __ T16B, __ post(dst, 32));
     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
-    __ cmp(octetCounter, large_loop_threshold);
+    __ subs(rscratch1, octetCounter, large_loop_threshold);
     __ br(__ LE, LOOP_START);
     __ b(LOOP_PRFM_START);
     __ bind(LOOP_PRFM);
@@ -4667,17 +4667,17 @@
     __ bind(LOOP_PRFM_START);
       __ prfm(Address(src, SoftwarePrefetchHintDistance));
       __ sub(octetCounter, octetCounter, 8);
-      __ cmp(octetCounter, large_loop_threshold);
+      __ subs(rscratch1, octetCounter, large_loop_threshold);
       inflate_and_store_2_fp_registers(true, v3, v4);
       inflate_and_store_2_fp_registers(true, v5, v6);
       __ br(__ GT, LOOP_PRFM);
-      __ cmp(octetCounter, 8);
+      __ cmp(octetCounter, (u1)8);
       __ br(__ LT, DONE);
     __ bind(LOOP);
       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
       __ bind(LOOP_START);
       __ sub(octetCounter, octetCounter, 8);
-      __ cmp(octetCounter, 8);
+      __ cmp(octetCounter, (u1)8);
       inflate_and_store_2_fp_registers(false, v3, v4);
       inflate_and_store_2_fp_registers(false, v5, v6);
       __ br(__ GE, LOOP);
@@ -5308,7 +5308,7 @@
       {
         ldr(Rn, Address(Pn_base, 0));
         mul(Rlo_mn, Rn, inv);
-        cmp(Rlo_mn, -1);
+        subs(zr, Rlo_mn, -1);
         Label ok;
         br(EQ, ok); {
           stop("broken inverse in Montgomery multiply");
--- a/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp	Fri Aug 10 17:07:44 2018 +0800
+++ b/src/hotspot/cpu/aarch64/templateInterpreterGenerator_aarch64.cpp	Fri Aug 10 14:22:49 2018 +0300
@@ -1360,7 +1360,7 @@
   {
     Label L;
     __ ldrw(t, Address(rthread, JavaThread::thread_state_offset()));
-    __ cmp(t, _thread_in_Java);
+    __ cmp(t, (u1)_thread_in_Java);
     __ br(Assembler::EQ, L);
     __ stop("Wrong thread state in native stub");
     __ bind(L);
@@ -1467,7 +1467,7 @@
     Label no_reguard;
     __ lea(rscratch1, Address(rthread, in_bytes(JavaThread::stack_guard_state_offset())));
     __ ldrw(rscratch1, Address(rscratch1));
-    __ cmp(rscratch1, JavaThread::stack_guard_yellow_reserved_disabled);
+    __ cmp(rscratch1, (u1)JavaThread::stack_guard_yellow_reserved_disabled);
     __ br(Assembler::NE, no_reguard);
 
     __ pusha(); // XXX only save smashed registers
--- a/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp	Fri Aug 10 17:07:44 2018 +0800
+++ b/src/hotspot/cpu/aarch64/templateTable_aarch64.cpp	Fri Aug 10 14:22:49 2018 +0300
@@ -331,16 +331,16 @@
   __ ldarb(r3, r3);
 
   // unresolved class - get the resolved class
-  __ cmp(r3, JVM_CONSTANT_UnresolvedClass);
+  __ cmp(r3, (u1)JVM_CONSTANT_UnresolvedClass);
   __ br(Assembler::EQ, call_ldc);
 
   // unresolved class in error state - call into runtime to throw the error
   // from the first resolution attempt
-  __ cmp(r3, JVM_CONSTANT_UnresolvedClassInError);
+  __ cmp(r3, (u1)JVM_CONSTANT_UnresolvedClassInError);
   __ br(Assembler::EQ, call_ldc);
 
   // resolved class - need to call vm to get java mirror of the class
-  __ cmp(r3, JVM_CONSTANT_Class);
+  __ cmp(r3, (u1)JVM_CONSTANT_Class);
   __ br(Assembler::NE, notClass);
 
   __ bind(call_ldc);
@@ -351,7 +351,7 @@
   __ b(Done);
 
   __ bind(notClass);
-  __ cmp(r3, JVM_CONSTANT_Float);
+  __ cmp(r3, (u1)JVM_CONSTANT_Float);
   __ br(Assembler::NE, notFloat);
   // ftos
   __ adds(r1, r2, r1, Assembler::LSL, 3);
@@ -361,7 +361,7 @@
 
   __ bind(notFloat);
 
-  __ cmp(r3, JVM_CONSTANT_Integer);
+  __ cmp(r3, (u1)JVM_CONSTANT_Integer);
   __ br(Assembler::NE, notInt);
 
   // itos
@@ -2333,7 +2333,7 @@
 
   assert(byte_no == f1_byte || byte_no == f2_byte, "byte_no out of range");
   __ get_cache_and_index_and_bytecode_at_bcp(Rcache, index, temp, byte_no, 1, index_size);
-  __ cmp(temp, (int) code);  // have we resolved this bytecode?
+  __ subs(zr, temp, (int) code);  // have we resolved this bytecode?
   __ br(Assembler::EQ, resolved);
 
   // resolve first time through
@@ -2515,7 +2515,7 @@
   __ b(Done);
 
   __ bind(notByte);
-  __ cmp(flags, ztos);
+  __ cmp(flags, (u1)ztos);
   __ br(Assembler::NE, notBool);
 
   // ztos (same code as btos)
@@ -2529,7 +2529,7 @@
   __ b(Done);
 
   __ bind(notBool);
-  __ cmp(flags, atos);
+  __ cmp(flags, (u1)atos);
   __ br(Assembler::NE, notObj);
   // atos
   do_oop_load(_masm, field, r0, IN_HEAP);
@@ -2540,7 +2540,7 @@
   __ b(Done);
 
   __ bind(notObj);
-  __ cmp(flags, itos);
+  __ cmp(flags, (u1)itos);
   __ br(Assembler::NE, notInt);
   // itos
   __ access_load_at(T_INT, IN_HEAP, r0, field, noreg, noreg);
@@ -2552,7 +2552,7 @@
   __ b(Done);
 
   __ bind(notInt);
-  __ cmp(flags, ctos);
+  __ cmp(flags, (u1)ctos);
   __ br(Assembler::NE, notChar);
   // ctos
   __ access_load_at(T_CHAR, IN_HEAP, r0, field, noreg, noreg);
@@ -2564,7 +2564,7 @@
   __ b(Done);
 
   __ bind(notChar);
-  __ cmp(flags, stos);
+  __ cmp(flags, (u1)stos);
   __ br(Assembler::NE, notShort);
   // stos
   __ access_load_at(T_SHORT, IN_HEAP, r0, field, noreg, noreg);
@@ -2576,7 +2576,7 @@
   __ b(Done);
 
   __ bind(notShort);
-  __ cmp(flags, ltos);
+  __ cmp(flags, (u1)ltos);
   __ br(Assembler::NE, notLong);
   // ltos
   __ access_load_at(T_LONG, IN_HEAP, r0, field, noreg, noreg);
@@ -2588,7 +2588,7 @@
   __ b(Done);
 
   __ bind(notLong);
-  __ cmp(flags, ftos);
+  __ cmp(flags, (u1)ftos);
   __ br(Assembler::NE, notFloat);
   // ftos
   __ access_load_at(T_FLOAT, IN_HEAP, noreg /* ftos */, field, noreg, noreg);
@@ -2601,7 +2601,7 @@
 
   __ bind(notFloat);
 #ifdef ASSERT
-  __ cmp(flags, dtos);
+  __ cmp(flags, (u1)dtos);
   __ br(Assembler::NE, notDouble);
 #endif
   // dtos
@@ -2751,7 +2751,7 @@
   }
 
   __ bind(notByte);
-  __ cmp(flags, ztos);
+  __ cmp(flags, (u1)ztos);
   __ br(Assembler::NE, notBool);
 
   // ztos
@@ -2766,7 +2766,7 @@
   }
 
   __ bind(notBool);
-  __ cmp(flags, atos);
+  __ cmp(flags, (u1)atos);
   __ br(Assembler::NE, notObj);
 
   // atos
@@ -2782,7 +2782,7 @@
   }
 
   __ bind(notObj);
-  __ cmp(flags, itos);
+  __ cmp(flags, (u1)itos);
   __ br(Assembler::NE, notInt);
 
   // itos
@@ -2797,7 +2797,7 @@
   }
 
   __ bind(notInt);
-  __ cmp(flags, ctos);
+  __ cmp(flags, (u1)ctos);
   __ br(Assembler::NE, notChar);
 
   // ctos
@@ -2812,7 +2812,7 @@
   }
 
   __ bind(notChar);
-  __ cmp(flags, stos);
+  __ cmp(flags, (u1)stos);
   __ br(Assembler::NE, notShort);
 
   // stos
@@ -2827,7 +2827,7 @@
   }
 
   __ bind(notShort);
-  __ cmp(flags, ltos);
+  __ cmp(flags, (u1)ltos);
   __ br(Assembler::NE, notLong);
 
   // ltos
@@ -2842,7 +2842,7 @@
   }
 
   __ bind(notLong);
-  __ cmp(flags, ftos);
+  __ cmp(flags, (u1)ftos);
   __ br(Assembler::NE, notFloat);
 
   // ftos
@@ -2858,7 +2858,7 @@
 
   __ bind(notFloat);
 #ifdef ASSERT
-  __ cmp(flags, dtos);
+  __ cmp(flags, (u1)dtos);
   __ br(Assembler::NE, notDouble);
 #endif
 
@@ -3534,7 +3534,7 @@
   __ lea(rscratch1, Address(r0, r3, Address::lsl(0)));
   __ lea(rscratch1, Address(rscratch1, tags_offset));
   __ ldarb(rscratch1, rscratch1);
-  __ cmp(rscratch1, JVM_CONSTANT_Class);
+  __ cmp(rscratch1, (u1)JVM_CONSTANT_Class);
   __ br(Assembler::NE, slow_case);
 
   // get InstanceKlass
@@ -3543,7 +3543,7 @@
   // make sure klass is initialized & doesn't have finalizer
   // make sure klass is fully initialized
   __ ldrb(rscratch1, Address(r4, InstanceKlass::init_state_offset()));
-  __ cmp(rscratch1, InstanceKlass::fully_initialized);
+  __ cmp(rscratch1, (u1)InstanceKlass::fully_initialized);
   __ br(Assembler::NE, slow_case);
 
   // get instance_size in InstanceKlass (scaled to a count of bytes)
@@ -3683,7 +3683,7 @@
   __ add(rscratch1, r3, Array<u1>::base_offset_in_bytes());
   __ lea(r1, Address(rscratch1, r19));
   __ ldarb(r1, r1);
-  __ cmp(r1, JVM_CONSTANT_Class);
+  __ cmp(r1, (u1)JVM_CONSTANT_Class);
   __ br(Assembler::EQ, quicked);
 
   __ push(atos); // save receiver for result, and for GC
@@ -3737,7 +3737,7 @@
   __ add(rscratch1, r3, Array<u1>::base_offset_in_bytes());
   __ lea(r1, Address(rscratch1, r19));
   __ ldarb(r1, r1);
-  __ cmp(r1, JVM_CONSTANT_Class);
+  __ cmp(r1, (u1)JVM_CONSTANT_Class);
   __ br(Assembler::EQ, quicked);
 
   __ push(atos); // save receiver for result, and for GC