src/hotspot/cpu/x86/macroAssembler_x86.cpp
changeset 51633 21154cb84d2a
parent 51464 d96e6839e83d
child 51663 a65d8a6fa424
--- a/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Tue Sep 04 18:32:28 2018 +0100
+++ b/src/hotspot/cpu/x86/macroAssembler_x86.cpp	Tue Sep 04 12:44:02 2018 -0700
@@ -4166,19 +4166,22 @@
   if ((dst_enc < 16) && (nds_enc < 16)) {
     vandps(dst, nds, negate_field, vector_len);
   } else if ((src_enc < 16) && (dst_enc < 16)) {
+    // Use src scratch register
     evmovdqul(src, nds, Assembler::AVX_512bit);
     vandps(dst, src, negate_field, vector_len);
+  } else if (dst_enc < 16) {
+    evmovdqul(dst, nds, Assembler::AVX_512bit);
+    vandps(dst, dst, negate_field, vector_len);
+  } else if (nds_enc < 16) {
+    vandps(nds, nds, negate_field, vector_len);
+    evmovdqul(dst, nds, Assembler::AVX_512bit);
   } else if (src_enc < 16) {
     evmovdqul(src, nds, Assembler::AVX_512bit);
     vandps(src, src, negate_field, vector_len);
     evmovdqul(dst, src, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    evmovdqul(src, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    vandps(dst, xmm0, negate_field, vector_len);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
   } else {
     if (src_enc != dst_enc) {
+      // Use src scratch register
       evmovdqul(src, xmm0, Assembler::AVX_512bit);
       evmovdqul(xmm0, nds, Assembler::AVX_512bit);
       vandps(xmm0, xmm0, negate_field, vector_len);
@@ -4201,17 +4204,19 @@
   if ((dst_enc < 16) && (nds_enc < 16)) {
     vandpd(dst, nds, negate_field, vector_len);
   } else if ((src_enc < 16) && (dst_enc < 16)) {
+    // Use src scratch register
     evmovdqul(src, nds, Assembler::AVX_512bit);
     vandpd(dst, src, negate_field, vector_len);
+  } else if (dst_enc < 16) {
+    evmovdqul(dst, nds, Assembler::AVX_512bit);
+    vandpd(dst, dst, negate_field, vector_len);
+  } else if (nds_enc < 16) {
+    vandpd(nds, nds, negate_field, vector_len);
+    evmovdqul(dst, nds, Assembler::AVX_512bit);
   } else if (src_enc < 16) {
     evmovdqul(src, nds, Assembler::AVX_512bit);
     vandpd(src, src, negate_field, vector_len);
     evmovdqul(dst, src, Assembler::AVX_512bit);
-  } else if (dst_enc < 16) {
-    evmovdqul(src, xmm0, Assembler::AVX_512bit);
-    evmovdqul(xmm0, nds, Assembler::AVX_512bit);
-    vandpd(dst, xmm0, negate_field, vector_len);
-    evmovdqul(xmm0, src, Assembler::AVX_512bit);
   } else {
     if (src_enc != dst_enc) {
       evmovdqul(src, xmm0, Assembler::AVX_512bit);
@@ -4282,6 +4287,7 @@
     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
     Assembler::vpaddb(xmm0, xmm0, src, vector_len);
+    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
   }
 }
@@ -4330,7 +4336,7 @@
   } else if (dst_enc < 16) {
     Assembler::vpaddw(dst, dst, src, vector_len);
   } else if (nds_enc < 16) {
-    // implies dst_enc in upper bank with src as scratch
+    // implies dst_enc in upper bank with nds as scratch
     evmovdqul(nds, dst, Assembler::AVX_512bit);
     Assembler::vpaddw(nds, nds, src, vector_len);
     evmovdqul(dst, nds, Assembler::AVX_512bit);
@@ -4339,6 +4345,7 @@
     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
     Assembler::vpaddw(xmm0, xmm0, src, vector_len);
+    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
   }
 }
@@ -4522,6 +4529,7 @@
     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
     Assembler::vpmullw(xmm0, xmm0, src, vector_len);
+    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
   }
 }
@@ -4578,7 +4586,8 @@
     // worse case scenario, all regs in upper bank
     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsubw(xmm0, xmm0, src, vector_len);
+    Assembler::vpsubb(xmm0, xmm0, src, vector_len);
+    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
   }
 }
@@ -4636,6 +4645,7 @@
     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
     Assembler::vpsubw(xmm0, xmm0, src, vector_len);
+    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
   }
 }
@@ -4649,7 +4659,7 @@
   } else if ((dst_enc < 16) && (shift_enc < 16)) {
     Assembler::vpsraw(dst, dst, shift, vector_len);
   } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds_enc as scratch with shift
+    // use nds as scratch with shift
     evmovdqul(nds, shift, Assembler::AVX_512bit);
     Assembler::vpsraw(dst, dst, nds, vector_len);
   } else if ((shift_enc < 16) && (nds_enc < 16)) {
@@ -4664,7 +4674,7 @@
     Assembler::vpsraw(dst, dst, xmm0, vector_len);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
   } else if (nds_enc < 16) {
-    // use nds as dest as temps
+    // use nds and dst as temps
     evmovdqul(nds, dst, Assembler::AVX_512bit);
     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
@@ -4677,8 +4687,7 @@
     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(xmm1, dst, Assembler::AVX_512bit);
+    Assembler::vpsraw(xmm0, xmm0, xmm1, vector_len);
     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
     pop_zmm(xmm1);
@@ -4702,6 +4711,7 @@
     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
     Assembler::vpsraw(xmm0, xmm0, shift, vector_len);
+    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
   }
 }
@@ -4715,7 +4725,7 @@
   } else if ((dst_enc < 16) && (shift_enc < 16)) {
     Assembler::vpsrlw(dst, dst, shift, vector_len);
   } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds_enc as scratch with shift
+    // use nds as scratch with shift
     evmovdqul(nds, shift, Assembler::AVX_512bit);
     Assembler::vpsrlw(dst, dst, nds, vector_len);
   } else if ((shift_enc < 16) && (nds_enc < 16)) {
@@ -4730,7 +4740,7 @@
     Assembler::vpsrlw(dst, dst, xmm0, vector_len);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
   } else if (nds_enc < 16) {
-    // use nds as dest as temps
+    // use nds and dst as temps
     evmovdqul(nds, dst, Assembler::AVX_512bit);
     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
@@ -4743,8 +4753,7 @@
     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
-    Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(xmm1, dst, Assembler::AVX_512bit);
+    Assembler::vpsrlw(xmm0, xmm0, xmm1, vector_len);
     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
     pop_zmm(xmm1);
@@ -4768,6 +4777,7 @@
     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
     Assembler::vpsrlw(xmm0, xmm0, shift, vector_len);
+    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
   }
 }
@@ -4781,7 +4791,7 @@
   } else if ((dst_enc < 16) && (shift_enc < 16)) {
     Assembler::vpsllw(dst, dst, shift, vector_len);
   } else if ((dst_enc < 16) && (nds_enc < 16)) {
-    // use nds_enc as scratch with shift
+    // use nds as scratch with shift
     evmovdqul(nds, shift, Assembler::AVX_512bit);
     Assembler::vpsllw(dst, dst, nds, vector_len);
   } else if ((shift_enc < 16) && (nds_enc < 16)) {
@@ -4796,7 +4806,7 @@
     Assembler::vpsllw(dst, dst, xmm0, vector_len);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
   } else if (nds_enc < 16) {
-    // use nds as dest as temps
+    // use nds and dst as temps
     evmovdqul(nds, dst, Assembler::AVX_512bit);
     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, shift, Assembler::AVX_512bit);
@@ -4810,7 +4820,6 @@
     evmovdqul(xmm1, shift, Assembler::AVX_512bit);
     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
     Assembler::vpsllw(xmm0, xmm0, xmm1, vector_len);
-    evmovdqul(xmm1, dst, Assembler::AVX_512bit);
     evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
     pop_zmm(xmm1);
@@ -4834,6 +4843,7 @@
     evmovdqul(nds, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, dst, Assembler::AVX_512bit);
     Assembler::vpsllw(xmm0, xmm0, shift, vector_len);
+    evmovdqul(dst, xmm0, Assembler::AVX_512bit);
     evmovdqul(xmm0, nds, Assembler::AVX_512bit);
   }
 }
@@ -7130,7 +7140,7 @@
 
   bind(RET_NOT_FOUND);
   movl(result, -1);
-  jmpb(CLEANUP);
+  jmp(CLEANUP);
 
   bind(FOUND_SUBSTR);
   // Compute start addr of substr
@@ -7148,7 +7158,7 @@
     addl(tmp, cnt2);
     // Found result if we matched whole substring.
     cmpl(tmp, stride);
-    jccb(Assembler::lessEqual, RET_FOUND);
+    jcc(Assembler::lessEqual, RET_FOUND);
 
     // Repeat search for small substring (<= 8 chars)
     // from new point 'str1' without reloading substring.
@@ -7248,7 +7258,7 @@
     jcc(Assembler::carryClear, FOUND_CHAR);
     addptr(result, 32);
     subl(tmp, 2*stride);
-    jccb(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
+    jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
     jmp(SCAN_TO_8_CHAR);
     bind(SCAN_TO_8_CHAR_INIT);
     movdl(vec1, ch);
@@ -7278,7 +7288,7 @@
   jcc(Assembler::carryClear, FOUND_CHAR);
   addptr(result, 16);
   subl(tmp, stride);
-  jccb(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
+  jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
   bind(SCAN_TO_CHAR);
   testl(cnt1, cnt1);
   jcc(Assembler::zero, RET_NOT_FOUND);
@@ -7857,7 +7867,7 @@
       // Compare 16-byte vectors
       andl(result, 0x0000000f);  //   tail count (in bytes)
       andl(len, 0xfffffff0);   // vector count (in bytes)
-      jccb(Assembler::zero, COMPARE_TAIL);
+      jcc(Assembler::zero, COMPARE_TAIL);
 
       lea(ary1, Address(ary1, len, Address::times_1));
       negptr(len);
@@ -7869,12 +7879,12 @@
       bind(COMPARE_WIDE_VECTORS);
       movdqu(vec1, Address(ary1, len, Address::times_1));
       ptest(vec1, vec2);
-      jccb(Assembler::notZero, TRUE_LABEL);
+      jcc(Assembler::notZero, TRUE_LABEL);
       addptr(len, 16);
       jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
 
       testl(result, result);
-      jccb(Assembler::zero, FALSE_LABEL);
+      jcc(Assembler::zero, FALSE_LABEL);
 
       movdqu(vec1, Address(ary1, result, Address::times_1, -16));
       ptest(vec1, vec2);
@@ -9069,7 +9079,7 @@
     jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found
     addq(result, 32);
     subq(length, 32);
-    jccb(Assembler::greaterEqual, VECTOR32_LOOP);
+    jcc(Assembler::greaterEqual, VECTOR32_LOOP);
     addq(length, 32);
     jcc(Assembler::equal, SAME_TILL_END);
     //falling through if less than 32 bytes left //close the branch here.
@@ -9140,24 +9150,24 @@
   load_unsigned_byte(tmp2, Address(objb, result));
   xorl(tmp1, tmp2);
   testl(tmp1, tmp1);
-  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
+  jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
   decq(length);
-  jccb(Assembler::zero, SAME_TILL_END);
+  jcc(Assembler::zero, SAME_TILL_END);
   incq(result);
   load_unsigned_byte(tmp1, Address(obja, result));
   load_unsigned_byte(tmp2, Address(objb, result));
   xorl(tmp1, tmp2);
   testl(tmp1, tmp1);
-  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
+  jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
   decq(length);
-  jccb(Assembler::zero, SAME_TILL_END);
+  jcc(Assembler::zero, SAME_TILL_END);
   incq(result);
   load_unsigned_byte(tmp1, Address(obja, result));
   load_unsigned_byte(tmp2, Address(objb, result));
   xorl(tmp1, tmp2);
   testl(tmp1, tmp1);
-  jccb(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
-  jmpb(SAME_TILL_END);
+  jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found
+  jmp(SAME_TILL_END);
 
   if (UseAVX >= 2) {
     bind(VECTOR32_NOT_EQUAL);
@@ -9168,7 +9178,7 @@
     bsfq(tmp1, tmp1);
     addq(result, tmp1);
     shrq(result);
-    jmpb(DONE);
+    jmp(DONE);
   }
 
   bind(VECTOR16_NOT_EQUAL);
@@ -10590,7 +10600,7 @@
     andl(len, 0xfffffff0);    // vector count (in chars)
     andl(result, 0x0000000f);    // tail count (in chars)
     testl(len, len);
-    jccb(Assembler::zero, copy_16);
+    jcc(Assembler::zero, copy_16);
 
     // compress 16 chars per iter
     movdl(tmp1Reg, tmp5);