8165287: fix compress intrinsics to produce correct results with avx512
authorvdeshpande
Tue, 22 Nov 2016 13:59:45 -0800
changeset 42587 80c66fce4b3b
parent 42586 e14fee6a1839
child 42588 d628ae4e12a2
8165287: fix compress intrinsics to produce correct results with avx512 Reviewed-by: kvn
hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Nov 22 11:34:11 2016 -0800
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp	Tue Nov 22 13:59:45 2016 -0800
@@ -10773,16 +10773,13 @@
   // save length for return
   push(len);
 
-  // 8165287: EVEX version disabled for now, needs to be refactored as
-  // it is returning incorrect results.
   if ((UseAVX > 2) && // AVX512
-    0 &&
     VM_Version::supports_avx512vlbw() &&
     VM_Version::supports_bmi2()) {
 
     set_vector_masking();  // opening of the stub context for programming mask registers
 
-    Label copy_32_loop, copy_loop_tail, copy_just_portion_of_candidates;
+    Label copy_32_loop, copy_loop_tail, restore_k1_return_zero;
 
     // alignement
     Label post_alignement;
@@ -10797,16 +10794,16 @@
     movl(result, 0x00FF);
     evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
 
-    testl(len, -64);
-    jcc(Assembler::zero, post_alignement);
-
     // Save k1
     kmovql(k3, k1);
 
+    testl(len, -64);
+    jcc(Assembler::zero, post_alignement);
+
     movl(tmp5, dst);
-    andl(tmp5, (64 - 1));
+    andl(tmp5, (32 - 1));
     negl(tmp5);
-    andl(tmp5, (64 - 1));
+    andl(tmp5, (32 - 1));
 
     // bail out when there is nothing to be done
     testl(tmp5, 0xFFFFFFFF);
@@ -10816,13 +10813,12 @@
     movl(result, 0xFFFFFFFF);
     shlxl(result, result, tmp5);
     notl(result);
-
     kmovdl(k1, result);
 
     evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
     evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
     ktestd(k2, k1);
-    jcc(Assembler::carryClear, copy_just_portion_of_candidates);
+    jcc(Assembler::carryClear, restore_k1_return_zero);
 
     evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
 
@@ -10835,7 +10831,7 @@
     // end of alignement
 
     movl(tmp5, len);
-    andl(tmp5, (32 - 1));   // tail count (in chars)
+    andl(tmp5, (32 - 1));    // tail count (in chars)
     andl(len, ~(32 - 1));    // vector count (in chars)
     jcc(Assembler::zero, copy_loop_tail);
 
@@ -10847,7 +10843,7 @@
     evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
     evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
     kortestdl(k2, k2);
-    jcc(Assembler::carryClear, copy_just_portion_of_candidates);
+    jcc(Assembler::carryClear, restore_k1_return_zero);
 
     // All elements in current processed chunk are valid candidates for
     // compression. Write a truncated byte elements to the memory.
@@ -10858,11 +10854,10 @@
     bind(copy_loop_tail);
     // bail out when there is nothing to be done
     testl(tmp5, 0xFFFFFFFF);
+    // Restore k1
+    kmovql(k1, k3);
     jcc(Assembler::zero, return_length);
 
-    // Save k1
-    kmovql(k3, k1);
-
     movl(len, tmp5);
 
     // ~(~0 << len), where len is the # of remaining elements to process
@@ -10875,30 +10870,16 @@
     evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
     evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
     ktestd(k2, k1);
-    jcc(Assembler::carryClear, copy_just_portion_of_candidates);
+    jcc(Assembler::carryClear, restore_k1_return_zero);
 
     evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
     // Restore k1
     kmovql(k1, k3);
-
     jmp(return_length);
 
-    bind(copy_just_portion_of_candidates);
-    kmovdl(tmp5, k2);
-    tzcntl(tmp5, tmp5);
-
-    // ~(~0 << tmp5), where tmp5 is a number of elements in an array from the
-    // result to the first element larger than 0xFF
-    movl(result, 0xFFFFFFFF);
-    shlxl(result, result, tmp5);
-    notl(result);
-
-    kmovdl(k1, result);
-
-    evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
+    bind(restore_k1_return_zero);
     // Restore k1
     kmovql(k1, k3);
-
     jmp(return_zero);
 
     clear_vector_masking();   // closing of the stub context for programming mask registers