8199421: Add support for vector popcount
authorrlupusoru
Tue, 13 Mar 2018 10:22:15 -0700 (2018-03-13)
changeset 49384 b242a1e3f9cf
parent 49383 bf2ff45e592f
child 49385 1517b9220e64
8199421: Add support for vector popcount Reviewed-by: kvn
src/hotspot/cpu/x86/assembler_x86.cpp
src/hotspot/cpu/x86/assembler_x86.hpp
src/hotspot/cpu/x86/vm_version_x86.cpp
src/hotspot/cpu/x86/vm_version_x86.hpp
src/hotspot/cpu/x86/x86.ad
src/hotspot/share/adlc/formssel.cpp
src/hotspot/share/opto/classes.hpp
src/hotspot/share/opto/superword.cpp
src/hotspot/share/opto/vectornode.cpp
src/hotspot/share/opto/vectornode.hpp
src/hotspot/share/runtime/vmStructs.cpp
test/hotspot/jtreg/TEST.groups
test/hotspot/jtreg/compiler/vectorization/TestPopCountVector.java
--- a/src/hotspot/cpu/x86/assembler_x86.cpp	Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp	Tue Mar 13 10:22:15 2018 -0700
@@ -8709,6 +8709,15 @@
   emit_int8((unsigned char)(0xC0 | encode));
 }
 
+void Assembler::vpopcntd(XMMRegister dst, XMMRegister src, int vector_len) {
+  assert(VM_Version::supports_vpopcntdq(), "must support vpopcntdq feature");
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  attributes.set_is_evex_instruction();
+  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8(0x55);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::popq(Address dst) {
   InstructionMark im(this);
   prefixq(dst);
--- a/src/hotspot/cpu/x86/assembler_x86.hpp	Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp	Tue Mar 13 10:22:15 2018 -0700
@@ -1638,6 +1638,8 @@
   void popcntq(Register dst, Register src);
 #endif
 
+  void vpopcntd(XMMRegister dst, XMMRegister src, int vector_len);
+
   // Prefetches (SSE, SSE2, 3DNOW only)
 
   void prefetchnta(Address src);
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp	Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp	Tue Mar 13 10:22:15 2018 -0700
@@ -257,6 +257,8 @@
     __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
     __ movl(Address(rsi, 0), rax);
     __ movl(Address(rsi, 4), rbx);
+    __ movl(Address(rsi, 8), rcx);
+    __ movl(Address(rsi, 12), rdx);
 
     //
     // Extended cpuid(0x80000000)
@@ -662,6 +664,7 @@
     _features &= ~CPU_AVX512CD;
     _features &= ~CPU_AVX512BW;
     _features &= ~CPU_AVX512VL;
+    _features &= ~CPU_AVX512_VPOPCNTDQ;
   }
 
   if (UseAVX < 2)
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp	Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp	Tue Mar 13 10:22:15 2018 -0700
@@ -228,6 +228,38 @@
     } bits;
   };
 
+  union SefCpuid7Ecx {
+    uint32_t value;
+    struct {
+      uint32_t prefetchwt1 : 1,
+               avx512_vbmi : 1,
+                      umip : 1,
+                       pku : 1,
+                     ospke : 1,
+                           : 1,
+              avx512_vbmi2 : 1,
+                           : 1,
+                      gfni : 1,
+                      vaes : 1,
+                vpclmulqdq : 1,
+               avx512_vnni : 1,
+             avx512_bitalg : 1,
+                           : 1,
+          avx512_vpopcntdq : 1,
+                           : 17;
+    } bits;
+  };
+
+  union SefCpuid7Edx {
+    uint32_t value;
+    struct {
+      uint32_t             : 2,
+             avx512_4vnniw : 1,
+             avx512_4fmaps : 1,
+                           : 28;
+    } bits;
+  };
+
   union ExtCpuid1EEbx {
     uint32_t value;
     struct {
@@ -300,7 +332,8 @@
 #define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
 #define CPU_SHA ((uint64_t)UCONST64(0x400000000))      // SHA instructions
 #define CPU_FMA ((uint64_t)UCONST64(0x800000000))      // FMA instructions
-#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000))      // Vzeroupper instruction
+#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000))       // Vzeroupper instruction
+#define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount
 
   enum Extended_Family {
     // AMD
@@ -353,8 +386,8 @@
     // cpuid function 7 (structured extended features)
     SefCpuid7Eax sef_cpuid7_eax;
     SefCpuid7Ebx sef_cpuid7_ebx;
-    uint32_t     sef_cpuid7_ecx; // unused currently
-    uint32_t     sef_cpuid7_edx; // unused currently
+    SefCpuid7Ecx sef_cpuid7_ecx;
+    SefCpuid7Edx sef_cpuid7_edx;
 
     // cpuid function 0xB (processor topology)
     // ecx = 0
@@ -507,6 +540,8 @@
           result |= CPU_AVX512BW;
         if (_cpuid_info.sef_cpuid7_ebx.bits.avx512vl != 0)
           result |= CPU_AVX512VL;
+        if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vpopcntdq != 0)
+          result |= CPU_AVX512_VPOPCNTDQ;
       }
     }
     if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
@@ -783,6 +818,7 @@
   static bool supports_sha()        { return (_features & CPU_SHA) != 0; }
   static bool supports_fma()        { return (_features & CPU_FMA) != 0 && supports_avx(); }
   static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; }
+  static bool supports_vpopcntdq()  { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; }
 
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
--- a/src/hotspot/cpu/x86/x86.ad	Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/cpu/x86/x86.ad	Tue Mar 13 10:22:15 2018 -0700
@@ -1223,6 +1223,10 @@
       if (!UsePopCountInstruction)
         ret_value = false;
       break;
+    case Op_PopCountVI:
+      if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
+        ret_value = false;
+      break;
     case Op_MulVI:
       if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
         ret_value = false;
@@ -10788,3 +10792,49 @@
   %}
   ins_pipe( pipe_slow );
 %}
+
+// --------------------------------- PopCount --------------------------------------
+
+instruct vpopcount2I(vecD dst, vecD src) %{
+  predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
+  match(Set dst (PopCountVI src));
+  format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vpopcount4I(vecX dst, vecX src) %{
+  predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
+  match(Set dst (PopCountVI src));
+  format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
+  ins_encode %{
+    int vector_len = 0;
+    __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vpopcount8I(vecY dst, vecY src) %{
+  predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
+  match(Set dst (PopCountVI src));
+  format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
+  ins_encode %{
+    int vector_len = 1;
+    __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct vpopcount16I(vecZ dst, vecZ src) %{
+  predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
+  match(Set dst (PopCountVI src));
+  format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
+  ins_encode %{
+    int vector_len = 2;
+    __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+  %}
+  ins_pipe( pipe_slow );
+%}
--- a/src/hotspot/share/adlc/formssel.cpp	Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/share/adlc/formssel.cpp	Tue Mar 13 10:22:15 2018 -0700
@@ -4180,7 +4180,7 @@
     "URShiftVB","URShiftVS","URShiftVI","URShiftVL",
     "ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
     "LoadVector","StoreVector",
-    "FmaVD", "FmaVF",
+    "FmaVD", "FmaVF","PopCountVI",
     // Next are not supported currently.
     "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
     "ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD"
--- a/src/hotspot/share/opto/classes.hpp	Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/share/opto/classes.hpp	Tue Mar 13 10:22:15 2018 -0700
@@ -241,6 +241,7 @@
 macro(Phi)
 macro(PopCountI)
 macro(PopCountL)
+macro(PopCountVI)
 macro(PrefetchAllocation)
 macro(Proj)
 macro(RShiftI)
--- a/src/hotspot/share/opto/superword.cpp	Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/share/opto/superword.cpp	Tue Mar 13 10:22:15 2018 -0700
@@ -2325,8 +2325,11 @@
           vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
           vlen_in_bytes = vn->as_Vector()->length_in_bytes();
         }
-      } else if (opc == Op_SqrtF || opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || opc == Op_NegF || opc == Op_NegD) {
-        // Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions)
+      } else if (opc == Op_SqrtF || opc == Op_SqrtD ||
+                 opc == Op_AbsF || opc == Op_AbsD ||
+                 opc == Op_NegF || opc == Op_NegD ||
+                 opc == Op_PopCountI) {
+        assert(n->req() == 2, "only one input expected");
         Node* in = vector_opd(p, 1);
         vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
         vlen_in_bytes = vn->as_Vector()->length_in_bytes();
--- a/src/hotspot/share/opto/vectornode.cpp	Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/share/opto/vectornode.cpp	Tue Mar 13 10:22:15 2018 -0700
@@ -122,6 +122,13 @@
   case Op_SqrtD:
     assert(bt == T_DOUBLE, "must be");
     return Op_SqrtVD;
+  case Op_PopCountI:
+    if (bt == T_INT) {
+      return Op_PopCountVI;
+    }
+    // Unimplemented for subword types since bit count changes
+    // depending on size of lane (and sign bit).
+    return 0;
   case Op_LShiftI:
     switch (bt) {
     case T_BOOLEAN:
@@ -325,6 +332,8 @@
   case Op_SqrtVF: return new SqrtVFNode(n1, vt);
   case Op_SqrtVD: return new SqrtVDNode(n1, vt);
 
+  case Op_PopCountVI: return new PopCountVINode(n1, vt);
+
   case Op_LShiftVB: return new LShiftVBNode(n1, n2, vt);
   case Op_LShiftVS: return new LShiftVSNode(n1, n2, vt);
   case Op_LShiftVI: return new LShiftVINode(n1, n2, vt);
--- a/src/hotspot/share/opto/vectornode.hpp	Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/share/opto/vectornode.hpp	Tue Mar 13 10:22:15 2018 -0700
@@ -381,6 +381,14 @@
   virtual int Opcode() const;
 };
 
+//------------------------------PopCountVINode---------------------------------
+// Vector popcount integer bits
+class PopCountVINode : public VectorNode {
+ public:
+  PopCountVINode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
+  virtual int Opcode() const;
+};
+
 //------------------------------SqrtVFNode--------------------------------------
 // Vector Sqrt float
 class SqrtVFNode : public VectorNode {
--- a/src/hotspot/share/runtime/vmStructs.cpp	Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/share/runtime/vmStructs.cpp	Tue Mar 13 10:22:15 2018 -0700
@@ -1996,6 +1996,7 @@
   declare_c2_type(MulReductionVDNode, ReductionNode)                      \
   declare_c2_type(DivVFNode, VectorNode)                                  \
   declare_c2_type(DivVDNode, VectorNode)                                  \
+  declare_c2_type(PopCountVINode, VectorNode)                             \
   declare_c2_type(LShiftVBNode, VectorNode)                               \
   declare_c2_type(LShiftVSNode, VectorNode)                               \
   declare_c2_type(LShiftVINode, VectorNode)                               \
--- a/test/hotspot/jtreg/TEST.groups	Tue Mar 13 17:13:35 2018 +0100
+++ b/test/hotspot/jtreg/TEST.groups	Tue Mar 13 10:22:15 2018 -0700
@@ -111,6 +111,7 @@
   compiler/types/ \
   compiler/uncommontrap/ \
   compiler/unsafe/ \
+  compiler/vectorization/ \
   -compiler/intrinsics/bmi \
   -compiler/intrinsics/mathexact \
   -compiler/intrinsics/sha \
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test/hotspot/jtreg/compiler/vectorization/TestPopCountVector.java	Tue Mar 13 10:22:15 2018 -0700
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8199421
+ * @summary Test vectorization of popcount
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UsePopCountInstruction
+ *      compiler.vectorization.TestPopCountVector
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UsePopCountInstruction
+ *      -XX:MaxVectorSize=8 compiler.vectorization.TestPopCountVector
+ */
+
+package compiler.vectorization;
+
+public class TestPopCountVector {
+    private int[] input;
+    private int[] output;
+    private static final int LEN = 1024;
+
+    public static void main(String args[]) {
+        TestPopCountVector test = new TestPopCountVector();
+
+        for (int i = 0; i < 10_000; ++i) {
+          test.vectorizeBitCount();
+        }
+        System.out.println("Checking popcount result");
+        test.checkResult();
+
+        for (int i = 0; i < 10_000; ++i) {
+          test.vectorizeBitCount();
+        }
+        System.out.println("Checking popcount result");
+        test.checkResult();
+    }
+
+    public TestPopCountVector() {
+        input = new int[LEN];
+        output = new int[LEN];
+        for (int i = 0; i < LEN; ++i) {
+            input[i] = i % 2 == 0 ? i : -1 * i;
+        }
+    }
+
+    public void vectorizeBitCount() {
+        for (int i = 0; i < LEN; ++i) {
+            output[i] = Integer.bitCount(input[i]);
+        }
+    }
+
+    public void checkResult() {
+        for (int i = 0; i < LEN; ++i) {
+            int expected = Integer.bitCount(input[i]);
+            if (output[i] != expected) {
+                throw new RuntimeException("Invalid result: output[" + i + "] = " + output[i] + " != " + expected);
+            }
+        }
+    }
+}
+