--- a/src/hotspot/cpu/x86/assembler_x86.cpp Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp Tue Mar 13 10:22:15 2018 -0700
@@ -8709,6 +8709,15 @@
emit_int8((unsigned char)(0xC0 | encode));
}
+void Assembler::vpopcntd(XMMRegister dst, XMMRegister src, int vector_len) {
+ assert(VM_Version::supports_vpopcntdq(), "must support vpopcntdq feature");
+ InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+ attributes.set_is_evex_instruction();
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+ emit_int8(0x55);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
void Assembler::popq(Address dst) {
InstructionMark im(this);
prefixq(dst);
--- a/src/hotspot/cpu/x86/assembler_x86.hpp Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp Tue Mar 13 10:22:15 2018 -0700
@@ -1638,6 +1638,8 @@
void popcntq(Register dst, Register src);
#endif
+ void vpopcntd(XMMRegister dst, XMMRegister src, int vector_len);
+
// Prefetches (SSE, SSE2, 3DNOW only)
void prefetchnta(Address src);
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp Tue Mar 13 10:22:15 2018 -0700
@@ -257,6 +257,8 @@
__ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
__ movl(Address(rsi, 0), rax);
__ movl(Address(rsi, 4), rbx);
+ __ movl(Address(rsi, 8), rcx);
+ __ movl(Address(rsi, 12), rdx);
//
// Extended cpuid(0x80000000)
@@ -662,6 +664,7 @@
_features &= ~CPU_AVX512CD;
_features &= ~CPU_AVX512BW;
_features &= ~CPU_AVX512VL;
+ _features &= ~CPU_AVX512_VPOPCNTDQ;
}
if (UseAVX < 2)
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp Tue Mar 13 10:22:15 2018 -0700
@@ -228,6 +228,38 @@
} bits;
};
+ union SefCpuid7Ecx {
+ uint32_t value;
+ struct {
+ uint32_t prefetchwt1 : 1,
+ avx512_vbmi : 1,
+ umip : 1,
+ pku : 1,
+ ospke : 1,
+ : 1,
+ avx512_vbmi2 : 1,
+ : 1,
+ gfni : 1,
+ vaes : 1,
+ vpclmulqdq : 1,
+ avx512_vnni : 1,
+ avx512_bitalg : 1,
+ : 1,
+ avx512_vpopcntdq : 1,
+ : 17;
+ } bits;
+ };
+
+ union SefCpuid7Edx {
+ uint32_t value;
+ struct {
+ uint32_t : 2,
+ avx512_4vnniw : 1,
+ avx512_4fmaps : 1,
+ : 28;
+ } bits;
+ };
+
union ExtCpuid1EEbx {
uint32_t value;
struct {
@@ -300,7 +332,8 @@
#define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
#define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions
#define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions
-#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction
+#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction
+#define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount
enum Extended_Family {
// AMD
@@ -353,8 +386,8 @@
// cpuid function 7 (structured extended features)
SefCpuid7Eax sef_cpuid7_eax;
SefCpuid7Ebx sef_cpuid7_ebx;
- uint32_t sef_cpuid7_ecx; // unused currently
- uint32_t sef_cpuid7_edx; // unused currently
+ SefCpuid7Ecx sef_cpuid7_ecx;
+ SefCpuid7Edx sef_cpuid7_edx;
// cpuid function 0xB (processor topology)
// ecx = 0
@@ -507,6 +540,8 @@
result |= CPU_AVX512BW;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512vl != 0)
result |= CPU_AVX512VL;
+ if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vpopcntdq != 0)
+ result |= CPU_AVX512_VPOPCNTDQ;
}
}
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
@@ -783,6 +818,7 @@
static bool supports_sha() { return (_features & CPU_SHA) != 0; }
static bool supports_fma() { return (_features & CPU_FMA) != 0 && supports_avx(); }
static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; }
+ static bool supports_vpopcntdq() { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&
--- a/src/hotspot/cpu/x86/x86.ad Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/cpu/x86/x86.ad Tue Mar 13 10:22:15 2018 -0700
@@ -1223,6 +1223,10 @@
if (!UsePopCountInstruction)
ret_value = false;
break;
+ case Op_PopCountVI:
+ if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
+ ret_value = false;
+ break;
case Op_MulVI:
if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
ret_value = false;
@@ -10788,3 +10792,49 @@
%}
ins_pipe( pipe_slow );
%}
+
+// --------------------------------- PopCount --------------------------------------
+
+instruct vpopcount2I(vecD dst, vecD src) %{
+ predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
+ match(Set dst (PopCountVI src));
+ format %{ "vpopcntd $dst,$src\t! vector popcount packed2I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct vpopcount4I(vecX dst, vecX src) %{
+ predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
+ match(Set dst (PopCountVI src));
+ format %{ "vpopcntd $dst,$src\t! vector popcount packed4I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct vpopcount8I(vecY dst, vecY src) %{
+ predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
+ match(Set dst (PopCountVI src));
+ format %{ "vpopcntd $dst,$src\t! vector popcount packed8I" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct vpopcount16I(vecZ dst, vecZ src) %{
+ predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
+ match(Set dst (PopCountVI src));
+ format %{ "vpopcntd $dst,$src\t! vector popcount packed16I" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
--- a/src/hotspot/share/adlc/formssel.cpp Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/share/adlc/formssel.cpp Tue Mar 13 10:22:15 2018 -0700
@@ -4180,7 +4180,7 @@
"URShiftVB","URShiftVS","URShiftVI","URShiftVL",
"ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
"LoadVector","StoreVector",
- "FmaVD", "FmaVF",
+ "FmaVD", "FmaVF","PopCountVI",
// Next are not supported currently.
"PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
"ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD"
--- a/src/hotspot/share/opto/classes.hpp Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/share/opto/classes.hpp Tue Mar 13 10:22:15 2018 -0700
@@ -241,6 +241,7 @@
macro(Phi)
macro(PopCountI)
macro(PopCountL)
+macro(PopCountVI)
macro(PrefetchAllocation)
macro(Proj)
macro(RShiftI)
--- a/src/hotspot/share/opto/superword.cpp Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/share/opto/superword.cpp Tue Mar 13 10:22:15 2018 -0700
@@ -2325,8 +2325,11 @@
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
}
- } else if (opc == Op_SqrtF || opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || opc == Op_NegF || opc == Op_NegD) {
- // Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions)
+ } else if (opc == Op_SqrtF || opc == Op_SqrtD ||
+ opc == Op_AbsF || opc == Op_AbsD ||
+ opc == Op_NegF || opc == Op_NegD ||
+ opc == Op_PopCountI) {
+ assert(n->req() == 2, "only one input expected");
Node* in = vector_opd(p, 1);
vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
--- a/src/hotspot/share/opto/vectornode.cpp Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/share/opto/vectornode.cpp Tue Mar 13 10:22:15 2018 -0700
@@ -122,6 +122,13 @@
case Op_SqrtD:
assert(bt == T_DOUBLE, "must be");
return Op_SqrtVD;
+ case Op_PopCountI:
+ if (bt == T_INT) {
+ return Op_PopCountVI;
+ }
+ // Unimplemented for subword types since bit count changes
+ // depending on size of lane (and sign bit).
+ return 0;
case Op_LShiftI:
switch (bt) {
case T_BOOLEAN:
@@ -325,6 +332,8 @@
case Op_SqrtVF: return new SqrtVFNode(n1, vt);
case Op_SqrtVD: return new SqrtVDNode(n1, vt);
+ case Op_PopCountVI: return new PopCountVINode(n1, vt);
+
case Op_LShiftVB: return new LShiftVBNode(n1, n2, vt);
case Op_LShiftVS: return new LShiftVSNode(n1, n2, vt);
case Op_LShiftVI: return new LShiftVINode(n1, n2, vt);
--- a/src/hotspot/share/opto/vectornode.hpp Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/share/opto/vectornode.hpp Tue Mar 13 10:22:15 2018 -0700
@@ -381,6 +381,14 @@
virtual int Opcode() const;
};
+//------------------------------PopCountVINode---------------------------------
+// Vector popcount integer bits
+class PopCountVINode : public VectorNode {
+ public:
+ PopCountVINode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
+ virtual int Opcode() const;
+};
+
//------------------------------SqrtVFNode--------------------------------------
// Vector Sqrt float
class SqrtVFNode : public VectorNode {
--- a/src/hotspot/share/runtime/vmStructs.cpp Tue Mar 13 17:13:35 2018 +0100
+++ b/src/hotspot/share/runtime/vmStructs.cpp Tue Mar 13 10:22:15 2018 -0700
@@ -1996,6 +1996,7 @@
declare_c2_type(MulReductionVDNode, ReductionNode) \
declare_c2_type(DivVFNode, VectorNode) \
declare_c2_type(DivVDNode, VectorNode) \
+ declare_c2_type(PopCountVINode, VectorNode) \
declare_c2_type(LShiftVBNode, VectorNode) \
declare_c2_type(LShiftVSNode, VectorNode) \
declare_c2_type(LShiftVINode, VectorNode) \
--- a/test/hotspot/jtreg/TEST.groups Tue Mar 13 17:13:35 2018 +0100
+++ b/test/hotspot/jtreg/TEST.groups Tue Mar 13 10:22:15 2018 -0700
@@ -111,6 +111,7 @@
compiler/types/ \
compiler/uncommontrap/ \
compiler/unsafe/ \
+ compiler/vectorization/ \
-compiler/intrinsics/bmi \
-compiler/intrinsics/mathexact \
-compiler/intrinsics/sha \
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/hotspot/jtreg/compiler/vectorization/TestPopCountVector.java Tue Mar 13 10:22:15 2018 -0700
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8199421
+ * @summary Test vectorization of popcount
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UsePopCountInstruction
+ * compiler.vectorization.TestPopCountVector
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UsePopCountInstruction
+ * -XX:MaxVectorSize=8 compiler.vectorization.TestPopCountVector
+ */
+
+package compiler.vectorization;
+
+public class TestPopCountVector {
+ private int[] input;
+ private int[] output;
+ private static final int LEN = 1024;
+
+ public static void main(String args[]) {
+ TestPopCountVector test = new TestPopCountVector();
+
+ for (int i = 0; i < 10_000; ++i) {
+ test.vectorizeBitCount();
+ }
+ System.out.println("Checking popcount result");
+ test.checkResult();
+
+ for (int i = 0; i < 10_000; ++i) {
+ test.vectorizeBitCount();
+ }
+ System.out.println("Checking popcount result");
+ test.checkResult();
+ }
+
+ public TestPopCountVector() {
+ input = new int[LEN];
+ output = new int[LEN];
+ for (int i = 0; i < LEN; ++i) {
+ input[i] = i % 2 == 0 ? i : -1 * i;
+ }
+ }
+
+ public void vectorizeBitCount() {
+ for (int i = 0; i < LEN; ++i) {
+ output[i] = Integer.bitCount(input[i]);
+ }
+ }
+
+ public void checkResult() {
+ for (int i = 0; i < LEN; ++i) {
+ int expected = Integer.bitCount(input[i]);
+ if (output[i] != expected) {
+ throw new RuntimeException("Invalid result: output[" + i + "] = " + output[i] + " != " + expected);
+ }
+ }
+ }
+}
+