--- a/hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -1,6 +1,6 @@
/*
- * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -2120,6 +2120,7 @@
save_native_result(masm, ret_type, stack_slots);
}
+ __ mov(c_rarg2, rthread);
__ lea(c_rarg1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
__ mov(c_rarg0, obj_reg);
@@ -2128,7 +2129,7 @@
__ ldr(r19, Address(rthread, in_bytes(Thread::pending_exception_offset())));
__ str(zr, Address(rthread, in_bytes(Thread::pending_exception_offset())));
- rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), 2, 0, 1);
+ rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), 3, 0, 1);
#ifdef ASSERT
{
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
@@ -190,6 +190,11 @@
}
}
+ if (UseGHASHIntrinsics) {
+ warning("GHASH intrinsics are not available on this CPU");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
UseCRC32Intrinsics = true;
}
--- a/hotspot/src/cpu/ppc/vm/sharedRuntime_ppc.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/ppc/vm/sharedRuntime_ppc.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -2475,7 +2475,8 @@
// Slow case of monitor enter.
// Inline a special case of call_VM that disallows any pending_exception.
- __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), r_oop, r_box);
+ // Arguments are (oop obj, BasicLock* lock, JavaThread* thread).
+ __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), r_oop, r_box, R16_thread);
__ asm_assert_mem8_is_zero(thread_(pending_exception),
"no pending exception allowed on exit from SharedRuntime::complete_monitor_unlocking_C", 0);
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -176,6 +176,11 @@
FLAG_SET_DEFAULT(UseAESIntrinsics, false);
}
+ if (UseGHASHIntrinsics) {
+ warning("GHASH intrinsics are not available on this CPU");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
if (UseSHA) {
warning("SHA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseSHA, false);
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -129,6 +129,7 @@
flog3_op3 = 0x36,
edge_op3 = 0x36,
fsrc_op3 = 0x36,
+ xmulx_op3 = 0x36,
impdep2_op3 = 0x37,
stpartialf_op3 = 0x37,
jmpl_op3 = 0x38,
@@ -220,6 +221,8 @@
mdtox_opf = 0x110,
mstouw_opf = 0x111,
mstosw_opf = 0x113,
+ xmulx_opf = 0x115,
+ xmulxhi_opf = 0x116,
mxtod_opf = 0x118,
mwtos_opf = 0x119,
@@ -1212,6 +1215,9 @@
void movwtos( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(mftoi_op3) | opf(mwtos_opf) | rs2(s)); }
void movxtod( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(mftoi_op3) | opf(mxtod_opf) | rs2(s)); }
+ void xmulx(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulx_opf) | rs2(s2)); }
+ void xmulxhi(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2)); }
+
// Crypto SHA instructions
void sha1() { sha1_only(); emit_int32( op(arith_op) | op3(sha_op3) | opf(sha1_opf)); }
--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -4786,6 +4786,130 @@
return start;
}
+ /* Single and multi-block ghash operations */
+ address generate_ghash_processBlocks() {
+ __ align(CodeEntryAlignment);
+ Label L_ghash_loop, L_aligned, L_main;
+ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+ address start = __ pc();
+
+ Register state = I0;
+ Register subkeyH = I1;
+ Register data = I2;
+ Register len = I3;
+
+ __ save_frame(0);
+
+ __ ldx(state, 0, O0);
+ __ ldx(state, 8, O1);
+
+ // Loop label for multiblock operations
+ __ BIND(L_ghash_loop);
+
+ // Check if 'data' is unaligned
+ __ andcc(data, 7, G1);
+ __ br(Assembler::zero, false, Assembler::pt, L_aligned);
+ __ delayed()->nop();
+
+ Register left_shift = L1;
+ Register right_shift = L2;
+ Register data_ptr = L3;
+
+ // Get left and right shift values in bits
+ __ sll(G1, LogBitsPerByte, left_shift);
+ __ mov(64, right_shift);
+ __ sub(right_shift, left_shift, right_shift);
+
+ // Align to read 'data'
+ __ sub(data, G1, data_ptr);
+
+ // Load first 8 bytes of 'data'
+ __ ldx(data_ptr, 0, O4);
+ __ sllx(O4, left_shift, O4);
+ __ ldx(data_ptr, 8, O5);
+ __ srlx(O5, right_shift, G4);
+ __ bset(G4, O4);
+
+ // Load second 8 bytes of 'data'
+ __ sllx(O5, left_shift, O5);
+ __ ldx(data_ptr, 16, G4);
+ __ srlx(G4, right_shift, G4);
+ __ ba(L_main);
+ __ delayed()->bset(G4, O5);
+
+ // If 'data' is aligned, load normally
+ __ BIND(L_aligned);
+ __ ldx(data, 0, O4);
+ __ ldx(data, 8, O5);
+
+ __ BIND(L_main);
+ __ ldx(subkeyH, 0, O2);
+ __ ldx(subkeyH, 8, O3);
+
+ __ xor3(O0, O4, O0);
+ __ xor3(O1, O5, O1);
+
+ __ xmulxhi(O0, O3, G3);
+ __ xmulx(O0, O2, O5);
+ __ xmulxhi(O1, O2, G4);
+ __ xmulxhi(O1, O3, G5);
+ __ xmulx(O0, O3, G1);
+ __ xmulx(O1, O3, G2);
+ __ xmulx(O1, O2, O3);
+ __ xmulxhi(O0, O2, O4);
+
+ __ mov(0xE1, O0);
+ __ sllx(O0, 56, O0);
+
+ __ xor3(O5, G3, O5);
+ __ xor3(O5, G4, O5);
+ __ xor3(G5, G1, G1);
+ __ xor3(G1, O3, G1);
+ __ srlx(G2, 63, O1);
+ __ srlx(G1, 63, G3);
+ __ sllx(G2, 63, O3);
+ __ sllx(G2, 58, O2);
+ __ xor3(O3, O2, O2);
+
+ __ sllx(G1, 1, G1);
+ __ or3(G1, O1, G1);
+
+ __ xor3(G1, O2, G1);
+
+ __ sllx(G2, 1, G2);
+
+ __ xmulxhi(G1, O0, O1);
+ __ xmulx(G1, O0, O2);
+ __ xmulxhi(G2, O0, O3);
+ __ xmulx(G2, O0, G1);
+
+ __ xor3(O4, O1, O4);
+ __ xor3(O5, O2, O5);
+ __ xor3(O5, O3, O5);
+
+ __ sllx(O4, 1, O2);
+ __ srlx(O5, 63, O3);
+
+ __ or3(O2, O3, O0);
+
+ __ sllx(O5, 1, O1);
+ __ srlx(G1, 63, O2);
+ __ or3(O1, O2, O1);
+ __ xor3(O1, G3, O1);
+
+ __ deccc(len);
+ __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
+ __ delayed()->add(data, 16, data);
+
+ __ stx(O0, I0, 0);
+ __ stx(O1, I0, 8);
+
+ __ ret();
+ __ delayed()->restore();
+
+ return start;
+ }
+
void generate_initial() {
// Generates all stubs and initializes the entry points
@@ -4859,6 +4983,10 @@
StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
}
+ // generate GHASH intrinsics code
+ if (UseGHASHIntrinsics) {
+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+ }
// generate SHA1/SHA256/SHA512 intrinsics code
if (UseSHA1Intrinsics) {
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -300,6 +300,17 @@
}
}
+ // GHASH/GCM intrinsics
+ if (has_vis3() && (UseVIS > 2)) {
+ if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+ UseGHASHIntrinsics = true;
+ }
+ } else if (UseGHASHIntrinsics) {
+ if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+ warning("GHASH intrinsics require VIS3 insructions support. Intriniscs will be disabled");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
// SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
if (has_sha1() || has_sha256() || has_sha512()) {
if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -1347,7 +1347,7 @@
void Assembler::andnl(Register dst, Register src1, Register src2) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode(dst, src1, src2, false);
+ int encode = vex_prefix_0F38_and_encode_legacy(dst, src1, src2, false);
emit_int8((unsigned char)0xF2);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -1355,7 +1355,7 @@
void Assembler::andnl(Register dst, Register src1, Address src2) {
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38(dst, src1, src2, false);
+ vex_prefix_0F38_legacy(dst, src1, src2, false);
emit_int8((unsigned char)0xF2);
emit_operand(dst, src2);
}
@@ -1382,7 +1382,7 @@
void Assembler::blsil(Register dst, Register src) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode(rbx, dst, src, false);
+ int encode = vex_prefix_0F38_and_encode_legacy(rbx, dst, src, false);
emit_int8((unsigned char)0xF3);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -1390,14 +1390,14 @@
void Assembler::blsil(Register dst, Address src) {
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38(rbx, dst, src, false);
+ vex_prefix_0F38_legacy(rbx, dst, src, false);
emit_int8((unsigned char)0xF3);
emit_operand(rbx, src);
}
void Assembler::blsmskl(Register dst, Register src) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode(rdx, dst, src, false);
+ int encode = vex_prefix_0F38_and_encode_legacy(rdx, dst, src, false);
emit_int8((unsigned char)0xF3);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -1412,7 +1412,7 @@
void Assembler::blsrl(Register dst, Register src) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode(rcx, dst, src, false);
+ int encode = vex_prefix_0F38_and_encode_legacy(rcx, dst, src, false);
emit_int8((unsigned char)0xF3);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -1420,7 +1420,7 @@
void Assembler::blsrl(Register dst, Address src) {
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38(rcx, dst, src, false);
+ vex_prefix_0F38_legacy(rcx, dst, src, false);
emit_int8((unsigned char)0xF3);
emit_operand(rcx, src);
}
@@ -3095,8 +3095,16 @@
void Assembler::psrldq(XMMRegister dst, int shift) {
// Shift 128 bit value in xmm register by number of bytes.
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
- int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F,
- false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+ int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
+ emit_int8(0x73);
+ emit_int8((unsigned char)(0xC0 | encode));
+ emit_int8(shift);
+}
+
+void Assembler::pslldq(XMMRegister dst, int shift) {
+ // Shift left 128 bit value in xmm register by number of bytes.
+ NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+ int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false));
emit_int8(0x73);
emit_int8((unsigned char)(0xC0 | encode));
emit_int8(shift);
@@ -3106,15 +3114,16 @@
assert(VM_Version::supports_sse4_1(), "");
assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
InstructionMark im(this);
- simd_prefix(dst, src, VEX_SIMD_66, false, VEX_OPCODE_0F_38);
+ simd_prefix(dst, xnoreg, src, VEX_SIMD_66, false,
+ VEX_OPCODE_0F_38, false, AVX_128bit, true);
emit_int8(0x17);
emit_operand(dst, src);
}
void Assembler::ptest(XMMRegister dst, XMMRegister src) {
assert(VM_Version::supports_sse4_1(), "");
- int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
- false, VEX_OPCODE_0F_38);
+ int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, false,
+ VEX_OPCODE_0F_38, false, AVX_128bit, true);
emit_int8(0x17);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -3126,7 +3135,7 @@
assert(dst != xnoreg, "sanity");
int dst_enc = dst->encoding();
// swap src<->dst for encoding
- vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+ vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len, true, false);
emit_int8(0x17);
emit_operand(dst, src);
}
@@ -3135,7 +3144,7 @@
assert(VM_Version::supports_avx(), "");
int vector_len = AVX_256bit;
int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
- vector_len, VEX_OPCODE_0F_38);
+ vector_len, VEX_OPCODE_0F_38, true, false);
emit_int8(0x17);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -3146,12 +3155,12 @@
if (VM_Version::supports_evex()) {
tuple_type = EVEX_FVM;
}
- emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
+ emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
}
void Assembler::punpcklbw(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
- emit_simd_arith(0x60, dst, src, VEX_SIMD_66);
+ emit_simd_arith(0x60, dst, src, VEX_SIMD_66, false, (VM_Version::supports_avx512vlbw() == false));
}
void Assembler::punpckldq(XMMRegister dst, Address src) {
@@ -4979,7 +4988,51 @@
emit_int8((unsigned char)(0xC0 | encode));
}
-// duplicate 4-bytes integer data from src into 8 locations in dest
+// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
+ vector_len, VEX_OPCODE_0F_38, false);
+ emit_int8(0x78);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ tuple_type = EVEX_T1S;
+ input_size_in_bits = EVEX_8bit;
+ InstructionMark im(this);
+ assert(dst != xnoreg, "sanity");
+ int dst_enc = dst->encoding();
+ // swap src<->dst for encoding
+ vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+ emit_int8(0x78);
+ emit_operand(dst, src);
+}
+
+// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
+ vector_len, VEX_OPCODE_0F_38, false);
+ emit_int8(0x79);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastw(XMMRegister dst, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ tuple_type = EVEX_T1S;
+ input_size_in_bits = EVEX_16bit;
+ InstructionMark im(this);
+ assert(dst != xnoreg, "sanity");
+ int dst_enc = dst->encoding();
+ // swap src<->dst for encoding
+ vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+ emit_int8(0x79);
+ emit_operand(dst, src);
+}
+
+// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
void Assembler::evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len) {
assert(VM_Version::supports_evex(), "");
int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66,
@@ -4988,6 +5041,121 @@
emit_int8((unsigned char)(0xC0 | encode));
}
+void Assembler::evpbroadcastd(XMMRegister dst, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ tuple_type = EVEX_T1S;
+ input_size_in_bits = EVEX_32bit;
+ InstructionMark im(this);
+ assert(dst != xnoreg, "sanity");
+ int dst_enc = dst->encoding();
+ // swap src<->dst for encoding
+ vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+ emit_int8(0x58);
+ emit_operand(dst, src);
+}
+
+// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, true, vector_len, false, false);
+ emit_int8(0x59);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastq(XMMRegister dst, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ tuple_type = EVEX_T1S;
+ input_size_in_bits = EVEX_64bit;
+ InstructionMark im(this);
+ assert(dst != xnoreg, "sanity");
+ int dst_enc = dst->encoding();
+ // swap src<->dst for encoding
+ vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+ emit_int8(0x59);
+ emit_operand(dst, src);
+}
+
+// duplicate single precision fp from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, false, vector_len, false, false);
+ emit_int8(0x18);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastss(XMMRegister dst, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ tuple_type = EVEX_T1S;
+ input_size_in_bits = EVEX_32bit;
+ InstructionMark im(this);
+ assert(dst != xnoreg, "sanity");
+ int dst_enc = dst->encoding();
+ // swap src<->dst for encoding
+ vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector_len);
+ emit_int8(0x18);
+ emit_operand(dst, src);
+}
+
+// duplicate double precision fp from src into 2|4|8 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, true, vector_len, false, false);
+ emit_int8(0x19);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::evpbroadcastsd(XMMRegister dst, Address src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ tuple_type = EVEX_T1S;
+ input_size_in_bits = EVEX_64bit;
+ InstructionMark im(this);
+ assert(dst != xnoreg, "sanity");
+ int dst_enc = dst->encoding();
+ // swap src<->dst for encoding
+ vex_prefix(src, 0, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, true, vector_len);
+ emit_int8(0x19);
+ emit_operand(dst, src);
+}
+
+// duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, false, vector_len, false, false);
+ emit_int8(0x7A);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
+void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, false, vector_len, false, false);
+ emit_int8(0x7B);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 4-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastd(XMMRegister dst, Register src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, false, vector_len, false, false);
+ emit_int8(0x7C);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// duplicate 8-byte integer data from src into 4|8|16 locations in dest : requires AVX512VL
+void Assembler::evpbroadcastq(XMMRegister dst, Register src, int vector_len) {
+ assert(VM_Version::supports_evex(), "");
+ int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66,
+ VEX_OPCODE_0F_38, true, vector_len, false, false);
+ emit_int8(0x7C);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
// Carry-Less Multiplication Quadword
void Assembler::pclmulqdq(XMMRegister dst, XMMRegister src, int mask) {
assert(VM_Version::supports_clmul(), "");
@@ -5598,7 +5766,7 @@
void Assembler::vex_prefix(Address adr, int nds_enc, int xreg_enc, VexSimdPrefix pre,
VexOpcode opc, bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg) {
- bool vex_r = (xreg_enc >= 8);
+ bool vex_r = ((xreg_enc & 8) == 8) ? 1 : 0;
bool vex_b = adr.base_needs_rex();
bool vex_x = adr.index_needs_rex();
avx_vector_len = vector_len;
@@ -5626,8 +5794,8 @@
int Assembler::vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc, VexSimdPrefix pre, VexOpcode opc,
bool vex_w, int vector_len, bool legacy_mode, bool no_mask_reg ) {
- bool vex_r = (dst_enc >= 8);
- bool vex_b = (src_enc >= 8);
+ bool vex_r = ((dst_enc & 8) == 8) ? 1 : 0;
+ bool vex_b = ((src_enc & 8) == 8) ? 1 : 0;
bool vex_x = false;
avx_vector_len = vector_len;
@@ -6272,19 +6440,15 @@
void Assembler::andnq(Register dst, Register src1, Register src2) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode_q(dst, src1, src2);
+ int encode = vex_prefix_0F38_and_encode_q_legacy(dst, src1, src2);
emit_int8((unsigned char)0xF2);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::andnq(Register dst, Register src1, Address src2) {
- if (VM_Version::supports_evex()) {
- tuple_type = EVEX_T1S;
- input_size_in_bits = EVEX_64bit;
- }
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38_q(dst, src1, src2);
+ vex_prefix_0F38_q_legacy(dst, src1, src2);
emit_int8((unsigned char)0xF2);
emit_operand(dst, src2);
}
@@ -6311,7 +6475,7 @@
void Assembler::blsiq(Register dst, Register src) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode_q(rbx, dst, src);
+ int encode = vex_prefix_0F38_and_encode_q_legacy(rbx, dst, src);
emit_int8((unsigned char)0xF3);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -6319,14 +6483,14 @@
void Assembler::blsiq(Register dst, Address src) {
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38_q(rbx, dst, src);
+ vex_prefix_0F38_q_legacy(rbx, dst, src);
emit_int8((unsigned char)0xF3);
emit_operand(rbx, src);
}
void Assembler::blsmskq(Register dst, Register src) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode_q(rdx, dst, src);
+ int encode = vex_prefix_0F38_and_encode_q_legacy(rdx, dst, src);
emit_int8((unsigned char)0xF3);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -6334,14 +6498,14 @@
void Assembler::blsmskq(Register dst, Address src) {
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38_q(rdx, dst, src);
+ vex_prefix_0F38_q_legacy(rdx, dst, src);
emit_int8((unsigned char)0xF3);
emit_operand(rdx, src);
}
void Assembler::blsrq(Register dst, Register src) {
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- int encode = vex_prefix_0F38_and_encode_q(rcx, dst, src);
+ int encode = vex_prefix_0F38_and_encode_q_legacy(rcx, dst, src);
emit_int8((unsigned char)0xF3);
emit_int8((unsigned char)(0xC0 | encode));
}
@@ -6349,7 +6513,7 @@
void Assembler::blsrq(Register dst, Address src) {
InstructionMark im(this);
assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
- vex_prefix_0F38_q(rcx, dst, src);
+ vex_prefix_0F38_q_legacy(rcx, dst, src);
emit_int8((unsigned char)0xF3);
emit_operand(rcx, src);
}
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -661,6 +661,14 @@
vector_len, no_mask_reg);
}
+ void vex_prefix_0F38_legacy(Register dst, Register nds, Address src, bool no_mask_reg = false) {
+ bool vex_w = false;
+ int vector_len = AVX_128bit;
+ vex_prefix(src, nds->encoding(), dst->encoding(),
+ VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
+ vector_len, true, no_mask_reg);
+ }
+
void vex_prefix_0F38_q(Register dst, Register nds, Address src, bool no_mask_reg = false) {
bool vex_w = true;
int vector_len = AVX_128bit;
@@ -668,6 +676,15 @@
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
vector_len, no_mask_reg);
}
+
+ void vex_prefix_0F38_q_legacy(Register dst, Register nds, Address src, bool no_mask_reg = false) {
+ bool vex_w = true;
+ int vector_len = AVX_128bit;
+ vex_prefix(src, nds->encoding(), dst->encoding(),
+ VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w,
+ vector_len, true, no_mask_reg);
+ }
+
int vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
VexSimdPrefix pre, VexOpcode opc,
bool vex_w, int vector_len,
@@ -680,6 +697,15 @@
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
false, no_mask_reg);
}
+
+ int vex_prefix_0F38_and_encode_legacy(Register dst, Register nds, Register src, bool no_mask_reg = false) {
+ bool vex_w = false;
+ int vector_len = AVX_128bit;
+ return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+ VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
+ true, no_mask_reg);
+ }
+
int vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src, bool no_mask_reg = false) {
bool vex_w = true;
int vector_len = AVX_128bit;
@@ -687,6 +713,15 @@
VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
false, no_mask_reg);
}
+
+ int vex_prefix_0F38_and_encode_q_legacy(Register dst, Register nds, Register src, bool no_mask_reg = false) {
+ bool vex_w = true;
+ int vector_len = AVX_128bit;
+ return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+ VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector_len,
+ true, no_mask_reg);
+ }
+
int vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
VexSimdPrefix pre, int vector_len = AVX_128bit,
VexOpcode opc = VEX_OPCODE_0F, bool legacy_mode = false,
@@ -1666,6 +1701,8 @@
// Shift Right by bytes Logical DoubleQuadword Immediate
void psrldq(XMMRegister dst, int shift);
+ // Shift Left by bytes Logical DoubleQuadword Immediate
+ void pslldq(XMMRegister dst, int shift);
// Logical Compare 128bit
void ptest(XMMRegister dst, XMMRegister src);
@@ -2024,8 +2061,25 @@
// duplicate 4-bytes integer data from src into 8 locations in dest
void vpbroadcastd(XMMRegister dst, XMMRegister src);
- // duplicate 4-bytes integer data from src into vector_len locations in dest
+ // duplicate n-bytes integer data from src into vector_len locations in dest
+ void evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
+ void evpbroadcastb(XMMRegister dst, Address src, int vector_len);
+ void evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
+ void evpbroadcastw(XMMRegister dst, Address src, int vector_len);
void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
+ void evpbroadcastd(XMMRegister dst, Address src, int vector_len);
+ void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
+ void evpbroadcastq(XMMRegister dst, Address src, int vector_len);
+
+ void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
+ void evpbroadcastss(XMMRegister dst, Address src, int vector_len);
+ void evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
+ void evpbroadcastsd(XMMRegister dst, Address src, int vector_len);
+
+ void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
+ void evpbroadcastw(XMMRegister dst, Register src, int vector_len);
+ void evpbroadcastd(XMMRegister dst, Register src, int vector_len);
+ void evpbroadcastq(XMMRegister dst, Register src, int vector_len);
// Carry-Less Multiplication Quadword
void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
--- a/hotspot/src/cpu/x86/vm/c2_init_x86.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/c2_init_x86.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -58,4 +58,6 @@
OptoReg::invalidate(i);
}
}
+
+ SuperWordLoopUnrollAnalysis = true;
}
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -365,22 +365,22 @@
map->set_callee_saved(STACK_OFFSET(xmm14H_off), xmm14->as_VMReg()->next());
map->set_callee_saved(STACK_OFFSET(xmm15H_off), xmm15->as_VMReg()->next());
if (UseAVX > 2) {
- map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg());
- map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg());
+ map->set_callee_saved(STACK_OFFSET(xmm16H_off), xmm16->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm17H_off), xmm17->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm18H_off), xmm18->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm19H_off), xmm19->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm20H_off), xmm20->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm21H_off), xmm21->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm22H_off), xmm22->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm23H_off), xmm23->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm24H_off), xmm24->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm25H_off), xmm25->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm26H_off), xmm26->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm27H_off), xmm27->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm28H_off), xmm28->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm29H_off), xmm29->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm30H_off), xmm30->as_VMReg()->next());
+ map->set_callee_saved(STACK_OFFSET(xmm31H_off), xmm31->as_VMReg()->next());
}
}
@@ -466,7 +466,7 @@
__ vinsertf64x4h(xmm29, Address(rsp, 928));
__ vinsertf64x4h(xmm30, Address(rsp, 960));
__ vinsertf64x4h(xmm31, Address(rsp, 992));
- __ subptr(rsp, 1024);
+ __ addptr(rsp, 1024);
}
}
#else
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -2727,6 +2727,167 @@
return start;
}
+ // byte swap x86 long
+ address generate_ghash_long_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+ address start = __ pc();
+ __ emit_data(0x0b0a0908, relocInfo::none, 0);
+ __ emit_data(0x0f0e0d0c, relocInfo::none, 0);
+ __ emit_data(0x03020100, relocInfo::none, 0);
+ __ emit_data(0x07060504, relocInfo::none, 0);
+
+ return start;
+ }
+
+ // byte swap x86 byte array
+ address generate_ghash_byte_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+ address start = __ pc();
+ __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+ __ emit_data(0x08090a0b, relocInfo::none, 0);
+ __ emit_data(0x04050607, relocInfo::none, 0);
+ __ emit_data(0x00010203, relocInfo::none, 0);
+ return start;
+ }
+
+ /* Single and multi-block ghash operations */
+ address generate_ghash_processBlocks() {
+ assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support");
+ __ align(CodeEntryAlignment);
+ Label L_ghash_loop, L_exit;
+ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+ address start = __ pc();
+
+ const Register state = rdi;
+ const Register subkeyH = rsi;
+ const Register data = rdx;
+ const Register blocks = rcx;
+
+ const Address state_param(rbp, 8+0);
+ const Address subkeyH_param(rbp, 8+4);
+ const Address data_param(rbp, 8+8);
+ const Address blocks_param(rbp, 8+12);
+
+ const XMMRegister xmm_temp0 = xmm0;
+ const XMMRegister xmm_temp1 = xmm1;
+ const XMMRegister xmm_temp2 = xmm2;
+ const XMMRegister xmm_temp3 = xmm3;
+ const XMMRegister xmm_temp4 = xmm4;
+ const XMMRegister xmm_temp5 = xmm5;
+ const XMMRegister xmm_temp6 = xmm6;
+ const XMMRegister xmm_temp7 = xmm7;
+
+ __ enter();
+
+ __ movptr(state, state_param);
+ __ movptr(subkeyH, subkeyH_param);
+ __ movptr(data, data_param);
+ __ movptr(blocks, blocks_param);
+
+ __ movdqu(xmm_temp0, Address(state, 0));
+ __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+ __ movdqu(xmm_temp1, Address(subkeyH, 0));
+ __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+ __ BIND(L_ghash_loop);
+ __ movdqu(xmm_temp2, Address(data, 0));
+ __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+ __ pxor(xmm_temp0, xmm_temp2);
+
+ //
+ // Multiply with the hash key
+ //
+ __ movdqu(xmm_temp3, xmm_temp0);
+ __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
+ __ movdqu(xmm_temp4, xmm_temp0);
+ __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
+
+ __ movdqu(xmm_temp5, xmm_temp0);
+ __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
+ __ movdqu(xmm_temp6, xmm_temp0);
+ __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
+
+ __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
+
+ __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
+ __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
+ __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
+ __ pxor(xmm_temp3, xmm_temp5);
+ __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
+ // of the carry-less multiplication of
+ // xmm0 by xmm1.
+
+ // We shift the result of the multiplication by one bit position
+ // to the left to cope for the fact that the bits are reversed.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp4, xmm_temp6);
+ __ pslld (xmm_temp3, 1);
+ __ pslld(xmm_temp6, 1);
+ __ psrld(xmm_temp7, 31);
+ __ psrld(xmm_temp4, 31);
+ __ movdqu(xmm_temp5, xmm_temp7);
+ __ pslldq(xmm_temp4, 4);
+ __ pslldq(xmm_temp7, 4);
+ __ psrldq(xmm_temp5, 12);
+ __ por(xmm_temp3, xmm_temp7);
+ __ por(xmm_temp6, xmm_temp4);
+ __ por(xmm_temp6, xmm_temp5);
+
+ //
+ // First phase of the reduction
+ //
+ // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts
+ // independently.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp4, xmm_temp3);
+ __ movdqu(xmm_temp5, xmm_temp3);
+ __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
+ __ pslld(xmm_temp4, 30); // packed right shift shifting << 30
+ __ pslld(xmm_temp5, 25); // packed right shift shifting << 25
+ __ pxor(xmm_temp7, xmm_temp4); // xor the shifted versions
+ __ pxor(xmm_temp7, xmm_temp5);
+ __ movdqu(xmm_temp4, xmm_temp7);
+ __ pslldq(xmm_temp7, 12);
+ __ psrldq(xmm_temp4, 4);
+ __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
+
+ //
+ // Second phase of the reduction
+ //
+ // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these
+ // shift operations.
+ __ movdqu(xmm_temp2, xmm_temp3);
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp5, xmm_temp3);
+ __ psrld(xmm_temp2, 1); // packed left shifting >> 1
+ __ psrld(xmm_temp7, 2); // packed left shifting >> 2
+ __ psrld(xmm_temp5, 7); // packed left shifting >> 7
+ __ pxor(xmm_temp2, xmm_temp7); // xor the shifted versions
+ __ pxor(xmm_temp2, xmm_temp5);
+ __ pxor(xmm_temp2, xmm_temp4);
+ __ pxor(xmm_temp3, xmm_temp2);
+ __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
+
+ __ decrement(blocks);
+ __ jcc(Assembler::zero, L_exit);
+ __ movdqu(xmm_temp0, xmm_temp6);
+ __ addptr(data, 16);
+ __ jmp(L_ghash_loop);
+
+ __ BIND(L_exit);
+ // Byte swap 16-byte result
+ __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+ __ movdqu(Address(state, 0), xmm_temp6); // store the result
+
+ __ leave();
+ __ ret(0);
+ return start;
+ }
+
/**
* Arguments:
*
@@ -3026,6 +3187,13 @@
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
}
+ // Generate GHASH intrinsics code
+ if (UseGHASHIntrinsics) {
+ StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+ StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+ }
+
// Safefetch stubs.
generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
&StubRoutines::_safefetch32_fault_pc,
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -382,7 +382,12 @@
// restore regs belonging to calling function
#ifdef _WIN64
- for (int i = 15; i >= 6; i--) {
+ int xmm_ub = 15;
+ if (UseAVX > 2) {
+ xmm_ub = 31;
+ }
+ // emit the restores for xmm regs
+ for (int i = 6; i <= xmm_ub; i++) {
__ movdqu(as_XMMRegister(i), xmm_save(i));
}
#endif
@@ -3681,6 +3686,175 @@
return start;
}
+
+ // byte swap x86 long
+ address generate_ghash_long_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
+ address start = __ pc();
+ __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
+ __ emit_data64(0x0706050403020100, relocInfo::none );
+ return start;
+ }
+
+ // byte swap x86 byte array
+ address generate_ghash_byte_swap_mask() {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
+ address start = __ pc();
+ __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
+ __ emit_data64(0x0001020304050607, relocInfo::none );
+ return start;
+ }
+
+ /* Single and multi-block ghash operations */
+ address generate_ghash_processBlocks() {
+ __ align(CodeEntryAlignment);
+ Label L_ghash_loop, L_exit;
+ StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
+ address start = __ pc();
+
+ const Register state = c_rarg0;
+ const Register subkeyH = c_rarg1;
+ const Register data = c_rarg2;
+ const Register blocks = c_rarg3;
+
+#ifdef _WIN64
+ const int XMM_REG_LAST = 10;
+#endif
+
+ const XMMRegister xmm_temp0 = xmm0;
+ const XMMRegister xmm_temp1 = xmm1;
+ const XMMRegister xmm_temp2 = xmm2;
+ const XMMRegister xmm_temp3 = xmm3;
+ const XMMRegister xmm_temp4 = xmm4;
+ const XMMRegister xmm_temp5 = xmm5;
+ const XMMRegister xmm_temp6 = xmm6;
+ const XMMRegister xmm_temp7 = xmm7;
+ const XMMRegister xmm_temp8 = xmm8;
+ const XMMRegister xmm_temp9 = xmm9;
+ const XMMRegister xmm_temp10 = xmm10;
+
+ __ enter();
+
+#ifdef _WIN64
+ // save the xmm registers which must be preserved 6-10
+ __ subptr(rsp, -rsp_after_call_off * wordSize);
+ for (int i = 6; i <= XMM_REG_LAST; i++) {
+ __ movdqu(xmm_save(i), as_XMMRegister(i));
+ }
+#endif
+
+ __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
+
+ __ movdqu(xmm_temp0, Address(state, 0));
+ __ pshufb(xmm_temp0, xmm_temp10);
+
+
+ __ BIND(L_ghash_loop);
+ __ movdqu(xmm_temp2, Address(data, 0));
+ __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
+
+ __ movdqu(xmm_temp1, Address(subkeyH, 0));
+ __ pshufb(xmm_temp1, xmm_temp10);
+
+ __ pxor(xmm_temp0, xmm_temp2);
+
+ //
+ // Multiply with the hash key
+ //
+ __ movdqu(xmm_temp3, xmm_temp0);
+ __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0
+ __ movdqu(xmm_temp4, xmm_temp0);
+ __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1
+
+ __ movdqu(xmm_temp5, xmm_temp0);
+ __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0
+ __ movdqu(xmm_temp6, xmm_temp0);
+ __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1
+
+ __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0
+
+ __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5
+ __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right
+ __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left
+ __ pxor(xmm_temp3, xmm_temp5);
+ __ pxor(xmm_temp6, xmm_temp4); // Register pair <xmm6:xmm3> holds the result
+ // of the carry-less multiplication of
+ // xmm0 by xmm1.
+
+ // We shift the result of the multiplication by one bit position
+ // to the left to cope for the fact that the bits are reversed.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp8, xmm_temp6);
+ __ pslld(xmm_temp3, 1);
+ __ pslld(xmm_temp6, 1);
+ __ psrld(xmm_temp7, 31);
+ __ psrld(xmm_temp8, 31);
+ __ movdqu(xmm_temp9, xmm_temp7);
+ __ pslldq(xmm_temp8, 4);
+ __ pslldq(xmm_temp7, 4);
+ __ psrldq(xmm_temp9, 12);
+ __ por(xmm_temp3, xmm_temp7);
+ __ por(xmm_temp6, xmm_temp8);
+ __ por(xmm_temp6, xmm_temp9);
+
+ //
+ // First phase of the reduction
+ //
+ // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
+ // independently.
+ __ movdqu(xmm_temp7, xmm_temp3);
+ __ movdqu(xmm_temp8, xmm_temp3);
+ __ movdqu(xmm_temp9, xmm_temp3);
+ __ pslld(xmm_temp7, 31); // packed right shift shifting << 31
+ __ pslld(xmm_temp8, 30); // packed right shift shifting << 30
+ __ pslld(xmm_temp9, 25); // packed right shift shifting << 25
+ __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions
+ __ pxor(xmm_temp7, xmm_temp9);
+ __ movdqu(xmm_temp8, xmm_temp7);
+ __ pslldq(xmm_temp7, 12);
+ __ psrldq(xmm_temp8, 4);
+ __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete
+
+ //
+ // Second phase of the reduction
+ //
+ // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
+ // shift operations.
+ __ movdqu(xmm_temp2, xmm_temp3);
+ __ movdqu(xmm_temp4, xmm_temp3);
+ __ movdqu(xmm_temp5, xmm_temp3);
+ __ psrld(xmm_temp2, 1); // packed left shifting >> 1
+ __ psrld(xmm_temp4, 2); // packed left shifting >> 2
+ __ psrld(xmm_temp5, 7); // packed left shifting >> 7
+ __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions
+ __ pxor(xmm_temp2, xmm_temp5);
+ __ pxor(xmm_temp2, xmm_temp8);
+ __ pxor(xmm_temp3, xmm_temp2);
+ __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6
+
+ __ decrement(blocks);
+ __ jcc(Assembler::zero, L_exit);
+ __ movdqu(xmm_temp0, xmm_temp6);
+ __ addptr(data, 16);
+ __ jmp(L_ghash_loop);
+
+ __ BIND(L_exit);
+ __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result
+ __ movdqu(Address(state, 0), xmm_temp6); // store the result
+
+#ifdef _WIN64
+ // restore xmm regs belonging to calling function
+ for (int i = 6; i <= XMM_REG_LAST; i++) {
+ __ movdqu(as_XMMRegister(i), xmm_save(i));
+ }
+#endif
+ __ leave();
+ __ ret(0);
+ return start;
+ }
+
/**
* Arguments:
*
@@ -4120,6 +4294,13 @@
StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
}
+ // Generate GHASH intrinsics code
+ if (UseGHASHIntrinsics) {
+ StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
+ StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
+ StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
+ }
+
// Safefetch stubs.
generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
&StubRoutines::_safefetch32_fault_pc,
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -33,6 +33,8 @@
address StubRoutines::x86::_verify_mxcsr_entry = NULL;
address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
+address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
+address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
uint64_t StubRoutines::x86::_crc_by128_masks[] =
{
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -36,10 +36,15 @@
// masks and table for CRC32
static uint64_t _crc_by128_masks[];
static juint _crc_table[];
+ // swap mask for ghash
+ static address _ghash_long_swap_mask_addr;
+ static address _ghash_byte_swap_mask_addr;
public:
static address verify_mxcsr_entry() { return _verify_mxcsr_entry; }
static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
+ static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
+ static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
#endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86_64.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -33,7 +33,7 @@
enum platform_dependent_constants {
code_size1 = 19000, // simply increase if too small (assembler will crash if too small)
- code_size2 = 23000 // simply increase if too small (assembler will crash if too small)
+ code_size2 = 24000 // simply increase if too small (assembler will crash if too small)
};
class x86 {
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -677,6 +677,17 @@
FLAG_SET_DEFAULT(UseAESIntrinsics, false);
}
+ // GHASH/GCM intrinsics
+ if (UseCLMUL && (UseSSE > 2)) {
+ if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) {
+ UseGHASHIntrinsics = true;
+ }
+ } else if (UseGHASHIntrinsics) {
+ if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics))
+ warning("GHASH intrinsic requires CLMUL and SSE2 instructions on this CPU");
+ FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
+ }
+
if (UseSHA) {
warning("SHA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseSHA, false);
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -702,6 +702,7 @@
static bool supports_avx512cd() { return (_cpuFeatures & CPU_AVX512CD) != 0; }
static bool supports_avx512bw() { return (_cpuFeatures & CPU_AVX512BW) != 0; }
static bool supports_avx512vl() { return (_cpuFeatures & CPU_AVX512VL) != 0; }
+ static bool supports_avx512vlbw() { return (supports_avx512bw() && supports_avx512vl()); }
// Intel features
static bool is_intel_family_core() { return is_intel() &&
extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/hotspot/src/cpu/x86/vm/x86.ad Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/cpu/x86/vm/x86.ad Thu Jun 25 09:48:50 2015 -0700
@@ -2894,6 +2894,457 @@
ins_pipe( pipe_slow );
%}
+// ====================LEGACY REPLICATE=======================================
+
+instruct Repl4B_mem(vecS dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "punpcklbw $dst,$mem\n\t"
+ "pshuflw $dst,$dst,0x00\t! replicate4B" %}
+ ins_encode %{
+ __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8B_mem(vecD dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "punpcklbw $dst,$mem\n\t"
+ "pshuflw $dst,$dst,0x00\t! replicate8B" %}
+ ins_encode %{
+ __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B(vecX dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB src));
+ format %{ "movd $dst,$src\n\t"
+ "punpcklbw $dst,$dst\n\t"
+ "pshuflw $dst,$dst,0x00\n\t"
+ "punpcklqdq $dst,$dst\t! replicate16B" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_mem(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "punpcklbw $dst,$mem\n\t"
+ "pshuflw $dst,$dst,0x00\n\t"
+ "punpcklqdq $dst,$dst\t! replicate16B" %}
+ ins_encode %{
+ __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B(vecY dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB src));
+ format %{ "movd $dst,$src\n\t"
+ "punpcklbw $dst,$dst\n\t"
+ "pshuflw $dst,$dst,0x00\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_mem(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "punpcklbw $dst,$mem\n\t"
+ "pshuflw $dst,$dst,0x00\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
+ ins_encode %{
+ __ punpcklbw($dst$$XMMRegister, $mem$$Address);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_imm(vecX dst, immI con) %{
+ predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_imm(vecY dst, immI con) %{
+ predicate(n->as_Vector()->length() == 32 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4S(vecD dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS src));
+ format %{ "movd $dst,$src\n\t"
+ "pshuflw $dst,$dst,0x00\t! replicate4S" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4S_mem(vecD dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "pshuflw $dst,$mem,0x00\t! replicate4S" %}
+ ins_encode %{
+ __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S(vecX dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS src));
+ format %{ "movd $dst,$src\n\t"
+ "pshuflw $dst,$dst,0x00\n\t"
+ "punpcklqdq $dst,$dst\t! replicate8S" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_mem(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 0 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "pshuflw $dst,$mem,0x00\n\t"
+ "punpcklqdq $dst,$dst\t! replicate8S" %}
+ ins_encode %{
+ __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_imm(vecX dst, immI con) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S(vecY dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS src));
+ format %{ "movd $dst,$src\n\t"
+ "pshuflw $dst,$dst,0x00\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_mem(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "pshuflw $dst,$mem,0x00\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
+ ins_encode %{
+ __ pshuflw($dst$$XMMRegister, $mem$$Address, 0x00);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_imm(vecY dst, immI con) %{
+ predicate(n->as_Vector()->length() == 16 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I(vecX dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI src));
+ format %{ "movd $dst,$src\n\t"
+ "pshufd $dst,$dst,0x00\t! replicate4I" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_mem(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI (LoadI mem)));
+ format %{ "pshufd $dst,$mem,0x00\t! replicate4I" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I(vecY dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI src));
+ format %{ "movd $dst,$src\n\t"
+ "pshufd $dst,$dst,0x00\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_mem(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI (LoadI mem)));
+ format %{ "pshufd $dst,$mem,0x00\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_imm(vecX dst, immI con) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI con));
+ format %{ "movq $dst,[$constantaddress]\t! replicate4I($con)\n\t"
+ "punpcklqdq $dst,$dst" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_imm(vecY dst, immI con) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI con));
+ format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// Long could be loaded into xmm register directly from memory.
+instruct Repl2L_mem(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 2 && !VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateL (LoadL mem)));
+ format %{ "movq $dst,$mem\n\t"
+ "punpcklqdq $dst,$dst\t! replicate2L" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $mem$$Address);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// Replicate long (8 byte) scalar to be vector
+#ifdef _LP64
+instruct Repl4L(vecY dst, rRegL src) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL src));
+ format %{ "movdq $dst,$src\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+ ins_encode %{
+ __ movdq($dst$$XMMRegister, $src$$Register);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+#else // _LP64
+instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL src));
+ effect(TEMP dst, USE src, TEMP tmp);
+ format %{ "movdl $dst,$src.lo\n\t"
+ "movdl $tmp,$src.hi\n\t"
+ "punpckldq $dst,$tmp\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+ ins_encode %{
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+ __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+#endif // _LP64
+
+instruct Repl4L_imm(vecY dst, immL con) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress($con));
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L_mem(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL (LoadL mem)));
+ format %{ "movq $dst,$mem\n\t"
+ "punpcklqdq $dst,$dst\n\t"
+ "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $mem$$Address);
+ __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
+ __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl2F_mem(vecD dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateF (LoadF mem)));
+ format %{ "pshufd $dst,$mem,0x00\t! replicate2F" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4F_mem(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateF (LoadF mem)));
+ format %{ "pshufd $dst,$mem,0x00\t! replicate4F" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F(vecY dst, regF src) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateF src));
+ format %{ "pshufd $dst,$src,0x00\n\t"
+ "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
+ __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F_mem(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateF (LoadF mem)));
+ format %{ "pshufd $dst,$mem,0x00\n\t"
+ "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x00);
+ __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl2D_mem(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 2 && UseAVX > 0 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateD (LoadD mem)));
+ format %{ "pshufd $dst,$mem,0x44\t! replicate2D" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D(vecY dst, regD src) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateD src));
+ format %{ "pshufd $dst,$src,0x44\n\t"
+ "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
+ __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D_mem(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && !VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateD (LoadD mem)));
+ format %{ "pshufd $dst,$mem,0x44\n\t"
+ "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
+ ins_encode %{
+ __ pshufd($dst$$XMMRegister, $mem$$Address, 0x44);
+ __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+// ====================GENERIC REPLICATE==========================================
+
// Replicate byte scalar to be vector
instruct Repl4B(vecS dst, rRegI src) %{
predicate(n->as_Vector()->length() == 4);
@@ -2923,60 +3374,6 @@
ins_pipe( pipe_slow );
%}
-instruct Repl16B(vecX dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateB src));
- format %{ "movd $dst,$src\n\t"
- "punpcklbw $dst,$dst\n\t"
- "pshuflw $dst,$dst,0x00\n\t"
- "punpcklqdq $dst,$dst\t! replicate16B" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl32B(vecY dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 32);
- match(Set dst (ReplicateB src));
- format %{ "movd $dst,$src\n\t"
- "punpcklbw $dst,$dst\n\t"
- "pshuflw $dst,$dst,0x00\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate32B" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl64B(vecZ dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 64);
- match(Set dst (ReplicateB src));
- format %{ "movd $dst,$src\n\t"
- "punpcklbw $dst,$dst\n\t"
- "pshuflw $dst,$dst,0x00\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate32B\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate632B" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ punpcklbw($dst$$XMMRegister, $dst$$XMMRegister);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate byte scalar immediate to be vector by loading from const table.
instruct Repl4B_imm(vecS dst, immI con) %{
predicate(n->as_Vector()->length() == 4);
@@ -2998,48 +3395,6 @@
ins_pipe( pipe_slow );
%}
-instruct Repl16B_imm(vecX dst, immI con) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateB con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\t! replicate16B($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl32B_imm(vecY dst, immI con) %{
- predicate(n->as_Vector()->length() == 32);
- match(Set dst (ReplicateB con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lreplicate32B($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl64B_imm(vecZ dst, immI con) %{
- predicate(n->as_Vector()->length() == 64);
- match(Set dst (ReplicateB con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate32B($con)\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate32B($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate byte scalar zero to be vector
instruct Repl4B_zero(vecS dst, immI0 zero) %{
predicate(n->as_Vector()->length() == 4);
@@ -3083,18 +3438,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl64B_zero(vecZ dst, immI0 zero) %{
- predicate(n->as_Vector()->length() == 64);
- match(Set dst (ReplicateB zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate64B zero" %}
- ins_encode %{
- // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
// Replicate char/short (2 byte) scalar to be vector
instruct Repl2S(vecS dst, rRegI src) %{
predicate(n->as_Vector()->length() == 2);
@@ -3108,66 +3451,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl4S(vecD dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateS src));
- format %{ "movd $dst,$src\n\t"
- "pshuflw $dst,$dst,0x00\t! replicate4S" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl8S(vecX dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateS src));
- format %{ "movd $dst,$src\n\t"
- "pshuflw $dst,$dst,0x00\n\t"
- "punpcklqdq $dst,$dst\t! replicate8S" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl16S(vecY dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateS src));
- format %{ "movd $dst,$src\n\t"
- "pshuflw $dst,$dst,0x00\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate16S" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl32S(vecZ dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 32);
- match(Set dst (ReplicateS src));
- format %{ "movd $dst,$src\n\t"
- "pshuflw $dst,$dst,0x00\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate16S\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshuflw($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate char/short (2 byte) scalar immediate to be vector by loading from const table.
instruct Repl2S_imm(vecS dst, immI con) %{
predicate(n->as_Vector()->length() == 2);
@@ -3189,48 +3472,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl8S_imm(vecX dst, immI con) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateS con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\t! replicate8S($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl16S_imm(vecY dst, immI con) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateS con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate16S($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl32S_imm(vecZ dst, immI con) %{
- predicate(n->as_Vector()->length() == 32);
- match(Set dst (ReplicateS con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate16S($con)\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate16S($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate char/short (2 byte) scalar zero to be vector
instruct Repl2S_zero(vecS dst, immI0 zero) %{
predicate(n->as_Vector()->length() == 2);
@@ -3274,18 +3515,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl32S_zero(vecZ dst, immI0 zero) %{
- predicate(n->as_Vector()->length() == 32);
- match(Set dst (ReplicateS zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate32S zero" %}
- ins_encode %{
- // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
// Replicate integer (4 byte) scalar to be vector
instruct Repl2I(vecD dst, rRegI src) %{
predicate(n->as_Vector()->length() == 2);
@@ -3299,101 +3528,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl4I(vecX dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateI src));
- format %{ "movd $dst,$src\n\t"
- "pshufd $dst,$dst,0x00\t! replicate4I" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8I(vecY dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateI src));
- format %{ "movd $dst,$src\n\t"
- "pshufd $dst,$dst,0x00\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl16I(vecZ dst, rRegI src) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateI src));
- format %{ "movd $dst,$src\n\t"
- "pshufd $dst,$dst,0x00\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
-instruct Repl2I_imm(vecD dst, immI con) %{
- predicate(n->as_Vector()->length() == 2);
- match(Set dst (ReplicateI con));
- format %{ "movq $dst,[$constantaddress]\t! replicate2I($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
-instruct Repl4I_imm(vecX dst, immI con) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateI con));
- format %{ "movq $dst,[$constantaddress]\t! replicate4I($con)\n\t"
- "punpcklqdq $dst,$dst" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8I_imm(vecY dst, immI con) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateI con));
- format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl16I_imm(vecZ dst, immI con) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateI con));
- format %{ "movq $dst,[$constantaddress]\t! replicate16I($con)\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\n\t"
- "vinserti64x4h $dst k0,$dst,$dst" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Integer could be loaded into xmm register directly from memory.
instruct Repl2I_mem(vecD dst, memory mem) %{
predicate(n->as_Vector()->length() == 2);
@@ -3407,46 +3541,15 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl4I_mem(vecX dst, memory mem) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateI (LoadI mem)));
- format %{ "movd $dst,$mem\n\t"
- "pshufd $dst,$dst,0x00\t! replicate4I" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $mem$$Address);
- __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8I_mem(vecY dst, memory mem) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateI (LoadI mem)));
- format %{ "movd $dst,$mem\n\t"
- "pshufd $dst,$dst,0x00\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate8I" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $mem$$Address);
- __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl16I_mem(vecZ dst, memory mem) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateI (LoadI mem)));
- format %{ "movd $dst,$mem\n\t"
- "pshufd $dst,$dst,0x00\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate8I\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate8I" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $mem$$Address);
- __ pshufd($dst$$XMMRegister, $dst$$XMMRegister, 0x00);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
+// Replicate integer (4 byte) scalar immediate to be vector by loading from const table.
+instruct Repl2I_imm(vecD dst, immI con) %{
+ predicate(n->as_Vector()->length() == 2);
+ match(Set dst (ReplicateI con));
+ format %{ "movq $dst,[$constantaddress]\t! replicate2I($con)" %}
+ ins_encode %{
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+ %}
+ ins_pipe( fpu_reg_reg );
%}
// Replicate integer (4 byte) scalar zero to be vector
@@ -3482,18 +3585,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl16I_zero(vecZ dst, immI0 zero) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateI zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate16I zero" %}
- ins_encode %{
- // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
// Replicate long (8 byte) scalar to be vector
#ifdef _LP64
instruct Repl2L(vecX dst, rRegL src) %{
@@ -3507,36 +3598,6 @@
%}
ins_pipe( pipe_slow );
%}
-
-instruct Repl4L(vecY dst, rRegL src) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateL src));
- format %{ "movdq $dst,$src\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
- ins_encode %{
- __ movdq($dst$$XMMRegister, $src$$Register);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L(vecZ dst, rRegL src) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateL src));
- format %{ "movdq $dst,$src\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
- ins_encode %{
- __ movdq($dst$$XMMRegister, $src$$Register);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
#else // _LP64
instruct Repl2L(vecX dst, eRegL src, regD tmp) %{
predicate(n->as_Vector()->length() == 2);
@@ -3554,45 +3615,6 @@
%}
ins_pipe( pipe_slow );
%}
-
-instruct Repl4L(vecY dst, eRegL src, regD tmp) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateL src));
- effect(TEMP dst, USE src, TEMP tmp);
- format %{ "movdl $dst,$src.lo\n\t"
- "movdl $tmp,$src.hi\n\t"
- "punpckldq $dst,$tmp\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
- __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L(vecZ dst, eRegL src, regD tmp) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateL src));
- effect(TEMP dst, USE src, TEMP tmp);
- format %{ "movdl $dst,$src.lo\n\t"
- "movdl $tmp,$src.hi\n\t"
- "punpckldq $dst,$tmp\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
- ins_encode %{
- __ movdl($dst$$XMMRegister, $src$$Register);
- __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
- __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
#endif // _LP64
// Replicate long (8 byte) scalar immediate to be vector by loading from const table.
@@ -3608,79 +3630,6 @@
ins_pipe( pipe_slow );
%}
-instruct Repl4L_imm(vecY dst, immL con) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateL con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate4L($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress($con));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L_imm(vecZ dst, immL con) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateL con));
- format %{ "movq $dst,[$constantaddress]\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate4L($con)\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L($con)" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $constantaddress($con));
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-// Long could be loaded into xmm register directly from memory.
-instruct Repl2L_mem(vecX dst, memory mem) %{
- predicate(n->as_Vector()->length() == 2);
- match(Set dst (ReplicateL (LoadL mem)));
- format %{ "movq $dst,$mem\n\t"
- "punpcklqdq $dst,$dst\t! replicate2L" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $mem$$Address);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl4L_mem(vecY dst, memory mem) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateL (LoadL mem)));
- format %{ "movq $dst,$mem\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! replicate4L" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $mem$$Address);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8L_mem(vecZ dst, memory mem) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateL (LoadL mem)));
- format %{ "movq $dst,$mem\n\t"
- "punpcklqdq $dst,$dst\n\t"
- "vinserti128h $dst,$dst,$dst\t! lower replicate4L\n\t"
- "vinserti64x4h $dst k0,$dst,$dst\t! upper replicate4L" %}
- ins_encode %{
- __ movq($dst$$XMMRegister, $mem$$Address);
- __ punpcklqdq($dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinserti64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate long (8 byte) scalar zero to be vector
instruct Repl2L_zero(vecX dst, immL0 zero) %{
predicate(n->as_Vector()->length() == 2);
@@ -3704,18 +3653,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl8L_zero(vecZ dst, immL0 zero) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateL zero));
- format %{ "vpxor $dst k0,$dst,$dst\t! replicate8L zero" %}
- ins_encode %{
- // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
- int vector_len = 2;
- __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
// Replicate float (4 byte) scalar to be vector
instruct Repl2F(vecD dst, regF src) %{
predicate(n->as_Vector()->length() == 2);
@@ -3737,32 +3674,6 @@
ins_pipe( pipe_slow );
%}
-instruct Repl8F(vecY dst, regF src) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateF src));
- format %{ "pshufd $dst,$src,0x00\n\t"
- "vinsertf128h $dst,$dst,$dst\t! replicate8F" %}
- ins_encode %{
- __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
- __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl16F(vecZ dst, regF src) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateF src));
- format %{ "pshufd $dst,$src,0x00\n\t"
- "vinsertf128h $dst,$dst,$dst\t! lower replicate8F\n\t"
- "vinsertf64x4h $dst k0,$dst,$dst\t! lower replicate8F" %}
- ins_encode %{
- __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x00);
- __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate float (4 byte) scalar zero to be vector
instruct Repl2F_zero(vecD dst, immF0 zero) %{
predicate(n->as_Vector()->length() == 2);
@@ -3795,17 +3706,6 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl16F_zero(vecZ dst, immF0 zero) %{
- predicate(n->as_Vector()->length() == 16);
- match(Set dst (ReplicateF zero));
- format %{ "vxorps $dst k0,$dst,$dst\t! replicate16F zero" %}
- ins_encode %{
- int vector_len = 2;
- __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
- %}
- ins_pipe( fpu_reg_reg );
-%}
-
// Replicate double (8 bytes) scalar to be vector
instruct Repl2D(vecX dst, regD src) %{
predicate(n->as_Vector()->length() == 2);
@@ -3817,32 +3717,6 @@
ins_pipe( pipe_slow );
%}
-instruct Repl4D(vecY dst, regD src) %{
- predicate(n->as_Vector()->length() == 4);
- match(Set dst (ReplicateD src));
- format %{ "pshufd $dst,$src,0x44\n\t"
- "vinsertf128h $dst,$dst,$dst\t! replicate4D" %}
- ins_encode %{
- __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
- __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
-instruct Repl8D(vecZ dst, regD src) %{
- predicate(n->as_Vector()->length() == 8);
- match(Set dst (ReplicateD src));
- format %{ "pshufd $dst,$src,0x44\n\t"
- "vinsertf128h $dst,$dst,$dst\t! lower replicate4D\n\t"
- "vinsertf64x4h $dst k0,$dst,$dst\t! upper replicate4D" %}
- ins_encode %{
- __ pshufd($dst$$XMMRegister, $src$$XMMRegister, 0x44);
- __ vinsertf128h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- __ vinsertf64x4h($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister);
- %}
- ins_pipe( pipe_slow );
-%}
-
// Replicate double (8 byte) scalar zero to be vector
instruct Repl2D_zero(vecX dst, immD0 zero) %{
predicate(n->as_Vector()->length() == 2);
@@ -3865,8 +3739,636 @@
ins_pipe( fpu_reg_reg );
%}
-instruct Repl8D_zero(vecZ dst, immD0 zero) %{
- predicate(n->as_Vector()->length() == 8);
+// ====================EVEX REPLICATE=============================================
+
+instruct Repl4B_mem_evex(vecS dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "vpbroadcastb $dst,$mem\t! replicate4B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8B_mem_evex(vecD dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "vpbroadcastb $dst,$mem\t! replicate8B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_evex(vecX dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB src));
+ format %{ "vpbroadcastb $dst,$src\t! replicate16B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_mem_evex(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "vpbroadcastb $dst,$mem\t! replicate16B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_evex(vecY dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB src));
+ format %{ "vpbroadcastb $dst,$src\t! replicate32B" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_mem_evex(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "vpbroadcastb $dst,$mem\t! replicate32B" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_evex(vecZ dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+ match(Set dst (ReplicateB src));
+ format %{ "vpbroadcastb $dst,$src\t! upper replicate64B" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastb($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_mem_evex(vecZ dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 64 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB (LoadB mem)));
+ format %{ "vpbroadcastb $dst,$mem\t! replicate64B" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastb($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16B_imm_evex(vecX dst, immI con) %{
+ predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastb $dst,$dst\t! replicate16B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+ __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32B_imm_evex(vecY dst, immI con) %{
+ predicate(n->as_Vector()->length() == 32 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateB con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastb $dst,$dst\t! replicate32B" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+ __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_imm_evex(vecZ dst, immI con) %{
+ predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+ match(Set dst (ReplicateB con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastb $dst,$dst\t! upper replicate64B" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 1)));
+ __ evpbroadcastb($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl64B_zero_evex(vecZ dst, immI0 zero) %{
+ predicate(n->as_Vector()->length() == 64 && UseAVX > 2);
+ match(Set dst (ReplicateB zero));
+ format %{ "vpxor $dst k0,$dst,$dst\t! replicate64B zero" %}
+ ins_encode %{
+ // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+ int vector_len = 2;
+ __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4S_evex(vecD dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS src));
+ format %{ "vpbroadcastw $dst,$src\t! replicate4S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4S_mem_evex(vecD dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "vpbroadcastw $dst,$mem\t! replicate4S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_evex(vecX dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS src));
+ format %{ "vpbroadcastw $dst,$src\t! replicate8S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_mem_evex(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "vpbroadcastw $dst,$mem\t! replicate8S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_evex(vecY dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS src));
+ format %{ "vpbroadcastw $dst,$src\t! replicate16S" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_mem_evex(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "vpbroadcastw $dst,$mem\t! replicate16S" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_evex(vecZ dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+ match(Set dst (ReplicateS src));
+ format %{ "vpbroadcastw $dst,$src\t! replicate32S" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastw($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_mem_evex(vecZ dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+ match(Set dst (ReplicateS (LoadS mem)));
+ format %{ "vpbroadcastw $dst,$mem\t! replicate32S" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastw($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8S_imm_evex(vecX dst, immI con) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastw $dst,$dst\t! replicate8S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+ __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16S_imm_evex(vecY dst, immI con) %{
+ predicate(n->as_Vector()->length() == 16 && VM_Version::supports_avx512vlbw());
+ match(Set dst (ReplicateS con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastw $dst,$dst\t! replicate16S" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+ __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_imm_evex(vecZ dst, immI con) %{
+ predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+ match(Set dst (ReplicateS con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastw $dst,$dst\t! replicate32S" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 2)));
+ __ evpbroadcastw($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl32S_zero_evex(vecZ dst, immI0 zero) %{
+ predicate(n->as_Vector()->length() == 32 && UseAVX > 2);
+ match(Set dst (ReplicateS zero));
+ format %{ "vpxor $dst k0,$dst,$dst\t! replicate32S zero" %}
+ ins_encode %{
+ // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+ int vector_len = 2;
+ __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4I_evex(vecX dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI src));
+ format %{ "vpbroadcastd $dst,$src\t! replicate4I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_mem_evex(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI (LoadI mem)));
+ format %{ "vpbroadcastd $dst,$mem\t! replicate4I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_evex(vecY dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI src));
+ format %{ "vpbroadcastd $dst,$src\t! replicate8I" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_mem_evex(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI (LoadI mem)));
+ format %{ "vpbroadcastd $dst,$mem\t! replicate8I" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_evex(vecZ dst, rRegI src) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateI src));
+ format %{ "vpbroadcastd $dst,$src\t! replicate16I" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastd($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_mem_evex(vecZ dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateI (LoadI mem)));
+ format %{ "vpbroadcastd $dst,$mem\t! replicate16I" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastd($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4I_imm_evex(vecX dst, immI con) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI con));
+ format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+ "vpbroadcastd $dst,$dst\t! replicate4I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+ __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8I_imm_evex(vecY dst, immI con) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateI con));
+ format %{ "movq $dst,[$constantaddress]\t! replicate8I($con)\n\t"
+ "vpbroadcastd $dst,$dst\t! replicate8I" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+ __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_imm_evex(vecZ dst, immI con) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateI con));
+ format %{ "movq $dst,[$constantaddress]\t! replicate16I($con)\n\t"
+ "vpbroadcastd $dst,$dst\t! replicate16I" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ movq($dst$$XMMRegister, $constantaddress(replicate8_imm($con$$constant, 4)));
+ __ evpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16I_zero_evex(vecZ dst, immI0 zero) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateI zero));
+ format %{ "vpxor $dst k0,$dst,$dst\t! replicate16I zero" %}
+ ins_encode %{
+ // Use vxorpd since AVX does not have vpxor for 512-bit (AVX2 will have it).
+ int vector_len = 2;
+ __ vpxor($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( fpu_reg_reg );
+%}
+
+// Replicate long (8 byte) scalar to be vector
+#ifdef _LP64
+instruct Repl4L_evex(vecY dst, rRegL src) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL src));
+ format %{ "vpbroadcastq $dst,$src\t! replicate4L" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_evex(vecZ dst, rRegL src) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateL src));
+ format %{ "vpbroadcastq $dst,$src\t! replicate8L" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastq($dst$$XMMRegister, $src$$Register, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+#else // _LP64
+instruct Repl4L_evex(vecY dst, eRegL src, regD tmp) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL src));
+ effect(TEMP dst, USE src, TEMP tmp);
+ format %{ "movdl $dst,$src.lo\n\t"
+ "movdl $tmp,$src.hi\n\t"
+ "punpckldq $dst,$tmp\n\t"
+ "vpbroadcastq $dst,$dst\t! replicate4L" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+ __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+ __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_evex(vecZ dst, eRegL src, regD tmp) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateL src));
+ effect(TEMP dst, USE src, TEMP tmp);
+ format %{ "movdl $dst,$src.lo\n\t"
+ "movdl $tmp,$src.hi\n\t"
+ "punpckldq $dst,$tmp\n\t"
+ "vpbroadcastq $dst,$dst\t! replicate8L" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ movdl($dst$$XMMRegister, $src$$Register);
+ __ movdl($tmp$$XMMRegister, HIGH_FROM_LOW($src$$Register));
+ __ punpckldq($dst$$XMMRegister, $tmp$$XMMRegister);
+ __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+#endif // _LP64
+
+instruct Repl4L_imm_evex(vecY dst, immL con) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastq $dst,$dst\t! replicate4L" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ movq($dst$$XMMRegister, $constantaddress($con));
+ __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_imm_evex(vecZ dst, immL con) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateL con));
+ format %{ "movq $dst,[$constantaddress]\n\t"
+ "vpbroadcastq $dst,$dst\t! replicate8L" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ movq($dst$$XMMRegister, $constantaddress($con));
+ __ evpbroadcastq($dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl2L_mem_evex(vecX dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 2 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL (LoadL mem)));
+ format %{ "vpbroadcastd $dst,$mem\t! replicate2L" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4L_mem_evex(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateL (LoadL mem)));
+ format %{ "vpbroadcastd $dst,$mem\t! replicate4L" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_mem_evex(vecZ dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateL (LoadL mem)));
+ format %{ "vpbroadcastd $dst,$mem\t! replicate8L" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastq($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8L_zero_evex(vecZ dst, immL0 zero) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateL zero));
+ format %{ "vpxor $dst k0,$dst,$dst\t! replicate8L zero" %}
+ ins_encode %{
+ // Use vxorpd since AVX does not have vpxor for 512-bit (EVEX will have it).
+ int vector_len = 2;
+ __ vpxor($dst$$XMMRegister,$dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl8F_evex(vecY dst, regF src) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateF src));
+ format %{ "vbroadcastss $dst,$src\t! replicate8F" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8F_mem_evex(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateF (LoadF mem)));
+ format %{ "vbroadcastss $dst,$mem\t! replicate8F" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_evex(vecZ dst, regF src) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateF src));
+ format %{ "vbroadcastss $dst,$src\t! replicate16F" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastss($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_mem_evex(vecZ dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateF (LoadF mem)));
+ format %{ "vbroadcastss $dst,$mem\t! replicate16F" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastss($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl16F_zero_evex(vecZ dst, immF0 zero) %{
+ predicate(n->as_Vector()->length() == 16 && UseAVX > 2);
+ match(Set dst (ReplicateF zero));
+ format %{ "vxorps $dst k0,$dst,$dst\t! replicate16F zero" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ vxorps($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector_len);
+ %}
+ ins_pipe( fpu_reg_reg );
+%}
+
+instruct Repl4D_evex(vecY dst, regD src) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateD src));
+ format %{ "vbroadcastsd $dst,$src\t! replicate4D" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl4D_mem_evex(vecY dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 4 && VM_Version::supports_avx512vl());
+ match(Set dst (ReplicateD (LoadD mem)));
+ format %{ "vbroadcastsd $dst,$mem\t! replicate4D" %}
+ ins_encode %{
+ int vector_len = 1;
+ __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_evex(vecZ dst, regD src) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateD src));
+ format %{ "vbroadcastsd $dst,$src\t! replicate8D" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastsd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_mem_evex(vecZ dst, memory mem) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
+ match(Set dst (ReplicateD (LoadD mem)));
+ format %{ "vbroadcastsd $dst,$mem\t! replicate8D" %}
+ ins_encode %{
+ int vector_len = 2;
+ __ evpbroadcastsd($dst$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct Repl8D_zero_evex(vecZ dst, immD0 zero) %{
+ predicate(n->as_Vector()->length() == 8 && UseAVX > 2);
match(Set dst (ReplicateD zero));
format %{ "vxorpd $dst k0,$dst,$dst,vect512\t! replicate8D zero" %}
ins_encode %{
@@ -4972,6 +5474,17 @@
ins_pipe( pipe_slow );
%}
+instruct vadd4B_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+ match(Set dst (AddVB src (LoadVector mem)));
+ format %{ "vpaddb $dst,$src,$mem\t! add packed4B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vadd8B(vecD dst, vecD src) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (AddVB dst src));
@@ -4993,6 +5506,17 @@
ins_pipe( pipe_slow );
%}
+instruct vadd8B_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+ match(Set dst (AddVB src (LoadVector mem)));
+ format %{ "vpaddb $dst,$src,$mem\t! add packed8B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpaddb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vadd16B(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 16);
match(Set dst (AddVB dst src));
@@ -5091,6 +5615,17 @@
ins_pipe( pipe_slow );
%}
+instruct vadd2S_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (AddVS src (LoadVector mem)));
+ format %{ "vpaddw $dst,$src,$mem\t! add packed2S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vadd4S(vecD dst, vecD src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (AddVS dst src));
@@ -5112,6 +5647,17 @@
ins_pipe( pipe_slow );
%}
+instruct vadd4S_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+ match(Set dst (AddVS src (LoadVector mem)));
+ format %{ "vpaddw $dst,$src,$mem\t! add packed4S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpaddw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vadd8S(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (AddVS dst src));
@@ -5210,6 +5756,17 @@
ins_pipe( pipe_slow );
%}
+instruct vadd2I_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (AddVI src (LoadVector mem)));
+ format %{ "vpaddd $dst,$src,$mem\t! add packed2I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpaddd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vadd4I(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (AddVI dst src));
@@ -5385,6 +5942,17 @@
ins_pipe( pipe_slow );
%}
+instruct vadd2F_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (AddVF src (LoadVector mem)));
+ format %{ "vaddps $dst,$src,$mem\t! add packed2F" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vaddps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vadd4F(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (AddVF dst src));
@@ -5562,6 +6130,17 @@
ins_pipe( pipe_slow );
%}
+instruct vsub4B_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+ match(Set dst (SubVB src (LoadVector mem)));
+ format %{ "vpsubb $dst,$src,$mem\t! sub packed4B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vsub8B(vecD dst, vecD src) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (SubVB dst src));
@@ -5583,6 +6162,17 @@
ins_pipe( pipe_slow );
%}
+instruct vsub8B_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 8);
+ match(Set dst (SubVB src (LoadVector mem)));
+ format %{ "vpsubb $dst,$src,$mem\t! sub packed8B" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpsubb($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vsub16B(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 16);
match(Set dst (SubVB dst src));
@@ -5681,6 +6271,17 @@
ins_pipe( pipe_slow );
%}
+instruct vsub2S_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (SubVS src (LoadVector mem)));
+ format %{ "vpsubw $dst,$src,$mem\t! sub packed2S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vsub4S(vecD dst, vecD src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (SubVS dst src));
@@ -5702,6 +6303,17 @@
ins_pipe( pipe_slow );
%}
+instruct vsub4S_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+ match(Set dst (SubVS src (LoadVector mem)));
+ format %{ "vpsubw $dst,$src,$mem\t! sub packed4S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpsubw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vsub8S(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (SubVS dst src));
@@ -5800,6 +6412,17 @@
ins_pipe( pipe_slow );
%}
+instruct vsub2I_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (SubVI src (LoadVector mem)));
+ format %{ "vpsubd $dst,$src,$mem\t! sub packed2I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpsubd($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vsub4I(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (SubVI dst src));
@@ -5975,6 +6598,17 @@
ins_pipe( pipe_slow );
%}
+instruct vsub2F_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (SubVF src (LoadVector mem)));
+ format %{ "vsubps $dst,$src,$mem\t! sub packed2F" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vsubps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vsub4F(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (SubVF dst src));
@@ -6152,6 +6786,17 @@
ins_pipe( pipe_slow );
%}
+instruct vmul2S_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (MulVS src (LoadVector mem)));
+ format %{ "vpmullw $dst,$src,$mem\t! mul packed2S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vmul4S(vecD dst, vecD src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (MulVS dst src));
@@ -6173,6 +6818,17 @@
ins_pipe( pipe_slow );
%}
+instruct vmul4S_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
+ match(Set dst (MulVS src (LoadVector mem)));
+ format %{ "vpmullw $dst,$src,$mem\t! mul packed4S" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpmullw($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vmul8S(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 8);
match(Set dst (MulVS dst src));
@@ -6271,13 +6927,13 @@
ins_pipe( pipe_slow );
%}
-instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
- predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
- match(Set dst (MulVL src1 src2));
- format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
- ins_encode %{
- int vector_len = 0;
- __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+instruct vmul2I_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (MulVI src (LoadVector mem)));
+ format %{ "vpmulld $dst,$src,$mem\t! mul packed2I" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpmulld($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
%}
ins_pipe( pipe_slow );
%}
@@ -6314,6 +6970,28 @@
ins_pipe( pipe_slow );
%}
+instruct vmul2L_reg(vecX dst, vecX src1, vecX src2) %{
+ predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
+ match(Set dst (MulVL src1 src2));
+ format %{ "vpmullq $dst,$src1,$src2\t! mul packed2L" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpmullq($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct vmul2L_mem(vecX dst, vecX src, memory mem) %{
+ predicate(UseAVX > 2 && n->as_Vector()->length() == 2 && VM_Version::supports_avx512dq());
+ match(Set dst (MulVL src (LoadVector mem)));
+ format %{ "vpmullq $dst,$src,$mem\t! mul packed2L" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vmul4L_reg(vecY dst, vecY src1, vecY src2) %{
predicate(UseAVX > 2 && n->as_Vector()->length() == 4 && VM_Version::supports_avx512dq());
match(Set dst (MulVL src1 src2));
@@ -6336,17 +7014,6 @@
ins_pipe( pipe_slow );
%}
-instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
- predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
- match(Set dst (MulVI src1 src2));
- format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
- ins_encode %{
- int vector_len = 1;
- __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
- %}
- ins_pipe( pipe_slow );
-%}
-
instruct vmul8L_reg(vecZ dst, vecZ src1, vecZ src2) %{
predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
match(Set dst (MulVL src1 src2));
@@ -6358,12 +7025,23 @@
ins_pipe( pipe_slow );
%}
-instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
- predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
- match(Set dst (MulVI src1 src2));
- format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
+instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
+ predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
+ match(Set dst (MulVL src (LoadVector mem)));
+ format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
ins_encode %{
int vector_len = 2;
+ __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct vmul8I_reg(vecY dst, vecY src1, vecY src2) %{
+ predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
+ match(Set dst (MulVI src1 src2));
+ format %{ "vpmulld $dst,$src1,$src2\t! mul packed8I" %}
+ ins_encode %{
+ int vector_len = 1;
__ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
%}
ins_pipe( pipe_slow );
@@ -6380,13 +7058,13 @@
ins_pipe( pipe_slow );
%}
-instruct vmul8L_mem(vecZ dst, vecZ src, memory mem) %{
- predicate(UseAVX > 2 && n->as_Vector()->length() == 8 && VM_Version::supports_avx512dq());
- match(Set dst (MulVL src (LoadVector mem)));
- format %{ "vpmullq $dst,$src,$mem\t! mul packed8L" %}
+instruct vmul16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
+ predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
+ match(Set dst (MulVI src1 src2));
+ format %{ "vpmulld $dst,$src1,$src2\t! mul packed16I" %}
ins_encode %{
int vector_len = 2;
- __ vpmullq($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ __ vpmulld($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
%}
ins_pipe( pipe_slow );
%}
@@ -6424,6 +7102,17 @@
ins_pipe( pipe_slow );
%}
+instruct vmul2F_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (MulVF src (LoadVector mem)));
+ format %{ "vmulps $dst,$src,$mem\t! mul packed2F" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vmulps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vmul4F(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (MulVF dst src));
@@ -6601,6 +7290,17 @@
ins_pipe( pipe_slow );
%}
+instruct vdiv2F_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
+ match(Set dst (DivVF src (LoadVector mem)));
+ format %{ "vdivps $dst,$src,$mem\t! div packed2F" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vdivps($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vdiv4F(vecX dst, vecX src) %{
predicate(n->as_Vector()->length() == 4);
match(Set dst (DivVF dst src));
@@ -7878,6 +8578,17 @@
ins_pipe( pipe_slow );
%}
+instruct vand4B_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+ match(Set dst (AndV src (LoadVector mem)));
+ format %{ "vpand $dst,$src,$mem\t! and vectors (4 bytes)" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vand8B(vecD dst, vecD src) %{
predicate(n->as_Vector()->length_in_bytes() == 8);
match(Set dst (AndV dst src));
@@ -7899,6 +8610,17 @@
ins_pipe( pipe_slow );
%}
+instruct vand8B_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+ match(Set dst (AndV src (LoadVector mem)));
+ format %{ "vpand $dst,$src,$mem\t! and vectors (8 bytes)" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpand($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vand16B(vecX dst, vecX src) %{
predicate(n->as_Vector()->length_in_bytes() == 16);
match(Set dst (AndV dst src));
@@ -7998,6 +8720,17 @@
ins_pipe( pipe_slow );
%}
+instruct vor4B_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+ match(Set dst (OrV src (LoadVector mem)));
+ format %{ "vpor $dst,$src,$mem\t! or vectors (4 bytes)" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vor8B(vecD dst, vecD src) %{
predicate(n->as_Vector()->length_in_bytes() == 8);
match(Set dst (OrV dst src));
@@ -8019,6 +8752,17 @@
ins_pipe( pipe_slow );
%}
+instruct vor8B_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+ match(Set dst (OrV src (LoadVector mem)));
+ format %{ "vpor $dst,$src,$mem\t! or vectors (8 bytes)" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vor16B(vecX dst, vecX src) %{
predicate(n->as_Vector()->length_in_bytes() == 16);
match(Set dst (OrV dst src));
@@ -8118,6 +8862,17 @@
ins_pipe( pipe_slow );
%}
+instruct vxor4B_mem(vecS dst, vecS src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 4);
+ match(Set dst (XorV src (LoadVector mem)));
+ format %{ "vpxor $dst,$src,$mem\t! xor vectors (4 bytes)" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vxor8B(vecD dst, vecD src) %{
predicate(n->as_Vector()->length_in_bytes() == 8);
match(Set dst (XorV dst src));
@@ -8139,6 +8894,17 @@
ins_pipe( pipe_slow );
%}
+instruct vxor8B_mem(vecD dst, vecD src, memory mem) %{
+ predicate(UseAVX > 0 && n->as_Vector()->length_in_bytes() == 8);
+ match(Set dst (XorV src (LoadVector mem)));
+ format %{ "vpxor $dst,$src,$mem\t! xor vectors (8 bytes)" %}
+ ins_encode %{
+ int vector_len = 0;
+ __ vpxor($dst$$XMMRegister, $src$$XMMRegister, $mem$$Address, vector_len);
+ %}
+ ins_pipe( pipe_slow );
+%}
+
instruct vxor16B(vecX dst, vecX src) %{
predicate(n->as_Vector()->length_in_bytes() == 16);
match(Set dst (XorV dst src));
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -846,6 +846,12 @@
do_name( implCompressMB_name, "implCompressMultiBlock") \
do_signature(implCompressMB_signature, "([BII)I") \
\
+ /* support for com.sun.crypto.provider.GHASH */ \
+ do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH") \
+ do_intrinsic(_ghash_processBlocks, com_sun_crypto_provider_ghash, processBlocks_name, ghash_processBlocks_signature, F_S) \
+ do_name(processBlocks_name, "processBlocks") \
+ do_signature(ghash_processBlocks_signature, "([BII[J[J)V") \
+ \
/* support for java.util.zip */ \
do_class(java_util_zip_CRC32, "java/util/zip/CRC32") \
do_intrinsic(_updateCRC32, java_util_zip_CRC32, update_name, int2_int_signature, F_SN) \
--- a/hotspot/src/share/vm/code/debugInfo.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/code/debugInfo.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -26,6 +26,7 @@
#include "code/debugInfo.hpp"
#include "code/debugInfoRec.hpp"
#include "code/nmethod.hpp"
+#include "oops/oop.inline.hpp"
#include "runtime/handles.inline.hpp"
PRAGMA_FORMAT_MUTE_WARNINGS_FOR_GCC
@@ -47,6 +48,12 @@
write_int(recorder()->oop_recorder()->find_index(h));
}
+oop DebugInfoReadStream::read_oop() {
+ oop o = code()->oop_at(read_int());
+ assert(o->is_oop_or_null(), "oop only");
+ return o;
+}
+
ScopeValue* DebugInfoReadStream::read_object_value() {
int id = read_int();
#ifdef ASSERT
--- a/hotspot/src/share/vm/code/debugInfo.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/code/debugInfo.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -266,11 +266,7 @@
} ;
- oop read_oop() {
- oop o = code()->oop_at(read_int());
- assert(o == NULL || o->is_oop(), "oop only");
- return o;
- }
+ oop read_oop();
Method* read_method() {
Method* o = (Method*)(code()->metadata_at(read_int()));
// is_metadata() is a faster check than is_metaspace_object()
--- a/hotspot/src/share/vm/compiler/compileLog.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/compiler/compileLog.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2002, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2002, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -58,13 +58,15 @@
CompileLog::~CompileLog() {
delete _out; // Close fd in fileStream::~fileStream()
_out = NULL;
+ // Remove partial file after merging in CompileLog::finish_log_on_error
+ unlink(_file);
FREE_C_HEAP_ARRAY(char, _identities);
FREE_C_HEAP_ARRAY(char, _file);
}
// see_tag, pop_tag: Override the default do-nothing methods on xmlStream.
-// These methods provide a hook for managing the the extra context markup.
+// These methods provide a hook for managing the extra context markup.
void CompileLog::see_tag(const char* tag, bool push) {
if (_context.size() > 0 && _out != NULL) {
_out->write(_context.base(), _context.size());
--- a/hotspot/src/share/vm/opto/c2_globals.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -193,6 +193,13 @@
product(intx, LoopMaxUnroll, 16, \
"Maximum number of unrolls for main loop") \
\
+ product(bool, SuperWordLoopUnrollAnalysis, false, \
+ "Map number of unrolls for main loop via " \
+ "Superword Level Parallelism analysis") \
+ \
+ notproduct(bool, TraceSuperWordLoopUnrollAnalysis, false, \
+ "Trace what Superword Level Parallelism analysis applies") \
+ \
product(intx, LoopUnrollMin, 4, \
"Minimum number of unroll loop bodies before checking progress" \
"of rounds of unroll,optimize,..") \
--- a/hotspot/src/share/vm/opto/escape.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/opto/escape.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -966,6 +966,7 @@
strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 ||
strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 ||
strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 ||
+ strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 ||
strcmp(call->as_CallLeaf()->_name, "sha256_implCompress") == 0 ||
--- a/hotspot/src/share/vm/opto/library_call.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/opto/library_call.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -278,6 +278,7 @@
Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting);
Node* get_key_start_from_aescrypt_object(Node* aescrypt_object);
Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object);
+ bool inline_ghash_processBlocks();
bool inline_sha_implCompress(vmIntrinsics::ID id);
bool inline_digestBase_implCompressMB(int predicate);
bool inline_sha_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass_SHA,
@@ -528,6 +529,10 @@
predicates = 3;
break;
+ case vmIntrinsics::_ghash_processBlocks:
+ if (!UseGHASHIntrinsics) return NULL;
+ break;
+
case vmIntrinsics::_updateCRC32:
case vmIntrinsics::_updateBytesCRC32:
case vmIntrinsics::_updateByteBufferCRC32:
@@ -929,6 +934,9 @@
case vmIntrinsics::_mulAdd:
return inline_mulAdd();
+ case vmIntrinsics::_ghash_processBlocks:
+ return inline_ghash_processBlocks();
+
case vmIntrinsics::_encodeISOArray:
return inline_encodeISOArray();
@@ -5858,6 +5866,35 @@
return _gvn.transform(region);
}
+//------------------------------inline_ghash_processBlocks
+bool LibraryCallKit::inline_ghash_processBlocks() {
+ address stubAddr;
+ const char *stubName;
+ assert(UseGHASHIntrinsics, "need GHASH intrinsics support");
+
+ stubAddr = StubRoutines::ghash_processBlocks();
+ stubName = "ghash_processBlocks";
+
+ Node* data = argument(0);
+ Node* offset = argument(1);
+ Node* len = argument(2);
+ Node* state = argument(3);
+ Node* subkeyH = argument(4);
+
+ Node* state_start = array_element_address(state, intcon(0), T_LONG);
+ assert(state_start, "state is NULL");
+ Node* subkeyH_start = array_element_address(subkeyH, intcon(0), T_LONG);
+ assert(subkeyH_start, "subkeyH is NULL");
+ Node* data_start = array_element_address(data, offset, T_BYTE);
+ assert(data_start, "data is NULL");
+
+ Node* ghash = make_runtime_call(RC_LEAF|RC_NO_FP,
+ OptoRuntime::ghash_processBlocks_Type(),
+ stubAddr, stubName, TypePtr::BOTTOM,
+ state_start, subkeyH_start, data_start, len);
+ return true;
+}
+
//------------------------------inline_sha_implCompress-----------------------
//
// Calculate SHA (i.e., SHA-1) for single-block byte[] array.
--- a/hotspot/src/share/vm/opto/loopTransform.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/opto/loopTransform.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -38,6 +38,7 @@
#include "opto/rootnode.hpp"
#include "opto/runtime.hpp"
#include "opto/subnode.hpp"
+#include "opto/superword.hpp"
#include "opto/vectornode.hpp"
//------------------------------is_loop_exit-----------------------------------
@@ -640,7 +641,7 @@
//------------------------------policy_unroll----------------------------------
// Return TRUE or FALSE if the loop should be unrolled or not. Unroll if
// the loop is a CountedLoop and the body is small enough.
-bool IdealLoopTree::policy_unroll( PhaseIdealLoop *phase ) const {
+bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
CountedLoopNode *cl = _head->as_CountedLoop();
assert(cl->is_normal_loop() || cl->is_main_loop(), "");
@@ -652,6 +653,8 @@
// After split at least one iteration will be executed in pre-loop.
if (cl->trip_count() <= (uint)(cl->is_normal_loop() ? 2 : 1)) return false;
+ _local_loop_unroll_limit = LoopUnrollLimit;
+ _local_loop_unroll_factor = 4;
int future_unroll_ct = cl->unrolled_count() * 2;
if (future_unroll_ct > LoopMaxUnroll) return false;
@@ -747,8 +750,24 @@
} // switch
}
+ if (UseSuperWord) {
+ if (!cl->is_reduction_loop()) {
+ phase->mark_reductions(this);
+ }
+
+ // Only attempt slp analysis when user controls do not prohibit it
+ if (LoopMaxUnroll > _local_loop_unroll_factor) {
+ // Once policy_slp_analysis succeeds, mark the loop with the
+ // maximal unroll factor so that we minimize analysis passes
+ if ((future_unroll_ct > _local_loop_unroll_factor) ||
+ (body_size > (uint)_local_loop_unroll_limit)) {
+ policy_unroll_slp_analysis(cl, phase, future_unroll_ct);
+ }
+ }
+ }
+
// Check for being too big
- if (body_size > (uint)LoopUnrollLimit) {
+ if (body_size > (uint)_local_loop_unroll_limit) {
if (xors_in_loop >= 4 && body_size < (uint)LoopUnrollLimit*4) return true;
// Normal case: loop too big
return false;
@@ -758,6 +777,36 @@
return true;
}
+void IdealLoopTree::policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLoop *phase, int future_unroll_ct) {
+ // Enable this functionality target by target as needed
+ if (SuperWordLoopUnrollAnalysis) {
+ if (!cl->has_passed_slp()) {
+ SuperWord sw(phase);
+ sw.transform_loop(this, false);
+
+ // If the loop is slp canonical analyze it
+ if (sw.early_return() == false) {
+ sw.unrolling_analysis(cl, _local_loop_unroll_factor);
+ }
+ }
+
+ int slp_max_unroll_factor = cl->slp_max_unroll();
+ if ((slp_max_unroll_factor > 4) &&
+ (slp_max_unroll_factor >= future_unroll_ct)) {
+ int new_limit = cl->node_count_before_unroll() * slp_max_unroll_factor;
+ if (new_limit > LoopUnrollLimit) {
+#ifndef PRODUCT
+ if (TraceSuperWordLoopUnrollAnalysis) {
+ tty->print_cr("slp analysis is applying unroll limit %d, the original limit was %d\n",
+ new_limit, _local_loop_unroll_limit);
+ }
+#endif
+ _local_loop_unroll_limit = new_limit;
+ }
+ }
+ }
+}
+
//------------------------------policy_align-----------------------------------
// Return TRUE or FALSE if the loop should be cache-line aligned. Gather the
// expression that does the alignment. Note that only one array base can be
@@ -1611,6 +1660,7 @@
// iff the uses conform
if (ok) {
def_node->add_flag(Node::Flag_is_reduction);
+ loop_head->mark_has_reductions();
}
}
}
@@ -2517,7 +2567,6 @@
// and we'd rather unroll the post-RCE'd loop SO... do not unroll if
// peeling.
if (should_unroll && !should_peel) {
- phase->mark_reductions(this);
phase->do_unroll(this, old_new, true);
}
--- a/hotspot/src/share/vm/opto/loopnode.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/opto/loopnode.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -2408,7 +2408,7 @@
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
IdealLoopTree* lpt = iter.current();
if (lpt->is_counted()) {
- sw.transform_loop(lpt);
+ sw.transform_loop(lpt, true);
}
}
}
--- a/hotspot/src/share/vm/opto/loopnode.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/opto/loopnode.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -62,7 +62,9 @@
HasExactTripCount=8,
InnerLoop=16,
PartialPeelLoop=32,
- PartialPeelFailed=64 };
+ PartialPeelFailed=64,
+ HasReductions=128,
+ PassedSlpAnalysis=256 };
char _unswitch_count;
enum { _unswitch_max=3 };
@@ -77,6 +79,8 @@
void set_partial_peel_loop() { _loop_flags |= PartialPeelLoop; }
int partial_peel_has_failed() const { return _loop_flags & PartialPeelFailed; }
void mark_partial_peel_failed() { _loop_flags |= PartialPeelFailed; }
+ void mark_has_reductions() { _loop_flags |= HasReductions; }
+ void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; }
int unswitch_max() { return _unswitch_max; }
int unswitch_count() { return _unswitch_count; }
@@ -155,11 +159,15 @@
// unroll,optimize,unroll,optimize,... is making progress
int _node_count_before_unroll;
+ // If slp analysis is performed we record the maximum
+ // vector mapped unroll factor here
+ int _slp_maximum_unroll_factor;
+
public:
CountedLoopNode( Node *entry, Node *backedge )
: LoopNode(entry, backedge), _main_idx(0), _trip_count(max_juint),
_profile_trip_cnt(COUNT_UNKNOWN), _unrolled_count_log2(0),
- _node_count_before_unroll(0) {
+ _node_count_before_unroll(0), _slp_maximum_unroll_factor(0) {
init_class_id(Class_CountedLoop);
// Initialize _trip_count to the largest possible value.
// Will be reset (lower) if the loop's trip count is known.
@@ -199,10 +207,12 @@
// A 'main' loop that is ONLY unrolled or peeled, never RCE'd or
// Aligned, may be missing it's pre-loop.
- int is_normal_loop() const { return (_loop_flags&PreMainPostFlagsMask) == Normal; }
- int is_pre_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Pre; }
- int is_main_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Main; }
- int is_post_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Post; }
+ int is_normal_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Normal; }
+ int is_pre_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Pre; }
+ int is_main_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Main; }
+ int is_post_loop () const { return (_loop_flags&PreMainPostFlagsMask) == Post; }
+ int is_reduction_loop() const { return (_loop_flags&HasReductions) == HasReductions; }
+ int has_passed_slp () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; }
void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
@@ -232,8 +242,10 @@
void double_unrolled_count() { _unrolled_count_log2++; }
int unrolled_count() { return 1 << MIN2(_unrolled_count_log2, BitsPerInt-3); }
- void set_node_count_before_unroll(int ct) { _node_count_before_unroll = ct; }
- int node_count_before_unroll() { return _node_count_before_unroll; }
+ void set_node_count_before_unroll(int ct) { _node_count_before_unroll = ct; }
+ int node_count_before_unroll() { return _node_count_before_unroll; }
+ void set_slp_max_unroll(int unroll_factor) { _slp_maximum_unroll_factor = unroll_factor; }
+ int slp_max_unroll() const { return _slp_maximum_unroll_factor; }
#ifndef PRODUCT
virtual void dump_spec(outputStream *st) const;
@@ -336,6 +348,8 @@
Node *_tail; // Tail of loop
inline Node *tail(); // Handle lazy update of _tail field
PhaseIdealLoop* _phase;
+ int _local_loop_unroll_limit;
+ int _local_loop_unroll_factor;
Node_List _body; // Loop body for inner loops
@@ -356,7 +370,8 @@
_safepts(NULL),
_required_safept(NULL),
_allow_optimizations(true),
- _nest(0), _irreducible(0), _has_call(0), _has_sfpt(0), _rce_candidate(0)
+ _nest(0), _irreducible(0), _has_call(0), _has_sfpt(0), _rce_candidate(0),
+ _local_loop_unroll_limit(0), _local_loop_unroll_factor(0)
{ }
// Is 'l' a member of 'this'?
@@ -444,7 +459,10 @@
// Return TRUE or FALSE if the loop should be unrolled or not. Unroll if
// the loop is a CountedLoop and the body is small enough.
- bool policy_unroll( PhaseIdealLoop *phase ) const;
+ bool policy_unroll(PhaseIdealLoop *phase);
+
+ // Loop analyses to map to a maximal superword unrolling for vectorization.
+ void policy_unroll_slp_analysis(CountedLoopNode *cl, PhaseIdealLoop *phase, int future_unroll_ct);
// Return TRUE or FALSE if the loop should be range-check-eliminated.
// Gather a list of IF tests that are dominated by iteration splitting;
--- a/hotspot/src/share/vm/opto/runtime.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/opto/runtime.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -987,7 +987,25 @@
return TypeFunc::make(domain, range);
}
+// GHASH block processing
+const TypeFunc* OptoRuntime::ghash_processBlocks_Type() {
+ int argcnt = 4;
+ const Type** fields = TypeTuple::fields(argcnt);
+ int argp = TypeFunc::Parms;
+ fields[argp++] = TypePtr::NOTNULL; // state
+ fields[argp++] = TypePtr::NOTNULL; // subkeyH
+ fields[argp++] = TypePtr::NOTNULL; // data
+ fields[argp++] = TypeInt::INT; // blocks
+ assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
+ const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
+
+ // result type needed
+ fields = TypeTuple::fields(1);
+ fields[TypeFunc::Parms+0] = NULL; // void
+ const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
+ return TypeFunc::make(domain, range);
+}
//------------- Interpreter state access for on stack replacement
const TypeFunc* OptoRuntime::osr_end_Type() {
--- a/hotspot/src/share/vm/opto/runtime.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/opto/runtime.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -316,6 +316,8 @@
static const TypeFunc* mulAdd_Type();
+ static const TypeFunc* ghash_processBlocks_Type();
+
static const TypeFunc* updateBytesCRC32_Type();
// leaf on stack replacement interpreter accessor types
--- a/hotspot/src/share/vm/opto/superword.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/opto/superword.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -68,6 +68,7 @@
_bb(NULL), // basic block
_iv(NULL), // induction var
_race_possible(false), // cases where SDMU is true
+ _early_return(true), // analysis evaluations routine
_num_work_vecs(0), // amount of vector work we have
_num_reductions(0), // amount of reduction work we have
_do_vector_loop(phase->C->do_vector_loop()), // whether to do vectorization/simd style
@@ -78,7 +79,7 @@
{}
//------------------------------transform_loop---------------------------
-void SuperWord::transform_loop(IdealLoopTree* lpt) {
+void SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) {
assert(UseSuperWord, "should be");
// Do vectors exist on this architecture?
if (Matcher::vector_width_in_bytes(T_BYTE) < 2) return;
@@ -113,8 +114,156 @@
// For now, define one block which is the entire loop body
set_bb(cl);
- assert(_packset.length() == 0, "packset must be empty");
- SLP_extract();
+ if (do_optimization) {
+ assert(_packset.length() == 0, "packset must be empty");
+ SLP_extract();
+ }
+}
+
+//------------------------------early unrolling analysis------------------------------
+void SuperWord::unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor) {
+ bool is_slp = true;
+ ResourceMark rm;
+ size_t ignored_size = lpt()->_body.size();
+ int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size);
+ Node_Stack nstack((int)ignored_size);
+ Node *cl_exit = cl->loopexit();
+
+ // First clear the entries
+ for (uint i = 0; i < lpt()->_body.size(); i++) {
+ ignored_loop_nodes[i] = -1;
+ }
+
+ int max_vector = Matcher::max_vector_size(T_INT);
+
+ // Process the loop, some/all of the stack entries will not be in order, ergo
+ // need to preprocess the ignored initial state before we process the loop
+ for (uint i = 0; i < lpt()->_body.size(); i++) {
+ Node* n = lpt()->_body.at(i);
+ if (n == cl->incr() ||
+ n->is_reduction() ||
+ n->is_AddP() ||
+ n->is_Cmp() ||
+ n->is_IfTrue() ||
+ n->is_CountedLoop() ||
+ (n == cl_exit)) {
+ ignored_loop_nodes[i] = n->_idx;
+ continue;
+ }
+
+ if (n->is_If()) {
+ IfNode *iff = n->as_If();
+ if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) {
+ if (lpt()->is_loop_exit(iff)) {
+ ignored_loop_nodes[i] = n->_idx;
+ continue;
+ }
+ }
+ }
+
+ if (n->is_Phi() && (n->bottom_type() == Type::MEMORY)) {
+ Node* n_tail = n->in(LoopNode::LoopBackControl);
+ if (n_tail != n->in(LoopNode::EntryControl)) {
+ if (!n_tail->is_Mem()) {
+ is_slp = false;
+ break;
+ }
+ }
+ }
+
+ // This must happen after check of phi/if
+ if (n->is_Phi() || n->is_If()) {
+ ignored_loop_nodes[i] = n->_idx;
+ continue;
+ }
+
+ if (n->is_LoadStore() || n->is_MergeMem() ||
+ (n->is_Proj() && !n->as_Proj()->is_CFG())) {
+ is_slp = false;
+ break;
+ }
+
+ if (n->is_Mem()) {
+ MemNode* current = n->as_Mem();
+ BasicType bt = current->memory_type();
+ if (is_java_primitive(bt) == false) {
+ ignored_loop_nodes[i] = n->_idx;
+ continue;
+ }
+ Node* adr = n->in(MemNode::Address);
+ Node* n_ctrl = _phase->get_ctrl(adr);
+
+ // save a queue of post process nodes
+ if (n_ctrl != NULL && lpt()->is_member(_phase->get_loop(n_ctrl))) {
+ // Process the memory expression
+ int stack_idx = 0;
+ bool have_side_effects = true;
+ if (adr->is_AddP() == false) {
+ nstack.push(adr, stack_idx++);
+ } else {
+ // Mark the components of the memory operation in nstack
+ SWPointer p1(current, this, &nstack, true);
+ have_side_effects = p1.node_stack()->is_nonempty();
+ }
+
+ // Process the pointer stack
+ while (have_side_effects) {
+ Node* pointer_node = nstack.node();
+ for (uint j = 0; j < lpt()->_body.size(); j++) {
+ Node* cur_node = lpt()->_body.at(j);
+ if (cur_node == pointer_node) {
+ ignored_loop_nodes[j] = cur_node->_idx;
+ break;
+ }
+ }
+ nstack.pop();
+ have_side_effects = nstack.is_nonempty();
+ }
+ }
+ }
+ }
+
+ if (is_slp) {
+ // Now we try to find the maximum supported consistent vector which the machine
+ // description can use
+ for (uint i = 0; i < lpt()->_body.size(); i++) {
+ if (ignored_loop_nodes[i] != -1) continue;
+
+ BasicType bt;
+ Node* n = lpt()->_body.at(i);
+ if (n->is_Store()) {
+ bt = n->as_Mem()->memory_type();
+ } else {
+ bt = n->bottom_type()->basic_type();
+ }
+
+ int cur_max_vector = Matcher::max_vector_size(bt);
+
+ // If a max vector exists which is not larger than _local_loop_unroll_factor
+ // stop looking, we already have the max vector to map to.
+ if (cur_max_vector <= local_loop_unroll_factor) {
+ is_slp = false;
+#ifndef PRODUCT
+ if (TraceSuperWordLoopUnrollAnalysis) {
+ tty->print_cr("slp analysis fails: unroll limit equals max vector\n");
+ }
+#endif
+ break;
+ }
+
+ // Map the maximal common vector
+ if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) {
+ if (cur_max_vector < max_vector) {
+ max_vector = cur_max_vector;
+ }
+ }
+ }
+ if (is_slp) {
+ local_loop_unroll_factor = max_vector;
+ }
+ cl->mark_passed_slp();
+ cl->set_slp_max_unroll(local_loop_unroll_factor);
+ }
}
//------------------------------SLP_extract---------------------------
@@ -268,12 +417,12 @@
best_iv_adjustment = iv_adjustment;
}
- SWPointer align_to_ref_p(mem_ref, this);
+ SWPointer align_to_ref_p(mem_ref, this, NULL, false);
// Set alignment relative to "align_to_ref" for all related memory operations.
for (int i = memops.size() - 1; i >= 0; i--) {
MemNode* s = memops.at(i)->as_Mem();
if (isomorphic(s, mem_ref)) {
- SWPointer p2(s, this);
+ SWPointer p2(s, this, NULL, false);
if (p2.comparable(align_to_ref_p)) {
int align = memory_alignment(s, iv_adjustment);
set_alignment(s, align);
@@ -294,7 +443,7 @@
// iterations in pre-loop will be not enough to align it.
create_pack = false;
} else {
- SWPointer p2(best_align_to_mem_ref, this);
+ SWPointer p2(best_align_to_mem_ref, this, NULL, false);
if (align_to_ref_p.invar() != p2.invar()) {
// Do not vectorize memory accesses with different invariants
// if unaligned memory accesses are not allowed.
@@ -411,7 +560,7 @@
// Count number of comparable memory ops
for (uint i = 0; i < memops.size(); i++) {
MemNode* s1 = memops.at(i)->as_Mem();
- SWPointer p1(s1, this);
+ SWPointer p1(s1, this, NULL, false);
// Discard if pre loop can't align this reference
if (!ref_is_alignable(p1)) {
*cmp_ct.adr_at(i) = 0;
@@ -420,7 +569,7 @@
for (uint j = i+1; j < memops.size(); j++) {
MemNode* s2 = memops.at(j)->as_Mem();
if (isomorphic(s1, s2)) {
- SWPointer p2(s2, this);
+ SWPointer p2(s2, this, NULL, false);
if (p1.comparable(p2)) {
(*cmp_ct.adr_at(i))++;
(*cmp_ct.adr_at(j))++;
@@ -441,7 +590,7 @@
if (s->is_Store()) {
int vw = vector_width_in_bytes(s);
assert(vw > 1, "sanity");
- SWPointer p(s, this);
+ SWPointer p(s, this, NULL, false);
if (cmp_ct.at(j) > max_ct ||
cmp_ct.at(j) == max_ct &&
(vw > max_vw ||
@@ -464,7 +613,7 @@
if (s->is_Load()) {
int vw = vector_width_in_bytes(s);
assert(vw > 1, "sanity");
- SWPointer p(s, this);
+ SWPointer p(s, this, NULL, false);
if (cmp_ct.at(j) > max_ct ||
cmp_ct.at(j) == max_ct &&
(vw > max_vw ||
@@ -575,7 +724,7 @@
//---------------------------get_iv_adjustment---------------------------
// Calculate loop's iv adjustment for this memory ops.
int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
- SWPointer align_to_ref_p(mem_ref, this);
+ SWPointer align_to_ref_p(mem_ref, this, NULL, false);
int offset = align_to_ref_p.offset_in_bytes();
int scale = align_to_ref_p.scale_in_bytes();
int elt_size = align_to_ref_p.memory_size();
@@ -649,13 +798,13 @@
if (_dg.dep(s1)->in_cnt() == 0) {
_dg.make_edge(slice, s1);
}
- SWPointer p1(s1->as_Mem(), this);
+ SWPointer p1(s1->as_Mem(), this, NULL, false);
bool sink_dependent = true;
for (int k = j - 1; k >= 0; k--) {
Node* s2 = _nlist.at(k);
if (s1->is_Load() && s2->is_Load())
continue;
- SWPointer p2(s2->as_Mem(), this);
+ SWPointer p2(s2->as_Mem(), this, NULL, false);
int cmp = p1.cmp(p2);
if (SuperWordRTDepCheck &&
@@ -795,8 +944,8 @@
if (_phase->C->get_alias_index(s1->as_Mem()->adr_type()) !=
_phase->C->get_alias_index(s2->as_Mem()->adr_type()))
return false;
- SWPointer p1(s1->as_Mem(), this);
- SWPointer p2(s2->as_Mem(), this);
+ SWPointer p1(s1->as_Mem(), this, NULL, false);
+ SWPointer p2(s2->as_Mem(), this, NULL, false);
if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
return diff == data_size(s1);
@@ -1615,13 +1764,13 @@
if (n->is_Load()) {
Node* ctl = n->in(MemNode::Control);
Node* mem = first->in(MemNode::Memory);
- SWPointer p1(n->as_Mem(), this);
+ SWPointer p1(n->as_Mem(), this, NULL, false);
// Identify the memory dependency for the new loadVector node by
// walking up through memory chain.
// This is done to give flexibility to the new loadVector node so that
// it can move above independent storeVector nodes.
while (mem->is_StoreVector()) {
- SWPointer p2(mem->as_Mem(), this);
+ SWPointer p2(mem->as_Mem(), this, NULL, false);
int cmp = p1.cmp(p2);
if (SWPointer::not_equal(cmp) || !SWPointer::comparable(cmp)) {
mem = mem->in(MemNode::Memory);
@@ -2138,7 +2287,7 @@
//------------------------------memory_alignment---------------------------
// Alignment within a vector memory reference
int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
- SWPointer p(s, this);
+ SWPointer p(s, this, NULL, false);
if (!p.valid()) {
return bottom_align;
}
@@ -2315,7 +2464,7 @@
Node *orig_limit = pre_opaq->original_loop_limit();
assert(orig_limit != NULL && _igvn.type(orig_limit) != Type::TOP, "");
- SWPointer align_to_ref_p(align_to_ref, this);
+ SWPointer align_to_ref_p(align_to_ref, this, NULL, false);
assert(align_to_ref_p.valid(), "sanity");
// Given:
@@ -2489,6 +2638,7 @@
_bb = NULL;
_iv = NULL;
_race_possible = 0;
+ _early_return = false;
_num_work_vecs = 0;
_num_reductions = 0;
}
@@ -2559,9 +2709,11 @@
//==============================SWPointer===========================
//----------------------------SWPointer------------------------
-SWPointer::SWPointer(MemNode* mem, SuperWord* slp) :
+SWPointer::SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only) :
_mem(mem), _slp(slp), _base(NULL), _adr(NULL),
- _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {
+ _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
+ _nstack(nstack), _analyze_only(analyze_only),
+ _stack_idx(0) {
Node* adr = mem->in(MemNode::Address);
if (!adr->is_AddP()) {
@@ -2599,7 +2751,9 @@
// the pattern match of an address expression.
SWPointer::SWPointer(SWPointer* p) :
_mem(p->_mem), _slp(p->_slp), _base(NULL), _adr(NULL),
- _scale(0), _offset(0), _invar(NULL), _negate_invar(false) {}
+ _scale(0), _offset(0), _invar(NULL), _negate_invar(false),
+ _nstack(p->_nstack), _analyze_only(p->_analyze_only),
+ _stack_idx(p->_stack_idx) {}
//------------------------scaled_iv_plus_offset--------------------
// Match: k*iv + offset
@@ -2642,6 +2796,9 @@
_scale = 1;
return true;
}
+ if (_analyze_only && (invariant(n) == false)) {
+ _nstack->push(n, _stack_idx++);
+ }
int opc = n->Opcode();
if (opc == Op_MulI) {
if (n->in(1) == iv() && n->in(2)->is_Con()) {
@@ -2699,6 +2856,9 @@
return false;
}
if (_invar != NULL) return false; // already have an invariant
+ if (_analyze_only && (invariant(n) == false)) {
+ _nstack->push(n, _stack_idx++);
+ }
if (opc == Op_AddI) {
if (n->in(2)->is_Con() && invariant(n->in(1))) {
_negate_invar = negate;
--- a/hotspot/src/share/vm/opto/superword.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/opto/superword.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -239,12 +239,15 @@
public:
SuperWord(PhaseIdealLoop* phase);
- void transform_loop(IdealLoopTree* lpt);
+ void transform_loop(IdealLoopTree* lpt, bool do_optimization);
+
+ void unrolling_analysis(CountedLoopNode *cl, int &local_loop_unroll_factor);
// Accessors for SWPointer
PhaseIdealLoop* phase() { return _phase; }
IdealLoopTree* lpt() { return _lpt; }
PhiNode* iv() { return _iv; }
+ bool early_return() { return _early_return; }
private:
IdealLoopTree* _lpt; // Current loop tree node
@@ -252,6 +255,7 @@
Node* _bb; // Current basic block
PhiNode* _iv; // Induction var
bool _race_possible; // In cases where SDMU is true
+ bool _early_return; // True if we do not initialize
bool _do_vector_loop; // whether to do vectorization/simd style
bool _vector_loop_debug; // provide more printing in debug mode
int _num_work_vecs; // Number of non memory vector operations
@@ -462,15 +466,18 @@
// Information about an address for dependence checking and vector alignment
class SWPointer VALUE_OBJ_CLASS_SPEC {
protected:
- MemNode* _mem; // My memory reference node
- SuperWord* _slp; // SuperWord class
+ MemNode* _mem; // My memory reference node
+ SuperWord* _slp; // SuperWord class
- Node* _base; // NULL if unsafe nonheap reference
- Node* _adr; // address pointer
- jint _scale; // multiplier for iv (in bytes), 0 if no loop iv
- jint _offset; // constant offset (in bytes)
- Node* _invar; // invariant offset (in bytes), NULL if none
- bool _negate_invar; // if true then use: (0 - _invar)
+ Node* _base; // NULL if unsafe nonheap reference
+ Node* _adr; // address pointer
+ jint _scale; // multiplier for iv (in bytes), 0 if no loop iv
+ jint _offset; // constant offset (in bytes)
+ Node* _invar; // invariant offset (in bytes), NULL if none
+ bool _negate_invar; // if true then use: (0 - _invar)
+ Node_Stack* _nstack; // stack used to record a swpointer trace of variants
+ bool _analyze_only; // Used in loop unrolling only for swpointer trace
+ uint _stack_idx; // Used in loop unrolling only for swpointer trace
PhaseIdealLoop* phase() { return _slp->phase(); }
IdealLoopTree* lpt() { return _slp->lpt(); }
@@ -497,7 +504,7 @@
NotComparable = (Less | Greater | Equal)
};
- SWPointer(MemNode* mem, SuperWord* slp);
+ SWPointer(MemNode* mem, SuperWord* slp, Node_Stack *nstack, bool analyze_only);
// Following is used to create a temporary object during
// the pattern match of an address expression.
SWPointer(SWPointer* p);
@@ -505,14 +512,15 @@
bool valid() { return _adr != NULL; }
bool has_iv() { return _scale != 0; }
- Node* base() { return _base; }
- Node* adr() { return _adr; }
- MemNode* mem() { return _mem; }
- int scale_in_bytes() { return _scale; }
- Node* invar() { return _invar; }
- bool negate_invar() { return _negate_invar; }
- int offset_in_bytes() { return _offset; }
- int memory_size() { return _mem->memory_size(); }
+ Node* base() { return _base; }
+ Node* adr() { return _adr; }
+ MemNode* mem() { return _mem; }
+ int scale_in_bytes() { return _scale; }
+ Node* invar() { return _invar; }
+ bool negate_invar() { return _negate_invar; }
+ int offset_in_bytes() { return _offset; }
+ int memory_size() { return _mem->memory_size(); }
+ Node_Stack* node_stack() { return _nstack; }
// Comparable?
int cmp(SWPointer& q) {
--- a/hotspot/src/share/vm/runtime/globals.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/runtime/globals.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -730,6 +730,9 @@
product(bool, UseSHA, false, \
"Control whether SHA instructions can be used on SPARC") \
\
+ product(bool, UseGHASHIntrinsics, false, \
+ "Use intrinsics for GHASH versions of crypto") \
+ \
product(size_t, LargePageSizeInBytes, 0, \
"Large page size (0 to let VM choose the page size)") \
\
--- a/hotspot/src/share/vm/runtime/stubRoutines.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/runtime/stubRoutines.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -125,6 +125,7 @@
address StubRoutines::_aescrypt_decryptBlock = NULL;
address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL;
address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL;
+address StubRoutines::_ghash_processBlocks = NULL;
address StubRoutines::_sha1_implCompress = NULL;
address StubRoutines::_sha1_implCompressMB = NULL;
@@ -173,6 +174,9 @@
}
CodeBuffer buffer(_code1);
StubGenerator_generate(&buffer, false);
+ // When new stubs added we need to make sure there is some space left
+ // to catch situation when we should increase size again.
+ assert(buffer.insts_remaining() > 200, "increase code_size1");
}
}
@@ -257,6 +261,9 @@
}
CodeBuffer buffer(_code2);
StubGenerator_generate(&buffer, true);
+ // When new stubs added we need to make sure there is some space left
+ // to catch situation when we should increase size again.
+ assert(buffer.insts_remaining() > 200, "increase code_size2");
}
#ifdef ASSERT
--- a/hotspot/src/share/vm/runtime/stubRoutines.hpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/runtime/stubRoutines.hpp Thu Jun 25 09:48:50 2015 -0700
@@ -185,6 +185,7 @@
static address _aescrypt_decryptBlock;
static address _cipherBlockChaining_encryptAESCrypt;
static address _cipherBlockChaining_decryptAESCrypt;
+ static address _ghash_processBlocks;
static address _sha1_implCompress;
static address _sha1_implCompressMB;
@@ -346,6 +347,7 @@
static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; }
static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; }
static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; }
+ static address ghash_processBlocks() { return _ghash_processBlocks; }
static address sha1_implCompress() { return _sha1_implCompress; }
static address sha1_implCompressMB() { return _sha1_implCompressMB; }
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp Thu Jun 25 09:48:50 2015 -0700
@@ -827,6 +827,7 @@
static_field(StubRoutines, _aescrypt_decryptBlock, address) \
static_field(StubRoutines, _cipherBlockChaining_encryptAESCrypt, address) \
static_field(StubRoutines, _cipherBlockChaining_decryptAESCrypt, address) \
+ static_field(StubRoutines, _ghash_processBlocks, address) \
static_field(StubRoutines, _updateBytesCRC32, address) \
static_field(StubRoutines, _crc_table_adr, address) \
static_field(StubRoutines, _multiplyToLen, address) \
--- a/hotspot/test/compiler/codegen/7184394/TestAESBase.java Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/test/compiler/codegen/7184394/TestAESBase.java Thu Jun 25 09:48:50 2015 -0700
@@ -31,6 +31,7 @@
import java.util.Random;
import javax.crypto.Cipher;
import javax.crypto.SecretKey;
+import javax.crypto.spec.GCMParameterSpec;
import javax.crypto.spec.IvParameterSpec;
import javax.crypto.spec.SecretKeySpec;
@@ -62,6 +63,10 @@
Cipher dCipher;
AlgorithmParameters algParams;
SecretKey key;
+ GCMParameterSpec gcm_spec;
+ byte[] aad;
+ int tlen = 12;
+ byte[] iv;
static int numThreads = 0;
int threadId;
@@ -100,6 +105,12 @@
int ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0);
IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]);
cipher.init(Cipher.ENCRYPT_MODE, key, initVector);
+ } else if (mode.equals("GCM")) {
+ iv = new byte[64];
+ random.nextBytes(iv);
+ aad = new byte[5];
+ random.nextBytes(aad);
+ gcm_init();
} else {
algParams = cipher.getParameters();
cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
@@ -186,4 +197,12 @@
}
abstract void childShowCipher();
+
+ void gcm_init() throws Exception {
+ tlen = 12;
+ gcm_spec = new GCMParameterSpec(tlen * 8, iv);
+ cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE");
+ cipher.init(Cipher.ENCRYPT_MODE, key, gcm_spec);
+ cipher.update(aad);
+ }
}
--- a/hotspot/test/compiler/codegen/7184394/TestAESEncode.java Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/test/compiler/codegen/7184394/TestAESEncode.java Thu Jun 25 09:48:50 2015 -0700
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@@ -32,7 +32,11 @@
@Override
public void run() {
try {
- if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+ if (mode.equals("GCM")) {
+ gcm_init();
+ } else if (!noReinit) {
+ cipher.init(Cipher.ENCRYPT_MODE, key, algParams);
+ }
encode = new byte[encodeLength];
if (testingMisalignment) {
int tempSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset);
--- a/hotspot/test/compiler/codegen/7184394/TestAESMain.java Wed Jun 24 13:38:01 2015 +0200
+++ b/hotspot/test/compiler/codegen/7184394/TestAESMain.java Thu Jun 25 09:48:50 2015 -0700
@@ -44,6 +44,13 @@
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
* @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain
+ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain
*
* @author Tom Deneau
*/