8150767: Enables SHA Extensions on x86
Summary: Add x86 intrinsics for SHA-1 and SHA-256.
Reviewed-by: kvn, twisti
Contributed-by: vivek.r.deshpande@intel.com, shravya.rukmannagari@intel.com
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp Thu Mar 03 22:02:13 2016 -0800
@@ -777,6 +777,7 @@
case 0x6E: // movd
case 0x7E: // movd
case 0xAE: // ldmxcsr, stmxcsr, fxrstor, fxsave, clflush
+ case 0xFE: // paddd
debug_only(has_disp32 = true);
break;
@@ -926,6 +927,7 @@
ip++; // skip P2, move to opcode
// To find the end of instruction (which == end_pc_operand).
switch (0xFF & *ip) {
+ case 0x22: // pinsrd r, r/a, #8
case 0x61: // pcmpestri r, r/a, #8
case 0x70: // pshufd r, r/a, #8
case 0x73: // psrldq r, #8
@@ -3953,6 +3955,83 @@
emit_int8((unsigned char)(0xC0 | encode));
}
+void Assembler::palignr(XMMRegister dst, XMMRegister src, int imm8) {
+ assert(VM_Version::supports_ssse3(), "");
+ InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ false, /* uses_vl */ false);
+ int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+ emit_int8((unsigned char)0x0F);
+ emit_int8((unsigned char)(0xC0 | encode));
+ emit_int8(imm8);
+}
+
+void Assembler::pblendw(XMMRegister dst, XMMRegister src, int imm8) {
+ assert(VM_Version::supports_sse4_1(), "");
+ InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+ int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
+ emit_int8((unsigned char)0x0E);
+ emit_int8((unsigned char)(0xC0 | encode));
+ emit_int8(imm8);
+}
+
+void Assembler::sha1rnds4(XMMRegister dst, XMMRegister src, int imm8) {
+ assert(VM_Version::supports_sha(), "");
+ InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+ int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_3A, &attributes);
+ emit_int8((unsigned char)0xCC);
+ emit_int8((unsigned char)(0xC0 | encode));
+ emit_int8((unsigned char)imm8);
+}
+
+void Assembler::sha1nexte(XMMRegister dst, XMMRegister src) {
+ assert(VM_Version::supports_sha(), "");
+ InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+ int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
+ emit_int8((unsigned char)0xC8);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::sha1msg1(XMMRegister dst, XMMRegister src) {
+ assert(VM_Version::supports_sha(), "");
+ InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+ int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
+ emit_int8((unsigned char)0xC9);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::sha1msg2(XMMRegister dst, XMMRegister src) {
+ assert(VM_Version::supports_sha(), "");
+ InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+ int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
+ emit_int8((unsigned char)0xCA);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+// xmm0 is implicit additional source to this instruction.
+void Assembler::sha256rnds2(XMMRegister dst, XMMRegister src) {
+ assert(VM_Version::supports_sha(), "");
+ InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+ int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
+ emit_int8((unsigned char)0xCB);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::sha256msg1(XMMRegister dst, XMMRegister src) {
+ assert(VM_Version::supports_sha(), "");
+ InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+ int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
+ emit_int8((unsigned char)0xCC);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::sha256msg2(XMMRegister dst, XMMRegister src) {
+ assert(VM_Version::supports_sha(), "");
+ InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ false, /* uses_vl */ false);
+ int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
+ emit_int8((unsigned char)0xCD);
+ emit_int8((unsigned char)(0xC0 | encode));
+}
+
+
void Assembler::shll(Register dst, int imm8) {
assert(isShiftCount(imm8), "illegal shift count");
int encode = prefix_and_encode(dst->encoding());
@@ -4931,6 +5010,15 @@
emit_int8((unsigned char)(0xC0 | encode));
}
+void Assembler::paddd(XMMRegister dst, Address src) {
+ NOT_LP64(assert(VM_Version::supports_sse2(), ""));
+ InstructionMark im(this);
+ InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+ simd_prefix(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
+ emit_int8((unsigned char)0xFE);
+ emit_operand(dst, src);
+}
+
void Assembler::paddq(XMMRegister dst, XMMRegister src) {
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp Thu Mar 03 22:02:13 2016 -0800
@@ -1672,6 +1672,18 @@
void setb(Condition cc, Register dst);
+ void palignr(XMMRegister dst, XMMRegister src, int imm8);
+ void pblendw(XMMRegister dst, XMMRegister src, int imm8);
+
+ void sha1rnds4(XMMRegister dst, XMMRegister src, int imm8);
+ void sha1nexte(XMMRegister dst, XMMRegister src);
+ void sha1msg1(XMMRegister dst, XMMRegister src);
+ void sha1msg2(XMMRegister dst, XMMRegister src);
+ // xmm0 is implicit additional source to the following instruction.
+ void sha256rnds2(XMMRegister dst, XMMRegister src);
+ void sha256msg1(XMMRegister dst, XMMRegister src);
+ void sha256msg2(XMMRegister dst, XMMRegister src);
+
void shldl(Register dst, Register src);
void shldl(Register dst, Register src, int8_t imm8);
@@ -1868,6 +1880,7 @@
void paddb(XMMRegister dst, XMMRegister src);
void paddw(XMMRegister dst, XMMRegister src);
void paddd(XMMRegister dst, XMMRegister src);
+ void paddd(XMMRegister dst, Address src);
void paddq(XMMRegister dst, XMMRegister src);
void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp Thu Mar 03 22:02:13 2016 -0800
@@ -48,7 +48,6 @@
// This is the base routine called by the different versions of call_VM_leaf. The interpreter
// may customize this version by overriding it for its purposes (e.g., to save/restore
// additional registers when doing a VM call).
-#define COMMA ,
virtual void call_VM_leaf_base(
address entry_point, // the entry point
@@ -903,35 +902,66 @@
void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
void ldmxcsr(AddressLiteral src);
+ void fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
+ XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
+ Register buf, Register state, Register ofs, Register limit, Register rsp,
+ bool multi_block);
+
+#ifdef _LP64
+ void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+ XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+ Register buf, Register state, Register ofs, Register limit, Register rsp,
+ bool multi_block, XMMRegister shuf_mask);
+#else
+ void fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+ XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+ Register buf, Register state, Register ofs, Register limit, Register rsp,
+ bool multi_block);
+#endif
+
void fast_exp(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
Register rax, Register rcx, Register rdx, Register tmp);
+#ifdef _LP64
void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
- Register rax, Register rcx, Register rdx, Register tmp1 LP64_ONLY(COMMA Register tmp2));
+ Register rax, Register rcx, Register rdx, Register tmp1, Register tmp2);
void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
- Register rdx NOT_LP64(COMMA Register tmp) LP64_ONLY(COMMA Register tmp1)
- LP64_ONLY(COMMA Register tmp2) LP64_ONLY(COMMA Register tmp3) LP64_ONLY(COMMA Register tmp4));
+ Register rdx, Register tmp1, Register tmp2, Register tmp3, Register tmp4);
void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
- Register rax, Register rbx LP64_ONLY(COMMA Register rcx), Register rdx
- LP64_ONLY(COMMA Register tmp1) LP64_ONLY(COMMA Register tmp2)
- LP64_ONLY(COMMA Register tmp3) LP64_ONLY(COMMA Register tmp4));
+ Register rax, Register rbx, Register rcx, Register rdx, Register tmp1, Register tmp2,
+ Register tmp3, Register tmp4);
void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
- Register rax, Register rcx, Register rdx NOT_LP64(COMMA Register tmp)
- LP64_ONLY(COMMA Register r8) LP64_ONLY(COMMA Register r9)
- LP64_ONLY(COMMA Register r10) LP64_ONLY(COMMA Register r11));
+ Register rax, Register rcx, Register rdx, Register tmp1,
+ Register tmp2, Register tmp3, Register tmp4);
+#else
+ void fast_log(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
+ XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
+ Register rax, Register rcx, Register rdx, Register tmp1);
-#ifndef _LP64
+ void fast_pow(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
+ XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7, Register rax, Register rcx,
+ Register rdx, Register tmp);
+
+ void fast_sin(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
+ XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
+ Register rax, Register rbx, Register rdx);
+
+ void fast_cos(XMMRegister xmm0, XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
+ XMMRegister xmm4, XMMRegister xmm5, XMMRegister xmm6, XMMRegister xmm7,
+ Register rax, Register rcx, Register rdx, Register tmp);
+
void libm_sincos_huge(XMMRegister xmm0, XMMRegister xmm1, Register eax, Register ecx,
Register edx, Register ebx, Register esi, Register edi,
Register ebp, Register esp);
+
void libm_reduce_pi04l(Register eax, Register ecx, Register edx, Register ebx,
Register esi, Register edi, Register ebp, Register esp);
#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86_sha.cpp Thu Mar 03 22:02:13 2016 -0800
@@ -0,0 +1,495 @@
+/*
+* Copyright (c) 2016, Intel Corporation.
+*
+* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+*
+* This code is free software; you can redistribute it and/or modify it
+* under the terms of the GNU General Public License version 2 only, as
+* published by the Free Software Foundation.
+*
+* This code is distributed in the hope that it will be useful, but WITHOUT
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+* version 2 for more details (a copy is included in the LICENSE file that
+* accompanied this code).
+*
+* You should have received a copy of the GNU General Public License version
+* 2 along with this work; if not, write to the Free Software Foundation,
+* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+*
+* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+* or visit www.oracle.com if you need additional information or have any
+* questions.
+*
+*/
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "macroAssembler_x86.hpp"
+
+// ofs and limit are used for multi-block byte array.
+// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+void MacroAssembler::fast_sha1(XMMRegister abcd, XMMRegister e0, XMMRegister e1, XMMRegister msg0,
+ XMMRegister msg1, XMMRegister msg2, XMMRegister msg3, XMMRegister shuf_mask,
+ Register buf, Register state, Register ofs, Register limit, Register rsp, bool multi_block) {
+
+ Label start, done_hash, loop0;
+
+ address upper_word_mask = StubRoutines::x86::upper_word_mask_addr();
+ address shuffle_byte_flip_mask = StubRoutines::x86::shuffle_byte_flip_mask_addr();
+
+ bind(start);
+ movdqu(abcd, Address(state, 0));
+ pinsrd(e0, Address(state, 16), 3);
+ movdqu(shuf_mask, ExternalAddress(upper_word_mask)); // 0xFFFFFFFF000000000000000000000000
+ pand(e0, shuf_mask);
+ pshufd(abcd, abcd, 0x1B);
+ movdqu(shuf_mask, ExternalAddress(shuffle_byte_flip_mask)); //0x000102030405060708090a0b0c0d0e0f
+
+ bind(loop0);
+ // Save hash values for addition after rounds
+ movdqu(Address(rsp, 0), e0);
+ movdqu(Address(rsp, 16), abcd);
+
+
+ // Rounds 0 - 3
+ movdqu(msg0, Address(buf, 0));
+ pshufb(msg0, shuf_mask);
+ paddd(e0, msg0);
+ movdqa(e1, abcd);
+ sha1rnds4(abcd, e0, 0);
+
+ // Rounds 4 - 7
+ movdqu(msg1, Address(buf, 16));
+ pshufb(msg1, shuf_mask);
+ sha1nexte(e1, msg1);
+ movdqa(e0, abcd);
+ sha1rnds4(abcd, e1, 0);
+ sha1msg1(msg0, msg1);
+
+ // Rounds 8 - 11
+ movdqu(msg2, Address(buf, 32));
+ pshufb(msg2, shuf_mask);
+ sha1nexte(e0, msg2);
+ movdqa(e1, abcd);
+ sha1rnds4(abcd, e0, 0);
+ sha1msg1(msg1, msg2);
+ pxor(msg0, msg2);
+
+ // Rounds 12 - 15
+ movdqu(msg3, Address(buf, 48));
+ pshufb(msg3, shuf_mask);
+ sha1nexte(e1, msg3);
+ movdqa(e0, abcd);
+ sha1msg2(msg0, msg3);
+ sha1rnds4(abcd, e1, 0);
+ sha1msg1(msg2, msg3);
+ pxor(msg1, msg3);
+
+ // Rounds 16 - 19
+ sha1nexte(e0, msg0);
+ movdqa(e1, abcd);
+ sha1msg2(msg1, msg0);
+ sha1rnds4(abcd, e0, 0);
+ sha1msg1(msg3, msg0);
+ pxor(msg2, msg0);
+
+ // Rounds 20 - 23
+ sha1nexte(e1, msg1);
+ movdqa(e0, abcd);
+ sha1msg2(msg2, msg1);
+ sha1rnds4(abcd, e1, 1);
+ sha1msg1(msg0, msg1);
+ pxor(msg3, msg1);
+
+ // Rounds 24 - 27
+ sha1nexte(e0, msg2);
+ movdqa(e1, abcd);
+ sha1msg2(msg3, msg2);
+ sha1rnds4(abcd, e0, 1);
+ sha1msg1(msg1, msg2);
+ pxor(msg0, msg2);
+
+ // Rounds 28 - 31
+ sha1nexte(e1, msg3);
+ movdqa(e0, abcd);
+ sha1msg2(msg0, msg3);
+ sha1rnds4(abcd, e1, 1);
+ sha1msg1(msg2, msg3);
+ pxor(msg1, msg3);
+
+ // Rounds 32 - 35
+ sha1nexte(e0, msg0);
+ movdqa(e1, abcd);
+ sha1msg2(msg1, msg0);
+ sha1rnds4(abcd, e0, 1);
+ sha1msg1(msg3, msg0);
+ pxor(msg2, msg0);
+
+ // Rounds 36 - 39
+ sha1nexte(e1, msg1);
+ movdqa(e0, abcd);
+ sha1msg2(msg2, msg1);
+ sha1rnds4(abcd, e1, 1);
+ sha1msg1(msg0, msg1);
+ pxor(msg3, msg1);
+
+ // Rounds 40 - 43
+ sha1nexte(e0, msg2);
+ movdqa(e1, abcd);
+ sha1msg2(msg3, msg2);
+ sha1rnds4(abcd, e0, 2);
+ sha1msg1(msg1, msg2);
+ pxor(msg0, msg2);
+
+ // Rounds 44 - 47
+ sha1nexte(e1, msg3);
+ movdqa(e0, abcd);
+ sha1msg2(msg0, msg3);
+ sha1rnds4(abcd, e1, 2);
+ sha1msg1(msg2, msg3);
+ pxor(msg1, msg3);
+
+ // Rounds 48 - 51
+ sha1nexte(e0, msg0);
+ movdqa(e1, abcd);
+ sha1msg2(msg1, msg0);
+ sha1rnds4(abcd, e0, 2);
+ sha1msg1(msg3, msg0);
+ pxor(msg2, msg0);
+
+ // Rounds 52 - 55
+ sha1nexte(e1, msg1);
+ movdqa(e0, abcd);
+ sha1msg2(msg2, msg1);
+ sha1rnds4(abcd, e1, 2);
+ sha1msg1(msg0, msg1);
+ pxor(msg3, msg1);
+
+ // Rounds 56 - 59
+ sha1nexte(e0, msg2);
+ movdqa(e1, abcd);
+ sha1msg2(msg3, msg2);
+ sha1rnds4(abcd, e0, 2);
+ sha1msg1(msg1, msg2);
+ pxor(msg0, msg2);
+
+ // Rounds 60 - 63
+ sha1nexte(e1, msg3);
+ movdqa(e0, abcd);
+ sha1msg2(msg0, msg3);
+ sha1rnds4(abcd, e1, 3);
+ sha1msg1(msg2, msg3);
+ pxor(msg1, msg3);
+
+ // Rounds 64 - 67
+ sha1nexte(e0, msg0);
+ movdqa(e1, abcd);
+ sha1msg2(msg1, msg0);
+ sha1rnds4(abcd, e0, 3);
+ sha1msg1(msg3, msg0);
+ pxor(msg2, msg0);
+
+ // Rounds 68 - 71
+ sha1nexte(e1, msg1);
+ movdqa(e0, abcd);
+ sha1msg2(msg2, msg1);
+ sha1rnds4(abcd, e1, 3);
+ pxor(msg3, msg1);
+
+ // Rounds 72 - 75
+ sha1nexte(e0, msg2);
+ movdqa(e1, abcd);
+ sha1msg2(msg3, msg2);
+ sha1rnds4(abcd, e0, 3);
+
+ // Rounds 76 - 79
+ sha1nexte(e1, msg3);
+ movdqa(e0, abcd);
+ sha1rnds4(abcd, e1, 3);
+
+ // add current hash values with previously saved
+ movdqu(msg0, Address(rsp, 0));
+ sha1nexte(e0, msg0);
+ movdqu(msg0, Address(rsp, 16));
+ paddd(abcd, msg0);
+
+ if (multi_block) {
+ // increment data pointer and loop if more to process
+ addptr(buf, 64);
+ addptr(ofs, 64);
+ cmpptr(ofs, limit);
+ jcc(Assembler::belowEqual, loop0);
+ movptr(rax, ofs); //return ofs
+ }
+ // write hash values back in the correct order
+ pshufd(abcd, abcd, 0x1b);
+ movdqu(Address(state, 0), abcd);
+ pextrd(Address(state, 16), e0, 3);
+
+ bind(done_hash);
+
+}
+
+// xmm0 (msg) is used as an implicit argument to sh256rnds2
+// and state0 and state1 can never use xmm0 register.
+// ofs and limit are used for multi-block byte array.
+// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+#ifdef _LP64
+void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+ XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+ Register buf, Register state, Register ofs, Register limit, Register rsp,
+ bool multi_block, XMMRegister shuf_mask) {
+#else
+void MacroAssembler::fast_sha256(XMMRegister msg, XMMRegister state0, XMMRegister state1, XMMRegister msgtmp0,
+ XMMRegister msgtmp1, XMMRegister msgtmp2, XMMRegister msgtmp3, XMMRegister msgtmp4,
+ Register buf, Register state, Register ofs, Register limit, Register rsp,
+ bool multi_block) {
+#endif
+ Label start, done_hash, loop0;
+
+ address K256 = StubRoutines::x86::k256_addr();
+ address pshuffle_byte_flip_mask = StubRoutines::x86::pshuffle_byte_flip_mask_addr();
+
+ bind(start);
+ movdqu(state0, Address(state, 0));
+ movdqu(state1, Address(state, 16));
+
+ pshufd(state0, state0, 0xB1);
+ pshufd(state1, state1, 0x1B);
+ movdqa(msgtmp4, state0);
+ palignr(state0, state1, 8);
+ pblendw(state1, msgtmp4, 0xF0);
+
+#ifdef _LP64
+ movdqu(shuf_mask, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+ lea(rax, ExternalAddress(K256));
+
+ bind(loop0);
+ movdqu(Address(rsp, 0), state0);
+ movdqu(Address(rsp, 16), state1);
+
+ // Rounds 0-3
+ movdqu(msg, Address(buf, 0));
+#ifdef _LP64
+ pshufb(msg, shuf_mask);
+#else
+ pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+ movdqa(msgtmp0, msg);
+ paddd(msg, Address(rax, 0));
+ sha256rnds2(state1, state0);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+
+ // Rounds 4-7
+ movdqu(msg, Address(buf, 16));
+#ifdef _LP64
+ pshufb(msg, shuf_mask);
+#else
+ pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+ movdqa(msgtmp1, msg);
+ paddd(msg, Address(rax, 16));
+ sha256rnds2(state1, state0);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ sha256msg1(msgtmp0, msgtmp1);
+
+ // Rounds 8-11
+ movdqu(msg, Address(buf, 32));
+#ifdef _LP64
+ pshufb(msg, shuf_mask);
+#else
+ pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+ movdqa(msgtmp2, msg);
+ paddd(msg, Address(rax, 32));
+ sha256rnds2(state1, state0);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ sha256msg1(msgtmp1, msgtmp2);
+
+ // Rounds 12-15
+ movdqu(msg, Address(buf, 48));
+#ifdef _LP64
+ pshufb(msg, shuf_mask);
+#else
+ pshufb(msg, ExternalAddress(pshuffle_byte_flip_mask));
+#endif
+ movdqa(msgtmp3, msg);
+ paddd(msg, Address(rax, 48));
+ sha256rnds2(state1, state0);
+ movdqa(msgtmp4, msgtmp3);
+ palignr(msgtmp4, msgtmp2, 4);
+ paddd(msgtmp0, msgtmp4);
+ sha256msg2(msgtmp0, msgtmp3);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ sha256msg1(msgtmp2, msgtmp3);
+
+ // Rounds 16-19
+ movdqa(msg, msgtmp0);
+ paddd(msg, Address(rax, 64));
+ sha256rnds2(state1, state0);
+ movdqa(msgtmp4, msgtmp0);
+ palignr(msgtmp4, msgtmp3, 4);
+ paddd(msgtmp1, msgtmp4);
+ sha256msg2(msgtmp1, msgtmp0);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ sha256msg1(msgtmp3, msgtmp0);
+
+ // Rounds 20-23
+ movdqa(msg, msgtmp1);
+ paddd(msg, Address(rax, 80));
+ sha256rnds2(state1, state0);
+ movdqa(msgtmp4, msgtmp1);
+ palignr(msgtmp4, msgtmp0, 4);
+ paddd(msgtmp2, msgtmp4);
+ sha256msg2(msgtmp2, msgtmp1);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ sha256msg1(msgtmp0, msgtmp1);
+
+ // Rounds 24-27
+ movdqa(msg, msgtmp2);
+ paddd(msg, Address(rax, 96));
+ sha256rnds2(state1, state0);
+ movdqa(msgtmp4, msgtmp2);
+ palignr(msgtmp4, msgtmp1, 4);
+ paddd(msgtmp3, msgtmp4);
+ sha256msg2(msgtmp3, msgtmp2);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ sha256msg1(msgtmp1, msgtmp2);
+
+ // Rounds 28-31
+ movdqa(msg, msgtmp3);
+ paddd(msg, Address(rax, 112));
+ sha256rnds2(state1, state0);
+ movdqa(msgtmp4, msgtmp3);
+ palignr(msgtmp4, msgtmp2, 4);
+ paddd(msgtmp0, msgtmp4);
+ sha256msg2(msgtmp0, msgtmp3);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ sha256msg1(msgtmp2, msgtmp3);
+
+ // Rounds 32-35
+ movdqa(msg, msgtmp0);
+ paddd(msg, Address(rax, 128));
+ sha256rnds2(state1, state0);
+ movdqa(msgtmp4, msgtmp0);
+ palignr(msgtmp4, msgtmp3, 4);
+ paddd(msgtmp1, msgtmp4);
+ sha256msg2(msgtmp1, msgtmp0);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ sha256msg1(msgtmp3, msgtmp0);
+
+ // Rounds 36-39
+ movdqa(msg, msgtmp1);
+ paddd(msg, Address(rax, 144));
+ sha256rnds2(state1, state0);
+ movdqa(msgtmp4, msgtmp1);
+ palignr(msgtmp4, msgtmp0, 4);
+ paddd(msgtmp2, msgtmp4);
+ sha256msg2(msgtmp2, msgtmp1);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ sha256msg1(msgtmp0, msgtmp1);
+
+ // Rounds 40-43
+ movdqa(msg, msgtmp2);
+ paddd(msg, Address(rax, 160));
+ sha256rnds2(state1, state0);
+ movdqa(msgtmp4, msgtmp2);
+ palignr(msgtmp4, msgtmp1, 4);
+ paddd(msgtmp3, msgtmp4);
+ sha256msg2(msgtmp3, msgtmp2);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ sha256msg1(msgtmp1, msgtmp2);
+
+ // Rounds 44-47
+ movdqa(msg, msgtmp3);
+ paddd(msg, Address(rax, 176));
+ sha256rnds2(state1, state0);
+ movdqa(msgtmp4, msgtmp3);
+ palignr(msgtmp4, msgtmp2, 4);
+ paddd(msgtmp0, msgtmp4);
+ sha256msg2(msgtmp0, msgtmp3);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ sha256msg1(msgtmp2, msgtmp3);
+
+ // Rounds 48-51
+ movdqa(msg, msgtmp0);
+ paddd(msg, Address(rax, 192));
+ sha256rnds2(state1, state0);
+ movdqa(msgtmp4, msgtmp0);
+ palignr(msgtmp4, msgtmp3, 4);
+ paddd(msgtmp1, msgtmp4);
+ sha256msg2(msgtmp1, msgtmp0);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ sha256msg1(msgtmp3, msgtmp0);
+
+ // Rounds 52-55
+ movdqa(msg, msgtmp1);
+ paddd(msg, Address(rax, 208));
+ sha256rnds2(state1, state0);
+ movdqa(msgtmp4, msgtmp1);
+ palignr(msgtmp4, msgtmp0, 4);
+ paddd(msgtmp2, msgtmp4);
+ sha256msg2(msgtmp2, msgtmp1);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+
+ // Rounds 56-59
+ movdqa(msg, msgtmp2);
+ paddd(msg, Address(rax, 224));
+ sha256rnds2(state1, state0);
+ movdqa(msgtmp4, msgtmp2);
+ palignr(msgtmp4, msgtmp1, 4);
+ paddd(msgtmp3, msgtmp4);
+ sha256msg2(msgtmp3, msgtmp2);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+
+ // Rounds 60-63
+ movdqa(msg, msgtmp3);
+ paddd(msg, Address(rax, 240));
+ sha256rnds2(state1, state0);
+ pshufd(msg, msg, 0x0E);
+ sha256rnds2(state0, state1);
+ movdqu(msg, Address(rsp, 0));
+ paddd(state0, msg);
+ movdqu(msg, Address(rsp, 16));
+ paddd(state1, msg);
+
+ if (multi_block) {
+ // increment data pointer and loop if more to process
+ addptr(buf, 64);
+ addptr(ofs, 64);
+ cmpptr(ofs, limit);
+ jcc(Assembler::belowEqual, loop0);
+ movptr(rax, ofs); //return ofs
+ }
+
+ pshufd(state0, state0, 0x1B);
+ pshufd(state1, state1, 0xB1);
+ movdqa(msgtmp4, state0);
+ pblendw(state0, state1, 0xF0);
+ palignr(state1, msgtmp4, 8);
+
+ movdqu(Address(state, 0), state0);
+ movdqu(Address(state, 16), state1);
+
+ bind(done_hash);
+
+}
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp Thu Mar 03 22:02:13 2016 -0800
@@ -3068,6 +3068,136 @@
return start;
}
+ address generate_upper_word_mask() {
+ __ align(64);
+ StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
+ address start = __ pc();
+ __ emit_data(0x00000000, relocInfo::none, 0);
+ __ emit_data(0x00000000, relocInfo::none, 0);
+ __ emit_data(0x00000000, relocInfo::none, 0);
+ __ emit_data(0xFFFFFFFF, relocInfo::none, 0);
+ return start;
+ }
+
+ address generate_shuffle_byte_flip_mask() {
+ __ align(64);
+ StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
+ address start = __ pc();
+ __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+ __ emit_data(0x08090a0b, relocInfo::none, 0);
+ __ emit_data(0x04050607, relocInfo::none, 0);
+ __ emit_data(0x00010203, relocInfo::none, 0);
+ return start;
+ }
+
+ // ofs and limit are use for multi-block byte array.
+ // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+ address generate_sha1_implCompress(bool multi_block, const char *name) {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", name);
+ address start = __ pc();
+
+ Register buf = rax;
+ Register state = rdx;
+ Register ofs = rcx;
+ Register limit = rdi;
+
+ const Address buf_param(rbp, 8 + 0);
+ const Address state_param(rbp, 8 + 4);
+ const Address ofs_param(rbp, 8 + 8);
+ const Address limit_param(rbp, 8 + 12);
+
+ const XMMRegister abcd = xmm0;
+ const XMMRegister e0 = xmm1;
+ const XMMRegister e1 = xmm2;
+ const XMMRegister msg0 = xmm3;
+
+ const XMMRegister msg1 = xmm4;
+ const XMMRegister msg2 = xmm5;
+ const XMMRegister msg3 = xmm6;
+ const XMMRegister shuf_mask = xmm7;
+
+ __ enter();
+ __ subptr(rsp, 8 * wordSize);
+ if (multi_block) {
+ __ push(limit);
+ }
+ __ movptr(buf, buf_param);
+ __ movptr(state, state_param);
+ if (multi_block) {
+ __ movptr(ofs, ofs_param);
+ __ movptr(limit, limit_param);
+ }
+
+ __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
+ buf, state, ofs, limit, rsp, multi_block);
+
+ if (multi_block) {
+ __ pop(limit);
+ }
+ __ addptr(rsp, 8 * wordSize);
+ __ leave();
+ __ ret(0);
+ return start;
+ }
+
+ address generate_pshuffle_byte_flip_mask() {
+ __ align(64);
+ StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
+ address start = __ pc();
+ __ emit_data(0x00010203, relocInfo::none, 0);
+ __ emit_data(0x04050607, relocInfo::none, 0);
+ __ emit_data(0x08090a0b, relocInfo::none, 0);
+ __ emit_data(0x0c0d0e0f, relocInfo::none, 0);
+ return start;
+ }
+
+ // ofs and limit are use for multi-block byte array.
+ // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+ address generate_sha256_implCompress(bool multi_block, const char *name) {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", name);
+ address start = __ pc();
+
+ Register buf = rbx;
+ Register state = rsi;
+ Register ofs = rdx;
+ Register limit = rcx;
+
+ const Address buf_param(rbp, 8 + 0);
+ const Address state_param(rbp, 8 + 4);
+ const Address ofs_param(rbp, 8 + 8);
+ const Address limit_param(rbp, 8 + 12);
+
+ const XMMRegister msg = xmm0;
+ const XMMRegister state0 = xmm1;
+ const XMMRegister state1 = xmm2;
+ const XMMRegister msgtmp0 = xmm3;
+
+ const XMMRegister msgtmp1 = xmm4;
+ const XMMRegister msgtmp2 = xmm5;
+ const XMMRegister msgtmp3 = xmm6;
+ const XMMRegister msgtmp4 = xmm7;
+
+ __ enter();
+ __ subptr(rsp, 8 * wordSize);
+ handleSOERegisters(true /*saving*/);
+ __ movptr(buf, buf_param);
+ __ movptr(state, state_param);
+ if (multi_block) {
+ __ movptr(ofs, ofs_param);
+ __ movptr(limit, limit_param);
+ }
+
+ __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
+ buf, state, ofs, limit, rsp, multi_block);
+
+ handleSOERegisters(false);
+ __ addptr(rsp, 8 * wordSize);
+ __ leave();
+ __ ret(0);
+ return start;
+ }
// byte swap x86 long
address generate_ghash_long_swap_mask() {
@@ -3772,6 +3902,19 @@
StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
}
+ if (UseSHA1Intrinsics) {
+ StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
+ StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
+ StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
+ StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
+ }
+ if (UseSHA256Intrinsics) {
+ StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
+ StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
+ StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
+ StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
+ }
+
// Generate GHASH intrinsics code
if (UseGHASHIntrinsics) {
StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp Thu Mar 03 22:02:13 2016 -0800
@@ -3695,6 +3695,133 @@
return start;
}
+ address generate_upper_word_mask() {
+ __ align(64);
+ StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
+ address start = __ pc();
+ __ emit_data64(0x0000000000000000, relocInfo::none);
+ __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
+ return start;
+ }
+
+ address generate_shuffle_byte_flip_mask() {
+ __ align(64);
+ StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
+ address start = __ pc();
+ __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
+ __ emit_data64(0x0001020304050607, relocInfo::none);
+ return start;
+ }
+
+ // ofs and limit are use for multi-block byte array.
+ // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+ address generate_sha1_implCompress(bool multi_block, const char *name) {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", name);
+ address start = __ pc();
+
+ Register buf = c_rarg0;
+ Register state = c_rarg1;
+ Register ofs = c_rarg2;
+ Register limit = c_rarg3;
+
+ const XMMRegister abcd = xmm0;
+ const XMMRegister e0 = xmm1;
+ const XMMRegister e1 = xmm2;
+ const XMMRegister msg0 = xmm3;
+
+ const XMMRegister msg1 = xmm4;
+ const XMMRegister msg2 = xmm5;
+ const XMMRegister msg3 = xmm6;
+ const XMMRegister shuf_mask = xmm7;
+
+ __ enter();
+
+#ifdef _WIN64
+ // save the xmm registers which must be preserved 6-7
+ __ subptr(rsp, 4 * wordSize);
+ __ movdqu(Address(rsp, 0), xmm6);
+ __ movdqu(Address(rsp, 2 * wordSize), xmm7);
+#endif
+
+ __ subptr(rsp, 4 * wordSize);
+
+ __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
+ buf, state, ofs, limit, rsp, multi_block);
+
+ __ addptr(rsp, 4 * wordSize);
+#ifdef _WIN64
+ // restore xmm regs belonging to calling function
+ __ movdqu(xmm6, Address(rsp, 0));
+ __ movdqu(xmm7, Address(rsp, 2 * wordSize));
+ __ addptr(rsp, 4 * wordSize);
+#endif
+
+ __ leave();
+ __ ret(0);
+ return start;
+ }
+
+ address generate_pshuffle_byte_flip_mask() {
+ __ align(64);
+ StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
+ address start = __ pc();
+ __ emit_data64(0x0405060700010203, relocInfo::none);
+ __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
+ return start;
+ }
+
+// ofs and limit are use for multi-block byte array.
+// int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
+ address generate_sha256_implCompress(bool multi_block, const char *name) {
+ __ align(CodeEntryAlignment);
+ StubCodeMark mark(this, "StubRoutines", name);
+ address start = __ pc();
+
+ Register buf = c_rarg0;
+ Register state = c_rarg1;
+ Register ofs = c_rarg2;
+ Register limit = c_rarg3;
+
+ const XMMRegister msg = xmm0;
+ const XMMRegister state0 = xmm1;
+ const XMMRegister state1 = xmm2;
+ const XMMRegister msgtmp0 = xmm3;
+
+ const XMMRegister msgtmp1 = xmm4;
+ const XMMRegister msgtmp2 = xmm5;
+ const XMMRegister msgtmp3 = xmm6;
+ const XMMRegister msgtmp4 = xmm7;
+
+ const XMMRegister shuf_mask = xmm8;
+
+ __ enter();
+#ifdef _WIN64
+ // save the xmm registers which must be preserved 6-7
+ __ subptr(rsp, 6 * wordSize);
+ __ movdqu(Address(rsp, 0), xmm6);
+ __ movdqu(Address(rsp, 2 * wordSize), xmm7);
+ __ movdqu(Address(rsp, 4 * wordSize), xmm8);
+#endif
+
+ __ subptr(rsp, 4 * wordSize);
+
+ __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
+ buf, state, ofs, limit, rsp, multi_block, shuf_mask);
+
+ __ addptr(rsp, 4 * wordSize);
+#ifdef _WIN64
+ // restore xmm regs belonging to calling function
+ __ movdqu(xmm6, Address(rsp, 0));
+ __ movdqu(xmm7, Address(rsp, 2 * wordSize));
+ __ movdqu(xmm8, Address(rsp, 4 * wordSize));
+ __ addptr(rsp, 6 * wordSize);
+#endif
+ __ leave();
+ __ ret(0);
+ return start;
+ }
+
// This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
// to hide instruction latency
//
@@ -4974,6 +5101,19 @@
StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
}
+ if (UseSHA1Intrinsics) {
+ StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
+ StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
+ StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
+ StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
+ }
+ if (UseSHA256Intrinsics) {
+ StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
+ StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
+ StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
+ StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
+ }
+
// Generate GHASH intrinsics code
if (UseGHASHIntrinsics) {
StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp Thu Mar 03 22:02:13 2016 -0800
@@ -29,6 +29,12 @@
#include "runtime/thread.inline.hpp"
#include "crc32c.h"
+#ifdef _MSC_VER
+#define ALIGNED_(x) __declspec(align(x))
+#else
+#define ALIGNED_(x) __attribute__ ((aligned(x)))
+#endif
+
// Implementation of the platform-specific part of StubRoutines - for
// a description of how to extend it, see the stubRoutines.hpp file.
@@ -37,6 +43,10 @@
address StubRoutines::x86::_counter_shuffle_mask_addr = NULL;
address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;
+address StubRoutines::x86::_upper_word_mask_addr = NULL;
+address StubRoutines::x86::_shuffle_byte_flip_mask_addr = NULL;
+address StubRoutines::x86::_k256_adr = NULL;
+address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = NULL;
uint64_t StubRoutines::x86::_crc_by128_masks[] =
{
@@ -236,3 +246,23 @@
_crc32c_table = (juint*)pclmulqdq_table;
}
}
+
+ALIGNED_(64) juint StubRoutines::x86::_k256[] =
+{
+ 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL,
+ 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL,
+ 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL,
+ 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL,
+ 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL,
+ 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL,
+ 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL,
+ 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL,
+ 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL,
+ 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL,
+ 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL,
+ 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL,
+ 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL,
+ 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL,
+ 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL,
+ 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL
+};
--- a/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/stubRoutines_x86.hpp Thu Mar 03 22:02:13 2016 -0800
@@ -46,6 +46,17 @@
static address _ghash_long_swap_mask_addr;
static address _ghash_byte_swap_mask_addr;
+ // upper word mask for sha1
+ static address _upper_word_mask_addr;
+ // byte flip mask for sha1
+ static address _shuffle_byte_flip_mask_addr;
+
+ //k256 table for sha256
+ static juint _k256[];
+ static address _k256_adr;
+ // byte flip mask for sha256
+ static address _pshuffle_byte_flip_mask_addr;
+
public:
static address verify_mxcsr_entry() { return _verify_mxcsr_entry; }
static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; }
@@ -53,5 +64,9 @@
static address crc_by128_masks_addr() { return (address)_crc_by128_masks; }
static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; }
static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; }
+ static address upper_word_mask_addr() { return _upper_word_mask_addr; }
+ static address shuffle_byte_flip_mask_addr() { return _shuffle_byte_flip_mask_addr; }
+ static address k256_addr() { return _k256_adr; }
+ static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; }
static void generate_CRC32C_table(bool is_pclmulqdq_supported);
#endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP
--- a/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp Thu Mar 03 22:02:13 2016 -0800
@@ -68,10 +68,11 @@
declare_constant(VM_Version::CPU_AVX512DQ) \
declare_constant(VM_Version::CPU_AVX512PF) \
declare_constant(VM_Version::CPU_AVX512ER) \
- declare_constant(VM_Version::CPU_AVX512CD) \
- declare_constant(VM_Version::CPU_AVX512BW)
+ declare_constant(VM_Version::CPU_AVX512CD)
#define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
- declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL)
+ declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
+ declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
+ declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
#endif // CPU_X86_VM_VMSTRUCTS_X86_HPP
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp Thu Mar 03 22:02:13 2016 -0800
@@ -577,7 +577,7 @@
}
char buf[256];
- jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+ jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
cores_per_cpu(), threads_per_core(),
cpu_family(), _model, _stepping,
(supports_cmov() ? ", cmov" : ""),
@@ -608,7 +608,8 @@
(supports_bmi1() ? ", bmi1" : ""),
(supports_bmi2() ? ", bmi2" : ""),
(supports_adx() ? ", adx" : ""),
- (supports_evex() ? ", evex" : ""));
+ (supports_evex() ? ", evex" : ""),
+ (supports_sha() ? ", sha" : ""));
_features_string = os::strdup(buf);
// UseSSE is set to the smaller of what hardware supports and what
@@ -730,17 +731,29 @@
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
- if (UseSHA) {
+ if (supports_sha()) {
+ if (FLAG_IS_DEFAULT(UseSHA)) {
+ UseSHA = true;
+ }
+ } else if (UseSHA) {
warning("SHA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseSHA, false);
}
- if (UseSHA1Intrinsics) {
+ if (UseSHA) {
+ if (FLAG_IS_DEFAULT(UseSHA1Intrinsics)) {
+ FLAG_SET_DEFAULT(UseSHA1Intrinsics, true);
+ }
+ } else if (UseSHA1Intrinsics) {
warning("Intrinsics for SHA-1 crypto hash functions not available on this CPU.");
FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
}
- if (UseSHA256Intrinsics) {
+ if (UseSHA) {
+ if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
+ FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
+ }
+ } else if (UseSHA256Intrinsics) {
warning("Intrinsics for SHA-224 and SHA-256 crypto hash functions not available on this CPU.");
FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
}
@@ -750,6 +763,10 @@
FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
}
+ if (!(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics)) {
+ FLAG_SET_DEFAULT(UseSHA, false);
+ }
+
if (UseAdler32Intrinsics) {
warning("Adler32Intrinsics not available on this CPU.");
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp Thu Mar 03 22:02:13 2016 -0800
@@ -221,7 +221,7 @@
avx512pf : 1,
avx512er : 1,
avx512cd : 1,
- : 1,
+ sha : 1,
avx512bw : 1,
avx512vl : 1;
} bits;
@@ -282,11 +282,13 @@
CPU_AVX512DQ = (1 << 27),
CPU_AVX512PF = (1 << 28),
CPU_AVX512ER = (1 << 29),
- CPU_AVX512CD = (1 << 30),
- CPU_AVX512BW = (1 << 31)
+ CPU_AVX512CD = (1 << 30)
+ // Keeping sign bit 31 unassigned.
};
-#define CPU_AVX512VL UCONST64(0x100000000) // EVEX instructions with smaller vector length : enums are limited to 32bit
+#define CPU_AVX512BW ((uint64_t)UCONST64(0x100000000)) // enums are limited to 31 bit
+#define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
+#define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions
enum Extended_Family {
// AMD
@@ -516,6 +518,8 @@
result |= CPU_ADX;
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
result |= CPU_BMI2;
+ if (_cpuid_info.sef_cpuid7_ebx.bits.sha != 0)
+ result |= CPU_SHA;
if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
result |= CPU_LZCNT;
// for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
@@ -721,6 +725,7 @@
static bool supports_avx512nobw() { return (supports_evex() && !supports_avx512bw()); }
static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); }
static bool supports_avxonly() { return ((supports_avx2() || supports_avx()) && !supports_evex()); }
+ static bool supports_sha() { return (_features & CPU_SHA) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&
extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java Thu Mar 03 22:02:13 2016 -0800
@@ -203,7 +203,8 @@
AVX512ER,
AVX512CD,
AVX512BW,
- AVX512VL
+ AVX512VL,
+ SHA
}
private final EnumSet<CPUFeature> features;
--- a/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java Thu Mar 03 22:02:13 2016 -0800
@@ -122,6 +122,9 @@
if ((config.vmVersionFeatures & config.amd64AVX512VL) != 0) {
features.add(AMD64.CPUFeature.AVX512VL);
}
+ if ((config.vmVersionFeatures & config.amd64SHA) != 0) {
+ features.add(AMD64.CPUFeature.SHA);
+ }
return features;
}
--- a/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotVMConfig.java Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot/src/jdk/vm/ci/hotspot/HotSpotVMConfig.java Thu Mar 03 22:02:13 2016 -0800
@@ -945,6 +945,7 @@
@HotSpotVMConstant(name = "VM_Version::CPU_AVX512CD", archs = {"amd64"}) @Stable public long amd64AVX512CD;
@HotSpotVMConstant(name = "VM_Version::CPU_AVX512BW", archs = {"amd64"}) @Stable public long amd64AVX512BW;
@HotSpotVMConstant(name = "VM_Version::CPU_AVX512VL", archs = {"amd64"}) @Stable public long amd64AVX512VL;
+ @HotSpotVMConstant(name = "VM_Version::CPU_SHA", archs = {"amd64"}) @Stable public long amd64SHA;
// SPARC specific values
@HotSpotVMConstant(name = "VM_Version::vis3_instructions_m", archs = {"sparc"}) @Stable public int sparcVis3Instructions;
--- a/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp Thu Mar 03 22:02:13 2016 -0800
@@ -639,11 +639,12 @@
declare_constant(VM_Version::CPU_AVX512DQ) \
declare_constant(VM_Version::CPU_AVX512PF) \
declare_constant(VM_Version::CPU_AVX512ER) \
- declare_constant(VM_Version::CPU_AVX512CD) \
- declare_constant(VM_Version::CPU_AVX512BW)
+ declare_constant(VM_Version::CPU_AVX512CD)
#define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
- declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL)
+ declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
+ declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
+ declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
#endif // TARGET_ARCH_x86
--- a/hotspot/src/share/vm/runtime/globals.hpp Fri Mar 04 01:30:11 2016 +0300
+++ b/hotspot/src/share/vm/runtime/globals.hpp Thu Mar 03 22:02:13 2016 -0800
@@ -725,7 +725,7 @@
\
product(bool, UseSHA, false, \
"Control whether SHA instructions can be used " \
- "on SPARC and on ARM") \
+ "on SPARC, on ARM and on x86") \
\
product(bool, UseGHASHIntrinsics, false, \
"Use intrinsics for GHASH versions of crypto") \