--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp Tue Sep 12 19:03:39 2017 +0200
@@ -0,0 +1,2238 @@
+/*
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_X86_VM_ASSEMBLER_X86_HPP
+#define CPU_X86_VM_ASSEMBLER_X86_HPP
+
+#include "asm/register.hpp"
+#include "vm_version_x86.hpp"
+
+class BiasedLockingCounters;
+
+// Contains all the definitions needed for x86 assembly code generation.
+
+// Calling convention
+class Argument VALUE_OBJ_CLASS_SPEC {
+ public:
+ enum {
+#ifdef _LP64
+#ifdef _WIN64
+ n_int_register_parameters_c = 4, // rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
+ n_float_register_parameters_c = 4, // xmm0 - xmm3 (c_farg0, c_farg1, ... )
+#else
+ n_int_register_parameters_c = 6, // rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
+ n_float_register_parameters_c = 8, // xmm0 - xmm7 (c_farg0, c_farg1, ... )
+#endif // _WIN64
+ n_int_register_parameters_j = 6, // j_rarg0, j_rarg1, ...
+ n_float_register_parameters_j = 8 // j_farg0, j_farg1, ...
+#else
+ n_register_parameters = 0 // 0 registers used to pass arguments
+#endif // _LP64
+ };
+};
+
+
+#ifdef _LP64
+// Symbolically name the register arguments used by the c calling convention.
+// Windows is different from linux/solaris. So much for standards...
+
+#ifdef _WIN64
+
+REGISTER_DECLARATION(Register, c_rarg0, rcx);
+REGISTER_DECLARATION(Register, c_rarg1, rdx);
+REGISTER_DECLARATION(Register, c_rarg2, r8);
+REGISTER_DECLARATION(Register, c_rarg3, r9);
+
+REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0);
+REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1);
+REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2);
+REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3);
+
+#else
+
+REGISTER_DECLARATION(Register, c_rarg0, rdi);
+REGISTER_DECLARATION(Register, c_rarg1, rsi);
+REGISTER_DECLARATION(Register, c_rarg2, rdx);
+REGISTER_DECLARATION(Register, c_rarg3, rcx);
+REGISTER_DECLARATION(Register, c_rarg4, r8);
+REGISTER_DECLARATION(Register, c_rarg5, r9);
+
+REGISTER_DECLARATION(XMMRegister, c_farg0, xmm0);
+REGISTER_DECLARATION(XMMRegister, c_farg1, xmm1);
+REGISTER_DECLARATION(XMMRegister, c_farg2, xmm2);
+REGISTER_DECLARATION(XMMRegister, c_farg3, xmm3);
+REGISTER_DECLARATION(XMMRegister, c_farg4, xmm4);
+REGISTER_DECLARATION(XMMRegister, c_farg5, xmm5);
+REGISTER_DECLARATION(XMMRegister, c_farg6, xmm6);
+REGISTER_DECLARATION(XMMRegister, c_farg7, xmm7);
+
+#endif // _WIN64
+
+// Symbolically name the register arguments used by the Java calling convention.
+// We have control over the convention for java so we can do what we please.
+// What pleases us is to offset the java calling convention so that when
+// we call a suitable jni method the arguments are lined up and we don't
+// have to do little shuffling. A suitable jni method is non-static and a
+// small number of arguments (two fewer args on windows)
+//
+// |-------------------------------------------------------|
+// | c_rarg0 c_rarg1 c_rarg2 c_rarg3 c_rarg4 c_rarg5 |
+// |-------------------------------------------------------|
+// | rcx rdx r8 r9 rdi* rsi* | windows (* not a c_rarg)
+// | rdi rsi rdx rcx r8 r9 | solaris/linux
+// |-------------------------------------------------------|
+// | j_rarg5 j_rarg0 j_rarg1 j_rarg2 j_rarg3 j_rarg4 |
+// |-------------------------------------------------------|
+
+REGISTER_DECLARATION(Register, j_rarg0, c_rarg1);
+REGISTER_DECLARATION(Register, j_rarg1, c_rarg2);
+REGISTER_DECLARATION(Register, j_rarg2, c_rarg3);
+// Windows runs out of register args here
+#ifdef _WIN64
+REGISTER_DECLARATION(Register, j_rarg3, rdi);
+REGISTER_DECLARATION(Register, j_rarg4, rsi);
+#else
+REGISTER_DECLARATION(Register, j_rarg3, c_rarg4);
+REGISTER_DECLARATION(Register, j_rarg4, c_rarg5);
+#endif /* _WIN64 */
+REGISTER_DECLARATION(Register, j_rarg5, c_rarg0);
+
+REGISTER_DECLARATION(XMMRegister, j_farg0, xmm0);
+REGISTER_DECLARATION(XMMRegister, j_farg1, xmm1);
+REGISTER_DECLARATION(XMMRegister, j_farg2, xmm2);
+REGISTER_DECLARATION(XMMRegister, j_farg3, xmm3);
+REGISTER_DECLARATION(XMMRegister, j_farg4, xmm4);
+REGISTER_DECLARATION(XMMRegister, j_farg5, xmm5);
+REGISTER_DECLARATION(XMMRegister, j_farg6, xmm6);
+REGISTER_DECLARATION(XMMRegister, j_farg7, xmm7);
+
+REGISTER_DECLARATION(Register, rscratch1, r10); // volatile
+REGISTER_DECLARATION(Register, rscratch2, r11); // volatile
+
+REGISTER_DECLARATION(Register, r12_heapbase, r12); // callee-saved
+REGISTER_DECLARATION(Register, r15_thread, r15); // callee-saved
+
+#else
+// rscratch1 will apear in 32bit code that is dead but of course must compile
+// Using noreg ensures if the dead code is incorrectly live and executed it
+// will cause an assertion failure
+#define rscratch1 noreg
+#define rscratch2 noreg
+
+#endif // _LP64
+
+// JSR 292
+// On x86, the SP does not have to be saved when invoking method handle intrinsics
+// or compiled lambda forms. We indicate that by setting rbp_mh_SP_save to noreg.
+REGISTER_DECLARATION(Register, rbp_mh_SP_save, noreg);
+
+// Address is an abstraction used to represent a memory location
+// using any of the amd64 addressing modes with one object.
+//
+// Note: A register location is represented via a Register, not
+// via an address for efficiency & simplicity reasons.
+
+class ArrayAddress;
+
+class Address VALUE_OBJ_CLASS_SPEC {
+ public:
+ enum ScaleFactor {
+ no_scale = -1,
+ times_1 = 0,
+ times_2 = 1,
+ times_4 = 2,
+ times_8 = 3,
+ times_ptr = LP64_ONLY(times_8) NOT_LP64(times_4)
+ };
+ static ScaleFactor times(int size) {
+ assert(size >= 1 && size <= 8 && is_power_of_2(size), "bad scale size");
+ if (size == 8) return times_8;
+ if (size == 4) return times_4;
+ if (size == 2) return times_2;
+ return times_1;
+ }
+ static int scale_size(ScaleFactor scale) {
+ assert(scale != no_scale, "");
+ assert(((1 << (int)times_1) == 1 &&
+ (1 << (int)times_2) == 2 &&
+ (1 << (int)times_4) == 4 &&
+ (1 << (int)times_8) == 8), "");
+ return (1 << (int)scale);
+ }
+
+ private:
+ Register _base;
+ Register _index;
+ ScaleFactor _scale;
+ int _disp;
+ RelocationHolder _rspec;
+
+ // Easily misused constructors make them private
+ // %%% can we make these go away?
+ NOT_LP64(Address(address loc, RelocationHolder spec);)
+ Address(int disp, address loc, relocInfo::relocType rtype);
+ Address(int disp, address loc, RelocationHolder spec);
+
+ public:
+
+ int disp() { return _disp; }
+ // creation
+ Address()
+ : _base(noreg),
+ _index(noreg),
+ _scale(no_scale),
+ _disp(0) {
+ }
+
+ // No default displacement otherwise Register can be implicitly
+ // converted to 0(Register) which is quite a different animal.
+
+ Address(Register base, int disp)
+ : _base(base),
+ _index(noreg),
+ _scale(no_scale),
+ _disp(disp) {
+ }
+
+ Address(Register base, Register index, ScaleFactor scale, int disp = 0)
+ : _base (base),
+ _index(index),
+ _scale(scale),
+ _disp (disp) {
+ assert(!index->is_valid() == (scale == Address::no_scale),
+ "inconsistent address");
+ }
+
+ Address(Register base, RegisterOrConstant index, ScaleFactor scale = times_1, int disp = 0)
+ : _base (base),
+ _index(index.register_or_noreg()),
+ _scale(scale),
+ _disp (disp + (index.constant_or_zero() * scale_size(scale))) {
+ if (!index.is_register()) scale = Address::no_scale;
+ assert(!_index->is_valid() == (scale == Address::no_scale),
+ "inconsistent address");
+ }
+
+ Address plus_disp(int disp) const {
+ Address a = (*this);
+ a._disp += disp;
+ return a;
+ }
+ Address plus_disp(RegisterOrConstant disp, ScaleFactor scale = times_1) const {
+ Address a = (*this);
+ a._disp += disp.constant_or_zero() * scale_size(scale);
+ if (disp.is_register()) {
+ assert(!a.index()->is_valid(), "competing indexes");
+ a._index = disp.as_register();
+ a._scale = scale;
+ }
+ return a;
+ }
+ bool is_same_address(Address a) const {
+ // disregard _rspec
+ return _base == a._base && _disp == a._disp && _index == a._index && _scale == a._scale;
+ }
+
+ // The following two overloads are used in connection with the
+ // ByteSize type (see sizes.hpp). They simplify the use of
+ // ByteSize'd arguments in assembly code. Note that their equivalent
+ // for the optimized build are the member functions with int disp
+ // argument since ByteSize is mapped to an int type in that case.
+ //
+ // Note: DO NOT introduce similar overloaded functions for WordSize
+ // arguments as in the optimized mode, both ByteSize and WordSize
+ // are mapped to the same type and thus the compiler cannot make a
+ // distinction anymore (=> compiler errors).
+
+#ifdef ASSERT
+ Address(Register base, ByteSize disp)
+ : _base(base),
+ _index(noreg),
+ _scale(no_scale),
+ _disp(in_bytes(disp)) {
+ }
+
+ Address(Register base, Register index, ScaleFactor scale, ByteSize disp)
+ : _base(base),
+ _index(index),
+ _scale(scale),
+ _disp(in_bytes(disp)) {
+ assert(!index->is_valid() == (scale == Address::no_scale),
+ "inconsistent address");
+ }
+
+ Address(Register base, RegisterOrConstant index, ScaleFactor scale, ByteSize disp)
+ : _base (base),
+ _index(index.register_or_noreg()),
+ _scale(scale),
+ _disp (in_bytes(disp) + (index.constant_or_zero() * scale_size(scale))) {
+ if (!index.is_register()) scale = Address::no_scale;
+ assert(!_index->is_valid() == (scale == Address::no_scale),
+ "inconsistent address");
+ }
+
+#endif // ASSERT
+
+ // accessors
+ bool uses(Register reg) const { return _base == reg || _index == reg; }
+ Register base() const { return _base; }
+ Register index() const { return _index; }
+ ScaleFactor scale() const { return _scale; }
+ int disp() const { return _disp; }
+
+ // Convert the raw encoding form into the form expected by the constructor for
+ // Address. An index of 4 (rsp) corresponds to having no index, so convert
+ // that to noreg for the Address constructor.
+ static Address make_raw(int base, int index, int scale, int disp, relocInfo::relocType disp_reloc);
+
+ static Address make_array(ArrayAddress);
+
+ private:
+ bool base_needs_rex() const {
+ return _base != noreg && _base->encoding() >= 8;
+ }
+
+ bool index_needs_rex() const {
+ return _index != noreg &&_index->encoding() >= 8;
+ }
+
+ relocInfo::relocType reloc() const { return _rspec.type(); }
+
+ friend class Assembler;
+ friend class MacroAssembler;
+ friend class LIR_Assembler; // base/index/scale/disp
+};
+
+//
+// AddressLiteral has been split out from Address because operands of this type
+// need to be treated specially on 32bit vs. 64bit platforms. By splitting it out
+// the few instructions that need to deal with address literals are unique and the
+// MacroAssembler does not have to implement every instruction in the Assembler
+// in order to search for address literals that may need special handling depending
+// on the instruction and the platform. As small step on the way to merging i486/amd64
+// directories.
+//
+class AddressLiteral VALUE_OBJ_CLASS_SPEC {
+ friend class ArrayAddress;
+ RelocationHolder _rspec;
+ // Typically we use AddressLiterals we want to use their rval
+ // However in some situations we want the lval (effect address) of the item.
+ // We provide a special factory for making those lvals.
+ bool _is_lval;
+
+ // If the target is far we'll need to load the ea of this to
+ // a register to reach it. Otherwise if near we can do rip
+ // relative addressing.
+
+ address _target;
+
+ protected:
+ // creation
+ AddressLiteral()
+ : _is_lval(false),
+ _target(NULL)
+ {}
+
+ public:
+
+
+ AddressLiteral(address target, relocInfo::relocType rtype);
+
+ AddressLiteral(address target, RelocationHolder const& rspec)
+ : _rspec(rspec),
+ _is_lval(false),
+ _target(target)
+ {}
+
+ AddressLiteral addr() {
+ AddressLiteral ret = *this;
+ ret._is_lval = true;
+ return ret;
+ }
+
+
+ private:
+
+ address target() { return _target; }
+ bool is_lval() { return _is_lval; }
+
+ relocInfo::relocType reloc() const { return _rspec.type(); }
+ const RelocationHolder& rspec() const { return _rspec; }
+
+ friend class Assembler;
+ friend class MacroAssembler;
+ friend class Address;
+ friend class LIR_Assembler;
+};
+
+// Convience classes
+class RuntimeAddress: public AddressLiteral {
+
+ public:
+
+ RuntimeAddress(address target) : AddressLiteral(target, relocInfo::runtime_call_type) {}
+
+};
+
+class ExternalAddress: public AddressLiteral {
+ private:
+ static relocInfo::relocType reloc_for_target(address target) {
+ // Sometimes ExternalAddress is used for values which aren't
+ // exactly addresses, like the card table base.
+ // external_word_type can't be used for values in the first page
+ // so just skip the reloc in that case.
+ return external_word_Relocation::can_be_relocated(target) ? relocInfo::external_word_type : relocInfo::none;
+ }
+
+ public:
+
+ ExternalAddress(address target) : AddressLiteral(target, reloc_for_target(target)) {}
+
+};
+
+class InternalAddress: public AddressLiteral {
+
+ public:
+
+ InternalAddress(address target) : AddressLiteral(target, relocInfo::internal_word_type) {}
+
+};
+
+// x86 can do array addressing as a single operation since disp can be an absolute
+// address amd64 can't. We create a class that expresses the concept but does extra
+// magic on amd64 to get the final result
+
+class ArrayAddress VALUE_OBJ_CLASS_SPEC {
+ private:
+
+ AddressLiteral _base;
+ Address _index;
+
+ public:
+
+ ArrayAddress() {};
+ ArrayAddress(AddressLiteral base, Address index): _base(base), _index(index) {};
+ AddressLiteral base() { return _base; }
+ Address index() { return _index; }
+
+};
+
+class InstructionAttr;
+
+// 64-bit refect the fxsave size which is 512 bytes and the new xsave area on EVEX which is another 2176 bytes
+// See fxsave and xsave(EVEX enabled) documentation for layout
+const int FPUStateSizeInWords = NOT_LP64(27) LP64_ONLY(2688 / wordSize);
+
+// The Intel x86/Amd64 Assembler: Pure assembler doing NO optimizations on the instruction
+// level (e.g. mov rax, 0 is not translated into xor rax, rax!); i.e., what you write
+// is what you get. The Assembler is generating code into a CodeBuffer.
+
+class Assembler : public AbstractAssembler {
+ friend class AbstractAssembler; // for the non-virtual hack
+ friend class LIR_Assembler; // as_Address()
+ friend class StubGenerator;
+
+ public:
+ enum Condition { // The x86 condition codes used for conditional jumps/moves.
+ zero = 0x4,
+ notZero = 0x5,
+ equal = 0x4,
+ notEqual = 0x5,
+ less = 0xc,
+ lessEqual = 0xe,
+ greater = 0xf,
+ greaterEqual = 0xd,
+ below = 0x2,
+ belowEqual = 0x6,
+ above = 0x7,
+ aboveEqual = 0x3,
+ overflow = 0x0,
+ noOverflow = 0x1,
+ carrySet = 0x2,
+ carryClear = 0x3,
+ negative = 0x8,
+ positive = 0x9,
+ parity = 0xa,
+ noParity = 0xb
+ };
+
+ enum Prefix {
+ // segment overrides
+ CS_segment = 0x2e,
+ SS_segment = 0x36,
+ DS_segment = 0x3e,
+ ES_segment = 0x26,
+ FS_segment = 0x64,
+ GS_segment = 0x65,
+
+ REX = 0x40,
+
+ REX_B = 0x41,
+ REX_X = 0x42,
+ REX_XB = 0x43,
+ REX_R = 0x44,
+ REX_RB = 0x45,
+ REX_RX = 0x46,
+ REX_RXB = 0x47,
+
+ REX_W = 0x48,
+
+ REX_WB = 0x49,
+ REX_WX = 0x4A,
+ REX_WXB = 0x4B,
+ REX_WR = 0x4C,
+ REX_WRB = 0x4D,
+ REX_WRX = 0x4E,
+ REX_WRXB = 0x4F,
+
+ VEX_3bytes = 0xC4,
+ VEX_2bytes = 0xC5,
+ EVEX_4bytes = 0x62,
+ Prefix_EMPTY = 0x0
+ };
+
+ enum VexPrefix {
+ VEX_B = 0x20,
+ VEX_X = 0x40,
+ VEX_R = 0x80,
+ VEX_W = 0x80
+ };
+
+ enum ExexPrefix {
+ EVEX_F = 0x04,
+ EVEX_V = 0x08,
+ EVEX_Rb = 0x10,
+ EVEX_X = 0x40,
+ EVEX_Z = 0x80
+ };
+
+ enum VexSimdPrefix {
+ VEX_SIMD_NONE = 0x0,
+ VEX_SIMD_66 = 0x1,
+ VEX_SIMD_F3 = 0x2,
+ VEX_SIMD_F2 = 0x3
+ };
+
+ enum VexOpcode {
+ VEX_OPCODE_NONE = 0x0,
+ VEX_OPCODE_0F = 0x1,
+ VEX_OPCODE_0F_38 = 0x2,
+ VEX_OPCODE_0F_3A = 0x3,
+ VEX_OPCODE_MASK = 0x1F
+ };
+
+ enum AvxVectorLen {
+ AVX_128bit = 0x0,
+ AVX_256bit = 0x1,
+ AVX_512bit = 0x2,
+ AVX_NoVec = 0x4
+ };
+
+ enum EvexTupleType {
+ EVEX_FV = 0,
+ EVEX_HV = 4,
+ EVEX_FVM = 6,
+ EVEX_T1S = 7,
+ EVEX_T1F = 11,
+ EVEX_T2 = 13,
+ EVEX_T4 = 15,
+ EVEX_T8 = 17,
+ EVEX_HVM = 18,
+ EVEX_QVM = 19,
+ EVEX_OVM = 20,
+ EVEX_M128 = 21,
+ EVEX_DUP = 22,
+ EVEX_ETUP = 23
+ };
+
+ enum EvexInputSizeInBits {
+ EVEX_8bit = 0,
+ EVEX_16bit = 1,
+ EVEX_32bit = 2,
+ EVEX_64bit = 3,
+ EVEX_NObit = 4
+ };
+
+ enum WhichOperand {
+ // input to locate_operand, and format code for relocations
+ imm_operand = 0, // embedded 32-bit|64-bit immediate operand
+ disp32_operand = 1, // embedded 32-bit displacement or address
+ call32_operand = 2, // embedded 32-bit self-relative displacement
+#ifndef _LP64
+ _WhichOperand_limit = 3
+#else
+ narrow_oop_operand = 3, // embedded 32-bit immediate narrow oop
+ _WhichOperand_limit = 4
+#endif
+ };
+
+ enum ComparisonPredicate {
+ eq = 0,
+ lt = 1,
+ le = 2,
+ _false = 3,
+ neq = 4,
+ nlt = 5,
+ nle = 6,
+ _true = 7
+ };
+
+
+ // NOTE: The general philopsophy of the declarations here is that 64bit versions
+ // of instructions are freely declared without the need for wrapping them an ifdef.
+ // (Some dangerous instructions are ifdef's out of inappropriate jvm's.)
+ // In the .cpp file the implementations are wrapped so that they are dropped out
+ // of the resulting jvm. This is done mostly to keep the footprint of MINIMAL
+ // to the size it was prior to merging up the 32bit and 64bit assemblers.
+ //
+ // This does mean you'll get a linker/runtime error if you use a 64bit only instruction
+ // in a 32bit vm. This is somewhat unfortunate but keeps the ifdef noise down.
+
+private:
+
+ bool _legacy_mode_bw;
+ bool _legacy_mode_dq;
+ bool _legacy_mode_vl;
+ bool _legacy_mode_vlbw;
+ bool _is_managed;
+ bool _vector_masking; // For stub code use only
+
+ class InstructionAttr *_attributes;
+
+ // 64bit prefixes
+ int prefix_and_encode(int reg_enc, bool byteinst = false);
+ int prefixq_and_encode(int reg_enc);
+
+ int prefix_and_encode(int dst_enc, int src_enc) {
+ return prefix_and_encode(dst_enc, false, src_enc, false);
+ }
+ int prefix_and_encode(int dst_enc, bool dst_is_byte, int src_enc, bool src_is_byte);
+ int prefixq_and_encode(int dst_enc, int src_enc);
+
+ void prefix(Register reg);
+ void prefix(Register dst, Register src, Prefix p);
+ void prefix(Register dst, Address adr, Prefix p);
+ void prefix(Address adr);
+ void prefixq(Address adr);
+
+ void prefix(Address adr, Register reg, bool byteinst = false);
+ void prefix(Address adr, XMMRegister reg);
+ void prefixq(Address adr, Register reg);
+ void prefixq(Address adr, XMMRegister reg);
+
+ void prefetch_prefix(Address src);
+
+ void rex_prefix(Address adr, XMMRegister xreg,
+ VexSimdPrefix pre, VexOpcode opc, bool rex_w);
+ int rex_prefix_and_encode(int dst_enc, int src_enc,
+ VexSimdPrefix pre, VexOpcode opc, bool rex_w);
+
+ void vex_prefix(bool vex_r, bool vex_b, bool vex_x, int nds_enc, VexSimdPrefix pre, VexOpcode opc);
+
+ void evex_prefix(bool vex_r, bool vex_b, bool vex_x, bool evex_r, bool evex_v,
+ int nds_enc, VexSimdPrefix pre, VexOpcode opc);
+
+ void vex_prefix(Address adr, int nds_enc, int xreg_enc,
+ VexSimdPrefix pre, VexOpcode opc,
+ InstructionAttr *attributes);
+
+ int vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
+ VexSimdPrefix pre, VexOpcode opc,
+ InstructionAttr *attributes);
+
+ void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr, VexSimdPrefix pre,
+ VexOpcode opc, InstructionAttr *attributes);
+
+ int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src, VexSimdPrefix pre,
+ VexOpcode opc, InstructionAttr *attributes);
+
+ // Helper functions for groups of instructions
+ void emit_arith_b(int op1, int op2, Register dst, int imm8);
+
+ void emit_arith(int op1, int op2, Register dst, int32_t imm32);
+ // Force generation of a 4 byte immediate value even if it fits into 8bit
+ void emit_arith_imm32(int op1, int op2, Register dst, int32_t imm32);
+ void emit_arith(int op1, int op2, Register dst, Register src);
+
+ bool emit_compressed_disp_byte(int &disp);
+
+ void emit_operand(Register reg,
+ Register base, Register index, Address::ScaleFactor scale,
+ int disp,
+ RelocationHolder const& rspec,
+ int rip_relative_correction = 0);
+
+ void emit_operand(Register reg, Address adr, int rip_relative_correction = 0);
+
+ // operands that only take the original 32bit registers
+ void emit_operand32(Register reg, Address adr);
+
+ void emit_operand(XMMRegister reg,
+ Register base, Register index, Address::ScaleFactor scale,
+ int disp,
+ RelocationHolder const& rspec);
+
+ void emit_operand(XMMRegister reg, Address adr);
+
+ void emit_operand(MMXRegister reg, Address adr);
+
+ // workaround gcc (3.2.1-7) bug
+ void emit_operand(Address adr, MMXRegister reg);
+
+
+ // Immediate-to-memory forms
+ void emit_arith_operand(int op1, Register rm, Address adr, int32_t imm32);
+
+ void emit_farith(int b1, int b2, int i);
+
+
+ protected:
+ #ifdef ASSERT
+ void check_relocation(RelocationHolder const& rspec, int format);
+ #endif
+
+ void emit_data(jint data, relocInfo::relocType rtype, int format);
+ void emit_data(jint data, RelocationHolder const& rspec, int format);
+ void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
+ void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
+
+ bool reachable(AddressLiteral adr) NOT_LP64({ return true;});
+
+ // These are all easily abused and hence protected
+
+ // 32BIT ONLY SECTION
+#ifndef _LP64
+ // Make these disappear in 64bit mode since they would never be correct
+ void cmp_literal32(Register src1, int32_t imm32, RelocationHolder const& rspec); // 32BIT ONLY
+ void cmp_literal32(Address src1, int32_t imm32, RelocationHolder const& rspec); // 32BIT ONLY
+
+ void mov_literal32(Register dst, int32_t imm32, RelocationHolder const& rspec); // 32BIT ONLY
+ void mov_literal32(Address dst, int32_t imm32, RelocationHolder const& rspec); // 32BIT ONLY
+
+ void push_literal32(int32_t imm32, RelocationHolder const& rspec); // 32BIT ONLY
+#else
+ // 64BIT ONLY SECTION
+ void mov_literal64(Register dst, intptr_t imm64, RelocationHolder const& rspec); // 64BIT ONLY
+
+ void cmp_narrow_oop(Register src1, int32_t imm32, RelocationHolder const& rspec);
+ void cmp_narrow_oop(Address src1, int32_t imm32, RelocationHolder const& rspec);
+
+ void mov_narrow_oop(Register dst, int32_t imm32, RelocationHolder const& rspec);
+ void mov_narrow_oop(Address dst, int32_t imm32, RelocationHolder const& rspec);
+#endif // _LP64
+
+ // These are unique in that we are ensured by the caller that the 32bit
+ // relative in these instructions will always be able to reach the potentially
+ // 64bit address described by entry. Since they can take a 64bit address they
+ // don't have the 32 suffix like the other instructions in this class.
+
+ void call_literal(address entry, RelocationHolder const& rspec);
+ void jmp_literal(address entry, RelocationHolder const& rspec);
+
+ // Avoid using directly section
+ // Instructions in this section are actually usable by anyone without danger
+ // of failure but have performance issues that are addressed my enhanced
+ // instructions which will do the proper thing base on the particular cpu.
+ // We protect them because we don't trust you...
+
+ // Don't use next inc() and dec() methods directly. INC & DEC instructions
+ // could cause a partial flag stall since they don't set CF flag.
+ // Use MacroAssembler::decrement() & MacroAssembler::increment() methods
+ // which call inc() & dec() or add() & sub() in accordance with
+ // the product flag UseIncDec value.
+
+ void decl(Register dst);
+ void decl(Address dst);
+ void decq(Register dst);
+ void decq(Address dst);
+
+ void incl(Register dst);
+ void incl(Address dst);
+ void incq(Register dst);
+ void incq(Address dst);
+
+ // New cpus require use of movsd and movss to avoid partial register stall
+ // when loading from memory. But for old Opteron use movlpd instead of movsd.
+ // The selection is done in MacroAssembler::movdbl() and movflt().
+
+ // Move Scalar Single-Precision Floating-Point Values
+ void movss(XMMRegister dst, Address src);
+ void movss(XMMRegister dst, XMMRegister src);
+ void movss(Address dst, XMMRegister src);
+
+ // Move Scalar Double-Precision Floating-Point Values
+ void movsd(XMMRegister dst, Address src);
+ void movsd(XMMRegister dst, XMMRegister src);
+ void movsd(Address dst, XMMRegister src);
+ void movlpd(XMMRegister dst, Address src);
+
+ // New cpus require use of movaps and movapd to avoid partial register stall
+ // when moving between registers.
+ void movaps(XMMRegister dst, XMMRegister src);
+ void movapd(XMMRegister dst, XMMRegister src);
+
+ // End avoid using directly
+
+
+ // Instruction prefixes
+ void prefix(Prefix p);
+
+ public:
+
+ // Creation
+ Assembler(CodeBuffer* code) : AbstractAssembler(code) {
+ init_attributes();
+ }
+
+ // Decoding
+ static address locate_operand(address inst, WhichOperand which);
+ static address locate_next_instruction(address inst);
+
+ // Utilities
+ static bool is_polling_page_far() NOT_LP64({ return false;});
+ static bool query_compressed_disp_byte(int disp, bool is_evex_inst, int vector_len,
+ int cur_tuple_type, int in_size_in_bits, int cur_encoding);
+
+ // Generic instructions
+ // Does 32bit or 64bit as needed for the platform. In some sense these
+ // belong in macro assembler but there is no need for both varieties to exist
+
+ void init_attributes(void) {
+ _legacy_mode_bw = (VM_Version::supports_avx512bw() == false);
+ _legacy_mode_dq = (VM_Version::supports_avx512dq() == false);
+ _legacy_mode_vl = (VM_Version::supports_avx512vl() == false);
+ _legacy_mode_vlbw = (VM_Version::supports_avx512vlbw() == false);
+ _is_managed = false;
+ _vector_masking = false;
+ _attributes = NULL;
+ }
+
+ void set_attributes(InstructionAttr *attributes) { _attributes = attributes; }
+ void clear_attributes(void) { _attributes = NULL; }
+
+ void set_managed(void) { _is_managed = true; }
+ void clear_managed(void) { _is_managed = false; }
+ bool is_managed(void) { return _is_managed; }
+
+ // Following functions are for stub code use only
+ void set_vector_masking(void) { _vector_masking = true; }
+ void clear_vector_masking(void) { _vector_masking = false; }
+ bool is_vector_masking(void) { return _vector_masking; }
+
+ void lea(Register dst, Address src);
+
+ void mov(Register dst, Register src);
+
+ void pusha();
+ void popa();
+
+ void pushf();
+ void popf();
+
+ void push(int32_t imm32);
+
+ void push(Register src);
+
+ void pop(Register dst);
+
+ // These are dummies to prevent surprise implicit conversions to Register
+ void push(void* v);
+ void pop(void* v);
+
+ // These do register sized moves/scans
+ void rep_mov();
+ void rep_stos();
+ void rep_stosb();
+ void repne_scan();
+#ifdef _LP64
+ void repne_scanl();
+#endif
+
+ // Vanilla instructions in lexical order
+
+ void adcl(Address dst, int32_t imm32);
+ void adcl(Address dst, Register src);
+ void adcl(Register dst, int32_t imm32);
+ void adcl(Register dst, Address src);
+ void adcl(Register dst, Register src);
+
+ void adcq(Register dst, int32_t imm32);
+ void adcq(Register dst, Address src);
+ void adcq(Register dst, Register src);
+
+ void addb(Address dst, int imm8);
+ void addw(Address dst, int imm16);
+
+ void addl(Address dst, int32_t imm32);
+ void addl(Address dst, Register src);
+ void addl(Register dst, int32_t imm32);
+ void addl(Register dst, Address src);
+ void addl(Register dst, Register src);
+
+ void addq(Address dst, int32_t imm32);
+ void addq(Address dst, Register src);
+ void addq(Register dst, int32_t imm32);
+ void addq(Register dst, Address src);
+ void addq(Register dst, Register src);
+
+#ifdef _LP64
+ //Add Unsigned Integers with Carry Flag
+ void adcxq(Register dst, Register src);
+
+ //Add Unsigned Integers with Overflow Flag
+ void adoxq(Register dst, Register src);
+#endif
+
+ void addr_nop_4();
+ void addr_nop_5();
+ void addr_nop_7();
+ void addr_nop_8();
+
+ // Add Scalar Double-Precision Floating-Point Values
+ void addsd(XMMRegister dst, Address src);
+ void addsd(XMMRegister dst, XMMRegister src);
+
+ // Add Scalar Single-Precision Floating-Point Values
+ void addss(XMMRegister dst, Address src);
+ void addss(XMMRegister dst, XMMRegister src);
+
+ // AES instructions
+ void aesdec(XMMRegister dst, Address src);
+ void aesdec(XMMRegister dst, XMMRegister src);
+ void aesdeclast(XMMRegister dst, Address src);
+ void aesdeclast(XMMRegister dst, XMMRegister src);
+ void aesenc(XMMRegister dst, Address src);
+ void aesenc(XMMRegister dst, XMMRegister src);
+ void aesenclast(XMMRegister dst, Address src);
+ void aesenclast(XMMRegister dst, XMMRegister src);
+
+
+ void andl(Address dst, int32_t imm32);
+ void andl(Register dst, int32_t imm32);
+ void andl(Register dst, Address src);
+ void andl(Register dst, Register src);
+
+ void andq(Address dst, int32_t imm32);
+ void andq(Register dst, int32_t imm32);
+ void andq(Register dst, Address src);
+ void andq(Register dst, Register src);
+
+ // BMI instructions
+ void andnl(Register dst, Register src1, Register src2);
+ void andnl(Register dst, Register src1, Address src2);
+ void andnq(Register dst, Register src1, Register src2);
+ void andnq(Register dst, Register src1, Address src2);
+
+ void blsil(Register dst, Register src);
+ void blsil(Register dst, Address src);
+ void blsiq(Register dst, Register src);
+ void blsiq(Register dst, Address src);
+
+ void blsmskl(Register dst, Register src);
+ void blsmskl(Register dst, Address src);
+ void blsmskq(Register dst, Register src);
+ void blsmskq(Register dst, Address src);
+
+ void blsrl(Register dst, Register src);
+ void blsrl(Register dst, Address src);
+ void blsrq(Register dst, Register src);
+ void blsrq(Register dst, Address src);
+
+ void bsfl(Register dst, Register src);
+ void bsrl(Register dst, Register src);
+
+#ifdef _LP64
+ void bsfq(Register dst, Register src);
+ void bsrq(Register dst, Register src);
+#endif
+
+ void bswapl(Register reg);
+
+ void bswapq(Register reg);
+
+ void call(Label& L, relocInfo::relocType rtype);
+ void call(Register reg); // push pc; pc <- reg
+ void call(Address adr); // push pc; pc <- adr
+
+ void cdql();
+
+ void cdqq();
+
+ void cld();
+
+ void clflush(Address adr);
+
+ void cmovl(Condition cc, Register dst, Register src);
+ void cmovl(Condition cc, Register dst, Address src);
+
+ void cmovq(Condition cc, Register dst, Register src);
+ void cmovq(Condition cc, Register dst, Address src);
+
+
+ void cmpb(Address dst, int imm8);
+
+ void cmpl(Address dst, int32_t imm32);
+
+ void cmpl(Register dst, int32_t imm32);
+ void cmpl(Register dst, Register src);
+ void cmpl(Register dst, Address src);
+
+ void cmpq(Address dst, int32_t imm32);
+ void cmpq(Address dst, Register src);
+
+ void cmpq(Register dst, int32_t imm32);
+ void cmpq(Register dst, Register src);
+ void cmpq(Register dst, Address src);
+
+ // these are dummies used to catch attempting to convert NULL to Register
+ void cmpl(Register dst, void* junk); // dummy
+ void cmpq(Register dst, void* junk); // dummy
+
+ void cmpw(Address dst, int imm16);
+
+ void cmpxchg8 (Address adr);
+
+ void cmpxchgb(Register reg, Address adr);
+ void cmpxchgl(Register reg, Address adr);
+
+ void cmpxchgq(Register reg, Address adr);
+
+ // Ordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
+ void comisd(XMMRegister dst, Address src);
+ void comisd(XMMRegister dst, XMMRegister src);
+
+ // Ordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
+ void comiss(XMMRegister dst, Address src);
+ void comiss(XMMRegister dst, XMMRegister src);
+
+ // Identify processor type and features
+ void cpuid();
+
+ // CRC32C
+ void crc32(Register crc, Register v, int8_t sizeInBytes);
+ void crc32(Register crc, Address adr, int8_t sizeInBytes);
+
+ // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
+ void cvtsd2ss(XMMRegister dst, XMMRegister src);
+ void cvtsd2ss(XMMRegister dst, Address src);
+
+ // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
+ void cvtsi2sdl(XMMRegister dst, Register src);
+ void cvtsi2sdl(XMMRegister dst, Address src);
+ void cvtsi2sdq(XMMRegister dst, Register src);
+ void cvtsi2sdq(XMMRegister dst, Address src);
+
+ // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
+ void cvtsi2ssl(XMMRegister dst, Register src);
+ void cvtsi2ssl(XMMRegister dst, Address src);
+ void cvtsi2ssq(XMMRegister dst, Register src);
+ void cvtsi2ssq(XMMRegister dst, Address src);
+
+ // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value
+ void cvtdq2pd(XMMRegister dst, XMMRegister src);
+
+ // Convert Packed Signed Doubleword Integers to Packed Single-Precision Floating-Point Value
+ void cvtdq2ps(XMMRegister dst, XMMRegister src);
+
+ // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
+ void cvtss2sd(XMMRegister dst, XMMRegister src);
+ void cvtss2sd(XMMRegister dst, Address src);
+
+ // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer
+ void cvttsd2sil(Register dst, Address src);
+ void cvttsd2sil(Register dst, XMMRegister src);
+ void cvttsd2siq(Register dst, XMMRegister src);
+
+ // Convert with Truncation Scalar Single-Precision Floating-Point Value to Doubleword Integer
+ void cvttss2sil(Register dst, XMMRegister src);
+ void cvttss2siq(Register dst, XMMRegister src);
+
+ void cvttpd2dq(XMMRegister dst, XMMRegister src);
+
+ // Divide Scalar Double-Precision Floating-Point Values
+ void divsd(XMMRegister dst, Address src);
+ void divsd(XMMRegister dst, XMMRegister src);
+
+ // Divide Scalar Single-Precision Floating-Point Values
+ void divss(XMMRegister dst, Address src);
+ void divss(XMMRegister dst, XMMRegister src);
+
+ void emms();
+
+ void fabs();
+
+ void fadd(int i);
+
+ void fadd_d(Address src);
+ void fadd_s(Address src);
+
+ // "Alternate" versions of x87 instructions place result down in FPU
+ // stack instead of on TOS
+
+ void fadda(int i); // "alternate" fadd
+ void faddp(int i = 1);
+
+ void fchs();
+
+ void fcom(int i);
+
+ void fcomp(int i = 1);
+ void fcomp_d(Address src);
+ void fcomp_s(Address src);
+
+ void fcompp();
+
+ void fcos();
+
+ void fdecstp();
+
+ void fdiv(int i);
+ void fdiv_d(Address src);
+ void fdivr_s(Address src);
+ void fdiva(int i); // "alternate" fdiv
+ void fdivp(int i = 1);
+
+ void fdivr(int i);
+ void fdivr_d(Address src);
+ void fdiv_s(Address src);
+
+ void fdivra(int i); // "alternate" reversed fdiv
+
+ void fdivrp(int i = 1);
+
+ void ffree(int i = 0);
+
+ void fild_d(Address adr);
+ void fild_s(Address adr);
+
+ void fincstp();
+
+ void finit();
+
+ void fist_s (Address adr);
+ void fistp_d(Address adr);
+ void fistp_s(Address adr);
+
+ void fld1();
+
+ void fld_d(Address adr);
+ void fld_s(Address adr);
+ void fld_s(int index);
+ void fld_x(Address adr); // extended-precision (80-bit) format
+
+ void fldcw(Address src);
+
+ void fldenv(Address src);
+
+ void fldlg2();
+
+ void fldln2();
+
+ void fldz();
+
+ void flog();
+ void flog10();
+
+ void fmul(int i);
+
+ void fmul_d(Address src);
+ void fmul_s(Address src);
+
+ void fmula(int i); // "alternate" fmul
+
+ void fmulp(int i = 1);
+
+ void fnsave(Address dst);
+
+ void fnstcw(Address src);
+
+ void fnstsw_ax();
+
+ void fprem();
+ void fprem1();
+
+ void frstor(Address src);
+
+ void fsin();
+
+ void fsqrt();
+
+ void fst_d(Address adr);
+ void fst_s(Address adr);
+
+ void fstp_d(Address adr);
+ void fstp_d(int index);
+ void fstp_s(Address adr);
+ void fstp_x(Address adr); // extended-precision (80-bit) format
+
+ void fsub(int i);
+ void fsub_d(Address src);
+ void fsub_s(Address src);
+
+ void fsuba(int i); // "alternate" fsub
+
+ void fsubp(int i = 1);
+
+ void fsubr(int i);
+ void fsubr_d(Address src);
+ void fsubr_s(Address src);
+
+ void fsubra(int i); // "alternate" reversed fsub
+
+ void fsubrp(int i = 1);
+
+ void ftan();
+
+ void ftst();
+
+ void fucomi(int i = 1);
+ void fucomip(int i = 1);
+
+ void fwait();
+
+ void fxch(int i = 1);
+
+ void fxrstor(Address src);
+ void xrstor(Address src);
+
+ void fxsave(Address dst);
+ void xsave(Address dst);
+
+ void fyl2x();
+ void frndint();
+ void f2xm1();
+ void fldl2e();
+
+ void hlt();
+
+ void idivl(Register src);
+ void divl(Register src); // Unsigned division
+
+#ifdef _LP64
+ void idivq(Register src);
+#endif
+
+ void imull(Register src);
+ void imull(Register dst, Register src);
+ void imull(Register dst, Register src, int value);
+ void imull(Register dst, Address src);
+
+#ifdef _LP64
+ void imulq(Register dst, Register src);
+ void imulq(Register dst, Register src, int value);
+ void imulq(Register dst, Address src);
+#endif
+
+ // jcc is the generic conditional branch generator to run-
+ // time routines, jcc is used for branches to labels. jcc
+ // takes a branch opcode (cc) and a label (L) and generates
+ // either a backward branch or a forward branch and links it
+ // to the label fixup chain. Usage:
+ //
+ // Label L; // unbound label
+ // jcc(cc, L); // forward branch to unbound label
+ // bind(L); // bind label to the current pc
+ // jcc(cc, L); // backward branch to bound label
+ // bind(L); // illegal: a label may be bound only once
+ //
+ // Note: The same Label can be used for forward and backward branches
+ // but it may be bound only once.
+
+ void jcc(Condition cc, Label& L, bool maybe_short = true);
+
+ // Conditional jump to a 8-bit offset to L.
+ // WARNING: be very careful using this for forward jumps. If the label is
+ // not bound within an 8-bit offset of this instruction, a run-time error
+ // will occur.
+ void jccb(Condition cc, Label& L);
+
+ void jmp(Address entry); // pc <- entry
+
+ // Label operations & relative jumps (PPUM Appendix D)
+ void jmp(Label& L, bool maybe_short = true); // unconditional jump to L
+
+ void jmp(Register entry); // pc <- entry
+
+ // Unconditional 8-bit offset jump to L.
+ // WARNING: be very careful using this for forward jumps. If the label is
+ // not bound within an 8-bit offset of this instruction, a run-time error
+ // will occur.
+ void jmpb(Label& L);
+
+ void ldmxcsr( Address src );
+
+ void leal(Register dst, Address src);
+
+ void leaq(Register dst, Address src);
+
+ void lfence();
+
+ void lock();
+
+ void lzcntl(Register dst, Register src);
+
+#ifdef _LP64
+ void lzcntq(Register dst, Register src);
+#endif
+
+ enum Membar_mask_bits {
+ StoreStore = 1 << 3,
+ LoadStore = 1 << 2,
+ StoreLoad = 1 << 1,
+ LoadLoad = 1 << 0
+ };
+
+ // Serializes memory and blows flags
+ void membar(Membar_mask_bits order_constraint) {
+ if (os::is_MP()) {
+ // We only have to handle StoreLoad
+ if (order_constraint & StoreLoad) {
+ // All usable chips support "locked" instructions which suffice
+ // as barriers, and are much faster than the alternative of
+ // using cpuid instruction. We use here a locked add [esp-C],0.
+ // This is conveniently otherwise a no-op except for blowing
+ // flags, and introducing a false dependency on target memory
+ // location. We can't do anything with flags, but we can avoid
+ // memory dependencies in the current method by locked-adding
+ // somewhere else on the stack. Doing [esp+C] will collide with
+ // something on stack in current method, hence we go for [esp-C].
+ // It is convenient since it is almost always in data cache, for
+ // any small C. We need to step back from SP to avoid data
+ // dependencies with other things on below SP (callee-saves, for
+ // example). Without a clear way to figure out the minimal safe
+ // distance from SP, it makes sense to step back the complete
+ // cache line, as this will also avoid possible second-order effects
+ // with locked ops against the cache line. Our choice of offset
+ // is bounded by x86 operand encoding, which should stay within
+ // [-128; +127] to have the 8-byte displacement encoding.
+ //
+ // Any change to this code may need to revisit other places in
+ // the code where this idiom is used, in particular the
+ // orderAccess code.
+
+ int offset = -VM_Version::L1_line_size();
+ if (offset < -128) {
+ offset = -128;
+ }
+
+ lock();
+ addl(Address(rsp, offset), 0);// Assert the lock# signal here
+ }
+ }
+ }
+
+ void mfence();
+
+ // Moves
+
+ void mov64(Register dst, int64_t imm64);
+
+ void movb(Address dst, Register src);
+ void movb(Address dst, int imm8);
+ void movb(Register dst, Address src);
+
+ void movddup(XMMRegister dst, XMMRegister src);
+
+ void kmovbl(KRegister dst, Register src);
+ void kmovbl(Register dst, KRegister src);
+ void kmovwl(KRegister dst, Register src);
+ void kmovwl(KRegister dst, Address src);
+ void kmovwl(Register dst, KRegister src);
+ void kmovdl(KRegister dst, Register src);
+ void kmovdl(Register dst, KRegister src);
+ void kmovql(KRegister dst, KRegister src);
+ void kmovql(Address dst, KRegister src);
+ void kmovql(KRegister dst, Address src);
+ void kmovql(KRegister dst, Register src);
+ void kmovql(Register dst, KRegister src);
+
+ void knotwl(KRegister dst, KRegister src);
+
+ void kortestbl(KRegister dst, KRegister src);
+ void kortestwl(KRegister dst, KRegister src);
+ void kortestdl(KRegister dst, KRegister src);
+ void kortestql(KRegister dst, KRegister src);
+
+ void ktestq(KRegister src1, KRegister src2);
+ void ktestd(KRegister src1, KRegister src2);
+
+ void ktestql(KRegister dst, KRegister src);
+
+ void movdl(XMMRegister dst, Register src);
+ void movdl(Register dst, XMMRegister src);
+ void movdl(XMMRegister dst, Address src);
+ void movdl(Address dst, XMMRegister src);
+
+ // Move Double Quadword
+ void movdq(XMMRegister dst, Register src);
+ void movdq(Register dst, XMMRegister src);
+
+ // Move Aligned Double Quadword
+ void movdqa(XMMRegister dst, XMMRegister src);
+ void movdqa(XMMRegister dst, Address src);
+
+ // Move Unaligned Double Quadword
+ void movdqu(Address dst, XMMRegister src);
+ void movdqu(XMMRegister dst, Address src);
+ void movdqu(XMMRegister dst, XMMRegister src);
+
+ // Move Unaligned 256bit Vector
+ void vmovdqu(Address dst, XMMRegister src);
+ void vmovdqu(XMMRegister dst, Address src);
+ void vmovdqu(XMMRegister dst, XMMRegister src);
+
+ // Move Unaligned 512bit Vector
+ void evmovdqub(Address dst, XMMRegister src, int vector_len);
+ void evmovdqub(XMMRegister dst, Address src, int vector_len);
+ void evmovdqub(XMMRegister dst, XMMRegister src, int vector_len);
+ void evmovdqub(XMMRegister dst, KRegister mask, Address src, int vector_len);
+ void evmovdquw(Address dst, XMMRegister src, int vector_len);
+ void evmovdquw(Address dst, KRegister mask, XMMRegister src, int vector_len);
+ void evmovdquw(XMMRegister dst, Address src, int vector_len);
+ void evmovdquw(XMMRegister dst, KRegister mask, Address src, int vector_len);
+ void evmovdqul(Address dst, XMMRegister src, int vector_len);
+ void evmovdqul(XMMRegister dst, Address src, int vector_len);
+ void evmovdqul(XMMRegister dst, XMMRegister src, int vector_len);
+ void evmovdquq(Address dst, XMMRegister src, int vector_len);
+ void evmovdquq(XMMRegister dst, Address src, int vector_len);
+ void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);
+
+ // Move lower 64bit to high 64bit in 128bit register
+ void movlhps(XMMRegister dst, XMMRegister src);
+
+ void movl(Register dst, int32_t imm32);
+ void movl(Address dst, int32_t imm32);
+ void movl(Register dst, Register src);
+ void movl(Register dst, Address src);
+ void movl(Address dst, Register src);
+
+ // These dummies prevent using movl from converting a zero (like NULL) into Register
+ // by giving the compiler two choices it can't resolve
+
+ void movl(Address dst, void* junk);
+ void movl(Register dst, void* junk);
+
+#ifdef _LP64
+ void movq(Register dst, Register src);
+ void movq(Register dst, Address src);
+ void movq(Address dst, Register src);
+#endif
+
+ void movq(Address dst, MMXRegister src );
+ void movq(MMXRegister dst, Address src );
+
+#ifdef _LP64
+ // These dummies prevent using movq from converting a zero (like NULL) into Register
+ // by giving the compiler two choices it can't resolve
+
+ void movq(Address dst, void* dummy);
+ void movq(Register dst, void* dummy);
+#endif
+
+ // Move Quadword
+ void movq(Address dst, XMMRegister src);
+ void movq(XMMRegister dst, Address src);
+
+ void movsbl(Register dst, Address src);
+ void movsbl(Register dst, Register src);
+
+#ifdef _LP64
+ void movsbq(Register dst, Address src);
+ void movsbq(Register dst, Register src);
+
+ // Move signed 32bit immediate to 64bit extending sign
+ void movslq(Address dst, int32_t imm64);
+ void movslq(Register dst, int32_t imm64);
+
+ void movslq(Register dst, Address src);
+ void movslq(Register dst, Register src);
+ void movslq(Register dst, void* src); // Dummy declaration to cause NULL to be ambiguous
+#endif
+
+ void movswl(Register dst, Address src);
+ void movswl(Register dst, Register src);
+
+#ifdef _LP64
+ void movswq(Register dst, Address src);
+ void movswq(Register dst, Register src);
+#endif
+
+ void movw(Address dst, int imm16);
+ void movw(Register dst, Address src);
+ void movw(Address dst, Register src);
+
+ void movzbl(Register dst, Address src);
+ void movzbl(Register dst, Register src);
+
+#ifdef _LP64
+ void movzbq(Register dst, Address src);
+ void movzbq(Register dst, Register src);
+#endif
+
+ void movzwl(Register dst, Address src);
+ void movzwl(Register dst, Register src);
+
+#ifdef _LP64
+ void movzwq(Register dst, Address src);
+ void movzwq(Register dst, Register src);
+#endif
+
+ // Unsigned multiply with RAX destination register
+ void mull(Address src);
+ void mull(Register src);
+
+#ifdef _LP64
+ void mulq(Address src);
+ void mulq(Register src);
+ void mulxq(Register dst1, Register dst2, Register src);
+#endif
+
+ // Multiply Scalar Double-Precision Floating-Point Values
+ void mulsd(XMMRegister dst, Address src);
+ void mulsd(XMMRegister dst, XMMRegister src);
+
+ // Multiply Scalar Single-Precision Floating-Point Values
+ void mulss(XMMRegister dst, Address src);
+ void mulss(XMMRegister dst, XMMRegister src);
+
+ void negl(Register dst);
+
+#ifdef _LP64
+ void negq(Register dst);
+#endif
+
+ void nop(int i = 1);
+
+ void notl(Register dst);
+
+#ifdef _LP64
+ void notq(Register dst);
+#endif
+
+ void orl(Address dst, int32_t imm32);
+ void orl(Register dst, int32_t imm32);
+ void orl(Register dst, Address src);
+ void orl(Register dst, Register src);
+ void orl(Address dst, Register src);
+
+ void orq(Address dst, int32_t imm32);
+ void orq(Register dst, int32_t imm32);
+ void orq(Register dst, Address src);
+ void orq(Register dst, Register src);
+
+ // Pack with unsigned saturation
+ void packuswb(XMMRegister dst, XMMRegister src);
+ void packuswb(XMMRegister dst, Address src);
+ void vpackuswb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+
+ // Pemutation of 64bit words
+ void vpermq(XMMRegister dst, XMMRegister src, int imm8, int vector_len);
+ void vpermq(XMMRegister dst, XMMRegister src, int imm8);
+ void vperm2i128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
+ void vperm2f128(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8);
+
+ void pause();
+
+ // Undefined Instruction
+ void ud2();
+
+ // SSE4.2 string instructions
+ void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
+ void pcmpestri(XMMRegister xmm1, Address src, int imm8);
+
+ void pcmpeqb(XMMRegister dst, XMMRegister src);
+ void vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void evpcmpeqb(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
+ void evpcmpeqb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
+ void evpcmpeqb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);
+
+ void evpcmpgtb(KRegister kdst, XMMRegister nds, Address src, int vector_len);
+ void evpcmpgtb(KRegister kdst, KRegister mask, XMMRegister nds, Address src, int vector_len);
+
+ void evpcmpuw(KRegister kdst, XMMRegister nds, XMMRegister src, ComparisonPredicate vcc, int vector_len);
+ void evpcmpuw(KRegister kdst, KRegister mask, XMMRegister nds, XMMRegister src, ComparisonPredicate of, int vector_len);
+ void evpcmpuw(KRegister kdst, XMMRegister nds, Address src, ComparisonPredicate vcc, int vector_len);
+
+ void pcmpeqw(XMMRegister dst, XMMRegister src);
+ void vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void evpcmpeqw(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
+ void evpcmpeqw(KRegister kdst, XMMRegister nds, Address src, int vector_len);
+
+ void pcmpeqd(XMMRegister dst, XMMRegister src);
+ void vpcmpeqd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void evpcmpeqd(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
+ void evpcmpeqd(KRegister kdst, XMMRegister nds, Address src, int vector_len);
+
+ void pcmpeqq(XMMRegister dst, XMMRegister src);
+ void vpcmpeqq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void evpcmpeqq(KRegister kdst, XMMRegister nds, XMMRegister src, int vector_len);
+ void evpcmpeqq(KRegister kdst, XMMRegister nds, Address src, int vector_len);
+
+ void pmovmskb(Register dst, XMMRegister src);
+ void vpmovmskb(Register dst, XMMRegister src);
+
+ // SSE 4.1 extract
+ void pextrd(Register dst, XMMRegister src, int imm8);
+ void pextrq(Register dst, XMMRegister src, int imm8);
+ void pextrd(Address dst, XMMRegister src, int imm8);
+ void pextrq(Address dst, XMMRegister src, int imm8);
+ void pextrb(Address dst, XMMRegister src, int imm8);
+ // SSE 2 extract
+ void pextrw(Register dst, XMMRegister src, int imm8);
+ void pextrw(Address dst, XMMRegister src, int imm8);
+
+ // SSE 4.1 insert
+ void pinsrd(XMMRegister dst, Register src, int imm8);
+ void pinsrq(XMMRegister dst, Register src, int imm8);
+ void pinsrd(XMMRegister dst, Address src, int imm8);
+ void pinsrq(XMMRegister dst, Address src, int imm8);
+ void pinsrb(XMMRegister dst, Address src, int imm8);
+ // SSE 2 insert
+ void pinsrw(XMMRegister dst, Register src, int imm8);
+ void pinsrw(XMMRegister dst, Address src, int imm8);
+
+ // SSE4.1 packed move
+ void pmovzxbw(XMMRegister dst, XMMRegister src);
+ void pmovzxbw(XMMRegister dst, Address src);
+
+ void vpmovzxbw( XMMRegister dst, Address src, int vector_len);
+ void evpmovzxbw(XMMRegister dst, KRegister mask, Address src, int vector_len);
+
+ void evpmovwb(Address dst, XMMRegister src, int vector_len);
+ void evpmovwb(Address dst, KRegister mask, XMMRegister src, int vector_len);
+
+#ifndef _LP64 // no 32bit push/pop on amd64
+ void popl(Address dst);
+#endif
+
+#ifdef _LP64
+ void popq(Address dst);
+#endif
+
+ void popcntl(Register dst, Address src);
+ void popcntl(Register dst, Register src);
+
+#ifdef _LP64
+ void popcntq(Register dst, Address src);
+ void popcntq(Register dst, Register src);
+#endif
+
+ // Prefetches (SSE, SSE2, 3DNOW only)
+
+ void prefetchnta(Address src);
+ void prefetchr(Address src);
+ void prefetcht0(Address src);
+ void prefetcht1(Address src);
+ void prefetcht2(Address src);
+ void prefetchw(Address src);
+
+ // Shuffle Bytes
+ void pshufb(XMMRegister dst, XMMRegister src);
+ void pshufb(XMMRegister dst, Address src);
+ void vpshufb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+
+ // Shuffle Packed Doublewords
+ void pshufd(XMMRegister dst, XMMRegister src, int mode);
+ void pshufd(XMMRegister dst, Address src, int mode);
+ void vpshufd(XMMRegister dst, XMMRegister src, int mode, int vector_len);
+
+ // Shuffle Packed Low Words
+ void pshuflw(XMMRegister dst, XMMRegister src, int mode);
+ void pshuflw(XMMRegister dst, Address src, int mode);
+
+ // Shift Right by bytes Logical DoubleQuadword Immediate
+ void psrldq(XMMRegister dst, int shift);
+ // Shift Left by bytes Logical DoubleQuadword Immediate
+ void pslldq(XMMRegister dst, int shift);
+
+ // Logical Compare 128bit
+ void ptest(XMMRegister dst, XMMRegister src);
+ void ptest(XMMRegister dst, Address src);
+ // Logical Compare 256bit
+ void vptest(XMMRegister dst, XMMRegister src);
+ void vptest(XMMRegister dst, Address src);
+
+ // Interleave Low Bytes
+ void punpcklbw(XMMRegister dst, XMMRegister src);
+ void punpcklbw(XMMRegister dst, Address src);
+
+ // Interleave Low Doublewords
+ void punpckldq(XMMRegister dst, XMMRegister src);
+ void punpckldq(XMMRegister dst, Address src);
+
+ // Interleave Low Quadwords
+ void punpcklqdq(XMMRegister dst, XMMRegister src);
+
+#ifndef _LP64 // no 32bit push/pop on amd64
+ void pushl(Address src);
+#endif
+
+ void pushq(Address src);
+
+ void rcll(Register dst, int imm8);
+
+ void rclq(Register dst, int imm8);
+
+ void rcrq(Register dst, int imm8);
+
+ void rcpps(XMMRegister dst, XMMRegister src);
+
+ void rcpss(XMMRegister dst, XMMRegister src);
+
+ void rdtsc();
+
+ void ret(int imm16);
+
+#ifdef _LP64
+ void rorq(Register dst, int imm8);
+ void rorxq(Register dst, Register src, int imm8);
+ void rorxd(Register dst, Register src, int imm8);
+#endif
+
+ void sahf();
+
+ void sarl(Register dst, int imm8);
+ void sarl(Register dst);
+
+ void sarq(Register dst, int imm8);
+ void sarq(Register dst);
+
+ void sbbl(Address dst, int32_t imm32);
+ void sbbl(Register dst, int32_t imm32);
+ void sbbl(Register dst, Address src);
+ void sbbl(Register dst, Register src);
+
+ void sbbq(Address dst, int32_t imm32);
+ void sbbq(Register dst, int32_t imm32);
+ void sbbq(Register dst, Address src);
+ void sbbq(Register dst, Register src);
+
+ void setb(Condition cc, Register dst);
+
+ void palignr(XMMRegister dst, XMMRegister src, int imm8);
+ void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2, int imm8, int vector_len);
+
+ void pblendw(XMMRegister dst, XMMRegister src, int imm8);
+
+ void sha1rnds4(XMMRegister dst, XMMRegister src, int imm8);
+ void sha1nexte(XMMRegister dst, XMMRegister src);
+ void sha1msg1(XMMRegister dst, XMMRegister src);
+ void sha1msg2(XMMRegister dst, XMMRegister src);
+ // xmm0 is implicit additional source to the following instruction.
+ void sha256rnds2(XMMRegister dst, XMMRegister src);
+ void sha256msg1(XMMRegister dst, XMMRegister src);
+ void sha256msg2(XMMRegister dst, XMMRegister src);
+
+ void shldl(Register dst, Register src);
+ void shldl(Register dst, Register src, int8_t imm8);
+
+ void shll(Register dst, int imm8);
+ void shll(Register dst);
+
+ void shlq(Register dst, int imm8);
+ void shlq(Register dst);
+
+ void shrdl(Register dst, Register src);
+
+ void shrl(Register dst, int imm8);
+ void shrl(Register dst);
+
+ void shrq(Register dst, int imm8);
+ void shrq(Register dst);
+
+ void smovl(); // QQQ generic?
+
+ // Compute Square Root of Scalar Double-Precision Floating-Point Value
+ void sqrtsd(XMMRegister dst, Address src);
+ void sqrtsd(XMMRegister dst, XMMRegister src);
+
+ // Compute Square Root of Scalar Single-Precision Floating-Point Value
+ void sqrtss(XMMRegister dst, Address src);
+ void sqrtss(XMMRegister dst, XMMRegister src);
+
+ void std();
+
+ void stmxcsr( Address dst );
+
+ void subl(Address dst, int32_t imm32);
+ void subl(Address dst, Register src);
+ void subl(Register dst, int32_t imm32);
+ void subl(Register dst, Address src);
+ void subl(Register dst, Register src);
+
+ void subq(Address dst, int32_t imm32);
+ void subq(Address dst, Register src);
+ void subq(Register dst, int32_t imm32);
+ void subq(Register dst, Address src);
+ void subq(Register dst, Register src);
+
+ // Force generation of a 4 byte immediate value even if it fits into 8bit
+ void subl_imm32(Register dst, int32_t imm32);
+ void subq_imm32(Register dst, int32_t imm32);
+
+ // Subtract Scalar Double-Precision Floating-Point Values
+ void subsd(XMMRegister dst, Address src);
+ void subsd(XMMRegister dst, XMMRegister src);
+
+ // Subtract Scalar Single-Precision Floating-Point Values
+ void subss(XMMRegister dst, Address src);
+ void subss(XMMRegister dst, XMMRegister src);
+
+ void testb(Register dst, int imm8);
+ void testb(Address dst, int imm8);
+
+ void testl(Register dst, int32_t imm32);
+ void testl(Register dst, Register src);
+ void testl(Register dst, Address src);
+
+ void testq(Register dst, int32_t imm32);
+ void testq(Register dst, Register src);
+
+ // BMI - count trailing zeros
+ void tzcntl(Register dst, Register src);
+ void tzcntq(Register dst, Register src);
+
+ // Unordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
+ void ucomisd(XMMRegister dst, Address src);
+ void ucomisd(XMMRegister dst, XMMRegister src);
+
+ // Unordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
+ void ucomiss(XMMRegister dst, Address src);
+ void ucomiss(XMMRegister dst, XMMRegister src);
+
+ void xabort(int8_t imm8);
+
+ void xaddb(Address dst, Register src);
+ void xaddw(Address dst, Register src);
+ void xaddl(Address dst, Register src);
+ void xaddq(Address dst, Register src);
+
+ void xbegin(Label& abort, relocInfo::relocType rtype = relocInfo::none);
+
+ void xchgb(Register reg, Address adr);
+ void xchgw(Register reg, Address adr);
+ void xchgl(Register reg, Address adr);
+ void xchgl(Register dst, Register src);
+
+ void xchgq(Register reg, Address adr);
+ void xchgq(Register dst, Register src);
+
+ void xend();
+
+ // Get Value of Extended Control Register
+ void xgetbv();
+
+ void xorl(Register dst, int32_t imm32);
+ void xorl(Register dst, Address src);
+ void xorl(Register dst, Register src);
+
+ void xorb(Register dst, Address src);
+
+ void xorq(Register dst, Address src);
+ void xorq(Register dst, Register src);
+
+ void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0
+
+ // AVX 3-operands scalar instructions (encoded with VEX prefix)
+
+ void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
+ void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+ void vaddss(XMMRegister dst, XMMRegister nds, Address src);
+ void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+ void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
+ void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+ void vdivss(XMMRegister dst, XMMRegister nds, Address src);
+ void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+ void vfmadd231sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+ void vfmadd231ss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+ void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
+ void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+ void vmulss(XMMRegister dst, XMMRegister nds, Address src);
+ void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+ void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
+ void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+ void vsubss(XMMRegister dst, XMMRegister nds, Address src);
+ void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+
+ void shlxl(Register dst, Register src1, Register src2);
+ void shlxq(Register dst, Register src1, Register src2);
+
+ //====================VECTOR ARITHMETIC=====================================
+
+ // Add Packed Floating-Point Values
+ void addpd(XMMRegister dst, XMMRegister src);
+ void addpd(XMMRegister dst, Address src);
+ void addps(XMMRegister dst, XMMRegister src);
+ void vaddpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vaddps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vaddpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vaddps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ // Subtract Packed Floating-Point Values
+ void subpd(XMMRegister dst, XMMRegister src);
+ void subps(XMMRegister dst, XMMRegister src);
+ void vsubpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vsubps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vsubpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vsubps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ // Multiply Packed Floating-Point Values
+ void mulpd(XMMRegister dst, XMMRegister src);
+ void mulpd(XMMRegister dst, Address src);
+ void mulps(XMMRegister dst, XMMRegister src);
+ void vmulpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vmulps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vmulpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vmulps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ void vfmadd231pd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vfmadd231ps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vfmadd231pd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vfmadd231ps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ // Divide Packed Floating-Point Values
+ void divpd(XMMRegister dst, XMMRegister src);
+ void divps(XMMRegister dst, XMMRegister src);
+ void vdivpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vdivps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vdivpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vdivps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ // Sqrt Packed Floating-Point Values - Double precision only
+ void vsqrtpd(XMMRegister dst, XMMRegister src, int vector_len);
+ void vsqrtpd(XMMRegister dst, Address src, int vector_len);
+
+ // Bitwise Logical AND of Packed Floating-Point Values
+ void andpd(XMMRegister dst, XMMRegister src);
+ void andps(XMMRegister dst, XMMRegister src);
+ void vandpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vandps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vandpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vandps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ void unpckhpd(XMMRegister dst, XMMRegister src);
+ void unpcklpd(XMMRegister dst, XMMRegister src);
+
+ // Bitwise Logical XOR of Packed Floating-Point Values
+ void xorpd(XMMRegister dst, XMMRegister src);
+ void xorps(XMMRegister dst, XMMRegister src);
+ void vxorpd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vxorps(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vxorpd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vxorps(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ // Add horizontal packed integers
+ void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void phaddw(XMMRegister dst, XMMRegister src);
+ void phaddd(XMMRegister dst, XMMRegister src);
+
+ // Add packed integers
+ void paddb(XMMRegister dst, XMMRegister src);
+ void paddw(XMMRegister dst, XMMRegister src);
+ void paddd(XMMRegister dst, XMMRegister src);
+ void paddd(XMMRegister dst, Address src);
+ void paddq(XMMRegister dst, XMMRegister src);
+ void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpaddq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vpaddd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vpaddq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ // Sub packed integers
+ void psubb(XMMRegister dst, XMMRegister src);
+ void psubw(XMMRegister dst, XMMRegister src);
+ void psubd(XMMRegister dst, XMMRegister src);
+ void psubq(XMMRegister dst, XMMRegister src);
+ void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpsubd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpsubq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vpsubd(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vpsubq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ // Multiply packed integers (only shorts and ints)
+ void pmullw(XMMRegister dst, XMMRegister src);
+ void pmulld(XMMRegister dst, XMMRegister src);
+ void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpmullq(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+ void vpmullq(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ // Shift left packed integers
+ void psllw(XMMRegister dst, int shift);
+ void pslld(XMMRegister dst, int shift);
+ void psllq(XMMRegister dst, int shift);
+ void psllw(XMMRegister dst, XMMRegister shift);
+ void pslld(XMMRegister dst, XMMRegister shift);
+ void psllq(XMMRegister dst, XMMRegister shift);
+ void vpsllw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+ void vpslld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+ void vpsllq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+ void vpsllw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+ void vpslld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+ void vpsllq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+
+ // Logical shift right packed integers
+ void psrlw(XMMRegister dst, int shift);
+ void psrld(XMMRegister dst, int shift);
+ void psrlq(XMMRegister dst, int shift);
+ void psrlw(XMMRegister dst, XMMRegister shift);
+ void psrld(XMMRegister dst, XMMRegister shift);
+ void psrlq(XMMRegister dst, XMMRegister shift);
+ void vpsrlw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+ void vpsrld(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+ void vpsrlq(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+ void vpsrlw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+ void vpsrld(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+ void vpsrlq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+
+ // Arithmetic shift right packed integers (only shorts and ints, no instructions for longs)
+ void psraw(XMMRegister dst, int shift);
+ void psrad(XMMRegister dst, int shift);
+ void psraw(XMMRegister dst, XMMRegister shift);
+ void psrad(XMMRegister dst, XMMRegister shift);
+ void vpsraw(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+ void vpsrad(XMMRegister dst, XMMRegister src, int shift, int vector_len);
+ void vpsraw(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+ void vpsrad(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len);
+
+ // And packed integers
+ void pand(XMMRegister dst, XMMRegister src);
+ void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpand(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ // Andn packed integers
+ void pandn(XMMRegister dst, XMMRegister src);
+
+ // Or packed integers
+ void por(XMMRegister dst, XMMRegister src);
+ void vpor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ // Xor packed integers
+ void pxor(XMMRegister dst, XMMRegister src);
+ void vpxor(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
+ void vpxor(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
+
+ // vinserti forms
+ void vinserti128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+ void vinserti128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+ void vinserti32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+ void vinserti32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+ void vinserti64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+
+ // vinsertf forms
+ void vinsertf128(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+ void vinsertf128(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+ void vinsertf32x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+ void vinsertf32x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+ void vinsertf64x4(XMMRegister dst, XMMRegister nds, XMMRegister src, uint8_t imm8);
+ void vinsertf64x4(XMMRegister dst, XMMRegister nds, Address src, uint8_t imm8);
+
+ // vextracti forms
+ void vextracti128(XMMRegister dst, XMMRegister src, uint8_t imm8);
+ void vextracti128(Address dst, XMMRegister src, uint8_t imm8);
+ void vextracti32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+ void vextracti32x4(Address dst, XMMRegister src, uint8_t imm8);
+ void vextracti64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
+ void vextracti64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+
+ // vextractf forms
+ void vextractf128(XMMRegister dst, XMMRegister src, uint8_t imm8);
+ void vextractf128(Address dst, XMMRegister src, uint8_t imm8);
+ void vextractf32x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+ void vextractf32x4(Address dst, XMMRegister src, uint8_t imm8);
+ void vextractf64x2(XMMRegister dst, XMMRegister src, uint8_t imm8);
+ void vextractf64x4(XMMRegister dst, XMMRegister src, uint8_t imm8);
+ void vextractf64x4(Address dst, XMMRegister src, uint8_t imm8);
+
+ // legacy xmm sourced word/dword replicate
+ void vpbroadcastw(XMMRegister dst, XMMRegister src);
+ void vpbroadcastd(XMMRegister dst, XMMRegister src);
+
+ // xmm/mem sourced byte/word/dword/qword replicate
+ void evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len);
+ void evpbroadcastb(XMMRegister dst, Address src, int vector_len);
+ void evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len);
+ void evpbroadcastw(XMMRegister dst, Address src, int vector_len);
+ void evpbroadcastd(XMMRegister dst, XMMRegister src, int vector_len);
+ void evpbroadcastd(XMMRegister dst, Address src, int vector_len);
+ void evpbroadcastq(XMMRegister dst, XMMRegister src, int vector_len);
+ void evpbroadcastq(XMMRegister dst, Address src, int vector_len);
+
+ // scalar single/double precision replicate
+ void evpbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
+ void evpbroadcastss(XMMRegister dst, Address src, int vector_len);
+ void evpbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
+ void evpbroadcastsd(XMMRegister dst, Address src, int vector_len);
+
+ // gpr sourced byte/word/dword/qword replicate
+ void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
+ void evpbroadcastw(XMMRegister dst, Register src, int vector_len);
+ void evpbroadcastd(XMMRegister dst, Register src, int vector_len);
+ void evpbroadcastq(XMMRegister dst, Register src, int vector_len);
+
+ // Carry-Less Multiplication Quadword
+ void pclmulqdq(XMMRegister dst, XMMRegister src, int mask);
+ void vpclmulqdq(XMMRegister dst, XMMRegister nds, XMMRegister src, int mask);
+
+ // AVX instruction which is used to clear upper 128 bits of YMM registers and
+ // to avoid transaction penalty between AVX and SSE states. There is no
+ // penalty if legacy SSE instructions are encoded using VEX prefix because
+ // they always clear upper 128 bits. It should be used before calling
+ // runtime code and native libraries.
+ void vzeroupper();
+
+ // AVX support for vectorized conditional move (double). The following two instructions used only coupled.
+ void cmppd(XMMRegister dst, XMMRegister nds, XMMRegister src, int cop, int vector_len);
+ void blendvpd(XMMRegister dst, XMMRegister nds, XMMRegister src1, XMMRegister src2, int vector_len);
+ void vpblendd(XMMRegister dst, XMMRegister nds, XMMRegister src, int imm8, int vector_len);
+
+ protected:
+ // Next instructions require address alignment 16 bytes SSE mode.
+ // They should be called only from corresponding MacroAssembler instructions.
+ void andpd(XMMRegister dst, Address src);
+ void andps(XMMRegister dst, Address src);
+ void xorpd(XMMRegister dst, Address src);
+ void xorps(XMMRegister dst, Address src);
+
+};
+
+// The Intel x86/Amd64 Assembler attributes: All fields enclosed here are to guide encoding level decisions.
+// Specific set functions are for specialized use, else defaults or whatever was supplied to object construction
+// are applied.
+class InstructionAttr {
+public:
+ InstructionAttr(
+ int vector_len, // The length of vector to be applied in encoding - for both AVX and EVEX
+ bool rex_vex_w, // Width of data: if 32-bits or less, false, else if 64-bit or specially defined, true
+ bool legacy_mode, // Details if either this instruction is conditionally encoded to AVX or earlier if true else possibly EVEX
+ bool no_reg_mask, // when true, k0 is used when EVEX encoding is chosen, else k1 is used under the same condition
+ bool uses_vl) // This instruction may have legacy constraints based on vector length for EVEX
+ :
+ _avx_vector_len(vector_len),
+ _rex_vex_w(rex_vex_w),
+ _rex_vex_w_reverted(false),
+ _legacy_mode(legacy_mode),
+ _no_reg_mask(no_reg_mask),
+ _uses_vl(uses_vl),
+ _tuple_type(Assembler::EVEX_ETUP),
+ _input_size_in_bits(Assembler::EVEX_NObit),
+ _is_evex_instruction(false),
+ _evex_encoding(0),
+ _is_clear_context(true),
+ _is_extended_context(false),
+ _current_assembler(NULL),
+ _embedded_opmask_register_specifier(1) { // hard code k1, it will be initialized for now
+ if (UseAVX < 3) _legacy_mode = true;
+ }
+
+ ~InstructionAttr() {
+ if (_current_assembler != NULL) {
+ _current_assembler->clear_attributes();
+ }
+ _current_assembler = NULL;
+ }
+
+private:
+ int _avx_vector_len;
+ bool _rex_vex_w;
+ bool _rex_vex_w_reverted;
+ bool _legacy_mode;
+ bool _no_reg_mask;
+ bool _uses_vl;
+ int _tuple_type;
+ int _input_size_in_bits;
+ bool _is_evex_instruction;
+ int _evex_encoding;
+ bool _is_clear_context;
+ bool _is_extended_context;
+ int _embedded_opmask_register_specifier;
+
+ Assembler *_current_assembler;
+
+public:
+ // query functions for field accessors
+ int get_vector_len(void) const { return _avx_vector_len; }
+ bool is_rex_vex_w(void) const { return _rex_vex_w; }
+ bool is_rex_vex_w_reverted(void) { return _rex_vex_w_reverted; }
+ bool is_legacy_mode(void) const { return _legacy_mode; }
+ bool is_no_reg_mask(void) const { return _no_reg_mask; }
+ bool uses_vl(void) const { return _uses_vl; }
+ int get_tuple_type(void) const { return _tuple_type; }
+ int get_input_size(void) const { return _input_size_in_bits; }
+ int is_evex_instruction(void) const { return _is_evex_instruction; }
+ int get_evex_encoding(void) const { return _evex_encoding; }
+ bool is_clear_context(void) const { return _is_clear_context; }
+ bool is_extended_context(void) const { return _is_extended_context; }
+ int get_embedded_opmask_register_specifier(void) const { return _embedded_opmask_register_specifier; }
+
+ // Set the vector len manually
+ void set_vector_len(int vector_len) { _avx_vector_len = vector_len; }
+
+ // Set revert rex_vex_w for avx encoding
+ void set_rex_vex_w_reverted(void) { _rex_vex_w_reverted = true; }
+
+ // Set rex_vex_w based on state
+ void set_rex_vex_w(bool state) { _rex_vex_w = state; }
+
+ // Set the instruction to be encoded in AVX mode
+ void set_is_legacy_mode(void) { _legacy_mode = true; }
+
+ // Set the current instuction to be encoded as an EVEX instuction
+ void set_is_evex_instruction(void) { _is_evex_instruction = true; }
+
+ // Internal encoding data used in compressed immediate offset programming
+ void set_evex_encoding(int value) { _evex_encoding = value; }
+
+ // Set the Evex.Z field to be used to clear all non directed XMM/YMM/ZMM components
+ void reset_is_clear_context(void) { _is_clear_context = false; }
+
+ // Map back to current asembler so that we can manage object level assocation
+ void set_current_assembler(Assembler *current_assembler) { _current_assembler = current_assembler; }
+
+ // Address modifiers used for compressed displacement calculation
+ void set_address_attributes(int tuple_type, int input_size_in_bits) {
+ if (VM_Version::supports_evex()) {
+ _tuple_type = tuple_type;
+ _input_size_in_bits = input_size_in_bits;
+ }
+ }
+
+ // Set embedded opmask register specifier.
+ void set_embedded_opmask_register_specifier(KRegister mask) {
+ _embedded_opmask_register_specifier = (*mask).encoding() & 0x7;
+ }
+
+};
+
+#endif // CPU_X86_VM_ASSEMBLER_X86_HPP