8031321: Support Intel bit manipulation instructions
authoriveresov
Wed, 12 Mar 2014 11:24:26 -0700
changeset 23220 fc827339dc37
parent 23219 69e72eaf9f51
child 23221 b70675ece1ce
8031321: Support Intel bit manipulation instructions Summary: Add support for BMI1 instructions Reviewed-by: kvn, roland
hotspot/src/cpu/x86/vm/assembler_x86.cpp
hotspot/src/cpu/x86/vm/assembler_x86.hpp
hotspot/src/cpu/x86/vm/globals_x86.hpp
hotspot/src/cpu/x86/vm/vm_version_x86.cpp
hotspot/src/cpu/x86/vm/vm_version_x86.hpp
hotspot/src/cpu/x86/vm/x86_32.ad
hotspot/src/cpu/x86/vm/x86_64.ad
hotspot/src/share/vm/adlc/formssel.cpp
hotspot/src/share/vm/opto/matcher.cpp
hotspot/src/share/vm/opto/matcher.hpp
hotspot/test/compiler/codegen/BMI1.java
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Tue Mar 11 14:54:47 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp	Wed Mar 12 11:24:26 2014 -0700
@@ -1089,6 +1089,21 @@
   emit_arith(0x23, 0xC0, dst, src);
 }
 
+void Assembler::andnl(Register dst, Register src1, Register src2) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode(dst, src1, src2);
+  emit_int8((unsigned char)0xF2);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::andnl(Register dst, Register src1, Address src2) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38(dst, src1, src2);
+  emit_int8((unsigned char)0xF2);
+  emit_operand(dst, src2);
+}
+
 void Assembler::bsfl(Register dst, Register src) {
   int encode = prefix_and_encode(dst->encoding(), src->encoding());
   emit_int8(0x0F);
@@ -1110,6 +1125,51 @@
   emit_int8((unsigned char)(0xC8 | encode));
 }
 
+void Assembler::blsil(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode(rbx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::blsil(Register dst, Address src) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38(rbx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_operand(rbx, src);
+}
+
+void Assembler::blsmskl(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode(rdx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::blsmskl(Register dst, Address src) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38(rdx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_operand(rdx, src);
+}
+
+void Assembler::blsrl(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode(rcx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::blsrl(Register dst, Address src) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38(rcx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_operand(rcx, src);
+}
+
 void Assembler::call(Label& L, relocInfo::relocType rtype) {
   // suspect disp32 is always good
   int operand = LP64_ONLY(disp32_operand) NOT_LP64(imm_operand);
@@ -2878,6 +2938,24 @@
   emit_operand(dst, src);
 }
 
+void Assembler::tzcntl(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "tzcnt instruction not supported");
+  emit_int8((unsigned char)0xF3);
+  int encode = prefix_and_encode(dst->encoding(), src->encoding());
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xBC);
+  emit_int8((unsigned char)0xC0 | encode);
+}
+
+void Assembler::tzcntq(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "tzcnt instruction not supported");
+  emit_int8((unsigned char)0xF3);
+  int encode = prefixq_and_encode(dst->encoding(), src->encoding());
+  emit_int8(0x0F);
+  emit_int8((unsigned char)0xBC);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::ucomisd(XMMRegister dst, Address src) {
   NOT_LP64(assert(VM_Version::supports_sse2(), ""));
   emit_simd_arith_nonds(0x2E, dst, src, VEX_SIMD_66);
@@ -4837,6 +4915,21 @@
   emit_arith(0x23, 0xC0, dst, src);
 }
 
+void Assembler::andnq(Register dst, Register src1, Register src2) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode_q(dst, src1, src2);
+  emit_int8((unsigned char)0xF2);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::andnq(Register dst, Register src1, Address src2) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38_q(dst, src1, src2);
+  emit_int8((unsigned char)0xF2);
+  emit_operand(dst, src2);
+}
+
 void Assembler::bsfq(Register dst, Register src) {
   int encode = prefixq_and_encode(dst->encoding(), src->encoding());
   emit_int8(0x0F);
@@ -4858,6 +4951,51 @@
   emit_int8((unsigned char)(0xC8 | encode));
 }
 
+void Assembler::blsiq(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode_q(rbx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::blsiq(Register dst, Address src) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38_q(rbx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_operand(rbx, src);
+}
+
+void Assembler::blsmskq(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode_q(rdx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::blsmskq(Register dst, Address src) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38_q(rdx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_operand(rdx, src);
+}
+
+void Assembler::blsrq(Register dst, Register src) {
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  int encode = vex_prefix_0F38_and_encode_q(rcx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::blsrq(Register dst, Address src) {
+  InstructionMark im(this);
+  assert(VM_Version::supports_bmi1(), "bit manipulation instructions not supported");
+  vex_prefix_0F38_q(rcx, dst, src);
+  emit_int8((unsigned char)0xF3);
+  emit_operand(rcx, src);
+}
+
 void Assembler::cdqq() {
   prefix(REX_W);
   emit_int8((unsigned char)0x99);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Tue Mar 11 14:54:47 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp	Wed Mar 12 11:24:26 2014 -0700
@@ -590,10 +590,35 @@
     vex_prefix(src, nds_enc, dst_enc, pre, VEX_OPCODE_0F, false, vector256);
   }
 
+  void vex_prefix_0F38(Register dst, Register nds, Address src) {
+    bool vex_w = false;
+    bool vector256 = false;
+    vex_prefix(src, nds->encoding(), dst->encoding(),
+               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
+  }
+
+  void vex_prefix_0F38_q(Register dst, Register nds, Address src) {
+    bool vex_w = true;
+    bool vector256 = false;
+    vex_prefix(src, nds->encoding(), dst->encoding(),
+               VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
+  }
   int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
                              VexSimdPrefix pre, VexOpcode opc,
                              bool vex_w, bool vector256);
 
+  int  vex_prefix_0F38_and_encode(Register dst, Register nds, Register src) {
+    bool vex_w = false;
+    bool vector256 = false;
+    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
+  }
+  int  vex_prefix_0F38_and_encode_q(Register dst, Register nds, Register src) {
+    bool vex_w = true;
+    bool vector256 = false;
+    return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+                                 VEX_SIMD_NONE, VEX_OPCODE_0F_38, vex_w, vector256);
+  }
   int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
                              VexSimdPrefix pre, bool vector256 = false,
                              VexOpcode opc = VEX_OPCODE_0F) {
@@ -897,6 +922,27 @@
   void andq(Register dst, Address src);
   void andq(Register dst, Register src);
 
+  // BMI instructions
+  void andnl(Register dst, Register src1, Register src2);
+  void andnl(Register dst, Register src1, Address src2);
+  void andnq(Register dst, Register src1, Register src2);
+  void andnq(Register dst, Register src1, Address src2);
+
+  void blsil(Register dst, Register src);
+  void blsil(Register dst, Address src);
+  void blsiq(Register dst, Register src);
+  void blsiq(Register dst, Address src);
+
+  void blsmskl(Register dst, Register src);
+  void blsmskl(Register dst, Address src);
+  void blsmskq(Register dst, Register src);
+  void blsmskq(Register dst, Address src);
+
+  void blsrl(Register dst, Register src);
+  void blsrl(Register dst, Address src);
+  void blsrq(Register dst, Register src);
+  void blsrq(Register dst, Address src);
+
   void bsfl(Register dst, Register src);
   void bsrl(Register dst, Register src);
 
@@ -1574,6 +1620,9 @@
   void testq(Register dst, int32_t imm32);
   void testq(Register dst, Register src);
 
+  // BMI - count trailing zeros
+  void tzcntl(Register dst, Register src);
+  void tzcntq(Register dst, Register src);
 
   // Unordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
   void ucomisd(XMMRegister dst, Address src);
--- a/hotspot/src/cpu/x86/vm/globals_x86.hpp	Tue Mar 11 14:54:47 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/globals_x86.hpp	Wed Mar 12 11:24:26 2014 -0700
@@ -135,5 +135,11 @@
                                                                             \
   product(bool, UseCountLeadingZerosInstruction, false,                     \
           "Use count leading zeros instruction")                            \
+                                                                            \
+  product(bool, UseCountTrailingZerosInstruction, false,                    \
+          "Use count trailing zeros instruction")                           \
+                                                                            \
+  product(bool, UseBMI1Instructions, false,                                 \
+          "Use BMI instructions")
 
 #endif // CPU_X86_VM_GLOBALS_X86_HPP
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Tue Mar 11 14:54:47 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp	Wed Mar 12 11:24:26 2014 -0700
@@ -429,7 +429,7 @@
   }
 
   char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
                cores_per_cpu(), threads_per_core(),
                cpu_family(), _model, _stepping,
                (supports_cmov() ? ", cmov" : ""),
@@ -455,7 +455,9 @@
                (supports_ht() ? ", ht": ""),
                (supports_tsc() ? ", tsc": ""),
                (supports_tscinv_bit() ? ", tscinvbit": ""),
-               (supports_tscinv() ? ", tscinv": ""));
+               (supports_tscinv() ? ", tscinv": ""),
+               (supports_bmi1() ? ", bmi1" : ""),
+               (supports_bmi2() ? ", bmi2" : ""));
   _features_str = strdup(buf);
 
   // UseSSE is set to the smaller of what hardware supports and what
@@ -600,13 +602,6 @@
       }
     }
 
-    // Use count leading zeros count instruction if available.
-    if (supports_lzcnt()) {
-      if (FLAG_IS_DEFAULT(UseCountLeadingZerosInstruction)) {
-        UseCountLeadingZerosInstruction = true;
-      }
-    }
-
     // some defaults for AMD family 15h
     if ( cpu_family() == 0x15 ) {
       // On family 15h processors default is no sw prefetch
@@ -692,6 +687,35 @@
     }
 #endif // COMPILER2
 
+  // Use count leading zeros count instruction if available.
+  if (supports_lzcnt()) {
+    if (FLAG_IS_DEFAULT(UseCountLeadingZerosInstruction)) {
+      UseCountLeadingZerosInstruction = true;
+    }
+   } else if (UseCountLeadingZerosInstruction) {
+    warning("lzcnt instruction is not available on this CPU");
+    FLAG_SET_DEFAULT(UseCountLeadingZerosInstruction, false);
+  }
+
+  if (supports_bmi1()) {
+    if (FLAG_IS_DEFAULT(UseBMI1Instructions)) {
+      UseBMI1Instructions = true;
+    }
+  } else if (UseBMI1Instructions) {
+    warning("BMI1 instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseBMI1Instructions, false);
+  }
+
+  // Use count trailing zeros instruction if available
+  if (supports_bmi1()) {
+    if (FLAG_IS_DEFAULT(UseCountTrailingZerosInstruction)) {
+      UseCountTrailingZerosInstruction = UseBMI1Instructions;
+    }
+  } else if (UseCountTrailingZerosInstruction) {
+    warning("tzcnt instruction is not available on this CPU");
+    FLAG_SET_DEFAULT(UseCountTrailingZerosInstruction, false);
+  }
+
   // Use population count instruction if available.
   if (supports_popcnt()) {
     if (FLAG_IS_DEFAULT(UsePopCountInstruction)) {
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Tue Mar 11 14:54:47 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp	Wed Mar 12 11:24:26 2014 -0700
@@ -141,7 +141,8 @@
     struct {
       uint32_t LahfSahf     : 1,
                CmpLegacy    : 1,
-                            : 4,
+                            : 3,
+               lzcnt_intel  : 1,
                lzcnt        : 1,
                sse4a        : 1,
                misalignsse  : 1,
@@ -251,7 +252,9 @@
     CPU_AVX2   = (1 << 18),
     CPU_AES    = (1 << 19),
     CPU_ERMS   = (1 << 20), // enhanced 'rep movsb/stosb' instructions
-    CPU_CLMUL  = (1 << 21) // carryless multiply for CRC
+    CPU_CLMUL  = (1 << 21), // carryless multiply for CRC
+    CPU_BMI1   = (1 << 22),
+    CPU_BMI2   = (1 << 23)
   } cpuFeatureFlags;
 
   enum {
@@ -423,6 +426,8 @@
       if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0)
         result |= CPU_AVX2;
     }
+    if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
+      result |= CPU_BMI1;
     if (_cpuid_info.std_cpuid1_edx.bits.tsc != 0)
       result |= CPU_TSC;
     if (_cpuid_info.ext_cpuid7_edx.bits.tsc_invariance != 0)
@@ -444,6 +449,13 @@
       if (_cpuid_info.ext_cpuid1_ecx.bits.sse4a != 0)
         result |= CPU_SSE4A;
     }
+    // Intel features.
+    if(is_intel()) {
+      if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
+        result |= CPU_BMI2;
+      if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
+        result |= CPU_LZCNT;
+    }
 
     return result;
   }
@@ -560,7 +572,8 @@
   static bool supports_aes()      { return (_cpuFeatures & CPU_AES) != 0; }
   static bool supports_erms()     { return (_cpuFeatures & CPU_ERMS) != 0; }
   static bool supports_clmul()    { return (_cpuFeatures & CPU_CLMUL) != 0; }
-
+  static bool supports_bmi1()     { return (_cpuFeatures & CPU_BMI1) != 0; }
+  static bool supports_bmi2()     { return (_cpuFeatures & CPU_BMI2) != 0; }
   // Intel features
   static bool is_intel_family_core() { return is_intel() &&
                                        extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/hotspot/src/cpu/x86/vm/x86_32.ad	Tue Mar 11 14:54:47 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad	Wed Mar 12 11:24:26 2014 -0700
@@ -5163,6 +5163,19 @@
 %}
 
 instruct countTrailingZerosI(rRegI dst, rRegI src, eFlagsReg cr) %{
+  predicate(UseCountTrailingZerosInstruction);
+  match(Set dst (CountTrailingZerosI src));
+  effect(KILL cr);
+
+  format %{ "TZCNT    $dst, $src\t# count trailing zeros (int)" %}
+  ins_encode %{
+    __ tzcntl($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct countTrailingZerosI_bsf(rRegI dst, rRegI src, eFlagsReg cr) %{
+  predicate(!UseCountTrailingZerosInstruction);
   match(Set dst (CountTrailingZerosI src));
   effect(KILL cr);
 
@@ -5182,6 +5195,30 @@
 %}
 
 instruct countTrailingZerosL(rRegI dst, eRegL src, eFlagsReg cr) %{
+  predicate(UseCountTrailingZerosInstruction);
+  match(Set dst (CountTrailingZerosL src));
+  effect(TEMP dst, KILL cr);
+
+  format %{ "TZCNT  $dst, $src.lo\t# count trailing zeros (long) \n\t"
+            "JNC    done\n\t"
+            "TZCNT  $dst, $src.hi\n\t"
+            "ADD    $dst, 32\n"
+            "done:" %}
+  ins_encode %{
+    Register Rdst = $dst$$Register;
+    Register Rsrc = $src$$Register;
+    Label done;
+    __ tzcntl(Rdst, Rsrc);
+    __ jccb(Assembler::carryClear, done);
+    __ tzcntl(Rdst, HIGH_FROM_LOW(Rsrc));
+    __ addl(Rdst, BitsPerInt);
+    __ bind(done);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct countTrailingZerosL_bsf(rRegI dst, eRegL src, eFlagsReg cr) %{
+  predicate(!UseCountTrailingZerosInstruction);
   match(Set dst (CountTrailingZerosL src));
   effect(TEMP dst, KILL cr);
 
@@ -8027,6 +8064,123 @@
   ins_pipe( ialu_mem_imm );
 %}
 
+// BMI1 instructions
+instruct andnI_rReg_rReg_rReg(rRegI dst, rRegI src1, rRegI src2, immI_M1 minus_1, eFlagsReg cr) %{
+  match(Set dst (AndI (XorI src1 minus_1) src2));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "ANDNL  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ andnl($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct andnI_rReg_rReg_mem(rRegI dst, rRegI src1, memory src2, immI_M1 minus_1, eFlagsReg cr) %{
+  match(Set dst (AndI (XorI src1 minus_1) (LoadI src2) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "ANDNL  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ andnl($dst$$Register, $src1$$Register, $src2$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI0 imm_zero, eFlagsReg cr) %{
+  match(Set dst (AndI (SubI imm_zero src) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "BLSIL  $dst, $src" %}
+
+  ins_encode %{
+    __ blsil($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsiI_rReg_mem(rRegI dst, memory src, immI0 imm_zero, eFlagsReg cr) %{
+  match(Set dst (AndI (SubI imm_zero (LoadI src) ) (LoadI src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "BLSIL  $dst, $src" %}
+
+  ins_encode %{
+    __ blsil($dst$$Register, $src$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsmskI_rReg_rReg(rRegI dst, rRegI src, immI_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (XorI (AddI src minus_1) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "BLSMSKL $dst, $src" %}
+
+  ins_encode %{
+    __ blsmskl($dst$$Register, $src$$Register);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsmskI_rReg_mem(rRegI dst, memory src, immI_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (XorI (AddI (LoadI src) minus_1) (LoadI src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "BLSMSKL $dst, $src" %}
+
+  ins_encode %{
+    __ blsmskl($dst$$Register, $src$$Address);
+  %}
+
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsrI_rReg_rReg(rRegI dst, rRegI src, immI_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (AndI (AddI src minus_1) src) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "BLSRL  $dst, $src" %}
+
+  ins_encode %{
+    __ blsrl($dst$$Register, $src$$Register);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsrI_rReg_mem(rRegI dst, memory src, immI_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (AndI (AddI (LoadI src) minus_1) (LoadI src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "BLSRL  $dst, $src" %}
+
+  ins_encode %{
+    __ blsrl($dst$$Register, $src$$Address);
+  %}
+
+  ins_pipe(ialu_reg_mem);
+%}
+
 // Or Instructions
 // Or Register with Register
 instruct orI_eReg(rRegI dst, rRegI src, eFlagsReg cr) %{
@@ -8649,6 +8803,210 @@
   ins_pipe( ialu_reg_long_mem );
 %}
 
+// BMI1 instructions
+instruct andnL_eReg_eReg_eReg(eRegL dst, eRegL src1, eRegL src2, immL_M1 minus_1, eFlagsReg cr) %{
+  match(Set dst (AndL (XorL src1 minus_1) src2));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  format %{ "ANDNL  $dst.lo, $src1.lo, $src2.lo\n\t"
+            "ANDNL  $dst.hi, $src1.hi, $src2.hi"
+         %}
+
+  ins_encode %{
+    Register Rdst = $dst$$Register;
+    Register Rsrc1 = $src1$$Register;
+    Register Rsrc2 = $src2$$Register;
+    __ andnl(Rdst, Rsrc1, Rsrc2);
+    __ andnl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc1), HIGH_FROM_LOW(Rsrc2));
+  %}
+  ins_pipe(ialu_reg_reg_long);
+%}
+
+instruct andnL_eReg_eReg_mem(eRegL dst, eRegL src1, memory src2, immL_M1 minus_1, eFlagsReg cr) %{
+  match(Set dst (AndL (XorL src1 minus_1) (LoadL src2) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  ins_cost(125);
+  format %{ "ANDNL  $dst.lo, $src1.lo, $src2\n\t"
+            "ANDNL  $dst.hi, $src1.hi, $src2+4"
+         %}
+
+  ins_encode %{
+    Register Rdst = $dst$$Register;
+    Register Rsrc1 = $src1$$Register;
+    Address src2_hi = Address::make_raw($src2$$base, $src2$$index, $src2$$scale, $src2$$disp + 4, relocInfo::none);
+
+    __ andnl(Rdst, Rsrc1, $src2$$Address);
+    __ andnl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc1), src2_hi);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsiL_eReg_eReg(eRegL dst, eRegL src, immL0 imm_zero, eFlagsReg cr) %{
+  match(Set dst (AndL (SubL imm_zero src) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  format %{ "MOVL   $dst.hi, 0\n\t"
+            "BLSIL  $dst.lo, $src.lo\n\t"
+            "JNZ    done\n\t"
+            "BLSIL  $dst.hi, $src.hi\n"
+            "done:"
+         %}
+
+  ins_encode %{
+    Label done;
+    Register Rdst = $dst$$Register;
+    Register Rsrc = $src$$Register;
+    __ movl(HIGH_FROM_LOW(Rdst), 0);
+    __ blsil(Rdst, Rsrc);
+    __ jccb(Assembler::notZero, done);
+    __ blsil(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc));
+    __ bind(done);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsiL_eReg_mem(eRegL dst, memory src, immL0 imm_zero, eFlagsReg cr) %{
+  match(Set dst (AndL (SubL imm_zero (LoadL src) ) (LoadL src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  ins_cost(125);
+  format %{ "MOVL   $dst.hi, 0\n\t"
+            "BLSIL  $dst.lo, $src\n\t"
+            "JNZ    done\n\t"
+            "BLSIL  $dst.hi, $src+4\n"
+            "done:"
+         %}
+
+  ins_encode %{
+    Label done;
+    Register Rdst = $dst$$Register;
+    Address src_hi = Address::make_raw($src$$base, $src$$index, $src$$scale, $src$$disp + 4, relocInfo::none);
+
+    __ movl(HIGH_FROM_LOW(Rdst), 0);
+    __ blsil(Rdst, $src$$Address);
+    __ jccb(Assembler::notZero, done);
+    __ blsil(HIGH_FROM_LOW(Rdst), src_hi);
+    __ bind(done);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsmskL_eReg_eReg(eRegL dst, eRegL src, immL_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (XorL (AddL src minus_1) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  format %{ "MOVL    $dst.hi, 0\n\t"
+            "BLSMSKL $dst.lo, $src.lo\n\t"
+            "JNC     done\n\t"
+            "BLSMSKL $dst.hi, $src.hi\n"
+            "done:"
+         %}
+
+  ins_encode %{
+    Label done;
+    Register Rdst = $dst$$Register;
+    Register Rsrc = $src$$Register;
+    __ movl(HIGH_FROM_LOW(Rdst), 0);
+    __ blsmskl(Rdst, Rsrc);
+    __ jccb(Assembler::carryClear, done);
+    __ blsmskl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc));
+    __ bind(done);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsmskL_eReg_mem(eRegL dst, memory src, immL_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (XorL (AddL (LoadL src) minus_1) (LoadL src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  ins_cost(125);
+  format %{ "MOVL    $dst.hi, 0\n\t"
+            "BLSMSKL $dst.lo, $src\n\t"
+            "JNC     done\n\t"
+            "BLSMSKL $dst.hi, $src+4\n"
+            "done:"
+         %}
+
+  ins_encode %{
+    Label done;
+    Register Rdst = $dst$$Register;
+    Address src_hi = Address::make_raw($src$$base, $src$$index, $src$$scale, $src$$disp + 4, relocInfo::none);
+
+    __ movl(HIGH_FROM_LOW(Rdst), 0);
+    __ blsmskl(Rdst, $src$$Address);
+    __ jccb(Assembler::carryClear, done);
+    __ blsmskl(HIGH_FROM_LOW(Rdst), src_hi);
+    __ bind(done);
+  %}
+
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsrL_eReg_eReg(eRegL dst, eRegL src, immL_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (AndL (AddL src minus_1) src) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  format %{ "MOVL   $dst.hi, $src.hi\n\t"
+            "BLSRL  $dst.lo, $src.lo\n\t"
+            "JNC    done\n\t"
+            "BLSRL  $dst.hi, $src.hi\n"
+            "done:"
+  %}
+
+  ins_encode %{
+    Label done;
+    Register Rdst = $dst$$Register;
+    Register Rsrc = $src$$Register;
+    __ movl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc));
+    __ blsrl(Rdst, Rsrc);
+    __ jccb(Assembler::carryClear, done);
+    __ blsrl(HIGH_FROM_LOW(Rdst), HIGH_FROM_LOW(Rsrc));
+    __ bind(done);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsrL_eReg_mem(eRegL dst, memory src, immL_M1 minus_1, eFlagsReg cr)
+%{
+  match(Set dst (AndL (AddL (LoadL src) minus_1) (LoadL src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr, TEMP dst);
+
+  ins_cost(125);
+  format %{ "MOVL   $dst.hi, $src+4\n\t"
+            "BLSRL  $dst.lo, $src\n\t"
+            "JNC    done\n\t"
+            "BLSRL  $dst.hi, $src+4\n"
+            "done:"
+  %}
+
+  ins_encode %{
+    Label done;
+    Register Rdst = $dst$$Register;
+    Address src_hi = Address::make_raw($src$$base, $src$$index, $src$$scale, $src$$disp + 4, relocInfo::none);
+    __ movl(HIGH_FROM_LOW(Rdst), src_hi);
+    __ blsrl(Rdst, $src$$Address);
+    __ jccb(Assembler::carryClear, done);
+    __ blsrl(HIGH_FROM_LOW(Rdst), src_hi);
+    __ bind(done);
+  %}
+
+  ins_pipe(ialu_reg_mem);
+%}
+
 // Or Long Register with Register
 instruct orl_eReg(eRegL dst, eRegL src, eFlagsReg cr) %{
   match(Set dst (OrL dst src));
--- a/hotspot/src/cpu/x86/vm/x86_64.ad	Tue Mar 11 14:54:47 2014 -0700
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad	Wed Mar 12 11:24:26 2014 -0700
@@ -6022,6 +6022,19 @@
 %}
 
 instruct countTrailingZerosI(rRegI dst, rRegI src, rFlagsReg cr) %{
+  predicate(UseCountTrailingZerosInstruction);
+  match(Set dst (CountTrailingZerosI src));
+  effect(KILL cr);
+
+  format %{ "tzcntl    $dst, $src\t# count trailing zeros (int)" %}
+  ins_encode %{
+    __ tzcntl($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct countTrailingZerosI_bsf(rRegI dst, rRegI src, rFlagsReg cr) %{
+  predicate(!UseCountTrailingZerosInstruction);
   match(Set dst (CountTrailingZerosI src));
   effect(KILL cr);
 
@@ -6041,6 +6054,19 @@
 %}
 
 instruct countTrailingZerosL(rRegI dst, rRegL src, rFlagsReg cr) %{
+  predicate(UseCountTrailingZerosInstruction);
+  match(Set dst (CountTrailingZerosL src));
+  effect(KILL cr);
+
+  format %{ "tzcntq    $dst, $src\t# count trailing zeros (long)" %}
+  ins_encode %{
+    __ tzcntq($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct countTrailingZerosL_bsf(rRegI dst, rRegL src, rFlagsReg cr) %{
+  predicate(!UseCountTrailingZerosInstruction);
   match(Set dst (CountTrailingZerosL src));
   effect(KILL cr);
 
@@ -8622,6 +8648,122 @@
   ins_pipe(ialu_mem_imm);
 %}
 
+// BMI1 instructions
+instruct andnI_rReg_rReg_mem(rRegI dst, rRegI src1, memory src2, immI_M1 minus_1, rFlagsReg cr) %{
+  match(Set dst (AndI (XorI src1 minus_1) (LoadI src2)));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "andnl  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ andnl($dst$$Register, $src1$$Register, $src2$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct andnI_rReg_rReg_rReg(rRegI dst, rRegI src1, rRegI src2, immI_M1 minus_1, rFlagsReg cr) %{
+  match(Set dst (AndI (XorI src1 minus_1) src2));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "andnl  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ andnl($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsiI_rReg_rReg(rRegI dst, rRegI src, immI0 imm_zero, rFlagsReg cr) %{
+  match(Set dst (AndI (SubI imm_zero src) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "blsil  $dst, $src" %}
+
+  ins_encode %{
+    __ blsil($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsiI_rReg_mem(rRegI dst, memory src, immI0 imm_zero, rFlagsReg cr) %{
+  match(Set dst (AndI (SubI imm_zero (LoadI src) ) (LoadI src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "blsil  $dst, $src" %}
+
+  ins_encode %{
+    __ blsil($dst$$Register, $src$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsmskI_rReg_mem(rRegI dst, memory src, immI_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (XorI (AddI (LoadI src) minus_1) (LoadI src) ) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "blsmskl $dst, $src" %}
+
+  ins_encode %{
+    __ blsmskl($dst$$Register, $src$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsmskI_rReg_rReg(rRegI dst, rRegI src, immI_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (XorI (AddI src minus_1) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "blsmskl $dst, $src" %}
+
+  ins_encode %{
+    __ blsmskl($dst$$Register, $src$$Register);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsrI_rReg_rReg(rRegI dst, rRegI src, immI_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (AndI (AddI src minus_1) src) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "blsrl  $dst, $src" %}
+
+  ins_encode %{
+    __ blsrl($dst$$Register, $src$$Register);
+  %}
+
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsrI_rReg_mem(rRegI dst, memory src, immI_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (AndI (AddI (LoadI src) minus_1) (LoadI src) ) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "blsrl  $dst, $src" %}
+
+  ins_encode %{
+    __ blsrl($dst$$Register, $src$$Address);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
 // Or Instructions
 // Or Register with Register
 instruct orI_rReg(rRegI dst, rRegI src, rFlagsReg cr)
@@ -8853,6 +8995,122 @@
   ins_pipe(ialu_mem_imm);
 %}
 
+// BMI1 instructions
+instruct andnL_rReg_rReg_mem(rRegL dst, rRegL src1, memory src2, immL_M1 minus_1, rFlagsReg cr) %{
+  match(Set dst (AndL (XorL src1 minus_1) (LoadL src2)));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "andnq  $dst, $src1, $src2" %}
+
+  ins_encode %{
+    __ andnq($dst$$Register, $src1$$Register, $src2$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct andnL_rReg_rReg_rReg(rRegL dst, rRegL src1, rRegL src2, immL_M1 minus_1, rFlagsReg cr) %{
+  match(Set dst (AndL (XorL src1 minus_1) src2));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "andnq  $dst, $src1, $src2" %}
+
+  ins_encode %{
+  __ andnq($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsiL_rReg_rReg(rRegL dst, rRegL src, immL0 imm_zero, rFlagsReg cr) %{
+  match(Set dst (AndL (SubL imm_zero src) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "blsiq  $dst, $src" %}
+
+  ins_encode %{
+    __ blsiq($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsiL_rReg_mem(rRegL dst, memory src, immL0 imm_zero, rFlagsReg cr) %{
+  match(Set dst (AndL (SubL imm_zero (LoadL src) ) (LoadL src) ));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "blsiq  $dst, $src" %}
+
+  ins_encode %{
+    __ blsiq($dst$$Register, $src$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsmskL_rReg_mem(rRegL dst, memory src, immL_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (XorL (AddL (LoadL src) minus_1) (LoadL src) ) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "blsmskq $dst, $src" %}
+
+  ins_encode %{
+    __ blsmskq($dst$$Register, $src$$Address);
+  %}
+  ins_pipe(ialu_reg_mem);
+%}
+
+instruct blsmskL_rReg_rReg(rRegL dst, rRegL src, immL_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (XorL (AddL src minus_1) src));
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "blsmskq $dst, $src" %}
+
+  ins_encode %{
+    __ blsmskq($dst$$Register, $src$$Register);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsrL_rReg_rReg(rRegL dst, rRegL src, immL_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (AndL (AddL src minus_1) src) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  format %{ "blsrq  $dst, $src" %}
+
+  ins_encode %{
+    __ blsrq($dst$$Register, $src$$Register);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
+instruct blsrL_rReg_mem(rRegL dst, memory src, immL_M1 minus_1, rFlagsReg cr)
+%{
+  match(Set dst (AndL (AddL (LoadL src) minus_1) (LoadL src)) );
+  predicate(UseBMI1Instructions);
+  effect(KILL cr);
+
+  ins_cost(125);
+  format %{ "blsrq  $dst, $src" %}
+
+  ins_encode %{
+    __ blsrq($dst$$Register, $src$$Address);
+  %}
+
+  ins_pipe(ialu_reg);
+%}
+
 // Or Instructions
 // Or Register with Register
 instruct orL_rReg(rRegL dst, rRegL src, rFlagsReg cr)
--- a/hotspot/src/share/vm/adlc/formssel.cpp	Tue Mar 11 14:54:47 2014 -0700
+++ b/hotspot/src/share/vm/adlc/formssel.cpp	Wed Mar 12 11:24:26 2014 -0700
@@ -660,6 +660,7 @@
   int USE_of_memory  = 0;
   int DEF_of_memory  = 0;
   const char*    last_memory_DEF = NULL; // to test DEF/USE pairing in asserts
+  const char*    last_memory_USE = NULL;
   Component     *unique          = NULL;
   Component     *comp            = NULL;
   ComponentList &components      = (ComponentList &)_components;
@@ -681,7 +682,16 @@
           assert(0 == strcmp(last_memory_DEF, comp->_name), "every memory DEF is followed by a USE of the same name");
           last_memory_DEF = NULL;
         }
-        USE_of_memory++;
+        // Handles same memory being used multiple times in the case of BMI1 instructions.
+        if (last_memory_USE != NULL) {
+          if (strcmp(comp->_name, last_memory_USE) != 0) {
+            USE_of_memory++;
+          }
+        } else {
+          USE_of_memory++;
+        }
+        last_memory_USE = comp->_name;
+
         if (DEF_of_memory == 0)  // defs take precedence
           unique = comp;
       } else {
--- a/hotspot/src/share/vm/opto/matcher.cpp	Tue Mar 11 14:54:47 2014 -0700
+++ b/hotspot/src/share/vm/opto/matcher.cpp	Wed Mar 12 11:24:26 2014 -0700
@@ -1922,6 +1922,105 @@
   return OptoReg::as_OptoReg(regs.first());
 }
 
+// This function identifies sub-graphs in which a 'load' node is
+// input to two different nodes, and such that it can be matched
+// with BMI instructions like blsi, blsr, etc.
+// Example : for b = -a[i] & a[i] can be matched to blsi r32, m32.
+// The graph is (AndL (SubL Con0 LoadL*) LoadL*), where LoadL*
+// refers to the same node.
+#ifdef X86
+// Match the generic fused operations pattern (op1 (op2 Con{ConType} mop) mop)
+// This is a temporary solution until we make DAGs expressible in ADL.
+template<typename ConType>
+class FusedPatternMatcher {
+  Node* _op1_node;
+  Node* _mop_node;
+  int _con_op;
+
+  static int match_next(Node* n, int next_op, int next_op_idx) {
+    if (n->in(1) == NULL || n->in(2) == NULL) {
+      return -1;
+    }
+
+    if (next_op_idx == -1) { // n is commutative, try rotations
+      if (n->in(1)->Opcode() == next_op) {
+        return 1;
+      } else if (n->in(2)->Opcode() == next_op) {
+        return 2;
+      }
+    } else {
+      assert(next_op_idx > 0 && next_op_idx <= 2, "Bad argument index");
+      if (n->in(next_op_idx)->Opcode() == next_op) {
+        return next_op_idx;
+      }
+    }
+    return -1;
+  }
+public:
+  FusedPatternMatcher(Node* op1_node, Node *mop_node, int con_op) :
+    _op1_node(op1_node), _mop_node(mop_node), _con_op(con_op) { }
+
+  bool match(int op1, int op1_op2_idx,  // op1 and the index of the op1->op2 edge, -1 if op1 is commutative
+             int op2, int op2_con_idx,  // op2 and the index of the op2->con edge, -1 if op2 is commutative
+             typename ConType::NativeType con_value) {
+    if (_op1_node->Opcode() != op1) {
+      return false;
+    }
+    if (_mop_node->outcnt() > 2) {
+      return false;
+    }
+    op1_op2_idx = match_next(_op1_node, op2, op1_op2_idx);
+    if (op1_op2_idx == -1) {
+      return false;
+    }
+    // Memory operation must be the other edge
+    int op1_mop_idx = (op1_op2_idx & 1) + 1;
+
+    // Check that the mop node is really what we want
+    if (_op1_node->in(op1_mop_idx) == _mop_node) {
+      Node *op2_node = _op1_node->in(op1_op2_idx);
+      if (op2_node->outcnt() > 1) {
+        return false;
+      }
+      assert(op2_node->Opcode() == op2, "Should be");
+      op2_con_idx = match_next(op2_node, _con_op, op2_con_idx);
+      if (op2_con_idx == -1) {
+        return false;
+      }
+      // Memory operation must be the other edge
+      int op2_mop_idx = (op2_con_idx & 1) + 1;
+      // Check that the memory operation is the same node
+      if (op2_node->in(op2_mop_idx) == _mop_node) {
+        // Now check the constant
+        const Type* con_type = op2_node->in(op2_con_idx)->bottom_type();
+        if (con_type != Type::TOP && ConType::as_self(con_type)->get_con() == con_value) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+};
+
+
+bool Matcher::is_bmi_pattern(Node *n, Node *m) {
+  if (n != NULL && m != NULL) {
+    if (m->Opcode() == Op_LoadI) {
+      FusedPatternMatcher<TypeInt> bmii(n, m, Op_ConI);
+      return bmii.match(Op_AndI, -1, Op_SubI,  1,  0)  ||
+             bmii.match(Op_AndI, -1, Op_AddI, -1, -1)  ||
+             bmii.match(Op_XorI, -1, Op_AddI, -1, -1);
+    } else if (m->Opcode() == Op_LoadL) {
+      FusedPatternMatcher<TypeLong> bmil(n, m, Op_ConL);
+      return bmil.match(Op_AndL, -1, Op_SubL,  1,  0) ||
+             bmil.match(Op_AndL, -1, Op_AddL, -1, -1) ||
+             bmil.match(Op_XorL, -1, Op_AddL, -1, -1);
+    }
+  }
+  return false;
+}
+#endif // X86
+
 // A method-klass-holder may be passed in the inline_cache_reg
 // and then expanded into the inline_cache_reg and a method_oop register
 //   defined in ad_<arch>.cpp
@@ -2077,6 +2176,14 @@
           set_shared(m->in(AddPNode::Base)->in(1));
         }
 
+        // if 'n' and 'm' are part of a graph for BMI instruction, clone this node.
+#ifdef X86
+        if (UseBMI1Instructions && is_bmi_pattern(n, m)) {
+          mstack.push(m, Visit);
+          continue;
+        }
+#endif
+
         // Clone addressing expressions as they are "free" in memory access instructions
         if( mem_op && i == MemNode::Address && mop == Op_AddP ) {
           // Some inputs for address expression are not put on stack
--- a/hotspot/src/share/vm/opto/matcher.hpp	Tue Mar 11 14:54:47 2014 -0700
+++ b/hotspot/src/share/vm/opto/matcher.hpp	Wed Mar 12 11:24:26 2014 -0700
@@ -79,6 +79,9 @@
 
   // Find shared Nodes, or Nodes that otherwise are Matcher roots
   void find_shared( Node *n );
+#ifdef X86
+  bool is_bmi_pattern(Node *n, Node *m);
+#endif
 
   // Debug and profile information for nodes in old space:
   GrowableArray<Node_Notes*>* _old_node_note_array;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/codegen/BMI1.java	Wed Mar 12 11:24:26 2014 -0700
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8031321
+ * @summary Support BMI1 instructions on x86/x64
+ * @run main/othervm -Xbatch -XX:-TieredCompilation -XX:CompileCommand=compileonly,BMITests.* BMI1
+ *
+ */
+
+class MemI {
+  public int x;
+  public MemI(int x) { this.x = x; }
+}
+
+class MemL {
+  public long x;
+  public MemL(long x) { this.x = x; }
+}
+
+class BMITests {
+  static int andnl(int src1, int src2) {
+    return ~src1 & src2;
+  }
+  static long andnq(long src1, long src2) {
+    return ~src1 & src2;
+  }
+  static int andnl(int src1, MemI src2) {
+    return ~src1 & src2.x;
+  }
+  static long andnq(long src1, MemL src2) {
+    return ~src1 & src2.x;
+  }
+  static int blsil(int src1) {
+    return src1 & -src1;
+  }
+  static long blsiq(long src1) {
+    return src1 & -src1;
+  }
+  static int blsil(MemI src1) {
+    return src1.x & -src1.x;
+  }
+  static long blsiq(MemL src1) {
+    return src1.x & -src1.x;
+  }
+  static int blsmskl(int src1) {
+    return (src1 - 1) ^ src1;
+  }
+  static long blsmskq(long src1) {
+    return (src1 - 1) ^ src1;
+  }
+  static int blsmskl(MemI src1) {
+    return (src1.x - 1) ^ src1.x;
+  }
+  static long blsmskq(MemL src1) {
+    return (src1.x - 1) ^ src1.x;
+  }
+  static int blsrl(int src1) {
+    return (src1 - 1) & src1;
+  }
+  static long blsrq(long src1) {
+    return (src1 - 1) & src1;
+  }
+  static int blsrl(MemI src1) {
+    return (src1.x - 1) & src1.x;
+  }
+  static long blsrq(MemL src1) {
+    return (src1.x - 1) & src1.x;
+  }
+  static int lzcntl(int src1) {
+    return Integer.numberOfLeadingZeros(src1);
+  }
+  static int lzcntq(long src1) {
+    return Long.numberOfLeadingZeros(src1);
+  }
+  static int tzcntl(int src1) {
+    return Integer.numberOfTrailingZeros(src1);
+  }
+  static int tzcntq(long src1) {
+    return Long.numberOfTrailingZeros(src1);
+  }
+}
+
+public class BMI1 {
+  private final static int ITERATIONS = 1000000;
+
+  public static void main(String[] args) {
+    int ix = 0x01234567;
+    int iy = 0x89abcdef;
+    MemI imy = new MemI(iy);
+    long lx = 0x0123456701234567L;
+    long ly = 0x89abcdef89abcdefL;
+    MemL lmy = new MemL(ly);
+
+    { // match(Set dst (AndI (XorI src1 minus_1) src2))
+      int z = BMITests.andnl(ix, iy);
+      for (int i = 0; i < ITERATIONS; i++) {
+        int ii = BMITests.andnl(ix, iy);
+        if (ii != z) {
+          throw new Error("andnl with register failed");
+        }
+      }
+    }
+    { // match(Set dst (AndL (XorL src1 minus_1) src2))
+      long z = BMITests.andnq(lx, ly);
+      for (int i = 0; i < ITERATIONS; i++) {
+        long ll = BMITests.andnq(lx, ly);
+        if (ll != z) {
+          throw new Error("andnq with register failed");
+        }
+      }
+    }
+    { // match(Set dst (AndI (XorI src1 minus_1) (LoadI src2)))
+      int z = BMITests.andnl(ix, imy);
+      for (int i = 0; i < ITERATIONS; i++) {
+        int ii = BMITests.andnl(ix, imy);
+        if (ii != z) {
+          throw new Error("andnl with memory failed");
+        }
+      }
+    }
+    { // match(Set dst (AndL (XorL src1 minus_1) (LoadL src2)))
+      long z = BMITests.andnq(lx, lmy);
+      for (int i = 0; i < ITERATIONS; i++) {
+        long ll = BMITests.andnq(lx, lmy);
+        if (ll != z) {
+          throw new Error("andnq with memory failed");
+        }
+      }
+    }
+    { // match(Set dst (AndI (SubI imm_zero src) src))
+      int z = BMITests.blsil(ix);
+      for (int i = 0; i < ITERATIONS; i++) {
+        int ii = BMITests.blsil(ix);
+        if (ii != z) {
+          throw new Error("blsil with register failed");
+        }
+      }
+    }
+    { // match(Set dst (AndL (SubL imm_zero src) src))
+      long z = BMITests.blsiq(lx);
+      for (int i = 0; i < ITERATIONS; i++) {
+        long ll = BMITests.blsiq(lx);
+        if (ll != z) {
+          throw new Error("blsiq with register failed");
+        }
+      }
+    }
+    { // match(Set dst (AndI (SubI imm_zero (LoadI src) ) (LoadI src) ))
+      int z = BMITests.blsil(imy);
+      for (int i = 0; i < ITERATIONS; i++) {
+        int ii = BMITests.blsil(imy);
+        if (ii != z) {
+          throw new Error("blsil with memory failed");
+        }
+      }
+    }
+    { // match(Set dst (AndL (SubL imm_zero (LoadL src) ) (LoadL src) ))
+      long z = BMITests.blsiq(lmy);
+      for (int i = 0; i < ITERATIONS; i++) {
+        long ll = BMITests.blsiq(lmy);
+        if (ll != z) {
+          throw new Error("blsiq with memory failed");
+        }
+      }
+    }
+
+    { // match(Set dst (XorI (AddI src minus_1) src))
+      int z = BMITests.blsmskl(ix);
+      for (int i = 0; i < ITERATIONS; i++) {
+        int ii = BMITests.blsmskl(ix);
+        if (ii != z) {
+          throw new Error("blsmskl with register failed");
+        }
+      }
+    }
+    { // match(Set dst (XorL (AddL src minus_1) src))
+      long z = BMITests.blsmskq(lx);
+      for (int i = 0; i < ITERATIONS; i++) {
+        long ll = BMITests.blsmskq(lx);
+        if (ll != z) {
+          throw new Error("blsmskq with register failed");
+        }
+      }
+    }
+    { // match(Set dst (XorI (AddI (LoadI src) minus_1) (LoadI src) ) )
+      int z = BMITests.blsmskl(imy);
+      for (int i = 0; i < ITERATIONS; i++) {
+        int ii = BMITests.blsmskl(imy);
+        if (ii != z) {
+          throw new Error("blsmskl with memory failed");
+        }
+      }
+    }
+    {  // match(Set dst (XorL (AddL (LoadL src) minus_1) (LoadL src) ) )
+      long z = BMITests.blsmskq(lmy);
+      for (int i = 0; i < ITERATIONS; i++) {
+        long ll = BMITests.blsmskq(lmy);
+        if (ll != z) {
+          throw new Error("blsmskq with memory failed");
+        }
+      }
+    }
+
+    { //  match(Set dst (AndI (AddI src minus_1) src) )
+      int z = BMITests.blsrl(ix);
+      for (int i = 0; i < ITERATIONS; i++) {
+        int ii = BMITests.blsrl(ix);
+        if (ii != z) {
+          throw new Error("blsrl with register failed");
+        }
+      }
+    }
+    { // match(Set dst (AndL (AddL src minus_1) src) )
+      long z = BMITests.blsrq(lx);
+      for (int i = 0; i < ITERATIONS; i++) {
+        long ll = BMITests.blsrq(lx);
+        if (ll != z) {
+          throw new Error("blsrq with register failed");
+        }
+      }
+    }
+    { // match(Set dst (AndI (AddI (LoadI src) minus_1) (LoadI src) ) )
+      int z = BMITests.blsrl(imy);
+      for (int i = 0; i < ITERATIONS; i++) {
+        int ii = BMITests.blsrl(imy);
+        if (ii != z) {
+          throw new Error("blsrl with memory failed");
+        }
+      }
+    }
+    { // match(Set dst (AndL (AddL (LoadL src) minus_1) (LoadL src)) )
+      long z = BMITests.blsrq(lmy);
+      for (int i = 0; i < ITERATIONS; i++) {
+        long ll = BMITests.blsrq(lmy);
+        if (ll != z) {
+          throw new Error("blsrq with memory failed");
+        }
+      }
+    }
+
+    {
+      int z = BMITests.lzcntl(ix);
+      for (int i = 0; i < ITERATIONS; i++) {
+        int ii = BMITests.lzcntl(ix);
+        if (ii != z) {
+          throw new Error("lzcntl failed");
+        }
+      }
+    }
+    {
+      int z = BMITests.lzcntq(lx);
+      for (int i = 0; i < ITERATIONS; i++) {
+        int ii = BMITests.lzcntq(lx);
+        if (ii != z) {
+          throw new Error("lzcntq failed");
+        }
+      }
+    }
+
+    {
+      int z = BMITests.tzcntl(ix);
+      for (int i = 0; i < ITERATIONS; i++) {
+        int ii = BMITests.tzcntl(ix);
+        if (ii != z) {
+          throw new Error("tzcntl failed");
+        }
+      }
+    }
+    {
+      int z = BMITests.tzcntq(lx);
+      for (int i = 0; i < ITERATIONS; i++) {
+        int ii = BMITests.tzcntq(lx);
+        if (ii != z) {
+          throw new Error("tzcntq failed");
+        }
+      }
+    }
+  }
+}