8196064: AArch64: Merging ld/st into ldp/stp in macro-assembler
authorzyao
Sat, 03 Feb 2018 10:36:58 +0800
changeset 49161 8f1bc5a0d16d
parent 49066 4aa67aba6c85
child 49162 c200b4700aeb
8196064: AArch64: Merging ld/st into ldp/stp in macro-assembler Reviewed-by: aph
src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp
src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp
src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp
src/hotspot/share/asm/codeBuffer.hpp
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Fri Feb 23 13:55:49 2018 -0800
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp	Sat Feb 03 10:36:58 2018 +0800
@@ -1794,18 +1794,63 @@
 
 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
   address prev = pc() - NativeMembar::instruction_size;
-  if (prev == code()->last_membar()) {
+  address last = code()->last_insn();
+  if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
     NativeMembar *bar = NativeMembar_at(prev);
     // We are merging two memory barrier instructions.  On AArch64 we
     // can do this simply by ORing them together.
     bar->set_kind(bar->get_kind() | order_constraint);
     BLOCK_COMMENT("merged membar");
   } else {
-    code()->set_last_membar(pc());
+    code()->set_last_insn(pc());
     dmb(Assembler::barrier(order_constraint));
   }
 }
 
+bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
+  if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
+    merge_ldst(rt, adr, size_in_bytes, is_store);
+    code()->clear_last_insn();
+    return true;
+  } else {
+    assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
+    const unsigned mask = size_in_bytes - 1;
+    if (adr.getMode() == Address::base_plus_offset &&
+        (adr.offset() & mask) == 0) { // only supports base_plus_offset.
+      code()->set_last_insn(pc());
+    }
+    return false;
+  }
+}
+
+void MacroAssembler::ldr(Register Rx, const Address &adr) {
+  // We always try to merge two adjacent loads into one ldp.
+  if (!try_merge_ldst(Rx, adr, 8, false)) {
+    Assembler::ldr(Rx, adr);
+  }
+}
+
+void MacroAssembler::ldrw(Register Rw, const Address &adr) {
+  // We always try to merge two adjacent loads into one ldp.
+  if (!try_merge_ldst(Rw, adr, 4, false)) {
+    Assembler::ldrw(Rw, adr);
+  }
+}
+
+void MacroAssembler::str(Register Rx, const Address &adr) {
+  // We always try to merge two adjacent stores into one stp.
+  if (!try_merge_ldst(Rx, adr, 8, true)) {
+    Assembler::str(Rx, adr);
+  }
+}
+
+void MacroAssembler::strw(Register Rw, const Address &adr) {
+  // We always try to merge two adjacent stores into one stp.
+  if (!try_merge_ldst(Rw, adr, 4, true)) {
+    Assembler::strw(Rw, adr);
+  }
+}
+
 // MacroAssembler routines found actually to be needed
 
 void MacroAssembler::push(Register src)
@@ -2576,6 +2621,143 @@
   return Address(base, offset);
 }
 
+// Checks whether offset is aligned.
+// Returns true if it is, else false.
+bool MacroAssembler::merge_alignment_check(Register base,
+                                           size_t size,
+                                           long cur_offset,
+                                           long prev_offset) const {
+  if (AvoidUnalignedAccesses) {
+    if (base == sp) {
+      // Checks whether low offset if aligned to pair of registers.
+      long pair_mask = size * 2 - 1;
+      long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
+      return (offset & pair_mask) == 0;
+    } else { // If base is not sp, we can't guarantee the access is aligned.
+      return false;
+    }
+  } else {
+    long mask = size - 1;
+    // Load/store pair instruction only supports element size aligned offset.
+    return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
+  }
+}
+
+// Checks whether current and previous loads/stores can be merged.
+// Returns true if it can be merged, else false.
+bool MacroAssembler::ldst_can_merge(Register rt,
+                                    const Address &adr,
+                                    size_t cur_size_in_bytes,
+                                    bool is_store) const {
+  address prev = pc() - NativeInstruction::instruction_size;
+  address last = code()->last_insn();
+
+  if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
+    return false;
+  }
+
+  if (adr.getMode() != Address::base_plus_offset || prev != last) {
+    return false;
+  }
+
+  NativeLdSt* prev_ldst = NativeLdSt_at(prev);
+  size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
+
+  assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
+  assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
+
+  if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
+    return false;
+  }
+
+  long max_offset = 63 * prev_size_in_bytes;
+  long min_offset = -64 * prev_size_in_bytes;
+
+  assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
+
+  // Only same base can be merged.
+  if (adr.base() != prev_ldst->base()) {
+    return false;
+  }
+
+  long cur_offset = adr.offset();
+  long prev_offset = prev_ldst->offset();
+  size_t diff = abs(cur_offset - prev_offset);
+  if (diff != prev_size_in_bytes) {
+    return false;
+  }
+
+  // Following cases can not be merged:
+  // ldr x2, [x2, #8]
+  // ldr x3, [x2, #16]
+  // or:
+  // ldr x2, [x3, #8]
+  // ldr x2, [x3, #16]
+  // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
+  if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
+    return false;
+  }
+
+  long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
+  // Offset range must be in ldp/stp instruction's range.
+  if (low_offset > max_offset || low_offset < min_offset) {
+    return false;
+  }
+
+  if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
+    return true;
+  }
+
+  return false;
+}
+
+// Merge current load/store with previous load/store into ldp/stp.
+void MacroAssembler::merge_ldst(Register rt,
+                                const Address &adr,
+                                size_t cur_size_in_bytes,
+                                bool is_store) {
+
+  assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
+
+  Register rt_low, rt_high;
+  address prev = pc() - NativeInstruction::instruction_size;
+  NativeLdSt* prev_ldst = NativeLdSt_at(prev);
+
+  long offset;
+
+  if (adr.offset() < prev_ldst->offset()) {
+    offset = adr.offset();
+    rt_low = rt;
+    rt_high = prev_ldst->target();
+  } else {
+    offset = prev_ldst->offset();
+    rt_low = prev_ldst->target();
+    rt_high = rt;
+  }
+
+  Address adr_p = Address(prev_ldst->base(), offset);
+  // Overwrite previous generated binary.
+  code_section()->set_end(prev);
+
+  const int sz = prev_ldst->size_in_bytes();
+  assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
+  if (!is_store) {
+    BLOCK_COMMENT("merged ldr pair");
+    if (sz == 8) {
+      ldp(rt_low, rt_high, adr_p);
+    } else {
+      ldpw(rt_low, rt_high, adr_p);
+    }
+  } else {
+    BLOCK_COMMENT("merged str pair");
+    if (sz == 8) {
+      stp(rt_low, rt_high, adr_p);
+    } else {
+      stpw(rt_low, rt_high, adr_p);
+    }
+  }
+}
+
 /**
  * Multiply 64 bit by 64 bit first loop.
  */
@@ -4152,7 +4334,7 @@
   bind(loop);
   sub(len, len, unroll);
   for (int i = -unroll; i < 0; i++)
-    str(zr, Address(t1, i * wordSize));
+    Assembler::str(zr, Address(t1, i * wordSize));
   bind(entry);
   add(t1, t1, unroll * wordSize);
   cbnz(len, loop);
--- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Fri Feb 23 13:55:49 2018 -0800
+++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp	Sat Feb 03 10:36:58 2018 +0800
@@ -150,11 +150,19 @@
 
   void bind(Label& L) {
     Assembler::bind(L);
-    code()->clear_last_membar();
+    code()->clear_last_insn();
   }
 
   void membar(Membar_mask_bits order_constraint);
 
+  using Assembler::ldr;
+  using Assembler::str;
+
+  void ldr(Register Rx, const Address &adr);
+  void ldrw(Register Rw, const Address &adr);
+  void str(Register Rx, const Address &adr);
+  void strw(Register Rx, const Address &adr);
+
   // Frame creation and destruction shared between JITs.
   void build_frame(int framesize);
   void remove_frame(int framesize);
@@ -1290,6 +1298,17 @@
   // Uses rscratch2 if the address is not directly reachable
   Address spill_address(int size, int offset, Register tmp=rscratch2);
 
+  bool merge_alignment_check(Register base, size_t size, long cur_offset, long prev_offset) const;
+
+  // Check whether two loads/stores can be merged into ldp/stp.
+  bool ldst_can_merge(Register rx, const Address &adr, size_t cur_size_in_bytes, bool is_store) const;
+
+  // Merge current load/store with previous load/store into ldp/stp.
+  void merge_ldst(Register rx, const Address &adr, size_t cur_size_in_bytes, bool is_store);
+
+  // Try to merge two loads/stores into ldp/stp. If success, returns true else false.
+  bool try_merge_ldst(Register rt, const Address &adr, size_t cur_size_in_bytes, bool is_store);
+
 public:
   void spill(Register Rx, bool is64, int offset) {
     if (is64) {
--- a/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp	Fri Feb 23 13:55:49 2018 -0800
+++ b/src/hotspot/cpu/aarch64/nativeInst_aarch64.hpp	Sat Feb 03 10:36:58 2018 +0800
@@ -131,6 +131,13 @@
     return Instruction_aarch64::extract(insn, 31, 12) == 0b11010101000000110011 &&
       Instruction_aarch64::extract(insn, 7, 0) == 0b10111111;
   }
+
+  bool is_Imm_LdSt() {
+    unsigned int insn = uint_at(0);
+    return Instruction_aarch64::extract(insn, 29, 27) == 0b111 &&
+      Instruction_aarch64::extract(insn, 23, 23) == 0b0 &&
+      Instruction_aarch64::extract(insn, 26, 25) == 0b00;
+  }
 };
 
 inline NativeInstruction* nativeInstruction_at(address address) {
@@ -532,4 +539,57 @@
   return (NativeMembar*)addr;
 }
 
+class NativeLdSt : public NativeInstruction {
+private:
+  int32_t size() { return Instruction_aarch64::extract(uint_at(0), 31, 30); }
+  // Check whether instruction is with unscaled offset.
+  bool is_ldst_ur() {
+    return (Instruction_aarch64::extract(uint_at(0), 29, 21) == 0b111000010 ||
+            Instruction_aarch64::extract(uint_at(0), 29, 21) == 0b111000000) &&
+      Instruction_aarch64::extract(uint_at(0), 11, 10) == 0b00;
+  }
+  bool is_ldst_unsigned_offset() {
+    return Instruction_aarch64::extract(uint_at(0), 29, 22) == 0b11100101 ||
+      Instruction_aarch64::extract(uint_at(0), 29, 22) == 0b11100100;
+  }
+public:
+  Register target() {
+    uint32_t r = Instruction_aarch64::extract(uint_at(0), 4, 0);
+    return r == 0x1f ? zr : as_Register(r);
+  }
+  Register base() {
+    uint32_t b = Instruction_aarch64::extract(uint_at(0), 9, 5);
+    return b == 0x1f ? sp : as_Register(b);
+  }
+  int64_t offset() {
+    if (is_ldst_ur()) {
+      return Instruction_aarch64::sextract(uint_at(0), 20, 12);
+    } else if (is_ldst_unsigned_offset()) {
+      return Instruction_aarch64::extract(uint_at(0), 21, 10) << size();
+    } else {
+      // others like: pre-index or post-index.
+      ShouldNotReachHere();
+      return 0;
+    }
+  }
+  size_t size_in_bytes() { return 1 << size(); }
+  bool is_not_pre_post_index() { return (is_ldst_ur() || is_ldst_unsigned_offset()); }
+  bool is_load() {
+    assert(Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b01 ||
+           Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b00, "must be ldr or str");
+
+    return Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b01;
+  }
+  bool is_store() {
+    assert(Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b01 ||
+           Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b00, "must be ldr or str");
+
+    return Instruction_aarch64::extract(uint_at(0), 23, 22) == 0b00;
+  }
+};
+
+inline NativeLdSt *NativeLdSt_at(address addr) {
+  assert(nativeInstruction_at(addr)->is_Imm_LdSt(), "no immediate load/store found");
+  return (NativeLdSt*)addr;
+}
 #endif // CPU_AARCH64_VM_NATIVEINST_AARCH64_HPP
--- a/src/hotspot/share/asm/codeBuffer.hpp	Fri Feb 23 13:55:49 2018 -0800
+++ b/src/hotspot/share/asm/codeBuffer.hpp	Sat Feb 03 10:36:58 2018 +0800
@@ -380,7 +380,7 @@
   OopRecorder  _default_oop_recorder;  // override with initialize_oop_recorder
   Arena*       _overflow_arena;
 
-  address      _last_membar;     // used to merge consecutive memory barriers
+  address      _last_insn;      // used to merge consecutive memory barriers, loads or stores.
 
   address      _decode_begin;   // start address for decode
   address      decode_begin();
@@ -395,7 +395,7 @@
     _decode_begin    = NULL;
     _overflow_arena  = NULL;
     _code_strings    = CodeStrings();
-    _last_membar     = NULL;
+    _last_insn       = NULL;
   }
 
   void initialize(address code_start, csize_t code_size) {
@@ -587,9 +587,9 @@
   OopRecorder* oop_recorder() const   { return _oop_recorder; }
   CodeStrings& strings()              { return _code_strings; }
 
-  address last_membar() const { return _last_membar; }
-  void set_last_membar(address a) { _last_membar = a; }
-  void clear_last_membar() { set_last_membar(NULL); }
+  address last_insn() const { return _last_insn; }
+  void set_last_insn(address a) { _last_insn = a; }
+  void clear_last_insn() { set_last_insn(NULL); }
 
   void free_strings() {
     if (!_code_strings.is_null()) {