8078497: C2's superword optimization causes unaligned memory accesses
authorthartmann
Fri, 08 May 2015 12:19:17 +0200
changeset 30625 80a08f9b2d63
parent 30596 0322b394e7fd
child 30626 86ba6ca7ca4a
8078497: C2's superword optimization causes unaligned memory accesses Summary: Prevent vectorization of memory operations with different invariant offsets if unaligned memory accesses are not allowed. Reviewed-by: kvn
hotspot/src/share/vm/opto/superword.cpp
hotspot/src/share/vm/opto/superword.hpp
hotspot/test/compiler/loopopts/superword/TestVectorizationWithInvariant.java
--- a/hotspot/src/share/vm/opto/superword.cpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/superword.cpp	Fri May 08 12:19:17 2015 +0200
@@ -293,6 +293,13 @@
           // if unaligned memory access is not allowed because number of
           // iterations in pre-loop will be not enough to align it.
           create_pack = false;
+        } else {
+          SWPointer p2(best_align_to_mem_ref, this);
+          if (align_to_ref_p.invar() != p2.invar()) {
+            // Do not vectorize memory accesses with different invariants
+            // if unaligned memory accesses are not allowed.
+            create_pack = false;
+          }
         }
       }
     } else {
@@ -516,24 +523,50 @@
   if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
     return true;
   }
-  // If initial offset from start of object is computable,
-  // compute alignment within the vector.
+  // If the initial offset from start of the object is computable,
+  // check if the pre-loop can align the final offset accordingly.
+  //
+  // In other words: Can we find an i such that the offset
+  // after i pre-loop iterations is aligned to vw?
+  //   (init_offset + pre_loop) % vw == 0              (1)
+  // where
+  //   pre_loop = i * span
+  // is the number of bytes added to the offset by i pre-loop iterations.
+  //
+  // For this to hold we need pre_loop to increase init_offset by
+  //   pre_loop = vw - (init_offset % vw)
+  //
+  // This is only possible if pre_loop is divisible by span because each
+  // pre-loop iteration increases the initial offset by 'span' bytes:
+  //   (vw - (init_offset % vw)) % span == 0
+  //
   int vw = vector_width_in_bytes(p.mem());
   assert(vw > 1, "sanity");
-  if (vw % span == 0) {
-    Node* init_nd = pre_end->init_trip();
-    if (init_nd->is_Con() && p.invar() == NULL) {
-      int init = init_nd->bottom_type()->is_int()->get_con();
-
-      int init_offset = init * p.scale_in_bytes() + offset;
-      assert(init_offset >= 0, "positive offset from object start");
-
+  Node* init_nd = pre_end->init_trip();
+  if (init_nd->is_Con() && p.invar() == NULL) {
+    int init = init_nd->bottom_type()->is_int()->get_con();
+    int init_offset = init * p.scale_in_bytes() + offset;
+    assert(init_offset >= 0, "positive offset from object start");
+    if (vw % span == 0) {
+      // If vm is a multiple of span, we use formula (1).
       if (span > 0) {
         return (vw - (init_offset % vw)) % span == 0;
       } else {
         assert(span < 0, "nonzero stride * scale");
         return (init_offset % vw) % -span == 0;
       }
+    } else if (span % vw == 0) {
+      // If span is a multiple of vw, we can simplify formula (1) to:
+      //   (init_offset + i * span) % vw == 0
+      //     =>
+      //   (init_offset % vw) + ((i * span) % vw) == 0
+      //     =>
+      //   init_offset % vw == 0
+      //
+      // Because we add a multiple of vw to the initial offset, the final
+      // offset is a multiple of vw if and only if init_offset is a multiple.
+      //
+      return (init_offset % vw) == 0;
     }
   }
   return false;
@@ -545,17 +578,23 @@
   SWPointer align_to_ref_p(mem_ref, this);
   int offset = align_to_ref_p.offset_in_bytes();
   int scale  = align_to_ref_p.scale_in_bytes();
+  int elt_size = align_to_ref_p.memory_size();
   int vw       = vector_width_in_bytes(mem_ref);
   assert(vw > 1, "sanity");
-  int stride_sign   = (scale * iv_stride()) > 0 ? 1 : -1;
-  // At least one iteration is executed in pre-loop by default. As result
-  // several iterations are needed to align memory operations in main-loop even
-  // if offset is 0.
-  int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
-  int elt_size = align_to_ref_p.memory_size();
-  assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
-         err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
-  int iv_adjustment = iv_adjustment_in_bytes/elt_size;
+  int iv_adjustment;
+  if (scale != 0) {
+    int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
+    // At least one iteration is executed in pre-loop by default. As result
+    // several iterations are needed to align memory operations in main-loop even
+    // if offset is 0.
+    int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
+    assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
+           err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
+    iv_adjustment = iv_adjustment_in_bytes/elt_size;
+  } else {
+    // This memory op is not dependent on iv (scale == 0)
+    iv_adjustment = 0;
+  }
 
 #ifndef PRODUCT
   if (TraceSuperWord)
--- a/hotspot/src/share/vm/opto/superword.hpp	Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/superword.hpp	Fri May 08 12:19:17 2015 +0200
@@ -40,7 +40,7 @@
 // Exploiting SuperWord Level Parallelism with
 //   Multimedia Instruction Sets
 // by
-//   Samuel Larsen and Saman Amarasighe
+//   Samuel Larsen and Saman Amarasinghe
 //   MIT Laboratory for Computer Science
 // date
 //   May 2000
@@ -466,7 +466,7 @@
 
   Node* _base;         // NULL if unsafe nonheap reference
   Node* _adr;          // address pointer
-  jint  _scale;        // multipler for iv (in bytes), 0 if no loop iv
+  jint  _scale;        // multiplier for iv (in bytes), 0 if no loop iv
   jint  _offset;       // constant offset (in bytes)
   Node* _invar;        // invariant offset (in bytes), NULL if none
   bool  _negate_invar; // if true then use: (0 - _invar)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/loopopts/superword/TestVectorizationWithInvariant.java	Fri May 08 12:19:17 2015 +0200
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+import com.oracle.java.testlibrary.*;
+import sun.misc.Unsafe;
+
+/**
+ * @test
+ * @bug 8078497
+ * @summary Tests correct alignment of vectors with loop invariant offset.
+ * @library /testlibrary
+ * @run main TestVectorizationWithInvariant
+ */
+public class TestVectorizationWithInvariant {
+
+    private static Unsafe unsafe;
+    private static final long BYTE_ARRAY_OFFSET;
+    private static final long CHAR_ARRAY_OFFSET;
+
+    static {
+        unsafe = Utils.getUnsafe();
+        BYTE_ARRAY_OFFSET = unsafe.arrayBaseOffset(byte[].class);
+        CHAR_ARRAY_OFFSET = unsafe.arrayBaseOffset(char[].class);
+    }
+
+    public static void main(String[] args) throws Exception {
+        byte[] byte_array1 = new byte[1000];
+        byte[] byte_array2 = new byte[1000];
+        char[] char_array = new char[1000];
+
+        for (int i = 0; i < 20_000; ++i) {
+            copyByteToChar(byte_array1, byte_array2, char_array, 1);
+            copyCharToByte(char_array, byte_array1, 1);
+            copyCharToByteAligned(char_array, byte_array1);
+            copyCharToByteUnaligned(char_array, byte_array1);
+        }
+    }
+
+    /*
+     * Copy multiple consecutive chars from a byte array to a given offset in a char array
+     * to trigger C2's superword optimization. The offset in the byte array is independent
+     * of the loop induction variable and can be set to an arbitrary value. It may then not
+     * be possible to both align the LoadUS and the StoreC operations. Therefore, vectorization
+     * should only be done in this case if unaligned memory accesses are allowed.
+     */
+    public static void copyByteToChar(byte[] src1, byte[] src2, char[] dst, int off) {
+        off = (int) BYTE_ARRAY_OFFSET + (off << 1);
+        byte[] src = src1;
+        for (int i = (int) CHAR_ARRAY_OFFSET; i < 100; i = i + 8) {
+            // Copy 8 chars from src to dst
+            unsafe.putChar(dst, i + 0, unsafe.getChar(src, off + 0));
+            unsafe.putChar(dst, i + 2, unsafe.getChar(src, off + 2));
+            unsafe.putChar(dst, i + 4, unsafe.getChar(src, off + 4));
+            unsafe.putChar(dst, i + 6, unsafe.getChar(src, off + 6));
+            unsafe.putChar(dst, i + 8, unsafe.getChar(src, off + 8));
+            unsafe.putChar(dst, i + 10, unsafe.getChar(src, off + 10));
+            unsafe.putChar(dst, i + 12, unsafe.getChar(src, off + 12));
+            unsafe.putChar(dst, i + 14, unsafe.getChar(src, off + 14));
+
+            // Prevent loop invariant code motion of char read.
+            src = (src == src1) ? src2 : src1;
+        }
+    }
+
+    /*
+     * Copy multiple consecutive chars from a char array to a given offset in a byte array
+     * to trigger C2's superword optimization. Checks for similar problems as 'copyByteToChar'.
+     */
+    public static void copyCharToByte(char[] src, byte[] dst, int off) {
+        off = (int) BYTE_ARRAY_OFFSET + (off << 1);
+        for (int i = 0; i < 100; i = i + 8) {
+            // Copy 8 chars from src to dst
+            unsafe.putChar(dst, off + 0, src[i + 0]);
+            unsafe.putChar(dst, off + 2, src[i + 1]);
+            unsafe.putChar(dst, off + 4, src[i + 2]);
+            unsafe.putChar(dst, off + 6, src[i + 3]);
+            unsafe.putChar(dst, off + 8, src[i + 4]);
+            unsafe.putChar(dst, off + 10, src[i + 5]);
+            unsafe.putChar(dst, off + 12, src[i + 6]);
+            unsafe.putChar(dst, off + 14, src[i + 7]);
+        }
+    }
+
+    /*
+     * Variant of copyCharToByte with a constant destination array offset.
+     * The loop should always be vectorized because both the LoadUS and StoreC
+     * operations can be aligned.
+     */
+    public static void copyCharToByteAligned(char[] src, byte[] dst) {
+        final int off = (int) BYTE_ARRAY_OFFSET;
+        for (int i = 8; i < 100; i = i + 8) {
+            // Copy 8 chars from src to dst
+            unsafe.putChar(dst, off + 0, src[i + 0]);
+            unsafe.putChar(dst, off + 2, src[i + 1]);
+            unsafe.putChar(dst, off + 4, src[i + 2]);
+            unsafe.putChar(dst, off + 6, src[i + 3]);
+            unsafe.putChar(dst, off + 8, src[i + 4]);
+            unsafe.putChar(dst, off + 10, src[i + 5]);
+            unsafe.putChar(dst, off + 12, src[i + 6]);
+            unsafe.putChar(dst, off + 14, src[i + 7]);
+        }
+    }
+
+    /*
+     * Variant of copyCharToByte with a constant destination array offset. The
+     * loop should only be vectorized if unaligned memory operations are allowed
+     * because not both the LoadUS and the StoreC can be aligned.
+     */
+    public static void copyCharToByteUnaligned(char[] src, byte[] dst) {
+        final int off = (int) BYTE_ARRAY_OFFSET + 2;
+        for (int i = 0; i < 100; i = i + 8) {
+            // Copy 8 chars from src to dst
+            unsafe.putChar(dst, off + 0, src[i + 0]);
+            unsafe.putChar(dst, off + 2, src[i + 1]);
+            unsafe.putChar(dst, off + 4, src[i + 2]);
+            unsafe.putChar(dst, off + 6, src[i + 3]);
+            unsafe.putChar(dst, off + 8, src[i + 4]);
+            unsafe.putChar(dst, off + 10, src[i + 5]);
+            unsafe.putChar(dst, off + 12, src[i + 6]);
+            unsafe.putChar(dst, off + 14, src[i + 7]);
+        }
+    }
+}