--- a/hotspot/src/share/vm/opto/superword.cpp Thu May 07 15:34:45 2015 -0700
+++ b/hotspot/src/share/vm/opto/superword.cpp Fri May 08 12:19:17 2015 +0200
@@ -293,6 +293,13 @@
// if unaligned memory access is not allowed because number of
// iterations in pre-loop will be not enough to align it.
create_pack = false;
+ } else {
+ SWPointer p2(best_align_to_mem_ref, this);
+ if (align_to_ref_p.invar() != p2.invar()) {
+ // Do not vectorize memory accesses with different invariants
+ // if unaligned memory accesses are not allowed.
+ create_pack = false;
+ }
}
}
} else {
@@ -516,24 +523,50 @@
if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
return true;
}
- // If initial offset from start of object is computable,
- // compute alignment within the vector.
+ // If the initial offset from start of the object is computable,
+ // check if the pre-loop can align the final offset accordingly.
+ //
+ // In other words: Can we find an i such that the offset
+ // after i pre-loop iterations is aligned to vw?
+ // (init_offset + pre_loop) % vw == 0 (1)
+ // where
+ // pre_loop = i * span
+ // is the number of bytes added to the offset by i pre-loop iterations.
+ //
+ // For this to hold we need pre_loop to increase init_offset by
+ // pre_loop = vw - (init_offset % vw)
+ //
+ // This is only possible if pre_loop is divisible by span because each
+ // pre-loop iteration increases the initial offset by 'span' bytes:
+ // (vw - (init_offset % vw)) % span == 0
+ //
int vw = vector_width_in_bytes(p.mem());
assert(vw > 1, "sanity");
- if (vw % span == 0) {
- Node* init_nd = pre_end->init_trip();
- if (init_nd->is_Con() && p.invar() == NULL) {
- int init = init_nd->bottom_type()->is_int()->get_con();
-
- int init_offset = init * p.scale_in_bytes() + offset;
- assert(init_offset >= 0, "positive offset from object start");
-
+ Node* init_nd = pre_end->init_trip();
+ if (init_nd->is_Con() && p.invar() == NULL) {
+ int init = init_nd->bottom_type()->is_int()->get_con();
+ int init_offset = init * p.scale_in_bytes() + offset;
+ assert(init_offset >= 0, "positive offset from object start");
+ if (vw % span == 0) {
+ // If vm is a multiple of span, we use formula (1).
if (span > 0) {
return (vw - (init_offset % vw)) % span == 0;
} else {
assert(span < 0, "nonzero stride * scale");
return (init_offset % vw) % -span == 0;
}
+ } else if (span % vw == 0) {
+ // If span is a multiple of vw, we can simplify formula (1) to:
+ // (init_offset + i * span) % vw == 0
+ // =>
+ // (init_offset % vw) + ((i * span) % vw) == 0
+ // =>
+ // init_offset % vw == 0
+ //
+ // Because we add a multiple of vw to the initial offset, the final
+ // offset is a multiple of vw if and only if init_offset is a multiple.
+ //
+ return (init_offset % vw) == 0;
}
}
return false;
@@ -545,17 +578,23 @@
SWPointer align_to_ref_p(mem_ref, this);
int offset = align_to_ref_p.offset_in_bytes();
int scale = align_to_ref_p.scale_in_bytes();
+ int elt_size = align_to_ref_p.memory_size();
int vw = vector_width_in_bytes(mem_ref);
assert(vw > 1, "sanity");
- int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
- // At least one iteration is executed in pre-loop by default. As result
- // several iterations are needed to align memory operations in main-loop even
- // if offset is 0.
- int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
- int elt_size = align_to_ref_p.memory_size();
- assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
- err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
- int iv_adjustment = iv_adjustment_in_bytes/elt_size;
+ int iv_adjustment;
+ if (scale != 0) {
+ int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
+ // At least one iteration is executed in pre-loop by default. As result
+ // several iterations are needed to align memory operations in main-loop even
+ // if offset is 0.
+ int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
+ assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
+ err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
+ iv_adjustment = iv_adjustment_in_bytes/elt_size;
+ } else {
+ // This memory op is not dependent on iv (scale == 0)
+ iv_adjustment = 0;
+ }
#ifndef PRODUCT
if (TraceSuperWord)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/loopopts/superword/TestVectorizationWithInvariant.java Fri May 08 12:19:17 2015 +0200
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+import com.oracle.java.testlibrary.*;
+import sun.misc.Unsafe;
+
+/**
+ * @test
+ * @bug 8078497
+ * @summary Tests correct alignment of vectors with loop invariant offset.
+ * @library /testlibrary
+ * @run main TestVectorizationWithInvariant
+ */
+public class TestVectorizationWithInvariant {
+
+ private static Unsafe unsafe;
+ private static final long BYTE_ARRAY_OFFSET;
+ private static final long CHAR_ARRAY_OFFSET;
+
+ static {
+ unsafe = Utils.getUnsafe();
+ BYTE_ARRAY_OFFSET = unsafe.arrayBaseOffset(byte[].class);
+ CHAR_ARRAY_OFFSET = unsafe.arrayBaseOffset(char[].class);
+ }
+
+ public static void main(String[] args) throws Exception {
+ byte[] byte_array1 = new byte[1000];
+ byte[] byte_array2 = new byte[1000];
+ char[] char_array = new char[1000];
+
+ for (int i = 0; i < 20_000; ++i) {
+ copyByteToChar(byte_array1, byte_array2, char_array, 1);
+ copyCharToByte(char_array, byte_array1, 1);
+ copyCharToByteAligned(char_array, byte_array1);
+ copyCharToByteUnaligned(char_array, byte_array1);
+ }
+ }
+
+ /*
+ * Copy multiple consecutive chars from a byte array to a given offset in a char array
+ * to trigger C2's superword optimization. The offset in the byte array is independent
+ * of the loop induction variable and can be set to an arbitrary value. It may then not
+ * be possible to both align the LoadUS and the StoreC operations. Therefore, vectorization
+ * should only be done in this case if unaligned memory accesses are allowed.
+ */
+ public static void copyByteToChar(byte[] src1, byte[] src2, char[] dst, int off) {
+ off = (int) BYTE_ARRAY_OFFSET + (off << 1);
+ byte[] src = src1;
+ for (int i = (int) CHAR_ARRAY_OFFSET; i < 100; i = i + 8) {
+ // Copy 8 chars from src to dst
+ unsafe.putChar(dst, i + 0, unsafe.getChar(src, off + 0));
+ unsafe.putChar(dst, i + 2, unsafe.getChar(src, off + 2));
+ unsafe.putChar(dst, i + 4, unsafe.getChar(src, off + 4));
+ unsafe.putChar(dst, i + 6, unsafe.getChar(src, off + 6));
+ unsafe.putChar(dst, i + 8, unsafe.getChar(src, off + 8));
+ unsafe.putChar(dst, i + 10, unsafe.getChar(src, off + 10));
+ unsafe.putChar(dst, i + 12, unsafe.getChar(src, off + 12));
+ unsafe.putChar(dst, i + 14, unsafe.getChar(src, off + 14));
+
+ // Prevent loop invariant code motion of char read.
+ src = (src == src1) ? src2 : src1;
+ }
+ }
+
+ /*
+ * Copy multiple consecutive chars from a char array to a given offset in a byte array
+ * to trigger C2's superword optimization. Checks for similar problems as 'copyByteToChar'.
+ */
+ public static void copyCharToByte(char[] src, byte[] dst, int off) {
+ off = (int) BYTE_ARRAY_OFFSET + (off << 1);
+ for (int i = 0; i < 100; i = i + 8) {
+ // Copy 8 chars from src to dst
+ unsafe.putChar(dst, off + 0, src[i + 0]);
+ unsafe.putChar(dst, off + 2, src[i + 1]);
+ unsafe.putChar(dst, off + 4, src[i + 2]);
+ unsafe.putChar(dst, off + 6, src[i + 3]);
+ unsafe.putChar(dst, off + 8, src[i + 4]);
+ unsafe.putChar(dst, off + 10, src[i + 5]);
+ unsafe.putChar(dst, off + 12, src[i + 6]);
+ unsafe.putChar(dst, off + 14, src[i + 7]);
+ }
+ }
+
+ /*
+ * Variant of copyCharToByte with a constant destination array offset.
+ * The loop should always be vectorized because both the LoadUS and StoreC
+ * operations can be aligned.
+ */
+ public static void copyCharToByteAligned(char[] src, byte[] dst) {
+ final int off = (int) BYTE_ARRAY_OFFSET;
+ for (int i = 8; i < 100; i = i + 8) {
+ // Copy 8 chars from src to dst
+ unsafe.putChar(dst, off + 0, src[i + 0]);
+ unsafe.putChar(dst, off + 2, src[i + 1]);
+ unsafe.putChar(dst, off + 4, src[i + 2]);
+ unsafe.putChar(dst, off + 6, src[i + 3]);
+ unsafe.putChar(dst, off + 8, src[i + 4]);
+ unsafe.putChar(dst, off + 10, src[i + 5]);
+ unsafe.putChar(dst, off + 12, src[i + 6]);
+ unsafe.putChar(dst, off + 14, src[i + 7]);
+ }
+ }
+
+ /*
+ * Variant of copyCharToByte with a constant destination array offset. The
+ * loop should only be vectorized if unaligned memory operations are allowed
+ * because not both the LoadUS and the StoreC can be aligned.
+ */
+ public static void copyCharToByteUnaligned(char[] src, byte[] dst) {
+ final int off = (int) BYTE_ARRAY_OFFSET + 2;
+ for (int i = 0; i < 100; i = i + 8) {
+ // Copy 8 chars from src to dst
+ unsafe.putChar(dst, off + 0, src[i + 0]);
+ unsafe.putChar(dst, off + 2, src[i + 1]);
+ unsafe.putChar(dst, off + 4, src[i + 2]);
+ unsafe.putChar(dst, off + 6, src[i + 3]);
+ unsafe.putChar(dst, off + 8, src[i + 4]);
+ unsafe.putChar(dst, off + 10, src[i + 5]);
+ unsafe.putChar(dst, off + 12, src[i + 6]);
+ unsafe.putChar(dst, off + 14, src[i + 7]);
+ }
+ }
+}