8148490: RegisterSaver::restore_live_registers() fails to restore xmm registers on 32 bit
authorthartmann
Wed, 03 Feb 2016 09:09:56 +0100
changeset 35757 0eeda480b926
parent 35756 28e2a0fd6756
child 35758 6109e2e1e1de
child 35759 85d3873f87b4
8148490: RegisterSaver::restore_live_registers() fails to restore xmm registers on 32 bit Summary: Fixed stack offsets for xmm register saving/restoring code on 32 bit. Reviewed-by: kvn, mcberg
hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp
hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp
hotspot/test/compiler/runtime/safepoints/TestRegisterRestoring.java
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Tue Jan 19 14:52:33 2016 +0100
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_32.cpp	Wed Feb 03 09:09:56 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -114,15 +114,20 @@
 
 OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words,
                                            int* total_frame_words, bool verify_fpu, bool save_vectors) {
-  int vect_words = 0;
   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
+  int ymm_bytes = num_xmm_regs * 16;
+  int zmm_bytes = num_xmm_regs * 32;
 #ifdef COMPILER2
   if (save_vectors) {
-    assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
-    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
-    // Save upper half of ZMM/YMM registers :
-    vect_words = 8 * 16 / wordSize;
-    additional_frame_words += vect_words;
+    assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX");
+    assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now");
+    // Save upper half of YMM registers
+    int vect_bytes = ymm_bytes;
+    if (UseAVX > 2) {
+      // Save upper half of ZMM registers as well
+      vect_bytes += zmm_bytes;
+    }
+    additional_frame_words += vect_bytes / wordSize;
   }
 #else
   assert(!save_vectors, "vectors are generated only by C2");
@@ -185,13 +190,14 @@
 
   off = xmm0_off;
   delta = xmm1_off - off;
-  if(UseSSE == 1) {           // Save the XMM state
+  if(UseSSE == 1) {
+    // Save the XMM state
     for (int n = 0; n < num_xmm_regs; n++) {
       __ movflt(Address(rsp, off*wordSize), as_XMMRegister(n));
       off += delta;
     }
   } else if(UseSSE >= 2) {
-    // Save whole 128bit (16 bytes) XMM regiters
+    // Save whole 128bit (16 bytes) XMM registers
     for (int n = 0; n < num_xmm_regs; n++) {
       __ movdqu(Address(rsp, off*wordSize), as_XMMRegister(n));
       off += delta;
@@ -199,13 +205,14 @@
   }
 
   if (save_vectors) {
-    assert(vect_words*wordSize == 128, "");
-    __ subptr(rsp, 128); // Save upper half of YMM registes
+    __ subptr(rsp, ymm_bytes);
+    // Save upper half of YMM registers
     for (int n = 0; n < num_xmm_regs; n++) {
       __ vextractf128h(Address(rsp, n*16), as_XMMRegister(n));
     }
     if (UseAVX > 2) {
-      __ subptr(rsp, 256); // Save upper half of ZMM registes
+      __ subptr(rsp, zmm_bytes);
+      // Save upper half of ZMM registers
       for (int n = 0; n < num_xmm_regs; n++) {
         __ vextractf64x4h(Address(rsp, n*32), as_XMMRegister(n), 1);
       }
@@ -255,48 +262,57 @@
 
 void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_vectors) {
   int num_xmm_regs = XMMRegisterImpl::number_of_registers;
+  int ymm_bytes = num_xmm_regs * 16;
+  int zmm_bytes = num_xmm_regs * 32;
   // Recover XMM & FPU state
   int additional_frame_bytes = 0;
 #ifdef COMPILER2
   if (restore_vectors) {
-    assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
-    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
-    additional_frame_bytes = 128;
+    assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX");
+    assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now");
+    // Save upper half of YMM registers
+    additional_frame_bytes = ymm_bytes;
+    if (UseAVX > 2) {
+      // Save upper half of ZMM registers as well
+      additional_frame_bytes += zmm_bytes;
+    }
   }
 #else
   assert(!restore_vectors, "vectors are generated only by C2");
 #endif
 
+  int off = xmm0_off;
+  int delta = xmm1_off - off;
+
+  if (UseSSE == 1) {
+    // Restore XMM registers
+    assert(additional_frame_bytes == 0, "");
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movflt(as_XMMRegister(n), Address(rsp, off*wordSize));
+      off += delta;
+    }
+  } else if (UseSSE >= 2) {
+    // Restore whole 128bit (16 bytes) XMM registers. Do this before restoring YMM and
+    // ZMM because the movdqu instruction zeros the upper part of the XMM register.
+    for (int n = 0; n < num_xmm_regs; n++) {
+      __ movdqu(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes));
+      off += delta;
+    }
+  }
+
   if (restore_vectors) {
-    assert(additional_frame_bytes == 128, "");
     if (UseAVX > 2) {
       // Restore upper half of ZMM registers.
       for (int n = 0; n < num_xmm_regs; n++) {
         __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, n*32), 1);
       }
-      __ addptr(rsp, additional_frame_bytes*2); // Save upper half of ZMM registes
+      __ addptr(rsp, zmm_bytes);
     }
-    // Restore upper half of YMM registes.
+    // Restore upper half of YMM registers.
     for (int n = 0; n < num_xmm_regs; n++) {
       __ vinsertf128h(as_XMMRegister(n), Address(rsp, n*16));
     }
-    __ addptr(rsp, additional_frame_bytes); // Save upper half of YMM registes
-  }
-
-  int off = xmm0_off;
-  int delta = xmm1_off - off;
-
-  if (UseSSE == 1) {
-    for (int n = 0; n < num_xmm_regs; n++) {
-      __ movflt(as_XMMRegister(n), Address(rsp, off*wordSize));
-      off += delta;
-    }
-  } else if (UseSSE >= 2) {
-    // additional_frame_bytes only populated for the restore_vector case, else it is 0
-    for (int n = 0; n < num_xmm_regs; n++) {
-      __ movdqu(as_XMMRegister(n), Address(rsp, off*wordSize+additional_frame_bytes));
-      off += delta;
-    }
+    __ addptr(rsp, ymm_bytes);
   }
 
   __ pop_FPU_state();
@@ -306,7 +322,6 @@
   __ popa();
   // Get the rbp, described implicitly by the frame sender code (no oopMap)
   __ pop(rbp);
-
 }
 
 void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
--- a/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Tue Jan 19 14:52:33 2016 +0100
+++ b/hotspot/src/cpu/x86/vm/sharedRuntime_x86_64.cpp	Wed Feb 03 09:09:56 2016 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -150,8 +150,8 @@
   }
 #if defined(COMPILER2) || INCLUDE_JVMCI
   if (save_vectors) {
-    assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
-    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
+    assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX");
+    assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now");
   }
 #else
   assert(!save_vectors, "vectors are generated only by C2 and JVMCI");
@@ -176,18 +176,18 @@
 
   // push cpu state handles this on EVEX enabled targets
   if (save_vectors) {
-    // Save upper half of YMM registes(0..15)
+    // Save upper half of YMM registers(0..15)
     int base_addr = XSAVE_AREA_YMM_BEGIN;
     for (int n = 0; n < 16; n++) {
       __ vextractf128h(Address(rsp, base_addr+n*16), as_XMMRegister(n));
     }
     if (VM_Version::supports_evex()) {
-      // Save upper half of ZMM registes(0..15)
+      // Save upper half of ZMM registers(0..15)
       base_addr = XSAVE_AREA_ZMM_BEGIN;
       for (int n = 0; n < 16; n++) {
         __ vextractf64x4h(Address(rsp, base_addr+n*32), as_XMMRegister(n), 1);
       }
-      // Save full ZMM registes(16..num_xmm_regs)
+      // Save full ZMM registers(16..num_xmm_regs)
       base_addr = XSAVE_AREA_UPPERBANK;
       off = 0;
       int vector_len = Assembler::AVX_512bit;
@@ -321,8 +321,8 @@
 
 #if defined(COMPILER2) || INCLUDE_JVMCI
   if (restore_vectors) {
-    assert(UseAVX > 0, "512bit vectors are supported only with EVEX");
-    assert(MaxVectorSize == 64, "only 512bit vectors are supported now");
+    assert(UseAVX > 0, "up to 512bit vectors are supported with EVEX");
+    assert(MaxVectorSize <= 64, "up to 512bit vectors are supported now");
   }
 #else
   assert(!restore_vectors, "vectors are generated only by C2");
@@ -330,18 +330,18 @@
 
   // On EVEX enabled targets everything is handled in pop fpu state
   if (restore_vectors) {
-    // Restore upper half of YMM registes (0..15)
+    // Restore upper half of YMM registers (0..15)
     int base_addr = XSAVE_AREA_YMM_BEGIN;
     for (int n = 0; n < 16; n++) {
       __ vinsertf128h(as_XMMRegister(n), Address(rsp,  base_addr+n*16));
     }
     if (VM_Version::supports_evex()) {
-      // Restore upper half of ZMM registes (0..15)
+      // Restore upper half of ZMM registers (0..15)
       base_addr = XSAVE_AREA_ZMM_BEGIN;
       for (int n = 0; n < 16; n++) {
         __ vinsertf64x4h(as_XMMRegister(n), Address(rsp, base_addr+n*32), 1);
       }
-      // Restore full ZMM registes(16..num_xmm_regs)
+      // Restore full ZMM registers(16..num_xmm_regs)
       base_addr = XSAVE_AREA_UPPERBANK;
       int vector_len = Assembler::AVX_512bit;
       int off = 0;
@@ -351,7 +351,7 @@
     }
   } else {
     if (VM_Version::supports_evex()) {
-      // Restore upper bank of ZMM registes(16..31) for double/float usage
+      // Restore upper bank of ZMM registers(16..31) for double/float usage
       int base_addr = XSAVE_AREA_UPPERBANK;
       int off = 0;
       for (int n = 16; n < num_xmm_regs; n++) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/runtime/safepoints/TestRegisterRestoring.java	Wed Feb 03 09:09:56 2016 +0100
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+
+/**
+ * @test
+ * @bug 8148490
+ * @summary Test correct saving and restoring of vector registers at safepoints.
+ * @run main/othervm -Xbatch -XX:-TieredCompilation -XX:CompileCommand=exclude,TestRegisterRestoring::main -XX:+SafepointALot TestRegisterRestoring
+ */
+public class TestRegisterRestoring {
+  public static void main(String args[]) throws Exception {
+    // Initialize
+    float[] array = new float[100];
+    for (int i = 0; i < array.length; ++i) {
+      array[i] = 0;
+    }
+    // Test
+    for (int j = 0; j < 20_000; ++j) {
+      increment(array);
+      // Check result
+      for (int i = 0; i < array.length; i++) {
+        if (array[i] != 10_000) {
+          throw new RuntimeException("Test failed: array[" + i + "] = " + array[i] + " but should be 10.000");
+        }
+        array[i] = 0;
+      }
+    }
+  }
+
+  static void increment(float[] array) {
+    // Loop with safepoint
+    for (long l = 0; l < 10_000; l++) {
+      // Vectorized loop
+      for (int i = 0; i < array.length; ++i) {
+        array[i] += 1;
+      }
+    }
+  }
+}
+