43972
|
1 |
/*
|
58299
|
2 |
* Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
|
43972
|
3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
4 |
*
|
|
5 |
* This code is free software; you can redistribute it and/or modify it
|
|
6 |
* under the terms of the GNU General Public License version 2 only, as
|
|
7 |
* published by the Free Software Foundation.
|
|
8 |
*
|
|
9 |
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
10 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
11 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
12 |
* version 2 for more details (a copy is included in the LICENSE file that
|
|
13 |
* accompanied this code).
|
|
14 |
*
|
|
15 |
* You should have received a copy of the GNU General Public License version
|
|
16 |
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
17 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
18 |
*
|
|
19 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
20 |
* or visit www.oracle.com if you need additional information or have any
|
|
21 |
* questions.
|
|
22 |
*/
|
50858
|
23 |
|
|
24 |
|
43972
|
25 |
package org.graalvm.compiler.lir.amd64;
|
|
26 |
|
|
27 |
import static jdk.vm.ci.code.ValueUtil.asRegister;
|
|
28 |
import static jdk.vm.ci.code.ValueUtil.isRegister;
|
|
29 |
|
58299
|
30 |
import java.util.ArrayList;
|
43972
|
31 |
import java.util.BitSet;
|
|
32 |
|
|
33 |
import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
|
|
34 |
import org.graalvm.compiler.lir.LIRInstructionClass;
|
59095
|
35 |
import org.graalvm.compiler.lir.amd64.AMD64Call.ForeignCallOp;
|
|
36 |
import org.graalvm.compiler.lir.amd64.vector.AMD64VectorInstruction;
|
43972
|
37 |
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
|
|
38 |
|
|
39 |
import jdk.vm.ci.amd64.AMD64;
|
|
40 |
import jdk.vm.ci.code.Register;
|
58299
|
41 |
import jdk.vm.ci.code.RegisterConfig;
|
43972
|
42 |
import jdk.vm.ci.code.RegisterValue;
|
|
43 |
import jdk.vm.ci.meta.Value;
|
|
44 |
|
59095
|
45 |
/**
|
|
46 |
* vzeroupper is essential to avoid performance penalty during SSE-AVX transition. Specifically,
|
|
47 |
* once we have executed instructions that modify the upper bits (i.e., 128+) of the YMM registers,
|
|
48 |
* we need to perform vzeroupper to transit the state to 128bits before executing any SSE
|
|
49 |
* instructions. We don't need to place vzeroupper between VEX-encoded SSE instructions and legacy
|
|
50 |
* SSE instructions, nor between AVX instructions and VEX-encoded SSE instructions.
|
|
51 |
*
|
|
52 |
* When running Graal on HotSpot, we emit a vzeroupper LIR operation (i.e. an instance of this
|
|
53 |
* class) before a foreign call to the runtime function where Graal has no knowledge. The underlying
|
|
54 |
* reason is that HotSpot is SSE-compiled so as to support older CPUs. We also emit a vzeroupper
|
|
55 |
* instruction (see {@code AMD64HotSpotReturnOp.emitCode}) upon returning, if the current LIR graph
|
|
56 |
* contains LIR operations that touch the upper bits of the YMM registers, including but not limited
|
|
57 |
* to {@link AMD64VectorInstruction}, {@link AMD64ArrayCompareToOp}, {@link AMD64ArrayEqualsOp},
|
|
58 |
* {@link AMD64ArrayIndexOfOp}, and {@link ForeignCallOp} that invokes to Graal-compiled stubs. For
|
|
59 |
* the last case, since Graal-compiled stubs is under our control, we don't emit vzeroupper upon
|
|
60 |
* returning of the stub, but rather do that upon returning of the current method.
|
|
61 |
*
|
|
62 |
* On JDK8, C2 does not emit many vzeroupper instructions, potentially because that YMM registers
|
|
63 |
* are not heavily employed (C2 vectorization starts using YMM registers in 9, source
|
|
64 |
* https://cr.openjdk.java.net/~vlivanov/talks/2017_Vectorization_in_HotSpot_JVM.pdf) and thus less
|
|
65 |
* care has been taken to place these instructions. One example is that many intrinsics employ YMM
|
|
66 |
* registers starting from https://bugs.openjdk.java.net/browse/JDK-8005419, but does not properly
|
|
67 |
* place vzeroupper upon returning of the intrinsic stub or the caller of the stub.
|
|
68 |
*
|
|
69 |
* Most vzeroupper were added in JDK 10 (https://bugs.openjdk.java.net/browse/JDK-8178811), and was
|
|
70 |
* later restricted on Haswell Xeon due to performance regression
|
|
71 |
* (https://bugs.openjdk.java.net/browse/JDK-8190934). The actual condition for placing vzeroupper
|
|
72 |
* is at http://hg.openjdk.java.net/jdk/jdk/file/c7d9df2e470c/src/hotspot/cpu/x86/x86_64.ad#l428. To
|
|
73 |
* summarize, if nmethod employs YMM registers (or intrinsics which use them, search for
|
|
74 |
* clear_upper_avx() in opto/library_call.cpp) vzeroupper will be generated on nmethod's exit and
|
|
75 |
* before any calls in nmethod, because even compiled nmethods can still use only SSE instructions.
|
|
76 |
*
|
|
77 |
* This means, if a Java method performs a call to an intrinsic that employs YMM registers,
|
|
78 |
* C2-compiled code will place a vzeroupper before the call, upon exit of the stub and upon exit of
|
|
79 |
* this method. Graal will only place the last, because it ensures that Graal-compiled Java method
|
|
80 |
* and stubs will be consistent on using VEX-encoding.
|
|
81 |
*
|
|
82 |
* In SubstrateVM, since the whole image is compiled consistently with or without VEX encoding (the
|
|
83 |
* later is the default behavior, see {@code NativeImageGenerator.createTarget}), there is no need
|
|
84 |
* for vzeroupper. For dynamic compilation on a SubstrateVM image, if the image is SSE-compiled, we
|
|
85 |
* then need vzeroupper when returning from the dynamic compiled code to the pre-built image code.
|
|
86 |
*/
|
43972
|
87 |
public class AMD64VZeroUpper extends AMD64LIRInstruction {
|
|
88 |
|
|
89 |
public static final LIRInstructionClass<AMD64VZeroUpper> TYPE = LIRInstructionClass.create(AMD64VZeroUpper.class);
|
|
90 |
|
|
91 |
@Temp protected final RegisterValue[] xmmRegisters;
|
|
92 |
|
58299
|
93 |
public AMD64VZeroUpper(Value[] exclude, RegisterConfig registerConfig) {
|
43972
|
94 |
super(TYPE);
|
58299
|
95 |
xmmRegisters = initRegisterValues(exclude, registerConfig);
|
43972
|
96 |
}
|
|
97 |
|
58299
|
98 |
private static RegisterValue[] initRegisterValues(Value[] exclude, RegisterConfig registerConfig) {
|
43972
|
99 |
BitSet skippedRegs = new BitSet();
|
|
100 |
if (exclude != null) {
|
|
101 |
for (Value value : exclude) {
|
|
102 |
if (isRegister(value) && asRegister(value).getRegisterCategory().equals(AMD64.XMM)) {
|
|
103 |
skippedRegs.set(asRegister(value).number);
|
|
104 |
}
|
|
105 |
}
|
|
106 |
}
|
58299
|
107 |
ArrayList<RegisterValue> regs = new ArrayList<>();
|
|
108 |
for (Register r : registerConfig.getCallerSaveRegisters()) {
|
|
109 |
if (r.getRegisterCategory().equals(AMD64.XMM) && !skippedRegs.get(r.number)) {
|
|
110 |
regs.add(r.asValue());
|
43972
|
111 |
}
|
|
112 |
}
|
58299
|
113 |
return regs.toArray(new RegisterValue[regs.size()]);
|
43972
|
114 |
}
|
|
115 |
|
|
116 |
@Override
|
|
117 |
public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
|
|
118 |
asm.vzeroupper();
|
|
119 |
}
|
|
120 |
}
|