30 import java.util.ArrayList; |
30 import java.util.ArrayList; |
31 import java.util.BitSet; |
31 import java.util.BitSet; |
32 |
32 |
33 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; |
33 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler; |
34 import org.graalvm.compiler.lir.LIRInstructionClass; |
34 import org.graalvm.compiler.lir.LIRInstructionClass; |
|
35 import org.graalvm.compiler.lir.amd64.AMD64Call.ForeignCallOp; |
|
36 import org.graalvm.compiler.lir.amd64.vector.AMD64VectorInstruction; |
35 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; |
37 import org.graalvm.compiler.lir.asm.CompilationResultBuilder; |
36 |
38 |
37 import jdk.vm.ci.amd64.AMD64; |
39 import jdk.vm.ci.amd64.AMD64; |
38 import jdk.vm.ci.code.Register; |
40 import jdk.vm.ci.code.Register; |
39 import jdk.vm.ci.code.RegisterConfig; |
41 import jdk.vm.ci.code.RegisterConfig; |
40 import jdk.vm.ci.code.RegisterValue; |
42 import jdk.vm.ci.code.RegisterValue; |
41 import jdk.vm.ci.meta.Value; |
43 import jdk.vm.ci.meta.Value; |
42 |
44 |
|
45 /** |
|
46 * vzeroupper is essential to avoid performance penalty during SSE-AVX transition. Specifically, |
|
47 * once we have executed instructions that modify the upper bits (i.e., 128+) of the YMM registers, |
|
48 * we need to perform vzeroupper to transit the state to 128bits before executing any SSE |
|
49 * instructions. We don't need to place vzeroupper between VEX-encoded SSE instructions and legacy |
|
50 * SSE instructions, nor between AVX instructions and VEX-encoded SSE instructions. |
|
51 * |
|
52 * When running Graal on HotSpot, we emit a vzeroupper LIR operation (i.e. an instance of this |
|
53 * class) before a foreign call to the runtime function where Graal has no knowledge. The underlying |
|
54 * reason is that HotSpot is SSE-compiled so as to support older CPUs. We also emit a vzeroupper |
|
55 * instruction (see {@code AMD64HotSpotReturnOp.emitCode}) upon returning, if the current LIR graph |
|
56 * contains LIR operations that touch the upper bits of the YMM registers, including but not limited |
|
57 * to {@link AMD64VectorInstruction}, {@link AMD64ArrayCompareToOp}, {@link AMD64ArrayEqualsOp}, |
|
58 * {@link AMD64ArrayIndexOfOp}, and {@link ForeignCallOp} that invokes to Graal-compiled stubs. For |
|
59 * the last case, since Graal-compiled stubs is under our control, we don't emit vzeroupper upon |
|
60 * returning of the stub, but rather do that upon returning of the current method. |
|
61 * |
|
62 * On JDK8, C2 does not emit many vzeroupper instructions, potentially because that YMM registers |
|
63 * are not heavily employed (C2 vectorization starts using YMM registers in 9, source |
|
64 * https://cr.openjdk.java.net/~vlivanov/talks/2017_Vectorization_in_HotSpot_JVM.pdf) and thus less |
|
65 * care has been taken to place these instructions. One example is that many intrinsics employ YMM |
|
66 * registers starting from https://bugs.openjdk.java.net/browse/JDK-8005419, but does not properly |
|
67 * place vzeroupper upon returning of the intrinsic stub or the caller of the stub. |
|
68 * |
|
69 * Most vzeroupper were added in JDK 10 (https://bugs.openjdk.java.net/browse/JDK-8178811), and was |
|
70 * later restricted on Haswell Xeon due to performance regression |
|
71 * (https://bugs.openjdk.java.net/browse/JDK-8190934). The actual condition for placing vzeroupper |
|
72 * is at http://hg.openjdk.java.net/jdk/jdk/file/c7d9df2e470c/src/hotspot/cpu/x86/x86_64.ad#l428. To |
|
73 * summarize, if nmethod employs YMM registers (or intrinsics which use them, search for |
|
74 * clear_upper_avx() in opto/library_call.cpp) vzeroupper will be generated on nmethod's exit and |
|
75 * before any calls in nmethod, because even compiled nmethods can still use only SSE instructions. |
|
76 * |
|
77 * This means, if a Java method performs a call to an intrinsic that employs YMM registers, |
|
78 * C2-compiled code will place a vzeroupper before the call, upon exit of the stub and upon exit of |
|
79 * this method. Graal will only place the last, because it ensures that Graal-compiled Java method |
|
80 * and stubs will be consistent on using VEX-encoding. |
|
81 * |
|
82 * In SubstrateVM, since the whole image is compiled consistently with or without VEX encoding (the |
|
83 * later is the default behavior, see {@code NativeImageGenerator.createTarget}), there is no need |
|
84 * for vzeroupper. For dynamic compilation on a SubstrateVM image, if the image is SSE-compiled, we |
|
85 * then need vzeroupper when returning from the dynamic compiled code to the pre-built image code. |
|
86 */ |
43 public class AMD64VZeroUpper extends AMD64LIRInstruction { |
87 public class AMD64VZeroUpper extends AMD64LIRInstruction { |
44 |
88 |
45 public static final LIRInstructionClass<AMD64VZeroUpper> TYPE = LIRInstructionClass.create(AMD64VZeroUpper.class); |
89 public static final LIRInstructionClass<AMD64VZeroUpper> TYPE = LIRInstructionClass.create(AMD64VZeroUpper.class); |
46 |
90 |
47 @Temp protected final RegisterValue[] xmmRegisters; |
91 @Temp protected final RegisterValue[] xmmRegisters; |