src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.amd64/src/org/graalvm/compiler/lir/amd64/AMD64VZeroUpper.java
changeset 59095 03fbcd06b4c0
parent 58299 6df94ce3ab2f
equal deleted inserted replaced
59094:5d4c3724e4c7 59095:03fbcd06b4c0
    30 import java.util.ArrayList;
    30 import java.util.ArrayList;
    31 import java.util.BitSet;
    31 import java.util.BitSet;
    32 
    32 
    33 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
    33 import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
    34 import org.graalvm.compiler.lir.LIRInstructionClass;
    34 import org.graalvm.compiler.lir.LIRInstructionClass;
       
    35 import org.graalvm.compiler.lir.amd64.AMD64Call.ForeignCallOp;
       
    36 import org.graalvm.compiler.lir.amd64.vector.AMD64VectorInstruction;
    35 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
    37 import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
    36 
    38 
    37 import jdk.vm.ci.amd64.AMD64;
    39 import jdk.vm.ci.amd64.AMD64;
    38 import jdk.vm.ci.code.Register;
    40 import jdk.vm.ci.code.Register;
    39 import jdk.vm.ci.code.RegisterConfig;
    41 import jdk.vm.ci.code.RegisterConfig;
    40 import jdk.vm.ci.code.RegisterValue;
    42 import jdk.vm.ci.code.RegisterValue;
    41 import jdk.vm.ci.meta.Value;
    43 import jdk.vm.ci.meta.Value;
    42 
    44 
       
    45 /**
       
    46  * vzeroupper is essential to avoid performance penalty during SSE-AVX transition. Specifically,
       
    47  * once we have executed instructions that modify the upper bits (i.e., 128+) of the YMM registers,
       
    48  * we need to perform vzeroupper to transit the state to 128bits before executing any SSE
       
    49  * instructions. We don't need to place vzeroupper between VEX-encoded SSE instructions and legacy
       
    50  * SSE instructions, nor between AVX instructions and VEX-encoded SSE instructions.
       
    51  *
       
    52  * When running Graal on HotSpot, we emit a vzeroupper LIR operation (i.e. an instance of this
       
    53  * class) before a foreign call to the runtime function where Graal has no knowledge. The underlying
       
    54  * reason is that HotSpot is SSE-compiled so as to support older CPUs. We also emit a vzeroupper
       
    55  * instruction (see {@code AMD64HotSpotReturnOp.emitCode}) upon returning, if the current LIR graph
       
    56  * contains LIR operations that touch the upper bits of the YMM registers, including but not limited
       
    57  * to {@link AMD64VectorInstruction}, {@link AMD64ArrayCompareToOp}, {@link AMD64ArrayEqualsOp},
       
    58  * {@link AMD64ArrayIndexOfOp}, and {@link ForeignCallOp} that invokes to Graal-compiled stubs. For
       
    59  * the last case, since Graal-compiled stubs is under our control, we don't emit vzeroupper upon
       
    60  * returning of the stub, but rather do that upon returning of the current method.
       
    61  *
       
    62  * On JDK8, C2 does not emit many vzeroupper instructions, potentially because that YMM registers
       
    63  * are not heavily employed (C2 vectorization starts using YMM registers in 9, source
       
    64  * https://cr.openjdk.java.net/~vlivanov/talks/2017_Vectorization_in_HotSpot_JVM.pdf) and thus less
       
    65  * care has been taken to place these instructions. One example is that many intrinsics employ YMM
       
    66  * registers starting from https://bugs.openjdk.java.net/browse/JDK-8005419, but does not properly
       
    67  * place vzeroupper upon returning of the intrinsic stub or the caller of the stub.
       
    68  *
       
    69  * Most vzeroupper were added in JDK 10 (https://bugs.openjdk.java.net/browse/JDK-8178811), and was
       
    70  * later restricted on Haswell Xeon due to performance regression
       
    71  * (https://bugs.openjdk.java.net/browse/JDK-8190934). The actual condition for placing vzeroupper
       
    72  * is at http://hg.openjdk.java.net/jdk/jdk/file/c7d9df2e470c/src/hotspot/cpu/x86/x86_64.ad#l428. To
       
    73  * summarize, if nmethod employs YMM registers (or intrinsics which use them, search for
       
    74  * clear_upper_avx() in opto/library_call.cpp) vzeroupper will be generated on nmethod's exit and
       
    75  * before any calls in nmethod, because even compiled nmethods can still use only SSE instructions.
       
    76  *
       
    77  * This means, if a Java method performs a call to an intrinsic that employs YMM registers,
       
    78  * C2-compiled code will place a vzeroupper before the call, upon exit of the stub and upon exit of
       
    79  * this method. Graal will only place the last, because it ensures that Graal-compiled Java method
       
    80  * and stubs will be consistent on using VEX-encoding.
       
    81  *
       
    82  * In SubstrateVM, since the whole image is compiled consistently with or without VEX encoding (the
       
    83  * later is the default behavior, see {@code NativeImageGenerator.createTarget}), there is no need
       
    84  * for vzeroupper. For dynamic compilation on a SubstrateVM image, if the image is SSE-compiled, we
       
    85  * then need vzeroupper when returning from the dynamic compiled code to the pre-built image code.
       
    86  */
    43 public class AMD64VZeroUpper extends AMD64LIRInstruction {
    87 public class AMD64VZeroUpper extends AMD64LIRInstruction {
    44 
    88 
    45     public static final LIRInstructionClass<AMD64VZeroUpper> TYPE = LIRInstructionClass.create(AMD64VZeroUpper.class);
    89     public static final LIRInstructionClass<AMD64VZeroUpper> TYPE = LIRInstructionClass.create(AMD64VZeroUpper.class);
    46 
    90 
    47     @Temp protected final RegisterValue[] xmmRegisters;
    91     @Temp protected final RegisterValue[] xmmRegisters;