--- a/src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.aarch64/src/org/graalvm/compiler/lir/aarch64/AArch64ZeroMemoryOp.java Wed Oct 09 19:38:11 2019 -0700
+++ b/src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.aarch64/src/org/graalvm/compiler/lir/aarch64/AArch64ZeroMemoryOp.java Wed Oct 09 21:53:48 2019 -0700
@@ -32,6 +32,7 @@
import org.graalvm.compiler.asm.Label;
import org.graalvm.compiler.asm.aarch64.AArch64Address;
import org.graalvm.compiler.asm.aarch64.AArch64Assembler;
+import org.graalvm.compiler.asm.aarch64.AArch64Assembler.ConditionFlag;
import org.graalvm.compiler.asm.aarch64.AArch64MacroAssembler;
import org.graalvm.compiler.lir.LIRInstructionClass;
import org.graalvm.compiler.lir.Opcode;
@@ -39,7 +40,7 @@
import jdk.vm.ci.code.CodeUtil;
import jdk.vm.ci.code.Register;
-import jdk.vm.ci.meta.AllocatableValue;
+import jdk.vm.ci.meta.Value;
/**
* Zero a chunk of memory on AArch64.
@@ -48,136 +49,186 @@
public final class AArch64ZeroMemoryOp extends AArch64LIRInstruction {
public static final LIRInstructionClass<AArch64ZeroMemoryOp> TYPE = LIRInstructionClass.create(AArch64ZeroMemoryOp.class);
- @Use({REG}) protected AllocatableValue addressValue;
- @Use({REG}) protected AllocatableValue lengthValue;
+ @Use({REG}) protected Value addressValue;
+ @Use({REG}) protected Value lengthValue;
+ @Temp({REG}) protected Value addressValueTemp;
+ @Temp({REG}) protected Value lengthValueTemp;
+
+ private final boolean isAligned;
private final boolean useDcZva;
private final int zvaLength;
/**
* Constructor of AArch64ZeroMemoryOp.
*
- * @param address allocatable 8-byte aligned base address of the memory chunk.
- * @param length allocatable length of the memory chunk, the value must be multiple of 8.
+ * @param address starting address of the memory chunk to be zeroed.
+ * @param length size of the memory chunk to be zeroed, in bytes.
+ * @param isAligned whether both address and size are aligned to 8 bytes.
* @param useDcZva is DC ZVA instruction is able to use.
* @param zvaLength the ZVA length info of current AArch64 CPU, negative value indicates length
* is unknown at compile time.
*/
- public AArch64ZeroMemoryOp(AllocatableValue address, AllocatableValue length, boolean useDcZva, int zvaLength) {
+ public AArch64ZeroMemoryOp(Value address, Value length, boolean isAligned, boolean useDcZva, int zvaLength) {
super(TYPE);
this.addressValue = address;
this.lengthValue = length;
+ this.addressValueTemp = address;
+ this.lengthValueTemp = length;
this.useDcZva = useDcZva;
this.zvaLength = zvaLength;
+ this.isAligned = isAligned;
}
@Override
protected void emitCode(CompilationResultBuilder crb, AArch64MacroAssembler masm) {
Register base = asRegister(addressValue);
Register size = asRegister(lengthValue);
- if (useDcZva && zvaLength > 0) {
- // From ARMv8-A architecture reference manual D12.2.35 Data Cache Zero ID register:
- // A valid ZVA length should be a power-of-2 value in [4, 2048]
- assert (CodeUtil.isPowerOf2(zvaLength) && 4 <= zvaLength && zvaLength <= 2048);
- emitZeroMemoryWithDc(masm, base, size, zvaLength);
- } else {
- // Use store pair instructions (STP) to zero memory as a fallback.
- emitZeroMemoryWithStp(masm, base, size);
- }
- }
+
+ try (AArch64MacroAssembler.ScratchRegister scratchRegister = masm.getScratchRegister()) {
+ Register alignmentBits = scratchRegister.getRegister();
+
+ Label tail = new Label();
+ Label done = new Label();
+
+ // Jump to DONE if size is zero.
+ masm.cbz(64, size, done);
+
+ if (!isAligned) {
+ Label baseAlignedTo2Bytes = new Label();
+ Label baseAlignedTo4Bytes = new Label();
+ Label baseAlignedTo8Bytes = new Label();
+
+ // Jump to per-byte zeroing loop if the zeroing size is less than 8
+ masm.cmp(64, size, 8);
+ masm.branchConditionally(ConditionFlag.LT, tail);
+
+ // Make base 8-byte aligned
+ masm.neg(64, alignmentBits, base);
+ masm.and(64, alignmentBits, alignmentBits, 7);
+
+ masm.tbz(alignmentBits, 0, baseAlignedTo2Bytes);
+ masm.sub(64, size, size, 1);
+ masm.str(8, zr, AArch64Address.createPostIndexedImmediateAddress(base, 1));
+ masm.bind(baseAlignedTo2Bytes);
+
+ masm.tbz(alignmentBits, 1, baseAlignedTo4Bytes);
+ masm.sub(64, size, size, 2);
+ masm.str(16, zr, AArch64Address.createPostIndexedImmediateAddress(base, 2));
+ masm.bind(baseAlignedTo4Bytes);
- /**
- * Zero a chunk of memory with DC ZVA instructions.
- *
- * @param masm the AArch64 macro assembler.
- * @param base base an 8-byte aligned address of the memory chunk to be zeroed.
- * @param size size of the memory chunk to be zeroed, in bytes, must be multiple of 8.
- * @param zvaLength the ZVA length info of current AArch64 CPU.
- */
- private static void emitZeroMemoryWithDc(AArch64MacroAssembler masm, Register base, Register size, int zvaLength) {
- Label preLoop = new Label();
- Label zvaLoop = new Label();
- Label postLoop = new Label();
- Label tail = new Label();
- Label done = new Label();
+ masm.tbz(alignmentBits, 2, baseAlignedTo8Bytes);
+ masm.sub(64, size, size, 4);
+ masm.str(32, zr, AArch64Address.createPostIndexedImmediateAddress(base, 4));
+ masm.bind(baseAlignedTo8Bytes);
+ // At this point base is 8-byte aligned.
+ }
+
+ if (useDcZva && zvaLength > 0) {
+ // From ARMv8-A architecture reference manual D12.2.35 Data Cache Zero ID register:
+ // A valid ZVA length should be a power-of-2 value in [4, 2048]
+ assert (CodeUtil.isPowerOf2(zvaLength) && 4 <= zvaLength && zvaLength <= 2048);
- try (AArch64MacroAssembler.ScratchRegister sc1 = masm.getScratchRegister()) {
- Register rscratch1 = sc1.getRegister();
+ Label preCheck = new Label();
+ Label preLoop = new Label();
+ Label mainCheck = new Label();
+ Label mainLoop = new Label();
+ Label postCheck = new Label();
+ Label postLoop = new Label();
+
+ masm.neg(64, alignmentBits, base);
+ masm.and(64, alignmentBits, alignmentBits, zvaLength - 1);
- // Count number of bytes to be pre-zeroed to align base address with ZVA length.
- masm.neg(64, rscratch1, base);
- masm.and(64, rscratch1, rscratch1, zvaLength - 1);
+ // Is size less than number of bytes to be pre-zeroed? Jump to post check if so.
+ masm.cmp(64, size, alignmentBits);
+ masm.branchConditionally(AArch64Assembler.ConditionFlag.LE, postCheck);
+ masm.sub(64, size, size, alignmentBits);
+
+ // Pre loop: align base according to the supported bulk zeroing stride.
+ masm.jmp(preCheck);
+
+ masm.align(crb.target.wordSize * 2);
+ masm.bind(preLoop);
+ masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
+ masm.bind(preCheck);
+ masm.subs(64, alignmentBits, alignmentBits, 8);
+ masm.branchConditionally(AArch64Assembler.ConditionFlag.GE, preLoop);
- // Is size less than number of bytes to be pre-zeroed? Jump to POST_LOOP if so.
- masm.cmp(64, size, rscratch1);
- masm.branchConditionally(AArch64Assembler.ConditionFlag.LE, postLoop);
- masm.sub(64, size, size, rscratch1);
+ // Main loop: bulk zeroing
+ masm.jmp(mainCheck);
+
+ masm.align(crb.target.wordSize * 2);
+ masm.bind(mainLoop);
+ masm.dc(AArch64Assembler.DataCacheOperationType.ZVA, base);
+ masm.add(64, base, base, zvaLength);
+ masm.bind(mainCheck);
+ masm.subs(64, size, size, zvaLength);
+ masm.branchConditionally(AArch64Assembler.ConditionFlag.GE, mainLoop);
+
+ masm.add(64, size, size, zvaLength);
+
+ // Post loop: handle bytes after the main loop
+ masm.jmp(postCheck);
- // Pre-ZVA loop.
- masm.bind(preLoop);
- masm.subs(64, rscratch1, rscratch1, 8);
- masm.branchConditionally(AArch64Assembler.ConditionFlag.LT, zvaLoop);
- masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
- masm.jmp(preLoop);
+ masm.align(crb.target.wordSize * 2);
+ masm.bind(postLoop);
+ masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
+ masm.bind(postCheck);
+ masm.subs(64, size, size, 8);
+ masm.branchConditionally(AArch64Assembler.ConditionFlag.GE, postLoop);
+
+ if (!isAligned) {
+ // Restore size for tail zeroing
+ masm.add(64, size, size, 8);
+ }
+ } else {
+ Label mainCheck = new Label();
+ Label mainLoop = new Label();
+
+ if (!isAligned) {
+ // After aligning base, we may have size less than 8. Need to check again.
+ masm.cmp(64, size, 8);
+ masm.branchConditionally(ConditionFlag.LT, tail);
+ }
- // ZVA loop.
- masm.bind(zvaLoop);
- masm.subs(64, size, size, zvaLength);
- masm.branchConditionally(AArch64Assembler.ConditionFlag.LT, tail);
- masm.dc(AArch64Assembler.DataCacheOperationType.ZVA, base);
- masm.add(64, base, base, zvaLength);
- masm.jmp(zvaLoop);
+ masm.tbz(base, 3, mainCheck);
+ masm.sub(64, size, size, 8);
+ masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
+ masm.jmp(mainCheck);
- // Handle bytes after ZVA loop.
+ // The STP loop that zeros 16 bytes in each iteration.
+ masm.align(crb.target.wordSize * 2);
+ masm.bind(mainLoop);
+ masm.stp(64, zr, zr, AArch64Address.createPostIndexedImmediateAddress(base, 2));
+ masm.bind(mainCheck);
+ masm.subs(64, size, size, 16);
+ masm.branchConditionally(AArch64Assembler.ConditionFlag.GE, mainLoop);
+
+ // We may need to zero the tail 8 bytes of the memory chunk.
+ masm.add(64, size, size, 16);
+ masm.tbz(size, 3, tail);
+ masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
+
+ if (!isAligned) {
+ // Adjust size for tail zeroing
+ masm.sub(64, size, size, 8);
+ }
+ }
+
masm.bind(tail);
- masm.add(64, size, size, zvaLength);
+ if (!isAligned) {
+ Label perByteZeroingLoop = new Label();
- // Post-ZVA loop.
- masm.bind(postLoop);
- masm.subs(64, size, size, 8);
- masm.branchConditionally(AArch64Assembler.ConditionFlag.LT, done);
- masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
- masm.jmp(postLoop);
-
- // Done.
+ masm.cbz(64, size, done);
+ // We have to ensure size > 0 when entering the following loop
+ masm.align(crb.target.wordSize * 2);
+ masm.bind(perByteZeroingLoop);
+ masm.str(8, zr, AArch64Address.createPostIndexedImmediateAddress(base, 1));
+ masm.subs(64, size, size, 1);
+ masm.branchConditionally(AArch64Assembler.ConditionFlag.NE, perByteZeroingLoop);
+ }
masm.bind(done);
}
}
- /**
- * Zero a chunk of memory with STP instructions.
- *
- * @param masm the AArch64 macro assembler.
- * @param base base an 8-byte aligned address of the memory chunk to be zeroed.
- * @param size size of the memory chunk to be zeroed, in bytes, must be multiple of 8.
- */
- private static void emitZeroMemoryWithStp(AArch64MacroAssembler masm, Register base, Register size) {
- Label loop = new Label();
- Label tail = new Label();
- Label done = new Label();
-
- // Jump to DONE if size is zero.
- masm.cbz(64, size, done);
-
- // Is base address already 16-byte aligned? Jump to LDP loop if so.
- masm.tbz(base, 3, loop);
- masm.sub(64, size, size, 8);
- masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
-
- // The STP loop that zeros 16 bytes in each iteration.
- masm.bind(loop);
- masm.subs(64, size, size, 16);
- masm.branchConditionally(AArch64Assembler.ConditionFlag.LT, tail);
- masm.stp(64, zr, zr, AArch64Address.createPostIndexedImmediateAddress(base, 2));
- masm.jmp(loop);
-
- // We may need to zero the tail 8 bytes of the memory chunk.
- masm.bind(tail);
- masm.adds(64, size, size, 16);
- masm.branchConditionally(AArch64Assembler.ConditionFlag.EQ, done);
- masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
-
- // Done.
- masm.bind(done);
- }
}