src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.aarch64/src/org/graalvm/compiler/lir/aarch64/AArch64ZeroMemoryOp.java
changeset 58533 46b0b7fe255c
parent 58299 6df94ce3ab2f
child 58679 9c3209ff7550
--- a/src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.aarch64/src/org/graalvm/compiler/lir/aarch64/AArch64ZeroMemoryOp.java	Wed Oct 09 19:38:11 2019 -0700
+++ b/src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.aarch64/src/org/graalvm/compiler/lir/aarch64/AArch64ZeroMemoryOp.java	Wed Oct 09 21:53:48 2019 -0700
@@ -32,6 +32,7 @@
 import org.graalvm.compiler.asm.Label;
 import org.graalvm.compiler.asm.aarch64.AArch64Address;
 import org.graalvm.compiler.asm.aarch64.AArch64Assembler;
+import org.graalvm.compiler.asm.aarch64.AArch64Assembler.ConditionFlag;
 import org.graalvm.compiler.asm.aarch64.AArch64MacroAssembler;
 import org.graalvm.compiler.lir.LIRInstructionClass;
 import org.graalvm.compiler.lir.Opcode;
@@ -39,7 +40,7 @@
 
 import jdk.vm.ci.code.CodeUtil;
 import jdk.vm.ci.code.Register;
-import jdk.vm.ci.meta.AllocatableValue;
+import jdk.vm.ci.meta.Value;
 
 /**
  * Zero a chunk of memory on AArch64.
@@ -48,136 +49,186 @@
 public final class AArch64ZeroMemoryOp extends AArch64LIRInstruction {
     public static final LIRInstructionClass<AArch64ZeroMemoryOp> TYPE = LIRInstructionClass.create(AArch64ZeroMemoryOp.class);
 
-    @Use({REG}) protected AllocatableValue addressValue;
-    @Use({REG}) protected AllocatableValue lengthValue;
+    @Use({REG}) protected Value addressValue;
+    @Use({REG}) protected Value lengthValue;
 
+    @Temp({REG}) protected Value addressValueTemp;
+    @Temp({REG}) protected Value lengthValueTemp;
+
+    private final boolean isAligned;
     private final boolean useDcZva;
     private final int zvaLength;
 
     /**
      * Constructor of AArch64ZeroMemoryOp.
      *
-     * @param address allocatable 8-byte aligned base address of the memory chunk.
-     * @param length allocatable length of the memory chunk, the value must be multiple of 8.
+     * @param address starting address of the memory chunk to be zeroed.
+     * @param length size of the memory chunk to be zeroed, in bytes.
+     * @param isAligned whether both address and size are aligned to 8 bytes.
      * @param useDcZva is DC ZVA instruction is able to use.
      * @param zvaLength the ZVA length info of current AArch64 CPU, negative value indicates length
      *            is unknown at compile time.
      */
-    public AArch64ZeroMemoryOp(AllocatableValue address, AllocatableValue length, boolean useDcZva, int zvaLength) {
+    public AArch64ZeroMemoryOp(Value address, Value length, boolean isAligned, boolean useDcZva, int zvaLength) {
         super(TYPE);
         this.addressValue = address;
         this.lengthValue = length;
+        this.addressValueTemp = address;
+        this.lengthValueTemp = length;
         this.useDcZva = useDcZva;
         this.zvaLength = zvaLength;
+        this.isAligned = isAligned;
     }
 
     @Override
     protected void emitCode(CompilationResultBuilder crb, AArch64MacroAssembler masm) {
         Register base = asRegister(addressValue);
         Register size = asRegister(lengthValue);
-        if (useDcZva && zvaLength > 0) {
-            // From ARMv8-A architecture reference manual D12.2.35 Data Cache Zero ID register:
-            // A valid ZVA length should be a power-of-2 value in [4, 2048]
-            assert (CodeUtil.isPowerOf2(zvaLength) && 4 <= zvaLength && zvaLength <= 2048);
-            emitZeroMemoryWithDc(masm, base, size, zvaLength);
-        } else {
-            // Use store pair instructions (STP) to zero memory as a fallback.
-            emitZeroMemoryWithStp(masm, base, size);
-        }
-    }
+
+        try (AArch64MacroAssembler.ScratchRegister scratchRegister = masm.getScratchRegister()) {
+            Register alignmentBits = scratchRegister.getRegister();
+
+            Label tail = new Label();
+            Label done = new Label();
+
+            // Jump to DONE if size is zero.
+            masm.cbz(64, size, done);
+
+            if (!isAligned) {
+                Label baseAlignedTo2Bytes = new Label();
+                Label baseAlignedTo4Bytes = new Label();
+                Label baseAlignedTo8Bytes = new Label();
+
+                // Jump to per-byte zeroing loop if the zeroing size is less than 8
+                masm.cmp(64, size, 8);
+                masm.branchConditionally(ConditionFlag.LT, tail);
+
+                // Make base 8-byte aligned
+                masm.neg(64, alignmentBits, base);
+                masm.and(64, alignmentBits, alignmentBits, 7);
+
+                masm.tbz(alignmentBits, 0, baseAlignedTo2Bytes);
+                masm.sub(64, size, size, 1);
+                masm.str(8, zr, AArch64Address.createPostIndexedImmediateAddress(base, 1));
+                masm.bind(baseAlignedTo2Bytes);
+
+                masm.tbz(alignmentBits, 1, baseAlignedTo4Bytes);
+                masm.sub(64, size, size, 2);
+                masm.str(16, zr, AArch64Address.createPostIndexedImmediateAddress(base, 2));
+                masm.bind(baseAlignedTo4Bytes);
 
-    /**
-     * Zero a chunk of memory with DC ZVA instructions.
-     *
-     * @param masm the AArch64 macro assembler.
-     * @param base base an 8-byte aligned address of the memory chunk to be zeroed.
-     * @param size size of the memory chunk to be zeroed, in bytes, must be multiple of 8.
-     * @param zvaLength the ZVA length info of current AArch64 CPU.
-     */
-    private static void emitZeroMemoryWithDc(AArch64MacroAssembler masm, Register base, Register size, int zvaLength) {
-        Label preLoop = new Label();
-        Label zvaLoop = new Label();
-        Label postLoop = new Label();
-        Label tail = new Label();
-        Label done = new Label();
+                masm.tbz(alignmentBits, 2, baseAlignedTo8Bytes);
+                masm.sub(64, size, size, 4);
+                masm.str(32, zr, AArch64Address.createPostIndexedImmediateAddress(base, 4));
+                masm.bind(baseAlignedTo8Bytes);
+                // At this point base is 8-byte aligned.
+            }
+
+            if (useDcZva && zvaLength > 0) {
+                // From ARMv8-A architecture reference manual D12.2.35 Data Cache Zero ID register:
+                // A valid ZVA length should be a power-of-2 value in [4, 2048]
+                assert (CodeUtil.isPowerOf2(zvaLength) && 4 <= zvaLength && zvaLength <= 2048);
 
-        try (AArch64MacroAssembler.ScratchRegister sc1 = masm.getScratchRegister()) {
-            Register rscratch1 = sc1.getRegister();
+                Label preCheck = new Label();
+                Label preLoop = new Label();
+                Label mainCheck = new Label();
+                Label mainLoop = new Label();
+                Label postCheck = new Label();
+                Label postLoop = new Label();
+
+                masm.neg(64, alignmentBits, base);
+                masm.and(64, alignmentBits, alignmentBits, zvaLength - 1);
 
-            // Count number of bytes to be pre-zeroed to align base address with ZVA length.
-            masm.neg(64, rscratch1, base);
-            masm.and(64, rscratch1, rscratch1, zvaLength - 1);
+                // Is size less than number of bytes to be pre-zeroed? Jump to post check if so.
+                masm.cmp(64, size, alignmentBits);
+                masm.branchConditionally(AArch64Assembler.ConditionFlag.LE, postCheck);
+                masm.sub(64, size, size, alignmentBits);
+
+                // Pre loop: align base according to the supported bulk zeroing stride.
+                masm.jmp(preCheck);
+
+                masm.align(crb.target.wordSize * 2);
+                masm.bind(preLoop);
+                masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
+                masm.bind(preCheck);
+                masm.subs(64, alignmentBits, alignmentBits, 8);
+                masm.branchConditionally(AArch64Assembler.ConditionFlag.GE, preLoop);
 
-            // Is size less than number of bytes to be pre-zeroed? Jump to POST_LOOP if so.
-            masm.cmp(64, size, rscratch1);
-            masm.branchConditionally(AArch64Assembler.ConditionFlag.LE, postLoop);
-            masm.sub(64, size, size, rscratch1);
+                // Main loop: bulk zeroing
+                masm.jmp(mainCheck);
+
+                masm.align(crb.target.wordSize * 2);
+                masm.bind(mainLoop);
+                masm.dc(AArch64Assembler.DataCacheOperationType.ZVA, base);
+                masm.add(64, base, base, zvaLength);
+                masm.bind(mainCheck);
+                masm.subs(64, size, size, zvaLength);
+                masm.branchConditionally(AArch64Assembler.ConditionFlag.GE, mainLoop);
+
+                masm.add(64, size, size, zvaLength);
+
+                // Post loop: handle bytes after the main loop
+                masm.jmp(postCheck);
 
-            // Pre-ZVA loop.
-            masm.bind(preLoop);
-            masm.subs(64, rscratch1, rscratch1, 8);
-            masm.branchConditionally(AArch64Assembler.ConditionFlag.LT, zvaLoop);
-            masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
-            masm.jmp(preLoop);
+                masm.align(crb.target.wordSize * 2);
+                masm.bind(postLoop);
+                masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
+                masm.bind(postCheck);
+                masm.subs(64, size, size, 8);
+                masm.branchConditionally(AArch64Assembler.ConditionFlag.GE, postLoop);
+
+                if (!isAligned) {
+                    // Restore size for tail zeroing
+                    masm.add(64, size, size, 8);
+                }
+            } else {
+                Label mainCheck = new Label();
+                Label mainLoop = new Label();
+
+                if (!isAligned) {
+                    // After aligning base, we may have size less than 8. Need to check again.
+                    masm.cmp(64, size, 8);
+                    masm.branchConditionally(ConditionFlag.LT, tail);
+                }
 
-            // ZVA loop.
-            masm.bind(zvaLoop);
-            masm.subs(64, size, size, zvaLength);
-            masm.branchConditionally(AArch64Assembler.ConditionFlag.LT, tail);
-            masm.dc(AArch64Assembler.DataCacheOperationType.ZVA, base);
-            masm.add(64, base, base, zvaLength);
-            masm.jmp(zvaLoop);
+                masm.tbz(base, 3, mainCheck);
+                masm.sub(64, size, size, 8);
+                masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
+                masm.jmp(mainCheck);
 
-            // Handle bytes after ZVA loop.
+                // The STP loop that zeros 16 bytes in each iteration.
+                masm.align(crb.target.wordSize * 2);
+                masm.bind(mainLoop);
+                masm.stp(64, zr, zr, AArch64Address.createPostIndexedImmediateAddress(base, 2));
+                masm.bind(mainCheck);
+                masm.subs(64, size, size, 16);
+                masm.branchConditionally(AArch64Assembler.ConditionFlag.GE, mainLoop);
+
+                // We may need to zero the tail 8 bytes of the memory chunk.
+                masm.add(64, size, size, 16);
+                masm.tbz(size, 3, tail);
+                masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
+
+                if (!isAligned) {
+                    // Adjust size for tail zeroing
+                    masm.sub(64, size, size, 8);
+                }
+            }
+
             masm.bind(tail);
-            masm.add(64, size, size, zvaLength);
+            if (!isAligned) {
+                Label perByteZeroingLoop = new Label();
 
-            // Post-ZVA loop.
-            masm.bind(postLoop);
-            masm.subs(64, size, size, 8);
-            masm.branchConditionally(AArch64Assembler.ConditionFlag.LT, done);
-            masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
-            masm.jmp(postLoop);
-
-            // Done.
+                masm.cbz(64, size, done);
+                // We have to ensure size > 0 when entering the following loop
+                masm.align(crb.target.wordSize * 2);
+                masm.bind(perByteZeroingLoop);
+                masm.str(8, zr, AArch64Address.createPostIndexedImmediateAddress(base, 1));
+                masm.subs(64, size, size, 1);
+                masm.branchConditionally(AArch64Assembler.ConditionFlag.NE, perByteZeroingLoop);
+            }
             masm.bind(done);
         }
     }
 
-    /**
-     * Zero a chunk of memory with STP instructions.
-     *
-     * @param masm the AArch64 macro assembler.
-     * @param base base an 8-byte aligned address of the memory chunk to be zeroed.
-     * @param size size of the memory chunk to be zeroed, in bytes, must be multiple of 8.
-     */
-    private static void emitZeroMemoryWithStp(AArch64MacroAssembler masm, Register base, Register size) {
-        Label loop = new Label();
-        Label tail = new Label();
-        Label done = new Label();
-
-        // Jump to DONE if size is zero.
-        masm.cbz(64, size, done);
-
-        // Is base address already 16-byte aligned? Jump to LDP loop if so.
-        masm.tbz(base, 3, loop);
-        masm.sub(64, size, size, 8);
-        masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
-
-        // The STP loop that zeros 16 bytes in each iteration.
-        masm.bind(loop);
-        masm.subs(64, size, size, 16);
-        masm.branchConditionally(AArch64Assembler.ConditionFlag.LT, tail);
-        masm.stp(64, zr, zr, AArch64Address.createPostIndexedImmediateAddress(base, 2));
-        masm.jmp(loop);
-
-        // We may need to zero the tail 8 bytes of the memory chunk.
-        masm.bind(tail);
-        masm.adds(64, size, size, 16);
-        masm.branchConditionally(AArch64Assembler.ConditionFlag.EQ, done);
-        masm.str(64, zr, AArch64Address.createPostIndexedImmediateAddress(base, 8));
-
-        // Done.
-        masm.bind(done);
-    }
 }