src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.asm.amd64/src/org/graalvm/compiler/asm/amd64/AMD64MacroAssembler.java
author erikj
Tue, 12 Sep 2017 19:03:39 +0200
changeset 47216 71c04702a3d5
parent 46344 hotspot/src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.asm.amd64/src/org/graalvm/compiler/asm/amd64/AMD64MacroAssembler.java@694c102fd8ed
child 47798 9fe9292f5931
permissions -rw-r--r--
8187443: Forest Consolidation: Move files to unified layout Reviewed-by: darcy, ihse

/*
 * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
package org.graalvm.compiler.asm.amd64;

import static jdk.vm.ci.amd64.AMD64.rax;
import static jdk.vm.ci.amd64.AMD64.rcx;
import static jdk.vm.ci.amd64.AMD64.rdx;
import static jdk.vm.ci.amd64.AMD64.rsp;
import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseIncDec;
import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseXmmLoadAndClearUpper;
import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseXmmRegToRegMoveAll;

import org.graalvm.compiler.asm.Label;
import org.graalvm.compiler.core.common.NumUtil;
import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;

import jdk.vm.ci.amd64.AMD64;
import jdk.vm.ci.amd64.AMD64Kind;
import jdk.vm.ci.code.Register;
import jdk.vm.ci.code.TargetDescription;

/**
 * This class implements commonly used X86 code patterns.
 */
public class AMD64MacroAssembler extends AMD64Assembler {

    public AMD64MacroAssembler(TargetDescription target) {
        super(target);
    }

    public final void decrementq(Register reg, int value) {
        if (value == Integer.MIN_VALUE) {
            subq(reg, value);
            return;
        }
        if (value < 0) {
            incrementq(reg, -value);
            return;
        }
        if (value == 0) {
            return;
        }
        if (value == 1 && UseIncDec) {
            decq(reg);
        } else {
            subq(reg, value);
        }
    }

    public final void decrementq(AMD64Address dst, int value) {
        if (value == Integer.MIN_VALUE) {
            subq(dst, value);
            return;
        }
        if (value < 0) {
            incrementq(dst, -value);
            return;
        }
        if (value == 0) {
            return;
        }
        if (value == 1 && UseIncDec) {
            decq(dst);
        } else {
            subq(dst, value);
        }
    }

    public void incrementq(Register reg, int value) {
        if (value == Integer.MIN_VALUE) {
            addq(reg, value);
            return;
        }
        if (value < 0) {
            decrementq(reg, -value);
            return;
        }
        if (value == 0) {
            return;
        }
        if (value == 1 && UseIncDec) {
            incq(reg);
        } else {
            addq(reg, value);
        }
    }

    public final void incrementq(AMD64Address dst, int value) {
        if (value == Integer.MIN_VALUE) {
            addq(dst, value);
            return;
        }
        if (value < 0) {
            decrementq(dst, -value);
            return;
        }
        if (value == 0) {
            return;
        }
        if (value == 1 && UseIncDec) {
            incq(dst);
        } else {
            addq(dst, value);
        }
    }

    public final void movptr(Register dst, AMD64Address src) {
        movq(dst, src);
    }

    public final void movptr(AMD64Address dst, Register src) {
        movq(dst, src);
    }

    public final void movptr(AMD64Address dst, int src) {
        movslq(dst, src);
    }

    public final void cmpptr(Register src1, Register src2) {
        cmpq(src1, src2);
    }

    public final void cmpptr(Register src1, AMD64Address src2) {
        cmpq(src1, src2);
    }

    public final void decrementl(Register reg) {
        decrementl(reg, 1);
    }

    public final void decrementl(Register reg, int value) {
        if (value == Integer.MIN_VALUE) {
            subl(reg, value);
            return;
        }
        if (value < 0) {
            incrementl(reg, -value);
            return;
        }
        if (value == 0) {
            return;
        }
        if (value == 1 && UseIncDec) {
            decl(reg);
        } else {
            subl(reg, value);
        }
    }

    public final void decrementl(AMD64Address dst, int value) {
        if (value == Integer.MIN_VALUE) {
            subl(dst, value);
            return;
        }
        if (value < 0) {
            incrementl(dst, -value);
            return;
        }
        if (value == 0) {
            return;
        }
        if (value == 1 && UseIncDec) {
            decl(dst);
        } else {
            subl(dst, value);
        }
    }

    public final void incrementl(Register reg, int value) {
        if (value == Integer.MIN_VALUE) {
            addl(reg, value);
            return;
        }
        if (value < 0) {
            decrementl(reg, -value);
            return;
        }
        if (value == 0) {
            return;
        }
        if (value == 1 && UseIncDec) {
            incl(reg);
        } else {
            addl(reg, value);
        }
    }

    public final void incrementl(AMD64Address dst, int value) {
        if (value == Integer.MIN_VALUE) {
            addl(dst, value);
            return;
        }
        if (value < 0) {
            decrementl(dst, -value);
            return;
        }
        if (value == 0) {
            return;
        }
        if (value == 1 && UseIncDec) {
            incl(dst);
        } else {
            addl(dst, value);
        }
    }

    public void movflt(Register dst, Register src) {
        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
        if (UseXmmRegToRegMoveAll) {
            movaps(dst, src);
        } else {
            movss(dst, src);
        }
    }

    public void movflt(Register dst, AMD64Address src) {
        assert dst.getRegisterCategory().equals(AMD64.XMM);
        movss(dst, src);
    }

    public void movflt(AMD64Address dst, Register src) {
        assert src.getRegisterCategory().equals(AMD64.XMM);
        movss(dst, src);
    }

    public void movdbl(Register dst, Register src) {
        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
        if (UseXmmRegToRegMoveAll) {
            movapd(dst, src);
        } else {
            movsd(dst, src);
        }
    }

    public void movdbl(Register dst, AMD64Address src) {
        assert dst.getRegisterCategory().equals(AMD64.XMM);
        if (UseXmmLoadAndClearUpper) {
            movsd(dst, src);
        } else {
            movlpd(dst, src);
        }
    }

    public void movdbl(AMD64Address dst, Register src) {
        assert src.getRegisterCategory().equals(AMD64.XMM);
        movsd(dst, src);
    }

    /**
     * Non-atomic write of a 64-bit constant to memory. Do not use if the address might be a
     * volatile field!
     */
    public final void movlong(AMD64Address dst, long src) {
        if (NumUtil.isInt(src)) {
            AMD64MIOp.MOV.emit(this, OperandSize.QWORD, dst, (int) src);
        } else {
            AMD64Address high = new AMD64Address(dst.getBase(), dst.getIndex(), dst.getScale(), dst.getDisplacement() + 4);
            movl(dst, (int) (src & 0xFFFFFFFF));
            movl(high, (int) (src >> 32));
        }

    }

    public final void flog(Register dest, Register value, boolean base10) {
        if (base10) {
            fldlg2();
        } else {
            fldln2();
        }
        AMD64Address tmp = trigPrologue(value);
        fyl2x();
        trigEpilogue(dest, tmp);
    }

    public final void fsin(Register dest, Register value) {
        AMD64Address tmp = trigPrologue(value);
        fsin();
        trigEpilogue(dest, tmp);
    }

    public final void fcos(Register dest, Register value) {
        AMD64Address tmp = trigPrologue(value);
        fcos();
        trigEpilogue(dest, tmp);
    }

    public final void ftan(Register dest, Register value) {
        AMD64Address tmp = trigPrologue(value);
        fptan();
        fstp(0); // ftan pushes 1.0 in addition to the actual result, pop
        trigEpilogue(dest, tmp);
    }

    public final void fpop() {
        ffree(0);
        fincstp();
    }

    private AMD64Address trigPrologue(Register value) {
        assert value.getRegisterCategory().equals(AMD64.XMM);
        AMD64Address tmp = new AMD64Address(AMD64.rsp);
        subq(AMD64.rsp, AMD64Kind.DOUBLE.getSizeInBytes());
        movdbl(tmp, value);
        fldd(tmp);
        return tmp;
    }

    private void trigEpilogue(Register dest, AMD64Address tmp) {
        assert dest.getRegisterCategory().equals(AMD64.XMM);
        fstpd(tmp);
        movdbl(dest, tmp);
        addq(AMD64.rsp, AMD64Kind.DOUBLE.getSizeInBytes());
    }

    // IndexOf for constant substrings with size >= 8 chars
    // which don't need to be loaded through stack.
    public void stringIndexofC8(Register str1, Register str2,
                    Register cnt1, Register cnt2,
                    int intCnt2, Register result,
                    Register vec, Register tmp) {
        // assert(UseSSE42Intrinsics, "SSE4.2 is required");

        // This method uses pcmpestri inxtruction with bound registers
        // inputs:
        // xmm - substring
        // rax - substring length (elements count)
        // mem - scanned string
        // rdx - string length (elements count)
        // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
        // outputs:
        // rcx - matched index in string
        assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";

        Label reloadSubstr = new Label();
        Label scanToSubstr = new Label();
        Label scanSubstr = new Label();
        Label retFound = new Label();
        Label retNotFound = new Label();
        Label exit = new Label();
        Label foundSubstr = new Label();
        Label matchSubstrHead = new Label();
        Label reloadStr = new Label();
        Label foundCandidate = new Label();

        // Note, inline_string_indexOf() generates checks:
        // if (substr.count > string.count) return -1;
        // if (substr.count == 0) return 0;
        assert intCnt2 >= 8 : "this code isused only for cnt2 >= 8 chars";

        // Load substring.
        movdqu(vec, new AMD64Address(str2, 0));
        movl(cnt2, intCnt2);
        movq(result, str1); // string addr

        if (intCnt2 > 8) {
            jmpb(scanToSubstr);

            // Reload substr for rescan, this code
            // is executed only for large substrings (> 8 chars)
            bind(reloadSubstr);
            movdqu(vec, new AMD64Address(str2, 0));
            negq(cnt2); // Jumped here with negative cnt2, convert to positive

            bind(reloadStr);
            // We came here after the beginning of the substring was
            // matched but the rest of it was not so we need to search
            // again. Start from the next element after the previous match.

            // cnt2 is number of substring reminding elements and
            // cnt1 is number of string reminding elements when cmp failed.
            // Restored cnt1 = cnt1 - cnt2 + int_cnt2
            subl(cnt1, cnt2);
            addl(cnt1, intCnt2);
            movl(cnt2, intCnt2); // Now restore cnt2

            decrementl(cnt1, 1);     // Shift to next element
            cmpl(cnt1, cnt2);
            jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring

            addq(result, 2);

        } // (int_cnt2 > 8)

        // Scan string for start of substr in 16-byte vectors
        bind(scanToSubstr);
        pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
        jccb(ConditionFlag.Below, foundCandidate);   // CF == 1
        subl(cnt1, 8);
        jccb(ConditionFlag.LessEqual, retNotFound); // Scanned full string
        cmpl(cnt1, cnt2);
        jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
        addq(result, 16);
        jmpb(scanToSubstr);

        // Found a potential substr
        bind(foundCandidate);
        // Matched whole vector if first element matched (tmp(rcx) == 0).
        if (intCnt2 == 8) {
            jccb(ConditionFlag.Overflow, retFound);    // OF == 1
        } else { // int_cnt2 > 8
            jccb(ConditionFlag.Overflow, foundSubstr);
        }
        // After pcmpestri tmp(rcx) contains matched element index
        // Compute start addr of substr
        leaq(result, new AMD64Address(result, tmp, Scale.Times2, 0));

        // Make sure string is still long enough
        subl(cnt1, tmp);
        cmpl(cnt1, cnt2);
        if (intCnt2 == 8) {
            jccb(ConditionFlag.GreaterEqual, scanToSubstr);
        } else { // int_cnt2 > 8
            jccb(ConditionFlag.GreaterEqual, matchSubstrHead);
        }
        // Left less then substring.

        bind(retNotFound);
        movl(result, -1);
        jmpb(exit);

        if (intCnt2 > 8) {
            // This code is optimized for the case when whole substring
            // is matched if its head is matched.
            bind(matchSubstrHead);
            pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
            // Reload only string if does not match
            jccb(ConditionFlag.NoOverflow, reloadStr); // OF == 0

            Label contScanSubstr = new Label();
            // Compare the rest of substring (> 8 chars).
            bind(foundSubstr);
            // First 8 chars are already matched.
            negq(cnt2);
            addq(cnt2, 8);

            bind(scanSubstr);
            subl(cnt1, 8);
            cmpl(cnt2, -8); // Do not read beyond substring
            jccb(ConditionFlag.LessEqual, contScanSubstr);
            // Back-up strings to avoid reading beyond substring:
            // cnt1 = cnt1 - cnt2 + 8
            addl(cnt1, cnt2); // cnt2 is negative
            addl(cnt1, 8);
            movl(cnt2, 8);
            negq(cnt2);
            bind(contScanSubstr);
            if (intCnt2 < 1024 * 1024 * 1024) {
                movdqu(vec, new AMD64Address(str2, cnt2, Scale.Times2, intCnt2 * 2));
                pcmpestri(vec, new AMD64Address(result, cnt2, Scale.Times2, intCnt2 * 2), 0x0d);
            } else {
                // calculate index in register to avoid integer overflow (int_cnt2*2)
                movl(tmp, intCnt2);
                addq(tmp, cnt2);
                movdqu(vec, new AMD64Address(str2, tmp, Scale.Times2, 0));
                pcmpestri(vec, new AMD64Address(result, tmp, Scale.Times2, 0), 0x0d);
            }
            // Need to reload strings pointers if not matched whole vector
            jcc(ConditionFlag.NoOverflow, reloadSubstr); // OF == 0
            addq(cnt2, 8);
            jcc(ConditionFlag.Negative, scanSubstr);
            // Fall through if found full substring

        } // (int_cnt2 > 8)

        bind(retFound);
        // Found result if we matched full small substring.
        // Compute substr offset
        subq(result, str1);
        shrl(result, 1); // index
        bind(exit);

    } // string_indexofC8

    // Small strings are loaded through stack if they cross page boundary.
    public void stringIndexOf(Register str1, Register str2,
                    Register cnt1, Register cnt2,
                    int intCnt2, Register result,
                    Register vec, Register tmp, int vmPageSize) {
        //
        // int_cnt2 is length of small (< 8 chars) constant substring
        // or (-1) for non constant substring in which case its length
        // is in cnt2 register.
        //
        // Note, inline_string_indexOf() generates checks:
        // if (substr.count > string.count) return -1;
        // if (substr.count == 0) return 0;
        //
        assert intCnt2 == -1 || (0 < intCnt2 && intCnt2 < 8) : "should be != 0";

        // This method uses pcmpestri instruction with bound registers
        // inputs:
        // xmm - substring
        // rax - substring length (elements count)
        // mem - scanned string
        // rdx - string length (elements count)
        // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
        // outputs:
        // rcx - matched index in string
        assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";

        Label reloadSubstr = new Label();
        Label scanToSubstr = new Label();
        Label scanSubstr = new Label();
        Label adjustStr = new Label();
        Label retFound = new Label();
        Label retNotFound = new Label();
        Label cleanup = new Label();
        Label foundSubstr = new Label();
        Label foundCandidate = new Label();

        int wordSize = 8;
        // We don't know where these strings are located
        // and we can't read beyond them. Load them through stack.
        Label bigStrings = new Label();
        Label checkStr = new Label();
        Label copySubstr = new Label();
        Label copyStr = new Label();

        movq(tmp, rsp); // save old SP

        if (intCnt2 > 0) {     // small (< 8 chars) constant substring
            if (intCnt2 == 1) {  // One char
                movzwl(result, new AMD64Address(str2, 0));
                movdl(vec, result); // move 32 bits
            } else if (intCnt2 == 2) { // Two chars
                movdl(vec, new AMD64Address(str2, 0)); // move 32 bits
            } else if (intCnt2 == 4) { // Four chars
                movq(vec, new AMD64Address(str2, 0));  // move 64 bits
            } else { // cnt2 = { 3, 5, 6, 7 }
                // Array header size is 12 bytes in 32-bit VM
                // + 6 bytes for 3 chars == 18 bytes,
                // enough space to load vec and shift.
                movdqu(vec, new AMD64Address(str2, (intCnt2 * 2) - 16));
                psrldq(vec, 16 - (intCnt2 * 2));
            }
        } else { // not constant substring
            cmpl(cnt2, 8);
            jccb(ConditionFlag.AboveEqual, bigStrings); // Both strings are big enough

            // We can read beyond string if str+16 does not cross page boundary
            // since heaps are aligned and mapped by pages.
            assert vmPageSize < 1024 * 1024 * 1024 : "default page should be small";
            movl(result, str2); // We need only low 32 bits
            andl(result, (vmPageSize - 1));
            cmpl(result, (vmPageSize - 16));
            jccb(ConditionFlag.BelowEqual, checkStr);

            // Move small strings to stack to allow load 16 bytes into vec.
            subq(rsp, 16);
            int stackOffset = wordSize - 2;
            push(cnt2);

            bind(copySubstr);
            movzwl(result, new AMD64Address(str2, cnt2, Scale.Times2, -2));
            movw(new AMD64Address(rsp, cnt2, Scale.Times2, stackOffset), result);
            decrementl(cnt2, 1);
            jccb(ConditionFlag.NotZero, copySubstr);

            pop(cnt2);
            movq(str2, rsp);  // New substring address
        } // non constant

        bind(checkStr);
        cmpl(cnt1, 8);
        jccb(ConditionFlag.AboveEqual, bigStrings);

        // Check cross page boundary.
        movl(result, str1); // We need only low 32 bits
        andl(result, (vmPageSize - 1));
        cmpl(result, (vmPageSize - 16));
        jccb(ConditionFlag.BelowEqual, bigStrings);

        subq(rsp, 16);
        int stackOffset = -2;
        if (intCnt2 < 0) { // not constant
            push(cnt2);
            stackOffset += wordSize;
        }
        movl(cnt2, cnt1);

        bind(copyStr);
        movzwl(result, new AMD64Address(str1, cnt2, Scale.Times2, -2));
        movw(new AMD64Address(rsp, cnt2, Scale.Times2, stackOffset), result);
        decrementl(cnt2, 1);
        jccb(ConditionFlag.NotZero, copyStr);

        if (intCnt2 < 0) { // not constant
            pop(cnt2);
        }
        movq(str1, rsp);  // New string address

        bind(bigStrings);
        // Load substring.
        if (intCnt2 < 0) { // -1
            movdqu(vec, new AMD64Address(str2, 0));
            push(cnt2);       // substr count
            push(str2);       // substr addr
            push(str1);       // string addr
        } else {
            // Small (< 8 chars) constant substrings are loaded already.
            movl(cnt2, intCnt2);
        }
        push(tmp);  // original SP
        // Finished loading

        // ========================================================
        // Start search
        //

        movq(result, str1); // string addr

        if (intCnt2 < 0) {  // Only for non constant substring
            jmpb(scanToSubstr);

            // SP saved at sp+0
            // String saved at sp+1*wordSize
            // Substr saved at sp+2*wordSize
            // Substr count saved at sp+3*wordSize

            // Reload substr for rescan, this code
            // is executed only for large substrings (> 8 chars)
            bind(reloadSubstr);
            movq(str2, new AMD64Address(rsp, 2 * wordSize));
            movl(cnt2, new AMD64Address(rsp, 3 * wordSize));
            movdqu(vec, new AMD64Address(str2, 0));
            // We came here after the beginning of the substring was
            // matched but the rest of it was not so we need to search
            // again. Start from the next element after the previous match.
            subq(str1, result); // Restore counter
            shrl(str1, 1);
            addl(cnt1, str1);
            decrementl(cnt1);   // Shift to next element
            cmpl(cnt1, cnt2);
            jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring

            addq(result, 2);
        } // non constant

        // Scan string for start of substr in 16-byte vectors
        bind(scanToSubstr);
        assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
        pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
        jccb(ConditionFlag.Below, foundCandidate);   // CF == 1
        subl(cnt1, 8);
        jccb(ConditionFlag.LessEqual, retNotFound); // Scanned full string
        cmpl(cnt1, cnt2);
        jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
        addq(result, 16);

        bind(adjustStr);
        cmpl(cnt1, 8); // Do not read beyond string
        jccb(ConditionFlag.GreaterEqual, scanToSubstr);
        // Back-up string to avoid reading beyond string.
        leaq(result, new AMD64Address(result, cnt1, Scale.Times2, -16));
        movl(cnt1, 8);
        jmpb(scanToSubstr);

        // Found a potential substr
        bind(foundCandidate);
        // After pcmpestri tmp(rcx) contains matched element index

        // Make sure string is still long enough
        subl(cnt1, tmp);
        cmpl(cnt1, cnt2);
        jccb(ConditionFlag.GreaterEqual, foundSubstr);
        // Left less then substring.

        bind(retNotFound);
        movl(result, -1);
        jmpb(cleanup);

        bind(foundSubstr);
        // Compute start addr of substr
        leaq(result, new AMD64Address(result, tmp, Scale.Times2));

        if (intCnt2 > 0) { // Constant substring
            // Repeat search for small substring (< 8 chars)
            // from new point without reloading substring.
            // Have to check that we don't read beyond string.
            cmpl(tmp, 8 - intCnt2);
            jccb(ConditionFlag.Greater, adjustStr);
            // Fall through if matched whole substring.
        } else { // non constant
            assert intCnt2 == -1 : "should be != 0";

            addl(tmp, cnt2);
            // Found result if we matched whole substring.
            cmpl(tmp, 8);
            jccb(ConditionFlag.LessEqual, retFound);

            // Repeat search for small substring (<= 8 chars)
            // from new point 'str1' without reloading substring.
            cmpl(cnt2, 8);
            // Have to check that we don't read beyond string.
            jccb(ConditionFlag.LessEqual, adjustStr);

            Label checkNext = new Label();
            Label contScanSubstr = new Label();
            Label retFoundLong = new Label();
            // Compare the rest of substring (> 8 chars).
            movq(str1, result);

            cmpl(tmp, cnt2);
            // First 8 chars are already matched.
            jccb(ConditionFlag.Equal, checkNext);

            bind(scanSubstr);
            pcmpestri(vec, new AMD64Address(str1, 0), 0x0d);
            // Need to reload strings pointers if not matched whole vector
            jcc(ConditionFlag.NoOverflow, reloadSubstr); // OF == 0

            bind(checkNext);
            subl(cnt2, 8);
            jccb(ConditionFlag.LessEqual, retFoundLong); // Found full substring
            addq(str1, 16);
            addq(str2, 16);
            subl(cnt1, 8);
            cmpl(cnt2, 8); // Do not read beyond substring
            jccb(ConditionFlag.GreaterEqual, contScanSubstr);
            // Back-up strings to avoid reading beyond substring.
            leaq(str2, new AMD64Address(str2, cnt2, Scale.Times2, -16));
            leaq(str1, new AMD64Address(str1, cnt2, Scale.Times2, -16));
            subl(cnt1, cnt2);
            movl(cnt2, 8);
            addl(cnt1, 8);
            bind(contScanSubstr);
            movdqu(vec, new AMD64Address(str2, 0));
            jmpb(scanSubstr);

            bind(retFoundLong);
            movq(str1, new AMD64Address(rsp, wordSize));
        } // non constant

        bind(retFound);
        // Compute substr offset
        subq(result, str1);
        shrl(result, 1); // index

        bind(cleanup);
        pop(rsp); // restore SP

    }

}