src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.asm.amd64/src/org/graalvm/compiler/asm/amd64/AMD64MacroAssembler.java
changeset 47216 71c04702a3d5
parent 46344 694c102fd8ed
child 47798 9fe9292f5931
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.asm.amd64/src/org/graalvm/compiler/asm/amd64/AMD64MacroAssembler.java	Tue Sep 12 19:03:39 2017 +0200
@@ -0,0 +1,764 @@
+/*
+ * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.graalvm.compiler.asm.amd64;
+
+import static jdk.vm.ci.amd64.AMD64.rax;
+import static jdk.vm.ci.amd64.AMD64.rcx;
+import static jdk.vm.ci.amd64.AMD64.rdx;
+import static jdk.vm.ci.amd64.AMD64.rsp;
+import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseIncDec;
+import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseXmmLoadAndClearUpper;
+import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseXmmRegToRegMoveAll;
+
+import org.graalvm.compiler.asm.Label;
+import org.graalvm.compiler.core.common.NumUtil;
+import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
+
+import jdk.vm.ci.amd64.AMD64;
+import jdk.vm.ci.amd64.AMD64Kind;
+import jdk.vm.ci.code.Register;
+import jdk.vm.ci.code.TargetDescription;
+
+/**
+ * This class implements commonly used X86 code patterns.
+ */
+public class AMD64MacroAssembler extends AMD64Assembler {
+
+    public AMD64MacroAssembler(TargetDescription target) {
+        super(target);
+    }
+
+    public final void decrementq(Register reg, int value) {
+        if (value == Integer.MIN_VALUE) {
+            subq(reg, value);
+            return;
+        }
+        if (value < 0) {
+            incrementq(reg, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            decq(reg);
+        } else {
+            subq(reg, value);
+        }
+    }
+
+    public final void decrementq(AMD64Address dst, int value) {
+        if (value == Integer.MIN_VALUE) {
+            subq(dst, value);
+            return;
+        }
+        if (value < 0) {
+            incrementq(dst, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            decq(dst);
+        } else {
+            subq(dst, value);
+        }
+    }
+
+    public void incrementq(Register reg, int value) {
+        if (value == Integer.MIN_VALUE) {
+            addq(reg, value);
+            return;
+        }
+        if (value < 0) {
+            decrementq(reg, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            incq(reg);
+        } else {
+            addq(reg, value);
+        }
+    }
+
+    public final void incrementq(AMD64Address dst, int value) {
+        if (value == Integer.MIN_VALUE) {
+            addq(dst, value);
+            return;
+        }
+        if (value < 0) {
+            decrementq(dst, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            incq(dst);
+        } else {
+            addq(dst, value);
+        }
+    }
+
+    public final void movptr(Register dst, AMD64Address src) {
+        movq(dst, src);
+    }
+
+    public final void movptr(AMD64Address dst, Register src) {
+        movq(dst, src);
+    }
+
+    public final void movptr(AMD64Address dst, int src) {
+        movslq(dst, src);
+    }
+
+    public final void cmpptr(Register src1, Register src2) {
+        cmpq(src1, src2);
+    }
+
+    public final void cmpptr(Register src1, AMD64Address src2) {
+        cmpq(src1, src2);
+    }
+
+    public final void decrementl(Register reg) {
+        decrementl(reg, 1);
+    }
+
+    public final void decrementl(Register reg, int value) {
+        if (value == Integer.MIN_VALUE) {
+            subl(reg, value);
+            return;
+        }
+        if (value < 0) {
+            incrementl(reg, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            decl(reg);
+        } else {
+            subl(reg, value);
+        }
+    }
+
+    public final void decrementl(AMD64Address dst, int value) {
+        if (value == Integer.MIN_VALUE) {
+            subl(dst, value);
+            return;
+        }
+        if (value < 0) {
+            incrementl(dst, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            decl(dst);
+        } else {
+            subl(dst, value);
+        }
+    }
+
+    public final void incrementl(Register reg, int value) {
+        if (value == Integer.MIN_VALUE) {
+            addl(reg, value);
+            return;
+        }
+        if (value < 0) {
+            decrementl(reg, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            incl(reg);
+        } else {
+            addl(reg, value);
+        }
+    }
+
+    public final void incrementl(AMD64Address dst, int value) {
+        if (value == Integer.MIN_VALUE) {
+            addl(dst, value);
+            return;
+        }
+        if (value < 0) {
+            decrementl(dst, -value);
+            return;
+        }
+        if (value == 0) {
+            return;
+        }
+        if (value == 1 && UseIncDec) {
+            incl(dst);
+        } else {
+            addl(dst, value);
+        }
+    }
+
+    public void movflt(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        if (UseXmmRegToRegMoveAll) {
+            movaps(dst, src);
+        } else {
+            movss(dst, src);
+        }
+    }
+
+    public void movflt(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        movss(dst, src);
+    }
+
+    public void movflt(AMD64Address dst, Register src) {
+        assert src.getRegisterCategory().equals(AMD64.XMM);
+        movss(dst, src);
+    }
+
+    public void movdbl(Register dst, Register src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+        if (UseXmmRegToRegMoveAll) {
+            movapd(dst, src);
+        } else {
+            movsd(dst, src);
+        }
+    }
+
+    public void movdbl(Register dst, AMD64Address src) {
+        assert dst.getRegisterCategory().equals(AMD64.XMM);
+        if (UseXmmLoadAndClearUpper) {
+            movsd(dst, src);
+        } else {
+            movlpd(dst, src);
+        }
+    }
+
+    public void movdbl(AMD64Address dst, Register src) {
+        assert src.getRegisterCategory().equals(AMD64.XMM);
+        movsd(dst, src);
+    }
+
+    /**
+     * Non-atomic write of a 64-bit constant to memory. Do not use if the address might be a
+     * volatile field!
+     */
+    public final void movlong(AMD64Address dst, long src) {
+        if (NumUtil.isInt(src)) {
+            AMD64MIOp.MOV.emit(this, OperandSize.QWORD, dst, (int) src);
+        } else {
+            AMD64Address high = new AMD64Address(dst.getBase(), dst.getIndex(), dst.getScale(), dst.getDisplacement() + 4);
+            movl(dst, (int) (src & 0xFFFFFFFF));
+            movl(high, (int) (src >> 32));
+        }
+
+    }
+
+    public final void flog(Register dest, Register value, boolean base10) {
+        if (base10) {
+            fldlg2();
+        } else {
+            fldln2();
+        }
+        AMD64Address tmp = trigPrologue(value);
+        fyl2x();
+        trigEpilogue(dest, tmp);
+    }
+
+    public final void fsin(Register dest, Register value) {
+        AMD64Address tmp = trigPrologue(value);
+        fsin();
+        trigEpilogue(dest, tmp);
+    }
+
+    public final void fcos(Register dest, Register value) {
+        AMD64Address tmp = trigPrologue(value);
+        fcos();
+        trigEpilogue(dest, tmp);
+    }
+
+    public final void ftan(Register dest, Register value) {
+        AMD64Address tmp = trigPrologue(value);
+        fptan();
+        fstp(0); // ftan pushes 1.0 in addition to the actual result, pop
+        trigEpilogue(dest, tmp);
+    }
+
+    public final void fpop() {
+        ffree(0);
+        fincstp();
+    }
+
+    private AMD64Address trigPrologue(Register value) {
+        assert value.getRegisterCategory().equals(AMD64.XMM);
+        AMD64Address tmp = new AMD64Address(AMD64.rsp);
+        subq(AMD64.rsp, AMD64Kind.DOUBLE.getSizeInBytes());
+        movdbl(tmp, value);
+        fldd(tmp);
+        return tmp;
+    }
+
+    private void trigEpilogue(Register dest, AMD64Address tmp) {
+        assert dest.getRegisterCategory().equals(AMD64.XMM);
+        fstpd(tmp);
+        movdbl(dest, tmp);
+        addq(AMD64.rsp, AMD64Kind.DOUBLE.getSizeInBytes());
+    }
+
+    // IndexOf for constant substrings with size >= 8 chars
+    // which don't need to be loaded through stack.
+    public void stringIndexofC8(Register str1, Register str2,
+                    Register cnt1, Register cnt2,
+                    int intCnt2, Register result,
+                    Register vec, Register tmp) {
+        // assert(UseSSE42Intrinsics, "SSE4.2 is required");
+
+        // This method uses pcmpestri inxtruction with bound registers
+        // inputs:
+        // xmm - substring
+        // rax - substring length (elements count)
+        // mem - scanned string
+        // rdx - string length (elements count)
+        // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
+        // outputs:
+        // rcx - matched index in string
+        assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
+
+        Label reloadSubstr = new Label();
+        Label scanToSubstr = new Label();
+        Label scanSubstr = new Label();
+        Label retFound = new Label();
+        Label retNotFound = new Label();
+        Label exit = new Label();
+        Label foundSubstr = new Label();
+        Label matchSubstrHead = new Label();
+        Label reloadStr = new Label();
+        Label foundCandidate = new Label();
+
+        // Note, inline_string_indexOf() generates checks:
+        // if (substr.count > string.count) return -1;
+        // if (substr.count == 0) return 0;
+        assert intCnt2 >= 8 : "this code isused only for cnt2 >= 8 chars";
+
+        // Load substring.
+        movdqu(vec, new AMD64Address(str2, 0));
+        movl(cnt2, intCnt2);
+        movq(result, str1); // string addr
+
+        if (intCnt2 > 8) {
+            jmpb(scanToSubstr);
+
+            // Reload substr for rescan, this code
+            // is executed only for large substrings (> 8 chars)
+            bind(reloadSubstr);
+            movdqu(vec, new AMD64Address(str2, 0));
+            negq(cnt2); // Jumped here with negative cnt2, convert to positive
+
+            bind(reloadStr);
+            // We came here after the beginning of the substring was
+            // matched but the rest of it was not so we need to search
+            // again. Start from the next element after the previous match.
+
+            // cnt2 is number of substring reminding elements and
+            // cnt1 is number of string reminding elements when cmp failed.
+            // Restored cnt1 = cnt1 - cnt2 + int_cnt2
+            subl(cnt1, cnt2);
+            addl(cnt1, intCnt2);
+            movl(cnt2, intCnt2); // Now restore cnt2
+
+            decrementl(cnt1, 1);     // Shift to next element
+            cmpl(cnt1, cnt2);
+            jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
+
+            addq(result, 2);
+
+        } // (int_cnt2 > 8)
+
+        // Scan string for start of substr in 16-byte vectors
+        bind(scanToSubstr);
+        pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
+        jccb(ConditionFlag.Below, foundCandidate);   // CF == 1
+        subl(cnt1, 8);
+        jccb(ConditionFlag.LessEqual, retNotFound); // Scanned full string
+        cmpl(cnt1, cnt2);
+        jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
+        addq(result, 16);
+        jmpb(scanToSubstr);
+
+        // Found a potential substr
+        bind(foundCandidate);
+        // Matched whole vector if first element matched (tmp(rcx) == 0).
+        if (intCnt2 == 8) {
+            jccb(ConditionFlag.Overflow, retFound);    // OF == 1
+        } else { // int_cnt2 > 8
+            jccb(ConditionFlag.Overflow, foundSubstr);
+        }
+        // After pcmpestri tmp(rcx) contains matched element index
+        // Compute start addr of substr
+        leaq(result, new AMD64Address(result, tmp, Scale.Times2, 0));
+
+        // Make sure string is still long enough
+        subl(cnt1, tmp);
+        cmpl(cnt1, cnt2);
+        if (intCnt2 == 8) {
+            jccb(ConditionFlag.GreaterEqual, scanToSubstr);
+        } else { // int_cnt2 > 8
+            jccb(ConditionFlag.GreaterEqual, matchSubstrHead);
+        }
+        // Left less then substring.
+
+        bind(retNotFound);
+        movl(result, -1);
+        jmpb(exit);
+
+        if (intCnt2 > 8) {
+            // This code is optimized for the case when whole substring
+            // is matched if its head is matched.
+            bind(matchSubstrHead);
+            pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
+            // Reload only string if does not match
+            jccb(ConditionFlag.NoOverflow, reloadStr); // OF == 0
+
+            Label contScanSubstr = new Label();
+            // Compare the rest of substring (> 8 chars).
+            bind(foundSubstr);
+            // First 8 chars are already matched.
+            negq(cnt2);
+            addq(cnt2, 8);
+
+            bind(scanSubstr);
+            subl(cnt1, 8);
+            cmpl(cnt2, -8); // Do not read beyond substring
+            jccb(ConditionFlag.LessEqual, contScanSubstr);
+            // Back-up strings to avoid reading beyond substring:
+            // cnt1 = cnt1 - cnt2 + 8
+            addl(cnt1, cnt2); // cnt2 is negative
+            addl(cnt1, 8);
+            movl(cnt2, 8);
+            negq(cnt2);
+            bind(contScanSubstr);
+            if (intCnt2 < 1024 * 1024 * 1024) {
+                movdqu(vec, new AMD64Address(str2, cnt2, Scale.Times2, intCnt2 * 2));
+                pcmpestri(vec, new AMD64Address(result, cnt2, Scale.Times2, intCnt2 * 2), 0x0d);
+            } else {
+                // calculate index in register to avoid integer overflow (int_cnt2*2)
+                movl(tmp, intCnt2);
+                addq(tmp, cnt2);
+                movdqu(vec, new AMD64Address(str2, tmp, Scale.Times2, 0));
+                pcmpestri(vec, new AMD64Address(result, tmp, Scale.Times2, 0), 0x0d);
+            }
+            // Need to reload strings pointers if not matched whole vector
+            jcc(ConditionFlag.NoOverflow, reloadSubstr); // OF == 0
+            addq(cnt2, 8);
+            jcc(ConditionFlag.Negative, scanSubstr);
+            // Fall through if found full substring
+
+        } // (int_cnt2 > 8)
+
+        bind(retFound);
+        // Found result if we matched full small substring.
+        // Compute substr offset
+        subq(result, str1);
+        shrl(result, 1); // index
+        bind(exit);
+
+    } // string_indexofC8
+
+    // Small strings are loaded through stack if they cross page boundary.
+    public void stringIndexOf(Register str1, Register str2,
+                    Register cnt1, Register cnt2,
+                    int intCnt2, Register result,
+                    Register vec, Register tmp, int vmPageSize) {
+        //
+        // int_cnt2 is length of small (< 8 chars) constant substring
+        // or (-1) for non constant substring in which case its length
+        // is in cnt2 register.
+        //
+        // Note, inline_string_indexOf() generates checks:
+        // if (substr.count > string.count) return -1;
+        // if (substr.count == 0) return 0;
+        //
+        assert intCnt2 == -1 || (0 < intCnt2 && intCnt2 < 8) : "should be != 0";
+
+        // This method uses pcmpestri instruction with bound registers
+        // inputs:
+        // xmm - substring
+        // rax - substring length (elements count)
+        // mem - scanned string
+        // rdx - string length (elements count)
+        // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
+        // outputs:
+        // rcx - matched index in string
+        assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
+
+        Label reloadSubstr = new Label();
+        Label scanToSubstr = new Label();
+        Label scanSubstr = new Label();
+        Label adjustStr = new Label();
+        Label retFound = new Label();
+        Label retNotFound = new Label();
+        Label cleanup = new Label();
+        Label foundSubstr = new Label();
+        Label foundCandidate = new Label();
+
+        int wordSize = 8;
+        // We don't know where these strings are located
+        // and we can't read beyond them. Load them through stack.
+        Label bigStrings = new Label();
+        Label checkStr = new Label();
+        Label copySubstr = new Label();
+        Label copyStr = new Label();
+
+        movq(tmp, rsp); // save old SP
+
+        if (intCnt2 > 0) {     // small (< 8 chars) constant substring
+            if (intCnt2 == 1) {  // One char
+                movzwl(result, new AMD64Address(str2, 0));
+                movdl(vec, result); // move 32 bits
+            } else if (intCnt2 == 2) { // Two chars
+                movdl(vec, new AMD64Address(str2, 0)); // move 32 bits
+            } else if (intCnt2 == 4) { // Four chars
+                movq(vec, new AMD64Address(str2, 0));  // move 64 bits
+            } else { // cnt2 = { 3, 5, 6, 7 }
+                // Array header size is 12 bytes in 32-bit VM
+                // + 6 bytes for 3 chars == 18 bytes,
+                // enough space to load vec and shift.
+                movdqu(vec, new AMD64Address(str2, (intCnt2 * 2) - 16));
+                psrldq(vec, 16 - (intCnt2 * 2));
+            }
+        } else { // not constant substring
+            cmpl(cnt2, 8);
+            jccb(ConditionFlag.AboveEqual, bigStrings); // Both strings are big enough
+
+            // We can read beyond string if str+16 does not cross page boundary
+            // since heaps are aligned and mapped by pages.
+            assert vmPageSize < 1024 * 1024 * 1024 : "default page should be small";
+            movl(result, str2); // We need only low 32 bits
+            andl(result, (vmPageSize - 1));
+            cmpl(result, (vmPageSize - 16));
+            jccb(ConditionFlag.BelowEqual, checkStr);
+
+            // Move small strings to stack to allow load 16 bytes into vec.
+            subq(rsp, 16);
+            int stackOffset = wordSize - 2;
+            push(cnt2);
+
+            bind(copySubstr);
+            movzwl(result, new AMD64Address(str2, cnt2, Scale.Times2, -2));
+            movw(new AMD64Address(rsp, cnt2, Scale.Times2, stackOffset), result);
+            decrementl(cnt2, 1);
+            jccb(ConditionFlag.NotZero, copySubstr);
+
+            pop(cnt2);
+            movq(str2, rsp);  // New substring address
+        } // non constant
+
+        bind(checkStr);
+        cmpl(cnt1, 8);
+        jccb(ConditionFlag.AboveEqual, bigStrings);
+
+        // Check cross page boundary.
+        movl(result, str1); // We need only low 32 bits
+        andl(result, (vmPageSize - 1));
+        cmpl(result, (vmPageSize - 16));
+        jccb(ConditionFlag.BelowEqual, bigStrings);
+
+        subq(rsp, 16);
+        int stackOffset = -2;
+        if (intCnt2 < 0) { // not constant
+            push(cnt2);
+            stackOffset += wordSize;
+        }
+        movl(cnt2, cnt1);
+
+        bind(copyStr);
+        movzwl(result, new AMD64Address(str1, cnt2, Scale.Times2, -2));
+        movw(new AMD64Address(rsp, cnt2, Scale.Times2, stackOffset), result);
+        decrementl(cnt2, 1);
+        jccb(ConditionFlag.NotZero, copyStr);
+
+        if (intCnt2 < 0) { // not constant
+            pop(cnt2);
+        }
+        movq(str1, rsp);  // New string address
+
+        bind(bigStrings);
+        // Load substring.
+        if (intCnt2 < 0) { // -1
+            movdqu(vec, new AMD64Address(str2, 0));
+            push(cnt2);       // substr count
+            push(str2);       // substr addr
+            push(str1);       // string addr
+        } else {
+            // Small (< 8 chars) constant substrings are loaded already.
+            movl(cnt2, intCnt2);
+        }
+        push(tmp);  // original SP
+        // Finished loading
+
+        // ========================================================
+        // Start search
+        //
+
+        movq(result, str1); // string addr
+
+        if (intCnt2 < 0) {  // Only for non constant substring
+            jmpb(scanToSubstr);
+
+            // SP saved at sp+0
+            // String saved at sp+1*wordSize
+            // Substr saved at sp+2*wordSize
+            // Substr count saved at sp+3*wordSize
+
+            // Reload substr for rescan, this code
+            // is executed only for large substrings (> 8 chars)
+            bind(reloadSubstr);
+            movq(str2, new AMD64Address(rsp, 2 * wordSize));
+            movl(cnt2, new AMD64Address(rsp, 3 * wordSize));
+            movdqu(vec, new AMD64Address(str2, 0));
+            // We came here after the beginning of the substring was
+            // matched but the rest of it was not so we need to search
+            // again. Start from the next element after the previous match.
+            subq(str1, result); // Restore counter
+            shrl(str1, 1);
+            addl(cnt1, str1);
+            decrementl(cnt1);   // Shift to next element
+            cmpl(cnt1, cnt2);
+            jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
+
+            addq(result, 2);
+        } // non constant
+
+        // Scan string for start of substr in 16-byte vectors
+        bind(scanToSubstr);
+        assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
+        pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
+        jccb(ConditionFlag.Below, foundCandidate);   // CF == 1
+        subl(cnt1, 8);
+        jccb(ConditionFlag.LessEqual, retNotFound); // Scanned full string
+        cmpl(cnt1, cnt2);
+        jccb(ConditionFlag.Negative, retNotFound);  // Left less then substring
+        addq(result, 16);
+
+        bind(adjustStr);
+        cmpl(cnt1, 8); // Do not read beyond string
+        jccb(ConditionFlag.GreaterEqual, scanToSubstr);
+        // Back-up string to avoid reading beyond string.
+        leaq(result, new AMD64Address(result, cnt1, Scale.Times2, -16));
+        movl(cnt1, 8);
+        jmpb(scanToSubstr);
+
+        // Found a potential substr
+        bind(foundCandidate);
+        // After pcmpestri tmp(rcx) contains matched element index
+
+        // Make sure string is still long enough
+        subl(cnt1, tmp);
+        cmpl(cnt1, cnt2);
+        jccb(ConditionFlag.GreaterEqual, foundSubstr);
+        // Left less then substring.
+
+        bind(retNotFound);
+        movl(result, -1);
+        jmpb(cleanup);
+
+        bind(foundSubstr);
+        // Compute start addr of substr
+        leaq(result, new AMD64Address(result, tmp, Scale.Times2));
+
+        if (intCnt2 > 0) { // Constant substring
+            // Repeat search for small substring (< 8 chars)
+            // from new point without reloading substring.
+            // Have to check that we don't read beyond string.
+            cmpl(tmp, 8 - intCnt2);
+            jccb(ConditionFlag.Greater, adjustStr);
+            // Fall through if matched whole substring.
+        } else { // non constant
+            assert intCnt2 == -1 : "should be != 0";
+
+            addl(tmp, cnt2);
+            // Found result if we matched whole substring.
+            cmpl(tmp, 8);
+            jccb(ConditionFlag.LessEqual, retFound);
+
+            // Repeat search for small substring (<= 8 chars)
+            // from new point 'str1' without reloading substring.
+            cmpl(cnt2, 8);
+            // Have to check that we don't read beyond string.
+            jccb(ConditionFlag.LessEqual, adjustStr);
+
+            Label checkNext = new Label();
+            Label contScanSubstr = new Label();
+            Label retFoundLong = new Label();
+            // Compare the rest of substring (> 8 chars).
+            movq(str1, result);
+
+            cmpl(tmp, cnt2);
+            // First 8 chars are already matched.
+            jccb(ConditionFlag.Equal, checkNext);
+
+            bind(scanSubstr);
+            pcmpestri(vec, new AMD64Address(str1, 0), 0x0d);
+            // Need to reload strings pointers if not matched whole vector
+            jcc(ConditionFlag.NoOverflow, reloadSubstr); // OF == 0
+
+            bind(checkNext);
+            subl(cnt2, 8);
+            jccb(ConditionFlag.LessEqual, retFoundLong); // Found full substring
+            addq(str1, 16);
+            addq(str2, 16);
+            subl(cnt1, 8);
+            cmpl(cnt2, 8); // Do not read beyond substring
+            jccb(ConditionFlag.GreaterEqual, contScanSubstr);
+            // Back-up strings to avoid reading beyond substring.
+            leaq(str2, new AMD64Address(str2, cnt2, Scale.Times2, -16));
+            leaq(str1, new AMD64Address(str1, cnt2, Scale.Times2, -16));
+            subl(cnt1, cnt2);
+            movl(cnt2, 8);
+            addl(cnt1, 8);
+            bind(contScanSubstr);
+            movdqu(vec, new AMD64Address(str2, 0));
+            jmpb(scanSubstr);
+
+            bind(retFoundLong);
+            movq(str1, new AMD64Address(rsp, wordSize));
+        } // non constant
+
+        bind(retFound);
+        // Compute substr offset
+        subq(result, str1);
+        shrl(result, 1); // index
+
+        bind(cleanup);
+        pop(rsp); // restore SP
+
+    }
+
+}