--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.asm.amd64/src/org/graalvm/compiler/asm/amd64/AMD64MacroAssembler.java Tue Sep 12 19:03:39 2017 +0200
@@ -0,0 +1,764 @@
+/*
+ * Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.graalvm.compiler.asm.amd64;
+
+import static jdk.vm.ci.amd64.AMD64.rax;
+import static jdk.vm.ci.amd64.AMD64.rcx;
+import static jdk.vm.ci.amd64.AMD64.rdx;
+import static jdk.vm.ci.amd64.AMD64.rsp;
+import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseIncDec;
+import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseXmmLoadAndClearUpper;
+import static org.graalvm.compiler.asm.amd64.AMD64AsmOptions.UseXmmRegToRegMoveAll;
+
+import org.graalvm.compiler.asm.Label;
+import org.graalvm.compiler.core.common.NumUtil;
+import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
+
+import jdk.vm.ci.amd64.AMD64;
+import jdk.vm.ci.amd64.AMD64Kind;
+import jdk.vm.ci.code.Register;
+import jdk.vm.ci.code.TargetDescription;
+
+/**
+ * This class implements commonly used X86 code patterns.
+ */
+public class AMD64MacroAssembler extends AMD64Assembler {
+
+ public AMD64MacroAssembler(TargetDescription target) {
+ super(target);
+ }
+
+ public final void decrementq(Register reg, int value) {
+ if (value == Integer.MIN_VALUE) {
+ subq(reg, value);
+ return;
+ }
+ if (value < 0) {
+ incrementq(reg, -value);
+ return;
+ }
+ if (value == 0) {
+ return;
+ }
+ if (value == 1 && UseIncDec) {
+ decq(reg);
+ } else {
+ subq(reg, value);
+ }
+ }
+
+ public final void decrementq(AMD64Address dst, int value) {
+ if (value == Integer.MIN_VALUE) {
+ subq(dst, value);
+ return;
+ }
+ if (value < 0) {
+ incrementq(dst, -value);
+ return;
+ }
+ if (value == 0) {
+ return;
+ }
+ if (value == 1 && UseIncDec) {
+ decq(dst);
+ } else {
+ subq(dst, value);
+ }
+ }
+
+ public void incrementq(Register reg, int value) {
+ if (value == Integer.MIN_VALUE) {
+ addq(reg, value);
+ return;
+ }
+ if (value < 0) {
+ decrementq(reg, -value);
+ return;
+ }
+ if (value == 0) {
+ return;
+ }
+ if (value == 1 && UseIncDec) {
+ incq(reg);
+ } else {
+ addq(reg, value);
+ }
+ }
+
+ public final void incrementq(AMD64Address dst, int value) {
+ if (value == Integer.MIN_VALUE) {
+ addq(dst, value);
+ return;
+ }
+ if (value < 0) {
+ decrementq(dst, -value);
+ return;
+ }
+ if (value == 0) {
+ return;
+ }
+ if (value == 1 && UseIncDec) {
+ incq(dst);
+ } else {
+ addq(dst, value);
+ }
+ }
+
+ public final void movptr(Register dst, AMD64Address src) {
+ movq(dst, src);
+ }
+
+ public final void movptr(AMD64Address dst, Register src) {
+ movq(dst, src);
+ }
+
+ public final void movptr(AMD64Address dst, int src) {
+ movslq(dst, src);
+ }
+
+ public final void cmpptr(Register src1, Register src2) {
+ cmpq(src1, src2);
+ }
+
+ public final void cmpptr(Register src1, AMD64Address src2) {
+ cmpq(src1, src2);
+ }
+
+ public final void decrementl(Register reg) {
+ decrementl(reg, 1);
+ }
+
+ public final void decrementl(Register reg, int value) {
+ if (value == Integer.MIN_VALUE) {
+ subl(reg, value);
+ return;
+ }
+ if (value < 0) {
+ incrementl(reg, -value);
+ return;
+ }
+ if (value == 0) {
+ return;
+ }
+ if (value == 1 && UseIncDec) {
+ decl(reg);
+ } else {
+ subl(reg, value);
+ }
+ }
+
+ public final void decrementl(AMD64Address dst, int value) {
+ if (value == Integer.MIN_VALUE) {
+ subl(dst, value);
+ return;
+ }
+ if (value < 0) {
+ incrementl(dst, -value);
+ return;
+ }
+ if (value == 0) {
+ return;
+ }
+ if (value == 1 && UseIncDec) {
+ decl(dst);
+ } else {
+ subl(dst, value);
+ }
+ }
+
+ public final void incrementl(Register reg, int value) {
+ if (value == Integer.MIN_VALUE) {
+ addl(reg, value);
+ return;
+ }
+ if (value < 0) {
+ decrementl(reg, -value);
+ return;
+ }
+ if (value == 0) {
+ return;
+ }
+ if (value == 1 && UseIncDec) {
+ incl(reg);
+ } else {
+ addl(reg, value);
+ }
+ }
+
+ public final void incrementl(AMD64Address dst, int value) {
+ if (value == Integer.MIN_VALUE) {
+ addl(dst, value);
+ return;
+ }
+ if (value < 0) {
+ decrementl(dst, -value);
+ return;
+ }
+ if (value == 0) {
+ return;
+ }
+ if (value == 1 && UseIncDec) {
+ incl(dst);
+ } else {
+ addl(dst, value);
+ }
+ }
+
+ public void movflt(Register dst, Register src) {
+ assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+ if (UseXmmRegToRegMoveAll) {
+ movaps(dst, src);
+ } else {
+ movss(dst, src);
+ }
+ }
+
+ public void movflt(Register dst, AMD64Address src) {
+ assert dst.getRegisterCategory().equals(AMD64.XMM);
+ movss(dst, src);
+ }
+
+ public void movflt(AMD64Address dst, Register src) {
+ assert src.getRegisterCategory().equals(AMD64.XMM);
+ movss(dst, src);
+ }
+
+ public void movdbl(Register dst, Register src) {
+ assert dst.getRegisterCategory().equals(AMD64.XMM) && src.getRegisterCategory().equals(AMD64.XMM);
+ if (UseXmmRegToRegMoveAll) {
+ movapd(dst, src);
+ } else {
+ movsd(dst, src);
+ }
+ }
+
+ public void movdbl(Register dst, AMD64Address src) {
+ assert dst.getRegisterCategory().equals(AMD64.XMM);
+ if (UseXmmLoadAndClearUpper) {
+ movsd(dst, src);
+ } else {
+ movlpd(dst, src);
+ }
+ }
+
+ public void movdbl(AMD64Address dst, Register src) {
+ assert src.getRegisterCategory().equals(AMD64.XMM);
+ movsd(dst, src);
+ }
+
+ /**
+ * Non-atomic write of a 64-bit constant to memory. Do not use if the address might be a
+ * volatile field!
+ */
+ public final void movlong(AMD64Address dst, long src) {
+ if (NumUtil.isInt(src)) {
+ AMD64MIOp.MOV.emit(this, OperandSize.QWORD, dst, (int) src);
+ } else {
+ AMD64Address high = new AMD64Address(dst.getBase(), dst.getIndex(), dst.getScale(), dst.getDisplacement() + 4);
+ movl(dst, (int) (src & 0xFFFFFFFF));
+ movl(high, (int) (src >> 32));
+ }
+
+ }
+
+ public final void flog(Register dest, Register value, boolean base10) {
+ if (base10) {
+ fldlg2();
+ } else {
+ fldln2();
+ }
+ AMD64Address tmp = trigPrologue(value);
+ fyl2x();
+ trigEpilogue(dest, tmp);
+ }
+
+ public final void fsin(Register dest, Register value) {
+ AMD64Address tmp = trigPrologue(value);
+ fsin();
+ trigEpilogue(dest, tmp);
+ }
+
+ public final void fcos(Register dest, Register value) {
+ AMD64Address tmp = trigPrologue(value);
+ fcos();
+ trigEpilogue(dest, tmp);
+ }
+
+ public final void ftan(Register dest, Register value) {
+ AMD64Address tmp = trigPrologue(value);
+ fptan();
+ fstp(0); // ftan pushes 1.0 in addition to the actual result, pop
+ trigEpilogue(dest, tmp);
+ }
+
+ public final void fpop() {
+ ffree(0);
+ fincstp();
+ }
+
+ private AMD64Address trigPrologue(Register value) {
+ assert value.getRegisterCategory().equals(AMD64.XMM);
+ AMD64Address tmp = new AMD64Address(AMD64.rsp);
+ subq(AMD64.rsp, AMD64Kind.DOUBLE.getSizeInBytes());
+ movdbl(tmp, value);
+ fldd(tmp);
+ return tmp;
+ }
+
+ private void trigEpilogue(Register dest, AMD64Address tmp) {
+ assert dest.getRegisterCategory().equals(AMD64.XMM);
+ fstpd(tmp);
+ movdbl(dest, tmp);
+ addq(AMD64.rsp, AMD64Kind.DOUBLE.getSizeInBytes());
+ }
+
+ // IndexOf for constant substrings with size >= 8 chars
+ // which don't need to be loaded through stack.
+ public void stringIndexofC8(Register str1, Register str2,
+ Register cnt1, Register cnt2,
+ int intCnt2, Register result,
+ Register vec, Register tmp) {
+ // assert(UseSSE42Intrinsics, "SSE4.2 is required");
+
+ // This method uses pcmpestri inxtruction with bound registers
+ // inputs:
+ // xmm - substring
+ // rax - substring length (elements count)
+ // mem - scanned string
+ // rdx - string length (elements count)
+ // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
+ // outputs:
+ // rcx - matched index in string
+ assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
+
+ Label reloadSubstr = new Label();
+ Label scanToSubstr = new Label();
+ Label scanSubstr = new Label();
+ Label retFound = new Label();
+ Label retNotFound = new Label();
+ Label exit = new Label();
+ Label foundSubstr = new Label();
+ Label matchSubstrHead = new Label();
+ Label reloadStr = new Label();
+ Label foundCandidate = new Label();
+
+ // Note, inline_string_indexOf() generates checks:
+ // if (substr.count > string.count) return -1;
+ // if (substr.count == 0) return 0;
+ assert intCnt2 >= 8 : "this code isused only for cnt2 >= 8 chars";
+
+ // Load substring.
+ movdqu(vec, new AMD64Address(str2, 0));
+ movl(cnt2, intCnt2);
+ movq(result, str1); // string addr
+
+ if (intCnt2 > 8) {
+ jmpb(scanToSubstr);
+
+ // Reload substr for rescan, this code
+ // is executed only for large substrings (> 8 chars)
+ bind(reloadSubstr);
+ movdqu(vec, new AMD64Address(str2, 0));
+ negq(cnt2); // Jumped here with negative cnt2, convert to positive
+
+ bind(reloadStr);
+ // We came here after the beginning of the substring was
+ // matched but the rest of it was not so we need to search
+ // again. Start from the next element after the previous match.
+
+ // cnt2 is number of substring reminding elements and
+ // cnt1 is number of string reminding elements when cmp failed.
+ // Restored cnt1 = cnt1 - cnt2 + int_cnt2
+ subl(cnt1, cnt2);
+ addl(cnt1, intCnt2);
+ movl(cnt2, intCnt2); // Now restore cnt2
+
+ decrementl(cnt1, 1); // Shift to next element
+ cmpl(cnt1, cnt2);
+ jccb(ConditionFlag.Negative, retNotFound); // Left less then substring
+
+ addq(result, 2);
+
+ } // (int_cnt2 > 8)
+
+ // Scan string for start of substr in 16-byte vectors
+ bind(scanToSubstr);
+ pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
+ jccb(ConditionFlag.Below, foundCandidate); // CF == 1
+ subl(cnt1, 8);
+ jccb(ConditionFlag.LessEqual, retNotFound); // Scanned full string
+ cmpl(cnt1, cnt2);
+ jccb(ConditionFlag.Negative, retNotFound); // Left less then substring
+ addq(result, 16);
+ jmpb(scanToSubstr);
+
+ // Found a potential substr
+ bind(foundCandidate);
+ // Matched whole vector if first element matched (tmp(rcx) == 0).
+ if (intCnt2 == 8) {
+ jccb(ConditionFlag.Overflow, retFound); // OF == 1
+ } else { // int_cnt2 > 8
+ jccb(ConditionFlag.Overflow, foundSubstr);
+ }
+ // After pcmpestri tmp(rcx) contains matched element index
+ // Compute start addr of substr
+ leaq(result, new AMD64Address(result, tmp, Scale.Times2, 0));
+
+ // Make sure string is still long enough
+ subl(cnt1, tmp);
+ cmpl(cnt1, cnt2);
+ if (intCnt2 == 8) {
+ jccb(ConditionFlag.GreaterEqual, scanToSubstr);
+ } else { // int_cnt2 > 8
+ jccb(ConditionFlag.GreaterEqual, matchSubstrHead);
+ }
+ // Left less then substring.
+
+ bind(retNotFound);
+ movl(result, -1);
+ jmpb(exit);
+
+ if (intCnt2 > 8) {
+ // This code is optimized for the case when whole substring
+ // is matched if its head is matched.
+ bind(matchSubstrHead);
+ pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
+ // Reload only string if does not match
+ jccb(ConditionFlag.NoOverflow, reloadStr); // OF == 0
+
+ Label contScanSubstr = new Label();
+ // Compare the rest of substring (> 8 chars).
+ bind(foundSubstr);
+ // First 8 chars are already matched.
+ negq(cnt2);
+ addq(cnt2, 8);
+
+ bind(scanSubstr);
+ subl(cnt1, 8);
+ cmpl(cnt2, -8); // Do not read beyond substring
+ jccb(ConditionFlag.LessEqual, contScanSubstr);
+ // Back-up strings to avoid reading beyond substring:
+ // cnt1 = cnt1 - cnt2 + 8
+ addl(cnt1, cnt2); // cnt2 is negative
+ addl(cnt1, 8);
+ movl(cnt2, 8);
+ negq(cnt2);
+ bind(contScanSubstr);
+ if (intCnt2 < 1024 * 1024 * 1024) {
+ movdqu(vec, new AMD64Address(str2, cnt2, Scale.Times2, intCnt2 * 2));
+ pcmpestri(vec, new AMD64Address(result, cnt2, Scale.Times2, intCnt2 * 2), 0x0d);
+ } else {
+ // calculate index in register to avoid integer overflow (int_cnt2*2)
+ movl(tmp, intCnt2);
+ addq(tmp, cnt2);
+ movdqu(vec, new AMD64Address(str2, tmp, Scale.Times2, 0));
+ pcmpestri(vec, new AMD64Address(result, tmp, Scale.Times2, 0), 0x0d);
+ }
+ // Need to reload strings pointers if not matched whole vector
+ jcc(ConditionFlag.NoOverflow, reloadSubstr); // OF == 0
+ addq(cnt2, 8);
+ jcc(ConditionFlag.Negative, scanSubstr);
+ // Fall through if found full substring
+
+ } // (int_cnt2 > 8)
+
+ bind(retFound);
+ // Found result if we matched full small substring.
+ // Compute substr offset
+ subq(result, str1);
+ shrl(result, 1); // index
+ bind(exit);
+
+ } // string_indexofC8
+
+ // Small strings are loaded through stack if they cross page boundary.
+ public void stringIndexOf(Register str1, Register str2,
+ Register cnt1, Register cnt2,
+ int intCnt2, Register result,
+ Register vec, Register tmp, int vmPageSize) {
+ //
+ // int_cnt2 is length of small (< 8 chars) constant substring
+ // or (-1) for non constant substring in which case its length
+ // is in cnt2 register.
+ //
+ // Note, inline_string_indexOf() generates checks:
+ // if (substr.count > string.count) return -1;
+ // if (substr.count == 0) return 0;
+ //
+ assert intCnt2 == -1 || (0 < intCnt2 && intCnt2 < 8) : "should be != 0";
+
+ // This method uses pcmpestri instruction with bound registers
+ // inputs:
+ // xmm - substring
+ // rax - substring length (elements count)
+ // mem - scanned string
+ // rdx - string length (elements count)
+ // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
+ // outputs:
+ // rcx - matched index in string
+ assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
+
+ Label reloadSubstr = new Label();
+ Label scanToSubstr = new Label();
+ Label scanSubstr = new Label();
+ Label adjustStr = new Label();
+ Label retFound = new Label();
+ Label retNotFound = new Label();
+ Label cleanup = new Label();
+ Label foundSubstr = new Label();
+ Label foundCandidate = new Label();
+
+ int wordSize = 8;
+ // We don't know where these strings are located
+ // and we can't read beyond them. Load them through stack.
+ Label bigStrings = new Label();
+ Label checkStr = new Label();
+ Label copySubstr = new Label();
+ Label copyStr = new Label();
+
+ movq(tmp, rsp); // save old SP
+
+ if (intCnt2 > 0) { // small (< 8 chars) constant substring
+ if (intCnt2 == 1) { // One char
+ movzwl(result, new AMD64Address(str2, 0));
+ movdl(vec, result); // move 32 bits
+ } else if (intCnt2 == 2) { // Two chars
+ movdl(vec, new AMD64Address(str2, 0)); // move 32 bits
+ } else if (intCnt2 == 4) { // Four chars
+ movq(vec, new AMD64Address(str2, 0)); // move 64 bits
+ } else { // cnt2 = { 3, 5, 6, 7 }
+ // Array header size is 12 bytes in 32-bit VM
+ // + 6 bytes for 3 chars == 18 bytes,
+ // enough space to load vec and shift.
+ movdqu(vec, new AMD64Address(str2, (intCnt2 * 2) - 16));
+ psrldq(vec, 16 - (intCnt2 * 2));
+ }
+ } else { // not constant substring
+ cmpl(cnt2, 8);
+ jccb(ConditionFlag.AboveEqual, bigStrings); // Both strings are big enough
+
+ // We can read beyond string if str+16 does not cross page boundary
+ // since heaps are aligned and mapped by pages.
+ assert vmPageSize < 1024 * 1024 * 1024 : "default page should be small";
+ movl(result, str2); // We need only low 32 bits
+ andl(result, (vmPageSize - 1));
+ cmpl(result, (vmPageSize - 16));
+ jccb(ConditionFlag.BelowEqual, checkStr);
+
+ // Move small strings to stack to allow load 16 bytes into vec.
+ subq(rsp, 16);
+ int stackOffset = wordSize - 2;
+ push(cnt2);
+
+ bind(copySubstr);
+ movzwl(result, new AMD64Address(str2, cnt2, Scale.Times2, -2));
+ movw(new AMD64Address(rsp, cnt2, Scale.Times2, stackOffset), result);
+ decrementl(cnt2, 1);
+ jccb(ConditionFlag.NotZero, copySubstr);
+
+ pop(cnt2);
+ movq(str2, rsp); // New substring address
+ } // non constant
+
+ bind(checkStr);
+ cmpl(cnt1, 8);
+ jccb(ConditionFlag.AboveEqual, bigStrings);
+
+ // Check cross page boundary.
+ movl(result, str1); // We need only low 32 bits
+ andl(result, (vmPageSize - 1));
+ cmpl(result, (vmPageSize - 16));
+ jccb(ConditionFlag.BelowEqual, bigStrings);
+
+ subq(rsp, 16);
+ int stackOffset = -2;
+ if (intCnt2 < 0) { // not constant
+ push(cnt2);
+ stackOffset += wordSize;
+ }
+ movl(cnt2, cnt1);
+
+ bind(copyStr);
+ movzwl(result, new AMD64Address(str1, cnt2, Scale.Times2, -2));
+ movw(new AMD64Address(rsp, cnt2, Scale.Times2, stackOffset), result);
+ decrementl(cnt2, 1);
+ jccb(ConditionFlag.NotZero, copyStr);
+
+ if (intCnt2 < 0) { // not constant
+ pop(cnt2);
+ }
+ movq(str1, rsp); // New string address
+
+ bind(bigStrings);
+ // Load substring.
+ if (intCnt2 < 0) { // -1
+ movdqu(vec, new AMD64Address(str2, 0));
+ push(cnt2); // substr count
+ push(str2); // substr addr
+ push(str1); // string addr
+ } else {
+ // Small (< 8 chars) constant substrings are loaded already.
+ movl(cnt2, intCnt2);
+ }
+ push(tmp); // original SP
+ // Finished loading
+
+ // ========================================================
+ // Start search
+ //
+
+ movq(result, str1); // string addr
+
+ if (intCnt2 < 0) { // Only for non constant substring
+ jmpb(scanToSubstr);
+
+ // SP saved at sp+0
+ // String saved at sp+1*wordSize
+ // Substr saved at sp+2*wordSize
+ // Substr count saved at sp+3*wordSize
+
+ // Reload substr for rescan, this code
+ // is executed only for large substrings (> 8 chars)
+ bind(reloadSubstr);
+ movq(str2, new AMD64Address(rsp, 2 * wordSize));
+ movl(cnt2, new AMD64Address(rsp, 3 * wordSize));
+ movdqu(vec, new AMD64Address(str2, 0));
+ // We came here after the beginning of the substring was
+ // matched but the rest of it was not so we need to search
+ // again. Start from the next element after the previous match.
+ subq(str1, result); // Restore counter
+ shrl(str1, 1);
+ addl(cnt1, str1);
+ decrementl(cnt1); // Shift to next element
+ cmpl(cnt1, cnt2);
+ jccb(ConditionFlag.Negative, retNotFound); // Left less then substring
+
+ addq(result, 2);
+ } // non constant
+
+ // Scan string for start of substr in 16-byte vectors
+ bind(scanToSubstr);
+ assert cnt1.equals(rdx) && cnt2.equals(rax) && tmp.equals(rcx) : "pcmpestri";
+ pcmpestri(vec, new AMD64Address(result, 0), 0x0d);
+ jccb(ConditionFlag.Below, foundCandidate); // CF == 1
+ subl(cnt1, 8);
+ jccb(ConditionFlag.LessEqual, retNotFound); // Scanned full string
+ cmpl(cnt1, cnt2);
+ jccb(ConditionFlag.Negative, retNotFound); // Left less then substring
+ addq(result, 16);
+
+ bind(adjustStr);
+ cmpl(cnt1, 8); // Do not read beyond string
+ jccb(ConditionFlag.GreaterEqual, scanToSubstr);
+ // Back-up string to avoid reading beyond string.
+ leaq(result, new AMD64Address(result, cnt1, Scale.Times2, -16));
+ movl(cnt1, 8);
+ jmpb(scanToSubstr);
+
+ // Found a potential substr
+ bind(foundCandidate);
+ // After pcmpestri tmp(rcx) contains matched element index
+
+ // Make sure string is still long enough
+ subl(cnt1, tmp);
+ cmpl(cnt1, cnt2);
+ jccb(ConditionFlag.GreaterEqual, foundSubstr);
+ // Left less then substring.
+
+ bind(retNotFound);
+ movl(result, -1);
+ jmpb(cleanup);
+
+ bind(foundSubstr);
+ // Compute start addr of substr
+ leaq(result, new AMD64Address(result, tmp, Scale.Times2));
+
+ if (intCnt2 > 0) { // Constant substring
+ // Repeat search for small substring (< 8 chars)
+ // from new point without reloading substring.
+ // Have to check that we don't read beyond string.
+ cmpl(tmp, 8 - intCnt2);
+ jccb(ConditionFlag.Greater, adjustStr);
+ // Fall through if matched whole substring.
+ } else { // non constant
+ assert intCnt2 == -1 : "should be != 0";
+
+ addl(tmp, cnt2);
+ // Found result if we matched whole substring.
+ cmpl(tmp, 8);
+ jccb(ConditionFlag.LessEqual, retFound);
+
+ // Repeat search for small substring (<= 8 chars)
+ // from new point 'str1' without reloading substring.
+ cmpl(cnt2, 8);
+ // Have to check that we don't read beyond string.
+ jccb(ConditionFlag.LessEqual, adjustStr);
+
+ Label checkNext = new Label();
+ Label contScanSubstr = new Label();
+ Label retFoundLong = new Label();
+ // Compare the rest of substring (> 8 chars).
+ movq(str1, result);
+
+ cmpl(tmp, cnt2);
+ // First 8 chars are already matched.
+ jccb(ConditionFlag.Equal, checkNext);
+
+ bind(scanSubstr);
+ pcmpestri(vec, new AMD64Address(str1, 0), 0x0d);
+ // Need to reload strings pointers if not matched whole vector
+ jcc(ConditionFlag.NoOverflow, reloadSubstr); // OF == 0
+
+ bind(checkNext);
+ subl(cnt2, 8);
+ jccb(ConditionFlag.LessEqual, retFoundLong); // Found full substring
+ addq(str1, 16);
+ addq(str2, 16);
+ subl(cnt1, 8);
+ cmpl(cnt2, 8); // Do not read beyond substring
+ jccb(ConditionFlag.GreaterEqual, contScanSubstr);
+ // Back-up strings to avoid reading beyond substring.
+ leaq(str2, new AMD64Address(str2, cnt2, Scale.Times2, -16));
+ leaq(str1, new AMD64Address(str1, cnt2, Scale.Times2, -16));
+ subl(cnt1, cnt2);
+ movl(cnt2, 8);
+ addl(cnt1, 8);
+ bind(contScanSubstr);
+ movdqu(vec, new AMD64Address(str2, 0));
+ jmpb(scanSubstr);
+
+ bind(retFoundLong);
+ movq(str1, new AMD64Address(rsp, wordSize));
+ } // non constant
+
+ bind(retFound);
+ // Compute substr offset
+ subq(result, str1);
+ shrl(result, 1); // index
+
+ bind(cleanup);
+ pop(rsp); // restore SP
+
+ }
+
+}