src/jdk.internal.vm.compiler/share/classes/org.graalvm.compiler.lir.amd64/src/org/graalvm/compiler/lir/amd64/AMD64ArrayIndexOfOp.java
/*
* Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.graalvm.compiler.lir.amd64;
import static jdk.vm.ci.code.ValueUtil.asRegister;
import static jdk.vm.ci.code.ValueUtil.isRegister;
import static jdk.vm.ci.code.ValueUtil.isStackSlot;
import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.CONST;
import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.ILLEGAL;
import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.REG;
import static org.graalvm.compiler.lir.LIRInstruction.OperandFlag.STACK;
import java.util.Objects;
import org.graalvm.compiler.asm.Label;
import org.graalvm.compiler.asm.amd64.AMD64Address;
import org.graalvm.compiler.asm.amd64.AMD64Address.Scale;
import org.graalvm.compiler.asm.amd64.AMD64Assembler;
import org.graalvm.compiler.asm.amd64.AMD64Assembler.AMD64RMOp;
import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexMoveOp;
import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMIOp;
import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRMOp;
import org.graalvm.compiler.asm.amd64.AMD64Assembler.VexRVMOp;
import org.graalvm.compiler.asm.amd64.AMD64BaseAssembler.OperandSize;
import org.graalvm.compiler.asm.amd64.AMD64MacroAssembler;
import org.graalvm.compiler.asm.amd64.AVXKind;
import org.graalvm.compiler.core.common.LIRKind;
import org.graalvm.compiler.core.common.NumUtil;
import org.graalvm.compiler.lir.ConstantValue;
import org.graalvm.compiler.lir.LIRInstructionClass;
import org.graalvm.compiler.lir.Opcode;
import org.graalvm.compiler.lir.asm.CompilationResultBuilder;
import org.graalvm.compiler.lir.gen.LIRGeneratorTool;
import jdk.vm.ci.amd64.AMD64;
import jdk.vm.ci.amd64.AMD64.CPUFeature;
import jdk.vm.ci.amd64.AMD64Kind;
import jdk.vm.ci.code.Register;
import jdk.vm.ci.meta.JavaConstant;
import jdk.vm.ci.meta.JavaKind;
import jdk.vm.ci.meta.Value;
/**
*/
@Opcode("AMD64_ARRAY_INDEX_OF")
public final class AMD64ArrayIndexOfOp extends AMD64LIRInstruction {
public static final LIRInstructionClass<AMD64ArrayIndexOfOp> TYPE = LIRInstructionClass.create(AMD64ArrayIndexOfOp.class);
private final JavaKind valueKind;
private final int nValues;
private final boolean findTwoConsecutive;
private final AMD64Kind vectorKind;
private final int arrayBaseOffset;
private final Scale arrayIndexScale;
@Def({REG}) protected Value resultValue;
@Alive({REG}) protected Value arrayPtrValue;
@Alive({REG}) protected Value arrayLengthValue;
@Use({REG}) protected Value fromIndexValue;
@Alive({REG, STACK, CONST}) protected Value searchValue1;
@Alive({REG, STACK, CONST, ILLEGAL}) protected Value searchValue2;
@Alive({REG, STACK, CONST, ILLEGAL}) protected Value searchValue3;
@Alive({REG, STACK, CONST, ILLEGAL}) protected Value searchValue4;
@Temp({REG}) protected Value comparisonResult1;
@Temp({REG, ILLEGAL}) protected Value comparisonResult2;
@Temp({REG, ILLEGAL}) protected Value vectorCompareVal1;
@Temp({REG, ILLEGAL}) protected Value vectorCompareVal2;
@Temp({REG, ILLEGAL}) protected Value vectorCompareVal3;
@Temp({REG, ILLEGAL}) protected Value vectorCompareVal4;
@Temp({REG, ILLEGAL}) protected Value vectorArray1;
@Temp({REG, ILLEGAL}) protected Value vectorArray2;
@Temp({REG, ILLEGAL}) protected Value vectorArray3;
@Temp({REG, ILLEGAL}) protected Value vectorArray4;
public AMD64ArrayIndexOfOp(JavaKind arrayKind, JavaKind valueKind, boolean findTwoConsecutive, int maxVectorSize, LIRGeneratorTool tool,
Value result, Value arrayPtr, Value arrayLength, Value fromIndex, Value... searchValues) {
super(TYPE);
this.valueKind = valueKind;
this.arrayBaseOffset = tool.getProviders().getMetaAccess().getArrayBaseOffset(arrayKind);
this.arrayIndexScale = Objects.requireNonNull(Scale.fromInt(tool.getProviders().getMetaAccess().getArrayIndexScale(valueKind)));
this.findTwoConsecutive = findTwoConsecutive;
assert 0 < searchValues.length && searchValues.length <= 4;
assert byteMode(valueKind) || charMode(valueKind);
assert supports(tool, CPUFeature.SSE2) || supports(tool, CPUFeature.AVX) || supportsAVX2(tool);
nValues = searchValues.length;
assert !findTwoConsecutive || nValues == 1;
resultValue = result;
arrayPtrValue = arrayPtr;
arrayLengthValue = arrayLength;
fromIndexValue = fromIndex;
searchValue1 = searchValues[0];
searchValue2 = nValues > 1 ? searchValues[1] : Value.ILLEGAL;
searchValue3 = nValues > 2 ? searchValues[2] : Value.ILLEGAL;
searchValue4 = nValues > 3 ? searchValues[3] : Value.ILLEGAL;
comparisonResult1 = tool.newVariable(LIRKind.value(tool.target().arch.getWordKind()));
comparisonResult2 = findTwoConsecutive ? tool.newVariable(LIRKind.value(tool.target().arch.getWordKind())) : Value.ILLEGAL;
vectorKind = supportsAVX2(tool) && (maxVectorSize < 0 || maxVectorSize >= 32) ? byteMode(valueKind) ? AMD64Kind.V256_BYTE : AMD64Kind.V256_WORD
: byteMode(valueKind) ? AMD64Kind.V128_BYTE : AMD64Kind.V128_WORD;
vectorCompareVal1 = tool.newVariable(LIRKind.value(vectorKind));
vectorCompareVal2 = nValues > 1 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
vectorCompareVal3 = nValues > 2 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
vectorCompareVal4 = nValues > 3 ? tool.newVariable(LIRKind.value(vectorKind)) : Value.ILLEGAL;
vectorArray1 = tool.newVariable(LIRKind.value(vectorKind));
vectorArray2 = tool.newVariable(LIRKind.value(vectorKind));
vectorArray3 = tool.newVariable(LIRKind.value(vectorKind));
vectorArray4 = tool.newVariable(LIRKind.value(vectorKind));
}
private static boolean byteMode(JavaKind kind) {
return kind == JavaKind.Byte;
}
private static boolean charMode(JavaKind kind) {
return kind == JavaKind.Char;
}
private JavaKind getComparisonKind() {
return findTwoConsecutive ? (byteMode(valueKind) ? JavaKind.Char : JavaKind.Int) : valueKind;
}
private AVXKind.AVXSize getVectorSize() {
return AVXKind.getDataSize(vectorKind);
}
@Override
public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
int nVectors = nValues == 1 ? 4 : nValues == 2 ? 2 : 1;
Register arrayPtr = asRegister(arrayPtrValue);
Register arrayLength = asRegister(arrayLengthValue);
Register fromIndex = asRegister(fromIndexValue);
Register index = asRegister(resultValue);
Value[] searchValue = {
nValues > 0 ? searchValue1 : null,
nValues > 1 ? searchValue2 : null,
nValues > 2 ? searchValue3 : null,
nValues > 3 ? searchValue4 : null,
};
Register[] vecCmp = {
nValues > 0 ? asRegister(vectorCompareVal1) : null,
nValues > 1 ? asRegister(vectorCompareVal2) : null,
nValues > 2 ? asRegister(vectorCompareVal3) : null,
nValues > 3 ? asRegister(vectorCompareVal4) : null,
};
Register[] vecArray = {
asRegister(vectorArray1),
asRegister(vectorArray2),
asRegister(vectorArray3),
asRegister(vectorArray4),
};
Register[] cmpResult = {
asRegister(comparisonResult1),
findTwoConsecutive ? asRegister(comparisonResult2) : null,
};
Label ret = new Label();
Label bulkVectorLoop = new Label();
Label singleVectorLoop = new Label();
Label[] vectorFound = {
new Label(),
new Label(),
new Label(),
new Label(),
};
Label runVectorized = new Label();
Label elementWiseLoop = new Label();
Label elementWiseFound = new Label();
Label elementWiseNotFound = new Label();
Label skipBulkVectorLoop = new Label();
int vectorSize = getVectorSize().getBytes() / valueKind.getByteCount();
int bulkSize = vectorSize * nVectors;
JavaKind vectorCompareKind = valueKind;
if (findTwoConsecutive) {
bulkSize /= 2;
vectorCompareKind = byteMode(valueKind) ? JavaKind.Char : JavaKind.Int;
}
// index = fromIndex + vectorSize (+1 if findTwoConsecutive)
// important: this must be the first register manipulation, since fromIndex is
// annotated with @Use
asm.leaq(index, new AMD64Address(fromIndex, vectorSize + (findTwoConsecutive ? 1 : 0)));
// check if vector vector load is in bounds
asm.cmpq(index, arrayLength);
asm.jccb(AMD64Assembler.ConditionFlag.LessEqual, runVectorized);
// search range is smaller than vector size, do element-wise comparison
// index = fromIndex (+ 1 if findTwoConsecutive)
asm.subq(index, vectorSize);
// check if enough array slots remain
asm.cmpq(index, arrayLength);
asm.jccb(AMD64Assembler.ConditionFlag.GreaterEqual, elementWiseNotFound);
// compare one-by-one
asm.bind(elementWiseLoop);
// check for match
OperandSize cmpSize = getOpSize(getComparisonKind());
// address = findTwoConsecutive ? array[index - 1] : array[index]
AMD64Address arrayAddr = new AMD64Address(arrayPtr, index, arrayIndexScale, arrayBaseOffset - (findTwoConsecutive ? valueKind.getByteCount() : 0));
boolean valuesOnStack = searchValuesOnStack(searchValue);
if (valuesOnStack) {
(cmpSize == OperandSize.BYTE ? AMD64RMOp.MOVB : AMD64RMOp.MOV).emit(asm, cmpSize, cmpResult[0], arrayAddr);
for (int i = 0; i < nValues; i++) {
if (isConstant(searchValue[i])) {
int imm = asConstant(searchValue[i]).asInt();
AMD64Assembler.AMD64BinaryArithmetic.CMP.getMIOpcode(cmpSize, NumUtil.isByte(imm)).emit(asm, cmpSize, cmpResult[0], imm);
} else if (isStackSlot(searchValue[i])) {
AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(cmpSize).emit(asm, cmpSize, cmpResult[0], (AMD64Address) crb.asAddress(searchValue[i]));
} else {
AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(cmpSize).emit(asm, cmpSize, cmpResult[0], asRegister(searchValue[i]));
}
asm.jccb(AMD64Assembler.ConditionFlag.Equal, elementWiseFound);
}
} else {
for (int i = 0; i < nValues; i++) {
if (isConstant(searchValue[i])) {
int imm = asConstant(searchValue[i]).asInt();
AMD64Assembler.AMD64BinaryArithmetic.CMP.getMIOpcode(cmpSize, NumUtil.isByte(imm)).emit(asm, cmpSize, arrayAddr, imm);
} else {
AMD64Assembler.AMD64BinaryArithmetic.CMP.getRMOpcode(cmpSize).emit(asm, cmpSize, asRegister(searchValue[i]), arrayAddr);
}
asm.jccb(AMD64Assembler.ConditionFlag.Equal, elementWiseFound);
}
}
// adjust index
asm.incrementq(index, 1);
// continue loop
asm.cmpq(index, arrayLength);
asm.jccb(AMD64Assembler.ConditionFlag.Less, elementWiseLoop);
asm.bind(elementWiseNotFound);
asm.xorq(index, index);
if (findTwoConsecutive) {
asm.bind(elementWiseFound);
asm.decrementq(index, 1);
} else {
asm.decrementq(index, 1);
asm.bind(elementWiseFound);
}
asm.jmp(ret);
// vectorized implementation
asm.bind(runVectorized);
// move search values to vectors
for (int i = 0; i < nValues; i++) {
// fill comparison vector with copies of the search value
broadcastSearchValue(crb, asm, vecCmp[i], searchValue[i], cmpResult[0], vecArray[0]);
}
// do one unaligned vector comparison pass and adjust alignment afterwards
emitVectorCompare(asm, vectorCompareKind, findTwoConsecutive ? 2 : 1, arrayPtr, index, vecCmp, vecArray, cmpResult, vectorFound, false, false);
// adjust index to vector size alignment
asm.leaq(cmpResult[0], new AMD64Address(arrayPtr, arrayBaseOffset));
if (charMode(valueKind)) {
asm.shrq(cmpResult[0], 1);
}
asm.addq(index, cmpResult[0]);
// adjust to next lower multiple of vector size
asm.andq(index, ~(vectorSize - 1));
asm.subq(index, cmpResult[0]);
// add bulk size
asm.addq(index, bulkSize);
// check if there are enough array slots remaining for the bulk loop
asm.cmpq(index, arrayLength);
asm.jccb(AMD64Assembler.ConditionFlag.Greater, skipBulkVectorLoop);
emitAlign(crb, asm);
asm.bind(bulkVectorLoop);
// memory-aligned bulk comparison
emitVectorCompare(asm, vectorCompareKind, nVectors, arrayPtr, index, vecCmp, vecArray, cmpResult, vectorFound, false, !findTwoConsecutive);
// adjust index
asm.addq(index, bulkSize);
// check if there are enough array slots remaining for the bulk loop
asm.cmpq(index, arrayLength);
asm.jccb(AMD64Assembler.ConditionFlag.LessEqual, bulkVectorLoop);
asm.bind(skipBulkVectorLoop);
if ((findTwoConsecutive && nVectors == 2) || nVectors == 1) {
// do last load from end of array
asm.movq(index, arrayLength);
// compare
emitVectorCompare(asm, vectorCompareKind, findTwoConsecutive ? 2 : 1, arrayPtr, index, vecCmp, vecArray, cmpResult, vectorFound, true, false);
} else {
// remove bulk offset
asm.subq(index, bulkSize);
emitAlign(crb, asm);
// same loop as bulkVectorLoop, with only one vector
asm.bind(singleVectorLoop);
// add vector size
asm.addq(index, vectorSize);
// check if vector load is in bounds
asm.cmpq(index, arrayLength);
// if load would be over bounds, set the load to the end of the array
asm.cmovq(AMD64Assembler.ConditionFlag.Greater, index, arrayLength);
// compare
emitVectorCompare(asm, vectorCompareKind, findTwoConsecutive ? 2 : 1, arrayPtr, index, vecCmp, vecArray, cmpResult, vectorFound, true, false);
// check if there are enough array slots remaining for the loop
asm.cmpq(index, arrayLength);
asm.jccb(AMD64Assembler.ConditionFlag.Less, singleVectorLoop);
}
asm.movl(index, -1);
asm.jmpb(ret);
if (findTwoConsecutive) {
Label vectorFound2Done = new Label();
// vectorFound[0] and vectorFound[2] behave like the single-char case
asm.bind(vectorFound[2]);
// add static offset
asm.subq(index, getResultIndexDelta(2));
asm.jmpb(vectorFound2Done);
asm.bind(vectorFound[0]);
// add static offset
asm.subq(index, getResultIndexDelta(0));
asm.bind(vectorFound2Done);
// find offset
asm.bsfq(cmpResult[0], cmpResult[0]);
if (charMode(valueKind)) {
// convert byte offset to chars
asm.shrl(cmpResult[0], 1);
}
// add offset to index
asm.addq(index, cmpResult[0]);
asm.jmpb(ret);
Label minResult = new Label();
Label minResultDone = new Label();
// in vectorFound[1] and vectorFound[3], we have to check the results 0 and 2 as well
if (nVectors > 2) {
asm.bind(vectorFound[3]);
// add offset
asm.subq(index, getResultIndexDelta(3));
asm.jmpb(minResult);
}
asm.bind(vectorFound[1]);
// add offset
asm.subq(index, getResultIndexDelta(1));
asm.bind(minResult);
// find offset 0
asm.bsfq(cmpResult[1], cmpResult[1]);
// check if second result is also a match
asm.testq(cmpResult[0], cmpResult[0]);
asm.jccb(AMD64Assembler.ConditionFlag.Zero, minResultDone);
// find offset 1
asm.bsfq(cmpResult[0], cmpResult[0]);
asm.addq(cmpResult[0], valueKind.getByteCount());
// if first result is greater than second, replace it with the second result
asm.cmpq(cmpResult[1], cmpResult[0]);
asm.cmovq(AMD64Assembler.ConditionFlag.Greater, cmpResult[1], cmpResult[0]);
asm.bind(minResultDone);
if (charMode(valueKind)) {
// convert byte offset to chars
asm.shrl(cmpResult[1], 1);
}
// add offset to index
asm.addq(index, cmpResult[1]);
} else {
Label end = new Label();
for (int i = 0; i < nVectors; i++) {
asm.bind(vectorFound[i]);
// add static offset
asm.subq(index, getResultIndexDelta(i));
if (i < nVectors - 1) {
asm.jmpb(end);
}
}
asm.bind(end);
// find offset
asm.bsfq(cmpResult[0], cmpResult[0]);
if (charMode(valueKind)) {
// convert byte offset to chars
asm.shrl(cmpResult[0], 1);
}
// add offset to index
asm.addq(index, cmpResult[0]);
}
asm.bind(ret);
}
private boolean searchValuesOnStack(Value[] searchValue) {
for (int i = 0; i < nValues; i++) {
if (isStackSlot(searchValue[i])) {
return true;
}
}
return false;
}
private int getResultIndexDelta(int i) {
return (((findTwoConsecutive ? i / 2 : i) + 1) * (getVectorSize().getBytes() / valueKind.getByteCount())) + (findTwoConsecutive ? (i & 1) : 0);
}
private int getVectorOffset(int i) {
return arrayBaseOffset - getResultIndexDelta(i) * valueKind.getByteCount();
}
private void broadcastSearchValue(CompilationResultBuilder crb, AMD64MacroAssembler asm, Register dst, Value srcVal, Register tmpReg, Register tmpVector) {
Register src = asRegOrTmpReg(crb, asm, srcVal, tmpReg);
if (asm.supports(CPUFeature.AVX)) {
VexMoveOp.VMOVD.emit(asm, AVXKind.AVXSize.DWORD, dst, src);
} else {
asm.movdl(dst, src);
}
emitBroadcast(asm, getComparisonKind(), dst, tmpVector, getVectorSize());
}
private static boolean isConstant(Value val) {
assert !(val instanceof ConstantValue) || ((ConstantValue) val).isJavaConstant();
return val instanceof ConstantValue;
}
private static JavaConstant asConstant(Value val) {
return ((ConstantValue) val).getJavaConstant();
}
private static Register asRegOrTmpReg(CompilationResultBuilder crb, AMD64MacroAssembler asm, Value val, Register tmpReg) {
if (isRegister(val)) {
return asRegister(val);
} else if (isStackSlot(val)) {
asm.movl(tmpReg, (AMD64Address) crb.asAddress(val));
return tmpReg;
} else {
assert isConstant(val);
asm.movl(tmpReg, asConstant(val).asInt());
return tmpReg;
}
}
private static void emitAlign(CompilationResultBuilder crb, AMD64MacroAssembler asm) {
asm.align(crb.target.wordSize * 2);
}
/**
* Fills {@code vecDst} with copies of its lowest byte, word or dword.
*/
private static void emitBroadcast(AMD64MacroAssembler asm, JavaKind kind, Register vecDst, Register vecTmp, AVXKind.AVXSize vectorSize) {
switch (kind) {
case Byte:
if (asm.supports(CPUFeature.AVX2)) {
VexRMOp.VPBROADCASTB.emit(asm, vectorSize, vecDst, vecDst);
} else if (asm.supports(CPUFeature.AVX)) {
VexRVMOp.VPXOR.emit(asm, vectorSize, vecTmp, vecTmp, vecTmp);
VexRVMOp.VPSHUFB.emit(asm, vectorSize, vecDst, vecDst, vecTmp);
} else if (asm.supports(CPUFeature.SSSE3)) {
asm.pxor(vecTmp, vecTmp);
asm.pshufb(vecDst, vecTmp);
} else { // SSE2
asm.punpcklbw(vecDst, vecDst);
asm.punpcklbw(vecDst, vecDst);
asm.pshufd(vecDst, vecDst, 0);
}
break;
case Short:
case Char:
if (asm.supports(CPUFeature.AVX2)) {
VexRMOp.VPBROADCASTW.emit(asm, vectorSize, vecDst, vecDst);
} else if (asm.supports(CPUFeature.AVX)) {
VexRMIOp.VPSHUFLW.emit(asm, vectorSize, vecDst, vecDst, 0);
VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
} else { // SSE
asm.pshuflw(vecDst, vecDst, 0);
asm.pshufd(vecDst, vecDst, 0);
}
break;
case Int:
if (asm.supports(CPUFeature.AVX2)) {
VexRMOp.VPBROADCASTD.emit(asm, vectorSize, vecDst, vecDst);
} else if (asm.supports(CPUFeature.AVX)) {
VexRMIOp.VPSHUFD.emit(asm, vectorSize, vecDst, vecDst, 0);
} else { // SSE
asm.pshufd(vecDst, vecDst, 0);
}
break;
default:
throw new UnsupportedOperationException();
}
}
private void emitVectorCompare(AMD64MacroAssembler asm,
JavaKind kind,
int nVectors,
Register arrayPtr,
Register index,
Register[] vecCmp,
Register[] vecArray,
Register[] cmpResult,
Label[] vectorFound,
boolean shortJmp,
boolean alignedLoad) {
// load array contents into vectors
for (int i = 0; i < nVectors; i++) {
int base = i * nValues;
for (int j = 0; j < nValues; j++) {
emitArrayLoad(asm, getVectorSize(), vecArray[base + j], arrayPtr, index, getVectorOffset(nVectors - (i + 1)), alignedLoad);
}
}
// compare all loaded bytes to the search value.
// matching bytes are set to 0xff, non-matching bytes are set to 0x00.
if (!findTwoConsecutive) {
for (int i = 0; i < nVectors; i++) {
int base = i * nValues;
for (int j = 0; j < nValues; j++) {
emitVectorCompareInst(asm, kind, getVectorSize(), vecArray[base + j], vecCmp[j]);
if ((j & 1) == 1) {
emitPOR(asm, getVectorSize(), vecArray[base + j - 1], vecArray[base + j]);
}
}
if (nValues > 2) {
emitPOR(asm, getVectorSize(), vecArray[base], vecArray[base + 2]);
}
emitMOVMSK(asm, getVectorSize(), cmpResult[0], vecArray[base]);
emitJnz(asm, cmpResult[0], vectorFound[nVectors - (i + 1)], shortJmp);
}
} else {
for (int i = 0; i < nVectors; i += 2) {
emitVectorCompareInst(asm, kind, getVectorSize(), vecArray[i], vecCmp[0]);
emitVectorCompareInst(asm, kind, getVectorSize(), vecArray[i + 1], vecCmp[0]);
emitMOVMSK(asm, getVectorSize(), cmpResult[1], vecArray[i]);
emitMOVMSK(asm, getVectorSize(), cmpResult[0], vecArray[i + 1]);
emitJnz(asm, cmpResult[1], vectorFound[nVectors - (i + 1)], shortJmp);
emitJnz(asm, cmpResult[0], vectorFound[nVectors - (i + 2)], shortJmp);
}
}
}
private static void emitJnz(AMD64MacroAssembler asm, Register cond, Label tgt, boolean shortJmp) {
asm.testl(cond, cond);
if (shortJmp) {
asm.jccb(AMD64Assembler.ConditionFlag.NotZero, tgt);
} else {
asm.jcc(AMD64Assembler.ConditionFlag.NotZero, tgt);
}
}
private void emitArrayLoad(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register vecDst, Register arrayPtr, Register index, int offset, boolean alignedLoad) {
AMD64Address src = new AMD64Address(arrayPtr, index, arrayIndexScale, offset);
if (asm.supports(CPUFeature.AVX)) {
VexMoveOp loadOp = alignedLoad ? VexMoveOp.VMOVDQA32 : VexMoveOp.VMOVDQU32;
loadOp.emit(asm, vectorSize, vecDst, src);
} else {
// SSE
asm.movdqu(vecDst, src);
}
}
/**
* Compares all packed bytes/words/dwords in {@code vecArray} to {@code vecCmp}. Matching values
* are set to all ones (0xff, 0xffff, ...), non-matching values are set to zero.
*/
private static void emitVectorCompareInst(AMD64MacroAssembler asm, JavaKind kind, AVXKind.AVXSize vectorSize, Register vecArray, Register vecCmp) {
switch (kind) {
case Byte:
if (asm.supports(CPUFeature.AVX)) {
VexRVMOp.VPCMPEQB.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
} else { // SSE
asm.pcmpeqb(vecArray, vecCmp);
}
break;
case Short:
case Char:
if (asm.supports(CPUFeature.AVX)) {
VexRVMOp.VPCMPEQW.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
} else { // SSE
asm.pcmpeqw(vecArray, vecCmp);
}
break;
case Int:
if (asm.supports(CPUFeature.AVX)) {
VexRVMOp.VPCMPEQD.emit(asm, vectorSize, vecArray, vecCmp, vecArray);
} else { // SSE
asm.pcmpeqd(vecArray, vecCmp);
}
break;
default:
throw new UnsupportedOperationException();
}
}
private static void emitPOR(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) {
if (asm.supports(CPUFeature.AVX)) {
VexRVMOp.VPOR.emit(asm, vectorSize, dst, dst, vecSrc);
} else {
// SSE
asm.por(dst, vecSrc);
}
}
private static void emitMOVMSK(AMD64MacroAssembler asm, AVXKind.AVXSize vectorSize, Register dst, Register vecSrc) {
if (asm.supports(CPUFeature.AVX)) {
VexRMOp.VPMOVMSKB.emit(asm, vectorSize, dst, vecSrc);
} else {
// SSE
asm.pmovmskb(dst, vecSrc);
}
}
private static OperandSize getOpSize(JavaKind kind) {
switch (kind) {
case Byte:
return OperandSize.BYTE;
case Short:
case Char:
return OperandSize.WORD;
case Int:
return OperandSize.DWORD;
default:
return OperandSize.QWORD;
}
}
private static boolean supportsAVX2(LIRGeneratorTool tool) {
return supports(tool, CPUFeature.AVX2);
}
private static boolean supports(LIRGeneratorTool tool, CPUFeature cpuFeature) {
return ((AMD64) tool.target().arch).getFeatures().contains(cpuFeature);
}
@Override
public boolean needsClearUpperVectorRegisters() {
return true;
}
}