# HG changeset patch # User jiangli # Date 1502835558 14400 # Node ID a27c007d05bbd0f2128eab6cdeb6fe7fecc44f0a # Parent f64eb2bfb250e7a4510d488e10bd3ed904b7e6ad# Parent 58689a7ca4e060531708736281bf64c9e51ae727 Merge diff -r f64eb2bfb250 -r a27c007d05bb hotspot/src/cpu/aarch64/vm/aarch64.ad --- a/hotspot/src/cpu/aarch64/vm/aarch64.ad Tue Aug 15 18:13:20 2017 -0400 +++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad Tue Aug 15 18:19:18 2017 -0400 @@ -16102,6 +16102,16 @@ ins_pipe(pipe_class_memory); %} +instruct has_negatives(iRegP_R1 ary1, iRegI_R2 len, iRegI_R0 result, rFlagsReg cr) +%{ + match(Set result (HasNegatives ary1 len)); + effect(USE_KILL ary1, USE_KILL len, KILL cr); + format %{ "has negatives byte[] $ary1,$len -> $result" %} + ins_encode %{ + __ has_negatives($ary1$$Register, $len$$Register, $result$$Register); + %} + ins_pipe( pipe_slow ); +%} // fast char[] to byte[] compression instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, diff -r f64eb2bfb250 -r a27c007d05bb hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp --- a/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp Tue Aug 15 18:13:20 2017 -0400 +++ b/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp Tue Aug 15 18:19:18 2017 -0400 @@ -154,8 +154,11 @@ product(intx, BlockZeroingLowLimit, 256, \ "Minimum size in bytes when block zeroing will be used") \ range(1, max_jint) \ - product(bool, TraceTraps, false, "Trace all traps the signal handler") - + product(bool, TraceTraps, false, "Trace all traps the signal handler")\ + product(int, SoftwarePrefetchHintDistance, -1, \ + "Use prfm hint with specified distance in compiled code." \ + "Value -1 means off.") \ + range(-1, 32760) #endif diff -r f64eb2bfb250 -r a27c007d05bb hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp --- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Tue Aug 15 18:13:20 2017 -0400 +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp Tue Aug 15 18:19:18 2017 -0400 @@ -4829,6 +4829,62 @@ BLOCK_COMMENT("} string_compare"); } +// This method checks if provided byte array contains byte with highest bit set. +void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { + // Simple and most common case of aligned small array which is not at the + // end of memory page is placed here. All other cases are in stub. + Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; + const uint64_t UPPER_BIT_MASK=0x8080808080808080; + assert_different_registers(ary1, len, result); + + cmpw(len, 0); + br(LE, SET_RESULT); + cmpw(len, 4 * wordSize); + br(GE, STUB_LONG); // size > 32 then go to stub + + int shift = 64 - exact_log2(os::vm_page_size()); + lsl(rscratch1, ary1, shift); + mov(rscratch2, (size_t)(4 * wordSize) << shift); + adds(rscratch2, rscratch1, rscratch2); // At end of page? + br(CS, STUB); // at the end of page then go to stub + subs(len, len, wordSize); + br(LT, END); + + BIND(LOOP); + ldr(rscratch1, Address(post(ary1, wordSize))); + tst(rscratch1, UPPER_BIT_MASK); + br(NE, SET_RESULT); + subs(len, len, wordSize); + br(GE, LOOP); + cmpw(len, -wordSize); + br(EQ, SET_RESULT); + + BIND(END); + ldr(result, Address(ary1)); + sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes + lslv(result, result, len); + tst(result, UPPER_BIT_MASK); + b(SET_RESULT); + + BIND(STUB); + RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); + assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); + trampoline_call(has_neg); + b(DONE); + + BIND(STUB_LONG); + RuntimeAddress has_neg_long = RuntimeAddress( + StubRoutines::aarch64::has_negatives_long()); + assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); + trampoline_call(has_neg_long); + b(DONE); + + BIND(SET_RESULT); + cset(result, NE); // set true or false + + BIND(DONE); +} + // Compare Strings or char/byte arrays. // is_string is true iff this is a string comparison. diff -r f64eb2bfb250 -r a27c007d05bb hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp --- a/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Tue Aug 15 18:13:20 2017 -0400 +++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp Tue Aug 15 18:19:18 2017 -0400 @@ -1210,6 +1210,8 @@ Register tmp1, FloatRegister vtmp, FloatRegister vtmpZ, int ae); + void has_negatives(Register ary1, Register len, Register result); + void arrays_equals(Register a1, Register a2, Register result, Register cnt1, int elem_size, bool is_string); diff -r f64eb2bfb250 -r a27c007d05bb hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp --- a/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Tue Aug 15 18:13:20 2017 -0400 +++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp Tue Aug 15 18:19:18 2017 -0400 @@ -3670,6 +3670,167 @@ __ eor(result, __ T16B, lo, t0); } + address generate_has_negatives(address &has_negatives_long) { + StubCodeMark mark(this, "StubRoutines", "has_negatives"); + const int large_loop_size = 64; + const uint64_t UPPER_BIT_MASK=0x8080808080808080; + int dcache_line = VM_Version::dcache_line_size(); + + Register ary1 = r1, len = r2, result = r0; + + __ align(CodeEntryAlignment); + address entry = __ pc(); + + __ enter(); + + Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, + LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; + + __ cmp(len, 15); + __ br(Assembler::GT, LEN_OVER_15); + // The only case when execution falls into this code is when pointer is near + // the end of memory page and we have to avoid reading next page + __ add(ary1, ary1, len); + __ subs(len, len, 8); + __ br(Assembler::GT, LEN_OVER_8); + __ ldr(rscratch2, Address(ary1, -8)); + __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. + __ lsrv(rscratch2, rscratch2, rscratch1); + __ tst(rscratch2, UPPER_BIT_MASK); + __ cset(result, Assembler::NE); + __ leave(); + __ ret(lr); + __ bind(LEN_OVER_8); + __ ldp(rscratch1, rscratch2, Address(ary1, -16)); + __ sub(len, len, 8); // no data dep., then sub can be executed while loading + __ tst(rscratch2, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE_NO_POP); + __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes + __ lsrv(rscratch1, rscratch1, rscratch2); + __ tst(rscratch1, UPPER_BIT_MASK); + __ cset(result, Assembler::NE); + __ leave(); + __ ret(lr); + + Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; + const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; + + has_negatives_long = __ pc(); // 2nd entry point + + __ enter(); + + __ bind(LEN_OVER_15); + __ push(spilled_regs, sp); + __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment + __ cbz(rscratch2, ALIGNED); + __ ldp(tmp6, tmp1, Address(ary1)); + __ mov(tmp5, 16); + __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address + __ add(ary1, ary1, rscratch1); + __ sub(len, len, rscratch1); + __ orr(tmp6, tmp6, tmp1); + __ tst(tmp6, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE); + + __ bind(ALIGNED); + __ cmp(len, large_loop_size); + __ br(Assembler::LT, CHECK_16); + // Perform 16-byte load as early return in pre-loop to handle situation + // when initially aligned large array has negative values at starting bytes, + // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is + // slower. Cases with negative bytes further ahead won't be affected that + // much. In fact, it'll be faster due to early loads, less instructions and + // less branches in LARGE_LOOP. + __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); + __ sub(len, len, 16); + __ orr(tmp6, tmp6, tmp1); + __ tst(tmp6, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE); + __ cmp(len, large_loop_size); + __ br(Assembler::LT, CHECK_16); + + if (SoftwarePrefetchHintDistance >= 0 + && SoftwarePrefetchHintDistance >= dcache_line) { + // initial prefetch + __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); + } + __ bind(LARGE_LOOP); + if (SoftwarePrefetchHintDistance >= 0) { + __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); + } + // Issue load instructions first, since it can save few CPU/MEM cycles, also + // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) + // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 + // instructions per cycle and have less branches, but this approach disables + // early return, thus, all 64 bytes are loaded and checked every time. + __ ldp(tmp2, tmp3, Address(ary1)); + __ ldp(tmp4, tmp5, Address(ary1, 16)); + __ ldp(rscratch1, rscratch2, Address(ary1, 32)); + __ ldp(tmp6, tmp1, Address(ary1, 48)); + __ add(ary1, ary1, large_loop_size); + __ sub(len, len, large_loop_size); + __ orr(tmp2, tmp2, tmp3); + __ orr(tmp4, tmp4, tmp5); + __ orr(rscratch1, rscratch1, rscratch2); + __ orr(tmp6, tmp6, tmp1); + __ orr(tmp2, tmp2, tmp4); + __ orr(rscratch1, rscratch1, tmp6); + __ orr(tmp2, tmp2, rscratch1); + __ tst(tmp2, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE); + __ cmp(len, large_loop_size); + __ br(Assembler::GE, LARGE_LOOP); + + __ bind(CHECK_16); // small 16-byte load pre-loop + __ cmp(len, 16); + __ br(Assembler::LT, POST_LOOP16); + + __ bind(LOOP16); // small 16-byte load loop + __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); + __ sub(len, len, 16); + __ orr(tmp2, tmp2, tmp3); + __ tst(tmp2, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE); + __ cmp(len, 16); + __ br(Assembler::GE, LOOP16); // 16-byte load loop end + + __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally + __ cmp(len, 8); + __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); + __ ldr(tmp3, Address(__ post(ary1, 8))); + __ sub(len, len, 8); + __ tst(tmp3, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE); + + __ bind(POST_LOOP16_LOAD_TAIL); + __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 + __ ldr(tmp1, Address(ary1)); + __ mov(tmp2, 64); + __ sub(tmp4, tmp2, len, __ LSL, 3); + __ lslv(tmp1, tmp1, tmp4); + __ tst(tmp1, UPPER_BIT_MASK); + __ br(Assembler::NE, RET_TRUE); + // Fallthrough + + __ bind(RET_FALSE); + __ pop(spilled_regs, sp); + __ leave(); + __ mov(result, zr); + __ ret(lr); + + __ bind(RET_TRUE); + __ pop(spilled_regs, sp); + __ bind(RET_TRUE_NO_POP); + __ leave(); + __ mov(result, 1); + __ ret(lr); + + __ bind(DONE); + __ pop(spilled_regs, sp); + __ leave(); + __ ret(lr); + return entry; + } /** * Arguments: * @@ -4686,6 +4847,7 @@ // } }; + // Initialization void generate_initial() { // Generate initial stubs and initializes the entry points @@ -4744,6 +4906,9 @@ // arraycopy stubs used by compilers generate_arraycopy_stubs(); + // has negatives stub for large arrays. + StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); + if (UseMultiplyToLenIntrinsic) { StubRoutines::_multiplyToLen = generate_multiplyToLen(); } diff -r f64eb2bfb250 -r a27c007d05bb hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp --- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp Tue Aug 15 18:13:20 2017 -0400 +++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp Tue Aug 15 18:19:18 2017 -0400 @@ -44,6 +44,8 @@ address StubRoutines::aarch64::_double_sign_mask = NULL; address StubRoutines::aarch64::_double_sign_flip = NULL; address StubRoutines::aarch64::_zero_blocks = NULL; +address StubRoutines::aarch64::_has_negatives = NULL; +address StubRoutines::aarch64::_has_negatives_long = NULL; bool StubRoutines::aarch64::_completed = false; /** diff -r f64eb2bfb250 -r a27c007d05bb hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp --- a/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp Tue Aug 15 18:13:20 2017 -0400 +++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp Tue Aug 15 18:19:18 2017 -0400 @@ -62,6 +62,9 @@ static address _double_sign_flip; static address _zero_blocks; + + static address _has_negatives; + static address _has_negatives_long; static bool _completed; public: @@ -120,6 +123,14 @@ return _zero_blocks; } + static address has_negatives() { + return _has_negatives; + } + + static address has_negatives_long() { + return _has_negatives_long; + } + static bool complete() { return _completed; } diff -r f64eb2bfb250 -r a27c007d05bb hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp --- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Tue Aug 15 18:13:20 2017 -0400 +++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp Tue Aug 15 18:19:18 2017 -0400 @@ -137,6 +137,8 @@ FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 3*dcache_line); if (FLAG_IS_DEFAULT(PrefetchCopyIntervalInBytes)) FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 3*dcache_line); + if (FLAG_IS_DEFAULT(SoftwarePrefetchHintDistance)) + FLAG_SET_DEFAULT(SoftwarePrefetchHintDistance, 3*dcache_line); if (PrefetchCopyIntervalInBytes != -1 && ((PrefetchCopyIntervalInBytes & 7) || (PrefetchCopyIntervalInBytes >= 32768))) { @@ -146,6 +148,12 @@ PrefetchCopyIntervalInBytes = 32760; } + if (SoftwarePrefetchHintDistance != -1 && + (SoftwarePrefetchHintDistance & 7)) { + warning("SoftwarePrefetchHintDistance must be -1, or a multiple of 8"); + SoftwarePrefetchHintDistance &= ~7; + } + unsigned long auxv = getauxval(AT_HWCAP); char buf[512]; diff -r f64eb2bfb250 -r a27c007d05bb hotspot/src/os/linux/vm/os_linux.cpp --- a/hotspot/src/os/linux/vm/os_linux.cpp Tue Aug 15 18:13:20 2017 -0400 +++ b/hotspot/src/os/linux/vm/os_linux.cpp Tue Aug 15 18:19:18 2017 -0400 @@ -2201,6 +2201,8 @@ #if defined(AMD64) || defined(IA32) || defined(X32) const char* search_string = "model name"; +#elif defined(M68K) +const char* search_string = "CPU"; #elif defined(PPC64) const char* search_string = "cpu"; #elif defined(S390) diff -r f64eb2bfb250 -r a27c007d05bb hotspot/src/share/vm/runtime/arguments.cpp --- a/hotspot/src/share/vm/runtime/arguments.cpp Tue Aug 15 18:13:20 2017 -0400 +++ b/hotspot/src/share/vm/runtime/arguments.cpp Tue Aug 15 18:19:18 2017 -0400 @@ -4462,6 +4462,16 @@ set_shared_spaces_flags(); +#if defined(SPARC) + // BIS instructions require 'membar' instruction regardless of the number + // of CPUs because in virtualized/container environments which might use only 1 + // CPU, BIS instructions may produce incorrect results. + + if (FLAG_IS_DEFAULT(AssumeMP)) { + FLAG_SET_DEFAULT(AssumeMP, true); + } +#endif + // Check the GC selections again. if (!check_gc_consistency()) { return JNI_EINVAL; diff -r f64eb2bfb250 -r a27c007d05bb hotspot/test/testlibrary/ctw/src/sun/hotspot/tools/ctw/ClassPathJimageEntry.java --- a/hotspot/test/testlibrary/ctw/src/sun/hotspot/tools/ctw/ClassPathJimageEntry.java Tue Aug 15 18:13:20 2017 -0400 +++ b/hotspot/test/testlibrary/ctw/src/sun/hotspot/tools/ctw/ClassPathJimageEntry.java Tue Aug 15 18:19:18 2017 -0400 @@ -43,9 +43,16 @@ return Arrays.stream(reader.getEntryNames()) .filter(name -> name.endsWith(".class")) .filter(name -> !name.endsWith("module-info.class")) + .map(ClassPathJimageEntry::toFileName) .map(Utils::fileNameToClassName); } + private static String toFileName(String name) { + final char nameSeparator = '/'; + assert name.charAt(0) == nameSeparator : name; + return name.substring(name.indexOf(nameSeparator, 1) + 1); + } + @Override protected String description() { return "# jimage: " + root; diff -r f64eb2bfb250 -r a27c007d05bb hotspot/test/testlibrary/ctw/src/sun/hotspot/tools/ctw/Utils.java --- a/hotspot/test/testlibrary/ctw/src/sun/hotspot/tools/ctw/Utils.java Tue Aug 15 18:13:20 2017 -0400 +++ b/hotspot/test/testlibrary/ctw/src/sun/hotspot/tools/ctw/Utils.java Tue Aug 15 18:19:18 2017 -0400 @@ -203,18 +203,14 @@ * Converts the filename to classname. * * @param filename filename to convert - * @return corresponding classname. + * @return corresponding classname * @throws AssertionError if filename isn't valid filename for class file - * {@link #isClassFile(String)} */ public static String fileNameToClassName(String filename) { assert isClassFile(filename); - // workaround for the class naming in jimage : // final char nameSeparator = '/'; - int nameStart = filename.charAt(0) == nameSeparator - ? filename.indexOf(nameSeparator, 1) + 1 - : 0; - return filename.substring(nameStart, filename.length() - CLASSFILE_EXT.length()) + return filename.substring(0, filename.length() - CLASSFILE_EXT.length()) .replace(nameSeparator, '.'); }