# HG changeset patch # User mdoerr # Date 1496413959 -7200 # Node ID 17e8acfe1db872dff1e83534d3330dba5fd052bd # Parent de5cb3eed39ba6b48cb018de8a4d365f4fb8d3c6 8179527: Implement intrinsic code for reverseBytes with load/store Reviewed-by: simonis, mdoerr Contributed-by: Michihiro Horie diff -r de5cb3eed39b -r 17e8acfe1db8 hotspot/src/cpu/ppc/vm/assembler_ppc.hpp --- a/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp Fri Jun 02 13:48:01 2017 +0200 +++ b/hotspot/src/cpu/ppc/vm/assembler_ppc.hpp Fri Jun 02 16:32:39 2017 +0200 @@ -376,10 +376,12 @@ STWX_OPCODE = (31u << OPCODE_SHIFT | 151u << 1), STWU_OPCODE = (37u << OPCODE_SHIFT), STWUX_OPCODE = (31u << OPCODE_SHIFT | 183u << 1), + STWBRX_OPCODE = (31u << OPCODE_SHIFT | 662u << 1), STH_OPCODE = (44u << OPCODE_SHIFT), STHX_OPCODE = (31u << OPCODE_SHIFT | 407u << 1), STHU_OPCODE = (45u << OPCODE_SHIFT), + STHBRX_OPCODE = (31u << OPCODE_SHIFT | 918u << 1), STB_OPCODE = (38u << OPCODE_SHIFT), STBX_OPCODE = (31u << OPCODE_SHIFT | 215u << 1), @@ -401,11 +403,13 @@ LD_OPCODE = (58u << OPCODE_SHIFT | 0u << XO_30_31_SHIFT), // DS-FORM LDU_OPCODE = (58u << OPCODE_SHIFT | 1u << XO_30_31_SHIFT), // DS-FORM LDX_OPCODE = (31u << OPCODE_SHIFT | 21u << XO_21_30_SHIFT), // X-FORM + LDBRX_OPCODE = (31u << OPCODE_SHIFT | 532u << 1), // X-FORM STD_OPCODE = (62u << OPCODE_SHIFT | 0u << XO_30_31_SHIFT), // DS-FORM STDU_OPCODE = (62u << OPCODE_SHIFT | 1u << XO_30_31_SHIFT), // DS-FORM - STDUX_OPCODE = (31u << OPCODE_SHIFT | 181u << 1), // X-FORM + STDUX_OPCODE = (31u << OPCODE_SHIFT | 181u << 1), // X-FORM STDX_OPCODE = (31u << OPCODE_SHIFT | 149u << XO_21_30_SHIFT), // X-FORM + STDBRX_OPCODE = (31u << OPCODE_SHIFT | 660u << 1), // X-FORM RLDICR_OPCODE = (30u << OPCODE_SHIFT | 1u << XO_27_29_SHIFT), // MD-FORM RLDICL_OPCODE = (30u << OPCODE_SHIFT | 0u << XO_27_29_SHIFT), // MD-FORM @@ -1552,6 +1556,9 @@ inline void ld( Register d, int si16, Register s1); inline void ldu( Register d, int si16, Register s1); + // 8 bytes reversed + inline void ldbrx( Register d, Register s1, Register s2); + // For convenience. Load pointer into d from b+s1. inline void ld_ptr(Register d, int b, Register s1); DEBUG_ONLY(inline void ld_ptr(Register d, ByteSize b, Register s1);) @@ -1560,10 +1567,12 @@ inline void stwx( Register d, Register s1, Register s2); inline void stw( Register d, int si16, Register s1); inline void stwu( Register d, int si16, Register s1); + inline void stwbrx( Register d, Register s1, Register s2); inline void sthx( Register d, Register s1, Register s2); inline void sth( Register d, int si16, Register s1); inline void sthu( Register d, int si16, Register s1); + inline void sthbrx( Register d, Register s1, Register s2); inline void stbx( Register d, Register s1, Register s2); inline void stb( Register d, int si16, Register s1); @@ -1573,6 +1582,7 @@ inline void std( Register d, int si16, Register s1); inline void stdu( Register d, int si16, Register s1); inline void stdux(Register s, Register a, Register b); + inline void stdbrx( Register d, Register s1, Register s2); inline void st_ptr(Register d, int si16, Register s1); DEBUG_ONLY(inline void st_ptr(Register d, ByteSize b, Register s1);) @@ -2182,14 +2192,18 @@ inline void lbz( Register d, int si16); inline void ldx( Register d, Register s2); inline void ld( Register d, int si16); + inline void ldbrx(Register d, Register s2); inline void stwx( Register d, Register s2); inline void stw( Register d, int si16); + inline void stwbrx( Register d, Register s2); inline void sthx( Register d, Register s2); inline void sth( Register d, int si16); + inline void sthbrx( Register d, Register s2); inline void stbx( Register d, Register s2); inline void stb( Register d, int si16); inline void stdx( Register d, Register s2); inline void std( Register d, int si16); + inline void stdbrx( Register d, Register s2); // PPC 2, section 3.2.1 Instruction Cache Instructions inline void icbi( Register s2); diff -r de5cb3eed39b -r 17e8acfe1db8 hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp --- a/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp Fri Jun 02 13:48:01 2017 +0200 +++ b/hotspot/src/cpu/ppc/vm/assembler_ppc.inline.hpp Fri Jun 02 16:32:39 2017 +0200 @@ -327,6 +327,7 @@ inline void Assembler::ld( Register d, int si16, Register s1) { emit_int32(LD_OPCODE | rt(d) | ds(si16) | ra0mem(s1));} inline void Assembler::ldx( Register d, Register s1, Register s2) { emit_int32(LDX_OPCODE | rt(d) | ra0mem(s1) | rb(s2));} inline void Assembler::ldu( Register d, int si16, Register s1) { assert(d != s1, "according to ibm manual"); emit_int32(LDU_OPCODE | rt(d) | ds(si16) | rta0mem(s1));} +inline void Assembler::ldbrx( Register d, Register s1, Register s2) { emit_int32(LDBRX_OPCODE | rt(d) | ra0mem(s1) | rb(s2));} inline void Assembler::ld_ptr(Register d, int b, Register s1) { ld(d, b, s1); } DEBUG_ONLY(inline void Assembler::ld_ptr(Register d, ByteSize b, Register s1) { ld(d, in_bytes(b), s1); }) @@ -335,10 +336,12 @@ inline void Assembler::stwx( Register d, Register s1, Register s2) { emit_int32(STWX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));} inline void Assembler::stw( Register d, int si16, Register s1) { emit_int32(STW_OPCODE | rs(d) | d1(si16) | ra0mem(s1));} inline void Assembler::stwu( Register d, int si16, Register s1) { emit_int32(STWU_OPCODE | rs(d) | d1(si16) | rta0mem(s1));} +inline void Assembler::stwbrx( Register d, Register s1, Register s2) { emit_int32(STWBRX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));} inline void Assembler::sthx( Register d, Register s1, Register s2) { emit_int32(STHX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));} inline void Assembler::sth( Register d, int si16, Register s1) { emit_int32(STH_OPCODE | rs(d) | d1(si16) | ra0mem(s1));} inline void Assembler::sthu( Register d, int si16, Register s1) { emit_int32(STHU_OPCODE | rs(d) | d1(si16) | rta0mem(s1));} +inline void Assembler::sthbrx( Register d, Register s1, Register s2) { emit_int32(STHBRX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));} inline void Assembler::stbx( Register d, Register s1, Register s2) { emit_int32(STBX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));} inline void Assembler::stb( Register d, int si16, Register s1) { emit_int32(STB_OPCODE | rs(d) | d1(si16) | ra0mem(s1));} @@ -348,6 +351,7 @@ inline void Assembler::stdx( Register d, Register s1, Register s2) { emit_int32(STDX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));} inline void Assembler::stdu( Register d, int si16, Register s1) { emit_int32(STDU_OPCODE | rs(d) | ds(si16) | rta0mem(s1));} inline void Assembler::stdux(Register s, Register a, Register b) { emit_int32(STDUX_OPCODE| rs(s) | rta0mem(a) | rb(b));} +inline void Assembler::stdbrx( Register d, Register s1, Register s2) { emit_int32(STDBRX_OPCODE | rs(d) | ra0mem(s1) | rb(s2));} inline void Assembler::st_ptr(Register d, int b, Register s1) { std(d, b, s1); } DEBUG_ONLY(inline void Assembler::st_ptr(Register d, ByteSize b, Register s1) { std(d, in_bytes(b), s1); }) @@ -944,14 +948,18 @@ inline void Assembler::lbz( Register d, int si16 ) { emit_int32( LBZ_OPCODE | rt(d) | d1(si16));} inline void Assembler::ld( Register d, int si16 ) { emit_int32( LD_OPCODE | rt(d) | ds(si16));} inline void Assembler::ldx( Register d, Register s2) { emit_int32( LDX_OPCODE | rt(d) | rb(s2));} +inline void Assembler::ldbrx(Register d, Register s2) { emit_int32( LDBRX_OPCODE| rt(d) | rb(s2));} inline void Assembler::stwx( Register d, Register s2) { emit_int32( STWX_OPCODE | rs(d) | rb(s2));} inline void Assembler::stw( Register d, int si16 ) { emit_int32( STW_OPCODE | rs(d) | d1(si16));} +inline void Assembler::stwbrx(Register d, Register s2){ emit_int32(STWBRX_OPCODE| rs(d) | rb(s2));} inline void Assembler::sthx( Register d, Register s2) { emit_int32( STHX_OPCODE | rs(d) | rb(s2));} inline void Assembler::sth( Register d, int si16 ) { emit_int32( STH_OPCODE | rs(d) | d1(si16));} +inline void Assembler::sthbrx(Register d, Register s2){ emit_int32(STHBRX_OPCODE| rs(d) | rb(s2));} inline void Assembler::stbx( Register d, Register s2) { emit_int32( STBX_OPCODE | rs(d) | rb(s2));} inline void Assembler::stb( Register d, int si16 ) { emit_int32( STB_OPCODE | rs(d) | d1(si16));} inline void Assembler::std( Register d, int si16 ) { emit_int32( STD_OPCODE | rs(d) | ds(si16));} inline void Assembler::stdx( Register d, Register s2) { emit_int32( STDX_OPCODE | rs(d) | rb(s2));} +inline void Assembler::stdbrx(Register d, Register s2){ emit_int32(STDBRX_OPCODE| rs(d) | rb(s2));} // ra0 version inline void Assembler::icbi( Register s2) { emit_int32( ICBI_OPCODE | rb(s2) ); } diff -r de5cb3eed39b -r 17e8acfe1db8 hotspot/src/cpu/ppc/vm/ppc.ad --- a/hotspot/src/cpu/ppc/vm/ppc.ad Fri Jun 02 13:48:01 2017 +0200 +++ b/hotspot/src/cpu/ppc/vm/ppc.ad Fri Jun 02 16:32:39 2017 +0200 @@ -5842,6 +5842,16 @@ ins_pipe(pipe_class_default); %} +instruct rldicl(iRegLdst dst, iRegLsrc src, immI16 shift, immI16 mask_begin) %{ + effect(DEF dst, USE src, USE shift, USE mask_begin); + + size(4); + ins_encode %{ + __ rldicl($dst$$Register, $src$$Register, $shift$$constant, $mask_begin$$constant); + %} + ins_pipe(pipe_class_default); +%} + // Needed to postalloc expand loadConN: ConN is loaded as ConI // leaving the upper 32 bits with sign-extension bits. // This clears these bits: dst = src & 0xFFFFFFFF. @@ -10519,6 +10529,16 @@ ins_pipe(pipe_class_default); %} +instruct extsh(iRegIdst dst, iRegIsrc src) %{ + effect(DEF dst, USE src); + + size(4); + ins_encode %{ + __ extsh($dst$$Register, $src$$Register); + %} + ins_pipe(pipe_class_default); +%} + // LShiftI 16 + RShiftI 16 converts short to int. instruct convS2I_reg(iRegIdst dst, iRegIsrc src, immI_16 amount) %{ match(Set dst (RShiftI (LShiftI src amount) amount)); @@ -12682,8 +12702,7 @@ // Just slightly faster than java implementation. instruct bytes_reverse_int_Ex(iRegIdst dst, iRegIsrc src) %{ match(Set dst (ReverseBytesI src)); - predicate(UseCountLeadingZerosInstructionsPPC64); - ins_cost(DEFAULT_COST); + ins_cost(7*DEFAULT_COST); expand %{ immI16 imm24 %{ (int) 24 %} @@ -12705,6 +12724,172 @@ %} %} +instruct bytes_reverse_long_Ex(iRegLdst dst, iRegLsrc src) %{ + match(Set dst (ReverseBytesL src)); + ins_cost(15*DEFAULT_COST); + + expand %{ + immI16 imm56 %{ (int) 56 %} + immI16 imm48 %{ (int) 48 %} + immI16 imm40 %{ (int) 40 %} + immI16 imm32 %{ (int) 32 %} + immI16 imm24 %{ (int) 24 %} + immI16 imm16 %{ (int) 16 %} + immI16 imm8 %{ (int) 8 %} + immI16 imm0 %{ (int) 0 %} + iRegLdst tmpL1; + iRegLdst tmpL2; + iRegLdst tmpL3; + iRegLdst tmpL4; + iRegLdst tmpL5; + iRegLdst tmpL6; + + // src : |a|b|c|d|e|f|g|h| + rldicl(tmpL1, src, imm8, imm24); // tmpL1 : | | | |e|f|g|h|a| + rldicl(tmpL2, tmpL1, imm32, imm24); // tmpL2 : | | | |a| | | |e| + rldicl(tmpL3, tmpL2, imm32, imm0); // tmpL3 : | | | |e| | | |a| + rldicl(tmpL1, src, imm16, imm24); // tmpL1 : | | | |f|g|h|a|b| + rldicl(tmpL2, tmpL1, imm32, imm24); // tmpL2 : | | | |b| | | |f| + rldicl(tmpL4, tmpL2, imm40, imm0); // tmpL4 : | | |f| | | |b| | + orL_reg_reg(tmpL5, tmpL3, tmpL4); // tmpL5 : | | |f|e| | |b|a| + rldicl(tmpL1, src, imm24, imm24); // tmpL1 : | | | |g|h|a|b|c| + rldicl(tmpL2, tmpL1, imm32, imm24); // tmpL2 : | | | |c| | | |g| + rldicl(tmpL3, tmpL2, imm48, imm0); // tmpL3 : | |g| | | |c| | | + rldicl(tmpL1, src, imm32, imm24); // tmpL1 : | | | |h|a|b|c|d| + rldicl(tmpL2, tmpL1, imm32, imm24); // tmpL2 : | | | |d| | | |h| + rldicl(tmpL4, tmpL2, imm56, imm0); // tmpL4 : |h| | | |d| | | | + orL_reg_reg(tmpL6, tmpL3, tmpL4); // tmpL6 : |h|g| | |d|c| | | + orL_reg_reg(dst, tmpL5, tmpL6); // dst : |h|g|f|e|d|c|b|a| + %} +%} + +instruct bytes_reverse_ushort_Ex(iRegIdst dst, iRegIsrc src) %{ + match(Set dst (ReverseBytesUS src)); + ins_cost(2*DEFAULT_COST); + + expand %{ + immI16 imm16 %{ (int) 16 %} + immI16 imm8 %{ (int) 8 %} + + urShiftI_reg_imm(dst, src, imm8); + insrwi(dst, src, imm16, imm8); + %} +%} + +instruct bytes_reverse_short_Ex(iRegIdst dst, iRegIsrc src) %{ + match(Set dst (ReverseBytesS src)); + ins_cost(3*DEFAULT_COST); + + expand %{ + immI16 imm16 %{ (int) 16 %} + immI16 imm8 %{ (int) 8 %} + iRegLdst tmpI1; + + urShiftI_reg_imm(tmpI1, src, imm8); + insrwi(tmpI1, src, imm16, imm8); + extsh(dst, tmpI1); + %} +%} + +// Load Integer reversed byte order +instruct loadI_reversed(iRegIdst dst, indirect mem) %{ + match(Set dst (ReverseBytesI (LoadI mem))); + ins_cost(MEMORY_REF_COST); + + size(4); + ins_encode %{ + __ lwbrx($dst$$Register, $mem$$Register); + %} + ins_pipe(pipe_class_default); +%} + +// Load Long - aligned and reversed +instruct loadL_reversed(iRegLdst dst, indirect mem) %{ + match(Set dst (ReverseBytesL (LoadL mem))); + predicate(VM_Version::has_ldbrx()); + ins_cost(MEMORY_REF_COST); + + size(4); + ins_encode %{ + __ ldbrx($dst$$Register, $mem$$Register); + %} + ins_pipe(pipe_class_default); +%} + +// Load unsigned short / char reversed byte order +instruct loadUS_reversed(iRegIdst dst, indirect mem) %{ + match(Set dst (ReverseBytesUS (LoadUS mem))); + ins_cost(MEMORY_REF_COST); + + size(4); + ins_encode %{ + __ lhbrx($dst$$Register, $mem$$Register); + %} + ins_pipe(pipe_class_default); +%} + +// Load short reversed byte order +instruct loadS_reversed(iRegIdst dst, indirect mem) %{ + match(Set dst (ReverseBytesS (LoadS mem))); + ins_cost(MEMORY_REF_COST + DEFAULT_COST); + + size(8); + ins_encode %{ + __ lhbrx($dst$$Register, $mem$$Register); + __ extsh($dst$$Register, $dst$$Register); + %} + ins_pipe(pipe_class_default); +%} + +// Store Integer reversed byte order +instruct storeI_reversed(iRegIsrc src, indirect mem) %{ + match(Set mem (StoreI mem (ReverseBytesI src))); + ins_cost(MEMORY_REF_COST); + + size(4); + ins_encode %{ + __ stwbrx($src$$Register, $mem$$Register); + %} + ins_pipe(pipe_class_default); +%} + +// Store Long reversed byte order +instruct storeL_reversed(iRegLsrc src, indirect mem) %{ + match(Set mem (StoreL mem (ReverseBytesL src))); + predicate(VM_Version::has_stdbrx()); + ins_cost(MEMORY_REF_COST); + + size(4); + ins_encode %{ + __ stdbrx($src$$Register, $mem$$Register); + %} + ins_pipe(pipe_class_default); +%} + +// Store unsigned short / char reversed byte order +instruct storeUS_reversed(iRegIsrc src, indirect mem) %{ + match(Set mem (StoreC mem (ReverseBytesUS src))); + ins_cost(MEMORY_REF_COST); + + size(4); + ins_encode %{ + __ sthbrx($src$$Register, $mem$$Register); + %} + ins_pipe(pipe_class_default); +%} + +// Store short reversed byte order +instruct storeS_reversed(iRegIsrc src, indirect mem) %{ + match(Set mem (StoreC mem (ReverseBytesS src))); + ins_cost(MEMORY_REF_COST); + + size(4); + ins_encode %{ + __ sthbrx($src$$Register, $mem$$Register); + %} + ins_pipe(pipe_class_default); +%} + //---------- Replicate Vector Instructions ------------------------------------ // Insrdi does replicate if src == dst. diff -r de5cb3eed39b -r 17e8acfe1db8 hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp --- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp Fri Jun 02 13:48:01 2017 +0200 +++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp Fri Jun 02 16:32:39 2017 +0200 @@ -111,7 +111,7 @@ // Create and print feature-string. char buf[(num_features+1) * 16]; // Max 16 chars per feature. jio_snprintf(buf, sizeof(buf), - "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + "ppc64%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", (has_fsqrt() ? " fsqrt" : ""), (has_isel() ? " isel" : ""), (has_lxarxeh() ? " lxarxeh" : ""), @@ -126,7 +126,9 @@ (has_vpmsumb() ? " vpmsumb" : ""), (has_tcheck() ? " tcheck" : ""), (has_mfdscr() ? " mfdscr" : ""), - (has_vsx() ? " vsx" : "") + (has_vsx() ? " vsx" : ""), + (has_ldbrx() ? " ldbrx" : ""), + (has_stdbrx() ? " stdbrx" : "") // Make sure number of %s matches num_features! ); _features_string = os::strdup(buf); @@ -663,6 +665,8 @@ a->tcheck(0); // code[12] -> tcheck a->mfdscr(R0); // code[13] -> mfdscr a->lxvd2x(VSR0, R3_ARG1); // code[14] -> vsx + a->ldbrx(R7, R3_ARG1, R4_ARG2); // code[15] -> ldbrx + a->stdbrx(R7, R3_ARG1, R4_ARG2); // code[16] -> stdbrx a->blr(); // Emit function to set one cache line to zero. Emit function descriptor and get pointer to it. @@ -712,6 +716,8 @@ if (code[feature_cntr++]) features |= tcheck_m; if (code[feature_cntr++]) features |= mfdscr_m; if (code[feature_cntr++]) features |= vsx_m; + if (code[feature_cntr++]) features |= ldbrx_m; + if (code[feature_cntr++]) features |= stdbrx_m; // Print the detection code. if (PrintAssembly) { diff -r de5cb3eed39b -r 17e8acfe1db8 hotspot/src/cpu/ppc/vm/vm_version_ppc.hpp --- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.hpp Fri Jun 02 13:48:01 2017 +0200 +++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.hpp Fri Jun 02 16:32:39 2017 +0200 @@ -47,6 +47,8 @@ tcheck, mfdscr, vsx, + ldbrx, + stdbrx, num_features // last entry to count features }; enum Feature_Flag_Set { @@ -66,6 +68,8 @@ tcheck_m = (1 << tcheck ), mfdscr_m = (1 << mfdscr ), vsx_m = (1 << vsx ), + ldbrx_m = (1 << ldbrx ), + stdbrx_m = (1 << stdbrx ), all_features_m = (unsigned long)-1 }; @@ -100,6 +104,8 @@ static bool has_tcheck() { return (_features & tcheck_m) != 0; } static bool has_mfdscr() { return (_features & mfdscr_m) != 0; } static bool has_vsx() { return (_features & vsx_m) != 0; } + static bool has_ldbrx() { return (_features & ldbrx_m) != 0; } + static bool has_stdbrx() { return (_features & stdbrx_m) != 0; } // Assembler testing static void allow_all();