8189793: [s390]: Improve String compress/inflate by exploiting vector instructions
authorlucy
Wed, 22 Nov 2017 17:10:38 +0100
changeset 48094 bca569f79fa1
parent 48093 2cb07c3778e1
child 48095 528144831ba3
8189793: [s390]: Improve String compress/inflate by exploiting vector instructions Reviewed-by: mdoerr, goetz
src/hotspot/cpu/s390/assembler_s390.hpp
src/hotspot/cpu/s390/assembler_s390.inline.hpp
src/hotspot/cpu/s390/macroAssembler_s390.cpp
src/hotspot/cpu/s390/macroAssembler_s390.hpp
src/hotspot/cpu/s390/s390.ad
--- a/src/hotspot/cpu/s390/assembler_s390.hpp	Wed Nov 22 16:57:34 2017 +0100
+++ b/src/hotspot/cpu/s390/assembler_s390.hpp	Wed Nov 22 17:10:38 2017 +0100
@@ -582,7 +582,11 @@
 #define LOC_ZOPC    (unsigned long)(0xebL << 40 | 0xf2L)        // z196
 #define LOCG_ZOPC   (unsigned long)(0xebL << 40 | 0xe2L)        // z196
 
-#define LMG_ZOPC    (unsigned long)(235L << 40 | 4L)
+
+// LOAD multiple registers at once
+#define LM_ZOPC     (unsigned  int)(0x98  << 24)
+#define LMY_ZOPC    (unsigned long)(0xebL << 40 | 0x98L)
+#define LMG_ZOPC    (unsigned long)(0xebL << 40 | 0x04L)
 
 #define LE_ZOPC     (unsigned  int)(0x78 << 24)
 #define LEY_ZOPC    (unsigned long)(237L << 40 | 100L)
@@ -613,7 +617,10 @@
 #define STOC_ZOPC   (unsigned long)(0xebL << 40 | 0xf3L)        // z196
 #define STOCG_ZOPC  (unsigned long)(0xebL << 40 | 0xe3L)        // z196
 
-#define STMG_ZOPC   (unsigned long)(235L << 40 | 36L)
+// STORE multiple registers at once
+#define STM_ZOPC    (unsigned  int)(0x90  << 24)
+#define STMY_ZOPC   (unsigned long)(0xebL << 40 | 0x90L)
+#define STMG_ZOPC   (unsigned long)(0xebL << 40 | 0x24L)
 
 #define STE_ZOPC    (unsigned  int)(0x70 << 24)
 #define STEY_ZOPC   (unsigned long)(237L << 40 | 102L)
@@ -874,15 +881,19 @@
 
 // Shift
 // arithmetic
-#define SLA_ZOPC    (unsigned  int)(139 << 24)
-#define SLAG_ZOPC   (unsigned long)(235L << 40 | 11L)
-#define SRA_ZOPC    (unsigned  int)(138 << 24)
-#define SRAG_ZOPC   (unsigned long)(235L << 40 | 10L)
+#define SLA_ZOPC    (unsigned  int)(0x8b  << 24)
+#define SLAK_ZOPC   (unsigned long)(0xebL << 40 | 0xddL)
+#define SLAG_ZOPC   (unsigned long)(0xebL << 40 | 0x0bL)
+#define SRA_ZOPC    (unsigned  int)(0x8a  << 24)
+#define SRAK_ZOPC   (unsigned long)(0xebL << 40 | 0xdcL)
+#define SRAG_ZOPC   (unsigned long)(0xebL << 40 | 0x0aL)
 // logical
-#define SLL_ZOPC    (unsigned  int)(137 << 24)
-#define SLLG_ZOPC   (unsigned long)(235L << 40 | 13L)
-#define SRL_ZOPC    (unsigned  int)(136 << 24)
-#define SRLG_ZOPC   (unsigned long)(235L << 40 | 12L)
+#define SLL_ZOPC    (unsigned  int)(0x89  << 24)
+#define SLLK_ZOPC   (unsigned long)(0xebL << 40 | 0xdfL)
+#define SLLG_ZOPC   (unsigned long)(0xebL << 40 | 0x0dL)
+#define SRL_ZOPC    (unsigned  int)(0x88  << 24)
+#define SRLK_ZOPC   (unsigned long)(0xebL << 40 | 0xdeL)
+#define SRLG_ZOPC   (unsigned long)(0xebL << 40 | 0x0cL)
 
 // Rotate, then AND/XOR/OR/insert
 // rotate
@@ -2262,12 +2273,16 @@
 
   // shift
   inline void z_sla( Register r1,              int64_t d2, Register b2=Z_R0); // shift left  r1 = r1 << ((d2+b2)&0x3f) ; int32, only 31 bits shifted, sign preserved!
+  inline void z_slak(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift left  r1 = r3 << ((d2+b2)&0x3f) ; int32, only 31 bits shifted, sign preserved!
   inline void z_slag(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift left  r1 = r3 << ((d2+b2)&0x3f) ; int64, only 63 bits shifted, sign preserved!
   inline void z_sra( Register r1,              int64_t d2, Register b2=Z_R0); // shift right r1 = r1 >> ((d2+b2)&0x3f) ; int32, sign extended
+  inline void z_srak(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int32, sign extended
   inline void z_srag(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int64, sign extended
   inline void z_sll( Register r1,              int64_t d2, Register b2=Z_R0); // shift left  r1 = r1 << ((d2+b2)&0x3f) ; int32, zeros added
+  inline void z_sllk(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift left  r1 = r3 << ((d2+b2)&0x3f) ; int32, zeros added
   inline void z_sllg(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift left  r1 = r3 << ((d2+b2)&0x3f) ; int64, zeros added
   inline void z_srl( Register r1,              int64_t d2, Register b2=Z_R0); // shift right r1 = r1 >> ((d2+b2)&0x3f) ; int32, zero extended
+  inline void z_srlk(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int32, zero extended
   inline void z_srlg(Register r1, Register r3, int64_t d2, Register b2=Z_R0); // shift right r1 = r3 >> ((d2+b2)&0x3f) ; int64, zero extended
 
   // rotate
@@ -3035,7 +3050,11 @@
 
   inline void z_tam();
   inline void z_stckf(int64_t d2, Register b2);
+  inline void z_stm( Register r1, Register r3, int64_t d2, Register b2);
+  inline void z_stmy(Register r1, Register r3, int64_t d2, Register b2);
   inline void z_stmg(Register r1, Register r3, int64_t d2, Register b2);
+  inline void z_lm( Register r1, Register r3, int64_t d2, Register b2);
+  inline void z_lmy(Register r1, Register r3, int64_t d2, Register b2);
   inline void z_lmg(Register r1, Register r3, int64_t d2, Register b2);
 
   inline void z_cs( Register r1, Register r3, int64_t d2, Register b2);
--- a/src/hotspot/cpu/s390/assembler_s390.inline.hpp	Wed Nov 22 16:57:34 2017 +0100
+++ b/src/hotspot/cpu/s390/assembler_s390.inline.hpp	Wed Nov 22 17:10:38 2017 +0100
@@ -334,12 +334,16 @@
 // SHIFT/RORATE OPERATIONS
 //-----------------------------------
 inline void Assembler::z_sla( Register r1,              int64_t d2, Register b2) { emit_32( SLA_ZOPC  | regt(r1, 8, 32) | uimm12(d2, 20, 32) | reg(b2, 16, 32)); }
+inline void Assembler::z_slak(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SLAK_ZOPC | regt(r1, 8, 48) | simm20(d2)         | reg(b2, 16, 48) | reg(r3, 12, 48)); }
 inline void Assembler::z_slag(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SLAG_ZOPC | regt(r1, 8, 48) | simm20(d2)         | reg(b2, 16, 48) | reg(r3, 12, 48)); }
 inline void Assembler::z_sra( Register r1,              int64_t d2, Register b2) { emit_32( SRA_ZOPC  | regt(r1, 8, 32) | uimm12(d2, 20, 32) | reg(b2, 16, 32)); }
+inline void Assembler::z_srak(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SRAK_ZOPC | regt(r1, 8, 48) | simm20(d2)         | reg(b2, 16, 48) | reg(r3, 12, 48)); }
 inline void Assembler::z_srag(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SRAG_ZOPC | regt(r1, 8, 48) | simm20(d2)         | reg(b2, 16, 48) | reg(r3, 12, 48)); }
 inline void Assembler::z_sll( Register r1,              int64_t d2, Register b2) { emit_32( SLL_ZOPC  | regt(r1, 8, 32) | uimm12(d2, 20, 32) | reg(b2, 16, 32)); }
+inline void Assembler::z_sllk(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SLLK_ZOPC | regt(r1, 8, 48) | simm20(d2)         | reg(b2, 16, 48) | reg(r3, 12, 48)); }
 inline void Assembler::z_sllg(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SLLG_ZOPC | regt(r1, 8, 48) | simm20(d2)         | reg(b2, 16, 48) | reg(r3, 12, 48)); }
 inline void Assembler::z_srl( Register r1,              int64_t d2, Register b2) { emit_32( SRL_ZOPC  | regt(r1, 8, 32) | uimm12(d2, 20, 32) | reg(b2, 16, 32)); }
+inline void Assembler::z_srlk(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SRLK_ZOPC | regt(r1, 8, 48) | simm20(d2)         | reg(b2, 16, 48) | reg(r3, 12, 48)); }
 inline void Assembler::z_srlg(Register r1, Register r3, int64_t d2, Register b2) { emit_48( SRLG_ZOPC | regt(r1, 8, 48) | simm20(d2)         | reg(b2, 16, 48) | reg(r3, 12, 48)); }
 
 // rotate left
@@ -690,10 +694,14 @@
 
 inline void Assembler::z_tam() { emit_16( TAM_ZOPC); }
 inline void Assembler::z_stckf(int64_t d2, Register b2) { emit_32( STCKF_ZOPC | uimm12(d2, 20, 32) | regz(b2, 16, 32)); }
-inline void Assembler::z_stmg(Register r1, Register r3, int64_t d2, Register b2) { emit_48( STMG_ZOPC | simm20(d2) | reg(r1, 8, 48) | reg(r3,12,48)| reg(b2,16,48) ); }
-inline void Assembler::z_lmg(Register r1, Register r3, int64_t d2, Register b2)  { emit_48( LMG_ZOPC  | simm20(d2) | reg(r1, 8, 48) | reg(r3,12,48)| reg(b2,16,48) ); }
+inline void Assembler::z_stm( Register r1, Register r3, int64_t d2, Register b2) { emit_32( STM_ZOPC  | reg(r1, 8, 32) | reg(r3,12,32)| reg(b2,16,32) | uimm12(d2, 20,32)); }
+inline void Assembler::z_stmy(Register r1, Register r3, int64_t d2, Register b2) { emit_48( STMY_ZOPC | reg(r1, 8, 48) | reg(r3,12,48)| reg(b2,16,48) | simm20(d2) ); }
+inline void Assembler::z_stmg(Register r1, Register r3, int64_t d2, Register b2) { emit_48( STMG_ZOPC | reg(r1, 8, 48) | reg(r3,12,48)| reg(b2,16,48) | simm20(d2) ); }
+inline void Assembler::z_lm(  Register r1, Register r3, int64_t d2, Register b2) { emit_32( LM_ZOPC   | reg(r1, 8, 32) | reg(r3,12,32)| reg(b2,16,32) | uimm12(d2, 20,32)); }
+inline void Assembler::z_lmy( Register r1, Register r3, int64_t d2, Register b2) { emit_48( LMY_ZOPC  | reg(r1, 8, 48) | reg(r3,12,48)| reg(b2,16,48) | simm20(d2) ); }
+inline void Assembler::z_lmg( Register r1, Register r3, int64_t d2, Register b2) { emit_48( LMG_ZOPC  | reg(r1, 8, 48) | reg(r3,12,48)| reg(b2,16,48) | simm20(d2) ); }
 
-inline void Assembler::z_cs(Register r1, Register r3, int64_t d2, Register b2)  { emit_32( CS_ZOPC  | regt(r1, 8, 32) | reg(r3, 12, 32) | reg(b2, 16, 32) | uimm12(d2, 20, 32)); }
+inline void Assembler::z_cs( Register r1, Register r3, int64_t d2, Register b2) { emit_32( CS_ZOPC  | regt(r1, 8, 32) | reg(r3, 12, 32) | reg(b2, 16, 32) | uimm12(d2, 20, 32)); }
 inline void Assembler::z_csy(Register r1, Register r3, int64_t d2, Register b2) { emit_48( CSY_ZOPC | regt(r1, 8, 48) | reg(r3, 12, 48) | reg(b2, 16, 48) | simm20(d2)); }
 inline void Assembler::z_csg(Register r1, Register r3, int64_t d2, Register b2) { emit_48( CSG_ZOPC | regt(r1, 8, 48) | reg(r3, 12, 48) | reg(b2, 16, 48) | simm20(d2)); }
 inline void Assembler::z_cs( Register r1, Register r3, const Address& a) { assert(!a.has_index(), "Cannot encode index"); z_cs( r1, r3, a.disp(), a.baseOrR0()); }
--- a/src/hotspot/cpu/s390/macroAssembler_s390.cpp	Wed Nov 22 16:57:34 2017 +0100
+++ b/src/hotspot/cpu/s390/macroAssembler_s390.cpp	Wed Nov 22 17:10:38 2017 +0100
@@ -936,7 +936,7 @@
 
   // Some extra safety net.
   if (!RelAddr::is_in_range_of_RelAddr32(total_distance)) {
-    guarantee(RelAddr::is_in_range_of_RelAddr32(total_distance), "too far away");
+    guarantee(RelAddr::is_in_range_of_RelAddr32(total_distance), "load_long_pcrelative can't handle distance " INTPTR_FORMAT, total_distance);
   }
 
   (this)->relocate(rspec, relocInfo::pcrel_addr_format);
@@ -956,7 +956,7 @@
 
   // Some extra safety net.
   if (!RelAddr::is_in_range_of_RelAddr32(total_distance)) {
-    guarantee(RelAddr::is_in_range_of_RelAddr32(total_distance), "too far away");
+    guarantee(RelAddr::is_in_range_of_RelAddr32(total_distance), "load_long_pcrelative can't handle distance " INTPTR_FORMAT, total_distance);
   }
 
   (this)->relocate(rspec, relocInfo::pcrel_addr_format);
@@ -1025,6 +1025,13 @@
   }
 }
 
+void MacroAssembler::prefetch_read(Address a) {
+  z_pfd(1, a.disp20(), a.indexOrR0(), a.base());
+}
+void MacroAssembler::prefetch_update(Address a) {
+  z_pfd(2, a.disp20(), a.indexOrR0(), a.base());
+}
+
 // Clear a register, i.e. load const zero into reg.
 // Return len (in bytes) of generated instruction(s).
 // whole_reg: Clear 64 bits if true, 32 bits otherwise.
@@ -4896,77 +4903,295 @@
 
 // Intrinsics for CompactStrings
 
-// Compress char[] to byte[]. odd_reg contains cnt. Kills dst. Early clobber: result
+// Compress char[] to byte[].
+//   Restores: src, dst
+//   Uses:     cnt
+//   Kills:    tmp, Z_R0, Z_R1.
+//   Early clobber: result.
+// Note:
+//   cnt is signed int. Do not rely on high word!
+//       counts # characters, not bytes.
 // The result is the number of characters copied before the first incompatible character was found.
-// If tmp2 is provided and the compression fails, the compression stops exactly at this point and the result is precise.
+// If precise is true, the processing stops exactly at this point. Otherwise, the result may be off
+// by a few bytes. The result always indicates the number of copied characters.
 //
 // Note: Does not behave exactly like package private StringUTF16 compress java implementation in case of failure:
-// - Different number of characters may have been written to dead array (if tmp2 not provided).
+// - Different number of characters may have been written to dead array (if precise is false).
 // - Returns a number <cnt instead of 0. (Result gets compared with cnt.)
-unsigned int MacroAssembler::string_compress(Register result, Register src, Register dst, Register odd_reg,
-                                             Register even_reg, Register tmp, Register tmp2) {
-  int block_start = offset();
-  Label Lloop1, Lloop2, Lslow, Ldone;
-  const Register addr2 = dst, ind1 = result, mask = tmp;
-  const bool precise = (tmp2 != noreg);
-
-  BLOCK_COMMENT("string_compress {");
-
-  z_sll(odd_reg, 1);       // Number of bytes to read. (Must be a positive simm32.)
-  clear_reg(ind1);         // Index to read.
-  z_llilf(mask, 0xFF00FF00);
-  z_ahi(odd_reg, -16);     // Last possible index for fast loop.
-  z_brl(Lslow);
-
-  // ind1: index, even_reg: index increment, odd_reg: index limit
-  z_iihf(mask, 0xFF00FF00);
-  z_lhi(even_reg, 16);
-
-  bind(Lloop1); // 8 Characters per iteration.
-  z_lg(Z_R0, Address(src, ind1));
-  z_lg(Z_R1, Address(src, ind1, 8));
+unsigned int MacroAssembler::string_compress(Register result, Register src, Register dst, Register cnt,
+                                             Register tmp,    bool precise) {
+  assert_different_registers(Z_R0, Z_R1, src, dst, cnt, tmp);
+
   if (precise) {
+    BLOCK_COMMENT("encode_iso_array {");
+  } else {
+    BLOCK_COMMENT("string_compress {");
+  }
+  int  block_start = offset();
+
+  Register       Rsrc  = src;
+  Register       Rdst  = dst;
+  Register       Rix   = tmp;
+  Register       Rcnt  = cnt;
+  Register       Rmask = result;  // holds incompatibility check mask until result value is stored.
+  Label          ScalarShortcut, AllDone;
+
+  z_iilf(Rmask, 0xFF00FF00);
+  z_iihf(Rmask, 0xFF00FF00);
+
+#if 0  // Sacrifice shortcuts for code compactness
+  {
+    //---<  shortcuts for short strings (very frequent)   >---
+    //   Strings with 4 and 8 characters were fond to occur very frequently.
+    //   Therefore, we handle them right away with minimal overhead.
+    Label     skipShortcut, skip4Shortcut, skip8Shortcut;
+    Register  Rout = Z_R0;
+    z_chi(Rcnt, 4);
+    z_brne(skip4Shortcut);                 // 4 characters are very frequent
+      z_lg(Z_R0, 0, Rsrc);                 // Treat exactly 4 characters specially.
+      if (VM_Version::has_DistinctOpnds()) {
+        Rout = Z_R0;
+        z_ngrk(Rix, Z_R0, Rmask);
+      } else {
+        Rout = Rix;
+        z_lgr(Rix, Z_R0);
+        z_ngr(Z_R0, Rmask);
+      }
+      z_brnz(skipShortcut);
+      z_stcmh(Rout, 5, 0, Rdst);
+      z_stcm(Rout,  5, 2, Rdst);
+      z_lgfr(result, Rcnt);
+      z_bru(AllDone);
+    bind(skip4Shortcut);
+
+    z_chi(Rcnt, 8);
+    z_brne(skip8Shortcut);                 // There's more to do...
+      z_lmg(Z_R0, Z_R1, 0, Rsrc);          // Treat exactly 8 characters specially.
+      if (VM_Version::has_DistinctOpnds()) {
+        Rout = Z_R0;
+        z_ogrk(Rix, Z_R0, Z_R1);
+        z_ngr(Rix, Rmask);
+      } else {
+        Rout = Rix;
+        z_lgr(Rix, Z_R0);
+        z_ogr(Z_R0, Z_R1);
+        z_ngr(Z_R0, Rmask);
+      }
+      z_brnz(skipShortcut);
+      z_stcmh(Rout, 5, 0, Rdst);
+      z_stcm(Rout,  5, 2, Rdst);
+      z_stcmh(Z_R1, 5, 4, Rdst);
+      z_stcm(Z_R1,  5, 6, Rdst);
+      z_lgfr(result, Rcnt);
+      z_bru(AllDone);
+
+    bind(skip8Shortcut);
+    clear_reg(Z_R0, true, false);          // #characters already processed (none). Precond for scalar loop.
+    z_brl(ScalarShortcut);                 // Just a few characters
+
+    bind(skipShortcut);
+  }
+#endif
+  clear_reg(Z_R0);                         // make sure register is properly initialized.
+
+  if (VM_Version::has_VectorFacility()) {
+    const int  min_vcnt     = 32;          // Minimum #characters required to use vector instructions.
+                                           // Otherwise just do nothing in vector mode.
+                                           // Must be multiple of 2*(vector register length in chars (8 HW = 128 bits)).
+    const int  log_min_vcnt = exact_log2(min_vcnt);
+    Label      VectorLoop, VectorDone, VectorBreak;
+
+    VectorRegister Vtmp1      = Z_V16;
+    VectorRegister Vtmp2      = Z_V17;
+    VectorRegister Vmask      = Z_V18;
+    VectorRegister Vzero      = Z_V19;
+    VectorRegister Vsrc_first = Z_V20;
+    VectorRegister Vsrc_last  = Z_V23;
+
+    assert((Vsrc_last->encoding() - Vsrc_first->encoding() + 1) == min_vcnt/8, "logic error");
+    assert(VM_Version::has_DistinctOpnds(), "Assumption when has_VectorFacility()");
+    z_srak(Rix, Rcnt, log_min_vcnt);       // # vector loop iterations
+    z_brz(VectorDone);                     // not enough data for vector loop
+
+    z_vzero(Vzero);                        // all zeroes
+    z_vgmh(Vmask, 0, 7);                   // generate 0xff00 mask for all 2-byte elements
+    z_sllg(Z_R0, Rix, log_min_vcnt);       // remember #chars that will be processed by vector loop
+
+    bind(VectorLoop);
+      z_vlm(Vsrc_first, Vsrc_last, 0, Rsrc);
+      add2reg(Rsrc, min_vcnt*2);
+
+      //---<  check for incompatible character  >---
+      z_vo(Vtmp1, Z_V20, Z_V21);
+      z_vo(Vtmp2, Z_V22, Z_V23);
+      z_vo(Vtmp1, Vtmp1, Vtmp2);
+      z_vn(Vtmp1, Vtmp1, Vmask);
+      z_vceqhs(Vtmp1, Vtmp1, Vzero);       // high half of all chars must be zero for successful compress.
+      z_brne(VectorBreak);                 // break vector loop, incompatible character found.
+                                           // re-process data from current iteration in break handler.
+
+      //---<  pack & store characters  >---
+      z_vpkh(Vtmp1, Z_V20, Z_V21);         // pack (src1, src2) -> tmp1
+      z_vpkh(Vtmp2, Z_V22, Z_V23);         // pack (src3, src4) -> tmp2
+      z_vstm(Vtmp1, Vtmp2, 0, Rdst);       // store packed string
+      add2reg(Rdst, min_vcnt);
+
+      z_brct(Rix, VectorLoop);
+
+    z_bru(VectorDone);
+
+    bind(VectorBreak);
+      z_sll(Rix, log_min_vcnt);            // # chars processed so far in VectorLoop, excl. current iteration.
+      z_sr(Z_R0, Rix);                     // correct # chars processed in total.
+
+    bind(VectorDone);
+  }
+
+  {
+    const int  min_cnt     =  8;           // Minimum #characters required to use unrolled loop.
+                                           // Otherwise just do nothing in unrolled loop.
+                                           // Must be multiple of 8.
+    const int  log_min_cnt = exact_log2(min_cnt);
+    Label      UnrolledLoop, UnrolledDone, UnrolledBreak;
+
     if (VM_Version::has_DistinctOpnds()) {
-      z_ogrk(tmp2, Z_R0, Z_R1);
+      z_srk(Rix, Rcnt, Z_R0);              // remaining # chars to compress in unrolled loop
     } else {
-      z_lgr(tmp2, Z_R0);
-      z_ogr(tmp2, Z_R1);
+      z_lr(Rix, Rcnt);
+      z_sr(Rix, Z_R0);
     }
-    z_ngr(tmp2, mask);
-    z_brne(Lslow);         // Failed fast case, retry slowly.
-  }
-  z_stcmh(Z_R0, 5, 0, addr2);
-  z_stcm(Z_R0, 5, 2, addr2);
-  if (!precise) { z_ogr(Z_R0, Z_R1); }
-  z_stcmh(Z_R1, 5, 4, addr2);
-  z_stcm(Z_R1, 5, 6, addr2);
-  if (!precise) {
-    z_ngr(Z_R0, mask);
-    z_brne(Ldone);         // Failed (more than needed was written).
-  }
-  z_aghi(addr2, 8);
-  z_brxle(ind1, even_reg, Lloop1);
-
-  bind(Lslow);
-  // Compute index limit and skip if negative.
-  z_ahi(odd_reg, 16-2);    // Last possible index for slow loop.
-  z_lhi(even_reg, 2);
-  z_cr(ind1, odd_reg);
-  z_brh(Ldone);
-
-  bind(Lloop2); // 1 Character per iteration.
-  z_llh(Z_R0, Address(src, ind1));
-  z_tmll(Z_R0, 0xFF00);
-  z_brnaz(Ldone);          // Failed slow case: Return number of written characters.
-  z_stc(Z_R0, Address(addr2));
-  z_aghi(addr2, 1);
-  z_brxle(ind1, even_reg, Lloop2);
-
-  bind(Ldone);             // result = ind1 = 2*cnt
-  z_srl(ind1, 1);
-
-  BLOCK_COMMENT("} string_compress");
-
+    z_sra(Rix, log_min_cnt);             // unrolled loop count
+    z_brz(UnrolledDone);
+
+    bind(UnrolledLoop);
+      z_lmg(Z_R0, Z_R1, 0, Rsrc);
+      if (precise) {
+        z_ogr(Z_R1, Z_R0);                 // check all 8 chars for incompatibility
+        z_ngr(Z_R1, Rmask);
+        z_brnz(UnrolledBreak);
+
+        z_lg(Z_R1, 8, Rsrc);               // reload destroyed register
+        z_stcmh(Z_R0, 5, 0, Rdst);
+        z_stcm(Z_R0,  5, 2, Rdst);
+      } else {
+        z_stcmh(Z_R0, 5, 0, Rdst);
+        z_stcm(Z_R0,  5, 2, Rdst);
+
+        z_ogr(Z_R0, Z_R1);
+        z_ngr(Z_R0, Rmask);
+        z_brnz(UnrolledBreak);
+      }
+      z_stcmh(Z_R1, 5, 4, Rdst);
+      z_stcm(Z_R1,  5, 6, Rdst);
+
+      add2reg(Rsrc, min_cnt*2);
+      add2reg(Rdst, min_cnt);
+      z_brct(Rix, UnrolledLoop);
+
+    z_lgfr(Z_R0, Rcnt);                    // # chars processed in total after unrolled loop.
+    z_nilf(Z_R0, ~(min_cnt-1));
+    z_tmll(Rcnt, min_cnt-1);
+    z_brnaz(ScalarShortcut);               // if all bits zero, there is nothing left to do for scalar loop.
+                                           // Rix == 0 in all cases.
+    z_lgfr(result, Rcnt);                  // all characters processed.
+    z_sgfr(Rdst, Rcnt);                    // restore ptr
+    z_sgfr(Rsrc, Rcnt);                    // restore ptr, double the element count for Rsrc restore
+    z_sgfr(Rsrc, Rcnt);
+    z_bru(AllDone);
+
+    bind(UnrolledBreak);
+    z_lgfr(Z_R0, Rcnt);                    // # chars processed in total after unrolled loop
+    z_nilf(Z_R0, ~(min_cnt-1));
+    z_sll(Rix, log_min_cnt);               // # chars processed so far in UnrolledLoop, excl. current iteration.
+    z_sr(Z_R0, Rix);                       // correct # chars processed in total.
+    if (!precise) {
+      z_lgfr(result, Z_R0);
+      z_aghi(result, min_cnt/2);           // min_cnt/2 characters have already been written
+                                           // but ptrs were not updated yet.
+      z_sgfr(Rdst, Z_R0);                  // restore ptr
+      z_sgfr(Rsrc, Z_R0);                  // restore ptr, double the element count for Rsrc restore
+      z_sgfr(Rsrc, Z_R0);
+      z_bru(AllDone);
+    }
+    bind(UnrolledDone);
+  }
+
+  {
+    Label     ScalarLoop, ScalarDone, ScalarBreak;
+
+    bind(ScalarShortcut);
+    z_ltgfr(result, Rcnt);
+    z_brz(AllDone);
+
+#if 0  // Sacrifice shortcuts for code compactness
+    {
+      //---<  Special treatment for very short strings (one or two characters)  >---
+      //   For these strings, we are sure that the above code was skipped.
+      //   Thus, no registers were modified, register restore is not required.
+      Label     ScalarDoit, Scalar2Char;
+      z_chi(Rcnt, 2);
+      z_brh(ScalarDoit);
+      z_llh(Z_R1,  0, Z_R0, Rsrc);
+      z_bre(Scalar2Char);
+      z_tmll(Z_R1, 0xff00);
+      z_lghi(result, 0);                   // cnt == 1, first char invalid, no chars successfully processed
+      z_brnaz(AllDone);
+      z_stc(Z_R1,  0, Z_R0, Rdst);
+      z_lghi(result, 1);
+      z_bru(AllDone);
+
+      bind(Scalar2Char);
+      z_llh(Z_R0,  2, Z_R0, Rsrc);
+      z_tmll(Z_R1, 0xff00);
+      z_lghi(result, 0);                   // cnt == 2, first char invalid, no chars successfully processed
+      z_brnaz(AllDone);
+      z_stc(Z_R1,  0, Z_R0, Rdst);
+      z_tmll(Z_R0, 0xff00);
+      z_lghi(result, 1);                   // cnt == 2, second char invalid, one char successfully processed
+      z_brnaz(AllDone);
+      z_stc(Z_R0,  1, Z_R0, Rdst);
+      z_lghi(result, 2);
+      z_bru(AllDone);
+
+      bind(ScalarDoit);
+    }
+#endif
+
+    if (VM_Version::has_DistinctOpnds()) {
+      z_srk(Rix, Rcnt, Z_R0);              // remaining # chars to compress in unrolled loop
+    } else {
+      z_lr(Rix, Rcnt);
+      z_sr(Rix, Z_R0);
+    }
+    z_lgfr(result, Rcnt);                  // # processed characters (if all runs ok).
+    z_brz(ScalarDone);
+
+    bind(ScalarLoop);
+      z_llh(Z_R1, 0, Z_R0, Rsrc);
+      z_tmll(Z_R1, 0xff00);
+      z_brnaz(ScalarBreak);
+      z_stc(Z_R1, 0, Z_R0, Rdst);
+      add2reg(Rsrc, 2);
+      add2reg(Rdst, 1);
+      z_brct(Rix, ScalarLoop);
+
+    z_bru(ScalarDone);
+
+    bind(ScalarBreak);
+    z_sr(result, Rix);
+
+    bind(ScalarDone);
+    z_sgfr(Rdst, result);                  // restore ptr
+    z_sgfr(Rsrc, result);                  // restore ptr, double the element count for Rsrc restore
+    z_sgfr(Rsrc, result);
+  }
+  bind(AllDone);
+
+  if (precise) {
+    BLOCK_COMMENT("} encode_iso_array");
+  } else {
+    BLOCK_COMMENT("} string_compress");
+  }
   return offset() - block_start;
 }
 
@@ -4997,53 +5222,432 @@
   return offset() - block_start;
 }
 
-// Inflate byte[] to char[]. odd_reg contains cnt. Kills src.
-unsigned int MacroAssembler::string_inflate(Register src, Register dst, Register odd_reg,
-                                            Register even_reg, Register tmp) {
-  int block_start = offset();
+// Inflate byte[] to char[].
+//   Restores: src, dst
+//   Uses:     cnt
+//   Kills:    tmp, Z_R0, Z_R1.
+// Note:
+//   cnt is signed int. Do not rely on high word!
+//       counts # characters, not bytes.
+unsigned int MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
+  assert_different_registers(Z_R0, Z_R1, src, dst, cnt, tmp);
 
   BLOCK_COMMENT("string_inflate {");
-
-  Label Lloop1, Lloop2, Lslow, Ldone;
-  const Register addr1 = src, ind2 = tmp;
-
-  z_sll(odd_reg, 1);       // Number of bytes to write. (Must be a positive simm32.)
-  clear_reg(ind2);         // Index to write.
-  z_ahi(odd_reg, -16);     // Last possible index for fast loop.
-  z_brl(Lslow);
-
-  // ind2: index, even_reg: index increment, odd_reg: index limit
-  clear_reg(Z_R0);
-  clear_reg(Z_R1);
-  z_lhi(even_reg, 16);
-
-  bind(Lloop1); // 8 Characters per iteration.
-  z_icmh(Z_R0, 5, 0, addr1);
-  z_icmh(Z_R1, 5, 4, addr1);
-  z_icm(Z_R0, 5, 2, addr1);
-  z_icm(Z_R1, 5, 6, addr1);
-  z_aghi(addr1, 8);
-  z_stg(Z_R0, Address(dst, ind2));
-  z_stg(Z_R1, Address(dst, ind2, 8));
-  z_brxle(ind2, even_reg, Lloop1);
-
-  bind(Lslow);
-  // Compute index limit and skip if negative.
-  z_ahi(odd_reg, 16-2);    // Last possible index for slow loop.
-  z_lhi(even_reg, 2);
-  z_cr(ind2, odd_reg);
-  z_brh(Ldone);
-
-  bind(Lloop2); // 1 Character per iteration.
-  z_llc(Z_R0, Address(addr1));
-  z_sth(Z_R0, Address(dst, ind2));
-  z_aghi(addr1, 1);
-  z_brxle(ind2, even_reg, Lloop2);
-
-  bind(Ldone);
+  int block_start = offset();
+
+  Register   Rcnt = cnt;   // # characters (src: bytes, dst: char (2-byte)), remaining after current loop.
+  Register   Rix  = tmp;   // loop index
+  Register   Rsrc = src;   // addr(src array)
+  Register   Rdst = dst;   // addr(dst array)
+  Label      ScalarShortcut, AllDone;
+
+#if 0  // Sacrifice shortcuts for code compactness
+  {
+    //---<  shortcuts for short strings (very frequent)   >---
+    Label   skipShortcut, skip4Shortcut;
+    z_ltr(Rcnt, Rcnt);                     // absolutely nothing to do for strings of len == 0.
+    z_brz(AllDone);
+    clear_reg(Z_R0);                       // make sure registers are properly initialized.
+    clear_reg(Z_R1);
+    z_chi(Rcnt, 4);
+    z_brne(skip4Shortcut);                 // 4 characters are very frequent
+      z_icm(Z_R0, 5,    0, Rsrc);          // Treat exactly 4 characters specially.
+      z_icm(Z_R1, 5,    2, Rsrc);
+      z_stm(Z_R0, Z_R1, 0, Rdst);
+      z_bru(AllDone);
+    bind(skip4Shortcut);
+
+    z_chi(Rcnt, 8);
+    z_brh(skipShortcut);                   // There's a lot to do...
+    z_lgfr(Z_R0, Rcnt);                    // remaining #characters (<= 8). Precond for scalar loop.
+                                           // This does not destroy the "register cleared" state of Z_R0.
+    z_brl(ScalarShortcut);                 // Just a few characters
+      z_icmh(Z_R0, 5, 0, Rsrc);            // Treat exactly 8 characters specially.
+      z_icmh(Z_R1, 5, 4, Rsrc);
+      z_icm(Z_R0,  5, 2, Rsrc);
+      z_icm(Z_R1,  5, 6, Rsrc);
+      z_stmg(Z_R0, Z_R1, 0, Rdst);
+      z_bru(AllDone);
+    bind(skipShortcut);
+  }
+#endif
+  clear_reg(Z_R0);                         // make sure register is properly initialized.
+
+  if (VM_Version::has_VectorFacility()) {
+    const int  min_vcnt     = 32;          // Minimum #characters required to use vector instructions.
+                                           // Otherwise just do nothing in vector mode.
+                                           // Must be multiple of vector register length (16 bytes = 128 bits).
+    const int  log_min_vcnt = exact_log2(min_vcnt);
+    Label      VectorLoop, VectorDone;
+
+    assert(VM_Version::has_DistinctOpnds(), "Assumption when has_VectorFacility()");
+    z_srak(Rix, Rcnt, log_min_vcnt);       // calculate # vector loop iterations
+    z_brz(VectorDone);                     // skip if none
+
+    z_sllg(Z_R0, Rix, log_min_vcnt);       // remember #chars that will be processed by vector loop
+
+    bind(VectorLoop);
+      z_vlm(Z_V20, Z_V21, 0, Rsrc);        // get next 32 characters (single-byte)
+      add2reg(Rsrc, min_vcnt);
+
+      z_vuplhb(Z_V22, Z_V20);              // V2 <- (expand) V0(high)
+      z_vupllb(Z_V23, Z_V20);              // V3 <- (expand) V0(low)
+      z_vuplhb(Z_V24, Z_V21);              // V4 <- (expand) V1(high)
+      z_vupllb(Z_V25, Z_V21);              // V5 <- (expand) V1(low)
+      z_vstm(Z_V22, Z_V25, 0, Rdst);       // store next 32 bytes
+      add2reg(Rdst, min_vcnt*2);
+
+      z_brct(Rix, VectorLoop);
+
+    bind(VectorDone);
+  }
+
+  const int  min_cnt     =  8;             // Minimum #characters required to use unrolled scalar loop.
+                                           // Otherwise just do nothing in unrolled scalar mode.
+                                           // Must be multiple of 8.
+  {
+    const int  log_min_cnt = exact_log2(min_cnt);
+    Label      UnrolledLoop, UnrolledDone;
+
+
+    if (VM_Version::has_DistinctOpnds()) {
+      z_srk(Rix, Rcnt, Z_R0);              // remaining # chars to process in unrolled loop
+    } else {
+      z_lr(Rix, Rcnt);
+      z_sr(Rix, Z_R0);
+    }
+    z_sra(Rix, log_min_cnt);               // unrolled loop count
+    z_brz(UnrolledDone);
+
+    clear_reg(Z_R0);
+    clear_reg(Z_R1);
+
+    bind(UnrolledLoop);
+      z_icmh(Z_R0, 5, 0, Rsrc);
+      z_icmh(Z_R1, 5, 4, Rsrc);
+      z_icm(Z_R0,  5, 2, Rsrc);
+      z_icm(Z_R1,  5, 6, Rsrc);
+      add2reg(Rsrc, min_cnt);
+
+      z_stmg(Z_R0, Z_R1, 0, Rdst);
+
+      add2reg(Rdst, min_cnt*2);
+      z_brct(Rix, UnrolledLoop);
+
+    bind(UnrolledDone);
+    z_lgfr(Z_R0, Rcnt);                    // # chars left over after unrolled loop.
+    z_nilf(Z_R0, min_cnt-1);
+    z_brnz(ScalarShortcut);                // if zero, there is nothing left to do for scalar loop.
+                                           // Rix == 0 in all cases.
+    z_sgfr(Z_R0, Rcnt);                    // negative # characters the ptrs have been advanced previously.
+    z_agr(Rdst, Z_R0);                     // restore ptr, double the element count for Rdst restore.
+    z_agr(Rdst, Z_R0);
+    z_agr(Rsrc, Z_R0);                     // restore ptr.
+    z_bru(AllDone);
+  }
+
+  {
+    bind(ScalarShortcut);
+    // Z_R0 must contain remaining # characters as 64-bit signed int here.
+    //      register contents is preserved over scalar processing (for register fixup).
+
+#if 0  // Sacrifice shortcuts for code compactness
+    {
+      Label      ScalarDefault;
+      z_chi(Rcnt, 2);
+      z_brh(ScalarDefault);
+      z_llc(Z_R0,  0, Z_R0, Rsrc);     // 6 bytes
+      z_sth(Z_R0,  0, Z_R0, Rdst);     // 4 bytes
+      z_brl(AllDone);
+      z_llc(Z_R0,  1, Z_R0, Rsrc);     // 6 bytes
+      z_sth(Z_R0,  2, Z_R0, Rdst);     // 4 bytes
+      z_bru(AllDone);
+      bind(ScalarDefault);
+    }
+#endif
+
+    Label   CodeTable;
+    // Some comments on Rix calculation:
+    //  - Rcnt is small, therefore no bits shifted out of low word (sll(g) instructions).
+    //  - high word of both Rix and Rcnt may contain garbage
+    //  - the final lngfr takes care of that garbage, extending the sign to high word
+    z_sllg(Rix, Z_R0, 2);                // calculate 10*Rix = (4*Rix + Rix)*2
+    z_ar(Rix, Z_R0);
+    z_larl(Z_R1, CodeTable);
+    z_sll(Rix, 1);
+    z_lngfr(Rix, Rix);      // ix range: [0..7], after inversion & mult: [-(7*12)..(0*12)].
+    z_bc(Assembler::bcondAlways, 0, Rix, Z_R1);
+
+    z_llc(Z_R1,  6, Z_R0, Rsrc);  // 6 bytes
+    z_sth(Z_R1, 12, Z_R0, Rdst);  // 4 bytes
+
+    z_llc(Z_R1,  5, Z_R0, Rsrc);
+    z_sth(Z_R1, 10, Z_R0, Rdst);
+
+    z_llc(Z_R1,  4, Z_R0, Rsrc);
+    z_sth(Z_R1,  8, Z_R0, Rdst);
+
+    z_llc(Z_R1,  3, Z_R0, Rsrc);
+    z_sth(Z_R1,  6, Z_R0, Rdst);
+
+    z_llc(Z_R1,  2, Z_R0, Rsrc);
+    z_sth(Z_R1,  4, Z_R0, Rdst);
+
+    z_llc(Z_R1,  1, Z_R0, Rsrc);
+    z_sth(Z_R1,  2, Z_R0, Rdst);
+
+    z_llc(Z_R1,  0, Z_R0, Rsrc);
+    z_sth(Z_R1,  0, Z_R0, Rdst);
+    bind(CodeTable);
+
+    z_chi(Rcnt, 8);                        // no fixup for small strings. Rdst, Rsrc were not modified.
+    z_brl(AllDone);
+
+    z_sgfr(Z_R0, Rcnt);                    // # characters the ptrs have been advanced previously.
+    z_agr(Rdst, Z_R0);                     // restore ptr, double the element count for Rdst restore.
+    z_agr(Rdst, Z_R0);
+    z_agr(Rsrc, Z_R0);                     // restore ptr.
+  }
+  bind(AllDone);
 
   BLOCK_COMMENT("} string_inflate");
-
+  return offset() - block_start;
+}
+
+// Inflate byte[] to char[], length known at compile time.
+//   Restores: src, dst
+//   Kills:    tmp, Z_R0, Z_R1.
+// Note:
+//   len is signed int. Counts # characters, not bytes.
+unsigned int MacroAssembler::string_inflate_const(Register src, Register dst, Register tmp, int len) {
+  assert_different_registers(Z_R0, Z_R1, src, dst, tmp);
+
+  BLOCK_COMMENT("string_inflate_const {");
+  int block_start = offset();
+
+  Register   Rix  = tmp;   // loop index
+  Register   Rsrc = src;   // addr(src array)
+  Register   Rdst = dst;   // addr(dst array)
+  Label      ScalarShortcut, AllDone;
+  int        nprocessed = 0;
+  int        src_off    = 0;  // compensate for saved (optimized away) ptr advancement.
+  int        dst_off    = 0;  // compensate for saved (optimized away) ptr advancement.
+  bool       restore_inputs = false;
+  bool       workreg_clear  = false;
+
+  if ((len >= 32) && VM_Version::has_VectorFacility()) {
+    const int  min_vcnt     = 32;          // Minimum #characters required to use vector instructions.
+                                           // Otherwise just do nothing in vector mode.
+                                           // Must be multiple of vector register length (16 bytes = 128 bits).
+    const int  log_min_vcnt = exact_log2(min_vcnt);
+    const int  iterations   = (len - nprocessed) >> log_min_vcnt;
+    nprocessed             += iterations << log_min_vcnt;
+    Label      VectorLoop;
+
+    if (iterations == 1) {
+      z_vlm(Z_V20, Z_V21, 0+src_off, Rsrc);  // get next 32 characters (single-byte)
+      z_vuplhb(Z_V22, Z_V20);                // V2 <- (expand) V0(high)
+      z_vupllb(Z_V23, Z_V20);                // V3 <- (expand) V0(low)
+      z_vuplhb(Z_V24, Z_V21);                // V4 <- (expand) V1(high)
+      z_vupllb(Z_V25, Z_V21);                // V5 <- (expand) V1(low)
+      z_vstm(Z_V22, Z_V25, 0+dst_off, Rdst); // store next 32 bytes
+
+      src_off += min_vcnt;
+      dst_off += min_vcnt*2;
+    } else {
+      restore_inputs = true;
+
+      z_lgfi(Rix, len>>log_min_vcnt);
+      bind(VectorLoop);
+        z_vlm(Z_V20, Z_V21, 0, Rsrc);        // get next 32 characters (single-byte)
+        add2reg(Rsrc, min_vcnt);
+
+        z_vuplhb(Z_V22, Z_V20);              // V2 <- (expand) V0(high)
+        z_vupllb(Z_V23, Z_V20);              // V3 <- (expand) V0(low)
+        z_vuplhb(Z_V24, Z_V21);              // V4 <- (expand) V1(high)
+        z_vupllb(Z_V25, Z_V21);              // V5 <- (expand) V1(low)
+        z_vstm(Z_V22, Z_V25, 0, Rdst);       // store next 32 bytes
+        add2reg(Rdst, min_vcnt*2);
+
+        z_brct(Rix, VectorLoop);
+    }
+  }
+
+  if (((len-nprocessed) >= 16) && VM_Version::has_VectorFacility()) {
+    const int  min_vcnt     = 16;          // Minimum #characters required to use vector instructions.
+                                           // Otherwise just do nothing in vector mode.
+                                           // Must be multiple of vector register length (16 bytes = 128 bits).
+    const int  log_min_vcnt = exact_log2(min_vcnt);
+    const int  iterations   = (len - nprocessed) >> log_min_vcnt;
+    nprocessed             += iterations << log_min_vcnt;
+    assert(iterations == 1, "must be!");
+
+    z_vl(Z_V20, 0+src_off, Z_R0, Rsrc);    // get next 16 characters (single-byte)
+    z_vuplhb(Z_V22, Z_V20);                // V2 <- (expand) V0(high)
+    z_vupllb(Z_V23, Z_V20);                // V3 <- (expand) V0(low)
+    z_vstm(Z_V22, Z_V23, 0+dst_off, Rdst); // store next 32 bytes
+
+    src_off += min_vcnt;
+    dst_off += min_vcnt*2;
+  }
+
+  if ((len-nprocessed) > 8) {
+    const int  min_cnt     =  8;           // Minimum #characters required to use unrolled scalar loop.
+                                           // Otherwise just do nothing in unrolled scalar mode.
+                                           // Must be multiple of 8.
+    const int  log_min_cnt = exact_log2(min_cnt);
+    const int  iterations  = (len - nprocessed) >> log_min_cnt;
+    nprocessed     += iterations << log_min_cnt;
+
+    //---<  avoid loop overhead/ptr increment for small # iterations  >---
+    if (iterations <= 2) {
+      clear_reg(Z_R0);
+      clear_reg(Z_R1);
+      workreg_clear = true;
+
+      z_icmh(Z_R0, 5, 0+src_off, Rsrc);
+      z_icmh(Z_R1, 5, 4+src_off, Rsrc);
+      z_icm(Z_R0,  5, 2+src_off, Rsrc);
+      z_icm(Z_R1,  5, 6+src_off, Rsrc);
+      z_stmg(Z_R0, Z_R1, 0+dst_off, Rdst);
+
+      src_off += min_cnt;
+      dst_off += min_cnt*2;
+    }
+
+    if (iterations == 2) {
+      z_icmh(Z_R0, 5, 0+src_off, Rsrc);
+      z_icmh(Z_R1, 5, 4+src_off, Rsrc);
+      z_icm(Z_R0,  5, 2+src_off, Rsrc);
+      z_icm(Z_R1,  5, 6+src_off, Rsrc);
+      z_stmg(Z_R0, Z_R1, 0+dst_off, Rdst);
+
+      src_off += min_cnt;
+      dst_off += min_cnt*2;
+    }
+
+    if (iterations > 2) {
+      Label      UnrolledLoop;
+      restore_inputs  = true;
+
+      clear_reg(Z_R0);
+      clear_reg(Z_R1);
+      workreg_clear = true;
+
+      z_lgfi(Rix, iterations);
+      bind(UnrolledLoop);
+        z_icmh(Z_R0, 5, 0, Rsrc);
+        z_icmh(Z_R1, 5, 4, Rsrc);
+        z_icm(Z_R0,  5, 2, Rsrc);
+        z_icm(Z_R1,  5, 6, Rsrc);
+        add2reg(Rsrc, min_cnt);
+
+        z_stmg(Z_R0, Z_R1, 0, Rdst);
+        add2reg(Rdst, min_cnt*2);
+
+        z_brct(Rix, UnrolledLoop);
+    }
+  }
+
+  if ((len-nprocessed) > 0) {
+    switch (len-nprocessed) {
+      case 8:
+        if (!workreg_clear) {
+          clear_reg(Z_R0);
+          clear_reg(Z_R1);
+        }
+        z_icmh(Z_R0, 5, 0+src_off, Rsrc);
+        z_icmh(Z_R1, 5, 4+src_off, Rsrc);
+        z_icm(Z_R0,  5, 2+src_off, Rsrc);
+        z_icm(Z_R1,  5, 6+src_off, Rsrc);
+        z_stmg(Z_R0, Z_R1, 0+dst_off, Rdst);
+        break;
+      case 7:
+        if (!workreg_clear) {
+          clear_reg(Z_R0);
+          clear_reg(Z_R1);
+        }
+        clear_reg(Rix);
+        z_icm(Z_R0,  5, 0+src_off, Rsrc);
+        z_icm(Z_R1,  5, 2+src_off, Rsrc);
+        z_icm(Rix,   5, 4+src_off, Rsrc);
+        z_stm(Z_R0,  Z_R1, 0+dst_off, Rdst);
+        z_llc(Z_R0,  6+src_off, Z_R0, Rsrc);
+        z_st(Rix,    8+dst_off, Z_R0, Rdst);
+        z_sth(Z_R0, 12+dst_off, Z_R0, Rdst);
+        break;
+      case 6:
+        if (!workreg_clear) {
+          clear_reg(Z_R0);
+          clear_reg(Z_R1);
+        }
+        clear_reg(Rix);
+        z_icm(Z_R0, 5, 0+src_off, Rsrc);
+        z_icm(Z_R1, 5, 2+src_off, Rsrc);
+        z_icm(Rix,  5, 4+src_off, Rsrc);
+        z_stm(Z_R0, Z_R1, 0+dst_off, Rdst);
+        z_st(Rix,   8+dst_off, Z_R0, Rdst);
+        break;
+      case 5:
+        if (!workreg_clear) {
+          clear_reg(Z_R0);
+          clear_reg(Z_R1);
+        }
+        z_icm(Z_R0, 5, 0+src_off, Rsrc);
+        z_icm(Z_R1, 5, 2+src_off, Rsrc);
+        z_llc(Rix,  4+src_off, Z_R0, Rsrc);
+        z_stm(Z_R0, Z_R1, 0+dst_off, Rdst);
+        z_sth(Rix,  8+dst_off, Z_R0, Rdst);
+        break;
+      case 4:
+        if (!workreg_clear) {
+          clear_reg(Z_R0);
+          clear_reg(Z_R1);
+        }
+        z_icm(Z_R0, 5, 0+src_off, Rsrc);
+        z_icm(Z_R1, 5, 2+src_off, Rsrc);
+        z_stm(Z_R0, Z_R1, 0+dst_off, Rdst);
+        break;
+      case 3:
+        if (!workreg_clear) {
+          clear_reg(Z_R0);
+        }
+        z_llc(Z_R1, 2+src_off, Z_R0, Rsrc);
+        z_icm(Z_R0, 5, 0+src_off, Rsrc);
+        z_sth(Z_R1, 4+dst_off, Z_R0, Rdst);
+        z_st(Z_R0,  0+dst_off, Rdst);
+        break;
+      case 2:
+        z_llc(Z_R0, 0+src_off, Z_R0, Rsrc);
+        z_llc(Z_R1, 1+src_off, Z_R0, Rsrc);
+        z_sth(Z_R0, 0+dst_off, Z_R0, Rdst);
+        z_sth(Z_R1, 2+dst_off, Z_R0, Rdst);
+        break;
+      case 1:
+        z_llc(Z_R0, 0+src_off, Z_R0, Rsrc);
+        z_sth(Z_R0, 0+dst_off, Z_R0, Rdst);
+        break;
+      default:
+        guarantee(false, "Impossible");
+        break;
+    }
+    src_off   +=  len-nprocessed;
+    dst_off   += (len-nprocessed)*2;
+    nprocessed = len;
+  }
+
+  //---< restore modified input registers  >---
+  if ((nprocessed > 0) && restore_inputs) {
+    z_agfi(Rsrc, -(nprocessed-src_off));
+    if (nprocessed < 1000000000) { // avoid int overflow
+      z_agfi(Rdst, -(nprocessed*2-dst_off));
+    } else {
+      z_agfi(Rdst, -(nprocessed-dst_off));
+      z_agfi(Rdst, -nprocessed);
+    }
+  }
+
+  BLOCK_COMMENT("} string_inflate_const");
   return offset() - block_start;
 }
 
--- a/src/hotspot/cpu/s390/macroAssembler_s390.hpp	Wed Nov 22 16:57:34 2017 +0100
+++ b/src/hotspot/cpu/s390/macroAssembler_s390.hpp	Wed Nov 22 17:10:38 2017 +0100
@@ -198,6 +198,9 @@
   // Test a bit in a register. Result is reflected in CC.
   void testbit(Register r, unsigned int bitPos);
 
+  void prefetch_read(Address a);
+  void prefetch_update(Address a);
+
   // Clear a register, i.e. load const zero into reg. Return len (in bytes) of
   // generated instruction(s).
   //   whole_reg: Clear 64 bits if true, 32 bits otherwise.
@@ -836,7 +839,7 @@
   void load_mirror(Register mirror, Register method);
 
   //--------------------------
-  //---  perations on arrays.
+  //---  Operations on arrays.
   //--------------------------
   unsigned int Clear_Array(Register cnt_arg, Register base_pointer_arg, Register src_addr, Register src_len);
   unsigned int Clear_Array_Const(long cnt, Register base);
@@ -849,20 +852,34 @@
   // Special String Intrinsics Implementation.
   //-------------------------------------------
   // Intrinsics for CompactStrings
-  // Compress char[] to byte[]. odd_reg contains cnt. tmp3 is only needed for precise behavior in failure case. Kills dst.
-  unsigned int string_compress(Register result, Register src, Register dst, Register odd_reg,
-                               Register even_reg, Register tmp, Register tmp2 = noreg);
+  //   Restores: src, dst
+  //   Uses:     cnt
+  //   Kills:    tmp, Z_R0, Z_R1.
+  //   Early clobber: result.
+  //   Boolean precise controls accuracy of result value.
+  unsigned int string_compress(Register result, Register src, Register dst, Register cnt,
+                               Register tmp,    bool precise);
+
+  // Inflate byte[] to char[].
+  unsigned int string_inflate_trot(Register src, Register dst, Register cnt, Register tmp);
+
+  // Inflate byte[] to char[].
+  //   Restores: src, dst
+  //   Uses:     cnt
+  //   Kills:    tmp, Z_R0, Z_R1.
+  unsigned int string_inflate(Register src, Register dst, Register cnt, Register tmp);
+
+  // Inflate byte[] to char[], length known at compile time.
+  //   Restores: src, dst
+  //   Kills:    tmp, Z_R0, Z_R1.
+  // Note:
+  //   len is signed int. Counts # characters, not bytes.
+  unsigned int string_inflate_const(Register src, Register dst, Register tmp, int len);
 
   // Kills src.
   unsigned int has_negatives(Register result, Register src, Register cnt,
                              Register odd_reg, Register even_reg, Register tmp);
 
-  // Inflate byte[] to char[].
-  unsigned int string_inflate_trot(Register src, Register dst, Register cnt, Register tmp);
-  // Odd_reg contains cnt. Kills src.
-  unsigned int string_inflate(Register src, Register dst, Register odd_reg,
-                              Register even_reg, Register tmp);
-
   unsigned int string_compare(Register str1, Register str2, Register cnt1, Register cnt2,
                               Register odd_reg, Register even_reg, Register result, int ae);
 
--- a/src/hotspot/cpu/s390/s390.ad	Wed Nov 22 16:57:34 2017 +0100
+++ b/src/hotspot/cpu/s390/s390.ad	Wed Nov 22 17:10:38 2017 +0100
@@ -10267,14 +10267,14 @@
 %}
 
 // char[] to byte[] compression
-instruct string_compress(iRegP src, rarg5RegP dst, iRegI result, roddRegI len, revenRegI evenReg, iRegI tmp, flagsReg cr) %{
+instruct string_compress(iRegP src, iRegP dst, iRegI result, iRegI len, iRegI tmp, flagsReg cr) %{
   match(Set result (StrCompressedCopy src (Binary dst len)));
-  effect(TEMP_DEF result, USE_KILL dst, USE_KILL len, TEMP evenReg, TEMP tmp, KILL cr); // R0, R1 are killed, too.
+  effect(TEMP_DEF result, TEMP tmp, KILL cr); // R0, R1 are killed, too.
   ins_cost(300);
   format %{ "String Compress $src->$dst($len) -> $result" %}
   ins_encode %{
     __ string_compress($result$$Register, $src$$Register, $dst$$Register, $len$$Register,
-                       $evenReg$$Register, $tmp$$Register);
+                       $tmp$$Register, false);
   %}
   ins_pipe(pipe_class_dummy);
 %}
@@ -10293,13 +10293,25 @@
 //%}
 
 // byte[] to char[] inflation
-instruct string_inflate(Universe dummy, rarg5RegP src, iRegP dst, roddRegI len, revenRegI evenReg, iRegI tmp, flagsReg cr) %{
+instruct string_inflate(Universe dummy, iRegP src, iRegP dst, iRegI len, iRegI tmp, flagsReg cr) %{
   match(Set dummy (StrInflatedCopy src (Binary dst len)));
-  effect(USE_KILL src, USE_KILL len, TEMP evenReg, TEMP tmp, KILL cr); // R0, R1 are killed, too.
+  effect(TEMP tmp, KILL cr); // R0, R1 are killed, too.
   ins_cost(300);
   format %{ "String Inflate $src->$dst($len)" %}
   ins_encode %{
-    __ string_inflate($src$$Register, $dst$$Register, $len$$Register, $evenReg$$Register, $tmp$$Register);
+    __ string_inflate($src$$Register, $dst$$Register, $len$$Register, $tmp$$Register);
+  %}
+  ins_pipe(pipe_class_dummy);
+%}
+
+// byte[] to char[] inflation
+instruct string_inflate_const(Universe dummy, iRegP src, iRegP dst, iRegI tmp, immI len, flagsReg cr) %{
+  match(Set dummy (StrInflatedCopy src (Binary dst len)));
+  effect(TEMP tmp, KILL cr); // R0, R1 are killed, too.
+  ins_cost(300);
+  format %{ "String Inflate (constLen) $src->$dst($len)" %}
+  ins_encode %{
+    __ string_inflate_const($src$$Register, $dst$$Register, $tmp$$Register, $len$$constant);
   %}
   ins_pipe(pipe_class_dummy);
 %}
@@ -10318,14 +10330,14 @@
 %}
 
 // encode char[] to byte[] in ISO_8859_1
-instruct encode_iso_array(rarg5RegP src, iRegP dst, iRegI result, roddRegI len, revenRegI evenReg, iRegI tmp, iRegI tmp2, flagsReg cr) %{
+instruct encode_iso_array(iRegP src, iRegP dst, iRegI result, iRegI len, iRegI tmp, flagsReg cr) %{
   match(Set result (EncodeISOArray src (Binary dst len)));
-  effect(TEMP_DEF result, USE_KILL src, USE_KILL len, TEMP evenReg, TEMP tmp, TEMP tmp2, KILL cr); // R0, R1 are killed, too.
+  effect(TEMP_DEF result, TEMP tmp, KILL cr); // R0, R1 are killed, too.
   ins_cost(300);
   format %{ "Encode array $src->$dst($len) -> $result" %}
   ins_encode %{
     __ string_compress($result$$Register, $src$$Register, $dst$$Register, $len$$Register,
-                       $evenReg$$Register, $tmp$$Register, $tmp2$$Register);
+                       $tmp$$Register, true);
   %}
   ins_pipe(pipe_class_dummy);
 %}