8216259: AArch64: Vectorize Adler32 intrinsics
authorpli
Thu, 28 Feb 2019 16:37:28 +0800
changeset 53950 cf47efcf7771
parent 53949 4a99d3a6a86d
child 53951 56089cf6152c
child 57249 3e39753ed05b
8216259: AArch64: Vectorize Adler32 intrinsics Reviewed-by: aph
src/hotspot/cpu/aarch64/assembler_aarch64.hpp
src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp
src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp
src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp
--- a/src/hotspot/cpu/aarch64/assembler_aarch64.hpp	Thu Feb 28 13:37:03 2019 +0800
+++ b/src/hotspot/cpu/aarch64/assembler_aarch64.hpp	Thu Feb 28 16:37:28 2019 +0800
@@ -2209,13 +2209,15 @@
     rf(Vn, 5), rf(Vd, 0);                                                               \
   }
 
-  INSN(addv, 0, 0b100001);
-  INSN(subv, 1, 0b100001);
-  INSN(mulv, 0, 0b100111);
-  INSN(mlav, 0, 0b100101);
-  INSN(mlsv, 1, 0b100101);
-  INSN(sshl, 0, 0b010001);
-  INSN(ushl, 1, 0b010001);
+  INSN(addv,   0, 0b100001);
+  INSN(subv,   1, 0b100001);
+  INSN(mulv,   0, 0b100111);
+  INSN(mlav,   0, 0b100101);
+  INSN(mlsv,   1, 0b100101);
+  INSN(sshl,   0, 0b010001);
+  INSN(ushl,   1, 0b010001);
+  INSN(umullv, 1, 0b110000);
+  INSN(umlalv, 1, 0b100000);
 
 #undef INSN
 
@@ -2227,13 +2229,14 @@
     rf(Vn, 5), rf(Vd, 0);                                                               \
   }
 
-  INSN(absr,  0, 0b100000101110);
-  INSN(negr,  1, 0b100000101110);
-  INSN(notr,  1, 0b100000010110);
-  INSN(addv,  0, 0b110001101110);
-  INSN(cls,   0, 0b100000010010);
-  INSN(clz,   1, 0b100000010010);
-  INSN(cnt,   0, 0b100000010110);
+  INSN(absr,   0, 0b100000101110);
+  INSN(negr,   1, 0b100000101110);
+  INSN(notr,   1, 0b100000010110);
+  INSN(addv,   0, 0b110001101110);
+  INSN(cls,    0, 0b100000010010);
+  INSN(clz,    1, 0b100000010010);
+  INSN(cnt,    0, 0b100000010110);
+  INSN(uaddlv, 1, 0b110000001110);
 
 #undef INSN
 
--- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Thu Feb 28 13:37:03 2019 +0800
+++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp	Thu Feb 28 16:37:28 2019 +0800
@@ -3257,11 +3257,14 @@
     Register buff   = c_rarg1;
     Register len    = c_rarg2;
     Register nmax  = r4;
-    Register base = r5;
+    Register base  = r5;
     Register count = r6;
     Register temp0 = rscratch1;
     Register temp1 = rscratch2;
-    Register temp2 = r7;
+    FloatRegister vbytes = v0;
+    FloatRegister vs1acc = v1;
+    FloatRegister vs2acc = v2;
+    FloatRegister vtable = v3;
 
     // Max number of bytes we can process before having to take the mod
     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
@@ -3271,6 +3274,10 @@
     __ mov(base, BASE);
     __ mov(nmax, NMAX);
 
+    // Load accumulation coefficients for the upper 16 bits
+    __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
+    __ ld1(vtable, __ T16B, Address(temp0));
+
     // s1 is initialized to the lower 16 bits of adler
     // s2 is initialized to the upper 16 bits of adler
     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
@@ -3311,53 +3318,8 @@
 
     __ bind(L_nmax_loop);
 
-    __ ldp(temp0, temp1, Address(__ post(buff, 16)));
-
-    __ add(s1, s1, temp0, ext::uxtb);
-    __ ubfx(temp2, temp0, 8, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp0, 16, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp0, 24, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp0, 32, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp0, 40, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp0, 48, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp0, Assembler::LSR, 56);
-    __ add(s2, s2, s1);
-
-    __ add(s1, s1, temp1, ext::uxtb);
-    __ ubfx(temp2, temp1, 8, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp1, 16, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp1, 24, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp1, 32, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp1, 40, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp1, 48, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp1, Assembler::LSR, 56);
-    __ add(s2, s2, s1);
+    generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
+                                      vbytes, vs1acc, vs2acc, vtable);
 
     __ subs(count, count, 16);
     __ br(Assembler::HS, L_nmax_loop);
@@ -3400,53 +3362,8 @@
 
     __ bind(L_by16_loop);
 
-    __ ldp(temp0, temp1, Address(__ post(buff, 16)));
-
-    __ add(s1, s1, temp0, ext::uxtb);
-    __ ubfx(temp2, temp0, 8, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp0, 16, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp0, 24, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp0, 32, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp0, 40, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp0, 48, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp0, Assembler::LSR, 56);
-    __ add(s2, s2, s1);
-
-    __ add(s1, s1, temp1, ext::uxtb);
-    __ ubfx(temp2, temp1, 8, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp1, 16, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp1, 24, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp1, 32, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp1, 40, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ ubfx(temp2, temp1, 48, 8);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp2);
-    __ add(s2, s2, s1);
-    __ add(s1, s1, temp1, Assembler::LSR, 56);
-    __ add(s2, s2, s1);
+    generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
+                                      vbytes, vs1acc, vs2acc, vtable);
 
     __ subs(len, len, 16);
     __ br(Assembler::HS, L_by16_loop);
@@ -3500,6 +3417,43 @@
     return start;
   }
 
+  void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
+          Register temp0, Register temp1, FloatRegister vbytes,
+          FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
+    // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
+    // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
+    // In non-vectorized code, we update s1 and s2 as:
+    //   s1 <- s1 + b1
+    //   s2 <- s2 + s1
+    //   s1 <- s1 + b2
+    //   s2 <- s2 + b1
+    //   ...
+    //   s1 <- s1 + b16
+    //   s2 <- s2 + s1
+    // Putting above assignments together, we have:
+    //   s1_new = s1 + b1 + b2 + ... + b16
+    //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
+    //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
+    //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
+    __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
+
+    // s2 = s2 + s1 * 16
+    __ add(s2, s2, s1, Assembler::LSL, 4);
+
+    // vs1acc = b1 + b2 + b3 + ... + b16
+    // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
+    __ umullv(vs2acc, __ T8B, vtable, vbytes);
+    __ umlalv(vs2acc, __ T16B, vtable, vbytes);
+    __ uaddlv(vs1acc, __ T16B, vbytes);
+    __ uaddlv(vs2acc, __ T8H, vs2acc);
+
+    // s1 = s1 + vs1acc, s2 = s2 + vs2acc
+    __ fmovd(temp0, vs1acc);
+    __ fmovd(temp1, vs2acc);
+    __ add(s1, s1, temp0);
+    __ add(s2, s2, temp1);
+  }
+
   /**
    *  Arguments:
    *
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Thu Feb 28 13:37:03 2019 +0800
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.cpp	Thu Feb 28 16:37:28 2019 +0800
@@ -287,6 +287,11 @@
     0xD502ED78UL, 0xAE7D62EDUL,         // byte swap of word swap
 };
 
+// Accumulation coefficients for adler32 upper 16 bits
+jubyte StubRoutines::aarch64::_adler_table[] __attribute__ ((aligned(64))) = {
+    16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
+};
+
 juint StubRoutines::aarch64::_npio2_hw[] __attribute__ ((aligned(64))) = {
     // first, various coefficient values: 0.5, invpio2, pio2_1, pio2_1t, pio2_2,
     // pio2_2t, pio2_3, pio2_3t
--- a/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Thu Feb 28 13:37:03 2019 +0800
+++ b/src/hotspot/cpu/aarch64/stubRoutines_aarch64.hpp	Thu Feb 28 16:37:28 2019 +0800
@@ -186,6 +186,7 @@
 
 private:
   static juint    _crc_table[];
+  static jubyte   _adler_table[];
   // begin trigonometric tables block. See comments in .cpp file
   static juint    _npio2_hw[];
   static jdouble   _two_over_pi[];