8143012: CRC32 Intrinsics support on SPARC
authorkvn
Fri, 20 Nov 2015 08:29:10 -0800
changeset 34205 9ec51d30a11e
parent 34204 5ad1ba3afecc
child 34206 7b7b1a9ef2e7
8143012: CRC32 Intrinsics support on SPARC Reviewed-by: kvn, roland Contributed-by: ahmed.khawaja@oracle.com
hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp
hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
hotspot/src/cpu/sparc/vm/interpreterGenerator_sparc.hpp
hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp
hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp
hotspot/src/cpu/sparc/vm/stubRoutines_sparc.cpp
hotspot/src/cpu/sparc/vm/stubRoutines_sparc.hpp
hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp
hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
hotspot/test/compiler/intrinsics/crc32/TestCRC32.java
--- a/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp	Fri Nov 20 08:29:10 2015 -0800
@@ -2812,7 +2812,23 @@
 }
 
 void LIR_Assembler::emit_updatecrc32(LIR_OpUpdateCRC32* op) {
-  fatal("CRC32 intrinsic is not implemented on this platform");
+  assert(op->crc()->is_single_cpu(),  "crc must be register");
+  assert(op->val()->is_single_cpu(),  "byte value must be register");
+  assert(op->result_opr()->is_single_cpu(), "result must be register");
+  Register crc = op->crc()->as_register();
+  Register val = op->val()->as_register();
+  Register table = op->result_opr()->as_register();
+  Register res   = op->result_opr()->as_register();
+
+  assert_different_registers(val, crc, table);
+
+  __ set(ExternalAddress(StubRoutines::crc_table_addr()), table);
+  __ not1(crc);
+  __ clruwu(crc);
+  __ update_byte_crc32(crc, val, table);
+  __ not1(crc);
+
+  __ mov(crc, res);
 }
 
 void LIR_Assembler::emit_lock(LIR_OpLock* op) {
--- a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp	Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp	Fri Nov 20 08:29:10 2015 -0800
@@ -786,7 +786,86 @@
 }
 
 void LIRGenerator::do_update_CRC32(Intrinsic* x) {
-  fatal("CRC32 intrinsic is not implemented on this platform");
+  // Make all state_for calls early since they can emit code
+  LIR_Opr result = rlock_result(x);
+  int flags = 0;
+  switch (x->id()) {
+    case vmIntrinsics::_updateCRC32: {
+      LIRItem crc(x->argument_at(0), this);
+      LIRItem val(x->argument_at(1), this);
+      // val is destroyed by update_crc32
+      val.set_destroys_register();
+      crc.load_item();
+      val.load_item();
+      __ update_crc32(crc.result(), val.result(), result);
+      break;
+    }
+    case vmIntrinsics::_updateBytesCRC32:
+    case vmIntrinsics::_updateByteBufferCRC32: {
+
+      bool is_updateBytes = (x->id() == vmIntrinsics::_updateBytesCRC32);
+
+      LIRItem crc(x->argument_at(0), this);
+      LIRItem buf(x->argument_at(1), this);
+      LIRItem off(x->argument_at(2), this);
+      LIRItem len(x->argument_at(3), this);
+
+      buf.load_item();
+      off.load_nonconstant();
+
+      LIR_Opr index = off.result();
+      int offset = is_updateBytes ? arrayOopDesc::base_offset_in_bytes(T_BYTE) : 0;
+      if(off.result()->is_constant()) {
+        index = LIR_OprFact::illegalOpr;
+        offset += off.result()->as_jint();
+      }
+
+      LIR_Opr base_op = buf.result();
+
+      if (index->is_valid()) {
+        LIR_Opr tmp = new_register(T_LONG);
+        __ convert(Bytecodes::_i2l, index, tmp);
+        index = tmp;
+        if (index->is_constant()) {
+          offset += index->as_constant_ptr()->as_jint();
+          index = LIR_OprFact::illegalOpr;
+        } else if (index->is_register()) {
+          LIR_Opr tmp2 = new_register(T_LONG);
+          LIR_Opr tmp3 = new_register(T_LONG);
+          __ move(base_op, tmp2);
+          __ move(index, tmp3);
+          __ add(tmp2, tmp3, tmp2);
+          base_op = tmp2;
+        } else {
+          ShouldNotReachHere();
+        }
+      }
+
+      LIR_Address* a = new LIR_Address(base_op, offset, T_BYTE);
+
+      BasicTypeList signature(3);
+      signature.append(T_INT);
+      signature.append(T_ADDRESS);
+      signature.append(T_INT);
+      CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+      const LIR_Opr result_reg = result_register_for(x->type());
+
+      LIR_Opr addr = new_pointer_register();
+      __ leal(LIR_OprFact::address(a), addr);
+
+      crc.load_item_force(cc->at(0));
+      __ move(addr, cc->at(1));
+      len.load_item_force(cc->at(2));
+
+      __ call_runtime_leaf(StubRoutines::updateBytesCRC32(), getThreadTemp(), result_reg, cc->args());
+      __ move(result_reg, result);
+
+      break;
+    }
+    default: {
+      ShouldNotReachHere();
+    }
+  }
 }
 
 // _i2l, _i2f, _i2d, _l2i, _l2f, _l2d, _f2i, _f2l, _f2d, _d2i, _d2l, _d2f
--- a/hotspot/src/cpu/sparc/vm/interpreterGenerator_sparc.hpp	Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/interpreterGenerator_sparc.hpp	Fri Nov 20 08:29:10 2015 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -43,8 +43,9 @@
   void generate_counter_incr(Label* overflow, Label* profile_method, Label* profile_method_continue);
   void generate_counter_overflow(Label& Lcontinue);
 
+  address generate_CRC32_update_entry();
+  address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+
   // Not supported
-  address generate_CRC32_update_entry() { return NULL; }
-  address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
   address generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) { return NULL; }
 #endif // CPU_SPARC_VM_INTERPRETERGENERATOR_SPARC_HPP
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp	Fri Nov 20 08:29:10 2015 -0800
@@ -4771,3 +4771,243 @@
   movdtox(src, tmp1);
   reverse_bytes_32(tmp1, dst, tmp2);
 }
+
+void MacroAssembler::fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register buf, int offset) {
+  xmulx(xcrc_hi, xK_hi, xtmp_lo);
+  xmulxhi(xcrc_hi, xK_hi, xtmp_hi);
+  xmulxhi(xcrc_lo, xK_lo, xcrc_hi);
+  xmulx(xcrc_lo, xK_lo, xcrc_lo);
+  xor3(xcrc_lo, xtmp_lo, xcrc_lo);
+  xor3(xcrc_hi, xtmp_hi, xcrc_hi);
+  ldxl(buf, G0, xtmp_lo);
+  inc(buf, 8);
+  ldxl(buf, G0, xtmp_hi);
+  inc(buf, 8);
+  xor3(xcrc_lo, xtmp_lo, xcrc_lo);
+  xor3(xcrc_hi, xtmp_hi, xcrc_hi);
+}
+
+void MacroAssembler::fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register xbuf_hi, Register xbuf_lo) {
+  mov(xcrc_lo, xtmp_lo);
+  mov(xcrc_hi, xtmp_hi);
+  xmulx(xtmp_hi, xK_hi, xtmp_lo);
+  xmulxhi(xtmp_hi, xK_hi, xtmp_hi);
+  xmulxhi(xcrc_lo, xK_lo, xcrc_hi);
+  xmulx(xcrc_lo, xK_lo, xcrc_lo);
+  xor3(xcrc_lo, xbuf_lo, xcrc_lo);
+  xor3(xcrc_hi, xbuf_hi, xcrc_hi);
+  xor3(xcrc_lo, xtmp_lo, xcrc_lo);
+  xor3(xcrc_hi, xtmp_hi, xcrc_hi);
+}
+
+void MacroAssembler::fold_8bit_crc32(Register xcrc, Register table, Register xtmp, Register tmp) {
+  and3(xcrc, 0xFF, tmp);
+  sllx(tmp, 2, tmp);
+  lduw(table, tmp, xtmp);
+  srlx(xcrc, 8, xcrc);
+  xor3(xtmp, xcrc, xcrc);
+}
+
+void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
+  and3(crc, 0xFF, tmp);
+  srlx(crc, 8, crc);
+  sllx(tmp, 2, tmp);
+  lduw(table, tmp, tmp);
+  xor3(tmp, crc, crc);
+}
+
+#define CRC32_TMP_REG_NUM 18
+
+#define CRC32_CONST_64  0x163cd6124
+#define CRC32_CONST_96  0x0ccaa009e
+#define CRC32_CONST_160 0x1751997d0
+#define CRC32_CONST_480 0x1c6e41596
+#define CRC32_CONST_544 0x154442bd4
+
+void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table) {
+
+  Label L_cleanup_loop, L_cleanup_check, L_align_loop, L_align_check;
+  Label L_main_loop_prologue;
+  Label L_fold_512b, L_fold_512b_loop, L_fold_128b;
+  Label L_fold_tail, L_fold_tail_loop;
+  Label L_8byte_fold_loop, L_8byte_fold_check;
+
+  const Register tmp[CRC32_TMP_REG_NUM] = {L0, L1, L2, L3, L4, L5, L6, G1, I0, I1, I2, I3, I4, I5, I7, O4, O5, G3};
+
+  Register const_64  = tmp[CRC32_TMP_REG_NUM-1];
+  Register const_96  = tmp[CRC32_TMP_REG_NUM-1];
+  Register const_160 = tmp[CRC32_TMP_REG_NUM-2];
+  Register const_480 = tmp[CRC32_TMP_REG_NUM-1];
+  Register const_544 = tmp[CRC32_TMP_REG_NUM-2];
+
+  set(ExternalAddress(StubRoutines::crc_table_addr()), table);
+
+  not1(crc); // ~c
+  clruwu(crc); // clear upper 32 bits of crc
+
+  // Check if below cutoff, proceed directly to cleanup code
+  mov(31, G4);
+  cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_check);
+
+  // Align buffer to 8 byte boundry
+  mov(8, O5);
+  and3(buf, 0x7, O4);
+  sub(O5, O4, O5);
+  and3(O5, 0x7, O5);
+  sub(len, O5, len);
+  ba(L_align_check);
+  delayed()->nop();
+
+  // Alignment loop, table look up method for up to 7 bytes
+  bind(L_align_loop);
+  ldub(buf, 0, O4);
+  inc(buf);
+  dec(O5);
+  xor3(O4, crc, O4);
+  and3(O4, 0xFF, O4);
+  sllx(O4, 2, O4);
+  lduw(table, O4, O4);
+  srlx(crc, 8, crc);
+  xor3(O4, crc, crc);
+  bind(L_align_check);
+  nop();
+  cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_align_loop);
+
+  // Aligned on 64-bit (8-byte) boundry at this point
+  // Check if still above cutoff (31-bytes)
+  mov(31, G4);
+  cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_check);
+  // At least 32 bytes left to process
+
+  // Free up registers by storing them to FP registers
+  for (int i = 0; i < CRC32_TMP_REG_NUM; i++) {
+    movxtod(tmp[i], as_FloatRegister(2*i));
+  }
+
+  // Determine which loop to enter
+  // Shared prologue
+  ldxl(buf, G0, tmp[0]);
+  inc(buf, 8);
+  ldxl(buf, G0, tmp[1]);
+  inc(buf, 8);
+  xor3(tmp[0], crc, tmp[0]); // Fold CRC into first few bytes
+  and3(crc, 0, crc); // Clear out the crc register
+  // Main loop needs 128-bytes at least
+  mov(128, G4);
+  mov(64, tmp[2]);
+  cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_main_loop_prologue);
+  // Less than 64 bytes
+  nop();
+  cmp_and_br_short(len, tmp[2], Assembler::lessUnsigned, Assembler::pt, L_fold_tail);
+  // Between 64 and 127 bytes
+  set64(CRC32_CONST_96,  const_96,  tmp[8]);
+  set64(CRC32_CONST_160, const_160, tmp[9]);
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[2], tmp[3], buf, 0);
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[4], tmp[5], buf, 16);
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[6], tmp[7], buf, 32);
+  dec(len, 48);
+  ba(L_fold_tail);
+  delayed()->nop();
+
+  bind(L_main_loop_prologue);
+  for (int i = 2; i < 8; i++) {
+    ldxl(buf, G0, tmp[i]);
+    inc(buf, 8);
+  }
+
+  // Fold total 512 bits of polynomial on each iteration,
+  // 128 bits per each of 4 parallel streams
+  set64(CRC32_CONST_480, const_480, tmp[8]);
+  set64(CRC32_CONST_544, const_544, tmp[9]);
+
+  mov(128, G4);
+  bind(L_fold_512b_loop);
+  fold_128bit_crc32(tmp[1], tmp[0], const_480, const_544, tmp[9],  tmp[8],  buf,  0);
+  fold_128bit_crc32(tmp[3], tmp[2], const_480, const_544, tmp[11], tmp[10], buf, 16);
+  fold_128bit_crc32(tmp[5], tmp[4], const_480, const_544, tmp[13], tmp[12], buf, 32);
+  fold_128bit_crc32(tmp[7], tmp[6], const_480, const_544, tmp[15], tmp[14], buf, 64);
+  dec(len, 64);
+  cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_fold_512b_loop);
+
+  // Fold 512 bits to 128 bits
+  bind(L_fold_512b);
+  set64(CRC32_CONST_96,  const_96,  tmp[8]);
+  set64(CRC32_CONST_160, const_160, tmp[9]);
+
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[3], tmp[2]);
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[5], tmp[4]);
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[7], tmp[6]);
+  dec(len, 48);
+
+  // Fold the rest of 128 bits data chunks
+  bind(L_fold_tail);
+  mov(32, G4);
+  cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_fold_128b);
+
+  set64(CRC32_CONST_96,  const_96,  tmp[8]);
+  set64(CRC32_CONST_160, const_160, tmp[9]);
+
+  bind(L_fold_tail_loop);
+  fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[2], tmp[3], buf, 0);
+  sub(len, 16, len);
+  cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_fold_tail_loop);
+
+  // Fold the 128 bits in tmps 0 - 1 into tmp 1
+  bind(L_fold_128b);
+
+  set64(CRC32_CONST_64, const_64, tmp[4]);
+
+  xmulx(const_64, tmp[0], tmp[2]);
+  xmulxhi(const_64, tmp[0], tmp[3]);
+
+  srl(tmp[2], G0, tmp[4]);
+  xmulx(const_64, tmp[4], tmp[4]);
+
+  srlx(tmp[2], 32, tmp[2]);
+  sllx(tmp[3], 32, tmp[3]);
+  or3(tmp[2], tmp[3], tmp[2]);
+
+  xor3(tmp[4], tmp[1], tmp[4]);
+  xor3(tmp[4], tmp[2], tmp[1]);
+  dec(len, 8);
+
+  // Use table lookup for the 8 bytes left in tmp[1]
+  dec(len, 8);
+
+  // 8 8-bit folds to compute 32-bit CRC.
+  for (int j = 0; j < 4; j++) {
+    fold_8bit_crc32(tmp[1], table, tmp[2], tmp[3]);
+  }
+  srl(tmp[1], G0, crc); // move 32 bits to general register
+  for (int j = 0; j < 4; j++) {
+    fold_8bit_crc32(crc, table, tmp[3]);
+  }
+
+  bind(L_8byte_fold_check);
+
+  // Restore int registers saved in FP registers
+  for (int i = 0; i < CRC32_TMP_REG_NUM; i++) {
+    movdtox(as_FloatRegister(2*i), tmp[i]);
+  }
+
+  ba(L_cleanup_check);
+  delayed()->nop();
+
+  // Table look-up method for the remaining few bytes
+  bind(L_cleanup_loop);
+  ldub(buf, 0, O4);
+  inc(buf);
+  dec(len);
+  xor3(O4, crc, O4);
+  and3(O4, 0xFF, O4);
+  sllx(O4, 2, O4);
+  lduw(table, O4, O4);
+  srlx(crc, 8, crc);
+  xor3(O4, crc, crc);
+  bind(L_cleanup_check);
+  nop();
+  cmp_and_br_short(len, 0, Assembler::greaterUnsigned, Assembler::pt, L_cleanup_loop);
+
+  not1(crc);
+}
+
--- a/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp	Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/macroAssembler_sparc.hpp	Fri Nov 20 08:29:10 2015 -0800
@@ -904,7 +904,9 @@
   inline void ldf(FloatRegisterImpl::Width w, const Address& a, FloatRegister d, int offset = 0);
 
   // little-endian
-  inline void ldxl(Register s1, Register s2, Register d) { ldxa(s1, s2, ASI_PRIMARY_LITTLE, d); }
+  inline void lduwl(Register s1, Register s2, Register d) { lduwa(s1, s2, ASI_PRIMARY_LITTLE, d); }
+  inline void ldswl(Register s1, Register s2, Register d) { ldswa(s1, s2, ASI_PRIMARY_LITTLE, d);}
+  inline void ldxl( Register s1, Register s2, Register d) { ldxa(s1, s2, ASI_PRIMARY_LITTLE, d); }
   inline void ldfl(FloatRegisterImpl::Width w, Register s1, Register s2, FloatRegister d) { ldfa(w, s1, s2, ASI_PRIMARY_LITTLE, d); }
 
   // membar psuedo instruction.  takes into account target memory model.
@@ -1469,6 +1471,15 @@
   void movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2);
   void movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2);
 
+  // CRC32 code for java.util.zip.CRC32::updateBytes0() instrinsic.
+  void kernel_crc32(Register crc, Register buf, Register len, Register table);
+  // Fold 128-bit data chunk
+  void fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register buf, int offset);
+  void fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register xbuf_hi, Register xbuf_lo);
+  // Fold 8-bit data
+  void fold_8bit_crc32(Register xcrc, Register table, Register xtmp, Register tmp);
+  void fold_8bit_crc32(Register crc, Register table, Register tmp);
+
 #undef VIRTUAL
 };
 
--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp	Fri Nov 20 08:29:10 2015 -0800
@@ -5292,6 +5292,38 @@
     return start;
   }
 
+/**
+   *  Arguments:
+   *
+   * Inputs:
+   *   O0   - int   crc
+   *   O1   - byte* buf
+   *   O2   - int   len
+   *   O3   - int*  table
+   *
+   * Output:
+   *   O0   - int crc result
+   */
+  address generate_updateBytesCRC32() {
+    assert(UseCRC32Intrinsics, "need VIS3 instructions");
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
+    address start = __ pc();
+
+    const Register crc   = O0; // crc
+    const Register buf   = O1; // source java byte array address
+    const Register len   = O2; // length
+    const Register table = O3; // crc_table address (reuse register)
+
+    __ kernel_crc32(crc, buf, len, table);
+
+    __ retl();
+    __ delayed()->nop();
+
+    return start;
+  }
+
   void generate_initial() {
     // Generates all stubs and initializes the entry points
 
@@ -5324,6 +5356,12 @@
 
     // Build this early so it's available for the interpreter.
     StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
+
+    if (UseCRC32Intrinsics) {
+      // set table address before stub generation which use it
+      StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
+      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
+    }
   }
 
 
--- a/hotspot/src/cpu/sparc/vm/stubRoutines_sparc.cpp	Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/stubRoutines_sparc.cpp	Fri Nov 20 08:29:10 2015 -0800
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -52,3 +52,98 @@
 address StubRoutines::Sparc::_flush_callers_register_windows_entry = CAST_FROM_FN_PTR(address, bootstrap_flush_windows);
 
 address StubRoutines::Sparc::_partial_subtype_check = NULL;
+
+uint64_t StubRoutines::Sparc::_crc_by128_masks[] =
+{
+  /* The fields in this structure are arranged so that they can be
+   * picked up two at a time with 128-bit loads.
+   *
+   * Because of flipped bit order for this CRC polynomials
+   * the constant for X**N is left-shifted by 1.  This is because
+   * a 64 x 64 polynomial multiply produces a 127-bit result
+   * but the highest term is always aligned to bit 0 in the container.
+   * Pre-shifting by one fixes this, at the cost of potentially making
+   * the 32-bit constant no longer fit in a 32-bit container (thus the
+   * use of uint64_t, though this is also the size used by the carry-
+   * less multiply instruction.
+   *
+   * In addition, the flipped bit order and highest-term-at-least-bit
+   * multiply changes the constants used.  The 96-bit result will be
+   * aligned to the high-term end of the target 128-bit container,
+   * not the low-term end; that is, instead of a 512-bit or 576-bit fold,
+   * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold.
+   *
+   * This cause additional problems in the 128-to-64-bit reduction; see the
+   * code for details.  By storing a mask in the otherwise unused half of
+   * a 128-bit constant, bits can be cleared before multiplication without
+   * storing and reloading.  Note that staying on a 128-bit datapath means
+   * that some data is uselessly stored and some unused data is intersected
+   * with an irrelevant constant.
+   */
+
+  ((uint64_t) 0xffffffffUL),     /* low  of K_M_64    */
+  ((uint64_t) 0xb1e6b092U << 1), /* high of K_M_64    */
+  ((uint64_t) 0xba8ccbe8U << 1), /* low  of K_160_96  */
+  ((uint64_t) 0x6655004fU << 1), /* high of K_160_96  */
+  ((uint64_t) 0xaa2215eaU << 1), /* low  of K_544_480 */
+  ((uint64_t) 0xe3720acbU << 1)  /* high of K_544_480 */
+};
+
+/**
+ *  crc_table[] from jdk/src/java.base/share/native/libzip/zlib-1.2.8/crc32.h
+ */
+juint StubRoutines::Sparc::_crc_table[] =
+{
+    0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
+    0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
+    0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
+    0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
+    0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
+    0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
+    0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
+    0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
+    0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
+    0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
+    0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
+    0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
+    0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
+    0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
+    0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
+    0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
+    0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
+    0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
+    0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
+    0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
+    0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
+    0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
+    0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
+    0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
+    0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
+    0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
+    0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
+    0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
+    0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
+    0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
+    0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
+    0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
+    0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
+    0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
+    0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
+    0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
+    0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
+    0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
+    0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
+    0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
+    0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
+    0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
+    0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
+    0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
+    0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
+    0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
+    0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
+    0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
+    0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
+    0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
+    0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
+    0x2d02ef8dUL
+};
--- a/hotspot/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/stubRoutines_sparc.hpp	Fri Nov 20 08:29:10 2015 -0800
@@ -53,6 +53,9 @@
   static address _flush_callers_register_windows_entry;
 
   static address _partial_subtype_check;
+  // masks and table for CRC32
+  static uint64_t _crc_by128_masks[];
+  static juint    _crc_table[];
 
  public:
   // test assembler stop routine by setting registers
@@ -65,6 +68,8 @@
   static intptr_t* (*flush_callers_register_windows_func())() { return CAST_TO_FN_PTR(intptr_t* (*)(void), _flush_callers_register_windows_entry); }
 
   static address partial_subtype_check()                  { return _partial_subtype_check; }
+
+  static address crc_by128_masks_addr()  { return (address)_crc_by128_masks; }
 };
 
 #endif // CPU_SPARC_VM_STUBROUTINES_SPARC_HPP
--- a/hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp	Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp	Fri Nov 20 08:29:10 2015 -0800
@@ -803,6 +803,106 @@
   return NULL;
 }
 
+/**
+ * Method entry for static native methods:
+ *   int java.util.zip.CRC32.update(int crc, int b)
+ */
+address InterpreterGenerator::generate_CRC32_update_entry() {
+
+  if (UseCRC32Intrinsics) {
+    address entry = __ pc();
+
+    Label L_slow_path;
+    // If we need a safepoint check, generate full interpreter entry.
+    ExternalAddress state(SafepointSynchronize::address_of_state());
+    __ set(ExternalAddress(SafepointSynchronize::address_of_state()), O2);
+    __ set(SafepointSynchronize::_not_synchronized, O3);
+    __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pt, L_slow_path);
+
+    // Load parameters
+    const Register crc   = O0; // initial crc
+    const Register val   = O1; // byte to update with
+    const Register table = O2; // address of 256-entry lookup table
+
+    __ ldub(Gargs, 3, val);
+    __ lduw(Gargs, 8, crc);
+
+    __ set(ExternalAddress(StubRoutines::crc_table_addr()), table);
+
+    __ not1(crc); // ~crc
+    __ clruwu(crc);
+    __ update_byte_crc32(crc, val, table);
+    __ not1(crc); // ~crc
+
+    // result in O0
+    __ retl();
+    __ delayed()->nop();
+
+    // generate a vanilla native entry as the slow path
+    __ bind(L_slow_path);
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
+    return entry;
+  }
+  return NULL;
+}
+
+/**
+ * Method entry for static native methods:
+ *   int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int len)
+ *   int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len)
+ */
+address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
+
+  if (UseCRC32Intrinsics) {
+    address entry = __ pc();
+
+    Label L_slow_path;
+    // If we need a safepoint check, generate full interpreter entry.
+    ExternalAddress state(SafepointSynchronize::address_of_state());
+    __ set(ExternalAddress(SafepointSynchronize::address_of_state()), O2);
+    __ set(SafepointSynchronize::_not_synchronized, O3);
+    __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pt, L_slow_path);
+
+    // Load parameters from the stack
+    const Register crc    = O0; // initial crc
+    const Register buf    = O1; // source java byte array address
+    const Register len    = O2; // len
+    const Register offset = O3; // offset
+
+    // Arguments are reversed on java expression stack
+    // Calculate address of start element
+    if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) {
+      __ lduw(Gargs, 0,  len);
+      __ lduw(Gargs, 8,  offset);
+      __ ldx( Gargs, 16, buf);
+      __ lduw(Gargs, 32, crc);
+      __ add(buf, offset, buf);
+    } else {
+      __ lduw(Gargs, 0,  len);
+      __ lduw(Gargs, 8,  offset);
+      __ ldx( Gargs, 16, buf);
+      __ lduw(Gargs, 24, crc);
+      __ add(buf, arrayOopDesc::base_offset_in_bytes(T_BYTE), buf); // account for the header size
+      __ add(buf ,offset, buf);
+    }
+
+    // Call the crc32 kernel
+    __ MacroAssembler::save_thread(L7_thread_cache);
+    __ kernel_crc32(crc, buf, len, O3);
+    __ MacroAssembler::restore_thread(L7_thread_cache);
+
+    // result in O0
+    __ retl();
+    __ delayed()->nop();
+
+    // generate a vanilla native entry as the slow path
+    __ bind(L_slow_path);
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
+    return entry;
+  }
+  return NULL;
+}
+
 //
 // Interpreter stub for calling a native method. (asm interpreter)
 // This sets up a somewhat different looking stack for calling the native method
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Fri Nov 20 10:09:42 2015 +0100
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp	Fri Nov 20 08:29:10 2015 -0800
@@ -347,6 +347,15 @@
     FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
   }
 
+  if (UseVIS > 2) {
+    if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
+      FLAG_SET_DEFAULT(UseCRC32Intrinsics, true);
+    }
+  } else if (UseCRC32Intrinsics) {
+    warning("SPARC CRC32 intrinsics require VIS3 insructions support. Intriniscs will be disabled");
+    FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
+  }
+
   if (FLAG_IS_DEFAULT(ContendedPaddingWidth) &&
     (cache_line_size > ContendedPaddingWidth))
     ContendedPaddingWidth = cache_line_size;
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/test/compiler/intrinsics/crc32/TestCRC32.java	Fri Nov 20 08:29:10 2015 -0800
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @bug 8143012
+ * @summary CRC32 Intrinsics support on SPARC
+ *
+ * @run main/othervm/timeout=720 -Xbatch TestCRC32 -m
+ */
+
+import java.nio.ByteBuffer;
+import java.util.zip.Checksum;
+import java.util.zip.CRC32;
+
+public class TestCRC32 {
+    public static void main(String[] args) {
+        int offset = Integer.getInteger("offset", 0);
+        int msgSize = Integer.getInteger("msgSize", 512);
+        boolean multi = false;
+        int iters = 20000;
+        int warmupIters = 20000;
+
+        if (args.length > 0) {
+            if (args[0].equals("-m")) {
+                multi = true;
+            } else {
+                iters = Integer.valueOf(args[0]);
+            }
+            if (args.length > 1) {
+                warmupIters = Integer.valueOf(args[1]);
+            }
+        }
+
+        if (multi) {
+            test_multi(warmupIters);
+            return;
+        }
+
+        System.out.println(" offset = " + offset);
+        System.out.println("msgSize = " + msgSize + " bytes");
+        System.out.println("  iters = " + iters);
+
+        byte[] b = initializedBytes(msgSize, offset);
+
+        CRC32 crc0 = new CRC32();
+        CRC32 crc1 = new CRC32();
+        CRC32 crc2 = new CRC32();
+
+        crc0.update(b, offset, msgSize);
+
+        System.out.println("-------------------------------------------------------");
+
+        /* warm up */
+        for (int i = 0; i < warmupIters; i++) {
+            crc1.reset();
+            crc1.update(b, offset, msgSize);
+        }
+
+        /* measure performance */
+        long start = System.nanoTime();
+        for (int i = 0; i < iters; i++) {
+            crc1.reset();
+            crc1.update(b, offset, msgSize);
+        }
+        long end = System.nanoTime();
+        double total = (double)(end - start)/1e9;         // in seconds
+        double thruput = (double)msgSize*iters/1e6/total; // in MB/s
+        System.out.println("CRC32.update(byte[]) runtime = " + total + " seconds");
+        System.out.println("CRC32.update(byte[]) throughput = " + thruput + " MB/s");
+
+        /* check correctness */
+        for (int i = 0; i < iters; i++) {
+            crc1.reset();
+            crc1.update(b, offset, msgSize);
+            if (!check(crc0, crc1)) break;
+        }
+        report("CRCs", crc0, crc1);
+
+        System.out.println("-------------------------------------------------------");
+
+        ByteBuffer buf = ByteBuffer.allocateDirect(msgSize);
+        buf.put(b, offset, msgSize);
+        buf.flip();
+
+        /* warm up */
+        for (int i = 0; i < warmupIters; i++) {
+            crc2.reset();
+            crc2.update(buf);
+            buf.rewind();
+        }
+
+        /* measure performance */
+        start = System.nanoTime();
+        for (int i = 0; i < iters; i++) {
+            crc2.reset();
+            crc2.update(buf);
+            buf.rewind();
+        }
+        end = System.nanoTime();
+        total = (double)(end - start)/1e9;         // in seconds
+        thruput = (double)msgSize*iters/1e6/total; // in MB/s
+        System.out.println("CRC32.update(ByteBuffer) runtime = " + total + " seconds");
+        System.out.println("CRC32.update(ByteBuffer) throughput = " + thruput + " MB/s");
+
+        /* check correctness */
+        for (int i = 0; i < iters; i++) {
+            crc2.reset();
+            crc2.update(buf);
+            buf.rewind();
+            if (!check(crc0, crc2)) break;
+        }
+        report("CRCs", crc0, crc2);
+
+        System.out.println("-------------------------------------------------------");
+    }
+
+    private static void report(String s, Checksum crc0, Checksum crc1) {
+        System.out.printf("%s: crc0 = %08x, crc1 = %08x\n",
+                          s, crc0.getValue(), crc1.getValue());
+    }
+
+    private static boolean check(Checksum crc0, Checksum crc1) {
+        if (crc0.getValue() != crc1.getValue()) {
+            System.err.printf("ERROR: crc0 = %08x, crc1 = %08x\n",
+                              crc0.getValue(), crc1.getValue());
+            return false;
+        }
+        return true;
+    }
+
+    private static byte[] initializedBytes(int M, int offset) {
+        byte[] bytes = new byte[M + offset];
+        for (int i = 0; i < offset; i++) {
+            bytes[i] = (byte) i;
+        }
+        for (int i = offset; i < bytes.length; i++) {
+            bytes[i] = (byte) (i - offset);
+        }
+        return bytes;
+    }
+
+    private static void test_multi(int iters) {
+        int len1 = 8;    // the  8B/iteration loop
+        int len2 = 32;   // the 32B/iteration loop
+        int len3 = 4096; // the 4KB/iteration loop
+
+        byte[] b = initializedBytes(len3*16, 0);
+        int[] offsets = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 128, 256, 512 };
+        int[] sizes = { 0, 1, 2, 3, 4, 5, 6, 7,
+                        len1, len1+1, len1+2, len1+3, len1+4, len1+5, len1+6, len1+7,
+                        len1*2, len1*2+1, len1*2+3, len1*2+5, len1*2+7,
+                        len2, len2+1, len2+3, len2+5, len2+7,
+                        len2*2, len2*4, len2*8, len2*16, len2*32, len2*64,
+                        len3, len3+1, len3+3, len3+5, len3+7,
+                        len3*2, len3*4, len3*8,
+                        len1+len2, len1+len2+1, len1+len2+3, len1+len2+5, len1+len2+7,
+                        len1+len3, len1+len3+1, len1+len3+3, len1+len3+5, len1+len3+7,
+                        len2+len3, len2+len3+1, len2+len3+3, len2+len3+5, len2+len3+7,
+                        len1+len2+len3, len1+len2+len3+1, len1+len2+len3+3,
+                        len1+len2+len3+5, len1+len2+len3+7,
+                        (len1+len2+len3)*2, (len1+len2+len3)*2+1, (len1+len2+len3)*2+3,
+                        (len1+len2+len3)*2+5, (len1+len2+len3)*2+7,
+                        (len1+len2+len3)*3, (len1+len2+len3)*3-1, (len1+len2+len3)*3-3,
+                        (len1+len2+len3)*3-5, (len1+len2+len3)*3-7 };
+        CRC32[] crc0 = new CRC32[offsets.length*sizes.length];
+        CRC32[] crc1 = new CRC32[offsets.length*sizes.length];
+        int i, j, k;
+
+        System.out.printf("testing %d cases ...\n", offsets.length*sizes.length);
+
+        /* set the result from interpreter as reference */
+        for (i = 0; i < offsets.length; i++) {
+            for (j = 0; j < sizes.length; j++) {
+                crc0[i*sizes.length + j] = new CRC32();
+                crc1[i*sizes.length + j] = new CRC32();
+                crc0[i*sizes.length + j].update(b, offsets[i], sizes[j]);
+            }
+        }
+
+        /* warm up the JIT compiler and get result */
+        for (k = 0; k < iters; k++) {
+            for (i = 0; i < offsets.length; i++) {
+                for (j = 0; j < sizes.length; j++) {
+                    crc1[i*sizes.length + j].reset();
+                    crc1[i*sizes.length + j].update(b, offsets[i], sizes[j]);
+                }
+            }
+        }
+
+        /* check correctness */
+        for (i = 0; i < offsets.length; i++) {
+            for (j = 0; j < sizes.length; j++) {
+                if (!check(crc0[i*sizes.length + j], crc1[i*sizes.length + j])) {
+                    System.out.printf("offsets[%d] = %d", i, offsets[i]);
+                    System.out.printf("\tsizes[%d] = %d\n", j, sizes[j]);
+                }
+            }
+        }
+    }
+}