hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
changeset 38237 d972e3a2df53
parent 38142 e16b23089599
child 38241 32eab2eb41fd
equal deleted inserted replaced
38236:510f77046e00 38237:d972e3a2df53
  4835   xor3(val, crc, crc);
  4835   xor3(val, crc, crc);
  4836 }
  4836 }
  4837 
  4837 
  4838 // Reverse byte order of lower 32 bits, assuming upper 32 bits all zeros
  4838 // Reverse byte order of lower 32 bits, assuming upper 32 bits all zeros
  4839 void MacroAssembler::reverse_bytes_32(Register src, Register dst, Register tmp) {
  4839 void MacroAssembler::reverse_bytes_32(Register src, Register dst, Register tmp) {
  4840   srlx(src, 24, dst);
  4840     srlx(src, 24, dst);
  4841 
  4841 
  4842   sllx(src, 32+8, tmp);
  4842     sllx(src, 32+8, tmp);
  4843   srlx(tmp, 32+24, tmp);
  4843     srlx(tmp, 32+24, tmp);
  4844   sllx(tmp, 8, tmp);
  4844     sllx(tmp, 8, tmp);
  4845   or3(dst, tmp, dst);
  4845     or3(dst, tmp, dst);
  4846 
  4846 
  4847   sllx(src, 32+16, tmp);
  4847     sllx(src, 32+16, tmp);
  4848   srlx(tmp, 32+24, tmp);
  4848     srlx(tmp, 32+24, tmp);
  4849   sllx(tmp, 16, tmp);
  4849     sllx(tmp, 16, tmp);
  4850   or3(dst, tmp, dst);
  4850     or3(dst, tmp, dst);
  4851 
  4851 
  4852   sllx(src, 32+24, tmp);
  4852     sllx(src, 32+24, tmp);
  4853   srlx(tmp, 32, tmp);
  4853     srlx(tmp, 32, tmp);
  4854   or3(dst, tmp, dst);
  4854     or3(dst, tmp, dst);
  4855 }
  4855 }
  4856 
  4856 
  4857 void MacroAssembler::movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2) {
  4857 void MacroAssembler::movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2) {
  4858   reverse_bytes_32(src, tmp1, tmp2);
  4858   reverse_bytes_32(src, tmp1, tmp2);
  4859   movxtod(tmp1, dst);
  4859   movxtod(tmp1, dst);
  5101   cmp_and_br_short(len, 0, Assembler::greaterUnsigned, Assembler::pt, L_cleanup_loop);
  5101   cmp_and_br_short(len, 0, Assembler::greaterUnsigned, Assembler::pt, L_cleanup_loop);
  5102 
  5102 
  5103   not1(crc);
  5103   not1(crc);
  5104 }
  5104 }
  5105 
  5105 
       
  5106 #define CHUNK_LEN   128          /* 128 x 8B = 1KB */
       
  5107 #define CHUNK_K1    0x1307a0206  /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */
       
  5108 #define CHUNK_K2    0x1a0f717c4  /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */
       
  5109 #define CHUNK_K3    0x0170076fa  /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */
       
  5110 
       
  5111 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, Register table) {
       
  5112 
       
  5113   Label L_crc32c_head, L_crc32c_aligned;
       
  5114   Label L_crc32c_parallel, L_crc32c_parallel_loop;
       
  5115   Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop;
       
  5116   Label L_crc32c_done, L_crc32c_tail, L_crc32c_return;
       
  5117 
       
  5118   set(ExternalAddress(StubRoutines::crc32c_table_addr()), table);
       
  5119 
       
  5120   cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return);
       
  5121 
       
  5122   // clear upper 32 bits of crc
       
  5123   clruwu(crc);
       
  5124 
       
  5125   and3(buf, 7, G4);
       
  5126   cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned);
       
  5127 
       
  5128   mov(8, G1);
       
  5129   sub(G1, G4, G4);
       
  5130 
       
  5131   // ------ process the misaligned head (7 bytes or less) ------
       
  5132   bind(L_crc32c_head);
       
  5133 
       
  5134   // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
       
  5135   ldub(buf, 0, G1);
       
  5136   update_byte_crc32(crc, G1, table);
       
  5137 
       
  5138   inc(buf);
       
  5139   dec(len);
       
  5140   cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return);
       
  5141   dec(G4);
       
  5142   cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head);
       
  5143 
       
  5144   // ------ process the 8-byte-aligned body ------
       
  5145   bind(L_crc32c_aligned);
       
  5146   nop();
       
  5147   cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail);
       
  5148 
       
  5149   // reverse the byte order of lower 32 bits to big endian, and move to FP side
       
  5150   movitof_revbytes(crc, F0, G1, G3);
       
  5151 
       
  5152   set(CHUNK_LEN*8*4, G4);
       
  5153   cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial);
       
  5154 
       
  5155   // ------ process four 1KB chunks in parallel ------
       
  5156   bind(L_crc32c_parallel);
       
  5157 
       
  5158   fzero(FloatRegisterImpl::D, F2);
       
  5159   fzero(FloatRegisterImpl::D, F4);
       
  5160   fzero(FloatRegisterImpl::D, F6);
       
  5161 
       
  5162   mov(CHUNK_LEN - 1, G4);
       
  5163   bind(L_crc32c_parallel_loop);
       
  5164   // schedule ldf's ahead of crc32c's to hide the load-use latency
       
  5165   ldf(FloatRegisterImpl::D, buf, 0,            F8);
       
  5166   ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
       
  5167   ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
       
  5168   ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14);
       
  5169   crc32c(F0, F8,  F0);
       
  5170   crc32c(F2, F10, F2);
       
  5171   crc32c(F4, F12, F4);
       
  5172   crc32c(F6, F14, F6);
       
  5173   inc(buf, 8);
       
  5174   dec(G4);
       
  5175   cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop);
       
  5176 
       
  5177   ldf(FloatRegisterImpl::D, buf, 0,            F8);
       
  5178   ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
       
  5179   ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
       
  5180   crc32c(F0, F8,  F0);
       
  5181   crc32c(F2, F10, F2);
       
  5182   crc32c(F4, F12, F4);
       
  5183 
       
  5184   inc(buf, CHUNK_LEN*24);
       
  5185   ldfl(FloatRegisterImpl::D, buf, G0, F14);  // load in little endian
       
  5186   inc(buf, 8);
       
  5187 
       
  5188   prefetch(buf, 0,            Assembler::severalReads);
       
  5189   prefetch(buf, CHUNK_LEN*8,  Assembler::severalReads);
       
  5190   prefetch(buf, CHUNK_LEN*16, Assembler::severalReads);
       
  5191   prefetch(buf, CHUNK_LEN*24, Assembler::severalReads);
       
  5192 
       
  5193   // move to INT side, and reverse the byte order of lower 32 bits to little endian
       
  5194   movftoi_revbytes(F0, O4, G1, G4);
       
  5195   movftoi_revbytes(F2, O5, G1, G4);
       
  5196   movftoi_revbytes(F4, G5, G1, G4);
       
  5197 
       
  5198   // combine the results of 4 chunks
       
  5199   set64(CHUNK_K1, G3, G1);
       
  5200   xmulx(O4, G3, O4);
       
  5201   set64(CHUNK_K2, G3, G1);
       
  5202   xmulx(O5, G3, O5);
       
  5203   set64(CHUNK_K3, G3, G1);
       
  5204   xmulx(G5, G3, G5);
       
  5205 
       
  5206   movdtox(F14, G4);
       
  5207   xor3(O4, O5, O5);
       
  5208   xor3(G5, O5, O5);
       
  5209   xor3(G4, O5, O5);
       
  5210 
       
  5211   // reverse the byte order to big endian, via stack, and move to FP side
       
  5212   // TODO: use new revb instruction
       
  5213   add(SP, -8, G1);
       
  5214   srlx(G1, 3, G1);
       
  5215   sllx(G1, 3, G1);
       
  5216   stx(O5, G1, G0);
       
  5217   ldfl(FloatRegisterImpl::D, G1, G0, F2);  // load in little endian
       
  5218 
       
  5219   crc32c(F6, F2, F0);
       
  5220 
       
  5221   set(CHUNK_LEN*8*4, G4);
       
  5222   sub(len, G4, len);
       
  5223   cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel);
       
  5224   nop();
       
  5225   cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done);
       
  5226 
       
  5227   bind(L_crc32c_serial);
       
  5228 
       
  5229   mov(32, G4);
       
  5230   cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8);
       
  5231 
       
  5232   // ------ process 32B chunks ------
       
  5233   bind(L_crc32c_x32_loop);
       
  5234   ldf(FloatRegisterImpl::D, buf, 0, F2);
       
  5235   crc32c(F0, F2, F0);
       
  5236   ldf(FloatRegisterImpl::D, buf, 8, F2);
       
  5237   crc32c(F0, F2, F0);
       
  5238   ldf(FloatRegisterImpl::D, buf, 16, F2);
       
  5239   crc32c(F0, F2, F0);
       
  5240   ldf(FloatRegisterImpl::D, buf, 24, F2);
       
  5241   inc(buf, 32);
       
  5242   crc32c(F0, F2, F0);
       
  5243   dec(len, 32);
       
  5244   cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop);
       
  5245 
       
  5246   bind(L_crc32c_x8);
       
  5247   nop();
       
  5248   cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done);
       
  5249 
       
  5250   // ------ process 8B chunks ------
       
  5251   bind(L_crc32c_x8_loop);
       
  5252   ldf(FloatRegisterImpl::D, buf, 0, F2);
       
  5253   inc(buf, 8);
       
  5254   crc32c(F0, F2, F0);
       
  5255   dec(len, 8);
       
  5256   cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop);
       
  5257 
       
  5258   bind(L_crc32c_done);
       
  5259 
       
  5260   // move to INT side, and reverse the byte order of lower 32 bits to little endian
       
  5261   movftoi_revbytes(F0, crc, G1, G3);
       
  5262 
       
  5263   cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return);
       
  5264 
       
  5265   // ------ process the misaligned tail (7 bytes or less) ------
       
  5266   bind(L_crc32c_tail);
       
  5267 
       
  5268   // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
       
  5269   ldub(buf, 0, G1);
       
  5270   update_byte_crc32(crc, G1, table);
       
  5271 
       
  5272   inc(buf);
       
  5273   dec(len);
       
  5274   cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
       
  5275 
       
  5276   bind(L_crc32c_return);
       
  5277   nop();
       
  5278 }