hotspot/src/cpu/sparc/vm/macroAssembler_sparc.cpp
changeset 34205 9ec51d30a11e
parent 33628 09241459a8b8
child 34211 d25c2fc1e248
equal deleted inserted replaced
34204:5ad1ba3afecc 34205:9ec51d30a11e
  4769 
  4769 
  4770 void MacroAssembler::movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2) {
  4770 void MacroAssembler::movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2) {
  4771   movdtox(src, tmp1);
  4771   movdtox(src, tmp1);
  4772   reverse_bytes_32(tmp1, dst, tmp2);
  4772   reverse_bytes_32(tmp1, dst, tmp2);
  4773 }
  4773 }
       
  4774 
       
  4775 void MacroAssembler::fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register buf, int offset) {
       
  4776   xmulx(xcrc_hi, xK_hi, xtmp_lo);
       
  4777   xmulxhi(xcrc_hi, xK_hi, xtmp_hi);
       
  4778   xmulxhi(xcrc_lo, xK_lo, xcrc_hi);
       
  4779   xmulx(xcrc_lo, xK_lo, xcrc_lo);
       
  4780   xor3(xcrc_lo, xtmp_lo, xcrc_lo);
       
  4781   xor3(xcrc_hi, xtmp_hi, xcrc_hi);
       
  4782   ldxl(buf, G0, xtmp_lo);
       
  4783   inc(buf, 8);
       
  4784   ldxl(buf, G0, xtmp_hi);
       
  4785   inc(buf, 8);
       
  4786   xor3(xcrc_lo, xtmp_lo, xcrc_lo);
       
  4787   xor3(xcrc_hi, xtmp_hi, xcrc_hi);
       
  4788 }
       
  4789 
       
  4790 void MacroAssembler::fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register xbuf_hi, Register xbuf_lo) {
       
  4791   mov(xcrc_lo, xtmp_lo);
       
  4792   mov(xcrc_hi, xtmp_hi);
       
  4793   xmulx(xtmp_hi, xK_hi, xtmp_lo);
       
  4794   xmulxhi(xtmp_hi, xK_hi, xtmp_hi);
       
  4795   xmulxhi(xcrc_lo, xK_lo, xcrc_hi);
       
  4796   xmulx(xcrc_lo, xK_lo, xcrc_lo);
       
  4797   xor3(xcrc_lo, xbuf_lo, xcrc_lo);
       
  4798   xor3(xcrc_hi, xbuf_hi, xcrc_hi);
       
  4799   xor3(xcrc_lo, xtmp_lo, xcrc_lo);
       
  4800   xor3(xcrc_hi, xtmp_hi, xcrc_hi);
       
  4801 }
       
  4802 
       
  4803 void MacroAssembler::fold_8bit_crc32(Register xcrc, Register table, Register xtmp, Register tmp) {
       
  4804   and3(xcrc, 0xFF, tmp);
       
  4805   sllx(tmp, 2, tmp);
       
  4806   lduw(table, tmp, xtmp);
       
  4807   srlx(xcrc, 8, xcrc);
       
  4808   xor3(xtmp, xcrc, xcrc);
       
  4809 }
       
  4810 
       
  4811 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) {
       
  4812   and3(crc, 0xFF, tmp);
       
  4813   srlx(crc, 8, crc);
       
  4814   sllx(tmp, 2, tmp);
       
  4815   lduw(table, tmp, tmp);
       
  4816   xor3(tmp, crc, crc);
       
  4817 }
       
  4818 
       
  4819 #define CRC32_TMP_REG_NUM 18
       
  4820 
       
  4821 #define CRC32_CONST_64  0x163cd6124
       
  4822 #define CRC32_CONST_96  0x0ccaa009e
       
  4823 #define CRC32_CONST_160 0x1751997d0
       
  4824 #define CRC32_CONST_480 0x1c6e41596
       
  4825 #define CRC32_CONST_544 0x154442bd4
       
  4826 
       
  4827 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table) {
       
  4828 
       
  4829   Label L_cleanup_loop, L_cleanup_check, L_align_loop, L_align_check;
       
  4830   Label L_main_loop_prologue;
       
  4831   Label L_fold_512b, L_fold_512b_loop, L_fold_128b;
       
  4832   Label L_fold_tail, L_fold_tail_loop;
       
  4833   Label L_8byte_fold_loop, L_8byte_fold_check;
       
  4834 
       
  4835   const Register tmp[CRC32_TMP_REG_NUM] = {L0, L1, L2, L3, L4, L5, L6, G1, I0, I1, I2, I3, I4, I5, I7, O4, O5, G3};
       
  4836 
       
  4837   Register const_64  = tmp[CRC32_TMP_REG_NUM-1];
       
  4838   Register const_96  = tmp[CRC32_TMP_REG_NUM-1];
       
  4839   Register const_160 = tmp[CRC32_TMP_REG_NUM-2];
       
  4840   Register const_480 = tmp[CRC32_TMP_REG_NUM-1];
       
  4841   Register const_544 = tmp[CRC32_TMP_REG_NUM-2];
       
  4842 
       
  4843   set(ExternalAddress(StubRoutines::crc_table_addr()), table);
       
  4844 
       
  4845   not1(crc); // ~c
       
  4846   clruwu(crc); // clear upper 32 bits of crc
       
  4847 
       
  4848   // Check if below cutoff, proceed directly to cleanup code
       
  4849   mov(31, G4);
       
  4850   cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_check);
       
  4851 
       
  4852   // Align buffer to 8 byte boundry
       
  4853   mov(8, O5);
       
  4854   and3(buf, 0x7, O4);
       
  4855   sub(O5, O4, O5);
       
  4856   and3(O5, 0x7, O5);
       
  4857   sub(len, O5, len);
       
  4858   ba(L_align_check);
       
  4859   delayed()->nop();
       
  4860 
       
  4861   // Alignment loop, table look up method for up to 7 bytes
       
  4862   bind(L_align_loop);
       
  4863   ldub(buf, 0, O4);
       
  4864   inc(buf);
       
  4865   dec(O5);
       
  4866   xor3(O4, crc, O4);
       
  4867   and3(O4, 0xFF, O4);
       
  4868   sllx(O4, 2, O4);
       
  4869   lduw(table, O4, O4);
       
  4870   srlx(crc, 8, crc);
       
  4871   xor3(O4, crc, crc);
       
  4872   bind(L_align_check);
       
  4873   nop();
       
  4874   cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_align_loop);
       
  4875 
       
  4876   // Aligned on 64-bit (8-byte) boundry at this point
       
  4877   // Check if still above cutoff (31-bytes)
       
  4878   mov(31, G4);
       
  4879   cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_check);
       
  4880   // At least 32 bytes left to process
       
  4881 
       
  4882   // Free up registers by storing them to FP registers
       
  4883   for (int i = 0; i < CRC32_TMP_REG_NUM; i++) {
       
  4884     movxtod(tmp[i], as_FloatRegister(2*i));
       
  4885   }
       
  4886 
       
  4887   // Determine which loop to enter
       
  4888   // Shared prologue
       
  4889   ldxl(buf, G0, tmp[0]);
       
  4890   inc(buf, 8);
       
  4891   ldxl(buf, G0, tmp[1]);
       
  4892   inc(buf, 8);
       
  4893   xor3(tmp[0], crc, tmp[0]); // Fold CRC into first few bytes
       
  4894   and3(crc, 0, crc); // Clear out the crc register
       
  4895   // Main loop needs 128-bytes at least
       
  4896   mov(128, G4);
       
  4897   mov(64, tmp[2]);
       
  4898   cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_main_loop_prologue);
       
  4899   // Less than 64 bytes
       
  4900   nop();
       
  4901   cmp_and_br_short(len, tmp[2], Assembler::lessUnsigned, Assembler::pt, L_fold_tail);
       
  4902   // Between 64 and 127 bytes
       
  4903   set64(CRC32_CONST_96,  const_96,  tmp[8]);
       
  4904   set64(CRC32_CONST_160, const_160, tmp[9]);
       
  4905   fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[2], tmp[3], buf, 0);
       
  4906   fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[4], tmp[5], buf, 16);
       
  4907   fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[6], tmp[7], buf, 32);
       
  4908   dec(len, 48);
       
  4909   ba(L_fold_tail);
       
  4910   delayed()->nop();
       
  4911 
       
  4912   bind(L_main_loop_prologue);
       
  4913   for (int i = 2; i < 8; i++) {
       
  4914     ldxl(buf, G0, tmp[i]);
       
  4915     inc(buf, 8);
       
  4916   }
       
  4917 
       
  4918   // Fold total 512 bits of polynomial on each iteration,
       
  4919   // 128 bits per each of 4 parallel streams
       
  4920   set64(CRC32_CONST_480, const_480, tmp[8]);
       
  4921   set64(CRC32_CONST_544, const_544, tmp[9]);
       
  4922 
       
  4923   mov(128, G4);
       
  4924   bind(L_fold_512b_loop);
       
  4925   fold_128bit_crc32(tmp[1], tmp[0], const_480, const_544, tmp[9],  tmp[8],  buf,  0);
       
  4926   fold_128bit_crc32(tmp[3], tmp[2], const_480, const_544, tmp[11], tmp[10], buf, 16);
       
  4927   fold_128bit_crc32(tmp[5], tmp[4], const_480, const_544, tmp[13], tmp[12], buf, 32);
       
  4928   fold_128bit_crc32(tmp[7], tmp[6], const_480, const_544, tmp[15], tmp[14], buf, 64);
       
  4929   dec(len, 64);
       
  4930   cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_fold_512b_loop);
       
  4931 
       
  4932   // Fold 512 bits to 128 bits
       
  4933   bind(L_fold_512b);
       
  4934   set64(CRC32_CONST_96,  const_96,  tmp[8]);
       
  4935   set64(CRC32_CONST_160, const_160, tmp[9]);
       
  4936 
       
  4937   fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[3], tmp[2]);
       
  4938   fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[5], tmp[4]);
       
  4939   fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[7], tmp[6]);
       
  4940   dec(len, 48);
       
  4941 
       
  4942   // Fold the rest of 128 bits data chunks
       
  4943   bind(L_fold_tail);
       
  4944   mov(32, G4);
       
  4945   cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_fold_128b);
       
  4946 
       
  4947   set64(CRC32_CONST_96,  const_96,  tmp[8]);
       
  4948   set64(CRC32_CONST_160, const_160, tmp[9]);
       
  4949 
       
  4950   bind(L_fold_tail_loop);
       
  4951   fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[2], tmp[3], buf, 0);
       
  4952   sub(len, 16, len);
       
  4953   cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_fold_tail_loop);
       
  4954 
       
  4955   // Fold the 128 bits in tmps 0 - 1 into tmp 1
       
  4956   bind(L_fold_128b);
       
  4957 
       
  4958   set64(CRC32_CONST_64, const_64, tmp[4]);
       
  4959 
       
  4960   xmulx(const_64, tmp[0], tmp[2]);
       
  4961   xmulxhi(const_64, tmp[0], tmp[3]);
       
  4962 
       
  4963   srl(tmp[2], G0, tmp[4]);
       
  4964   xmulx(const_64, tmp[4], tmp[4]);
       
  4965 
       
  4966   srlx(tmp[2], 32, tmp[2]);
       
  4967   sllx(tmp[3], 32, tmp[3]);
       
  4968   or3(tmp[2], tmp[3], tmp[2]);
       
  4969 
       
  4970   xor3(tmp[4], tmp[1], tmp[4]);
       
  4971   xor3(tmp[4], tmp[2], tmp[1]);
       
  4972   dec(len, 8);
       
  4973 
       
  4974   // Use table lookup for the 8 bytes left in tmp[1]
       
  4975   dec(len, 8);
       
  4976 
       
  4977   // 8 8-bit folds to compute 32-bit CRC.
       
  4978   for (int j = 0; j < 4; j++) {
       
  4979     fold_8bit_crc32(tmp[1], table, tmp[2], tmp[3]);
       
  4980   }
       
  4981   srl(tmp[1], G0, crc); // move 32 bits to general register
       
  4982   for (int j = 0; j < 4; j++) {
       
  4983     fold_8bit_crc32(crc, table, tmp[3]);
       
  4984   }
       
  4985 
       
  4986   bind(L_8byte_fold_check);
       
  4987 
       
  4988   // Restore int registers saved in FP registers
       
  4989   for (int i = 0; i < CRC32_TMP_REG_NUM; i++) {
       
  4990     movdtox(as_FloatRegister(2*i), tmp[i]);
       
  4991   }
       
  4992 
       
  4993   ba(L_cleanup_check);
       
  4994   delayed()->nop();
       
  4995 
       
  4996   // Table look-up method for the remaining few bytes
       
  4997   bind(L_cleanup_loop);
       
  4998   ldub(buf, 0, O4);
       
  4999   inc(buf);
       
  5000   dec(len);
       
  5001   xor3(O4, crc, O4);
       
  5002   and3(O4, 0xFF, O4);
       
  5003   sllx(O4, 2, O4);
       
  5004   lduw(table, O4, O4);
       
  5005   srlx(crc, 8, crc);
       
  5006   xor3(O4, crc, crc);
       
  5007   bind(L_cleanup_check);
       
  5008   nop();
       
  5009   cmp_and_br_short(len, 0, Assembler::greaterUnsigned, Assembler::pt, L_cleanup_loop);
       
  5010 
       
  5011   not1(crc);
       
  5012 }
       
  5013