4769 |
4769 |
4770 void MacroAssembler::movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2) { |
4770 void MacroAssembler::movftoi_revbytes(FloatRegister src, Register dst, Register tmp1, Register tmp2) { |
4771 movdtox(src, tmp1); |
4771 movdtox(src, tmp1); |
4772 reverse_bytes_32(tmp1, dst, tmp2); |
4772 reverse_bytes_32(tmp1, dst, tmp2); |
4773 } |
4773 } |
|
4774 |
|
4775 void MacroAssembler::fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register buf, int offset) { |
|
4776 xmulx(xcrc_hi, xK_hi, xtmp_lo); |
|
4777 xmulxhi(xcrc_hi, xK_hi, xtmp_hi); |
|
4778 xmulxhi(xcrc_lo, xK_lo, xcrc_hi); |
|
4779 xmulx(xcrc_lo, xK_lo, xcrc_lo); |
|
4780 xor3(xcrc_lo, xtmp_lo, xcrc_lo); |
|
4781 xor3(xcrc_hi, xtmp_hi, xcrc_hi); |
|
4782 ldxl(buf, G0, xtmp_lo); |
|
4783 inc(buf, 8); |
|
4784 ldxl(buf, G0, xtmp_hi); |
|
4785 inc(buf, 8); |
|
4786 xor3(xcrc_lo, xtmp_lo, xcrc_lo); |
|
4787 xor3(xcrc_hi, xtmp_hi, xcrc_hi); |
|
4788 } |
|
4789 |
|
4790 void MacroAssembler::fold_128bit_crc32(Register xcrc_hi, Register xcrc_lo, Register xK_hi, Register xK_lo, Register xtmp_hi, Register xtmp_lo, Register xbuf_hi, Register xbuf_lo) { |
|
4791 mov(xcrc_lo, xtmp_lo); |
|
4792 mov(xcrc_hi, xtmp_hi); |
|
4793 xmulx(xtmp_hi, xK_hi, xtmp_lo); |
|
4794 xmulxhi(xtmp_hi, xK_hi, xtmp_hi); |
|
4795 xmulxhi(xcrc_lo, xK_lo, xcrc_hi); |
|
4796 xmulx(xcrc_lo, xK_lo, xcrc_lo); |
|
4797 xor3(xcrc_lo, xbuf_lo, xcrc_lo); |
|
4798 xor3(xcrc_hi, xbuf_hi, xcrc_hi); |
|
4799 xor3(xcrc_lo, xtmp_lo, xcrc_lo); |
|
4800 xor3(xcrc_hi, xtmp_hi, xcrc_hi); |
|
4801 } |
|
4802 |
|
4803 void MacroAssembler::fold_8bit_crc32(Register xcrc, Register table, Register xtmp, Register tmp) { |
|
4804 and3(xcrc, 0xFF, tmp); |
|
4805 sllx(tmp, 2, tmp); |
|
4806 lduw(table, tmp, xtmp); |
|
4807 srlx(xcrc, 8, xcrc); |
|
4808 xor3(xtmp, xcrc, xcrc); |
|
4809 } |
|
4810 |
|
4811 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { |
|
4812 and3(crc, 0xFF, tmp); |
|
4813 srlx(crc, 8, crc); |
|
4814 sllx(tmp, 2, tmp); |
|
4815 lduw(table, tmp, tmp); |
|
4816 xor3(tmp, crc, crc); |
|
4817 } |
|
4818 |
|
4819 #define CRC32_TMP_REG_NUM 18 |
|
4820 |
|
4821 #define CRC32_CONST_64 0x163cd6124 |
|
4822 #define CRC32_CONST_96 0x0ccaa009e |
|
4823 #define CRC32_CONST_160 0x1751997d0 |
|
4824 #define CRC32_CONST_480 0x1c6e41596 |
|
4825 #define CRC32_CONST_544 0x154442bd4 |
|
4826 |
|
4827 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table) { |
|
4828 |
|
4829 Label L_cleanup_loop, L_cleanup_check, L_align_loop, L_align_check; |
|
4830 Label L_main_loop_prologue; |
|
4831 Label L_fold_512b, L_fold_512b_loop, L_fold_128b; |
|
4832 Label L_fold_tail, L_fold_tail_loop; |
|
4833 Label L_8byte_fold_loop, L_8byte_fold_check; |
|
4834 |
|
4835 const Register tmp[CRC32_TMP_REG_NUM] = {L0, L1, L2, L3, L4, L5, L6, G1, I0, I1, I2, I3, I4, I5, I7, O4, O5, G3}; |
|
4836 |
|
4837 Register const_64 = tmp[CRC32_TMP_REG_NUM-1]; |
|
4838 Register const_96 = tmp[CRC32_TMP_REG_NUM-1]; |
|
4839 Register const_160 = tmp[CRC32_TMP_REG_NUM-2]; |
|
4840 Register const_480 = tmp[CRC32_TMP_REG_NUM-1]; |
|
4841 Register const_544 = tmp[CRC32_TMP_REG_NUM-2]; |
|
4842 |
|
4843 set(ExternalAddress(StubRoutines::crc_table_addr()), table); |
|
4844 |
|
4845 not1(crc); // ~c |
|
4846 clruwu(crc); // clear upper 32 bits of crc |
|
4847 |
|
4848 // Check if below cutoff, proceed directly to cleanup code |
|
4849 mov(31, G4); |
|
4850 cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_check); |
|
4851 |
|
4852 // Align buffer to 8 byte boundry |
|
4853 mov(8, O5); |
|
4854 and3(buf, 0x7, O4); |
|
4855 sub(O5, O4, O5); |
|
4856 and3(O5, 0x7, O5); |
|
4857 sub(len, O5, len); |
|
4858 ba(L_align_check); |
|
4859 delayed()->nop(); |
|
4860 |
|
4861 // Alignment loop, table look up method for up to 7 bytes |
|
4862 bind(L_align_loop); |
|
4863 ldub(buf, 0, O4); |
|
4864 inc(buf); |
|
4865 dec(O5); |
|
4866 xor3(O4, crc, O4); |
|
4867 and3(O4, 0xFF, O4); |
|
4868 sllx(O4, 2, O4); |
|
4869 lduw(table, O4, O4); |
|
4870 srlx(crc, 8, crc); |
|
4871 xor3(O4, crc, crc); |
|
4872 bind(L_align_check); |
|
4873 nop(); |
|
4874 cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_align_loop); |
|
4875 |
|
4876 // Aligned on 64-bit (8-byte) boundry at this point |
|
4877 // Check if still above cutoff (31-bytes) |
|
4878 mov(31, G4); |
|
4879 cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_check); |
|
4880 // At least 32 bytes left to process |
|
4881 |
|
4882 // Free up registers by storing them to FP registers |
|
4883 for (int i = 0; i < CRC32_TMP_REG_NUM; i++) { |
|
4884 movxtod(tmp[i], as_FloatRegister(2*i)); |
|
4885 } |
|
4886 |
|
4887 // Determine which loop to enter |
|
4888 // Shared prologue |
|
4889 ldxl(buf, G0, tmp[0]); |
|
4890 inc(buf, 8); |
|
4891 ldxl(buf, G0, tmp[1]); |
|
4892 inc(buf, 8); |
|
4893 xor3(tmp[0], crc, tmp[0]); // Fold CRC into first few bytes |
|
4894 and3(crc, 0, crc); // Clear out the crc register |
|
4895 // Main loop needs 128-bytes at least |
|
4896 mov(128, G4); |
|
4897 mov(64, tmp[2]); |
|
4898 cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_main_loop_prologue); |
|
4899 // Less than 64 bytes |
|
4900 nop(); |
|
4901 cmp_and_br_short(len, tmp[2], Assembler::lessUnsigned, Assembler::pt, L_fold_tail); |
|
4902 // Between 64 and 127 bytes |
|
4903 set64(CRC32_CONST_96, const_96, tmp[8]); |
|
4904 set64(CRC32_CONST_160, const_160, tmp[9]); |
|
4905 fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[2], tmp[3], buf, 0); |
|
4906 fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[4], tmp[5], buf, 16); |
|
4907 fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[6], tmp[7], buf, 32); |
|
4908 dec(len, 48); |
|
4909 ba(L_fold_tail); |
|
4910 delayed()->nop(); |
|
4911 |
|
4912 bind(L_main_loop_prologue); |
|
4913 for (int i = 2; i < 8; i++) { |
|
4914 ldxl(buf, G0, tmp[i]); |
|
4915 inc(buf, 8); |
|
4916 } |
|
4917 |
|
4918 // Fold total 512 bits of polynomial on each iteration, |
|
4919 // 128 bits per each of 4 parallel streams |
|
4920 set64(CRC32_CONST_480, const_480, tmp[8]); |
|
4921 set64(CRC32_CONST_544, const_544, tmp[9]); |
|
4922 |
|
4923 mov(128, G4); |
|
4924 bind(L_fold_512b_loop); |
|
4925 fold_128bit_crc32(tmp[1], tmp[0], const_480, const_544, tmp[9], tmp[8], buf, 0); |
|
4926 fold_128bit_crc32(tmp[3], tmp[2], const_480, const_544, tmp[11], tmp[10], buf, 16); |
|
4927 fold_128bit_crc32(tmp[5], tmp[4], const_480, const_544, tmp[13], tmp[12], buf, 32); |
|
4928 fold_128bit_crc32(tmp[7], tmp[6], const_480, const_544, tmp[15], tmp[14], buf, 64); |
|
4929 dec(len, 64); |
|
4930 cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_fold_512b_loop); |
|
4931 |
|
4932 // Fold 512 bits to 128 bits |
|
4933 bind(L_fold_512b); |
|
4934 set64(CRC32_CONST_96, const_96, tmp[8]); |
|
4935 set64(CRC32_CONST_160, const_160, tmp[9]); |
|
4936 |
|
4937 fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[3], tmp[2]); |
|
4938 fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[5], tmp[4]); |
|
4939 fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[8], tmp[9], tmp[7], tmp[6]); |
|
4940 dec(len, 48); |
|
4941 |
|
4942 // Fold the rest of 128 bits data chunks |
|
4943 bind(L_fold_tail); |
|
4944 mov(32, G4); |
|
4945 cmp_and_br_short(len, G4, Assembler::lessEqualUnsigned, Assembler::pt, L_fold_128b); |
|
4946 |
|
4947 set64(CRC32_CONST_96, const_96, tmp[8]); |
|
4948 set64(CRC32_CONST_160, const_160, tmp[9]); |
|
4949 |
|
4950 bind(L_fold_tail_loop); |
|
4951 fold_128bit_crc32(tmp[1], tmp[0], const_96, const_160, tmp[2], tmp[3], buf, 0); |
|
4952 sub(len, 16, len); |
|
4953 cmp_and_br_short(len, G4, Assembler::greaterEqualUnsigned, Assembler::pt, L_fold_tail_loop); |
|
4954 |
|
4955 // Fold the 128 bits in tmps 0 - 1 into tmp 1 |
|
4956 bind(L_fold_128b); |
|
4957 |
|
4958 set64(CRC32_CONST_64, const_64, tmp[4]); |
|
4959 |
|
4960 xmulx(const_64, tmp[0], tmp[2]); |
|
4961 xmulxhi(const_64, tmp[0], tmp[3]); |
|
4962 |
|
4963 srl(tmp[2], G0, tmp[4]); |
|
4964 xmulx(const_64, tmp[4], tmp[4]); |
|
4965 |
|
4966 srlx(tmp[2], 32, tmp[2]); |
|
4967 sllx(tmp[3], 32, tmp[3]); |
|
4968 or3(tmp[2], tmp[3], tmp[2]); |
|
4969 |
|
4970 xor3(tmp[4], tmp[1], tmp[4]); |
|
4971 xor3(tmp[4], tmp[2], tmp[1]); |
|
4972 dec(len, 8); |
|
4973 |
|
4974 // Use table lookup for the 8 bytes left in tmp[1] |
|
4975 dec(len, 8); |
|
4976 |
|
4977 // 8 8-bit folds to compute 32-bit CRC. |
|
4978 for (int j = 0; j < 4; j++) { |
|
4979 fold_8bit_crc32(tmp[1], table, tmp[2], tmp[3]); |
|
4980 } |
|
4981 srl(tmp[1], G0, crc); // move 32 bits to general register |
|
4982 for (int j = 0; j < 4; j++) { |
|
4983 fold_8bit_crc32(crc, table, tmp[3]); |
|
4984 } |
|
4985 |
|
4986 bind(L_8byte_fold_check); |
|
4987 |
|
4988 // Restore int registers saved in FP registers |
|
4989 for (int i = 0; i < CRC32_TMP_REG_NUM; i++) { |
|
4990 movdtox(as_FloatRegister(2*i), tmp[i]); |
|
4991 } |
|
4992 |
|
4993 ba(L_cleanup_check); |
|
4994 delayed()->nop(); |
|
4995 |
|
4996 // Table look-up method for the remaining few bytes |
|
4997 bind(L_cleanup_loop); |
|
4998 ldub(buf, 0, O4); |
|
4999 inc(buf); |
|
5000 dec(len); |
|
5001 xor3(O4, crc, O4); |
|
5002 and3(O4, 0xFF, O4); |
|
5003 sllx(O4, 2, O4); |
|
5004 lduw(table, O4, O4); |
|
5005 srlx(crc, 8, crc); |
|
5006 xor3(O4, crc, crc); |
|
5007 bind(L_cleanup_check); |
|
5008 nop(); |
|
5009 cmp_and_br_short(len, 0, Assembler::greaterUnsigned, Assembler::pt, L_cleanup_loop); |
|
5010 |
|
5011 not1(crc); |
|
5012 } |
|
5013 |