4835 xor3(val, crc, crc); |
4835 xor3(val, crc, crc); |
4836 } |
4836 } |
4837 |
4837 |
4838 // Reverse byte order of lower 32 bits, assuming upper 32 bits all zeros |
4838 // Reverse byte order of lower 32 bits, assuming upper 32 bits all zeros |
4839 void MacroAssembler::reverse_bytes_32(Register src, Register dst, Register tmp) { |
4839 void MacroAssembler::reverse_bytes_32(Register src, Register dst, Register tmp) { |
4840 srlx(src, 24, dst); |
4840 srlx(src, 24, dst); |
4841 |
4841 |
4842 sllx(src, 32+8, tmp); |
4842 sllx(src, 32+8, tmp); |
4843 srlx(tmp, 32+24, tmp); |
4843 srlx(tmp, 32+24, tmp); |
4844 sllx(tmp, 8, tmp); |
4844 sllx(tmp, 8, tmp); |
4845 or3(dst, tmp, dst); |
4845 or3(dst, tmp, dst); |
4846 |
4846 |
4847 sllx(src, 32+16, tmp); |
4847 sllx(src, 32+16, tmp); |
4848 srlx(tmp, 32+24, tmp); |
4848 srlx(tmp, 32+24, tmp); |
4849 sllx(tmp, 16, tmp); |
4849 sllx(tmp, 16, tmp); |
4850 or3(dst, tmp, dst); |
4850 or3(dst, tmp, dst); |
4851 |
4851 |
4852 sllx(src, 32+24, tmp); |
4852 sllx(src, 32+24, tmp); |
4853 srlx(tmp, 32, tmp); |
4853 srlx(tmp, 32, tmp); |
4854 or3(dst, tmp, dst); |
4854 or3(dst, tmp, dst); |
4855 } |
4855 } |
4856 |
4856 |
4857 void MacroAssembler::movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2) { |
4857 void MacroAssembler::movitof_revbytes(Register src, FloatRegister dst, Register tmp1, Register tmp2) { |
4858 reverse_bytes_32(src, tmp1, tmp2); |
4858 reverse_bytes_32(src, tmp1, tmp2); |
4859 movxtod(tmp1, dst); |
4859 movxtod(tmp1, dst); |
5101 cmp_and_br_short(len, 0, Assembler::greaterUnsigned, Assembler::pt, L_cleanup_loop); |
5101 cmp_and_br_short(len, 0, Assembler::greaterUnsigned, Assembler::pt, L_cleanup_loop); |
5102 |
5102 |
5103 not1(crc); |
5103 not1(crc); |
5104 } |
5104 } |
5105 |
5105 |
|
5106 #define CHUNK_LEN 128 /* 128 x 8B = 1KB */ |
|
5107 #define CHUNK_K1 0x1307a0206 /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */ |
|
5108 #define CHUNK_K2 0x1a0f717c4 /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */ |
|
5109 #define CHUNK_K3 0x0170076fa /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */ |
|
5110 |
|
5111 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, Register table) { |
|
5112 |
|
5113 Label L_crc32c_head, L_crc32c_aligned; |
|
5114 Label L_crc32c_parallel, L_crc32c_parallel_loop; |
|
5115 Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop; |
|
5116 Label L_crc32c_done, L_crc32c_tail, L_crc32c_return; |
|
5117 |
|
5118 set(ExternalAddress(StubRoutines::crc32c_table_addr()), table); |
|
5119 |
|
5120 cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return); |
|
5121 |
|
5122 // clear upper 32 bits of crc |
|
5123 clruwu(crc); |
|
5124 |
|
5125 and3(buf, 7, G4); |
|
5126 cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned); |
|
5127 |
|
5128 mov(8, G1); |
|
5129 sub(G1, G4, G4); |
|
5130 |
|
5131 // ------ process the misaligned head (7 bytes or less) ------ |
|
5132 bind(L_crc32c_head); |
|
5133 |
|
5134 // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF]; |
|
5135 ldub(buf, 0, G1); |
|
5136 update_byte_crc32(crc, G1, table); |
|
5137 |
|
5138 inc(buf); |
|
5139 dec(len); |
|
5140 cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return); |
|
5141 dec(G4); |
|
5142 cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head); |
|
5143 |
|
5144 // ------ process the 8-byte-aligned body ------ |
|
5145 bind(L_crc32c_aligned); |
|
5146 nop(); |
|
5147 cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail); |
|
5148 |
|
5149 // reverse the byte order of lower 32 bits to big endian, and move to FP side |
|
5150 movitof_revbytes(crc, F0, G1, G3); |
|
5151 |
|
5152 set(CHUNK_LEN*8*4, G4); |
|
5153 cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial); |
|
5154 |
|
5155 // ------ process four 1KB chunks in parallel ------ |
|
5156 bind(L_crc32c_parallel); |
|
5157 |
|
5158 fzero(FloatRegisterImpl::D, F2); |
|
5159 fzero(FloatRegisterImpl::D, F4); |
|
5160 fzero(FloatRegisterImpl::D, F6); |
|
5161 |
|
5162 mov(CHUNK_LEN - 1, G4); |
|
5163 bind(L_crc32c_parallel_loop); |
|
5164 // schedule ldf's ahead of crc32c's to hide the load-use latency |
|
5165 ldf(FloatRegisterImpl::D, buf, 0, F8); |
|
5166 ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8, F10); |
|
5167 ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12); |
|
5168 ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14); |
|
5169 crc32c(F0, F8, F0); |
|
5170 crc32c(F2, F10, F2); |
|
5171 crc32c(F4, F12, F4); |
|
5172 crc32c(F6, F14, F6); |
|
5173 inc(buf, 8); |
|
5174 dec(G4); |
|
5175 cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop); |
|
5176 |
|
5177 ldf(FloatRegisterImpl::D, buf, 0, F8); |
|
5178 ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8, F10); |
|
5179 ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12); |
|
5180 crc32c(F0, F8, F0); |
|
5181 crc32c(F2, F10, F2); |
|
5182 crc32c(F4, F12, F4); |
|
5183 |
|
5184 inc(buf, CHUNK_LEN*24); |
|
5185 ldfl(FloatRegisterImpl::D, buf, G0, F14); // load in little endian |
|
5186 inc(buf, 8); |
|
5187 |
|
5188 prefetch(buf, 0, Assembler::severalReads); |
|
5189 prefetch(buf, CHUNK_LEN*8, Assembler::severalReads); |
|
5190 prefetch(buf, CHUNK_LEN*16, Assembler::severalReads); |
|
5191 prefetch(buf, CHUNK_LEN*24, Assembler::severalReads); |
|
5192 |
|
5193 // move to INT side, and reverse the byte order of lower 32 bits to little endian |
|
5194 movftoi_revbytes(F0, O4, G1, G4); |
|
5195 movftoi_revbytes(F2, O5, G1, G4); |
|
5196 movftoi_revbytes(F4, G5, G1, G4); |
|
5197 |
|
5198 // combine the results of 4 chunks |
|
5199 set64(CHUNK_K1, G3, G1); |
|
5200 xmulx(O4, G3, O4); |
|
5201 set64(CHUNK_K2, G3, G1); |
|
5202 xmulx(O5, G3, O5); |
|
5203 set64(CHUNK_K3, G3, G1); |
|
5204 xmulx(G5, G3, G5); |
|
5205 |
|
5206 movdtox(F14, G4); |
|
5207 xor3(O4, O5, O5); |
|
5208 xor3(G5, O5, O5); |
|
5209 xor3(G4, O5, O5); |
|
5210 |
|
5211 // reverse the byte order to big endian, via stack, and move to FP side |
|
5212 // TODO: use new revb instruction |
|
5213 add(SP, -8, G1); |
|
5214 srlx(G1, 3, G1); |
|
5215 sllx(G1, 3, G1); |
|
5216 stx(O5, G1, G0); |
|
5217 ldfl(FloatRegisterImpl::D, G1, G0, F2); // load in little endian |
|
5218 |
|
5219 crc32c(F6, F2, F0); |
|
5220 |
|
5221 set(CHUNK_LEN*8*4, G4); |
|
5222 sub(len, G4, len); |
|
5223 cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel); |
|
5224 nop(); |
|
5225 cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done); |
|
5226 |
|
5227 bind(L_crc32c_serial); |
|
5228 |
|
5229 mov(32, G4); |
|
5230 cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8); |
|
5231 |
|
5232 // ------ process 32B chunks ------ |
|
5233 bind(L_crc32c_x32_loop); |
|
5234 ldf(FloatRegisterImpl::D, buf, 0, F2); |
|
5235 crc32c(F0, F2, F0); |
|
5236 ldf(FloatRegisterImpl::D, buf, 8, F2); |
|
5237 crc32c(F0, F2, F0); |
|
5238 ldf(FloatRegisterImpl::D, buf, 16, F2); |
|
5239 crc32c(F0, F2, F0); |
|
5240 ldf(FloatRegisterImpl::D, buf, 24, F2); |
|
5241 inc(buf, 32); |
|
5242 crc32c(F0, F2, F0); |
|
5243 dec(len, 32); |
|
5244 cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop); |
|
5245 |
|
5246 bind(L_crc32c_x8); |
|
5247 nop(); |
|
5248 cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done); |
|
5249 |
|
5250 // ------ process 8B chunks ------ |
|
5251 bind(L_crc32c_x8_loop); |
|
5252 ldf(FloatRegisterImpl::D, buf, 0, F2); |
|
5253 inc(buf, 8); |
|
5254 crc32c(F0, F2, F0); |
|
5255 dec(len, 8); |
|
5256 cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop); |
|
5257 |
|
5258 bind(L_crc32c_done); |
|
5259 |
|
5260 // move to INT side, and reverse the byte order of lower 32 bits to little endian |
|
5261 movftoi_revbytes(F0, crc, G1, G3); |
|
5262 |
|
5263 cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return); |
|
5264 |
|
5265 // ------ process the misaligned tail (7 bytes or less) ------ |
|
5266 bind(L_crc32c_tail); |
|
5267 |
|
5268 // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF]; |
|
5269 ldub(buf, 0, G1); |
|
5270 update_byte_crc32(crc, G1, table); |
|
5271 |
|
5272 inc(buf); |
|
5273 dec(len); |
|
5274 cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail); |
|
5275 |
|
5276 bind(L_crc32c_return); |
|
5277 nop(); |
|
5278 } |