4936 const Register crc = O0; // crc |
4931 const Register crc = O0; // crc |
4937 const Register buf = O1; // source java byte array address |
4932 const Register buf = O1; // source java byte array address |
4938 const Register len = O2; // number of bytes |
4933 const Register len = O2; // number of bytes |
4939 const Register table = O3; // byteTable |
4934 const Register table = O3; // byteTable |
4940 |
4935 |
4941 Label L_crc32c_head, L_crc32c_aligned; |
4936 __ kernel_crc32c(crc, buf, len, table); |
4942 Label L_crc32c_parallel, L_crc32c_parallel_loop; |
4937 |
4943 Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop; |
|
4944 Label L_crc32c_done, L_crc32c_tail, L_crc32c_return; |
|
4945 |
|
4946 __ cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return); |
|
4947 |
|
4948 // clear upper 32 bits of crc |
|
4949 __ clruwu(crc); |
|
4950 |
|
4951 __ and3(buf, 7, G4); |
|
4952 __ cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned); |
|
4953 |
|
4954 __ mov(8, G1); |
|
4955 __ sub(G1, G4, G4); |
|
4956 |
|
4957 // ------ process the misaligned head (7 bytes or less) ------ |
|
4958 __ BIND(L_crc32c_head); |
|
4959 |
|
4960 // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF]; |
|
4961 __ ldub(buf, 0, G1); |
|
4962 __ update_byte_crc32(crc, G1, table); |
|
4963 |
|
4964 __ inc(buf); |
|
4965 __ dec(len); |
|
4966 __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return); |
|
4967 __ dec(G4); |
|
4968 __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head); |
|
4969 |
|
4970 // ------ process the 8-byte-aligned body ------ |
|
4971 __ BIND(L_crc32c_aligned); |
|
4972 __ nop(); |
|
4973 __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail); |
|
4974 |
|
4975 // reverse the byte order of lower 32 bits to big endian, and move to FP side |
|
4976 __ movitof_revbytes(crc, F0, G1, G3); |
|
4977 |
|
4978 __ set(CHUNK_LEN*8*4, G4); |
|
4979 __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial); |
|
4980 |
|
4981 // ------ process four 1KB chunks in parallel ------ |
|
4982 __ BIND(L_crc32c_parallel); |
|
4983 |
|
4984 __ fzero(FloatRegisterImpl::D, F2); |
|
4985 __ fzero(FloatRegisterImpl::D, F4); |
|
4986 __ fzero(FloatRegisterImpl::D, F6); |
|
4987 |
|
4988 __ mov(CHUNK_LEN - 1, G4); |
|
4989 __ BIND(L_crc32c_parallel_loop); |
|
4990 // schedule ldf's ahead of crc32c's to hide the load-use latency |
|
4991 __ ldf(FloatRegisterImpl::D, buf, 0, F8); |
|
4992 __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8, F10); |
|
4993 __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12); |
|
4994 __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14); |
|
4995 __ crc32c(F0, F8, F0); |
|
4996 __ crc32c(F2, F10, F2); |
|
4997 __ crc32c(F4, F12, F4); |
|
4998 __ crc32c(F6, F14, F6); |
|
4999 __ inc(buf, 8); |
|
5000 __ dec(G4); |
|
5001 __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop); |
|
5002 |
|
5003 __ ldf(FloatRegisterImpl::D, buf, 0, F8); |
|
5004 __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8, F10); |
|
5005 __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12); |
|
5006 __ crc32c(F0, F8, F0); |
|
5007 __ crc32c(F2, F10, F2); |
|
5008 __ crc32c(F4, F12, F4); |
|
5009 |
|
5010 __ inc(buf, CHUNK_LEN*24); |
|
5011 __ ldfl(FloatRegisterImpl::D, buf, G0, F14); // load in little endian |
|
5012 __ inc(buf, 8); |
|
5013 |
|
5014 __ prefetch(buf, 0, Assembler::severalReads); |
|
5015 __ prefetch(buf, CHUNK_LEN*8, Assembler::severalReads); |
|
5016 __ prefetch(buf, CHUNK_LEN*16, Assembler::severalReads); |
|
5017 __ prefetch(buf, CHUNK_LEN*24, Assembler::severalReads); |
|
5018 |
|
5019 // move to INT side, and reverse the byte order of lower 32 bits to little endian |
|
5020 __ movftoi_revbytes(F0, O4, G1, G4); |
|
5021 __ movftoi_revbytes(F2, O5, G1, G4); |
|
5022 __ movftoi_revbytes(F4, G5, G1, G4); |
|
5023 |
|
5024 // combine the results of 4 chunks |
|
5025 __ set64(CHUNK_K1, G3, G1); |
|
5026 __ xmulx(O4, G3, O4); |
|
5027 __ set64(CHUNK_K2, G3, G1); |
|
5028 __ xmulx(O5, G3, O5); |
|
5029 __ set64(CHUNK_K3, G3, G1); |
|
5030 __ xmulx(G5, G3, G5); |
|
5031 |
|
5032 __ movdtox(F14, G4); |
|
5033 __ xor3(O4, O5, O5); |
|
5034 __ xor3(G5, O5, O5); |
|
5035 __ xor3(G4, O5, O5); |
|
5036 |
|
5037 // reverse the byte order to big endian, via stack, and move to FP side |
|
5038 __ add(SP, -8, G1); |
|
5039 __ srlx(G1, 3, G1); |
|
5040 __ sllx(G1, 3, G1); |
|
5041 __ stx(O5, G1, G0); |
|
5042 __ ldfl(FloatRegisterImpl::D, G1, G0, F2); // load in little endian |
|
5043 |
|
5044 __ crc32c(F6, F2, F0); |
|
5045 |
|
5046 __ set(CHUNK_LEN*8*4, G4); |
|
5047 __ sub(len, G4, len); |
|
5048 __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel); |
|
5049 __ nop(); |
|
5050 __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done); |
|
5051 |
|
5052 __ BIND(L_crc32c_serial); |
|
5053 |
|
5054 __ mov(32, G4); |
|
5055 __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8); |
|
5056 |
|
5057 // ------ process 32B chunks ------ |
|
5058 __ BIND(L_crc32c_x32_loop); |
|
5059 __ ldf(FloatRegisterImpl::D, buf, 0, F2); |
|
5060 __ inc(buf, 8); |
|
5061 __ crc32c(F0, F2, F0); |
|
5062 __ ldf(FloatRegisterImpl::D, buf, 0, F2); |
|
5063 __ inc(buf, 8); |
|
5064 __ crc32c(F0, F2, F0); |
|
5065 __ ldf(FloatRegisterImpl::D, buf, 0, F2); |
|
5066 __ inc(buf, 8); |
|
5067 __ crc32c(F0, F2, F0); |
|
5068 __ ldf(FloatRegisterImpl::D, buf, 0, F2); |
|
5069 __ inc(buf, 8); |
|
5070 __ crc32c(F0, F2, F0); |
|
5071 __ dec(len, 32); |
|
5072 __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop); |
|
5073 |
|
5074 __ BIND(L_crc32c_x8); |
|
5075 __ nop(); |
|
5076 __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done); |
|
5077 |
|
5078 // ------ process 8B chunks ------ |
|
5079 __ BIND(L_crc32c_x8_loop); |
|
5080 __ ldf(FloatRegisterImpl::D, buf, 0, F2); |
|
5081 __ inc(buf, 8); |
|
5082 __ crc32c(F0, F2, F0); |
|
5083 __ dec(len, 8); |
|
5084 __ cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop); |
|
5085 |
|
5086 __ BIND(L_crc32c_done); |
|
5087 |
|
5088 // move to INT side, and reverse the byte order of lower 32 bits to little endian |
|
5089 __ movftoi_revbytes(F0, crc, G1, G3); |
|
5090 |
|
5091 __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return); |
|
5092 |
|
5093 // ------ process the misaligned tail (7 bytes or less) ------ |
|
5094 __ BIND(L_crc32c_tail); |
|
5095 |
|
5096 // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF]; |
|
5097 __ ldub(buf, 0, G1); |
|
5098 __ update_byte_crc32(crc, G1, table); |
|
5099 |
|
5100 __ inc(buf); |
|
5101 __ dec(len); |
|
5102 __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail); |
|
5103 |
|
5104 __ BIND(L_crc32c_return); |
|
5105 __ nop(); |
|
5106 __ retl(); |
4938 __ retl(); |
5107 __ delayed()->nop(); |
4939 __ delayed()->nop(); |
5108 |
4940 |
5109 return start; |
4941 return start; |
5110 } |
4942 } |