hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp
changeset 38237 d972e3a2df53
parent 37466 287c4ebd11b0
child 38241 32eab2eb41fd
equal deleted inserted replaced
38236:510f77046e00 38237:d972e3a2df53
     1 /*
     1 /*
     2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
     2  * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
     4  *
     4  *
     5  * This code is free software; you can redistribute it and/or modify it
     5  * This code is free software; you can redistribute it and/or modify it
     6  * under the terms of the GNU General Public License version 2 only, as
     6  * under the terms of the GNU General Public License version 2 only, as
     7  * published by the Free Software Foundation.
     7  * published by the Free Software Foundation.
  4907       __ delayed()->restore();
  4907       __ delayed()->restore();
  4908 
  4908 
  4909       return start;
  4909       return start;
  4910   }
  4910   }
  4911 
  4911 
  4912 #define CHUNK_LEN   128          /* 128 x 8B = 1KB */
       
  4913 #define CHUNK_K1    0x1307a0206  /* reverseBits(pow(x, CHUNK_LEN*8*8*3 - 32) mod P(x)) << 1 */
       
  4914 #define CHUNK_K2    0x1a0f717c4  /* reverseBits(pow(x, CHUNK_LEN*8*8*2 - 32) mod P(x)) << 1 */
       
  4915 #define CHUNK_K3    0x0170076fa  /* reverseBits(pow(x, CHUNK_LEN*8*8*1 - 32) mod P(x)) << 1 */
       
  4916 
       
  4917   /**
  4912   /**
  4918    *  Arguments:
  4913    *  Arguments:
  4919    *
  4914    *
  4920    * Inputs:
  4915    * Inputs:
  4921    *   O0   - int   crc
  4916    *   O0   - int   crc
  4936     const Register crc   = O0;  // crc
  4931     const Register crc   = O0;  // crc
  4937     const Register buf   = O1;  // source java byte array address
  4932     const Register buf   = O1;  // source java byte array address
  4938     const Register len   = O2;  // number of bytes
  4933     const Register len   = O2;  // number of bytes
  4939     const Register table = O3;  // byteTable
  4934     const Register table = O3;  // byteTable
  4940 
  4935 
  4941     Label L_crc32c_head, L_crc32c_aligned;
  4936     __ kernel_crc32c(crc, buf, len, table);
  4942     Label L_crc32c_parallel, L_crc32c_parallel_loop;
  4937 
  4943     Label L_crc32c_serial, L_crc32c_x32_loop, L_crc32c_x8, L_crc32c_x8_loop;
       
  4944     Label L_crc32c_done, L_crc32c_tail, L_crc32c_return;
       
  4945 
       
  4946     __ cmp_and_br_short(len, 0, Assembler::lessEqual, Assembler::pn, L_crc32c_return);
       
  4947 
       
  4948     // clear upper 32 bits of crc
       
  4949     __ clruwu(crc);
       
  4950 
       
  4951     __ and3(buf, 7, G4);
       
  4952     __ cmp_and_brx_short(G4, 0, Assembler::equal, Assembler::pt, L_crc32c_aligned);
       
  4953 
       
  4954     __ mov(8, G1);
       
  4955     __ sub(G1, G4, G4);
       
  4956 
       
  4957     // ------ process the misaligned head (7 bytes or less) ------
       
  4958     __ BIND(L_crc32c_head);
       
  4959 
       
  4960     // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
       
  4961     __ ldub(buf, 0, G1);
       
  4962     __ update_byte_crc32(crc, G1, table);
       
  4963 
       
  4964     __ inc(buf);
       
  4965     __ dec(len);
       
  4966     __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pn, L_crc32c_return);
       
  4967     __ dec(G4);
       
  4968     __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_head);
       
  4969 
       
  4970     // ------ process the 8-byte-aligned body ------
       
  4971     __ BIND(L_crc32c_aligned);
       
  4972     __ nop();
       
  4973     __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pn, L_crc32c_tail);
       
  4974 
       
  4975     // reverse the byte order of lower 32 bits to big endian, and move to FP side
       
  4976     __ movitof_revbytes(crc, F0, G1, G3);
       
  4977 
       
  4978     __ set(CHUNK_LEN*8*4, G4);
       
  4979     __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pt, L_crc32c_serial);
       
  4980 
       
  4981     // ------ process four 1KB chunks in parallel ------
       
  4982     __ BIND(L_crc32c_parallel);
       
  4983 
       
  4984     __ fzero(FloatRegisterImpl::D, F2);
       
  4985     __ fzero(FloatRegisterImpl::D, F4);
       
  4986     __ fzero(FloatRegisterImpl::D, F6);
       
  4987 
       
  4988     __ mov(CHUNK_LEN - 1, G4);
       
  4989     __ BIND(L_crc32c_parallel_loop);
       
  4990     // schedule ldf's ahead of crc32c's to hide the load-use latency
       
  4991     __ ldf(FloatRegisterImpl::D, buf, 0,            F8);
       
  4992     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
       
  4993     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
       
  4994     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*24, F14);
       
  4995     __ crc32c(F0, F8,  F0);
       
  4996     __ crc32c(F2, F10, F2);
       
  4997     __ crc32c(F4, F12, F4);
       
  4998     __ crc32c(F6, F14, F6);
       
  4999     __ inc(buf, 8);
       
  5000     __ dec(G4);
       
  5001     __ cmp_and_br_short(G4, 0, Assembler::greater, Assembler::pt, L_crc32c_parallel_loop);
       
  5002 
       
  5003     __ ldf(FloatRegisterImpl::D, buf, 0,            F8);
       
  5004     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*8,  F10);
       
  5005     __ ldf(FloatRegisterImpl::D, buf, CHUNK_LEN*16, F12);
       
  5006     __ crc32c(F0, F8,  F0);
       
  5007     __ crc32c(F2, F10, F2);
       
  5008     __ crc32c(F4, F12, F4);
       
  5009 
       
  5010     __ inc(buf, CHUNK_LEN*24);
       
  5011     __ ldfl(FloatRegisterImpl::D, buf, G0, F14);  // load in little endian
       
  5012     __ inc(buf, 8);
       
  5013 
       
  5014     __ prefetch(buf, 0,            Assembler::severalReads);
       
  5015     __ prefetch(buf, CHUNK_LEN*8,  Assembler::severalReads);
       
  5016     __ prefetch(buf, CHUNK_LEN*16, Assembler::severalReads);
       
  5017     __ prefetch(buf, CHUNK_LEN*24, Assembler::severalReads);
       
  5018 
       
  5019     // move to INT side, and reverse the byte order of lower 32 bits to little endian
       
  5020     __ movftoi_revbytes(F0, O4, G1, G4);
       
  5021     __ movftoi_revbytes(F2, O5, G1, G4);
       
  5022     __ movftoi_revbytes(F4, G5, G1, G4);
       
  5023 
       
  5024     // combine the results of 4 chunks
       
  5025     __ set64(CHUNK_K1, G3, G1);
       
  5026     __ xmulx(O4, G3, O4);
       
  5027     __ set64(CHUNK_K2, G3, G1);
       
  5028     __ xmulx(O5, G3, O5);
       
  5029     __ set64(CHUNK_K3, G3, G1);
       
  5030     __ xmulx(G5, G3, G5);
       
  5031 
       
  5032     __ movdtox(F14, G4);
       
  5033     __ xor3(O4, O5, O5);
       
  5034     __ xor3(G5, O5, O5);
       
  5035     __ xor3(G4, O5, O5);
       
  5036 
       
  5037     // reverse the byte order to big endian, via stack, and move to FP side
       
  5038     __ add(SP, -8, G1);
       
  5039     __ srlx(G1, 3, G1);
       
  5040     __ sllx(G1, 3, G1);
       
  5041     __ stx(O5, G1, G0);
       
  5042     __ ldfl(FloatRegisterImpl::D, G1, G0, F2);  // load in little endian
       
  5043 
       
  5044     __ crc32c(F6, F2, F0);
       
  5045 
       
  5046     __ set(CHUNK_LEN*8*4, G4);
       
  5047     __ sub(len, G4, len);
       
  5048     __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_parallel);
       
  5049     __ nop();
       
  5050     __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_done);
       
  5051 
       
  5052     __ BIND(L_crc32c_serial);
       
  5053 
       
  5054     __ mov(32, G4);
       
  5055     __ cmp_and_br_short(len, G4, Assembler::less, Assembler::pn, L_crc32c_x8);
       
  5056 
       
  5057     // ------ process 32B chunks ------
       
  5058     __ BIND(L_crc32c_x32_loop);
       
  5059     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
       
  5060     __ inc(buf, 8);
       
  5061     __ crc32c(F0, F2, F0);
       
  5062     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
       
  5063     __ inc(buf, 8);
       
  5064     __ crc32c(F0, F2, F0);
       
  5065     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
       
  5066     __ inc(buf, 8);
       
  5067     __ crc32c(F0, F2, F0);
       
  5068     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
       
  5069     __ inc(buf, 8);
       
  5070     __ crc32c(F0, F2, F0);
       
  5071     __ dec(len, 32);
       
  5072     __ cmp_and_br_short(len, G4, Assembler::greaterEqual, Assembler::pt, L_crc32c_x32_loop);
       
  5073 
       
  5074     __ BIND(L_crc32c_x8);
       
  5075     __ nop();
       
  5076     __ cmp_and_br_short(len, 8, Assembler::less, Assembler::pt, L_crc32c_done);
       
  5077 
       
  5078     // ------ process 8B chunks ------
       
  5079     __ BIND(L_crc32c_x8_loop);
       
  5080     __ ldf(FloatRegisterImpl::D, buf, 0, F2);
       
  5081     __ inc(buf, 8);
       
  5082     __ crc32c(F0, F2, F0);
       
  5083     __ dec(len, 8);
       
  5084     __ cmp_and_br_short(len, 8, Assembler::greaterEqual, Assembler::pt, L_crc32c_x8_loop);
       
  5085 
       
  5086     __ BIND(L_crc32c_done);
       
  5087 
       
  5088     // move to INT side, and reverse the byte order of lower 32 bits to little endian
       
  5089     __ movftoi_revbytes(F0, crc, G1, G3);
       
  5090 
       
  5091     __ cmp_and_br_short(len, 0, Assembler::equal, Assembler::pt, L_crc32c_return);
       
  5092 
       
  5093     // ------ process the misaligned tail (7 bytes or less) ------
       
  5094     __ BIND(L_crc32c_tail);
       
  5095 
       
  5096     // crc = (crc >>> 8) ^ byteTable[(crc ^ b) & 0xFF];
       
  5097     __ ldub(buf, 0, G1);
       
  5098     __ update_byte_crc32(crc, G1, table);
       
  5099 
       
  5100     __ inc(buf);
       
  5101     __ dec(len);
       
  5102     __ cmp_and_br_short(len, 0, Assembler::greater, Assembler::pt, L_crc32c_tail);
       
  5103 
       
  5104     __ BIND(L_crc32c_return);
       
  5105     __ nop();
       
  5106     __ retl();
  4938     __ retl();
  5107     __ delayed()->nop();
  4939     __ delayed()->nop();
  5108 
  4940 
  5109     return start;
  4941     return start;
  5110   }
  4942   }
  5364     if (UseCRC32Intrinsics) {
  5196     if (UseCRC32Intrinsics) {
  5365       // set table address before stub generation which use it
  5197       // set table address before stub generation which use it
  5366       StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
  5198       StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
  5367       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
  5199       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
  5368     }
  5200     }
       
  5201 
       
  5202     if (UseCRC32CIntrinsics) {
       
  5203       // set table address before stub generation which use it
       
  5204       StubRoutines::_crc32c_table_addr = (address)StubRoutines::Sparc::_crc32c_table;
       
  5205       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
       
  5206     }
  5369   }
  5207   }
  5370 
  5208 
  5371 
  5209 
  5372   void generate_all() {
  5210   void generate_all() {
  5373     // Generates all stubs and initializes the entry points
  5211     // Generates all stubs and initializes the entry points
  5423     }
  5261     }
  5424     if (UseSHA512Intrinsics) {
  5262     if (UseSHA512Intrinsics) {
  5425       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
  5263       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
  5426       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
  5264       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
  5427     }
  5265     }
  5428 
       
  5429     // generate CRC32C intrinsic code
       
  5430     if (UseCRC32CIntrinsics) {
       
  5431       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
       
  5432     }
       
  5433 
       
  5434     // generate Adler32 intrinsics code
  5266     // generate Adler32 intrinsics code
  5435     if (UseAdler32Intrinsics) {
  5267     if (UseAdler32Intrinsics) {
  5436       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
  5268       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
  5437     }
  5269     }
  5438   }
  5270   }