hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
changeset 36564 9442bb67de26
parent 36563 0b48c2c8ad13
child 36595 3322a76f3a00
equal deleted inserted replaced
36563:0b48c2c8ad13 36564:9442bb67de26
   739   // s and d are adjusted to point to the remaining words to copy
   739   // s and d are adjusted to point to the remaining words to copy
   740   //
   740   //
   741   void generate_copy_longs(Label &start, Register s, Register d, Register count,
   741   void generate_copy_longs(Label &start, Register s, Register d, Register count,
   742                            copy_direction direction) {
   742                            copy_direction direction) {
   743     int unit = wordSize * direction;
   743     int unit = wordSize * direction;
       
   744     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
   744 
   745 
   745     int offset;
   746     int offset;
   746     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
   747     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
   747       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
   748       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
   748     const Register stride = r13;
   749     const Register stride = r13;
   758       stub_name = "backward_copy_longs";
   759       stub_name = "backward_copy_longs";
   759     StubCodeMark mark(this, "StubRoutines", stub_name);
   760     StubCodeMark mark(this, "StubRoutines", stub_name);
   760     __ align(CodeEntryAlignment);
   761     __ align(CodeEntryAlignment);
   761     __ bind(start);
   762     __ bind(start);
   762     if (direction == copy_forwards) {
   763     if (direction == copy_forwards) {
   763       __ sub(s, s, 2 * wordSize);
   764       __ sub(s, s, bias);
   764       __ sub(d, d, 2 * wordSize);
   765       __ sub(d, d, bias);
   765     }
   766     }
   766 
   767 
   767 #ifdef ASSERT
   768 #ifdef ASSERT
   768     // Make sure we are never given < 8 words
   769     // Make sure we are never given < 8 words
   769     {
   770     {
   774       __ bind(L);
   775       __ bind(L);
   775     }
   776     }
   776 #endif
   777 #endif
   777 
   778 
   778     // Fill 8 registers
   779     // Fill 8 registers
   779     __ ldp(t0, t1, Address(s, 2 * unit));
   780     if (UseSIMDForMemoryOps) {
   780     __ ldp(t2, t3, Address(s, 4 * unit));
   781       __ ldpq(v0, v1, Address(s, 4 * unit));
   781     __ ldp(t4, t5, Address(s, 6 * unit));
   782       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
   782     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
   783     } else {
       
   784       __ ldp(t0, t1, Address(s, 2 * unit));
       
   785       __ ldp(t2, t3, Address(s, 4 * unit));
       
   786       __ ldp(t4, t5, Address(s, 6 * unit));
       
   787       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
       
   788     }
   783 
   789 
   784     __ subs(count, count, 16);
   790     __ subs(count, count, 16);
   785     __ br(Assembler::LO, drain);
   791     __ br(Assembler::LO, drain);
   786 
   792 
   787     int prefetch = PrefetchCopyIntervalInBytes;
   793     int prefetch = PrefetchCopyIntervalInBytes;
   795     __ bind(again);
   801     __ bind(again);
   796 
   802 
   797     if (PrefetchCopyIntervalInBytes > 0)
   803     if (PrefetchCopyIntervalInBytes > 0)
   798       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
   804       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
   799 
   805 
   800     __ stp(t0, t1, Address(d, 2 * unit));
   806     if (UseSIMDForMemoryOps) {
   801     __ ldp(t0, t1, Address(s, 2 * unit));
   807       __ stpq(v0, v1, Address(d, 4 * unit));
   802     __ stp(t2, t3, Address(d, 4 * unit));
   808       __ ldpq(v0, v1, Address(s, 4 * unit));
   803     __ ldp(t2, t3, Address(s, 4 * unit));
   809       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
   804     __ stp(t4, t5, Address(d, 6 * unit));
   810       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
   805     __ ldp(t4, t5, Address(s, 6 * unit));
   811     } else {
   806     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
   812       __ stp(t0, t1, Address(d, 2 * unit));
   807     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
   813       __ ldp(t0, t1, Address(s, 2 * unit));
       
   814       __ stp(t2, t3, Address(d, 4 * unit));
       
   815       __ ldp(t2, t3, Address(s, 4 * unit));
       
   816       __ stp(t4, t5, Address(d, 6 * unit));
       
   817       __ ldp(t4, t5, Address(s, 6 * unit));
       
   818       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
       
   819       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
       
   820     }
   808 
   821 
   809     __ subs(count, count, 8);
   822     __ subs(count, count, 8);
   810     __ br(Assembler::HS, again);
   823     __ br(Assembler::HS, again);
   811 
   824 
   812     // Drain
   825     // Drain
   813     __ bind(drain);
   826     __ bind(drain);
   814     __ stp(t0, t1, Address(d, 2 * unit));
   827     if (UseSIMDForMemoryOps) {
   815     __ stp(t2, t3, Address(d, 4 * unit));
   828       __ stpq(v0, v1, Address(d, 4 * unit));
   816     __ stp(t4, t5, Address(d, 6 * unit));
   829       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
   817     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
   830     } else {
   818 
   831       __ stp(t0, t1, Address(d, 2 * unit));
   819     if (direction == copy_forwards) {
   832       __ stp(t2, t3, Address(d, 4 * unit));
   820       __ add(s, s, 2 * wordSize);
   833       __ stp(t4, t5, Address(d, 6 * unit));
   821       __ add(d, d, 2 * wordSize);
   834       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
   822     }
   835     }
   823 
   836 
   824     {
   837     {
   825       Label L1, L2;
   838       Label L1, L2;
   826       __ tbz(count, exact_log2(4), L1);
   839       __ tbz(count, exact_log2(4), L1);
   827       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
   840       if (UseSIMDForMemoryOps) {
   828       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
   841         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
   829       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
   842         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
   830       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
   843       } else {
       
   844         __ ldp(t0, t1, Address(s, 2 * unit));
       
   845         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
       
   846         __ stp(t0, t1, Address(d, 2 * unit));
       
   847         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
       
   848       }
   831       __ bind(L1);
   849       __ bind(L1);
       
   850 
       
   851       if (direction == copy_forwards) {
       
   852         __ add(s, s, 2 * wordSize);
       
   853         __ add(d, d, 2 * wordSize);
       
   854       }
   832 
   855 
   833       __ tbz(count, 1, L2);
   856       __ tbz(count, 1, L2);
   834       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
   857       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
   835       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
   858       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
   836       __ bind(L2);
   859       __ bind(L2);
   912     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
   935     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
   913     const Register send = r17, dend = r18;
   936     const Register send = r17, dend = r18;
   914 
   937 
   915     if (PrefetchCopyIntervalInBytes > 0)
   938     if (PrefetchCopyIntervalInBytes > 0)
   916       __ prfm(Address(s, 0), PLDL1KEEP);
   939       __ prfm(Address(s, 0), PLDL1KEEP);
   917 
   940     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
   918     __ cmp(count, 80/granularity);
       
   919     __ br(Assembler::HI, copy_big);
   941     __ br(Assembler::HI, copy_big);
   920 
   942 
   921     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
   943     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
   922     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
   944     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
   923 
   945 
   929 
   951 
   930     __ cmp(count, 32/granularity);
   952     __ cmp(count, 32/granularity);
   931     __ br(Assembler::LS, copy32);
   953     __ br(Assembler::LS, copy32);
   932 
   954 
   933     // 33..64 bytes
   955     // 33..64 bytes
   934     __ ldp(t0, t1, Address(s, 0));
   956     if (UseSIMDForMemoryOps) {
   935     __ ldp(t2, t3, Address(s, 16));
   957       __ ldpq(v0, v1, Address(s, 0));
   936     __ ldp(t4, t5, Address(send, -32));
   958       __ ldpq(v2, v3, Address(send, -32));
   937     __ ldp(t6, t7, Address(send, -16));
   959       __ stpq(v0, v1, Address(d, 0));
   938 
   960       __ stpq(v2, v3, Address(dend, -32));
   939     __ stp(t0, t1, Address(d, 0));
   961     } else {
   940     __ stp(t2, t3, Address(d, 16));
   962       __ ldp(t0, t1, Address(s, 0));
   941     __ stp(t4, t5, Address(dend, -32));
   963       __ ldp(t2, t3, Address(s, 16));
   942     __ stp(t6, t7, Address(dend, -16));
   964       __ ldp(t4, t5, Address(send, -32));
       
   965       __ ldp(t6, t7, Address(send, -16));
       
   966 
       
   967       __ stp(t0, t1, Address(d, 0));
       
   968       __ stp(t2, t3, Address(d, 16));
       
   969       __ stp(t4, t5, Address(dend, -32));
       
   970       __ stp(t6, t7, Address(dend, -16));
       
   971     }
   943     __ b(finish);
   972     __ b(finish);
   944 
   973 
   945     // 17..32 bytes
   974     // 17..32 bytes
   946     __ bind(copy32);
   975     __ bind(copy32);
   947     __ ldp(t0, t1, Address(s, 0));
   976     __ ldp(t0, t1, Address(s, 0));
   948     __ ldp(t2, t3, Address(send, -16));
   977     __ ldp(t2, t3, Address(send, -16));
   949     __ stp(t0, t1, Address(d, 0));
   978     __ stp(t0, t1, Address(d, 0));
   950     __ stp(t2, t3, Address(dend, -16));
   979     __ stp(t2, t3, Address(dend, -16));
   951     __ b(finish);
   980     __ b(finish);
   952 
   981 
   953     // 65..80 bytes
   982     // 65..80/96 bytes
       
   983     // (96 bytes if SIMD because we do 32 byes per instruction)
   954     __ bind(copy80);
   984     __ bind(copy80);
   955     __ ldp(t0, t1, Address(s, 0));
   985     if (UseSIMDForMemoryOps) {
   956     __ ldp(t2, t3, Address(s, 16));
   986       __ ldpq(v0, v1, Address(s, 0));
   957     __ ldp(t4, t5, Address(s, 32));
   987       __ ldpq(v2, v3, Address(s, 32));
   958     __ ldp(t6, t7, Address(s, 48));
   988       __ ldpq(v4, v5, Address(send, -32));
   959     __ ldp(t8, t9, Address(send, -16));
   989       __ stpq(v0, v1, Address(d, 0));
   960 
   990       __ stpq(v2, v3, Address(d, 32));
   961     __ stp(t0, t1, Address(d, 0));
   991       __ stpq(v4, v5, Address(dend, -32));
   962     __ stp(t2, t3, Address(d, 16));
   992     } else {
   963     __ stp(t4, t5, Address(d, 32));
   993       __ ldp(t0, t1, Address(s, 0));
   964     __ stp(t6, t7, Address(d, 48));
   994       __ ldp(t2, t3, Address(s, 16));
   965     __ stp(t8, t9, Address(dend, -16));
   995       __ ldp(t4, t5, Address(s, 32));
       
   996       __ ldp(t6, t7, Address(s, 48));
       
   997       __ ldp(t8, t9, Address(send, -16));
       
   998 
       
   999       __ stp(t0, t1, Address(d, 0));
       
  1000       __ stp(t2, t3, Address(d, 16));
       
  1001       __ stp(t4, t5, Address(d, 32));
       
  1002       __ stp(t6, t7, Address(d, 48));
       
  1003       __ stp(t8, t9, Address(dend, -16));
       
  1004     }
   966     __ b(finish);
  1005     __ b(finish);
   967 
  1006 
   968     // 0..16 bytes
  1007     // 0..16 bytes
   969     __ bind(copy16);
  1008     __ bind(copy16);
   970     __ cmp(count, 8/granularity);
  1009     __ cmp(count, 8/granularity);