src/hotspot/cpu/aarch64/macroAssembler_aarch64_trig.cpp
changeset 50754 ccb8aa083958
child 51374 7be0084191ed
equal deleted inserted replaced
50753:4449b45900f1 50754:ccb8aa083958
       
     1 /* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
       
     2  * Copyright (c) 2018, Cavium. All rights reserved. (By BELLSOFT)
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.
       
     8  *
       
     9  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    12  * version 2 for more details (a copy is included in the LICENSE file that
       
    13  * accompanied this code).
       
    14  *
       
    15  * You should have received a copy of the GNU General Public License version
       
    16  * 2 along with this work; if not, write to the Free Software Foundation,
       
    17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    18  *
       
    19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    20  * or visit www.oracle.com if you need additional information or have any
       
    21  * questions.
       
    22  *
       
    23  */
       
    24 
       
    25 #include "precompiled.hpp"
       
    26 #include "asm/assembler.hpp"
       
    27 #include "asm/assembler.inline.hpp"
       
    28 #include "runtime/stubRoutines.hpp"
       
    29 #include "macroAssembler_aarch64.hpp"
       
    30 
       
    31 // The following code is a optimized version of fdlibm sin/cos implementation
       
    32 // (C code is in share/runtime/sharedRuntimeTrig.cpp) adapted for AARCH64.
       
    33 
       
    34 // Please refer to sin/cos approximation via polynomial and
       
    35 // trigonometric argument reduction techniques to the following literature:
       
    36 //
       
    37 // [1] Muller, Jean-Michel, Nicolas Brisebarre, Florent De Dinechin,
       
    38 // Claude-Pierre Jeannerod, Vincent Lefevre, Guillaume Melquiond,
       
    39 // Nathalie Revol, Damien Stehlé, and Serge Torres:
       
    40 // Handbook of floating-point arithmetic.
       
    41 // Springer Science & Business Media, 2009.
       
    42 // [2] K. C. Ng
       
    43 // Argument Reduction for Huge Arguments: Good to the Last Bit
       
    44 // July 13, 1992, SunPro
       
    45 //
       
    46 // HOW TO READ THIS CODE:
       
    47 // This code consists of several functions. Each function has following header:
       
    48 // 1) Description
       
    49 // 2) C-pseudo code with differences from fdlibm marked by comments starting
       
    50 //        with "NOTE". Check unmodified fdlibm code in
       
    51 //        share/runtime/SharedRuntimeTrig.cpp
       
    52 // 3) Brief textual description of changes between fdlibm and current
       
    53 //        implementation along with optimization notes (if applicable)
       
    54 // 4) Assumptions, input and output
       
    55 // 5) (Optional) additional notes about intrinsic implementation
       
    56 // Each function is separated in blocks which follow the pseudo-code structure
       
    57 //
       
    58 // HIGH-LEVEL ALGORITHM DESCRIPTION:
       
    59 //    - entry point: generate_dsin_dcos(...);
       
    60 //    - check corner cases: NaN, INF, tiny argument.
       
    61 //    - check if |x| < Pi/4. Then approximate sin/cos via polynomial (kernel_sin/kernel_cos)
       
    62 //    -- else proceed to argument reduction routine (__ieee754_rem_pio2) and
       
    63 //           use reduced argument to get result via kernel_sin/kernel_cos
       
    64 //
       
    65 // HIGH-LEVEL CHANGES BETWEEN INTRINSICS AND FDLIBM:
       
    66 // 1) two_over_pi table fdlibm representation is int[], while intrinsic version
       
    67 // has these int values converted to double representation to load converted
       
    68 // double values directly (see stubRoutines_aarch4::_two_over_pi)
       
    69 // 2) Several loops are unrolled and vectorized: see comments in code after
       
    70 // labels: SKIP_F_LOAD, RECOMP_FOR1_CHECK, RECOMP_FOR2
       
    71 // 3) fdlibm npio2_hw table now has "prefix" with constants used in
       
    72 // calculation. These constants are loaded from npio2_hw table instead of
       
    73 // constructing it in code (see stubRoutines_aarch64.cpp)
       
    74 // 4) Polynomial coefficients for sin and cos are moved to table sin_coef
       
    75 // and cos_coef to use the same optimization as in 3). It allows to load most of
       
    76 // required constants via single instruction
       
    77 //
       
    78 //
       
    79 //
       
    80 ///* __ieee754_rem_pio2(x,y)
       
    81 // *
       
    82 // * returns the remainder of x rem pi/2 in y[0]+y[1] (i.e. like x div pi/2)
       
    83 // * x is input argument, y[] is hi and low parts of reduced argument (x)
       
    84 // * uses __kernel_rem_pio2()
       
    85 // */
       
    86 // // use tables(see stubRoutines_aarch64.cpp): two_over_pi and modified npio2_hw
       
    87 //
       
    88 // BEGIN __ieee754_rem_pio2 PSEUDO CODE
       
    89 //
       
    90 //static int __ieee754_rem_pio2(double x, double *y) {
       
    91 //  double z,w,t,r,fn;
       
    92 //  double tx[3];
       
    93 //  int e0,i,j,nx,n,ix,hx,i0;
       
    94 //
       
    95 //  i0 = ((*(int*)&two24A)>>30)^1;        /* high word index */
       
    96 //  hx = *(i0+(int*)&x);          /* high word of x */
       
    97 //  ix = hx&0x7fffffff;
       
    98 //  if(ix<0x4002d97c) {  /* |x| < 3pi/4, special case with n=+-1 */
       
    99 //    if(hx>0) {
       
   100 //      z = x - pio2_1;
       
   101 //      if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
       
   102 //        y[0] = z - pio2_1t;
       
   103 //        y[1] = (z-y[0])-pio2_1t;
       
   104 //      } else {                /* near pi/2, use 33+33+53 bit pi */
       
   105 //        z -= pio2_2;
       
   106 //        y[0] = z - pio2_2t;
       
   107 //        y[1] = (z-y[0])-pio2_2t;
       
   108 //      }
       
   109 //      return 1;
       
   110 //    } else {    /* negative x */
       
   111 //      z = x + pio2_1;
       
   112 //      if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
       
   113 //        y[0] = z + pio2_1t;
       
   114 //        y[1] = (z-y[0])+pio2_1t;
       
   115 //      } else {                /* near pi/2, use 33+33+53 bit pi */
       
   116 //        z += pio2_2;
       
   117 //        y[0] = z + pio2_2t;
       
   118 //        y[1] = (z-y[0])+pio2_2t;
       
   119 //      }
       
   120 //      return -1;
       
   121 //    }
       
   122 //  }
       
   123 //  if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
       
   124 //    t  = fabsd(x);
       
   125 //    n  = (int) (t*invpio2+half);
       
   126 //    fn = (double)n;
       
   127 //    r  = t-fn*pio2_1;
       
   128 //    w  = fn*pio2_1t;    /* 1st round good to 85 bit */
       
   129 //    // NOTE: y[0] = r-w; is moved from if/else below to be before "if"
       
   130 //    y[0] = r-w;
       
   131 //    if(n<32&&ix!=npio2_hw[n-1]) {
       
   132 //      // y[0] = r-w;       /* quick check no cancellation */ // NOTE: moved earlier
       
   133 //    } else {
       
   134 //      j  = ix>>20;
       
   135 //      // y[0] = r-w; // NOTE: moved earlier
       
   136 //      i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
       
   137 //      if(i>16) {  /* 2nd iteration needed, good to 118 */
       
   138 //        t  = r;
       
   139 //        w  = fn*pio2_2;
       
   140 //        r  = t-w;
       
   141 //        w  = fn*pio2_2t-((t-r)-w);
       
   142 //        y[0] = r-w;
       
   143 //        i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
       
   144 //        if(i>49)  {     /* 3rd iteration need, 151 bits acc */
       
   145 //          t  = r;       /* will cover all possible cases */
       
   146 //          w  = fn*pio2_3;
       
   147 //          r  = t-w;
       
   148 //          w  = fn*pio2_3t-((t-r)-w);
       
   149 //          y[0] = r-w;
       
   150 //        }
       
   151 //      }
       
   152 //    }
       
   153 //    y[1] = (r-y[0])-w;
       
   154 //    if(hx<0)    {y[0] = -y[0]; y[1] = -y[1]; return -n;}
       
   155 //    else         return n;
       
   156 //  }
       
   157 //  /*
       
   158 //   * all other (large) arguments
       
   159 //   */
       
   160 //  // NOTE: this check is removed, because it was checked in dsin/dcos
       
   161 //  // if(ix>=0x7ff00000) {          /* x is inf or NaN */
       
   162 //  //  y[0]=y[1]=x-x; return 0;
       
   163 //  // }
       
   164 //  /* set z = scalbn(|x|,ilogb(x)-23) */
       
   165 //  *(1-i0+(int*)&z) = *(1-i0+(int*)&x);
       
   166 //  e0    = (ix>>20)-1046;        /* e0 = ilogb(z)-23; */
       
   167 //  *(i0+(int*)&z) = ix - (e0<<20);
       
   168 //
       
   169 //  // NOTE: "for" loop below in unrolled. See comments in asm code
       
   170 //  for(i=0;i<2;i++) {
       
   171 //    tx[i] = (double)((int)(z));
       
   172 //    z     = (z-tx[i])*two24A;
       
   173 //  }
       
   174 //
       
   175 //  tx[2] = z;
       
   176 //  nx = 3;
       
   177 //
       
   178 //  // NOTE: while(tx[nx-1]==zeroA) nx--;  is unrolled. See comments in asm code
       
   179 //  while(tx[nx-1]==zeroA) nx--;  /* skip zero term */
       
   180 //
       
   181 //  n  =  __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi);
       
   182 //  if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
       
   183 //  return n;
       
   184 //}
       
   185 //
       
   186 // END __ieee754_rem_pio2 PSEUDO CODE
       
   187 //
       
   188 // Changes between fdlibm and intrinsic for __ieee754_rem_pio2:
       
   189 //     1. INF/NaN check for huge argument is removed in comparison with fdlibm
       
   190 //     code, because this check is already done in dcos/dsin code
       
   191 //     2. Most constants are now loaded from table instead of direct initialization
       
   192 //     3. Two loops are unrolled
       
   193 // Assumptions:
       
   194 //     1. Assume |X| >= PI/4
       
   195 //     2. Assume rscratch1 = 0x3fe921fb00000000  (~ PI/4)
       
   196 //     3. Assume ix = r3
       
   197 // Input and output:
       
   198 //     1. Input: X = r0
       
   199 //     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
       
   200 // NOTE: general purpose register names match local variable names in C code
       
   201 // NOTE: fpu registers are actively reused. See comments in code about their usage
       
   202 void MacroAssembler::generate__ieee754_rem_pio2(address npio2_hw,
       
   203     address two_over_pi, address pio2) {
       
   204   const long PIO2_1t = 0x3DD0B4611A626331UL;
       
   205   const long PIO2_2  = 0x3DD0B4611A600000UL;
       
   206   const long PIO2_2t = 0x3BA3198A2E037073UL;
       
   207   Label X_IS_NEGATIVE, X_IS_MEDIUM_OR_LARGE, X_IS_POSITIVE_LONG_PI, LARGE_ELSE,
       
   208       REDUCTION_DONE, X_IS_MEDIUM_BRANCH_DONE, X_IS_LARGE, NX_SET,
       
   209       X_IS_NEGATIVE_LONG_PI;
       
   210   Register X = r0, n = r2, ix = r3, jv = r4, tmp5 = r5, jx = r6,
       
   211       tmp3 = r7, iqBase = r10, ih = r11, i = r17;
       
   212     // initializing constants first
       
   213     // rscratch1 = 0x3fe921fb00000000 (see assumptions)
       
   214     movk(rscratch1, 0x3ff9, 48); // was 0x3fe921fb0..0 now it's 0x3ff921fb0..0
       
   215     mov(rscratch2, 0x4002d97c); // 3*PI/4 high word
       
   216     movk(rscratch1, 0x5440, 16); // now rscratch1 == PIO2_1
       
   217     fmovd(v1, rscratch1); // v1 = PIO2_1
       
   218     cmp(rscratch2, ix);
       
   219     br(LE, X_IS_MEDIUM_OR_LARGE);
       
   220 
       
   221     block_comment("if(ix<0x4002d97c) {...  /* |x| ~< 3pi/4 */ "); {
       
   222       cmp(X, zr);
       
   223       br(LT, X_IS_NEGATIVE);
       
   224 
       
   225       block_comment("if(hx>0) {"); {
       
   226         fsubd(v2, v0, v1); // v2 = z = x - pio2_1
       
   227         cmp(ix, rscratch1, LSR, 32);
       
   228         mov(n, 1);
       
   229         br(EQ, X_IS_POSITIVE_LONG_PI);
       
   230 
       
   231         block_comment("case: hx > 0 &&  ix!=0x3ff921fb {"); { /* 33+53 bit pi is good enough */
       
   232           mov(rscratch2, PIO2_1t);
       
   233           fmovd(v27, rscratch2);
       
   234           fsubd(v4, v2, v27); // v4 = y[0] = z - pio2_1t;
       
   235           fsubd(v5, v2, v4);
       
   236           fsubd(v5, v5, v27); // v5 = y[1] = (z-y[0])-pio2_1t
       
   237           b(REDUCTION_DONE);
       
   238         }
       
   239 
       
   240         block_comment("case: hx > 0 &*& ix==0x3ff921fb {"); { /* near pi/2, use 33+33+53 bit pi */
       
   241           bind(X_IS_POSITIVE_LONG_PI);
       
   242             mov(rscratch1, PIO2_2);
       
   243             mov(rscratch2, PIO2_2t);
       
   244             fmovd(v27, rscratch1);
       
   245             fmovd(v6, rscratch2);
       
   246             fsubd(v2, v2, v27); // z-= pio2_2
       
   247             fsubd(v4, v2, v6);  // y[0] = z - pio2_2t
       
   248             fsubd(v5, v2, v4);
       
   249             fsubd(v5, v5, v6);  // v5 = (z - y[0]) - pio2_2t
       
   250             b(REDUCTION_DONE);
       
   251         }
       
   252       }
       
   253 
       
   254       block_comment("case: hx <= 0)"); {
       
   255         bind(X_IS_NEGATIVE);
       
   256           faddd(v2, v0, v1); // v2 = z = x + pio2_1
       
   257           cmp(ix, rscratch1, LSR, 32);
       
   258           mov(n, -1);
       
   259           br(EQ, X_IS_NEGATIVE_LONG_PI);
       
   260 
       
   261           block_comment("case: hx <= 0 && ix!=0x3ff921fb) {"); { /* 33+53 bit pi is good enough */
       
   262             mov(rscratch2, PIO2_1t);
       
   263             fmovd(v27, rscratch2);
       
   264             faddd(v4, v2, v27); // v4 = y[0] = z + pio2_1t;
       
   265             fsubd(v5, v2, v4);
       
   266             faddd(v5, v5, v27); // v5 = y[1] = (z-y[0]) + pio2_1t
       
   267             b(REDUCTION_DONE);
       
   268           }
       
   269 
       
   270           block_comment("case: hx <= 0 && ix==0x3ff921fb"); { /* near pi/2, use 33+33+53 bit pi */
       
   271             bind(X_IS_NEGATIVE_LONG_PI);
       
   272               mov(rscratch1, PIO2_2);
       
   273               mov(rscratch2, PIO2_2t);
       
   274               fmovd(v27, rscratch1);
       
   275               fmovd(v6, rscratch2);
       
   276               faddd(v2, v2, v27); // z += pio2_2
       
   277               faddd(v4, v2, v6);  // y[0] = z + pio2_2t
       
   278               fsubd(v5, v2, v4);
       
   279               faddd(v5, v5, v6);  // v5 = (z - y[0]) + pio2_2t
       
   280               b(REDUCTION_DONE);
       
   281           }
       
   282       }
       
   283   }
       
   284   bind(X_IS_MEDIUM_OR_LARGE);
       
   285     mov(rscratch1, 0x413921fb);
       
   286     cmp(ix, rscratch1); // ix < = 0x413921fb ?
       
   287     br(GT, X_IS_LARGE);
       
   288 
       
   289     block_comment("|x| ~<= 2^19*(pi/2), medium size"); {
       
   290       lea(ih, ExternalAddress(npio2_hw));
       
   291       ld1(v4, v5, v6, v7, T1D, ih);
       
   292       fabsd(v31, v0);          // v31 = t = |x|
       
   293       add(ih, ih, 64);
       
   294       fmaddd(v2, v31, v5, v4); // v2 = t * invpio2 + half (invpio2 = 53 bits of 2/pi, half = 0.5)
       
   295       fcvtzdw(n, v2);          // n = (int) v2
       
   296       frintzd(v2, v2);
       
   297       fmsubd(v3, v2, v6, v31); // v3 = r = t - fn * pio2_1
       
   298       fmuld(v26, v2, v7);      // v26 = w = fn * pio2_1t
       
   299       fsubd(v4, v3, v26);      // y[0] = r - w. Calculated before branch
       
   300       cmp(n, 32);
       
   301       br(GT, LARGE_ELSE);
       
   302       subw(tmp5, n, 1);        // tmp5 = n - 1
       
   303       ldrw(jv, Address(ih, tmp5, Address::lsl(2)));
       
   304       cmp(ix, jv);
       
   305       br(NE, X_IS_MEDIUM_BRANCH_DONE);
       
   306 
       
   307       block_comment("else block for if(n<32&&ix!=npio2_hw[n-1])"); {
       
   308         bind(LARGE_ELSE);
       
   309           fmovd(jx, v4);
       
   310           lsr(tmp5, ix, 20);                       // j = ix >> 20
       
   311           lsl(jx, jx, 1);
       
   312           sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1);   // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
       
   313 
       
   314           block_comment("if(i>16)"); {
       
   315             cmp(tmp3, 16);
       
   316             br(LE, X_IS_MEDIUM_BRANCH_DONE);
       
   317             // i > 16. 2nd iteration needed
       
   318             ldpd(v6, v7, Address(ih, -32));
       
   319             fmovd(v28, v3);                        // t = r
       
   320             fmuld(v29, v2, v6);                    // w = v29 = fn * pio2_2
       
   321             fsubd(v3, v28, v29);                   // r = t - w
       
   322             fsubd(v31, v28, v3);                   // v31 = (t - r)
       
   323             fsubd(v31, v29, v31);                  // v31 = w - (t - r) = - ((t - r) - w)
       
   324             fmaddd(v26, v2, v7, v31);              // v26 = w = fn*pio2_2t - ((t - r) - w)
       
   325             fsubd(v4, v3, v26);                    // y[0] = r - w
       
   326             fmovd(jx, v4);
       
   327             lsl(jx, jx, 1);
       
   328             sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1); // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
       
   329 
       
   330             block_comment("if(i>49)"); {
       
   331               cmp(tmp3, 49);
       
   332               br(LE, X_IS_MEDIUM_BRANCH_DONE);
       
   333               // 3rd iteration need, 151 bits acc
       
   334               ldpd(v6, v7, Address(ih, -16));
       
   335               fmovd(v28, v3);                      // save "r"
       
   336               fmuld(v29, v2, v6);                  // v29 = fn * pio2_3
       
   337               fsubd(v3, v28, v29);                 // r = r - w
       
   338               fsubd(v31, v28, v3);                 // v31 = (t - r)
       
   339               fsubd(v31, v29, v31);                // v31 = w - (t - r) = - ((t - r) - w)
       
   340               fmaddd(v26, v2, v7, v31);            // v26 = w = fn*pio2_3t - ((t - r) - w)
       
   341               fsubd(v4, v3, v26);                  // y[0] = r - w
       
   342             }
       
   343           }
       
   344       }
       
   345     block_comment("medium x tail"); {
       
   346       bind(X_IS_MEDIUM_BRANCH_DONE);
       
   347         fsubd(v5, v3, v4);                         // v5 = y[1] = (r - y[0])
       
   348         fsubd(v5, v5, v26);                        // v5 = y[1] = (r - y[0]) - w
       
   349         cmp(X, zr);
       
   350         br(GT, REDUCTION_DONE);
       
   351         fnegd(v4, v4);
       
   352         negw(n, n);
       
   353         fnegd(v5, v5);
       
   354         b(REDUCTION_DONE);
       
   355     }
       
   356   }
       
   357 
       
   358   block_comment("all other (large) arguments"); {
       
   359     bind(X_IS_LARGE);
       
   360       lsr(rscratch1, ix, 20);                      // ix >> 20
       
   361       movz(tmp5, 0x4170, 48);
       
   362       subw(rscratch1, rscratch1, 1046);            // e0
       
   363       fmovd(v10, tmp5);                            // init two24A value
       
   364       subw(jv, ix, rscratch1, LSL, 20);            // ix - (e0<<20)
       
   365       lsl(jv, jv, 32);
       
   366       subw(rscratch2, rscratch1, 3);
       
   367       bfm(jv, X, 0, 31);                           // jv = z
       
   368       movw(i, 24);
       
   369       fmovd(v26, jv);                              // v26 = z
       
   370 
       
   371       block_comment("unrolled for(i=0;i<2;i++) {tx[i] = (double)((int)(z));z = (z-tx[i])*two24A;}"); {
       
   372         // tx[0,1,2] = v6,v7,v26
       
   373         frintzd(v6, v26);                          // v6 = (double)((int)v26)
       
   374         sdivw(jv, rscratch2, i);                   // jv = (e0 - 3)/24
       
   375         fsubd(v26, v26, v6);
       
   376         sub(sp, sp, 560);
       
   377         fmuld(v26, v26, v10);
       
   378         frintzd(v7, v26);                          // v7 = (double)((int)v26)
       
   379         movw(jx, 2); // calculate jx as nx - 1, which is initially 2. Not a part of unrolled loop
       
   380         fsubd(v26, v26, v7);
       
   381       }
       
   382 
       
   383       block_comment("nx calculation with unrolled while(tx[nx-1]==zeroA) nx--;"); {
       
   384         fcmpd(v26, 0.0d);                          // if NE then jx == 2. else it's 1 or 0
       
   385         add(iqBase, sp, 480);                      // base of iq[]
       
   386         fmuld(v3, v26, v10);
       
   387         br(NE, NX_SET);
       
   388         fcmpd(v7, 0.0d);                           // v7 == 0 => jx = 0. Else jx = 1
       
   389         csetw(jx, NE);
       
   390       }
       
   391     bind(NX_SET);
       
   392       generate__kernel_rem_pio2(two_over_pi, pio2);
       
   393       // now we have y[0] = v4, y[1] = v5 and n = r2
       
   394       cmp(X, zr);
       
   395       br(GE, REDUCTION_DONE);
       
   396       fnegd(v4, v4);
       
   397       fnegd(v5, v5);
       
   398       negw(n, n);
       
   399   }
       
   400   bind(REDUCTION_DONE);
       
   401 }
       
   402 
       
   403 ///*
       
   404 // * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
       
   405 // * double x[],y[]; int e0,nx,prec; int ipio2[];
       
   406 // *
       
   407 // * __kernel_rem_pio2 return the last three digits of N with
       
   408 // *              y = x - N*pi/2
       
   409 // * so that |y| < pi/2.
       
   410 // *
       
   411 // * The method is to compute the integer (mod 8) and fraction parts of
       
   412 // * (2/pi)*x without doing the full multiplication. In general we
       
   413 // * skip the part of the product that are known to be a huge integer (
       
   414 // * more accurately, = 0 mod 8 ). Thus the number of operations are
       
   415 // * independent of the exponent of the input.
       
   416 // *
       
   417 // * NOTE: 2/pi int representation is converted to double
       
   418 // * // (2/pi) is represented by an array of 24-bit integers in ipio2[].
       
   419 // *
       
   420 // * Input parameters:
       
   421 // *      x[]     The input value (must be positive) is broken into nx
       
   422 // *              pieces of 24-bit integers in double precision format.
       
   423 // *              x[i] will be the i-th 24 bit of x. The scaled exponent
       
   424 // *              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
       
   425 // *              match x's up to 24 bits.
       
   426 // *
       
   427 // *              Example of breaking a double positive z into x[0]+x[1]+x[2]:
       
   428 // *                      e0 = ilogb(z)-23
       
   429 // *                      z  = scalbn(z,-e0)
       
   430 // *              for i = 0,1,2
       
   431 // *                      x[i] = floor(z)
       
   432 // *                      z    = (z-x[i])*2**24
       
   433 // *
       
   434 // *
       
   435 // *      y[]     ouput result in an array of double precision numbers.
       
   436 // *              The dimension of y[] is:
       
   437 // *                      24-bit  precision       1
       
   438 // *                      53-bit  precision       2
       
   439 // *                      64-bit  precision       2
       
   440 // *                      113-bit precision       3
       
   441 // *              The actual value is the sum of them. Thus for 113-bit
       
   442 // *              precsion, one may have to do something like:
       
   443 // *
       
   444 // *              long double t,w,r_head, r_tail;
       
   445 // *              t = (long double)y[2] + (long double)y[1];
       
   446 // *              w = (long double)y[0];
       
   447 // *              r_head = t+w;
       
   448 // *              r_tail = w - (r_head - t);
       
   449 // *
       
   450 // *      e0      The exponent of x[0]
       
   451 // *
       
   452 // *      nx      dimension of x[]
       
   453 // *
       
   454 // *      prec    an interger indicating the precision:
       
   455 // *                      0       24  bits (single)
       
   456 // *                      1       53  bits (double)
       
   457 // *                      2       64  bits (extended)
       
   458 // *                      3       113 bits (quad)
       
   459 // *
       
   460 // *      NOTE: ipio2[] array below is converted to double representation
       
   461 // *      //ipio2[]
       
   462 // *      //        integer array, contains the (24*i)-th to (24*i+23)-th
       
   463 // *      //        bit of 2/pi after binary point. The corresponding
       
   464 // *      //        floating value is
       
   465 // *
       
   466 // *                      ipio2[i] * 2^(-24(i+1)).
       
   467 // *
       
   468 // * Here is the description of some local variables:
       
   469 // *
       
   470 // *      jk      jk+1 is the initial number of terms of ipio2[] needed
       
   471 // *              in the computation. The recommended value is 2,3,4,
       
   472 // *              6 for single, double, extended,and quad.
       
   473 // *
       
   474 // *      jz      local integer variable indicating the number of
       
   475 // *              terms of ipio2[] used.
       
   476 // *
       
   477 // *      jx      nx - 1
       
   478 // *
       
   479 // *      jv      index for pointing to the suitable ipio2[] for the
       
   480 // *              computation. In general, we want
       
   481 // *                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
       
   482 // *              is an integer. Thus
       
   483 // *                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
       
   484 // *              Hence jv = max(0,(e0-3)/24).
       
   485 // *
       
   486 // *      jp      jp+1 is the number of terms in PIo2[] needed, jp = jk.
       
   487 // *
       
   488 // *      q[]     double array with integral value, representing the
       
   489 // *              24-bits chunk of the product of x and 2/pi.
       
   490 // *
       
   491 // *      q0      the corresponding exponent of q[0]. Note that the
       
   492 // *              exponent for q[i] would be q0-24*i.
       
   493 // *
       
   494 // *      PIo2[]  double precision array, obtained by cutting pi/2
       
   495 // *              into 24 bits chunks.
       
   496 // *
       
   497 // *      f[]     ipio2[] in floating point
       
   498 // *
       
   499 // *      iq[]    integer array by breaking up q[] in 24-bits chunk.
       
   500 // *
       
   501 // *      fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
       
   502 // *
       
   503 // *      ih      integer. If >0 it indicates q[] is >= 0.5, hence
       
   504 // *              it also indicates the *sign* of the result.
       
   505 // *
       
   506 // */
       
   507 //
       
   508 // Use PIo2 table(see stubRoutines_aarch64.cpp)
       
   509 //
       
   510 // BEGIN __kernel_rem_pio2 PSEUDO CODE
       
   511 //
       
   512 //static int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, /* NOTE: converted to double */ const double *ipio2 // const int *ipio2) {
       
   513 //  int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
       
   514 //  double z,fw,f[20],fq[20],q[20];
       
   515 //
       
   516 //  /* initialize jk*/
       
   517 //  // jk = init_jk[prec]; // NOTE: prec==2 for double. jk is always 4.
       
   518 //  jp = jk; // NOTE: always 4
       
   519 //
       
   520 //  /* determine jx,jv,q0, note that 3>q0 */
       
   521 //  jx =  nx-1;
       
   522 //  jv = (e0-3)/24; if(jv<0) jv=0;
       
   523 //  q0 =  e0-24*(jv+1);
       
   524 //
       
   525 //  /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
       
   526 //  j = jv-jx; m = jx+jk;
       
   527 //
       
   528 //  // NOTE: split into two for-loops: one with zeroB and one with ipio2[j]. It
       
   529 //  //       allows the use of wider loads/stores
       
   530 //  for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; //(double) ipio2[j];
       
   531 //
       
   532 //  // NOTE: unrolled and vectorized "for". See comments in asm code
       
   533 //  /* compute q[0],q[1],...q[jk] */
       
   534 //  for (i=0;i<=jk;i++) {
       
   535 //    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
       
   536 //  }
       
   537 //
       
   538 //  jz = jk;
       
   539 //recompute:
       
   540 //  /* distill q[] into iq[] reversingly */
       
   541 //  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
       
   542 //    fw    =  (double)((int)(twon24* z));
       
   543 //    iq[i] =  (int)(z-two24B*fw);
       
   544 //    z     =  q[j-1]+fw;
       
   545 //  }
       
   546 //
       
   547 //  /* compute n */
       
   548 //  z  = scalbnA(z,q0);           /* actual value of z */
       
   549 //  z -= 8.0*floor(z*0.125);              /* trim off integer >= 8 */
       
   550 //  n  = (int) z;
       
   551 //  z -= (double)n;
       
   552 //  ih = 0;
       
   553 //  if(q0>0) {    /* need iq[jz-1] to determine n */
       
   554 //    i  = (iq[jz-1]>>(24-q0)); n += i;
       
   555 //    iq[jz-1] -= i<<(24-q0);
       
   556 //    ih = iq[jz-1]>>(23-q0);
       
   557 //  }
       
   558 //  else if(q0==0) ih = iq[jz-1]>>23;
       
   559 //  else if(z>=0.5) ih=2;
       
   560 //
       
   561 //  if(ih>0) {    /* q > 0.5 */
       
   562 //    n += 1; carry = 0;
       
   563 //    for(i=0;i<jz ;i++) {        /* compute 1-q */
       
   564 //      j = iq[i];
       
   565 //      if(carry==0) {
       
   566 //        if(j!=0) {
       
   567 //          carry = 1; iq[i] = 0x1000000- j;
       
   568 //        }
       
   569 //      } else  iq[i] = 0xffffff - j;
       
   570 //    }
       
   571 //    if(q0>0) {          /* rare case: chance is 1 in 12 */
       
   572 //      switch(q0) {
       
   573 //      case 1:
       
   574 //        iq[jz-1] &= 0x7fffff; break;
       
   575 //      case 2:
       
   576 //        iq[jz-1] &= 0x3fffff; break;
       
   577 //      }
       
   578 //    }
       
   579 //    if(ih==2) {
       
   580 //      z = one - z;
       
   581 //      if(carry!=0) z -= scalbnA(one,q0);
       
   582 //    }
       
   583 //  }
       
   584 //
       
   585 //  /* check if recomputation is needed */
       
   586 //  if(z==zeroB) {
       
   587 //    j = 0;
       
   588 //    for (i=jz-1;i>=jk;i--) j |= iq[i];
       
   589 //    if(j==0) { /* need recomputation */
       
   590 //      for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
       
   591 //
       
   592 //      for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
       
   593 //        f[jx+i] = /* NOTE: converted to double */ ipio2[jv+i]; //(double) ipio2[jv+i];
       
   594 //        for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
       
   595 //        q[i] = fw;
       
   596 //      }
       
   597 //      jz += k;
       
   598 //      goto recompute;
       
   599 //    }
       
   600 //  }
       
   601 //
       
   602 //  /* chop off zero terms */
       
   603 //  if(z==0.0) {
       
   604 //    jz -= 1; q0 -= 24;
       
   605 //    while(iq[jz]==0) { jz--; q0-=24;}
       
   606 //  } else { /* break z into 24-bit if necessary */
       
   607 //    z = scalbnA(z,-q0);
       
   608 //    if(z>=two24B) {
       
   609 //      fw = (double)((int)(twon24*z));
       
   610 //      iq[jz] = (int)(z-two24B*fw);
       
   611 //      jz += 1; q0 += 24;
       
   612 //      iq[jz] = (int) fw;
       
   613 //    } else iq[jz] = (int) z ;
       
   614 //  }
       
   615 //
       
   616 //  /* convert integer "bit" chunk to floating-point value */
       
   617 //  fw = scalbnA(one,q0);
       
   618 //  for(i=jz;i>=0;i--) {
       
   619 //    q[i] = fw*(double)iq[i]; fw*=twon24;
       
   620 //  }
       
   621 //
       
   622 //  /* compute PIo2[0,...,jp]*q[jz,...,0] */
       
   623 //  for(i=jz;i>=0;i--) {
       
   624 //    for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
       
   625 //    fq[jz-i] = fw;
       
   626 //  }
       
   627 //
       
   628 //  // NOTE: switch below is eliminated, because prec is always 2 for doubles
       
   629 //  /* compress fq[] into y[] */
       
   630 //  //switch(prec) {
       
   631 //  //case 0:
       
   632 //  //  fw = 0.0;
       
   633 //  //  for (i=jz;i>=0;i--) fw += fq[i];
       
   634 //  //  y[0] = (ih==0)? fw: -fw;
       
   635 //  //  break;
       
   636 //  //case 1:
       
   637 //  //case 2:
       
   638 //    fw = 0.0;
       
   639 //    for (i=jz;i>=0;i--) fw += fq[i];
       
   640 //    y[0] = (ih==0)? fw: -fw;
       
   641 //    fw = fq[0]-fw;
       
   642 //    for (i=1;i<=jz;i++) fw += fq[i];
       
   643 //    y[1] = (ih==0)? fw: -fw;
       
   644 //  //  break;
       
   645 //  //case 3:       /* painful */
       
   646 //  //  for (i=jz;i>0;i--) {
       
   647 //  //    fw      = fq[i-1]+fq[i];
       
   648 //  // fq[i]  += fq[i-1]-fw;
       
   649 //  //    fq[i-1] = fw;
       
   650 //  //  }
       
   651 //  //  for (i=jz;i>1;i--) {
       
   652 //  //    fw      = fq[i-1]+fq[i];
       
   653 //  //    fq[i]  += fq[i-1]-fw;
       
   654 //  //    fq[i-1] = fw;
       
   655 //  //  }
       
   656 //  //  for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
       
   657 //  //  if(ih==0) {
       
   658 //  //    y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
       
   659 //  //  } else {
       
   660 //  //    y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
       
   661 //  //  }
       
   662 //  //}
       
   663 //  return n&7;
       
   664 //}
       
   665 //
       
   666 // END __kernel_rem_pio2 PSEUDO CODE
       
   667 //
       
   668 // Changes between fdlibm and intrinsic:
       
   669 //     1. One loop is unrolled and vectorized (see comments in code)
       
   670 //     2. One loop is split into 2 loops (see comments in code)
       
   671 //     3. Non-double code is removed(last switch). Sevaral variables became
       
   672 //         constants because of that (see comments in code)
       
   673 //     4. Use of jx, which is nx-1 instead of nx
       
   674 // Assumptions:
       
   675 //     1. Assume |X| >= PI/4
       
   676 // Input and output:
       
   677 //     1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
       
   678 //     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
       
   679 // NOTE: general purpose register names match local variable names in C code
       
   680 // NOTE: fpu registers are actively reused. See comments in code about their usage
       
   681 void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
       
   682   Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
       
   683       RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
       
   684       INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
       
   685       Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
       
   686       RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
       
   687       CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
       
   688       IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
       
   689       RECOMP_FOR1_CHECK;
       
   690   Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
       
   691       tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
       
   692       jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r18;
       
   693     // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
       
   694     // jx = nx - 1
       
   695     lea(twoOverPiBase, ExternalAddress(two_over_pi));
       
   696     cmpw(jv, zr);
       
   697     addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
       
   698     cselw(jv, jv, zr, GE);
       
   699     fmovd(v26, 0.0d);
       
   700     addw(tmp5, jv, 1);                    // jv+1
       
   701     subsw(j, jv, jx);
       
   702     add(qBase, sp, 320);                  // base of q[]
       
   703     msubw(rscratch1, i, tmp5, rscratch1); // q0 =  e0-24*(jv+1)
       
   704     // use double f[20], fq[20], q[20], iq[20] on stack, which is
       
   705     // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
       
   706     // will contain f[20], fq[20], q[20], iq[20]
       
   707     // now initialize f[20] indexes 0..m (inclusive)
       
   708     // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
       
   709     mov(tmp5, sp);
       
   710 
       
   711     block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
       
   712         eorw(i, i, i);
       
   713         br(GE, INIT_F_COPY);
       
   714       bind(INIT_F_ZERO);
       
   715         stpq(v26, v26, Address(post(tmp5, 32)));
       
   716         addw(i, i, 4);
       
   717         addsw(j, j, 4);
       
   718         br(LT, INIT_F_ZERO);
       
   719         subw(i, i, j);
       
   720         movw(j, zr);
       
   721       bind(INIT_F_COPY);
       
   722         add(tmp1, twoOverPiBase, j, LSL, 3); // ipio2[j] start address
       
   723         ld1(v18, v19, v20, v21, T16B, tmp1);
       
   724         add(tmp5, sp, i, ext::uxtx, 3);
       
   725         st1(v18, v19, v20, v21, T16B, tmp5);
       
   726     }
       
   727     // v18..v21 can actually contain f[0..7]
       
   728     cbz(i, SKIP_F_LOAD); // i == 0 => f[i] == f[0] => already loaded
       
   729     ld1(v18, v19, v20, v21, T2D, Address(sp)); // load f[0..7]
       
   730   bind(SKIP_F_LOAD);
       
   731     // calculate 2^q0 and 2^-q0, which we'll need further.
       
   732     // q0 is exponent. So, calculate biased exponent(q0+1023)
       
   733     negw(tmp4, rscratch1);
       
   734     addw(tmp5, rscratch1, 1023);
       
   735     addw(tmp4, tmp4, 1023);
       
   736     // Unroll following for(s) depending on jx in [0,1,2]
       
   737     // for (i=0;i<=jk;i++) {
       
   738     //   for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
       
   739     // }
       
   740     // Unrolling for jx == 0 case:
       
   741     //   q[0] = x[0] * f[0]
       
   742     //   q[1] = x[0] * f[1]
       
   743     //   q[2] = x[0] * f[2]
       
   744     //   q[3] = x[0] * f[3]
       
   745     //   q[4] = x[0] * f[4]
       
   746     //
       
   747     // Vectorization for unrolled jx == 0 case:
       
   748     //   {q[0], q[1]} = {f[0], f[1]} * x[0]
       
   749     //   {q[2], q[3]} = {f[2], f[3]} * x[0]
       
   750     //   q[4] = f[4] * x[0]
       
   751     //
       
   752     // Unrolling for jx == 1 case:
       
   753     //   q[0] = x[0] * f[1] + x[1] * f[0]
       
   754     //   q[1] = x[0] * f[2] + x[1] * f[1]
       
   755     //   q[2] = x[0] * f[3] + x[1] * f[2]
       
   756     //   q[3] = x[0] * f[4] + x[1] * f[3]
       
   757     //   q[4] = x[0] * f[5] + x[1] * f[4]
       
   758     //
       
   759     // Vectorization for unrolled jx == 1 case:
       
   760     //   {q[0], q[1]} = {f[0], f[1]} * x[1]
       
   761     //   {q[2], q[3]} = {f[2], f[3]} * x[1]
       
   762     //   q[4] = f[4] * x[1]
       
   763     //   {q[0], q[1]} += {f[1], f[2]} * x[0]
       
   764     //   {q[2], q[3]} += {f[3], f[4]} * x[0]
       
   765     //   q[4] += f[5] * x[0]
       
   766     //
       
   767     // Unrolling for jx == 2 case:
       
   768     //   q[0] = x[0] * f[2] + x[1] * f[1] + x[2] * f[0]
       
   769     //   q[1] = x[0] * f[3] + x[1] * f[2] + x[2] * f[1]
       
   770     //   q[2] = x[0] * f[4] + x[1] * f[3] + x[2] * f[2]
       
   771     //   q[3] = x[0] * f[5] + x[1] * f[4] + x[2] * f[3]
       
   772     //   q[4] = x[0] * f[6] + x[1] * f[5] + x[2] * f[4]
       
   773     //
       
   774     // Vectorization for unrolled jx == 2 case:
       
   775     //   {q[0], q[1]} = {f[0], f[1]} * x[2]
       
   776     //   {q[2], q[3]} = {f[2], f[3]} * x[2]
       
   777     //   q[4] = f[4] * x[2]
       
   778     //   {q[0], q[1]} += {f[1], f[2]} * x[1]
       
   779     //   {q[2], q[3]} += {f[3], f[4]} * x[1]
       
   780     //   q[4] += f[5] * x[1]
       
   781     //   {q[0], q[1]} += {f[2], f[3]} * x[0]
       
   782     //   {q[2], q[3]} += {f[4], f[5]} * x[0]
       
   783     //   q[4] += f[6] * x[0]
       
   784   block_comment("unrolled and vectorized computation of q[0]..q[jk]"); {
       
   785       cmpw(jx, 1);
       
   786       lsl(tmp5, tmp5, 52);                     // now it's 2^q0 double value
       
   787       lsl(tmp4, tmp4, 52);                     // now it's 2^-q0 double value
       
   788       br(LT, JX_IS_0);
       
   789       add(i, sp, 8);
       
   790       ldpq(v26, v27, i);                       // load f[1..4]
       
   791       br(GT, JX_IS_2);
       
   792       // jx == 1
       
   793       fmulxvs(v28, T2D, v18, v7);              // f[0,1] * x[1]
       
   794       fmulxvs(v29, T2D, v19, v7);              // f[2,3] * x[1]
       
   795       fmuld(v30, v20, v7);                     // f[4] * x[1]
       
   796       fmlavs(v28, T2D, v26, v6, 0);
       
   797       fmlavs(v29, T2D, v27, v6, 0);
       
   798       fmlavs(v30, T2D, v6, v20, 1);            // v30 += f[5] * x[0]
       
   799       b(Q_DONE);
       
   800     bind(JX_IS_2);
       
   801       fmulxvs(v28, T2D, v18, v3);              // f[0,1] * x[2]
       
   802       fmulxvs(v29, T2D, v19, v3);              // f[2,3] * x[2]
       
   803       fmuld(v30, v20, v3);                     // f[4] * x[2]
       
   804       fmlavs(v28, T2D, v26, v7, 0);
       
   805       fmlavs(v29, T2D, v27, v7, 0);
       
   806       fmlavs(v30, T2D, v7, v20, 1);            // v30 += f[5] * x[1]
       
   807       fmlavs(v28, T2D, v19, v6, 0);
       
   808       fmlavs(v29, T2D, v20, v6, 0);
       
   809       fmlavs(v30, T2D, v6, v21, 0);            // v30 += f[6] * x[0]
       
   810       b(Q_DONE);
       
   811     bind(JX_IS_0);
       
   812       fmulxvs(v28, T2D, v18, v6);              // f[0,1] * x[0]
       
   813       fmulxvs(v29, T2D, v19, v6);              // f[2,3] * x[0]
       
   814       fmuld(v30, v20, v6);                     // f[4] * x[0]
       
   815     bind(Q_DONE);
       
   816       st1(v28, v29, v30, T2D, Address(qBase)); // save calculated q[0]...q[jk]
       
   817   }
       
   818   movz(i, 0x3E70, 48);
       
   819   movw(jz, 4);
       
   820   fmovd(v17, i);                               // v17 = twon24
       
   821   fmovd(v30, tmp5);                            // 2^q0
       
   822   fmovd(v21, 0.125d);
       
   823   fmovd(v20, 8.0d);
       
   824   fmovd(v22, tmp4);                            // 2^-q0
       
   825 
       
   826   block_comment("recompute loop"); {
       
   827     bind(RECOMPUTE);
       
   828       //  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
       
   829       //    fw    =  (double)((int)(twon24* z));
       
   830       //    iq[i] =  (int)(z-two24A*fw);
       
   831       //    z     =  q[j-1]+fw;
       
   832       //  }
       
   833       block_comment("distill q[] into iq[] reversingly"); {
       
   834           eorw(i, i, i);
       
   835           movw(j, jz);
       
   836           add(tmp2, qBase, jz, LSL, 3);                    // q[jz] address
       
   837           ldrd(v18, post(tmp2, -8));                       // z = q[j] and moving address to q[j-1]
       
   838         bind(RECOMP_FIRST_FOR);
       
   839           ldrd(v27, post(tmp2, -8));
       
   840           fmuld(v29, v17, v18);                            // twon24*z
       
   841           frintzd(v29, v29);                               // (double)(int)
       
   842           fmsubd(v28, v10, v29, v18);                      // v28 = z-two24A*fw
       
   843           fcvtzdw(tmp1, v28);                              // (int)(z-two24A*fw)
       
   844           strw(tmp1, Address(iqBase, i, Address::lsl(2)));
       
   845           faddd(v18, v27, v29);
       
   846           add(i, i, 1);
       
   847           subs(j, j, 1);
       
   848           br(GT, RECOMP_FIRST_FOR);
       
   849       }
       
   850       // compute n
       
   851       fmuld(v18, v18, v30);
       
   852       fmuld(v2, v18, v21);
       
   853       frintmd(v2, v2);                                     // v2 = floor(v2) == rounding towards -inf
       
   854       fmsubd(v18, v2, v20, v18);                           // z -= 8.0*floor(z*0.125);
       
   855       movw(ih, 2);
       
   856       frintzd(v2, v18);                                    // v2 = (double)((int)z)
       
   857       fcvtzdw(n, v18);                                     // n  = (int) z;
       
   858       fsubd(v18, v18, v2);                                 // z -= (double)n;
       
   859 
       
   860       block_comment("q0-dependent initialization"); {
       
   861           cmpw(rscratch1, 0);                              // if (q0 > 0)
       
   862           br(LT, Q0_ZERO_CMP_LT);
       
   863           subw(j, jz, 1);                                  // j = jz - 1
       
   864           ldrw(tmp2, Address(iqBase, j, Address::lsl(2))); // tmp2 = iq[jz-1]
       
   865           br(EQ, Q0_ZERO_CMP_EQ);
       
   866           movw(tmp4, 24);
       
   867           subw(tmp4, tmp4, rscratch1);                     // == 24 - q0
       
   868           lsrvw(i, tmp2, tmp4);                            // i = iq[jz-1] >> (24-q0)
       
   869           lslvw(tmp5, i, tmp4);
       
   870           subw(tmp2, tmp2, tmp5);                          // iq[jz-1] -= i<<(24-q0);
       
   871           strw(tmp2, Address(iqBase, j, Address::lsl(2))); // store iq[jz-1]
       
   872           subw(rscratch2, tmp4, 1);                        // == 23 - q0
       
   873           addw(n, n, i);                                   // n+=i
       
   874           lsrvw(ih, tmp2, rscratch2);                      // ih = iq[jz-1] >> (23-q0)
       
   875           b(Q0_ZERO_CMP_DONE);
       
   876         bind(Q0_ZERO_CMP_EQ);
       
   877           lsr(ih, tmp2, 23);                               // ih = iq[z-1] >> 23
       
   878           b(Q0_ZERO_CMP_DONE);
       
   879         bind(Q0_ZERO_CMP_LT);
       
   880           fmovd(v4, 0.5d);
       
   881           fcmpd(v18, v4);
       
   882           cselw(ih, zr, ih, LT);                           // if (z<0.5) ih = 0
       
   883       }
       
   884     bind(Q0_ZERO_CMP_DONE);
       
   885       cmpw(ih, zr);
       
   886       br(LE, IH_HANDLED);
       
   887 
       
   888     block_comment("if(ih>) {"); {
       
   889       // use rscratch2 as carry
       
   890 
       
   891       block_comment("for(i=0;i<jz ;i++) {...}"); {
       
   892           addw(n, n, 1);
       
   893           eorw(i, i, i);
       
   894           eorw(rscratch2, rscratch2, rscratch2);
       
   895         bind(IH_FOR);
       
   896           ldrw(j, Address(iqBase, i, Address::lsl(2)));    // j = iq[i]
       
   897           movw(tmp3, 0x1000000);
       
   898           subw(tmp3, tmp3, rscratch2);
       
   899           cbnzw(rscratch2, IH_FOR_STORE);
       
   900           cbzw(j, IH_FOR_INCREMENT);
       
   901           movw(rscratch2, 1);
       
   902         bind(IH_FOR_STORE);
       
   903           subw(tmp3, tmp3, j);
       
   904           strw(tmp3, Address(iqBase, i, Address::lsl(2))); // iq[i] = 0xffffff - j
       
   905         bind(IH_FOR_INCREMENT);
       
   906           addw(i, i, 1);
       
   907           cmpw(i, jz);
       
   908           br(LT, IH_FOR);
       
   909       }
       
   910 
       
   911       block_comment("if(q0>0) {"); {
       
   912         cmpw(rscratch1, zr);
       
   913         br(LE, IH_AFTER_SWITCH);
       
   914         // tmp3 still has iq[jz-1] value. no need to reload
       
   915         // now, zero high tmp3 bits (rscratch1 number of bits)
       
   916         movw(j, -1);
       
   917         subw(i, jz, 1);                                    // set i to jz-1
       
   918         lsrv(j, j, rscratch1);
       
   919         andw(tmp3, tmp3, j, LSR, 8);                       // we have 24-bit-based constants
       
   920         strw(tmp3, Address(iqBase, i, Address::lsl(2)));   // save iq[jz-1]
       
   921       }
       
   922       bind(IH_AFTER_SWITCH);
       
   923         cmpw(ih, 2);
       
   924         br(NE, IH_HANDLED);
       
   925 
       
   926         block_comment("if(ih==2) {"); {
       
   927           fmovd(v25, 1.0d);
       
   928           fsubd(v18, v25, v18);                            // z = one - z;
       
   929           cbzw(rscratch2, IH_HANDLED);
       
   930           fsubd(v18, v18, v30);                            // z -= scalbnA(one,q0);
       
   931         }
       
   932     }
       
   933     bind(IH_HANDLED);
       
   934       // check if recomputation is needed
       
   935       fcmpd(v18, 0.0d);
       
   936       br(NE, RECOMP_CHECK_DONE_NOT_ZERO);
       
   937 
       
   938       block_comment("if(z==zeroB) {"); {
       
   939 
       
   940         block_comment("for (i=jz-1;i>=jk;i--) j |= iq[i];"); {
       
   941             subw(i, jz, 1);
       
   942             eorw(j, j, j);
       
   943             b(RECOMP_FOR1_CHECK);
       
   944           bind(RECOMP_FOR1);
       
   945             ldrw(tmp1, Address(iqBase, i, Address::lsl(2)));
       
   946             orrw(j, j, tmp1);
       
   947             subw(i, i, 1);
       
   948           bind(RECOMP_FOR1_CHECK);
       
   949             cmpw(i, 4);
       
   950             br(GE, RECOMP_FOR1);
       
   951         }
       
   952         cbnzw(j, RECOMP_CHECK_DONE);
       
   953 
       
   954         block_comment("if(j==0) {"); {
       
   955             // for(k=1;iq[jk-k]==0;k++); // let's unroll it. jk == 4. So, read
       
   956             // iq[3], iq[2], iq[1], iq[0] until non-zero value
       
   957             ldp(tmp1, tmp3, iqBase);               // iq[0..3]
       
   958             movw(j, 2);
       
   959             cmp(tmp3, zr);
       
   960             csel(tmp1, tmp1, tmp3, EQ);            // set register for further consideration
       
   961             cselw(j, j, zr, EQ);                   // set initial k. Use j as k
       
   962             cmp(zr, tmp1, LSR, 32);
       
   963             addw(i, jz, 1);
       
   964             csincw(j, j, j, NE);
       
   965 
       
   966           block_comment("for(i=jz+1;i<=jz+k;i++) {...}"); {
       
   967               addw(jz, i, j); // i = jz+1, j = k-1. j+i = jz+k (which is a new jz)
       
   968             bind(RECOMP_FOR2);
       
   969               addw(tmp1, jv, i);
       
   970               ldrd(v29, Address(twoOverPiBase, tmp1, Address::lsl(3)));
       
   971               addw(tmp2, jx, i);
       
   972               strd(v29, Address(sp, tmp2, Address::lsl(3)));
       
   973               // f[jx+i] = /* NOTE: converted to double */ ipio2[jv+i]; //(double) ipio2[jv+i];
       
   974               // since jx = 0, 1 or 2 we can unroll it:
       
   975               // for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
       
   976               // f[jx+i-j] == (for first iteration) f[jx+i], which is already v29
       
   977               add(tmp2, sp, tmp2, ext::uxtx, 3); // address of f[jx+i]
       
   978               ldpd(v4, v5, Address(tmp2, -16)); // load f[jx+i-2] and f[jx+i-1]
       
   979               fmuld(v26, v6, v29); // initial fw
       
   980               cbzw(jx, RECOMP_FW_UPDATED);
       
   981               fmaddd(v26, v7, v5, v26);
       
   982               cmpw(jx, 1);
       
   983               br(EQ, RECOMP_FW_UPDATED);
       
   984               fmaddd(v26, v3, v4, v26);
       
   985             bind(RECOMP_FW_UPDATED);
       
   986               strd(v26, Address(qBase, i, Address::lsl(3))); // q[i] = fw;
       
   987               addw(i, i, 1);
       
   988               cmpw(i, jz);                                   // jz here is "old jz" + k
       
   989               br(LE, RECOMP_FOR2);
       
   990           }
       
   991             b(RECOMPUTE);
       
   992         }
       
   993       }
       
   994     }
       
   995     bind(RECOMP_CHECK_DONE);
       
   996       // chop off zero terms
       
   997       fcmpd(v18, 0.0d);
       
   998       br(EQ, Z_IS_ZERO);
       
   999 
       
  1000       block_comment("else block of if(z==0.0) {"); {
       
  1001         bind(RECOMP_CHECK_DONE_NOT_ZERO);
       
  1002           fmuld(v18, v18, v22);
       
  1003           fcmpd(v18, v10);                                   // v10 is stil two24A
       
  1004           br(LT, Z_IS_LESS_THAN_TWO24B);
       
  1005           fmuld(v1, v18, v17);                               // twon24*z
       
  1006           frintzd(v1, v1);                                   // v1 = (double)(int)(v1)
       
  1007           fmaddd(v2, v10, v1, v18);
       
  1008           fcvtzdw(tmp3, v1);                                 // (int)fw
       
  1009           fcvtzdw(tmp2, v2);                                 // double to int
       
  1010           strw(tmp2, Address(iqBase, jz, Address::lsl(2)));
       
  1011           addw(rscratch1, rscratch1, 24);
       
  1012           addw(jz, jz, 1);
       
  1013           strw(tmp3, Address(iqBase, jz, Address::lsl(2)));  // iq[jz] = (int) fw
       
  1014           b(Z_ZERO_CHECK_DONE);
       
  1015         bind(Z_IS_LESS_THAN_TWO24B);
       
  1016           fcvtzdw(tmp3, v18);                                // (int)z
       
  1017           strw(tmp3, Address(iqBase, jz, Address::lsl(2)));  // iq[jz] = (int) z
       
  1018           b(Z_ZERO_CHECK_DONE);
       
  1019       }
       
  1020 
       
  1021       block_comment("if(z==0.0) {"); {
       
  1022         bind(Z_IS_ZERO);
       
  1023           subw(jz, jz, 1);
       
  1024           ldrw(tmp1, Address(iqBase, jz, Address::lsl(2)));
       
  1025           subw(rscratch1, rscratch1, 24);
       
  1026           cbz(tmp1, Z_IS_ZERO);
       
  1027       }
       
  1028       bind(Z_ZERO_CHECK_DONE);
       
  1029         // convert integer "bit" chunk to floating-point value
       
  1030         // v17 = twon24
       
  1031         // update v30, which was scalbnA(1.0, <old q0>);
       
  1032         addw(tmp2, rscratch1, 1023); // biased exponent
       
  1033         lsl(tmp2, tmp2, 52); // put at correct position
       
  1034         mov(i, jz);
       
  1035         fmovd(v30, tmp2);
       
  1036 
       
  1037         block_comment("for(i=jz;i>=0;i--) {q[i] = fw*(double)iq[i]; fw*=twon24;}"); {
       
  1038           bind(CONVERTION_FOR);
       
  1039             ldrw(tmp1, Address(iqBase, i, Address::lsl(2)));
       
  1040             scvtfwd(v31, tmp1);
       
  1041             fmuld(v31, v31, v30);
       
  1042             strd(v31, Address(qBase, i, Address::lsl(3)));
       
  1043             fmuld(v30, v30, v17);
       
  1044             subsw(i, i, 1);
       
  1045             br(GE, CONVERTION_FOR);
       
  1046         }
       
  1047         add(rscratch2, sp, 160); // base for fq
       
  1048         // reusing twoOverPiBase
       
  1049         lea(twoOverPiBase, ExternalAddress(pio2));
       
  1050 
       
  1051       block_comment("compute PIo2[0,...,jp]*q[jz,...,0]. for(i=jz;i>=0;i--) {...}"); {
       
  1052           movw(i, jz);
       
  1053           movw(tmp2, zr); // tmp2 will keep jz - i == 0 at start
       
  1054         bind(COMP_FOR);
       
  1055           // for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
       
  1056           fmovd(v30, 0.0d);
       
  1057           add(tmp5, qBase, i, LSL, 3); // address of q[i+k] for k==0
       
  1058           movw(tmp3, 4);
       
  1059           movw(tmp4, zr);              // used as k
       
  1060           cmpw(tmp2, 4);
       
  1061           add(tmp1, qBase, i, LSL, 3); // used as q[i] address
       
  1062           cselw(tmp3, tmp2, tmp3, LE); // min(jz - i, jp)
       
  1063 
       
  1064           block_comment("for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];"); {
       
  1065             bind(COMP_INNER_LOOP);
       
  1066               ldrd(v18, Address(tmp1, tmp4, Address::lsl(3)));          // q[i+k]
       
  1067               ldrd(v19, Address(twoOverPiBase, tmp4, Address::lsl(3))); // PIo2[k]
       
  1068               fmaddd(v30, v18, v19, v30);                               // fw += PIo2[k]*q[i+k];
       
  1069               addw(tmp4, tmp4, 1);                                      // k++
       
  1070               cmpw(tmp4, tmp3);
       
  1071               br(LE, COMP_INNER_LOOP);
       
  1072           }
       
  1073           strd(v30, Address(rscratch2, tmp2, Address::lsl(3)));         // fq[jz-i]
       
  1074           add(tmp2, tmp2, 1);
       
  1075           subsw(i, i, 1);
       
  1076           br(GE, COMP_FOR);
       
  1077       }
       
  1078 
       
  1079       block_comment("switch(prec) {...}. case 2:"); {
       
  1080         // compress fq into y[]
       
  1081         // remember prec == 2
       
  1082 
       
  1083         block_comment("for (i=jz;i>=0;i--) fw += fq[i];"); {
       
  1084             fmovd(v4, 0.0d);
       
  1085             mov(i, jz);
       
  1086           bind(FW_FOR1);
       
  1087             ldrd(v1, Address(rscratch2, i, Address::lsl(3)));
       
  1088             subsw(i, i, 1);
       
  1089             faddd(v4, v4, v1);
       
  1090             br(GE, FW_FOR1);
       
  1091         }
       
  1092         bind(FW_FOR1_DONE);
       
  1093           // v1 contains fq[0]. so, keep it so far
       
  1094           fsubd(v5, v1, v4); // fw = fq[0] - fw
       
  1095           cbzw(ih, FW_Y0_NO_NEGATION);
       
  1096           fnegd(v4, v4);
       
  1097         bind(FW_Y0_NO_NEGATION);
       
  1098 
       
  1099         block_comment("for (i=1;i<=jz;i++) fw += fq[i];"); {
       
  1100             movw(i, 1);
       
  1101               cmpw(jz, 1);
       
  1102             br(LT, FW_FOR2_DONE);
       
  1103           bind(FW_FOR2);
       
  1104             ldrd(v1, Address(rscratch2, i, Address::lsl(3)));
       
  1105             addw(i, i, 1);
       
  1106             cmp(i, jz);
       
  1107             faddd(v5, v5, v1);
       
  1108             br(LE, FW_FOR2);
       
  1109         }
       
  1110         bind(FW_FOR2_DONE);
       
  1111           cbz(ih, FW_Y1_NO_NEGATION);
       
  1112           fnegd(v5, v5);
       
  1113         bind(FW_Y1_NO_NEGATION);
       
  1114           add(sp, sp, 560);
       
  1115       }
       
  1116 }
       
  1117 
       
  1118 ///* __kernel_sin( x, y, iy)
       
  1119 // * kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854
       
  1120 // * Input x is assumed to be bounded by ~pi/4 in magnitude.
       
  1121 // * Input y is the tail of x.
       
  1122 // * Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
       
  1123 // *
       
  1124 // * Algorithm
       
  1125 // *      1. Since sin(-x) = -sin(x), we need only to consider positive x.
       
  1126 // *      2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0.
       
  1127 // *      3. sin(x) is approximated by a polynomial of degree 13 on
       
  1128 // *         [0,pi/4]
       
  1129 // *                               3            13
       
  1130 // *              sin(x) ~ x + S1*x + ... + S6*x
       
  1131 // *         where
       
  1132 // *
       
  1133 // *      |sin(x)         2     4     6     8     10     12  |     -58
       
  1134 // *      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
       
  1135 // *      |  x                                               |
       
  1136 // *
       
  1137 // *      4. sin(x+y) = sin(x) + sin'(x')*y
       
  1138 // *                  ~ sin(x) + (1-x*x/2)*y
       
  1139 // *         For better accuracy, let
       
  1140 // *                   3      2      2      2      2
       
  1141 // *              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
       
  1142 // *         then                   3    2
       
  1143 // *              sin(x) = x + (S1*x + (x *(r-y/2)+y))
       
  1144 // */
       
  1145 //static const double
       
  1146 //S1  = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
       
  1147 //S2  =  8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
       
  1148 //S3  = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
       
  1149 //S4  =  2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
       
  1150 //S5  = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
       
  1151 //S6  =  1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
       
  1152 //
       
  1153 // NOTE: S1..S6 were moved into a table: StubRoutines::aarch64::_dsin_coef
       
  1154 //
       
  1155 // BEGIN __kernel_sin PSEUDO CODE
       
  1156 //
       
  1157 //static double __kernel_sin(double x, double y, bool iy)
       
  1158 //{
       
  1159 //        double z,r,v;
       
  1160 //
       
  1161 //        // NOTE: not needed. moved to dsin/dcos
       
  1162 //        //int ix;
       
  1163 //        //ix = high(x)&0x7fffffff;                /* high word of x */
       
  1164 //
       
  1165 //        // NOTE: moved to dsin/dcos
       
  1166 //        //if(ix<0x3e400000)                       /* |x| < 2**-27 */
       
  1167 //        //   {if((int)x==0) return x;}            /* generate inexact */
       
  1168 //
       
  1169 //        z       =  x*x;
       
  1170 //        v       =  z*x;
       
  1171 //        r       =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
       
  1172 //        if(iy==0) return x+v*(S1+z*r);
       
  1173 //        else      return x-((z*(half*y-v*r)-y)-v*S1);
       
  1174 //}
       
  1175 //
       
  1176 // END __kernel_sin PSEUDO CODE
       
  1177 //
       
  1178 // Changes between fdlibm and intrinsic:
       
  1179 //     1. Removed |x| < 2**-27 check, because if was done earlier in dsin/dcos
       
  1180 //     2. Constants are now loaded from table dsin_coef
       
  1181 //     3. C code parameter "int iy" was modified to "bool iyIsOne", because
       
  1182 //         iy is always 0 or 1. Also, iyIsOne branch was moved into
       
  1183 //         generation phase instead of taking it during code execution
       
  1184 // Input ans output:
       
  1185 //     1. Input for generated function: X argument = x
       
  1186 //     2. Input for generator: x = register to read argument from, iyIsOne
       
  1187 //         = flag to use low argument low part or not, dsin_coef = coefficients
       
  1188 //         table address
       
  1189 //     3. Return sin(x) value in v0
       
  1190 void MacroAssembler::generate_kernel_sin(FloatRegister x, bool iyIsOne,
       
  1191     address dsin_coef) {
       
  1192   FloatRegister y = v5, z = v6, v = v7, r = v16, S1 = v17, S2 = v18,
       
  1193       S3 = v19, S4 = v20, S5 = v21, S6 = v22, half = v23;
       
  1194   lea(rscratch2, ExternalAddress(dsin_coef));
       
  1195   ldpd(S5, S6, Address(rscratch2, 32));
       
  1196   fmuld(z, x, x); // z =  x*x;
       
  1197   ld1(S1, S2, S3, S4, T1D, Address(rscratch2));
       
  1198   fmuld(v, z, x); // v =  z*x;
       
  1199 
       
  1200   block_comment("calculate r =  S2+z*(S3+z*(S4+z*(S5+z*S6)))"); {
       
  1201     fmaddd(r, z, S6, S5);
       
  1202     // initialize "half" in current block to utilize 2nd FPU. However, it's
       
  1203     // not a part of this block
       
  1204     fmovd(half, 0.5);
       
  1205     fmaddd(r, z, r, S4);
       
  1206     fmaddd(r, z, r, S3);
       
  1207     fmaddd(r, z, r, S2);
       
  1208   }
       
  1209 
       
  1210   if (!iyIsOne) {
       
  1211     // return x+v*(S1+z*r);
       
  1212     fmaddd(S1, z, r, S1);
       
  1213     fmaddd(v0, v, S1, x);
       
  1214   } else {
       
  1215     // return x-((z*(half*y-v*r)-y)-v*S1);
       
  1216     fmuld(S6, half, y);    // half*y
       
  1217     fmsubd(S6, v, r, S6);  // half*y-v*r
       
  1218     fmsubd(S6, z, S6, y);  // y - z*(half*y-v*r) = - (z*(half*y-v*r)-y)
       
  1219     fmaddd(S6, v, S1, S6); // - (z*(half*y-v*r)-y) + v*S1 == -((z*(half*y-v*r)-y)-v*S1)
       
  1220     faddd(v0, x, S6);
       
  1221   }
       
  1222 }
       
  1223 
       
  1224 ///*
       
  1225 // * __kernel_cos( x,  y )
       
  1226 // * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
       
  1227 // * Input x is assumed to be bounded by ~pi/4 in magnitude.
       
  1228 // * Input y is the tail of x.
       
  1229 // *
       
  1230 // * Algorithm
       
  1231 // *      1. Since cos(-x) = cos(x), we need only to consider positive x.
       
  1232 // *      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
       
  1233 // *      3. cos(x) is approximated by a polynomial of degree 14 on
       
  1234 // *         [0,pi/4]
       
  1235 // *                                       4            14
       
  1236 // *              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
       
  1237 // *         where the remez error is
       
  1238 // *
       
  1239 // *      |              2     4     6     8     10    12     14 |     -58
       
  1240 // *      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
       
  1241 // *      |                                                      |
       
  1242 // *
       
  1243 // *                     4     6     8     10    12     14
       
  1244 // *      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
       
  1245 // *             cos(x) = 1 - x*x/2 + r
       
  1246 // *         since cos(x+y) ~ cos(x) - sin(x)*y
       
  1247 // *                        ~ cos(x) - x*y,
       
  1248 // *         a correction term is necessary in cos(x) and hence
       
  1249 // *              cos(x+y) = 1 - (x*x/2 - (r - x*y))
       
  1250 // *         For better accuracy when x > 0.3, let qx = |x|/4 with
       
  1251 // *         the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125.
       
  1252 // *         Then
       
  1253 // *              cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)).
       
  1254 // *         Note that 1-qx and (x*x/2-qx) is EXACT here, and the
       
  1255 // *         magnitude of the latter is at least a quarter of x*x/2,
       
  1256 // *         thus, reducing the rounding error in the subtraction.
       
  1257 // */
       
  1258 //
       
  1259 //static const double
       
  1260 //C1  =  4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
       
  1261 //C2  = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
       
  1262 //C3  =  2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
       
  1263 //C4  = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
       
  1264 //C5  =  2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
       
  1265 //C6  = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
       
  1266 //
       
  1267 // NOTE: C1..C6 were moved into a table: StubRoutines::aarch64::_dcos_coef
       
  1268 //
       
  1269 // BEGIN __kernel_cos PSEUDO CODE
       
  1270 //
       
  1271 //static double __kernel_cos(double x, double y)
       
  1272 //{
       
  1273 //  double a,h,z,r,qx=0;
       
  1274 //
       
  1275 //  // NOTE: ix is already initialized in dsin/dcos. Reuse value from register
       
  1276 //  //int ix;
       
  1277 //  //ix = high(x)&0x7fffffff;              /* ix = |x|'s high word*/
       
  1278 //
       
  1279 //  // NOTE: moved to dsin/dcos
       
  1280 //  //if(ix<0x3e400000) {                   /* if x < 2**27 */
       
  1281 //  //  if(((int)x)==0) return one;         /* generate inexact */
       
  1282 //  //}
       
  1283 //
       
  1284 //  z  = x*x;
       
  1285 //  r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
       
  1286 //  if(ix < 0x3FD33333)                   /* if |x| < 0.3 */
       
  1287 //    return one - (0.5*z - (z*r - x*y));
       
  1288 //  else {
       
  1289 //    if(ix > 0x3fe90000) {               /* x > 0.78125 */
       
  1290 //      qx = 0.28125;
       
  1291 //    } else {
       
  1292 //      set_high(&qx, ix-0x00200000); /* x/4 */
       
  1293 //      set_low(&qx, 0);
       
  1294 //    }
       
  1295 //    h = 0.5*z-qx;
       
  1296 //    a = one-qx;
       
  1297 //    return a - (h - (z*r-x*y));
       
  1298 //  }
       
  1299 //}
       
  1300 //
       
  1301 // END __kernel_cos PSEUDO CODE
       
  1302 //
       
  1303 // Changes between fdlibm and intrinsic:
       
  1304 //     1. Removed |x| < 2**-27 check, because if was done earlier in dsin/dcos
       
  1305 //     2. Constants are now loaded from table dcos_coef
       
  1306 // Input and output:
       
  1307 //     1. Input for generated function: X argument = x
       
  1308 //     2. Input for generator: x = register to read argument from, dcos_coef
       
  1309 //        = coefficients table address
       
  1310 //     2. Return cos(x) value in v0
       
  1311 void MacroAssembler::generate_kernel_cos(FloatRegister x, address dcos_coef) {
       
  1312   Register ix = r3;
       
  1313   FloatRegister qx = v1, h = v2, a = v3, y = v5, z = v6, r = v7, C1 = v18,
       
  1314       C2 = v19, C3 = v20, C4 = v21, C5 = v22, C6 = v23, one = v25, half = v26;
       
  1315   Label IX_IS_LARGE, SET_QX_CONST, DONE, QX_SET;
       
  1316     lea(rscratch2, ExternalAddress(dcos_coef));
       
  1317     ldpd(C5, C6, Address(rscratch2, 32));         // load C5, C6
       
  1318     fmuld(z, x, x);                               // z=x^2
       
  1319     ld1(C1, C2, C3, C4, T1D, Address(rscratch2)); // load C1..C3\4
       
  1320     block_comment("calculate r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))))"); {
       
  1321       fmaddd(r, z, C6, C5);
       
  1322       fmovd(half, 0.5d);
       
  1323       fmaddd(r, z, r, C4);
       
  1324       fmuld(y, x, y);
       
  1325       fmaddd(r, z, r, C3);
       
  1326       mov(rscratch1, 0x3FD33333);
       
  1327       fmaddd(r, z, r, C2);
       
  1328       fmuld(x, z, z);                             // x = z^2
       
  1329       fmaddd(r, z, r, C1);                        // r = C1+z(C2+z(C4+z(C5+z*C6)))
       
  1330     }
       
  1331     // need to multiply r by z to have "final" r value
       
  1332     fmovd(one, 1.0d);
       
  1333     cmp(ix, rscratch1);
       
  1334     br(GT, IX_IS_LARGE);
       
  1335     block_comment("if(ix < 0x3FD33333) return one - (0.5*z - (z*r - x*y))"); {
       
  1336       // return 1.0 - (0.5*z - (z*r - x*y)) = 1.0 - (0.5*z + (x*y - z*r))
       
  1337       fmsubd(v0, x, r, y);
       
  1338       fmaddd(v0, half, z, v0);
       
  1339       fsubd(v0, one, v0);
       
  1340       b(DONE);
       
  1341     }
       
  1342   block_comment("if(ix >= 0x3FD33333)"); {
       
  1343     bind(IX_IS_LARGE);
       
  1344       movz(rscratch2, 0x3FE9, 16);
       
  1345       cmp(ix, rscratch2);
       
  1346       br(GT, SET_QX_CONST);
       
  1347       block_comment("set_high(&qx, ix-0x00200000); set_low(&qx, 0);"); {
       
  1348         subw(rscratch2, ix, 0x00200000);
       
  1349         lsl(rscratch2, rscratch2, 32);
       
  1350         fmovd(qx, rscratch2);
       
  1351       }
       
  1352       b(QX_SET);
       
  1353     bind(SET_QX_CONST);
       
  1354       block_comment("if(ix > 0x3fe90000) qx = 0.28125;"); {
       
  1355         fmovd(qx, 0.28125d);
       
  1356       }
       
  1357     bind(QX_SET);
       
  1358       fnmsub(C6, x, r, y);    // z*r - xy
       
  1359       fnmsub(h, half, z, qx); // h = 0.5*z - qx
       
  1360       fsubd(a, one, qx);      // a = 1-qx
       
  1361       fsubd(C6, h, C6);       // = h - (z*r - x*y)
       
  1362       fsubd(v0, a, C6);
       
  1363   }
       
  1364   bind(DONE);
       
  1365 }
       
  1366 
       
  1367 // generate_dsin_dcos creates stub for dsin and dcos
       
  1368 // Generation is done via single call because dsin and dcos code is almost the
       
  1369 // same(see C code below). These functions work as follows:
       
  1370 // 1) handle corner cases: |x| ~< pi/4, x is NaN or INF, |x| < 2**-27
       
  1371 // 2) perform argument reduction if required
       
  1372 // 3) call kernel_sin or kernel_cos which approximate sin/cos via polynomial
       
  1373 //
       
  1374 // BEGIN dsin/dcos PSEUDO CODE
       
  1375 //
       
  1376 //dsin_dcos(jdouble x, bool isCos) {
       
  1377 //  double y[2],z=0.0;
       
  1378 //  int n, ix;
       
  1379 //
       
  1380 //  /* High word of x. */
       
  1381 //  ix = high(x);
       
  1382 //
       
  1383 //  /* |x| ~< pi/4 */
       
  1384 //  ix &= 0x7fffffff;
       
  1385 //  if(ix <= 0x3fe921fb) return isCos ? __kernel_cos : __kernel_sin(x,z,0);
       
  1386 //
       
  1387 //  /* sin/cos(Inf or NaN) is NaN */
       
  1388 //  else if (ix>=0x7ff00000) return x-x;
       
  1389 //  else if (ix<0x3e400000) {                   /* if ix < 2**27 */
       
  1390 //    if(((int)x)==0) return isCos ? one : x;         /* generate inexact */
       
  1391 //  }
       
  1392 //  /* argument reduction needed */
       
  1393 //  else {
       
  1394 //    n = __ieee754_rem_pio2(x,y);
       
  1395 //    switch(n&3) {
       
  1396 //    case 0: return isCos ?  __kernel_cos(y[0],y[1])      :  __kernel_sin(y[0],y[1], true);
       
  1397 //    case 1: return isCos ? -__kernel_sin(y[0],y[1],true) :  __kernel_cos(y[0],y[1]);
       
  1398 //    case 2: return isCos ? -__kernel_cos(y[0],y[1])      : -__kernel_sin(y[0],y[1], true);
       
  1399 //    default:
       
  1400 //      return isCos ? __kernel_sin(y[0],y[1],1) : -__kernel_cos(y[0],y[1]);
       
  1401 //    }
       
  1402 //  }
       
  1403 //}
       
  1404 // END dsin/dcos PSEUDO CODE
       
  1405 //
       
  1406 // Changes between fdlibm and intrinsic:
       
  1407 //     1. Moved ix < 2**27 from kernel_sin/kernel_cos into dsin/dcos
       
  1408 //     2. Final switch use equivalent bit checks(tbz/tbnz)
       
  1409 // Input ans output:
       
  1410 //     1. Input for generated function: X = r0
       
  1411 //     2. Input for generator: isCos = generate sin or cos, npio2_hw = address
       
  1412 //         of npio2_hw table, two_over_pi = address of two_over_pi table,
       
  1413 //         pio2 = address if pio2 table, dsin_coef = address if dsin_coef table,
       
  1414 //         dcos_coef = address of dcos_coef table
       
  1415 //     3. Return result in v0
       
  1416 // NOTE: general purpose register names match local variable names in C code
       
  1417 void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
       
  1418     address two_over_pi, address pio2, address dsin_coef, address dcos_coef) {
       
  1419   const int POSITIVE_INFINITY_OR_NAN_PREFIX = 0x7FF0;
       
  1420 
       
  1421   Label DONE, ARG_REDUCTION, TINY_X, RETURN_SIN, EARLY_CASE;
       
  1422   Register X = r0, absX = r1, n = r2, ix = r3;
       
  1423   FloatRegister y0 = v4, y1 = v5;
       
  1424     block_comment("check |x| ~< pi/4, NaN, Inf and |x| < 2**-27 cases"); {
       
  1425       fmovd(X, v0);
       
  1426       mov(rscratch2, 0x3e400000);
       
  1427       mov(rscratch1, 0x3fe921fb00000000);            // pi/4. shifted to reuse later
       
  1428       ubfm(absX, X, 0, 62);                          // absX
       
  1429       movz(r10, POSITIVE_INFINITY_OR_NAN_PREFIX, 48);
       
  1430       cmp(rscratch2, absX, LSR, 32);
       
  1431       lsr(ix, absX, 32);                             // set ix
       
  1432       br(GT, TINY_X);                                // handle tiny x (|x| < 2^-27)
       
  1433       cmp(ix, rscratch1, LSR, 32);
       
  1434       br(LE, EARLY_CASE);                            // if(ix <= 0x3fe921fb) return
       
  1435       cmp(absX, r10);
       
  1436       br(LT, ARG_REDUCTION);
       
  1437       // X is NaN or INF(i.e. 0x7FF* or 0xFFF*). Return NaN (mantissa != 0).
       
  1438       // Set last bit unconditionally to make it NaN
       
  1439       orr(r10, r10, 1);
       
  1440       fmovd(v0, r10);
       
  1441       ret(lr);
       
  1442     }
       
  1443   block_comment("kernel_sin/kernel_cos: if(ix<0x3e400000) {<fast return>}"); {
       
  1444     bind(TINY_X);
       
  1445       if (isCos) {
       
  1446         fmovd(v0, 1.0d);
       
  1447       }
       
  1448       ret(lr);
       
  1449   }
       
  1450   bind(ARG_REDUCTION); /* argument reduction needed */
       
  1451     block_comment("n = __ieee754_rem_pio2(x,y);"); {
       
  1452       generate__ieee754_rem_pio2(npio2_hw, two_over_pi, pio2);
       
  1453     }
       
  1454     block_comment("switch(n&3) {case ... }"); {
       
  1455       if (isCos) {
       
  1456         eorw(absX, n, n, LSR, 1);
       
  1457         tbnz(n, 0, RETURN_SIN);
       
  1458       } else {
       
  1459         tbz(n, 0, RETURN_SIN);
       
  1460       }
       
  1461       generate_kernel_cos(y0, dcos_coef);
       
  1462       if (isCos) {
       
  1463         tbz(absX, 0, DONE);
       
  1464       } else {
       
  1465         tbz(n, 1, DONE);
       
  1466       }
       
  1467       fnegd(v0, v0);
       
  1468       ret(lr);
       
  1469     bind(RETURN_SIN);
       
  1470       generate_kernel_sin(y0, true, dsin_coef);
       
  1471       if (isCos) {
       
  1472         tbz(absX, 0, DONE);
       
  1473       } else {
       
  1474         tbz(n, 1, DONE);
       
  1475       }
       
  1476       fnegd(v0, v0);
       
  1477       ret(lr);
       
  1478     }
       
  1479   bind(EARLY_CASE);
       
  1480     eor(y1, T8B, y1, y1);
       
  1481     if (isCos) {
       
  1482       generate_kernel_cos(v0, dcos_coef);
       
  1483     } else {
       
  1484       generate_kernel_sin(v0, false, dsin_coef);
       
  1485     }
       
  1486   bind(DONE);
       
  1487     ret(lr);
       
  1488 }