src/hotspot/cpu/aarch64/macroAssembler_aarch64_trig.cpp
author ngasson
Mon, 17 Jun 2019 15:31:49 +0800
changeset 55398 e53ec3b362f4
parent 51739 7bed934d439e
permissions -rw-r--r--
8224851: AArch64: fix warnings and errors with Clang and GCC 8.3 Reviewed-by: aph, kbarrett
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
     1
/* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
     2
 * Copyright (c) 2018, Cavium. All rights reserved. (By BELLSOFT)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
     3
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
     4
 *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
     5
 * This code is free software; you can redistribute it and/or modify it
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
     6
 * under the terms of the GNU General Public License version 2 only, as
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
     7
 * published by the Free Software Foundation.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
     8
 *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
     9
 * This code is distributed in the hope that it will be useful, but WITHOUT
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    10
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    11
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    12
 * version 2 for more details (a copy is included in the LICENSE file that
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    13
 * accompanied this code).
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    14
 *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    15
 * You should have received a copy of the GNU General Public License version
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    16
 * 2 along with this work; if not, write to the Free Software Foundation,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    17
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    18
 *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    19
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    20
 * or visit www.oracle.com if you need additional information or have any
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    21
 * questions.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    22
 *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    23
 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    24
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    25
#include "precompiled.hpp"
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    26
#include "asm/assembler.hpp"
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    27
#include "asm/assembler.inline.hpp"
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    28
#include "runtime/stubRoutines.hpp"
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    29
#include "macroAssembler_aarch64.hpp"
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    30
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    31
// The following code is a optimized version of fdlibm sin/cos implementation
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    32
// (C code is in share/runtime/sharedRuntimeTrig.cpp) adapted for AARCH64.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    33
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    34
// Please refer to sin/cos approximation via polynomial and
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    35
// trigonometric argument reduction techniques to the following literature:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    36
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    37
// [1] Muller, Jean-Michel, Nicolas Brisebarre, Florent De Dinechin,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    38
// Claude-Pierre Jeannerod, Vincent Lefevre, Guillaume Melquiond,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    39
// Nathalie Revol, Damien Stehlé, and Serge Torres:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    40
// Handbook of floating-point arithmetic.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    41
// Springer Science & Business Media, 2009.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    42
// [2] K. C. Ng
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    43
// Argument Reduction for Huge Arguments: Good to the Last Bit
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    44
// July 13, 1992, SunPro
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    45
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    46
// HOW TO READ THIS CODE:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    47
// This code consists of several functions. Each function has following header:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    48
// 1) Description
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    49
// 2) C-pseudo code with differences from fdlibm marked by comments starting
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    50
//        with "NOTE". Check unmodified fdlibm code in
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    51
//        share/runtime/SharedRuntimeTrig.cpp
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    52
// 3) Brief textual description of changes between fdlibm and current
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    53
//        implementation along with optimization notes (if applicable)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    54
// 4) Assumptions, input and output
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    55
// 5) (Optional) additional notes about intrinsic implementation
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    56
// Each function is separated in blocks which follow the pseudo-code structure
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    57
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    58
// HIGH-LEVEL ALGORITHM DESCRIPTION:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    59
//    - entry point: generate_dsin_dcos(...);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    60
//    - check corner cases: NaN, INF, tiny argument.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    61
//    - check if |x| < Pi/4. Then approximate sin/cos via polynomial (kernel_sin/kernel_cos)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    62
//    -- else proceed to argument reduction routine (__ieee754_rem_pio2) and
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    63
//           use reduced argument to get result via kernel_sin/kernel_cos
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    64
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    65
// HIGH-LEVEL CHANGES BETWEEN INTRINSICS AND FDLIBM:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    66
// 1) two_over_pi table fdlibm representation is int[], while intrinsic version
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    67
// has these int values converted to double representation to load converted
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    68
// double values directly (see stubRoutines_aarch4::_two_over_pi)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    69
// 2) Several loops are unrolled and vectorized: see comments in code after
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    70
// labels: SKIP_F_LOAD, RECOMP_FOR1_CHECK, RECOMP_FOR2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    71
// 3) fdlibm npio2_hw table now has "prefix" with constants used in
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    72
// calculation. These constants are loaded from npio2_hw table instead of
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    73
// constructing it in code (see stubRoutines_aarch64.cpp)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    74
// 4) Polynomial coefficients for sin and cos are moved to table sin_coef
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    75
// and cos_coef to use the same optimization as in 3). It allows to load most of
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    76
// required constants via single instruction
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    77
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    78
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    79
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    80
///* __ieee754_rem_pio2(x,y)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    81
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    82
// * returns the remainder of x rem pi/2 in y[0]+y[1] (i.e. like x div pi/2)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    83
// * x is input argument, y[] is hi and low parts of reduced argument (x)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    84
// * uses __kernel_rem_pio2()
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    85
// */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    86
// // use tables(see stubRoutines_aarch64.cpp): two_over_pi and modified npio2_hw
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    87
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    88
// BEGIN __ieee754_rem_pio2 PSEUDO CODE
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    89
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    90
//static int __ieee754_rem_pio2(double x, double *y) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    91
//  double z,w,t,r,fn;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    92
//  double tx[3];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    93
//  int e0,i,j,nx,n,ix,hx,i0;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    94
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    95
//  i0 = ((*(int*)&two24A)>>30)^1;        /* high word index */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    96
//  hx = *(i0+(int*)&x);          /* high word of x */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    97
//  ix = hx&0x7fffffff;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    98
//  if(ix<0x4002d97c) {  /* |x| < 3pi/4, special case with n=+-1 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
    99
//    if(hx>0) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   100
//      z = x - pio2_1;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   101
//      if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   102
//        y[0] = z - pio2_1t;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   103
//        y[1] = (z-y[0])-pio2_1t;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   104
//      } else {                /* near pi/2, use 33+33+53 bit pi */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   105
//        z -= pio2_2;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   106
//        y[0] = z - pio2_2t;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   107
//        y[1] = (z-y[0])-pio2_2t;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   108
//      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   109
//      return 1;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   110
//    } else {    /* negative x */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   111
//      z = x + pio2_1;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   112
//      if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   113
//        y[0] = z + pio2_1t;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   114
//        y[1] = (z-y[0])+pio2_1t;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   115
//      } else {                /* near pi/2, use 33+33+53 bit pi */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   116
//        z += pio2_2;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   117
//        y[0] = z + pio2_2t;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   118
//        y[1] = (z-y[0])+pio2_2t;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   119
//      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   120
//      return -1;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   121
//    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   122
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   123
//  if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   124
//    t  = fabsd(x);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   125
//    n  = (int) (t*invpio2+half);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   126
//    fn = (double)n;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   127
//    r  = t-fn*pio2_1;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   128
//    w  = fn*pio2_1t;    /* 1st round good to 85 bit */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   129
//    // NOTE: y[0] = r-w; is moved from if/else below to be before "if"
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   130
//    y[0] = r-w;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   131
//    if(n<32&&ix!=npio2_hw[n-1]) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   132
//      // y[0] = r-w;       /* quick check no cancellation */ // NOTE: moved earlier
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   133
//    } else {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   134
//      j  = ix>>20;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   135
//      // y[0] = r-w; // NOTE: moved earlier
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   136
//      i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   137
//      if(i>16) {  /* 2nd iteration needed, good to 118 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   138
//        t  = r;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   139
//        w  = fn*pio2_2;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   140
//        r  = t-w;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   141
//        w  = fn*pio2_2t-((t-r)-w);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   142
//        y[0] = r-w;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   143
//        i = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   144
//        if(i>49)  {     /* 3rd iteration need, 151 bits acc */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   145
//          t  = r;       /* will cover all possible cases */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   146
//          w  = fn*pio2_3;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   147
//          r  = t-w;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   148
//          w  = fn*pio2_3t-((t-r)-w);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   149
//          y[0] = r-w;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   150
//        }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   151
//      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   152
//    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   153
//    y[1] = (r-y[0])-w;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   154
//    if(hx<0)    {y[0] = -y[0]; y[1] = -y[1]; return -n;}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   155
//    else         return n;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   156
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   157
//  /*
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   158
//   * all other (large) arguments
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   159
//   */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   160
//  // NOTE: this check is removed, because it was checked in dsin/dcos
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   161
//  // if(ix>=0x7ff00000) {          /* x is inf or NaN */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   162
//  //  y[0]=y[1]=x-x; return 0;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   163
//  // }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   164
//  /* set z = scalbn(|x|,ilogb(x)-23) */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   165
//  *(1-i0+(int*)&z) = *(1-i0+(int*)&x);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   166
//  e0    = (ix>>20)-1046;        /* e0 = ilogb(z)-23; */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   167
//  *(i0+(int*)&z) = ix - (e0<<20);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   168
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   169
//  // NOTE: "for" loop below in unrolled. See comments in asm code
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   170
//  for(i=0;i<2;i++) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   171
//    tx[i] = (double)((int)(z));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   172
//    z     = (z-tx[i])*two24A;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   173
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   174
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   175
//  tx[2] = z;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   176
//  nx = 3;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   177
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   178
//  // NOTE: while(tx[nx-1]==zeroA) nx--;  is unrolled. See comments in asm code
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   179
//  while(tx[nx-1]==zeroA) nx--;  /* skip zero term */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   180
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   181
//  n  =  __kernel_rem_pio2(tx,y,e0,nx,2,two_over_pi);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   182
//  if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   183
//  return n;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   184
//}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   185
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   186
// END __ieee754_rem_pio2 PSEUDO CODE
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   187
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   188
// Changes between fdlibm and intrinsic for __ieee754_rem_pio2:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   189
//     1. INF/NaN check for huge argument is removed in comparison with fdlibm
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   190
//     code, because this check is already done in dcos/dsin code
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   191
//     2. Most constants are now loaded from table instead of direct initialization
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   192
//     3. Two loops are unrolled
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   193
// Assumptions:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   194
//     1. Assume |X| >= PI/4
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   195
//     2. Assume rscratch1 = 0x3fe921fb00000000  (~ PI/4)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   196
//     3. Assume ix = r3
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   197
// Input and output:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   198
//     1. Input: X = r0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   199
//     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   200
// NOTE: general purpose register names match local variable names in C code
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   201
// NOTE: fpu registers are actively reused. See comments in code about their usage
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   202
void MacroAssembler::generate__ieee754_rem_pio2(address npio2_hw,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   203
    address two_over_pi, address pio2) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   204
  const long PIO2_1t = 0x3DD0B4611A626331UL;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   205
  const long PIO2_2  = 0x3DD0B4611A600000UL;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   206
  const long PIO2_2t = 0x3BA3198A2E037073UL;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   207
  Label X_IS_NEGATIVE, X_IS_MEDIUM_OR_LARGE, X_IS_POSITIVE_LONG_PI, LARGE_ELSE,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   208
      REDUCTION_DONE, X_IS_MEDIUM_BRANCH_DONE, X_IS_LARGE, NX_SET,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   209
      X_IS_NEGATIVE_LONG_PI;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   210
  Register X = r0, n = r2, ix = r3, jv = r4, tmp5 = r5, jx = r6,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   211
      tmp3 = r7, iqBase = r10, ih = r11, i = r17;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   212
    // initializing constants first
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   213
    // rscratch1 = 0x3fe921fb00000000 (see assumptions)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   214
    movk(rscratch1, 0x3ff9, 48); // was 0x3fe921fb0..0 now it's 0x3ff921fb0..0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   215
    mov(rscratch2, 0x4002d97c); // 3*PI/4 high word
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   216
    movk(rscratch1, 0x5440, 16); // now rscratch1 == PIO2_1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   217
    fmovd(v1, rscratch1); // v1 = PIO2_1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   218
    cmp(rscratch2, ix);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   219
    br(LE, X_IS_MEDIUM_OR_LARGE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   220
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   221
    block_comment("if(ix<0x4002d97c) {...  /* |x| ~< 3pi/4 */ "); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   222
      cmp(X, zr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   223
      br(LT, X_IS_NEGATIVE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   224
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   225
      block_comment("if(hx>0) {"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   226
        fsubd(v2, v0, v1); // v2 = z = x - pio2_1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   227
        cmp(ix, rscratch1, LSR, 32);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   228
        mov(n, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   229
        br(EQ, X_IS_POSITIVE_LONG_PI);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   230
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   231
        block_comment("case: hx > 0 &&  ix!=0x3ff921fb {"); { /* 33+53 bit pi is good enough */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   232
          mov(rscratch2, PIO2_1t);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   233
          fmovd(v27, rscratch2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   234
          fsubd(v4, v2, v27); // v4 = y[0] = z - pio2_1t;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   235
          fsubd(v5, v2, v4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   236
          fsubd(v5, v5, v27); // v5 = y[1] = (z-y[0])-pio2_1t
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   237
          b(REDUCTION_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   238
        }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   239
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   240
        block_comment("case: hx > 0 &*& ix==0x3ff921fb {"); { /* near pi/2, use 33+33+53 bit pi */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   241
          bind(X_IS_POSITIVE_LONG_PI);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   242
            mov(rscratch1, PIO2_2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   243
            mov(rscratch2, PIO2_2t);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   244
            fmovd(v27, rscratch1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   245
            fmovd(v6, rscratch2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   246
            fsubd(v2, v2, v27); // z-= pio2_2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   247
            fsubd(v4, v2, v6);  // y[0] = z - pio2_2t
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   248
            fsubd(v5, v2, v4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   249
            fsubd(v5, v5, v6);  // v5 = (z - y[0]) - pio2_2t
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   250
            b(REDUCTION_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   251
        }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   252
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   253
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   254
      block_comment("case: hx <= 0)"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   255
        bind(X_IS_NEGATIVE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   256
          faddd(v2, v0, v1); // v2 = z = x + pio2_1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   257
          cmp(ix, rscratch1, LSR, 32);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   258
          mov(n, -1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   259
          br(EQ, X_IS_NEGATIVE_LONG_PI);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   260
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   261
          block_comment("case: hx <= 0 && ix!=0x3ff921fb) {"); { /* 33+53 bit pi is good enough */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   262
            mov(rscratch2, PIO2_1t);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   263
            fmovd(v27, rscratch2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   264
            faddd(v4, v2, v27); // v4 = y[0] = z + pio2_1t;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   265
            fsubd(v5, v2, v4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   266
            faddd(v5, v5, v27); // v5 = y[1] = (z-y[0]) + pio2_1t
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   267
            b(REDUCTION_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   268
          }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   269
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   270
          block_comment("case: hx <= 0 && ix==0x3ff921fb"); { /* near pi/2, use 33+33+53 bit pi */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   271
            bind(X_IS_NEGATIVE_LONG_PI);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   272
              mov(rscratch1, PIO2_2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   273
              mov(rscratch2, PIO2_2t);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   274
              fmovd(v27, rscratch1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   275
              fmovd(v6, rscratch2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   276
              faddd(v2, v2, v27); // z += pio2_2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   277
              faddd(v4, v2, v6);  // y[0] = z + pio2_2t
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   278
              fsubd(v5, v2, v4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   279
              faddd(v5, v5, v6);  // v5 = (z - y[0]) + pio2_2t
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   280
              b(REDUCTION_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   281
          }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   282
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   283
  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   284
  bind(X_IS_MEDIUM_OR_LARGE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   285
    mov(rscratch1, 0x413921fb);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   286
    cmp(ix, rscratch1); // ix < = 0x413921fb ?
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   287
    br(GT, X_IS_LARGE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   288
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   289
    block_comment("|x| ~<= 2^19*(pi/2), medium size"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   290
      lea(ih, ExternalAddress(npio2_hw));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   291
      ld1(v4, v5, v6, v7, T1D, ih);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   292
      fabsd(v31, v0);          // v31 = t = |x|
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   293
      add(ih, ih, 64);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   294
      fmaddd(v2, v31, v5, v4); // v2 = t * invpio2 + half (invpio2 = 53 bits of 2/pi, half = 0.5)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   295
      fcvtzdw(n, v2);          // n = (int) v2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   296
      frintzd(v2, v2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   297
      fmsubd(v3, v2, v6, v31); // v3 = r = t - fn * pio2_1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   298
      fmuld(v26, v2, v7);      // v26 = w = fn * pio2_1t
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   299
      fsubd(v4, v3, v26);      // y[0] = r - w. Calculated before branch
51374
7be0084191ed 8206895: aarch64: rework error-prone cmp instuction
bulasevich
parents: 50754
diff changeset
   300
      cmp(n, (u1)32);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   301
      br(GT, LARGE_ELSE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   302
      subw(tmp5, n, 1);        // tmp5 = n - 1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   303
      ldrw(jv, Address(ih, tmp5, Address::lsl(2)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   304
      cmp(ix, jv);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   305
      br(NE, X_IS_MEDIUM_BRANCH_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   306
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   307
      block_comment("else block for if(n<32&&ix!=npio2_hw[n-1])"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   308
        bind(LARGE_ELSE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   309
          fmovd(jx, v4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   310
          lsr(tmp5, ix, 20);                       // j = ix >> 20
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   311
          lsl(jx, jx, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   312
          sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1);   // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   313
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   314
          block_comment("if(i>16)"); {
51374
7be0084191ed 8206895: aarch64: rework error-prone cmp instuction
bulasevich
parents: 50754
diff changeset
   315
            cmp(tmp3, (u1)16);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   316
            br(LE, X_IS_MEDIUM_BRANCH_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   317
            // i > 16. 2nd iteration needed
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   318
            ldpd(v6, v7, Address(ih, -32));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   319
            fmovd(v28, v3);                        // t = r
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   320
            fmuld(v29, v2, v6);                    // w = v29 = fn * pio2_2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   321
            fsubd(v3, v28, v29);                   // r = t - w
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   322
            fsubd(v31, v28, v3);                   // v31 = (t - r)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   323
            fsubd(v31, v29, v31);                  // v31 = w - (t - r) = - ((t - r) - w)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   324
            fmaddd(v26, v2, v7, v31);              // v26 = w = fn*pio2_2t - ((t - r) - w)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   325
            fsubd(v4, v3, v26);                    // y[0] = r - w
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   326
            fmovd(jx, v4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   327
            lsl(jx, jx, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   328
            sub(tmp3, tmp5, jx, LSR, 32 + 20 + 1); // r7 = j-(((*(i0+(int*)&y[0]))>>20)&0x7ff);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   329
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   330
            block_comment("if(i>49)"); {
51374
7be0084191ed 8206895: aarch64: rework error-prone cmp instuction
bulasevich
parents: 50754
diff changeset
   331
              cmp(tmp3, (u1)49);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   332
              br(LE, X_IS_MEDIUM_BRANCH_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   333
              // 3rd iteration need, 151 bits acc
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   334
              ldpd(v6, v7, Address(ih, -16));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   335
              fmovd(v28, v3);                      // save "r"
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   336
              fmuld(v29, v2, v6);                  // v29 = fn * pio2_3
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   337
              fsubd(v3, v28, v29);                 // r = r - w
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   338
              fsubd(v31, v28, v3);                 // v31 = (t - r)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   339
              fsubd(v31, v29, v31);                // v31 = w - (t - r) = - ((t - r) - w)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   340
              fmaddd(v26, v2, v7, v31);            // v26 = w = fn*pio2_3t - ((t - r) - w)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   341
              fsubd(v4, v3, v26);                  // y[0] = r - w
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   342
            }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   343
          }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   344
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   345
    block_comment("medium x tail"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   346
      bind(X_IS_MEDIUM_BRANCH_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   347
        fsubd(v5, v3, v4);                         // v5 = y[1] = (r - y[0])
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   348
        fsubd(v5, v5, v26);                        // v5 = y[1] = (r - y[0]) - w
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   349
        cmp(X, zr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   350
        br(GT, REDUCTION_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   351
        fnegd(v4, v4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   352
        negw(n, n);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   353
        fnegd(v5, v5);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   354
        b(REDUCTION_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   355
    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   356
  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   357
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   358
  block_comment("all other (large) arguments"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   359
    bind(X_IS_LARGE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   360
      lsr(rscratch1, ix, 20);                      // ix >> 20
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   361
      movz(tmp5, 0x4170, 48);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   362
      subw(rscratch1, rscratch1, 1046);            // e0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   363
      fmovd(v10, tmp5);                            // init two24A value
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   364
      subw(jv, ix, rscratch1, LSL, 20);            // ix - (e0<<20)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   365
      lsl(jv, jv, 32);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   366
      subw(rscratch2, rscratch1, 3);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   367
      bfm(jv, X, 0, 31);                           // jv = z
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   368
      movw(i, 24);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   369
      fmovd(v26, jv);                              // v26 = z
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   370
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   371
      block_comment("unrolled for(i=0;i<2;i++) {tx[i] = (double)((int)(z));z = (z-tx[i])*two24A;}"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   372
        // tx[0,1,2] = v6,v7,v26
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   373
        frintzd(v6, v26);                          // v6 = (double)((int)v26)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   374
        sdivw(jv, rscratch2, i);                   // jv = (e0 - 3)/24
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   375
        fsubd(v26, v26, v6);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   376
        sub(sp, sp, 560);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   377
        fmuld(v26, v26, v10);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   378
        frintzd(v7, v26);                          // v7 = (double)((int)v26)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   379
        movw(jx, 2); // calculate jx as nx - 1, which is initially 2. Not a part of unrolled loop
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   380
        fsubd(v26, v26, v7);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   381
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   382
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   383
      block_comment("nx calculation with unrolled while(tx[nx-1]==zeroA) nx--;"); {
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
   384
        fcmpd(v26, 0.0);                           // if NE then jx == 2. else it's 1 or 0
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   385
        add(iqBase, sp, 480);                      // base of iq[]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   386
        fmuld(v3, v26, v10);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   387
        br(NE, NX_SET);
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
   388
        fcmpd(v7, 0.0);                            // v7 == 0 => jx = 0. Else jx = 1
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   389
        csetw(jx, NE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   390
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   391
    bind(NX_SET);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   392
      generate__kernel_rem_pio2(two_over_pi, pio2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   393
      // now we have y[0] = v4, y[1] = v5 and n = r2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   394
      cmp(X, zr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   395
      br(GE, REDUCTION_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   396
      fnegd(v4, v4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   397
      fnegd(v5, v5);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   398
      negw(n, n);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   399
  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   400
  bind(REDUCTION_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   401
}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   402
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   403
///*
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   404
// * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   405
// * double x[],y[]; int e0,nx,prec; int ipio2[];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   406
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   407
// * __kernel_rem_pio2 return the last three digits of N with
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   408
// *              y = x - N*pi/2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   409
// * so that |y| < pi/2.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   410
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   411
// * The method is to compute the integer (mod 8) and fraction parts of
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   412
// * (2/pi)*x without doing the full multiplication. In general we
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   413
// * skip the part of the product that are known to be a huge integer (
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   414
// * more accurately, = 0 mod 8 ). Thus the number of operations are
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   415
// * independent of the exponent of the input.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   416
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   417
// * NOTE: 2/pi int representation is converted to double
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   418
// * // (2/pi) is represented by an array of 24-bit integers in ipio2[].
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   419
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   420
// * Input parameters:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   421
// *      x[]     The input value (must be positive) is broken into nx
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   422
// *              pieces of 24-bit integers in double precision format.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   423
// *              x[i] will be the i-th 24 bit of x. The scaled exponent
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   424
// *              of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   425
// *              match x's up to 24 bits.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   426
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   427
// *              Example of breaking a double positive z into x[0]+x[1]+x[2]:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   428
// *                      e0 = ilogb(z)-23
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   429
// *                      z  = scalbn(z,-e0)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   430
// *              for i = 0,1,2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   431
// *                      x[i] = floor(z)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   432
// *                      z    = (z-x[i])*2**24
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   433
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   434
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   435
// *      y[]     ouput result in an array of double precision numbers.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   436
// *              The dimension of y[] is:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   437
// *                      24-bit  precision       1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   438
// *                      53-bit  precision       2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   439
// *                      64-bit  precision       2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   440
// *                      113-bit precision       3
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   441
// *              The actual value is the sum of them. Thus for 113-bit
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   442
// *              precsion, one may have to do something like:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   443
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   444
// *              long double t,w,r_head, r_tail;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   445
// *              t = (long double)y[2] + (long double)y[1];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   446
// *              w = (long double)y[0];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   447
// *              r_head = t+w;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   448
// *              r_tail = w - (r_head - t);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   449
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   450
// *      e0      The exponent of x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   451
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   452
// *      nx      dimension of x[]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   453
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   454
// *      prec    an interger indicating the precision:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   455
// *                      0       24  bits (single)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   456
// *                      1       53  bits (double)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   457
// *                      2       64  bits (extended)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   458
// *                      3       113 bits (quad)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   459
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   460
// *      NOTE: ipio2[] array below is converted to double representation
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   461
// *      //ipio2[]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   462
// *      //        integer array, contains the (24*i)-th to (24*i+23)-th
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   463
// *      //        bit of 2/pi after binary point. The corresponding
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   464
// *      //        floating value is
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   465
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   466
// *                      ipio2[i] * 2^(-24(i+1)).
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   467
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   468
// * Here is the description of some local variables:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   469
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   470
// *      jk      jk+1 is the initial number of terms of ipio2[] needed
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   471
// *              in the computation. The recommended value is 2,3,4,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   472
// *              6 for single, double, extended,and quad.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   473
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   474
// *      jz      local integer variable indicating the number of
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   475
// *              terms of ipio2[] used.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   476
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   477
// *      jx      nx - 1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   478
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   479
// *      jv      index for pointing to the suitable ipio2[] for the
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   480
// *              computation. In general, we want
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   481
// *                      ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   482
// *              is an integer. Thus
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   483
// *                      e0-3-24*jv >= 0 or (e0-3)/24 >= jv
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   484
// *              Hence jv = max(0,(e0-3)/24).
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   485
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   486
// *      jp      jp+1 is the number of terms in PIo2[] needed, jp = jk.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   487
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   488
// *      q[]     double array with integral value, representing the
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   489
// *              24-bits chunk of the product of x and 2/pi.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   490
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   491
// *      q0      the corresponding exponent of q[0]. Note that the
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   492
// *              exponent for q[i] would be q0-24*i.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   493
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   494
// *      PIo2[]  double precision array, obtained by cutting pi/2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   495
// *              into 24 bits chunks.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   496
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   497
// *      f[]     ipio2[] in floating point
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   498
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   499
// *      iq[]    integer array by breaking up q[] in 24-bits chunk.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   500
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   501
// *      fq[]    final product of x*(2/pi) in fq[0],..,fq[jk]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   502
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   503
// *      ih      integer. If >0 it indicates q[] is >= 0.5, hence
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   504
// *              it also indicates the *sign* of the result.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   505
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   506
// */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   507
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   508
// Use PIo2 table(see stubRoutines_aarch64.cpp)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   509
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   510
// BEGIN __kernel_rem_pio2 PSEUDO CODE
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   511
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   512
//static int __kernel_rem_pio2(double *x, double *y, int e0, int nx, int prec, /* NOTE: converted to double */ const double *ipio2 // const int *ipio2) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   513
//  int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   514
//  double z,fw,f[20],fq[20],q[20];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   515
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   516
//  /* initialize jk*/
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   517
//  // jk = init_jk[prec]; // NOTE: prec==2 for double. jk is always 4.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   518
//  jp = jk; // NOTE: always 4
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   519
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   520
//  /* determine jx,jv,q0, note that 3>q0 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   521
//  jx =  nx-1;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   522
//  jv = (e0-3)/24; if(jv<0) jv=0;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   523
//  q0 =  e0-24*(jv+1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   524
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   525
//  /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   526
//  j = jv-jx; m = jx+jk;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   527
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   528
//  // NOTE: split into two for-loops: one with zeroB and one with ipio2[j]. It
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   529
//  //       allows the use of wider loads/stores
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   530
//  for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; //(double) ipio2[j];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   531
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   532
//  // NOTE: unrolled and vectorized "for". See comments in asm code
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   533
//  /* compute q[0],q[1],...q[jk] */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   534
//  for (i=0;i<=jk;i++) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   535
//    for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   536
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   537
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   538
//  jz = jk;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   539
//recompute:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   540
//  /* distill q[] into iq[] reversingly */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   541
//  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   542
//    fw    =  (double)((int)(twon24* z));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   543
//    iq[i] =  (int)(z-two24B*fw);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   544
//    z     =  q[j-1]+fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   545
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   546
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   547
//  /* compute n */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   548
//  z  = scalbnA(z,q0);           /* actual value of z */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   549
//  z -= 8.0*floor(z*0.125);              /* trim off integer >= 8 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   550
//  n  = (int) z;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   551
//  z -= (double)n;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   552
//  ih = 0;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   553
//  if(q0>0) {    /* need iq[jz-1] to determine n */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   554
//    i  = (iq[jz-1]>>(24-q0)); n += i;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   555
//    iq[jz-1] -= i<<(24-q0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   556
//    ih = iq[jz-1]>>(23-q0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   557
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   558
//  else if(q0==0) ih = iq[jz-1]>>23;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   559
//  else if(z>=0.5) ih=2;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   560
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   561
//  if(ih>0) {    /* q > 0.5 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   562
//    n += 1; carry = 0;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   563
//    for(i=0;i<jz ;i++) {        /* compute 1-q */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   564
//      j = iq[i];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   565
//      if(carry==0) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   566
//        if(j!=0) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   567
//          carry = 1; iq[i] = 0x1000000- j;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   568
//        }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   569
//      } else  iq[i] = 0xffffff - j;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   570
//    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   571
//    if(q0>0) {          /* rare case: chance is 1 in 12 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   572
//      switch(q0) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   573
//      case 1:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   574
//        iq[jz-1] &= 0x7fffff; break;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   575
//      case 2:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   576
//        iq[jz-1] &= 0x3fffff; break;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   577
//      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   578
//    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   579
//    if(ih==2) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   580
//      z = one - z;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   581
//      if(carry!=0) z -= scalbnA(one,q0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   582
//    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   583
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   584
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   585
//  /* check if recomputation is needed */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   586
//  if(z==zeroB) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   587
//    j = 0;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   588
//    for (i=jz-1;i>=jk;i--) j |= iq[i];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   589
//    if(j==0) { /* need recomputation */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   590
//      for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   591
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   592
//      for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   593
//        f[jx+i] = /* NOTE: converted to double */ ipio2[jv+i]; //(double) ipio2[jv+i];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   594
//        for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   595
//        q[i] = fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   596
//      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   597
//      jz += k;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   598
//      goto recompute;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   599
//    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   600
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   601
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   602
//  /* chop off zero terms */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   603
//  if(z==0.0) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   604
//    jz -= 1; q0 -= 24;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   605
//    while(iq[jz]==0) { jz--; q0-=24;}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   606
//  } else { /* break z into 24-bit if necessary */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   607
//    z = scalbnA(z,-q0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   608
//    if(z>=two24B) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   609
//      fw = (double)((int)(twon24*z));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   610
//      iq[jz] = (int)(z-two24B*fw);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   611
//      jz += 1; q0 += 24;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   612
//      iq[jz] = (int) fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   613
//    } else iq[jz] = (int) z ;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   614
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   615
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   616
//  /* convert integer "bit" chunk to floating-point value */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   617
//  fw = scalbnA(one,q0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   618
//  for(i=jz;i>=0;i--) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   619
//    q[i] = fw*(double)iq[i]; fw*=twon24;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   620
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   621
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   622
//  /* compute PIo2[0,...,jp]*q[jz,...,0] */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   623
//  for(i=jz;i>=0;i--) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   624
//    for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   625
//    fq[jz-i] = fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   626
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   627
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   628
//  // NOTE: switch below is eliminated, because prec is always 2 for doubles
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   629
//  /* compress fq[] into y[] */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   630
//  //switch(prec) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   631
//  //case 0:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   632
//  //  fw = 0.0;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   633
//  //  for (i=jz;i>=0;i--) fw += fq[i];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   634
//  //  y[0] = (ih==0)? fw: -fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   635
//  //  break;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   636
//  //case 1:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   637
//  //case 2:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   638
//    fw = 0.0;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   639
//    for (i=jz;i>=0;i--) fw += fq[i];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   640
//    y[0] = (ih==0)? fw: -fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   641
//    fw = fq[0]-fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   642
//    for (i=1;i<=jz;i++) fw += fq[i];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   643
//    y[1] = (ih==0)? fw: -fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   644
//  //  break;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   645
//  //case 3:       /* painful */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   646
//  //  for (i=jz;i>0;i--) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   647
//  //    fw      = fq[i-1]+fq[i];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   648
//  // fq[i]  += fq[i-1]-fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   649
//  //    fq[i-1] = fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   650
//  //  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   651
//  //  for (i=jz;i>1;i--) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   652
//  //    fw      = fq[i-1]+fq[i];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   653
//  //    fq[i]  += fq[i-1]-fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   654
//  //    fq[i-1] = fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   655
//  //  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   656
//  //  for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   657
//  //  if(ih==0) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   658
//  //    y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   659
//  //  } else {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   660
//  //    y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   661
//  //  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   662
//  //}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   663
//  return n&7;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   664
//}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   665
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   666
// END __kernel_rem_pio2 PSEUDO CODE
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   667
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   668
// Changes between fdlibm and intrinsic:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   669
//     1. One loop is unrolled and vectorized (see comments in code)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   670
//     2. One loop is split into 2 loops (see comments in code)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   671
//     3. Non-double code is removed(last switch). Sevaral variables became
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   672
//         constants because of that (see comments in code)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   673
//     4. Use of jx, which is nx-1 instead of nx
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   674
// Assumptions:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   675
//     1. Assume |X| >= PI/4
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   676
// Input and output:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   677
//     1. Input: X = r0, jx == nx - 1 == r6, e0 == rscratch1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   678
//     2. Return n in r2, y[0] == y0 == v4, y[1] == y1 == v5
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   679
// NOTE: general purpose register names match local variable names in C code
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   680
// NOTE: fpu registers are actively reused. See comments in code about their usage
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   681
void MacroAssembler::generate__kernel_rem_pio2(address two_over_pi, address pio2) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   682
  Label Q_DONE, JX_IS_0, JX_IS_2, COMP_INNER_LOOP, RECOMP_FOR2, Q0_ZERO_CMP_LT,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   683
      RECOMP_CHECK_DONE_NOT_ZERO, Q0_ZERO_CMP_DONE, COMP_FOR, Q0_ZERO_CMP_EQ,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   684
      INIT_F_ZERO, RECOMPUTE, IH_FOR_INCREMENT, IH_FOR_STORE, RECOMP_CHECK_DONE,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   685
      Z_IS_LESS_THAN_TWO24B, Z_IS_ZERO, FW_Y1_NO_NEGATION,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   686
      RECOMP_FW_UPDATED, Z_ZERO_CHECK_DONE, FW_FOR1, IH_AFTER_SWITCH, IH_HANDLED,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   687
      CONVERTION_FOR, FW_Y0_NO_NEGATION, FW_FOR1_DONE, FW_FOR2, FW_FOR2_DONE,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   688
      IH_FOR, SKIP_F_LOAD, RECOMP_FOR1, RECOMP_FIRST_FOR, INIT_F_COPY,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   689
      RECOMP_FOR1_CHECK;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   690
  Register tmp2 = r1, n = r2, jv = r4, tmp5 = r5, jx = r6,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   691
      tmp3 = r7, iqBase = r10, ih = r11, tmp4 = r12, tmp1 = r13,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   692
      jz = r14, j = r15, twoOverPiBase = r16, i = r17, qBase = r18;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   693
    // jp = jk == init_jk[prec] = init_jk[2] == {2,3,4,6}[2] == 4
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   694
    // jx = nx - 1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   695
    lea(twoOverPiBase, ExternalAddress(two_over_pi));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   696
    cmpw(jv, zr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   697
    addw(tmp4, jx, 4); // tmp4 = m = jx + jk = jx + 4. jx is in {0,1,2} so m is in [4,5,6]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   698
    cselw(jv, jv, zr, GE);
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
   699
    fmovd(v26, 0.0);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   700
    addw(tmp5, jv, 1);                    // jv+1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   701
    subsw(j, jv, jx);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   702
    add(qBase, sp, 320);                  // base of q[]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   703
    msubw(rscratch1, i, tmp5, rscratch1); // q0 =  e0-24*(jv+1)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   704
    // use double f[20], fq[20], q[20], iq[20] on stack, which is
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   705
    // (20 + 20 + 20) x 8 + 20 x 4 = 560 bytes. From lower to upper addresses it
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   706
    // will contain f[20], fq[20], q[20], iq[20]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   707
    // now initialize f[20] indexes 0..m (inclusive)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   708
    // for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   709
    mov(tmp5, sp);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   710
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   711
    block_comment("for(i=0;i<=m;i++,j++) f[i] = (j<0)? zeroB : /* NOTE: converted to double */ ipio2[j]; // (double) ipio2[j];"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   712
        eorw(i, i, i);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   713
        br(GE, INIT_F_COPY);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   714
      bind(INIT_F_ZERO);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   715
        stpq(v26, v26, Address(post(tmp5, 32)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   716
        addw(i, i, 4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   717
        addsw(j, j, 4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   718
        br(LT, INIT_F_ZERO);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   719
        subw(i, i, j);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   720
        movw(j, zr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   721
      bind(INIT_F_COPY);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   722
        add(tmp1, twoOverPiBase, j, LSL, 3); // ipio2[j] start address
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   723
        ld1(v18, v19, v20, v21, T16B, tmp1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   724
        add(tmp5, sp, i, ext::uxtx, 3);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   725
        st1(v18, v19, v20, v21, T16B, tmp5);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   726
    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   727
    // v18..v21 can actually contain f[0..7]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   728
    cbz(i, SKIP_F_LOAD); // i == 0 => f[i] == f[0] => already loaded
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   729
    ld1(v18, v19, v20, v21, T2D, Address(sp)); // load f[0..7]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   730
  bind(SKIP_F_LOAD);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   731
    // calculate 2^q0 and 2^-q0, which we'll need further.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   732
    // q0 is exponent. So, calculate biased exponent(q0+1023)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   733
    negw(tmp4, rscratch1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   734
    addw(tmp5, rscratch1, 1023);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   735
    addw(tmp4, tmp4, 1023);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   736
    // Unroll following for(s) depending on jx in [0,1,2]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   737
    // for (i=0;i<=jk;i++) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   738
    //   for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   739
    // }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   740
    // Unrolling for jx == 0 case:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   741
    //   q[0] = x[0] * f[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   742
    //   q[1] = x[0] * f[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   743
    //   q[2] = x[0] * f[2]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   744
    //   q[3] = x[0] * f[3]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   745
    //   q[4] = x[0] * f[4]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   746
    //
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   747
    // Vectorization for unrolled jx == 0 case:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   748
    //   {q[0], q[1]} = {f[0], f[1]} * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   749
    //   {q[2], q[3]} = {f[2], f[3]} * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   750
    //   q[4] = f[4] * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   751
    //
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   752
    // Unrolling for jx == 1 case:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   753
    //   q[0] = x[0] * f[1] + x[1] * f[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   754
    //   q[1] = x[0] * f[2] + x[1] * f[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   755
    //   q[2] = x[0] * f[3] + x[1] * f[2]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   756
    //   q[3] = x[0] * f[4] + x[1] * f[3]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   757
    //   q[4] = x[0] * f[5] + x[1] * f[4]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   758
    //
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   759
    // Vectorization for unrolled jx == 1 case:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   760
    //   {q[0], q[1]} = {f[0], f[1]} * x[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   761
    //   {q[2], q[3]} = {f[2], f[3]} * x[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   762
    //   q[4] = f[4] * x[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   763
    //   {q[0], q[1]} += {f[1], f[2]} * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   764
    //   {q[2], q[3]} += {f[3], f[4]} * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   765
    //   q[4] += f[5] * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   766
    //
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   767
    // Unrolling for jx == 2 case:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   768
    //   q[0] = x[0] * f[2] + x[1] * f[1] + x[2] * f[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   769
    //   q[1] = x[0] * f[3] + x[1] * f[2] + x[2] * f[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   770
    //   q[2] = x[0] * f[4] + x[1] * f[3] + x[2] * f[2]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   771
    //   q[3] = x[0] * f[5] + x[1] * f[4] + x[2] * f[3]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   772
    //   q[4] = x[0] * f[6] + x[1] * f[5] + x[2] * f[4]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   773
    //
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   774
    // Vectorization for unrolled jx == 2 case:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   775
    //   {q[0], q[1]} = {f[0], f[1]} * x[2]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   776
    //   {q[2], q[3]} = {f[2], f[3]} * x[2]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   777
    //   q[4] = f[4] * x[2]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   778
    //   {q[0], q[1]} += {f[1], f[2]} * x[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   779
    //   {q[2], q[3]} += {f[3], f[4]} * x[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   780
    //   q[4] += f[5] * x[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   781
    //   {q[0], q[1]} += {f[2], f[3]} * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   782
    //   {q[2], q[3]} += {f[4], f[5]} * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   783
    //   q[4] += f[6] * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   784
  block_comment("unrolled and vectorized computation of q[0]..q[jk]"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   785
      cmpw(jx, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   786
      lsl(tmp5, tmp5, 52);                     // now it's 2^q0 double value
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   787
      lsl(tmp4, tmp4, 52);                     // now it's 2^-q0 double value
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   788
      br(LT, JX_IS_0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   789
      add(i, sp, 8);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   790
      ldpq(v26, v27, i);                       // load f[1..4]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   791
      br(GT, JX_IS_2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   792
      // jx == 1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   793
      fmulxvs(v28, T2D, v18, v7);              // f[0,1] * x[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   794
      fmulxvs(v29, T2D, v19, v7);              // f[2,3] * x[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   795
      fmuld(v30, v20, v7);                     // f[4] * x[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   796
      fmlavs(v28, T2D, v26, v6, 0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   797
      fmlavs(v29, T2D, v27, v6, 0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   798
      fmlavs(v30, T2D, v6, v20, 1);            // v30 += f[5] * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   799
      b(Q_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   800
    bind(JX_IS_2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   801
      fmulxvs(v28, T2D, v18, v3);              // f[0,1] * x[2]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   802
      fmulxvs(v29, T2D, v19, v3);              // f[2,3] * x[2]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   803
      fmuld(v30, v20, v3);                     // f[4] * x[2]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   804
      fmlavs(v28, T2D, v26, v7, 0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   805
      fmlavs(v29, T2D, v27, v7, 0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   806
      fmlavs(v30, T2D, v7, v20, 1);            // v30 += f[5] * x[1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   807
      fmlavs(v28, T2D, v19, v6, 0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   808
      fmlavs(v29, T2D, v20, v6, 0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   809
      fmlavs(v30, T2D, v6, v21, 0);            // v30 += f[6] * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   810
      b(Q_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   811
    bind(JX_IS_0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   812
      fmulxvs(v28, T2D, v18, v6);              // f[0,1] * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   813
      fmulxvs(v29, T2D, v19, v6);              // f[2,3] * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   814
      fmuld(v30, v20, v6);                     // f[4] * x[0]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   815
    bind(Q_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   816
      st1(v28, v29, v30, T2D, Address(qBase)); // save calculated q[0]...q[jk]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   817
  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   818
  movz(i, 0x3E70, 48);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   819
  movw(jz, 4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   820
  fmovd(v17, i);                               // v17 = twon24
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   821
  fmovd(v30, tmp5);                            // 2^q0
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
   822
  fmovd(v21, 0.125);
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
   823
  fmovd(v20, 8.0);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   824
  fmovd(v22, tmp4);                            // 2^-q0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   825
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   826
  block_comment("recompute loop"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   827
    bind(RECOMPUTE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   828
      //  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   829
      //    fw    =  (double)((int)(twon24* z));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   830
      //    iq[i] =  (int)(z-two24A*fw);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   831
      //    z     =  q[j-1]+fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   832
      //  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   833
      block_comment("distill q[] into iq[] reversingly"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   834
          eorw(i, i, i);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   835
          movw(j, jz);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   836
          add(tmp2, qBase, jz, LSL, 3);                    // q[jz] address
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   837
          ldrd(v18, post(tmp2, -8));                       // z = q[j] and moving address to q[j-1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   838
        bind(RECOMP_FIRST_FOR);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   839
          ldrd(v27, post(tmp2, -8));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   840
          fmuld(v29, v17, v18);                            // twon24*z
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   841
          frintzd(v29, v29);                               // (double)(int)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   842
          fmsubd(v28, v10, v29, v18);                      // v28 = z-two24A*fw
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   843
          fcvtzdw(tmp1, v28);                              // (int)(z-two24A*fw)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   844
          strw(tmp1, Address(iqBase, i, Address::lsl(2)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   845
          faddd(v18, v27, v29);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   846
          add(i, i, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   847
          subs(j, j, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   848
          br(GT, RECOMP_FIRST_FOR);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   849
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   850
      // compute n
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   851
      fmuld(v18, v18, v30);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   852
      fmuld(v2, v18, v21);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   853
      frintmd(v2, v2);                                     // v2 = floor(v2) == rounding towards -inf
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   854
      fmsubd(v18, v2, v20, v18);                           // z -= 8.0*floor(z*0.125);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   855
      movw(ih, 2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   856
      frintzd(v2, v18);                                    // v2 = (double)((int)z)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   857
      fcvtzdw(n, v18);                                     // n  = (int) z;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   858
      fsubd(v18, v18, v2);                                 // z -= (double)n;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   859
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   860
      block_comment("q0-dependent initialization"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   861
          cmpw(rscratch1, 0);                              // if (q0 > 0)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   862
          br(LT, Q0_ZERO_CMP_LT);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   863
          subw(j, jz, 1);                                  // j = jz - 1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   864
          ldrw(tmp2, Address(iqBase, j, Address::lsl(2))); // tmp2 = iq[jz-1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   865
          br(EQ, Q0_ZERO_CMP_EQ);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   866
          movw(tmp4, 24);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   867
          subw(tmp4, tmp4, rscratch1);                     // == 24 - q0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   868
          lsrvw(i, tmp2, tmp4);                            // i = iq[jz-1] >> (24-q0)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   869
          lslvw(tmp5, i, tmp4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   870
          subw(tmp2, tmp2, tmp5);                          // iq[jz-1] -= i<<(24-q0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   871
          strw(tmp2, Address(iqBase, j, Address::lsl(2))); // store iq[jz-1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   872
          subw(rscratch2, tmp4, 1);                        // == 23 - q0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   873
          addw(n, n, i);                                   // n+=i
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   874
          lsrvw(ih, tmp2, rscratch2);                      // ih = iq[jz-1] >> (23-q0)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   875
          b(Q0_ZERO_CMP_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   876
        bind(Q0_ZERO_CMP_EQ);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   877
          lsr(ih, tmp2, 23);                               // ih = iq[z-1] >> 23
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   878
          b(Q0_ZERO_CMP_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   879
        bind(Q0_ZERO_CMP_LT);
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
   880
          fmovd(v4, 0.5);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   881
          fcmpd(v18, v4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   882
          cselw(ih, zr, ih, LT);                           // if (z<0.5) ih = 0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   883
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   884
    bind(Q0_ZERO_CMP_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   885
      cmpw(ih, zr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   886
      br(LE, IH_HANDLED);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   887
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   888
    block_comment("if(ih>) {"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   889
      // use rscratch2 as carry
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   890
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   891
      block_comment("for(i=0;i<jz ;i++) {...}"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   892
          addw(n, n, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   893
          eorw(i, i, i);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   894
          eorw(rscratch2, rscratch2, rscratch2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   895
        bind(IH_FOR);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   896
          ldrw(j, Address(iqBase, i, Address::lsl(2)));    // j = iq[i]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   897
          movw(tmp3, 0x1000000);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   898
          subw(tmp3, tmp3, rscratch2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   899
          cbnzw(rscratch2, IH_FOR_STORE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   900
          cbzw(j, IH_FOR_INCREMENT);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   901
          movw(rscratch2, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   902
        bind(IH_FOR_STORE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   903
          subw(tmp3, tmp3, j);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   904
          strw(tmp3, Address(iqBase, i, Address::lsl(2))); // iq[i] = 0xffffff - j
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   905
        bind(IH_FOR_INCREMENT);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   906
          addw(i, i, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   907
          cmpw(i, jz);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   908
          br(LT, IH_FOR);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   909
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   910
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   911
      block_comment("if(q0>0) {"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   912
        cmpw(rscratch1, zr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   913
        br(LE, IH_AFTER_SWITCH);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   914
        // tmp3 still has iq[jz-1] value. no need to reload
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   915
        // now, zero high tmp3 bits (rscratch1 number of bits)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   916
        movw(j, -1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   917
        subw(i, jz, 1);                                    // set i to jz-1
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   918
        lsrv(j, j, rscratch1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   919
        andw(tmp3, tmp3, j, LSR, 8);                       // we have 24-bit-based constants
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   920
        strw(tmp3, Address(iqBase, i, Address::lsl(2)));   // save iq[jz-1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   921
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   922
      bind(IH_AFTER_SWITCH);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   923
        cmpw(ih, 2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   924
        br(NE, IH_HANDLED);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   925
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   926
        block_comment("if(ih==2) {"); {
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
   927
          fmovd(v25, 1.0);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   928
          fsubd(v18, v25, v18);                            // z = one - z;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   929
          cbzw(rscratch2, IH_HANDLED);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   930
          fsubd(v18, v18, v30);                            // z -= scalbnA(one,q0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   931
        }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   932
    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   933
    bind(IH_HANDLED);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   934
      // check if recomputation is needed
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
   935
      fcmpd(v18, 0.0);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   936
      br(NE, RECOMP_CHECK_DONE_NOT_ZERO);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   937
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   938
      block_comment("if(z==zeroB) {"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   939
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   940
        block_comment("for (i=jz-1;i>=jk;i--) j |= iq[i];"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   941
            subw(i, jz, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   942
            eorw(j, j, j);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   943
            b(RECOMP_FOR1_CHECK);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   944
          bind(RECOMP_FOR1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   945
            ldrw(tmp1, Address(iqBase, i, Address::lsl(2)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   946
            orrw(j, j, tmp1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   947
            subw(i, i, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   948
          bind(RECOMP_FOR1_CHECK);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   949
            cmpw(i, 4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   950
            br(GE, RECOMP_FOR1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   951
        }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   952
        cbnzw(j, RECOMP_CHECK_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   953
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   954
        block_comment("if(j==0) {"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   955
            // for(k=1;iq[jk-k]==0;k++); // let's unroll it. jk == 4. So, read
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   956
            // iq[3], iq[2], iq[1], iq[0] until non-zero value
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   957
            ldp(tmp1, tmp3, iqBase);               // iq[0..3]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   958
            movw(j, 2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   959
            cmp(tmp3, zr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   960
            csel(tmp1, tmp1, tmp3, EQ);            // set register for further consideration
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   961
            cselw(j, j, zr, EQ);                   // set initial k. Use j as k
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   962
            cmp(zr, tmp1, LSR, 32);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   963
            addw(i, jz, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   964
            csincw(j, j, j, NE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   965
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   966
          block_comment("for(i=jz+1;i<=jz+k;i++) {...}"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   967
              addw(jz, i, j); // i = jz+1, j = k-1. j+i = jz+k (which is a new jz)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   968
            bind(RECOMP_FOR2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   969
              addw(tmp1, jv, i);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   970
              ldrd(v29, Address(twoOverPiBase, tmp1, Address::lsl(3)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   971
              addw(tmp2, jx, i);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   972
              strd(v29, Address(sp, tmp2, Address::lsl(3)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   973
              // f[jx+i] = /* NOTE: converted to double */ ipio2[jv+i]; //(double) ipio2[jv+i];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   974
              // since jx = 0, 1 or 2 we can unroll it:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   975
              // for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   976
              // f[jx+i-j] == (for first iteration) f[jx+i], which is already v29
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   977
              add(tmp2, sp, tmp2, ext::uxtx, 3); // address of f[jx+i]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   978
              ldpd(v4, v5, Address(tmp2, -16)); // load f[jx+i-2] and f[jx+i-1]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   979
              fmuld(v26, v6, v29); // initial fw
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   980
              cbzw(jx, RECOMP_FW_UPDATED);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   981
              fmaddd(v26, v7, v5, v26);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   982
              cmpw(jx, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   983
              br(EQ, RECOMP_FW_UPDATED);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   984
              fmaddd(v26, v3, v4, v26);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   985
            bind(RECOMP_FW_UPDATED);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   986
              strd(v26, Address(qBase, i, Address::lsl(3))); // q[i] = fw;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   987
              addw(i, i, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   988
              cmpw(i, jz);                                   // jz here is "old jz" + k
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   989
              br(LE, RECOMP_FOR2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   990
          }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   991
            b(RECOMPUTE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   992
        }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   993
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   994
    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   995
    bind(RECOMP_CHECK_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   996
      // chop off zero terms
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
   997
      fcmpd(v18, 0.0);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   998
      br(EQ, Z_IS_ZERO);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
   999
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1000
      block_comment("else block of if(z==0.0) {"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1001
        bind(RECOMP_CHECK_DONE_NOT_ZERO);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1002
          fmuld(v18, v18, v22);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1003
          fcmpd(v18, v10);                                   // v10 is stil two24A
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1004
          br(LT, Z_IS_LESS_THAN_TWO24B);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1005
          fmuld(v1, v18, v17);                               // twon24*z
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1006
          frintzd(v1, v1);                                   // v1 = (double)(int)(v1)
51739
7bed934d439e 8210461: AArch64: Math.cos intrinsic gives incorrect results
dpochepk
parents: 51374
diff changeset
  1007
          fmsubd(v2, v10, v1, v18);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1008
          fcvtzdw(tmp3, v1);                                 // (int)fw
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1009
          fcvtzdw(tmp2, v2);                                 // double to int
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1010
          strw(tmp2, Address(iqBase, jz, Address::lsl(2)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1011
          addw(rscratch1, rscratch1, 24);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1012
          addw(jz, jz, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1013
          strw(tmp3, Address(iqBase, jz, Address::lsl(2)));  // iq[jz] = (int) fw
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1014
          b(Z_ZERO_CHECK_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1015
        bind(Z_IS_LESS_THAN_TWO24B);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1016
          fcvtzdw(tmp3, v18);                                // (int)z
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1017
          strw(tmp3, Address(iqBase, jz, Address::lsl(2)));  // iq[jz] = (int) z
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1018
          b(Z_ZERO_CHECK_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1019
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1020
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1021
      block_comment("if(z==0.0) {"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1022
        bind(Z_IS_ZERO);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1023
          subw(jz, jz, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1024
          ldrw(tmp1, Address(iqBase, jz, Address::lsl(2)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1025
          subw(rscratch1, rscratch1, 24);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1026
          cbz(tmp1, Z_IS_ZERO);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1027
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1028
      bind(Z_ZERO_CHECK_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1029
        // convert integer "bit" chunk to floating-point value
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1030
        // v17 = twon24
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1031
        // update v30, which was scalbnA(1.0, <old q0>);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1032
        addw(tmp2, rscratch1, 1023); // biased exponent
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1033
        lsl(tmp2, tmp2, 52); // put at correct position
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1034
        mov(i, jz);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1035
        fmovd(v30, tmp2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1036
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1037
        block_comment("for(i=jz;i>=0;i--) {q[i] = fw*(double)iq[i]; fw*=twon24;}"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1038
          bind(CONVERTION_FOR);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1039
            ldrw(tmp1, Address(iqBase, i, Address::lsl(2)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1040
            scvtfwd(v31, tmp1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1041
            fmuld(v31, v31, v30);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1042
            strd(v31, Address(qBase, i, Address::lsl(3)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1043
            fmuld(v30, v30, v17);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1044
            subsw(i, i, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1045
            br(GE, CONVERTION_FOR);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1046
        }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1047
        add(rscratch2, sp, 160); // base for fq
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1048
        // reusing twoOverPiBase
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1049
        lea(twoOverPiBase, ExternalAddress(pio2));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1050
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1051
      block_comment("compute PIo2[0,...,jp]*q[jz,...,0]. for(i=jz;i>=0;i--) {...}"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1052
          movw(i, jz);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1053
          movw(tmp2, zr); // tmp2 will keep jz - i == 0 at start
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1054
        bind(COMP_FOR);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1055
          // for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
  1056
          fmovd(v30, 0.0);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1057
          add(tmp5, qBase, i, LSL, 3); // address of q[i+k] for k==0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1058
          movw(tmp3, 4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1059
          movw(tmp4, zr);              // used as k
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1060
          cmpw(tmp2, 4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1061
          add(tmp1, qBase, i, LSL, 3); // used as q[i] address
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1062
          cselw(tmp3, tmp2, tmp3, LE); // min(jz - i, jp)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1063
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1064
          block_comment("for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1065
            bind(COMP_INNER_LOOP);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1066
              ldrd(v18, Address(tmp1, tmp4, Address::lsl(3)));          // q[i+k]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1067
              ldrd(v19, Address(twoOverPiBase, tmp4, Address::lsl(3))); // PIo2[k]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1068
              fmaddd(v30, v18, v19, v30);                               // fw += PIo2[k]*q[i+k];
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1069
              addw(tmp4, tmp4, 1);                                      // k++
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1070
              cmpw(tmp4, tmp3);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1071
              br(LE, COMP_INNER_LOOP);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1072
          }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1073
          strd(v30, Address(rscratch2, tmp2, Address::lsl(3)));         // fq[jz-i]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1074
          add(tmp2, tmp2, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1075
          subsw(i, i, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1076
          br(GE, COMP_FOR);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1077
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1078
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1079
      block_comment("switch(prec) {...}. case 2:"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1080
        // compress fq into y[]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1081
        // remember prec == 2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1082
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1083
        block_comment("for (i=jz;i>=0;i--) fw += fq[i];"); {
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
  1084
            fmovd(v4, 0.0);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1085
            mov(i, jz);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1086
          bind(FW_FOR1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1087
            ldrd(v1, Address(rscratch2, i, Address::lsl(3)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1088
            subsw(i, i, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1089
            faddd(v4, v4, v1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1090
            br(GE, FW_FOR1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1091
        }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1092
        bind(FW_FOR1_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1093
          // v1 contains fq[0]. so, keep it so far
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1094
          fsubd(v5, v1, v4); // fw = fq[0] - fw
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1095
          cbzw(ih, FW_Y0_NO_NEGATION);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1096
          fnegd(v4, v4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1097
        bind(FW_Y0_NO_NEGATION);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1098
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1099
        block_comment("for (i=1;i<=jz;i++) fw += fq[i];"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1100
            movw(i, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1101
              cmpw(jz, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1102
            br(LT, FW_FOR2_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1103
          bind(FW_FOR2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1104
            ldrd(v1, Address(rscratch2, i, Address::lsl(3)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1105
            addw(i, i, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1106
            cmp(i, jz);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1107
            faddd(v5, v5, v1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1108
            br(LE, FW_FOR2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1109
        }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1110
        bind(FW_FOR2_DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1111
          cbz(ih, FW_Y1_NO_NEGATION);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1112
          fnegd(v5, v5);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1113
        bind(FW_Y1_NO_NEGATION);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1114
          add(sp, sp, 560);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1115
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1116
}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1117
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1118
///* __kernel_sin( x, y, iy)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1119
// * kernel sin function on [-pi/4, pi/4], pi/4 ~ 0.7854
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1120
// * Input x is assumed to be bounded by ~pi/4 in magnitude.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1121
// * Input y is the tail of x.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1122
// * Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1123
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1124
// * Algorithm
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1125
// *      1. Since sin(-x) = -sin(x), we need only to consider positive x.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1126
// *      2. if x < 2^-27 (hx<0x3e400000 0), return x with inexact if x!=0.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1127
// *      3. sin(x) is approximated by a polynomial of degree 13 on
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1128
// *         [0,pi/4]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1129
// *                               3            13
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1130
// *              sin(x) ~ x + S1*x + ... + S6*x
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1131
// *         where
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1132
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1133
// *      |sin(x)         2     4     6     8     10     12  |     -58
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1134
// *      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1135
// *      |  x                                               |
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1136
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1137
// *      4. sin(x+y) = sin(x) + sin'(x')*y
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1138
// *                  ~ sin(x) + (1-x*x/2)*y
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1139
// *         For better accuracy, let
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1140
// *                   3      2      2      2      2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1141
// *              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1142
// *         then                   3    2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1143
// *              sin(x) = x + (S1*x + (x *(r-y/2)+y))
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1144
// */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1145
//static const double
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1146
//S1  = -1.66666666666666324348e-01, /* 0xBFC55555, 0x55555549 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1147
//S2  =  8.33333333332248946124e-03, /* 0x3F811111, 0x1110F8A6 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1148
//S3  = -1.98412698298579493134e-04, /* 0xBF2A01A0, 0x19C161D5 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1149
//S4  =  2.75573137070700676789e-06, /* 0x3EC71DE3, 0x57B1FE7D */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1150
//S5  = -2.50507602534068634195e-08, /* 0xBE5AE5E6, 0x8A2B9CEB */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1151
//S6  =  1.58969099521155010221e-10; /* 0x3DE5D93A, 0x5ACFD57C */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1152
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1153
// NOTE: S1..S6 were moved into a table: StubRoutines::aarch64::_dsin_coef
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1154
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1155
// BEGIN __kernel_sin PSEUDO CODE
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1156
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1157
//static double __kernel_sin(double x, double y, bool iy)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1158
//{
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1159
//        double z,r,v;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1160
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1161
//        // NOTE: not needed. moved to dsin/dcos
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1162
//        //int ix;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1163
//        //ix = high(x)&0x7fffffff;                /* high word of x */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1164
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1165
//        // NOTE: moved to dsin/dcos
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1166
//        //if(ix<0x3e400000)                       /* |x| < 2**-27 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1167
//        //   {if((int)x==0) return x;}            /* generate inexact */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1168
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1169
//        z       =  x*x;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1170
//        v       =  z*x;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1171
//        r       =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1172
//        if(iy==0) return x+v*(S1+z*r);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1173
//        else      return x-((z*(half*y-v*r)-y)-v*S1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1174
//}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1175
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1176
// END __kernel_sin PSEUDO CODE
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1177
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1178
// Changes between fdlibm and intrinsic:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1179
//     1. Removed |x| < 2**-27 check, because if was done earlier in dsin/dcos
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1180
//     2. Constants are now loaded from table dsin_coef
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1181
//     3. C code parameter "int iy" was modified to "bool iyIsOne", because
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1182
//         iy is always 0 or 1. Also, iyIsOne branch was moved into
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1183
//         generation phase instead of taking it during code execution
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1184
// Input ans output:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1185
//     1. Input for generated function: X argument = x
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1186
//     2. Input for generator: x = register to read argument from, iyIsOne
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1187
//         = flag to use low argument low part or not, dsin_coef = coefficients
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1188
//         table address
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1189
//     3. Return sin(x) value in v0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1190
void MacroAssembler::generate_kernel_sin(FloatRegister x, bool iyIsOne,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1191
    address dsin_coef) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1192
  FloatRegister y = v5, z = v6, v = v7, r = v16, S1 = v17, S2 = v18,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1193
      S3 = v19, S4 = v20, S5 = v21, S6 = v22, half = v23;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1194
  lea(rscratch2, ExternalAddress(dsin_coef));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1195
  ldpd(S5, S6, Address(rscratch2, 32));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1196
  fmuld(z, x, x); // z =  x*x;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1197
  ld1(S1, S2, S3, S4, T1D, Address(rscratch2));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1198
  fmuld(v, z, x); // v =  z*x;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1199
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1200
  block_comment("calculate r =  S2+z*(S3+z*(S4+z*(S5+z*S6)))"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1201
    fmaddd(r, z, S6, S5);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1202
    // initialize "half" in current block to utilize 2nd FPU. However, it's
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1203
    // not a part of this block
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1204
    fmovd(half, 0.5);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1205
    fmaddd(r, z, r, S4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1206
    fmaddd(r, z, r, S3);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1207
    fmaddd(r, z, r, S2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1208
  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1209
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1210
  if (!iyIsOne) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1211
    // return x+v*(S1+z*r);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1212
    fmaddd(S1, z, r, S1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1213
    fmaddd(v0, v, S1, x);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1214
  } else {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1215
    // return x-((z*(half*y-v*r)-y)-v*S1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1216
    fmuld(S6, half, y);    // half*y
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1217
    fmsubd(S6, v, r, S6);  // half*y-v*r
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1218
    fmsubd(S6, z, S6, y);  // y - z*(half*y-v*r) = - (z*(half*y-v*r)-y)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1219
    fmaddd(S6, v, S1, S6); // - (z*(half*y-v*r)-y) + v*S1 == -((z*(half*y-v*r)-y)-v*S1)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1220
    faddd(v0, x, S6);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1221
  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1222
}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1223
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1224
///*
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1225
// * __kernel_cos( x,  y )
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1226
// * kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1227
// * Input x is assumed to be bounded by ~pi/4 in magnitude.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1228
// * Input y is the tail of x.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1229
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1230
// * Algorithm
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1231
// *      1. Since cos(-x) = cos(x), we need only to consider positive x.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1232
// *      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1233
// *      3. cos(x) is approximated by a polynomial of degree 14 on
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1234
// *         [0,pi/4]
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1235
// *                                       4            14
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1236
// *              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1237
// *         where the remez error is
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1238
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1239
// *      |              2     4     6     8     10    12     14 |     -58
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1240
// *      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1241
// *      |                                                      |
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1242
// *
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1243
// *                     4     6     8     10    12     14
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1244
// *      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1245
// *             cos(x) = 1 - x*x/2 + r
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1246
// *         since cos(x+y) ~ cos(x) - sin(x)*y
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1247
// *                        ~ cos(x) - x*y,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1248
// *         a correction term is necessary in cos(x) and hence
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1249
// *              cos(x+y) = 1 - (x*x/2 - (r - x*y))
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1250
// *         For better accuracy when x > 0.3, let qx = |x|/4 with
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1251
// *         the last 32 bits mask off, and if x > 0.78125, let qx = 0.28125.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1252
// *         Then
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1253
// *              cos(x+y) = (1-qx) - ((x*x/2-qx) - (r-x*y)).
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1254
// *         Note that 1-qx and (x*x/2-qx) is EXACT here, and the
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1255
// *         magnitude of the latter is at least a quarter of x*x/2,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1256
// *         thus, reducing the rounding error in the subtraction.
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1257
// */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1258
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1259
//static const double
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1260
//C1  =  4.16666666666666019037e-02, /* 0x3FA55555, 0x5555554C */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1261
//C2  = -1.38888888888741095749e-03, /* 0xBF56C16C, 0x16C15177 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1262
//C3  =  2.48015872894767294178e-05, /* 0x3EFA01A0, 0x19CB1590 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1263
//C4  = -2.75573143513906633035e-07, /* 0xBE927E4F, 0x809C52AD */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1264
//C5  =  2.08757232129817482790e-09, /* 0x3E21EE9E, 0xBDB4B1C4 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1265
//C6  = -1.13596475577881948265e-11; /* 0xBDA8FAE9, 0xBE8838D4 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1266
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1267
// NOTE: C1..C6 were moved into a table: StubRoutines::aarch64::_dcos_coef
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1268
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1269
// BEGIN __kernel_cos PSEUDO CODE
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1270
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1271
//static double __kernel_cos(double x, double y)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1272
//{
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1273
//  double a,h,z,r,qx=0;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1274
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1275
//  // NOTE: ix is already initialized in dsin/dcos. Reuse value from register
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1276
//  //int ix;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1277
//  //ix = high(x)&0x7fffffff;              /* ix = |x|'s high word*/
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1278
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1279
//  // NOTE: moved to dsin/dcos
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1280
//  //if(ix<0x3e400000) {                   /* if x < 2**27 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1281
//  //  if(((int)x)==0) return one;         /* generate inexact */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1282
//  //}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1283
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1284
//  z  = x*x;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1285
//  r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1286
//  if(ix < 0x3FD33333)                   /* if |x| < 0.3 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1287
//    return one - (0.5*z - (z*r - x*y));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1288
//  else {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1289
//    if(ix > 0x3fe90000) {               /* x > 0.78125 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1290
//      qx = 0.28125;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1291
//    } else {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1292
//      set_high(&qx, ix-0x00200000); /* x/4 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1293
//      set_low(&qx, 0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1294
//    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1295
//    h = 0.5*z-qx;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1296
//    a = one-qx;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1297
//    return a - (h - (z*r-x*y));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1298
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1299
//}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1300
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1301
// END __kernel_cos PSEUDO CODE
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1302
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1303
// Changes between fdlibm and intrinsic:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1304
//     1. Removed |x| < 2**-27 check, because if was done earlier in dsin/dcos
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1305
//     2. Constants are now loaded from table dcos_coef
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1306
// Input and output:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1307
//     1. Input for generated function: X argument = x
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1308
//     2. Input for generator: x = register to read argument from, dcos_coef
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1309
//        = coefficients table address
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1310
//     2. Return cos(x) value in v0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1311
void MacroAssembler::generate_kernel_cos(FloatRegister x, address dcos_coef) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1312
  Register ix = r3;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1313
  FloatRegister qx = v1, h = v2, a = v3, y = v5, z = v6, r = v7, C1 = v18,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1314
      C2 = v19, C3 = v20, C4 = v21, C5 = v22, C6 = v23, one = v25, half = v26;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1315
  Label IX_IS_LARGE, SET_QX_CONST, DONE, QX_SET;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1316
    lea(rscratch2, ExternalAddress(dcos_coef));
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1317
    ldpd(C5, C6, Address(rscratch2, 32));         // load C5, C6
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1318
    fmuld(z, x, x);                               // z=x^2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1319
    ld1(C1, C2, C3, C4, T1D, Address(rscratch2)); // load C1..C3\4
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1320
    block_comment("calculate r = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))))"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1321
      fmaddd(r, z, C6, C5);
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
  1322
      fmovd(half, 0.5);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1323
      fmaddd(r, z, r, C4);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1324
      fmuld(y, x, y);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1325
      fmaddd(r, z, r, C3);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1326
      mov(rscratch1, 0x3FD33333);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1327
      fmaddd(r, z, r, C2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1328
      fmuld(x, z, z);                             // x = z^2
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1329
      fmaddd(r, z, r, C1);                        // r = C1+z(C2+z(C4+z(C5+z*C6)))
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1330
    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1331
    // need to multiply r by z to have "final" r value
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
  1332
    fmovd(one, 1.0);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1333
    cmp(ix, rscratch1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1334
    br(GT, IX_IS_LARGE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1335
    block_comment("if(ix < 0x3FD33333) return one - (0.5*z - (z*r - x*y))"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1336
      // return 1.0 - (0.5*z - (z*r - x*y)) = 1.0 - (0.5*z + (x*y - z*r))
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1337
      fmsubd(v0, x, r, y);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1338
      fmaddd(v0, half, z, v0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1339
      fsubd(v0, one, v0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1340
      b(DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1341
    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1342
  block_comment("if(ix >= 0x3FD33333)"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1343
    bind(IX_IS_LARGE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1344
      movz(rscratch2, 0x3FE9, 16);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1345
      cmp(ix, rscratch2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1346
      br(GT, SET_QX_CONST);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1347
      block_comment("set_high(&qx, ix-0x00200000); set_low(&qx, 0);"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1348
        subw(rscratch2, ix, 0x00200000);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1349
        lsl(rscratch2, rscratch2, 32);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1350
        fmovd(qx, rscratch2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1351
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1352
      b(QX_SET);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1353
    bind(SET_QX_CONST);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1354
      block_comment("if(ix > 0x3fe90000) qx = 0.28125;"); {
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
  1355
        fmovd(qx, 0.28125);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1356
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1357
    bind(QX_SET);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1358
      fnmsub(C6, x, r, y);    // z*r - xy
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1359
      fnmsub(h, half, z, qx); // h = 0.5*z - qx
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1360
      fsubd(a, one, qx);      // a = 1-qx
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1361
      fsubd(C6, h, C6);       // = h - (z*r - x*y)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1362
      fsubd(v0, a, C6);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1363
  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1364
  bind(DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1365
}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1366
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1367
// generate_dsin_dcos creates stub for dsin and dcos
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1368
// Generation is done via single call because dsin and dcos code is almost the
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1369
// same(see C code below). These functions work as follows:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1370
// 1) handle corner cases: |x| ~< pi/4, x is NaN or INF, |x| < 2**-27
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1371
// 2) perform argument reduction if required
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1372
// 3) call kernel_sin or kernel_cos which approximate sin/cos via polynomial
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1373
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1374
// BEGIN dsin/dcos PSEUDO CODE
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1375
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1376
//dsin_dcos(jdouble x, bool isCos) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1377
//  double y[2],z=0.0;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1378
//  int n, ix;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1379
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1380
//  /* High word of x. */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1381
//  ix = high(x);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1382
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1383
//  /* |x| ~< pi/4 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1384
//  ix &= 0x7fffffff;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1385
//  if(ix <= 0x3fe921fb) return isCos ? __kernel_cos : __kernel_sin(x,z,0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1386
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1387
//  /* sin/cos(Inf or NaN) is NaN */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1388
//  else if (ix>=0x7ff00000) return x-x;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1389
//  else if (ix<0x3e400000) {                   /* if ix < 2**27 */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1390
//    if(((int)x)==0) return isCos ? one : x;         /* generate inexact */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1391
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1392
//  /* argument reduction needed */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1393
//  else {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1394
//    n = __ieee754_rem_pio2(x,y);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1395
//    switch(n&3) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1396
//    case 0: return isCos ?  __kernel_cos(y[0],y[1])      :  __kernel_sin(y[0],y[1], true);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1397
//    case 1: return isCos ? -__kernel_sin(y[0],y[1],true) :  __kernel_cos(y[0],y[1]);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1398
//    case 2: return isCos ? -__kernel_cos(y[0],y[1])      : -__kernel_sin(y[0],y[1], true);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1399
//    default:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1400
//      return isCos ? __kernel_sin(y[0],y[1],1) : -__kernel_cos(y[0],y[1]);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1401
//    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1402
//  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1403
//}
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1404
// END dsin/dcos PSEUDO CODE
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1405
//
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1406
// Changes between fdlibm and intrinsic:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1407
//     1. Moved ix < 2**27 from kernel_sin/kernel_cos into dsin/dcos
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1408
//     2. Final switch use equivalent bit checks(tbz/tbnz)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1409
// Input ans output:
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1410
//     1. Input for generated function: X = r0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1411
//     2. Input for generator: isCos = generate sin or cos, npio2_hw = address
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1412
//         of npio2_hw table, two_over_pi = address of two_over_pi table,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1413
//         pio2 = address if pio2 table, dsin_coef = address if dsin_coef table,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1414
//         dcos_coef = address of dcos_coef table
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1415
//     3. Return result in v0
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1416
// NOTE: general purpose register names match local variable names in C code
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1417
void MacroAssembler::generate_dsin_dcos(bool isCos, address npio2_hw,
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1418
    address two_over_pi, address pio2, address dsin_coef, address dcos_coef) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1419
  const int POSITIVE_INFINITY_OR_NAN_PREFIX = 0x7FF0;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1420
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1421
  Label DONE, ARG_REDUCTION, TINY_X, RETURN_SIN, EARLY_CASE;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1422
  Register X = r0, absX = r1, n = r2, ix = r3;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1423
  FloatRegister y0 = v4, y1 = v5;
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1424
    block_comment("check |x| ~< pi/4, NaN, Inf and |x| < 2**-27 cases"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1425
      fmovd(X, v0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1426
      mov(rscratch2, 0x3e400000);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1427
      mov(rscratch1, 0x3fe921fb00000000);            // pi/4. shifted to reuse later
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1428
      ubfm(absX, X, 0, 62);                          // absX
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1429
      movz(r10, POSITIVE_INFINITY_OR_NAN_PREFIX, 48);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1430
      cmp(rscratch2, absX, LSR, 32);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1431
      lsr(ix, absX, 32);                             // set ix
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1432
      br(GT, TINY_X);                                // handle tiny x (|x| < 2^-27)
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1433
      cmp(ix, rscratch1, LSR, 32);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1434
      br(LE, EARLY_CASE);                            // if(ix <= 0x3fe921fb) return
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1435
      cmp(absX, r10);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1436
      br(LT, ARG_REDUCTION);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1437
      // X is NaN or INF(i.e. 0x7FF* or 0xFFF*). Return NaN (mantissa != 0).
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1438
      // Set last bit unconditionally to make it NaN
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1439
      orr(r10, r10, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1440
      fmovd(v0, r10);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1441
      ret(lr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1442
    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1443
  block_comment("kernel_sin/kernel_cos: if(ix<0x3e400000) {<fast return>}"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1444
    bind(TINY_X);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1445
      if (isCos) {
55398
e53ec3b362f4 8224851: AArch64: fix warnings and errors with Clang and GCC 8.3
ngasson
parents: 51739
diff changeset
  1446
        fmovd(v0, 1.0);
50754
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1447
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1448
      ret(lr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1449
  }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1450
  bind(ARG_REDUCTION); /* argument reduction needed */
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1451
    block_comment("n = __ieee754_rem_pio2(x,y);"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1452
      generate__ieee754_rem_pio2(npio2_hw, two_over_pi, pio2);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1453
    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1454
    block_comment("switch(n&3) {case ... }"); {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1455
      if (isCos) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1456
        eorw(absX, n, n, LSR, 1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1457
        tbnz(n, 0, RETURN_SIN);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1458
      } else {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1459
        tbz(n, 0, RETURN_SIN);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1460
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1461
      generate_kernel_cos(y0, dcos_coef);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1462
      if (isCos) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1463
        tbz(absX, 0, DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1464
      } else {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1465
        tbz(n, 1, DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1466
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1467
      fnegd(v0, v0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1468
      ret(lr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1469
    bind(RETURN_SIN);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1470
      generate_kernel_sin(y0, true, dsin_coef);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1471
      if (isCos) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1472
        tbz(absX, 0, DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1473
      } else {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1474
        tbz(n, 1, DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1475
      }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1476
      fnegd(v0, v0);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1477
      ret(lr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1478
    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1479
  bind(EARLY_CASE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1480
    eor(y1, T8B, y1, y1);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1481
    if (isCos) {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1482
      generate_kernel_cos(v0, dcos_coef);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1483
    } else {
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1484
      generate_kernel_sin(v0, false, dsin_coef);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1485
    }
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1486
  bind(DONE);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1487
    ret(lr);
ccb8aa083958 8189105: AARCH64: create intrinsic for sin and cos
dpochepk
parents:
diff changeset
  1488
}