sysdeps/ia64/fpu/libm_tan.S

   1 .file "libm_tan.s"
   2
   3 // Copyright (C) 2000, 2001, Intel Corporation
   4 // All rights reserved.
   5 //
   6 // Contributed 2/2/2000 by John Harrison, Ted Kubaska, Bob Norin, Shane Story,
   7 // and Ping Tak Peter Tang of the Computational Software Lab, Intel Corporation.
   8 //
   9 // Redistribution and use in source and binary forms, with or without
  10 // modification, are permitted provided that the following conditions are
  11 // met:
  12 //
  13 // * Redistributions of source code must retain the above copyright
  14 // notice, this list of conditions and the following disclaimer.
  15 //
  16 // * Redistributions in binary form must reproduce the above copyright
  17 // notice, this list of conditions and the following disclaimer in the
  18 // documentation and/or other materials provided with the distribution.
  19 //
  20 // * The name of Intel Corporation may not be used to endorse or promote
  21 // products derived from this software without specific prior written
  22 // permission.
  23 //
  24 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  27 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
  28 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  29 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  30 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  31 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  32 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
  33 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  34 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  35 //
  36 // Intel Corporation is the author of this code, and requests that all
  37 // problem reports or change requests be submitted to it directly at
  38 // http://developer.intel.com/opensource.
  39 //
  40 // *********************************************************************
  41 //
  42 // History:
  43 // 02/02/00 Initial Version
  44 // 4/04/00  Unwind support added
  45 // 12/28/00 Fixed false invalid flags
  46 //
  47 // *********************************************************************
  48 //
  49 // Function:   tan(x) = tangent(x), for double precision x values
  50 //
  51 // *********************************************************************
  52 //
  53 // Accuracy:       Very accurate for double-precision values
  54 //
  55 // *********************************************************************
  56 //
  57 // Resources Used:
  58 //
  59 //    Floating-Point Registers: f8 (Input and Return Value)
  60 //                              f9-f15
  61 //                              f32-f112
  62 //
  63 //    General Purpose Registers:
  64 //      r32-r48
  65 //      r49-r50 (Used to pass arguments to pi_by_2 reduce routine)
  66 //
  67 //    Predicate Registers:      p6-p15
  68 //
  69 // *********************************************************************
  70 //
  71 // IEEE Special Conditions:
  72 //
  73 //    Denormal  fault raised on denormal inputs
  74 //    Overflow exceptions do not occur
  75 //    Underflow exceptions raised when appropriate for tan
  76 //    (No specialized error handling for this routine)
  77 //    Inexact raised when appropriate by algorithm
  78 //
  79 //    tan(SNaN) = QNaN
  80 //    tan(QNaN) = QNaN
  81 //    tan(inf) = QNaN
  82 //    tan(+/-0) = +/-0
  83 //
  84 // *********************************************************************
  85 //
  86 // Mathematical Description
  87 //
  88 // We consider the computation of FPTAN of Arg. Now, given
  89 //
  90 //      Arg = N pi/2  + alpha,          |alpha| <= pi/4,
  91 //
  92 // basic mathematical relationship shows that
  93 //
  94 //      tan( Arg ) =  tan( alpha )     if N is even;
  95 //                 = -cot( alpha )      otherwise.
  96 //
  97 // The value of alpha is obtained by argument reduction and
  98 // represented by two working precision numbers r and c where
  99 //
 100 //      alpha =  r  +  c     accurately.
 101 //
 102 // The reduction method is described in a previous write up.
 103 // The argument reduction scheme identifies 4 cases. For Cases 2
 104 // and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be
 105 // computed very easily by 2 or 3 terms of the Taylor series
 106 // expansion as follows:
 107 //
 108 // Case 2:
 109 // -------
 110 //
 111 //      tan(r + c) = r + c + r^3/3          ...accurately
 112 //        -cot(r + c) = -1/(r+c) + r/3          ...accurately
 113 //
 114 // Case 4:
 115 // -------
 116 //
 117 //      tan(r + c) = r + c + r^3/3 + 2r^5/15     ...accurately
 118 //        -cot(r + c) = -1/(r+c) + r/3 + r^3/45     ...accurately
 119 //
 120 //
 121 // The only cases left are Cases 1 and 3 of the argument reduction
 122 // procedure. These two cases will be merged since after the
 123 // argument is reduced in either cases, we have the reduced argument
 124 // represented as r + c and that the magnitude |r + c| is not small
 125 // enough to allow the usage of a very short approximation.
 126 //
 127 // The greatest challenge of this task is that the second terms of
 128 // the Taylor series for tan(r) and -cot(r)
 129 //
 130 //      r + r^3/3 + 2 r^5/15 + ...
 131 //
 132 // and
 133 //
 134 //      -1/r + r/3 + r^3/45 + ...
 135 //
 136 // are not very small when |r| is close to pi/4 and the rounding
 137 // errors will be a concern if simple polynomial accumulation is
 138 // used. When |r| < 2^(-2), however, the second terms will be small
 139 // enough (5 bits or so of right shift) that a normal Horner
 140 // recurrence suffices. Hence there are two cases that we consider
 141 // in the accurate computation of tan(r) and cot(r), |r| <= pi/4.
 142 //
 143 // Case small_r: |r| < 2^(-2)
 144 // --------------------------
 145 //
 146 // Since Arg = N pi/4 + r + c accurately, we have
 147 //
 148 //      tan(Arg) =  tan(r+c)            for N even,
 149 //            = -cot(r+c)          otherwise.
 150 //
 151 // Here for this case, both tan(r) and -cot(r) can be approximated
 152 // by simple polynomials:
 153 //
 154 //      tan(r) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 155 //        -cot(r) = -1/r + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 156 //
 157 // accurately. Since |r| is relatively small, tan(r+c) and
 158 // -cot(r+c) can be accurately approximated by replacing r with
 159 // r+c only in the first two terms of the corresponding polynomials.
 160 //
 161 // Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to
 162 // almost 64 sig. bits, thus
 163 //
 164 //      P1_1 (r+c)^3 =  P1_1 r^3 + c * r^2     accurately.
 165 //
 166 // Hence,
 167 //
 168 //      tan(r+c) =    r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 169 //                     + c*(1 + r^2)
 170 //
 171 //        -cot(r+c) = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 172 //               + Q1_1*c
 173 //
 174 //
 175 // Case normal_r: 2^(-2) <= |r| <= pi/4
 176 // ------------------------------------
 177 //
 178 // This case is more likely than the previous one if one considers
 179 // r to be uniformly distributed in [-pi/4 pi/4].
 180 //
 181 // The required calculation is either
 182 //
 183 //      tan(r + c)  =  tan(r)  +  correction,  or
 184 //        -cot(r + c)  = -cot(r)  +  correction.
 185 //
 186 // Specifically,
 187 //
 188 //      tan(r + c) =  tan(r) + c tan'(r)  + O(c^2)
 189 //              =  tan(r) + c sec^2(r) + O(c^2)
 190 //              =  tan(r) + c SEC_sq     ...accurately
 191 //                as long as SEC_sq approximates sec^2(r)
 192 //                to, say, 5 bits or so.
 193 //
 194 // Similarly,
 195 //
 196 //        -cot(r + c) = -cot(r) - c cot'(r)  + O(c^2)
 197 //              = -cot(r) + c csc^2(r) + O(c^2)
 198 //              = -cot(r) + c CSC_sq     ...accurately
 199 //                as long as CSC_sq approximates csc^2(r)
 200 //                to, say, 5 bits or so.
 201 //
 202 // We therefore concentrate on accurately calculating tan(r) and
 203 // cot(r) for a working-precision number r, |r| <= pi/4 to within
 204 // 0.1% or so.
 205 //
 206 // We will employ a table-driven approach. Let
 207 //
 208 //      r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63
 209 //        = sgn_r * ( B + x )
 210 //
 211 // where
 212 //
 213 //      B = 2^k * 1.b_1 b_2 ... b_5 1
 214 //         x = |r| - B
 215 //
 216 // Now,
 217 //                   tan(B)  +   tan(x)
 218 //      tan( B + x ) =  ------------------------
 219 //                   1 -  tan(B)*tan(x)
 220 //
 221 //               /                         \
 222 //               |   tan(B)  +   tan(x)          |
 223
 224 //      = tan(B) +  | ------------------------ - tan(B) |
 225 //               |     1 -  tan(B)*tan(x)          |
 226 //               \                         /
 227 //
 228 //                 sec^2(B) * tan(x)
 229 //      = tan(B) + ------------------------
 230 //                 1 -  tan(B)*tan(x)
 231 //
 232 //                (1/[sin(B)*cos(B)]) * tan(x)
 233 //      = tan(B) + --------------------------------
 234 //                      cot(B)  -  tan(x)
 235 //
 236 //
 237 // Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 238 // calculated beforehand and stored in a table. Since
 239 //
 240 //      |x| <= 2^k * 2^(-6)  <= 2^(-7)  (because k = -1, -2)
 241 //
 242 // a very short polynomial will be sufficient to approximate tan(x)
 243 // accurately. The details involved in computing the last expression
 244 // will be given in the next section on algorithm description.
 245 //
 246 //
 247 // Now, we turn to the case where cot( B + x ) is needed.
 248 //
 249 //
 250 //                   1 - tan(B)*tan(x)
 251 //      cot( B + x ) =  ------------------------
 252 //                   tan(B)  +  tan(x)
 253 //
 254 //               /                           \
 255 //               |   1 - tan(B)*tan(x)              |
 256
 257 //      = cot(B) +  | ----------------------- - cot(B) |
 258 //               |     tan(B)  +  tan(x)            |
 259 //               \                           /
 260 //
 261 //               [tan(B) + cot(B)] * tan(x)
 262 //      = cot(B) - ----------------------------
 263 //                   tan(B)  +  tan(x)
 264 //
 265 //                (1/[sin(B)*cos(B)]) * tan(x)
 266 //      = cot(B) - --------------------------------
 267 //                      tan(B)  +  tan(x)
 268 //
 269 //
 270 // Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that
 271 // are needed are the same set of values needed in the previous
 272 // case.
 273 //
 274 // Finally, we can put all the ingredients together as follows:
 275 //
 276 //      Arg = N * pi/2 +  r + c          ...accurately
 277 //
 278 //      tan(Arg) =  tan(r) + correction    if N is even;
 279 //            = -cot(r) + correction    otherwise.
 280 //
 281 // For Cases 2 and 4,
 282 //
 283 //     Case 2:
 284 //     tan(Arg) =  tan(r + c) = r + c + r^3/3           N even
 285 //              = -cot(r + c) = -1/(r+c) + r/3           N odd
 286 //     Case 4:
 287 //     tan(Arg) =  tan(r + c) = r + c + r^3/3 + 2r^5/15  N even
 288 //              = -cot(r + c) = -1/(r+c) + r/3 + r^3/45  N odd
 289 //
 290 //
 291 // For Cases 1 and 3,
 292 //
 293 //     Case small_r: |r| < 2^(-2)
 294 //
 295 //      tan(Arg) =  r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
 296 //                     + c*(1 + r^2)               N even
 297 //
 298 //                  = -1/(r+c) + Q1_1 r   + Q1_2 r^3 + ... + Q1_7 r^13
 299 //               + Q1_1*c                    N odd
 300 //
 301 //     Case normal_r: 2^(-2) <= |r| <= pi/4
 302 //
 303 //      tan(Arg) =  tan(r) + c * sec^2(r)     N even
 304 //               = -cot(r) + c * csc^2(r)     otherwise
 305 //
 306 //     For N even,
 307 //
 308 //      tan(Arg) = tan(r) + c*sec^2(r)
 309 //               = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
 310 //                  = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(|r|) )
 311 //                  = sgn_r * ( tan(B+x)  + sgn_r*c*sec^2(B) )
 312 //
 313 // since B approximates |r| to 2^(-6) in relative accuracy.
 314 //
 315 //                 /            (1/[sin(B)*cos(B)]) * tan(x)
 316 //    tan(Arg) = sgn_r * | tan(B) + --------------------------------
 317 //                 \                     cot(B)  -  tan(x)
 318 //                                        \
 319 //                       + CORR  |
 320
 321 //                                     /
 322 // where
 323 //
 324 //    CORR = sgn_r*c*tan(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
 325 //
 326 // For N odd,
 327 //
 328 //      tan(Arg) = -cot(r) + c*csc^2(r)
 329 //               = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
 330 //                  = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(|r|) )
 331 //                  = sgn_r * ( -cot(B+x)  + sgn_r*c*csc^2(B) )
 332 //
 333 // since B approximates |r| to 2^(-6) in relative accuracy.
 334 //
 335 //                 /            (1/[sin(B)*cos(B)]) * tan(x)
 336 //    tan(Arg) = sgn_r * | -cot(B) + --------------------------------
 337 //                 \                     tan(B)  +  tan(x)
 338 //                                        \
 339 //                       + CORR  |
 340
 341 //                                     /
 342 // where
 343 //
 344 //    CORR = sgn_r*c*cot(B)*SC_inv(B);  SC_inv(B) = 1/(sin(B)*cos(B)).
 345 //
 346 //
 347 // The actual algorithm prescribes how all the mathematical formulas
 348 // are calculated.
 349 //
 350 //
 351 // 2. Algorithmic Description
 352 // ==========================
 353 //
 354 // 2.1 Computation for Cases 2 and 4.
 355 // ----------------------------------
 356 //
 357 // For Case 2, we use two-term polynomials.
 358 //
 359 //    For N even,
 360 //
 361 //    rsq := r * r
 362 //    Result := c + r * rsq * P1_1
 363 //    Result := r + Result          ...in user-defined rounding
 364 //
 365 //    For N odd,
 366 //    S_hi  := -frcpa(r)               ...8 bits
 367 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 368 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 369 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 370 //    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 371 //    ...S_hi + S_lo is -1/(r+c) to extra precision
 372 //    S_lo  := S_lo + Q1_1*r
 373 //
 374 //    Result := S_hi + S_lo     ...in user-defined rounding
 375 //
 376 // For Case 4, we use three-term polynomials
 377 //
 378 //    For N even,
 379 //
 380 //    rsq := r * r
 381 //    Result := c + r * rsq * (P1_1 + rsq * P1_2)
 382 //    Result := r + Result          ...in user-defined rounding
 383 //
 384 //    For N odd,
 385 //    S_hi  := -frcpa(r)               ...8 bits
 386 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 387 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 388 //    S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 389 //    S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 390 //    ...S_hi + S_lo is -1/(r+c) to extra precision
 391 //    rsq   := r * r
 392 //    P      := Q1_1 + rsq*Q1_2
 393 //    S_lo  := S_lo + r*P
 394 //
 395 //    Result := S_hi + S_lo     ...in user-defined rounding
 396 //
 397 //
 398 // Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are
 399 // the same as those used in the small_r case of Cases 1 and 3
 400 // below.
 401 //
 402 //
 403 // 2.2 Computation for Cases 1 and 3.
 404 // ----------------------------------
 405 // This is further divided into the case of small_r,
 406 // where |r| < 2^(-2), and the case of normal_r, where |r| lies between
 407 // 2^(-2) and pi/4.
 408 //
 409 // Algorithm for the case of small_r
 410 // ---------------------------------
 411 //
 412 // For N even,
 413 //      rsq   := r * r
 414 //      Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3))
 415 //      r_to_the_8    := rsq * rsq
 416 //      r_to_the_8    := r_to_the_8 * r_to_the_8
 417 //      Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
 418 //      CORR  := c * ( 1 + rsq )
 419 //      Poly  := Poly1 + r_to_the_8*Poly2
 420 //      Result := r*Poly + CORR
 421 //      Result := r + Result     ...in user-defined rounding
 422 //      ...note that Poly1 and r_to_the_8 can be computed in parallel
 423 //      ...with Poly2 (Poly1 is intentionally set to be much
 424 //      ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
 425 //
 426 // For N odd,
 427 //      S_hi  := -frcpa(r)               ...8 bits
 428 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...16 bits
 429 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...32 bits
 430 //      S_hi  := S_hi + S_hi*(1 + S_hi*r)     ...64 bits
 431 //      S_lo  := S_hi*( (1 + S_hi*r) + S_hi*c )
 432 //      ...S_hi + S_lo is -1/(r+c) to extra precision
 433 //      S_lo  := S_lo + Q1_1*c
 434 //
 435 //      ...S_hi and S_lo are computed in parallel with
 436 //      ...the following
 437 //      rsq := r*r
 438 //      P   := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
 439 //
 440 //      Result :=  r*P + S_lo
 441 //      Result :=  S_hi  +  Result      ...in user-defined rounding
 442 //
 443 //
 444 // Algorithm for the case of normal_r
 445 // ----------------------------------
 446 //
 447 // Here, we first consider the computation of tan( r + c ). As
 448 // presented in the previous section,
 449 //
 450 //      tan( r + c )  =  tan(r) + c * sec^2(r)
 451 //                 =  sgn_r * [ tan(B+x) + CORR ]
 452 //      CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)]
 453 //
 454 // because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits.
 455 //
 456 //      tan( r + c ) =
 457 //           /           (1/[sin(B)*cos(B)]) * tan(x)
 458 //      sgn_r * | tan(B) + --------------------------------  +
 459 //           \                     cot(B)  -  tan(x)
 460 //                                \
 461 //                          CORR  |
 462
 463 //                                /
 464 //
 465 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 466 // calculated beforehand and stored in a table. Specifically,
 467 // the table values are
 468 //
 469 //      tan(B)                as  T_hi  +  T_lo;
 470 //      cot(B)             as  C_hi  +  C_lo;
 471 //      1/[sin(B)*cos(B)]  as  SC_inv
 472 //
 473 // T_hi, C_hi are in  double-precision  memory format;
 474 // T_lo, C_lo are in  single-precision  memory format;
 475 // SC_inv     is  in extended-precision memory format.
 476 //
 477 // The value of tan(x) will be approximated by a short polynomial of
 478 // the form
 479 //
 480 //      tan(x)  as  x  +  x * P, where
 481 //           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
 482 //
 483 // Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x)
 484 // to a relative accuracy better than 2^(-20). Thus, a good
 485 // initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative
 486 // division is:
 487 //
 488 //      1/(cot(B) - tan(x))      is approximately
 489 //      1/(cot(B) -   x)         is
 490 //      tan(B)/(1 - x*tan(B))    is approximately
 491 //      T_hi / ( 1 - T_hi * x )  is approximately
 492 //
 493 //      T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ]
 494 //
 495 // The calculation of tan(r+c) therefore proceed as follows:
 496 //
 497 //      Tx     := T_hi * x
 498 //      xsq     := x * x
 499 //
 500 //      V_hi     := T_hi*(1 + Tx*(1 + Tx))
 501 //      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
 502 //      ...V_hi serves as an initial guess of 1/(cot(B) - tan(x))
 503 //         ...good to about 20 bits of accuracy
 504 //
 505 //      tanx     := x + x*P
 506 //      D     := C_hi - tanx
 507 //      ...D is a double precision denominator: cot(B) - tan(x)
 508 //
 509 //      V_hi     := V_hi + V_hi*(1 - V_hi*D)
 510 //      ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits
 511 //
 512 //      V_lo     := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ]
 513 //                           - V_hi*C_lo )   ...observe all order
 514 //         ...V_hi + V_lo approximates 1/(cot(B) - tan(x))
 515 //      ...to extra accuracy
 516 //
 517 //      ...               SC_inv(B) * (x + x*P)
 518 //      ...   tan(B) +      ------------------------- + CORR
 519 //         ...                cot(B) - (x + x*P)
 520 //      ...
 521 //      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 522 //      ...
 523 //
 524 //      Sx     := SC_inv * x
 525 //      CORR     := sgn_r * c * SC_inv * T_hi
 526 //
 527 //      ...put the ingredients together to compute
 528 //      ...               SC_inv(B) * (x + x*P)
 529 //      ...   tan(B) +      ------------------------- + CORR
 530 //         ...                cot(B) - (x + x*P)
 531 //      ...
 532 //      ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 533 //      ...
 534 //      ... = T_hi + T_lo + CORR +
 535 //      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
 536 //
 537 //      CORR := CORR + T_lo
 538 //      tail := V_lo + P*(V_hi + V_lo)
 539 //         tail := Sx * tail  +  CORR
 540 //      tail := Sx * V_hi  +  tail
 541 //         T_hi := sgn_r * T_hi
 542 //
 543 //         ...T_hi + sgn_r*tail  now approximate
 544 //      ...sgn_r*(tan(B+x) + CORR) accurately
 545 //
 546 //      Result :=  T_hi + sgn_r*tail  ...in user-defined
 547 //                           ...rounding control
 548 //      ...It is crucial that independent paths be fully
 549 //      ...exploited for performance's sake.
 550 //
 551 //
 552 // Next, we consider the computation of -cot( r + c ). As
 553 // presented in the previous section,
 554 //
 555 //        -cot( r + c )  =  -cot(r) + c * csc^2(r)
 556 //                 =  sgn_r * [ -cot(B+x) + CORR ]
 557 //      CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)]
 558 //
 559 // because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits.
 560 //
 561 //        -cot( r + c ) =
 562 //           /             (1/[sin(B)*cos(B)]) * tan(x)
 563 //      sgn_r * | -cot(B) + --------------------------------  +
 564 //           \                     tan(B)  +  tan(x)
 565 //                                \
 566 //                          CORR  |
 567
 568 //                                /
 569 //
 570 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
 571 // calculated beforehand and stored in a table. Specifically,
 572 // the table values are
 573 //
 574 //      tan(B)                as  T_hi  +  T_lo;
 575 //      cot(B)             as  C_hi  +  C_lo;
 576 //      1/[sin(B)*cos(B)]  as  SC_inv
 577 //
 578 // T_hi, C_hi are in  double-precision  memory format;
 579 // T_lo, C_lo are in  single-precision  memory format;
 580 // SC_inv     is  in extended-precision memory format.
 581 //
 582 // The value of tan(x) will be approximated by a short polynomial of
 583 // the form
 584 //
 585 //      tan(x)  as  x  +  x * P, where
 586 //           P  =   x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
 587 //
 588 // Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x)
 589 // to a relative accuracy better than 2^(-18). Thus, a good
 590 // initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative
 591 // division is:
 592 //
 593 //      1/(tan(B) + tan(x))      is approximately
 594 //      1/(tan(B) +   x)         is
 595 //      cot(B)/(1 + x*cot(B))    is approximately
 596 //      C_hi / ( 1 + C_hi * x )  is approximately
 597 //
 598 //      C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ]
 599 //
 600 // The calculation of -cot(r+c) therefore proceed as follows:
 601 //
 602 //      Cx     := C_hi * x
 603 //      xsq     := x * x
 604 //
 605 //      V_hi     := C_hi*(1 - Cx*(1 - Cx))
 606 //      P     := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
 607 //      ...V_hi serves as an initial guess of 1/(tan(B) + tan(x))
 608 //         ...good to about 18 bits of accuracy
 609 //
 610 //      tanx     := x + x*P
 611 //      D     := T_hi + tanx
 612 //      ...D is a double precision denominator: tan(B) + tan(x)
 613 //
 614 //      V_hi     := V_hi + V_hi*(1 - V_hi*D)
 615 //      ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits
 616 //
 617 //      V_lo     := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ]
 618 //                           - V_hi*T_lo )   ...observe all order
 619 //         ...V_hi + V_lo approximates 1/(tan(B) + tan(x))
 620 //      ...to extra accuracy
 621 //
 622 //      ...               SC_inv(B) * (x + x*P)
 623 //      ...  -cot(B) +      ------------------------- + CORR
 624 //         ...                tan(B) + (x + x*P)
 625 //      ...
 626 //      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 627 //      ...
 628 //
 629 //      Sx     := SC_inv * x
 630 //      CORR     := sgn_r * c * SC_inv * C_hi
 631 //
 632 //      ...put the ingredients together to compute
 633 //      ...               SC_inv(B) * (x + x*P)
 634 //      ...  -cot(B) +      ------------------------- + CORR
 635 //         ...                tan(B) + (x + x*P)
 636 //      ...
 637 //      ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
 638 //      ...
 639 //      ... =-C_hi - C_lo + CORR +
 640 //      ...    Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
 641 //
 642 //      CORR := CORR - C_lo
 643 //      tail := V_lo + P*(V_hi + V_lo)
 644 //         tail := Sx * tail  +  CORR
 645 //      tail := Sx * V_hi  +  tail
 646 //         C_hi := -sgn_r * C_hi
 647 //
 648 //         ...C_hi + sgn_r*tail now approximates
 649 //      ...sgn_r*(-cot(B+x) + CORR) accurately
 650 //
 651 //      Result :=  C_hi + sgn_r*tail   in user-defined rounding control
 652 //      ...It is crucial that independent paths be fully
 653 //      ...exploited for performance's sake.
 654 //
 655 // 3. Implementation Notes
 656 // =======================
 657 //
 658 //   Table entries T_hi, T_lo; C_hi, C_lo; SC_inv
 659 //
 660 //   Recall that 2^(-2) <= |r| <= pi/4;
 661 //
 662 //      r = sgn_r * 2^k * 1.b_1 b_2 ... b_63
 663 //
 664 //   and
 665 //
 666 //        B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1
 667 //
 668 //   Thus, for k = -2, possible values of B are
 669 //
 670 //          B = 2^(-2) * ( 1 + index/32  +  1/64 ),
 671 //      index ranges from 0 to 31
 672 //
 673 //   For k = -1, however, since |r| <= pi/4 = 0.78...
 674 //   possible values of B are
 675 //
 676 //        B = 2^(-1) * ( 1 + index/32  +  1/64 )
 677 //      index ranges from 0 to 19.
 678 //
 679 //
 680
 681 #include "libm_support.h"
 682
 683 #ifdef _LIBC
 684 .rodata
 685 #else
 686 .data
 687 #endif
 688
 689 .align 128
 690
 691 TAN_BASE_CONSTANTS:
 692 ASM_TYPE_DIRECTIVE(TAN_BASE_CONSTANTS,@object)
 693 data4    0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24
 694                                                         // two**-14, -two**-14
 695 data4    0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi
 696 data4    0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0
 697 data4    0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1
 698 data4    0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2
 699 data4    0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3
 700 data4    0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63
 701 data4    0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0
 702 data4    0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1
 703 data4    0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2
 704 data4    0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4
 705 data4    0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4
 706 data4    0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2
 707 data4    0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33
 708 data4    0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1
 709 data4    0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2
 710 data4    0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3
 711 data4    0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4
 712 data4    0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5
 713 data4    0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6
 714 data4    0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7
 715 data4    0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8
 716 data4    0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9
 717 data4    0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1
 718 data4    0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2
 719 data4    0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3
 720 data4    0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4
 721 data4    0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5
 722 data4    0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6
 723 data4    0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7
 724 data4    0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1
 725 data4    0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2
 726 data4    0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3
 727 //
 728 //  Entries T_hi   double-precision memory format
 729 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 730 //  Entries T_lo  single-precision memory format
 731 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 732 //
 733 data4    0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000
 734 data4    0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000
 735 data4    0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000
 736 data4    0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000
 737 data4    0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000
 738 data4    0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000
 739 data4    0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000
 740 data4    0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000
 741 data4    0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000
 742 data4    0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000
 743 data4    0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000
 744 data4    0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000
 745 data4    0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000
 746 data4    0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000
 747 data4    0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000
 748 data4    0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000
 749 data4    0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000
 750 data4    0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000
 751 data4    0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000
 752 data4    0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000
 753 data4    0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000
 754 data4    0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000
 755 data4    0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000
 756 data4    0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000
 757 data4    0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000
 758 data4    0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000
 759 data4    0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000
 760 data4    0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000
 761 data4    0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000
 762 data4    0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000
 763 data4    0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000
 764 data4    0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000
 765 //
 766 //  Entries T_hi   double-precision memory format
 767 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 768 //  Entries T_lo  single-precision memory format
 769 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 770 //
 771 data4    0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000
 772 data4    0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000
 773 data4    0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000
 774 data4    0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000
 775 data4    0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000
 776 data4    0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000
 777 data4    0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000
 778 data4    0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000
 779 data4    0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000
 780 data4    0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000
 781 data4    0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000
 782 data4    0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000
 783 data4    0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000
 784 data4    0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000
 785 data4    0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000
 786 data4    0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000
 787 data4    0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000
 788 data4    0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000
 789 data4    0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000
 790 data4    0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000
 791 //
 792 //  Entries C_hi   double-precision memory format
 793 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 794 //  Entries C_lo  single-precision memory format
 795 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 796 //
 797 data4    0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000
 798 data4    0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000
 799 data4    0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000
 800 data4    0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000
 801 data4    0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000
 802 data4    0x5285B068, 0x400A855A, 0x25524F18, 0x00000000
 803 data4    0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000
 804 data4    0x646AF623, 0x4009188A, 0x254FD801, 0x00000000
 805 data4    0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000
 806 data4    0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000
 807 data4    0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000
 808 data4    0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000
 809 data4    0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000
 810 data4    0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000
 811 data4    0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000
 812 data4    0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000
 813 data4    0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000
 814 data4    0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000
 815 data4    0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000
 816 data4    0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000
 817 data4    0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000
 818 data4    0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000
 819 data4    0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000
 820 data4    0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000
 821 data4    0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000
 822 data4    0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000
 823 data4    0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000
 824 data4    0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000
 825 data4    0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000
 826 data4    0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000
 827 data4    0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000
 828 data4    0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000
 829 //
 830 //  Entries C_hi   double-precision memory format
 831 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 832 //  Entries C_lo  single-precision memory format
 833 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 834 //
 835 data4    0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000
 836 data4    0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000
 837 data4    0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000
 838 data4    0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000
 839 data4    0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000
 840 data4    0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000
 841 data4    0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000
 842 data4    0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000
 843 data4    0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000
 844 data4    0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000
 845 data4    0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000
 846 data4    0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000
 847 data4    0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000
 848 data4    0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000
 849 data4    0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000
 850 data4    0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000
 851 data4    0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000
 852 data4    0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000
 853 data4    0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000
 854 data4    0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000
 855 //
 856 //  Entries SC_inv in Swapped IEEE format (extended)
 857 //  Index = 0,1,...,31  B = 2^(-2)*(1+Index/32+1/64)
 858 //
 859 data4    0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000
 860 data4    0x554B0EB0, 0x80092804, 0x00004001, 0x00000000
 861 data4    0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000
 862 data4    0x77378677, 0xF3086BA0, 0x00004000, 0x00000000
 863 data4    0xCCD4723C, 0xED154515, 0x00004000, 0x00000000
 864 data4    0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000
 865 data4    0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000
 866 data4    0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000
 867 data4    0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000
 868 data4    0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000
 869 data4    0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000
 870 data4    0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000
 871 data4    0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000
 872 data4    0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000
 873 data4    0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000
 874 data4    0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000
 875 data4    0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000
 876 data4    0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000
 877 data4    0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000
 878 data4    0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000
 879 data4    0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000
 880 data4    0x7221235F, 0xAC862A23, 0x00004000, 0x00000000
 881 data4    0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000
 882 data4    0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000
 883 data4    0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000
 884 data4    0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000
 885 data4    0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000
 886 data4    0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000
 887 data4    0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000
 888 data4    0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000
 889 data4    0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000
 890 data4    0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000
 891 //
 892 //  Entries SC_inv in Swapped IEEE format (extended)
 893 //  Index = 0,1,...,19  B = 2^(-1)*(1+Index/32+1/64)
 894 //
 895 data4    0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000
 896 data4    0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000
 897 data4    0x61B798AF, 0x9147094F, 0x00004000, 0x00000000
 898 data4    0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000
 899 data4    0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000
 900 data4    0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000
 901 data4    0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000
 902 data4    0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000
 903 data4    0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000
 904 data4    0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000
 905 data4    0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000
 906 data4    0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000
 907 data4    0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000
 908 data4    0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000
 909 data4    0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000
 910 data4    0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000
 911 data4    0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000
 912 data4    0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000
 913 data4    0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000
 914 data4    0xA07791FA, 0x80186650, 0x00004000, 0x00000000
 915
 916 Arg                 = f8
 917 Result              = f8
 918 fp_tmp              = f9
 919 U_2                 = f10
 920 rsq                =  f11
 921 C_hi                = f12
 922 C_lo                = f13
 923 T_hi                = f14
 924 T_lo                = f15
 925
 926 N_0                 = f32
 927 d_1                 = f33
 928 MPI_BY_4            = f34
 929 tail                = f35
 930 tanx                = f36
 931 Cx                  = f37
 932 Sx                  = f38
 933 sgn_r               = f39
 934 CORR                = f40
 935 P                   = f41
 936 D                   = f42
 937 ArgPrime            = f43
 938 P_0                 = f44
 939
 940 P2_1                = f45
 941 P2_2                = f46
 942 P2_3                = f47
 943
 944 P1_1                = f45
 945 P1_2                = f46
 946 P1_3                = f47
 947
 948 P1_4                = f48
 949 P1_5                = f49
 950 P1_6                = f50
 951 P1_7                = f51
 952 P1_8                = f52
 953 P1_9                = f53
 954
 955 TWO_TO_63           = f54
 956 NEGTWO_TO_63        = f55
 957 x                   = f56
 958 xsq                 = f57
 959 Tx                  = f58
 960 Tx1                 = f59
 961 Set                 = f60
 962 poly1               = f61
 963 poly2               = f62
 964 Poly                = f63
 965 Poly1               = f64
 966 Poly2               = f65
 967 r_to_the_8          = f66
 968 B                   = f67
 969 SC_inv              = f68
 970 Pos_r               = f69
 971 N_0_fix             = f70
 972 PI_BY_4             = f71
 973 NEGTWO_TO_NEG2      = f72
 974 TWO_TO_24           = f73
 975 TWO_TO_NEG14        = f74
 976 TWO_TO_NEG33        = f75
 977 NEGTWO_TO_24        = f76
 978 NEGTWO_TO_NEG14     = f76
 979 NEGTWO_TO_NEG33     = f77
 980 two_by_PI           = f78
 981 N                   = f79
 982 N_fix               = f80
 983 P_1                 = f81
 984 P_2                 = f82
 985 P_3                 = f83
 986 s_val               = f84
 987 w                   = f85
 988 c                   = f86
 989 r                   = f87
 990 Z                   = f88
 991 A                   = f89
 992 a                   = f90
 993 t                   = f91
 994 U_1                 = f92
 995 d_2                 = f93
 996 TWO_TO_NEG2         = f94
 997 Q1_1                = f95
 998 Q1_2                = f96
 999 Q1_3                = f97
1000 Q1_4                = f98
1001 Q1_5                = f99
1002 Q1_6                = f100
1003 Q1_7                = f101
1004 Q1_8                = f102
1005 S_hi                = f103
1006 S_lo                = f104
1007 V_hi                = f105
1008 V_lo                = f106
1009 U_hi                = f107
1010 U_lo                = f108
1011 U_hiabs             = f109
1012 V_hiabs             = f110
1013 V                   = f111
1014 Inv_P_0             = f112
1015
1016 GR_SAVE_B0     = r33
1017 GR_SAVE_GP     = r34
1018 GR_SAVE_PFS    = r35
1019
1020 delta1         = r36
1021 table_ptr1     = r37
1022 table_ptr2     = r38
1023 i_0            = r39
1024 i_1            = r40
1025 N_fix_gr       = r41
1026 N_inc          = r42
1027 exp_Arg        = r43
1028 exp_r          = r44
1029 sig_r          = r45
1030 lookup         = r46
1031 table_offset   = r47
1032 Create_B       = r48
1033 gr_tmp         = r49
1034
1035 GR_Parameter_X = r49
1036 GR_Parameter_r = r50
1037
1038
1039
1040 .global __libm_tan
1041 .section .text
1042 .proc __libm_tan
1043
1044
1045 __libm_tan:
1046
1047 { .mfi
1048 alloc r32 = ar.pfs, 0,17,2,0
1049 (p0)   fclass.m.unc  p6,p0 = Arg, 0x1E7
1050       addl gr_tmp = -1,r0
1051 }
1052 ;;
1053
1054 { .mfi
1055        nop.m 999
1056 (p0)   fclass.nm.unc  p7,p0 = Arg, 0x1FF
1057        nop.i 999
1058 }
1059 ;;
1060
1061 { .mfi
1062 (p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
1063        nop.f 999
1064        nop.i 999
1065 }
1066 ;;
1067
1068 { .mmi
1069       ld8 table_ptr1 = [table_ptr1]
1070       setf.sig fp_tmp = gr_tmp   // Make a constant so fmpy produces inexact
1071       nop.i 999
1072 }
1073 ;;
1074
1075 //
1076 //     Check for NatVals, Infs , NaNs, and Zeros
1077 //     Check for everything - if false, then must be pseudo-zero
1078 //     or pseudo-nan.
1079 //     Local table pointer
1080 //
1081
1082 { .mbb
1083 (p0)   add table_ptr2 = 96, table_ptr1
1084 (p6)   br.cond.spnt __libm_TAN_SPECIAL
1085 (p7)   br.cond.spnt __libm_TAN_SPECIAL ;;
1086 }
1087 //
1088 //     Point to Inv_P_0
1089 //     Branch out to deal with unsupporteds and special values.
1090 //
1091
1092 { .mmf
1093 (p0)   ldfs TWO_TO_24 = [table_ptr1],4
1094 (p0)   ldfs TWO_TO_63 = [table_ptr2],4
1095 //
1096 //     Load -2**24, load -2**63.
1097 //
1098 (p0)   fcmp.eq.s0 p0, p6 = Arg, f1 ;;
1099 }
1100
1101 { .mfi
1102 (p0)   ldfs NEGTWO_TO_63 = [table_ptr2],12
1103 (p0)   fnorm.s1     Arg = Arg
1104         nop.i 999
1105 }
1106 //
1107 //     Load 2**24, Load 2**63.
1108 //
1109
1110 { .mmi
1111 (p0)   ldfs NEGTWO_TO_24 = [table_ptr1],12 ;;
1112 //
1113 //     Do fcmp to generate Denormal exception
1114 //     - can't do FNORM (will generate Underflow when U is unmasked!)
1115 //     Normalize input argument.
1116 //
1117 (p0)   ldfe two_by_PI = [table_ptr1],16
1118         nop.i 999
1119 }
1120
1121 { .mmi
1122 (p0)   ldfe Inv_P_0 = [table_ptr2],16 ;;
1123 (p0)   ldfe d_1 = [table_ptr2],16
1124         nop.i 999
1125 }
1126 //
1127 //     Decide about the paths to take:
1128 //     PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2
1129 //     OTHERWISE - CASE 3 OR 4
1130 //     Load inverse of P_0 .
1131 //     Set PR_6 if Arg <= -2**63
1132 //     Are there any Infs, NaNs, or zeros?
1133 //
1134
1135 { .mmi
1136 (p0)   ldfe P_0 = [table_ptr1],16 ;;
1137 (p0)   ldfe d_2 = [table_ptr2],16
1138         nop.i 999
1139 }
1140 //
1141 //     Set PR_8 if Arg <= -2**24
1142 //     Set PR_6 if Arg >=  2**63
1143 //
1144
1145 { .mmi
1146 (p0)   ldfe P_1 = [table_ptr1],16 ;;
1147 (p0)   ldfe PI_BY_4 = [table_ptr2],16
1148         nop.i 999
1149 }
1150 //
1151 //     Set PR_8 if Arg >= 2**24
1152 //
1153
1154 { .mmi
1155 (p0)   ldfe P_2 = [table_ptr1],16 ;;
1156 (p0)   ldfe   MPI_BY_4 = [table_ptr2],16
1157         nop.i 999
1158 }
1159 //
1160 //     Load  P_2 and PI_BY_4
1161 //
1162
1163 { .mfi
1164 (p0)   ldfe   P_3 = [table_ptr1],16
1165         nop.f 999
1166         nop.i 999 ;;
1167 }
1168
1169 { .mfi
1170         nop.m 999
1171 (p0)   fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63
1172         nop.i 999
1173 }
1174
1175 { .mfi
1176         nop.m 999
1177 (p0)   fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24
1178         nop.i 999 ;;
1179 }
1180
1181 { .mfi
1182         nop.m 999
1183 (p7)   fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63
1184         nop.i 999
1185 }
1186
1187 { .mfi
1188         nop.m 999
1189 (p9)   fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24
1190         nop.i 999 ;;
1191 }
1192
1193 { .mib
1194         nop.m 999
1195         nop.i 999
1196 //
1197 //     Load  P_3 and -PI_BY_4
1198 //
1199 (p6)   br.cond.spnt TAN_ARG_TOO_LARGE ;;
1200 }
1201
1202 { .mib
1203         nop.m 999
1204         nop.i 999
1205 //
1206 //     Load 2**(-2).
1207 //     Load -2**(-2).
1208 //     Branch out if we have a special argument.
1209 //     Branch out if the magnitude of the input argument is too large
1210 //     - do this branch before the next.
1211 //
1212 (p8)   br.cond.spnt TAN_LARGER_ARG ;;
1213 }
1214 //
1215 //     Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1216 //
1217
1218 { .mfi
1219 (p0)   ldfs TWO_TO_NEG2 = [table_ptr2],4
1220 //     ARGUMENT REDUCTION CODE - CASE 1 and 2
1221 //     Load 2**(-2).
1222 //     Load -2**(-2).
1223 (p0)   fmpy.s1 N = Arg,two_by_PI
1224         nop.i 999 ;;
1225 }
1226
1227 { .mfi
1228 (p0)   ldfs NEGTWO_TO_NEG2 = [table_ptr2],12
1229 //
1230 //     N = Arg * 2/pi
1231 //
1232 (p0)   fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4
1233         nop.i 999 ;;
1234 }
1235
1236 { .mfi
1237         nop.m 999
1238 //
1239 //     if Arg < pi/4,  set PR_8.
1240 //
1241 (p8)   fcmp.gt.s1 p8,p9= Arg,MPI_BY_4
1242         nop.i 999 ;;
1243 }
1244 //
1245 //     Case 1: Is |r| < 2**(-2).
1246 //     Arg is the same as r in this case.
1247 //     r = Arg
1248 //     c = 0
1249 //
1250
1251 { .mfi
1252 (p8)   mov N_fix_gr = r0
1253 //
1254 //     if Arg > -pi/4, reset PR_8.
1255 //     Select the case when |Arg| < pi/4 - set PR[8] = true.
1256 //     Else Select the case when |Arg| >= pi/4 - set PR[9] = true.
1257 //
1258 (p0)   fcvt.fx.s1 N_fix = N
1259         nop.i 999 ;;
1260 }
1261
1262 { .mfi
1263         nop.m 999
1264 //
1265 //     Grab the integer part of N .
1266 //
1267 (p8)   mov r = Arg
1268         nop.i 999
1269 }
1270
1271 { .mfi
1272         nop.m 999
1273 (p8)   mov c = f0
1274         nop.i 999 ;;
1275 }
1276
1277 { .mfi
1278         nop.m 999
1279 (p8)   fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2
1280         nop.i 999 ;;
1281 }
1282
1283 { .mfi
1284         nop.m 999
1285 (p10)  fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2
1286         nop.i 999 ;;
1287 }
1288
1289 { .mfi
1290         nop.m 999
1291 //
1292 //     Case 2: Place integer part of N in GP register.
1293 //
1294 (p9)   fcvt.xf N = N_fix
1295         nop.i 999 ;;
1296 }
1297
1298 { .mib
1299 (p9)   getf.sig N_fix_gr = N_fix
1300         nop.i 999
1301 //
1302 //     Case 2: Convert integer N_fix back to normalized floating-point value.
1303 //
1304 (p10)  br.cond.spnt TAN_SMALL_R ;;
1305 }
1306
1307 { .mib
1308         nop.m 999
1309         nop.i 999
1310 (p8)   br.cond.sptk TAN_NORMAL_R ;;
1311 }
1312 //
1313 //     Case 1: PR_3 is only affected  when PR_1 is set.
1314 //
1315
1316 { .mmi
1317 (p9)   ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;;
1318 //
1319 //     Case 2: Load 2**(-33).
1320 //
1321 (p9)   ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4
1322         nop.i 999 ;;
1323 }
1324
1325 { .mfi
1326         nop.m 999
1327 //
1328 //     Case 2: Load -2**(-33).
1329 //
1330 (p9)   fnma.s1 s_val = N, P_1, Arg
1331         nop.i 999
1332 }
1333
1334 { .mfi
1335         nop.m 999
1336 (p9)   fmpy.s1 w = N, P_2
1337         nop.i 999 ;;
1338 }
1339
1340 { .mfi
1341         nop.m 999
1342 //
1343 //     Case 2: w = N * P_2
1344 //     Case 2: s_val = -N * P_1  + Arg
1345 //
1346 (p0)   fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33
1347         nop.i 999 ;;
1348 }
1349
1350 { .mfi
1351         nop.m 999
1352 //
1353 //     Decide between case_1 and case_2 reduce:
1354 //
1355 (p9)   fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
1356         nop.i 999 ;;
1357 }
1358
1359 { .mfi
1360         nop.m 999
1361 //
1362 //     Case 1_reduce:  s <= -2**(-33) or s >= 2**(-33)
1363 //     Case 2_reduce: -2**(-33) < s < 2**(-33)
1364 //
1365 (p8)   fsub.s1 r = s_val, w
1366         nop.i 999
1367 }
1368
1369 { .mfi
1370         nop.m 999
1371 (p9)   fmpy.s1 w = N, P_3
1372         nop.i 999 ;;
1373 }
1374
1375 { .mfi
1376         nop.m 999
1377 (p9)   fma.s1  U_1 = N, P_2, w
1378         nop.i 999
1379 }
1380
1381 { .mfi
1382         nop.m 999
1383 //
1384 //     Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
1385 //     else set PR_11.
1386 //
1387 (p8)   fsub.s1 c = s_val, r
1388         nop.i 999 ;;
1389 }
1390
1391 { .mfi
1392         nop.m 999
1393 //
1394 //     Case 1_reduce: r = s + w (change sign)
1395 //     Case 2_reduce: w = N * P_3 (change sign)
1396 //
1397 (p8)   fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2
1398         nop.i 999 ;;
1399 }
1400
1401 { .mfi
1402         nop.m 999
1403 (p10)  fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2
1404         nop.i 999 ;;
1405 }
1406
1407 { .mfi
1408         nop.m 999
1409 (p9)   fsub.s1 r = s_val, U_1
1410         nop.i 999
1411 }
1412
1413 { .mfi
1414         nop.m 999
1415 //
1416 //     Case 1_reduce: c is complete here.
1417 //     c = c + w (w has not been negated.)
1418 //     Case 2_reduce: r is complete here - continue to calculate c .
1419 //     r = s - U_1
1420 //
1421 (p9)   fms.s1 U_2 = N, P_2, U_1
1422         nop.i 999 ;;
1423 }
1424
1425 { .mfi
1426         nop.m 999
1427 //
1428 //     Case 1_reduce: c = s - r
1429 //     Case 2_reduce: U_1 = N * P_2 + w
1430 //
1431 (p8)   fsub.s1 c = c, w
1432         nop.i 999 ;;
1433 }
1434
1435 { .mfi
1436         nop.m 999
1437 (p9)   fsub.s1 s_val = s_val, r
1438         nop.i 999
1439 }
1440
1441 { .mfb
1442         nop.m 999
1443 //
1444 //     Case 2_reduce:
1445 //     U_2 = N * P_2 - U_1
1446 //     Not needed until later.
1447 //
1448 (p9)   fadd.s1 U_2 = U_2, w
1449 //
1450 //     Case 2_reduce:
1451 //     s = s - r
1452 //     U_2 = U_2 + w
1453 //
1454 (p10)  br.cond.spnt TAN_SMALL_R ;;
1455 }
1456
1457 { .mib
1458         nop.m 999
1459         nop.i 999
1460 (p11)  br.cond.sptk TAN_NORMAL_R ;;
1461 }
1462
1463 { .mii
1464         nop.m 999
1465 //
1466 //     Case 2_reduce:
1467 //     c = c - U_2
1468 //     c is complete here
1469 //     Argument reduction ends here.
1470 //
1471 (p9)   extr.u i_1 = N_fix_gr, 0, 1 ;;
1472 (p9)   cmp.eq.unc p11, p12 = 0x0000,i_1 ;;
1473 }
1474
1475 { .mfi
1476         nop.m 999
1477 //
1478 //     Is i_1  even or odd?
1479 //     if i_1 == 0, set p11, else set p12.
1480 //
1481 (p11)  fmpy.s1 rsq = r, Z
1482         nop.i 999 ;;
1483 }
1484
1485 { .mfi
1486         nop.m 999
1487 (p12)  frcpa.s1 S_hi,p0 = f1, r
1488         nop.i 999
1489 }
1490
1491 //
1492 //     Case 1: Branch to SMALL_R or NORMAL_R.
1493 //     Case 1 is done now.
1494 //
1495
1496 { .mfi
1497 (p9)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
1498 (p9)   fsub.s1 c = s_val, U_1
1499         nop.i 999 ;;
1500 }
1501 ;;
1502
1503 { .mmi
1504 (p9)  ld8 table_ptr1 = [table_ptr1]
1505       nop.m 999
1506       nop.i 999
1507 }
1508 ;;
1509
1510 { .mmi
1511 (p9)   add table_ptr1 = 224, table_ptr1 ;;
1512 (p9)   ldfe P1_1 = [table_ptr1],144
1513         nop.i 999 ;;
1514 }
1515 //
1516 //     Get [i_1] -  lsb of N_fix_gr .
1517 //     Load P1_1 and point to Q1_1 .
1518 //
1519
1520 { .mfi
1521 (p9)   ldfe Q1_1 = [table_ptr1] , 0
1522 //
1523 //     N even: rsq = r * Z
1524 //     N odd:  S_hi = frcpa(r)
1525 //
1526 (p12)  fmerge.ns S_hi = S_hi, S_hi
1527         nop.i 999
1528 }
1529
1530 { .mfi
1531         nop.m 999
1532 //
1533 //     Case 2_reduce:
1534 //     c = s - U_1
1535 //
1536 (p9)   fsub.s1 c = c, U_2
1537         nop.i 999 ;;
1538 }
1539
1540 { .mfi
1541         nop.m 999
1542 (p12)  fma.s1  poly1 = S_hi, r, f1
1543         nop.i 999 ;;
1544 }
1545
1546 { .mfi
1547         nop.m 999
1548 //
1549 //     N odd:  Change sign of S_hi
1550 //
1551 (p11)  fmpy.s1 rsq = rsq, P1_1
1552         nop.i 999 ;;
1553 }
1554
1555 { .mfi
1556         nop.m 999
1557 (p12)  fma.s1 S_hi = S_hi, poly1, S_hi
1558         nop.i 999 ;;
1559 }
1560
1561 { .mfi
1562         nop.m 999
1563 //
1564 //     N even: rsq = rsq * P1_1
1565 //     N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
1566 //
1567 (p11)  fma.s1 Result = r, rsq, c
1568         nop.i 999 ;;
1569 }
1570
1571 { .mfi
1572         nop.m 999
1573 //
1574 //     N even: Result = c  + r * rsq
1575 //     N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
1576 //
1577 (p12)  fma.s1 poly1 = S_hi, r, f1
1578         nop.i 999 ;;
1579 }
1580
1581 { .mfi
1582         nop.m 999
1583 //
1584 //     N even: Result = Result + r
1585 //     N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
1586 //
1587 (p11)  fadd.s0 Result = r, Result
1588         nop.i 999 ;;
1589 }
1590
1591 { .mfi
1592         nop.m 999
1593 (p12)  fma.s1  S_hi = S_hi, poly1, S_hi
1594         nop.i 999 ;;
1595 }
1596
1597 { .mfi
1598         nop.m 999
1599 //
1600 //     N even: Result1 = Result + r
1601 //     N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
1602 //
1603 (p12)  fma.s1 poly1 = S_hi, r, f1
1604         nop.i 999 ;;
1605 }
1606
1607 { .mfi
1608         nop.m 999
1609 //
1610 //     N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
1611 //
1612 (p12)  fma.s1 S_hi = S_hi, poly1, S_hi
1613         nop.i 999 ;;
1614 }
1615
1616 { .mfi
1617         nop.m 999
1618 //
1619 //     N odd:  poly1  =  S_hi * poly + 1.0    64 bits
1620 //
1621 (p12)  fma.s1 poly1 = S_hi, r, f1
1622         nop.i 999 ;;
1623 }
1624
1625 { .mfi
1626         nop.m 999
1627 //
1628 //     N odd:  poly1  =  S_hi * r + 1.0
1629 //
1630 (p12)  fma.s1 poly1 = S_hi, c, poly1
1631         nop.i 999 ;;
1632 }
1633
1634 { .mfi
1635         nop.m 999
1636 //
1637 //     N odd:  poly1  =  S_hi * c + poly1
1638 //
1639 (p12)  fmpy.s1 S_lo = S_hi, poly1
1640         nop.i 999 ;;
1641 }
1642
1643 { .mfi
1644         nop.m 999
1645 //
1646 //     N odd:  S_lo  =  S_hi *  poly1
1647 //
1648 (p12)  fma.s1 S_lo = Q1_1, r, S_lo
1649         nop.i 999
1650 }
1651
1652 { .mfi
1653         nop.m 999
1654 //
1655 //     N odd:  Result =  S_hi + S_lo
1656 //
1657 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
1658         nop.i 999 ;;
1659 }
1660
1661 { .mfb
1662         nop.m 999
1663 //
1664 //     N odd:  S_lo  =  S_lo + Q1_1 * r
1665 //
1666 (p12)  fadd.s0 Result = S_hi, S_lo
1667 (p0)   br.ret.sptk b0 ;;
1668 }
1669
1670
1671 TAN_LARGER_ARG:
1672
1673 { .mmf
1674 (p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
1675       nop.m 999
1676 (p0)  fmpy.s1 N_0 = Arg, Inv_P_0
1677 }
1678 ;;
1679
1680 //
1681 // ARGUMENT REDUCTION CODE - CASE 3 and 4
1682 //
1683 //
1684 //    Adjust table_ptr1 to beginning of table.
1685 //    N_0 = Arg * Inv_P_0
1686 //
1687
1688
1689 { .mmi
1690 (p0)  ld8 table_ptr1 = [table_ptr1]
1691       nop.m 999
1692       nop.i 999
1693 }
1694 ;;
1695
1696
1697 { .mmi
1698 (p0)  add table_ptr1 = 8, table_ptr1 ;;
1699 //
1700 //    Point to  2*-14
1701 //
1702 (p0)  ldfs TWO_TO_NEG14 = [table_ptr1], 4
1703         nop.i 999 ;;
1704 }
1705 //
1706 //    Load 2**(-14).
1707 //
1708
1709 { .mmi
1710 (p0)  ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;;
1711 //
1712 //    N_0_fix  = integer part of N_0 .
1713 //    Adjust table_ptr1 to beginning of table.
1714 //
1715 (p0)  ldfs TWO_TO_NEG2 = [table_ptr1], 4
1716         nop.i 999 ;;
1717 }
1718 //
1719 //    Make N_0 the integer part.
1720 //
1721
1722 { .mfi
1723 (p0)  ldfs NEGTWO_TO_NEG2 = [table_ptr1]
1724 //
1725 //    Load -2**(-14).
1726 //
1727 (p0)  fcvt.fx.s1 N_0_fix = N_0
1728         nop.i 999 ;;
1729 }
1730
1731 { .mfi
1732         nop.m 999
1733 (p0)  fcvt.xf N_0 = N_0_fix
1734         nop.i 999 ;;
1735 }
1736
1737 { .mfi
1738         nop.m 999
1739 (p0)  fnma.s1 ArgPrime = N_0, P_0, Arg
1740         nop.i 999
1741 }
1742
1743 { .mfi
1744         nop.m 999
1745 (p0)  fmpy.s1 w = N_0, d_1
1746         nop.i 999 ;;
1747 }
1748
1749 { .mfi
1750         nop.m 999
1751 //
1752 //    ArgPrime = -N_0 * P_0 + Arg
1753 //    w  = N_0 * d_1
1754 //
1755 (p0)  fmpy.s1 N = ArgPrime, two_by_PI
1756         nop.i 999 ;;
1757 }
1758
1759 { .mfi
1760         nop.m 999
1761 //
1762 //    N = ArgPrime * 2/pi
1763 //
1764 (p0)  fcvt.fx.s1 N_fix = N
1765         nop.i 999 ;;
1766 }
1767
1768 { .mfi
1769         nop.m 999
1770 //
1771 //    N_fix is the integer part.
1772 //
1773 (p0)  fcvt.xf N = N_fix
1774         nop.i 999 ;;
1775 }
1776
1777 { .mfi
1778 (p0)  getf.sig N_fix_gr = N_fix
1779         nop.f 999
1780         nop.i 999 ;;
1781 }
1782
1783 { .mfi
1784         nop.m 999
1785 //
1786 //    N is the integer part of the reduced-reduced argument.
1787 //    Put the integer in a GP register.
1788 //
1789 (p0)  fnma.s1 s_val = N, P_1, ArgPrime
1790         nop.i 999
1791 }
1792
1793 { .mfi
1794         nop.m 999
1795 (p0)  fnma.s1 w = N, P_2, w
1796         nop.i 999 ;;
1797 }
1798
1799 { .mfi
1800         nop.m 999
1801 //
1802 //    s_val = -N*P_1 + ArgPrime
1803 //    w = -N*P_2 + w
1804 //
1805 (p0)  fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14
1806         nop.i 999 ;;
1807 }
1808
1809 { .mfi
1810         nop.m 999
1811 (p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
1812         nop.i 999 ;;
1813 }
1814
1815 { .mfi
1816         nop.m 999
1817 //
1818 //    Case 3: r = s_val + w (Z complete)
1819 //    Case 4: U_hi = N_0 * d_1
1820 //
1821 (p10) fmpy.s1 V_hi = N, P_2
1822         nop.i 999
1823 }
1824
1825 { .mfi
1826         nop.m 999
1827 (p11) fmpy.s1 U_hi = N_0, d_1
1828         nop.i 999 ;;
1829 }
1830
1831 { .mfi
1832         nop.m 999
1833 //
1834 //    Case 3: r = s_val + w (Z complete)
1835 //    Case 4: U_hi = N_0 * d_1
1836 //
1837 (p11) fmpy.s1 V_hi = N, P_2
1838         nop.i 999
1839 }
1840
1841 { .mfi
1842         nop.m 999
1843 (p11) fmpy.s1 U_hi = N_0, d_1
1844         nop.i 999 ;;
1845 }
1846
1847 { .mfi
1848         nop.m 999
1849 //
1850 //    Decide between case 3 and 4:
1851 //    Case 3:  s <= -2**(-14) or s >= 2**(-14)
1852 //    Case 4: -2**(-14) < s < 2**(-14)
1853 //
1854 (p10) fadd.s1 r = s_val, w
1855         nop.i 999
1856 }
1857
1858 { .mfi
1859         nop.m 999
1860 (p11) fmpy.s1 w = N, P_3
1861         nop.i 999 ;;
1862 }
1863
1864 { .mfi
1865         nop.m 999
1866 //
1867 //    Case 4: We need abs of both U_hi and V_hi - dont
1868 //    worry about switched sign of V_hi .
1869 //
1870 (p11) fsub.s1 A = U_hi, V_hi
1871         nop.i 999
1872 }
1873
1874 { .mfi
1875         nop.m 999
1876 //
1877 //    Case 4: A =  U_hi + V_hi
1878 //    Note: Worry about switched sign of V_hi, so subtract instead of add.
1879 //
1880 (p11) fnma.s1 V_lo = N, P_2, V_hi
1881         nop.i 999 ;;
1882 }
1883
1884 { .mfi
1885         nop.m 999
1886 (p11) fms.s1 U_lo = N_0, d_1, U_hi
1887         nop.i 999 ;;
1888 }
1889
1890 { .mfi
1891         nop.m 999
1892 (p11) fabs V_hiabs = V_hi
1893         nop.i 999
1894 }
1895
1896 { .mfi
1897         nop.m 999
1898 //
1899 //    Case 4: V_hi = N * P_2
1900 //            w = N * P_3
1901 //    Note the product does not include the (-) as in the writeup
1902 //    so (-) missing for V_hi and w .
1903 (p10) fadd.s1 r = s_val, w
1904         nop.i 999 ;;
1905 }
1906
1907 { .mfi
1908         nop.m 999
1909 //
1910 //    Case 3: c = s_val - r
1911 //    Case 4: U_lo = N_0 * d_1 - U_hi
1912 //
1913 (p11) fabs U_hiabs = U_hi
1914         nop.i 999
1915 }
1916
1917 { .mfi
1918         nop.m 999
1919 (p11) fmpy.s1 w = N, P_3
1920         nop.i 999 ;;
1921 }
1922
1923 { .mfi
1924         nop.m 999
1925 //
1926 //    Case 4: Set P_12 if U_hiabs >= V_hiabs
1927 //
1928 (p11) fadd.s1 C_hi = s_val, A
1929         nop.i 999 ;;
1930 }
1931
1932 { .mfi
1933         nop.m 999
1934 //
1935 //    Case 4: C_hi = s_val + A
1936 //
1937 (p11) fadd.s1 t = U_lo, V_lo
1938         nop.i 999 ;;
1939 }
1940
1941 { .mfi
1942         nop.m 999
1943 //
1944 //    Case 3: Is |r| < 2**(-2), if so set PR_7
1945 //    else set PR_8.
1946 //    Case 3: If PR_7 is set, prepare to branch to Small_R.
1947 //    Case 3: If PR_8 is set, prepare to branch to Normal_R.
1948 //
1949 (p10) fsub.s1 c = s_val, r
1950         nop.i 999 ;;
1951 }
1952
1953 { .mfi
1954         nop.m 999
1955 //
1956 //    Case 3: c = (s - r) + w (c complete)
1957 //
1958 (p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
1959         nop.i 999
1960 }
1961
1962 { .mfi
1963         nop.m 999
1964 (p11) fms.s1 w = N_0, d_2, w
1965         nop.i 999 ;;
1966 }
1967
1968 { .mfi
1969         nop.m 999
1970 //
1971 //    Case 4: V_hi = N * P_2
1972 //            w = N * P_3
1973 //    Note the product does not include the (-) as in the writeup
1974 //    so (-) missing for V_hi and w .
1975 //
1976 (p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
1977         nop.i 999 ;;
1978 }
1979
1980 { .mfi
1981         nop.m 999
1982 (p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
1983         nop.i 999 ;;
1984 }
1985
1986 { .mfb
1987         nop.m 999
1988 //
1989 //    Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
1990 //    Note: the (-) is still missing for V_hi .
1991 //    Case 4: w = w + N_0 * d_2
1992 //    Note: the (-) is now incorporated in w .
1993 //
1994 (p10) fadd.s1 c = c, w
1995 //
1996 //    Case 4: t = U_lo + V_lo
1997 //    Note: remember V_lo should be (-), subtract instead of add. NO
1998 //
1999 (p14) br.cond.spnt TAN_SMALL_R ;;
2000 }
2001
2002 { .mib
2003         nop.m 999
2004         nop.i 999
2005 (p15) br.cond.spnt TAN_NORMAL_R ;;
2006 }
2007
2008 { .mfi
2009         nop.m 999
2010 //
2011 //    Case 3: Vector off when |r| < 2**(-2).  Recall that PR_3 will be true.
2012 //    The remaining stuff is for Case 4.
2013 //
2014 (p12) fsub.s1 a = U_hi, A
2015 (p11) extr.u i_1 = N_fix_gr, 0, 1 ;;
2016 }
2017
2018 { .mfi
2019         nop.m 999
2020 //
2021 //    Case 4: C_lo = s_val - C_hi
2022 //
2023 (p11) fadd.s1 t = t, w
2024         nop.i 999
2025 }
2026
2027 { .mfi
2028         nop.m 999
2029 (p13) fadd.s1 a = V_hi, A
2030         nop.i 999 ;;
2031 }
2032
2033 //
2034 //    Case 4: a = U_hi - A
2035 //            a = V_hi - A (do an add to account for missing (-) on V_hi
2036 //
2037
2038 { .mfi
2039 (p11)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
2040 (p11) fsub.s1 C_lo = s_val, C_hi
2041         nop.i 999
2042 }
2043 ;;
2044
2045 { .mmi
2046 (p11) ld8 table_ptr1 = [table_ptr1]
2047       nop.m 999
2048       nop.i 999
2049 }
2050 ;;
2051
2052 //
2053 //    Case 4: a = (U_hi - A)  + V_hi
2054 //            a = (V_hi - A)  + U_hi
2055 //    In each case account for negative missing form V_hi .
2056 //
2057 //
2058 //    Case 4: C_lo = (s_val - C_hi) + A
2059 //
2060
2061 { .mmi
2062 (p11) add table_ptr1 = 224, table_ptr1 ;;
2063 (p11) ldfe P1_1 = [table_ptr1], 16
2064         nop.i 999 ;;
2065 }
2066
2067 { .mfi
2068 (p11) ldfe P1_2 = [table_ptr1], 128
2069 //
2070 //    Case 4: w = U_lo + V_lo  + w
2071 //
2072 (p12) fsub.s1 a = a, V_hi
2073         nop.i 999 ;;
2074 }
2075 //
2076 //    Case 4: r = C_hi + C_lo
2077 //
2078
2079 { .mfi
2080 (p11) ldfe Q1_1 = [table_ptr1], 16
2081 (p11) fadd.s1 C_lo = C_lo, A
2082         nop.i 999 ;;
2083 }
2084 //
2085 //    Case 4: c = C_hi - r
2086 //    Get [i_1] - lsb of N_fix_gr.
2087 //
2088
2089 { .mfi
2090 (p11) ldfe Q1_2 = [table_ptr1], 16
2091         nop.f 999
2092         nop.i 999 ;;
2093 }
2094
2095 { .mfi
2096         nop.m 999
2097 (p13) fsub.s1 a = U_hi, a
2098         nop.i 999 ;;
2099 }
2100
2101 { .mfi
2102         nop.m 999
2103 (p11) fadd.s1 t = t, a
2104         nop.i 999 ;;
2105 }
2106
2107 { .mfi
2108         nop.m 999
2109 //
2110 //    Case 4: t = t + a
2111 //
2112 (p11) fadd.s1 C_lo = C_lo, t
2113         nop.i 999 ;;
2114 }
2115
2116 { .mfi
2117         nop.m 999
2118 //
2119 //    Case 4: C_lo = C_lo + t
2120 //
2121 (p11) fadd.s1 r = C_hi, C_lo
2122         nop.i 999 ;;
2123 }
2124
2125 { .mfi
2126         nop.m 999
2127 (p11) fsub.s1 c = C_hi, r
2128         nop.i 999
2129 }
2130
2131 { .mfi
2132         nop.m 999
2133 //
2134 //    Case 4: c = c + C_lo  finished.
2135 //    Is i_1  even or odd?
2136 //    if i_1 == 0, set PR_4, else set PR_5.
2137 //
2138 // r and c have been computed.
2139 // We known whether this is the sine or cosine routine.
2140 // Make sure ftz mode is set - should be automatic when using wre
2141 (p0)  fmpy.s1 rsq = r, r
2142         nop.i 999 ;;
2143 }
2144
2145 { .mfi
2146         nop.m 999
2147 (p11) fadd.s1 c = c , C_lo
2148 (p11) cmp.eq.unc p11, p12 =  0x0000, i_1 ;;
2149 }
2150
2151 { .mfi
2152         nop.m 999
2153 (p12) frcpa.s1 S_hi, p0 = f1, r
2154         nop.i 999
2155 }
2156
2157 { .mfi
2158         nop.m 999
2159 //
2160 //    N odd: Change sign of S_hi
2161 //
2162 (p11) fma.s1 Result = rsq, P1_2, P1_1
2163         nop.i 999 ;;
2164 }
2165
2166 { .mfi
2167         nop.m 999
2168 (p12) fma.s1 P = rsq, Q1_2, Q1_1
2169         nop.i 999
2170 }
2171
2172 { .mfi
2173         nop.m 999
2174 //
2175 //    N odd:  Result  =  S_hi + S_lo      (User supplied rounding mode for C1)
2176 //
2177 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2178         nop.i 999 ;;
2179 }
2180
2181 { .mfi
2182         nop.m 999
2183 //
2184 //    N even: rsq = r * r
2185 //    N odd:  S_hi = frcpa(r)
2186 //
2187 (p12) fmerge.ns S_hi = S_hi, S_hi
2188         nop.i 999
2189 }
2190
2191 { .mfi
2192         nop.m 999
2193 //
2194 //    N even: rsq = rsq * P1_2 + P1_1
2195 //    N odd:  poly1 =  1.0 +  S_hi * r    16 bits partial  account for necessary
2196 //
2197 (p11) fmpy.s1 Result = rsq, Result
2198         nop.i 999 ;;
2199 }
2200
2201 { .mfi
2202         nop.m 999
2203 (p12) fma.s1 poly1 = S_hi, r,f1
2204         nop.i 999
2205 }
2206
2207 { .mfi
2208         nop.m 999
2209 //
2210 //    N even: Result =  Result * rsq
2211 //    N odd:  S_hi  = S_hi + S_hi*poly1  16 bits account for necessary
2212 //
2213 (p11) fma.s1 Result = r, Result, c
2214         nop.i 999 ;;
2215 }
2216
2217 { .mfi
2218         nop.m 999
2219 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2220         nop.i 999
2221 }
2222
2223 { .mfi
2224         nop.m 999
2225 //
2226 //    N odd:   S_hi  = S_hi * poly1 + S_hi   32 bits
2227 //
2228 (p11) fadd.s0 Result= r, Result
2229         nop.i 999 ;;
2230 }
2231
2232 { .mfi
2233         nop.m 999
2234 (p12) fma.s1 poly1 =  S_hi, r, f1
2235         nop.i 999 ;;
2236 }
2237
2238 { .mfi
2239         nop.m 999
2240 //
2241 //    N even: Result = Result * r + c
2242 //    N odd:  poly1  = 1.0 + S_hi * r        32 bits partial
2243 //
2244 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2245         nop.i 999 ;;
2246 }
2247
2248 { .mfi
2249         nop.m 999
2250 (p12) fma.s1 poly1 = S_hi, r, f1
2251         nop.i 999 ;;
2252 }
2253
2254 { .mfi
2255         nop.m 999
2256 //
2257 //    N even: Result1 = Result + r  (Rounding mode S0)
2258 //    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2259 //
2260 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2261         nop.i 999 ;;
2262 }
2263
2264 { .mfi
2265         nop.m 999
2266 //
2267 //    N odd:  poly1  =  S_hi * poly + S_hi    64 bits
2268 //
2269 (p12) fma.s1 poly1 = S_hi, r, f1
2270         nop.i 999 ;;
2271 }
2272
2273 { .mfi
2274         nop.m 999
2275 //
2276 //    N odd:  poly1  =  S_hi * r + 1.0
2277 //
2278 (p12) fma.s1 poly1 = S_hi, c, poly1
2279         nop.i 999 ;;
2280 }
2281
2282 { .mfi
2283         nop.m 999
2284 //
2285 //    N odd:  poly1  =  S_hi * c + poly1
2286 //
2287 (p12) fmpy.s1 S_lo = S_hi, poly1
2288         nop.i 999 ;;
2289 }
2290
2291 { .mfi
2292         nop.m 999
2293 //
2294 //    N odd:  S_lo  =  S_hi *  poly1
2295 //
2296 (p12) fma.s1 S_lo = P, r, S_lo
2297         nop.i 999 ;;
2298 }
2299
2300 { .mfb
2301         nop.m 999
2302 //
2303 //    N odd:  S_lo  =  S_lo + r * P
2304 //
2305 (p12) fadd.s0 Result = S_hi, S_lo
2306 (p0)   br.ret.sptk b0 ;;
2307 }
2308
2309
2310 TAN_SMALL_R:
2311
2312 { .mii
2313         nop.m 999
2314 (p0)  extr.u i_1 = N_fix_gr, 0, 1 ;;
2315 (p0)  cmp.eq.unc p11, p12 = 0x0000, i_1
2316 }
2317
2318 { .mfi
2319         nop.m 999
2320 (p0)  fmpy.s1 rsq = r, r
2321         nop.i 999 ;;
2322 }
2323
2324 { .mfi
2325         nop.m 999
2326 (p12) frcpa.s1 S_hi, p0 = f1, r
2327         nop.i 999
2328 }
2329
2330 { .mfi
2331 (p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
2332         nop.f 999
2333         nop.i 999
2334 }
2335 ;;
2336
2337 { .mmi
2338 (p0)  ld8 table_ptr1 = [table_ptr1]
2339       nop.m 999
2340       nop.i 999
2341 }
2342 ;;
2343
2344 // *****************************************************************
2345 // *****************************************************************
2346 // *****************************************************************
2347
2348 { .mmi
2349 (p0)  add table_ptr1 = 224, table_ptr1 ;;
2350 (p0)  ldfe P1_1 = [table_ptr1], 16
2351         nop.i 999 ;;
2352 }
2353 //    r and c have been computed.
2354 //    We known whether this is the sine or cosine routine.
2355 //    Make sure ftz mode is set - should be automatic when using wre
2356 //    |r| < 2**(-2)
2357
2358 { .mfi
2359 (p0)  ldfe P1_2 = [table_ptr1], 16
2360 (p11) fmpy.s1 r_to_the_8 = rsq, rsq
2361         nop.i 999 ;;
2362 }
2363 //
2364 //    Set table_ptr1 to beginning of constant table.
2365 //    Get [i_1] - lsb of N_fix_gr.
2366 //
2367
2368 { .mfi
2369 (p0)  ldfe P1_3 = [table_ptr1], 96
2370 //
2371 //    N even: rsq = r * r
2372 //    N odd:  S_hi = frcpa(r)
2373 //
2374 (p12) fmerge.ns S_hi = S_hi, S_hi
2375         nop.i 999 ;;
2376 }
2377 //
2378 //    Is i_1  even or odd?
2379 //    if i_1 == 0, set PR_11.
2380 //    if i_1 != 0, set PR_12.
2381 //
2382
2383 { .mfi
2384 (p11) ldfe P1_9 = [table_ptr1], -16
2385 //
2386 //    N even: Poly2 = P1_7 + Poly2 * rsq
2387 //    N odd:  poly2 = Q1_5 + poly2 * rsq
2388 //
2389 (p11) fadd.s1 CORR = rsq, f1
2390         nop.i 999 ;;
2391 }
2392
2393 { .mmi
2394 (p11) ldfe P1_8 = [table_ptr1], -16 ;;
2395 //
2396 //    N even: Poly1 = P1_2 + P1_3 * rsq
2397 //    N odd:  poly1 =  1.0 +  S_hi * r
2398 //    16 bits partial  account for necessary (-1)
2399 //
2400 (p11) ldfe P1_7 = [table_ptr1], -16
2401         nop.i 999 ;;
2402 }
2403 //
2404 //    N even: Poly1 = P1_1 + Poly1 * rsq
2405 //    N odd:  S_hi  =  S_hi + S_hi * poly1)     16 bits account for necessary
2406 //
2407
2408 { .mfi
2409 (p11) ldfe P1_6 = [table_ptr1], -16
2410 //
2411 //    N even: Poly2 = P1_5 + Poly2 * rsq
2412 //    N odd:  poly2 = Q1_3 + poly2 * rsq
2413 //
2414 (p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
2415         nop.i 999 ;;
2416 }
2417 //
2418 //    N even: Poly1 =  Poly1 * rsq
2419 //    N odd:  poly1  = 1.0 + S_hi * r         32 bits partial
2420 //
2421
2422 { .mfi
2423 (p11) ldfe P1_5 = [table_ptr1], -16
2424 (p12) fma.s1 poly1 =  S_hi, r, f1
2425         nop.i 999 ;;
2426 }
2427 //
2428 //    N even: CORR =  CORR * c
2429 //    N odd:  S_hi  =  S_hi * poly1 + S_hi    32 bits
2430 //
2431
2432 //
2433 //    N even: Poly2 = P1_6 + Poly2 * rsq
2434 //    N odd:  poly2 = Q1_4 + poly2 * rsq
2435 //
2436 { .mmf
2437 (p0)  addl           table_ptr2   = @ltoff(TAN_BASE_CONSTANTS), gp
2438 (p11) ldfe P1_4 = [table_ptr1], -16
2439 (p11) fmpy.s1 CORR =  CORR, c
2440 }
2441 ;;
2442
2443
2444 { .mmi
2445 (p0)  ld8 table_ptr2 = [table_ptr2]
2446       nop.m 999
2447       nop.i 999
2448 }
2449 ;;
2450
2451
2452 { .mii
2453 (p0)  add table_ptr2 = 464, table_ptr2
2454         nop.i 999 ;;
2455         nop.i 999
2456 }
2457
2458 { .mfi
2459         nop.m 999
2460 (p11) fma.s1 Poly1 = P1_3, rsq, P1_2
2461         nop.i 999 ;;
2462 }
2463
2464 { .mfi
2465 (p0)  ldfe Q1_7 = [table_ptr2], -16
2466 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2467         nop.i 999 ;;
2468 }
2469
2470 { .mfi
2471 (p0)  ldfe Q1_6 = [table_ptr2], -16
2472 (p11) fma.s1 Poly2 = P1_9, rsq, P1_8
2473         nop.i 999 ;;
2474 }
2475
2476 { .mmi
2477 (p0)  ldfe Q1_5 = [table_ptr2], -16 ;;
2478 (p12) ldfe Q1_4 = [table_ptr2], -16
2479         nop.i 999 ;;
2480 }
2481
2482 { .mfi
2483 (p12) ldfe Q1_3 = [table_ptr2], -16
2484 //
2485 //    N even: Poly2 = P1_8 + P1_9 * rsq
2486 //    N odd:  poly2 = Q1_6 + Q1_7 * rsq
2487 //
2488 (p11) fma.s1 Poly1 = Poly1, rsq, P1_1
2489         nop.i 999 ;;
2490 }
2491
2492 { .mfi
2493 (p12) ldfe Q1_2 = [table_ptr2], -16
2494 (p12) fma.s1 poly1 = S_hi, r, f1
2495         nop.i 999 ;;
2496 }
2497
2498 { .mfi
2499 (p12) ldfe Q1_1 = [table_ptr2], -16
2500 (p11) fma.s1 Poly2 = Poly2, rsq, P1_7
2501         nop.i 999 ;;
2502 }
2503
2504 { .mfi
2505         nop.m 999
2506 //
2507 //    N even: CORR =  rsq + 1
2508 //    N even: r_to_the_8 =  rsq * rsq
2509 //
2510 (p11) fmpy.s1 Poly1 = Poly1, rsq
2511         nop.i 999 ;;
2512 }
2513
2514 { .mfi
2515         nop.m 999
2516 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2517         nop.i 999
2518 }
2519
2520 { .mfi
2521         nop.m 999
2522 (p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
2523         nop.i 999 ;;
2524 }
2525
2526 { .mfi
2527         nop.m 999
2528 (p11) fma.s1 Poly2 = Poly2, rsq, P1_6
2529         nop.i 999 ;;
2530 }
2531
2532 { .mfi
2533         nop.m 999
2534 (p12) fma.s1 poly1 = S_hi, r, f1
2535         nop.i 999
2536 }
2537
2538 { .mfi
2539         nop.m 999
2540 (p12) fma.s1 poly2 = poly2, rsq, Q1_5
2541         nop.i 999 ;;
2542 }
2543
2544 { .mfi
2545         nop.m 999
2546 (p11) fma.s1 Poly2= Poly2, rsq, P1_5
2547         nop.i 999 ;;
2548 }
2549
2550 { .mfi
2551         nop.m 999
2552 (p12) fma.s1 S_hi =  S_hi, poly1, S_hi
2553         nop.i 999
2554 }
2555
2556 { .mfi
2557         nop.m 999
2558 (p12) fma.s1 poly2 = poly2, rsq, Q1_4
2559         nop.i 999 ;;
2560 }
2561
2562 { .mfi
2563         nop.m 999
2564 //
2565 //    N even: r_to_the_8 = r_to_the_8 * r_to_the_8
2566 //    N odd:  poly1  =  S_hi * r + 1.0       64 bits partial
2567 //
2568 (p11) fma.s1 Poly2 = Poly2, rsq, P1_4
2569         nop.i 999 ;;
2570 }
2571
2572 { .mfi
2573         nop.m 999
2574 //
2575 //    N even: Result = CORR + Poly * r
2576 //    N odd:  P = Q1_1 + poly2 * rsq
2577 //
2578 (p12) fma.s1 poly1 = S_hi, r, f1
2579         nop.i 999
2580 }
2581
2582 { .mfi
2583         nop.m 999
2584 (p12) fma.s1 poly2 = poly2, rsq, Q1_3
2585         nop.i 999 ;;
2586 }
2587
2588 { .mfi
2589         nop.m 999
2590 //
2591 //    N even: Poly2 = P1_4 + Poly2 * rsq
2592 //    N odd:  poly2 = Q1_2 + poly2 * rsq
2593 //
2594 (p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
2595         nop.i 999 ;;
2596 }
2597
2598 { .mfi
2599         nop.m 999
2600 (p12) fma.s1 poly1 = S_hi, c, poly1
2601         nop.i 999
2602 }
2603
2604 { .mfi
2605         nop.m 999
2606 (p12) fma.s1 poly2 = poly2, rsq, Q1_2
2607         nop.i 999 ;;
2608 }
2609
2610 { .mfi
2611         nop.m 999
2612 //
2613 //    N even: Poly = Poly1 + Poly2 * r_to_the_8
2614 //    N odd:  S_hi =  S_hi * poly1 + S_hi    64 bits
2615 //
2616 (p11) fma.s1 Result = Poly, r, CORR
2617         nop.i 999 ;;
2618 }
2619
2620 { .mfi
2621         nop.m 999
2622 //
2623 //    N even: Result =  r + Result  (User supplied rounding mode)
2624 //    N odd:  poly1  =  S_hi * c + poly1
2625 //
2626 (p12) fmpy.s1 S_lo = S_hi, poly1
2627         nop.i 999
2628 }
2629
2630 { .mfi
2631         nop.m 999
2632 (p12) fma.s1 P = poly2, rsq, Q1_1
2633         nop.i 999 ;;
2634 }
2635
2636 { .mfi
2637         nop.m 999
2638 //
2639 //    N odd:  poly1  =  S_hi * r + 1.0
2640 //
2641 (p11) fadd.s0 Result = Result, r
2642         nop.i 999 ;;
2643 }
2644
2645 { .mfi
2646         nop.m 999
2647 //
2648 //    N odd:  S_lo  =  S_hi *  poly1
2649 //
2650 (p12) fma.s1 S_lo = Q1_1, c, S_lo
2651         nop.i 999
2652 }
2653
2654 { .mfi
2655         nop.m 999
2656 //
2657 //    N odd:  Result = Result + S_hi  (user supplied rounding mode)
2658 //
2659 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2660         nop.i 999 ;;
2661 }
2662
2663 { .mfi
2664         nop.m 999
2665 //
2666 //    N odd:  S_lo  =  Q1_1 * c + S_lo
2667 //
2668 (p12) fma.s1 Result = P, r, S_lo
2669         nop.i 999 ;;
2670 }
2671
2672 { .mfb
2673         nop.m 999
2674 //
2675 //    N odd:  Result =  S_lo + r * P
2676 //
2677 (p12) fadd.s0 Result = Result, S_hi
2678 (p0)   br.ret.sptk b0 ;;
2679 }
2680
2681
2682 TAN_NORMAL_R:
2683
2684 { .mfi
2685 (p0)  getf.sig sig_r = r
2686 // *******************************************************************
2687 // *******************************************************************
2688 // *******************************************************************
2689 //
2690 //    r and c have been computed.
2691 //    Make sure ftz mode is set - should be automatic when using wre
2692 //
2693 //
2694 //    Get [i_1] -  lsb of N_fix_gr alone.
2695 //
2696 (p0)  fmerge.s  Pos_r = f1, r
2697 (p0)  extr.u i_1 = N_fix_gr, 0, 1 ;;
2698 }
2699
2700 { .mfi
2701         nop.m 999
2702 (p0)  fmerge.s  sgn_r =  r, f1
2703 (p0)  cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
2704 }
2705
2706 { .mfi
2707         nop.m 999
2708         nop.f 999
2709 (p0)  extr.u lookup = sig_r, 58, 5
2710 }
2711
2712 { .mlx
2713         nop.m 999
2714 (p0)  movl Create_B = 0x8200000000000000 ;;
2715 }
2716
2717 { .mfi
2718 (p0)  addl           table_ptr1   = @ltoff(TAN_BASE_CONSTANTS), gp
2719         nop.f 999
2720 (p0)  dep Create_B = lookup, Create_B, 58, 5
2721 }
2722 ;;
2723
2724 //
2725 //    Get [i_1] -  lsb of N_fix_gr alone.
2726 //    Pos_r = abs (r)
2727 //
2728
2729
2730 { .mmi
2731       ld8 table_ptr1 = [table_ptr1]
2732       nop.m 999
2733       nop.i 999
2734 }
2735 ;;
2736
2737
2738 { .mmi
2739         nop.m 999
2740 (p0)  setf.sig B = Create_B
2741 //
2742 //    Set table_ptr1 and table_ptr2 to base address of
2743 //    constant table.
2744 //
2745 (p0)  add table_ptr1 = 480, table_ptr1 ;;
2746 }
2747
2748 { .mmb
2749         nop.m 999
2750 //
2751 //    Is i_1 or i_0  == 0 ?
2752 //    Create the constant  1 00000 1000000000000000000000...
2753 //
2754 (p0)  ldfe P2_1 = [table_ptr1], 16
2755         nop.b 999
2756 }
2757
2758 { .mmi
2759         nop.m 999 ;;
2760 (p0)  getf.exp exp_r = Pos_r
2761         nop.i 999
2762 }
2763 //
2764 //    Get r's exponent
2765 //    Get r's significand
2766 //
2767
2768 { .mmi
2769 (p0)  ldfe P2_2 = [table_ptr1], 16 ;;
2770 //
2771 //    Get the 5 bits or r for the lookup.   1.xxxxx ....
2772 //    from sig_r.
2773 //    Grab  lsb of exp of B
2774 //
2775 (p0)  ldfe P2_3 = [table_ptr1], 16
2776         nop.i 999 ;;
2777 }
2778
2779 { .mii
2780         nop.m 999
2781 (p0)  andcm table_offset = 0x0001, exp_r ;;
2782 (p0)  shl table_offset = table_offset, 9 ;;
2783 }
2784
2785 { .mii
2786         nop.m 999
2787 //
2788 //    Deposit   0 00000 1000000000000000000000... on
2789 //              1 xxxxx yyyyyyyyyyyyyyyyyyyyyy...,
2790 //    getting rid of the ys.
2791 //    Is  B = 2** -2 or  B= 2** -1? If 2**-1, then
2792 //    we want an offset of 512 for table addressing.
2793 //
2794 (p0)  shladd table_offset = lookup, 4, table_offset ;;
2795 //
2796 //    B =  ........ 1xxxxx 1000000000000000000...
2797 //
2798 (p0)  add table_ptr1 = table_ptr1, table_offset ;;
2799 }
2800
2801 { .mmb
2802         nop.m 999
2803 //
2804 //   B =  ........ 1xxxxx 1000000000000000000...
2805 //   Convert B so it has the same exponent as Pos_r
2806 //
2807 (p0)  ldfd T_hi = [table_ptr1], 8
2808         nop.b 999 ;;
2809 }
2810
2811 //
2812 //    x = |r| - B
2813 //    Load T_hi.
2814 //    Load C_hi.
2815 //
2816
2817 { .mmf
2818 (p0)  addl           table_ptr2   = @ltoff(TAN_BASE_CONSTANTS), gp
2819 (p0)  ldfs T_lo = [table_ptr1]
2820 (p0)  fmerge.se B = Pos_r, B
2821 }
2822 ;;
2823
2824 { .mmi
2825       ld8 table_ptr2 = [table_ptr2]
2826       nop.m 999
2827       nop.i 999
2828 }
2829 ;;
2830
2831 { .mii
2832 (p0)  add table_ptr2 = 1360, table_ptr2
2833         nop.i 999 ;;
2834 (p0)  add table_ptr2 = table_ptr2, table_offset ;;
2835 }
2836
2837 { .mfi
2838 (p0)  ldfd C_hi = [table_ptr2], 8
2839 (p0)  fsub.s1 x = Pos_r, B
2840         nop.i 999 ;;
2841 }
2842
2843 { .mii
2844 (p0)  ldfs C_lo = [table_ptr2],255
2845         nop.i 999 ;;
2846 //
2847 //    xsq = x * x
2848 //    N even: Tx = T_hi * x
2849 //    Load T_lo.
2850 //    Load C_lo - increment pointer to get SC_inv
2851 //    - cant get all the way, do an add later.
2852 //
2853 (p0)  add table_ptr2 = 569, table_ptr2 ;;
2854 }
2855 //
2856 //    N even: Tx1 = Tx + 1
2857 //    N odd:  Cx1 = 1 - Cx
2858 //
2859
2860 { .mfi
2861 (p0)  ldfe SC_inv = [table_ptr2], 0
2862         nop.f 999
2863         nop.i 999 ;;
2864 }
2865
2866 { .mfi
2867         nop.m 999
2868 (p0)  fmpy.s1 xsq = x, x
2869         nop.i 999
2870 }
2871
2872 { .mfi
2873         nop.m 999
2874 (p11) fmpy.s1 Tx = T_hi, x
2875         nop.i 999 ;;
2876 }
2877
2878 { .mfi
2879         nop.m 999
2880 (p12) fmpy.s1 Cx = C_hi, x
2881         nop.i 999 ;;
2882 }
2883
2884 { .mfi
2885         nop.m 999
2886 //
2887 //    N odd: Cx = C_hi * x
2888 //
2889 (p0)  fma.s1 P = P2_3, xsq, P2_2
2890         nop.i 999
2891 }
2892
2893 { .mfi
2894         nop.m 999
2895 //
2896 //    N even and odd: P = P2_3 + P2_2 * xsq
2897 //
2898 (p11) fadd.s1 Tx1 = Tx, f1
2899         nop.i 999 ;;
2900 }
2901
2902 { .mfi
2903         nop.m 999
2904 //
2905 //    N even: D = C_hi - tanx
2906 //    N odd: D = T_hi + tanx
2907 //
2908 (p11) fmpy.s1 CORR = SC_inv, T_hi
2909         nop.i 999
2910 }
2911
2912 { .mfi
2913         nop.m 999
2914 (p0)  fmpy.s1 Sx = SC_inv, x
2915         nop.i 999 ;;
2916 }
2917
2918 { .mfi
2919         nop.m 999
2920 (p12) fmpy.s1 CORR = SC_inv, C_hi
2921         nop.i 999 ;;
2922 }
2923
2924 { .mfi
2925         nop.m 999
2926 (p12) fsub.s1 V_hi = f1, Cx
2927         nop.i 999 ;;
2928 }
2929
2930 { .mfi
2931         nop.m 999
2932 (p0)  fma.s1 P = P, xsq, P2_1
2933         nop.i 999
2934 }
2935
2936 { .mfi
2937         nop.m 999
2938 //
2939 //    N even and odd: P = P2_1 + P * xsq
2940 //
2941 (p11) fma.s1 V_hi = Tx, Tx1, f1
2942         nop.i 999 ;;
2943 }
2944
2945 { .mfi
2946         nop.m 999
2947 //
2948 //    N even: Result  = sgn_r * tail + T_hi (user rounding mode for C1)
2949 //    N odd:  Result  = sgn_r * tail + C_hi (user rounding mode for C1)
2950 //
2951 (p0)   fmpy.s0 fp_tmp = fp_tmp, fp_tmp  // Dummy mult to set inexact
2952         nop.i 999 ;;
2953 }
2954
2955 { .mfi
2956         nop.m 999
2957 (p0)  fmpy.s1 CORR = CORR, c
2958         nop.i 999 ;;
2959 }
2960
2961 { .mfi
2962         nop.m 999
2963 (p12) fnma.s1 V_hi = Cx,V_hi,f1
2964         nop.i 999 ;;
2965 }
2966
2967 { .mfi
2968         nop.m 999
2969 //
2970 //    N even: V_hi = Tx * Tx1 + 1
2971 //    N odd: Cx1 = 1 - Cx * Cx1
2972 //
2973 (p0)  fmpy.s1 P = P, xsq
2974         nop.i 999
2975 }
2976
2977 { .mfi
2978         nop.m 999
2979 //
2980 //    N even and odd: P = P * xsq
2981 //
2982 (p11) fmpy.s1 V_hi = V_hi, T_hi
2983         nop.i 999 ;;
2984 }
2985
2986 { .mfi
2987         nop.m 999
2988 //
2989 //    N even and odd: tail = P * tail + V_lo
2990 //
2991 (p11) fmpy.s1 T_hi = sgn_r, T_hi
2992         nop.i 999 ;;
2993 }
2994
2995 { .mfi
2996         nop.m 999
2997 (p0)  fmpy.s1 CORR = CORR, sgn_r
2998         nop.i 999 ;;
2999 }
3000
3001 { .mfi
3002         nop.m 999
3003 (p12) fmpy.s1 V_hi = V_hi,C_hi
3004         nop.i 999 ;;
3005 }
3006
3007 { .mfi
3008         nop.m 999
3009 //
3010 //    N even: V_hi = T_hi * V_hi
3011 //    N odd: V_hi  = C_hi * V_hi
3012 //
3013 (p0)  fma.s1 tanx = P, x, x
3014         nop.i 999
3015 }
3016
3017 { .mfi
3018         nop.m 999
3019 (p12) fnmpy.s1 C_hi = sgn_r, C_hi
3020         nop.i 999 ;;
3021 }
3022
3023 { .mfi
3024         nop.m 999
3025 //
3026 //    N even: V_lo = 1 - V_hi + C_hi
3027 //    N odd: V_lo = 1 - V_hi + T_hi
3028 //
3029 (p11) fadd.s1 CORR = CORR, T_lo
3030         nop.i 999
3031 }
3032
3033 { .mfi
3034         nop.m 999
3035 (p12) fsub.s1 CORR = CORR, C_lo
3036         nop.i 999 ;;
3037 }
3038
3039 { .mfi
3040         nop.m 999
3041 //
3042 //    N even and odd: tanx = x + x * P
3043 //    N even and odd: Sx = SC_inv * x
3044 //
3045 (p11) fsub.s1 D = C_hi, tanx
3046         nop.i 999
3047 }
3048
3049 { .mfi
3050         nop.m 999
3051 (p12) fadd.s1 D = T_hi, tanx
3052         nop.i 999 ;;
3053 }
3054
3055 { .mfi
3056         nop.m 999
3057 //
3058 //    N odd: CORR = SC_inv * C_hi
3059 //    N even: CORR = SC_inv * T_hi
3060 //
3061 (p0)  fnma.s1 D = V_hi, D, f1
3062         nop.i 999 ;;
3063 }
3064
3065 { .mfi
3066         nop.m 999
3067 //
3068 //    N even and odd: D = 1 - V_hi * D
3069 //    N even and odd: CORR = CORR * c
3070 //
3071 (p0)  fma.s1 V_hi = V_hi, D, V_hi
3072         nop.i 999 ;;
3073 }
3074
3075 { .mfi
3076         nop.m 999
3077 //
3078 //    N even and odd: V_hi = V_hi + V_hi * D
3079 //    N even and odd: CORR = sgn_r * CORR
3080 //
3081 (p11) fnma.s1 V_lo = V_hi, C_hi, f1
3082         nop.i 999
3083 }
3084
3085 { .mfi
3086         nop.m 999
3087 (p12) fnma.s1 V_lo = V_hi, T_hi, f1
3088         nop.i 999 ;;
3089 }
3090
3091 { .mfi
3092         nop.m 999
3093 //
3094 //    N even: CORR = COOR + T_lo
3095 //    N odd: CORR = CORR - C_lo
3096 //
3097 (p11) fma.s1 V_lo = tanx, V_hi, V_lo
3098         nop.i 999
3099 }
3100
3101 { .mfi
3102         nop.m 999
3103 (p12) fnma.s1 V_lo = tanx, V_hi, V_lo
3104         nop.i 999 ;;
3105 }
3106
3107 { .mfi
3108         nop.m 999
3109 //
3110 //    N even: V_lo = V_lo + V_hi * tanx
3111 //    N odd: V_lo = V_lo - V_hi * tanx
3112 //
3113 (p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
3114         nop.i 999
3115 }
3116
3117 { .mfi
3118         nop.m 999
3119 (p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
3120         nop.i 999 ;;
3121 }
3122
3123 { .mfi
3124         nop.m 999
3125 //
3126 //    N  even: V_lo = V_lo - V_hi * C_lo
3127 //    N  odd: V_lo = V_lo - V_hi * T_lo
3128 //
3129 (p0)  fmpy.s1 V_lo = V_hi, V_lo
3130         nop.i 999 ;;
3131 }
3132
3133 { .mfi
3134         nop.m 999
3135 //
3136 //    N even and odd: V_lo = V_lo * V_hi
3137 //
3138 (p0)  fadd.s1 tail = V_hi, V_lo
3139         nop.i 999 ;;
3140 }
3141
3142 { .mfi
3143         nop.m 999
3144 //
3145 //    N even and odd: tail = V_hi + V_lo
3146 //
3147 (p0)  fma.s1 tail = tail, P, V_lo
3148         nop.i 999 ;;
3149 }
3150
3151 { .mfi
3152         nop.m 999
3153 //
3154 //    N even: T_hi = sgn_r * T_hi
3155 //    N odd : C_hi = -sgn_r * C_hi
3156 //
3157 (p0)  fma.s1 tail = tail, Sx, CORR
3158         nop.i 999 ;;
3159 }
3160
3161 { .mfi
3162         nop.m 999
3163 //
3164 //    N even and odd: tail = Sx * tail + CORR
3165 //
3166 (p0)  fma.s1 tail = V_hi, Sx, tail
3167         nop.i 999 ;;
3168 }
3169
3170 { .mfi
3171         nop.m 999
3172 //
3173 //    N even an odd: tail = Sx * V_hi + tail
3174 //
3175 (p11) fma.s0 Result = sgn_r, tail, T_hi
3176         nop.i 999
3177 }
3178
3179 { .mfb
3180         nop.m 999
3181 (p12) fma.s0 Result = sgn_r, tail, C_hi
3182 (p0)   br.ret.sptk b0 ;;
3183 }
3184
3185 .endp __libm_tan
3186 ASM_SIZE_DIRECTIVE(__libm_tan)
3187
3188
3189
3190 // *******************************************************************
3191 // *******************************************************************
3192 // *******************************************************************
3193 //
3194 //     Special Code to handle very large argument case.
3195 //     Call int pi_by_2_reduce(&x,&r)
3196 //     for |arguments| >= 2**63
3197 //     (Arg or x) is in f8
3198 //     Address to save r and c as double
3199
3200 //                 (1)                    (2)                 (3) (call)         (4)
3201 //            sp -> +               psp -> +            psp -> +           sp ->  +
3202 //                  |                      |                   |                  |
3203 //                  |                r50 ->| <- r50      f0  ->|           r50 -> | -> c
3204 //                  |                      |                   |                  |
3205 //         sp-32 -> | <- r50          f0 ->|             f0  ->| <- r50    r49 -> | -> r
3206 //                  |                      |                   |                  |
3207 //                  |               r49  ->| <- r49     Arg  ->| <- r49           | -> x
3208 //                  |                      |                   |                  |
3209 //         sp -64 ->|             sp -64 ->|          sp -64 ->|                  |
3210 //
3211 //            save pfs           save b0                                     restore gp
3212 //            save gp                                                        restore b0
3213 //                                                                           restore pfs
3214
3215
3216
3217 .proc __libm_callout
3218 __libm_callout:
3219 TAN_ARG_TOO_LARGE:
3220 .prologue
3221 // (1)
3222 { .mfi
3223         add   GR_Parameter_r =-32,sp                        // Parameter: r address
3224         nop.f 0
3225 .save   ar.pfs,GR_SAVE_PFS
3226         mov  GR_SAVE_PFS=ar.pfs                 // Save ar.pfs
3227 }
3228 { .mfi
3229 .fframe 64
3230         add sp=-64,sp                           // Create new stack
3231         nop.f 0
3232         mov GR_SAVE_GP=gp                       // Save gp
3233 };;
3234
3235 // (2)
3236 { .mmi
3237         stfe [GR_Parameter_r ] = f0,16                      // Clear Parameter r on stack
3238         add  GR_Parameter_X = 16,sp                        // Parameter x address
3239 .save   b0, GR_SAVE_B0
3240         mov GR_SAVE_B0=b0                       // Save b0
3241 };;
3242
3243 // (3)
3244 .body
3245 { .mib
3246         stfe [GR_Parameter_r ] = f0,-16                     // Clear Parameter c on stack
3247         nop.i 0
3248         nop.b 0
3249 }
3250 { .mib
3251         stfe [GR_Parameter_X] = Arg                        // Store Parameter x on stack
3252         nop.i 0
3253 (p0)    br.call.sptk b0=__libm_pi_by_2_reduce#
3254 }
3255 ;;
3256
3257
3258 // (4)
3259 { .mmi
3260         mov   gp = GR_SAVE_GP                  // Restore gp
3261 (p0)    mov   N_fix_gr = r8
3262         nop.i 999
3263 }
3264 ;;
3265
3266 { .mmi
3267 (p0)    ldfe  Arg        =[GR_Parameter_X],16
3268 (p0)    ldfs  TWO_TO_NEG2 = [table_ptr2],4
3269         nop.i 999
3270 }
3271 ;;
3272
3273
3274 { .mmb
3275 (p0)    ldfe  r =[GR_Parameter_r ],16
3276 (p0)    ldfs  NEGTWO_TO_NEG2 = [table_ptr2],4
3277         nop.b 999 ;;
3278 }
3279
3280 { .mfi
3281 (p0)    ldfe  c =[GR_Parameter_r ]
3282         nop.f 999
3283         nop.i 999 ;;
3284 }
3285
3286 { .mfi
3287         nop.m 999
3288 //
3289 //     Is |r| < 2**(-2)
3290 //
3291 (p0)   fcmp.lt.unc.s1  p6, p0 = r, TWO_TO_NEG2
3292         mov   b0 = GR_SAVE_B0                  // Restore return address
3293 }
3294 ;;
3295
3296 { .mfi
3297        nop.m 999
3298 (p6)   fcmp.gt.unc.s1  p6, p0 = r, NEGTWO_TO_NEG2
3299        mov   ar.pfs = GR_SAVE_PFS             // Restore ar.pfs
3300 }
3301 ;;
3302
3303 { .mbb
3304 .restore sp
3305         add   sp = 64,sp                       // Restore stack pointer
3306 (p6)   br.cond.spnt TAN_SMALL_R
3307 (p0)   br.cond.sptk TAN_NORMAL_R
3308 }
3309 ;;
3310 .endp __libm_callout
3311 ASM_SIZE_DIRECTIVE(__libm_callout)
3312
3313
3314 .proc __libm_TAN_SPECIAL
3315 __libm_TAN_SPECIAL:
3316
3317 //
3318 //     Code for NaNs, Unsupporteds, Infs, or +/- zero ?
3319 //     Invalid raised for Infs and SNaNs.
3320 //
3321
3322 { .mfb
3323         nop.m 999
3324 (p0)   fmpy.s0 Arg = Arg, f0
3325 (p0)   br.ret.sptk b0
3326 }
3327 .endp __libm_TAN_SPECIAL
3328 ASM_SIZE_DIRECTIVE(__libm_TAN_SPECIAL)
3329
3330
3331 .type __libm_pi_by_2_reduce#,@function
3332 .global __libm_pi_by_2_reduce#