lib/std/math/__trig.zig


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273

// Ported from musl, which is licensed under the MIT license:
// https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT
//
// https://git.musl-libc.org/cgit/musl/tree/src/math/__cos.c
// https://git.musl-libc.org/cgit/musl/tree/src/math/__cosdf.c
// https://git.musl-libc.org/cgit/musl/tree/src/math/__sin.c
// https://git.musl-libc.org/cgit/musl/tree/src/math/__sindf.c
// https://git.musl-libc.org/cgit/musl/tree/src/math/__tand.c
// https://git.musl-libc.org/cgit/musl/tree/src/math/__tandf.c

// kernel cos function on [-pi/4, pi/4], pi/4 ~ 0.785398164
// Input x is assumed to be bounded by ~pi/4 in magnitude.
// Input y is the tail of x.
//
// Algorithm
//      1. Since cos(-x) = cos(x), we need only to consider positive x.
//      2. if x < 2^-27 (hx<0x3e400000 0), return 1 with inexact if x!=0.
//      3. cos(x) is approximated by a polynomial of degree 14 on
//         [0,pi/4]
//                                       4            14
//              cos(x) ~ 1 - x*x/2 + C1*x + ... + C6*x
//         where the remez error is
//
//      |              2     4     6     8     10    12     14 |     -58
//      |cos(x)-(1-.5*x +C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  )| <= 2
//      |                                                      |
//
//                     4     6     8     10    12     14
//      4. let r = C1*x +C2*x +C3*x +C4*x +C5*x  +C6*x  , then
//             cos(x) ~ 1 - x*x/2 + r
//         since cos(x+y) ~ cos(x) - sin(x)*y
//                        ~ cos(x) - x*y,
//         a correction term is necessary in cos(x) and hence
//              cos(x+y) = 1 - (x*x/2 - (r - x*y))
//         For better accuracy, rearrange to
//              cos(x+y) ~ w + (tmp + (r-x*y))
//         where w = 1 - x*x/2 and tmp is a tiny correction term
//         (1 - x*x/2 == w + tmp exactly in infinite precision).
//         The exactness of w + tmp in infinite precision depends on w
//         and tmp having the same precision as x.  If they have extra
//         precision due to compiler bugs, then the extra precision is
//         only good provided it is retained in all terms of the final
//         expression for cos().  Retention happens in all cases tested
//         under FreeBSD, so don't pessimize things by forcibly clipping
//         any extra precision in w.
pub fn __cos(x: f64, y: f64) f64 {
    const C1 = 4.16666666666666019037e-02; // 0x3FA55555, 0x5555554C
    const C2 = -1.38888888888741095749e-03; // 0xBF56C16C, 0x16C15177
    const C3 = 2.48015872894767294178e-05; // 0x3EFA01A0, 0x19CB1590
    const C4 = -2.75573143513906633035e-07; // 0xBE927E4F, 0x809C52AD
    const C5 = 2.08757232129817482790e-09; // 0x3E21EE9E, 0xBDB4B1C4
    const C6 = -1.13596475577881948265e-11; // 0xBDA8FAE9, 0xBE8838D4

    const z = x * x;
    const zs = z * z;
    const r = z * (C1 + z * (C2 + z * C3)) + zs * zs * (C4 + z * (C5 + z * C6));
    const hz = 0.5 * z;
    const w = 1.0 - hz;
    return w + (((1.0 - w) - hz) + (z * r - x * y));
}

pub fn __cosdf(x: f64) f32 {
    // |cos(x) - c(x)| < 2**-34.1 (~[-5.37e-11, 5.295e-11]).
    const C0 = -0x1ffffffd0c5e81.0p-54; // -0.499999997251031003120
    const C1 = 0x155553e1053a42.0p-57; //  0.0416666233237390631894
    const C2 = -0x16c087e80f1e27.0p-62; // -0.00138867637746099294692
    const C3 = 0x199342e0ee5069.0p-68; //  0.0000243904487962774090654

    // Try to optimize for parallel evaluation as in __tandf.c.
    const z = x * x;
    const w = z * z;
    const r = C2 + z * C3;
    return @floatCast(f32, ((1.0 + z * C0) + w * C1) + (w * z) * r);
}

// kernel sin function on ~[-pi/4, pi/4] (except on -0), pi/4 ~ 0.7854
// Input x is assumed to be bounded by ~pi/4 in magnitude.
// Input y is the tail of x.
// Input iy indicates whether y is 0. (if iy=0, y assume to be 0).
//
// Algorithm
//      1. Since sin(-x) = -sin(x), we need only to consider positive x.
//      2. Callers must return sin(-0) = -0 without calling here since our
//         odd polynomial is not evaluated in a way that preserves -0.
//         Callers may do the optimization sin(x) ~ x for tiny x.
//      3. sin(x) is approximated by a polynomial of degree 13 on
//         [0,pi/4]
//                               3            13
//              sin(x) ~ x + S1*x + ... + S6*x
//         where
//
//      |sin(x)         2     4     6     8     10     12  |     -58
//      |----- - (1+S1*x +S2*x +S3*x +S4*x +S5*x  +S6*x   )| <= 2
//      |  x                                               |
//
//      4. sin(x+y) = sin(x) + sin'(x')*y
//                  ~ sin(x) + (1-x*x/2)*y
//         For better accuracy, let
//                   3      2      2      2      2
//              r = x *(S2+x *(S3+x *(S4+x *(S5+x *S6))))
//         then                   3    2
//              sin(x) = x + (S1*x + (x *(r-y/2)+y))
pub fn __sin(x: f64, y: f64, iy: i32) f64 {
    const S1 = -1.66666666666666324348e-01; // 0xBFC55555, 0x55555549
    const S2 = 8.33333333332248946124e-03; // 0x3F811111, 0x1110F8A6
    const S3 = -1.98412698298579493134e-04; // 0xBF2A01A0, 0x19C161D5
    const S4 = 2.75573137070700676789e-06; // 0x3EC71DE3, 0x57B1FE7D
    const S5 = -2.50507602534068634195e-08; // 0xBE5AE5E6, 0x8A2B9CEB
    const S6 = 1.58969099521155010221e-10; // 0x3DE5D93A, 0x5ACFD57C

    const z = x * x;
    const w = z * z;
    const r = S2 + z * (S3 + z * S4) + z * w * (S5 + z * S6);
    const v = z * x;
    if (iy == 0) {
        return x + v * (S1 + z * r);
    } else {
        return x - ((z * (0.5 * y - v * r) - y) - v * S1);
    }
}

pub fn __sindf(x: f64) f32 {
    // |sin(x)/x - s(x)| < 2**-37.5 (~[-4.89e-12, 4.824e-12]).
    const S1 = -0x15555554cbac77.0p-55; // -0.166666666416265235595
    const S2 = 0x111110896efbb2.0p-59; //  0.0083333293858894631756
    const S3 = -0x1a00f9e2cae774.0p-65; // -0.000198393348360966317347
    const S4 = 0x16cd878c3b46a7.0p-71; //  0.0000027183114939898219064

    // Try to optimize for parallel evaluation as in __tandf.c.
    const z = x * x;
    const w = z * z;
    const r = S3 + z * S4;
    const s = z * x;
    return @floatCast(f32, (x + s * (S1 + z * S2)) + s * w * r);
}

// kernel tan function on ~[-pi/4, pi/4] (except on -0), pi/4 ~ 0.7854
// Input x is assumed to be bounded by ~pi/4 in magnitude.
// Input y is the tail of x.
// Input odd indicates whether tan (if odd = 0) or -1/tan (if odd = 1) is returned.
//
// Algorithm
//      1. Since tan(-x) = -tan(x), we need only to consider positive x.
//      2. Callers must return tan(-0) = -0 without calling here since our
//         odd polynomial is not evaluated in a way that preserves -0.
//         Callers may do the optimization tan(x) ~ x for tiny x.
//      3. tan(x) is approximated by a odd polynomial of degree 27 on
//         [0,0.67434]
//                               3             27
//              tan(x) ~ x + T1*x + ... + T13*x
//         where
//
//              |tan(x)         2     4            26   |     -59.2
//              |----- - (1+T1*x +T2*x +.... +T13*x    )| <= 2
//              |  x                                    |
//
//         Note: tan(x+y) = tan(x) + tan'(x)*y
//                        ~ tan(x) + (1+x*x)*y
//         Therefore, for better accuracy in computing tan(x+y), let
//                   3      2      2       2       2
//              r = x *(T2+x *(T3+x *(...+x *(T12+x *T13))))
//         then
//                                  3    2
//              tan(x+y) = x + (T1*x + (x *(r+y)+y))
//
//      4. For x in [0.67434,pi/4],  let y = pi/4 - x, then
//              tan(x) = tan(pi/4-y) = (1-tan(y))/(1+tan(y))
//                     = 1 - 2*(tan(y) - (tan(y)^2)/(1+tan(y)))
pub fn __tan(x_: f64, y_: f64, odd: bool) f64 {
    var x = x_;
    var y = y_;

    const T = [_]f64{
        3.33333333333334091986e-01, // 3FD55555, 55555563
        1.33333333333201242699e-01, // 3FC11111, 1110FE7A
        5.39682539762260521377e-02, // 3FABA1BA, 1BB341FE
        2.18694882948595424599e-02, // 3F9664F4, 8406D637
        8.86323982359930005737e-03, // 3F8226E3, E96E8493
        3.59207910759131235356e-03, // 3F6D6D22, C9560328
        1.45620945432529025516e-03, // 3F57DBC8, FEE08315
        5.88041240820264096874e-04, // 3F4344D8, F2F26501
        2.46463134818469906812e-04, // 3F3026F7, 1A8D1068
        7.81794442939557092300e-05, // 3F147E88, A03792A6
        7.14072491382608190305e-05, // 3F12B80F, 32F0A7E9
        -1.85586374855275456654e-05, // BEF375CB, DB605373
        2.59073051863633712884e-05, // 3EFB2A70, 74BF7AD4
    };
    const pio4 = 7.85398163397448278999e-01; // 3FE921FB, 54442D18
    const pio4lo = 3.06161699786838301793e-17; // 3C81A626, 33145C07

    var z: f64 = undefined;
    var r: f64 = undefined;
    var v: f64 = undefined;
    var w: f64 = undefined;
    var s: f64 = undefined;
    var a: f64 = undefined;
    var w0: f64 = undefined;
    var a0: f64 = undefined;
    var hx: u32 = undefined;
    var sign: bool = undefined;

    hx = @intCast(u32, @bitCast(u64, x) >> 32);
    const big = (hx & 0x7fffffff) >= 0x3FE59428; // |x| >= 0.6744
    if (big) {
        sign = hx >> 31 != 0;
        if (sign) {
            x = -x;
            y = -y;
        }
        x = (pio4 - x) + (pio4lo - y);
        y = 0.0;
    }
    z = x * x;
    w = z * z;

    // Break x^5*(T[1]+x^2*T[2]+...) into
    // x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
    // x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
    r = T[1] + w * (T[3] + w * (T[5] + w * (T[7] + w * (T[9] + w * T[11]))));
    v = z * (T[2] + w * (T[4] + w * (T[6] + w * (T[8] + w * (T[10] + w * T[12])))));
    s = z * x;
    r = y + z * (s * (r + v) + y) + s * T[0];
    w = x + r;
    if (big) {
        s = 1 - 2 * @intToFloat(f64, @boolToInt(odd));
        v = s - 2.0 * (x + (r - w * w / (w + s)));
        return if (sign) -v else v;
    }
    if (!odd) {
        return w;
    }
    // -1.0/(x+r) has up to 2ulp error, so compute it accurately
    w0 = w;
    w0 = @bitCast(f64, @bitCast(u64, w0) & 0xffffffff00000000);
    v = r - (w0 - x); // w0+v = r+x
    a = -1.0 / w;
    a0 = a;
    a0 = @bitCast(f64, @bitCast(u64, a0) & 0xffffffff00000000);
    return a0 + a * (1.0 + a0 * w0 + a0 * v);
}

pub fn __tandf(x: f64, odd: bool) f32 {
    // |tan(x)/x - t(x)| < 2**-25.5 (~[-2e-08, 2e-08]).
    const T = [_]f64{
        0x15554d3418c99f.0p-54, // 0.333331395030791399758
        0x1112fd38999f72.0p-55, // 0.133392002712976742718
        0x1b54c91d865afe.0p-57, // 0.0533812378445670393523
        0x191df3908c33ce.0p-58, // 0.0245283181166547278873
        0x185dadfcecf44e.0p-61, // 0.00297435743359967304927
        0x1362b9bf971bcd.0p-59, // 0.00946564784943673166728
    };

    const z = x * x;
    // Split up the polynomial into small independent terms to give
    // opportunities for parallel evaluation.  The chosen splitting is
    // micro-optimized for Athlons (XP, X64).  It costs 2 multiplications
    // relative to Horner's method on sequential machines.
    //
    // We add the small terms from lowest degree up for efficiency on
    // non-sequential machines (the lowest degree terms tend to be ready
    // earlier).  Apart from this, we don't care about order of
    // operations, and don't need to to care since we have precision to
    // spare.  However, the chosen splitting is good for accuracy too,
    // and would give results as accurate as Horner's method if the
    // small terms were added from highest degree down.
    const r = T[4] + z * T[5];
    const t = T[2] + z * T[3];
    const w = z * z;
    const s = z * x;
    const u = T[0] + z * T[1];
    const r0 = (x + s * u) + (s * w) * (t + w * r);
    return @floatCast(f32, if (odd) -1.0 / r0 else r0);
}