Merge pull request #11566 from tiehuis/master

add new float-parser based on eisel-lemire algorithm
author: Andrew Kelley <andrew@ziglang.org> 2022-05-03 14:56:05 -0400
committer: GitHub <noreply@github.com> 2022-05-03 14:56:05 -0400
commit: 6317da8da400e40a947f6333d1afc289ec349102 (patch)
tree: 30e450f3c2277605b85cf07074505e52cfd86ae2 /lib/std/fmt/parse_float/parse.zig
parent: d48789467d95ddf0582c855d01f32bef48726e29 (diff)
parent: 2947a2faab2e287926b20d5d8984ad69bba5bd2c (diff)
download: zig-6317da8da400e40a947f6333d1afc289ec349102.tar.gz
zig-6317da8da400e40a947f6333d1afc289ec349102.zip
1 files changed, 291 insertions, 0 deletions
diff --git a/lib/std/fmt/parse_float/parse.zig b/lib/std/fmt/parse_float/parse.zig
new file mode 100644
index 0000000000..3b757c7c41
--- /dev/null
+++ b/lib/std/fmt/parse_float/parse.zig
@@ -0,0 +1,291 @@
+const std = @import("std");
+const common = @import("common.zig");
+const FloatStream = @import("FloatStream.zig");
+const isEightDigits = common.isEightDigits;
+const Number = common.Number;
+
+/// Parse 8 digits, loaded as bytes in little-endian order.
+///
+/// This uses the trick where every digit is in [0x030, 0x39],
+/// and therefore can be parsed in 3 multiplications, much
+/// faster than the normal 8.
+///
+/// This is based off the algorithm described in "Fast numeric string to
+/// int", available here: <https://johnnylee-sde.github.io/Fast-numeric-string-to-int/>.
+fn parse8Digits(v_: u64) u64 {
+    var v = v_;
+    const mask = 0x0000_00ff_0000_00ff;
+    const mul1 = 0x000f_4240_0000_0064;
+    const mul2 = 0x0000_2710_0000_0001;
+    v -= 0x3030_3030_3030_3030;
+    v = (v * 10) + (v >> 8); // will not overflow, fits in 63 bits
+    const v1 = (v & mask) *% mul1;
+    const v2 = ((v >> 16) & mask) *% mul2;
+    return @as(u64, @truncate(u32, (v1 +% v2) >> 32));
+}
+
+/// Parse digits until a non-digit character is found.
+fn tryParseDigits(comptime T: type, stream: *FloatStream, x: *T, comptime base: u8) void {
+    // Try to parse 8 digits at a time, using an optimized algorithm.
+    // This only supports decimal digits.
+    if (base == 10) {
+        while (stream.hasLen(8)) {
+            const v = stream.readU64Unchecked();
+            if (!isEightDigits(v)) {
+                break;
+            }
+
+            x.* = x.* *% 1_0000_0000 +% parse8Digits(v);
+            stream.advance(8);
+        }
+    }
+
+    while (stream.scanDigit(base)) |digit| {
+        x.* *%= base;
+        x.* +%= digit;
+    }
+}
+
+fn min_n_digit_int(comptime T: type, digit_count: usize) T {
+    var n: T = 1;
+    var i: usize = 1;
+    while (i < digit_count) : (i += 1) n *= 10;
+    return n;
+}
+
+/// Parse up to N digits
+fn tryParseNDigits(comptime T: type, stream: *FloatStream, x: *T, comptime base: u8, comptime n: usize) void {
+    while (x.* < min_n_digit_int(T, n)) {
+        if (stream.scanDigit(base)) |digit| {
+            x.* *%= base;
+            x.* +%= digit;
+        } else {
+            break;
+        }
+    }
+}
+
+/// Parse the scientific notation component of a float.
+fn parseScientific(stream: *FloatStream) ?i64 {
+    var exponent: i64 = 0;
+    var negative = false;
+
+    if (stream.first()) |c| {
+        negative = c == '-';
+        if (c == '-' or c == '+') {
+            stream.advance(1);
+        }
+    }
+    if (stream.firstIsDigit(10)) {
+        while (stream.scanDigit(10)) |digit| {
+            // no overflows here, saturate well before overflow
+            if (exponent < 0x1000_0000) {
+                exponent = 10 * exponent + digit;
+            }
+        }
+
+        return if (negative) -exponent else exponent;
+    }
+
+    return null;
+}
+
+const ParseInfo = struct {
+    // 10 or 16
+    base: u8,
+    // 10^19 fits in u64, 16^16 fits in u64
+    max_mantissa_digits: usize,
+    // e.g. e or p (E and P also checked)
+    exp_char_lower: u8,
+};
+
+fn parsePartialNumberBase(comptime T: type, stream: *FloatStream, negative: bool, n: *usize, comptime info: ParseInfo) ?Number(T) {
+    const MantissaT = common.mantissaType(T);
+
+    // parse initial digits before dot
+    var mantissa: MantissaT = 0;
+    tryParseDigits(MantissaT, stream, &mantissa, info.base);
+    var int_end = stream.offsetTrue();
+    var n_digits = @intCast(isize, stream.offsetTrue());
+
+    // handle dot with the following digits
+    var exponent: i64 = 0;
+    if (stream.firstIs('.')) {
+        stream.advance(1);
+        const marker = stream.offsetTrue();
+        tryParseDigits(MantissaT, stream, &mantissa, info.base);
+        const n_after_dot = stream.offsetTrue() - marker;
+        exponent = -@intCast(i64, n_after_dot);
+        n_digits += @intCast(isize, n_after_dot);
+    }
+
+    // adjust required shift to offset mantissa for base-16 (2^4)
+    if (info.base == 16) {
+        exponent *= 4;
+    }
+
+    if (n_digits == 0) {
+        return null;
+    }
+
+    // handle scientific format
+    var exp_number: i64 = 0;
+    if (stream.firstIsLower(info.exp_char_lower)) {
+        stream.advance(1);
+        exp_number = parseScientific(stream) orelse return null;
+        exponent += exp_number;
+    }
+
+    const len = stream.offset; // length must be complete parsed length
+    n.* = len;
+
+    if (stream.underscore_count > 0 and !validUnderscores(stream.slice, info.base)) {
+        return null;
+    }
+
+    // common case with not many digits
+    if (n_digits <= info.max_mantissa_digits) {
+        return Number(T){
+            .exponent = exponent,
+            .mantissa = mantissa,
+            .negative = negative,
+            .many_digits = false,
+            .hex = info.base == 16,
+        };
+    }
+
+    n_digits -= info.max_mantissa_digits;
+    var many_digits = false;
+    stream.reset(); // re-parse from beginning
+    while (stream.firstIs3('0', '.', '_')) {
+        // '0' = '.' + 2
+        const next = stream.firstUnchecked();
+        if (next != '_') {
+            n_digits -= @intCast(isize, next -| ('0' - 1));
+        } else {
+            stream.underscore_count += 1;
+        }
+        stream.advance(1);
+    }
+    if (n_digits > 0) {
+        // at this point we have more than max_mantissa_digits significant digits, let's try again
+        many_digits = true;
+        mantissa = 0;
+        stream.reset();
+        tryParseNDigits(MantissaT, stream, &mantissa, info.base, info.max_mantissa_digits);
+
+        exponent = blk: {
+            if (mantissa >= min_n_digit_int(MantissaT, info.max_mantissa_digits)) {
+                // big int
+                break :blk @intCast(i64, int_end) - @intCast(i64, stream.offsetTrue());
+            } else {
+                // the next byte must be present and be '.'
+                // We know this is true because we had more than 19
+                // digits previously, so we overflowed a 64-bit integer,
+                // but parsing only the integral digits produced less
+                // than 19 digits. That means we must have a decimal
+                // point, and at least 1 fractional digit.
+                stream.advance(1);
+                var marker = stream.offsetTrue();
+                tryParseNDigits(MantissaT, stream, &mantissa, info.base, info.max_mantissa_digits);
+                break :blk @intCast(i64, marker) - @intCast(i64, stream.offsetTrue());
+            }
+        };
+        // add back the explicit part
+        exponent += exp_number;
+    }
+
+    return Number(T){
+        .exponent = exponent,
+        .mantissa = mantissa,
+        .negative = negative,
+        .many_digits = many_digits,
+        .hex = info.base == 16,
+    };
+}
+
+/// Parse a partial, non-special floating point number.
+///
+/// This creates a representation of the float as the
+/// significant digits and the decimal exponent.
+fn parsePartialNumber(comptime T: type, s: []const u8, negative: bool, n: *usize) ?Number(T) {
+    std.debug.assert(s.len != 0);
+    var stream = FloatStream.init(s);
+    const MantissaT = common.mantissaType(T);
+
+    if (stream.hasLen(2) and stream.atUnchecked(0) == '0' and std.ascii.toLower(stream.atUnchecked(1)) == 'x') {
+        stream.advance(2);
+        return parsePartialNumberBase(T, &stream, negative, n, .{
+            .base = 16,
+            .max_mantissa_digits = if (MantissaT == u64) 16 else 32,
+            .exp_char_lower = 'p',
+        });
+    } else {
+        return parsePartialNumberBase(T, &stream, negative, n, .{
+            .base = 10,
+            .max_mantissa_digits = if (MantissaT == u64) 19 else 38,
+            .exp_char_lower = 'e',
+        });
+    }
+}
+
+pub fn parseNumber(comptime T: type, s: []const u8, negative: bool) ?Number(T) {
+    var consumed: usize = 0;
+    if (parsePartialNumber(T, s, negative, &consumed)) |number| {
+        // must consume entire float (no trailing data)
+        if (s.len == consumed) {
+            return number;
+        }
+    }
+    return null;
+}
+
+fn parsePartialInfOrNan(comptime T: type, s: []const u8, negative: bool, n: *usize) ?T {
+    // inf/infinity; infxxx should only consume inf.
+    if (std.ascii.startsWithIgnoreCase(s, "inf")) {
+        n.* = 3;
+        if (std.ascii.startsWithIgnoreCase(s[3..], "inity")) {
+            n.* = 8;
+        }
+
+        return if (!negative) std.math.inf(T) else -std.math.inf(T);
+    }
+
+    if (std.ascii.startsWithIgnoreCase(s, "nan")) {
+        n.* = 3;
+        return std.math.nan(T);
+    }
+
+    return null;
+}
+
+pub fn parseInfOrNan(comptime T: type, s: []const u8, negative: bool) ?T {
+    var consumed: usize = 0;
+    if (parsePartialInfOrNan(T, s, negative, &consumed)) |special| {
+        if (s.len == consumed) {
+            return special;
+        }
+    }
+    return null;
+}
+
+pub fn validUnderscores(s: []const u8, comptime base: u8) bool {
+    var i: usize = 0;
+    while (i < s.len) : (i += 1) {
+        if (s[i] == '_') {
+            // underscore at start of end
+            if (i == 0 or i + 1 == s.len) {
+                return false;
+            }
+            // consecutive underscores
+            if (!common.isDigit(s[i - 1], base) or !common.isDigit(s[i + 1], base)) {
+                return false;
+            }
+
+            // next is guaranteed a digit, skip an extra
+            i += 1;
+        }
+    }
+
+    return true;
+}
author	Andrew Kelley <andrew@ziglang.org>	2022-05-03 14:56:05 -0400
committer	GitHub <noreply@github.com>	2022-05-03 14:56:05 -0400
commit	6317da8da400e40a947f6333d1afc289ec349102 (patch)
tree	30e450f3c2277605b85cf07074505e52cfd86ae2 /lib/std/fmt/parse_float/parse.zig
parent	d48789467d95ddf0582c855d01f32bef48726e29 (diff)
parent	2947a2faab2e287926b20d5d8984ad69bba5bd2c (diff)
download	zig-6317da8da400e40a947f6333d1afc289ec349102.tar.gz zig-6317da8da400e40a947f6333d1afc289ec349102.zip