diff options
| author | Andrew Kelley <andrew@ziglang.org> | 2022-05-03 14:56:05 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-05-03 14:56:05 -0400 |
| commit | 6317da8da400e40a947f6333d1afc289ec349102 (patch) | |
| tree | 30e450f3c2277605b85cf07074505e52cfd86ae2 /lib/std/fmt/parse_float/parse.zig | |
| parent | d48789467d95ddf0582c855d01f32bef48726e29 (diff) | |
| parent | 2947a2faab2e287926b20d5d8984ad69bba5bd2c (diff) | |
| download | zig-6317da8da400e40a947f6333d1afc289ec349102.tar.gz zig-6317da8da400e40a947f6333d1afc289ec349102.zip | |
Merge pull request #11566 from tiehuis/master
add new float-parser based on eisel-lemire algorithm
Diffstat (limited to 'lib/std/fmt/parse_float/parse.zig')
| -rw-r--r-- | lib/std/fmt/parse_float/parse.zig | 291 |
1 files changed, 291 insertions, 0 deletions
diff --git a/lib/std/fmt/parse_float/parse.zig b/lib/std/fmt/parse_float/parse.zig new file mode 100644 index 0000000000..3b757c7c41 --- /dev/null +++ b/lib/std/fmt/parse_float/parse.zig @@ -0,0 +1,291 @@ +const std = @import("std"); +const common = @import("common.zig"); +const FloatStream = @import("FloatStream.zig"); +const isEightDigits = common.isEightDigits; +const Number = common.Number; + +/// Parse 8 digits, loaded as bytes in little-endian order. +/// +/// This uses the trick where every digit is in [0x030, 0x39], +/// and therefore can be parsed in 3 multiplications, much +/// faster than the normal 8. +/// +/// This is based off the algorithm described in "Fast numeric string to +/// int", available here: <https://johnnylee-sde.github.io/Fast-numeric-string-to-int/>. +fn parse8Digits(v_: u64) u64 { + var v = v_; + const mask = 0x0000_00ff_0000_00ff; + const mul1 = 0x000f_4240_0000_0064; + const mul2 = 0x0000_2710_0000_0001; + v -= 0x3030_3030_3030_3030; + v = (v * 10) + (v >> 8); // will not overflow, fits in 63 bits + const v1 = (v & mask) *% mul1; + const v2 = ((v >> 16) & mask) *% mul2; + return @as(u64, @truncate(u32, (v1 +% v2) >> 32)); +} + +/// Parse digits until a non-digit character is found. +fn tryParseDigits(comptime T: type, stream: *FloatStream, x: *T, comptime base: u8) void { + // Try to parse 8 digits at a time, using an optimized algorithm. + // This only supports decimal digits. + if (base == 10) { + while (stream.hasLen(8)) { + const v = stream.readU64Unchecked(); + if (!isEightDigits(v)) { + break; + } + + x.* = x.* *% 1_0000_0000 +% parse8Digits(v); + stream.advance(8); + } + } + + while (stream.scanDigit(base)) |digit| { + x.* *%= base; + x.* +%= digit; + } +} + +fn min_n_digit_int(comptime T: type, digit_count: usize) T { + var n: T = 1; + var i: usize = 1; + while (i < digit_count) : (i += 1) n *= 10; + return n; +} + +/// Parse up to N digits +fn tryParseNDigits(comptime T: type, stream: *FloatStream, x: *T, comptime base: u8, comptime n: usize) void { + while (x.* < min_n_digit_int(T, n)) { + if (stream.scanDigit(base)) |digit| { + x.* *%= base; + x.* +%= digit; + } else { + break; + } + } +} + +/// Parse the scientific notation component of a float. +fn parseScientific(stream: *FloatStream) ?i64 { + var exponent: i64 = 0; + var negative = false; + + if (stream.first()) |c| { + negative = c == '-'; + if (c == '-' or c == '+') { + stream.advance(1); + } + } + if (stream.firstIsDigit(10)) { + while (stream.scanDigit(10)) |digit| { + // no overflows here, saturate well before overflow + if (exponent < 0x1000_0000) { + exponent = 10 * exponent + digit; + } + } + + return if (negative) -exponent else exponent; + } + + return null; +} + +const ParseInfo = struct { + // 10 or 16 + base: u8, + // 10^19 fits in u64, 16^16 fits in u64 + max_mantissa_digits: usize, + // e.g. e or p (E and P also checked) + exp_char_lower: u8, +}; + +fn parsePartialNumberBase(comptime T: type, stream: *FloatStream, negative: bool, n: *usize, comptime info: ParseInfo) ?Number(T) { + const MantissaT = common.mantissaType(T); + + // parse initial digits before dot + var mantissa: MantissaT = 0; + tryParseDigits(MantissaT, stream, &mantissa, info.base); + var int_end = stream.offsetTrue(); + var n_digits = @intCast(isize, stream.offsetTrue()); + + // handle dot with the following digits + var exponent: i64 = 0; + if (stream.firstIs('.')) { + stream.advance(1); + const marker = stream.offsetTrue(); + tryParseDigits(MantissaT, stream, &mantissa, info.base); + const n_after_dot = stream.offsetTrue() - marker; + exponent = -@intCast(i64, n_after_dot); + n_digits += @intCast(isize, n_after_dot); + } + + // adjust required shift to offset mantissa for base-16 (2^4) + if (info.base == 16) { + exponent *= 4; + } + + if (n_digits == 0) { + return null; + } + + // handle scientific format + var exp_number: i64 = 0; + if (stream.firstIsLower(info.exp_char_lower)) { + stream.advance(1); + exp_number = parseScientific(stream) orelse return null; + exponent += exp_number; + } + + const len = stream.offset; // length must be complete parsed length + n.* = len; + + if (stream.underscore_count > 0 and !validUnderscores(stream.slice, info.base)) { + return null; + } + + // common case with not many digits + if (n_digits <= info.max_mantissa_digits) { + return Number(T){ + .exponent = exponent, + .mantissa = mantissa, + .negative = negative, + .many_digits = false, + .hex = info.base == 16, + }; + } + + n_digits -= info.max_mantissa_digits; + var many_digits = false; + stream.reset(); // re-parse from beginning + while (stream.firstIs3('0', '.', '_')) { + // '0' = '.' + 2 + const next = stream.firstUnchecked(); + if (next != '_') { + n_digits -= @intCast(isize, next -| ('0' - 1)); + } else { + stream.underscore_count += 1; + } + stream.advance(1); + } + if (n_digits > 0) { + // at this point we have more than max_mantissa_digits significant digits, let's try again + many_digits = true; + mantissa = 0; + stream.reset(); + tryParseNDigits(MantissaT, stream, &mantissa, info.base, info.max_mantissa_digits); + + exponent = blk: { + if (mantissa >= min_n_digit_int(MantissaT, info.max_mantissa_digits)) { + // big int + break :blk @intCast(i64, int_end) - @intCast(i64, stream.offsetTrue()); + } else { + // the next byte must be present and be '.' + // We know this is true because we had more than 19 + // digits previously, so we overflowed a 64-bit integer, + // but parsing only the integral digits produced less + // than 19 digits. That means we must have a decimal + // point, and at least 1 fractional digit. + stream.advance(1); + var marker = stream.offsetTrue(); + tryParseNDigits(MantissaT, stream, &mantissa, info.base, info.max_mantissa_digits); + break :blk @intCast(i64, marker) - @intCast(i64, stream.offsetTrue()); + } + }; + // add back the explicit part + exponent += exp_number; + } + + return Number(T){ + .exponent = exponent, + .mantissa = mantissa, + .negative = negative, + .many_digits = many_digits, + .hex = info.base == 16, + }; +} + +/// Parse a partial, non-special floating point number. +/// +/// This creates a representation of the float as the +/// significant digits and the decimal exponent. +fn parsePartialNumber(comptime T: type, s: []const u8, negative: bool, n: *usize) ?Number(T) { + std.debug.assert(s.len != 0); + var stream = FloatStream.init(s); + const MantissaT = common.mantissaType(T); + + if (stream.hasLen(2) and stream.atUnchecked(0) == '0' and std.ascii.toLower(stream.atUnchecked(1)) == 'x') { + stream.advance(2); + return parsePartialNumberBase(T, &stream, negative, n, .{ + .base = 16, + .max_mantissa_digits = if (MantissaT == u64) 16 else 32, + .exp_char_lower = 'p', + }); + } else { + return parsePartialNumberBase(T, &stream, negative, n, .{ + .base = 10, + .max_mantissa_digits = if (MantissaT == u64) 19 else 38, + .exp_char_lower = 'e', + }); + } +} + +pub fn parseNumber(comptime T: type, s: []const u8, negative: bool) ?Number(T) { + var consumed: usize = 0; + if (parsePartialNumber(T, s, negative, &consumed)) |number| { + // must consume entire float (no trailing data) + if (s.len == consumed) { + return number; + } + } + return null; +} + +fn parsePartialInfOrNan(comptime T: type, s: []const u8, negative: bool, n: *usize) ?T { + // inf/infinity; infxxx should only consume inf. + if (std.ascii.startsWithIgnoreCase(s, "inf")) { + n.* = 3; + if (std.ascii.startsWithIgnoreCase(s[3..], "inity")) { + n.* = 8; + } + + return if (!negative) std.math.inf(T) else -std.math.inf(T); + } + + if (std.ascii.startsWithIgnoreCase(s, "nan")) { + n.* = 3; + return std.math.nan(T); + } + + return null; +} + +pub fn parseInfOrNan(comptime T: type, s: []const u8, negative: bool) ?T { + var consumed: usize = 0; + if (parsePartialInfOrNan(T, s, negative, &consumed)) |special| { + if (s.len == consumed) { + return special; + } + } + return null; +} + +pub fn validUnderscores(s: []const u8, comptime base: u8) bool { + var i: usize = 0; + while (i < s.len) : (i += 1) { + if (s[i] == '_') { + // underscore at start of end + if (i == 0 or i + 1 == s.len) { + return false; + } + // consecutive underscores + if (!common.isDigit(s[i - 1], base) or !common.isDigit(s[i + 1], base)) { + return false; + } + + // next is guaranteed a digit, skip an extra + i += 1; + } + } + + return true; +} |
