aboutsummaryrefslogtreecommitdiff
path: root/lib/std/fmt/parse_float/parse.zig
diff options
context:
space:
mode:
authorAndrew Kelley <andrew@ziglang.org>2022-05-03 14:56:05 -0400
committerGitHub <noreply@github.com>2022-05-03 14:56:05 -0400
commit6317da8da400e40a947f6333d1afc289ec349102 (patch)
tree30e450f3c2277605b85cf07074505e52cfd86ae2 /lib/std/fmt/parse_float/parse.zig
parentd48789467d95ddf0582c855d01f32bef48726e29 (diff)
parent2947a2faab2e287926b20d5d8984ad69bba5bd2c (diff)
downloadzig-6317da8da400e40a947f6333d1afc289ec349102.tar.gz
zig-6317da8da400e40a947f6333d1afc289ec349102.zip
Merge pull request #11566 from tiehuis/master
add new float-parser based on eisel-lemire algorithm
Diffstat (limited to 'lib/std/fmt/parse_float/parse.zig')
-rw-r--r--lib/std/fmt/parse_float/parse.zig291
1 files changed, 291 insertions, 0 deletions
diff --git a/lib/std/fmt/parse_float/parse.zig b/lib/std/fmt/parse_float/parse.zig
new file mode 100644
index 0000000000..3b757c7c41
--- /dev/null
+++ b/lib/std/fmt/parse_float/parse.zig
@@ -0,0 +1,291 @@
+const std = @import("std");
+const common = @import("common.zig");
+const FloatStream = @import("FloatStream.zig");
+const isEightDigits = common.isEightDigits;
+const Number = common.Number;
+
+/// Parse 8 digits, loaded as bytes in little-endian order.
+///
+/// This uses the trick where every digit is in [0x030, 0x39],
+/// and therefore can be parsed in 3 multiplications, much
+/// faster than the normal 8.
+///
+/// This is based off the algorithm described in "Fast numeric string to
+/// int", available here: <https://johnnylee-sde.github.io/Fast-numeric-string-to-int/>.
+fn parse8Digits(v_: u64) u64 {
+ var v = v_;
+ const mask = 0x0000_00ff_0000_00ff;
+ const mul1 = 0x000f_4240_0000_0064;
+ const mul2 = 0x0000_2710_0000_0001;
+ v -= 0x3030_3030_3030_3030;
+ v = (v * 10) + (v >> 8); // will not overflow, fits in 63 bits
+ const v1 = (v & mask) *% mul1;
+ const v2 = ((v >> 16) & mask) *% mul2;
+ return @as(u64, @truncate(u32, (v1 +% v2) >> 32));
+}
+
+/// Parse digits until a non-digit character is found.
+fn tryParseDigits(comptime T: type, stream: *FloatStream, x: *T, comptime base: u8) void {
+ // Try to parse 8 digits at a time, using an optimized algorithm.
+ // This only supports decimal digits.
+ if (base == 10) {
+ while (stream.hasLen(8)) {
+ const v = stream.readU64Unchecked();
+ if (!isEightDigits(v)) {
+ break;
+ }
+
+ x.* = x.* *% 1_0000_0000 +% parse8Digits(v);
+ stream.advance(8);
+ }
+ }
+
+ while (stream.scanDigit(base)) |digit| {
+ x.* *%= base;
+ x.* +%= digit;
+ }
+}
+
+fn min_n_digit_int(comptime T: type, digit_count: usize) T {
+ var n: T = 1;
+ var i: usize = 1;
+ while (i < digit_count) : (i += 1) n *= 10;
+ return n;
+}
+
+/// Parse up to N digits
+fn tryParseNDigits(comptime T: type, stream: *FloatStream, x: *T, comptime base: u8, comptime n: usize) void {
+ while (x.* < min_n_digit_int(T, n)) {
+ if (stream.scanDigit(base)) |digit| {
+ x.* *%= base;
+ x.* +%= digit;
+ } else {
+ break;
+ }
+ }
+}
+
+/// Parse the scientific notation component of a float.
+fn parseScientific(stream: *FloatStream) ?i64 {
+ var exponent: i64 = 0;
+ var negative = false;
+
+ if (stream.first()) |c| {
+ negative = c == '-';
+ if (c == '-' or c == '+') {
+ stream.advance(1);
+ }
+ }
+ if (stream.firstIsDigit(10)) {
+ while (stream.scanDigit(10)) |digit| {
+ // no overflows here, saturate well before overflow
+ if (exponent < 0x1000_0000) {
+ exponent = 10 * exponent + digit;
+ }
+ }
+
+ return if (negative) -exponent else exponent;
+ }
+
+ return null;
+}
+
+const ParseInfo = struct {
+ // 10 or 16
+ base: u8,
+ // 10^19 fits in u64, 16^16 fits in u64
+ max_mantissa_digits: usize,
+ // e.g. e or p (E and P also checked)
+ exp_char_lower: u8,
+};
+
+fn parsePartialNumberBase(comptime T: type, stream: *FloatStream, negative: bool, n: *usize, comptime info: ParseInfo) ?Number(T) {
+ const MantissaT = common.mantissaType(T);
+
+ // parse initial digits before dot
+ var mantissa: MantissaT = 0;
+ tryParseDigits(MantissaT, stream, &mantissa, info.base);
+ var int_end = stream.offsetTrue();
+ var n_digits = @intCast(isize, stream.offsetTrue());
+
+ // handle dot with the following digits
+ var exponent: i64 = 0;
+ if (stream.firstIs('.')) {
+ stream.advance(1);
+ const marker = stream.offsetTrue();
+ tryParseDigits(MantissaT, stream, &mantissa, info.base);
+ const n_after_dot = stream.offsetTrue() - marker;
+ exponent = -@intCast(i64, n_after_dot);
+ n_digits += @intCast(isize, n_after_dot);
+ }
+
+ // adjust required shift to offset mantissa for base-16 (2^4)
+ if (info.base == 16) {
+ exponent *= 4;
+ }
+
+ if (n_digits == 0) {
+ return null;
+ }
+
+ // handle scientific format
+ var exp_number: i64 = 0;
+ if (stream.firstIsLower(info.exp_char_lower)) {
+ stream.advance(1);
+ exp_number = parseScientific(stream) orelse return null;
+ exponent += exp_number;
+ }
+
+ const len = stream.offset; // length must be complete parsed length
+ n.* = len;
+
+ if (stream.underscore_count > 0 and !validUnderscores(stream.slice, info.base)) {
+ return null;
+ }
+
+ // common case with not many digits
+ if (n_digits <= info.max_mantissa_digits) {
+ return Number(T){
+ .exponent = exponent,
+ .mantissa = mantissa,
+ .negative = negative,
+ .many_digits = false,
+ .hex = info.base == 16,
+ };
+ }
+
+ n_digits -= info.max_mantissa_digits;
+ var many_digits = false;
+ stream.reset(); // re-parse from beginning
+ while (stream.firstIs3('0', '.', '_')) {
+ // '0' = '.' + 2
+ const next = stream.firstUnchecked();
+ if (next != '_') {
+ n_digits -= @intCast(isize, next -| ('0' - 1));
+ } else {
+ stream.underscore_count += 1;
+ }
+ stream.advance(1);
+ }
+ if (n_digits > 0) {
+ // at this point we have more than max_mantissa_digits significant digits, let's try again
+ many_digits = true;
+ mantissa = 0;
+ stream.reset();
+ tryParseNDigits(MantissaT, stream, &mantissa, info.base, info.max_mantissa_digits);
+
+ exponent = blk: {
+ if (mantissa >= min_n_digit_int(MantissaT, info.max_mantissa_digits)) {
+ // big int
+ break :blk @intCast(i64, int_end) - @intCast(i64, stream.offsetTrue());
+ } else {
+ // the next byte must be present and be '.'
+ // We know this is true because we had more than 19
+ // digits previously, so we overflowed a 64-bit integer,
+ // but parsing only the integral digits produced less
+ // than 19 digits. That means we must have a decimal
+ // point, and at least 1 fractional digit.
+ stream.advance(1);
+ var marker = stream.offsetTrue();
+ tryParseNDigits(MantissaT, stream, &mantissa, info.base, info.max_mantissa_digits);
+ break :blk @intCast(i64, marker) - @intCast(i64, stream.offsetTrue());
+ }
+ };
+ // add back the explicit part
+ exponent += exp_number;
+ }
+
+ return Number(T){
+ .exponent = exponent,
+ .mantissa = mantissa,
+ .negative = negative,
+ .many_digits = many_digits,
+ .hex = info.base == 16,
+ };
+}
+
+/// Parse a partial, non-special floating point number.
+///
+/// This creates a representation of the float as the
+/// significant digits and the decimal exponent.
+fn parsePartialNumber(comptime T: type, s: []const u8, negative: bool, n: *usize) ?Number(T) {
+ std.debug.assert(s.len != 0);
+ var stream = FloatStream.init(s);
+ const MantissaT = common.mantissaType(T);
+
+ if (stream.hasLen(2) and stream.atUnchecked(0) == '0' and std.ascii.toLower(stream.atUnchecked(1)) == 'x') {
+ stream.advance(2);
+ return parsePartialNumberBase(T, &stream, negative, n, .{
+ .base = 16,
+ .max_mantissa_digits = if (MantissaT == u64) 16 else 32,
+ .exp_char_lower = 'p',
+ });
+ } else {
+ return parsePartialNumberBase(T, &stream, negative, n, .{
+ .base = 10,
+ .max_mantissa_digits = if (MantissaT == u64) 19 else 38,
+ .exp_char_lower = 'e',
+ });
+ }
+}
+
+pub fn parseNumber(comptime T: type, s: []const u8, negative: bool) ?Number(T) {
+ var consumed: usize = 0;
+ if (parsePartialNumber(T, s, negative, &consumed)) |number| {
+ // must consume entire float (no trailing data)
+ if (s.len == consumed) {
+ return number;
+ }
+ }
+ return null;
+}
+
+fn parsePartialInfOrNan(comptime T: type, s: []const u8, negative: bool, n: *usize) ?T {
+ // inf/infinity; infxxx should only consume inf.
+ if (std.ascii.startsWithIgnoreCase(s, "inf")) {
+ n.* = 3;
+ if (std.ascii.startsWithIgnoreCase(s[3..], "inity")) {
+ n.* = 8;
+ }
+
+ return if (!negative) std.math.inf(T) else -std.math.inf(T);
+ }
+
+ if (std.ascii.startsWithIgnoreCase(s, "nan")) {
+ n.* = 3;
+ return std.math.nan(T);
+ }
+
+ return null;
+}
+
+pub fn parseInfOrNan(comptime T: type, s: []const u8, negative: bool) ?T {
+ var consumed: usize = 0;
+ if (parsePartialInfOrNan(T, s, negative, &consumed)) |special| {
+ if (s.len == consumed) {
+ return special;
+ }
+ }
+ return null;
+}
+
+pub fn validUnderscores(s: []const u8, comptime base: u8) bool {
+ var i: usize = 0;
+ while (i < s.len) : (i += 1) {
+ if (s[i] == '_') {
+ // underscore at start of end
+ if (i == 0 or i + 1 == s.len) {
+ return false;
+ }
+ // consecutive underscores
+ if (!common.isDigit(s[i - 1], base) or !common.isDigit(s[i + 1], base)) {
+ return false;
+ }
+
+ // next is guaranteed a digit, skip an extra
+ i += 1;
+ }
+ }
+
+ return true;
+}