diff options
| author | Andrew Kelley <andrew@ziglang.org> | 2020-10-17 21:38:50 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-10-17 21:38:50 -0400 |
| commit | 644400054c5769a397ded4f569e2ac0600d65305 (patch) | |
| tree | b4f340abf3922fd0ccbf0110ee5e3f2c8af77b70 | |
| parent | e55244c4c64830d2830dea81ab6011b61c188d25 (diff) | |
| parent | ff58f09b68de08a5fe33177f1874c677c762c1c0 (diff) | |
| download | zig-644400054c5769a397ded4f569e2ac0600d65305.tar.gz zig-644400054c5769a397ded4f569e2ac0600d65305.zip | |
Merge pull request #6259 from dec05eba/master
Use boyer-moore-horspool algorithm for indexOfPos and lastIndexOf unless the haystack or needle is very small
| -rw-r--r-- | lib/std/mem.zig | 93 |
1 files changed, 85 insertions, 8 deletions
diff --git a/lib/std/mem.zig b/lib/std/mem.zig index 6b50e51393..a3b5dd9228 100644 --- a/lib/std/mem.zig +++ b/lib/std/mem.zig @@ -853,10 +853,7 @@ pub fn indexOf(comptime T: type, haystack: []const T, needle: []const T) ?usize /// Find the index in a slice of a sub-slice, searching from the end backwards. /// To start looking at a different index, slice the haystack first. -/// TODO is there even a better algorithm for this? -pub fn lastIndexOf(comptime T: type, haystack: []const T, needle: []const T) ?usize { - if (needle.len > haystack.len) return null; - +fn lastIndexOfLinear(comptime T: type, haystack: []const T, needle: []const T) ?usize { var i: usize = haystack.len - needle.len; while (true) : (i -= 1) { if (mem.eql(T, haystack[i .. i + needle.len], needle)) return i; @@ -864,10 +861,7 @@ pub fn lastIndexOf(comptime T: type, haystack: []const T, needle: []const T) ?us } } -// TODO boyer-moore algorithm -pub fn indexOfPos(comptime T: type, haystack: []const T, start_index: usize, needle: []const T) ?usize { - if (needle.len > haystack.len) return null; - +fn indexOfPosLinear(comptime T: type, haystack: []const T, start_index: usize, needle: []const T) ?usize { var i: usize = start_index; const end = haystack.len - needle.len; while (i <= end) : (i += 1) { @@ -876,7 +870,90 @@ pub fn indexOfPos(comptime T: type, haystack: []const T, start_index: usize, nee return null; } +fn boyerMooreHorspoolPreprocessReverse(pattern: []const u8, table: *[256]usize) void { + for (table) |*c| { + c.* = pattern.len; + } + + var i: usize = pattern.len - 1; + // The first item is intentionally ignored and the skip size will be pattern.len. + // This is the standard way boyer-moore-horspool is implemented. + while (i > 0) : (i -= 1) { + table[pattern[i]] = i; + } +} + +fn boyerMooreHorspoolPreprocess(pattern: []const u8, table: *[256]usize) void { + for (table) |*c| { + c.* = pattern.len; + } + + var i: usize = 0; + // The last item is intentionally ignored and the skip size will be pattern.len. + // This is the standard way boyer-moore-horspool is implemented. + while (i < pattern.len - 1) : (i += 1) { + table[pattern[i]] = pattern.len - 1 - i; + } +} +/// Find the index in a slice of a sub-slice, searching from the end backwards. +/// To start looking at a different index, slice the haystack first. +// Reverse boyer-moore-horspool algorithm +pub fn lastIndexOf(comptime T: type, haystack: []const T, needle: []const T) ?usize { + if (needle.len > haystack.len) return null; + if (needle.len == 0) return haystack.len; + + if (!meta.trait.hasUniqueRepresentation(T) or haystack.len < 32 or needle.len <= 2) + return lastIndexOfLinear(T, haystack, needle); + + const haystack_bytes = sliceAsBytes(haystack); + const needle_bytes = sliceAsBytes(needle); + + var skip_table: [256]usize = undefined; + boyerMooreHorspoolPreprocessReverse(needle_bytes, skip_table[0..]); + + var i: usize = haystack_bytes.len - needle_bytes.len; + while (true) { + if (mem.eql(u8, haystack_bytes[i .. i + needle_bytes.len], needle_bytes)) return i; + const skip = skip_table[haystack_bytes[i]]; + if (skip > i) break; + i -= skip; + } + + return null; +} + +// Boyer-moore-horspool algorithm +pub fn indexOfPos(comptime T: type, haystack: []const T, start_index: usize, needle: []const T) ?usize { + if (needle.len > haystack.len) return null; + if (needle.len == 0) return 0; + + if (!meta.trait.hasUniqueRepresentation(T) or haystack.len < 52 or needle.len <= 4) + return indexOfPosLinear(T, haystack, start_index, needle); + + const haystack_bytes = sliceAsBytes(haystack); + const needle_bytes = sliceAsBytes(needle); + + var skip_table: [256]usize = undefined; + boyerMooreHorspoolPreprocess(needle_bytes, skip_table[0..]); + + var i: usize = start_index * @sizeOf(T); + while (i <= haystack_bytes.len - needle_bytes.len) { + if (mem.eql(u8, haystack_bytes[i .. i + needle_bytes.len], needle_bytes)) return i; + i += skip_table[haystack_bytes[i + needle_bytes.len - 1]]; + } + + return null; +} + test "mem.indexOf" { + testing.expect(indexOf(u8, "one two three four five six seven eight nine ten", "three four").? == 8); + testing.expect(lastIndexOf(u8, "one two three four five six seven eight nine ten", "three four").? == 8); + testing.expect(indexOf(u8, "one two three four five six seven eight nine ten", "two two") == null); + testing.expect(lastIndexOf(u8, "one two three four five six seven eight nine ten", "two two") == null); + + testing.expect(indexOf(u8, "one two three four five six seven eight nine ten", "").? == 0); + testing.expect(lastIndexOf(u8, "one two three four five six seven eight nine ten", "").? == 48); + testing.expect(indexOf(u8, "one two three four", "four").? == 14); testing.expect(lastIndexOf(u8, "one two three two four", "two").? == 14); testing.expect(indexOf(u8, "one two three four", "gour") == null); |
