diff options
| author | Andrew Kelley <andrew@ziglang.org> | 2022-05-24 15:32:12 -0400 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-05-24 15:32:12 -0400 |
| commit | 60f0acd9b9c2bced47ba1c214460f34b73738f95 (patch) | |
| tree | 7cc1ba371eab54fb84d5f56a67c3dab0d035bf7a /src | |
| parent | 8171972cbb477672ee1a99d953df4aaecb744a0c (diff) | |
| parent | 953e2778d4402cf85b99061ef9bdb89aa3e5f2db (diff) | |
| download | zig-60f0acd9b9c2bced47ba1c214460f34b73738f95.tar.gz zig-60f0acd9b9c2bced47ba1c214460f34b73738f95.zip | |
Merge pull request #11706 from ziglang/string-literal-interning
stage2: string literal interning
Diffstat (limited to 'src')
| -rw-r--r-- | src/Module.zig | 67 | ||||
| -rw-r--r-- | src/Sema.zig | 92 | ||||
| -rw-r--r-- | src/TypedValue.zig | 5 | ||||
| -rw-r--r-- | src/codegen.zig | 16 | ||||
| -rw-r--r-- | src/codegen/llvm.zig | 32 | ||||
| -rw-r--r-- | src/value.zig | 41 |
6 files changed, 231 insertions, 22 deletions
diff --git a/src/Module.zig b/src/Module.zig index 11549ccda6..24495d8591 100644 --- a/src/Module.zig +++ b/src/Module.zig @@ -70,6 +70,17 @@ import_table: std.StringArrayHashMapUnmanaged(*File) = .{}, /// Keys are fully resolved file paths. This table owns the keys and values. embed_table: std.StringHashMapUnmanaged(*EmbedFile) = .{}, +/// This is a temporary addition to stage2 in order to match stage1 behavior, +/// however the end-game once the lang spec is settled will be to use a global +/// InternPool for comptime memoized objects, making this behavior consistent across all types, +/// not only string literals. Or, we might decide to not guarantee string literals +/// to have equal comptime pointers, in which case this field can be deleted (perhaps +/// the commit that introduced it can simply be reverted). +/// This table uses an optional index so that when a Decl is destroyed, the string literal +/// is still reclaimable by a future Decl. +string_literal_table: std.HashMapUnmanaged(StringLiteralContext.Key, Decl.OptionalIndex, StringLiteralContext, std.hash_map.default_max_load_percentage) = .{}, +string_literal_bytes: std.ArrayListUnmanaged(u8) = .{}, + /// The set of all the generic function instantiations. This is used so that when a generic /// function is called twice with the same comptime parameter arguments, both calls dispatch /// to the same function. @@ -157,6 +168,39 @@ decls_free_list: std.ArrayListUnmanaged(Decl.Index) = .{}, global_assembly: std.AutoHashMapUnmanaged(Decl.Index, []u8) = .{}, +pub const StringLiteralContext = struct { + bytes: *std.ArrayListUnmanaged(u8), + + pub const Key = struct { + index: u32, + len: u32, + }; + + pub fn eql(self: @This(), a: Key, b: Key) bool { + _ = self; + return a.index == b.index and a.len == b.len; + } + + pub fn hash(self: @This(), x: Key) u64 { + const x_slice = self.bytes.items[x.index..][0..x.len]; + return std.hash_map.hashString(x_slice); + } +}; + +pub const StringLiteralAdapter = struct { + bytes: *std.ArrayListUnmanaged(u8), + + pub fn eql(self: @This(), a_slice: []const u8, b: StringLiteralContext.Key) bool { + const b_slice = self.bytes.items[b.index..][0..b.len]; + return mem.eql(u8, a_slice, b_slice); + } + + pub fn hash(self: @This(), adapted_key: []const u8) u64 { + _ = self; + return std.hash_map.hashString(adapted_key); + } +}; + const MonomorphedFuncsSet = std.HashMapUnmanaged( *Fn, void, @@ -507,7 +551,8 @@ pub const Decl = struct { decl.name = undefined; } - pub fn clearValues(decl: *Decl, gpa: Allocator) void { + pub fn clearValues(decl: *Decl, mod: *Module) void { + const gpa = mod.gpa; if (decl.getExternFn()) |extern_fn| { extern_fn.deinit(gpa); gpa.destroy(extern_fn); @@ -521,6 +566,13 @@ pub const Decl = struct { gpa.destroy(variable); } if (decl.value_arena) |arena_state| { + if (decl.owns_tv) { + if (decl.val.castTag(.str_lit)) |str_lit| { + mod.string_literal_table.getPtrContext(str_lit.data, .{ + .bytes = &mod.string_literal_bytes, + }).?.* = .none; + } + } arena_state.promote(gpa).deinit(); decl.value_arena = null; decl.has_tv = false; @@ -2839,6 +2891,9 @@ pub fn deinit(mod: *Module) void { mod.decls_free_list.deinit(gpa); mod.allocated_decls.deinit(gpa); mod.global_assembly.deinit(gpa); + + mod.string_literal_table.deinit(gpa); + mod.string_literal_bytes.deinit(gpa); } pub fn destroyDecl(mod: *Module, decl_index: Decl.Index) void { @@ -2857,7 +2912,7 @@ pub fn destroyDecl(mod: *Module, decl_index: Decl.Index) void { if (decl.getInnerNamespace()) |namespace| { namespace.destroyDecls(mod); } - decl.clearValues(gpa); + decl.clearValues(mod); } decl.dependants.deinit(gpa); decl.dependencies.deinit(gpa); @@ -4034,7 +4089,7 @@ fn semaDecl(mod: *Module, decl_index: Decl.Index) !bool { if (decl.getFunction()) |prev_func| { prev_is_inline = prev_func.state == .inline_only; } - decl.clearValues(gpa); + decl.clearValues(mod); } decl.ty = try decl_tv.ty.copy(decl_arena_allocator); @@ -4080,7 +4135,7 @@ fn semaDecl(mod: *Module, decl_index: Decl.Index) !bool { var type_changed = true; if (decl.has_tv) { type_changed = !decl.ty.eql(decl_tv.ty, mod); - decl.clearValues(gpa); + decl.clearValues(mod); } decl.owns_tv = false; @@ -4694,7 +4749,7 @@ pub fn clearDecl( if (decl.getInnerNamespace()) |namespace| { try namespace.deleteAllDecls(mod, outdated_decls); } - decl.clearValues(gpa); + decl.clearValues(mod); } if (decl.deletion_flag) { @@ -5623,7 +5678,7 @@ pub fn populateTestFunctions(mod: *Module) !void { // Since we are replacing the Decl's value we must perform cleanup on the // previous value. - decl.clearValues(gpa); + decl.clearValues(mod); decl.ty = new_ty; decl.val = new_val; decl.has_tv = true; diff --git a/src/Sema.zig b/src/Sema.zig index cb34fd158f..d3fca6d2b2 100644 --- a/src/Sema.zig +++ b/src/Sema.zig @@ -3842,20 +3842,44 @@ fn zirStr(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Ins fn addStrLit(sema: *Sema, block: *Block, zir_bytes: []const u8) CompileError!Air.Inst.Ref { // `zir_bytes` references memory inside the ZIR module, which can get deallocated // after semantic analysis is complete, for example in the case of the initialization - // expression of a variable declaration. We need the memory to be in the new - // anonymous Decl's arena. - var anon_decl = try block.startAnonDecl(LazySrcLoc.unneeded); - defer anon_decl.deinit(); + // expression of a variable declaration. + const mod = sema.mod; + const gpa = sema.gpa; + const string_bytes = &mod.string_literal_bytes; + const StringLiteralAdapter = Module.StringLiteralAdapter; + const StringLiteralContext = Module.StringLiteralContext; + try string_bytes.ensureUnusedCapacity(gpa, zir_bytes.len); + const gop = try mod.string_literal_table.getOrPutContextAdapted(gpa, zir_bytes, StringLiteralAdapter{ + .bytes = string_bytes, + }, StringLiteralContext{ + .bytes = string_bytes, + }); + if (!gop.found_existing) { + gop.key_ptr.* = .{ + .index = @intCast(u32, string_bytes.items.len), + .len = @intCast(u32, zir_bytes.len), + }; + string_bytes.appendSliceAssumeCapacity(zir_bytes); + gop.value_ptr.* = .none; + } + const decl_index = gop.value_ptr.unwrap() orelse di: { + var anon_decl = try block.startAnonDecl(LazySrcLoc.unneeded); + defer anon_decl.deinit(); - const bytes = try anon_decl.arena().dupeZ(u8, zir_bytes); + const decl_index = try anon_decl.finish( + try Type.Tag.array_u8_sentinel_0.create(anon_decl.arena(), gop.key_ptr.len), + try Value.Tag.str_lit.create(anon_decl.arena(), gop.key_ptr.*), + 0, // default alignment + ); - const new_decl = try anon_decl.finish( - try Type.Tag.array_u8_sentinel_0.create(anon_decl.arena(), bytes.len), - try Value.Tag.bytes.create(anon_decl.arena(), bytes[0 .. bytes.len + 1]), - 0, // default alignment - ); + // Needed so that `Decl.clearValues` will additionally set the corresponding + // string literal table value back to `Decl.OptionalIndex.none`. + mod.declPtr(decl_index).owns_tv = true; - return sema.analyzeDeclRef(new_decl); + gop.value_ptr.* = decl_index.toOptional(); + break :di decl_index; + }; + return sema.analyzeDeclRef(decl_index); } fn zirInt(sema: *Sema, block: *Block, inst: Zir.Inst.Index) CompileError!Air.Inst.Ref { @@ -19762,6 +19786,35 @@ fn beginComptimePtrMutation( .ty = elem_ty, }; }, + .str_lit => { + // An array is memory-optimized to store a slice of bytes, but we are about + // to modify an individual field and the representation has to change. + // If we wanted to avoid this, there would need to be special detection + // elsewhere to identify when writing a value to an array element that is stored + // using the `str_lit` tag, and handle it without making a call to this function. + const arena = parent.beginArena(sema.mod); + defer parent.finishArena(sema.mod); + + const str_lit = parent.val.castTag(.str_lit).?.data; + const dest_len = parent.ty.arrayLenIncludingSentinel(); + const bytes = sema.mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len]; + const elems = try arena.alloc(Value, @intCast(usize, dest_len)); + for (bytes) |byte, i| { + elems[i] = try Value.Tag.int_u64.create(arena, byte); + } + if (parent.ty.sentinel()) |sent_val| { + assert(elems.len == bytes.len + 1); + elems[bytes.len] = sent_val; + } + + parent.val.* = try Value.Tag.aggregate.create(arena, elems); + + return ComptimePtrMutationKit{ + .decl_ref_mut = parent.decl_ref_mut, + .val = &elems[elem_ptr.index], + .ty = elem_ty, + }; + }, .repeated => { // An array is memory-optimized to store only a single element value, and // that value is understood to be the same for the entire length of the array. @@ -20097,10 +20150,23 @@ fn beginComptimePtrLoad( } } - deref.pointee = if (elem_ptr.index < check_len) TypedValue{ + if (elem_ptr.index >= check_len) { + deref.pointee = null; + break :blk deref; + } + if (elem_ptr.index == check_len - 1) { + if (array_tv.ty.sentinel()) |sent| { + deref.pointee = TypedValue{ + .ty = elem_ty, + .val = sent, + }; + break :blk deref; + } + } + deref.pointee = TypedValue{ .ty = elem_ty, .val = try array_tv.val.elemValue(sema.mod, sema.arena, elem_ptr.index), - } else null; + }; break :blk deref; }, diff --git a/src/TypedValue.zig b/src/TypedValue.zig index b6aee29a4b..9f69e4c8bd 100644 --- a/src/TypedValue.zig +++ b/src/TypedValue.zig @@ -295,6 +295,11 @@ pub fn print( return writer.print(".{s}", .{ty.enumFieldName(val.castTag(.enum_field_index).?.data)}); }, .bytes => return writer.print("\"{}\"", .{std.zig.fmtEscapes(val.castTag(.bytes).?.data)}), + .str_lit => { + const str_lit = val.castTag(.str_lit).?.data; + const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len]; + return writer.print("\"{}\"", .{std.zig.fmtEscapes(bytes)}); + }, .repeated => { if (level == 0) { return writer.writeAll(".{ ... }"); diff --git a/src/codegen.zig b/src/codegen.zig index def69d952f..bd556baa5e 100644 --- a/src/codegen.zig +++ b/src/codegen.zig @@ -203,11 +203,23 @@ pub fn generateSymbol( }, .Array => switch (typed_value.val.tag()) { .bytes => { - const payload = typed_value.val.castTag(.bytes).?; + const bytes = typed_value.val.castTag(.bytes).?.data; const len = @intCast(usize, typed_value.ty.arrayLenIncludingSentinel()); // The bytes payload already includes the sentinel, if any try code.ensureUnusedCapacity(len); - code.appendSliceAssumeCapacity(payload.data[0..len]); + code.appendSliceAssumeCapacity(bytes[0..len]); + return Result{ .appended = {} }; + }, + .str_lit => { + const str_lit = typed_value.val.castTag(.str_lit).?.data; + const mod = bin_file.options.module.?; + const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len]; + try code.ensureUnusedCapacity(bytes.len + 1); + code.appendSliceAssumeCapacity(bytes); + if (typed_value.ty.sentinel()) |sent_val| { + const byte = @intCast(u8, sent_val.toUnsignedInt(target)); + code.appendAssumeCapacity(byte); + } return Result{ .appended = {} }; }, .aggregate => { diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig index 6de001e5fd..ef33f39f55 100644 --- a/src/codegen/llvm.zig +++ b/src/codegen/llvm.zig @@ -2936,9 +2936,39 @@ pub const DeclGen = struct { return dg.context.constString( bytes.ptr, @intCast(c_uint, tv.ty.arrayLenIncludingSentinel()), - .True, // don't null terminate. bytes has the sentinel, if any. + .True, // Don't null terminate. Bytes has the sentinel, if any. ); }, + .str_lit => { + const str_lit = tv.val.castTag(.str_lit).?.data; + const bytes = dg.module.string_literal_bytes.items[str_lit.index..][0..str_lit.len]; + if (tv.ty.sentinel()) |sent_val| { + const byte = @intCast(u8, sent_val.toUnsignedInt(target)); + if (byte == 0 and bytes.len > 0) { + return dg.context.constString( + bytes.ptr, + @intCast(c_uint, bytes.len), + .False, // Yes, null terminate. + ); + } + var array = std.ArrayList(u8).init(dg.gpa); + defer array.deinit(); + try array.ensureUnusedCapacity(bytes.len + 1); + array.appendSliceAssumeCapacity(bytes); + array.appendAssumeCapacity(byte); + return dg.context.constString( + array.items.ptr, + @intCast(c_uint, array.items.len), + .True, // Don't null terminate. + ); + } else { + return dg.context.constString( + bytes.ptr, + @intCast(c_uint, bytes.len), + .True, // Don't null terminate. `bytes` has the sentinel, if any. + ); + } + }, .aggregate => { const elem_vals = tv.val.castTag(.aggregate).?.data; const elem_ty = tv.ty.elemType(); diff --git a/src/value.zig b/src/value.zig index 1280adf1e0..310384c3c3 100644 --- a/src/value.zig +++ b/src/value.zig @@ -126,6 +126,8 @@ pub const Value = extern union { field_ptr, /// A slice of u8 whose memory is managed externally. bytes, + /// Similar to bytes however it stores an index relative to `Module.string_literal_bytes`. + str_lit, /// This value is repeated some number of times. The amount of times to repeat /// is stored externally. repeated, @@ -285,6 +287,7 @@ pub const Value = extern union { .enum_literal, => Payload.Bytes, + .str_lit => Payload.StrLit, .slice => Payload.Slice, .enum_field_index => Payload.U32, @@ -538,6 +541,7 @@ pub const Value = extern union { }; return Value{ .ptr_otherwise = &new_payload.base }; }, + .str_lit => return self.copyPayloadShallow(arena, Payload.StrLit), .repeated, .eu_payload, .opt_payload, @@ -764,6 +768,12 @@ pub const Value = extern union { .enum_literal => return out_stream.print(".{}", .{std.zig.fmtId(val.castTag(.enum_literal).?.data)}), .enum_field_index => return out_stream.print("(enum field {d})", .{val.castTag(.enum_field_index).?.data}), .bytes => return out_stream.print("\"{}\"", .{std.zig.fmtEscapes(val.castTag(.bytes).?.data)}), + .str_lit => { + const str_lit = val.castTag(.str_lit).?.data; + return out_stream.print("(.str_lit index={d} len={d})", .{ + str_lit.index, str_lit.len, + }); + }, .repeated => { try out_stream.writeAll("(repeated) "); val = val.castTag(.repeated).?.data; @@ -824,6 +834,11 @@ pub const Value = extern union { const adjusted_bytes = bytes[0..adjusted_len]; return allocator.dupe(u8, adjusted_bytes); }, + .str_lit => { + const str_lit = val.castTag(.str_lit).?.data; + const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len]; + return allocator.dupe(u8, bytes); + }, .enum_literal => return allocator.dupe(u8, val.castTag(.enum_literal).?.data), .repeated => { const byte = @intCast(u8, val.castTag(.repeated).?.data.toUnsignedInt(target)); @@ -2537,6 +2552,20 @@ pub const Value = extern union { return initPayload(&buffer.base); } }, + .str_lit => { + const str_lit = val.castTag(.str_lit).?.data; + const bytes = mod.string_literal_bytes.items[str_lit.index..][0..str_lit.len]; + const byte = bytes[index]; + if (arena) |a| { + return Tag.int_u64.create(a, byte); + } else { + buffer.* = .{ + .base = .{ .tag = .int_u64 }, + .data = byte, + }; + return initPayload(&buffer.base); + } + }, // No matter the index; all the elements are the same! .repeated => return val.castTag(.repeated).?.data, @@ -2570,6 +2599,13 @@ pub const Value = extern union { return switch (val.tag()) { .empty_array_sentinel => if (start == 0 and end == 1) val else Value.initTag(.empty_array), .bytes => Tag.bytes.create(arena, val.castTag(.bytes).?.data[start..end]), + .str_lit => { + const str_lit = val.castTag(.str_lit).?.data; + return Tag.str_lit.create(arena, .{ + .index = @intCast(u32, str_lit.index + start), + .len = @intCast(u32, end - start), + }); + }, .aggregate => Tag.aggregate.create(arena, val.castTag(.aggregate).?.data[start..end]), .slice => sliceArray(val.castTag(.slice).?.data.ptr, mod, arena, start, end), @@ -4721,6 +4757,11 @@ pub const Value = extern union { data: []const u8, }; + pub const StrLit = struct { + base: Payload, + data: Module.StringLiteralContext.Key, + }; + pub const Aggregate = struct { base: Payload, /// Field values. The types are according to the struct or array type. |
