lib/compiler/resinator/lang.zig


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878

const std = @import("std");

/// This function is specific to how the Win32 RC command line interprets
/// language IDs specified as integers.
/// - Always interpreted as hexadecimal, but explicit 0x prefix is also allowed
/// - Wraps on overflow of u16
/// - Stops parsing on any invalid hexadecimal digits
/// - Errors if a digit is not the first char
/// - `-` (negative) prefix is allowed
pub fn parseInt(str: []const u8) error{InvalidLanguageId}!u16 {
    var result: u16 = 0;
    const radix: u8 = 16;
    var buf = str;

    const Prefix = enum { none, minus };
    var prefix: Prefix = .none;
    switch (buf[0]) {
        '-' => {
            prefix = .minus;
            buf = buf[1..];
        },
        else => {},
    }

    if (buf.len > 2 and buf[0] == '0' and buf[1] == 'x') {
        buf = buf[2..];
    }

    for (buf, 0..) |c, i| {
        const digit = switch (c) {
            // On invalid digit for the radix, just stop parsing but don't fail
            'a'...'f', 'A'...'F', '0'...'9' => std.fmt.charToDigit(c, radix) catch break,
            else => {
                // First digit must be valid
                if (i == 0) {
                    return error.InvalidLanguageId;
                }
                break;
            },
        };

        if (result != 0) {
            result *%= radix;
        }
        result +%= digit;
    }

    switch (prefix) {
        .none => {},
        .minus => result = 0 -% result,
    }

    return result;
}

test parseInt {
    try std.testing.expectEqual(@as(u16, 0x16), try parseInt("16"));
    try std.testing.expectEqual(@as(u16, 0x1a), try parseInt("0x1A"));
    try std.testing.expectEqual(@as(u16, 0x1a), try parseInt("0x1Azzzz"));
    try std.testing.expectEqual(@as(u16, 0xffff), try parseInt("-1"));
    try std.testing.expectEqual(@as(u16, 0xffea), try parseInt("-0x16"));
    try std.testing.expectEqual(@as(u16, 0x0), try parseInt("0o100"));
    try std.testing.expectEqual(@as(u16, 0x1), try parseInt("10001"));
    try std.testing.expectError(error.InvalidLanguageId, parseInt("--1"));
    try std.testing.expectError(error.InvalidLanguageId, parseInt("0xha"));
    try std.testing.expectError(error.InvalidLanguageId, parseInt("¹"));
    try std.testing.expectError(error.InvalidLanguageId, parseInt("~1"));
}

/// This function is specific to how the Win32 RC command line interprets
/// language tags: invalid tags are rejected, but tags that don't have
/// a specific assigned ID but are otherwise valid enough will get
/// converted to an ID of LOCALE_CUSTOM_UNSPECIFIED.
pub fn tagToInt(tag: []const u8) error{InvalidLanguageTag}!u16 {
    const maybe_id = try tagToId(tag);
    if (maybe_id) |id| {
        return @intFromEnum(id);
    } else {
        return LOCALE_CUSTOM_UNSPECIFIED;
    }
}

pub fn tagToId(tag: []const u8) error{InvalidLanguageTag}!?LanguageId {
    const parsed = try parse(tag);
    // There are currently no language tags with assigned IDs that have
    // multiple suffixes, so we can skip the lookup.
    if (parsed.multiple_suffixes) return null;
    const longest_known_tag = comptime blk: {
        var len = 0;
        for (@typeInfo(LanguageId).@"enum".fields) |field| {
            if (field.name.len > len) len = field.name.len;
        }
        break :blk len;
    };
    // If the tag is longer than the longest tag that has an assigned ID,
    // then we can skip the lookup.
    if (tag.len > longest_known_tag) return null;
    var normalized_buf: [longest_known_tag]u8 = undefined;
    // To allow e.g. `de-de_phoneb` to get looked up as `de-de`, we need to
    // omit the suffix, but only if the tag contains a valid alternate sort order.
    const tag_to_normalize = if (parsed.isSuffixValidSortOrder()) tag[0 .. tag.len - (parsed.suffix.?.len + 1)] else tag;
    const normalized_tag = normalizeTag(tag_to_normalize, &normalized_buf);
    return std.meta.stringToEnum(LanguageId, normalized_tag) orelse {
        // special case for a tag that has been mapped to the same ID
        // twice.
        if (std.mem.eql(u8, "ff_latn_ng", normalized_tag)) {
            return LanguageId.ff_ng;
        }
        return null;
    };
}

test tagToId {
    try std.testing.expectEqual(LanguageId.ar_ae, (try tagToId("ar-ae")).?);
    try std.testing.expectEqual(LanguageId.ar_ae, (try tagToId("AR_AE")).?);
    try std.testing.expectEqual(LanguageId.ff_ng, (try tagToId("ff-ng")).?);
    // Special case
    try std.testing.expectEqual(LanguageId.ff_ng, (try tagToId("ff-Latn-NG")).?);
}

test "exhaustive tagToId" {
    @setEvalBranchQuota(2000);
    inline for (@typeInfo(LanguageId).@"enum".fields) |field| {
        const id = tagToId(field.name) catch |err| {
            std.debug.print("tag: {s}\n", .{field.name});
            return err;
        };
        try std.testing.expectEqual(@field(LanguageId, field.name), id orelse {
            std.debug.print("tag: {s}, got null\n", .{field.name});
            return error.TestExpectedEqual;
        });
    }
    var buf: [32]u8 = undefined;
    inline for (valid_alternate_sorts) |parsed_sort| {
        var fbs: std.Io.Writer = .fixed(&buf);
        const writer = &fbs;
        writer.writeAll(parsed_sort.language_code) catch unreachable;
        writer.writeAll("-") catch unreachable;
        writer.writeAll(parsed_sort.country_code.?) catch unreachable;
        writer.writeAll("-") catch unreachable;
        writer.writeAll(parsed_sort.suffix.?) catch unreachable;
        const expected_field_name = comptime field: {
            var name_buf: [5]u8 = undefined;
            @memcpy(name_buf[0..parsed_sort.language_code.len], parsed_sort.language_code);
            name_buf[2] = '_';
            @memcpy(name_buf[3..], parsed_sort.country_code.?);
            break :field name_buf;
        };
        const expected = @field(LanguageId, &expected_field_name);
        const id = tagToId(fbs.buffered()) catch |err| {
            std.debug.print("tag: {s}\n", .{fbs.buffered()});
            return err;
        };
        try std.testing.expectEqual(expected, id orelse {
            std.debug.print("tag: {s}, expected: {}, got null\n", .{ fbs.buffered(), expected });
            return error.TestExpectedEqual;
        });
    }
}

fn normalizeTag(tag: []const u8, buf: []u8) []u8 {
    std.debug.assert(buf.len >= tag.len);
    for (tag, 0..) |c, i| {
        if (c == '-')
            buf[i] = '_'
        else
            buf[i] = std.ascii.toLower(c);
    }
    return buf[0..tag.len];
}

/// https://winprotocoldoc.blob.core.windows.net/productionwindowsarchives/MS-LCID/%5bMS-LCID%5d.pdf#%5B%7B%22num%22%3A72%2C%22gen%22%3A0%7D%2C%7B%22name%22%3A%22XYZ%22%7D%2C69%2C574%2C0%5D
/// "When an LCID is requested for a locale without a
/// permanent LCID assignment, nor a temporary
/// assignment as above, the protocol will respond
/// with LOCALE_CUSTOM_UNSPECIFIED for all such
/// locales. Because this single value is used for
/// numerous possible locale names, it is impossible to
/// round trip this locale, even temporarily.
/// Applications should discard this value as soon as
/// possible and never persist it. If the system is
/// forced to respond to a request for
/// LCID_CUSTOM_UNSPECIFIED, it will fall back to
/// the current user locale. This is often incorrect but
/// may prevent an application or component from
/// failing. As the meaning of this temporary LCID is
/// unstable, it should never be used for interchange
/// or persisted data. This is a 1-to-many relationship
/// that is very unstable."
pub const LOCALE_CUSTOM_UNSPECIFIED = 0x1000;

pub const LANG_ENGLISH = 0x09;
pub const SUBLANG_ENGLISH_US = 0x01;

/// https://learn.microsoft.com/en-us/windows/win32/intl/language-identifiers
pub fn MAKELANGID(primary: u10, sublang: u6) u16 {
    return (@as(u16, primary) << 10) | sublang;
}

/// Language tag format expressed as a regular expression (rough approximation):
///
/// [a-zA-Z]{1,3}([-_][a-zA-Z]{4})?([-_][a-zA-Z]{2})?([-_][a-zA-Z0-9]{1,8})?
///     lang    |     script      |      country    |       suffix
///
/// Notes:
/// - If lang code is 1 char, it seems to mean that everything afterwards uses suffix
///   parsing rules (e.g. `a-0` and `a-00000000` are allowed).
/// - There can also be any number of trailing suffix parts as long as they each
///   would be a valid suffix part, e.g. `en-us-blah-blah1-blah2-blah3` is allowed.
/// - When doing lookups, trailing suffix parts are taken into account, e.g.
///   `ca-es-valencia` is not considered equivalent to `ca-es-valencia-blah`.
/// - A suffix is only allowed if:
///   + Lang code is 1 char long, or
///   + A country code is present, or
///   + A script tag is not present and:
///      - the suffix is numeric-only and has a length of 3, or
///      - the lang is `qps` and the suffix is `ploca` or `plocm`
pub fn parse(lang_tag: []const u8) error{InvalidLanguageTag}!Parsed {
    var it = std.mem.splitAny(u8, lang_tag, "-_");
    const lang_code = it.first();
    const is_valid_lang_code = lang_code.len >= 1 and lang_code.len <= 3 and isAllAlphabetic(lang_code);
    if (!is_valid_lang_code) return error.InvalidLanguageTag;
    var parsed = Parsed{
        .language_code = lang_code,
    };
    // The second part could be a script tag, a country code, or a suffix
    if (it.next()) |part_str| {
        // The lang code being length 1 behaves strangely, so fully special case it.
        if (lang_code.len == 1) {
            // This is almost certainly not the 'right' way to do this, but I don't have a method
            // to determine how exactly these language tags are parsed, and it seems like
            // suffix parsing rules apply generally (digits allowed, length of 1 to 8).
            //
            // However, because we want to be able to lookup `x-iv-mathan` normally without
            // `multiple_suffixes` being set to true, we need to make sure to treat two-length
            // alphabetic parts as a country code.
            if (part_str.len == 2 and isAllAlphabetic(part_str)) {
                parsed.country_code = part_str;
            }
            // Everything else, though, we can just throw into the suffix as long as the normal
            // rules apply.
            else if (part_str.len > 0 and part_str.len <= 8 and isAllAlphanumeric(part_str)) {
                parsed.suffix = part_str;
            } else {
                return error.InvalidLanguageTag;
            }
        } else if (part_str.len == 4 and isAllAlphabetic(part_str)) {
            parsed.script_tag = part_str;
        } else if (part_str.len == 2 and isAllAlphabetic(part_str)) {
            parsed.country_code = part_str;
        }
        // Only a 3-len numeric suffix is allowed as the second part of a tag
        else if (part_str.len == 3 and isAllNumeric(part_str)) {
            parsed.suffix = part_str;
        }
        // Special case for qps-ploca and qps-plocm
        else if (std.ascii.eqlIgnoreCase(lang_code, "qps") and
            (std.ascii.eqlIgnoreCase(part_str, "ploca") or
                std.ascii.eqlIgnoreCase(part_str, "plocm")))
        {
            parsed.suffix = part_str;
        } else {
            return error.InvalidLanguageTag;
        }
    } else {
        // If there's no part besides a 1-len lang code, then it is malformed
        if (lang_code.len == 1) return error.InvalidLanguageTag;
        return parsed;
    }
    if (parsed.script_tag != null) {
        if (it.next()) |part_str| {
            if (part_str.len == 2 and isAllAlphabetic(part_str)) {
                parsed.country_code = part_str;
            } else {
                // Suffix is not allowed when a country code is not present.
                return error.InvalidLanguageTag;
            }
        } else {
            return parsed;
        }
    }
    // We've now parsed any potential script tag/country codes, so anything remaining
    // is a suffix
    while (it.next()) |part_str| {
        if (part_str.len == 0 or part_str.len > 8 or !isAllAlphanumeric(part_str)) {
            return error.InvalidLanguageTag;
        }
        if (parsed.suffix == null) {
            parsed.suffix = part_str;
        } else {
            // In theory we could return early here but we still want to validate
            // that each part is a valid suffix all the way to the end, e.g.
            // we should reject `en-us-suffix-a-b-c-!!!` because of the invalid `!!!`
            // suffix part.
            parsed.multiple_suffixes = true;
        }
    }
    return parsed;
}

pub const Parsed = struct {
    language_code: []const u8,
    script_tag: ?[]const u8 = null,
    country_code: ?[]const u8 = null,
    /// Can be a sort order (e.g. phoneb) or something like valencia, 001, etc
    suffix: ?[]const u8 = null,
    /// There can be any number of suffixes, but we don't need to care what their
    /// values are, we just need to know if any exist so that e.g. `ca-es-valencia-blah`
    /// can be seen as different from `ca-es-valencia`. Storing this as a bool
    /// allows us to avoid needing either (a) dynamic allocation or (b) a limit to
    /// the number of suffixes allowed when parsing.
    multiple_suffixes: bool = false,

    pub fn isSuffixValidSortOrder(self: Parsed) bool {
        if (self.country_code == null) return false;
        if (self.suffix == null) return false;
        if (self.script_tag != null) return false;
        if (self.multiple_suffixes) return false;
        for (valid_alternate_sorts) |valid_sort| {
            if (std.ascii.eqlIgnoreCase(valid_sort.language_code, self.language_code) and
                std.ascii.eqlIgnoreCase(valid_sort.country_code.?, self.country_code.?) and
                std.ascii.eqlIgnoreCase(valid_sort.suffix.?, self.suffix.?))
            {
                return true;
            }
        }
        return false;
    }
};

/// https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/70feba9f-294e-491e-b6eb-56532684c37f
/// See the table following this text: "Alternate sorts can be selected by using one of the identifiers from the following table."
const valid_alternate_sorts = [_]Parsed{
    // Note: x-IV-mathan is omitted due to how lookups are implemented.
    //       This table is used to make e.g. `de-de_phoneb` get looked up
    //       as `de-de` (the suffix is omitted for the lookup), but x-iv-mathan
    //       instead needs to be looked up with the suffix included because
    //       `x-iv` is not a tag with an assigned ID.
    .{ .language_code = "de", .country_code = "de", .suffix = "phoneb" },
    .{ .language_code = "hu", .country_code = "hu", .suffix = "tchncl" },
    .{ .language_code = "ka", .country_code = "ge", .suffix = "modern" },
    .{ .language_code = "zh", .country_code = "cn", .suffix = "stroke" },
    .{ .language_code = "zh", .country_code = "sg", .suffix = "stroke" },
    .{ .language_code = "zh", .country_code = "mo", .suffix = "stroke" },
    .{ .language_code = "zh", .country_code = "tw", .suffix = "pronun" },
    .{ .language_code = "zh", .country_code = "tw", .suffix = "radstr" },
    .{ .language_code = "ja", .country_code = "jp", .suffix = "radstr" },
    .{ .language_code = "zh", .country_code = "hk", .suffix = "radstr" },
    .{ .language_code = "zh", .country_code = "mo", .suffix = "radstr" },
    .{ .language_code = "zh", .country_code = "cn", .suffix = "phoneb" },
    .{ .language_code = "zh", .country_code = "sg", .suffix = "phoneb" },
};

test "parse" {
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "en",
    }, try parse("en"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "en",
        .country_code = "us",
    }, try parse("en-us"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "en",
        .suffix = "123",
    }, try parse("en-123"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "en",
        .suffix = "123",
        .multiple_suffixes = true,
    }, try parse("en-123-blah"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "en",
        .country_code = "us",
        .suffix = "123",
        .multiple_suffixes = true,
    }, try parse("en-us_123-blah"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "eng",
        .script_tag = "Latn",
    }, try parse("eng-Latn"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "eng",
        .script_tag = "Latn",
    }, try parse("eng-Latn"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "ff",
        .script_tag = "Latn",
        .country_code = "NG",
    }, try parse("ff-Latn-NG"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "qps",
        .suffix = "Plocm",
    }, try parse("qps-Plocm"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "qps",
        .suffix = "ploca",
    }, try parse("qps-ploca"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "x",
        .country_code = "IV",
        .suffix = "mathan",
    }, try parse("x-IV-mathan"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "a",
        .suffix = "a",
    }, try parse("a-a"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "a",
        .suffix = "000",
    }, try parse("a-000"));
    try std.testing.expectEqualDeep(Parsed{
        .language_code = "a",
        .suffix = "00000000",
    }, try parse("a-00000000"));
    // suffix not allowed if script tag is present without country code
    try std.testing.expectError(error.InvalidLanguageTag, parse("eng-Latn-suffix"));
    // suffix must be 3 numeric digits if neither script tag nor country code is present
    try std.testing.expectError(error.InvalidLanguageTag, parse("eng-suffix"));
    try std.testing.expectError(error.InvalidLanguageTag, parse("en-plocm"));
    // 1-len lang code is not allowed if it's the only part
    try std.testing.expectError(error.InvalidLanguageTag, parse("e"));
}

fn isAllAlphabetic(str: []const u8) bool {
    for (str) |c| {
        if (!std.ascii.isAlphabetic(c)) return false;
    }
    return true;
}

fn isAllAlphanumeric(str: []const u8) bool {
    for (str) |c| {
        if (!std.ascii.isAlphanumeric(c)) return false;
    }
    return true;
}

fn isAllNumeric(str: []const u8) bool {
    for (str) |c| {
        if (!std.ascii.isDigit(c)) return false;
    }
    return true;
}

/// Derived from https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-lcid/70feba9f-294e-491e-b6eb-56532684c37f
/// - Protocol Revision: 15.0
/// - Language / Language ID / Language Tag table in Appendix A
/// - Removed all rows that have Language ID 0x1000 (LOCALE_CUSTOM_UNSPECIFIED)
/// - Normalized each language tag (lowercased, replaced all `-` with `_`)
/// - There is one special case where two tags are mapped to the same ID, the following
///   has been omitted and must be special cased during lookup to map to the ID ff_ng / 0x0467.
///     ff_latn_ng = 0x0467, // Fulah (Latin), Nigeria
/// - x_iv_mathan has been added which is not in the table but does appear in the Alternate sorts
///   table as 0x007F (LANG_INVARIANT).
pub const LanguageId = enum(u16) {
    // Language tag = Language ID, // Language, Location (or type)
    af = 0x0036, // Afrikaans
    af_za = 0x0436, // Afrikaans, South Africa
    sq = 0x001C, // Albanian
    sq_al = 0x041C, // Albanian, Albania
    gsw = 0x0084, // Alsatian
    gsw_fr = 0x0484, // Alsatian, France
    am = 0x005E, // Amharic
    am_et = 0x045E, // Amharic, Ethiopia
    ar = 0x0001, // Arabic
    ar_dz = 0x1401, // Arabic, Algeria
    ar_bh = 0x3C01, // Arabic, Bahrain
    ar_eg = 0x0c01, // Arabic, Egypt
    ar_iq = 0x0801, // Arabic, Iraq
    ar_jo = 0x2C01, // Arabic, Jordan
    ar_kw = 0x3401, // Arabic, Kuwait
    ar_lb = 0x3001, // Arabic, Lebanon
    ar_ly = 0x1001, // Arabic, Libya
    ar_ma = 0x1801, // Arabic, Morocco
    ar_om = 0x2001, // Arabic, Oman
    ar_qa = 0x4001, // Arabic, Qatar
    ar_sa = 0x0401, // Arabic, Saudi Arabia
    ar_sy = 0x2801, // Arabic, Syria
    ar_tn = 0x1C01, // Arabic, Tunisia
    ar_ae = 0x3801, // Arabic, U.A.E.
    ar_ye = 0x2401, // Arabic, Yemen
    hy = 0x002B, // Armenian
    hy_am = 0x042B, // Armenian, Armenia
    as = 0x004D, // Assamese
    as_in = 0x044D, // Assamese, India
    az_cyrl = 0x742C, // Azerbaijani (Cyrillic)
    az_cyrl_az = 0x082C, // Azerbaijani (Cyrillic), Azerbaijan
    az = 0x002C, // Azerbaijani (Latin)
    az_latn = 0x782C, // Azerbaijani (Latin)
    az_latn_az = 0x042C, // Azerbaijani (Latin), Azerbaijan
    bn = 0x0045, // Bangla
    bn_bd = 0x0845, // Bangla, Bangladesh
    bn_in = 0x0445, // Bangla, India
    ba = 0x006D, // Bashkir
    ba_ru = 0x046D, // Bashkir, Russia
    eu = 0x002D, // Basque
    eu_es = 0x042D, // Basque, Spain
    be = 0x0023, // Belarusian
    be_by = 0x0423, // Belarusian, Belarus
    bs_cyrl = 0x641A, // Bosnian (Cyrillic)
    bs_cyrl_ba = 0x201A, // Bosnian (Cyrillic), Bosnia and Herzegovina
    bs_latn = 0x681A, // Bosnian (Latin)
    bs = 0x781A, // Bosnian (Latin)
    bs_latn_ba = 0x141A, // Bosnian (Latin), Bosnia and Herzegovina
    br = 0x007E, // Breton
    br_fr = 0x047E, // Breton, France
    bg = 0x0002, // Bulgarian
    bg_bg = 0x0402, // Bulgarian, Bulgaria
    my = 0x0055, // Burmese
    my_mm = 0x0455, // Burmese, Myanmar
    ca = 0x0003, // Catalan
    ca_es = 0x0403, // Catalan, Spain
    tzm_arab_ma = 0x045F, // Central Atlas Tamazight (Arabic), Morocco
    ku = 0x0092, // Central Kurdish
    ku_arab = 0x7c92, // Central Kurdish
    ku_arab_iq = 0x0492, // Central Kurdish, Iraq
    chr = 0x005C, // Cherokee
    chr_cher = 0x7c5C, // Cherokee
    chr_cher_us = 0x045C, // Cherokee, United States
    zh_hans = 0x0004, // Chinese (Simplified)
    zh = 0x7804, // Chinese (Simplified)
    zh_cn = 0x0804, // Chinese (Simplified), People's Republic of China
    zh_sg = 0x1004, // Chinese (Simplified), Singapore
    zh_hant = 0x7C04, // Chinese (Traditional)
    zh_hk = 0x0C04, // Chinese (Traditional), Hong Kong S.A.R.
    zh_mo = 0x1404, // Chinese (Traditional), Macao S.A.R.
    zh_tw = 0x0404, // Chinese (Traditional), Taiwan
    co = 0x0083, // Corsican
    co_fr = 0x0483, // Corsican, France
    hr = 0x001A, // Croatian
    hr_hr = 0x041A, // Croatian, Croatia
    hr_ba = 0x101A, // Croatian (Latin), Bosnia and Herzegovina
    cs = 0x0005, // Czech
    cs_cz = 0x0405, // Czech, Czech Republic
    da = 0x0006, // Danish
    da_dk = 0x0406, // Danish, Denmark
    prs = 0x008C, // Dari
    prs_af = 0x048C, // Dari, Afghanistan
    dv = 0x0065, // Divehi
    dv_mv = 0x0465, // Divehi, Maldives
    nl = 0x0013, // Dutch
    nl_be = 0x0813, // Dutch, Belgium
    nl_nl = 0x0413, // Dutch, Netherlands
    dz_bt = 0x0C51, // Dzongkha, Bhutan
    en = 0x0009, // English
    en_au = 0x0C09, // English, Australia
    en_bz = 0x2809, // English, Belize
    en_ca = 0x1009, // English, Canada
    en_029 = 0x2409, // English, Caribbean
    en_hk = 0x3C09, // English, Hong Kong
    en_in = 0x4009, // English, India
    en_ie = 0x1809, // English, Ireland
    en_jm = 0x2009, // English, Jamaica
    en_my = 0x4409, // English, Malaysia
    en_nz = 0x1409, // English, New Zealand
    en_ph = 0x3409, // English, Republic of the Philippines
    en_sg = 0x4809, // English, Singapore
    en_za = 0x1C09, // English, South Africa
    en_tt = 0x2c09, // English, Trinidad and Tobago
    en_ae = 0x4C09, // English, United Arab Emirates
    en_gb = 0x0809, // English, United Kingdom
    en_us = 0x0409, // English, United States
    en_zw = 0x3009, // English, Zimbabwe
    et = 0x0025, // Estonian
    et_ee = 0x0425, // Estonian, Estonia
    fo = 0x0038, // Faroese
    fo_fo = 0x0438, // Faroese, Faroe Islands
    fil = 0x0064, // Filipino
    fil_ph = 0x0464, // Filipino, Philippines
    fi = 0x000B, // Finnish
    fi_fi = 0x040B, // Finnish, Finland
    fr = 0x000C, // French
    fr_be = 0x080C, // French, Belgium
    fr_cm = 0x2c0C, // French, Cameroon
    fr_ca = 0x0c0C, // French, Canada
    fr_029 = 0x1C0C, // French, Caribbean
    fr_cd = 0x240C, // French, Congo, DRC
    fr_ci = 0x300C, // French, Côte d'Ivoire
    fr_fr = 0x040C, // French, France
    fr_ht = 0x3c0C, // French, Haiti
    fr_lu = 0x140C, // French, Luxembourg
    fr_ml = 0x340C, // French, Mali
    fr_ma = 0x380C, // French, Morocco
    fr_mc = 0x180C, // French, Principality of Monaco
    fr_re = 0x200C, // French, Reunion
    fr_sn = 0x280C, // French, Senegal
    fr_ch = 0x100C, // French, Switzerland
    fy = 0x0062, // Frisian
    fy_nl = 0x0462, // Frisian, Netherlands
    ff = 0x0067, // Fulah
    ff_latn = 0x7C67, // Fulah (Latin)
    ff_ng = 0x0467, // Fulah, Nigeria
    ff_latn_sn = 0x0867, // Fulah, Senegal
    gl = 0x0056, // Galician
    gl_es = 0x0456, // Galician, Spain
    ka = 0x0037, // Georgian
    ka_ge = 0x0437, // Georgian, Georgia
    de = 0x0007, // German
    de_at = 0x0C07, // German, Austria
    de_de = 0x0407, // German, Germany
    de_li = 0x1407, // German, Liechtenstein
    de_lu = 0x1007, // German, Luxembourg
    de_ch = 0x0807, // German, Switzerland
    el = 0x0008, // Greek
    el_gr = 0x0408, // Greek, Greece
    kl = 0x006F, // Greenlandic
    kl_gl = 0x046F, // Greenlandic, Greenland
    gn = 0x0074, // Guarani
    gn_py = 0x0474, // Guarani, Paraguay
    gu = 0x0047, // Gujarati
    gu_in = 0x0447, // Gujarati, India
    ha = 0x0068, // Hausa (Latin)
    ha_latn = 0x7C68, // Hausa (Latin)
    ha_latn_ng = 0x0468, // Hausa (Latin), Nigeria
    haw = 0x0075, // Hawaiian
    haw_us = 0x0475, // Hawaiian, United States
    he = 0x000D, // Hebrew
    he_il = 0x040D, // Hebrew, Israel
    hi = 0x0039, // Hindi
    hi_in = 0x0439, // Hindi, India
    hu = 0x000E, // Hungarian
    hu_hu = 0x040E, // Hungarian, Hungary
    is = 0x000F, // Icelandic
    is_is = 0x040F, // Icelandic, Iceland
    ig = 0x0070, // Igbo
    ig_ng = 0x0470, // Igbo, Nigeria
    id = 0x0021, // Indonesian
    id_id = 0x0421, // Indonesian, Indonesia
    iu = 0x005D, // Inuktitut (Latin)
    iu_latn = 0x7C5D, // Inuktitut (Latin)
    iu_latn_ca = 0x085D, // Inuktitut (Latin), Canada
    iu_cans = 0x785D, // Inuktitut (Syllabics)
    iu_cans_ca = 0x045d, // Inuktitut (Syllabics), Canada
    ga = 0x003C, // Irish
    ga_ie = 0x083C, // Irish, Ireland
    it = 0x0010, // Italian
    it_it = 0x0410, // Italian, Italy
    it_ch = 0x0810, // Italian, Switzerland
    ja = 0x0011, // Japanese
    ja_jp = 0x0411, // Japanese, Japan
    kn = 0x004B, // Kannada
    kn_in = 0x044B, // Kannada, India
    kr_latn_ng = 0x0471, // Kanuri (Latin), Nigeria
    ks = 0x0060, // Kashmiri
    ks_arab = 0x0460, // Kashmiri, Perso-Arabic
    ks_deva_in = 0x0860, // Kashmiri (Devanagari), India
    kk = 0x003F, // Kazakh
    kk_kz = 0x043F, // Kazakh, Kazakhstan
    km = 0x0053, // Khmer
    km_kh = 0x0453, // Khmer, Cambodia
    quc = 0x0086, // K'iche
    quc_latn_gt = 0x0486, // K'iche, Guatemala
    rw = 0x0087, // Kinyarwanda
    rw_rw = 0x0487, // Kinyarwanda, Rwanda
    sw = 0x0041, // Kiswahili
    sw_ke = 0x0441, // Kiswahili, Kenya
    kok = 0x0057, // Konkani
    kok_in = 0x0457, // Konkani, India
    ko = 0x0012, // Korean
    ko_kr = 0x0412, // Korean, Korea
    ky = 0x0040, // Kyrgyz
    ky_kg = 0x0440, // Kyrgyz, Kyrgyzstan
    lo = 0x0054, // Lao
    lo_la = 0x0454, // Lao, Lao P.D.R.
    la_va = 0x0476, // Latin, Vatican City
    lv = 0x0026, // Latvian
    lv_lv = 0x0426, // Latvian, Latvia
    lt = 0x0027, // Lithuanian
    lt_lt = 0x0427, // Lithuanian, Lithuania
    dsb = 0x7C2E, // Lower Sorbian
    dsb_de = 0x082E, // Lower Sorbian, Germany
    lb = 0x006E, // Luxembourgish
    lb_lu = 0x046E, // Luxembourgish, Luxembourg
    mk = 0x002F, // Macedonian
    mk_mk = 0x042F, // Macedonian, North Macedonia
    ms = 0x003E, // Malay
    ms_bn = 0x083E, // Malay, Brunei Darussalam
    ms_my = 0x043E, // Malay, Malaysia
    ml = 0x004C, // Malayalam
    ml_in = 0x044C, // Malayalam, India
    mt = 0x003A, // Maltese
    mt_mt = 0x043A, // Maltese, Malta
    mi = 0x0081, // Maori
    mi_nz = 0x0481, // Maori, New Zealand
    arn = 0x007A, // Mapudungun
    arn_cl = 0x047A, // Mapudungun, Chile
    mr = 0x004E, // Marathi
    mr_in = 0x044E, // Marathi, India
    moh = 0x007C, // Mohawk
    moh_ca = 0x047C, // Mohawk, Canada
    mn = 0x0050, // Mongolian (Cyrillic)
    mn_cyrl = 0x7850, // Mongolian (Cyrillic)
    mn_mn = 0x0450, // Mongolian (Cyrillic), Mongolia
    mn_mong = 0x7C50, // Mongolian (Traditional Mongolian)
    mn_mong_cn = 0x0850, // Mongolian (Traditional Mongolian), People's Republic of China
    mn_mong_mn = 0x0C50, // Mongolian (Traditional Mongolian), Mongolia
    ne = 0x0061, // Nepali
    ne_in = 0x0861, // Nepali, India
    ne_np = 0x0461, // Nepali, Nepal
    no = 0x0014, // Norwegian (Bokmal)
    nb = 0x7C14, // Norwegian (Bokmal)
    nb_no = 0x0414, // Norwegian (Bokmal), Norway
    nn = 0x7814, // Norwegian (Nynorsk)
    nn_no = 0x0814, // Norwegian (Nynorsk), Norway
    oc = 0x0082, // Occitan
    oc_fr = 0x0482, // Occitan, France
    @"or" = 0x0048, // Odia
    or_in = 0x0448, // Odia, India
    om = 0x0072, // Oromo
    om_et = 0x0472, // Oromo, Ethiopia
    ps = 0x0063, // Pashto
    ps_af = 0x0463, // Pashto, Afghanistan
    fa = 0x0029, // Persian
    fa_ir = 0x0429, // Persian, Iran
    pl = 0x0015, // Polish
    pl_pl = 0x0415, // Polish, Poland
    pt = 0x0016, // Portuguese
    pt_br = 0x0416, // Portuguese, Brazil
    pt_pt = 0x0816, // Portuguese, Portugal
    qps_ploca = 0x05FE, // Pseudo Language, Pseudo locale for east Asian/complex script localization testing
    qps_ploc = 0x0501, // Pseudo Language, Pseudo locale used for localization testing
    qps_plocm = 0x09FF, // Pseudo Language, Pseudo locale used for localization testing of mirrored locales
    pa = 0x0046, // Punjabi
    pa_arab = 0x7C46, // Punjabi
    pa_in = 0x0446, // Punjabi, India
    pa_arab_pk = 0x0846, // Punjabi, Islamic Republic of Pakistan
    quz = 0x006B, // Quechua
    quz_bo = 0x046B, // Quechua, Bolivia
    quz_ec = 0x086B, // Quechua, Ecuador
    quz_pe = 0x0C6B, // Quechua, Peru
    ro = 0x0018, // Romanian
    ro_md = 0x0818, // Romanian, Moldova
    ro_ro = 0x0418, // Romanian, Romania
    rm = 0x0017, // Romansh
    rm_ch = 0x0417, // Romansh, Switzerland
    ru = 0x0019, // Russian
    ru_md = 0x0819, // Russian, Moldova
    ru_ru = 0x0419, // Russian, Russia
    sah = 0x0085, // Sakha
    sah_ru = 0x0485, // Sakha, Russia
    smn = 0x703B, // Sami (Inari)
    smn_fi = 0x243B, // Sami (Inari), Finland
    smj = 0x7C3B, // Sami (Lule)
    smj_no = 0x103B, // Sami (Lule), Norway
    smj_se = 0x143B, // Sami (Lule), Sweden
    se = 0x003B, // Sami (Northern)
    se_fi = 0x0C3B, // Sami (Northern), Finland
    se_no = 0x043B, // Sami (Northern), Norway
    se_se = 0x083B, // Sami (Northern), Sweden
    sms = 0x743B, // Sami (Skolt)
    sms_fi = 0x203B, // Sami (Skolt), Finland
    sma = 0x783B, // Sami (Southern)
    sma_no = 0x183B, // Sami (Southern), Norway
    sma_se = 0x1C3B, // Sami (Southern), Sweden
    sa = 0x004F, // Sanskrit
    sa_in = 0x044F, // Sanskrit, India
    gd = 0x0091, // Scottish Gaelic
    gd_gb = 0x0491, // Scottish Gaelic, United Kingdom
    sr_cyrl = 0x6C1A, // Serbian (Cyrillic)
    sr_cyrl_ba = 0x1C1A, // Serbian (Cyrillic), Bosnia and Herzegovina
    sr_cyrl_me = 0x301A, // Serbian (Cyrillic), Montenegro
    sr_cyrl_rs = 0x281A, // Serbian (Cyrillic), Serbia
    sr_cyrl_cs = 0x0C1A, // Serbian (Cyrillic), Serbia and Montenegro (Former)
    sr_latn = 0x701A, // Serbian (Latin)
    sr = 0x7C1A, // Serbian (Latin)
    sr_latn_ba = 0x181A, // Serbian (Latin), Bosnia and Herzegovina
    sr_latn_me = 0x2c1A, // Serbian (Latin), Montenegro
    sr_latn_rs = 0x241A, // Serbian (Latin), Serbia
    sr_latn_cs = 0x081A, // Serbian (Latin), Serbia and Montenegro (Former)
    nso = 0x006C, // Sesotho sa Leboa
    nso_za = 0x046C, // Sesotho sa Leboa, South Africa
    tn = 0x0032, // Setswana
    tn_bw = 0x0832, // Setswana, Botswana
    tn_za = 0x0432, // Setswana, South Africa
    sd = 0x0059, // Sindhi
    sd_arab = 0x7C59, // Sindhi
    sd_arab_pk = 0x0859, // Sindhi, Islamic Republic of Pakistan
    si = 0x005B, // Sinhala
    si_lk = 0x045B, // Sinhala, Sri Lanka
    sk = 0x001B, // Slovak
    sk_sk = 0x041B, // Slovak, Slovakia
    sl = 0x0024, // Slovenian
    sl_si = 0x0424, // Slovenian, Slovenia
    so = 0x0077, // Somali
    so_so = 0x0477, // Somali, Somalia
    st = 0x0030, // Sotho
    st_za = 0x0430, // Sotho, South Africa
    es = 0x000A, // Spanish
    es_ar = 0x2C0A, // Spanish, Argentina
    es_ve = 0x200A, // Spanish, Bolivarian Republic of Venezuela
    es_bo = 0x400A, // Spanish, Bolivia
    es_cl = 0x340A, // Spanish, Chile
    es_co = 0x240A, // Spanish, Colombia
    es_cr = 0x140A, // Spanish, Costa Rica
    es_cu = 0x5c0A, // Spanish, Cuba
    es_do = 0x1c0A, // Spanish, Dominican Republic
    es_ec = 0x300A, // Spanish, Ecuador
    es_sv = 0x440A, // Spanish, El Salvador
    es_gt = 0x100A, // Spanish, Guatemala
    es_hn = 0x480A, // Spanish, Honduras
    es_419 = 0x580A, // Spanish, Latin America
    es_mx = 0x080A, // Spanish, Mexico
    es_ni = 0x4C0A, // Spanish, Nicaragua
    es_pa = 0x180A, // Spanish, Panama
    es_py = 0x3C0A, // Spanish, Paraguay
    es_pe = 0x280A, // Spanish, Peru
    es_pr = 0x500A, // Spanish, Puerto Rico
    es_es_tradnl = 0x040A, // Spanish, Spain
    es_es = 0x0c0A, // Spanish, Spain
    es_us = 0x540A, // Spanish, United States
    es_uy = 0x380A, // Spanish, Uruguay
    sv = 0x001D, // Swedish
    sv_fi = 0x081D, // Swedish, Finland
    sv_se = 0x041D, // Swedish, Sweden
    syr = 0x005A, // Syriac
    syr_sy = 0x045A, // Syriac, Syria
    tg = 0x0028, // Tajik (Cyrillic)
    tg_cyrl = 0x7C28, // Tajik (Cyrillic)
    tg_cyrl_tj = 0x0428, // Tajik (Cyrillic), Tajikistan
    tzm = 0x005F, // Tamazight (Latin)
    tzm_latn = 0x7C5F, // Tamazight (Latin)
    tzm_latn_dz = 0x085F, // Tamazight (Latin), Algeria
    ta = 0x0049, // Tamil
    ta_in = 0x0449, // Tamil, India
    ta_lk = 0x0849, // Tamil, Sri Lanka
    tt = 0x0044, // Tatar
    tt_ru = 0x0444, // Tatar, Russia
    te = 0x004A, // Telugu
    te_in = 0x044A, // Telugu, India
    th = 0x001E, // Thai
    th_th = 0x041E, // Thai, Thailand
    bo = 0x0051, // Tibetan
    bo_cn = 0x0451, // Tibetan, People's Republic of China
    ti = 0x0073, // Tigrinya
    ti_er = 0x0873, // Tigrinya, Eritrea
    ti_et = 0x0473, // Tigrinya, Ethiopia
    ts = 0x0031, // Tsonga
    ts_za = 0x0431, // Tsonga, South Africa
    tr = 0x001F, // Turkish
    tr_tr = 0x041F, // Turkish, Turkey
    tk = 0x0042, // Turkmen
    tk_tm = 0x0442, // Turkmen, Turkmenistan
    uk = 0x0022, // Ukrainian
    uk_ua = 0x0422, // Ukrainian, Ukraine
    hsb = 0x002E, // Upper Sorbian
    hsb_de = 0x042E, // Upper Sorbian, Germany
    ur = 0x0020, // Urdu
    ur_in = 0x0820, // Urdu, India
    ur_pk = 0x0420, // Urdu, Islamic Republic of Pakistan
    ug = 0x0080, // Uyghur
    ug_cn = 0x0480, // Uyghur, People's Republic of China
    uz_cyrl = 0x7843, // Uzbek (Cyrillic)
    uz_cyrl_uz = 0x0843, // Uzbek (Cyrillic), Uzbekistan
    uz = 0x0043, // Uzbek (Latin)
    uz_latn = 0x7C43, // Uzbek (Latin)
    uz_latn_uz = 0x0443, // Uzbek (Latin), Uzbekistan
    ca_es_valencia = 0x0803, // Valencian, Spain
    ve = 0x0033, // Venda
    ve_za = 0x0433, // Venda, South Africa
    vi = 0x002A, // Vietnamese
    vi_vn = 0x042A, // Vietnamese, Vietnam
    cy = 0x0052, // Welsh
    cy_gb = 0x0452, // Welsh, United Kingdom
    wo = 0x0088, // Wolof
    wo_sn = 0x0488, // Wolof, Senegal
    xh = 0x0034, // Xhosa
    xh_za = 0x0434, // Xhosa, South Africa
    ii = 0x0078, // Yi
    ii_cn = 0x0478, // Yi, People's Republic of China
    yi_001 = 0x043D, // Yiddish, World
    yo = 0x006A, // Yoruba
    yo_ng = 0x046A, // Yoruba, Nigeria
    zu = 0x0035, // Zulu
    zu_za = 0x0435, // Zulu, South Africa

    /// Special case
    x_iv_mathan = 0x007F, // LANG_INVARIANT, "math alphanumeric sorting"
};