Skip to content

Commit 04b168d

Browse files
committed
compiler: Support long UTF-8 encoded atoms
Support for atoms containing any Unicode code point was added in Erlang/OTP 20 (PR-1078). After that change, an atom can contain up to 255 Unicode code points. However, atoms used in Erlang source code is still limited to 255 bytes because the atom table in the BEAM file only has a byte for holding the length in bytes of the atom text. For instance, the `🟦` character has a four-byte encoding (`<<240,159,159,166>>`), meaning that Erlang source code containing a literal atom consisting of 64 or more such characters cannot be compiled. This commit changes the atom table in BEAM files to use a variable length encoding for the length of each atom. For atoms up to 15 bytes, the length is encoded in one byte. The header for the atom table is also changed to indicate that new encoding of lengths are used. Attempting to load a BEAM file compiled with Erlang/OTP 28 in Erlang/OTP 27 or earlier will result in the following error message: 1> l(t). =ERROR REPORT==== 8-Oct-2024::08:49:01.750424 === beam/beam_load.c(150): Error loading module t: corrupt atom table {error,badfile} `beam_lib` is updated to handle the new format. External tools that use `beam_lib:chunks(Beam, [atoms])` to read the atom table will continue to work. External tools that do their own parsing of the atom table will need to be updated.
1 parent fa80932 commit 04b168d

File tree

6 files changed

+90
-22
lines changed

6 files changed

+90
-22
lines changed

erts/emulator/beam/beam_file.c

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -241,13 +241,18 @@ static int parse_atom_chunk(BeamFile *beam,
241241
BeamReader reader;
242242
Sint32 count;
243243
int i;
244+
bool long_counts = false;
244245

245246
ASSERT(beam->atoms.entries == NULL);
246247
atoms = &beam->atoms;
247248

248249
beamreader_init(chunk->data, chunk->size, &reader);
249250

250251
LoadAssert(beamreader_read_i32(&reader, &count));
252+
if (count < 0) {
253+
long_counts = true;
254+
count = -count;
255+
}
251256
LoadAssert(CHECK_ITEM_COUNT(count, 1, sizeof(atoms->entries[0])));
252257

253258
/* Reserve a slot for the empty list, which is encoded as atom 0 as we
@@ -264,12 +269,21 @@ static int parse_atom_chunk(BeamFile *beam,
264269

265270
for (i = 1; i < count; i++) {
266271
const byte *string;
267-
byte length;
268272
Eterm atom;
273+
Uint length;
269274

270-
LoadAssert(beamreader_read_u8(&reader, &length));
271-
LoadAssert(beamreader_read_bytes(&reader, length, &string));
275+
if (long_counts) {
276+
TaggedNumber len;
277+
LoadAssert(beamreader_read_tagged(&reader, &len));
278+
LoadAssert(len.size == 0);
279+
length = (Uint) len.word_value;
280+
} else {
281+
byte len;
282+
LoadAssert(beamreader_read_u8(&reader, &len));
283+
length = len;
284+
}
272285

286+
LoadAssert(beamreader_read_bytes(&reader, length, &string));
273287
atom = erts_atom_put(string, length, ERTS_ATOM_ENC_UTF8, 1);
274288
LoadAssert(atom != THE_NON_VALUE);
275289

lib/compiler/src/beam_asm.erl

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,7 @@ build_file(Code, Attr, Dict, NumLabels, NumFuncs, ExtraChunks0, CompileInfo, Com
137137
Code),
138138

139139
%% Create the atom table chunk.
140-
{NumAtoms, AtomTab} = beam_dict:atom_table(Dict),
141-
AtomChunk = chunk(<<"AtU8">>, <<NumAtoms:32>>, AtomTab),
140+
AtomChunk = build_atom_table(CompilerOpts, Dict),
142141

143142
%% Create the import table chunk.
144143

@@ -287,6 +286,19 @@ build_attributes(Attr, Compile, MD5) ->
287286
CompileBinary = term_to_binary([{version,?COMPILER_VSN}|Compile]),
288287
{AttrBinary,CompileBinary}.
289288

289+
build_atom_table(Options, Dict) ->
290+
{NumAtoms, AtomTab0} = beam_dict:atom_table(Dict),
291+
case member(no_long_atoms, Options) of
292+
false ->
293+
%% Build an atom table for Erlang/OTP 28 and later.
294+
AtomTab = [[encode(?tag_u, Len),Text] || [Len,Text] <- AtomTab0],
295+
chunk(<<"AtU8">>, <<-NumAtoms:32>>, AtomTab);
296+
true ->
297+
%% Build an atom table compatible with Erlang/OTP 27
298+
%% and earlier.
299+
chunk(<<"AtU8">>, <<NumAtoms:32>>, AtomTab0)
300+
end.
301+
290302
build_line_table(Dict, Options) ->
291303
{NumLineInstrs,NumFnames0,Fnames0,NumLines,Lines0,ExecLine} =
292304
beam_dict:line_table(Dict),

lib/compiler/src/compile.erl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1040,7 +1040,9 @@ expand_opt(r25, Os) ->
10401040
[no_ssa_opt_update_tuple, no_bs_match, no_min_max_bifs |
10411041
expand_opt(r26, Os)];
10421042
expand_opt(r26, Os) ->
1043-
[no_bsm_opt | Os];
1043+
[no_bsm_opt | expand_opt(r27, Os)];
1044+
expand_opt(r27, Os) ->
1045+
[no_long_atoms | Os];
10441046
expand_opt({debug_info_key,_}=O, Os) ->
10451047
[encrypt_debug_info,O|Os];
10461048
expand_opt(no_type_opt=O, Os) ->

lib/compiler/test/compile_SUITE.erl

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -943,19 +943,31 @@ test_sloppy() ->
943943
Turtle.
944944

945945
utf8_atoms(Config) when is_list(Config) ->
946+
do_utf8_atom(binary_to_atom(<<"こんにちは"/utf8>>, utf8)),
947+
948+
LongAtom = binary_to_atom(binary:copy(<<240,159,159,166>>, 255)),
949+
do_utf8_atom(LongAtom),
950+
951+
ok.
952+
953+
do_utf8_atom(Atom) ->
954+
Mod = ?FUNCTION_NAME,
946955
Anno = erl_anno:new(1),
947-
Atom = binary_to_atom(<<"こんにちは"/utf8>>, utf8),
948-
Forms = [{attribute,Anno,compile,[export_all]},
956+
Forms = [{attribute,Anno,module,Mod},
957+
{attribute,Anno,compile,[export_all]},
949958
{function,Anno,atom,0,[{clause,Anno,[],[],[{atom,Anno,Atom}]}]}],
950959

951-
Utf8AtomForms = [{attribute,Anno,module,utf8_atom}|Forms],
952-
{ok,utf8_atom,Utf8AtomBin} =
953-
compile:forms(Utf8AtomForms, [binary]),
954-
{ok,{utf8_atom,[{atoms,_}]}} =
955-
beam_lib:chunks(Utf8AtomBin, [atoms]),
956-
code:load_binary(utf8_atom, "compile_SUITE", Utf8AtomBin),
957-
Atom = utf8_atom:atom(),
960+
{ok,Mod,Utf8AtomBin} = compile:forms(Forms, [binary,report]),
961+
{ok,{Mod,[{atoms,_}]}} = beam_lib:chunks(Utf8AtomBin, [atoms]),
962+
963+
code:load_binary(Mod, "compile_SUITE", Utf8AtomBin),
964+
965+
Atom = Mod:atom(),
958966
true = is_atom(Atom),
967+
968+
true = code:delete(Mod),
969+
false = code:purge(Mod),
970+
959971
ok.
960972

961973
utf8_functions(Config) when is_list(Config) ->

lib/stdlib/src/beam_lib.erl

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -977,8 +977,7 @@ scan_beam(FD, Pos, What, Mod, Data) ->
977977
get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, Encoding) ->
978978
NewCs = del_chunk(Id, Cs),
979979
{NFD, Chunk} = get_chunk(Id, Pos, Size, FD),
980-
<<_Num:32, Chunk2/binary>> = Chunk,
981-
{Module, _} = extract_atom(Chunk2, Encoding),
980+
Module = extract_module(Chunk, Encoding),
982981
C = case Cs of
983982
info ->
984983
{Id, Pos, Size};
@@ -1202,6 +1201,15 @@ ensure_atoms({empty, AT}, Cs) ->
12021201
ensure_atoms(AT, _Cs) ->
12031202
AT.
12041203

1204+
extract_module(<<Num:32/signed-integer, B/binary>>, utf8) when Num < 0 ->
1205+
{Module, _} = extract_long_atom(B),
1206+
Module;
1207+
extract_module(<<_Num:32/signed-integer, B/binary>>, Encoding) ->
1208+
{Module, _} = extract_atom(B, Encoding),
1209+
Module.
1210+
1211+
extract_atoms(<<Num:32/signed-integer, B/binary>>, AT, utf8) when Num < 0 ->
1212+
extract_long_atoms(B, 1, AT);
12051213
extract_atoms(<<_Num:32, B/binary>>, AT, Encoding) ->
12061214
extract_atoms(B, 1, AT, Encoding).
12071215

@@ -1216,6 +1224,27 @@ extract_atom(<<Len, B/binary>>, Encoding) ->
12161224
<<SB:Len/binary, Tail/binary>> = B,
12171225
{binary_to_atom(SB, Encoding), Tail}.
12181226

1227+
extract_long_atoms(<<>>, _I, _AT) ->
1228+
true;
1229+
extract_long_atoms(B, I, AT) ->
1230+
{Atom, B1} = extract_long_atom(B),
1231+
true = ets:insert(AT, {I, Atom}),
1232+
extract_long_atoms(B1, I+1, AT).
1233+
1234+
extract_long_atom(B0) ->
1235+
{Len, B} = decode_arg_val(B0),
1236+
<<SB:Len/binary, Tail/binary>> = B,
1237+
{binary_to_atom(SB, utf8), Tail}.
1238+
1239+
%% Extract the value from variable-sized tagged argument. Only support
1240+
%% values from 0 through 2047, which is sufficient to handle the
1241+
%% length of atoms in the atom table.
1242+
decode_arg_val(<<N:4,0:1, _Tag:3, Code/binary>>) ->
1243+
{N, Code};
1244+
decode_arg_val(<<High:3,0:1,1:1, _Tag:3, Low, Code0/binary>>) ->
1245+
N = (High bsl 8) bor Low,
1246+
{N, Code0}.
1247+
12191248
%%% Utils.
12201249

12211250
-record(bb, {pos = 0 :: integer(),

lib/stdlib/test/beam_lib_SUITE.erl

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ normal(Conf) when is_list(Conf) ->
9494
P0 = pps(),
9595

9696
do_normal(Source, PrivDir, BeamFile, []),
97+
do_normal(Source, PrivDir, BeamFile, [r27]),
9798

9899
{ok,_} = compile:file(Source, [{outdir,PrivDir}, no_debug_info]),
99100
{ok, {simple, [{debug_info, {debug_info_v1, erl_abstract_code, {none, _}}}]}} =
@@ -118,7 +119,7 @@ do_normal(Source, PrivDir, BeamFile, Opts) ->
118119
do_normal(BeamFile, Opts),
119120
do_normal(Binary, Opts).
120121

121-
do_normal(BeamFile, Opts) ->
122+
do_normal(BeamFile, _Opts) ->
122123
Imports = {imports, [{erlang, get_module_info, 1},
123124
{erlang, get_module_info, 2},
124125
{lists, member, 2}]},
@@ -151,10 +152,8 @@ do_normal(BeamFile, Opts) ->
151152
%% Test reading optional chunks.
152153
All = ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT", "AtU8"],
153154
{ok,{simple,Chunks}} = beam_lib:chunks(BeamFile, All, [allow_missing_chunks]),
154-
case {verify_simple(Chunks),Opts} of
155-
{{missing_chunk, AtomBin}, []} when is_binary(AtomBin) -> ok;
156-
{{AtomBin, missing_chunk}, [no_utf8_atoms]} when is_binary(AtomBin) -> ok
157-
end,
155+
{missing_chunk, AtomBin} = verify_simple(Chunks),
156+
true = is_binary(AtomBin),
158157

159158
%% 'allow_missing_chunks' should work for named chunks too.
160159
{ok, {simple, StrippedBeam}} = beam_lib:strip(BeamFile),

0 commit comments

Comments
 (0)