1111#include < string>
1212#include < string_view>
1313#include < vector>
14+ #include " embedded_data.h"
1415#include " executable_wrapper.h"
1516#include " simdutf.h"
1617#include " uv.h"
@@ -396,11 +397,14 @@ const std::string& GetCode(uint16_t index) {
396397
397398#ifdef NODE_JS2C_USE_STRING_LITERALS
398399const char * string_literal_def_template = " static const %s *%s_raw = " ;
400+ constexpr std::string_view latin1_string_literal_start =
401+ " reinterpret_cast<const uint8_t*>(\" " ;
399402constexpr std::string_view ascii_string_literal_start =
400403 " reinterpret_cast<const uint8_t*>(R\" JS2C1b732aee(" ;
401404constexpr std::string_view utf16_string_literal_start =
402405 " reinterpret_cast<const uint16_t*>(uR\" JS2C1b732aee(" ;
403- constexpr std::string_view string_literal_end = " )JS2C1b732aee\" );" ;
406+ constexpr std::string_view latin1_string_literal_end = " \" );" ;
407+ constexpr std::string_view utf_string_literal_end = " )JS2C1b732aee\" );" ;
404408#else
405409const char * array_literal_def_template = " static const %s %s_raw[] = " ;
406410constexpr std::string_view array_literal_start = " {\n " ;
@@ -424,9 +428,15 @@ constexpr std::string_view array_literal_end = "\n};\n\n";
424428// If NODE_JS2C_USE_STRING_LITERALS is defined, the data is output as C++
425429// raw strings (i.e. R"JS2C1b732aee(...)JS2C1b732aee") rather than as an
426430// array. This speeds up compilation for gcc/clang.
431+ enum class CodeType {
432+ kAscii , // Code points are all within 0-127
433+ kLatin1 , // Code points are all within 0-255
434+ kTwoByte ,
435+ };
427436template <typename T>
428437Fragment GetDefinitionImpl (const std::vector<char >& code,
429- const std::string& var) {
438+ const std::string& var,
439+ CodeType type) {
430440 constexpr bool is_two_byte = std::is_same_v<T, uint16_t >;
431441 static_assert (is_two_byte || std::is_same_v<T, char >);
432442
@@ -440,11 +450,14 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
440450
441451#ifdef NODE_JS2C_USE_STRING_LITERALS
442452 const char * literal_def_template = string_literal_def_template;
443- size_t def_size = 512 + code.size ();
453+ // For code that contains Latin-1 characters, be conservative and assume
454+ // they all need escaping: one "\" and three digits.
455+ size_t unit = type == CodeType::kLatin1 ? 4 : 1 ;
456+ size_t def_size = 512 + code.size () * unit;
444457#else
445458 const char * literal_def_template = array_literal_def_template;
446459 constexpr size_t unit =
447- (is_two_byte ? 5 : 3 ) + 1 ; // 0-65536 or 0-127 and a ","
460+ (is_two_byte ? 5 : 3 ) + 1 ; // 0-65536 or 0-255 and a ","
448461 size_t def_size = 512 + count * unit;
449462#endif
450463
@@ -456,16 +469,56 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
456469 assert (cur != 0 );
457470
458471#ifdef NODE_JS2C_USE_STRING_LITERALS
459- constexpr std::string_view start_string_view =
460- is_two_byte ? utf16_string_literal_start : ascii_string_literal_start;
472+ std::string_view start_string_view;
473+ switch (type) {
474+ case CodeType::kAscii :
475+ start_string_view = ascii_string_literal_start;
476+ break ;
477+ case CodeType::kLatin1 :
478+ start_string_view = latin1_string_literal_start;
479+ break ;
480+ case CodeType::kTwoByte :
481+ start_string_view = utf16_string_literal_start;
482+ break ;
483+ }
461484
462485 memcpy (
463486 result.data () + cur, start_string_view.data (), start_string_view.size ());
464487 cur += start_string_view.size ();
465488
466- memcpy (result.data () + cur, code.data (), code.size ());
467- cur += code.size ();
489+ if (type != CodeType::kLatin1 ) {
490+ memcpy (result.data () + cur, code.data (), code.size ());
491+ cur += code.size ();
492+ } else {
493+ const uint8_t * ptr = reinterpret_cast <const uint8_t *>(code.data ());
494+ for (size_t i = 0 ; i < count; ++i) {
495+ // Avoid using snprintf on large chunks of data because it's much slower.
496+ // It's fine to use it on small amount of data though.
497+ uint8_t ch = ptr[i];
498+ if (ch > 127 ) {
499+ Debug (" In %s, found non-ASCII Latin-1 character at %zu: %d\n " ,
500+ var.c_str (),
501+ i,
502+ ch);
503+ }
504+ const std::string& str = GetOctalCode (ch);
505+ memcpy (result.data () + cur, str.c_str (), str.size ());
506+ cur += str.size ();
507+ }
508+ }
468509
510+ std::string_view string_literal_end;
511+ switch (type) {
512+ case CodeType::kAscii :
513+ string_literal_end = utf_string_literal_end;
514+ break ;
515+ case CodeType::kLatin1 :
516+ string_literal_end = latin1_string_literal_end;
517+ break ;
518+ case CodeType::kTwoByte :
519+ string_literal_end = utf_string_literal_end;
520+ break ;
521+ }
469522 memcpy (result.data () + cur,
470523 string_literal_end.data (),
471524 string_literal_end.size ());
@@ -476,10 +529,10 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
476529 array_literal_start.size ());
477530 cur += array_literal_start.size ();
478531
479- const std::vector<T>* codepoints;
480-
481- std::vector<uint16_t > utf16_codepoints;
532+ // Avoid using snprintf on large chunks of data because it's much slower.
533+ // It's fine to use it on small amount of data though.
482534 if constexpr (is_two_byte) {
535+ std::vector<uint16_t > utf16_codepoints;
483536 utf16_codepoints.resize (count);
484537 size_t utf16_count = simdutf::convert_utf8_to_utf16 (
485538 code.data (),
@@ -488,19 +541,25 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
488541 assert (utf16_count != 0 );
489542 utf16_codepoints.resize (utf16_count);
490543 Debug (" static size %zu\n " , utf16_count);
491- codepoints = &utf16_codepoints;
544+ for (size_t i = 0 ; i < utf16_count; ++i) {
545+ const std::string& str = GetCode (utf16_codepoints[i]);
546+ memcpy (result.data () + cur, str.c_str (), str.size ());
547+ cur += str.size ();
548+ }
492549 } else {
493- // The code is ASCII, so no need to translate.
494- codepoints = &code;
495- }
496-
497- for (size_t i = 0 ; i < codepoints->size (); ++i) {
498- // Avoid using snprintf on large chunks of data because it's much slower.
499- // It's fine to use it on small amount of data though.
500- const std::string& str = GetCode (static_cast <uint16_t >((*codepoints)[i]));
501-
502- memcpy (result.data () + cur, str.c_str (), str.size ());
503- cur += str.size ();
550+ const uint8_t * ptr = reinterpret_cast <const uint8_t *>(code.data ());
551+ for (size_t i = 0 ; i < count; ++i) {
552+ uint16_t ch = static_cast <uint16_t >(ptr[i]);
553+ if (ch > 127 ) {
554+ Debug (" In %s, found non-ASCII Latin-1 character at %zu: %d\n " ,
555+ var.c_str (),
556+ i,
557+ ch);
558+ }
559+ const std::string& str = GetCode (ch);
560+ memcpy (result.data () + cur, str.c_str (), str.size ());
561+ cur += str.size ();
562+ }
504563 }
505564
506565 memcpy (
@@ -520,17 +579,80 @@ Fragment GetDefinitionImpl(const std::vector<char>& code,
520579 return result;
521580}
522581
523- Fragment GetDefinition (const std::string& var, const std::vector<char >& code) {
524- Debug (" GetDefinition %s, code size %zu " , var.c_str (), code.size ());
525- bool is_one_byte = simdutf::validate_ascii (code.data (), code.size ());
526- Debug (" with %s\n " , is_one_byte ? " 1-byte chars" : " 2-byte chars" );
582+ bool Simplify (const std::vector<char >& code,
583+ const std::string& var,
584+ std::vector<char >* simplified) {
585+ // Allowlist files to avoid false positives.
586+ // TODO(joyeecheung): this could be removed if undici updates itself
587+ // to replace "’" with "'" though we could still keep this skeleton in
588+ // place for future hot fixes that are verified by humans.
589+ if (var != " internal_deps_undici_undici" ) {
590+ return false ;
591+ }
527592
528- if (is_one_byte) {
529- Debug (" static size %zu\n " , code.size ());
530- return GetDefinitionImpl<char >(code, var);
531- } else {
532- return GetDefinitionImpl<uint16_t >(code, var);
593+ size_t code_size = code.size ();
594+ simplified->reserve (code_size);
595+ const uint8_t * ptr = reinterpret_cast <const uint8_t *>(code.data ());
596+ size_t simplified_count = 0 ;
597+ for (size_t i = 0 ; i < code_size; ++i) {
598+ switch (ptr[i]) {
599+ case 226 : { // ’ [ 226, 128, 153 ] -> '
600+ if (i + 2 < code_size && ptr[i + 1 ] == 128 && ptr[i + 2 ] == 153 ) {
601+ simplified->push_back (' \' ' );
602+ i += 2 ;
603+ simplified_count++;
604+ break ;
605+ }
606+ }
607+ default : {
608+ simplified->push_back (code[i]);
609+ break ;
610+ }
611+ }
533612 }
613+
614+ if (simplified_count > 0 ) {
615+ Debug (" Simplified %d characters, " , simplified_count);
616+ Debug (" old size %d, new size %d\n " , code_size, simplified->size ());
617+ return true ;
618+ }
619+ return false ;
620+ }
621+
622+ Fragment GetDefinition (const std::string& var, const std::vector<char >& code) {
623+ Debug (" GetDefinition %s, code size %zu\n " , var.c_str (), code.size ());
624+ bool is_ascii = simdutf::validate_ascii (code.data (), code.size ());
625+
626+ if (is_ascii) {
627+ Debug (" ASCII-only, static size %zu\n " , code.size ());
628+ return GetDefinitionImpl<char >(code, var, CodeType::kAscii );
629+ }
630+
631+ std::vector<char > latin1 (code.size ());
632+ auto result = simdutf::convert_utf8_to_latin1_with_errors (
633+ code.data (), code.size (), latin1.data ());
634+ if (!result.error ) {
635+ latin1.resize (result.count );
636+ Debug (" Latin-1-only, old size %zu, new size %zu\n " ,
637+ code.size (),
638+ latin1.size ());
639+ return GetDefinitionImpl<char >(latin1, var, CodeType::kLatin1 );
640+ }
641+
642+ // Since V8 only supports Latin-1 and UTF16 as underlying representation
643+ // we have to encode all files containing two-byte characters as UTF16.
644+ // While some files do need two-byte characters, some just
645+ // unintentionally have them. Replace certain characters that are known
646+ // to have sane one-byte equivalent to save space.
647+ std::vector<char > simplified;
648+ if (Simplify (code, var, &simplified)) { // Changed.
649+ Debug (" %s is simplified, re-generate definition\n " , var.c_str ());
650+ return GetDefinition (var, simplified);
651+ }
652+
653+ // Simplification did not turn the code into 1-byte string. Just
654+ // use the original.
655+ return GetDefinitionImpl<uint16_t >(code, var, CodeType::kTwoByte );
534656}
535657
536658int AddModule (const std::string& filename,
0 commit comments