diff --git a/src/workerd/api/BUILD.bazel b/src/workerd/api/BUILD.bazel index 5d884dab8d7..eba268d382a 100644 --- a/src/workerd/api/BUILD.bazel +++ b/src/workerd/api/BUILD.bazel @@ -432,6 +432,7 @@ wd_cc_library( hdrs = ["encoding.h"], implementation_deps = [ "//src/workerd/util:strings", + "@simdutf", ], visibility = ["//visibility:public"], deps = [ diff --git a/src/workerd/api/encoding.c++ b/src/workerd/api/encoding.c++ index 329bcb7d20d..029d5514549 100644 --- a/src/workerd/api/encoding.c++ +++ b/src/workerd/api/encoding.c++ @@ -4,6 +4,7 @@ #include "encoding.h" +#include "simdutf.h" #include "util.h" #include @@ -11,6 +12,9 @@ #include #include +#include + +#include #include @@ -460,38 +464,584 @@ kj::Maybe TextDecoder::decodePtr( // ======================================================================================= // TextEncoder implementation +namespace { + +[[maybe_unused]] constexpr inline bool isLeadSurrogate(char16_t c) { + return (c & 0xFC00) == 0xD800; +} + +[[maybe_unused]] constexpr inline bool isTrailSurrogate(char16_t c) { + return (c & 0xFC00) == 0xDC00; +} + +// Calculate UTF-8 length from UTF-16 with potentially invalid surrogates. +// Invalid surrogates are counted as U+FFFD (3 bytes in UTF-8). +// Uses SIMD for valid portions and falls back to scalar for invalid surrogates. +size_t utf8LengthFromInvalidUtf16(kj::ArrayPtr input) { + size_t inputPos = 0; + size_t utf8Length = 0; + + while (inputPos < input.size()) { + // Find the next invalid surrogate using SIMD validation + auto result = + simdutf::validate_utf16_with_errors(input.begin() + inputPos, input.size() - inputPos); + + if (result.error == simdutf::error_code::SUCCESS) { + // Remaining input is valid - calculate length with SIMD + utf8Length += + simdutf::utf8_length_from_utf16(input.begin() + inputPos, input.size() - inputPos); + break; + } + + if (result.error == simdutf::error_code::SURROGATE) { + // Calculate length for the valid portion before the error with SIMD + if (result.count > 0) { + utf8Length += simdutf::utf8_length_from_utf16(input.begin() + inputPos, result.count); + inputPos += result.count; + } + + // Handle the invalid surrogate at inputPos + // SURROGATE error means unpaired surrogate, so valid pair should be impossible + [[maybe_unused]] char16_t c = input[inputPos]; + KJ_DASSERT(!(isLeadSurrogate(c) && inputPos + 1 < input.size() && + isTrailSurrogate(input[inputPos + 1])), + "Valid surrogate pair should not trigger SURROGATE error"); + + // Invalid surrogate = U+FFFD (3 bytes) + utf8Length += 3; + inputPos++; + } else { + KJ_FAIL_REQUIRE( + "Unexpected UTF-16 validation error from simdutf", static_cast(result.error)); + } + } + + return utf8Length; +} + +// Encode a single UTF-16 code unit to UTF-8 +inline size_t encodeUtf8CodeUnit(char16_t c, kj::ArrayPtr out) { + if (c < 0x80) { + KJ_DASSERT(out.size() >= 1); + out[0] = static_cast(c); + return 1; + } else if (c < 0x800) { + KJ_DASSERT(out.size() >= 2); + out[0] = static_cast(0xC0 | (c >> 6)); + out[1] = static_cast(0x80 | (c & 0x3F)); + return 2; + } else { + KJ_DASSERT(out.size() >= 3); + out[0] = static_cast(0xE0 | (c >> 12)); + out[1] = static_cast(0x80 | ((c >> 6) & 0x3F)); + out[2] = static_cast(0x80 | (c & 0x3F)); + return 3; + } +} + +// Convert UTF-16 with potentially invalid surrogates to UTF-8. +// Invalid surrogates are replaced with U+FFFD. +// Returns the number of UTF-8 bytes written. +// Uses SIMD for valid portions and falls back to scalar for invalid surrogates. +size_t convertInvalidUtf16ToUtf8(kj::ArrayPtr input, kj::ArrayPtr out) { + size_t inputPos = 0; + size_t outputPos = 0; + + while (inputPos < input.size()) { + // Find the next invalid surrogate using SIMD validation + auto result = + simdutf::validate_utf16_with_errors(input.begin() + inputPos, input.size() - inputPos); + + if (result.error == simdutf::error_code::SUCCESS) { + // Remaining input is valid - convert it all with SIMD + outputPos += simdutf::convert_utf16_to_utf8( + input.begin() + inputPos, input.size() - inputPos, out.begin() + outputPos); + KJ_DASSERT(outputPos <= out.size()); + break; + } + + if (result.error == simdutf::error_code::SURROGATE) { + // Convert the valid portion before the error with SIMD + if (result.count > 0) { + outputPos += simdutf::convert_valid_utf16_to_utf8( + input.begin() + inputPos, result.count, out.begin() + outputPos); + KJ_DASSERT(outputPos <= out.size()); + inputPos += result.count; + } + + // Handle the invalid surrogate at inputPos + // SURROGATE error means unpaired surrogate, so valid pair should be impossible + [[maybe_unused]] char16_t c = input[inputPos]; + KJ_DASSERT(!(isLeadSurrogate(c) && inputPos + 1 < input.size() && + isTrailSurrogate(input[inputPos + 1])), + "Valid surrogate pair should not trigger SURROGATE error"); + + // Invalid surrogate - replace with U+FFFD (3 bytes) + outputPos += encodeUtf8CodeUnit(0xFFFD, out.slice(outputPos, out.size())); + KJ_DASSERT(outputPos <= out.size()); + inputPos++; + } else { + KJ_FAIL_REQUIRE( + "Unexpected UTF-16 validation error from simdutf", static_cast(result.error)); + } + } + + return outputPos; +} + +} // namespace + jsg::Ref TextEncoder::constructor(jsg::Lock& js) { return js.alloc(); } +jsg::JsUint8Array TextEncoder::encode(jsg::Lock& js, jsg::Optional input) { + jsg::JsString str = input.orDefault(js.str()); + std::shared_ptr backingStore; + size_t utf8_length = 0; + auto length = str.length(js); + + // Fast path: check if string is one-byte before creating ValueView + if (str.isOneByte(js)) { + // Use off-heap allocation for intermediate Latin-1 buffer to avoid wasting V8 heap space + // and potentially triggering GC. Stack allocation for small strings, heap for large. + kj::SmallArray latin1Buffer(length); + + [[maybe_unused]] auto writeResult = str.writeInto(js, latin1Buffer.asPtr()); + KJ_DASSERT( + writeResult.written == length, "writeInto must completely overwrite the backing buffer"); + + utf8_length = simdutf::utf8_length_from_latin1( + reinterpret_cast(latin1Buffer.begin()), length); + + if (utf8_length == length) { + // ASCII fast path: no conversion needed, Latin-1 is same as UTF-8 for ASCII + // Allocate final on-heap buffer and copy + backingStore = js.allocBackingStore(length, jsg::Lock::AllocOption::UNINITIALIZED); + memcpy(backingStore->Data(), latin1Buffer.begin(), length); + auto array = v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, length); + return jsg::JsUint8Array(array); + } + + KJ_DASSERT(utf8_length > length); + + // Need to convert Latin-1 to UTF-8 + backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); + [[maybe_unused]] auto written = + simdutf::convert_latin1_to_utf8(reinterpret_cast(latin1Buffer.begin()), length, + reinterpret_cast(backingStore->Data())); + KJ_DASSERT(utf8_length == written); + auto array = + v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, utf8_length); + return jsg::JsUint8Array(array); + } + + // Two-byte string path + // Use off-heap allocation for intermediate UTF-16 buffer to avoid triggering GC. + // Stack allocation for small strings, heap for large. + kj::SmallArray utf16Buffer(length); + + // Note: writeInto() doesn't flatten the string - it calls writeTo() which chains through + // Write2 -> WriteV2 -> WriteHelperV2 -> String::WriteToFlat (written by Erik in 2008). + // This means we may read from multiple string segments, but that's fine for our use case. + [[maybe_unused]] auto writeResult = str.writeInto(js, utf16Buffer.asPtr()); + KJ_DASSERT( + writeResult.written == length, "writeInto must completely overwrite the backing buffer"); + + auto data = reinterpret_cast(utf16Buffer.begin()); + utf8_length = utf8LengthFromInvalidUtf16(kj::arrayPtr(data, length)); + + if (!simdutf::validate_utf16(data, length)) { + simdutf::to_well_formed_utf16(data, length, data); + } + + backingStore = js.allocBackingStore(utf8_length, jsg::Lock::AllocOption::UNINITIALIZED); + [[maybe_unused]] auto written = simdutf::convert_valid_utf16_to_utf8( + data, length, reinterpret_cast(backingStore->Data())); + + auto array = + v8::Uint8Array::New(v8::ArrayBuffer::New(js.v8Isolate, backingStore), 0, utf8_length); + return jsg::JsUint8Array(array); +} + namespace { -TextEncoder::EncodeIntoResult encodeIntoImpl( - jsg::Lock& js, jsg::JsString input, jsg::BufferSource& buffer) { - auto result = input.writeInto( - js, buffer.asArrayPtr().asChars(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); - return TextEncoder::EncodeIntoResult{ - .read = static_cast(result.read), - .written = static_cast(result.written), - }; + +// Find how many Latin-1 characters fit when converted to UTF-8 +// Uses chunked forward scan with SIMD, O(result) complexity +// Template parameter ReturnLength controls whether to return just position or (position, utf8_length) +template +std::conditional_t, size_t> findBestFitLatin1( + const char* data, size_t length, size_t bufferSize) { + size_t pos = 0; + size_t utf8Accumulated = 0; + constexpr size_t CHUNK = 256; + + while (pos < length) { + size_t remaining = length - pos; + size_t chunkSize = kj::min(remaining, CHUNK); + size_t chunkUtf8Len = simdutf::utf8_length_from_latin1(data + pos, chunkSize); + + if (utf8Accumulated + chunkUtf8Len > bufferSize) { + // Chunk would overflow - binary search within chunk + size_t left = 0; + size_t right = chunkSize; + size_t bestFit = 0; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + if (mid == 0) break; + + size_t midUtf8Length = simdutf::utf8_length_from_latin1(data + pos, mid); + if (utf8Accumulated + midUtf8Length <= bufferSize) { + bestFit = mid; + left = mid + 1; + } else { + right = mid - 1; + } + } + + if constexpr (ReturnLength) { + size_t finalPos = pos + bestFit; + size_t finalUtf8Len = + utf8Accumulated + simdutf::utf8_length_from_latin1(data + pos, bestFit); + return {finalPos, finalUtf8Len}; + } else { + return pos + bestFit; + } + } + + utf8Accumulated += chunkUtf8Len; + pos += chunkSize; + } + + if constexpr (ReturnLength) { + return {pos, utf8Accumulated}; + } else { + return pos; + } } -} // namespace -jsg::BufferSource TextEncoder::encode(jsg::Lock& js, jsg::Optional input) { - auto str = input.orDefault(js.str()); - auto view = JSG_REQUIRE_NONNULL(jsg::BufferSource::tryAlloc(js, str.utf8Length(js)), RangeError, - "Cannot allocate space for TextEncoder.encode"); - [[maybe_unused]] auto result = encodeIntoImpl(js, str, view); - KJ_DASSERT(result.written == view.size()); - return kj::mv(view); +// Find how many UTF-16 code units fit when converted to UTF-8 +// Uses chunked forward scan with SIMD, O(result) complexity. Never splits surrogate pairs. +// Template parameter ReturnLength controls whether to return just position or (position, utf8_length) +template +std::conditional_t, size_t> findBestFitUtf16( + const char16_t* data, size_t length, size_t bufferSize) { + size_t pos = 0; + size_t utf8Accumulated = 0; + constexpr size_t CHUNK = 256; + + while (pos < length) { + size_t remaining = length - pos; + size_t chunkSize = simdutf::trim_partial_utf16(data + pos, kj::min(remaining, CHUNK)); + + if (chunkSize == 0) { + chunkSize = (remaining >= 2) ? 2 : remaining; + } + + size_t chunkUtf8Len = simdutf::utf8_length_from_utf16(data + pos, chunkSize); + + if (utf8Accumulated + chunkUtf8Len > bufferSize) { + // Chunk would overflow - binary search within chunk + size_t left = 0; + size_t right = chunkSize; + size_t bestFit = 0; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + if (mid == 0) break; + + size_t adjustedMid = simdutf::trim_partial_utf16(data + pos, mid); + + if (adjustedMid == 0) { + right = 0; + break; + } + + size_t midUtf8Length = simdutf::utf8_length_from_utf16(data + pos, adjustedMid); + if (utf8Accumulated + midUtf8Length <= bufferSize) { + bestFit = adjustedMid; + left = adjustedMid + 1; + } else { + right = adjustedMid - 1; + } + } + + if constexpr (ReturnLength) { + size_t finalPos = pos + bestFit; + size_t finalUtf8Len = + utf8Accumulated + simdutf::utf8_length_from_utf16(data + pos, bestFit); + return {finalPos, finalUtf8Len}; + } else { + return pos + bestFit; + } + } + + utf8Accumulated += chunkUtf8Len; + pos += chunkSize; + } + + if constexpr (ReturnLength) { + return {pos, utf8Accumulated}; + } else { + return pos; + } +} + +// Find how many UTF-16 code units with invalid surrogates fit when converted to UTF-8 +// Uses chunked forward scan with SIMD, O(result) complexity. Never splits surrogate pairs. +// Unpaired surrogates replaced with U+FFFD. +size_t findBestFitInvalidUtf16(const char16_t* data, size_t length, size_t bufferSize) { + size_t pos = 0; + size_t utf8Accumulated = 0; + constexpr size_t CHUNK = 256; + + while (pos < length) { + size_t remaining = length - pos; + size_t chunkSize = simdutf::trim_partial_utf16(data + pos, kj::min(remaining, CHUNK)); + + if (chunkSize == 0) { + chunkSize = (remaining >= 2) ? 2 : remaining; + } + + size_t chunkUtf8Len = utf8LengthFromInvalidUtf16(kj::arrayPtr(data + pos, chunkSize)); + + if (utf8Accumulated + chunkUtf8Len > bufferSize) { + // Chunk would overflow - binary search within chunk + size_t left = 0; + size_t right = chunkSize; + size_t bestFit = 0; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + if (mid == 0) break; + + size_t adjustedMid = simdutf::trim_partial_utf16(data + pos, mid); + + if (adjustedMid == 0) { + right = 0; + break; + } + + size_t midUtf8Length = utf8LengthFromInvalidUtf16(kj::arrayPtr(data + pos, adjustedMid)); + if (utf8Accumulated + midUtf8Length <= bufferSize) { + bestFit = adjustedMid; + left = adjustedMid + 1; + } else { + right = adjustedMid - 1; + } + } + + return pos + bestFit; + } + + utf8Accumulated += chunkUtf8Len; + pos += chunkSize; + } + + return pos; } +} // namespace + TextEncoder::EncodeIntoResult TextEncoder::encodeInto( jsg::Lock& js, jsg::JsString input, jsg::JsUint8Array buffer) { - auto result = input.writeInto( - js, buffer.asArrayPtr(), jsg::JsString::WriteFlags::REPLACE_INVALID_UTF8); + auto outputBuf = buffer.asArrayPtr(); + size_t bufferSize = outputBuf.size(); + + v8::String::ValueView view(js.v8Isolate, input); + uint32_t length = view.length(); + + if (view.is_one_byte()) { + // Latin-1 path: characters 0x00-0x7F encode as 1 UTF-8 byte, 0x80-0xFF as 2 bytes + auto data = reinterpret_cast(view.data8()); + + // Latin-1 encoding strategy: three zones based on input size vs buffer capacity + // + // For Latin-1: ASCII chars (0x00-0x7F) → 1 byte, extended chars (0x80-0xFF) → 2 bytes + // Worst-case expansion: 2x, Best-case: 1x (pure ASCII), Typical mixed: ~1.2-1.5x + // + // Zone 1: "Definitely doesn't fit" (length > bufferSize * 2) + // Even if all ASCII (best case 1:1), string won't fit. Go straight to incremental mode. + // Uses forward scan without length calculation for maximum efficiency. + // Example: 1M chars, 400k buffer → can't possibly fit, scan to find cutoff point + // + // Zone 2: "Definitely fits" (length * 2 <= bufferSize) + // Even if all extended Latin-1 (worst case 1:2), string will fit. Convert directly. + // Example: 100k chars, 250k buffer → worst case 200k bytes, guaranteed to fit + // + // Zone 3: "Maybe fits" (bufferSize < length * 2 AND length <= bufferSize * 2) + // Might fit depending on ASCII/extended ratio. Use forward scan with length calculation. + // Avoids redundant work: scanning once gets us both position and UTF-8 length. + // Example: 600k chars, 700k buffer → fits if mostly ASCII, doesn't if mixed + // + // Threshold selection (bufferSize * 2): + // - Chosen based on worst-case Latin-1 expansion of 2x + // - Optimized for common case: small buffer relative to input (SSR, streaming) + // - Trade-off: Zone 3 still does forward scan, but with length calculation overhead + // - Performance cliff exists for borderline cases (e.g., 1M chars, 500k buffer falls + // into Zone 3), but forward scan with length is still reasonably efficient + // + // Future optimization: Could use sampling to estimate ASCII ratio and choose zone + // dynamically, but adds complexity for marginal benefit in typical workloads. + + if (length > bufferSize * 2) { + // Zone 1: Incremental mode - forward scan to find what fits, then convert + size_t read = findBestFitLatin1(data, length, bufferSize); + size_t written = simdutf::convert_latin1_to_utf8(data, read, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(read), + .written = static_cast(written), + }; + } + + if (length * 2 <= bufferSize) { + // Zone 2: Fast path - worst-case (2x) definitely fits, convert directly + size_t written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; + } + + // Zone 3: "Maybe fits" - use forward scan with length calculation to avoid double-scan + auto [read, utf8Length] = findBestFitLatin1(data, length, bufferSize); + + // Check if everything fit + if (read == length) { + // ASCII fast path: utf8Length == length means all chars are ASCII, no conversion needed + if (utf8Length == length) { + memcpy(outputBuf.begin(), data, length); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(length), + }; + } + + auto written = simdutf::convert_latin1_to_utf8(data, length, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; + } + + // Partial fit: convert only what fits + size_t written = simdutf::convert_latin1_to_utf8(data, read, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(read), + .written = static_cast(written), + }; + } + + // UTF-16 path: validate to ensure spec compliance (replace invalid surrogates with U+FFFD) + auto data = reinterpret_cast(view.data16()); + + if (simdutf::validate_utf16(data, length)) { + // Valid UTF-16: use fast SIMD conversion + // + // UTF-16 to UTF-8 encoding: variable expansion based on code point ranges + // U+0000-U+007F (ASCII): 1 byte (rare in two-byte strings) + // U+0080-U+07FF: 2 bytes (most common) + // U+0800-U+FFFF (BMP): 3 bytes (common: CJK, etc.) + // U+10000-U+10FFFF (surrogate pairs): 4 bytes (less common: emoji, etc.) + // Worst-case: 3 bytes per code unit (BMP chars), Typical: ~2-3 bytes per code unit + // + // Zone 1: "Definitely doesn't fit" (length > bufferSize) + // Conservative threshold: even if all ASCII (impossible for two-byte strings), won't fit. + // This differs from Latin-1 (bufferSize * 2) due to different typical expansion patterns. + // Example: 1M code units, 900k buffer → can't fit, use incremental mode + // + // Zone 2: "Definitely fits" (length * 3 <= bufferSize) + // Even if all BMP characters (worst case 1:3), string will fit. Convert directly. + // Example: 200k code units, 700k buffer → worst case 600k bytes, guaranteed to fit + // + // Zone 3: "Maybe fits" (bufferSize < length * 3 AND length <= bufferSize) + // Might fit depending on character distribution. Use forward scan with length calculation. + // Example: 300k code units, 800k buffer → fits if mostly 2-byte chars, doesn't if BMP + // + // Threshold selection (bufferSize vs bufferSize * 3): + // - Zone 1 threshold (length > bufferSize) is conservative: even 1:1 ratio won't fit + // - More aggressive than Latin-1 because UTF-16 typical expansion is higher (~2-3x) + // - Zone 3 (maybe fits) is large: from bufferSize to bufferSize * 3 + // - Optimized for common case where UTF-16 strings are mostly 2-3 byte encodings + // - Performance cliff: Zone 3 still uses forward scan with length calculation overhead + + if (length > bufferSize) { + // Zone 1: Incremental mode - forward scan to find what fits, then convert + size_t read = findBestFitUtf16(data, length, bufferSize); + size_t written = simdutf::convert_utf16_to_utf8(data, read, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(read), + .written = static_cast(written), + }; + } + + if (length * 3 <= bufferSize) { + // Zone 2: Fast path - worst-case (3x) definitely fits, convert directly + size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; + } + + // Zone 3: "Maybe fits" - use forward scan with length calculation to avoid double-scan + auto [read, utf8Length] = findBestFitUtf16(data, length, bufferSize); + + if (read == length) { + // Everything fit: convert entire string with SIMD + size_t written = simdutf::convert_utf16_to_utf8(data, length, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; + } + + // Partial fit: convert only what fits + size_t written = simdutf::convert_utf16_to_utf8(data, read, outputBuf.begin()); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(read), + .written = static_cast(written), + }; + } + + // Invalid UTF-16: convert directly to UTF-8, replacing unpaired surrogates with U+FFFD + + // Incremental mode: buffer much smaller than input, skip "whole string fits" checks + if (length > bufferSize) { + size_t read = findBestFitInvalidUtf16(data, length, bufferSize); + size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, read), outputBuf); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(read), + .written = static_cast(written), + }; + } + + // Fast path: worst-case (3 bytes per UTF-16 code unit) fits + if (length * 3 <= bufferSize) { + size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, length), outputBuf); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; + } + + // Slow path: calculate exact UTF-8 length + size_t utf8Length = utf8LengthFromInvalidUtf16(kj::arrayPtr(data, length)); + if (utf8Length <= bufferSize) { + size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, length), outputBuf); + return TextEncoder::EncodeIntoResult{ + .read = static_cast(length), + .written = static_cast(written), + }; + } + + // Doesn't fit: forward scan to find what does + size_t read = findBestFitInvalidUtf16(data, length, bufferSize); + size_t written = convertInvalidUtf16ToUtf8(kj::arrayPtr(data, read), outputBuf); return TextEncoder::EncodeIntoResult{ - .read = static_cast(result.read), - .written = static_cast(result.written), + .read = static_cast(read), + .written = static_cast(written), }; } diff --git a/src/workerd/api/encoding.h b/src/workerd/api/encoding.h index 81d6899e5ce..c2425c4c153 100644 --- a/src/workerd/api/encoding.h +++ b/src/workerd/api/encoding.h @@ -216,7 +216,7 @@ class TextEncoder final: public jsg::Object { static jsg::Ref constructor(jsg::Lock& js); - jsg::BufferSource encode(jsg::Lock& js, jsg::Optional input); + jsg::JsUint8Array encode(jsg::Lock& js, jsg::Optional input); EncodeIntoResult encodeInto(jsg::Lock& js, jsg::JsString input, jsg::JsUint8Array buffer); @@ -234,11 +234,7 @@ class TextEncoder final: public jsg::Object { JSG_READONLY_INSTANCE_PROPERTY(encoding, getEncoding); } - // `encode()` returns `jsg::BufferSource`, which may be an `ArrayBuffer` or `ArrayBufferView`, - // but the implementation uses `jsg::BufferSource::tryAlloc()` which always tries to allocate a - // `Uint8Array`. The spec defines that this function returns a `Uint8Array` too. JSG_TS_OVERRIDE({ - encode(input?: string): Uint8Array; encodeInto(input: string, buffer: Uint8Array): TextEncoderEncodeIntoResult; }); } diff --git a/src/workerd/jsg/buffersource.h b/src/workerd/jsg/buffersource.h index f2e8351de97..b15ebef25f3 100644 --- a/src/workerd/jsg/buffersource.h +++ b/src/workerd/jsg/buffersource.h @@ -102,9 +102,10 @@ class BackingStore { // Creates a new BackingStore of the given size. template - static BackingStore alloc(Lock& js, size_t size) { - return BackingStore(js.allocBackingStore(size), size, 0, getBufferSourceElementSize(), - construct, checkIsIntegerType()); + static BackingStore alloc( + Lock& js, size_t size, Lock::AllocOption init_mode = Lock::AllocOption::ZERO_INITIALIZED) { + return BackingStore(js.allocBackingStore(size, init_mode), size, 0, + getBufferSourceElementSize(), construct, checkIsIntegerType()); } using Disposer = void(void*, size_t, void*); diff --git a/src/workerd/jsg/jsg.h b/src/workerd/jsg/jsg.h index 4af60a50871..e6e23da08f2 100644 --- a/src/workerd/jsg/jsg.h +++ b/src/workerd/jsg/jsg.h @@ -2766,6 +2766,14 @@ class Lock { // Utility method to safely allocate a v8::BackingStore with allocation failure handling. // Throws a javascript error if allocation fails. + // + // IMPORTANT: This method can trigger garbage collection, which may move or invalidate V8 + // objects. Do NOT call this method while: + // - A v8::String::ValueView is alive (it holds internal V8 heap locks) + // - You have raw pointers to V8 heap data (e.g., from view.data8(), view.data16()) + // + // Safe pattern: Copy V8 string data to off-heap memory FIRST (e.g., via JsString::writeInto() + // into kj::SmallArray), THEN call allocBackingStore(). See TextEncoder::encode() for example. std::unique_ptr allocBackingStore( size_t size, AllocOption init_mode = AllocOption::ZERO_INITIALIZED) KJ_WARN_UNUSED_RESULT; diff --git a/src/workerd/jsg/jsvalue.h b/src/workerd/jsg/jsvalue.h index 23f94a7ac36..36c7a311f4c 100644 --- a/src/workerd/jsg/jsvalue.h +++ b/src/workerd/jsg/jsvalue.h @@ -280,6 +280,7 @@ class JsString final: public JsBase { int hashCode() const; bool isFlat() const; + bool isOneByte(Lock& js) const KJ_WARN_UNUSED_RESULT; bool containsOnlyOneByte() const; bool operator==(const JsString& other) const; @@ -307,6 +308,12 @@ class JsString final: public JsBase { // The number of elements (e.g. char, byte, uint16_t) written to the buffer. size_t written; }; + + // Copy string contents into a provided buffer (off-heap memory). + // + // IMPORTANT: This method does NOT flatten the V8 string or hold V8 heap locks. It safely + // copies data out of V8's heap into your buffer. This makes it safe to use before calling + // GC-triggering operations like Lock::allocBackingStore(). WriteIntoStatus writeInto( Lock& js, kj::ArrayPtr buffer, WriteFlags options = WriteFlags::NONE) const; WriteIntoStatus writeInto( @@ -988,6 +995,10 @@ inline int JsString::length(jsg::Lock& js) const { return inner->Length(); } +inline bool JsString::isOneByte(jsg::Lock& js) const { + return inner->IsOneByte(); +} + inline size_t JsString::utf8Length(jsg::Lock& js) const { return inner->Utf8LengthV2(js.v8Isolate); }