Skip to content

Add additional unicode escape support #4296

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
188 changes: 161 additions & 27 deletions pkg/decoders/escaped_unicode.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,33 @@ var (

// Common escape sequence used in programming languages.
escapePat = regexp.MustCompile(`(?i:\\{1,2}u)([a-fA-F0-9]{4})`)

// Additional Unicode escape formats from dencode.com

// \u{X} format - Rust, Swift, some JS, etc. (variable length hex in braces)
braceEscapePat = regexp.MustCompile(`\\u\{([a-fA-F0-9]{1,6})\}`)

// \U00XXXXXX format - Python, etc. (8-digit format for non-BMP characters)
longEscapePat = regexp.MustCompile(`\\U([a-fA-F0-9]{8})`)

// \x{X} format - Perl (variable length hex in braces)
perlEscapePat = regexp.MustCompile(`\\x\{([a-fA-F0-9]{1,6})\}`)

// \X format - CSS (hex without padding). Go's regexp (RE2) has no look-ahead, so we
// include the delimiter (whitespace, another backslash, or end-of-string) in the
// match using a non-capturing group. The delimiter is later re-inserted by the
// decoder when necessary.
cssEscapePat = regexp.MustCompile(`\\([a-fA-F0-9]{1,6})(?:\s|\\|$)`)
Comment on lines +35 to +42
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If these are specific to Perl and CSS I think we could probably omit them for better performance


// &#xX; format - HTML/XML (hex with semicolon)
htmlEscapePat = regexp.MustCompile(`&#x([a-fA-F0-9]{1,6});`)

// %uXXXX format - Percent-encoding (non-standard)
percentEscapePat = regexp.MustCompile(`%u([a-fA-F0-9]{4})`)

// // 0xX format - Hexadecimal notation with space separation
// Note: Commenting out for now due to high memory overhead. Review ways to handle this.
// hexEscapePat = regexp.MustCompile(`0x([a-fA-F0-9]{1,6})(?:\s|$)`)
)

func (d *EscapedUnicode) Type() detectorspb.DecoderType {
Expand All @@ -39,13 +66,38 @@ func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk {
chunkData = bytes.Clone(chunk.Data)
matched = false
)
if codePointPat.Match(chunkData) {

// Process patterns in priority order - more specific patterns first
// This prevents conflicts where multiple patterns match the same input

// Long escape format (8 hex digits) - highest priority
if longEscapePat.Match(chunkData) {
matched = true
chunkData = decodeCodePoint(chunkData)
}
if escapePat.Match(chunkData) {
chunkData = decodeLongEscape(chunkData)
} else if braceEscapePat.Match(chunkData) {
matched = true
chunkData = decodeBraceEscape(chunkData)
} else if perlEscapePat.Match(chunkData) {
matched = true
chunkData = decodePerlEscape(chunkData)
} else if htmlEscapePat.Match(chunkData) {
matched = true
chunkData = decodeHtmlEscape(chunkData)
} else if percentEscapePat.Match(chunkData) {
matched = true
chunkData = decodePercentEscape(chunkData)
} else if escapePat.Match(chunkData) {
matched = true
chunkData = decodeEscaped(chunkData)
} else if codePointPat.Match(chunkData) {
matched = true
chunkData = decodeCodePoint(chunkData)
} else if cssEscapePat.Match(chunkData) {
matched = true
chunkData = decodeCssEscape(chunkData)
// } else if hexEscapePat.Match(chunkData) {
// matched = true
// chunkData = decodeHexEscape(chunkData)
}

if matched {
Expand All @@ -71,6 +123,34 @@ func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk {
const maxBytesPerRune = 4
const spaceChar = byte(' ')

// decodeWithPattern replaces escape sequences matched by re with their UTF-8
// equivalents. The regex *must* have the first capturing group contain the
// hexadecimal code-point digits. Any invalid value (> 0x10FFFF or parse error)
// is skipped. The replacement walks matches in reverse order to avoid index
// shifts.
func decodeWithPattern(input []byte, re *regexp.Regexp) []byte {
indices := re.FindAllSubmatchIndex(input, -1)
if len(indices) == 0 {
return input
}

utf8Bytes := make([]byte, maxBytesPerRune)
for i := len(indices) - 1; i >= 0; i-- {
m := indices[i]
start, end := m[0], m[1]
hexStart, hexEnd := m[2], m[3]

cp, err := strconv.ParseUint(string(input[hexStart:hexEnd]), 16, 32)
if err != nil || cp > 0x10FFFF {
continue
}

utf8Len := utf8.EncodeRune(utf8Bytes, rune(cp))
input = append(input[:start], append(utf8Bytes[:utf8Len], input[end:]...)...)
}
return input
}

func decodeCodePoint(input []byte) []byte {
// Find all Unicode escape sequences in the input byte slice
indices := codePointPat.FindAllSubmatchIndex(input, -1)
Expand Down Expand Up @@ -112,33 +192,87 @@ func decodeCodePoint(input []byte) []byte {
}

func decodeEscaped(input []byte) []byte {
// Find all Unicode escape sequences in the input byte slice
indices := escapePat.FindAllSubmatchIndex(input, -1)
return decodeWithPattern(input, escapePat)
}

// Iterate over found indices in reverse order to avoid modifying the slice length
utf8Bytes := make([]byte, maxBytesPerRune)
for i := len(indices) - 1; i >= 0; i-- {
matches := indices[i]
startIndex := matches[0]
hexStartIndex := matches[2]
endIndex := matches[3]
// decodeBraceEscape handles \u{X} format - Rust, Swift, some JS, etc.
func decodeBraceEscape(input []byte) []byte {
return decodeWithPattern(input, braceEscapePat)
}

// Extract the hexadecimal value from the escape sequence
hexValue := string(input[hexStartIndex:endIndex])
// decodeLongEscape handles \U00XXXXXX format - Python, etc.
func decodeLongEscape(input []byte) []byte {
return decodeWithPattern(input, longEscapePat)
}

// Parse the hexadecimal value to an integer
unicodeInt, err := strconv.ParseInt(hexValue, 16, 32)
if err != nil {
// If there's an error, continue to the next escape sequence
continue
}
// decodePerlEscape handles \x{X} format - Perl
func decodePerlEscape(input []byte) []byte {
return decodeWithPattern(input, perlEscapePat)
}

// Convert the Unicode code point to a UTF-8 representation
utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))
// decodeCssEscape handles \X format - CSS (hex without padding, with space delimiter or end of string or next hex sequence)
func decodeCssEscape(input []byte) []byte {
return decodeWithPattern(input, cssEscapePat)
}

// Replace the escape sequence with the UTF-8 representation
input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...)
}
// decodeHtmlEscape handles &#xX; format - HTML/XML
func decodeHtmlEscape(input []byte) []byte {
return decodeWithPattern(input, htmlEscapePat)
}

return input
// decodePercentEscape handles %uXXXX format - Percent-encoding (non-standard)
func decodePercentEscape(input []byte) []byte {
return decodeWithPattern(input, percentEscapePat)
}

// decodeHexEscape handles 0xX format - Hexadecimal notation with space separation
// func decodeHexEscape(input []byte) []byte {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we intentionally commenting out this code?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I left this commented out, in case you wanted to try an implementation that doesn't bloat memory. I couldn't figure it out, but I think supporting this format would be really great given it's a pretty common format from what I've seen. But if that needs to be put on pause, that's fine, and we can remove the comments. What do you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I don’t want to be a blocker, but personally I’m not a big fan of leaving large chunks of code commented out 😅

// // This format requires consecutive 0xNN sequences to be considered for decoding
// // We'll look for patterns of multiple consecutive hex values
// hexPattern := regexp.MustCompile(`(?:0x[a-fA-F0-9]{1,2}(?:\s+|$))+`)

// matches := hexPattern.FindAll(input, -1)
// if len(matches) == 0 {
// return input
// }

// result := input
// for _, match := range matches {
// // Extract individual hex values
// individualHex := regexp.MustCompile(`0x([a-fA-F0-9]{1,2})`)
// hexMatches := individualHex.FindAllSubmatch(match, -1)

// // Only decode if we have multiple consecutive hex values (likely to be a Unicode string)
// if len(hexMatches) < 3 {
// continue
// }

// var decoded []byte
// for _, hexMatch := range hexMatches {
// hexValue := string(hexMatch[1])
// if len(hexValue) == 1 {
// hexValue = "0" + hexValue // Pad single digit hex values
// }

// unicodeInt, err := strconv.ParseUint(hexValue, 16, 32)
// if err != nil || unicodeInt > 0x10FFFF {
// break
// }

// if unicodeInt <= 0x7F {
// // ASCII character
// decoded = append(decoded, byte(unicodeInt))
// } else {
// // Unicode character
// utf8Bytes := make([]byte, maxBytesPerRune)
// utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt))
// decoded = append(decoded, utf8Bytes[:utf8Len]...)
// }
// }

// // Replace the original sequence with decoded bytes
// result = bytes.Replace(result, match, decoded, 1)
// }

// return result
// }
Loading
Loading