-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Add additional unicode escape support #4296
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
bb1de3a
31183ec
d2717c6
e809c3a
e4b378b
58d5dae
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,33 @@ var ( | |
|
||
// Common escape sequence used in programming languages. | ||
escapePat = regexp.MustCompile(`(?i:\\{1,2}u)([a-fA-F0-9]{4})`) | ||
|
||
// Additional Unicode escape formats from dencode.com | ||
|
||
// \u{X} format - Rust, Swift, some JS, etc. (variable length hex in braces) | ||
braceEscapePat = regexp.MustCompile(`\\u\{([a-fA-F0-9]{1,6})\}`) | ||
|
||
// \U00XXXXXX format - Python, etc. (8-digit format for non-BMP characters) | ||
longEscapePat = regexp.MustCompile(`\\U([a-fA-F0-9]{8})`) | ||
|
||
// \x{X} format - Perl (variable length hex in braces) | ||
perlEscapePat = regexp.MustCompile(`\\x\{([a-fA-F0-9]{1,6})\}`) | ||
|
||
// \X format - CSS (hex without padding). Go's regexp (RE2) has no look-ahead, so we | ||
// include the delimiter (whitespace, another backslash, or end-of-string) in the | ||
// match using a non-capturing group. The delimiter is later re-inserted by the | ||
// decoder when necessary. | ||
cssEscapePat = regexp.MustCompile(`\\([a-fA-F0-9]{1,6})(?:\s|\\|$)`) | ||
|
||
// &#xX; format - HTML/XML (hex with semicolon) | ||
htmlEscapePat = regexp.MustCompile(`&#x([a-fA-F0-9]{1,6});`) | ||
|
||
// %uXXXX format - Percent-encoding (non-standard) | ||
percentEscapePat = regexp.MustCompile(`%u([a-fA-F0-9]{4})`) | ||
|
||
// // 0xX format - Hexadecimal notation with space separation | ||
// Note: Commenting out for now due to high memory overhead. Review ways to handle this. | ||
// hexEscapePat = regexp.MustCompile(`0x([a-fA-F0-9]{1,6})(?:\s|$)`) | ||
) | ||
|
||
func (d *EscapedUnicode) Type() detectorspb.DecoderType { | ||
|
@@ -39,13 +66,38 @@ func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk { | |
chunkData = bytes.Clone(chunk.Data) | ||
matched = false | ||
) | ||
if codePointPat.Match(chunkData) { | ||
|
||
// Process patterns in priority order - more specific patterns first | ||
// This prevents conflicts where multiple patterns match the same input | ||
|
||
// Long escape format (8 hex digits) - highest priority | ||
if longEscapePat.Match(chunkData) { | ||
matched = true | ||
chunkData = decodeCodePoint(chunkData) | ||
} | ||
if escapePat.Match(chunkData) { | ||
chunkData = decodeLongEscape(chunkData) | ||
} else if braceEscapePat.Match(chunkData) { | ||
matched = true | ||
chunkData = decodeBraceEscape(chunkData) | ||
} else if perlEscapePat.Match(chunkData) { | ||
matched = true | ||
chunkData = decodePerlEscape(chunkData) | ||
} else if htmlEscapePat.Match(chunkData) { | ||
matched = true | ||
chunkData = decodeHtmlEscape(chunkData) | ||
} else if percentEscapePat.Match(chunkData) { | ||
matched = true | ||
chunkData = decodePercentEscape(chunkData) | ||
} else if escapePat.Match(chunkData) { | ||
matched = true | ||
chunkData = decodeEscaped(chunkData) | ||
} else if codePointPat.Match(chunkData) { | ||
matched = true | ||
chunkData = decodeCodePoint(chunkData) | ||
} else if cssEscapePat.Match(chunkData) { | ||
matched = true | ||
chunkData = decodeCssEscape(chunkData) | ||
// } else if hexEscapePat.Match(chunkData) { | ||
// matched = true | ||
// chunkData = decodeHexEscape(chunkData) | ||
} | ||
|
||
if matched { | ||
|
@@ -71,6 +123,34 @@ func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk { | |
const maxBytesPerRune = 4 | ||
const spaceChar = byte(' ') | ||
|
||
// decodeWithPattern replaces escape sequences matched by re with their UTF-8 | ||
// equivalents. The regex *must* have the first capturing group contain the | ||
// hexadecimal code-point digits. Any invalid value (> 0x10FFFF or parse error) | ||
// is skipped. The replacement walks matches in reverse order to avoid index | ||
// shifts. | ||
func decodeWithPattern(input []byte, re *regexp.Regexp) []byte { | ||
indices := re.FindAllSubmatchIndex(input, -1) | ||
if len(indices) == 0 { | ||
return input | ||
} | ||
|
||
utf8Bytes := make([]byte, maxBytesPerRune) | ||
for i := len(indices) - 1; i >= 0; i-- { | ||
m := indices[i] | ||
start, end := m[0], m[1] | ||
hexStart, hexEnd := m[2], m[3] | ||
|
||
cp, err := strconv.ParseUint(string(input[hexStart:hexEnd]), 16, 32) | ||
if err != nil || cp > 0x10FFFF { | ||
continue | ||
} | ||
|
||
utf8Len := utf8.EncodeRune(utf8Bytes, rune(cp)) | ||
input = append(input[:start], append(utf8Bytes[:utf8Len], input[end:]...)...) | ||
} | ||
return input | ||
} | ||
|
||
func decodeCodePoint(input []byte) []byte { | ||
// Find all Unicode escape sequences in the input byte slice | ||
indices := codePointPat.FindAllSubmatchIndex(input, -1) | ||
|
@@ -112,33 +192,87 @@ func decodeCodePoint(input []byte) []byte { | |
} | ||
|
||
func decodeEscaped(input []byte) []byte { | ||
// Find all Unicode escape sequences in the input byte slice | ||
indices := escapePat.FindAllSubmatchIndex(input, -1) | ||
return decodeWithPattern(input, escapePat) | ||
} | ||
|
||
// Iterate over found indices in reverse order to avoid modifying the slice length | ||
utf8Bytes := make([]byte, maxBytesPerRune) | ||
for i := len(indices) - 1; i >= 0; i-- { | ||
matches := indices[i] | ||
startIndex := matches[0] | ||
hexStartIndex := matches[2] | ||
endIndex := matches[3] | ||
// decodeBraceEscape handles \u{X} format - Rust, Swift, some JS, etc. | ||
func decodeBraceEscape(input []byte) []byte { | ||
return decodeWithPattern(input, braceEscapePat) | ||
} | ||
|
||
// Extract the hexadecimal value from the escape sequence | ||
hexValue := string(input[hexStartIndex:endIndex]) | ||
// decodeLongEscape handles \U00XXXXXX format - Python, etc. | ||
func decodeLongEscape(input []byte) []byte { | ||
return decodeWithPattern(input, longEscapePat) | ||
} | ||
|
||
// Parse the hexadecimal value to an integer | ||
unicodeInt, err := strconv.ParseInt(hexValue, 16, 32) | ||
if err != nil { | ||
// If there's an error, continue to the next escape sequence | ||
continue | ||
} | ||
// decodePerlEscape handles \x{X} format - Perl | ||
func decodePerlEscape(input []byte) []byte { | ||
return decodeWithPattern(input, perlEscapePat) | ||
} | ||
|
||
// Convert the Unicode code point to a UTF-8 representation | ||
utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt)) | ||
// decodeCssEscape handles \X format - CSS (hex without padding, with space delimiter or end of string or next hex sequence) | ||
func decodeCssEscape(input []byte) []byte { | ||
return decodeWithPattern(input, cssEscapePat) | ||
} | ||
|
||
// Replace the escape sequence with the UTF-8 representation | ||
input = append(input[:startIndex], append(utf8Bytes[:utf8Len], input[endIndex:]...)...) | ||
} | ||
// decodeHtmlEscape handles &#xX; format - HTML/XML | ||
func decodeHtmlEscape(input []byte) []byte { | ||
return decodeWithPattern(input, htmlEscapePat) | ||
} | ||
|
||
return input | ||
// decodePercentEscape handles %uXXXX format - Percent-encoding (non-standard) | ||
func decodePercentEscape(input []byte) []byte { | ||
return decodeWithPattern(input, percentEscapePat) | ||
} | ||
|
||
// decodeHexEscape handles 0xX format - Hexadecimal notation with space separation | ||
// func decodeHexEscape(input []byte) []byte { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we intentionally commenting out this code? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I left this commented out, in case you wanted to try an implementation that doesn't bloat memory. I couldn't figure it out, but I think supporting this format would be really great given it's a pretty common format from what I've seen. But if that needs to be put on pause, that's fine, and we can remove the comments. What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I don’t want to be a blocker, but personally I’m not a big fan of leaving large chunks of code commented out 😅 |
||
// // This format requires consecutive 0xNN sequences to be considered for decoding | ||
// // We'll look for patterns of multiple consecutive hex values | ||
// hexPattern := regexp.MustCompile(`(?:0x[a-fA-F0-9]{1,2}(?:\s+|$))+`) | ||
|
||
// matches := hexPattern.FindAll(input, -1) | ||
// if len(matches) == 0 { | ||
// return input | ||
// } | ||
|
||
// result := input | ||
// for _, match := range matches { | ||
// // Extract individual hex values | ||
// individualHex := regexp.MustCompile(`0x([a-fA-F0-9]{1,2})`) | ||
// hexMatches := individualHex.FindAllSubmatch(match, -1) | ||
|
||
// // Only decode if we have multiple consecutive hex values (likely to be a Unicode string) | ||
// if len(hexMatches) < 3 { | ||
// continue | ||
// } | ||
|
||
// var decoded []byte | ||
// for _, hexMatch := range hexMatches { | ||
// hexValue := string(hexMatch[1]) | ||
// if len(hexValue) == 1 { | ||
// hexValue = "0" + hexValue // Pad single digit hex values | ||
// } | ||
|
||
// unicodeInt, err := strconv.ParseUint(hexValue, 16, 32) | ||
// if err != nil || unicodeInt > 0x10FFFF { | ||
// break | ||
// } | ||
|
||
// if unicodeInt <= 0x7F { | ||
// // ASCII character | ||
// decoded = append(decoded, byte(unicodeInt)) | ||
// } else { | ||
// // Unicode character | ||
// utf8Bytes := make([]byte, maxBytesPerRune) | ||
// utf8Len := utf8.EncodeRune(utf8Bytes, rune(unicodeInt)) | ||
// decoded = append(decoded, utf8Bytes[:utf8Len]...) | ||
// } | ||
// } | ||
|
||
// // Replace the original sequence with decoded bytes | ||
// result = bytes.Replace(result, match, decoded, 1) | ||
// } | ||
|
||
// return result | ||
// } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If these are specific to Perl and CSS I think we could probably omit them for better performance