Skip to content

Commit 355c6c3

Browse files
committed
Add utf8_hex token and expression ...
to detect hex escape sequences for UTF8 chars that were previously parsed as individual hex escapes.
1 parent 20d70a7 commit 355c6c3

File tree

12 files changed

+42
-10
lines changed

12 files changed

+42
-10
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
12+
- a new token `:escape, :utf8_hex` and expression `EscapeSequence::UTF8Hex`
13+
- used for UTF-8 hex escapes, e.g. `\xE2\x82\xAC` representing U+20AC "€"
14+
1015
## [2.10.0] - 2024-12-25 - Janosch Müller
1116

1217
### Added

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ _Note that not all of these are available in all versions of Ruby_
330330
| ------------------------------------- | ------------------------------------------------------- |:--------:|
331331
| **Alternation** | `a\|b\|c` | ✓ |
332332
| **Anchors** | `\A`, `^`, `\b` | ✓ |
333-
| **Character Classes** | `[abc]`, `[^\\]`, `[a-d&&aeiou]`, `[a=e=b]` | ✓ |
333+
| **Character Classes** | `[abc]`, `[^\\]`, `[a-d&&aeiou]` | ✓ |
334334
| **Character Types** | `\d`, `\H`, `\s` | ✓ |
335335
| **Cluster Types** | `\R`, `\X` | ✓ |
336336
| **Conditional Exps.** | `(?(cond)yes-subexp)`, `(?(cond)yes-subexp\|no-subexp)` | ✓ |
@@ -365,7 +365,7 @@ _Note that not all of these are available in all versions of Ruby_
365365
|   _**Possessive**_ | `?+`, `*+`, `++` \[1\] | ✓ |
366366
| **String Escapes** | | ⋱ |
367367
|   _**Control** \[2\]_ | `\C-C`, `\cD` | ✓ |
368-
|   _**Hex**_ | `\x20`, `\x{701230}` | ✓ |
368+
|   _**Hex**_ | `\x20`, `\xE2\x82\xAC` | ✓ |
369369
|   _**Meta** \[2\]_ | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C` | ✓ |
370370
|   _**Octal**_ | `\0`, `\01`, `\012` | ✓ |
371371
|   _**Unicode**_ | `\uHHHH`, `\u{H+ H+}` | ✓ |

lib/regexp_parser/expression/classes/escape_sequence.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ module EscapeSequence
1818
Codepoint = Class.new(Base) # e.g. \u000A
1919

2020
CodepointList = Class.new(Base) # e.g. \u{A B}
21+
UTF8Hex = Class.new(Base) # e.g. \xE2\x82\xAC
2122

2223
AbstractMetaControlSequence = Class.new(Base)
2324
Control = Class.new(AbstractMetaControlSequence) # e.g. \cB

lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ module Regexp::Expression::EscapeSequence
1515
Hex.class_eval { def codepoint; text[/\h+/].hex end }
1616
Codepoint.class_eval { def codepoint; text[/\h+/].hex end }
1717

18+
UTF8Hex.class_eval do
19+
def codepoint
20+
text.scan(/\h+/).map(&:hex).pack('C*').force_encoding('utf-8').ord
21+
end
22+
end
23+
1824
CodepointList.class_eval do
1925
# Maybe this should be a unique top-level expression class?
2026
def char

lib/regexp_parser/parser.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ def escape(token)
319319
when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
320320
when :hex; node << EscapeSequence::Hex.new(token, active_opts)
321321
when :octal; node << EscapeSequence::Octal.new(token, active_opts)
322+
when :utf8_hex; node << EscapeSequence::UTF8Hex.new(token, active_opts)
322323

323324
when :control
324325
if token.text =~ /\A(?:\\C-\\M|\\c\\M)/

lib/regexp_parser/scanner/scanner.rl

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@
3737
octal_sequence = [0-7]{1,3};
3838

3939
hex_sequence = 'x' . xdigit{1,2};
40-
hex_sequence_err = 'x' . [^0-9a-fA-F{];
40+
hex_sequence_err = 'x' . [^0-9A-Fa-f];
41+
high_hex_sequence = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
4142

4243
codepoint_single = 'u' . xdigit{4};
4344
codepoint_list = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
@@ -210,7 +211,7 @@
210211
type = :nonposixclass
211212
end
212213

213-
unless self.class.posix_classes.include?(class_name)
214+
unless POSIX_CLASSES[class_name]
214215
raise ValidationError.for(:posix_class, text)
215216
end
216217

@@ -321,6 +322,16 @@
321322
fret;
322323
};
323324

325+
high_hex_sequence > (escaped_alpha, 5) {
326+
text = copy(data, ts-1, te)
327+
if regexp_encoding == Encoding::BINARY
328+
text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
329+
else
330+
emit(:escape, :utf8_hex, text)
331+
end
332+
fret;
333+
};
334+
324335
hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
325336
emit(:escape, :hex, copy(data, ts-1, te))
326337
fret;
@@ -662,6 +673,7 @@ class Regexp::Scanner
662673

663674
input = input_object.is_a?(Regexp) ? input_object.source : input_object
664675
self.free_spacing = free_spacing?(input_object, options)
676+
self.regexp_encoding = input_object.encoding if input_object.is_a?(::Regexp)
665677
self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
666678

667679
data = input.unpack("c*")
@@ -711,10 +723,9 @@ class Regexp::Scanner
711723
File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
712724
end
713725

714-
def self.posix_classes
726+
POSIX_CLASSES =
715727
%w[alnum alpha ascii blank cntrl digit graph
716-
lower print punct space upper word xdigit]
717-
end
728+
lower print punct space upper word xdigit].to_h { |c| [c, true] }.freeze
718729

719730
# Emits an array with the details of the scanned pattern
720731
def emit(type, token, text)
@@ -749,6 +760,7 @@ class Regexp::Scanner
749760
attr_accessor :block,
750761
:collect_tokens, :tokens, :prev_token,
751762
:free_spacing, :spacing_stack,
763+
:regexp_encoding,
752764
:group_depth, :set_depth, :conditional_stack,
753765
:char_pos
754766

lib/regexp_parser/syntax/token/escape.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ module Escape
1717
interval_open interval_close
1818
set_open set_close]
1919

20-
Hex = %i[hex]
20+
Hex = %i[hex utf8_hex]
2121

2222
Octal = %i[octal]
2323

spec/expression/options_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
it 'gives correct precedence when encountering multiple encoding flags' do
4444
# Any encoding flag overrides all previous encoding flags. If there are
4545
# multiple encoding flags in an options string, the last one wins.
46-
# E.g. /(?dau)\w/ matches UTF8 chars but /(?dua)\w/ only ASCII chars.
46+
# E.g. /(?dau)\w/ matches UTF-8 chars but /(?dua)\w/ only ASCII chars.
4747
regexp1 = /(?dau)\w/
4848
regexp2 = /(?dua)\w/
4949
expect(regexp1).to match 'ü'

spec/parser/escapes_spec.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
# hex escapes
3131
include_examples 'parse', /a\xFF/n, 1 => [:escape, :hex, es::Hex]
32+
include_examples 'parse', /a\xFF\xFF/n, 2 => [:escape, :hex, es::Hex]
3233

3334
# octal escapes
3435
include_examples 'parse', /a\177/n, 1 => [:escape, :octal, es::Octal]
@@ -39,6 +40,7 @@
3940
include_examples 'parse', /\101/, 0 => [char: 'A', codepoint: 65 ]
4041
include_examples 'parse', /\x42/, 0 => [char: 'B', codepoint: 66 ]
4142
include_examples 'parse', /\xA/, 0 => [char: "\n", codepoint: 10 ]
43+
include_examples 'parse', /\xE2\x82\xAC/, 0 => [char: "€", codepoint: 8364 ]
4244
include_examples 'parse', /\u0043/, 0 => [char: 'C', codepoint: 67 ]
4345
include_examples 'parse', /\u{44 45}/, 0 => [chars: %w[D E], codepoints: [68, 69]]
4446

spec/scanner/errors_spec.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M-\C-'
4141
include_examples 'scan error', RS::InvalidSequenceError, 'invalid hex', '\xZ'
4242
include_examples 'scan error', RS::InvalidSequenceError, 'invalid hex', '\xZ0'
43+
include_examples 'scan error', RS::InvalidSequenceError, 'invalid hex', '\x{'
4344
include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\cü'
4445
include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\c\M-ü'
4546
include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\C-ü'

0 commit comments

Comments
 (0)