Add utf8_hex token and expression ...

jaynetics · jaynetics · commit 355c6c30508d · 2025-07-31T18:40:10.000+02:00
to detect hex escape sequences for UTF8 chars that were previously parsed as individual hex escapes.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- a new token `:escape, :utf8_hex` and expression `EscapeSequence::UTF8Hex`
+  - used for UTF-8 hex escapes, e.g. `\xE2\x82\xAC` representing U+20AC "€"
+
 ## [2.10.0] - 2024-12-25 - Janosch Müller
 
 ### Added
diff --git a/README.md b/README.md
@@ -330,7 +330,7 @@ _Note that not all of these are available in all versions of Ruby_
 | ------------------------------------- | ------------------------------------------------------- |:--------:|
 | **Alternation**                       | `a\|b\|c`                                               | &#x2713; |
 | **Anchors**                           | `\A`, `^`, `\b`                                         | &#x2713; |
-| **Character Classes**                 | `[abc]`, `[^\\]`, `[a-d&&aeiou]`, `[a=e=b]`             | &#x2713; |
+| **Character Classes**                 | `[abc]`, `[^\\]`, `[a-d&&aeiou]`                        | &#x2713; |
 | **Character Types**                   | `\d`, `\H`, `\s`                                        | &#x2713; |
 | **Cluster Types**                     | `\R`, `\X`                                              | &#x2713; |
 | **Conditional Exps.**                 | `(?(cond)yes-subexp)`, `(?(cond)yes-subexp\|no-subexp)` | &#x2713; |
@@ -365,7 +365,7 @@ _Note that not all of these are available in all versions of Ruby_
 | &emsp;&nbsp;_**Possessive**_          | `?+`, `*+`, `++` \[1\]                                  | &#x2713; |
 | **String Escapes**                    |                                                         | &#x22f1; |
 | &emsp;&nbsp;_**Control** \[2\]_       | `\C-C`, `\cD`                                           | &#x2713; |
-| &emsp;&nbsp;_**Hex**_                 | `\x20`, `\x{701230}`                                    | &#x2713; |
+| &emsp;&nbsp;_**Hex**_                 | `\x20`, `\xE2\x82\xAC`                                  | &#x2713; |
 | &emsp;&nbsp;_**Meta** \[2\]_          | `\M-c`, `\M-\C-C`, `\M-\cC`, `\C-\M-C`, `\c\M-C`        | &#x2713; |
 | &emsp;&nbsp;_**Octal**_               | `\0`, `\01`, `\012`                                     | &#x2713; |
 | &emsp;&nbsp;_**Unicode**_             | `\uHHHH`, `\u{H+ H+}`                                   | &#x2713; |
diff --git a/lib/regexp_parser/expression/classes/escape_sequence.rb b/lib/regexp_parser/expression/classes/escape_sequence.rb
@@ -18,6 +18,7 @@ module EscapeSequence
     Codepoint   = Class.new(Base) # e.g. \u000A
 
     CodepointList = Class.new(Base) # e.g. \u{A B}
+    UTF8Hex       = Class.new(Base) # e.g. \xE2\x82\xAC
 
     AbstractMetaControlSequence = Class.new(Base)
     Control                     = Class.new(AbstractMetaControlSequence) # e.g. \cB
diff --git a/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb b/lib/regexp_parser/expression/methods/escape_sequence_codepoint.rb
@@ -15,6 +15,12 @@ module Regexp::Expression::EscapeSequence
   Hex.class_eval         { def codepoint; text[/\h+/].hex end }
   Codepoint.class_eval   { def codepoint; text[/\h+/].hex end }
 
+  UTF8Hex.class_eval do
+    def codepoint
+      text.scan(/\h+/).map(&:hex).pack('C*').force_encoding('utf-8').ord
+    end
+  end
+
   CodepointList.class_eval do
     # Maybe this should be a unique top-level expression class?
     def char
diff --git a/lib/regexp_parser/parser.rb b/lib/regexp_parser/parser.rb
@@ -319,6 +319,7 @@ def escape(token)
     when :codepoint_list; node << EscapeSequence::CodepointList.new(token, active_opts)
     when :hex;            node << EscapeSequence::Hex.new(token, active_opts)
     when :octal;          node << EscapeSequence::Octal.new(token, active_opts)
+    when :utf8_hex;       node << EscapeSequence::UTF8Hex.new(token, active_opts)
 
     when :control
       if token.text =~ /\A(?:\\C-\\M|\\c\\M)/
diff --git a/lib/regexp_parser/scanner/scanner.rl b/lib/regexp_parser/scanner/scanner.rl
@@ -37,7 +37,8 @@
   octal_sequence        = [0-7]{1,3};
 
   hex_sequence          = 'x' . xdigit{1,2};
-  hex_sequence_err      = 'x' . [^0-9a-fA-F{];
+  hex_sequence_err      = 'x' . [^0-9A-Fa-f];
+  high_hex_sequence     = 'x' . [89A-Fa-f] . xdigit . ( '\\x' . [89A-Fa-f] . xdigit )*;
 
   codepoint_single      = 'u' . xdigit{4};
   codepoint_list        = 'u{' . xdigit{1,6} . (space . xdigit{1,6})* . '}';
@@ -210,7 +211,7 @@
         type = :nonposixclass
       end
 
-      unless self.class.posix_classes.include?(class_name)
+      unless POSIX_CLASSES[class_name]
         raise ValidationError.for(:posix_class, text)
       end
 
@@ -321,6 +322,16 @@
       fret;
     };
 
+    high_hex_sequence > (escaped_alpha, 5) {
+      text = copy(data, ts-1, te)
+      if regexp_encoding == Encoding::BINARY
+        text.split(/(?=\\)/).each { |part| emit(:escape, :hex, part) }
+      else
+        emit(:escape, :utf8_hex, text)
+      end
+      fret;
+    };
+
     hex_sequence > (escaped_alpha, 5) @eof(premature_end_error) {
       emit(:escape, :hex, copy(data, ts-1, te))
       fret;
@@ -662,6 +673,7 @@ class Regexp::Scanner
 
     input = input_object.is_a?(Regexp) ? input_object.source : input_object
     self.free_spacing = free_spacing?(input_object, options)
+    self.regexp_encoding = input_object.encoding if input_object.is_a?(::Regexp)
     self.spacing_stack = [{:free_spacing => free_spacing, :depth => 0}]
 
     data  = input.unpack("c*")
@@ -711,10 +723,9 @@ class Regexp::Scanner
     File.read("#{__dir__}/scanner/properties/#{name}.csv").scan(/(.+),(.+)/).to_h
   end
 
-  def self.posix_classes
+  POSIX_CLASSES =
     %w[alnum alpha ascii blank cntrl digit graph
-       lower print punct space upper word xdigit]
-  end
+       lower print punct space upper word xdigit].to_h { |c| [c, true] }.freeze
 
   # Emits an array with the details of the scanned pattern
   def emit(type, token, text)
@@ -749,6 +760,7 @@ class Regexp::Scanner
   attr_accessor :block,
                 :collect_tokens, :tokens, :prev_token,
                 :free_spacing, :spacing_stack,
+                :regexp_encoding,
                 :group_depth, :set_depth, :conditional_stack,
                 :char_pos
 
diff --git a/lib/regexp_parser/syntax/token/escape.rb b/lib/regexp_parser/syntax/token/escape.rb
@@ -17,7 +17,7 @@ module Escape
                  interval_open interval_close
                  set_open set_close]
 
-      Hex   = %i[hex]
+      Hex   = %i[hex utf8_hex]
 
       Octal = %i[octal]
 
diff --git a/spec/expression/options_spec.rb b/spec/expression/options_spec.rb
@@ -43,7 +43,7 @@
   it 'gives correct precedence when encountering multiple encoding flags' do
     # Any encoding flag overrides all previous encoding flags. If there are
     # multiple encoding flags in an options string, the last one wins.
-    # E.g. /(?dau)\w/ matches UTF8 chars but /(?dua)\w/ only ASCII chars.
+    # E.g. /(?dau)\w/ matches UTF-8 chars but /(?dua)\w/ only ASCII chars.
     regexp1 = /(?dau)\w/
     regexp2 = /(?dua)\w/
     expect(regexp1).to match 'ü'
diff --git a/spec/parser/escapes_spec.rb b/spec/parser/escapes_spec.rb
@@ -29,6 +29,7 @@
 
   # hex escapes
   include_examples 'parse', /a\xFF/n,        1 => [:escape, :hex,            es::Hex]
+  include_examples 'parse', /a\xFF\xFF/n,    2 => [:escape, :hex,            es::Hex]
 
   # octal escapes
   include_examples 'parse', /a\177/n,        1 => [:escape, :octal,          es::Octal]
@@ -39,6 +40,7 @@
   include_examples 'parse', /\101/,          0 => [char:  'A',     codepoint:  65      ]
   include_examples 'parse', /\x42/,          0 => [char:  'B',     codepoint:  66      ]
   include_examples 'parse', /\xA/,           0 => [char:  "\n",    codepoint:  10      ]
+  include_examples 'parse', /\xE2\x82\xAC/,  0 => [char:  "€",     codepoint:  8364    ]
   include_examples 'parse', /\u0043/,        0 => [char:  'C',     codepoint:  67      ]
   include_examples 'parse', /\u{44 45}/,     0 => [chars: %w[D E], codepoints: [68, 69]]
 
diff --git a/spec/scanner/errors_spec.rb b/spec/scanner/errors_spec.rb
@@ -40,6 +40,7 @@
   include_examples 'scan error', RS::PrematureEndError, 'eof in m-seq', '\M-\C-'
   include_examples 'scan error', RS::InvalidSequenceError, 'invalid hex', '\xZ'
   include_examples 'scan error', RS::InvalidSequenceError, 'invalid hex', '\xZ0'
+  include_examples 'scan error', RS::InvalidSequenceError, 'invalid hex', '\x{'
   include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\cü'
   include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\c\M-ü'
   include_examples 'scan error', RS::InvalidSequenceError, 'invalid c-seq', '\C-ü'
diff --git a/spec/scanner/escapes_spec.rb b/spec/scanner/escapes_spec.rb
@@ -34,6 +34,10 @@
   include_examples 'scan', 'a\xA',            1 => [:escape,  :hex,              '\xA',            1,  4]
   include_examples 'scan', 'a\x24c',          1 => [:escape,  :hex,              '\x24',           1,  5]
   include_examples 'scan', 'a\x0640c',        1 => [:escape,  :hex,              '\x06',           1,  5]
+  include_examples 'scan', 'a\xE2\x82\xAC',   1 => [:escape,  :utf8_hex,         '\xE2\x82\xAC',   1,  13]
+  include_examples 'scan', /a\xE2\x82\xAC/n,  1 => [:escape,  :hex,              '\xE2',           1,  5]
+  include_examples 'scan', /a\xE2\x82\xAC/n,  2 => [:escape,  :hex,              '\x82',           5,  9]
+  include_examples 'scan', /a\xE2\x82\xAC/n,  3 => [:escape,  :hex,              '\xAC',           9,  13]
 
   include_examples 'scan', 'a\u0640c',        1 => [:escape,  :codepoint,        '\u0640',         1,  7]
   include_examples 'scan', 'a\u{640 0641}c',  1 => [:escape,  :codepoint_list,   '\u{640 0641}',   1,  13]
diff --git a/spec/scanner/literals_spec.rb b/spec/scanner/literals_spec.rb
@@ -1,6 +1,6 @@
 require 'spec_helper'
 
-RSpec.describe('UTF8 scanning') do
+RSpec.describe('UTF-8 scanning') do
   # ascii, single byte characters
   include_examples 'scan', 'a',
     0 => [:literal,     :literal,       'a',        0, 1]