@@ -253,7 +253,7 @@ class Scanner {
253253 null => false ,
254254 LF || CR || BOM => false ,
255255 TAB || NEL => true ,
256- _ => _isStandardCharacter (char ),
256+ _ => _isStandardCharacterAt ( 0 ),
257257 };
258258 }
259259
@@ -267,7 +267,7 @@ class Scanner {
267267 null => false ,
268268 LF || CR || BOM || SP => false ,
269269 NEL => true ,
270- _ => _isStandardCharacter (char ),
270+ _ => _isStandardCharacterAt ( 0 ),
271271 };
272272 }
273273
@@ -614,9 +614,9 @@ class Scanner {
614614
615615 // Consume the indicator token.
616616 var start = _scanner.state;
617- _scanner.readChar ();
618- _scanner.readChar ();
619- _scanner.readChar ();
617+ _scanner.readCodePoint ();
618+ _scanner.readCodePoint ();
619+ _scanner.readCodePoint ();
620620
621621 _tokens.add (Token (type, _scanner.spanFrom (start)));
622622 }
@@ -732,7 +732,7 @@ class Scanner {
732732 /// The span of the new token is the current character.
733733 void _addCharToken (TokenType type) {
734734 var start = _scanner.state;
735- _scanner.readChar ();
735+ _scanner.readCodePoint ();
736736 _tokens.add (Token (type, _scanner.spanFrom (start)));
737737 }
738738
@@ -836,7 +836,7 @@ class Scanner {
836836 // libyaml doesn't support unknown directives, but the spec says to ignore
837837 // them and warn: http://yaml.org/spec/1.2/spec.html#id2781147.
838838 while (! _isBreakOrEnd) {
839- _scanner.readChar ();
839+ _scanner.readCodePoint ();
840840 }
841841
842842 return null ;
@@ -866,7 +866,7 @@ class Scanner {
866866 // disagrees: http://yaml.org/spec/1.2/spec.html#ns-directive-name.
867867 var start = _scanner.position;
868868 while (_isNonSpace) {
869- _scanner.readChar ();
869+ _scanner.readCodePoint ();
870870 }
871871
872872 var name = _scanner.substring (start);
@@ -941,13 +941,13 @@ class Scanner {
941941 var start = _scanner.state;
942942
943943 // Eat the indicator character.
944- _scanner.readChar ();
944+ _scanner.readCodePoint ();
945945
946946 // libyaml only allows word characters in anchor names, but the spec
947947 // disagrees: http://yaml.org/spec/1.2/spec.html#ns-anchor-char.
948948 var startPosition = _scanner.position;
949949 while (_isAnchorChar) {
950- _scanner.readChar ();
950+ _scanner.readCodePoint ();
951951 }
952952 var name = _scanner.substring (startPosition);
953953
@@ -1032,7 +1032,7 @@ class Scanner {
10321032 buffer.write (_scanner.substring (start));
10331033
10341034 if (_scanner.peekChar () == EXCLAMATION ) {
1035- buffer.writeCharCode (_scanner.readChar ());
1035+ buffer.writeCharCode (_scanner.readCodePoint ());
10361036 } else {
10371037 // It's either the '!' tag or not really a tag handle. If it's a %TAG
10381038 // directive, it's an error. If it's a tag token, it must be part of a
@@ -1083,15 +1083,15 @@ class Scanner {
10831083 var start = _scanner.state;
10841084
10851085 // Eat the indicator '|' or '>'.
1086- _scanner.readChar ();
1086+ _scanner.readCodePoint ();
10871087
10881088 // Check for a chomping indicator.
10891089 var chomping = _Chomping .clip;
10901090 var increment = 0 ;
10911091 var char = _scanner.peekChar ();
10921092 if (char == PLUS || char == HYPHEN ) {
10931093 chomping = char == PLUS ? _Chomping .keep : _Chomping .strip;
1094- _scanner.readChar ();
1094+ _scanner.readCodePoint ();
10951095
10961096 // Check for an indentation indicator.
10971097 if (_isDigit) {
@@ -1101,7 +1101,7 @@ class Scanner {
11011101 _scanner.spanFrom (start));
11021102 }
11031103
1104- increment = _scanner.readChar () - NUMBER_0 ;
1104+ increment = _scanner.readCodePoint () - NUMBER_0 ;
11051105 }
11061106 } else if (_isDigit) {
11071107 // Do the same as above, but in the opposite order.
@@ -1110,12 +1110,12 @@ class Scanner {
11101110 _scanner.spanFrom (start));
11111111 }
11121112
1113- increment = _scanner.readChar () - NUMBER_0 ;
1113+ increment = _scanner.readCodePoint () - NUMBER_0 ;
11141114
11151115 char = _scanner.peekChar ();
11161116 if (char == PLUS || char == HYPHEN ) {
11171117 chomping = char == PLUS ? _Chomping .keep : _Chomping .strip;
1118- _scanner.readChar ();
1118+ _scanner.readCodePoint ();
11191119 }
11201120 }
11211121
@@ -1182,7 +1182,7 @@ class Scanner {
11821182
11831183 var startPosition = _scanner.position;
11841184 while (! _isBreakOrEnd) {
1185- _scanner.readChar ();
1185+ _scanner.readCodePoint ();
11861186 }
11871187 buffer.write (_scanner.substring (startPosition));
11881188 end = _scanner.state;
@@ -1373,7 +1373,7 @@ class Scanner {
13731373 buffer.writeCharCode (value);
13741374 }
13751375 } else {
1376- buffer.writeCharCode (_scanner.readChar ());
1376+ buffer.writeCharCode (_scanner.readCodePoint ());
13771377 }
13781378 }
13791379
@@ -1462,7 +1462,7 @@ class Scanner {
14621462 // 1.2's. We use [_isPlainChar] instead of libyaml's character here.
14631463 var startPosition = _scanner.position;
14641464 while (_isPlainChar) {
1465- _scanner.readChar ();
1465+ _scanner.readCodePoint ();
14661466 }
14671467 buffer.write (_scanner.substring (startPosition));
14681468 end = _scanner.state;
@@ -1587,15 +1587,28 @@ class Scanner {
15871587 _inBlockContext,
15881588 SP || TAB || LF || CR || BOM => false ,
15891589 NEL => true ,
1590- _ => _isStandardCharacter (char )
1590+ _ => _isStandardCharacterAt (offset )
15911591 };
15921592 }
15931593
1594+ bool _isStandardCharacterAt (int offset) {
1595+ var first = _scanner.peekChar (offset);
1596+ if (first == null ) return false ;
1597+
1598+ if (isHighSurrogate (first)) {
1599+ var next = _scanner.peekChar (offset + 1 );
1600+ // A surrogate pair encodes code points from U+010000 to U+10FFFF, so it
1601+ // must be a standard character.
1602+ return next != null && isLowSurrogate (next);
1603+ }
1604+
1605+ return _isStandardCharacter (first);
1606+ }
1607+
15941608 bool _isStandardCharacter (int char) =>
1595- (char >= 0x00020 && char <= 0x00007E ) ||
1596- (char >= 0x000A0 && char <= 0x00D7FF ) ||
1597- (char >= 0x0E000 && char <= 0x00FFFD ) ||
1598- (char >= 0x10000 && char <= 0x10FFFF );
1609+ (char >= 0x0020 && char <= 0x007E ) ||
1610+ (char >= 0x00A0 && char <= 0xD7FF ) ||
1611+ (char >= 0xE000 && char <= 0xFFFD );
15991612
16001613 /// Returns the hexidecimal value of [char] .
16011614 int _asHex (int char) {
0 commit comments