Skip to content

Commit 3853594

Browse files
committed
[C++] Implement standalone Unicode encoding and decoding handling
1 parent 66d6679 commit 3853594

File tree

9 files changed

+383
-151
lines changed

9 files changed

+383
-151
lines changed

runtime/Cpp/runtime/src/ANTLRInputStream.cpp

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#include "misc/Interval.h"
1010
#include "IntStream.h"
1111

12-
#include "support/StringUtils.h"
12+
#include "support/Utf8.h"
1313
#include "support/CPPUtils.h"
1414

1515
#include "ANTLRInputStream.h"
@@ -35,28 +35,37 @@ ANTLRInputStream::ANTLRInputStream(std::istream &stream): ANTLRInputStream() {
3535
load(stream);
3636
}
3737

38-
void ANTLRInputStream::load(const std::string &input) {
39-
load(input.data(), input.size());
38+
void ANTLRInputStream::load(const std::string &input, bool lenient) {
39+
load(input.data(), input.size(), lenient);
4040
}
4141

42-
void ANTLRInputStream::load(const char *data, size_t length) {
42+
void ANTLRInputStream::load(const char *data, size_t length, bool lenient) {
4343
// Remove the UTF-8 BOM if present.
4444
const char *bom = "\xef\xbb\xbf";
45-
if (length >= 3 && strncmp(data, bom, 3) == 0)
46-
_data = antlrcpp::utf8_to_utf32(data + 3, data + length);
47-
else
48-
_data = antlrcpp::utf8_to_utf32(data, data + length);
45+
if (length >= 3 && strncmp(data, bom, 3) == 0) {
46+
data += 3;
47+
length -= 3;
48+
}
49+
if (lenient) {
50+
_data = Utf8::lenientDecode(std::string_view(data, length));
51+
} else {
52+
auto maybe_utf32 = Utf8::strictDecode(std::string_view(data, length));
53+
if (!maybe_utf32.has_value()) {
54+
throw IllegalArgumentException("UTF-8 string contains an illegal byte sequence");
55+
}
56+
_data = std::move(maybe_utf32).value();
57+
}
4958
p = 0;
5059
}
5160

52-
void ANTLRInputStream::load(std::istream &stream) {
61+
void ANTLRInputStream::load(std::istream &stream, bool lenient) {
5362
if (!stream.good() || stream.eof()) // No fail, bad or EOF.
5463
return;
5564

5665
_data.clear();
5766

5867
std::string s((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
59-
load(s.data(), s.length());
68+
load(s.data(), s.length(), lenient);
6069
}
6170

6271
void ANTLRInputStream::reset() {
@@ -144,7 +153,11 @@ std::string ANTLRInputStream::getText(const Interval &interval) {
144153
return "";
145154
}
146155

147-
return antlrcpp::utf32_to_utf8(_data.substr(start, count));
156+
auto maybe_utf8 = Utf8::strictEncode(std::u32string_view(_data).substr(start, count));
157+
if (!maybe_utf8.has_value()) {
158+
throw IllegalArgumentException("Input stream contains invalid Unicode code points");
159+
}
160+
return std::move(maybe_utf8).value();
148161
}
149162

150163
std::string ANTLRInputStream::getSourceName() const {
@@ -155,7 +168,11 @@ std::string ANTLRInputStream::getSourceName() const {
155168
}
156169

157170
std::string ANTLRInputStream::toString() const {
158-
return antlrcpp::utf32_to_utf8(_data);
171+
auto maybe_utf8 = Utf8::strictEncode(_data);
172+
if (!maybe_utf8.has_value()) {
173+
throw IllegalArgumentException("Input stream contains invalid Unicode code points");
174+
}
175+
return std::move(maybe_utf8).value();
159176
}
160177

161178
void ANTLRInputStream::InitializeInstanceFields() {

runtime/Cpp/runtime/src/ANTLRInputStream.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ namespace antlr4 {
1818
protected:
1919
/// The data being scanned.
2020
// UTF-32
21-
UTF32String _data;
21+
std::u32string _data;
2222

2323
/// 0..n-1 index into string of next char </summary>
2424
size_t p;
@@ -34,9 +34,13 @@ namespace antlr4 {
3434
ANTLRInputStream(const char *data, size_t length);
3535
ANTLRInputStream(std::istream &stream);
3636

37-
virtual void load(const std::string &input);
38-
virtual void load(const char *data, size_t length);
39-
virtual void load(std::istream &stream);
37+
virtual void load(const std::string &input, bool lenient);
38+
virtual void load(const char *data, size_t length, bool lenient);
39+
virtual void load(std::istream &stream, bool lenient);
40+
41+
virtual void load(const std::string &input) { load(input, false); }
42+
virtual void load(const char *data, size_t length) { load(data, length, false); }
43+
virtual void load(std::istream &stream) { load(stream, false); }
4044

4145
/// Reset the stream so that it's in the same state it was
4246
/// when the object was created *except* the data array is not

runtime/Cpp/runtime/src/UnbufferedCharStream.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
#include "misc/Interval.h"
77
#include "Exceptions.h"
8-
#include "support/StringUtils.h"
8+
#include "support/Utf8.h"
99

1010
#include "UnbufferedCharStream.h"
1111

@@ -195,7 +195,11 @@ std::string UnbufferedCharStream::getText(const misc::Interval &interval) {
195195
}
196196
// convert from absolute to local index
197197
size_t i = interval.a - bufferStartIndex;
198-
return utf32_to_utf8(_data.substr(i, interval.length()));
198+
auto maybe_utf8 = Utf8::strictEncode(std::u32string_view(_data).substr(i, interval.length()));
199+
if (!maybe_utf8.has_value()) {
200+
throw IllegalArgumentException("Unbuffered stream contains invalid Unicode code points");
201+
}
202+
return std::move(maybe_utf8).value();
199203
}
200204

201205
size_t UnbufferedCharStream::getBufferStartIndex() const {

runtime/Cpp/runtime/src/antlr4-common.h

Lines changed: 9 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,38 +6,33 @@
66
#pragma once
77

88
#include <algorithm>
9-
#include <assert.h>
109
#include <atomic>
10+
#include <bitset>
11+
#include <cassert>
1112
#include <chrono>
13+
#include <climits>
14+
#include <cstdint>
15+
#include <cstdlib>
16+
#include <exception>
1217
#include <fstream>
1318
#include <iostream>
1419
#include <iterator>
1520
#include <limits>
16-
#include <limits.h>
1721
#include <list>
1822
#include <map>
1923
#include <memory>
24+
#include <mutex>
2025
#include <set>
21-
#include <stdarg.h>
22-
#include <stdint.h>
23-
#include <stdlib.h>
2426
#include <sstream>
2527
#include <stack>
2628
#include <string>
27-
#include <typeinfo>
29+
#include <string_view>
2830
#include <type_traits>
31+
#include <typeinfo>
2932
#include <unordered_map>
3033
#include <unordered_set>
3134
#include <utility>
3235
#include <vector>
33-
#include <mutex>
34-
#include <exception>
35-
#include <bitset>
36-
#include <condition_variable>
37-
38-
#ifndef USE_UTF8_INSTEAD_OF_CODECVT
39-
#include <codecvt>
40-
#endif
4136

4237
// Defines for the Guid class and other platform dependent stuff.
4338
#ifdef _WIN32
@@ -59,17 +54,6 @@
5954
typedef __int32 ssize_t;
6055
#endif
6156

62-
#if _MSC_VER >= 1900 && _MSC_VER < 2000
63-
// VS 2015 has a known bug when using std::codecvt_utf8<char32_t>
64-
// so we have to temporarily use __int32 instead.
65-
// https://connect.microsoft.com/VisualStudio/feedback/details/1403302/unresolved-external-when-using-codecvt-utf8
66-
typedef std::basic_string<__int32> i32string;
67-
68-
typedef i32string UTF32String;
69-
#else
70-
typedef std::u32string UTF32String;
71-
#endif
72-
7357
#ifdef ANTLR4CPP_EXPORTS
7458
#define ANTLR4CPP_PUBLIC __declspec(dllexport)
7559
#else
@@ -80,25 +64,14 @@
8064
#endif
8165
#endif
8266

83-
#if defined(_MSC_VER) && !defined(__clang__)
84-
// clang-cl should escape this to prevent [ignored-attributes].
85-
namespace std {
86-
class ANTLR4CPP_PUBLIC exception; // Prevents warning C4275 from MSVC.
87-
} // namespace std
88-
#endif
89-
9067
#elif defined(__APPLE__)
91-
typedef std::u32string UTF32String;
92-
9368
#define GUID_CFUUID
9469
#if __GNUC__ >= 4
9570
#define ANTLR4CPP_PUBLIC __attribute__ ((visibility ("default")))
9671
#else
9772
#define ANTLR4CPP_PUBLIC
9873
#endif
9974
#else
100-
typedef std::u32string UTF32String;
101-
10275
#define GUID_LIBUUID
10376
#if __GNUC__ >= 6
10477
#define ANTLR4CPP_PUBLIC __attribute__ ((visibility ("default")))

runtime/Cpp/runtime/src/support/StringUtils.cpp

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -7,40 +7,15 @@
77

88
namespace antlrcpp {
99

10-
void replaceAll(std::string& str, std::string const& from, std::string const& to)
11-
{
12-
if (from.empty())
13-
return;
14-
15-
size_t start_pos = 0;
16-
while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
17-
str.replace(start_pos, from.length(), to);
18-
start_pos += to.length(); // In case 'to' contains 'from', like replacing 'x' with 'yx'.
10+
void replaceAll(std::string& str, std::string_view from, std::string_view to) {
11+
if (from.empty())
12+
return;
13+
14+
size_t start_pos = 0;
15+
while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
16+
str.replace(start_pos, from.length(), to);
17+
start_pos += to.length(); // In case 'to' contains 'from', like replacing 'x' with 'yx'.
18+
}
1919
}
20-
}
21-
22-
std::string ws2s(std::wstring const& wstr) {
23-
#ifndef USE_UTF8_INSTEAD_OF_CODECVT
24-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
25-
std::string narrow = converter.to_bytes(wstr);
26-
#else
27-
std::string narrow;
28-
utf8::utf32to8(wstr.begin(), wstr.end(), std::back_inserter(narrow));
29-
#endif
30-
31-
return narrow;
32-
}
33-
34-
std::wstring s2ws(const std::string &str) {
35-
#ifndef USE_UTF8_INSTEAD_OF_CODECVT
36-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
37-
std::wstring wide = converter.from_bytes(str);
38-
#else
39-
std::wstring wide;
40-
utf8::utf8to32(str.begin(), str.end(), std::back_inserter(wide));
41-
#endif
42-
43-
return wide;
44-
}
4520

4621
} // namespace antrlcpp

runtime/Cpp/runtime/src/support/StringUtils.h

Lines changed: 1 addition & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -7,70 +7,8 @@
77

88
#include "antlr4-common.h"
99

10-
#ifdef USE_UTF8_INSTEAD_OF_CODECVT
11-
#include "utf8.h"
12-
#endif
13-
1410
namespace antlrcpp {
1511

16-
// For all conversions utf8 <-> utf32.
17-
// I wouldn't prefer wstring_convert because: according to
18-
// https://en.cppreference.com/w/cpp/locale/wstring_convert,
19-
// wstring_convert is deprecated in C++17.
20-
// utfcpp (https://github.com/nemtrif/utfcpp) is a substitution.
21-
#ifndef USE_UTF8_INSTEAD_OF_CODECVT
22-
// VS 2015 and VS 2017 have different bugs in std::codecvt_utf8<char32_t> (VS 2013 works fine).
23-
#if defined(_MSC_VER) && _MSC_VER >= 1900 && _MSC_VER < 2000
24-
typedef std::wstring_convert<std::codecvt_utf8<__int32>, __int32> UTF32Converter;
25-
#else
26-
typedef std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> UTF32Converter;
27-
#endif
28-
#endif
29-
30-
// The conversion functions fails in VS2017, so we explicitly use a workaround.
31-
template<typename T>
32-
inline std::string utf32_to_utf8(T const& data)
33-
{
34-
#ifndef USE_UTF8_INSTEAD_OF_CODECVT
35-
// Don't make the converter static or we have to serialize access to it.
36-
thread_local UTF32Converter converter;
37-
38-
#if defined(_MSC_VER) && _MSC_VER >= 1900 && _MSC_VER < 2000
39-
const auto p = reinterpret_cast<const int32_t *>(data.data());
40-
return converter.to_bytes(p, p + data.size());
41-
#else
42-
return converter.to_bytes(data);
43-
#endif
44-
#else
45-
std::string narrow;
46-
utf8::utf32to8(data.begin(), data.end(), std::back_inserter(narrow));
47-
return narrow;
48-
#endif
49-
}
50-
51-
inline UTF32String utf8_to_utf32(const char* first, const char* last)
52-
{
53-
#ifndef USE_UTF8_INSTEAD_OF_CODECVT
54-
thread_local UTF32Converter converter;
55-
56-
#if defined(_MSC_VER) && _MSC_VER >= 1900 && _MSC_VER < 2000
57-
auto r = converter.from_bytes(first, last);
58-
i32string s = reinterpret_cast<const int32_t *>(r.data());
59-
return s;
60-
#else
61-
std::u32string s = converter.from_bytes(first, last);
62-
return s;
63-
#endif
64-
#else
65-
UTF32String wide;
66-
utf8::utf8to32(first, last, std::back_inserter(wide));
67-
return wide;
68-
#endif
69-
}
70-
71-
void replaceAll(std::string &str, std::string const& from, std::string const& to);
12+
void replaceAll(std::string &str, std::string_view from, std::string_view to);
7213

73-
// string <-> wstring conversion (UTF-16), e.g. for use with Window's wide APIs.
74-
ANTLR4CPP_PUBLIC std::string ws2s(std::wstring const& wstr);
75-
ANTLR4CPP_PUBLIC std::wstring s2ws(std::string const& str);
7614
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/* Copyright (c) 2021 The ANTLR Project. All rights reserved.
2+
* Use of this file is governed by the BSD 3-clause license that
3+
* can be found in the LICENSE.txt file in the project root.
4+
*/
5+
6+
#pragma once
7+
8+
namespace antlrcpp {
9+
10+
class Unicode final {
11+
public:
12+
static constexpr char32_t REPLACEMENT_CHARACTER = 0xfffd;
13+
14+
static constexpr bool isValid(char32_t codePoint) {
15+
return codePoint < 0xd800 || (codePoint > 0xdfff && codePoint <= 0x10ffff);
16+
}
17+
18+
private:
19+
Unicode() = delete;
20+
21+
Unicode(const Unicode&) = delete;
22+
23+
Unicode& operator=(const Unicode&) = delete;
24+
};
25+
26+
}

0 commit comments

Comments
 (0)