Skip to content

Commit aa5b233

Browse files
committed
[C++] Implement standalone Unicode encoding and decoding handling
1 parent 66d6679 commit aa5b233

File tree

11 files changed

+384
-219
lines changed

11 files changed

+384
-219
lines changed

runtime/Cpp/deploy-macos.sh

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,6 @@
44
rm -f -R antlr4-runtime build lib 2> /dev/null
55
rm antlr4-cpp-runtime-macos.zip 2> /dev/null
66

7-
# Get utf8 dependency.
8-
mkdir -p runtime/thirdparty 2> /dev/null
9-
pushd runtime/thirdparty
10-
if [ ! -d utfcpp ]
11-
then
12-
git clone https://github.com/nemtrif/utfcpp.git utfcpp
13-
pushd utfcpp
14-
git checkout tags/v3.1.1
15-
popd
16-
fi
17-
popd
18-
197
# Binaries
208
xcodebuild -project runtime/antlrcpp.xcodeproj \
219
-target antlr4 \
@@ -35,9 +23,6 @@ rm -f -R antlr4-runtime
3523
pushd runtime/src
3624
find . -name '*.h' | cpio -pdm ../../antlr4-runtime
3725
popd
38-
pushd runtime/thirdparty/utfcpp/source
39-
find . -name '*.h' | cpio -pdm ../../../../antlr4-runtime
40-
popd
4126

4227
# Zip up and clean up
4328
zip -r antlr4-cpp-runtime-macos.zip antlr4-runtime lib

runtime/Cpp/runtime/CMakeLists.txt

Lines changed: 0 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -35,46 +35,6 @@ add_custom_target(make_lib_output_dir ALL
3535
add_dependencies(antlr4_shared make_lib_output_dir)
3636
add_dependencies(antlr4_static make_lib_output_dir)
3737

38-
find_package(utf8cpp QUIET)
39-
40-
set(INSTALL_utf8cpp FALSE)
41-
42-
if (utf8cpp_FOUND)
43-
target_link_libraries(antlr4_shared utf8cpp)
44-
target_link_libraries(antlr4_static utf8cpp)
45-
else()
46-
47-
# older utf8cpp doesn't define the package above
48-
find_path(utf8cpp_HEADER utf8.h
49-
PATH_SUFFIXES utf8cpp
50-
)
51-
52-
if (utf8cpp_HEADER)
53-
include_directories(${utf8cpp_HEADER})
54-
else()
55-
include(${CMAKE_ROOT}/Modules/ExternalProject.cmake)
56-
set(THIRDPARTY_DIR ${CMAKE_BINARY_DIR}/runtime/thirdparty)
57-
set(UTFCPP_DIR ${THIRDPARTY_DIR}/utfcpp)
58-
ExternalProject_Add(
59-
utf8cpp
60-
GIT_REPOSITORY "https://github.com/nemtrif/utfcpp"
61-
GIT_TAG "v3.1.1"
62-
SOURCE_DIR ${UTFCPP_DIR}
63-
UPDATE_DISCONNECTED 1
64-
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${UTFCPP_DIR}/install -DUTF8_TESTS=off -DUTF8_SAMPLES=off
65-
STEP_TARGETS build)
66-
67-
include_directories(
68-
${UTFCPP_DIR}/install/include/utf8cpp
69-
${UTFCPP_DIR}/install/include/utf8cpp/utf8
70-
)
71-
72-
add_dependencies(antlr4_shared utf8cpp)
73-
add_dependencies(antlr4_static utf8cpp)
74-
set(INSTALL_utf8cpp TRUE)
75-
endif()
76-
endif()
77-
7838
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
7939
target_link_libraries(antlr4_shared ${UUID_LIBRARIES})
8040
target_link_libraries(antlr4_static ${UUID_LIBRARIES})
@@ -152,16 +112,3 @@ install(DIRECTORY "${PROJECT_SOURCE_DIR}/runtime/src/"
152112
COMPONENT dev
153113
FILES_MATCHING PATTERN "*.h"
154114
)
155-
156-
if (INSTALL_utf8cpp)
157-
install(FILES "${UTFCPP_DIR}/source/utf8.h"
158-
DESTINATION "include/antlr4-runtime")
159-
install(DIRECTORY "${UTFCPP_DIR}/source/utf8"
160-
DESTINATION "include/antlr4-runtime"
161-
COMPONENT dev
162-
FILES_MATCHING PATTERN "*.h"
163-
)
164-
endif()
165-
166-
167-

runtime/Cpp/runtime/src/ANTLRInputStream.cpp

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#include "misc/Interval.h"
1010
#include "IntStream.h"
1111

12-
#include "support/StringUtils.h"
12+
#include "support/Utf8.h"
1313
#include "support/CPPUtils.h"
1414

1515
#include "ANTLRInputStream.h"
@@ -35,28 +35,37 @@ ANTLRInputStream::ANTLRInputStream(std::istream &stream): ANTLRInputStream() {
3535
load(stream);
3636
}
3737

38-
void ANTLRInputStream::load(const std::string &input) {
39-
load(input.data(), input.size());
38+
void ANTLRInputStream::load(const std::string &input, bool lenient) {
39+
load(input.data(), input.size(), lenient);
4040
}
4141

42-
void ANTLRInputStream::load(const char *data, size_t length) {
42+
void ANTLRInputStream::load(const char *data, size_t length, bool lenient) {
4343
// Remove the UTF-8 BOM if present.
4444
const char *bom = "\xef\xbb\xbf";
45-
if (length >= 3 && strncmp(data, bom, 3) == 0)
46-
_data = antlrcpp::utf8_to_utf32(data + 3, data + length);
47-
else
48-
_data = antlrcpp::utf8_to_utf32(data, data + length);
45+
if (length >= 3 && strncmp(data, bom, 3) == 0) {
46+
data += 3;
47+
length -= 3;
48+
}
49+
if (lenient) {
50+
_data = Utf8::lenientDecode(std::string_view(data, length));
51+
} else {
52+
auto maybe_utf32 = Utf8::strictDecode(std::string_view(data, length));
53+
if (!maybe_utf32.has_value()) {
54+
throw IllegalArgumentException("UTF-8 string contains an illegal byte sequence");
55+
}
56+
_data = std::move(maybe_utf32).value();
57+
}
4958
p = 0;
5059
}
5160

52-
void ANTLRInputStream::load(std::istream &stream) {
61+
void ANTLRInputStream::load(std::istream &stream, bool lenient) {
5362
if (!stream.good() || stream.eof()) // No fail, bad or EOF.
5463
return;
5564

5665
_data.clear();
5766

5867
std::string s((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
59-
load(s.data(), s.length());
68+
load(s.data(), s.length(), lenient);
6069
}
6170

6271
void ANTLRInputStream::reset() {
@@ -144,7 +153,11 @@ std::string ANTLRInputStream::getText(const Interval &interval) {
144153
return "";
145154
}
146155

147-
return antlrcpp::utf32_to_utf8(_data.substr(start, count));
156+
auto maybe_utf8 = Utf8::strictEncode(std::u32string_view(_data).substr(start, count));
157+
if (!maybe_utf8.has_value()) {
158+
throw IllegalArgumentException("Input stream contains invalid Unicode code points");
159+
}
160+
return std::move(maybe_utf8).value();
148161
}
149162

150163
std::string ANTLRInputStream::getSourceName() const {
@@ -155,7 +168,11 @@ std::string ANTLRInputStream::getSourceName() const {
155168
}
156169

157170
std::string ANTLRInputStream::toString() const {
158-
return antlrcpp::utf32_to_utf8(_data);
171+
auto maybe_utf8 = Utf8::strictEncode(_data);
172+
if (!maybe_utf8.has_value()) {
173+
throw IllegalArgumentException("Input stream contains invalid Unicode code points");
174+
}
175+
return std::move(maybe_utf8).value();
159176
}
160177

161178
void ANTLRInputStream::InitializeInstanceFields() {

runtime/Cpp/runtime/src/ANTLRInputStream.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ namespace antlr4 {
1818
protected:
1919
/// The data being scanned.
2020
// UTF-32
21-
UTF32String _data;
21+
std::u32string _data;
2222

2323
/// 0..n-1 index into string of next char </summary>
2424
size_t p;
@@ -34,9 +34,13 @@ namespace antlr4 {
3434
ANTLRInputStream(const char *data, size_t length);
3535
ANTLRInputStream(std::istream &stream);
3636

37-
virtual void load(const std::string &input);
38-
virtual void load(const char *data, size_t length);
39-
virtual void load(std::istream &stream);
37+
virtual void load(const std::string &input, bool lenient);
38+
virtual void load(const char *data, size_t length, bool lenient);
39+
virtual void load(std::istream &stream, bool lenient);
40+
41+
virtual void load(const std::string &input) { load(input, false); }
42+
virtual void load(const char *data, size_t length) { load(data, length, false); }
43+
virtual void load(std::istream &stream) { load(stream, false); }
4044

4145
/// Reset the stream so that it's in the same state it was
4246
/// when the object was created *except* the data array is not

runtime/Cpp/runtime/src/UnbufferedCharStream.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
#include "misc/Interval.h"
77
#include "Exceptions.h"
8-
#include "support/StringUtils.h"
8+
#include "support/Utf8.h"
99

1010
#include "UnbufferedCharStream.h"
1111

@@ -195,7 +195,11 @@ std::string UnbufferedCharStream::getText(const misc::Interval &interval) {
195195
}
196196
// convert from absolute to local index
197197
size_t i = interval.a - bufferStartIndex;
198-
return utf32_to_utf8(_data.substr(i, interval.length()));
198+
auto maybe_utf8 = Utf8::strictEncode(std::u32string_view(_data).substr(i, interval.length()));
199+
if (!maybe_utf8.has_value()) {
200+
throw IllegalArgumentException("Unbuffered stream contains invalid Unicode code points");
201+
}
202+
return std::move(maybe_utf8).value();
199203
}
200204

201205
size_t UnbufferedCharStream::getBufferStartIndex() const {

runtime/Cpp/runtime/src/antlr4-common.h

Lines changed: 9 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,38 +6,33 @@
66
#pragma once
77

88
#include <algorithm>
9-
#include <assert.h>
109
#include <atomic>
10+
#include <bitset>
11+
#include <cassert>
1112
#include <chrono>
13+
#include <climits>
14+
#include <cstdint>
15+
#include <cstdlib>
16+
#include <exception>
1217
#include <fstream>
1318
#include <iostream>
1419
#include <iterator>
1520
#include <limits>
16-
#include <limits.h>
1721
#include <list>
1822
#include <map>
1923
#include <memory>
24+
#include <mutex>
2025
#include <set>
21-
#include <stdarg.h>
22-
#include <stdint.h>
23-
#include <stdlib.h>
2426
#include <sstream>
2527
#include <stack>
2628
#include <string>
27-
#include <typeinfo>
29+
#include <string_view>
2830
#include <type_traits>
31+
#include <typeinfo>
2932
#include <unordered_map>
3033
#include <unordered_set>
3134
#include <utility>
3235
#include <vector>
33-
#include <mutex>
34-
#include <exception>
35-
#include <bitset>
36-
#include <condition_variable>
37-
38-
#ifndef USE_UTF8_INSTEAD_OF_CODECVT
39-
#include <codecvt>
40-
#endif
4136

4237
// Defines for the Guid class and other platform dependent stuff.
4338
#ifdef _WIN32
@@ -59,17 +54,6 @@
5954
typedef __int32 ssize_t;
6055
#endif
6156

62-
#if _MSC_VER >= 1900 && _MSC_VER < 2000
63-
// VS 2015 has a known bug when using std::codecvt_utf8<char32_t>
64-
// so we have to temporarily use __int32 instead.
65-
// https://connect.microsoft.com/VisualStudio/feedback/details/1403302/unresolved-external-when-using-codecvt-utf8
66-
typedef std::basic_string<__int32> i32string;
67-
68-
typedef i32string UTF32String;
69-
#else
70-
typedef std::u32string UTF32String;
71-
#endif
72-
7357
#ifdef ANTLR4CPP_EXPORTS
7458
#define ANTLR4CPP_PUBLIC __declspec(dllexport)
7559
#else
@@ -80,25 +64,14 @@
8064
#endif
8165
#endif
8266

83-
#if defined(_MSC_VER) && !defined(__clang__)
84-
// clang-cl should escape this to prevent [ignored-attributes].
85-
namespace std {
86-
class ANTLR4CPP_PUBLIC exception; // Prevents warning C4275 from MSVC.
87-
} // namespace std
88-
#endif
89-
9067
#elif defined(__APPLE__)
91-
typedef std::u32string UTF32String;
92-
9368
#define GUID_CFUUID
9469
#if __GNUC__ >= 4
9570
#define ANTLR4CPP_PUBLIC __attribute__ ((visibility ("default")))
9671
#else
9772
#define ANTLR4CPP_PUBLIC
9873
#endif
9974
#else
100-
typedef std::u32string UTF32String;
101-
10275
#define GUID_LIBUUID
10376
#if __GNUC__ >= 6
10477
#define ANTLR4CPP_PUBLIC __attribute__ ((visibility ("default")))

runtime/Cpp/runtime/src/support/StringUtils.cpp

Lines changed: 9 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -7,40 +7,15 @@
77

88
namespace antlrcpp {
99

10-
void replaceAll(std::string& str, std::string const& from, std::string const& to)
11-
{
12-
if (from.empty())
13-
return;
14-
15-
size_t start_pos = 0;
16-
while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
17-
str.replace(start_pos, from.length(), to);
18-
start_pos += to.length(); // In case 'to' contains 'from', like replacing 'x' with 'yx'.
10+
void replaceAll(std::string& str, std::string_view from, std::string_view to) {
11+
if (from.empty())
12+
return;
13+
14+
size_t start_pos = 0;
15+
while ((start_pos = str.find(from, start_pos)) != std::string::npos) {
16+
str.replace(start_pos, from.length(), to);
17+
start_pos += to.length(); // In case 'to' contains 'from', like replacing 'x' with 'yx'.
18+
}
1919
}
20-
}
21-
22-
std::string ws2s(std::wstring const& wstr) {
23-
#ifndef USE_UTF8_INSTEAD_OF_CODECVT
24-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
25-
std::string narrow = converter.to_bytes(wstr);
26-
#else
27-
std::string narrow;
28-
utf8::utf32to8(wstr.begin(), wstr.end(), std::back_inserter(narrow));
29-
#endif
30-
31-
return narrow;
32-
}
33-
34-
std::wstring s2ws(const std::string &str) {
35-
#ifndef USE_UTF8_INSTEAD_OF_CODECVT
36-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
37-
std::wstring wide = converter.from_bytes(str);
38-
#else
39-
std::wstring wide;
40-
utf8::utf8to32(str.begin(), str.end(), std::back_inserter(wide));
41-
#endif
42-
43-
return wide;
44-
}
4520

4621
} // namespace antrlcpp

0 commit comments

Comments
 (0)