Skip to content

Commit 2349fb2

Browse files
committed
[C++] Implement standalone Unicode encoding and decoding handling
1 parent 82c4417 commit 2349fb2

File tree

16 files changed

+540
-222
lines changed

16 files changed

+540
-222
lines changed

.circleci/scripts/run-tests-cpp.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22

33
set -euo pipefail
44

5+
pushd runtime/Cpp
6+
ctest
7+
popd
8+
59
pushd runtime-testsuite
610
echo "running maven tests..."
711
if [ $GROUP == "LEXER" ]; then

.github/scripts/run-tests-cpp.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,18 @@
22

33
set -euo pipefail
44

5-
cd runtime-testsuite/
5+
pushd runtime/Cpp
6+
ctest
7+
popd
68

9+
pushd runtime-testsuite
710
if [ $GROUP == "LEXER" ]; then
811
mvn -q -Dgroups="org.antlr.v4.test.runtime.category.LexerTests" -Dtest=cpp.** test
912
elif [ $GROUP == "PARSER" ]; then
1013
mvn -q -Dgroups="org.antlr.v4.test.runtime.category.ParserTests" -Dtest=cpp.** test
1114
elif [ $GROUP == "RECURSION" ]; then
1215
mvn -q -Dgroups="org.antlr.v4.test.runtime.category.LeftRecursionTests" -Dtest=cpp.** test
1316
else
14-
mvn -q -Dtest=cpp.* test
17+
mvn -q -Dtest=cpp.* test
1518
fi
19+
popd

.travis/run-tests-cpp.sh

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,18 @@
22

33
set -euo pipefail
44

5+
pushd runtime/Cpp
6+
ctest
7+
popd
8+
9+
pushd runtime-testsuite
510
if [ $GROUP == "LEXER" ]; then
6-
mvn -q -Dgroups="org.antlr.v4.test.runtime.category.LexerTests" -Dtest=cpp.* test
11+
mvn -q -Dgroups="org.antlr.v4.test.runtime.category.LexerTests" -Dtest=cpp.* test
712
elif [ $GROUP == "PARSER" ]; then
8-
mvn -q -Dgroups="org.antlr.v4.test.runtime.category.ParserTests" -Dtest=cpp.* test
13+
mvn -q -Dgroups="org.antlr.v4.test.runtime.category.ParserTests" -Dtest=cpp.* test
914
elif [ $GROUP == "RECURSION" ]; then
10-
mvn -q -Dgroups="org.antlr.v4.test.runtime.category.LeftRecursionTests" -Dtest=cpp.* test
15+
mvn -q -Dgroups="org.antlr.v4.test.runtime.category.LeftRecursionTests" -Dtest=cpp.* test
1116
else
12-
mvn -q -Dtest=cpp.* test
17+
mvn -q -Dtest=cpp.* test
1318
fi
14-
19+
popd

runtime/Cpp/CMakeLists.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# -*- mode:cmake -*-
2-
cmake_minimum_required (VERSION 2.8)
3-
# 2.8 needed because of ExternalProject
2+
cmake_minimum_required (VERSION 3.14)
3+
# 3.14 needed because of FetchContent
4+
5+
enable_testing()
46

57
# Detect build type, fallback to release and throw a warning if use didn't specify any
68
if(NOT CMAKE_BUILD_TYPE)

runtime/Cpp/deploy-macos.sh

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,6 @@
44
rm -f -R antlr4-runtime build lib 2> /dev/null
55
rm antlr4-cpp-runtime-macos.zip 2> /dev/null
66

7-
# Get utf8 dependency.
8-
mkdir -p runtime/thirdparty 2> /dev/null
9-
pushd runtime/thirdparty
10-
if [ ! -d utfcpp ]
11-
then
12-
git clone https://github.com/nemtrif/utfcpp.git utfcpp
13-
pushd utfcpp
14-
git checkout tags/v3.1.1
15-
popd
16-
fi
17-
popd
18-
197
# Binaries
208
xcodebuild -project runtime/antlrcpp.xcodeproj \
219
-target antlr4 \
@@ -35,9 +23,6 @@ rm -f -R antlr4-runtime
3523
pushd runtime/src
3624
find . -name '*.h' | cpio -pdm ../../antlr4-runtime
3725
popd
38-
pushd runtime/thirdparty/utfcpp/source
39-
find . -name '*.h' | cpio -pdm ../../../../antlr4-runtime
40-
popd
4126

4227
# Zip up and clean up
4328
zip -r antlr4-cpp-runtime-macos.zip antlr4-runtime lib

runtime/Cpp/runtime/CMakeLists.txt

Lines changed: 25 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -35,45 +35,35 @@ add_custom_target(make_lib_output_dir ALL
3535
add_dependencies(antlr4_shared make_lib_output_dir)
3636
add_dependencies(antlr4_static make_lib_output_dir)
3737

38-
find_package(utf8cpp QUIET)
38+
include(FetchContent)
3939

40-
set(INSTALL_utf8cpp FALSE)
40+
FetchContent_Declare(
41+
googletest
42+
URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip
43+
)
4144

42-
if (utf8cpp_FOUND)
43-
target_link_libraries(antlr4_shared utf8cpp)
44-
target_link_libraries(antlr4_static utf8cpp)
45-
else()
45+
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
4646

47-
# older utf8cpp doesn't define the package above
48-
find_path(utf8cpp_HEADER utf8.h
49-
PATH_SUFFIXES utf8cpp
50-
)
47+
FetchContent_MakeAvailable(googletest)
5148

52-
if (utf8cpp_HEADER)
53-
include_directories(${utf8cpp_HEADER})
54-
else()
55-
include(${CMAKE_ROOT}/Modules/ExternalProject.cmake)
56-
set(THIRDPARTY_DIR ${CMAKE_BINARY_DIR}/runtime/thirdparty)
57-
set(UTFCPP_DIR ${THIRDPARTY_DIR}/utfcpp)
58-
ExternalProject_Add(
59-
utf8cpp
60-
GIT_REPOSITORY "https://github.com/nemtrif/utfcpp"
61-
GIT_TAG "v3.1.1"
62-
SOURCE_DIR ${UTFCPP_DIR}
63-
UPDATE_DISCONNECTED 1
64-
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${UTFCPP_DIR}/install -DUTF8_TESTS=off -DUTF8_SAMPLES=off
65-
STEP_TARGETS build)
66-
67-
include_directories(
68-
${UTFCPP_DIR}/install/include/utf8cpp
69-
${UTFCPP_DIR}/install/include/utf8cpp/utf8
70-
)
71-
72-
add_dependencies(antlr4_shared utf8cpp)
73-
add_dependencies(antlr4_static utf8cpp)
74-
set(INSTALL_utf8cpp TRUE)
75-
endif()
76-
endif()
49+
file(GLOB libantlrcpp_TESTS
50+
"${PROJECT_SOURCE_DIR}/runtime/tests/*.cpp"
51+
)
52+
53+
add_executable(
54+
antlr4_tests
55+
${libantlrcpp_TESTS}
56+
)
57+
58+
target_link_libraries(
59+
antlr4_tests
60+
antlr4_static
61+
gtest_main
62+
)
63+
64+
include(GoogleTest)
65+
66+
gtest_discover_tests(antlr4_tests)
7767

7868
if(CMAKE_SYSTEM_NAME MATCHES "Linux")
7969
target_link_libraries(antlr4_shared ${UUID_LIBRARIES})
@@ -152,16 +142,3 @@ install(DIRECTORY "${PROJECT_SOURCE_DIR}/runtime/src/"
152142
COMPONENT dev
153143
FILES_MATCHING PATTERN "*.h"
154144
)
155-
156-
if (INSTALL_utf8cpp)
157-
install(FILES "${UTFCPP_DIR}/source/utf8.h"
158-
DESTINATION "include/antlr4-runtime")
159-
install(DIRECTORY "${UTFCPP_DIR}/source/utf8"
160-
DESTINATION "include/antlr4-runtime"
161-
COMPONENT dev
162-
FILES_MATCHING PATTERN "*.h"
163-
)
164-
endif()
165-
166-
167-

runtime/Cpp/runtime/src/ANTLRInputStream.cpp

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#include "misc/Interval.h"
1010
#include "IntStream.h"
1111

12-
#include "support/StringUtils.h"
12+
#include "support/Utf8.h"
1313
#include "support/CPPUtils.h"
1414

1515
#include "ANTLRInputStream.h"
@@ -35,28 +35,37 @@ ANTLRInputStream::ANTLRInputStream(std::istream &stream): ANTLRInputStream() {
3535
load(stream);
3636
}
3737

38-
void ANTLRInputStream::load(const std::string &input) {
39-
load(input.data(), input.size());
38+
void ANTLRInputStream::load(const std::string &input, bool lenient) {
39+
load(input.data(), input.size(), lenient);
4040
}
4141

42-
void ANTLRInputStream::load(const char *data, size_t length) {
42+
void ANTLRInputStream::load(const char *data, size_t length, bool lenient) {
4343
// Remove the UTF-8 BOM if present.
4444
const char *bom = "\xef\xbb\xbf";
45-
if (length >= 3 && strncmp(data, bom, 3) == 0)
46-
_data = antlrcpp::utf8_to_utf32(data + 3, data + length);
47-
else
48-
_data = antlrcpp::utf8_to_utf32(data, data + length);
45+
if (length >= 3 && strncmp(data, bom, 3) == 0) {
46+
data += 3;
47+
length -= 3;
48+
}
49+
if (lenient) {
50+
_data = Utf8::lenientDecode(std::string_view(data, length));
51+
} else {
52+
auto maybe_utf32 = Utf8::strictDecode(std::string_view(data, length));
53+
if (!maybe_utf32.has_value()) {
54+
throw IllegalArgumentException("UTF-8 string contains an illegal byte sequence");
55+
}
56+
_data = std::move(maybe_utf32).value();
57+
}
4958
p = 0;
5059
}
5160

52-
void ANTLRInputStream::load(std::istream &stream) {
61+
void ANTLRInputStream::load(std::istream &stream, bool lenient) {
5362
if (!stream.good() || stream.eof()) // No fail, bad or EOF.
5463
return;
5564

5665
_data.clear();
5766

5867
std::string s((std::istreambuf_iterator<char>(stream)), std::istreambuf_iterator<char>());
59-
load(s.data(), s.length());
68+
load(s.data(), s.length(), lenient);
6069
}
6170

6271
void ANTLRInputStream::reset() {
@@ -144,7 +153,11 @@ std::string ANTLRInputStream::getText(const Interval &interval) {
144153
return "";
145154
}
146155

147-
return antlrcpp::utf32_to_utf8(_data.substr(start, count));
156+
auto maybeUtf8 = Utf8::strictEncode(std::u32string_view(_data).substr(start, count));
157+
if (!maybeUtf8.has_value()) {
158+
throw IllegalArgumentException("Input stream contains invalid Unicode code points");
159+
}
160+
return std::move(maybeUtf8).value();
148161
}
149162

150163
std::string ANTLRInputStream::getSourceName() const {
@@ -155,7 +168,11 @@ std::string ANTLRInputStream::getSourceName() const {
155168
}
156169

157170
std::string ANTLRInputStream::toString() const {
158-
return antlrcpp::utf32_to_utf8(_data);
171+
auto maybeUtf8 = Utf8::strictEncode(_data);
172+
if (!maybeUtf8.has_value()) {
173+
throw IllegalArgumentException("Input stream contains invalid Unicode code points");
174+
}
175+
return std::move(maybeUtf8).value();
159176
}
160177

161178
void ANTLRInputStream::InitializeInstanceFields() {

runtime/Cpp/runtime/src/ANTLRInputStream.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ namespace antlr4 {
1818
protected:
1919
/// The data being scanned.
2020
// UTF-32
21-
UTF32String _data;
21+
std::u32string _data;
2222

2323
/// 0..n-1 index into string of next char </summary>
2424
size_t p;
@@ -34,9 +34,13 @@ namespace antlr4 {
3434
ANTLRInputStream(const char *data, size_t length);
3535
ANTLRInputStream(std::istream &stream);
3636

37-
virtual void load(const std::string &input);
38-
virtual void load(const char *data, size_t length);
39-
virtual void load(std::istream &stream);
37+
virtual void load(const std::string &input, bool lenient);
38+
virtual void load(const char *data, size_t length, bool lenient);
39+
virtual void load(std::istream &stream, bool lenient);
40+
41+
virtual void load(const std::string &input) { load(input, false); }
42+
virtual void load(const char *data, size_t length) { load(data, length, false); }
43+
virtual void load(std::istream &stream) { load(stream, false); }
4044

4145
/// Reset the stream so that it's in the same state it was
4246
/// when the object was created *except* the data array is not

runtime/Cpp/runtime/src/UnbufferedCharStream.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
#include "misc/Interval.h"
77
#include "Exceptions.h"
8-
#include "support/StringUtils.h"
8+
#include "support/Utf8.h"
99

1010
#include "UnbufferedCharStream.h"
1111

@@ -195,7 +195,11 @@ std::string UnbufferedCharStream::getText(const misc::Interval &interval) {
195195
}
196196
// convert from absolute to local index
197197
size_t i = interval.a - bufferStartIndex;
198-
return utf32_to_utf8(_data.substr(i, interval.length()));
198+
auto maybeUtf8 = Utf8::strictEncode(std::u32string_view(_data).substr(i, interval.length()));
199+
if (!maybeUtf8.has_value()) {
200+
throw IllegalArgumentException("Unbuffered stream contains invalid Unicode code points");
201+
}
202+
return std::move(maybeUtf8).value();
199203
}
200204

201205
size_t UnbufferedCharStream::getBufferStartIndex() const {

0 commit comments

Comments
 (0)