Skip to content

Commit bff3015

Browse files
authored
Adding parquet transcoding example (#15420)
This PR adds a new example `parquet_io` to `libcudf/cpp/examples` instrumenting reading and writing parquet files with different column encodings (same for all columns for now) and compressions to close #15344. The example maybe elaborated and/or evolved as needed. #15348 should be merged before this PR to get all CMake updates needed to successfully build and run this example. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Ray Douglass (https://github.com/raydouglass) URL: #15420
1 parent 425a5da commit bff3015

File tree

6 files changed

+358
-0
lines changed

6 files changed

+358
-0
lines changed

ci/run_cudf_examples.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,7 @@ compute-sanitizer --tool memcheck custom_optimized names.csv
2323
compute-sanitizer --tool memcheck custom_prealloc names.csv
2424
compute-sanitizer --tool memcheck custom_with_malloc names.csv
2525

26+
compute-sanitizer --tool memcheck parquet_io
27+
compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE
28+
2629
exit ${EXITCODE}

cpp/examples/build.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,4 @@ build_example() {
5959
build_example basic
6060
build_example strings
6161
build_example nested_types
62+
build_example parquet_io
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Copyright (c) 2024, NVIDIA CORPORATION.
2+
3+
cmake_minimum_required(VERSION 3.26.4)
4+
5+
include(../set_cuda_architecture.cmake)
6+
7+
# initialize cuda architecture
8+
rapids_cuda_init_architectures(parquet_io)
9+
rapids_cuda_set_architectures(RAPIDS)
10+
11+
project(
12+
parquet_io
13+
VERSION 0.0.1
14+
LANGUAGES CXX CUDA
15+
)
16+
17+
include(../fetch_dependencies.cmake)
18+
19+
# Configure your project here
20+
add_executable(parquet_io parquet_io.cpp)
21+
target_link_libraries(parquet_io PRIVATE cudf::cudf)
22+
target_compile_features(parquet_io PRIVATE cxx_std_17)
23+
24+
install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
25+
install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf)
614 Bytes
Binary file not shown.
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
/*
2+
* Copyright (c) 2024, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "parquet_io.hpp"
18+
19+
/**
20+
* @file parquet_io.cpp
21+
* @brief Demonstrates usage of the libcudf APIs to read and write
22+
* parquet file format with different encodings and compression types
23+
*
24+
* The following encoding and compression ztypes are demonstrated:
25+
* Encoding Types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,
26+
* DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY
27+
*
28+
* Compression Types: NONE, AUTO, SNAPPY, LZ4, ZSTD
29+
*
30+
*/
31+
32+
/**
33+
* @brief Read parquet input from file
34+
*
35+
* @param filepath path to input parquet file
36+
* @return cudf::io::table_with_metadata
37+
*/
38+
cudf::io::table_with_metadata read_parquet(std::string filepath)
39+
{
40+
auto source_info = cudf::io::source_info(filepath);
41+
auto builder = cudf::io::parquet_reader_options::builder(source_info);
42+
auto options = builder.build();
43+
return cudf::io::read_parquet(options);
44+
}
45+
46+
/**
47+
* @brief Write parquet output to file
48+
*
49+
* @param input table to write
50+
* @param metadata metadata of input table read by parquet reader
51+
* @param filepath path to output parquet file
52+
* @param stats_level optional page size stats level
53+
*/
54+
void write_parquet(cudf::table_view input,
55+
cudf::io::table_metadata metadata,
56+
std::string filepath,
57+
cudf::io::column_encoding encoding,
58+
cudf::io::compression_type compression,
59+
std::optional<cudf::io::statistics_freq> stats_level)
60+
{
61+
// write the data for inspection
62+
auto sink_info = cudf::io::sink_info(filepath);
63+
auto builder = cudf::io::parquet_writer_options::builder(sink_info, input);
64+
auto table_metadata = cudf::io::table_input_metadata{metadata};
65+
66+
std::for_each(table_metadata.column_metadata.begin(),
67+
table_metadata.column_metadata.end(),
68+
[=](auto& col_meta) { col_meta.set_encoding(encoding); });
69+
70+
builder.metadata(table_metadata);
71+
auto options = builder.build();
72+
options.set_compression(compression);
73+
// Either use the input stats level or don't write stats
74+
options.set_stats_level(stats_level.value_or(cudf::io::statistics_freq::STATISTICS_NONE));
75+
76+
// write parquet data
77+
cudf::io::write_parquet(options);
78+
}
79+
80+
/**
81+
* @brief Main for nested_types examples
82+
*
83+
* Command line parameters:
84+
* 1. parquet input file name/path (default: "example.parquet")
85+
* 2. parquet output file name/path (default: "output.parquet")
86+
* 3. encoding type for columns (default: "DELTA_BINARY_PACKED")
87+
* 4. compression type (default: "ZSTD")
88+
* 5. optional: use page size stats metadata (default: "NO")
89+
*
90+
* Example invocation from directory `cudf/cpp/examples/parquet_io`:
91+
* ./build/parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD
92+
*
93+
*/
94+
int main(int argc, char const** argv)
95+
{
96+
std::string input_filepath;
97+
std::string output_filepath;
98+
cudf::io::column_encoding encoding;
99+
cudf::io::compression_type compression;
100+
std::optional<cudf::io::statistics_freq> page_stats;
101+
102+
switch (argc) {
103+
case 1:
104+
input_filepath = "example.parquet";
105+
output_filepath = "output.parquet";
106+
encoding = get_encoding_type("DELTA_BINARY_PACKED");
107+
compression = get_compression_type("ZSTD");
108+
break;
109+
case 6: page_stats = get_page_size_stats(argv[5]); [[fallthrough]];
110+
case 5:
111+
input_filepath = argv[1];
112+
output_filepath = argv[2];
113+
encoding = get_encoding_type(argv[3]);
114+
compression = get_compression_type(argv[4]);
115+
break;
116+
default:
117+
throw std::runtime_error(
118+
"Either provide all command-line arguments, or none to use defaults\n");
119+
}
120+
121+
// Create and use a memory pool
122+
bool is_pool_used = true;
123+
auto resource = create_memory_resource(is_pool_used);
124+
rmm::mr::set_current_device_resource(resource.get());
125+
126+
// Read input parquet file
127+
// We do not want to time the initial read time as it may include
128+
// time for nvcomp, cufile loading and RMM growth
129+
std::cout << std::endl << "Reading " << input_filepath << "..." << std::endl;
130+
std::cout << "Note: Not timing the initial parquet read as it may include\n"
131+
"times for nvcomp, cufile loading and RMM growth."
132+
<< std::endl
133+
<< std::endl;
134+
auto [input, metadata] = read_parquet(input_filepath);
135+
136+
// Status string to indicate if page stats are set to be written or not
137+
auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats";
138+
// Write parquet file with the specified encoding and compression
139+
std::cout << "Writing " << output_filepath << " with encoding, compression and "
140+
<< page_stat_string << ".." << std::endl;
141+
142+
// `timer` is automatically started here
143+
Timer timer;
144+
write_parquet(input->view(), metadata, output_filepath, encoding, compression, page_stats);
145+
timer.print_elapsed_millis();
146+
147+
// Read the parquet file written with encoding and compression
148+
std::cout << "Reading " << output_filepath << "..." << std::endl;
149+
150+
// Reset the timer
151+
timer.reset();
152+
auto [transcoded_input, transcoded_metadata] = read_parquet(output_filepath);
153+
timer.print_elapsed_millis();
154+
155+
// Check for validity
156+
try {
157+
// Left anti-join the original and transcoded tables
158+
// identical tables should not throw an exception and
159+
// return an empty indices vector
160+
auto const indices = cudf::left_anti_join(
161+
input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get());
162+
163+
// No exception thrown, check indices
164+
auto const valid = indices->size() == 0;
165+
std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl;
166+
} catch (std::exception& e) {
167+
std::cerr << e.what() << std::endl << std::endl;
168+
std::cout << "Transcoding valid: false" << std::endl;
169+
}
170+
171+
return 0;
172+
}
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
/*
2+
* Copyright (c) 2024, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#pragma once
18+
19+
#include <cudf/io/parquet.hpp>
20+
#include <cudf/io/types.hpp>
21+
#include <cudf/join.hpp>
22+
#include <cudf/table/table_view.hpp>
23+
24+
#include <rmm/cuda_device.hpp>
25+
#include <rmm/mr/device/cuda_memory_resource.hpp>
26+
#include <rmm/mr/device/device_memory_resource.hpp>
27+
#include <rmm/mr/device/owning_wrapper.hpp>
28+
#include <rmm/mr/device/pool_memory_resource.hpp>
29+
30+
#include <chrono>
31+
#include <iostream>
32+
#include <optional>
33+
#include <string>
34+
35+
/**
36+
* @brief Create memory resource for libcudf functions
37+
*
38+
* @param pool Whether to use a pool memory resource.
39+
* @return Memory resource instance
40+
*/
41+
std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_pool_used)
42+
{
43+
auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
44+
if (is_pool_used) {
45+
return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
46+
cuda_mr, rmm::percent_of_free_device_memory(50));
47+
}
48+
return cuda_mr;
49+
}
50+
51+
/**
52+
* @brief Get encoding type from the keyword
53+
*
54+
* @param name encoding keyword name
55+
* @return corresponding column encoding type
56+
*/
57+
[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name)
58+
{
59+
using encoding_type = cudf::io::column_encoding;
60+
61+
static const std::unordered_map<std::string_view, cudf::io::column_encoding> map = {
62+
{"DEFAULT", encoding_type::USE_DEFAULT},
63+
{"DICTIONARY", encoding_type::DICTIONARY},
64+
{"PLAIN", encoding_type::PLAIN},
65+
{"DELTA_BINARY_PACKED", encoding_type::DELTA_BINARY_PACKED},
66+
{"DELTA_LENGTH_BYTE_ARRAY", encoding_type::DELTA_LENGTH_BYTE_ARRAY},
67+
{"DELTA_BYTE_ARRAY", encoding_type::DELTA_BYTE_ARRAY},
68+
};
69+
70+
std::transform(name.begin(), name.end(), name.begin(), ::toupper);
71+
if (map.find(name) != map.end()) { return map.at(name); }
72+
throw std::invalid_argument("FATAL: " + std::string(name) +
73+
" is not a valid encoding type.\n\n"
74+
"Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n"
75+
"DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n"
76+
"DELTA_BYTE_ARRAY\n"
77+
"\n"
78+
"Exiting...\n");
79+
}
80+
81+
/**
82+
* @brief Get compression type from the keyword
83+
*
84+
* @param name compression keyword name
85+
* @return corresponding compression type
86+
*/
87+
[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name)
88+
{
89+
using compression_type = cudf::io::compression_type;
90+
91+
static const std::unordered_map<std::string_view, cudf::io::compression_type> map = {
92+
{"NONE", compression_type::NONE},
93+
{"AUTO", compression_type::AUTO},
94+
{"SNAPPY", compression_type::SNAPPY},
95+
{"LZ4", compression_type::LZ4},
96+
{"ZSTD", compression_type::ZSTD}};
97+
98+
std::transform(name.begin(), name.end(), name.begin(), ::toupper);
99+
if (map.find(name) != map.end()) { return map.at(name); }
100+
throw std::invalid_argument("FATAL: " + std::string(name) +
101+
" is not a valid compression type.\n\n"
102+
"Available compression_type types: NONE, AUTO, SNAPPY,\n"
103+
"LZ4, ZSTD\n"
104+
"\n"
105+
"Exiting...\n");
106+
}
107+
108+
/**
109+
* @brief Get the optional page size stat frequency from they keyword
110+
*
111+
* @param use_stats keyword affirmation string such as: Y, T, YES, TRUE, ON
112+
* @return optional page statistics frequency set to full (STATISTICS_COLUMN)
113+
*/
114+
[[nodiscard]] std::optional<cudf::io::statistics_freq> get_page_size_stats(std::string use_stats)
115+
{
116+
std::transform(use_stats.begin(), use_stats.end(), use_stats.begin(), ::toupper);
117+
118+
// Check if the input string matches to any of the following
119+
if (not use_stats.compare("ON") or not use_stats.compare("TRUE") or
120+
not use_stats.compare("YES") or not use_stats.compare("Y") or not use_stats.compare("T")) {
121+
// Full column and offset indices - STATISTICS_COLUMN
122+
return std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN);
123+
}
124+
125+
return std::nullopt;
126+
}
127+
128+
/**
129+
* @brief Light-weight timer for parquet reader and writer instrumentation
130+
*
131+
* Timer object constructed from std::chrono, instrumenting at microseconds
132+
* precision. Can display elapsed durations at milli and micro second
133+
* scales. Timer starts at object construction.
134+
*/
135+
class Timer {
136+
public:
137+
using micros = std::chrono::microseconds;
138+
using millis = std::chrono::milliseconds;
139+
140+
Timer() { reset(); }
141+
void reset() { start_time = std::chrono::high_resolution_clock::now(); }
142+
auto elapsed() { return (std::chrono::high_resolution_clock::now() - start_time); }
143+
void print_elapsed_micros()
144+
{
145+
std::cout << "Elapsed Time: " << std::chrono::duration_cast<micros>(elapsed()).count()
146+
<< "us\n\n";
147+
}
148+
void print_elapsed_millis()
149+
{
150+
std::cout << "Elapsed Time: " << std::chrono::duration_cast<millis>(elapsed()).count()
151+
<< "ms\n\n";
152+
}
153+
154+
private:
155+
using time_point_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
156+
time_point_t start_time;
157+
};

0 commit comments

Comments
 (0)