|
| 1 | +/* |
| 2 | + * Copyright (c) 2024, NVIDIA CORPORATION. |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + */ |
| 16 | + |
| 17 | +#include "parquet_io.hpp" |
| 18 | + |
| 19 | +/** |
| 20 | + * @file parquet_io.cpp |
| 21 | + * @brief Demonstrates usage of the libcudf APIs to read and write |
| 22 | + * parquet file format with different encodings and compression types |
| 23 | + * |
| 24 | + * The following encoding and compression ztypes are demonstrated: |
| 25 | + * Encoding Types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED, |
| 26 | + * DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY |
| 27 | + * |
| 28 | + * Compression Types: NONE, AUTO, SNAPPY, LZ4, ZSTD |
| 29 | + * |
| 30 | + */ |
| 31 | + |
| 32 | +/** |
| 33 | + * @brief Read parquet input from file |
| 34 | + * |
| 35 | + * @param filepath path to input parquet file |
| 36 | + * @return cudf::io::table_with_metadata |
| 37 | + */ |
| 38 | +cudf::io::table_with_metadata read_parquet(std::string filepath) |
| 39 | +{ |
| 40 | + auto source_info = cudf::io::source_info(filepath); |
| 41 | + auto builder = cudf::io::parquet_reader_options::builder(source_info); |
| 42 | + auto options = builder.build(); |
| 43 | + return cudf::io::read_parquet(options); |
| 44 | +} |
| 45 | + |
| 46 | +/** |
| 47 | + * @brief Write parquet output to file |
| 48 | + * |
| 49 | + * @param input table to write |
| 50 | + * @param metadata metadata of input table read by parquet reader |
| 51 | + * @param filepath path to output parquet file |
| 52 | + * @param stats_level optional page size stats level |
| 53 | + */ |
| 54 | +void write_parquet(cudf::table_view input, |
| 55 | + cudf::io::table_metadata metadata, |
| 56 | + std::string filepath, |
| 57 | + cudf::io::column_encoding encoding, |
| 58 | + cudf::io::compression_type compression, |
| 59 | + std::optional<cudf::io::statistics_freq> stats_level) |
| 60 | +{ |
| 61 | + // write the data for inspection |
| 62 | + auto sink_info = cudf::io::sink_info(filepath); |
| 63 | + auto builder = cudf::io::parquet_writer_options::builder(sink_info, input); |
| 64 | + auto table_metadata = cudf::io::table_input_metadata{metadata}; |
| 65 | + |
| 66 | + std::for_each(table_metadata.column_metadata.begin(), |
| 67 | + table_metadata.column_metadata.end(), |
| 68 | + [=](auto& col_meta) { col_meta.set_encoding(encoding); }); |
| 69 | + |
| 70 | + builder.metadata(table_metadata); |
| 71 | + auto options = builder.build(); |
| 72 | + options.set_compression(compression); |
| 73 | + // Either use the input stats level or don't write stats |
| 74 | + options.set_stats_level(stats_level.value_or(cudf::io::statistics_freq::STATISTICS_NONE)); |
| 75 | + |
| 76 | + // write parquet data |
| 77 | + cudf::io::write_parquet(options); |
| 78 | +} |
| 79 | + |
| 80 | +/** |
| 81 | + * @brief Main for nested_types examples |
| 82 | + * |
| 83 | + * Command line parameters: |
| 84 | + * 1. parquet input file name/path (default: "example.parquet") |
| 85 | + * 2. parquet output file name/path (default: "output.parquet") |
| 86 | + * 3. encoding type for columns (default: "DELTA_BINARY_PACKED") |
| 87 | + * 4. compression type (default: "ZSTD") |
| 88 | + * 5. optional: use page size stats metadata (default: "NO") |
| 89 | + * |
| 90 | + * Example invocation from directory `cudf/cpp/examples/parquet_io`: |
| 91 | + * ./build/parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD |
| 92 | + * |
| 93 | + */ |
| 94 | +int main(int argc, char const** argv) |
| 95 | +{ |
| 96 | + std::string input_filepath; |
| 97 | + std::string output_filepath; |
| 98 | + cudf::io::column_encoding encoding; |
| 99 | + cudf::io::compression_type compression; |
| 100 | + std::optional<cudf::io::statistics_freq> page_stats; |
| 101 | + |
| 102 | + switch (argc) { |
| 103 | + case 1: |
| 104 | + input_filepath = "example.parquet"; |
| 105 | + output_filepath = "output.parquet"; |
| 106 | + encoding = get_encoding_type("DELTA_BINARY_PACKED"); |
| 107 | + compression = get_compression_type("ZSTD"); |
| 108 | + break; |
| 109 | + case 6: page_stats = get_page_size_stats(argv[5]); [[fallthrough]]; |
| 110 | + case 5: |
| 111 | + input_filepath = argv[1]; |
| 112 | + output_filepath = argv[2]; |
| 113 | + encoding = get_encoding_type(argv[3]); |
| 114 | + compression = get_compression_type(argv[4]); |
| 115 | + break; |
| 116 | + default: |
| 117 | + throw std::runtime_error( |
| 118 | + "Either provide all command-line arguments, or none to use defaults\n"); |
| 119 | + } |
| 120 | + |
| 121 | + // Create and use a memory pool |
| 122 | + bool is_pool_used = true; |
| 123 | + auto resource = create_memory_resource(is_pool_used); |
| 124 | + rmm::mr::set_current_device_resource(resource.get()); |
| 125 | + |
| 126 | + // Read input parquet file |
| 127 | + // We do not want to time the initial read time as it may include |
| 128 | + // time for nvcomp, cufile loading and RMM growth |
| 129 | + std::cout << std::endl << "Reading " << input_filepath << "..." << std::endl; |
| 130 | + std::cout << "Note: Not timing the initial parquet read as it may include\n" |
| 131 | + "times for nvcomp, cufile loading and RMM growth." |
| 132 | + << std::endl |
| 133 | + << std::endl; |
| 134 | + auto [input, metadata] = read_parquet(input_filepath); |
| 135 | + |
| 136 | + // Status string to indicate if page stats are set to be written or not |
| 137 | + auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats"; |
| 138 | + // Write parquet file with the specified encoding and compression |
| 139 | + std::cout << "Writing " << output_filepath << " with encoding, compression and " |
| 140 | + << page_stat_string << ".." << std::endl; |
| 141 | + |
| 142 | + // `timer` is automatically started here |
| 143 | + Timer timer; |
| 144 | + write_parquet(input->view(), metadata, output_filepath, encoding, compression, page_stats); |
| 145 | + timer.print_elapsed_millis(); |
| 146 | + |
| 147 | + // Read the parquet file written with encoding and compression |
| 148 | + std::cout << "Reading " << output_filepath << "..." << std::endl; |
| 149 | + |
| 150 | + // Reset the timer |
| 151 | + timer.reset(); |
| 152 | + auto [transcoded_input, transcoded_metadata] = read_parquet(output_filepath); |
| 153 | + timer.print_elapsed_millis(); |
| 154 | + |
| 155 | + // Check for validity |
| 156 | + try { |
| 157 | + // Left anti-join the original and transcoded tables |
| 158 | + // identical tables should not throw an exception and |
| 159 | + // return an empty indices vector |
| 160 | + auto const indices = cudf::left_anti_join( |
| 161 | + input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get()); |
| 162 | + |
| 163 | + // No exception thrown, check indices |
| 164 | + auto const valid = indices->size() == 0; |
| 165 | + std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl; |
| 166 | + } catch (std::exception& e) { |
| 167 | + std::cerr << e.what() << std::endl << std::endl; |
| 168 | + std::cout << "Transcoding valid: false" << std::endl; |
| 169 | + } |
| 170 | + |
| 171 | + return 0; |
| 172 | +} |
0 commit comments