Skip to content

Commit e4f96f0

Browse files
committed
Add tool to see size of column in parquet file
1 parent 1dc19bb commit e4f96f0

File tree

3 files changed

+56
-3
lines changed

3 files changed

+56
-3
lines changed

cmd/parquet-tool/main.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
package main
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"strings"
7+
8+
"github.com/dustin/go-humanize"
9+
"github.com/olekukonko/tablewriter"
10+
"github.com/segmentio/parquet-go"
11+
)
12+
13+
func main() {
14+
f, err := os.Open("/tmp/test.parquet")
15+
if err != nil {
16+
panic(err)
17+
}
18+
defer f.Close()
19+
stats, err := f.Stat()
20+
if err != nil {
21+
panic(err)
22+
}
23+
pf, err := parquet.OpenFile(f, stats.Size())
24+
if err != nil {
25+
panic(err)
26+
}
27+
fmt.Println("schema:", pf.Schema())
28+
meta := pf.Metadata()
29+
fmt.Println("Num Rows:", meta.NumRows)
30+
31+
for i, rg := range meta.RowGroups {
32+
fmt.Println("\t Row group:", i)
33+
fmt.Println("\t\t Row Count:", rg.NumRows)
34+
fmt.Println("\t\t Row size:", humanize.Bytes(uint64(rg.TotalByteSize)))
35+
fmt.Println("\t\t Columns:")
36+
table := tablewriter.NewWriter(os.Stdout)
37+
table.SetHeader([]string{"Col", "Type", "NumVal", "TotalCompressedSize", "TotalUncompressedSize", "Compression", "%"})
38+
for _, ds := range rg.Columns {
39+
table.Append(
40+
[]string{
41+
strings.Join(ds.MetaData.PathInSchema, "/"),
42+
ds.MetaData.Type.String(),
43+
fmt.Sprintf("%d", ds.MetaData.NumValues),
44+
humanize.Bytes(uint64(ds.MetaData.TotalCompressedSize)),
45+
humanize.Bytes(uint64(ds.MetaData.TotalUncompressedSize)),
46+
fmt.Sprintf("%.2f", float64(ds.MetaData.TotalUncompressedSize-ds.MetaData.TotalCompressedSize)/float64(ds.MetaData.TotalCompressedSize)*100),
47+
fmt.Sprintf("%.2f", float64(ds.MetaData.TotalCompressedSize)/float64(rg.TotalByteSize)*100),
48+
})
49+
}
50+
table.Render()
51+
}
52+
}

go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ require (
2323
github.com/prometheus/common v0.35.0
2424
github.com/prometheus/prometheus v1.8.2-0.20220315145411-881111fec433
2525
github.com/pyroscope-io/pyroscope v0.18.0
26+
github.com/segmentio/parquet-go v0.0.0-20220623195409-8b4d4260d8cb
2627
github.com/stretchr/testify v1.7.5
2728
github.com/thanos-io/objstore v0.0.0-20220324141029-c4f11442aa33
2829
github.com/thanos-io/thanos v0.26.0
@@ -211,7 +212,6 @@ require (
211212
github.com/scaleway/scaleway-sdk-go v1.0.0-beta.9 // indirect
212213
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 // indirect
213214
github.com/segmentio/encoding v0.3.5 // indirect
214-
github.com/segmentio/parquet-go v0.0.0-20220623195409-8b4d4260d8cb // indirect
215215
github.com/sercand/kuberesolver v2.4.0+incompatible // indirect
216216
github.com/shirou/gopsutil v3.21.4+incompatible // indirect
217217
github.com/sirupsen/logrus v1.8.1 // indirect
@@ -262,6 +262,7 @@ require (
262262

263263
replace (
264264
github.com/polarsignals/frostdb => github.com/grafana/frostdb v0.0.0-20220623183226-4080ae2447c1
265+
github.com/segmentio/parquet-go => github.com/cyriltovena/parquet-go v0.0.0-20220706111652-c477af2f3d29
265266
google.golang.org/api => google.golang.org/api v0.70.0
266267
google.golang.org/grpc => google.golang.org/grpc v1.44.0
267268
)

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,8 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
414414
github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
415415
github.com/cyphar/filepath-securejoin v0.2.2/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4=
416416
github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=
417+
github.com/cyriltovena/parquet-go v0.0.0-20220706111652-c477af2f3d29 h1:oTjhKKMrex0rLrUxgyAagfuA6ANOQATrT0L3iqpwoTc=
418+
github.com/cyriltovena/parquet-go v0.0.0-20220706111652-c477af2f3d29/go.mod h1:BuMbRhCCg3gFchup9zucJaUjQ4m6RxX+iVci37CoMPQ=
417419
github.com/d2g/dhcp4 v0.0.0-20170904100407-a1d1b6c41b1c/go.mod h1:Ct2BUK8SB0YC1SMSibvLzxjeJLnrYEVLULFNiHY9YfQ=
418420
github.com/d2g/dhcp4client v1.0.0/go.mod h1:j0hNfjhrt2SxUOw55nL0ATM/z4Yt3t2Kd1mW34z5W5s=
419421
github.com/d2g/dhcp4server v0.0.0-20181031114812-7d4a0a7f59a5/go.mod h1:Eo87+Kg/IX2hfWJfwxMzLyuSZyxSoAug2nGa1G2QAi8=
@@ -1364,8 +1366,6 @@ github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod
13641366
github.com/segmentio/asm v1.1.3/go.mod h1:Ld3L4ZXGNcSLRg4JBsZ3//1+f/TjYl0Mzen/DQy1EJg=
13651367
github.com/segmentio/encoding v0.3.5 h1:UZEiaZ55nlXGDL92scoVuw00RmiRCazIEmvPSbSvt8Y=
13661368
github.com/segmentio/encoding v0.3.5/go.mod h1:n0JeuIqEQrQoPDGsjo8UNd1iA0U8d8+oHAA4E3G3OxM=
1367-
github.com/segmentio/parquet-go v0.0.0-20220623195409-8b4d4260d8cb h1:nGVNj4bt9Fn5rsEvLsuOoTuQGSdiSkPRH+BvKlIp5NY=
1368-
github.com/segmentio/parquet-go v0.0.0-20220623195409-8b4d4260d8cb/go.mod h1:BuMbRhCCg3gFchup9zucJaUjQ4m6RxX+iVci37CoMPQ=
13691369
github.com/sercand/kuberesolver v2.4.0+incompatible h1:WE2OlRf6wjLxHwNkkFLQGaZcVLEXjMjBPjjEU5vksH8=
13701370
github.com/sercand/kuberesolver v2.4.0+incompatible/go.mod h1:lWF3GL0xptCB/vCiJPl/ZshwPsX/n4Y7u0CW9E7aQIQ=
13711371
github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=

0 commit comments

Comments
 (0)