diff --git a/README.md b/README.md index b39aa2ba..8d682bd2 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ## Install -You'll need to build [RocksDB](https://github.com/facebook/rocksdb) v5.16+ on your machine. +You'll need to build [RocksDB](https://github.com/facebook/rocksdb) v6.16+ on your machine. After that, you can install gorocksdb using the following command: @@ -14,5 +14,3 @@ After that, you can install gorocksdb using the following command: Please note that this package might upgrade the required RocksDB version at any moment. Vendoring is thus highly recommended if you require high stability. - -*The [embedded CockroachDB RocksDB](https://github.com/cockroachdb/c-rocksdb) is no longer supported in gorocksdb.* diff --git a/db.go b/db.go index 64735c61..ae3e298f 100755 --- a/db.go +++ b/db.go @@ -18,9 +18,10 @@ type Range struct { // DB is a reusable handle to a RocksDB database on disk, created by Open. type DB struct { - c *C.rocksdb_t - name string - opts *Options + c *C.rocksdb_t + name string + secondaryPath string + opts *Options } // OpenDb opens a database with the specified options. @@ -139,6 +140,76 @@ func OpenDbColumnFamilies( }, cfHandles, nil } +// OpenDbColumnFamiliesWithTTL opens a database with the specified column families. +func OpenDbColumnFamiliesWithTTL( + opts *Options, + name string, + cfNames []string, + cfOpts []*Options, + cfTtls []int, +) (*DB, []*ColumnFamilyHandle, error) { + numColumnFamilies := len(cfNames) + if numColumnFamilies != len(cfOpts) { + return nil, nil, errors.New("must provide the same number of column family names and options") + } + + if numColumnFamilies != len(cfTtls) { + return nil, nil, errors.New("must provide the same number of column family names and ttls") + } + + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + + cNames := make([]*C.char, numColumnFamilies) + for i, s := range cfNames { + cNames[i] = C.CString(s) + } + defer func() { + for _, s := range cNames { + C.free(unsafe.Pointer(s)) + } + }() + + cOpts := make([]*C.rocksdb_options_t, numColumnFamilies) + for i, o := range cfOpts { + cOpts[i] = o.c + } + + cHandles := make([]*C.rocksdb_column_family_handle_t, numColumnFamilies) + + cTtls := make([]C.int, numColumnFamilies) + for i, t := range cfTtls { + cTtls[i] = C.int(t) + } + + var cErr *C.char + db := C.rocksdb_open_column_families_with_ttl( + opts.c, + cName, + C.int(numColumnFamilies), + &cNames[0], + &cOpts[0], + &cHandles[0], + &cTtls[0], + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, nil, errors.New(C.GoString(cErr)) + } + + cfHandles := make([]*ColumnFamilyHandle, numColumnFamilies) + for i, c := range cHandles { + cfHandles[i] = NewNativeColumnFamilyHandle(c) + } + + return &DB{ + name: name, + c: db, + opts: opts, + }, cfHandles, nil +} + // OpenDbForReadOnlyColumnFamilies opens a database with the specified column // families in read only mode. func OpenDbForReadOnlyColumnFamilies( @@ -201,6 +272,93 @@ func OpenDbForReadOnlyColumnFamilies( }, cfHandles, nil } +// OpenDbAsSecondary opens a database with the specified options for secondary usage. +func OpenDbAsSecondary(opts *Options, name string, secondaryPath string) (*DB, error) { + var ( + cErr *C.char + cName = C.CString(name) + cSecondaryPath = C.CString(secondaryPath) + ) + defer C.free(unsafe.Pointer(cName)) + defer C.free(unsafe.Pointer(cSecondaryPath)) + db := C.rocksdb_open_as_secondary(opts.c, cName, cSecondaryPath, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return &DB{ + name: name, + secondaryPath: secondaryPath, + c: db, + opts: opts, + }, nil +} + +// OpenDbAsSecondaryColumnFamilies opens a database with the specified column +// families in secondary mode. +func OpenDbAsSecondaryColumnFamilies( + opts *Options, + name string, + secondaryPath string, + cfNames []string, + cfOpts []*Options, +) (*DB, []*ColumnFamilyHandle, error) { + numColumnFamilies := len(cfNames) + if numColumnFamilies != len(cfOpts) { + return nil, nil, errors.New("must provide the same number of column family names and options") + } + + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + + cSecondaryPath := C.CString(secondaryPath) + defer C.free(unsafe.Pointer(cSecondaryPath)) + + cNames := make([]*C.char, numColumnFamilies) + for i, s := range cfNames { + cNames[i] = C.CString(s) + } + defer func() { + for _, s := range cNames { + C.free(unsafe.Pointer(s)) + } + }() + + cOpts := make([]*C.rocksdb_options_t, numColumnFamilies) + for i, o := range cfOpts { + cOpts[i] = o.c + } + + cHandles := make([]*C.rocksdb_column_family_handle_t, numColumnFamilies) + + var cErr *C.char + db := C.rocksdb_open_as_secondary_column_families( + opts.c, + cName, + cSecondaryPath, + C.int(numColumnFamilies), + &cNames[0], + &cOpts[0], + &cHandles[0], + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, nil, errors.New(C.GoString(cErr)) + } + + cfHandles := make([]*ColumnFamilyHandle, numColumnFamilies) + for i, c := range cHandles { + cfHandles[i] = NewNativeColumnFamilyHandle(c) + } + + return &DB{ + name: name, + c: db, + opts: opts, + }, cfHandles, nil +} + // ListColumnFamilies lists the names of the column families in the DB. func ListColumnFamilies(opts *Options, name string) ([]string, error) { var ( @@ -237,6 +395,11 @@ func (db *DB) Name() string { return db.name } +// SecondaryPath returns the secondary path of the database, if it is a secondary database instance. +func (db *DB) SecondaryPath() string { + return db.secondaryPath +} + // Get returns the data associated with the key from the database. func (db *DB) Get(opts *ReadOptions, key []byte) (*Slice, error) { var ( @@ -566,6 +729,22 @@ func (db *DB) CreateColumnFamily(opts *Options, name string) (*ColumnFamilyHandl return NewNativeColumnFamilyHandle(cHandle), nil } +// CreateColumnFamilyWithTTL creates a new column family with a TTL. +func (db *DB) CreateColumnFamilyWithTTL(opts *Options, name string, ttl int) (*ColumnFamilyHandle, error) { + var ( + cErr *C.char + cName = C.CString(name) + cTtl = C.int(ttl) + ) + defer C.free(unsafe.Pointer(cName)) + cHandle := C.rocksdb_create_column_family_with_ttl(db.c, opts.c, cName, cTtl, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewNativeColumnFamilyHandle(cHandle), nil +} + // DropColumnFamily drops a column family. func (db *DB) DropColumnFamily(c *ColumnFamilyHandle) error { var cErr *C.char @@ -582,10 +761,10 @@ func (db *DB) DropColumnFamily(c *ColumnFamilyHandle) error { // // The keys counted will begin at Range.Start and end on the key before // Range.Limit. -func (db *DB) GetApproximateSizes(ranges []Range) []uint64 { +func (db *DB) GetApproximateSizes(ranges []Range) ([]uint64, error) { sizes := make([]uint64, len(ranges)) if len(ranges) == 0 { - return sizes + return sizes, nil } cStarts := make([]*C.char, len(ranges)) @@ -606,6 +785,7 @@ func (db *DB) GetApproximateSizes(ranges []Range) []uint64 { } }() + var cErr *C.char C.rocksdb_approximate_sizes( db.c, C.int(len(ranges)), @@ -613,9 +793,15 @@ func (db *DB) GetApproximateSizes(ranges []Range) []uint64 { &cStartLens[0], &cLimits[0], &cLimitLens[0], - (*C.uint64_t)(&sizes[0])) + (*C.uint64_t)(&sizes[0]), + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return sizes, errors.New(C.GoString(cErr)) + } - return sizes + return sizes, nil } // GetApproximateSizesCF returns the approximate number of bytes of file system @@ -623,10 +809,10 @@ func (db *DB) GetApproximateSizes(ranges []Range) []uint64 { // // The keys counted will begin at Range.Start and end on the key before // Range.Limit. -func (db *DB) GetApproximateSizesCF(cf *ColumnFamilyHandle, ranges []Range) []uint64 { +func (db *DB) GetApproximateSizesCF(cf *ColumnFamilyHandle, ranges []Range) ([]uint64, error) { sizes := make([]uint64, len(ranges)) if len(ranges) == 0 { - return sizes + return sizes, nil } cStarts := make([]*C.char, len(ranges)) @@ -647,6 +833,7 @@ func (db *DB) GetApproximateSizesCF(cf *ColumnFamilyHandle, ranges []Range) []ui } }() + var cErr *C.char C.rocksdb_approximate_sizes_cf( db.c, cf.c, @@ -655,9 +842,14 @@ func (db *DB) GetApproximateSizesCF(cf *ColumnFamilyHandle, ranges []Range) []ui &cStartLens[0], &cLimits[0], &cLimitLens[0], - (*C.uint64_t)(&sizes[0])) - - return sizes + (*C.uint64_t)(&sizes[0]), + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return sizes, errors.New(C.GoString(cErr)) + } + return sizes, nil } // SetOptions dynamically changes options through the SetOptions API. @@ -752,6 +944,17 @@ func (db *DB) Flush(opts *FlushOptions) error { return nil } +// FlushCF triggers a manual flush for the column family. +func (db *DB) FlushCF(cf *ColumnFamilyHandle, opts *FlushOptions) error { + var cErr *C.char + C.rocksdb_flush_cf(db.c, opts.c, cf.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + // DisableFileDeletions disables file deletions and should be used when backup the database. func (db *DB) DisableFileDeletions() error { var cErr *C.char @@ -907,6 +1110,20 @@ func (db *DB) Close() { C.rocksdb_close(db.c) } +// TryCatchUpWithPrimary will sync a secondary db with the state of the primary +func (db *DB) TryCatchUpWithPrimary() error { + var ( + cErr *C.char + ) + C.rocksdb_try_catch_up_with_primary(db.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + + return nil +} + // DestroyDb removes a database entirely, removing everything from the // filesystem. func DestroyDb(name string, opts *Options) error { diff --git a/db_test.go b/db_test.go index 4ccc7aa8..fc5814d9 100755 --- a/db_test.go +++ b/db_test.go @@ -2,6 +2,7 @@ package gorocksdb import ( "io/ioutil" + "os" "strconv" "testing" @@ -13,6 +14,45 @@ func TestOpenDb(t *testing.T) { defer db.Close() } +func TestOpenDbColumnFamiliesWithTTL(t *testing.T) { + dir, err := ioutil.TempDir("", "gorocksdb-TestOpenDbColumnFamiliesWithTtl") + ensure.Nil(t, err) + + opts := NewDefaultOptions() + defer opts.Destroy() + + opts.SetCreateIfMissing(true) + opts.SetCreateIfMissingColumnFamilies(true) + + db, _, err := OpenDbColumnFamiliesWithTTL(opts, dir, []string{"default", "mycf"}, []*Options{opts, opts}, []int{3600, 3600}) + defer db.Close() + + ensure.Nil(t, err) +} + +func TestCreateColumnFamilyWithTTL(t *testing.T) { + db := newTestDBWithTTL(t, "TestCreateColumnFamilyWithTTL", nil) + defer db.Close() + + var ( + givenKey = []byte("hello") + givenVal = []byte("world") + o = NewDefaultOptions() + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + ) + + cf, err := db.CreateColumnFamilyWithTTL(o, "cf", 3600) + ensure.Nil(t, err) + + ensure.Nil(t, db.PutCF(wo, cf, givenKey, givenVal)) + + v, err := db.GetCF(ro, cf, givenKey) + defer v.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v.Data(), givenVal) +} + func TestDBCRUD(t *testing.T) { db := newTestDB(t, "TestDBGet", nil) defer db.Close() @@ -141,6 +181,46 @@ func newTestDB(t *testing.T, name string, applyOpts func(opts *Options)) *DB { return db } +func newTestDBWithTTL(t *testing.T, name string, applyOpts func(opts *Options)) *DB { + dir, err := ioutil.TempDir("", "gorocksdb-"+name) + ensure.Nil(t, err) + + opts := NewDefaultOptions() + // test the ratelimiter + rateLimiter := NewRateLimiter(1024, 100*1000, 10) + opts.SetRateLimiter(rateLimiter) + opts.SetCreateIfMissing(true) + if applyOpts != nil { + applyOpts(opts) + } + db, err := OpenDbWithTTL(opts, dir, 3600) + ensure.Nil(t, err) + + return db +} + +func newSecondaryTestDB(t *testing.T, name string) *DB { + secondaryDir := name + "-secondary" + + opts := NewDefaultOptions() + opts.SetMaxOpenFiles(-1) + db, err := OpenDbAsSecondary(opts, name, secondaryDir) + ensure.Nil(t, err) + + return db +} + +func newSecondaryTestDBCF(t *testing.T, name string, cfNames []string, cfOpts []*Options) (*DB, []*ColumnFamilyHandle) { + secondaryDir := name + "-secondary" + + opts := NewDefaultOptions() + opts.SetMaxOpenFiles(-1) + db, handles, err := OpenDbAsSecondaryColumnFamilies(opts, name, secondaryDir, cfNames, cfOpts) + ensure.Nil(t, err) + + return db, handles +} + func newTestDBPathNames(t *testing.T, name string, names []string, target_sizes []uint64, applyOpts func(opts *Options)) *DB { ensure.DeepEqual(t, len(target_sizes), len(names)) ensure.NotDeepEqual(t, len(names), 0) @@ -210,16 +290,19 @@ func TestDBGetApproximateSizes(t *testing.T) { defer db.Close() // no ranges - sizes := db.GetApproximateSizes(nil) + sizes, err := db.GetApproximateSizes(nil) ensure.DeepEqual(t, len(sizes), 0) + ensure.Nil(t, err) // range will nil start and limit - sizes = db.GetApproximateSizes([]Range{{Start: nil, Limit: nil}}) + sizes, err = db.GetApproximateSizes([]Range{{Start: nil, Limit: nil}}) ensure.DeepEqual(t, sizes, []uint64{0}) + ensure.Nil(t, err) // valid range - sizes = db.GetApproximateSizes([]Range{{Start: []byte{0x00}, Limit: []byte{0xFF}}}) + sizes, err = db.GetApproximateSizes([]Range{{Start: []byte{0x00}, Limit: []byte{0xFF}}}) ensure.DeepEqual(t, sizes, []uint64{0}) + ensure.Nil(t, err) } func TestDBGetApproximateSizesCF(t *testing.T) { @@ -232,14 +315,143 @@ func TestDBGetApproximateSizesCF(t *testing.T) { ensure.Nil(t, err) // no ranges - sizes := db.GetApproximateSizesCF(cf, nil) + sizes, err := db.GetApproximateSizesCF(cf, nil) ensure.DeepEqual(t, len(sizes), 0) + ensure.Nil(t, err) // range will nil start and limit - sizes = db.GetApproximateSizesCF(cf, []Range{{Start: nil, Limit: nil}}) + sizes, err = db.GetApproximateSizesCF(cf, []Range{{Start: nil, Limit: nil}}) ensure.DeepEqual(t, sizes, []uint64{0}) + ensure.Nil(t, err) // valid range - sizes = db.GetApproximateSizesCF(cf, []Range{{Start: []byte{0x00}, Limit: []byte{0xFF}}}) + sizes, err = db.GetApproximateSizesCF(cf, []Range{{Start: []byte{0x00}, Limit: []byte{0xFF}}}) ensure.DeepEqual(t, sizes, []uint64{0}) + ensure.Nil(t, err) +} + +func TestDBFlushCF(t *testing.T) { + var ( + db = newTestDB(t, "TestDBFlushCF", nil) + o = NewDefaultOptions() + wo = NewDefaultWriteOptions() + fo = NewDefaultFlushOptions() + + key1 = []byte("hello1") + val1 = []byte("world1") + ) + defer func() { + fo.Destroy() + wo.Destroy() + db.Close() + }() + + cf, err := db.CreateColumnFamily(o, "other") + ensure.Nil(t, err) + + // update + ensure.Nil(t, db.PutCF(wo, cf, key1, val1)) + + // flush CF + ensure.Nil(t, db.FlushCF(cf, fo)) +} + +func TestSecondaryDB(t *testing.T) { + var ( + db = newTestDB(t, "TestSecondaryDB", nil) + secondaryDB = newSecondaryTestDB(t, db.Name()) + ro = NewDefaultReadOptions() + wo = NewDefaultWriteOptions() + fo = NewDefaultFlushOptions() + ) + defer func() { + fo.Destroy() + wo.Destroy() + ro.Destroy() + secondaryDB.Close() + db.Close() + + os.RemoveAll(secondaryDB.SecondaryPath()) + os.RemoveAll(db.Name()) + }() + + // Put a key into the primary database + ensure.Nil(t, db.Put(wo, []byte("hello"), []byte("world"))) + ensure.Nil(t, db.Flush(fo)) + + // Ensure the key is written correctly + s, err := db.Get(ro, []byte("hello")) + ensure.Nil(t, err) + ensure.NotNil(t, s) + + // Get the key from the secondary database, and ensure that we cannot see the key yet + s, err = secondaryDB.Get(ro, []byte("hello")) + ensure.Nil(t, err) + ensure.NotNil(t, s) + ensure.DeepEqual(t, s.Data(), []byte(nil)) + + // Catch up the secondary with the current state of the primary + err = secondaryDB.TryCatchUpWithPrimary() + ensure.Nil(t, err) + + // Ensure that now that it has caught up that the key is now present + s, err = secondaryDB.Get(ro, []byte("hello")) + ensure.Nil(t, err) + ensure.NotNil(t, s) + ensure.DeepEqual(t, s.Data(), []byte("world")) +} + +func TestSecondaryDBColumnFamilies(t *testing.T) { + var ( + db = newTestDB(t, "TestSecondaryDB", nil) + o = NewDefaultOptions() + ro = NewDefaultReadOptions() + wo = NewDefaultWriteOptions() + fo = NewDefaultFlushOptions() + ) + defer func() { + fo.Destroy() + wo.Destroy() + ro.Destroy() + o.Destroy() + db.Close() + + os.RemoveAll(db.Name()) + }() + + // Create a column family + primaryCF, err := db.CreateColumnFamily(o, "mycf") + ensure.Nil(t, err) + + // Open a secondary database, opening the created column family + secondaryDB, handles := newSecondaryTestDBCF(t, db.Name(), []string{"default", "mycf"}, []*Options{o, o}) + defer func() { + secondaryDB.Close() + os.RemoveAll(secondaryDB.SecondaryPath()) + }() + + // Put a key into the primary database + ensure.Nil(t, db.PutCF(wo, primaryCF, []byte("hello"), []byte("world"))) + ensure.Nil(t, db.FlushCF(primaryCF, fo)) + + // Ensure the key is written correctly + s, err := db.GetCF(ro, primaryCF, []byte("hello")) + ensure.Nil(t, err) + ensure.NotNil(t, s) + + // Get the key from the secondary database, and ensure that we cannot see the key yet + s, err = secondaryDB.GetCF(ro, handles[1], []byte("hello")) + ensure.Nil(t, err) + ensure.NotNil(t, s) + ensure.DeepEqual(t, s.Data(), []byte(nil)) + + // Catch up the secondary with the current state of the primary + err = secondaryDB.TryCatchUpWithPrimary() + ensure.Nil(t, err) + + // Ensure that now that it has caught up that the key is now present + s, err = secondaryDB.GetCF(ro, handles[1], []byte("hello")) + ensure.Nil(t, err) + ensure.NotNil(t, s) + ensure.DeepEqual(t, s.Data(), []byte("world")) } diff --git a/dynflag.go b/dynflag.go index 18c18f40..91229639 100644 --- a/dynflag.go +++ b/dynflag.go @@ -1,6 +1,7 @@ -// +build !linux !static +//go:build !linux || !rocksdbstatic +// +build !linux !rocksdbstatic package gorocksdb -// #cgo LDFLAGS: -lrocksdb -lstdc++ -lm -ldl +// #cgo LDFLAGS: -lrocksdb -lstdc++ -lm -lz -lbz2 -lsnappy -llz4 -lzstd -ldl import "C" diff --git a/memory_usage.go b/memory_usage.go index 7b9a6ad6..a2fad4ef 100644 --- a/memory_usage.go +++ b/memory_usage.go @@ -20,16 +20,38 @@ type MemoryUsage struct { CacheTotal uint64 } +type NativeDB interface { + getNativeDB() *C.rocksdb_t +} + +func (db *DB) getNativeDB() *C.rocksdb_t { + return db.c +} + +func (db *TransactionDB) getNativeDB() *C.rocksdb_t { + return (*C.rocksdb_t)(db.c) +} + // GetApproximateMemoryUsageByType returns summary // memory usage stats for given databases and caches. func GetApproximateMemoryUsageByType(dbs []*DB, caches []*Cache) (*MemoryUsage, error) { + nativeDBs := make([]NativeDB, 0, len(dbs)) + for _, db := range dbs { + nativeDBs = append(nativeDBs, db) + } + return GetApproximateMemoryUsageByTypeNativeDB(nativeDBs, caches) +} + +// GetApproximateMemoryUsageByTypeNativeDB returns summary +// memory usage stats for given databases and caches. +func GetApproximateMemoryUsageByTypeNativeDB(dbs []NativeDB, caches []*Cache) (*MemoryUsage, error) { // register memory consumers consumers := C.rocksdb_memory_consumers_create() defer C.rocksdb_memory_consumers_destroy(consumers) for _, db := range dbs { if db != nil { - C.rocksdb_memory_consumers_add_db(consumers, db.c) + C.rocksdb_memory_consumers_add_db(consumers, (db.getNativeDB())) } } for _, cache := range caches { diff --git a/memory_usage_test.go b/memory_usage_test.go index 7fc6eaa3..fd541d6a 100644 --- a/memory_usage_test.go +++ b/memory_usage_test.go @@ -14,6 +14,7 @@ func TestMemoryUsage(t *testing.T) { cache := NewLRUCache(8 * 1024 * 1024) bbto := NewDefaultBlockBasedTableOptions() bbto.SetBlockCache(cache) + defer bbto.Destroy() defer cache.Destroy() applyOpts := func(opts *Options) { @@ -40,6 +41,11 @@ func TestMemoryUsage(t *testing.T) { err = db.Put(wo, key, value) ensure.Nil(t, err) + + // A single Put is not enough to increase approximate memtable usage. + err = db.Put(wo, key, value) + ensure.Nil(t, err) + _, err = db.Get(ro, key) ensure.Nil(t, err) @@ -54,3 +60,55 @@ func TestMemoryUsage(t *testing.T) { assert.True(t, mu2.CacheTotal >= mu1.CacheTotal) assert.True(t, mu2.MemTableReadersTotal >= mu1.MemTableReadersTotal) } + +func TestMemoryUsageTransactionDB(t *testing.T) { + // create database with cache + cache := NewLRUCache(8 * 1024 * 1024) + bbto := NewDefaultBlockBasedTableOptions() + bbto.SetBlockCache(cache) + defer bbto.Destroy() + defer cache.Destroy() + + applyOpts := func(opts *Options, transactionDBOpts *TransactionDBOptions) { + opts.SetBlockBasedTableFactory(bbto) + } + + db := newTestTransactionDB(t, "TestMemoryUsage", applyOpts) + defer db.Close() + + // take first memory usage snapshot + mu1, err := GetApproximateMemoryUsageByTypeNativeDB([]NativeDB{db}, []*Cache{cache}) + ensure.Nil(t, err) + + // perforx`m IO operations that will affect in-memory tables (and maybe cache as well) + wo := NewDefaultWriteOptions() + defer wo.Destroy() + ro := NewDefaultReadOptions() + defer ro.Destroy() + + key := []byte("key") + value := make([]byte, 1024) + _, err = rand.Read(value) + ensure.Nil(t, err) + + err = db.Put(wo, key, value) + ensure.Nil(t, err) + + // A single Put is not enough to increase approximate memtable usage. + err = db.Put(wo, key, value) + ensure.Nil(t, err) + + _, err = db.Get(ro, key) + ensure.Nil(t, err) + + // take second memory usage snapshot + mu2, err := GetApproximateMemoryUsageByTypeNativeDB([]NativeDB{db}, []*Cache{cache}) + ensure.Nil(t, err) + + // the amount of memory used by memtables should increase after write/read; + // cache memory usage is not likely to be changed, perhaps because requested key is kept by memtable + assert.True(t, mu2.MemTableTotal > mu1.MemTableTotal) + assert.True(t, mu2.MemTableUnflushed > mu1.MemTableUnflushed) + assert.True(t, mu2.CacheTotal >= mu1.CacheTotal) + assert.True(t, mu2.MemTableReadersTotal >= mu1.MemTableReadersTotal) +} diff --git a/options.go b/options.go index 07000215..1ad122b8 100644 --- a/options.go +++ b/options.go @@ -234,7 +234,8 @@ func (opts *Options) SetParanoidChecks(value bool) { // // For example, you have a flash device with 10GB allocated for the DB, // as well as a hard drive of 2TB, you should config it to be: -// [{"/flash_path", 10GB}, {"/hard_drive", 2TB}] +// +// [{"/flash_path", 10GB}, {"/hard_drive", 2TB}] // // The system will try to guarantee data under each path is close to but // not larger than the target size. But current and future file sizes used @@ -405,6 +406,12 @@ func (opts *Options) SetCompression(value CompressionType) { C.rocksdb_options_set_compression(opts.c, C.int(value)) } +// SetBottommostCompression sets the compression algorithm for the bottommost level (level with most data). +// Default: NoCompression +func (opts *Options) SetBottommostCompression(value CompressionType) { + C.rocksdb_options_set_bottommost_compression(opts.c, C.int(value)) +} + // SetCompressionPerLevel sets different compression algorithm per level. // // Different levels can have different compression policies. There @@ -546,11 +553,12 @@ func (opts *Options) SetMaxBytesForLevelMultiplier(value float64) { // We will pick a base level b >= 1. L0 will be directly merged into level b, // instead of always into level 1. Level 1 to b-1 need to be empty. // We try to pick b and its target size so that -// 1. target size is in the range of -// (max_bytes_for_level_base / max_bytes_for_level_multiplier, -// max_bytes_for_level_base] -// 2. target size of the last level (level num_levels-1) equals to extra size -// of the level. +// 1. target size is in the range of +// (max_bytes_for_level_base / max_bytes_for_level_multiplier, +// max_bytes_for_level_base] +// 2. target size of the last level (level num_levels-1) equals to extra size +// of the level. +// // At the same time max_bytes_for_level_multiplier and // max_bytes_for_level_multiplier_additional are still satisfied. // @@ -821,17 +829,18 @@ func (opts *Options) SetWALRecoveryMode(mode WALRecoveryMode) { // SetWALTtlSeconds sets the WAL ttl in seconds. // // The following two options affect how archived logs will be deleted. -// 1. If both set to 0, logs will be deleted asap and will not get into -// the archive. -// 2. If wal_ttl_seconds is 0 and wal_size_limit_mb is not 0, -// WAL files will be checked every 10 min and if total size is greater -// then wal_size_limit_mb, they will be deleted starting with the -// earliest until size_limit is met. All empty files will be deleted. -// 3. If wal_ttl_seconds is not 0 and wall_size_limit_mb is 0, then -// WAL files will be checked every wal_ttl_seconds / 2 and those that -// are older than wal_ttl_seconds will be deleted. -// 4. If both are not 0, WAL files will be checked every 10 min and both -// checks will be performed with ttl being first. +// 1. If both set to 0, logs will be deleted asap and will not get into +// the archive. +// 2. If wal_ttl_seconds is 0 and wal_size_limit_mb is not 0, +// WAL files will be checked every 10 min and if total size is greater +// then wal_size_limit_mb, they will be deleted starting with the +// earliest until size_limit is met. All empty files will be deleted. +// 3. If wal_ttl_seconds is not 0 and wall_size_limit_mb is 0, then +// WAL files will be checked every wal_ttl_seconds / 2 and those that +// are older than wal_ttl_seconds will be deleted. +// 4. If both are not 0, WAL files will be checked every 10 min and both +// checks will be performed with ttl being first. +// // Default: 0 func (opts *Options) SetWALTtlSeconds(value uint64) { C.rocksdb_options_set_WAL_ttl_seconds(opts.c, C.uint64_t(value)) @@ -853,6 +862,13 @@ func (opts *Options) SetEnablePipelinedWrite(value bool) { C.rocksdb_options_set_enable_pipelined_write(opts.c, boolToChar(value)) } +// SetUnorderedWrite enables unordered writes +// +// Default: false +func (opts *Options) SetUnorderedWrite(value bool) { + C.rocksdb_options_set_unordered_write(opts.c, boolToChar(value)) +} + // SetManifestPreallocationSize sets the number of bytes // to preallocate (via fallocate) the manifest files. // @@ -1036,7 +1052,9 @@ func (opts *Options) SetInplaceUpdateNumLocks(value int) { // If <=0, it won't allocate from huge page but from malloc. // Users are responsible to reserve huge pages for it to be allocated. For // example: -// sysctl -w vm.nr_hugepages=20 +// +// sysctl -w vm.nr_hugepages=20 +// // See linux doc Documentation/vm/hugetlbpage.txt // If there isn't enough free huge page available, it will fall back to // malloc. @@ -1074,6 +1092,13 @@ func (opts *Options) SetMaxSuccessiveMerges(value int) { C.rocksdb_options_set_max_successive_merges(opts.c, C.size_t(value)) } +// SetDumpMallocStats will print malloc statistics to the LOG file for the +// database if set to true - jemalloc must be turned on for this to work. +// Default: false +func (opts *Options) SetDumpMallocStats(value bool) { + C.rocksdb_options_set_dump_malloc_stats(opts.c, boolToChar(value)) +} + // EnableStatistics enable statistics. func (opts *Options) EnableStatistics() { C.rocksdb_options_enable_statistics(opts.c) @@ -1104,7 +1129,8 @@ func (opts *Options) SetMemtableVectorRep() { // bucketCount: number of fixed array buckets // skiplistHeight: the max height of the skiplist // skiplistBranchingFactor: probabilistic size ratio between adjacent -// link lists in the skiplist +// +// link lists in the skiplist func (opts *Options) SetHashSkipListRep(bucketCount int, skiplistHeight, skiplistBranchingFactor int32) { C.rocksdb_options_set_hash_skip_list_rep(opts.c, C.size_t(bucketCount), C.int32_t(skiplistHeight), C.int32_t(skiplistBranchingFactor)) } @@ -1127,14 +1153,21 @@ func (opts *Options) SetHashLinkListRep(bucketCount int) { // a linear search is used. // // keyLen: plain table has optimization for fix-sized keys, -// which can be specified via keyLen. +// +// which can be specified via keyLen. +// // bloomBitsPerKey: the number of bits used for bloom filer per prefix. You -// may disable it by passing a zero. +// +// may disable it by passing a zero. +// // hashTableRatio: the desired utilization of the hash table used for prefix -// hashing. hashTableRatio = number of prefixes / #buckets -// in the hash table +// +// hashing. hashTableRatio = number of prefixes / #buckets +// in the hash table +// // indexSparseness: inside each prefix, need to build one index record for how -// many keys for binary search inside each hash bucket. +// +// many keys for binary search inside each hash bucket. func (opts *Options) SetPlainTableFactory(keyLen uint32, bloomBitsPerKey int, hashTableRatio float64, indexSparseness int) { C.rocksdb_options_set_plain_table_factory(opts.c, C.uint32_t(keyLen), C.int(bloomBitsPerKey), C.double(hashTableRatio), C.size_t(indexSparseness)) } @@ -1196,6 +1229,32 @@ func (opts *Options) SetOptimizeFiltersForHits(value bool) { C.rocksdb_options_set_optimize_filters_for_hits(opts.c, C.int(btoi(value))) } +// SetAtomicFlush sets atomic_flush +// RocksDB supports atomic flush of multiple column families if the DB option +// atomic_flush is set to true. The execution result of flushing multiple +// column families is written to the MANIFEST with 'all-or-nothing' guarantee +// (logically). With atomic flush, either all or no memtables of the column +// families of interest are persisted to SST files and added to the database. +// +// This can be desirable if data in multiple column families must be consistent +// with each other. For example, imagine there is one metadata column family +// meta_cf, and a data column family data_cf. Every time we write a new record +// to data_cf, we also write its metadata to meta_cf. meta_cf and data_cf must +// be flushed atomically. Database becomes inconsistent if one of them is +// persisted but the other is not. Atomic flush provides a good guarantee. +// Suppose at a certain time, kv1 exists in the memtables of meta_cf and kv2 +// exists in the memtables of data_cf. After atomically flushing these two +// column families, both kv1 and kv2 are persistent if the flush succeeds. +// Otherwise neither of them exist in the database. +// +// Since atomic flush also goes through the write_thread, it is guaranteed that +// no flush can occur in the middle of write batch. +// +// Default: false +func (opts *Options) SetAtomicFlush(value bool) { + C.rocksdb_options_set_atomic_flush(opts.c, C.uchar(btoi(value))) +} + // Destroy deallocates the Options object. func (opts *Options) Destroy() { C.rocksdb_options_destroy(opts.c) diff --git a/options_read.go b/options_read.go index 6a37cc48..32c9c681 100644 --- a/options_read.go +++ b/options_read.go @@ -129,6 +129,27 @@ func (opts *ReadOptions) SetReadaheadSize(value uint64) { C.rocksdb_readoptions_set_readahead_size(opts.c, C.size_t(value)) } +// SetTotalOrderSeek specifies the value of "total_order_seek". +// Enable a total order seek regardless of index format (e.g. hash index) +// used in the table. Some table format (e.g. plain table) may not support +// this option. +// If true when calling Get(), we also skip prefix bloom when reading from +// block based table. It provides a way to read existing data after +// changing implementation of prefix extractor. +// Default: false +func (opts *ReadOptions) SetTotalOrderSeek(value bool) { + C.rocksdb_readoptions_set_total_order_seek(opts.c, boolToChar(value)) +} + +// SetIgnoreRangeDeletions specifies the value of "ignore_range_deletions". +// If true, keys deleted using the DeleteRange() API will be visible to +// readers until they are naturally deleted during compaction. This improves +// read performance in DBs with many range deletions. +// Default: false +func (opts *ReadOptions) SetIgnoreRangeDeletions(value bool) { + C.rocksdb_readoptions_set_ignore_range_deletions(opts.c, boolToChar(value)) +} + // Destroy deallocates the ReadOptions object. func (opts *ReadOptions) Destroy() { C.rocksdb_readoptions_destroy(opts.c) diff --git a/slice.go b/slice.go index 707a1f2e..01ecfd3c 100644 --- a/slice.go +++ b/slice.go @@ -56,6 +56,14 @@ func (s *Slice) Free() { } } +// Copy returns a new copy of the slice and frees the slice. +func (s *Slice) Copy() []byte { + r := make([]byte, s.size) + copy(r, s.Data()) + s.Free() + return r +} + // PinnableSliceHandle represents a handle to a PinnableSlice. type PinnableSliceHandle struct { c *C.rocksdb_pinnableslice_t diff --git a/staticflag_linux.go b/staticflag_linux.go index 3af044ef..a7a8f420 100644 --- a/staticflag_linux.go +++ b/staticflag_linux.go @@ -1,6 +1,7 @@ -// +build static +//go:build rocksdbstatic +// +build rocksdbstatic package gorocksdb -// #cgo LDFLAGS: -l:librocksdb.a -l:libstdc++.a -lm -ldl +// #cgo LDFLAGS: -l:librocksdb.a -l:libstdc++.a -l:libz.a -l:libbz2.a -l:libsnappy.a -l:liblz4.a -l:libzstd.a -lm -ldl import "C" diff --git a/transaction.go b/transaction.go index 67c9ef09..c6388899 100644 --- a/transaction.go +++ b/transaction.go @@ -63,6 +63,23 @@ func (transaction *Transaction) Get(opts *ReadOptions, key []byte) (*Slice, erro return NewSlice(cValue, cValLen), nil } +// GetCF returns the data associated with the key in a given column family from the database given this transaction. +func (transaction *Transaction) GetCF(opts *ReadOptions, cf_handle *ColumnFamilyHandle, key []byte) (*Slice, error) { + var ( + cErr *C.char + cValLen C.size_t + cKey = byteToChar(key) + ) + cValue := C.rocksdb_transaction_get_cf( + transaction.c, opts.c, cf_handle.c, cKey, C.size_t(len(key)), &cValLen, &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewSlice(cValue, cValLen), nil +} + // GetForUpdate queries the data associated with the key and puts an exclusive lock on the key from the database given this transaction. func (transaction *Transaction) GetForUpdate(opts *ReadOptions, key []byte) (*Slice, error) { var ( @@ -80,6 +97,24 @@ func (transaction *Transaction) GetForUpdate(opts *ReadOptions, key []byte) (*Sl return NewSlice(cValue, cValLen), nil } +// GetForUpdateCF queries the data associated with the key in a given column family +// and puts an exclusive lock on the key from the database given this transaction. +func (transaction *Transaction) GetForUpdateCF(opts *ReadOptions, cf_handle *ColumnFamilyHandle, key []byte) (*Slice, error) { + var ( + cErr *C.char + cValLen C.size_t + cKey = byteToChar(key) + ) + cValue := C.rocksdb_transaction_get_for_update_cf( + transaction.c, opts.c, cf_handle.c, cKey, C.size_t(len(key)), &cValLen, C.uchar(byte(1)) /*exclusive*/, &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewSlice(cValue, cValLen), nil +} + // Put writes data associated with a key to the transaction. func (transaction *Transaction) Put(key, value []byte) error { var ( @@ -97,6 +132,23 @@ func (transaction *Transaction) Put(key, value []byte) error { return nil } +// PutCF writes data associated with a key in a given family to the transaction. +func (transaction *Transaction) PutCF(cf_handle *ColumnFamilyHandle, key, value []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + cValue = byteToChar(value) + ) + C.rocksdb_transaction_put_cf( + transaction.c, cf_handle.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value)), &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + // Delete removes the data associated with the key from the transaction. func (transaction *Transaction) Delete(key []byte) error { var ( @@ -111,6 +163,20 @@ func (transaction *Transaction) Delete(key []byte) error { return nil } +// DeleteCF removes the data in a given column family associated with the key from the transaction. +func (transaction *Transaction) DeleteCF(cf_handle *ColumnFamilyHandle, key []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + ) + C.rocksdb_transaction_delete_cf(transaction.c, cf_handle.c, cKey, C.size_t(len(key)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + // NewIterator returns an Iterator over the database that uses the // ReadOptions given. func (transaction *Transaction) NewIterator(opts *ReadOptions) *Iterator { diff --git a/transactiondb.go b/transactiondb.go index cfdeac9c..e8a73095 100644 --- a/transactiondb.go +++ b/transactiondb.go @@ -41,6 +41,82 @@ func OpenTransactionDb( }, nil } +// OpenDbColumnFamilies opens a database with the specified column families. +func OpenTransactionDbColumnFamilies( + opts *Options, + transactionDBOpts *TransactionDBOptions, + name string, + cfNames []string, + cfOpts []*Options, +) (*TransactionDB, []*ColumnFamilyHandle, error) { + numColumnFamilies := len(cfNames) + if numColumnFamilies != len(cfOpts) { + return nil, nil, errors.New("must provide the same number of column family names and options") + } + + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + + cNames := make([]*C.char, numColumnFamilies) + for i, s := range cfNames { + cNames[i] = C.CString(s) + } + defer func() { + for _, s := range cNames { + C.free(unsafe.Pointer(s)) + } + }() + + cOpts := make([]*C.rocksdb_options_t, numColumnFamilies) + for i, o := range cfOpts { + cOpts[i] = o.c + } + + cHandles := make([]*C.rocksdb_column_family_handle_t, numColumnFamilies) + + var cErr *C.char + db := C.rocksdb_transactiondb_open_column_families( + opts.c, + transactionDBOpts.c, + cName, + C.int(numColumnFamilies), + &cNames[0], + &cOpts[0], + &cHandles[0], + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, nil, errors.New(C.GoString(cErr)) + } + + cfHandles := make([]*ColumnFamilyHandle, numColumnFamilies) + for i, c := range cHandles { + cfHandles[i] = NewNativeColumnFamilyHandle(c) + } + + return &TransactionDB{ + name: name, + c: db, + opts: opts, + }, cfHandles, nil +} + +// CreateColumnFamily creates a new column family. +func (db *TransactionDB) CreateColumnFamily(opts *Options, name string) (*ColumnFamilyHandle, error) { + var ( + cErr *C.char + cName = C.CString(name) + ) + defer C.free(unsafe.Pointer(cName)) + cHandle := C.rocksdb_transactiondb_create_column_family(db.c, opts.c, cName, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewNativeColumnFamilyHandle(cHandle), nil +} + // NewSnapshot creates a new snapshot of the database. func (db *TransactionDB) NewSnapshot() *Snapshot { return NewNativeSnapshot(C.rocksdb_transactiondb_create_snapshot(db.c)) @@ -89,6 +165,22 @@ func (db *TransactionDB) Get(opts *ReadOptions, key []byte) (*Slice, error) { return NewSlice(cValue, cValLen), nil } +// GetCF returns the data associated with the key in a given column family from the database. +func (db *TransactionDB) GetCF(opts *ReadOptions, cf *ColumnFamilyHandle, key []byte) (*Slice, error) { + var ( + cErr *C.char + cValLen C.size_t + cKey = byteToChar(key) + ) + cValue := C.rocksdb_transactiondb_get_cf(db.c, opts.c, cf.c, cKey, C.size_t(len(key)), &cValLen, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewSlice(cValue, cValLen), nil + +} + // Put writes data associated with a key to the database. func (db *TransactionDB) Put(opts *WriteOptions, key, value []byte) error { var ( @@ -106,6 +198,32 @@ func (db *TransactionDB) Put(opts *WriteOptions, key, value []byte) error { return nil } +// PutCF writes data associated with a key to the database and column family. +func (db *TransactionDB) PutCF(opts *WriteOptions, cf *ColumnFamilyHandle, key, value []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + cValue = byteToChar(value) + ) + C.rocksdb_transactiondb_put_cf(db.c, opts.c, cf.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Write writes a WriteBatch to the database +func (db *TransactionDB) Write(opts *WriteOptions, batch *WriteBatch) error { + var cErr *C.char + C.rocksdb_transactiondb_write(db.c, opts.c, batch.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + // Delete removes the data associated with the key from the database. func (db *TransactionDB) Delete(opts *WriteOptions, key []byte) error { var ( @@ -120,6 +238,20 @@ func (db *TransactionDB) Delete(opts *WriteOptions, key []byte) error { return nil } +// DeleteCF removes the data associated with the key from the database and column family. +func (db *TransactionDB) DeleteCF(opts *WriteOptions, cf *ColumnFamilyHandle, key []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + ) + C.rocksdb_transactiondb_delete_cf(db.c, opts.c, cf.c, cKey, C.size_t(len(key)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + // NewCheckpoint creates a new Checkpoint for this db. func (db *TransactionDB) NewCheckpoint() (*Checkpoint, error) { var ( diff --git a/transactiondb_test.go b/transactiondb_test.go index 48fb382b..8fbc757e 100644 --- a/transactiondb_test.go +++ b/transactiondb_test.go @@ -1,6 +1,7 @@ package gorocksdb import ( + "fmt" "io/ioutil" "testing" @@ -12,6 +13,48 @@ func TestOpenTransactionDb(t *testing.T) { defer db.Close() } +func TestTransactionDbColumnFamilies(t *testing.T) { + test_cf_names := []string{"default", "cf1", "cf2"} + db, cf_handles := newTestTransactionDBColumnFamilies(t, "TestOpenTransactionDbColumnFamilies", test_cf_names) + ensure.True(t, 3 == len(cf_handles)) + defer db.Close() + + cf_names, err := ListColumnFamilies(NewDefaultOptions(), db.name) + ensure.Nil(t, err) + ensure.True(t, 3 == len(cf_names)) + ensure.DeepEqual(t, cf_names, test_cf_names) + + for idx, cf_name := range test_cf_names { + ensure.Nil(t, db.PutCF(NewDefaultWriteOptions(), cf_handles[idx], []byte(cf_name+"_key"), []byte(cf_name+"_value"))) + } + + for idx, cf_name := range test_cf_names { + val, err := db.GetCF(NewDefaultReadOptions(), cf_handles[idx], []byte(cf_name+"_key")) + ensure.Nil(t, err) + ensure.DeepEqual(t, val.Data(), []byte(cf_name+"_value")) + } + + // Delete all keys in all column families + for idx, cf_name := range test_cf_names { + ensure.Nil(t, db.DeleteCF(NewDefaultWriteOptions(), cf_handles[idx], []byte(cf_name+"_key"))) + } + + for idx, cf_name := range test_cf_names { + val, err := db.GetCF(NewDefaultReadOptions(), cf_handles[idx], []byte(cf_name+"_key")) + ensure.Nil(t, err) + ensure.True(t, val.Size() == 0) + } + + { + cf_handle, err := db.CreateColumnFamily(NewDefaultOptions(), "new_cf") + ensure.Nil(t, err) + ensure.NotNil(t, cf_handle) + cf_names, err := ListColumnFamilies(NewDefaultOptions(), db.name) + ensure.Nil(t, err) + ensure.True(t, 4 == len(cf_names)) + } +} + func TestTransactionDBCRUD(t *testing.T) { db := newTestTransactionDB(t, "TestTransactionDBGet", nil) defer db.Close() @@ -93,6 +136,126 @@ func TestTransactionDBCRUD(t *testing.T) { } +func TestTransactionDBWriteBatchColumnFamilies(t *testing.T) { + test_cf_names := []string{"default", "cf1", "cf2"} + db, cf_handles := newTestTransactionDBColumnFamilies(t, "TestOpenTransactionDbColumnFamilies", test_cf_names) + ensure.True(t, len(cf_handles) == 3) + defer db.Close() + + var ( + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + ) + + // WriteBatch PutCF + { + batch := NewWriteBatch() + for h_idx := 1; h_idx <= 2; h_idx++ { + for k_idx := 0; k_idx <= 2; k_idx++ { + batch.PutCF(cf_handles[h_idx], []byte(fmt.Sprintf("%s_key_%d", test_cf_names[h_idx], k_idx)), + []byte(fmt.Sprintf("%s_value_%d", test_cf_names[h_idx], k_idx))) + } + } + ensure.Nil(t, db.Write(wo, batch)) + batch.Destroy() + } + + // Read back + { + for h_idx := 1; h_idx <= 2; h_idx++ { + for k_idx := 0; k_idx <= 2; k_idx++ { + data, err := db.GetCF(ro, cf_handles[h_idx], []byte(fmt.Sprintf("%s_key_%d", test_cf_names[h_idx], k_idx))) + ensure.Nil(t, err) + ensure.DeepEqual(t, data.Data(), []byte(fmt.Sprintf("%s_value_%d", test_cf_names[h_idx], k_idx))) + } + } + } + + { // WriteBatch with DeleteRangeCF not implemented + batch := NewWriteBatch() + batch.DeleteRangeCF(cf_handles[1], []byte(test_cf_names[1]+"_key_0"), []byte(test_cf_names[1]+"_key_2")) + ensure.NotNil(t, db.Write(wo, batch)) + } + // WriteBatch DeleteCF + { + batch := NewWriteBatch() + batch.DeleteCF(cf_handles[1], []byte(test_cf_names[1]+"_key_0")) + batch.DeleteCF(cf_handles[1], []byte(test_cf_names[1]+"_key_1")) + ensure.Nil(t, db.Write(wo, batch)) + } + + // Read back the remaining keys + { + // All keys on "cf2" are still there. + // Only key2 on "cf1" still remains + for h_idx := 1; h_idx <= 2; h_idx++ { + for k_idx := 0; k_idx <= 2; k_idx++ { + data, err := db.GetCF(ro, cf_handles[h_idx], []byte(fmt.Sprintf("%s_key_%d", test_cf_names[h_idx], k_idx))) + ensure.Nil(t, err) + if h_idx == 2 || k_idx == 2 { + ensure.DeepEqual(t, data.Data(), []byte(fmt.Sprintf("%s_value_%d", test_cf_names[h_idx], k_idx))) + } else { + ensure.True(t, len(data.Data()) == 0) + } + } + } + } +} + +func TestTransactionDBCRUDColumnFamilies(t *testing.T) { + test_cf_names := []string{"default", "cf1", "cf2"} + db, cf_handles := newTestTransactionDBColumnFamilies(t, "TestOpenTransactionDbColumnFamilies", test_cf_names) + ensure.True(t, len(cf_handles) == 3) + defer db.Close() + + var ( + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + to = NewDefaultTransactionOptions() + ) + + { + txn := db.TransactionBegin(wo, to, nil) + defer txn.Destroy() + // RYW. + for idx, cf_handle := range cf_handles { + ensure.Nil(t, txn.PutCF(cf_handle, []byte(test_cf_names[idx]+"_key"), []byte(test_cf_names[idx]+"_value"))) + val, err := txn.GetCF(ro, cf_handle, []byte(test_cf_names[idx]+"_key")) + defer val.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, val.Data(), []byte(test_cf_names[idx]+"_value")) + } + txn.Commit() + } + + // Read after commit + for idx, cf_handle := range cf_handles { + val, err := db.GetCF(ro, cf_handle, []byte(test_cf_names[idx]+"_key")) + defer val.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, val.Data(), []byte(test_cf_names[idx]+"_value")) + } + + // Delete + { + txn := db.TransactionBegin(wo, to, nil) + defer txn.Destroy() + // RYW. + for idx, cf_handle := range cf_handles { + ensure.Nil(t, txn.DeleteCF(cf_handle, []byte(test_cf_names[idx]+"_key"))) + } + txn.Commit() + } + + // Read after delete commit + for idx, cf_handle := range cf_handles { + val, err := db.GetCF(ro, cf_handle, []byte(test_cf_names[idx]+"_key")) + defer val.Free() + ensure.Nil(t, err) + ensure.True(t, val.Size() == 0) + } +} + func TestTransactionDBGetForUpdate(t *testing.T) { lockTimeoutMilliSec := int64(50) applyOpts := func(opts *Options, transactionDBOpts *TransactionDBOptions) { @@ -122,6 +285,35 @@ func TestTransactionDBGetForUpdate(t *testing.T) { } } +func TestTransactionDBGetForUpdateColumnFamilies(t *testing.T) { + test_cf_names := []string{"default", "cf1", "cf2"} + db, cf_handles := newTestTransactionDBColumnFamilies(t, "TestOpenTransactionDbColumnFamilies", test_cf_names) + ensure.True(t, 3 == len(cf_handles)) + defer db.Close() + + var ( + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + to = NewDefaultTransactionOptions() + ) + + { + txn := db.TransactionBegin(wo, to, nil) + defer txn.Destroy() + + val, err := txn.GetForUpdateCF(ro, cf_handles[1], []byte(test_cf_names[1]+"_key")) + defer val.Free() + ensure.Nil(t, err) + txn.PutCF(cf_handles[1], []byte(test_cf_names[1]+"_key"), []byte(test_cf_names[1]+"_value")) + ensure.Nil(t, txn.Commit()) + } + + // Read after update + val, err := db.GetCF(ro, cf_handles[1], []byte(test_cf_names[1]+"_key")) + ensure.Nil(t, err) + ensure.DeepEqual(t, val.Data(), []byte(test_cf_names[1]+"_value")) +} + func newTestTransactionDB(t *testing.T, name string, applyOpts func(opts *Options, transactionDBOpts *TransactionDBOptions)) *TransactionDB { dir, err := ioutil.TempDir("", "gorockstransactiondb-"+name) ensure.Nil(t, err) @@ -137,3 +329,19 @@ func newTestTransactionDB(t *testing.T, name string, applyOpts func(opts *Option return db } + +func newTestTransactionDBColumnFamilies(t *testing.T, name string, cfNames []string) (*TransactionDB, []*ColumnFamilyHandle) { + dir, err := ioutil.TempDir("", "gorockstransactiondb-"+name) + ensure.Nil(t, err) + + opts := NewDefaultOptions() + opts.SetCreateIfMissing(true) + opts.SetCreateIfMissingColumnFamilies(true) + transactionDBOpts := NewDefaultTransactionDBOptions() + cfOpts := []*Options{opts, opts, opts} + db, cfHandles, err := OpenTransactionDbColumnFamilies(opts, transactionDBOpts, dir, cfNames, cfOpts) + ensure.Nil(t, err) + ensure.True(t, 3 == len(cfHandles)) + + return db, cfHandles +} diff --git a/v8/array.go b/v8/array.go new file mode 100644 index 00000000..c5a12289 --- /dev/null +++ b/v8/array.go @@ -0,0 +1,54 @@ +package gorocksdb + +// #include "stdlib.h" +// #include "rocksdb/c.h" +import "C" +import ( + "reflect" + "unsafe" +) + +type charsSlice []*C.char +type sizeTSlice []C.size_t +type columnFamilySlice []*C.rocksdb_column_family_handle_t + +func (s charsSlice) c() **C.char { + sH := (*reflect.SliceHeader)(unsafe.Pointer(&s)) + return (**C.char)(unsafe.Pointer(sH.Data)) +} + +func (s sizeTSlice) c() *C.size_t { + sH := (*reflect.SliceHeader)(unsafe.Pointer(&s)) + return (*C.size_t)(unsafe.Pointer(sH.Data)) +} + +func (s columnFamilySlice) c() **C.rocksdb_column_family_handle_t { + sH := (*reflect.SliceHeader)(unsafe.Pointer(&s)) + return (**C.rocksdb_column_family_handle_t)(unsafe.Pointer(sH.Data)) +} + +// bytesSliceToCSlices converts a slice of byte slices to two slices with C +// datatypes. One containing pointers to copies of the byte slices and one +// containing their sizes. +// IMPORTANT: All the contents of the charsSlice array are malloced and +// should be freed using the Destroy method of charsSlice. +func byteSlicesToCSlices(vals [][]byte) (charsSlice, sizeTSlice) { + if len(vals) == 0 { + return nil, nil + } + + chars := make(charsSlice, len(vals)) + sizes := make(sizeTSlice, len(vals)) + for i, val := range vals { + chars[i] = (*C.char)(C.CBytes(val)) + sizes[i] = C.size_t(len(val)) + } + + return chars, sizes +} + +func (s charsSlice) Destroy() { + for _, chars := range s { + C.free(unsafe.Pointer(chars)) + } +} diff --git a/v8/backup.go b/v8/backup.go new file mode 100644 index 00000000..87621dd9 --- /dev/null +++ b/v8/backup.go @@ -0,0 +1,169 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" +import ( + "errors" + "unsafe" +) + +// BackupEngineInfo represents the information about the backups +// in a backup engine instance. Use this to get the state of the +// backup like number of backups and their ids and timestamps etc. +type BackupEngineInfo struct { + c *C.rocksdb_backup_engine_info_t +} + +// GetCount gets the number backsup available. +func (b *BackupEngineInfo) GetCount() int { + return int(C.rocksdb_backup_engine_info_count(b.c)) +} + +// GetTimestamp gets the timestamp at which the backup index was taken. +func (b *BackupEngineInfo) GetTimestamp(index int) int64 { + return int64(C.rocksdb_backup_engine_info_timestamp(b.c, C.int(index))) +} + +// GetBackupId gets an id that uniquely identifies a backup +// regardless of its position. +func (b *BackupEngineInfo) GetBackupId(index int) int64 { + return int64(C.rocksdb_backup_engine_info_backup_id(b.c, C.int(index))) +} + +// GetSize get the size of the backup in bytes. +func (b *BackupEngineInfo) GetSize(index int) int64 { + return int64(C.rocksdb_backup_engine_info_size(b.c, C.int(index))) +} + +// GetNumFiles gets the number of files in the backup index. +func (b *BackupEngineInfo) GetNumFiles(index int) int32 { + return int32(C.rocksdb_backup_engine_info_number_files(b.c, C.int(index))) +} + +// Destroy destroys the backup engine info instance. +func (b *BackupEngineInfo) Destroy() { + C.rocksdb_backup_engine_info_destroy(b.c) + b.c = nil +} + +// RestoreOptions captures the options to be used during +// restoration of a backup. +type RestoreOptions struct { + c *C.rocksdb_restore_options_t +} + +// NewRestoreOptions creates a RestoreOptions instance. +func NewRestoreOptions() *RestoreOptions { + return &RestoreOptions{ + c: C.rocksdb_restore_options_create(), + } +} + +// SetKeepLogFiles is used to set or unset the keep_log_files option +// If true, restore won't overwrite the existing log files in wal_dir. It will +// also move all log files from archive directory to wal_dir. +// By default, this is false. +func (ro *RestoreOptions) SetKeepLogFiles(v int) { + C.rocksdb_restore_options_set_keep_log_files(ro.c, C.int(v)) +} + +// Destroy destroys this RestoreOptions instance. +func (ro *RestoreOptions) Destroy() { + C.rocksdb_restore_options_destroy(ro.c) +} + +// BackupEngine is a reusable handle to a RocksDB Backup, created by +// OpenBackupEngine. +type BackupEngine struct { + c *C.rocksdb_backup_engine_t + path string + opts *Options +} + +// OpenBackupEngine opens a backup engine with specified options. +func OpenBackupEngine(opts *Options, path string) (*BackupEngine, error) { + var cErr *C.char + cpath := C.CString(path) + defer C.free(unsafe.Pointer(cpath)) + + be := C.rocksdb_backup_engine_open(opts.c, cpath, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return &BackupEngine{ + c: be, + path: path, + opts: opts, + }, nil +} + +// UnsafeGetBackupEngine returns the underlying c backup engine. +func (b *BackupEngine) UnsafeGetBackupEngine() unsafe.Pointer { + return unsafe.Pointer(b.c) +} + +// CreateNewBackupFlush takes a new backup from db. If flush is set to true, +// it flushes the WAL before taking the backup. +func (b *BackupEngine) CreateNewBackupFlush(db *DB, flush bool) error { + var cErr *C.char + + C.rocksdb_backup_engine_create_new_backup_flush(b.c, db.c, boolToChar(flush), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + + return nil +} + +// CreateNewBackup takes a new backup from db. +func (b *BackupEngine) CreateNewBackup(db *DB) error { + return b.CreateNewBackupFlush(db, false) +} + +// GetInfo gets an object that gives information about +// the backups that have already been taken +func (b *BackupEngine) GetInfo() *BackupEngineInfo { + return &BackupEngineInfo{ + c: C.rocksdb_backup_engine_get_backup_info(b.c), + } +} + +// RestoreDBFromLatestBackup restores the latest backup to dbDir. walDir +// is where the write ahead logs are restored to and usually the same as dbDir. +func (b *BackupEngine) RestoreDBFromLatestBackup(dbDir, walDir string, ro *RestoreOptions) error { + var cErr *C.char + cDbDir := C.CString(dbDir) + cWalDir := C.CString(walDir) + defer func() { + C.free(unsafe.Pointer(cDbDir)) + C.free(unsafe.Pointer(cWalDir)) + }() + + C.rocksdb_backup_engine_restore_db_from_latest_backup(b.c, cDbDir, cWalDir, ro.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// PurgeOldBackups deletes all backups older than the latest 'n' backups +func (b *BackupEngine) PurgeOldBackups(n uint32) error { + var cErr *C.char + C.rocksdb_backup_engine_purge_old_backups(b.c, C.uint32_t(n), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Close close the backup engine and cleans up state +// The backups already taken remain on storage. +func (b *BackupEngine) Close() { + C.rocksdb_backup_engine_close(b.c) + b.c = nil +} diff --git a/v8/cache.go b/v8/cache.go new file mode 100644 index 00000000..866326dc --- /dev/null +++ b/v8/cache.go @@ -0,0 +1,35 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// Cache is a cache used to store data read from data in memory. +type Cache struct { + c *C.rocksdb_cache_t +} + +// NewLRUCache creates a new LRU Cache object with the capacity given. +func NewLRUCache(capacity uint64) *Cache { + return NewNativeCache(C.rocksdb_cache_create_lru(C.size_t(capacity))) +} + +// NewNativeCache creates a Cache object. +func NewNativeCache(c *C.rocksdb_cache_t) *Cache { + return &Cache{c} +} + +// GetUsage returns the Cache memory usage. +func (c *Cache) GetUsage() uint64 { + return uint64(C.rocksdb_cache_get_usage(c.c)) +} + +// GetPinnedUsage returns the Cache pinned memory usage. +func (c *Cache) GetPinnedUsage() uint64 { + return uint64(C.rocksdb_cache_get_pinned_usage(c.c)) +} + +// Destroy deallocates the Cache object. +func (c *Cache) Destroy() { + C.rocksdb_cache_destroy(c.c) + c.c = nil +} diff --git a/v8/cf_handle.go b/v8/cf_handle.go new file mode 100644 index 00000000..6ded4c59 --- /dev/null +++ b/v8/cf_handle.go @@ -0,0 +1,36 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" +import "unsafe" + +// ColumnFamilyHandle represents a handle to a ColumnFamily. +type ColumnFamilyHandle struct { + c *C.rocksdb_column_family_handle_t +} + +// NewNativeColumnFamilyHandle creates a ColumnFamilyHandle object. +func NewNativeColumnFamilyHandle(c *C.rocksdb_column_family_handle_t) *ColumnFamilyHandle { + return &ColumnFamilyHandle{c} +} + +// UnsafeGetCFHandler returns the underlying c column family handle. +func (h *ColumnFamilyHandle) UnsafeGetCFHandler() unsafe.Pointer { + return unsafe.Pointer(h.c) +} + +// Destroy calls the destructor of the underlying column family handle. +func (h *ColumnFamilyHandle) Destroy() { + C.rocksdb_column_family_handle_destroy(h.c) +} + +type ColumnFamilyHandles []*ColumnFamilyHandle + +func (cfs ColumnFamilyHandles) toCSlice() columnFamilySlice { + cCFs := make(columnFamilySlice, len(cfs)) + for i, cf := range cfs { + cCFs[i] = cf.c + } + return cCFs +} diff --git a/v8/cf_test.go b/v8/cf_test.go new file mode 100644 index 00000000..f6db7c1f --- /dev/null +++ b/v8/cf_test.go @@ -0,0 +1,229 @@ +package gorocksdb + +import ( + "io/ioutil" + "testing" + + "github.com/facebookgo/ensure" +) + +func TestColumnFamilyOpen(t *testing.T) { + dir, err := ioutil.TempDir("", "gorocksdb-TestColumnFamilyOpen") + ensure.Nil(t, err) + + givenNames := []string{"default", "guide"} + opts := NewDefaultOptions() + opts.SetCreateIfMissingColumnFamilies(true) + opts.SetCreateIfMissing(true) + db, cfh, err := OpenDbColumnFamilies(opts, dir, givenNames, []*Options{opts, opts}) + ensure.Nil(t, err) + defer db.Close() + ensure.DeepEqual(t, len(cfh), 2) + cfh[0].Destroy() + cfh[1].Destroy() + + actualNames, err := ListColumnFamilies(opts, dir) + ensure.Nil(t, err) + ensure.SameElements(t, actualNames, givenNames) +} + +func TestColumnFamilyCreateDrop(t *testing.T) { + dir, err := ioutil.TempDir("", "gorocksdb-TestColumnFamilyCreate") + ensure.Nil(t, err) + + opts := NewDefaultOptions() + opts.SetCreateIfMissingColumnFamilies(true) + opts.SetCreateIfMissing(true) + db, err := OpenDb(opts, dir) + ensure.Nil(t, err) + defer db.Close() + cf, err := db.CreateColumnFamily(opts, "guide") + ensure.Nil(t, err) + defer cf.Destroy() + + actualNames, err := ListColumnFamilies(opts, dir) + ensure.Nil(t, err) + ensure.SameElements(t, actualNames, []string{"default", "guide"}) + + ensure.Nil(t, db.DropColumnFamily(cf)) + + actualNames, err = ListColumnFamilies(opts, dir) + ensure.Nil(t, err) + ensure.SameElements(t, actualNames, []string{"default"}) +} + +func TestColumnFamilyBatchPutGet(t *testing.T) { + dir, err := ioutil.TempDir("", "gorocksdb-TestColumnFamilyPutGet") + ensure.Nil(t, err) + + givenNames := []string{"default", "guide"} + opts := NewDefaultOptions() + opts.SetCreateIfMissingColumnFamilies(true) + opts.SetCreateIfMissing(true) + db, cfh, err := OpenDbColumnFamilies(opts, dir, givenNames, []*Options{opts, opts}) + ensure.Nil(t, err) + defer db.Close() + ensure.DeepEqual(t, len(cfh), 2) + defer cfh[0].Destroy() + defer cfh[1].Destroy() + + wo := NewDefaultWriteOptions() + defer wo.Destroy() + ro := NewDefaultReadOptions() + defer ro.Destroy() + + givenKey0 := []byte("hello0") + givenVal0 := []byte("world0") + givenKey1 := []byte("hello1") + givenVal1 := []byte("world1") + + b0 := NewWriteBatch() + defer b0.Destroy() + b0.PutCF(cfh[0], givenKey0, givenVal0) + ensure.Nil(t, db.Write(wo, b0)) + actualVal0, err := db.GetCF(ro, cfh[0], givenKey0) + defer actualVal0.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, actualVal0.Data(), givenVal0) + + b1 := NewWriteBatch() + defer b1.Destroy() + b1.PutCF(cfh[1], givenKey1, givenVal1) + ensure.Nil(t, db.Write(wo, b1)) + actualVal1, err := db.GetCF(ro, cfh[1], givenKey1) + defer actualVal1.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, actualVal1.Data(), givenVal1) + + actualVal, err := db.GetCF(ro, cfh[0], givenKey1) + ensure.Nil(t, err) + ensure.DeepEqual(t, actualVal.Size(), 0) + actualVal, err = db.GetCF(ro, cfh[1], givenKey0) + ensure.Nil(t, err) + ensure.DeepEqual(t, actualVal.Size(), 0) +} + +func TestColumnFamilyPutGetDelete(t *testing.T) { + dir, err := ioutil.TempDir("", "gorocksdb-TestColumnFamilyPutGet") + ensure.Nil(t, err) + + givenNames := []string{"default", "guide"} + opts := NewDefaultOptions() + opts.SetCreateIfMissingColumnFamilies(true) + opts.SetCreateIfMissing(true) + db, cfh, err := OpenDbColumnFamilies(opts, dir, givenNames, []*Options{opts, opts}) + ensure.Nil(t, err) + defer db.Close() + ensure.DeepEqual(t, len(cfh), 2) + defer cfh[0].Destroy() + defer cfh[1].Destroy() + + wo := NewDefaultWriteOptions() + defer wo.Destroy() + ro := NewDefaultReadOptions() + defer ro.Destroy() + + givenKey0 := []byte("hello0") + givenVal0 := []byte("world0") + givenKey1 := []byte("hello1") + givenVal1 := []byte("world1") + + ensure.Nil(t, db.PutCF(wo, cfh[0], givenKey0, givenVal0)) + actualVal0, err := db.GetCF(ro, cfh[0], givenKey0) + defer actualVal0.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, actualVal0.Data(), givenVal0) + + ensure.Nil(t, db.PutCF(wo, cfh[1], givenKey1, givenVal1)) + actualVal1, err := db.GetCF(ro, cfh[1], givenKey1) + defer actualVal1.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, actualVal1.Data(), givenVal1) + + actualVal, err := db.GetCF(ro, cfh[0], givenKey1) + ensure.Nil(t, err) + ensure.DeepEqual(t, actualVal.Size(), 0) + actualVal, err = db.GetCF(ro, cfh[1], givenKey0) + ensure.Nil(t, err) + ensure.DeepEqual(t, actualVal.Size(), 0) + + ensure.Nil(t, db.DeleteCF(wo, cfh[0], givenKey0)) + actualVal, err = db.GetCF(ro, cfh[0], givenKey0) + ensure.Nil(t, err) + ensure.DeepEqual(t, actualVal.Size(), 0) +} + +func newTestDBCF(t *testing.T, name string) (db *DB, cfh []*ColumnFamilyHandle, cleanup func()) { + dir, err := ioutil.TempDir("", "gorocksdb-TestColumnFamilyPutGet") + ensure.Nil(t, err) + + givenNames := []string{"default", "guide"} + opts := NewDefaultOptions() + opts.SetCreateIfMissingColumnFamilies(true) + opts.SetCreateIfMissing(true) + db, cfh, err = OpenDbColumnFamilies(opts, dir, givenNames, []*Options{opts, opts}) + ensure.Nil(t, err) + cleanup = func() { + for _, cf := range cfh { + cf.Destroy() + } + db.Close() + } + return db, cfh, cleanup +} + +func TestColumnFamilyMultiGet(t *testing.T) { + db, cfh, cleanup := newTestDBCF(t, "TestDBMultiGet") + defer cleanup() + + var ( + givenKey1 = []byte("hello1") + givenKey2 = []byte("hello2") + givenKey3 = []byte("hello3") + givenVal1 = []byte("world1") + givenVal2 = []byte("world2") + givenVal3 = []byte("world3") + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + ) + + // create + ensure.Nil(t, db.PutCF(wo, cfh[0], givenKey1, givenVal1)) + ensure.Nil(t, db.PutCF(wo, cfh[1], givenKey2, givenVal2)) + ensure.Nil(t, db.PutCF(wo, cfh[1], givenKey3, givenVal3)) + + // column family 0 only has givenKey1 + values, err := db.MultiGetCF(ro, cfh[0], []byte("noexist"), givenKey1, givenKey2, givenKey3) + defer values.Destroy() + ensure.Nil(t, err) + ensure.DeepEqual(t, len(values), 4) + + ensure.DeepEqual(t, values[0].Data(), []byte(nil)) + ensure.DeepEqual(t, values[1].Data(), givenVal1) + ensure.DeepEqual(t, values[2].Data(), []byte(nil)) + ensure.DeepEqual(t, values[3].Data(), []byte(nil)) + + // column family 1 only has givenKey2 and givenKey3 + values, err = db.MultiGetCF(ro, cfh[1], []byte("noexist"), givenKey1, givenKey2, givenKey3) + defer values.Destroy() + ensure.Nil(t, err) + ensure.DeepEqual(t, len(values), 4) + + ensure.DeepEqual(t, values[0].Data(), []byte(nil)) + ensure.DeepEqual(t, values[1].Data(), []byte(nil)) + ensure.DeepEqual(t, values[2].Data(), givenVal2) + ensure.DeepEqual(t, values[3].Data(), givenVal3) + + // getting them all from the right CF should return them all + values, err = db.MultiGetCFMultiCF(ro, + ColumnFamilyHandles{cfh[0], cfh[1], cfh[1]}, + [][]byte{givenKey1, givenKey2, givenKey3}, + ) + defer values.Destroy() + ensure.Nil(t, err) + ensure.DeepEqual(t, len(values), 3) + + ensure.DeepEqual(t, values[0].Data(), givenVal1) + ensure.DeepEqual(t, values[1].Data(), givenVal2) + ensure.DeepEqual(t, values[2].Data(), givenVal3) +} diff --git a/v8/checkpoint.go b/v8/checkpoint.go new file mode 100644 index 00000000..4a6436d2 --- /dev/null +++ b/v8/checkpoint.go @@ -0,0 +1,56 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" + +import ( + "errors" + "unsafe" +) + +// Checkpoint provides Checkpoint functionality. +// Checkpoints provide persistent snapshots of RocksDB databases. +type Checkpoint struct { + c *C.rocksdb_checkpoint_t +} + +// NewNativeCheckpoint creates a new checkpoint. +func NewNativeCheckpoint(c *C.rocksdb_checkpoint_t) *Checkpoint { + return &Checkpoint{c} +} + +// CreateCheckpoint builds an openable snapshot of RocksDB on the same disk, which +// accepts an output directory on the same disk, and under the directory +// (1) hard-linked SST files pointing to existing live SST files +// SST files will be copied if output directory is on a different filesystem +// (2) a copied manifest files and other files +// The directory should not already exist and will be created by this API. +// The directory will be an absolute path +// log_size_for_flush: if the total log file size is equal or larger than +// this value, then a flush is triggered for all the column families. The +// default value is 0, which means flush is always triggered. If you move +// away from the default, the checkpoint may not contain up-to-date data +// if WAL writing is not always enabled. +// Flush will always trigger if it is 2PC. +func (checkpoint *Checkpoint) CreateCheckpoint(checkpoint_dir string, log_size_for_flush uint64) error { + var ( + cErr *C.char + ) + + cDir := C.CString(checkpoint_dir) + defer C.free(unsafe.Pointer(cDir)) + + C.rocksdb_checkpoint_create(checkpoint.c, cDir, C.uint64_t(log_size_for_flush), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Destroy deallocates the Checkpoint object. +func (checkpoint *Checkpoint) Destroy() { + C.rocksdb_checkpoint_object_destroy(checkpoint.c) + checkpoint.c = nil +} diff --git a/v8/checkpoint_test.go b/v8/checkpoint_test.go new file mode 100644 index 00000000..1ea10fdb --- /dev/null +++ b/v8/checkpoint_test.go @@ -0,0 +1,57 @@ +package gorocksdb + +import ( + "io/ioutil" + "os" + "testing" + + "github.com/facebookgo/ensure" +) + +func TestCheckpoint(t *testing.T) { + + suffix := "checkpoint" + dir, err := ioutil.TempDir("", "gorocksdb-"+suffix) + ensure.Nil(t, err) + err = os.RemoveAll(dir) + ensure.Nil(t, err) + + db := newTestDB(t, "TestCheckpoint", nil) + defer db.Close() + + // insert keys + givenKeys := [][]byte{[]byte("key1"), []byte("key2"), []byte("key3")} + givenVal := []byte("val") + wo := NewDefaultWriteOptions() + for _, k := range givenKeys { + ensure.Nil(t, db.Put(wo, k, givenVal)) + } + + var dbCheck *DB + var checkpoint *Checkpoint + + checkpoint, err = db.NewCheckpoint() + defer checkpoint.Destroy() + ensure.NotNil(t, checkpoint) + ensure.Nil(t, err) + + err = checkpoint.CreateCheckpoint(dir, 0) + ensure.Nil(t, err) + + opts := NewDefaultOptions() + opts.SetCreateIfMissing(true) + dbCheck, err = OpenDb(opts, dir) + defer dbCheck.Close() + ensure.Nil(t, err) + + // test keys + var value *Slice + ro := NewDefaultReadOptions() + for _, k := range givenKeys { + value, err = dbCheck.Get(ro, k) + defer value.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, value.Data(), givenVal) + } + +} diff --git a/v8/compaction_filter.go b/v8/compaction_filter.go new file mode 100644 index 00000000..bda27e20 --- /dev/null +++ b/v8/compaction_filter.go @@ -0,0 +1,73 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// A CompactionFilter can be used to filter keys during compaction time. +type CompactionFilter interface { + // If the Filter function returns false, it indicates + // that the kv should be preserved, while a return value of true + // indicates that this key-value should be removed from the + // output of the compaction. The application can inspect + // the existing value of the key and make decision based on it. + // + // When the value is to be preserved, the application has the option + // to modify the existing value and pass it back through a new value. + // To retain the previous value, simply return nil + // + // If multithreaded compaction is being used *and* a single CompactionFilter + // instance was supplied via SetCompactionFilter, this the Filter function may be + // called from different threads concurrently. The application must ensure + // that the call is thread-safe. + Filter(level int, key, val []byte) (remove bool, newVal []byte) + + // The name of the compaction filter, for logging + Name() string +} + +// NewNativeCompactionFilter creates a CompactionFilter object. +func NewNativeCompactionFilter(c *C.rocksdb_compactionfilter_t) CompactionFilter { + return nativeCompactionFilter{c} +} + +type nativeCompactionFilter struct { + c *C.rocksdb_compactionfilter_t +} + +func (c nativeCompactionFilter) Filter(level int, key, val []byte) (remove bool, newVal []byte) { + return false, nil +} +func (c nativeCompactionFilter) Name() string { return "" } + +// Hold references to compaction filters. +var compactionFilters = NewCOWList() + +type compactionFilterWrapper struct { + name *C.char + filter CompactionFilter +} + +func registerCompactionFilter(filter CompactionFilter) int { + return compactionFilters.Append(compactionFilterWrapper{C.CString(filter.Name()), filter}) +} + +//export gorocksdb_compactionfilter_filter +func gorocksdb_compactionfilter_filter(idx int, cLevel C.int, cKey *C.char, cKeyLen C.size_t, cVal *C.char, cValLen C.size_t, cNewVal **C.char, cNewValLen *C.size_t, cValChanged *C.uchar) C.int { + key := charToByte(cKey, cKeyLen) + val := charToByte(cVal, cValLen) + + remove, newVal := compactionFilters.Get(idx).(compactionFilterWrapper).filter.Filter(int(cLevel), key, val) + if remove { + return C.int(1) + } else if newVal != nil { + *cNewVal = byteToChar(newVal) + *cNewValLen = C.size_t(len(newVal)) + *cValChanged = C.uchar(1) + } + return C.int(0) +} + +//export gorocksdb_compactionfilter_name +func gorocksdb_compactionfilter_name(idx int) *C.char { + return compactionFilters.Get(idx).(compactionFilterWrapper).name +} diff --git a/v8/compaction_filter_test.go b/v8/compaction_filter_test.go new file mode 100644 index 00000000..1dfcd63e --- /dev/null +++ b/v8/compaction_filter_test.go @@ -0,0 +1,61 @@ +package gorocksdb + +import ( + "bytes" + "testing" + + "github.com/facebookgo/ensure" +) + +func TestCompactionFilter(t *testing.T) { + var ( + changeKey = []byte("change") + changeValOld = []byte("old") + changeValNew = []byte("new") + deleteKey = []byte("delete") + ) + db := newTestDB(t, "TestCompactionFilter", func(opts *Options) { + opts.SetCompactionFilter(&mockCompactionFilter{ + filter: func(level int, key, val []byte) (remove bool, newVal []byte) { + if bytes.Equal(key, changeKey) { + return false, changeValNew + } + if bytes.Equal(key, deleteKey) { + return true, val + } + t.Errorf("key %q not expected during compaction", key) + return false, nil + }, + }) + }) + defer db.Close() + + // insert the test keys + wo := NewDefaultWriteOptions() + ensure.Nil(t, db.Put(wo, changeKey, changeValOld)) + ensure.Nil(t, db.Put(wo, deleteKey, changeValNew)) + + // trigger a compaction + db.CompactRange(Range{nil, nil}) + + // ensure that the value is changed after compaction + ro := NewDefaultReadOptions() + v1, err := db.Get(ro, changeKey) + defer v1.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v1.Data(), changeValNew) + + // ensure that the key is deleted after compaction + v2, err := db.Get(ro, deleteKey) + ensure.Nil(t, err) + ensure.True(t, v2.Data() == nil) +} + +type mockCompactionFilter struct { + filter func(level int, key, val []byte) (remove bool, newVal []byte) +} + +func (m *mockCompactionFilter) Name() string { return "gorocksdb.test" } +func (m *mockCompactionFilter) Filter(level int, key, val []byte) (bool, []byte) { + return m.filter(level, key, val) +} diff --git a/v8/comparator.go b/v8/comparator.go new file mode 100644 index 00000000..242771e3 --- /dev/null +++ b/v8/comparator.go @@ -0,0 +1,53 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// A Comparator object provides a total order across slices that are +// used as keys in an sstable or a database. +type Comparator interface { + // Three-way comparison. Returns value: + // < 0 iff "a" < "b", + // == 0 iff "a" == "b", + // > 0 iff "a" > "b" + Compare(a, b []byte) int + + // The name of the comparator. + Name() string +} + +// NewNativeComparator creates a Comparator object. +func NewNativeComparator(c *C.rocksdb_comparator_t) Comparator { + return nativeComparator{c} +} + +type nativeComparator struct { + c *C.rocksdb_comparator_t +} + +func (c nativeComparator) Compare(a, b []byte) int { return 0 } +func (c nativeComparator) Name() string { return "" } + +// Hold references to comperators. +var comperators = NewCOWList() + +type comperatorWrapper struct { + name *C.char + comparator Comparator +} + +func registerComperator(cmp Comparator) int { + return comperators.Append(comperatorWrapper{C.CString(cmp.Name()), cmp}) +} + +//export gorocksdb_comparator_compare +func gorocksdb_comparator_compare(idx int, cKeyA *C.char, cKeyALen C.size_t, cKeyB *C.char, cKeyBLen C.size_t) C.int { + keyA := charToByte(cKeyA, cKeyALen) + keyB := charToByte(cKeyB, cKeyBLen) + return C.int(comperators.Get(idx).(comperatorWrapper).comparator.Compare(keyA, keyB)) +} + +//export gorocksdb_comparator_name +func gorocksdb_comparator_name(idx int) *C.char { + return comperators.Get(idx).(comperatorWrapper).name +} diff --git a/v8/comparator_test.go b/v8/comparator_test.go new file mode 100644 index 00000000..76f3e565 --- /dev/null +++ b/v8/comparator_test.go @@ -0,0 +1,47 @@ +package gorocksdb + +import ( + "bytes" + "testing" + + "github.com/facebookgo/ensure" +) + +func TestComparator(t *testing.T) { + db := newTestDB(t, "TestComparator", func(opts *Options) { + opts.SetComparator(&bytesReverseComparator{}) + }) + defer db.Close() + + // insert keys + givenKeys := [][]byte{[]byte("key1"), []byte("key2"), []byte("key3")} + wo := NewDefaultWriteOptions() + for _, k := range givenKeys { + ensure.Nil(t, db.Put(wo, k, []byte("val"))) + } + + // create a iterator to collect the keys + ro := NewDefaultReadOptions() + iter := db.NewIterator(ro) + defer iter.Close() + + // we seek to the last key and iterate in reverse order + // to match given keys + var actualKeys [][]byte + for iter.SeekToLast(); iter.Valid(); iter.Prev() { + key := make([]byte, 4) + copy(key, iter.Key().Data()) + actualKeys = append(actualKeys, key) + } + ensure.Nil(t, iter.Err()) + + // ensure that the order is correct + ensure.DeepEqual(t, actualKeys, givenKeys) +} + +type bytesReverseComparator struct{} + +func (cmp *bytesReverseComparator) Name() string { return "gorocksdb.bytes-reverse" } +func (cmp *bytesReverseComparator) Compare(a, b []byte) int { + return bytes.Compare(a, b) * -1 +} diff --git a/v8/cow.go b/v8/cow.go new file mode 100644 index 00000000..dfcee687 --- /dev/null +++ b/v8/cow.go @@ -0,0 +1,42 @@ +package gorocksdb + +import ( + "sync" + "sync/atomic" +) + +// COWList implements a copy-on-write list. It is intended to be used by go +// callback registry for CGO, which is read-heavy with occasional writes. +// Reads do not block; Writes do not block reads (or vice versa), but only +// one write can occur at once; +type COWList struct { + v *atomic.Value + mu *sync.Mutex +} + +// NewCOWList creates a new COWList. +func NewCOWList() *COWList { + var list []interface{} + v := &atomic.Value{} + v.Store(list) + return &COWList{v: v, mu: new(sync.Mutex)} +} + +// Append appends an item to the COWList and returns the index for that item. +func (c *COWList) Append(i interface{}) int { + c.mu.Lock() + defer c.mu.Unlock() + list := c.v.Load().([]interface{}) + newLen := len(list) + 1 + newList := make([]interface{}, newLen) + copy(newList, list) + newList[newLen-1] = i + c.v.Store(newList) + return newLen - 1 +} + +// Get gets the item at index. +func (c *COWList) Get(index int) interface{} { + list := c.v.Load().([]interface{}) + return list[index] +} diff --git a/v8/cow_test.go b/v8/cow_test.go new file mode 100644 index 00000000..2af9aa56 --- /dev/null +++ b/v8/cow_test.go @@ -0,0 +1,48 @@ +package gorocksdb + +import ( + "fmt" + "sync" + "testing" + + "github.com/facebookgo/ensure" +) + +func TestCOWList(t *testing.T) { + cl := NewCOWList() + cl.Append("hello") + cl.Append("world") + cl.Append("!") + ensure.DeepEqual(t, cl.Get(0), "hello") + ensure.DeepEqual(t, cl.Get(1), "world") + ensure.DeepEqual(t, cl.Get(2), "!") +} + +func TestCOWListMT(t *testing.T) { + cl := NewCOWList() + expectedRes := make([]int, 3) + var wg sync.WaitGroup + for i := 0; i < 3; i++ { + wg.Add(1) + go func(v int) { + defer wg.Done() + index := cl.Append(v) + expectedRes[index] = v + }(i) + } + wg.Wait() + for i, v := range expectedRes { + ensure.DeepEqual(t, cl.Get(i), v) + } +} + +func BenchmarkCOWList_Get(b *testing.B) { + cl := NewCOWList() + for i := 0; i < 10; i++ { + cl.Append(fmt.Sprintf("helloworld%d", i)) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + _ = cl.Get(i % 10).(string) + } +} diff --git a/v8/db.go b/v8/db.go new file mode 100755 index 00000000..dd8e80db --- /dev/null +++ b/v8/db.go @@ -0,0 +1,1196 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" +import ( + "errors" + "fmt" + "unsafe" +) + +// Range is a range of keys in the database. GetApproximateSizes calls with it +// begin at the key Start and end right before the key Limit. +type Range struct { + Start []byte + Limit []byte +} + +// DB is a reusable handle to a RocksDB database on disk, created by Open. +type DB struct { + c *C.rocksdb_t + name string + secondaryPath string + opts *Options +} + +// OpenDb opens a database with the specified options. +func OpenDb(opts *Options, name string) (*DB, error) { + var ( + cErr *C.char + cName = C.CString(name) + ) + defer C.free(unsafe.Pointer(cName)) + db := C.rocksdb_open(opts.c, cName, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return &DB{ + name: name, + c: db, + opts: opts, + }, nil +} + +// OpenDbWithTTL opens a database with TTL support with the specified options. +func OpenDbWithTTL(opts *Options, name string, ttl int) (*DB, error) { + var ( + cErr *C.char + cName = C.CString(name) + ) + defer C.free(unsafe.Pointer(cName)) + db := C.rocksdb_open_with_ttl(opts.c, cName, C.int(ttl), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return &DB{ + name: name, + c: db, + opts: opts, + }, nil +} + +// OpenDbForReadOnly opens a database with the specified options for readonly usage. +func OpenDbForReadOnly(opts *Options, name string, errorIfLogFileExist bool) (*DB, error) { + var ( + cErr *C.char + cName = C.CString(name) + ) + defer C.free(unsafe.Pointer(cName)) + db := C.rocksdb_open_for_read_only(opts.c, cName, boolToChar(errorIfLogFileExist), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return &DB{ + name: name, + c: db, + opts: opts, + }, nil +} + +// OpenDbColumnFamilies opens a database with the specified column families. +func OpenDbColumnFamilies( + opts *Options, + name string, + cfNames []string, + cfOpts []*Options, +) (*DB, []*ColumnFamilyHandle, error) { + numColumnFamilies := len(cfNames) + if numColumnFamilies != len(cfOpts) { + return nil, nil, errors.New("must provide the same number of column family names and options") + } + + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + + cNames := make([]*C.char, numColumnFamilies) + for i, s := range cfNames { + cNames[i] = C.CString(s) + } + defer func() { + for _, s := range cNames { + C.free(unsafe.Pointer(s)) + } + }() + + cOpts := make([]*C.rocksdb_options_t, numColumnFamilies) + for i, o := range cfOpts { + cOpts[i] = o.c + } + + cHandles := make([]*C.rocksdb_column_family_handle_t, numColumnFamilies) + + var cErr *C.char + db := C.rocksdb_open_column_families( + opts.c, + cName, + C.int(numColumnFamilies), + &cNames[0], + &cOpts[0], + &cHandles[0], + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, nil, errors.New(C.GoString(cErr)) + } + + cfHandles := make([]*ColumnFamilyHandle, numColumnFamilies) + for i, c := range cHandles { + cfHandles[i] = NewNativeColumnFamilyHandle(c) + } + + return &DB{ + name: name, + c: db, + opts: opts, + }, cfHandles, nil +} + +// OpenDbColumnFamiliesWithTTL opens a database with the specified column families. +func OpenDbColumnFamiliesWithTTL( + opts *Options, + name string, + cfNames []string, + cfOpts []*Options, + cfTtls []int, +) (*DB, []*ColumnFamilyHandle, error) { + numColumnFamilies := len(cfNames) + if numColumnFamilies != len(cfOpts) { + return nil, nil, errors.New("must provide the same number of column family names and options") + } + + if numColumnFamilies != len(cfTtls) { + return nil, nil, errors.New("must provide the same number of column family names and ttls") + } + + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + + cNames := make([]*C.char, numColumnFamilies) + for i, s := range cfNames { + cNames[i] = C.CString(s) + } + defer func() { + for _, s := range cNames { + C.free(unsafe.Pointer(s)) + } + }() + + cOpts := make([]*C.rocksdb_options_t, numColumnFamilies) + for i, o := range cfOpts { + cOpts[i] = o.c + } + + cHandles := make([]*C.rocksdb_column_family_handle_t, numColumnFamilies) + + cTtls := make([]C.int, numColumnFamilies) + for i, t := range cfTtls { + cTtls[i] = C.int(t) + } + + var cErr *C.char + db := C.rocksdb_open_column_families_with_ttl( + opts.c, + cName, + C.int(numColumnFamilies), + &cNames[0], + &cOpts[0], + &cHandles[0], + &cTtls[0], + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, nil, errors.New(C.GoString(cErr)) + } + + cfHandles := make([]*ColumnFamilyHandle, numColumnFamilies) + for i, c := range cHandles { + cfHandles[i] = NewNativeColumnFamilyHandle(c) + } + + return &DB{ + name: name, + c: db, + opts: opts, + }, cfHandles, nil +} + +// OpenDbForReadOnlyColumnFamilies opens a database with the specified column +// families in read only mode. +func OpenDbForReadOnlyColumnFamilies( + opts *Options, + name string, + cfNames []string, + cfOpts []*Options, + errorIfLogFileExist bool, +) (*DB, []*ColumnFamilyHandle, error) { + numColumnFamilies := len(cfNames) + if numColumnFamilies != len(cfOpts) { + return nil, nil, errors.New("must provide the same number of column family names and options") + } + + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + + cNames := make([]*C.char, numColumnFamilies) + for i, s := range cfNames { + cNames[i] = C.CString(s) + } + defer func() { + for _, s := range cNames { + C.free(unsafe.Pointer(s)) + } + }() + + cOpts := make([]*C.rocksdb_options_t, numColumnFamilies) + for i, o := range cfOpts { + cOpts[i] = o.c + } + + cHandles := make([]*C.rocksdb_column_family_handle_t, numColumnFamilies) + + var cErr *C.char + db := C.rocksdb_open_for_read_only_column_families( + opts.c, + cName, + C.int(numColumnFamilies), + &cNames[0], + &cOpts[0], + &cHandles[0], + boolToChar(errorIfLogFileExist), + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, nil, errors.New(C.GoString(cErr)) + } + + cfHandles := make([]*ColumnFamilyHandle, numColumnFamilies) + for i, c := range cHandles { + cfHandles[i] = NewNativeColumnFamilyHandle(c) + } + + return &DB{ + name: name, + c: db, + opts: opts, + }, cfHandles, nil +} + +// OpenDbAsSecondary opens a database with the specified options for secondary usage. +func OpenDbAsSecondary(opts *Options, name string, secondaryPath string) (*DB, error) { + var ( + cErr *C.char + cName = C.CString(name) + cSecondaryPath = C.CString(secondaryPath) + ) + defer C.free(unsafe.Pointer(cName)) + defer C.free(unsafe.Pointer(cSecondaryPath)) + db := C.rocksdb_open_as_secondary(opts.c, cName, cSecondaryPath, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return &DB{ + name: name, + secondaryPath: secondaryPath, + c: db, + opts: opts, + }, nil +} + +// OpenDbAsSecondaryColumnFamilies opens a database with the specified column +// families in secondary mode. +func OpenDbAsSecondaryColumnFamilies( + opts *Options, + name string, + secondaryPath string, + cfNames []string, + cfOpts []*Options, +) (*DB, []*ColumnFamilyHandle, error) { + numColumnFamilies := len(cfNames) + if numColumnFamilies != len(cfOpts) { + return nil, nil, errors.New("must provide the same number of column family names and options") + } + + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + + cSecondaryPath := C.CString(secondaryPath) + defer C.free(unsafe.Pointer(cSecondaryPath)) + + cNames := make([]*C.char, numColumnFamilies) + for i, s := range cfNames { + cNames[i] = C.CString(s) + } + defer func() { + for _, s := range cNames { + C.free(unsafe.Pointer(s)) + } + }() + + cOpts := make([]*C.rocksdb_options_t, numColumnFamilies) + for i, o := range cfOpts { + cOpts[i] = o.c + } + + cHandles := make([]*C.rocksdb_column_family_handle_t, numColumnFamilies) + + var cErr *C.char + db := C.rocksdb_open_as_secondary_column_families( + opts.c, + cName, + cSecondaryPath, + C.int(numColumnFamilies), + &cNames[0], + &cOpts[0], + &cHandles[0], + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, nil, errors.New(C.GoString(cErr)) + } + + cfHandles := make([]*ColumnFamilyHandle, numColumnFamilies) + for i, c := range cHandles { + cfHandles[i] = NewNativeColumnFamilyHandle(c) + } + + return &DB{ + name: name, + c: db, + opts: opts, + }, cfHandles, nil +} + +// ListColumnFamilies lists the names of the column families in the DB. +func ListColumnFamilies(opts *Options, name string) ([]string, error) { + var ( + cErr *C.char + cLen C.size_t + cName = C.CString(name) + ) + defer C.free(unsafe.Pointer(cName)) + cNames := C.rocksdb_list_column_families(opts.c, cName, &cLen, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + namesLen := int(cLen) + names := make([]string, namesLen) + // The maximum capacity of the following two slices is limited to (2^29)-1 to remain compatible + // with 32-bit platforms. The size of a `*C.char` (a pointer) is 4 Byte on a 32-bit system + // and (2^29)*4 == math.MaxInt32 + 1. -- See issue golang/go#13656 + cNamesArr := (*[(1 << 29) - 1]*C.char)(unsafe.Pointer(cNames))[:namesLen:namesLen] + for i, n := range cNamesArr { + names[i] = C.GoString(n) + } + C.rocksdb_list_column_families_destroy(cNames, cLen) + return names, nil +} + +// UnsafeGetDB returns the underlying c rocksdb instance. +func (db *DB) UnsafeGetDB() unsafe.Pointer { + return unsafe.Pointer(db.c) +} + +// Name returns the name of the database. +func (db *DB) Name() string { + return db.name +} + +// SecondaryPath returns the secondary path of the database, if it is a secondary database instance. +func (db *DB) SecondaryPath() string { + return db.secondaryPath +} + +// Get returns the data associated with the key from the database. +func (db *DB) Get(opts *ReadOptions, key []byte) (*Slice, error) { + var ( + cErr *C.char + cValLen C.size_t + cKey = byteToChar(key) + ) + cValue := C.rocksdb_get(db.c, opts.c, cKey, C.size_t(len(key)), &cValLen, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewSlice(cValue, cValLen), nil +} + +// GetBytes is like Get but returns a copy of the data. +func (db *DB) GetBytes(opts *ReadOptions, key []byte) ([]byte, error) { + var ( + cErr *C.char + cValLen C.size_t + cKey = byteToChar(key) + ) + cValue := C.rocksdb_get(db.c, opts.c, cKey, C.size_t(len(key)), &cValLen, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + if cValue == nil { + return nil, nil + } + defer C.rocksdb_free(unsafe.Pointer(cValue)) + return C.GoBytes(unsafe.Pointer(cValue), C.int(cValLen)), nil +} + +// GetCF returns the data associated with the key from the database and column family. +func (db *DB) GetCF(opts *ReadOptions, cf *ColumnFamilyHandle, key []byte) (*Slice, error) { + var ( + cErr *C.char + cValLen C.size_t + cKey = byteToChar(key) + ) + cValue := C.rocksdb_get_cf(db.c, opts.c, cf.c, cKey, C.size_t(len(key)), &cValLen, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewSlice(cValue, cValLen), nil +} + +// GetPinned returns the data associated with the key from the database. +func (db *DB) GetPinned(opts *ReadOptions, key []byte) (*PinnableSliceHandle, error) { + var ( + cErr *C.char + cKey = byteToChar(key) + ) + cHandle := C.rocksdb_get_pinned(db.c, opts.c, cKey, C.size_t(len(key)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewNativePinnableSliceHandle(cHandle), nil +} + +// MultiGet returns the data associated with the passed keys from the database +func (db *DB) MultiGet(opts *ReadOptions, keys ...[]byte) (Slices, error) { + cKeys, cKeySizes := byteSlicesToCSlices(keys) + defer cKeys.Destroy() + vals := make(charsSlice, len(keys)) + valSizes := make(sizeTSlice, len(keys)) + rocksErrs := make(charsSlice, len(keys)) + + C.rocksdb_multi_get( + db.c, + opts.c, + C.size_t(len(keys)), + cKeys.c(), + cKeySizes.c(), + vals.c(), + valSizes.c(), + rocksErrs.c(), + ) + + var errs []error + + for i, rocksErr := range rocksErrs { + if rocksErr != nil { + defer C.rocksdb_free(unsafe.Pointer(rocksErr)) + err := fmt.Errorf("getting %q failed: %v", string(keys[i]), C.GoString(rocksErr)) + errs = append(errs, err) + } + } + + if len(errs) > 0 { + return nil, fmt.Errorf("failed to get %d keys, first error: %v", len(errs), errs[0]) + } + + slices := make(Slices, len(keys)) + for i, val := range vals { + slices[i] = NewSlice(val, valSizes[i]) + } + + return slices, nil +} + +// MultiGetCF returns the data associated with the passed keys from the column family +func (db *DB) MultiGetCF(opts *ReadOptions, cf *ColumnFamilyHandle, keys ...[]byte) (Slices, error) { + cfs := make(ColumnFamilyHandles, len(keys)) + for i := 0; i < len(keys); i++ { + cfs[i] = cf + } + return db.MultiGetCFMultiCF(opts, cfs, keys) +} + +// MultiGetCFMultiCF returns the data associated with the passed keys and +// column families. +func (db *DB) MultiGetCFMultiCF(opts *ReadOptions, cfs ColumnFamilyHandles, keys [][]byte) (Slices, error) { + cKeys, cKeySizes := byteSlicesToCSlices(keys) + defer cKeys.Destroy() + vals := make(charsSlice, len(keys)) + valSizes := make(sizeTSlice, len(keys)) + rocksErrs := make(charsSlice, len(keys)) + + C.rocksdb_multi_get_cf( + db.c, + opts.c, + cfs.toCSlice().c(), + C.size_t(len(keys)), + cKeys.c(), + cKeySizes.c(), + vals.c(), + valSizes.c(), + rocksErrs.c(), + ) + + var errs []error + + for i, rocksErr := range rocksErrs { + if rocksErr != nil { + defer C.rocksdb_free(unsafe.Pointer(rocksErr)) + err := fmt.Errorf("getting %q failed: %v", string(keys[i]), C.GoString(rocksErr)) + errs = append(errs, err) + } + } + + if len(errs) > 0 { + return nil, fmt.Errorf("failed to get %d keys, first error: %v", len(errs), errs[0]) + } + + slices := make(Slices, len(keys)) + for i, val := range vals { + slices[i] = NewSlice(val, valSizes[i]) + } + + return slices, nil +} + +// Put writes data associated with a key to the database. +func (db *DB) Put(opts *WriteOptions, key, value []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + cValue = byteToChar(value) + ) + C.rocksdb_put(db.c, opts.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// PutCF writes data associated with a key to the database and column family. +func (db *DB) PutCF(opts *WriteOptions, cf *ColumnFamilyHandle, key, value []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + cValue = byteToChar(value) + ) + C.rocksdb_put_cf(db.c, opts.c, cf.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Delete removes the data associated with the key from the database. +func (db *DB) Delete(opts *WriteOptions, key []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + ) + C.rocksdb_delete(db.c, opts.c, cKey, C.size_t(len(key)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// DeleteCF removes the data associated with the key from the database and column family. +func (db *DB) DeleteCF(opts *WriteOptions, cf *ColumnFamilyHandle, key []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + ) + C.rocksdb_delete_cf(db.c, opts.c, cf.c, cKey, C.size_t(len(key)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Merge merges the data associated with the key with the actual data in the database. +func (db *DB) Merge(opts *WriteOptions, key []byte, value []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + cValue = byteToChar(value) + ) + C.rocksdb_merge(db.c, opts.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// MergeCF merges the data associated with the key with the actual data in the +// database and column family. +func (db *DB) MergeCF(opts *WriteOptions, cf *ColumnFamilyHandle, key []byte, value []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + cValue = byteToChar(value) + ) + C.rocksdb_merge_cf(db.c, opts.c, cf.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Write writes a WriteBatch to the database +func (db *DB) Write(opts *WriteOptions, batch *WriteBatch) error { + var cErr *C.char + C.rocksdb_write(db.c, opts.c, batch.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// NewIterator returns an Iterator over the the database that uses the +// ReadOptions given. +func (db *DB) NewIterator(opts *ReadOptions) *Iterator { + cIter := C.rocksdb_create_iterator(db.c, opts.c) + return NewNativeIterator(unsafe.Pointer(cIter)) +} + +// NewIteratorCF returns an Iterator over the the database and column family +// that uses the ReadOptions given. +func (db *DB) NewIteratorCF(opts *ReadOptions, cf *ColumnFamilyHandle) *Iterator { + cIter := C.rocksdb_create_iterator_cf(db.c, opts.c, cf.c) + return NewNativeIterator(unsafe.Pointer(cIter)) +} + +func (db *DB) GetUpdatesSince(seqNumber uint64) (*WalIterator, error) { + var cErr *C.char + cIter := C.rocksdb_get_updates_since(db.c, C.uint64_t(seqNumber), nil, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewNativeWalIterator(unsafe.Pointer(cIter)), nil +} + +func (db *DB) GetLatestSequenceNumber() uint64 { + return uint64(C.rocksdb_get_latest_sequence_number(db.c)) +} + +// NewSnapshot creates a new snapshot of the database. +func (db *DB) NewSnapshot() *Snapshot { + cSnap := C.rocksdb_create_snapshot(db.c) + return NewNativeSnapshot(cSnap) +} + +// ReleaseSnapshot releases the snapshot and its resources. +func (db *DB) ReleaseSnapshot(snapshot *Snapshot) { + C.rocksdb_release_snapshot(db.c, snapshot.c) + snapshot.c = nil +} + +// GetProperty returns the value of a database property. +func (db *DB) GetProperty(propName string) string { + cprop := C.CString(propName) + defer C.free(unsafe.Pointer(cprop)) + cValue := C.rocksdb_property_value(db.c, cprop) + defer C.rocksdb_free(unsafe.Pointer(cValue)) + return C.GoString(cValue) +} + +// GetPropertyCF returns the value of a database property. +func (db *DB) GetPropertyCF(propName string, cf *ColumnFamilyHandle) string { + cProp := C.CString(propName) + defer C.free(unsafe.Pointer(cProp)) + cValue := C.rocksdb_property_value_cf(db.c, cf.c, cProp) + defer C.rocksdb_free(unsafe.Pointer(cValue)) + return C.GoString(cValue) +} + +// CreateColumnFamily create a new column family. +func (db *DB) CreateColumnFamily(opts *Options, name string) (*ColumnFamilyHandle, error) { + var ( + cErr *C.char + cName = C.CString(name) + ) + defer C.free(unsafe.Pointer(cName)) + cHandle := C.rocksdb_create_column_family(db.c, opts.c, cName, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewNativeColumnFamilyHandle(cHandle), nil +} + +// CreateColumnFamilyWithTTL creates a new column family with a TTL. +func (db *DB) CreateColumnFamilyWithTTL(opts *Options, name string, ttl int) (*ColumnFamilyHandle, error) { + var ( + cErr *C.char + cName = C.CString(name) + cTtl = C.int(ttl) + ) + defer C.free(unsafe.Pointer(cName)) + cHandle := C.rocksdb_create_column_family_with_ttl(db.c, opts.c, cName, cTtl, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewNativeColumnFamilyHandle(cHandle), nil +} + +// DropColumnFamily drops a column family. +func (db *DB) DropColumnFamily(c *ColumnFamilyHandle) error { + var cErr *C.char + C.rocksdb_drop_column_family(db.c, c.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// GetApproximateSizes returns the approximate number of bytes of file system +// space used by one or more key ranges. +// +// The keys counted will begin at Range.Start and end on the key before +// Range.Limit. +func (db *DB) GetApproximateSizes(ranges []Range) ([]uint64, error) { + sizes := make([]uint64, len(ranges)) + if len(ranges) == 0 { + return sizes, nil + } + + cStarts := make([]*C.char, len(ranges)) + cLimits := make([]*C.char, len(ranges)) + cStartLens := make([]C.size_t, len(ranges)) + cLimitLens := make([]C.size_t, len(ranges)) + for i, r := range ranges { + cStarts[i] = (*C.char)(C.CBytes(r.Start)) + cStartLens[i] = C.size_t(len(r.Start)) + cLimits[i] = (*C.char)(C.CBytes(r.Limit)) + cLimitLens[i] = C.size_t(len(r.Limit)) + } + + defer func() { + for i := range ranges { + C.free(unsafe.Pointer(cStarts[i])) + C.free(unsafe.Pointer(cLimits[i])) + } + }() + + var cErr *C.char + C.rocksdb_approximate_sizes( + db.c, + C.int(len(ranges)), + &cStarts[0], + &cStartLens[0], + &cLimits[0], + &cLimitLens[0], + (*C.uint64_t)(&sizes[0]), + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return sizes, errors.New(C.GoString(cErr)) + } + + return sizes, nil +} + +// GetApproximateSizesCF returns the approximate number of bytes of file system +// space used by one or more key ranges in the column family. +// +// The keys counted will begin at Range.Start and end on the key before +// Range.Limit. +func (db *DB) GetApproximateSizesCF(cf *ColumnFamilyHandle, ranges []Range) ([]uint64, error) { + sizes := make([]uint64, len(ranges)) + if len(ranges) == 0 { + return sizes, nil + } + + cStarts := make([]*C.char, len(ranges)) + cLimits := make([]*C.char, len(ranges)) + cStartLens := make([]C.size_t, len(ranges)) + cLimitLens := make([]C.size_t, len(ranges)) + for i, r := range ranges { + cStarts[i] = (*C.char)(C.CBytes(r.Start)) + cStartLens[i] = C.size_t(len(r.Start)) + cLimits[i] = (*C.char)(C.CBytes(r.Limit)) + cLimitLens[i] = C.size_t(len(r.Limit)) + } + + defer func() { + for i := range ranges { + C.free(unsafe.Pointer(cStarts[i])) + C.free(unsafe.Pointer(cLimits[i])) + } + }() + + var cErr *C.char + C.rocksdb_approximate_sizes_cf( + db.c, + cf.c, + C.int(len(ranges)), + &cStarts[0], + &cStartLens[0], + &cLimits[0], + &cLimitLens[0], + (*C.uint64_t)(&sizes[0]), + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return sizes, errors.New(C.GoString(cErr)) + } + return sizes, nil +} + +// SetOptions dynamically changes options through the SetOptions API. +func (db *DB) SetOptions(keys, values []string) error { + num_keys := len(keys) + + if num_keys == 0 { + return nil + } + + cKeys := make([]*C.char, num_keys) + cValues := make([]*C.char, num_keys) + for i := range keys { + cKeys[i] = C.CString(keys[i]) + cValues[i] = C.CString(values[i]) + } + + var cErr *C.char + + C.rocksdb_set_options( + db.c, + C.int(num_keys), + &cKeys[0], + &cValues[0], + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// SetOptionsCF dynamically changes options through the SetOptions API for specific Column Family. +func (db *DB) SetOptionsCF(cf *ColumnFamilyHandle, keys, values []string) (err error) { + numKeys := len(keys) + if numKeys == 0 { + return nil + } + + cKeys := make([]*C.char, numKeys) + cValues := make([]*C.char, numKeys) + for i := range keys { + cKeys[i] = C.CString(keys[i]) + cValues[i] = C.CString(values[i]) + } + + var cErr *C.char + + C.rocksdb_set_options_cf( + db.c, + cf.c, + C.int(numKeys), + &cKeys[0], + &cValues[0], + &cErr, + ) + if cErr != nil { + err = errors.New(C.GoString(cErr)) + C.rocksdb_free(unsafe.Pointer(cErr)) + } + + // free before return + for i := range cKeys { + C.free(unsafe.Pointer(cKeys[i])) + C.free(unsafe.Pointer(cValues[i])) + } + + return +} + +// LiveFileMetadata is a metadata which is associated with each SST file. +type LiveFileMetadata struct { + Name string + ColumnFamilyName string + Level int + Size int64 + SmallestKey []byte + LargestKey []byte +} + +// GetLiveFilesMetaData returns a list of all table files with their +// level, start key and end key. +func (db *DB) GetLiveFilesMetaData() []LiveFileMetadata { + lf := C.rocksdb_livefiles(db.c) + defer C.rocksdb_livefiles_destroy(lf) + + count := C.rocksdb_livefiles_count(lf) + liveFiles := make([]LiveFileMetadata, int(count)) + for i := C.int(0); i < count; i++ { + var liveFile LiveFileMetadata + liveFile.Name = C.GoString(C.rocksdb_livefiles_name(lf, i)) + liveFile.Level = int(C.rocksdb_livefiles_level(lf, i)) + liveFile.Size = int64(C.rocksdb_livefiles_size(lf, i)) + liveFile.ColumnFamilyName = C.GoString(C.rocksdb_livefiles_column_family_name(lf, i)) + + var cSize C.size_t + key := C.rocksdb_livefiles_smallestkey(lf, i, &cSize) + liveFile.SmallestKey = C.GoBytes(unsafe.Pointer(key), C.int(cSize)) + + key = C.rocksdb_livefiles_largestkey(lf, i, &cSize) + liveFile.LargestKey = C.GoBytes(unsafe.Pointer(key), C.int(cSize)) + liveFiles[int(i)] = liveFile + } + return liveFiles +} + +// CompactRange runs a manual compaction on the Range of keys given. This is +// not likely to be needed for typical usage. +func (db *DB) CompactRange(r Range) { + cStart := byteToChar(r.Start) + cLimit := byteToChar(r.Limit) + C.rocksdb_compact_range(db.c, cStart, C.size_t(len(r.Start)), cLimit, C.size_t(len(r.Limit))) +} + +// CompactRangeCF runs a manual compaction on the Range of keys given on the +// given column family. This is not likely to be needed for typical usage. +func (db *DB) CompactRangeCF(cf *ColumnFamilyHandle, r Range) { + cStart := byteToChar(r.Start) + cLimit := byteToChar(r.Limit) + C.rocksdb_compact_range_cf(db.c, cf.c, cStart, C.size_t(len(r.Start)), cLimit, C.size_t(len(r.Limit))) +} + +// Flush triggers a manuel flush for the database. +func (db *DB) Flush(opts *FlushOptions) error { + var cErr *C.char + C.rocksdb_flush(db.c, opts.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// FlushCF triggers a manual flush for the column family. +func (db *DB) FlushCF(cf *ColumnFamilyHandle, opts *FlushOptions) error { + var cErr *C.char + C.rocksdb_flush_cf(db.c, opts.c, cf.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// DisableFileDeletions disables file deletions and should be used when backup the database. +func (db *DB) DisableFileDeletions() error { + var cErr *C.char + C.rocksdb_disable_file_deletions(db.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// EnableFileDeletions enables file deletions for the database. +func (db *DB) EnableFileDeletions(force bool) error { + var cErr *C.char + C.rocksdb_enable_file_deletions(db.c, boolToChar(force), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// DeleteFile deletes the file name from the db directory and update the internal state to +// reflect that. Supports deletion of sst and log files only. 'name' must be +// path relative to the db directory. eg. 000001.sst, /archive/000003.log. +func (db *DB) DeleteFile(name string) { + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + C.rocksdb_delete_file(db.c, cName) +} + +// DeleteFileInRange deletes SST files that contain keys between the Range, [r.Start, r.Limit] +func (db *DB) DeleteFileInRange(r Range) error { + cStartKey := byteToChar(r.Start) + cLimitKey := byteToChar(r.Limit) + + var cErr *C.char + + C.rocksdb_delete_file_in_range( + db.c, + cStartKey, C.size_t(len(r.Start)), + cLimitKey, C.size_t(len(r.Limit)), + &cErr, + ) + + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// DeleteFileInRangeCF deletes SST files that contain keys between the Range, [r.Start, r.Limit], and +// belong to a given column family +func (db *DB) DeleteFileInRangeCF(cf *ColumnFamilyHandle, r Range) error { + cStartKey := byteToChar(r.Start) + cLimitKey := byteToChar(r.Limit) + + var cErr *C.char + + C.rocksdb_delete_file_in_range_cf( + db.c, + cf.c, + cStartKey, C.size_t(len(r.Start)), + cLimitKey, C.size_t(len(r.Limit)), + &cErr, + ) + + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// IngestExternalFile loads a list of external SST files. +func (db *DB) IngestExternalFile(filePaths []string, opts *IngestExternalFileOptions) error { + cFilePaths := make([]*C.char, len(filePaths)) + for i, s := range filePaths { + cFilePaths[i] = C.CString(s) + } + defer func() { + for _, s := range cFilePaths { + C.free(unsafe.Pointer(s)) + } + }() + + var cErr *C.char + + C.rocksdb_ingest_external_file( + db.c, + &cFilePaths[0], + C.size_t(len(filePaths)), + opts.c, + &cErr, + ) + + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// IngestExternalFileCF loads a list of external SST files for a column family. +func (db *DB) IngestExternalFileCF(handle *ColumnFamilyHandle, filePaths []string, opts *IngestExternalFileOptions) error { + cFilePaths := make([]*C.char, len(filePaths)) + for i, s := range filePaths { + cFilePaths[i] = C.CString(s) + } + defer func() { + for _, s := range cFilePaths { + C.free(unsafe.Pointer(s)) + } + }() + + var cErr *C.char + + C.rocksdb_ingest_external_file_cf( + db.c, + handle.c, + &cFilePaths[0], + C.size_t(len(filePaths)), + opts.c, + &cErr, + ) + + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// NewCheckpoint creates a new Checkpoint for this db. +func (db *DB) NewCheckpoint() (*Checkpoint, error) { + var ( + cErr *C.char + ) + cCheckpoint := C.rocksdb_checkpoint_object_create( + db.c, &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + + return NewNativeCheckpoint(cCheckpoint), nil +} + +// Close closes the database. +func (db *DB) Close() { + C.rocksdb_close(db.c) +} + +// TryCatchUpWithPrimary will sync a secondary db with the state of the primary +func (db *DB) TryCatchUpWithPrimary() error { + var ( + cErr *C.char + ) + C.rocksdb_try_catch_up_with_primary(db.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + + return nil +} + +// DestroyDb removes a database entirely, removing everything from the +// filesystem. +func DestroyDb(name string, opts *Options) error { + var ( + cErr *C.char + cName = C.CString(name) + ) + defer C.free(unsafe.Pointer(cName)) + C.rocksdb_destroy_db(opts.c, cName, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// RepairDb repairs a database. +func RepairDb(name string, opts *Options) error { + var ( + cErr *C.char + cName = C.CString(name) + ) + defer C.free(unsafe.Pointer(cName)) + C.rocksdb_repair_db(opts.c, cName, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} diff --git a/v8/db_external_file_test.go b/v8/db_external_file_test.go new file mode 100644 index 00000000..9eae5aab --- /dev/null +++ b/v8/db_external_file_test.go @@ -0,0 +1,57 @@ +package gorocksdb + +import ( + "io/ioutil" + "os" + "testing" + + "github.com/facebookgo/ensure" +) + +func TestExternalFile(t *testing.T) { + db := newTestDB(t, "TestDBExternalFile", nil) + defer db.Close() + + envOpts := NewDefaultEnvOptions() + opts := NewDefaultOptions() + w := NewSSTFileWriter(envOpts, opts) + defer w.Destroy() + + filePath, err := ioutil.TempFile("", "sst-file-test") + ensure.Nil(t, err) + defer os.Remove(filePath.Name()) + + err = w.Open(filePath.Name()) + ensure.Nil(t, err) + + err = w.Add([]byte("aaa"), []byte("aaaValue")) + ensure.Nil(t, err) + err = w.Add([]byte("bbb"), []byte("bbbValue")) + ensure.Nil(t, err) + err = w.Add([]byte("ccc"), []byte("cccValue")) + ensure.Nil(t, err) + err = w.Add([]byte("ddd"), []byte("dddValue")) + ensure.Nil(t, err) + + err = w.Finish() + ensure.Nil(t, err) + + ingestOpts := NewDefaultIngestExternalFileOptions() + err = db.IngestExternalFile([]string{filePath.Name()}, ingestOpts) + ensure.Nil(t, err) + + readOpts := NewDefaultReadOptions() + + v1, err := db.Get(readOpts, []byte("aaa")) + ensure.Nil(t, err) + ensure.DeepEqual(t, v1.Data(), []byte("aaaValue")) + v2, err := db.Get(readOpts, []byte("bbb")) + ensure.Nil(t, err) + ensure.DeepEqual(t, v2.Data(), []byte("bbbValue")) + v3, err := db.Get(readOpts, []byte("ccc")) + ensure.Nil(t, err) + ensure.DeepEqual(t, v3.Data(), []byte("cccValue")) + v4, err := db.Get(readOpts, []byte("ddd")) + ensure.Nil(t, err) + ensure.DeepEqual(t, v4.Data(), []byte("dddValue")) +} diff --git a/v8/db_test.go b/v8/db_test.go new file mode 100755 index 00000000..fc5814d9 --- /dev/null +++ b/v8/db_test.go @@ -0,0 +1,457 @@ +package gorocksdb + +import ( + "io/ioutil" + "os" + "strconv" + "testing" + + "github.com/facebookgo/ensure" +) + +func TestOpenDb(t *testing.T) { + db := newTestDB(t, "TestOpenDb", nil) + defer db.Close() +} + +func TestOpenDbColumnFamiliesWithTTL(t *testing.T) { + dir, err := ioutil.TempDir("", "gorocksdb-TestOpenDbColumnFamiliesWithTtl") + ensure.Nil(t, err) + + opts := NewDefaultOptions() + defer opts.Destroy() + + opts.SetCreateIfMissing(true) + opts.SetCreateIfMissingColumnFamilies(true) + + db, _, err := OpenDbColumnFamiliesWithTTL(opts, dir, []string{"default", "mycf"}, []*Options{opts, opts}, []int{3600, 3600}) + defer db.Close() + + ensure.Nil(t, err) +} + +func TestCreateColumnFamilyWithTTL(t *testing.T) { + db := newTestDBWithTTL(t, "TestCreateColumnFamilyWithTTL", nil) + defer db.Close() + + var ( + givenKey = []byte("hello") + givenVal = []byte("world") + o = NewDefaultOptions() + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + ) + + cf, err := db.CreateColumnFamilyWithTTL(o, "cf", 3600) + ensure.Nil(t, err) + + ensure.Nil(t, db.PutCF(wo, cf, givenKey, givenVal)) + + v, err := db.GetCF(ro, cf, givenKey) + defer v.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v.Data(), givenVal) +} + +func TestDBCRUD(t *testing.T) { + db := newTestDB(t, "TestDBGet", nil) + defer db.Close() + + var ( + givenKey = []byte("hello") + givenVal1 = []byte("") + givenVal2 = []byte("world1") + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + ) + + // create + ensure.Nil(t, db.Put(wo, givenKey, givenVal1)) + + // retrieve + v1, err := db.Get(ro, givenKey) + defer v1.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v1.Data(), givenVal1) + + // update + ensure.Nil(t, db.Put(wo, givenKey, givenVal2)) + v2, err := db.Get(ro, givenKey) + defer v2.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v2.Data(), givenVal2) + + // retrieve pinned + v3, err := db.GetPinned(ro, givenKey) + defer v3.Destroy() + ensure.Nil(t, err) + ensure.DeepEqual(t, v3.Data(), givenVal2) + + // delete + ensure.Nil(t, db.Delete(wo, givenKey)) + v4, err := db.Get(ro, givenKey) + ensure.Nil(t, err) + ensure.True(t, v4.Data() == nil) + + // retrieve missing pinned + v5, err := db.GetPinned(ro, givenKey) + defer v5.Destroy() + ensure.Nil(t, err) + ensure.True(t, v5.Data() == nil) +} + +func TestDBCRUDDBPaths(t *testing.T) { + names := make([]string, 4) + target_sizes := make([]uint64, len(names)) + + for i := range names { + names[i] = "TestDBGet_" + strconv.FormatInt(int64(i), 10) + target_sizes[i] = uint64(1024 * 1024 * (i + 1)) + } + + db := newTestDBPathNames(t, "TestDBGet", names, target_sizes, nil) + defer db.Close() + + var ( + givenKey = []byte("hello") + givenVal1 = []byte("") + givenVal2 = []byte("world1") + givenVal3 = []byte("world2") + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + ) + + // retrieve before create + noexist, err := db.Get(ro, givenKey) + defer noexist.Free() + ensure.Nil(t, err) + ensure.False(t, noexist.Exists()) + ensure.DeepEqual(t, noexist.Data(), []byte(nil)) + + // create + ensure.Nil(t, db.Put(wo, givenKey, givenVal1)) + + // retrieve + v1, err := db.Get(ro, givenKey) + defer v1.Free() + ensure.Nil(t, err) + ensure.True(t, v1.Exists()) + ensure.DeepEqual(t, v1.Data(), givenVal1) + + // update + ensure.Nil(t, db.Put(wo, givenKey, givenVal2)) + v2, err := db.Get(ro, givenKey) + defer v2.Free() + ensure.Nil(t, err) + ensure.True(t, v2.Exists()) + ensure.DeepEqual(t, v2.Data(), givenVal2) + + // update + ensure.Nil(t, db.Put(wo, givenKey, givenVal3)) + v3, err := db.Get(ro, givenKey) + defer v3.Free() + ensure.Nil(t, err) + ensure.True(t, v3.Exists()) + ensure.DeepEqual(t, v3.Data(), givenVal3) + + // delete + ensure.Nil(t, db.Delete(wo, givenKey)) + v4, err := db.Get(ro, givenKey) + defer v4.Free() + ensure.Nil(t, err) + ensure.False(t, v4.Exists()) + ensure.DeepEqual(t, v4.Data(), []byte(nil)) +} + +func newTestDB(t *testing.T, name string, applyOpts func(opts *Options)) *DB { + dir, err := ioutil.TempDir("", "gorocksdb-"+name) + ensure.Nil(t, err) + + opts := NewDefaultOptions() + // test the ratelimiter + rateLimiter := NewRateLimiter(1024, 100*1000, 10) + opts.SetRateLimiter(rateLimiter) + opts.SetCreateIfMissing(true) + if applyOpts != nil { + applyOpts(opts) + } + db, err := OpenDb(opts, dir) + ensure.Nil(t, err) + + return db +} + +func newTestDBWithTTL(t *testing.T, name string, applyOpts func(opts *Options)) *DB { + dir, err := ioutil.TempDir("", "gorocksdb-"+name) + ensure.Nil(t, err) + + opts := NewDefaultOptions() + // test the ratelimiter + rateLimiter := NewRateLimiter(1024, 100*1000, 10) + opts.SetRateLimiter(rateLimiter) + opts.SetCreateIfMissing(true) + if applyOpts != nil { + applyOpts(opts) + } + db, err := OpenDbWithTTL(opts, dir, 3600) + ensure.Nil(t, err) + + return db +} + +func newSecondaryTestDB(t *testing.T, name string) *DB { + secondaryDir := name + "-secondary" + + opts := NewDefaultOptions() + opts.SetMaxOpenFiles(-1) + db, err := OpenDbAsSecondary(opts, name, secondaryDir) + ensure.Nil(t, err) + + return db +} + +func newSecondaryTestDBCF(t *testing.T, name string, cfNames []string, cfOpts []*Options) (*DB, []*ColumnFamilyHandle) { + secondaryDir := name + "-secondary" + + opts := NewDefaultOptions() + opts.SetMaxOpenFiles(-1) + db, handles, err := OpenDbAsSecondaryColumnFamilies(opts, name, secondaryDir, cfNames, cfOpts) + ensure.Nil(t, err) + + return db, handles +} + +func newTestDBPathNames(t *testing.T, name string, names []string, target_sizes []uint64, applyOpts func(opts *Options)) *DB { + ensure.DeepEqual(t, len(target_sizes), len(names)) + ensure.NotDeepEqual(t, len(names), 0) + + dir, err := ioutil.TempDir("", "gorocksdb-"+name) + ensure.Nil(t, err) + + paths := make([]string, len(names)) + for i, name := range names { + dir, err := ioutil.TempDir("", "gorocksdb-"+name) + ensure.Nil(t, err) + paths[i] = dir + } + + dbpaths := NewDBPathsFromData(paths, target_sizes) + defer DestroyDBPaths(dbpaths) + + opts := NewDefaultOptions() + opts.SetDBPaths(dbpaths) + // test the ratelimiter + rateLimiter := NewRateLimiter(1024, 100*1000, 10) + opts.SetRateLimiter(rateLimiter) + opts.SetCreateIfMissing(true) + if applyOpts != nil { + applyOpts(opts) + } + db, err := OpenDb(opts, dir) + ensure.Nil(t, err) + + return db +} + +func TestDBMultiGet(t *testing.T) { + db := newTestDB(t, "TestDBMultiGet", nil) + defer db.Close() + + var ( + givenKey1 = []byte("hello1") + givenKey2 = []byte("hello2") + givenKey3 = []byte("hello3") + givenVal1 = []byte("world1") + givenVal2 = []byte("world2") + givenVal3 = []byte("world3") + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + ) + + // create + ensure.Nil(t, db.Put(wo, givenKey1, givenVal1)) + ensure.Nil(t, db.Put(wo, givenKey2, givenVal2)) + ensure.Nil(t, db.Put(wo, givenKey3, givenVal3)) + + // retrieve + values, err := db.MultiGet(ro, []byte("noexist"), givenKey1, givenKey2, givenKey3) + defer values.Destroy() + ensure.Nil(t, err) + ensure.DeepEqual(t, len(values), 4) + + ensure.DeepEqual(t, values[0].Data(), []byte(nil)) + ensure.DeepEqual(t, values[1].Data(), givenVal1) + ensure.DeepEqual(t, values[2].Data(), givenVal2) + ensure.DeepEqual(t, values[3].Data(), givenVal3) +} + +func TestDBGetApproximateSizes(t *testing.T) { + db := newTestDB(t, "TestDBGetApproximateSizes", nil) + defer db.Close() + + // no ranges + sizes, err := db.GetApproximateSizes(nil) + ensure.DeepEqual(t, len(sizes), 0) + ensure.Nil(t, err) + + // range will nil start and limit + sizes, err = db.GetApproximateSizes([]Range{{Start: nil, Limit: nil}}) + ensure.DeepEqual(t, sizes, []uint64{0}) + ensure.Nil(t, err) + + // valid range + sizes, err = db.GetApproximateSizes([]Range{{Start: []byte{0x00}, Limit: []byte{0xFF}}}) + ensure.DeepEqual(t, sizes, []uint64{0}) + ensure.Nil(t, err) +} + +func TestDBGetApproximateSizesCF(t *testing.T) { + db := newTestDB(t, "TestDBGetApproximateSizesCF", nil) + defer db.Close() + + o := NewDefaultOptions() + + cf, err := db.CreateColumnFamily(o, "other") + ensure.Nil(t, err) + + // no ranges + sizes, err := db.GetApproximateSizesCF(cf, nil) + ensure.DeepEqual(t, len(sizes), 0) + ensure.Nil(t, err) + + // range will nil start and limit + sizes, err = db.GetApproximateSizesCF(cf, []Range{{Start: nil, Limit: nil}}) + ensure.DeepEqual(t, sizes, []uint64{0}) + ensure.Nil(t, err) + + // valid range + sizes, err = db.GetApproximateSizesCF(cf, []Range{{Start: []byte{0x00}, Limit: []byte{0xFF}}}) + ensure.DeepEqual(t, sizes, []uint64{0}) + ensure.Nil(t, err) +} + +func TestDBFlushCF(t *testing.T) { + var ( + db = newTestDB(t, "TestDBFlushCF", nil) + o = NewDefaultOptions() + wo = NewDefaultWriteOptions() + fo = NewDefaultFlushOptions() + + key1 = []byte("hello1") + val1 = []byte("world1") + ) + defer func() { + fo.Destroy() + wo.Destroy() + db.Close() + }() + + cf, err := db.CreateColumnFamily(o, "other") + ensure.Nil(t, err) + + // update + ensure.Nil(t, db.PutCF(wo, cf, key1, val1)) + + // flush CF + ensure.Nil(t, db.FlushCF(cf, fo)) +} + +func TestSecondaryDB(t *testing.T) { + var ( + db = newTestDB(t, "TestSecondaryDB", nil) + secondaryDB = newSecondaryTestDB(t, db.Name()) + ro = NewDefaultReadOptions() + wo = NewDefaultWriteOptions() + fo = NewDefaultFlushOptions() + ) + defer func() { + fo.Destroy() + wo.Destroy() + ro.Destroy() + secondaryDB.Close() + db.Close() + + os.RemoveAll(secondaryDB.SecondaryPath()) + os.RemoveAll(db.Name()) + }() + + // Put a key into the primary database + ensure.Nil(t, db.Put(wo, []byte("hello"), []byte("world"))) + ensure.Nil(t, db.Flush(fo)) + + // Ensure the key is written correctly + s, err := db.Get(ro, []byte("hello")) + ensure.Nil(t, err) + ensure.NotNil(t, s) + + // Get the key from the secondary database, and ensure that we cannot see the key yet + s, err = secondaryDB.Get(ro, []byte("hello")) + ensure.Nil(t, err) + ensure.NotNil(t, s) + ensure.DeepEqual(t, s.Data(), []byte(nil)) + + // Catch up the secondary with the current state of the primary + err = secondaryDB.TryCatchUpWithPrimary() + ensure.Nil(t, err) + + // Ensure that now that it has caught up that the key is now present + s, err = secondaryDB.Get(ro, []byte("hello")) + ensure.Nil(t, err) + ensure.NotNil(t, s) + ensure.DeepEqual(t, s.Data(), []byte("world")) +} + +func TestSecondaryDBColumnFamilies(t *testing.T) { + var ( + db = newTestDB(t, "TestSecondaryDB", nil) + o = NewDefaultOptions() + ro = NewDefaultReadOptions() + wo = NewDefaultWriteOptions() + fo = NewDefaultFlushOptions() + ) + defer func() { + fo.Destroy() + wo.Destroy() + ro.Destroy() + o.Destroy() + db.Close() + + os.RemoveAll(db.Name()) + }() + + // Create a column family + primaryCF, err := db.CreateColumnFamily(o, "mycf") + ensure.Nil(t, err) + + // Open a secondary database, opening the created column family + secondaryDB, handles := newSecondaryTestDBCF(t, db.Name(), []string{"default", "mycf"}, []*Options{o, o}) + defer func() { + secondaryDB.Close() + os.RemoveAll(secondaryDB.SecondaryPath()) + }() + + // Put a key into the primary database + ensure.Nil(t, db.PutCF(wo, primaryCF, []byte("hello"), []byte("world"))) + ensure.Nil(t, db.FlushCF(primaryCF, fo)) + + // Ensure the key is written correctly + s, err := db.GetCF(ro, primaryCF, []byte("hello")) + ensure.Nil(t, err) + ensure.NotNil(t, s) + + // Get the key from the secondary database, and ensure that we cannot see the key yet + s, err = secondaryDB.GetCF(ro, handles[1], []byte("hello")) + ensure.Nil(t, err) + ensure.NotNil(t, s) + ensure.DeepEqual(t, s.Data(), []byte(nil)) + + // Catch up the secondary with the current state of the primary + err = secondaryDB.TryCatchUpWithPrimary() + ensure.Nil(t, err) + + // Ensure that now that it has caught up that the key is now present + s, err = secondaryDB.GetCF(ro, handles[1], []byte("hello")) + ensure.Nil(t, err) + ensure.NotNil(t, s) + ensure.DeepEqual(t, s.Data(), []byte("world")) +} diff --git a/v8/dbpath.go b/v8/dbpath.go new file mode 100644 index 00000000..bec984ad --- /dev/null +++ b/v8/dbpath.go @@ -0,0 +1,48 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" +import "unsafe" + +// DBPath represents options for a dbpath. +type DBPath struct { + c *C.rocksdb_dbpath_t +} + +// NewDBPath creates a DBPath object +// with the given path and target_size. +func NewDBPath(path string, target_size uint64) *DBPath { + cpath := C.CString(path) + defer C.free(unsafe.Pointer(cpath)) + return NewNativeDBPath(C.rocksdb_dbpath_create(cpath, C.uint64_t(target_size))) +} + +// NewNativeDBPath creates a DBPath object. +func NewNativeDBPath(c *C.rocksdb_dbpath_t) *DBPath { + return &DBPath{c} +} + +// Destroy deallocates the DBPath object. +func (dbpath *DBPath) Destroy() { + C.rocksdb_dbpath_destroy(dbpath.c) +} + +// NewDBPathsFromData creates a slice with allocated DBPath objects +// from paths and target_sizes. +func NewDBPathsFromData(paths []string, target_sizes []uint64) []*DBPath { + dbpaths := make([]*DBPath, len(paths)) + for i, path := range paths { + targetSize := target_sizes[i] + dbpaths[i] = NewDBPath(path, targetSize) + } + + return dbpaths +} + +// DestroyDBPaths deallocates all DBPath objects in dbpaths. +func DestroyDBPaths(dbpaths []*DBPath) { + for _, dbpath := range dbpaths { + dbpath.Destroy() + } +} diff --git a/v8/doc.go b/v8/doc.go new file mode 100644 index 00000000..f56a0926 --- /dev/null +++ b/v8/doc.go @@ -0,0 +1,74 @@ +/* +Package gorocksdb provides the ability to create and access RocksDB databases. + +gorocksdb.OpenDb opens and creates databases. + + bbto := gorocksdb.NewDefaultBlockBasedTableOptions() + bbto.SetBlockCache(gorocksdb.NewLRUCache(3 << 30)) + opts := gorocksdb.NewDefaultOptions() + opts.SetBlockBasedTableFactory(bbto) + opts.SetCreateIfMissing(true) + db, err := gorocksdb.OpenDb(opts, "/path/to/db") + +The DB struct returned by OpenDb provides DB.Get, DB.Put, DB.Merge and DB.Delete to modify +and query the database. + + ro := gorocksdb.NewDefaultReadOptions() + wo := gorocksdb.NewDefaultWriteOptions() + // if ro and wo are not used again, be sure to Close them. + err = db.Put(wo, []byte("foo"), []byte("bar")) + ... + value, err := db.Get(ro, []byte("foo")) + defer value.Free() + ... + err = db.Delete(wo, []byte("foo")) + +For bulk reads, use an Iterator. If you want to avoid disturbing your live +traffic while doing the bulk read, be sure to call SetFillCache(false) on the +ReadOptions you use when creating the Iterator. + + ro := gorocksdb.NewDefaultReadOptions() + ro.SetFillCache(false) + it := db.NewIterator(ro) + defer it.Close() + it.Seek([]byte("foo")) + for it = it; it.Valid(); it.Next() { + key := it.Key() + value := it.Value() + fmt.Printf("Key: %v Value: %v\n", key.Data(), value.Data()) + key.Free() + value.Free() + } + if err := it.Err(); err != nil { + ... + } + +Batched, atomic writes can be performed with a WriteBatch and +DB.Write. + + wb := gorocksdb.NewWriteBatch() + // defer wb.Close or use wb.Clear and reuse. + wb.Delete([]byte("foo")) + wb.Put([]byte("foo"), []byte("bar")) + wb.Put([]byte("bar"), []byte("foo")) + err := db.Write(wo, wb) + +If your working dataset does not fit in memory, you'll want to add a bloom +filter to your database. NewBloomFilter and +BlockBasedTableOptions.SetFilterPolicy is what you want. NewBloomFilter is +amount of bits in the filter to use per key in your database. + + filter := gorocksdb.NewBloomFilter(10) + bbto := gorocksdb.NewDefaultBlockBasedTableOptions() + bbto.SetFilterPolicy(filter) + opts.SetBlockBasedTableFactory(bbto) + db, err := gorocksdb.OpenDb(opts, "/path/to/db") + +If you're using a custom comparator in your code, be aware you may have to +make your own filter policy object. + +This documentation is not a complete discussion of RocksDB. Please read the +RocksDB documentation for information on its +operation. You'll find lots of goodies there. +*/ +package gorocksdb diff --git a/v8/dynflag.go b/v8/dynflag.go new file mode 100644 index 00000000..91229639 --- /dev/null +++ b/v8/dynflag.go @@ -0,0 +1,7 @@ +//go:build !linux || !rocksdbstatic +// +build !linux !rocksdbstatic + +package gorocksdb + +// #cgo LDFLAGS: -lrocksdb -lstdc++ -lm -lz -lbz2 -lsnappy -llz4 -lzstd -ldl +import "C" diff --git a/v8/env.go b/v8/env.go new file mode 100644 index 00000000..11e84ef8 --- /dev/null +++ b/v8/env.go @@ -0,0 +1,45 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// Env is a system call environment used by a database. +type Env struct { + c *C.rocksdb_env_t +} + +// NewDefaultEnv creates a default environment. +func NewDefaultEnv() *Env { + return NewNativeEnv(C.rocksdb_create_default_env()) +} + +// NewMemEnv creates MemEnv for in-memory testing. +func NewMemEnv() *Env { + return NewNativeEnv(C.rocksdb_create_mem_env()) +} + +// NewNativeEnv creates a Environment object. +func NewNativeEnv(c *C.rocksdb_env_t) *Env { + return &Env{c} +} + +// SetBackgroundThreads sets the number of background worker threads +// of a specific thread pool for this environment. +// 'LOW' is the default pool. +// Default: 1 +func (env *Env) SetBackgroundThreads(n int) { + C.rocksdb_env_set_background_threads(env.c, C.int(n)) +} + +// SetHighPriorityBackgroundThreads sets the size of the high priority +// thread pool that can be used to prevent compactions from stalling +// memtable flushes. +func (env *Env) SetHighPriorityBackgroundThreads(n int) { + C.rocksdb_env_set_high_priority_background_threads(env.c, C.int(n)) +} + +// Destroy deallocates the Env object. +func (env *Env) Destroy() { + C.rocksdb_env_destroy(env.c) + env.c = nil +} diff --git a/v8/filter_policy.go b/v8/filter_policy.go new file mode 100644 index 00000000..807760ed --- /dev/null +++ b/v8/filter_policy.go @@ -0,0 +1,44 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +type FilterPolicy struct { + c *C.rocksdb_filterpolicy_t +} + +// NewFilterPolicy creates a FilterPolicy object. +func NewFilterPolicy(c *C.rocksdb_filterpolicy_t) FilterPolicy { + return FilterPolicy{c} +} + +// NewBloomFilter returns a new filter policy that uses a bloom filter with approximately +// the specified number of bits per key. A good value for bits_per_key +// is 10, which yields a filter with ~1% false positive rate. +// +// Note: if you are using a custom comparator that ignores some parts +// of the keys being compared, you must not use NewBloomFilterPolicy() +// and must provide your own FilterPolicy that also ignores the +// corresponding parts of the keys. For example, if the comparator +// ignores trailing spaces, it would be incorrect to use a +// FilterPolicy (like NewBloomFilterPolicy) that does not ignore +// trailing spaces in keys. +func NewBloomFilter(bitsPerKey float64) FilterPolicy { + return NewFilterPolicy(C.rocksdb_filterpolicy_create_bloom(C.double(bitsPerKey))) +} + +// NewBloomFilterFull returns a new filter policy created with use_block_based_builder=false +// (use full or partitioned filter). +func NewBloomFilterFull(bitsPerKey float64) FilterPolicy { + return NewFilterPolicy(C.rocksdb_filterpolicy_create_bloom_full(C.double(bitsPerKey))) +} + +// NewRibbonFilter returns a new filter policy created with a ribbon filter. +func NewRibbonFilter(bitsPerKey float64) FilterPolicy { + return NewFilterPolicy(C.rocksdb_filterpolicy_create_ribbon(C.double(bitsPerKey))) +} + +// NewRibbonHybridFilter returns a new filter policy created with a ribbon hybrid filter. +func NewRibbonHybridFilter(bitsPerKey float64, bloomBeforeLevel int) FilterPolicy { + return NewFilterPolicy(C.rocksdb_filterpolicy_create_ribbon_hybrid(C.double(bitsPerKey), C.int(bloomBeforeLevel))) +} diff --git a/v8/go.mod b/v8/go.mod new file mode 100644 index 00000000..0157cc25 --- /dev/null +++ b/v8/go.mod @@ -0,0 +1,16 @@ +module github.com/DataDog/gorocksdb/v8 + +go 1.20 + +require ( + github.com/facebookgo/ensure v0.0.0-20200202191622-63f1cf65ac4c + github.com/stretchr/testify v1.9.0 +) + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/facebookgo/stack v0.0.0-20160209184415-751773369052 // indirect + github.com/facebookgo/subset v0.0.0-20200203212716-c811ad88dec4 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/v8/go.sum b/v8/go.sum new file mode 100644 index 00000000..eff251f6 --- /dev/null +++ b/v8/go.sum @@ -0,0 +1,16 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/facebookgo/ensure v0.0.0-20200202191622-63f1cf65ac4c h1:8ISkoahWXwZR41ois5lSJBSVw4D0OV19Ht/JSTzvSv0= +github.com/facebookgo/ensure v0.0.0-20200202191622-63f1cf65ac4c/go.mod h1:Yg+htXGokKKdzcwhuNDwVvN+uBxDGXJ7G/VN1d8fa64= +github.com/facebookgo/stack v0.0.0-20160209184415-751773369052 h1:JWuenKqqX8nojtoVVWjGfOF9635RETekkoH6Cc9SX0A= +github.com/facebookgo/stack v0.0.0-20160209184415-751773369052/go.mod h1:UbMTZqLaRiH3MsBH8va0n7s1pQYcu3uTb8G4tygF4Zg= +github.com/facebookgo/subset v0.0.0-20200203212716-c811ad88dec4 h1:7HZCaLC5+BZpmbhCOZJ293Lz68O7PYrF2EzeiFMwCLk= +github.com/facebookgo/subset v0.0.0-20200203212716-c811ad88dec4/go.mod h1:5tD+neXqOorC30/tWg0LCSkrqj/AR6gu8yY8/fpw1q0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/v8/gorocksdb.c b/v8/gorocksdb.c new file mode 100644 index 00000000..3f76c6d1 --- /dev/null +++ b/v8/gorocksdb.c @@ -0,0 +1,55 @@ +#include "gorocksdb.h" +#include "_cgo_export.h" + +/* Base */ + +void gorocksdb_destruct_handler(void* state) { } + +/* Comparator */ + +rocksdb_comparator_t* gorocksdb_comparator_create(uintptr_t idx) { + return rocksdb_comparator_create( + (void*)idx, + gorocksdb_destruct_handler, + (int (*)(void*, const char*, size_t, const char*, size_t))(gorocksdb_comparator_compare), + (const char *(*)(void*))(gorocksdb_comparator_name)); +} + +/* CompactionFilter */ + +rocksdb_compactionfilter_t* gorocksdb_compactionfilter_create(uintptr_t idx) { + return rocksdb_compactionfilter_create( + (void*)idx, + gorocksdb_destruct_handler, + (unsigned char (*)(void*, int, const char*, size_t, const char*, size_t, char**, size_t*, unsigned char*))(gorocksdb_compactionfilter_filter), + (const char *(*)(void*))(gorocksdb_compactionfilter_name)); +} + + +/* Merge Operator */ + +rocksdb_mergeoperator_t* gorocksdb_mergeoperator_create(uintptr_t idx) { + return rocksdb_mergeoperator_create( + (void*)idx, + gorocksdb_destruct_handler, + (char* (*)(void*, const char*, size_t, const char*, size_t, const char* const*, const size_t*, int, unsigned char*, size_t*))(gorocksdb_mergeoperator_full_merge), + (char* (*)(void*, const char*, size_t, const char* const*, const size_t*, int, unsigned char*, size_t*))(gorocksdb_mergeoperator_partial_merge_multi), + gorocksdb_mergeoperator_delete_value, + (const char* (*)(void*))(gorocksdb_mergeoperator_name)); +} + +void gorocksdb_mergeoperator_delete_value(void* id, const char* v, size_t s) { + free((char*)v); +} + +/* Slice Transform */ + +rocksdb_slicetransform_t* gorocksdb_slicetransform_create(uintptr_t idx) { + return rocksdb_slicetransform_create( + (void*)idx, + gorocksdb_destruct_handler, + (char* (*)(void*, const char*, size_t, size_t*))(gorocksdb_slicetransform_transform), + (unsigned char (*)(void*, const char*, size_t))(gorocksdb_slicetransform_in_domain), + (unsigned char (*)(void*, const char*, size_t))(gorocksdb_slicetransform_in_range), + (const char* (*)(void*))(gorocksdb_slicetransform_name)); +} diff --git a/v8/gorocksdb.h b/v8/gorocksdb.h new file mode 100644 index 00000000..c435a116 --- /dev/null +++ b/v8/gorocksdb.h @@ -0,0 +1,25 @@ +#include +#include "rocksdb/c.h" + +// This API provides convenient C wrapper functions for rocksdb client. + +/* Base */ + +extern void gorocksdb_destruct_handler(void* state); + +/* CompactionFilter */ + +extern rocksdb_compactionfilter_t* gorocksdb_compactionfilter_create(uintptr_t idx); + +/* Comparator */ + +extern rocksdb_comparator_t* gorocksdb_comparator_create(uintptr_t idx); + +/* Merge Operator */ + +extern rocksdb_mergeoperator_t* gorocksdb_mergeoperator_create(uintptr_t idx); +extern void gorocksdb_mergeoperator_delete_value(void* state, const char* v, size_t s); + +/* Slice Transform */ + +extern rocksdb_slicetransform_t* gorocksdb_slicetransform_create(uintptr_t idx); diff --git a/v8/iterator.go b/v8/iterator.go new file mode 100644 index 00000000..fefb82f1 --- /dev/null +++ b/v8/iterator.go @@ -0,0 +1,126 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" +import ( + "bytes" + "errors" + "unsafe" +) + +// Iterator provides a way to seek to specific keys and iterate through +// the keyspace from that point, as well as access the values of those keys. +// +// For example: +// +// it := db.NewIterator(readOpts) +// defer it.Close() +// +// it.Seek([]byte("foo")) +// for ; it.Valid(); it.Next() { +// fmt.Printf("Key: %v Value: %v\n", it.Key().Data(), it.Value().Data()) +// } +// +// if err := it.Err(); err != nil { +// return err +// } +// +type Iterator struct { + c *C.rocksdb_iterator_t +} + +// NewNativeIterator creates a Iterator object. +func NewNativeIterator(c unsafe.Pointer) *Iterator { + return &Iterator{(*C.rocksdb_iterator_t)(c)} +} + +// Valid returns false only when an Iterator has iterated past either the +// first or the last key in the database. +func (iter *Iterator) Valid() bool { + return C.rocksdb_iter_valid(iter.c) != 0 +} + +// ValidForPrefix returns false only when an Iterator has iterated past the +// first or the last key in the database or the specified prefix. +func (iter *Iterator) ValidForPrefix(prefix []byte) bool { + if C.rocksdb_iter_valid(iter.c) == 0 { + return false + } + + key := iter.Key() + result := bytes.HasPrefix(key.Data(), prefix) + key.Free() + return result +} + +// Key returns the key the iterator currently holds. +func (iter *Iterator) Key() *Slice { + var cLen C.size_t + cKey := C.rocksdb_iter_key(iter.c, &cLen) + if cKey == nil { + return nil + } + return &Slice{cKey, cLen, true} +} + +// Value returns the value in the database the iterator currently holds. +func (iter *Iterator) Value() *Slice { + var cLen C.size_t + cVal := C.rocksdb_iter_value(iter.c, &cLen) + if cVal == nil { + return nil + } + return &Slice{cVal, cLen, true} +} + +// Next moves the iterator to the next sequential key in the database. +func (iter *Iterator) Next() { + C.rocksdb_iter_next(iter.c) +} + +// Prev moves the iterator to the previous sequential key in the database. +func (iter *Iterator) Prev() { + C.rocksdb_iter_prev(iter.c) +} + +// SeekToFirst moves the iterator to the first key in the database. +func (iter *Iterator) SeekToFirst() { + C.rocksdb_iter_seek_to_first(iter.c) +} + +// SeekToLast moves the iterator to the last key in the database. +func (iter *Iterator) SeekToLast() { + C.rocksdb_iter_seek_to_last(iter.c) +} + +// Seek moves the iterator to the position greater than or equal to the key. +func (iter *Iterator) Seek(key []byte) { + cKey := byteToChar(key) + C.rocksdb_iter_seek(iter.c, cKey, C.size_t(len(key))) +} + +// SeekForPrev moves the iterator to the last key that less than or equal +// to the target key, in contrast with Seek. +func (iter *Iterator) SeekForPrev(key []byte) { + cKey := byteToChar(key) + C.rocksdb_iter_seek_for_prev(iter.c, cKey, C.size_t(len(key))) +} + +// Err returns nil if no errors happened during iteration, or the actual +// error otherwise. +func (iter *Iterator) Err() error { + var cErr *C.char + C.rocksdb_iter_get_error(iter.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Close closes the iterator. +func (iter *Iterator) Close() { + C.rocksdb_iter_destroy(iter.c) + iter.c = nil +} diff --git a/v8/iterator_test.go b/v8/iterator_test.go new file mode 100644 index 00000000..358400ba --- /dev/null +++ b/v8/iterator_test.go @@ -0,0 +1,31 @@ +package gorocksdb + +import ( + "testing" + + "github.com/facebookgo/ensure" +) + +func TestIterator(t *testing.T) { + db := newTestDB(t, "TestIterator", nil) + defer db.Close() + + // insert keys + givenKeys := [][]byte{[]byte("key1"), []byte("key2"), []byte("key3")} + wo := NewDefaultWriteOptions() + for _, k := range givenKeys { + ensure.Nil(t, db.Put(wo, k, []byte("val"))) + } + + ro := NewDefaultReadOptions() + iter := db.NewIterator(ro) + defer iter.Close() + var actualKeys [][]byte + for iter.SeekToFirst(); iter.Valid(); iter.Next() { + key := make([]byte, 4) + copy(key, iter.Key().Data()) + actualKeys = append(actualKeys, key) + } + ensure.Nil(t, iter.Err()) + ensure.DeepEqual(t, actualKeys, givenKeys) +} diff --git a/v8/memory_usage.go b/v8/memory_usage.go new file mode 100644 index 00000000..a2fad4ef --- /dev/null +++ b/v8/memory_usage.go @@ -0,0 +1,80 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" +import ( + "errors" + "unsafe" +) + +// MemoryUsage contains memory usage statistics provided by RocksDB +type MemoryUsage struct { + // MemTableTotal estimates memory usage of all mem-tables + MemTableTotal uint64 + // MemTableUnflushed estimates memory usage of unflushed mem-tables + MemTableUnflushed uint64 + // MemTableReadersTotal memory usage of table readers (indexes and bloom filters) + MemTableReadersTotal uint64 + // CacheTotal memory usage of cache + CacheTotal uint64 +} + +type NativeDB interface { + getNativeDB() *C.rocksdb_t +} + +func (db *DB) getNativeDB() *C.rocksdb_t { + return db.c +} + +func (db *TransactionDB) getNativeDB() *C.rocksdb_t { + return (*C.rocksdb_t)(db.c) +} + +// GetApproximateMemoryUsageByType returns summary +// memory usage stats for given databases and caches. +func GetApproximateMemoryUsageByType(dbs []*DB, caches []*Cache) (*MemoryUsage, error) { + nativeDBs := make([]NativeDB, 0, len(dbs)) + for _, db := range dbs { + nativeDBs = append(nativeDBs, db) + } + return GetApproximateMemoryUsageByTypeNativeDB(nativeDBs, caches) +} + +// GetApproximateMemoryUsageByTypeNativeDB returns summary +// memory usage stats for given databases and caches. +func GetApproximateMemoryUsageByTypeNativeDB(dbs []NativeDB, caches []*Cache) (*MemoryUsage, error) { + // register memory consumers + consumers := C.rocksdb_memory_consumers_create() + defer C.rocksdb_memory_consumers_destroy(consumers) + + for _, db := range dbs { + if db != nil { + C.rocksdb_memory_consumers_add_db(consumers, (db.getNativeDB())) + } + } + for _, cache := range caches { + if cache != nil { + C.rocksdb_memory_consumers_add_cache(consumers, cache.c) + } + } + + // obtain memory usage stats + var cErr *C.char + memoryUsage := C.rocksdb_approximate_memory_usage_create(consumers, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + + defer C.rocksdb_approximate_memory_usage_destroy(memoryUsage) + + result := &MemoryUsage{ + MemTableTotal: uint64(C.rocksdb_approximate_memory_usage_get_mem_table_total(memoryUsage)), + MemTableUnflushed: uint64(C.rocksdb_approximate_memory_usage_get_mem_table_unflushed(memoryUsage)), + MemTableReadersTotal: uint64(C.rocksdb_approximate_memory_usage_get_mem_table_readers_total(memoryUsage)), + CacheTotal: uint64(C.rocksdb_approximate_memory_usage_get_cache_total(memoryUsage)), + } + return result, nil +} diff --git a/v8/memory_usage_test.go b/v8/memory_usage_test.go new file mode 100644 index 00000000..fd541d6a --- /dev/null +++ b/v8/memory_usage_test.go @@ -0,0 +1,114 @@ +package gorocksdb + +import ( + "math/rand" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/facebookgo/ensure" +) + +func TestMemoryUsage(t *testing.T) { + // create database with cache + cache := NewLRUCache(8 * 1024 * 1024) + bbto := NewDefaultBlockBasedTableOptions() + bbto.SetBlockCache(cache) + defer bbto.Destroy() + defer cache.Destroy() + + applyOpts := func(opts *Options) { + opts.SetBlockBasedTableFactory(bbto) + } + + db := newTestDB(t, "TestMemoryUsage", applyOpts) + defer db.Close() + + // take first memory usage snapshot + mu1, err := GetApproximateMemoryUsageByType([]*DB{db}, []*Cache{cache}) + ensure.Nil(t, err) + + // perform IO operations that will affect in-memory tables (and maybe cache as well) + wo := NewDefaultWriteOptions() + defer wo.Destroy() + ro := NewDefaultReadOptions() + defer ro.Destroy() + + key := []byte("key") + value := make([]byte, 1024) + _, err = rand.Read(value) + ensure.Nil(t, err) + + err = db.Put(wo, key, value) + ensure.Nil(t, err) + + // A single Put is not enough to increase approximate memtable usage. + err = db.Put(wo, key, value) + ensure.Nil(t, err) + + _, err = db.Get(ro, key) + ensure.Nil(t, err) + + // take second memory usage snapshot + mu2, err := GetApproximateMemoryUsageByType([]*DB{db}, []*Cache{cache}) + ensure.Nil(t, err) + + // the amount of memory used by memtables should increase after write/read; + // cache memory usage is not likely to be changed, perhaps because requested key is kept by memtable + assert.True(t, mu2.MemTableTotal > mu1.MemTableTotal) + assert.True(t, mu2.MemTableUnflushed > mu1.MemTableUnflushed) + assert.True(t, mu2.CacheTotal >= mu1.CacheTotal) + assert.True(t, mu2.MemTableReadersTotal >= mu1.MemTableReadersTotal) +} + +func TestMemoryUsageTransactionDB(t *testing.T) { + // create database with cache + cache := NewLRUCache(8 * 1024 * 1024) + bbto := NewDefaultBlockBasedTableOptions() + bbto.SetBlockCache(cache) + defer bbto.Destroy() + defer cache.Destroy() + + applyOpts := func(opts *Options, transactionDBOpts *TransactionDBOptions) { + opts.SetBlockBasedTableFactory(bbto) + } + + db := newTestTransactionDB(t, "TestMemoryUsage", applyOpts) + defer db.Close() + + // take first memory usage snapshot + mu1, err := GetApproximateMemoryUsageByTypeNativeDB([]NativeDB{db}, []*Cache{cache}) + ensure.Nil(t, err) + + // perforx`m IO operations that will affect in-memory tables (and maybe cache as well) + wo := NewDefaultWriteOptions() + defer wo.Destroy() + ro := NewDefaultReadOptions() + defer ro.Destroy() + + key := []byte("key") + value := make([]byte, 1024) + _, err = rand.Read(value) + ensure.Nil(t, err) + + err = db.Put(wo, key, value) + ensure.Nil(t, err) + + // A single Put is not enough to increase approximate memtable usage. + err = db.Put(wo, key, value) + ensure.Nil(t, err) + + _, err = db.Get(ro, key) + ensure.Nil(t, err) + + // take second memory usage snapshot + mu2, err := GetApproximateMemoryUsageByTypeNativeDB([]NativeDB{db}, []*Cache{cache}) + ensure.Nil(t, err) + + // the amount of memory used by memtables should increase after write/read; + // cache memory usage is not likely to be changed, perhaps because requested key is kept by memtable + assert.True(t, mu2.MemTableTotal > mu1.MemTableTotal) + assert.True(t, mu2.MemTableUnflushed > mu1.MemTableUnflushed) + assert.True(t, mu2.CacheTotal >= mu1.CacheTotal) + assert.True(t, mu2.MemTableReadersTotal >= mu1.MemTableReadersTotal) +} diff --git a/v8/merge_operator.go b/v8/merge_operator.go new file mode 100644 index 00000000..2de7f9ab --- /dev/null +++ b/v8/merge_operator.go @@ -0,0 +1,168 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// A MergeOperator specifies the SEMANTICS of a merge, which only +// client knows. It could be numeric addition, list append, string +// concatenation, edit data structure, ... , anything. +// The library, on the other hand, is concerned with the exercise of this +// interface, at the right time (during get, iteration, compaction...) +// +// Please read the RocksDB documentation for +// more details and example implementations. +type MergeOperator interface { + // Gives the client a way to express the read -> modify -> write semantics + // key: The key that's associated with this merge operation. + // Client could multiplex the merge operator based on it + // if the key space is partitioned and different subspaces + // refer to different types of data which have different + // merge operation semantics. + // existingValue: null indicates that the key does not exist before this op. + // operands: the sequence of merge operations to apply, front() first. + // + // Return true on success. + // + // All values passed in will be client-specific values. So if this method + // returns false, it is because client specified bad data or there was + // internal corruption. This will be treated as an error by the library. + FullMerge(key, existingValue []byte, operands [][]byte) ([]byte, bool) + + // The name of the MergeOperator. + Name() string +} + +// PartialMerger implements PartialMerge(key, leftOperand, rightOperand []byte) ([]byte, err) +// When a MergeOperator implements this interface, PartialMerge will be called in addition +// to FullMerge for compactions across levels +type PartialMerger interface { + // This function performs merge(left_op, right_op) + // when both the operands are themselves merge operation types + // that you would have passed to a db.Merge() call in the same order + // (i.e.: db.Merge(key,left_op), followed by db.Merge(key,right_op)). + // + // PartialMerge should combine them into a single merge operation. + // The return value should be constructed such that a call to + // db.Merge(key, new_value) would yield the same result as a call + // to db.Merge(key, left_op) followed by db.Merge(key, right_op). + // + // If it is impossible or infeasible to combine the two operations, return false. + // The library will internally keep track of the operations, and apply them in the + // correct order once a base-value (a Put/Delete/End-of-Database) is seen. + PartialMerge(key, leftOperand, rightOperand []byte) ([]byte, bool) +} + +// MultiMerger implements PartialMergeMulti(key []byte, operands [][]byte) ([]byte, err) +// When a MergeOperator implements this interface, PartialMergeMulti will be called in addition +// to FullMerge for compactions across levels +type MultiMerger interface { + // PartialMerge performs merge on multiple operands + // when all of the operands are themselves merge operation types + // that you would have passed to a db.Merge() call in the same order + // (i.e.: db.Merge(key,operand[0]), followed by db.Merge(key,operand[1]), + // ... db.Merge(key, operand[n])). + // + // PartialMerge should combine them into a single merge operation. + // The return value should be constructed such that a call to + // db.Merge(key, new_value) would yield the same result as a call + // to db.Merge(key,operand[0]), followed by db.Merge(key,operand[1]), + // ... db.Merge(key, operand[n])). + // + // If it is impossible or infeasible to combine the operations, return false. + // The library will internally keep track of the operations, and apply them in the + // correct order once a base-value (a Put/Delete/End-of-Database) is seen. + PartialMergeMulti(key []byte, operands [][]byte) ([]byte, bool) +} + +// NewNativeMergeOperator creates a MergeOperator object. +func NewNativeMergeOperator(c *C.rocksdb_mergeoperator_t) MergeOperator { + return nativeMergeOperator{c} +} + +type nativeMergeOperator struct { + c *C.rocksdb_mergeoperator_t +} + +func (mo nativeMergeOperator) FullMerge(key, existingValue []byte, operands [][]byte) ([]byte, bool) { + return nil, false +} +func (mo nativeMergeOperator) PartialMerge(key, leftOperand, rightOperand []byte) ([]byte, bool) { + return nil, false +} +func (mo nativeMergeOperator) Name() string { return "" } + +// Hold references to merge operators. +var mergeOperators = NewCOWList() + +type mergeOperatorWrapper struct { + name *C.char + mergeOperator MergeOperator +} + +func registerMergeOperator(merger MergeOperator) int { + return mergeOperators.Append(mergeOperatorWrapper{C.CString(merger.Name()), merger}) +} + +//export gorocksdb_mergeoperator_full_merge +func gorocksdb_mergeoperator_full_merge(idx int, cKey *C.char, cKeyLen C.size_t, cExistingValue *C.char, cExistingValueLen C.size_t, cOperands **C.char, cOperandsLen *C.size_t, cNumOperands C.int, cSuccess *C.uchar, cNewValueLen *C.size_t) *C.char { + key := charToByte(cKey, cKeyLen) + rawOperands := charSlice(cOperands, cNumOperands) + operandsLen := sizeSlice(cOperandsLen, cNumOperands) + existingValue := charToByte(cExistingValue, cExistingValueLen) + operands := make([][]byte, int(cNumOperands)) + for i, len := range operandsLen { + operands[i] = charToByte(rawOperands[i], len) + } + + newValue, success := mergeOperators.Get(idx).(mergeOperatorWrapper).mergeOperator.FullMerge(key, existingValue, operands) + newValueLen := len(newValue) + + *cNewValueLen = C.size_t(newValueLen) + *cSuccess = boolToChar(success) + + return cByteSlice(newValue) +} + +//export gorocksdb_mergeoperator_partial_merge_multi +func gorocksdb_mergeoperator_partial_merge_multi(idx int, cKey *C.char, cKeyLen C.size_t, cOperands **C.char, cOperandsLen *C.size_t, cNumOperands C.int, cSuccess *C.uchar, cNewValueLen *C.size_t) *C.char { + key := charToByte(cKey, cKeyLen) + rawOperands := charSlice(cOperands, cNumOperands) + operandsLen := sizeSlice(cOperandsLen, cNumOperands) + operands := make([][]byte, int(cNumOperands)) + for i, len := range operandsLen { + operands[i] = charToByte(rawOperands[i], len) + } + + var newValue []byte + success := true + + merger := mergeOperators.Get(idx).(mergeOperatorWrapper).mergeOperator + + // check if this MergeOperator supports partial or multi merges + switch v := merger.(type) { + case MultiMerger: + newValue, success = v.PartialMergeMulti(key, operands) + case PartialMerger: + leftOperand := operands[0] + for i := 1; i < int(cNumOperands); i++ { + newValue, success = v.PartialMerge(key, leftOperand, operands[i]) + if !success { + break + } + leftOperand = newValue + } + default: + success = false + } + + newValueLen := len(newValue) + *cNewValueLen = C.size_t(newValueLen) + *cSuccess = boolToChar(success) + + return cByteSlice(newValue) +} + +//export gorocksdb_mergeoperator_name +func gorocksdb_mergeoperator_name(idx int) *C.char { + return mergeOperators.Get(idx).(mergeOperatorWrapper).name +} diff --git a/v8/merge_operator_test.go b/v8/merge_operator_test.go new file mode 100644 index 00000000..5534ccb2 --- /dev/null +++ b/v8/merge_operator_test.go @@ -0,0 +1,195 @@ +package gorocksdb + +import ( + "testing" + + "github.com/facebookgo/ensure" +) + +// fatalAsError is used as a wrapper to make it possible to use ensure +// also if C calls Go otherwise it will throw a internal lockOSThread error. +type fatalAsError struct { + t *testing.T +} + +func (f *fatalAsError) Fatal(a ...interface{}) { + f.t.Error(a...) +} + +func TestMergeOperator(t *testing.T) { + var ( + givenKey = []byte("hello") + givenVal1 = []byte("foo") + givenVal2 = []byte("bar") + givenMerged = []byte("foobar") + ) + merger := &mockMergeOperator{ + fullMerge: func(key, existingValue []byte, operands [][]byte) ([]byte, bool) { + ensure.DeepEqual(&fatalAsError{t}, key, givenKey) + ensure.DeepEqual(&fatalAsError{t}, existingValue, givenVal1) + ensure.DeepEqual(&fatalAsError{t}, operands, [][]byte{givenVal2}) + return givenMerged, true + }, + } + db := newTestDB(t, "TestMergeOperator", func(opts *Options) { + opts.SetMergeOperator(merger) + }) + defer db.Close() + + wo := NewDefaultWriteOptions() + ensure.Nil(t, db.Put(wo, givenKey, givenVal1)) + ensure.Nil(t, db.Merge(wo, givenKey, givenVal2)) + + // trigger a compaction to ensure that a merge is performed + db.CompactRange(Range{nil, nil}) + + ro := NewDefaultReadOptions() + v1, err := db.Get(ro, givenKey) + defer v1.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v1.Data(), givenMerged) +} + +func TestPartialMergeOperator(t *testing.T) { + var ( + givenKey = []byte("hello") + startingVal = []byte("foo") + mergeVal1 = []byte("bar") + mergeVal2 = []byte("baz") + fMergeResult = []byte("foobarbaz") + pMergeResult = []byte("barbaz") + ) + + merger := &mockMergePartialOperator{ + fullMerge: func(key, existingValue []byte, operands [][]byte) ([]byte, bool) { + ensure.DeepEqual(&fatalAsError{t}, key, givenKey) + ensure.DeepEqual(&fatalAsError{t}, existingValue, startingVal) + ensure.DeepEqual(&fatalAsError{t}, operands[0], pMergeResult) + return fMergeResult, true + }, + partialMerge: func(key, leftOperand, rightOperand []byte) ([]byte, bool) { + ensure.DeepEqual(&fatalAsError{t}, key, givenKey) + ensure.DeepEqual(&fatalAsError{t}, leftOperand, mergeVal1) + ensure.DeepEqual(&fatalAsError{t}, rightOperand, mergeVal2) + return pMergeResult, true + }, + } + db := newTestDB(t, "TestMergeOperator", func(opts *Options) { + opts.SetMergeOperator(merger) + }) + defer db.Close() + + wo := NewDefaultWriteOptions() + defer wo.Destroy() + + // insert a starting value and compact to trigger merges + ensure.Nil(t, db.Put(wo, givenKey, startingVal)) + + // trigger a compaction to ensure that a merge is performed + db.CompactRange(Range{nil, nil}) + + // we expect these two operands to be passed to merge partial + ensure.Nil(t, db.Merge(wo, givenKey, mergeVal1)) + ensure.Nil(t, db.Merge(wo, givenKey, mergeVal2)) + + // trigger a compaction to ensure that a + // partial and full merge are performed + db.CompactRange(Range{nil, nil}) + + ro := NewDefaultReadOptions() + v1, err := db.Get(ro, givenKey) + defer v1.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v1.Data(), fMergeResult) + +} + +func TestMergeMultiOperator(t *testing.T) { + var ( + givenKey = []byte("hello") + startingVal = []byte("foo") + mergeVal1 = []byte("bar") + mergeVal2 = []byte("baz") + fMergeResult = []byte("foobarbaz") + pMergeResult = []byte("barbaz") + ) + + merger := &mockMergeMultiOperator{ + fullMerge: func(key, existingValue []byte, operands [][]byte) ([]byte, bool) { + ensure.DeepEqual(&fatalAsError{t}, key, givenKey) + ensure.DeepEqual(&fatalAsError{t}, existingValue, startingVal) + ensure.DeepEqual(&fatalAsError{t}, operands[0], pMergeResult) + return fMergeResult, true + }, + partialMergeMulti: func(key []byte, operands [][]byte) ([]byte, bool) { + ensure.DeepEqual(&fatalAsError{t}, key, givenKey) + ensure.DeepEqual(&fatalAsError{t}, operands[0], mergeVal1) + ensure.DeepEqual(&fatalAsError{t}, operands[1], mergeVal2) + return pMergeResult, true + }, + } + db := newTestDB(t, "TestMergeOperator", func(opts *Options) { + opts.SetMergeOperator(merger) + }) + defer db.Close() + + wo := NewDefaultWriteOptions() + defer wo.Destroy() + + // insert a starting value and compact to trigger merges + ensure.Nil(t, db.Put(wo, givenKey, startingVal)) + + // trigger a compaction to ensure that a merge is performed + db.CompactRange(Range{nil, nil}) + + // we expect these two operands to be passed to merge multi + ensure.Nil(t, db.Merge(wo, givenKey, mergeVal1)) + ensure.Nil(t, db.Merge(wo, givenKey, mergeVal2)) + + // trigger a compaction to ensure that a + // partial and full merge are performed + db.CompactRange(Range{nil, nil}) + + ro := NewDefaultReadOptions() + v1, err := db.Get(ro, givenKey) + defer v1.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v1.Data(), fMergeResult) + +} + +// Mock Objects +type mockMergeOperator struct { + fullMerge func(key, existingValue []byte, operands [][]byte) ([]byte, bool) +} + +func (m *mockMergeOperator) Name() string { return "gorocksdb.test" } +func (m *mockMergeOperator) FullMerge(key, existingValue []byte, operands [][]byte) ([]byte, bool) { + return m.fullMerge(key, existingValue, operands) +} + +type mockMergeMultiOperator struct { + fullMerge func(key, existingValue []byte, operands [][]byte) ([]byte, bool) + partialMergeMulti func(key []byte, operands [][]byte) ([]byte, bool) +} + +func (m *mockMergeMultiOperator) Name() string { return "gorocksdb.multi" } +func (m *mockMergeMultiOperator) FullMerge(key, existingValue []byte, operands [][]byte) ([]byte, bool) { + return m.fullMerge(key, existingValue, operands) +} +func (m *mockMergeMultiOperator) PartialMergeMulti(key []byte, operands [][]byte) ([]byte, bool) { + return m.partialMergeMulti(key, operands) +} + +type mockMergePartialOperator struct { + fullMerge func(key, existingValue []byte, operands [][]byte) ([]byte, bool) + partialMerge func(key, leftOperand, rightOperand []byte) ([]byte, bool) +} + +func (m *mockMergePartialOperator) Name() string { return "gorocksdb.partial" } +func (m *mockMergePartialOperator) FullMerge(key, existingValue []byte, operands [][]byte) ([]byte, bool) { + return m.fullMerge(key, existingValue, operands) +} +func (m *mockMergePartialOperator) PartialMerge(key, leftOperand, rightOperand []byte) ([]byte, bool) { + return m.partialMerge(key, leftOperand, rightOperand) +} diff --git a/v8/options.go b/v8/options.go new file mode 100644 index 00000000..fca6b39e --- /dev/null +++ b/v8/options.go @@ -0,0 +1,1283 @@ +package gorocksdb + +// #include "rocksdb/c.h" +// #include "gorocksdb.h" +import "C" +import ( + "errors" + "unsafe" +) + +// CompressionType specifies the block compression. +// DB contents are stored in a set of blocks, each of which holds a +// sequence of key,value pairs. Each block may be compressed before +// being stored in a file. The following enum describes which +// compression method (if any) is used to compress a block. +type CompressionType uint + +// Compression types. +const ( + NoCompression = CompressionType(C.rocksdb_no_compression) + SnappyCompression = CompressionType(C.rocksdb_snappy_compression) + ZLibCompression = CompressionType(C.rocksdb_zlib_compression) + Bz2Compression = CompressionType(C.rocksdb_bz2_compression) + LZ4Compression = CompressionType(C.rocksdb_lz4_compression) + LZ4HCCompression = CompressionType(C.rocksdb_lz4hc_compression) + XpressCompression = CompressionType(C.rocksdb_xpress_compression) + ZSTDCompression = CompressionType(C.rocksdb_zstd_compression) +) + +// CompactionStyle specifies the compaction style. +type CompactionStyle uint + +// Compaction styles. +const ( + LevelCompactionStyle = CompactionStyle(C.rocksdb_level_compaction) + UniversalCompactionStyle = CompactionStyle(C.rocksdb_universal_compaction) + FIFOCompactionStyle = CompactionStyle(C.rocksdb_fifo_compaction) +) + +// CompactionAccessPattern specifies the access patern in compaction. +type CompactionAccessPattern uint + +// Access patterns for compaction. +const ( + NoneCompactionAccessPattern = CompactionAccessPattern(0) + NormalCompactionAccessPattern = CompactionAccessPattern(1) + SequentialCompactionAccessPattern = CompactionAccessPattern(2) + WillneedCompactionAccessPattern = CompactionAccessPattern(3) +) + +// InfoLogLevel describes the log level. +type InfoLogLevel uint + +// Log leves. +const ( + DebugInfoLogLevel = InfoLogLevel(0) + InfoInfoLogLevel = InfoLogLevel(1) + WarnInfoLogLevel = InfoLogLevel(2) + ErrorInfoLogLevel = InfoLogLevel(3) + FatalInfoLogLevel = InfoLogLevel(4) +) + +type WALRecoveryMode int + +const ( + TolerateCorruptedTailRecordsRecovery = WALRecoveryMode(0) + AbsoluteConsistencyRecovery = WALRecoveryMode(1) + PointInTimeRecovery = WALRecoveryMode(2) + SkipAnyCorruptedRecordsRecovery = WALRecoveryMode(3) +) + +// EncodingType The value will determine how to encode keys +// when writing to a new SST file. +type EncodingType int8 + +const ( + //Plain will always write full keys without any special encoding. + Plain = EncodingType(0) + // Prefix will find opportunity to write the same prefix once for multiple rows. + Prefix = EncodingType(1) +) + +// Options represent all of the available options when opening a database with Open. +type Options struct { + c *C.rocksdb_options_t + + // Hold references for GC. + env *Env + bbto *BlockBasedTableOptions + + // We keep these so we can free their memory in Destroy. + ccmp *C.rocksdb_comparator_t + cmo *C.rocksdb_mergeoperator_t + cst *C.rocksdb_slicetransform_t + ccf *C.rocksdb_compactionfilter_t +} + +// NewDefaultOptions creates the default Options. +func NewDefaultOptions() *Options { + return NewNativeOptions(C.rocksdb_options_create()) +} + +// NewNativeOptions creates a Options object. +func NewNativeOptions(c *C.rocksdb_options_t) *Options { + return &Options{c: c} +} + +// GetOptionsFromString creates a Options object from existing opt and string. +// If base is nil, a default opt create by NewDefaultOptions will be used as base opt. +func GetOptionsFromString(base *Options, optStr string) (*Options, error) { + if base == nil { + base = NewDefaultOptions() + defer base.Destroy() + } + + var ( + cErr *C.char + cOptStr = C.CString(optStr) + ) + defer C.free(unsafe.Pointer(cOptStr)) + + newOpt := NewDefaultOptions() + C.rocksdb_get_options_from_string(base.c, cOptStr, newOpt.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + + return newOpt, nil +} + +// ------------------- +// Parameters that affect behavior + +// SetCompactionFilter sets the specified compaction filter +// which will be applied on compactions. +// Default: nil +func (opts *Options) SetCompactionFilter(value CompactionFilter) { + if nc, ok := value.(nativeCompactionFilter); ok { + opts.ccf = nc.c + } else { + idx := registerCompactionFilter(value) + opts.ccf = C.gorocksdb_compactionfilter_create(C.uintptr_t(idx)) + } + C.rocksdb_options_set_compaction_filter(opts.c, opts.ccf) +} + +// SetComparator sets the comparator which define the order of keys in the table. +// Default: a comparator that uses lexicographic byte-wise ordering +func (opts *Options) SetComparator(value Comparator) { + if nc, ok := value.(nativeComparator); ok { + opts.ccmp = nc.c + } else { + idx := registerComperator(value) + opts.ccmp = C.gorocksdb_comparator_create(C.uintptr_t(idx)) + } + C.rocksdb_options_set_comparator(opts.c, opts.ccmp) +} + +// SetMergeOperator sets the merge operator which will be called +// if a merge operations are used. +// Default: nil +func (opts *Options) SetMergeOperator(value MergeOperator) { + if nmo, ok := value.(nativeMergeOperator); ok { + opts.cmo = nmo.c + } else { + idx := registerMergeOperator(value) + opts.cmo = C.gorocksdb_mergeoperator_create(C.uintptr_t(idx)) + } + C.rocksdb_options_set_merge_operator(opts.c, opts.cmo) +} + +// A single CompactionFilter instance to call into during compaction. +// Allows an application to modify/delete a key-value during background +// compaction. +// +// If the client requires a new compaction filter to be used for different +// compaction runs, it can specify compaction_filter_factory instead of this +// option. The client should specify only one of the two. +// compaction_filter takes precedence over compaction_filter_factory if +// client specifies both. +// +// If multithreaded compaction is being used, the supplied CompactionFilter +// instance may be used from different threads concurrently and so should be +// thread-safe. +// +// Default: nil +// TODO: implement in C +//func (opts *Options) SetCompactionFilter(value *CompactionFilter) { +// C.rocksdb_options_set_compaction_filter(opts.c, value.filter) +//} + +// This is a factory that provides compaction filter objects which allow +// an application to modify/delete a key-value during background compaction. +// +// A new filter will be created on each compaction run. If multithreaded +// compaction is being used, each created CompactionFilter will only be used +// from a single thread and so does not need to be thread-safe. +// +// Default: a factory that doesn't provide any object +// std::shared_ptr compaction_filter_factory; +// TODO: implement in C and Go + +// Version TWO of the compaction_filter_factory +// It supports rolling compaction +// +// Default: a factory that doesn't provide any object +// std::shared_ptr compaction_filter_factory_v2; +// TODO: implement in C and Go + +// SetCreateIfMissing specifies whether the database +// should be created if it is missing. +// Default: false +func (opts *Options) SetCreateIfMissing(value bool) { + C.rocksdb_options_set_create_if_missing(opts.c, boolToChar(value)) +} + +// SetErrorIfExists specifies whether an error should be raised +// if the database already exists. +// Default: false +func (opts *Options) SetErrorIfExists(value bool) { + C.rocksdb_options_set_error_if_exists(opts.c, boolToChar(value)) +} + +// SetParanoidChecks enable/disable paranoid checks. +// +// If true, the implementation will do aggressive checking of the +// data it is processing and will stop early if it detects any +// errors. This may have unforeseen ramifications: for example, a +// corruption of one DB entry may cause a large number of entries to +// become unreadable or for the entire DB to become unopenable. +// If any of the writes to the database fails (Put, Delete, Merge, Write), +// the database will switch to read-only mode and fail all other +// Write operations. +// Default: false +func (opts *Options) SetParanoidChecks(value bool) { + C.rocksdb_options_set_paranoid_checks(opts.c, boolToChar(value)) +} + +// SetDBPaths sets the DBPaths of the options. +// +// A list of paths where SST files can be put into, with its target size. +// Newer data is placed into paths specified earlier in the vector while +// older data gradually moves to paths specified later in the vector. +// +// For example, you have a flash device with 10GB allocated for the DB, +// as well as a hard drive of 2TB, you should config it to be: +// +// [{"/flash_path", 10GB}, {"/hard_drive", 2TB}] +// +// The system will try to guarantee data under each path is close to but +// not larger than the target size. But current and future file sizes used +// by determining where to place a file are based on best-effort estimation, +// which means there is a chance that the actual size under the directory +// is slightly more than target size under some workloads. User should give +// some buffer room for those cases. +// +// If none of the paths has sufficient room to place a file, the file will +// be placed to the last path anyway, despite to the target size. +// +// Placing newer data to earlier paths is also best-efforts. User should +// expect user files to be placed in higher levels in some extreme cases. +// +// If left empty, only one path will be used, which is db_name passed when +// opening the DB. +// Default: empty +func (opts *Options) SetDBPaths(dbpaths []*DBPath) { + l := len(dbpaths) + cDbpaths := make([]*C.rocksdb_dbpath_t, l) + for i, v := range dbpaths { + cDbpaths[i] = v.c + } + + C.rocksdb_options_set_db_paths(opts.c, &cDbpaths[0], C.size_t(l)) +} + +// SetEnv sets the specified object to interact with the environment, +// e.g. to read/write files, schedule background work, etc. +// Default: DefaultEnv +func (opts *Options) SetEnv(value *Env) { + opts.env = value + + C.rocksdb_options_set_env(opts.c, value.c) +} + +// SetInfoLogLevel sets the info log level. +// Default: InfoInfoLogLevel +func (opts *Options) SetInfoLogLevel(value InfoLogLevel) { + C.rocksdb_options_set_info_log_level(opts.c, C.int(value)) +} + +// IncreaseParallelism sets the parallelism. +// +// By default, RocksDB uses only one background thread for flush and +// compaction. Calling this function will set it up such that total of +// `total_threads` is used. Good value for `total_threads` is the number of +// cores. You almost definitely want to call this function if your system is +// bottlenecked by RocksDB. +func (opts *Options) IncreaseParallelism(total_threads int) { + C.rocksdb_options_increase_parallelism(opts.c, C.int(total_threads)) +} + +// OptimizeForPointLookup optimize the DB for point lookups. +// +// Use this if you don't need to keep the data sorted, i.e. you'll never use +// an iterator, only Put() and Get() API calls +// +// If you use this with rocksdb >= 5.0.2, you must call `SetAllowConcurrentMemtableWrites(false)` +// to avoid an assertion error immediately on opening the db. +func (opts *Options) OptimizeForPointLookup(block_cache_size_mb uint64) { + C.rocksdb_options_optimize_for_point_lookup(opts.c, C.uint64_t(block_cache_size_mb)) +} + +// Set whether to allow concurrent memtable writes. Conccurent writes are +// not supported by all memtable factories (currently only SkipList memtables). +// As of rocksdb 5.0.2 you must call `SetAllowConcurrentMemtableWrites(false)` +// if you use `OptimizeForPointLookup`. +func (opts *Options) SetAllowConcurrentMemtableWrites(allow bool) { + C.rocksdb_options_set_allow_concurrent_memtable_write(opts.c, boolToChar(allow)) +} + +// OptimizeLevelStyleCompaction optimize the DB for leveld compaction. +// +// Default values for some parameters in ColumnFamilyOptions are not +// optimized for heavy workloads and big datasets, which means you might +// observe write stalls under some conditions. As a starting point for tuning +// RocksDB options, use the following two functions: +// * OptimizeLevelStyleCompaction -- optimizes level style compaction +// * OptimizeUniversalStyleCompaction -- optimizes universal style compaction +// Universal style compaction is focused on reducing Write Amplification +// Factor for big data sets, but increases Space Amplification. You can learn +// more about the different styles here: +// https://github.com/facebook/rocksdb/wiki/Rocksdb-Architecture-Guide +// Make sure to also call IncreaseParallelism(), which will provide the +// biggest performance gains. +// Note: we might use more memory than memtable_memory_budget during high +// write rate period +func (opts *Options) OptimizeLevelStyleCompaction(memtable_memory_budget uint64) { + C.rocksdb_options_optimize_level_style_compaction(opts.c, C.uint64_t(memtable_memory_budget)) +} + +// OptimizeUniversalStyleCompaction optimize the DB for universal compaction. +// See note on OptimizeLevelStyleCompaction. +func (opts *Options) OptimizeUniversalStyleCompaction(memtable_memory_budget uint64) { + C.rocksdb_options_optimize_universal_style_compaction(opts.c, C.uint64_t(memtable_memory_budget)) +} + +// SetWriteBufferSize sets the amount of data to build up in memory +// (backed by an unsorted log on disk) before converting to a sorted on-disk file. +// +// Larger values increase performance, especially during bulk loads. +// Up to max_write_buffer_number write buffers may be held in memory +// at the same time, +// so you may wish to adjust this parameter to control memory usage. +// Also, a larger write buffer will result in a longer recovery time +// the next time the database is opened. +// Default: 64MB +func (opts *Options) SetWriteBufferSize(value int) { + C.rocksdb_options_set_write_buffer_size(opts.c, C.size_t(value)) +} + +// SetMaxWriteBufferNumber sets the maximum number of write buffers +// that are built up in memory. +// +// The default is 2, so that when 1 write buffer is being flushed to +// storage, new writes can continue to the other write buffer. +// Default: 2 +func (opts *Options) SetMaxWriteBufferNumber(value int) { + C.rocksdb_options_set_max_write_buffer_number(opts.c, C.int(value)) +} + +// SetMinWriteBufferNumberToMerge sets the minimum number of write buffers +// that will be merged together before writing to storage. +// +// If set to 1, then all write buffers are flushed to L0 as individual files +// and this increases read amplification because a get request has to check +// in all of these files. Also, an in-memory merge may result in writing lesser +// data to storage if there are duplicate records in each of these +// individual write buffers. +// Default: 1 +func (opts *Options) SetMinWriteBufferNumberToMerge(value int) { + C.rocksdb_options_set_min_write_buffer_number_to_merge(opts.c, C.int(value)) +} + +// SetMaxOpenFiles sets the number of open files that can be used by the DB. +// +// You may need to increase this if your database has a large working set +// (budget one open file per 2MB of working set). +// Default: 1000 +func (opts *Options) SetMaxOpenFiles(value int) { + C.rocksdb_options_set_max_open_files(opts.c, C.int(value)) +} + +// SetMaxFileOpeningThreads sets the maximum number of file opening threads. +// If max_open_files is -1, DB will open all files on DB::Open(). You can +// use this option to increase the number of threads used to open the files. +// Default: 16 +func (opts *Options) SetMaxFileOpeningThreads(value int) { + C.rocksdb_options_set_max_file_opening_threads(opts.c, C.int(value)) +} + +// SetMaxTotalWalSize sets the maximum total wal size in bytes. +// Once write-ahead logs exceed this size, we will start forcing the flush of +// column families whose memtables are backed by the oldest live WAL file +// (i.e. the ones that are causing all the space amplification). If set to 0 +// (default), we will dynamically choose the WAL size limit to be +// [sum of all write_buffer_size * max_write_buffer_number] * 4 +// Default: 0 +func (opts *Options) SetMaxTotalWalSize(value uint64) { + C.rocksdb_options_set_max_total_wal_size(opts.c, C.uint64_t(value)) +} + +// SetCompression sets the compression algorithm. +// Default: SnappyCompression, which gives lightweight but fast +// compression. +func (opts *Options) SetCompression(value CompressionType) { + C.rocksdb_options_set_compression(opts.c, C.int(value)) +} + +// SetBottommostCompression sets the compression algorithm for the nth level. +// Default: NoComprression +func (opts *Options) SetBottommostCompression(value CompressionType) { + C.rocksdb_options_set_bottommost_compression(opts.c, C.int(value)) +} + +// SetCompressionPerLevel sets different compression algorithm per level. +// +// Different levels can have different compression policies. There +// are cases where most lower levels would like to quick compression +// algorithm while the higher levels (which have more data) use +// compression algorithms that have better compression but could +// be slower. This array should have an entry for +// each level of the database. This array overrides the +// value specified in the previous field 'compression'. +func (opts *Options) SetCompressionPerLevel(value []CompressionType) { + cLevels := make([]C.int, len(value)) + for i, v := range value { + cLevels[i] = C.int(v) + } + + C.rocksdb_options_set_compression_per_level(opts.c, &cLevels[0], C.size_t(len(value))) +} + +// SetMinLevelToCompress sets the start level to use compression. +func (opts *Options) SetMinLevelToCompress(value int) { + C.rocksdb_options_set_min_level_to_compress(opts.c, C.int(value)) +} + +// SetCompressionOptions sets different options for compression algorithms. +// Default: nil +func (opts *Options) SetCompressionOptions(value *CompressionOptions) { + C.rocksdb_options_set_compression_options(opts.c, C.int(value.WindowBits), C.int(value.Level), C.int(value.Strategy), C.int(value.MaxDictBytes)) +} + +// SetPrefixExtractor sets the prefic extractor. +// +// If set, use the specified function to determine the +// prefixes for keys. These prefixes will be placed in the filter. +// Depending on the workload, this can reduce the number of read-IOP +// cost for scans when a prefix is passed via ReadOptions to +// db.NewIterator(). +// Default: nil +func (opts *Options) SetPrefixExtractor(value SliceTransform) { + if nst, ok := value.(nativeSliceTransform); ok { + opts.cst = nst.c + } else { + idx := registerSliceTransform(value) + opts.cst = C.gorocksdb_slicetransform_create(C.uintptr_t(idx)) + } + C.rocksdb_options_set_prefix_extractor(opts.c, opts.cst) +} + +// SetNumLevels sets the number of levels for this database. +// Default: 7 +func (opts *Options) SetNumLevels(value int) { + C.rocksdb_options_set_num_levels(opts.c, C.int(value)) +} + +// SetLevel0FileNumCompactionTrigger sets the number of files +// to trigger level-0 compaction. +// +// A value <0 means that level-0 compaction will not be +// triggered by number of files at all. +// Default: 4 +func (opts *Options) SetLevel0FileNumCompactionTrigger(value int) { + C.rocksdb_options_set_level0_file_num_compaction_trigger(opts.c, C.int(value)) +} + +// SetLevel0SlowdownWritesTrigger sets the soft limit on number of level-0 files. +// +// We start slowing down writes at this point. +// A value <0 means that no writing slow down will be triggered by +// number of files in level-0. +// Default: 8 +func (opts *Options) SetLevel0SlowdownWritesTrigger(value int) { + C.rocksdb_options_set_level0_slowdown_writes_trigger(opts.c, C.int(value)) +} + +// SetLevel0StopWritesTrigger sets the maximum number of level-0 files. +// We stop writes at this point. +// Default: 12 +func (opts *Options) SetLevel0StopWritesTrigger(value int) { + C.rocksdb_options_set_level0_stop_writes_trigger(opts.c, C.int(value)) +} + +// SetTargetFileSizeBase sets the target file size for compaction. +// +// Target file size is per-file size for level-1. +// Target file size for level L can be calculated by +// target_file_size_base * (target_file_size_multiplier ^ (L-1)) +// +// For example, if target_file_size_base is 2MB and +// target_file_size_multiplier is 10, then each file on level-1 will +// be 2MB, and each file on level 2 will be 20MB, +// and each file on level-3 will be 200MB. +// Default: 2MB +func (opts *Options) SetTargetFileSizeBase(value uint64) { + C.rocksdb_options_set_target_file_size_base(opts.c, C.uint64_t(value)) +} + +// SetTargetFileSizeMultiplier sets the target file size multiplier for compaction. +// Default: 1 +func (opts *Options) SetTargetFileSizeMultiplier(value int) { + C.rocksdb_options_set_target_file_size_multiplier(opts.c, C.int(value)) +} + +// SetMaxBytesForLevelBase sets the maximum total data size for a level. +// +// It is the max total for level-1. +// Maximum number of bytes for level L can be calculated as +// (max_bytes_for_level_base) * (max_bytes_for_level_multiplier ^ (L-1)) +// +// For example, if max_bytes_for_level_base is 20MB, and if +// max_bytes_for_level_multiplier is 10, total data size for level-1 +// will be 20MB, total file size for level-2 will be 200MB, +// and total file size for level-3 will be 2GB. +// Default: 10MB +func (opts *Options) SetMaxBytesForLevelBase(value uint64) { + C.rocksdb_options_set_max_bytes_for_level_base(opts.c, C.uint64_t(value)) +} + +// SetMaxBytesForLevelMultiplier sets the max Bytes for level multiplier. +// Default: 10 +func (opts *Options) SetMaxBytesForLevelMultiplier(value float64) { + C.rocksdb_options_set_max_bytes_for_level_multiplier(opts.c, C.double(value)) +} + +// SetLevelCompactiondynamiclevelbytes specifies whether to pick +// target size of each level dynamically. +// +// We will pick a base level b >= 1. L0 will be directly merged into level b, +// instead of always into level 1. Level 1 to b-1 need to be empty. +// We try to pick b and its target size so that +// 1. target size is in the range of +// (max_bytes_for_level_base / max_bytes_for_level_multiplier, +// max_bytes_for_level_base] +// 2. target size of the last level (level num_levels-1) equals to extra size +// of the level. +// +// At the same time max_bytes_for_level_multiplier and +// max_bytes_for_level_multiplier_additional are still satisfied. +// +// With this option on, from an empty DB, we make last level the base level, +// which means merging L0 data into the last level, until it exceeds +// max_bytes_for_level_base. And then we make the second last level to be +// base level, to start to merge L0 data to second last level, with its +// target size to be 1/max_bytes_for_level_multiplier of the last level's +// extra size. After the data accumulates more so that we need to move the +// base level to the third last one, and so on. +// +// For example, assume max_bytes_for_level_multiplier=10, num_levels=6, +// and max_bytes_for_level_base=10MB. +// Target sizes of level 1 to 5 starts with: +// [- - - - 10MB] +// with base level is level. Target sizes of level 1 to 4 are not applicable +// because they will not be used. +// Until the size of Level 5 grows to more than 10MB, say 11MB, we make +// base target to level 4 and now the targets looks like: +// [- - - 1.1MB 11MB] +// While data are accumulated, size targets are tuned based on actual data +// of level 5. When level 5 has 50MB of data, the target is like: +// [- - - 5MB 50MB] +// Until level 5's actual size is more than 100MB, say 101MB. Now if we keep +// level 4 to be the base level, its target size needs to be 10.1MB, which +// doesn't satisfy the target size range. So now we make level 3 the target +// size and the target sizes of the levels look like: +// [- - 1.01MB 10.1MB 101MB] +// In the same way, while level 5 further grows, all levels' targets grow, +// like +// [- - 5MB 50MB 500MB] +// Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the +// base level and make levels' target sizes like this: +// [- 1.001MB 10.01MB 100.1MB 1001MB] +// and go on... +// +// By doing it, we give max_bytes_for_level_multiplier a priority against +// max_bytes_for_level_base, for a more predictable LSM tree shape. It is +// useful to limit worse case space amplification. +// +// max_bytes_for_level_multiplier_additional is ignored with this flag on. +// +// Turning this feature on or off for an existing DB can cause unexpected +// LSM tree structure so it's not recommended. +// +// Default: false +func (opts *Options) SetLevelCompactionDynamicLevelBytes(value bool) { + C.rocksdb_options_set_level_compaction_dynamic_level_bytes(opts.c, boolToChar(value)) +} + +// SetMaxCompactionBytes sets the maximum number of bytes in all compacted files. +// We try to limit number of bytes in one compaction to be lower than this +// threshold. But it's not guaranteed. +// Value 0 will be sanitized. +// Default: result.target_file_size_base * 25 +func (opts *Options) SetMaxCompactionBytes(value uint64) { + C.rocksdb_options_set_max_compaction_bytes(opts.c, C.uint64_t(value)) +} + +// SetSoftPendingCompactionBytesLimit sets the threshold at which +// all writes will be slowed down to at least delayed_write_rate if estimated +// bytes needed to be compaction exceed this threshold. +// +// Default: 64GB +func (opts *Options) SetSoftPendingCompactionBytesLimit(value uint64) { + C.rocksdb_options_set_soft_pending_compaction_bytes_limit(opts.c, C.size_t(value)) +} + +// SetHardPendingCompactionBytesLimit sets the bytes threshold at which +// all writes are stopped if estimated bytes needed to be compaction exceed +// this threshold. +// +// Default: 256GB +func (opts *Options) SetHardPendingCompactionBytesLimit(value uint64) { + C.rocksdb_options_set_hard_pending_compaction_bytes_limit(opts.c, C.size_t(value)) +} + +// SetMaxBytesForLevelMultiplierAdditional sets different max-size multipliers +// for different levels. +// +// These are multiplied by max_bytes_for_level_multiplier to arrive +// at the max-size of each level. +// Default: 1 for each level +func (opts *Options) SetMaxBytesForLevelMultiplierAdditional(value []int) { + cLevels := make([]C.int, len(value)) + for i, v := range value { + cLevels[i] = C.int(v) + } + + C.rocksdb_options_set_max_bytes_for_level_multiplier_additional(opts.c, &cLevels[0], C.size_t(len(value))) +} + +// SetUseFsync enable/disable fsync. +// +// If true, then every store to stable storage will issue a fsync. +// If false, then every store to stable storage will issue a fdatasync. +// This parameter should be set to true while storing data to +// filesystem like ext3 that can lose files after a reboot. +// Default: false +func (opts *Options) SetUseFsync(value bool) { + C.rocksdb_options_set_use_fsync(opts.c, C.int(btoi(value))) +} + +// SetDbLogDir specifies the absolute info LOG dir. +// +// If it is empty, the log files will be in the same dir as data. +// If it is non empty, the log files will be in the specified dir, +// and the db data dir's absolute path will be used as the log file +// name's prefix. +// Default: empty +func (opts *Options) SetDbLogDir(value string) { + cvalue := C.CString(value) + defer C.free(unsafe.Pointer(cvalue)) + C.rocksdb_options_set_db_log_dir(opts.c, cvalue) +} + +// SetWalDir specifies the absolute dir path for write-ahead logs (WAL). +// +// If it is empty, the log files will be in the same dir as data. +// If it is non empty, the log files will be in the specified dir, +// When destroying the db, all log files and the dir itopts is deleted. +// Default: empty +func (opts *Options) SetWalDir(value string) { + cvalue := C.CString(value) + defer C.free(unsafe.Pointer(cvalue)) + C.rocksdb_options_set_wal_dir(opts.c, cvalue) +} + +// SetDeleteObsoleteFilesPeriodMicros sets the periodicity +// when obsolete files get deleted. +// +// The files that get out of scope by compaction +// process will still get automatically delete on every compaction, +// regardless of this setting. +// Default: 6 hours +func (opts *Options) SetDeleteObsoleteFilesPeriodMicros(value uint64) { + C.rocksdb_options_set_delete_obsolete_files_period_micros(opts.c, C.uint64_t(value)) +} + +// SetMaxBackgroundCompactions sets the maximum number of +// concurrent background jobs, submitted to +// the default LOW priority thread pool +// Default: 1 +// Deprecated: use SetMaxBackgroundJobs, as rocksdb decides this automatically based on the value of MaxBackgroundJobs +func (opts *Options) SetMaxBackgroundCompactions(value int) { + C.rocksdb_options_set_max_background_compactions(opts.c, C.int(value)) +} + +// SetMaxBackgroundFlushes sets the maximum number of +// concurrent background memtable flush jobs, submitted to +// the HIGH priority thread pool. +// +// By default, all background jobs (major compaction and memtable flush) go +// to the LOW priority pool. If this option is set to a positive number, +// memtable flush jobs will be submitted to the HIGH priority pool. +// It is important when the same Env is shared by multiple db instances. +// Without a separate pool, long running major compaction jobs could +// potentially block memtable flush jobs of other db instances, leading to +// unnecessary Put stalls. +// Default: 0 +// Deprecated: use SetMaxBackgroundJobs, as rocksdb decides this automatically based on the value of MaxBackgroundJobs +func (opts *Options) SetMaxBackgroundFlushes(value int) { + C.rocksdb_options_set_max_background_flushes(opts.c, C.int(value)) +} + +// SetMaxLogFileSize sets the maximal size of the info log file. +// +// If the log file is larger than `max_log_file_size`, a new info log +// file will be created. +// If max_log_file_size == 0, all logs will be written to one log file. +// Default: 0 +func (opts *Options) SetMaxLogFileSize(value int) { + C.rocksdb_options_set_max_log_file_size(opts.c, C.size_t(value)) +} + +// SetLogFileTimeToRoll sets the time for the info log file to roll (in seconds). +// +// If specified with non-zero value, log file will be rolled +// if it has been active longer than `log_file_time_to_roll`. +// Default: 0 (disabled) +func (opts *Options) SetLogFileTimeToRoll(value int) { + C.rocksdb_options_set_log_file_time_to_roll(opts.c, C.size_t(value)) +} + +// SetKeepLogFileNum sets the maximal info log files to be kept. +// Default: 1000 +func (opts *Options) SetKeepLogFileNum(value int) { + C.rocksdb_options_set_keep_log_file_num(opts.c, C.size_t(value)) +} + +// SetMaxManifestFileSize sets the maximal manifest file size until is rolled over. +// The older manifest file be deleted. +// Default: MAX_INT so that roll-over does not take place. +func (opts *Options) SetMaxManifestFileSize(value uint64) { + C.rocksdb_options_set_max_manifest_file_size(opts.c, C.size_t(value)) +} + +// SetTableCacheNumshardbits sets the number of shards used for table cache. +// Default: 4 +func (opts *Options) SetTableCacheNumshardbits(value int) { + C.rocksdb_options_set_table_cache_numshardbits(opts.c, C.int(value)) +} + +// SetArenaBlockSize sets the size of one block in arena memory allocation. +// +// If <= 0, a proper value is automatically calculated (usually 1/10 of +// writer_buffer_size). +// Default: 0 +func (opts *Options) SetArenaBlockSize(value int) { + C.rocksdb_options_set_arena_block_size(opts.c, C.size_t(value)) +} + +// SetDisableAutoCompactions enable/disable automatic compactions. +// +// Manual compactions can still be issued on this database. +// Default: false +func (opts *Options) SetDisableAutoCompactions(value bool) { + C.rocksdb_options_set_disable_auto_compactions(opts.c, C.int(btoi(value))) +} + +// SetWALRecoveryMode sets the recovery mode +// +// Recovery mode to control the consistency while replaying WAL +// Default: TolerateCorruptedTailRecordsRecovery +func (opts *Options) SetWALRecoveryMode(mode WALRecoveryMode) { + C.rocksdb_options_set_wal_recovery_mode(opts.c, C.int(mode)) +} + +// SetWALTtlSeconds sets the WAL ttl in seconds. +// +// The following two options affect how archived logs will be deleted. +// 1. If both set to 0, logs will be deleted asap and will not get into +// the archive. +// 2. If wal_ttl_seconds is 0 and wal_size_limit_mb is not 0, +// WAL files will be checked every 10 min and if total size is greater +// then wal_size_limit_mb, they will be deleted starting with the +// earliest until size_limit is met. All empty files will be deleted. +// 3. If wal_ttl_seconds is not 0 and wall_size_limit_mb is 0, then +// WAL files will be checked every wal_ttl_seconds / 2 and those that +// are older than wal_ttl_seconds will be deleted. +// 4. If both are not 0, WAL files will be checked every 10 min and both +// checks will be performed with ttl being first. +// +// Default: 0 +func (opts *Options) SetWALTtlSeconds(value uint64) { + C.rocksdb_options_set_WAL_ttl_seconds(opts.c, C.uint64_t(value)) +} + +// SetWalSizeLimitMb sets the WAL size limit in MB. +// +// If total size of WAL files is greater then wal_size_limit_mb, +// they will be deleted starting with the earliest until size_limit is met +// Default: 0 +func (opts *Options) SetWalSizeLimitMb(value uint64) { + C.rocksdb_options_set_WAL_size_limit_MB(opts.c, C.uint64_t(value)) +} + +// SetEnablePipelinedWrite enables pipelined write +// +// Default: false +func (opts *Options) SetEnablePipelinedWrite(value bool) { + C.rocksdb_options_set_enable_pipelined_write(opts.c, boolToChar(value)) +} + +// SetUnorderedWrite enables unordered writes +// +// Default: false +func (opts *Options) SetUnorderedWrite(value bool) { + C.rocksdb_options_set_unordered_write(opts.c, boolToChar(value)) +} + +// SetManifestPreallocationSize sets the number of bytes +// to preallocate (via fallocate) the manifest files. +// +// Default is 4mb, which is reasonable to reduce random IO +// as well as prevent overallocation for mounts that preallocate +// large amounts of data (such as xfs's allocsize option). +// Default: 4mb +func (opts *Options) SetManifestPreallocationSize(value int) { + C.rocksdb_options_set_manifest_preallocation_size(opts.c, C.size_t(value)) +} + +// SetAllowMmapReads enable/disable mmap reads for reading sst tables. +// Default: false +func (opts *Options) SetAllowMmapReads(value bool) { + C.rocksdb_options_set_allow_mmap_reads(opts.c, boolToChar(value)) +} + +// SetAllowMmapWrites enable/disable mmap writes for writing sst tables. +// Default: false +func (opts *Options) SetAllowMmapWrites(value bool) { + C.rocksdb_options_set_allow_mmap_writes(opts.c, boolToChar(value)) +} + +// SetUseDirectReads enable/disable direct I/O mode (O_DIRECT) for reads +// Default: false +func (opts *Options) SetUseDirectReads(value bool) { + C.rocksdb_options_set_use_direct_reads(opts.c, boolToChar(value)) +} + +// SetUseDirectIOForFlushAndCompaction enable/disable direct I/O mode (O_DIRECT) for both reads and writes in background flush and compactions +// When true, new_table_reader_for_compaction_inputs is forced to true. +// Default: false +func (opts *Options) SetUseDirectIOForFlushAndCompaction(value bool) { + C.rocksdb_options_set_use_direct_io_for_flush_and_compaction(opts.c, boolToChar(value)) +} + +// SetIsFdCloseOnExec enable/dsiable child process inherit open files. +// Default: true +func (opts *Options) SetIsFdCloseOnExec(value bool) { + C.rocksdb_options_set_is_fd_close_on_exec(opts.c, boolToChar(value)) +} + +// SetStatsDumpPeriodSec sets the stats dump period in seconds. +// +// If not zero, dump stats to LOG every stats_dump_period_sec +// Default: 3600 (1 hour) +func (opts *Options) SetStatsDumpPeriodSec(value uint) { + C.rocksdb_options_set_stats_dump_period_sec(opts.c, C.uint(value)) +} + +// SetAdviseRandomOnOpen specifies whether we will hint the underlying +// file system that the file access pattern is random, when a sst file is opened. +// Default: true +func (opts *Options) SetAdviseRandomOnOpen(value bool) { + C.rocksdb_options_set_advise_random_on_open(opts.c, boolToChar(value)) +} + +// SetDbWriteBufferSize sets the amount of data to build up +// in memtables across all column families before writing to disk. +// +// This is distinct from write_buffer_size, which enforces a limit +// for a single memtable. +// +// This feature is disabled by default. Specify a non-zero value +// to enable it. +// +// Default: 0 (disabled) +func (opts *Options) SetDbWriteBufferSize(value int) { + C.rocksdb_options_set_db_write_buffer_size(opts.c, C.size_t(value)) +} + +// SetAccessHintOnCompactionStart specifies the file access pattern +// once a compaction is started. +// +// It will be applied to all input files of a compaction. +// Default: NormalCompactionAccessPattern +func (opts *Options) SetAccessHintOnCompactionStart(value CompactionAccessPattern) { + C.rocksdb_options_set_access_hint_on_compaction_start(opts.c, C.int(value)) +} + +// SetUseAdaptiveMutex enable/disable adaptive mutex, which spins +// in the user space before resorting to kernel. +// +// This could reduce context switch when the mutex is not +// heavily contended. However, if the mutex is hot, we could end up +// wasting spin time. +// Default: false +func (opts *Options) SetUseAdaptiveMutex(value bool) { + C.rocksdb_options_set_use_adaptive_mutex(opts.c, boolToChar(value)) +} + +// SetBytesPerSync sets the bytes per sync. +// +// Allows OS to incrementally sync files to disk while they are being +// written, asynchronously, in the background. +// Issue one request for every bytes_per_sync written. +// Default: 0 (disabled) +func (opts *Options) SetBytesPerSync(value uint64) { + C.rocksdb_options_set_bytes_per_sync(opts.c, C.uint64_t(value)) +} + +// SetCompactionStyle sets the compaction style. +// Default: LevelCompactionStyle +func (opts *Options) SetCompactionStyle(value CompactionStyle) { + C.rocksdb_options_set_compaction_style(opts.c, C.int(value)) +} + +// SetUniversalCompactionOptions sets the options needed +// to support Universal Style compactions. +// Default: nil +func (opts *Options) SetUniversalCompactionOptions(value *UniversalCompactionOptions) { + C.rocksdb_options_set_universal_compaction_options(opts.c, value.c) +} + +// SetFIFOCompactionOptions sets the options for FIFO compaction style. +// Default: nil +func (opts *Options) SetFIFOCompactionOptions(value *FIFOCompactionOptions) { + C.rocksdb_options_set_fifo_compaction_options(opts.c, value.c) +} + +// GetStatisticsString returns the statistics as a string. +func (opts *Options) GetStatisticsString() string { + sString := C.rocksdb_options_statistics_get_string(opts.c) + defer C.rocksdb_free(unsafe.Pointer(sString)) + return C.GoString(sString) +} + +// SetRateLimiter sets the rate limiter of the options. +// Use to control write rate of flush and compaction. Flush has higher +// priority than compaction. Rate limiting is disabled if nullptr. +// If rate limiter is enabled, bytes_per_sync is set to 1MB by default. +// Default: nullptr +func (opts *Options) SetRateLimiter(rateLimiter *RateLimiter) { + C.rocksdb_options_set_ratelimiter(opts.c, rateLimiter.c) +} + +// SetMaxSequentialSkipInIterations specifies whether an iteration->Next() +// sequentially skips over keys with the same user-key or not. +// +// This number specifies the number of keys (with the same userkey) +// that will be sequentially skipped before a reseek is issued. +// Default: 8 +func (opts *Options) SetMaxSequentialSkipInIterations(value uint64) { + C.rocksdb_options_set_max_sequential_skip_in_iterations(opts.c, C.uint64_t(value)) +} + +// SetInplaceUpdateSupport enable/disable thread-safe inplace updates. +// +// Requires updates if +// * key exists in current memtable +// * new sizeof(new_value) <= sizeof(old_value) +// * old_value for that key is a put i.e. kTypeValue +// Default: false. +func (opts *Options) SetInplaceUpdateSupport(value bool) { + C.rocksdb_options_set_inplace_update_support(opts.c, boolToChar(value)) +} + +// SetInplaceUpdateNumLocks sets the number of locks used for inplace update. +// Default: 10000, if inplace_update_support = true, else 0. +func (opts *Options) SetInplaceUpdateNumLocks(value int) { + C.rocksdb_options_set_inplace_update_num_locks(opts.c, C.size_t(value)) +} + +// SetMemtableHugePageSize sets the page size for huge page for +// arena used by the memtable. +// If <=0, it won't allocate from huge page but from malloc. +// Users are responsible to reserve huge pages for it to be allocated. For +// example: +// +// sysctl -w vm.nr_hugepages=20 +// +// See linux doc Documentation/vm/hugetlbpage.txt +// If there isn't enough free huge page available, it will fall back to +// malloc. +// +// Dynamically changeable through SetOptions() API +func (opts *Options) SetMemtableHugePageSize(value int) { + C.rocksdb_options_set_memtable_huge_page_size(opts.c, C.size_t(value)) +} + +// SetBloomLocality sets the bloom locality. +// +// Control locality of bloom filter probes to improve cache miss rate. +// This option only applies to memtable prefix bloom and plaintable +// prefix bloom. It essentially limits the max number of cache lines each +// bloom filter check can touch. +// This optimization is turned off when set to 0. The number should never +// be greater than number of probes. This option can boost performance +// for in-memory workload but should use with care since it can cause +// higher false positive rate. +// Default: 0 +func (opts *Options) SetBloomLocality(value uint32) { + C.rocksdb_options_set_bloom_locality(opts.c, C.uint32_t(value)) +} + +// SetMaxSuccessiveMerges sets the maximum number of +// successive merge operations on a key in the memtable. +// +// When a merge operation is added to the memtable and the maximum number of +// successive merges is reached, the value of the key will be calculated and +// inserted into the memtable instead of the merge operation. This will +// ensure that there are never more than max_successive_merges merge +// operations in the memtable. +// Default: 0 (disabled) +func (opts *Options) SetMaxSuccessiveMerges(value int) { + C.rocksdb_options_set_max_successive_merges(opts.c, C.size_t(value)) +} + +// SetDumpMallocStats will print malloc statistics to the LOG file for the +// database if set to true - jemalloc must be turned on for this to work. +// Default: false +func (opts *Options) SetDumpMallocStats(value bool) { + C.rocksdb_options_set_dump_malloc_stats(opts.c, boolToChar(value)) +} + +// SetMemtableWholeKeyFiltering enable whole key bloom filter in memtable. Note this will only take effect +// if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering +// can potentially reduce CPU usage for point-look-ups. +// +// Default: false (disable) +// +// Dynamically changeable through SetOptions() API +func (opts *Options) SetMemtableWholeKeyFiltering(value bool) { + C.rocksdb_options_set_memtable_whole_key_filtering(opts.c, boolToChar(value)) +} + +// EnableStatistics enable statistics. +func (opts *Options) EnableStatistics() { + C.rocksdb_options_enable_statistics(opts.c) +} + +// PrepareForBulkLoad prepare the DB for bulk loading. +// +// All data will be in level 0 without any automatic compaction. +// It's recommended to manually call CompactRange(NULL, NULL) before reading +// from the database, because otherwise the read can be very slow. +func (opts *Options) PrepareForBulkLoad() { + C.rocksdb_options_prepare_for_bulk_load(opts.c) +} + +// SetMemtableVectorRep sets a MemTableRep which is backed by a vector. +// +// On iteration, the vector is sorted. This is useful for workloads where +// iteration is very rare and writes are generally not issued after reads begin. +func (opts *Options) SetMemtableVectorRep() { + C.rocksdb_options_set_memtable_vector_rep(opts.c) +} + +// SetHashSkipListRep sets a hash skip list as MemTableRep. +// +// It contains a fixed array of buckets, each +// pointing to a skiplist (null if the bucket is empty). +// +// bucketCount: number of fixed array buckets +// skiplistHeight: the max height of the skiplist +// skiplistBranchingFactor: probabilistic size ratio between adjacent +// +// link lists in the skiplist +func (opts *Options) SetHashSkipListRep(bucketCount int, skiplistHeight, skiplistBranchingFactor int32) { + C.rocksdb_options_set_hash_skip_list_rep(opts.c, C.size_t(bucketCount), C.int32_t(skiplistHeight), C.int32_t(skiplistBranchingFactor)) +} + +// SetHashLinkListRep sets a hashed linked list as MemTableRep. +// +// It contains a fixed array of buckets, each pointing to a sorted single +// linked list (null if the bucket is empty). +// +// bucketCount: number of fixed array buckets +func (opts *Options) SetHashLinkListRep(bucketCount int) { + C.rocksdb_options_set_hash_link_list_rep(opts.c, C.size_t(bucketCount)) +} + +// SetPlainTableFactory sets a plain table factory with prefix-only seek. +// +// For this factory, you need to set prefix_extractor properly to make it +// work. Look-up will starts with prefix hash lookup for key prefix. Inside the +// hash bucket found, a binary search is executed for hash conflicts. Finally, +// a linear search is used. +// +// keyLen: plain table has optimization for fix-sized keys, +// +// which can be specified via keyLen. +// +// bloomBitsPerKey: the number of bits used for bloom filer per prefix. You +// +// may disable it by passing a zero. +// +// hashTableRatio: the desired utilization of the hash table used for prefix +// +// hashing. hashTableRatio = number of prefixes / #buckets +// in the hash table +// +// indexSparseness: inside each prefix, need to build one index record for how +// +// many keys for binary search inside each hash bucket. +// +// hugePageTlbSize: if <=0, allocate hash indexes and blooms from malloc. Otherwise from huge page TLB. +// +// encodingType: how to encode the keys. See enum EncodingType above for the choices. +// +// fullScanMode: mode for reading the whole file one record by one without using the index. +// +// storeIndexInFile: compute plain table index and bloom filter during +// +// file building and store it in file. When reading +// file, index will be mapped instead of recomputation. +func (opts *Options) SetPlainTableFactory(keyLen uint32, bloomBitsPerKey int, hashTableRatio float64, indexSparseness int, hugePageTlbSize int, encodingType EncodingType, fullScanMode bool, storeIndexInFile bool) { + C.rocksdb_options_set_plain_table_factory(opts.c, C.uint32_t(keyLen), C.int(bloomBitsPerKey), C.double(hashTableRatio), C.size_t(indexSparseness), C.size_t(hugePageTlbSize), C.char(encodingType), boolToChar(fullScanMode), boolToChar(storeIndexInFile)) +} + +// SetCreateIfMissingColumnFamilies specifies whether the column families +// should be created if they are missing. +func (opts *Options) SetCreateIfMissingColumnFamilies(value bool) { + C.rocksdb_options_set_create_missing_column_families(opts.c, boolToChar(value)) +} + +// SetBlockBasedTableFactory sets the block based table factory. +func (opts *Options) SetBlockBasedTableFactory(value *BlockBasedTableOptions) { + opts.bbto = value + C.rocksdb_options_set_block_based_table_factory(opts.c, value.c) +} + +// SetAllowIngestBehind sets allow_ingest_behind +// Set this option to true during creation of database if you want +// to be able to ingest behind (call IngestExternalFile() skipping keys +// that already exist, rather than overwriting matching keys). +// Setting this option to true will affect 2 things: +// 1) Disable some internal optimizations around SST file compression +// 2) Reserve bottom-most level for ingested files only. +// 3) Note that num_levels should be >= 3 if this option is turned on. +// +// DEFAULT: false +// Immutable. +func (opts *Options) SetAllowIngestBehind(value bool) { + C.rocksdb_options_set_allow_ingest_behind(opts.c, boolToChar(value)) +} + +// SetMemTablePrefixBloomSizeRatio sets memtable_prefix_bloom_size_ratio +// if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, +// create prefix bloom for memtable with the size of +// write_buffer_size * memtable_prefix_bloom_size_ratio. +// If it is larger than 0.25, it is sanitized to 0.25. +// +// Default: 0 (disable) +func (opts *Options) SetMemTablePrefixBloomSizeRatio(value float64) { + C.rocksdb_options_set_memtable_prefix_bloom_size_ratio(opts.c, C.double(value)) +} + +// SetOptimizeFiltersForHits sets optimize_filters_for_hits +// This flag specifies that the implementation should optimize the filters +// mainly for cases where keys are found rather than also optimize for keys +// missed. This would be used in cases where the application knows that +// there are very few misses or the performance in the case of misses is not +// important. +// +// For now, this flag allows us to not store filters for the last level i.e +// the largest level which contains data of the LSM store. For keys which +// are hits, the filters in this level are not useful because we will search +// for the data anyway. NOTE: the filters in other levels are still useful +// even for key hit because they tell us whether to look in that level or go +// to the higher level. +// +// Default: false +func (opts *Options) SetOptimizeFiltersForHits(value bool) { + C.rocksdb_options_set_optimize_filters_for_hits(opts.c, C.int(btoi(value))) +} + +// SetAtomicFlush sets atomic_flush +// RocksDB supports atomic flush of multiple column families if the DB option +// atomic_flush is set to true. The execution result of flushing multiple +// column families is written to the MANIFEST with 'all-or-nothing' guarantee +// (logically). With atomic flush, either all or no memtables of the column +// families of interest are persisted to SST files and added to the database. +// +// This can be desirable if data in multiple column families must be consistent +// with each other. For example, imagine there is one metadata column family +// meta_cf, and a data column family data_cf. Every time we write a new record +// to data_cf, we also write its metadata to meta_cf. meta_cf and data_cf must +// be flushed atomically. Database becomes inconsistent if one of them is +// persisted but the other is not. Atomic flush provides a good guarantee. +// Suppose at a certain time, kv1 exists in the memtables of meta_cf and kv2 +// exists in the memtables of data_cf. After atomically flushing these two +// column families, both kv1 and kv2 are persistent if the flush succeeds. +// Otherwise neither of them exist in the database. +// +// Since atomic flush also goes through the write_thread, it is guaranteed that +// no flush can occur in the middle of write batch. +// +// Default: false +func (opts *Options) SetAtomicFlush(value bool) { + C.rocksdb_options_set_atomic_flush(opts.c, C.uchar(btoi(value))) +} + +// AddCompactOnDeletionCollectorFactory marks a SST +// file as need-compaction when it observe at least "D" deletion +// entries in any "N" consecutive entries or the ratio of tombstone +// entries in the whole file >= the specified deletion ratio. +func (opts *Options) AddCompactOnDeletionCollectorFactory(windowSize, numDelsTrigger uint) { + C.rocksdb_options_add_compact_on_deletion_collector_factory(opts.c, C.size_t(windowSize), C.size_t(numDelsTrigger)) +} + +// AddCompactOnDeletionCollectorFactoryWithRatio similar to AddCompactOnDeletionCollectorFactory +// with specific deletion ratio. +func (opts *Options) AddCompactOnDeletionCollectorFactoryWithRatio(windowSize, numDelsTrigger uint, deletionRatio float64) { + C.rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio(opts.c, C.size_t(windowSize), C.size_t(numDelsTrigger), C.double(deletionRatio)) +} + +// SetMaxSubcompactions represents the maximum number of threads that will +// concurrently perform a compaction job by breaking it into multiple, +// smaller ones that are run simultaneously. +// +// Default: 1 (i.e. no subcompactions) +func (opts *Options) SetMaxSubcompactions(value uint32) { + C.rocksdb_options_set_max_subcompactions(opts.c, C.uint32_t(value)) +} + +// GetMaxSubcompactions gets the maximum number of threads that will +// concurrently perform a compaction job by breaking it into multiple, +// smaller ones that are run simultaneously. +func (opts *Options) GetMaxSubcompactions() uint32 { + return uint32(C.rocksdb_options_get_max_subcompactions(opts.c)) +} + +// SetMaxBackgroundJobs maximum number of concurrent background jobs +// (compactions and flushes). +// +// Default: 2 +// +// Dynamically changeable through SetDBOptions() API. +func (opts *Options) SetMaxBackgroundJobs(value int) { + C.rocksdb_options_set_max_background_jobs(opts.c, C.int(value)) +} + +// GetMaxBackgroundJobs returns maximum number of concurrent background jobs setting. +func (opts *Options) GetMaxBackgroundJobs() int { + return int(C.rocksdb_options_get_max_background_jobs(opts.c)) +} + +// Destroy deallocates the Options object. +func (opts *Options) Destroy() { + C.rocksdb_options_destroy(opts.c) + if opts.ccmp != nil { + C.rocksdb_comparator_destroy(opts.ccmp) + } + // don't destroy the opts.cst here, it has already been + // associated with a PrefixExtractor and this will segfault + if opts.ccf != nil { + C.rocksdb_compactionfilter_destroy(opts.ccf) + } + opts.c = nil + opts.env = nil + opts.bbto = nil +} diff --git a/v8/options_block_based_table.go b/v8/options_block_based_table.go new file mode 100644 index 00000000..e5524d3a --- /dev/null +++ b/v8/options_block_based_table.go @@ -0,0 +1,247 @@ +package gorocksdb + +// #include "rocksdb/c.h" +// #include "gorocksdb.h" +import "C" + +// IndexType specifies the index type that will be used for this table. +type IndexType uint + +const ( + // A space efficient index block that is optimized for + // binary-search-based index. + KBinarySearchIndexType = 0 + // The hash index, if enabled, will do the hash lookup when + // `Options.prefix_extractor` is provided. + KHashSearchIndexType = 1 + // A two-level index implementation. Both levels are binary search indexes. + KTwoLevelIndexSearchIndexType = 2 +) + +// DataBlockIndexType specifies index type that will be used for the data block. +type DataBlockIndexType uint + +const ( + // KDataBlockIndexTypeBinarySearch is traditional block type + KDataBlockIndexTypeBinarySearch DataBlockIndexType = 0 + // KDataBlockIndexTypeBinarySearchAndHash additional hash index + KDataBlockIndexTypeBinarySearchAndHash DataBlockIndexType = 1 +) + +// BlockBasedTableOptions represents block-based table options. +type BlockBasedTableOptions struct { + c *C.rocksdb_block_based_table_options_t + + // Hold references for GC. + cache *Cache + compCache *Cache + + // We keep these so we can free their memory in Destroy. + cFp *C.rocksdb_filterpolicy_t +} + +// NewDefaultBlockBasedTableOptions creates a default BlockBasedTableOptions object. +func NewDefaultBlockBasedTableOptions() *BlockBasedTableOptions { + return NewNativeBlockBasedTableOptions(C.rocksdb_block_based_options_create()) +} + +// NewNativeBlockBasedTableOptions creates a BlockBasedTableOptions object. +func NewNativeBlockBasedTableOptions(c *C.rocksdb_block_based_table_options_t) *BlockBasedTableOptions { + return &BlockBasedTableOptions{c: c} +} + +// Destroy deallocates the BlockBasedTableOptions object. +func (opts *BlockBasedTableOptions) Destroy() { + C.rocksdb_block_based_options_destroy(opts.c) + opts.c = nil + opts.cache = nil + opts.compCache = nil +} + +// SetCacheIndexAndFilterBlocks is indicating if we'd put index/filter blocks to the block cache. +// If not specified, each "table reader" object will pre-load index/filter +// block during table initialization. +// Default: false +func (opts *BlockBasedTableOptions) SetCacheIndexAndFilterBlocks(value bool) { + C.rocksdb_block_based_options_set_cache_index_and_filter_blocks(opts.c, boolToChar(value)) +} + +// SetCacheIndexAndFilterBlocksWithHighPriority sets cache index and filter +// blocks with high priority (if cache_index_and_filter_blocks is enabled). +// If set to true, depending on implementation of block cache, +// index and filter blocks may be less likely to be evicted than data blocks. +func (opts *BlockBasedTableOptions) SetCacheIndexAndFilterBlocksWithHighPriority(value bool) { + C.rocksdb_block_based_options_set_cache_index_and_filter_blocks_with_high_priority(opts.c, boolToChar(value)) +} + +// SetPinL0FilterAndIndexBlocksInCache sets cache_index_and_filter_blocks. +// If is true and the below is true (hash_index_allow_collision), then +// filter and index blocks are stored in the cache, but a reference is +// held in the "table reader" object so the blocks are pinned and only +// evicted from cache when the table reader is freed. +func (opts *BlockBasedTableOptions) SetPinL0FilterAndIndexBlocksInCache(value bool) { + C.rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache(opts.c, boolToChar(value)) +} + +// SetPinTopLevelIndexAndFilter set that if cache_index_and_filter_blocks is true, then +// the top-level index of partitioned filter and index blocks are stored in +// the cache, but a reference is held in the "table reader" object so the +// blocks are pinned and only evicted from cache when the table reader is +// freed. This is not limited to l0 in LSM tree. +func (opts *BlockBasedTableOptions) SetPinTopLevelIndexAndFilter(value bool) { + C.rocksdb_block_based_options_set_pin_top_level_index_and_filter(opts.c, boolToChar(value)) +} + +// SetBlockSize sets the approximate size of user data packed per block. +// Note that the block size specified here corresponds opts uncompressed data. +// The actual size of the unit read from disk may be smaller if +// compression is enabled. This parameter can be changed dynamically. +// Default: 4K +func (opts *BlockBasedTableOptions) SetBlockSize(blockSize int) { + C.rocksdb_block_based_options_set_block_size(opts.c, C.size_t(blockSize)) +} + +// SetBlockSizeDeviation sets the block size deviation. +// This is used opts close a block before it reaches the configured +// 'block_size'. If the percentage of free space in the current block is less +// than this specified number and adding a new record opts the block will +// exceed the configured block size, then this block will be closed and the +// new record will be written opts the next block. +// Default: 10 +func (opts *BlockBasedTableOptions) SetBlockSizeDeviation(blockSizeDeviation int) { + C.rocksdb_block_based_options_set_block_size_deviation(opts.c, C.int(blockSizeDeviation)) +} + +// SetBlockRestartInterval sets the number of keys between +// restart points for delta encoding of keys. +// This parameter can be changed dynamically. Most clients should +// leave this parameter alone. +// Default: 16 +func (opts *BlockBasedTableOptions) SetBlockRestartInterval(blockRestartInterval int) { + C.rocksdb_block_based_options_set_block_restart_interval(opts.c, C.int(blockRestartInterval)) +} + +// SetIndexBlockRestartInterval is the same as SetBlockRestartInterval but used for the index block. +// Default: 1 +func (opts *BlockBasedTableOptions) SetIndexBlockRestartInterval(indexBlockRestartInterval int) { + C.rocksdb_block_based_options_set_index_block_restart_interval(opts.c, C.int(indexBlockRestartInterval)) +} + +// SetMetadataBlockSize sets the block size for partitioned metadata. +// Currently applied to indexes when +// kTwoLevelIndexSearch is used and to filters when partition_filters is used. +// Note: Since in the current implementation the filters and index partitions +// are aligned, an index/filter block is created when either index or filter +// block size reaches the specified limit. +// Note: this limit is currently applied to only index blocks; a filter +// partition is cut right after an index block is cut +// Default: 4096 +func (opts *BlockBasedTableOptions) SetMetadataBlockSize(metadataBlockSize uint64) { + C.rocksdb_block_based_options_set_metadata_block_size(opts.c, C.uint64_t(metadataBlockSize)) +} + +// SetPartitionFilters sets using partitioned full filters for each SST file. +// This option is incompatible with block-based filters. +// Note: currently this option requires kTwoLevelIndexSearch to be set as well. +// Default: false +func (opts *BlockBasedTableOptions) SetPartitionFilters(value bool) { + C.rocksdb_block_based_options_set_partition_filters(opts.c, boolToChar(value)) +} + +// SetUseDeltaEncoding sets using delta encoding to compress keys in blocks. +// ReadOptions::pin_data requires this option to be disabled. +func (opts *BlockBasedTableOptions) SetUseDeltaEncoding(value bool) { + C.rocksdb_block_based_options_set_use_delta_encoding(opts.c, boolToChar(value)) +} + +// SetFilterPolicy sets the filter policy opts reduce disk reads. +// Many applications will benefit from passing the result of +// NewBloomFilterPolicy() here. +// Default: nil +func (opts *BlockBasedTableOptions) SetFilterPolicy(fp FilterPolicy) { + opts.cFp = fp.c + C.rocksdb_block_based_options_set_filter_policy(opts.c, opts.cFp) +} + +// SetNoBlockCache specify whether block cache should be used or not. +// Default: false +func (opts *BlockBasedTableOptions) SetNoBlockCache(value bool) { + C.rocksdb_block_based_options_set_no_block_cache(opts.c, boolToChar(value)) +} + +// SetBlockCache sets the control over blocks (user data is stored in a set of blocks, and +// a block is the unit of reading from disk). +// +// If set, use the specified cache for blocks. +// If nil, rocksdb will auoptsmatically create and use an 8MB internal cache. +// Default: nil +func (opts *BlockBasedTableOptions) SetBlockCache(cache *Cache) { + opts.cache = cache + C.rocksdb_block_based_options_set_block_cache(opts.c, cache.c) +} + +// SetWholeKeyFiltering specify if whole keys in the filter (not just prefixes) +// should be placed. +// This must generally be true for gets opts be efficient. +// Default: true +func (opts *BlockBasedTableOptions) SetWholeKeyFiltering(value bool) { + C.rocksdb_block_based_options_set_whole_key_filtering(opts.c, boolToChar(value)) +} + +// SetFormatVersion sets the format version. +// We currently have five versions: +// 0 -- This version is currently written out by all RocksDB's versions by +// default. Can be read by really old RocksDB's. Doesn't support changing +// checksum (default is CRC32). +// 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default +// checksum, like xxHash. It is written by RocksDB when +// BlockBasedTableOptions::checksum is something other than kCRC32c. (version +// 0 is silently upconverted) +// 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we +// encode compressed blocks with LZ4, BZip2 and Zlib compression. If you +// don't plan to run RocksDB before version 3.10, you should probably use +// this. +// 3 -- Can be read by RocksDB's versions since 5.15. Changes the way we +// encode the keys in index blocks. If you don't plan to run RocksDB before +// version 5.15, you should probably use this. +// This option only affects newly written tables. When reading existing +// tables, the information about version is read from the footer. +// 4 -- Can be read by RocksDB's versions since 5.16. Changes the way we +// encode the values in index blocks. If you don't plan to run RocksDB before +// version 5.16 and you are using index_block_restart_interval > 1, you should +// probably use this as it would reduce the index size. +// This option only affects newly written tables. When reading existing +// tables, the information about version is read from the footer. +// Default: 2 +func (opts *BlockBasedTableOptions) SetFormatVersion(version int) { + C.rocksdb_block_based_options_set_format_version(opts.c, C.int(version)) +} + +// SetIndexType sets the index type used for this table. +// kBinarySearch: +// A space efficient index block that is optimized for +// binary-search-based index. +// +// kHashSearch: +// The hash index, if enabled, will do the hash lookup when +// `Options.prefix_extractor` is provided. +// +// kTwoLevelIndexSearch: +// A two-level index implementation. Both levels are binary search indexes. +// Default: kBinarySearch +func (opts *BlockBasedTableOptions) SetIndexType(value IndexType) { + C.rocksdb_block_based_options_set_index_type(opts.c, C.int(value)) +} + +// SetDataBlockIndexType sets data block index type +func (opts *BlockBasedTableOptions) SetDataBlockIndexType(value DataBlockIndexType) { + C.rocksdb_block_based_options_set_data_block_index_type(opts.c, C.int(value)) +} + +// SetDataBlockHashRatio is valid only when data_block_hash_index_type is +// KDataBlockIndexTypeBinarySearchAndHash. +// +// Default value: 0.75 +func (opts *BlockBasedTableOptions) SetDataBlockHashRatio(value float64) { + C.rocksdb_block_based_options_set_data_block_hash_ratio(opts.c, C.double(value)) +} diff --git a/v8/options_compaction.go b/v8/options_compaction.go new file mode 100644 index 00000000..a7db2136 --- /dev/null +++ b/v8/options_compaction.go @@ -0,0 +1,130 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// UniversalCompactionStopStyle describes a algorithm used to make a +// compaction request stop picking new files into a single compaction run. +type UniversalCompactionStopStyle uint + +// Compaction stop style types. +const ( + CompactionStopStyleSimilarSize = UniversalCompactionStopStyle(C.rocksdb_similar_size_compaction_stop_style) + CompactionStopStyleTotalSize = UniversalCompactionStopStyle(C.rocksdb_total_size_compaction_stop_style) +) + +// FIFOCompactionOptions represent all of the available options for +// FIFO compaction. +type FIFOCompactionOptions struct { + c *C.rocksdb_fifo_compaction_options_t +} + +// NewDefaultFIFOCompactionOptions creates a default FIFOCompactionOptions object. +func NewDefaultFIFOCompactionOptions() *FIFOCompactionOptions { + return NewNativeFIFOCompactionOptions(C.rocksdb_fifo_compaction_options_create()) +} + +// NewNativeFIFOCompactionOptions creates a native FIFOCompactionOptions object. +func NewNativeFIFOCompactionOptions(c *C.rocksdb_fifo_compaction_options_t) *FIFOCompactionOptions { + return &FIFOCompactionOptions{c} +} + +// SetMaxTableFilesSize sets the max table file size. +// Once the total sum of table files reaches this, we will delete the oldest +// table file +// Default: 1GB +func (opts *FIFOCompactionOptions) SetMaxTableFilesSize(value uint64) { + C.rocksdb_fifo_compaction_options_set_max_table_files_size(opts.c, C.uint64_t(value)) +} + +// Destroy deallocates the FIFOCompactionOptions object. +func (opts *FIFOCompactionOptions) Destroy() { + C.rocksdb_fifo_compaction_options_destroy(opts.c) +} + +// UniversalCompactionOptions represent all of the available options for +// universal compaction. +type UniversalCompactionOptions struct { + c *C.rocksdb_universal_compaction_options_t +} + +// NewDefaultUniversalCompactionOptions creates a default UniversalCompactionOptions +// object. +func NewDefaultUniversalCompactionOptions() *UniversalCompactionOptions { + return NewNativeUniversalCompactionOptions(C.rocksdb_universal_compaction_options_create()) +} + +// NewNativeUniversalCompactionOptions creates a UniversalCompactionOptions +// object. +func NewNativeUniversalCompactionOptions(c *C.rocksdb_universal_compaction_options_t) *UniversalCompactionOptions { + return &UniversalCompactionOptions{c} +} + +// SetSizeRatio sets the percentage flexibilty while comparing file size. +// If the candidate file(s) size is 1% smaller than the next file's size, +// then include next file into this candidate set. +// Default: 1 +func (opts *UniversalCompactionOptions) SetSizeRatio(value uint) { + C.rocksdb_universal_compaction_options_set_size_ratio(opts.c, C.int(value)) +} + +// SetMinMergeWidth sets the minimum number of files in a single compaction run. +// Default: 2 +func (opts *UniversalCompactionOptions) SetMinMergeWidth(value uint) { + C.rocksdb_universal_compaction_options_set_min_merge_width(opts.c, C.int(value)) +} + +// SetMaxMergeWidth sets the maximum number of files in a single compaction run. +// Default: UINT_MAX +func (opts *UniversalCompactionOptions) SetMaxMergeWidth(value uint) { + C.rocksdb_universal_compaction_options_set_max_merge_width(opts.c, C.int(value)) +} + +// SetMaxSizeAmplificationPercent sets the size amplification. +// It is defined as the amount (in percentage) of +// additional storage needed to store a single byte of data in the database. +// For example, a size amplification of 2% means that a database that +// contains 100 bytes of user-data may occupy upto 102 bytes of +// physical storage. By this definition, a fully compacted database has +// a size amplification of 0%. Rocksdb uses the following heuristic +// to calculate size amplification: it assumes that all files excluding +// the earliest file contribute to the size amplification. +// Default: 200, which means that a 100 byte database could require upto +// 300 bytes of storage. +func (opts *UniversalCompactionOptions) SetMaxSizeAmplificationPercent(value uint) { + C.rocksdb_universal_compaction_options_set_max_size_amplification_percent(opts.c, C.int(value)) +} + +// SetCompressionSizePercent sets the percentage of compression size. +// +// If this option is set to be -1, all the output files +// will follow compression type specified. +// +// If this option is not negative, we will try to make sure compressed +// size is just above this value. In normal cases, at least this percentage +// of data will be compressed. +// When we are compacting to a new file, here is the criteria whether +// it needs to be compressed: assuming here are the list of files sorted +// by generation time: +// A1...An B1...Bm C1...Ct +// where A1 is the newest and Ct is the oldest, and we are going to compact +// B1...Bm, we calculate the total size of all the files as total_size, as +// well as the total size of C1...Ct as total_C, the compaction output file +// will be compressed iff +// total_C / total_size < this percentage +// Default: -1 +func (opts *UniversalCompactionOptions) SetCompressionSizePercent(value int) { + C.rocksdb_universal_compaction_options_set_compression_size_percent(opts.c, C.int(value)) +} + +// SetStopStyle sets the algorithm used to stop picking files into a single compaction run. +// Default: CompactionStopStyleTotalSize +func (opts *UniversalCompactionOptions) SetStopStyle(value UniversalCompactionStopStyle) { + C.rocksdb_universal_compaction_options_set_stop_style(opts.c, C.int(value)) +} + +// Destroy deallocates the UniversalCompactionOptions object. +func (opts *UniversalCompactionOptions) Destroy() { + C.rocksdb_universal_compaction_options_destroy(opts.c) + opts.c = nil +} diff --git a/v8/options_compression.go b/v8/options_compression.go new file mode 100644 index 00000000..7165ceda --- /dev/null +++ b/v8/options_compression.go @@ -0,0 +1,24 @@ +package gorocksdb + +// CompressionOptions represents options for different compression algorithms like Zlib. +type CompressionOptions struct { + WindowBits int + Level int + Strategy int + MaxDictBytes int +} + +// NewDefaultCompressionOptions creates a default CompressionOptions object. +func NewDefaultCompressionOptions() *CompressionOptions { + return NewCompressionOptions(-14, -1, 0, 0) +} + +// NewCompressionOptions creates a CompressionOptions object. +func NewCompressionOptions(windowBits, level, strategy, maxDictBytes int) *CompressionOptions { + return &CompressionOptions{ + WindowBits: windowBits, + Level: level, + Strategy: strategy, + MaxDictBytes: maxDictBytes, + } +} diff --git a/v8/options_env.go b/v8/options_env.go new file mode 100644 index 00000000..cdac98f2 --- /dev/null +++ b/v8/options_env.go @@ -0,0 +1,25 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// EnvOptions represents options for env. +type EnvOptions struct { + c *C.rocksdb_envoptions_t +} + +// NewDefaultEnvOptions creates a default EnvOptions object. +func NewDefaultEnvOptions() *EnvOptions { + return NewNativeEnvOptions(C.rocksdb_envoptions_create()) +} + +// NewNativeEnvOptions creates a EnvOptions object. +func NewNativeEnvOptions(c *C.rocksdb_envoptions_t) *EnvOptions { + return &EnvOptions{c: c} +} + +// Destroy deallocates the EnvOptions object. +func (opts *EnvOptions) Destroy() { + C.rocksdb_envoptions_destroy(opts.c) + opts.c = nil +} diff --git a/v8/options_flush.go b/v8/options_flush.go new file mode 100644 index 00000000..518236a5 --- /dev/null +++ b/v8/options_flush.go @@ -0,0 +1,32 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// FlushOptions represent all of the available options when manual flushing the +// database. +type FlushOptions struct { + c *C.rocksdb_flushoptions_t +} + +// NewDefaultFlushOptions creates a default FlushOptions object. +func NewDefaultFlushOptions() *FlushOptions { + return NewNativeFlushOptions(C.rocksdb_flushoptions_create()) +} + +// NewNativeFlushOptions creates a FlushOptions object. +func NewNativeFlushOptions(c *C.rocksdb_flushoptions_t) *FlushOptions { + return &FlushOptions{c} +} + +// SetWait specify if the flush will wait until the flush is done. +// Default: true +func (opts *FlushOptions) SetWait(value bool) { + C.rocksdb_flushoptions_set_wait(opts.c, boolToChar(value)) +} + +// Destroy deallocates the FlushOptions object. +func (opts *FlushOptions) Destroy() { + C.rocksdb_flushoptions_destroy(opts.c) + opts.c = nil +} diff --git a/v8/options_ingest.go b/v8/options_ingest.go new file mode 100644 index 00000000..89efb547 --- /dev/null +++ b/v8/options_ingest.go @@ -0,0 +1,65 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// IngestExternalFileOptions represents available options when ingesting external files. +type IngestExternalFileOptions struct { + c *C.rocksdb_ingestexternalfileoptions_t +} + +// NewDefaultIngestExternalFileOptions creates a default IngestExternalFileOptions object. +func NewDefaultIngestExternalFileOptions() *IngestExternalFileOptions { + return NewNativeIngestExternalFileOptions(C.rocksdb_ingestexternalfileoptions_create()) +} + +// NewNativeIngestExternalFileOptions creates a IngestExternalFileOptions object. +func NewNativeIngestExternalFileOptions(c *C.rocksdb_ingestexternalfileoptions_t) *IngestExternalFileOptions { + return &IngestExternalFileOptions{c: c} +} + +// SetMoveFiles specifies if it should move the files instead of copying them. +// Default to false. +func (opts *IngestExternalFileOptions) SetMoveFiles(flag bool) { + C.rocksdb_ingestexternalfileoptions_set_move_files(opts.c, boolToChar(flag)) +} + +// SetSnapshotConsistency if specifies the consistency. +// If set to false, an ingested file key could appear in existing snapshots that were created before the +// file was ingested. +// Default to true. +func (opts *IngestExternalFileOptions) SetSnapshotConsistency(flag bool) { + C.rocksdb_ingestexternalfileoptions_set_snapshot_consistency(opts.c, boolToChar(flag)) +} + +// SetAllowGlobalSeqNo sets allow_global_seqno. If set to false,IngestExternalFile() will fail if the file key +// range overlaps with existing keys or tombstones in the DB. +// Default true. +func (opts *IngestExternalFileOptions) SetAllowGlobalSeqNo(flag bool) { + C.rocksdb_ingestexternalfileoptions_set_allow_global_seqno(opts.c, boolToChar(flag)) +} + +// SetAllowBlockingFlush sets allow_blocking_flush. If set to false and the file key range overlaps with +// the memtable key range (memtable flush required), IngestExternalFile will fail. +// Default to true. +func (opts *IngestExternalFileOptions) SetAllowBlockingFlush(flag bool) { + C.rocksdb_ingestexternalfileoptions_set_allow_blocking_flush(opts.c, boolToChar(flag)) +} + +// SetIngestionBehind sets ingest_behind +// Set to true if you would like duplicate keys in the file being ingested +// to be skipped rather than overwriting existing data under that key. +// Usecase: back-fill of some historical data in the database without +// over-writing existing newer version of data. +// This option could only be used if the DB has been running +// with allow_ingest_behind=true since the dawn of time. +// All files will be ingested at the bottommost level with seqno=0. +func (opts *IngestExternalFileOptions) SetIngestionBehind(flag bool) { + C.rocksdb_ingestexternalfileoptions_set_ingest_behind(opts.c, boolToChar(flag)) +} + +// Destroy deallocates the IngestExternalFileOptions object. +func (opts *IngestExternalFileOptions) Destroy() { + C.rocksdb_ingestexternalfileoptions_destroy(opts.c) + opts.c = nil +} diff --git a/v8/options_read.go b/v8/options_read.go new file mode 100644 index 00000000..32c9c681 --- /dev/null +++ b/v8/options_read.go @@ -0,0 +1,157 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" +import "unsafe" + +// ReadTier controls fetching of data during a read request. +// An application can issue a read request (via Get/Iterators) and specify +// if that read should process data that ALREADY resides on a specified cache +// level. For example, if an application specifies BlockCacheTier then the +// Get call will process data that is already processed in the memtable or +// the block cache. It will not page in data from the OS cache or data that +// resides in storage. +type ReadTier uint + +const ( + // ReadAllTier reads data in memtable, block cache, OS cache or storage. + ReadAllTier = ReadTier(0) + // BlockCacheTier reads data in memtable or block cache. + BlockCacheTier = ReadTier(1) +) + +// ReadOptions represent all of the available options when reading from a +// database. +type ReadOptions struct { + c *C.rocksdb_readoptions_t +} + +// NewDefaultReadOptions creates a default ReadOptions object. +func NewDefaultReadOptions() *ReadOptions { + return NewNativeReadOptions(C.rocksdb_readoptions_create()) +} + +// NewNativeReadOptions creates a ReadOptions object. +func NewNativeReadOptions(c *C.rocksdb_readoptions_t) *ReadOptions { + return &ReadOptions{c} +} + +// UnsafeGetReadOptions returns the underlying c read options object. +func (opts *ReadOptions) UnsafeGetReadOptions() unsafe.Pointer { + return unsafe.Pointer(opts.c) +} + +// SetVerifyChecksums speciy if all data read from underlying storage will be +// verified against corresponding checksums. +// Default: false +func (opts *ReadOptions) SetVerifyChecksums(value bool) { + C.rocksdb_readoptions_set_verify_checksums(opts.c, boolToChar(value)) +} + +// SetPrefixSameAsStart Enforce that the iterator only iterates over the same +// prefix as the seek. +// This option is effective only for prefix seeks, i.e. prefix_extractor is +// non-null for the column family and total_order_seek is false. Unlike +// iterate_upper_bound, prefix_same_as_start only works within a prefix +// but in both directions. +// Default: false +func (opts *ReadOptions) SetPrefixSameAsStart(value bool) { + C.rocksdb_readoptions_set_prefix_same_as_start(opts.c, boolToChar(value)) +} + +// SetFillCache specify whether the "data block"/"index block"/"filter block" +// read for this iteration should be cached in memory? +// Callers may wish to set this field to false for bulk scans. +// Default: true +func (opts *ReadOptions) SetFillCache(value bool) { + C.rocksdb_readoptions_set_fill_cache(opts.c, boolToChar(value)) +} + +// SetSnapshot sets the snapshot which should be used for the read. +// The snapshot must belong to the DB that is being read and must +// not have been released. +// Default: nil +func (opts *ReadOptions) SetSnapshot(snap *Snapshot) { + C.rocksdb_readoptions_set_snapshot(opts.c, snap.c) +} + +// SetReadTier specify if this read request should process data that ALREADY +// resides on a particular cache. If the required data is not +// found at the specified cache, then Status::Incomplete is returned. +// Default: ReadAllTier +func (opts *ReadOptions) SetReadTier(value ReadTier) { + C.rocksdb_readoptions_set_read_tier(opts.c, C.int(value)) +} + +// SetTailing specify if to create a tailing iterator. +// A special iterator that has a view of the complete database +// (i.e. it can also be used to read newly added data) and +// is optimized for sequential reads. It will return records +// that were inserted into the database after the creation of the iterator. +// Default: false +func (opts *ReadOptions) SetTailing(value bool) { + C.rocksdb_readoptions_set_tailing(opts.c, boolToChar(value)) +} + +// SetIterateUpperBound specifies "iterate_upper_bound", which defines +// the extent upto which the forward iterator can returns entries. +// Once the bound is reached, Valid() will be false. +// "iterate_upper_bound" is exclusive ie the bound value is +// not a valid entry. If iterator_extractor is not null, the Seek target +// and iterator_upper_bound need to have the same prefix. +// This is because ordering is not guaranteed outside of prefix domain. +// There is no lower bound on the iterator. If needed, that can be easily +// implemented. +// Default: nullptr +func (opts *ReadOptions) SetIterateUpperBound(key []byte) { + cKey := byteToChar(key) + cKeyLen := C.size_t(len(key)) + C.rocksdb_readoptions_set_iterate_upper_bound(opts.c, cKey, cKeyLen) +} + +// SetPinData specifies the value of "pin_data". If true, it keeps the blocks +// loaded by the iterator pinned in memory as long as the iterator is not deleted, +// If used when reading from tables created with +// BlockBasedTableOptions::use_delta_encoding = false, +// Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to +// return 1. +// Default: false +func (opts *ReadOptions) SetPinData(value bool) { + C.rocksdb_readoptions_set_pin_data(opts.c, boolToChar(value)) +} + +// SetReadaheadSize specifies the value of "readahead_size". +// If non-zero, NewIterator will create a new table reader which +// performs reads of the given size. Using a large size (> 2MB) can +// improve the performance of forward iteration on spinning disks. +// Default: 0 +func (opts *ReadOptions) SetReadaheadSize(value uint64) { + C.rocksdb_readoptions_set_readahead_size(opts.c, C.size_t(value)) +} + +// SetTotalOrderSeek specifies the value of "total_order_seek". +// Enable a total order seek regardless of index format (e.g. hash index) +// used in the table. Some table format (e.g. plain table) may not support +// this option. +// If true when calling Get(), we also skip prefix bloom when reading from +// block based table. It provides a way to read existing data after +// changing implementation of prefix extractor. +// Default: false +func (opts *ReadOptions) SetTotalOrderSeek(value bool) { + C.rocksdb_readoptions_set_total_order_seek(opts.c, boolToChar(value)) +} + +// SetIgnoreRangeDeletions specifies the value of "ignore_range_deletions". +// If true, keys deleted using the DeleteRange() API will be visible to +// readers until they are naturally deleted during compaction. This improves +// read performance in DBs with many range deletions. +// Default: false +func (opts *ReadOptions) SetIgnoreRangeDeletions(value bool) { + C.rocksdb_readoptions_set_ignore_range_deletions(opts.c, boolToChar(value)) +} + +// Destroy deallocates the ReadOptions object. +func (opts *ReadOptions) Destroy() { + C.rocksdb_readoptions_destroy(opts.c) + opts.c = nil +} diff --git a/v8/options_test.go b/v8/options_test.go new file mode 100644 index 00000000..114cd60f --- /dev/null +++ b/v8/options_test.go @@ -0,0 +1,22 @@ +package gorocksdb + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestOptions(t *testing.T) { + opts := NewDefaultOptions() + defer opts.Destroy() + + // Test setting max bg jobs + assert.Equal(t, 2, opts.GetMaxBackgroundJobs()) + opts.SetMaxBackgroundJobs(10) + assert.Equal(t, 10, opts.GetMaxBackgroundJobs()) + + // Test setting max bg compactions + assert.Equal(t, uint32(1), opts.GetMaxSubcompactions()) + opts.SetMaxSubcompactions(9) + assert.Equal(t, uint32(9), opts.GetMaxSubcompactions()) +} diff --git a/v8/options_transaction.go b/v8/options_transaction.go new file mode 100644 index 00000000..cb72bff8 --- /dev/null +++ b/v8/options_transaction.go @@ -0,0 +1,66 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// TransactionOptions represent all of the available options options for +// a transaction on the database. +type TransactionOptions struct { + c *C.rocksdb_transaction_options_t +} + +// NewDefaultTransactionOptions creates a default TransactionOptions object. +func NewDefaultTransactionOptions() *TransactionOptions { + return NewNativeTransactionOptions(C.rocksdb_transaction_options_create()) +} + +// NewNativeTransactionOptions creates a TransactionOptions object. +func NewNativeTransactionOptions(c *C.rocksdb_transaction_options_t) *TransactionOptions { + return &TransactionOptions{c} +} + +// SetSetSnapshot to true is the same as calling +// Transaction::SetSnapshot(). +func (opts *TransactionOptions) SetSetSnapshot(value bool) { + C.rocksdb_transaction_options_set_set_snapshot(opts.c, boolToChar(value)) +} + +// SetDeadlockDetect to true means that before acquiring locks, this transaction will +// check if doing so will cause a deadlock. If so, it will return with +// Status::Busy. The user should retry their transaction. +func (opts *TransactionOptions) SetDeadlockDetect(value bool) { + C.rocksdb_transaction_options_set_deadlock_detect(opts.c, boolToChar(value)) +} + +// SetLockTimeout positive, specifies the wait timeout in milliseconds when +// a transaction attempts to lock a key. +// If 0, no waiting is done if a lock cannot instantly be acquired. +// If negative, TransactionDBOptions::transaction_lock_timeout will be used +func (opts *TransactionOptions) SetLockTimeout(lock_timeout int64) { + C.rocksdb_transaction_options_set_lock_timeout(opts.c, C.int64_t(lock_timeout)) +} + +// SetExpiration sets the Expiration duration in milliseconds. +// If non-negative, transactions that last longer than this many milliseconds will fail to commit. +// If not set, a forgotten transaction that is never committed, rolled back, or deleted +// will never relinquish any locks it holds. This could prevent keys from +// being written by other writers. +func (opts *TransactionOptions) SetExpiration(expiration int64) { + C.rocksdb_transaction_options_set_expiration(opts.c, C.int64_t(expiration)) +} + +// SetDeadlockDetectDepth sets the number of traversals to make during deadlock detection. +func (opts *TransactionOptions) SetDeadlockDetectDepth(depth int64) { + C.rocksdb_transaction_options_set_deadlock_detect_depth(opts.c, C.int64_t(depth)) +} + +// SetMaxWriteBatchSize sets the maximum number of bytes used for the write batch. 0 means no limit. +func (opts *TransactionOptions) SetMaxWriteBatchSize(size uint64) { + C.rocksdb_transaction_options_set_max_write_batch_size(opts.c, C.size_t(size)) +} + +// Destroy deallocates the TransactionOptions object. +func (opts *TransactionOptions) Destroy() { + C.rocksdb_transaction_options_destroy(opts.c) + opts.c = nil +} diff --git a/v8/options_transactiondb.go b/v8/options_transactiondb.go new file mode 100644 index 00000000..6c0b1cd9 --- /dev/null +++ b/v8/options_transactiondb.go @@ -0,0 +1,72 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// TransactionDBOptions represent all of the available options when opening a transactional database +// with OpenTransactionDb. +type TransactionDBOptions struct { + c *C.rocksdb_transactiondb_options_t +} + +// NewDefaultTransactionDBOptions creates a default TransactionDBOptions object. +func NewDefaultTransactionDBOptions() *TransactionDBOptions { + return NewNativeTransactionDBOptions(C.rocksdb_transactiondb_options_create()) +} + +// NewDefaultTransactionDBOptions creates a TransactionDBOptions object. +func NewNativeTransactionDBOptions(c *C.rocksdb_transactiondb_options_t) *TransactionDBOptions { + return &TransactionDBOptions{c} +} + +// SetMaxNumLocks sets the maximum number of keys that can be locked at the same time +// per column family. +// If the number of locked keys is greater than max_num_locks, transaction +// writes (or GetForUpdate) will return an error. +// If this value is not positive, no limit will be enforced. +func (opts *TransactionDBOptions) SetMaxNumLocks(max_num_locks int64) { + C.rocksdb_transactiondb_options_set_max_num_locks(opts.c, C.int64_t(max_num_locks)) +} + +// SetNumStripes sets the concurrency level. +// Increasing this value will increase the concurrency by dividing the lock +// table (per column family) into more sub-tables, each with their own +// separate +// mutex. +func (opts *TransactionDBOptions) SetNumStripes(num_stripes uint64) { + C.rocksdb_transactiondb_options_set_num_stripes(opts.c, C.size_t(num_stripes)) +} + +// SetTransactionLockTimeout if positive, specifies the default wait timeout in milliseconds when +// a transaction attempts to lock a key if not specified by +// TransactionOptions::lock_timeout. +// +// If 0, no waiting is done if a lock cannot instantly be acquired. +// If negative, there is no timeout. Not using a timeout is not recommended +// as it can lead to deadlocks. Currently, there is no deadlock-detection to +// recover from a deadlock. +func (opts *TransactionDBOptions) SetTransactionLockTimeout(txn_lock_timeout int64) { + C.rocksdb_transactiondb_options_set_transaction_lock_timeout(opts.c, C.int64_t(txn_lock_timeout)) +} + +// SetDefaultLockTimeout if posititve, specifies the wait timeout in milliseconds when writing a key +// OUTSIDE of a transaction (ie by calling DB::Put(),Merge(),Delete(),Write() +// directly). +// If 0, no waiting is done if a lock cannot instantly be acquired. +// If negative, there is no timeout and will block indefinitely when acquiring +// a lock. +// +// Not using a timeout can lead to deadlocks. Currently, there +// is no deadlock-detection to recover from a deadlock. While DB writes +// cannot deadlock with other DB writes, they can deadlock with a transaction. +// A negative timeout should only be used if all transactions have a small +// expiration set. +func (opts *TransactionDBOptions) SetDefaultLockTimeout(default_lock_timeout int64) { + C.rocksdb_transactiondb_options_set_default_lock_timeout(opts.c, C.int64_t(default_lock_timeout)) +} + +// Destroy deallocates the TransactionDBOptions object. +func (opts *TransactionDBOptions) Destroy() { + C.rocksdb_transactiondb_options_destroy(opts.c) + opts.c = nil +} diff --git a/v8/options_write.go b/v8/options_write.go new file mode 100644 index 00000000..e956ded7 --- /dev/null +++ b/v8/options_write.go @@ -0,0 +1,58 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// WriteOptions represent all of the available options when writing to a +// database. +type WriteOptions struct { + c *C.rocksdb_writeoptions_t +} + +// NewDefaultWriteOptions creates a default WriteOptions object. +func NewDefaultWriteOptions() *WriteOptions { + return NewNativeWriteOptions(C.rocksdb_writeoptions_create()) +} + +// NewNativeWriteOptions creates a WriteOptions object. +func NewNativeWriteOptions(c *C.rocksdb_writeoptions_t) *WriteOptions { + return &WriteOptions{c} +} + +// SetSync sets the sync mode. If true, the write will be flushed +// from the operating system buffer cache before the write is considered complete. +// If this flag is true, writes will be slower. +// Default: false +func (opts *WriteOptions) SetSync(value bool) { + C.rocksdb_writeoptions_set_sync(opts.c, boolToChar(value)) +} + +// DisableWAL sets whether WAL should be active or not. +// If true, writes will not first go to the write ahead log, +// and the write may got lost after a crash. +// Default: false +func (opts *WriteOptions) DisableWAL(value bool) { + C.rocksdb_writeoptions_disable_WAL(opts.c, C.int(btoi(value))) +} + +// SetLowPri if true, this write request is of lower priority if compaction is +// behind. In this case, no_slowdown = true, the request will be cancelled +// immediately with Status::Incomplete() returned. Otherwise, it will be +// slowed down. The slowdown value is determined by RocksDB to guarantee +// it introduces minimum impacts to high priority writes. +// +// Default: false +func (opts *WriteOptions) SetLowPri(value bool) { + C.rocksdb_writeoptions_set_low_pri(opts.c, boolToChar(value)) +} + +// IsLowPri returns if the write request is of lower priority if compaction is behind. +func (opts *WriteOptions) IsLowPri() bool { + return charToBool(C.rocksdb_writeoptions_get_low_pri(opts.c)) +} + +// Destroy deallocates the WriteOptions object. +func (opts *WriteOptions) Destroy() { + C.rocksdb_writeoptions_destroy(opts.c) + opts.c = nil +} diff --git a/v8/ratelimiter.go b/v8/ratelimiter.go new file mode 100644 index 00000000..72b53951 --- /dev/null +++ b/v8/ratelimiter.go @@ -0,0 +1,31 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" + +// RateLimiter, is used to control write rate of flush and +// compaction. +type RateLimiter struct { + c *C.rocksdb_ratelimiter_t +} + +// NewDefaultRateLimiter creates a default RateLimiter object. +func NewRateLimiter(rate_bytes_per_sec, refill_period_us int64, fairness int32) *RateLimiter { + return NewNativeRateLimiter(C.rocksdb_ratelimiter_create( + C.int64_t(rate_bytes_per_sec), + C.int64_t(refill_period_us), + C.int32_t(fairness), + )) +} + +// NewNativeRateLimiter creates a native RateLimiter object. +func NewNativeRateLimiter(c *C.rocksdb_ratelimiter_t) *RateLimiter { + return &RateLimiter{c} +} + +// Destroy deallocates the RateLimiter object. +func (self *RateLimiter) Destroy() { + C.rocksdb_ratelimiter_destroy(self.c) + self.c = nil +} diff --git a/v8/slice.go b/v8/slice.go new file mode 100644 index 00000000..01ecfd3c --- /dev/null +++ b/v8/slice.go @@ -0,0 +1,92 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" +import "unsafe" + +// Slice is used as a wrapper for non-copy values +type Slice struct { + data *C.char + size C.size_t + freed bool +} + +type Slices []*Slice + +func (slices Slices) Destroy() { + for _, s := range slices { + s.Free() + } +} + +// NewSlice returns a slice with the given data. +func NewSlice(data *C.char, size C.size_t) *Slice { + return &Slice{data, size, false} +} + +// StringToSlice is similar to NewSlice, but can be called with +// a Go string type. This exists to make testing integration +// with Gorocksdb easier. +func StringToSlice(data string) *Slice { + return NewSlice(C.CString(data), C.size_t(len(data))) +} + +// Data returns the data of the slice. If the key doesn't exist this will be a +// nil slice. +func (s *Slice) Data() []byte { + return charToByte(s.data, s.size) +} + +// Size returns the size of the data. +func (s *Slice) Size() int { + return int(s.size) +} + +// Exists returns if the key exists +func (s *Slice) Exists() bool { + return s.data != nil +} + +// Free frees the slice data. +func (s *Slice) Free() { + if !s.freed { + C.rocksdb_free(unsafe.Pointer(s.data)) + s.freed = true + } +} + +// Copy returns a new copy of the slice and frees the slice. +func (s *Slice) Copy() []byte { + r := make([]byte, s.size) + copy(r, s.Data()) + s.Free() + return r +} + +// PinnableSliceHandle represents a handle to a PinnableSlice. +type PinnableSliceHandle struct { + c *C.rocksdb_pinnableslice_t +} + +// NewNativePinnableSliceHandle creates a PinnableSliceHandle object. +func NewNativePinnableSliceHandle(c *C.rocksdb_pinnableslice_t) *PinnableSliceHandle { + return &PinnableSliceHandle{c} +} + +// Data returns the data of the slice. +func (h *PinnableSliceHandle) Data() []byte { + if h.c == nil { + return nil + } + + var cValLen C.size_t + cValue := C.rocksdb_pinnableslice_value(h.c, &cValLen) + + return charToByte(cValue, cValLen) +} + +// Destroy calls the destructor of the underlying pinnable slice handle. +func (h *PinnableSliceHandle) Destroy() { + C.rocksdb_pinnableslice_destroy(h.c) +} diff --git a/v8/slice_transform.go b/v8/slice_transform.go new file mode 100644 index 00000000..8b9b2362 --- /dev/null +++ b/v8/slice_transform.go @@ -0,0 +1,82 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// A SliceTransform can be used as a prefix extractor. +type SliceTransform interface { + // Transform a src in domain to a dst in the range. + Transform(src []byte) []byte + + // Determine whether this is a valid src upon the function applies. + InDomain(src []byte) bool + + // Determine whether dst=Transform(src) for some src. + InRange(src []byte) bool + + // Return the name of this transformation. + Name() string +} + +// NewFixedPrefixTransform creates a new fixed prefix transform. +func NewFixedPrefixTransform(prefixLen int) SliceTransform { + return NewNativeSliceTransform(C.rocksdb_slicetransform_create_fixed_prefix(C.size_t(prefixLen))) +} + +// NewNoopPrefixTransform creates a new no-op prefix transform. +func NewNoopPrefixTransform() SliceTransform { + return NewNativeSliceTransform(C.rocksdb_slicetransform_create_noop()) +} + +// NewNativeSliceTransform creates a SliceTransform object. +func NewNativeSliceTransform(c *C.rocksdb_slicetransform_t) SliceTransform { + return nativeSliceTransform{c} +} + +type nativeSliceTransform struct { + c *C.rocksdb_slicetransform_t +} + +func (st nativeSliceTransform) Transform(src []byte) []byte { return nil } +func (st nativeSliceTransform) InDomain(src []byte) bool { return false } +func (st nativeSliceTransform) InRange(src []byte) bool { return false } +func (st nativeSliceTransform) Name() string { return "" } + +// Hold references to slice transforms. +var sliceTransforms = NewCOWList() + +type sliceTransformWrapper struct { + name *C.char + sliceTransform SliceTransform +} + +func registerSliceTransform(st SliceTransform) int { + return sliceTransforms.Append(sliceTransformWrapper{C.CString(st.Name()), st}) +} + +//export gorocksdb_slicetransform_transform +func gorocksdb_slicetransform_transform(idx int, cKey *C.char, cKeyLen C.size_t, cDstLen *C.size_t) *C.char { + key := charToByte(cKey, cKeyLen) + dst := sliceTransforms.Get(idx).(sliceTransformWrapper).sliceTransform.Transform(key) + *cDstLen = C.size_t(len(dst)) + return cByteSlice(dst) +} + +//export gorocksdb_slicetransform_in_domain +func gorocksdb_slicetransform_in_domain(idx int, cKey *C.char, cKeyLen C.size_t) C.uchar { + key := charToByte(cKey, cKeyLen) + inDomain := sliceTransforms.Get(idx).(sliceTransformWrapper).sliceTransform.InDomain(key) + return boolToChar(inDomain) +} + +//export gorocksdb_slicetransform_in_range +func gorocksdb_slicetransform_in_range(idx int, cKey *C.char, cKeyLen C.size_t) C.uchar { + key := charToByte(cKey, cKeyLen) + inRange := sliceTransforms.Get(idx).(sliceTransformWrapper).sliceTransform.InRange(key) + return boolToChar(inRange) +} + +//export gorocksdb_slicetransform_name +func gorocksdb_slicetransform_name(idx int) *C.char { + return sliceTransforms.Get(idx).(sliceTransformWrapper).name +} diff --git a/v8/slice_transform_test.go b/v8/slice_transform_test.go new file mode 100644 index 00000000..d60c7326 --- /dev/null +++ b/v8/slice_transform_test.go @@ -0,0 +1,52 @@ +package gorocksdb + +import ( + "testing" + + "github.com/facebookgo/ensure" +) + +func TestSliceTransform(t *testing.T) { + db := newTestDB(t, "TestSliceTransform", func(opts *Options) { + opts.SetPrefixExtractor(&testSliceTransform{}) + }) + defer db.Close() + + wo := NewDefaultWriteOptions() + ensure.Nil(t, db.Put(wo, []byte("foo1"), []byte("foo"))) + ensure.Nil(t, db.Put(wo, []byte("foo2"), []byte("foo"))) + ensure.Nil(t, db.Put(wo, []byte("bar1"), []byte("bar"))) + + iter := db.NewIterator(NewDefaultReadOptions()) + defer iter.Close() + prefix := []byte("foo") + numFound := 0 + for iter.Seek(prefix); iter.ValidForPrefix(prefix); iter.Next() { + numFound++ + } + ensure.Nil(t, iter.Err()) + ensure.DeepEqual(t, numFound, 2) +} + +func TestFixedPrefixTransformOpen(t *testing.T) { + db := newTestDB(t, "TestFixedPrefixTransformOpen", func(opts *Options) { + opts.SetPrefixExtractor(NewFixedPrefixTransform(3)) + }) + defer db.Close() +} + +func TestNewNoopPrefixTransform(t *testing.T) { + db := newTestDB(t, "TestNewNoopPrefixTransform", func(opts *Options) { + opts.SetPrefixExtractor(NewNoopPrefixTransform()) + }) + defer db.Close() +} + +type testSliceTransform struct { + initiated bool +} + +func (st *testSliceTransform) Name() string { return "gorocksdb.test" } +func (st *testSliceTransform) Transform(src []byte) []byte { return src[0:3] } +func (st *testSliceTransform) InDomain(src []byte) bool { return len(src) >= 3 } +func (st *testSliceTransform) InRange(src []byte) bool { return len(src) == 3 } diff --git a/v8/snapshot.go b/v8/snapshot.go new file mode 100644 index 00000000..2ea16909 --- /dev/null +++ b/v8/snapshot.go @@ -0,0 +1,14 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" + +// Snapshot provides a consistent view of read operations in a DB. +type Snapshot struct { + c *C.rocksdb_snapshot_t +} + +// NewNativeSnapshot creates a Snapshot object. +func NewNativeSnapshot(c *C.rocksdb_snapshot_t) *Snapshot { + return &Snapshot{c} +} diff --git a/v8/sst_file_writer.go b/v8/sst_file_writer.go new file mode 100644 index 00000000..54f2c139 --- /dev/null +++ b/v8/sst_file_writer.go @@ -0,0 +1,67 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" + +import ( + "errors" + "unsafe" +) + +// SSTFileWriter is used to create sst files that can be added to database later. +// All keys in files generated by SstFileWriter will have sequence number = 0. +type SSTFileWriter struct { + c *C.rocksdb_sstfilewriter_t +} + +// NewSSTFileWriter creates an SSTFileWriter object. +func NewSSTFileWriter(opts *EnvOptions, dbOpts *Options) *SSTFileWriter { + c := C.rocksdb_sstfilewriter_create(opts.c, dbOpts.c) + return &SSTFileWriter{c: c} +} + +// Open prepares SstFileWriter to write into file located at "path". +func (w *SSTFileWriter) Open(path string) error { + var ( + cErr *C.char + cPath = C.CString(path) + ) + defer C.free(unsafe.Pointer(cPath)) + C.rocksdb_sstfilewriter_open(w.c, cPath, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Add adds key, value to currently opened file. +// REQUIRES: key is after any previously added key according to comparator. +func (w *SSTFileWriter) Add(key, value []byte) error { + cKey := byteToChar(key) + cValue := byteToChar(value) + var cErr *C.char + C.rocksdb_sstfilewriter_add(w.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Finish finishes writing to sst file and close file. +func (w *SSTFileWriter) Finish() error { + var cErr *C.char + C.rocksdb_sstfilewriter_finish(w.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Destroy destroys the SSTFileWriter object. +func (w *SSTFileWriter) Destroy() { + C.rocksdb_sstfilewriter_destroy(w.c) +} diff --git a/v8/staticflag_linux.go b/v8/staticflag_linux.go new file mode 100644 index 00000000..a7a8f420 --- /dev/null +++ b/v8/staticflag_linux.go @@ -0,0 +1,7 @@ +//go:build rocksdbstatic +// +build rocksdbstatic + +package gorocksdb + +// #cgo LDFLAGS: -l:librocksdb.a -l:libstdc++.a -l:libz.a -l:libbz2.a -l:libsnappy.a -l:liblz4.a -l:libzstd.a -lm -ldl +import "C" diff --git a/v8/transaction.go b/v8/transaction.go new file mode 100644 index 00000000..4417095d --- /dev/null +++ b/v8/transaction.go @@ -0,0 +1,215 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" + +import ( + "errors" + "unsafe" +) + +// Transaction is used with TransactionDB for transaction support. +type Transaction struct { + c *C.rocksdb_transaction_t +} + +// NewNativeTransaction creates a Transaction object. +func NewNativeTransaction(c *C.rocksdb_transaction_t) *Transaction { + return &Transaction{c} +} + +// Commit commits the transaction to the database. +func (transaction *Transaction) Commit() error { + var ( + cErr *C.char + ) + C.rocksdb_transaction_commit(transaction.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Rollback performs a rollback on the transaction. +func (transaction *Transaction) Rollback() error { + var ( + cErr *C.char + ) + C.rocksdb_transaction_rollback(transaction.c, &cErr) + + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Get returns the data associated with the key from the database given this transaction. +func (transaction *Transaction) Get(opts *ReadOptions, key []byte) (*Slice, error) { + var ( + cErr *C.char + cValLen C.size_t + cKey = byteToChar(key) + ) + cValue := C.rocksdb_transaction_get( + transaction.c, opts.c, cKey, C.size_t(len(key)), &cValLen, &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewSlice(cValue, cValLen), nil +} + +// GetCF returns the data associated with the key in a given column family from the database given this transaction. +func (transaction *Transaction) GetCF(opts *ReadOptions, cf *ColumnFamilyHandle, key []byte) (*Slice, error) { + var ( + cErr *C.char + cValLen C.size_t + cKey = byteToChar(key) + ) + cValue := C.rocksdb_transaction_get_cf( + transaction.c, opts.c, cf.c, cKey, C.size_t(len(key)), &cValLen, &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewSlice(cValue, cValLen), nil +} + +// GetForUpdate queries the data associated with the key and puts an exclusive lock on the key from the database given this transaction. +func (transaction *Transaction) GetForUpdate(opts *ReadOptions, key []byte) (*Slice, error) { + var ( + cErr *C.char + cValLen C.size_t + cKey = byteToChar(key) + ) + cValue := C.rocksdb_transaction_get_for_update( + transaction.c, opts.c, cKey, C.size_t(len(key)), &cValLen, C.uchar(byte(1)) /*exclusive*/, &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewSlice(cValue, cValLen), nil +} + +// GetForUpdateCF queries the data associated with the key in a given column family +// and puts an exclusive lock on the key from the database given this transaction. +func (transaction *Transaction) GetForUpdateCF(opts *ReadOptions, cf *ColumnFamilyHandle, key []byte) (*Slice, error) { + var ( + cErr *C.char + cValLen C.size_t + cKey = byteToChar(key) + ) + cValue := C.rocksdb_transaction_get_for_update_cf( + transaction.c, opts.c, cf.c, cKey, C.size_t(len(key)), &cValLen, C.uchar(byte(1)) /*exclusive*/, &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewSlice(cValue, cValLen), nil +} + +// GetPinnedForUpdateCF queries the data associated with the key in a given column family +// and puts an exclusive lock on the key from the database given this transaction. +// It uses a pinnable slice to improve performance by avoiding a memcpy. +func (transaction *Transaction) GetPinnedForUpdateCF(opts *ReadOptions, cf *ColumnFamilyHandle, key []byte) (*PinnableSliceHandle, error) { + var ( + cErr *C.char + cKey = byteToChar(key) + ) + + cHandle := C.rocksdb_transaction_get_pinned_for_update_cf( + transaction.c, opts.c, cf.c, cKey, C.size_t(len(key)), C.uchar(byte(1)) /*exclusive*/, &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewNativePinnableSliceHandle(cHandle), nil +} + +// Put writes data associated with a key to the transaction. +func (transaction *Transaction) Put(key, value []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + cValue = byteToChar(value) + ) + C.rocksdb_transaction_put( + transaction.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value)), &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// PutCF writes data associated with a key in a given family to the transaction. +func (transaction *Transaction) PutCF(cf *ColumnFamilyHandle, key, value []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + cValue = byteToChar(value) + ) + C.rocksdb_transaction_put_cf( + transaction.c, cf.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value)), &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Delete removes the data associated with the key from the transaction. +func (transaction *Transaction) Delete(key []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + ) + C.rocksdb_transaction_delete(transaction.c, cKey, C.size_t(len(key)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// DeleteCF removes the data in a given column family associated with the key from the transaction. +func (transaction *Transaction) DeleteCF(cf *ColumnFamilyHandle, key []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + ) + C.rocksdb_transaction_delete_cf(transaction.c, cf.c, cKey, C.size_t(len(key)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// NewIterator returns an Iterator over the database that uses the +// ReadOptions given. +func (transaction *Transaction) NewIterator(opts *ReadOptions) *Iterator { + return NewNativeIterator(unsafe.Pointer(C.rocksdb_transaction_create_iterator(transaction.c, opts.c))) +} + +// NewIteratorCF returns an Iterator over the column family that uses the +// ReadOptions given. +func (transaction *Transaction) NewIteratorCF(opts *ReadOptions, cf *ColumnFamilyHandle) *Iterator { + return NewNativeIterator(unsafe.Pointer(C.rocksdb_transaction_create_iterator_cf(transaction.c, opts.c, cf.c))) +} + +// Destroy deallocates the transaction object. +func (transaction *Transaction) Destroy() { + C.rocksdb_transaction_destroy(transaction.c) + transaction.c = nil +} diff --git a/v8/transactiondb.go b/v8/transactiondb.go new file mode 100644 index 00000000..eb12f04e --- /dev/null +++ b/v8/transactiondb.go @@ -0,0 +1,305 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" +import ( + "errors" + "unsafe" +) + +// TransactionDB is a reusable handle to a RocksDB transactional database on disk, created by OpenTransactionDb. +type TransactionDB struct { + c *C.rocksdb_transactiondb_t + name string + opts *Options + transactionDBOpts *TransactionDBOptions +} + +// OpenTransactionDb opens a database with the specified options. +func OpenTransactionDb( + opts *Options, + transactionDBOpts *TransactionDBOptions, + name string, +) (*TransactionDB, error) { + var ( + cErr *C.char + cName = C.CString(name) + ) + defer C.free(unsafe.Pointer(cName)) + db := C.rocksdb_transactiondb_open( + opts.c, transactionDBOpts.c, cName, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return &TransactionDB{ + name: name, + c: db, + opts: opts, + transactionDBOpts: transactionDBOpts, + }, nil +} + +// OpenDbColumnFamilies opens a database with the specified column families. +func OpenTransactionDbColumnFamilies( + opts *Options, + transactionDBOpts *TransactionDBOptions, + name string, + cfNames []string, + cfOpts []*Options, +) (*TransactionDB, []*ColumnFamilyHandle, error) { + numColumnFamilies := len(cfNames) + if numColumnFamilies != len(cfOpts) { + return nil, nil, errors.New("must provide the same number of column family names and options") + } + + cName := C.CString(name) + defer C.free(unsafe.Pointer(cName)) + + cNames := make([]*C.char, numColumnFamilies) + for i, s := range cfNames { + cNames[i] = C.CString(s) + } + defer func() { + for _, s := range cNames { + C.free(unsafe.Pointer(s)) + } + }() + + cOpts := make([]*C.rocksdb_options_t, numColumnFamilies) + for i, o := range cfOpts { + cOpts[i] = o.c + } + + cHandles := make([]*C.rocksdb_column_family_handle_t, numColumnFamilies) + + var cErr *C.char + db := C.rocksdb_transactiondb_open_column_families( + opts.c, + transactionDBOpts.c, + cName, + C.int(numColumnFamilies), + &cNames[0], + &cOpts[0], + &cHandles[0], + &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, nil, errors.New(C.GoString(cErr)) + } + + cfHandles := make([]*ColumnFamilyHandle, numColumnFamilies) + for i, c := range cHandles { + cfHandles[i] = NewNativeColumnFamilyHandle(c) + } + + return &TransactionDB{ + name: name, + c: db, + opts: opts, + }, cfHandles, nil +} + +// CreateColumnFamily creates a new column family. +func (db *TransactionDB) CreateColumnFamily(opts *Options, name string) (*ColumnFamilyHandle, error) { + var ( + cErr *C.char + cName = C.CString(name) + ) + defer C.free(unsafe.Pointer(cName)) + cHandle := C.rocksdb_transactiondb_create_column_family(db.c, opts.c, cName, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewNativeColumnFamilyHandle(cHandle), nil +} + +// NewSnapshot creates a new snapshot of the database. +func (db *TransactionDB) NewSnapshot() *Snapshot { + return NewNativeSnapshot(C.rocksdb_transactiondb_create_snapshot(db.c)) +} + +// ReleaseSnapshot releases the snapshot and its resources. +func (db *TransactionDB) ReleaseSnapshot(snapshot *Snapshot) { + C.rocksdb_transactiondb_release_snapshot(db.c, snapshot.c) + snapshot.c = nil +} + +// GetBaseDB gets base db. +func (db *TransactionDB) GetBaseDB() *DB { + base := C.rocksdb_transactiondb_get_base_db(db.c) + return &DB{c: base} +} + +// CloseBaseDBOfTransactionDB closes base db of TransactionDB. +func CloseBaseDBOfTransactionDB(db *DB) { + if db != nil && db.c != nil { + C.rocksdb_transactiondb_close_base_db(db.c) + } +} + +// TransactionBegin begins a new transaction +// with the WriteOptions and TransactionOptions given. +func (db *TransactionDB) TransactionBegin( + opts *WriteOptions, + transactionOpts *TransactionOptions, + oldTransaction *Transaction, +) *Transaction { + if oldTransaction != nil { + return NewNativeTransaction(C.rocksdb_transaction_begin( + db.c, + opts.c, + transactionOpts.c, + oldTransaction.c, + )) + } + + return NewNativeTransaction(C.rocksdb_transaction_begin( + db.c, opts.c, transactionOpts.c, nil)) +} + +// Get returns the data associated with the key from the database. +func (db *TransactionDB) Get(opts *ReadOptions, key []byte) (*Slice, error) { + var ( + cErr *C.char + cValLen C.size_t + cKey = byteToChar(key) + ) + cValue := C.rocksdb_transactiondb_get( + db.c, opts.c, cKey, C.size_t(len(key)), &cValLen, &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewSlice(cValue, cValLen), nil +} + +// GetCF returns the data associated with the key in a given column family from the database. +func (db *TransactionDB) GetCF(opts *ReadOptions, cf *ColumnFamilyHandle, key []byte) (*Slice, error) { + var ( + cErr *C.char + cValLen C.size_t + cKey = byteToChar(key) + ) + cValue := C.rocksdb_transactiondb_get_cf(db.c, opts.c, cf.c, cKey, C.size_t(len(key)), &cValLen, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + return NewSlice(cValue, cValLen), nil + +} + +// Put writes data associated with a key to the database. +func (db *TransactionDB) Put(opts *WriteOptions, key, value []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + cValue = byteToChar(value) + ) + C.rocksdb_transactiondb_put( + db.c, opts.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value)), &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// PutCF writes data associated with a key to the database and column family. +func (db *TransactionDB) PutCF(opts *WriteOptions, cf *ColumnFamilyHandle, key, value []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + cValue = byteToChar(value) + ) + C.rocksdb_transactiondb_put_cf(db.c, opts.c, cf.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Write writes a WriteBatch to the database +func (db *TransactionDB) Write(opts *WriteOptions, batch *WriteBatch) error { + var cErr *C.char + C.rocksdb_transactiondb_write(db.c, opts.c, batch.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// Delete removes the data associated with the key from the database. +func (db *TransactionDB) Delete(opts *WriteOptions, key []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + ) + C.rocksdb_transactiondb_delete(db.c, opts.c, cKey, C.size_t(len(key)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// DeleteCF removes the data associated with the key from the database and column family. +func (db *TransactionDB) DeleteCF(opts *WriteOptions, cf *ColumnFamilyHandle, key []byte) error { + var ( + cErr *C.char + cKey = byteToChar(key) + ) + C.rocksdb_transactiondb_delete_cf(db.c, opts.c, cf.c, cKey, C.size_t(len(key)), &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +// NewCheckpoint creates a new Checkpoint for this db. +func (db *TransactionDB) NewCheckpoint() (*Checkpoint, error) { + var ( + cErr *C.char + ) + cCheckpoint := C.rocksdb_transactiondb_checkpoint_object_create( + db.c, &cErr, + ) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return nil, errors.New(C.GoString(cErr)) + } + + return NewNativeCheckpoint(cCheckpoint), nil +} + +// NewIterator returns an Iterator over the database that uses the +// ReadOptions given. +func (db *TransactionDB) NewIterator(opts *ReadOptions) *Iterator { + return NewNativeIterator(unsafe.Pointer(C.rocksdb_transactiondb_create_iterator(db.c, opts.c))) +} + +// NewIteratorCF returns an Iterator over the column family that uses the +// ReadOptions given. +func (db *TransactionDB) NewIteratorCF(opts *ReadOptions, cf *ColumnFamilyHandle) *Iterator { + return NewNativeIterator(unsafe.Pointer(C.rocksdb_transactiondb_create_iterator_cf(db.c, opts.c, cf.c))) +} + +// UnsafeGetDB returns the underlying c rocksdb instance. +func (db *TransactionDB) UnsafeGetDB() unsafe.Pointer { + return unsafe.Pointer(db.c) +} + +// Close closes the database. +func (db *TransactionDB) Close() { + C.rocksdb_transactiondb_close(db.c) + db.c = nil +} diff --git a/v8/transactiondb_test.go b/v8/transactiondb_test.go new file mode 100644 index 00000000..e2c471b1 --- /dev/null +++ b/v8/transactiondb_test.go @@ -0,0 +1,376 @@ +package gorocksdb + +import ( + "fmt" + "io/ioutil" + "testing" + + "github.com/facebookgo/ensure" +) + +func TestOpenTransactionDb(t *testing.T) { + db := newTestTransactionDB(t, "TestOpenTransactionDb", nil) + defer db.Close() +} + +func TestTransactionDbColumnFamilies(t *testing.T) { + test_cf_names := []string{"default", "cf1", "cf2"} + db, cf_handles := newTestTransactionDBColumnFamilies(t, "TestOpenTransactionDbColumnFamilies", test_cf_names) + ensure.True(t, 3 == len(cf_handles)) + defer db.Close() + + cf_names, err := ListColumnFamilies(NewDefaultOptions(), db.name) + ensure.Nil(t, err) + ensure.True(t, 3 == len(cf_names)) + ensure.DeepEqual(t, cf_names, test_cf_names) + + for idx, cf_name := range test_cf_names { + ensure.Nil(t, db.PutCF(NewDefaultWriteOptions(), cf_handles[idx], []byte(cf_name+"_key"), []byte(cf_name+"_value"))) + } + + for idx, cf_name := range test_cf_names { + val, err := db.GetCF(NewDefaultReadOptions(), cf_handles[idx], []byte(cf_name+"_key")) + ensure.Nil(t, err) + ensure.DeepEqual(t, val.Data(), []byte(cf_name+"_value")) + } + + // Delete all keys in all column families + for idx, cf_name := range test_cf_names { + ensure.Nil(t, db.DeleteCF(NewDefaultWriteOptions(), cf_handles[idx], []byte(cf_name+"_key"))) + } + + for idx, cf_name := range test_cf_names { + val, err := db.GetCF(NewDefaultReadOptions(), cf_handles[idx], []byte(cf_name+"_key")) + ensure.Nil(t, err) + ensure.True(t, val.Size() == 0) + } + + { + cf_handle, err := db.CreateColumnFamily(NewDefaultOptions(), "new_cf") + ensure.Nil(t, err) + ensure.NotNil(t, cf_handle) + cf_names, err := ListColumnFamilies(NewDefaultOptions(), db.name) + ensure.Nil(t, err) + ensure.True(t, 4 == len(cf_names)) + } +} + +func TestTransactionDBCRUD(t *testing.T) { + db := newTestTransactionDB(t, "TestTransactionDBGet", nil) + defer db.Close() + + var ( + givenKey = []byte("hello") + givenVal1 = []byte("world1") + givenVal2 = []byte("world2") + givenTxnKey = []byte("hello2") + givenTxnKey2 = []byte("hello3") + givenTxnVal1 = []byte("whatawonderful") + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + to = NewDefaultTransactionOptions() + ) + + // create + ensure.Nil(t, db.Put(wo, givenKey, givenVal1)) + + // retrieve + v1, err := db.Get(ro, givenKey) + defer v1.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v1.Data(), givenVal1) + + // update + ensure.Nil(t, db.Put(wo, givenKey, givenVal2)) + v2, err := db.Get(ro, givenKey) + defer v2.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v2.Data(), givenVal2) + + // delete + ensure.Nil(t, db.Delete(wo, givenKey)) + v3, err := db.Get(ro, givenKey) + defer v3.Free() + ensure.Nil(t, err) + ensure.True(t, v3.Data() == nil) + + // transaction + txn := db.TransactionBegin(wo, to, nil) + defer txn.Destroy() + // create + ensure.Nil(t, txn.Put(givenTxnKey, givenTxnVal1)) + v4, err := txn.Get(ro, givenTxnKey) + defer v4.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v4.Data(), givenTxnVal1) + + ensure.Nil(t, txn.Commit()) + v5, err := db.Get(ro, givenTxnKey) + defer v5.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v5.Data(), givenTxnVal1) + + // transaction + txn2 := db.TransactionBegin(wo, to, nil) + defer txn2.Destroy() + // create + ensure.Nil(t, txn2.Put(givenTxnKey2, givenTxnVal1)) + // rollback + ensure.Nil(t, txn2.Rollback()) + + v6, err := txn2.Get(ro, givenTxnKey2) + defer v6.Free() + ensure.Nil(t, err) + ensure.True(t, v6.Data() == nil) + // transaction + txn3 := db.TransactionBegin(wo, to, nil) + defer txn3.Destroy() + // delete + ensure.Nil(t, txn3.Delete(givenTxnKey)) + ensure.Nil(t, txn3.Commit()) + + v7, err := db.Get(ro, givenTxnKey) + defer v7.Free() + ensure.Nil(t, err) + ensure.True(t, v7.Data() == nil) + +} + +func TestTransactionDBWriteBatchColumnFamilies(t *testing.T) { + test_cf_names := []string{"default", "cf1", "cf2"} + db, cf_handles := newTestTransactionDBColumnFamilies(t, "TestOpenTransactionDbColumnFamilies", test_cf_names) + ensure.True(t, len(cf_handles) == 3) + defer db.Close() + + var ( + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + ) + + // WriteBatch PutCF + { + batch := NewWriteBatch() + for h_idx := 1; h_idx <= 2; h_idx++ { + for k_idx := 0; k_idx <= 2; k_idx++ { + batch.PutCF(cf_handles[h_idx], []byte(fmt.Sprintf("%s_key_%d", test_cf_names[h_idx], k_idx)), + []byte(fmt.Sprintf("%s_value_%d", test_cf_names[h_idx], k_idx))) + } + } + ensure.Nil(t, db.Write(wo, batch)) + batch.Destroy() + } + + // Read back + { + for h_idx := 1; h_idx <= 2; h_idx++ { + for k_idx := 0; k_idx <= 2; k_idx++ { + data, err := db.GetCF(ro, cf_handles[h_idx], []byte(fmt.Sprintf("%s_key_%d", test_cf_names[h_idx], k_idx))) + ensure.Nil(t, err) + ensure.DeepEqual(t, data.Data(), []byte(fmt.Sprintf("%s_value_%d", test_cf_names[h_idx], k_idx))) + } + } + } + + { // WriteBatch with DeleteRangeCF not implemented + batch := NewWriteBatch() + batch.DeleteRangeCF(cf_handles[1], []byte(test_cf_names[1]+"_key_0"), []byte(test_cf_names[1]+"_key_2")) + ensure.NotNil(t, db.Write(wo, batch)) + } + // WriteBatch DeleteCF + { + batch := NewWriteBatch() + batch.DeleteCF(cf_handles[1], []byte(test_cf_names[1]+"_key_0")) + batch.DeleteCF(cf_handles[1], []byte(test_cf_names[1]+"_key_1")) + ensure.Nil(t, db.Write(wo, batch)) + } + + // Read back the remaining keys + { + // All keys on "cf2" are still there. + // Only key2 on "cf1" still remains + for h_idx := 1; h_idx <= 2; h_idx++ { + for k_idx := 0; k_idx <= 2; k_idx++ { + data, err := db.GetCF(ro, cf_handles[h_idx], []byte(fmt.Sprintf("%s_key_%d", test_cf_names[h_idx], k_idx))) + ensure.Nil(t, err) + if h_idx == 2 || k_idx == 2 { + ensure.DeepEqual(t, data.Data(), []byte(fmt.Sprintf("%s_value_%d", test_cf_names[h_idx], k_idx))) + } else { + ensure.True(t, len(data.Data()) == 0) + } + } + } + } +} + +func TestTransactionDBCRUDColumnFamilies(t *testing.T) { + test_cf_names := []string{"default", "cf1", "cf2"} + db, cf_handles := newTestTransactionDBColumnFamilies(t, "TestOpenTransactionDbColumnFamilies", test_cf_names) + ensure.True(t, len(cf_handles) == 3) + defer db.Close() + + var ( + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + to = NewDefaultTransactionOptions() + ) + + { + txn := db.TransactionBegin(wo, to, nil) + defer txn.Destroy() + // RYW. + for idx, cf_handle := range cf_handles { + ensure.Nil(t, txn.PutCF(cf_handle, []byte(test_cf_names[idx]+"_key"), []byte(test_cf_names[idx]+"_value"))) + val, err := txn.GetCF(ro, cf_handle, []byte(test_cf_names[idx]+"_key")) + defer val.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, val.Data(), []byte(test_cf_names[idx]+"_value")) + } + txn.Commit() + } + + // Read after commit + for idx, cf_handle := range cf_handles { + val, err := db.GetCF(ro, cf_handle, []byte(test_cf_names[idx]+"_key")) + defer val.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, val.Data(), []byte(test_cf_names[idx]+"_value")) + } + + // Delete + { + txn := db.TransactionBegin(wo, to, nil) + defer txn.Destroy() + // RYW. + for idx, cf_handle := range cf_handles { + ensure.Nil(t, txn.DeleteCF(cf_handle, []byte(test_cf_names[idx]+"_key"))) + } + txn.Commit() + } + + // Read after delete commit + for idx, cf_handle := range cf_handles { + val, err := db.GetCF(ro, cf_handle, []byte(test_cf_names[idx]+"_key")) + defer val.Free() + ensure.Nil(t, err) + ensure.True(t, val.Size() == 0) + } +} + +func TestTransactionDBGetForUpdate(t *testing.T) { + lockTimeoutMilliSec := int64(50) + applyOpts := func(opts *Options, transactionDBOpts *TransactionDBOptions) { + transactionDBOpts.SetTransactionLockTimeout(lockTimeoutMilliSec) + } + db := newTestTransactionDB(t, "TestOpenTransactionDb", applyOpts) + defer db.Close() + + var ( + givenKey = []byte("hello") + givenVal = []byte("world") + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + to = NewDefaultTransactionOptions() + ) + + txn := db.TransactionBegin(wo, to, nil) + defer txn.Destroy() + + v, err := txn.GetForUpdate(ro, givenKey) + defer v.Free() + ensure.Nil(t, err) + + // expect lock timeout error to be thrown + if err := db.Put(wo, givenKey, givenVal); err == nil { + t.Error("expect locktime out error, got nil error") + } +} + +func TestTransactionDBGetForUpdateColumnFamilies(t *testing.T) { + test_cf_names := []string{"default", "cf1", "cf2"} + db, cf_handles := newTestTransactionDBColumnFamilies(t, "TestOpenTransactionDbColumnFamilies", test_cf_names) + ensure.True(t, 3 == len(cf_handles)) + defer db.Close() + + var ( + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + to = NewDefaultTransactionOptions() + ) + + { + txn := db.TransactionBegin(wo, to, nil) + defer txn.Destroy() + + val, err := txn.GetForUpdateCF(ro, cf_handles[1], []byte(test_cf_names[1]+"_key")) + defer val.Free() + ensure.Nil(t, err) + txn.PutCF(cf_handles[1], []byte(test_cf_names[1]+"_key"), []byte(test_cf_names[1]+"_value")) + ensure.Nil(t, txn.Commit()) + } + + // Read after update + val, err := db.GetCF(ro, cf_handles[1], []byte(test_cf_names[1]+"_key")) + ensure.Nil(t, err) + ensure.DeepEqual(t, val.Data(), []byte(test_cf_names[1]+"_value")) +} + +func TestTransactionDBGetPinnedForUpdateColumnFamilies(t *testing.T) { + testCFNames := []string{"default", "cf1", "cf2"} + db, cfHandles := newTestTransactionDBColumnFamilies(t, "TestOpenTransactionDbColumnFamilies", testCFNames) + ensure.True(t, 3 == len(cfHandles)) + defer db.Close() + + var ( + wo = NewDefaultWriteOptions() + ro = NewDefaultReadOptions() + to = NewDefaultTransactionOptions() + ) + + { + txn := db.TransactionBegin(wo, to, nil) + defer txn.Destroy() + + val, err := txn.GetPinnedForUpdateCF(ro, cfHandles[1], []byte(testCFNames[1]+"_key")) + defer val.Destroy() + ensure.Nil(t, err) + txn.PutCF(cfHandles[1], []byte(testCFNames[1]+"_key"), []byte(testCFNames[1]+"_value")) + ensure.Nil(t, txn.Commit()) + } + + // Read after update + val, err := db.GetCF(ro, cfHandles[1], []byte(testCFNames[1]+"_key")) + ensure.Nil(t, err) + ensure.DeepEqual(t, val.Data(), []byte(testCFNames[1]+"_value")) +} + +func newTestTransactionDB(t *testing.T, name string, applyOpts func(opts *Options, transactionDBOpts *TransactionDBOptions)) *TransactionDB { + dir, err := ioutil.TempDir("", "gorockstransactiondb-"+name) + ensure.Nil(t, err) + + opts := NewDefaultOptions() + opts.SetCreateIfMissing(true) + transactionDBOpts := NewDefaultTransactionDBOptions() + if applyOpts != nil { + applyOpts(opts, transactionDBOpts) + } + db, err := OpenTransactionDb(opts, transactionDBOpts, dir) + ensure.Nil(t, err) + + return db +} + +func newTestTransactionDBColumnFamilies(t *testing.T, name string, cfNames []string) (*TransactionDB, []*ColumnFamilyHandle) { + dir, err := ioutil.TempDir("", "gorockstransactiondb-"+name) + ensure.Nil(t, err) + + opts := NewDefaultOptions() + opts.SetCreateIfMissing(true) + opts.SetCreateIfMissingColumnFamilies(true) + transactionDBOpts := NewDefaultTransactionDBOptions() + cfOpts := []*Options{opts, opts, opts} + db, cfHandles, err := OpenTransactionDbColumnFamilies(opts, transactionDBOpts, dir, cfNames, cfOpts) + ensure.Nil(t, err) + ensure.True(t, 3 == len(cfHandles)) + + return db, cfHandles +} diff --git a/v8/util.go b/v8/util.go new file mode 100644 index 00000000..fc138cd2 --- /dev/null +++ b/v8/util.go @@ -0,0 +1,66 @@ +package gorocksdb + +// #include + +import "C" +import "unsafe" + +// btoi converts a bool value to int. +func btoi(b bool) int { + if b { + return 1 + } + return 0 +} + +// boolToChar converts a bool value to C.uchar. +func boolToChar(b bool) C.uchar { + if b { + return 1 + } + return 0 +} + +// charToBool converts C.uchar to bool value +func charToBool(c C.uchar) bool { + return c != 0 +} + +// charToByte converts a *C.char to a byte slice. +func charToByte(data *C.char, len C.size_t) []byte { + return unsafe.Slice((*byte)(unsafe.Pointer(data)), int(len)) +} + +// byteToChar returns *C.char from byte slice. +func byteToChar(b []byte) *C.char { + var c *C.char + if len(b) > 0 { + c = (*C.char)(unsafe.Pointer(&b[0])) + } + return c +} + +// Go []byte to C string +// The C string is allocated in the C heap using malloc. +func cByteSlice(b []byte) *C.char { + var c *C.char + if len(b) > 0 { + c = (*C.char)(C.CBytes(b)) + } + return c +} + +// stringToChar returns *C.char from string. +func stringToChar(s string) *C.char { + return (*C.char)(unsafe.Pointer(unsafe.StringData(s))) +} + +// charSlice converts a C array of *char to a []*C.char. +func charSlice(data **C.char, len C.int) []*C.char { + return unsafe.Slice(data, int(len)) +} + +// sizeSlice converts a C array of size_t to a []C.size_t. +func sizeSlice(data *C.size_t, len C.int) []C.size_t { + return unsafe.Slice(data, int(len)) +} diff --git a/v8/wal_iterator.go b/v8/wal_iterator.go new file mode 100755 index 00000000..7805d7c9 --- /dev/null +++ b/v8/wal_iterator.go @@ -0,0 +1,49 @@ +package gorocksdb + +// #include +// #include "rocksdb/c.h" +import "C" +import ( + "errors" + "unsafe" +) + +type WalIterator struct { + c *C.rocksdb_wal_iterator_t +} + +func NewNativeWalIterator(c unsafe.Pointer) *WalIterator { + return &WalIterator{(*C.rocksdb_wal_iterator_t)(c)} +} + +func (iter *WalIterator) Valid() bool { + return C.rocksdb_wal_iter_valid(iter.c) != 0 +} + +func (iter *WalIterator) Next() { + C.rocksdb_wal_iter_next(iter.c) +} + +func (iter *WalIterator) Err() error { + var cErr *C.char + C.rocksdb_wal_iter_status(iter.c, &cErr) + if cErr != nil { + defer C.rocksdb_free(unsafe.Pointer(cErr)) + return errors.New(C.GoString(cErr)) + } + return nil +} + +func (iter *WalIterator) Destroy() { + C.rocksdb_wal_iter_destroy(iter.c) + iter.c = nil +} + +// C.rocksdb_wal_iter_get_batch in the official rocksdb c wrapper has memory leak +// see https://github.com/facebook/rocksdb/pull/5515 +// https://github.com/facebook/rocksdb/issues/5536 +func (iter *WalIterator) GetBatch() (*WriteBatch, uint64) { + var cSeq C.uint64_t + cB := C.rocksdb_wal_iter_get_batch(iter.c, &cSeq) + return NewNativeWriteBatch(cB), uint64(cSeq) +} diff --git a/v8/write_batch.go b/v8/write_batch.go new file mode 100644 index 00000000..f894427b --- /dev/null +++ b/v8/write_batch.go @@ -0,0 +1,283 @@ +package gorocksdb + +// #include "rocksdb/c.h" +import "C" +import ( + "errors" + "io" +) + +// WriteBatch is a batching of Puts, Merges and Deletes. +type WriteBatch struct { + c *C.rocksdb_writebatch_t +} + +// NewWriteBatch create a WriteBatch object. +func NewWriteBatch() *WriteBatch { + return NewNativeWriteBatch(C.rocksdb_writebatch_create()) +} + +// NewNativeWriteBatch create a WriteBatch object. +func NewNativeWriteBatch(c *C.rocksdb_writebatch_t) *WriteBatch { + return &WriteBatch{c} +} + +// WriteBatchFrom creates a write batch from a serialized WriteBatch. +func WriteBatchFrom(data []byte) *WriteBatch { + return NewNativeWriteBatch(C.rocksdb_writebatch_create_from(byteToChar(data), C.size_t(len(data)))) +} + +// Put queues a key-value pair. +func (wb *WriteBatch) Put(key, value []byte) { + cKey := byteToChar(key) + cValue := byteToChar(value) + C.rocksdb_writebatch_put(wb.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value))) +} + +// PutCF queues a key-value pair in a column family. +func (wb *WriteBatch) PutCF(cf *ColumnFamilyHandle, key, value []byte) { + cKey := byteToChar(key) + cValue := byteToChar(value) + C.rocksdb_writebatch_put_cf(wb.c, cf.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value))) +} + +// Append a blob of arbitrary size to the records in this batch. +func (wb *WriteBatch) PutLogData(blob []byte) { + cBlob := byteToChar(blob) + C.rocksdb_writebatch_put_log_data(wb.c, cBlob, C.size_t(len(blob))) +} + +// Merge queues a merge of "value" with the existing value of "key". +func (wb *WriteBatch) Merge(key, value []byte) { + cKey := byteToChar(key) + cValue := byteToChar(value) + C.rocksdb_writebatch_merge(wb.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value))) +} + +// MergeCF queues a merge of "value" with the existing value of "key" in a +// column family. +func (wb *WriteBatch) MergeCF(cf *ColumnFamilyHandle, key, value []byte) { + cKey := byteToChar(key) + cValue := byteToChar(value) + C.rocksdb_writebatch_merge_cf(wb.c, cf.c, cKey, C.size_t(len(key)), cValue, C.size_t(len(value))) +} + +// Delete queues a deletion of the data at key. +func (wb *WriteBatch) Delete(key []byte) { + cKey := byteToChar(key) + C.rocksdb_writebatch_delete(wb.c, cKey, C.size_t(len(key))) +} + +// DeleteCF queues a deletion of the data at key in a column family. +func (wb *WriteBatch) DeleteCF(cf *ColumnFamilyHandle, key []byte) { + cKey := byteToChar(key) + C.rocksdb_writebatch_delete_cf(wb.c, cf.c, cKey, C.size_t(len(key))) +} + +// DeleteRange deletes keys that are between [startKey, endKey) +func (wb *WriteBatch) DeleteRange(startKey []byte, endKey []byte) { + cStartKey := byteToChar(startKey) + cEndKey := byteToChar(endKey) + C.rocksdb_writebatch_delete_range(wb.c, cStartKey, C.size_t(len(startKey)), cEndKey, C.size_t(len(endKey))) +} + +// DeleteRangeCF deletes keys that are between [startKey, endKey) and +// belong to a given column family +func (wb *WriteBatch) DeleteRangeCF(cf *ColumnFamilyHandle, startKey []byte, endKey []byte) { + cStartKey := byteToChar(startKey) + cEndKey := byteToChar(endKey) + C.rocksdb_writebatch_delete_range_cf(wb.c, cf.c, cStartKey, C.size_t(len(startKey)), cEndKey, C.size_t(len(endKey))) +} + +// Data returns the serialized version of this batch. +func (wb *WriteBatch) Data() []byte { + var cSize C.size_t + cValue := C.rocksdb_writebatch_data(wb.c, &cSize) + return charToByte(cValue, cSize) +} + +// Count returns the number of updates in the batch. +func (wb *WriteBatch) Count() int { + return int(C.rocksdb_writebatch_count(wb.c)) +} + +// NewIterator returns a iterator to iterate over the records in the batch. +func (wb *WriteBatch) NewIterator() *WriteBatchIterator { + data := wb.Data() + if len(data) < 8+4 { + return &WriteBatchIterator{} + } + return &WriteBatchIterator{data: data[12:]} +} + +// Clear removes all the enqueued Put and Deletes. +func (wb *WriteBatch) Clear() { + C.rocksdb_writebatch_clear(wb.c) +} + +// Destroy deallocates the WriteBatch object. +func (wb *WriteBatch) Destroy() { + C.rocksdb_writebatch_destroy(wb.c) + wb.c = nil +} + +// WriteBatchRecordType describes the type of a batch record. +type WriteBatchRecordType byte + +// Types of batch records. +const ( + WriteBatchDeletionRecord WriteBatchRecordType = 0x0 + WriteBatchValueRecord WriteBatchRecordType = 0x1 + WriteBatchMergeRecord WriteBatchRecordType = 0x2 + WriteBatchLogDataRecord WriteBatchRecordType = 0x3 + WriteBatchCFDeletionRecord WriteBatchRecordType = 0x4 + WriteBatchCFValueRecord WriteBatchRecordType = 0x5 + WriteBatchCFMergeRecord WriteBatchRecordType = 0x6 + WriteBatchSingleDeletionRecord WriteBatchRecordType = 0x7 + WriteBatchCFSingleDeletionRecord WriteBatchRecordType = 0x8 + WriteBatchBeginPrepareXIDRecord WriteBatchRecordType = 0x9 + WriteBatchEndPrepareXIDRecord WriteBatchRecordType = 0xA + WriteBatchCommitXIDRecord WriteBatchRecordType = 0xB + WriteBatchRollbackXIDRecord WriteBatchRecordType = 0xC + WriteBatchNoopRecord WriteBatchRecordType = 0xD + WriteBatchRangeDeletion WriteBatchRecordType = 0xF + WriteBatchCFRangeDeletion WriteBatchRecordType = 0xE + WriteBatchCFBlobIndex WriteBatchRecordType = 0x10 + WriteBatchBlobIndex WriteBatchRecordType = 0x11 + WriteBatchBeginPersistedPrepareXIDRecord WriteBatchRecordType = 0x12 + WriteBatchNotUsedRecord WriteBatchRecordType = 0x7F +) + +// WriteBatchRecord represents a record inside a WriteBatch. +type WriteBatchRecord struct { + CF int + Key []byte + Value []byte + Type WriteBatchRecordType +} + +// WriteBatchIterator represents a iterator to iterator over records. +type WriteBatchIterator struct { + data []byte + record WriteBatchRecord + err error +} + +// Next returns the next record. +// Returns false if no further record exists. +func (iter *WriteBatchIterator) Next() bool { + if iter.err != nil || len(iter.data) == 0 { + return false + } + // reset the current record + iter.record.CF = 0 + iter.record.Key = nil + iter.record.Value = nil + + // parse the record type + iter.record.Type = iter.decodeRecType() + + switch iter.record.Type { + case + WriteBatchDeletionRecord, + WriteBatchSingleDeletionRecord: + iter.record.Key = iter.decodeSlice() + case + WriteBatchCFDeletionRecord, + WriteBatchCFSingleDeletionRecord: + iter.record.CF = int(iter.decodeVarint()) + if iter.err == nil { + iter.record.Key = iter.decodeSlice() + } + case + WriteBatchValueRecord, + WriteBatchMergeRecord, + WriteBatchRangeDeletion, + WriteBatchBlobIndex: + iter.record.Key = iter.decodeSlice() + if iter.err == nil { + iter.record.Value = iter.decodeSlice() + } + case + WriteBatchCFValueRecord, + WriteBatchCFRangeDeletion, + WriteBatchCFMergeRecord, + WriteBatchCFBlobIndex: + iter.record.CF = int(iter.decodeVarint()) + if iter.err == nil { + iter.record.Key = iter.decodeSlice() + } + if iter.err == nil { + iter.record.Value = iter.decodeSlice() + } + case WriteBatchLogDataRecord: + iter.record.Value = iter.decodeSlice() + case + WriteBatchNoopRecord, + WriteBatchBeginPrepareXIDRecord, + WriteBatchBeginPersistedPrepareXIDRecord: + case + WriteBatchEndPrepareXIDRecord, + WriteBatchCommitXIDRecord, + WriteBatchRollbackXIDRecord: + iter.record.Value = iter.decodeSlice() + default: + iter.err = errors.New("unsupported wal record type") + } + + return iter.err == nil + +} + +// Record returns the current record. +func (iter *WriteBatchIterator) Record() *WriteBatchRecord { + return &iter.record +} + +// Error returns the error if the iteration is failed. +func (iter *WriteBatchIterator) Error() error { + return iter.err +} + +func (iter *WriteBatchIterator) decodeSlice() []byte { + l := int(iter.decodeVarint()) + if l > len(iter.data) { + iter.err = io.ErrShortBuffer + } + if iter.err != nil { + return []byte{} + } + ret := iter.data[:l] + iter.data = iter.data[l:] + return ret +} + +func (iter *WriteBatchIterator) decodeRecType() WriteBatchRecordType { + if len(iter.data) == 0 { + iter.err = io.ErrShortBuffer + return WriteBatchNotUsedRecord + } + t := iter.data[0] + iter.data = iter.data[1:] + return WriteBatchRecordType(t) +} + +func (iter *WriteBatchIterator) decodeVarint() uint64 { + var n int + var x uint64 + for shift := uint(0); shift < 64 && n < len(iter.data); shift += 7 { + b := uint64(iter.data[n]) + n++ + x |= (b & 0x7F) << shift + if (b & 0x80) == 0 { + iter.data = iter.data[n:] + return x + } + } + if n == len(iter.data) { + iter.err = io.ErrShortBuffer + } else { + iter.err = errors.New("malformed varint") + } + return 0 +} diff --git a/v8/write_batch_test.go b/v8/write_batch_test.go new file mode 100644 index 00000000..72eeb36e --- /dev/null +++ b/v8/write_batch_test.go @@ -0,0 +1,87 @@ +package gorocksdb + +import ( + "testing" + + "github.com/facebookgo/ensure" +) + +func TestWriteBatch(t *testing.T) { + db := newTestDB(t, "TestWriteBatch", nil) + defer db.Close() + + var ( + givenKey1 = []byte("key1") + givenVal1 = []byte("val1") + givenKey2 = []byte("key2") + ) + wo := NewDefaultWriteOptions() + ensure.Nil(t, db.Put(wo, givenKey2, []byte("foo"))) + + // create and fill the write batch + wb := NewWriteBatch() + defer wb.Destroy() + wb.Put(givenKey1, givenVal1) + wb.Delete(givenKey2) + ensure.DeepEqual(t, wb.Count(), 2) + + // perform the batch + ensure.Nil(t, db.Write(wo, wb)) + + // check changes + ro := NewDefaultReadOptions() + v1, err := db.Get(ro, givenKey1) + defer v1.Free() + ensure.Nil(t, err) + ensure.DeepEqual(t, v1.Data(), givenVal1) + + v2, err := db.Get(ro, givenKey2) + defer v2.Free() + ensure.Nil(t, err) + ensure.True(t, v2.Data() == nil) + + // DeleteRange test + wb.Clear() + wb.DeleteRange(givenKey1, givenKey2) + + // perform the batch + ensure.Nil(t, db.Write(wo, wb)) + + v1, err = db.Get(ro, givenKey1) + defer v1.Free() + ensure.Nil(t, err) + ensure.True(t, v1.Data() == nil) +} + +func TestWriteBatchIterator(t *testing.T) { + db := newTestDB(t, "TestWriteBatchIterator", nil) + defer db.Close() + + var ( + givenKey1 = []byte("key1") + givenVal1 = []byte("val1") + givenKey2 = []byte("key2") + ) + // create and fill the write batch + wb := NewWriteBatch() + defer wb.Destroy() + wb.Put(givenKey1, givenVal1) + wb.Delete(givenKey2) + ensure.DeepEqual(t, wb.Count(), 2) + + // iterate over the batch + iter := wb.NewIterator() + ensure.True(t, iter.Next()) + record := iter.Record() + ensure.DeepEqual(t, record.Type, WriteBatchValueRecord) + ensure.DeepEqual(t, record.Key, givenKey1) + ensure.DeepEqual(t, record.Value, givenVal1) + + ensure.True(t, iter.Next()) + record = iter.Record() + ensure.DeepEqual(t, record.Type, WriteBatchDeletionRecord) + ensure.DeepEqual(t, record.Key, givenKey2) + + // there shouldn't be any left + ensure.False(t, iter.Next()) +}