diff --git a/Cargo.lock b/Cargo.lock index af7c24abd132c..afcb4a35b56f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5225,9 +5225,9 @@ dependencies = [ [[package]] name = "stringdex" -version = "0.0.1-alpha4" +version = "0.0.1-alpha8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2841fd43df5b1ff1b042e167068a1fe9b163dc93041eae56ab2296859013a9a0" +checksum = "180d5e520e6b3f10adc18ed61aa464c2c54ad0d5403e7292d37558bc508eae83" dependencies = [ "stacker", ] diff --git a/src/librustdoc/Cargo.toml b/src/librustdoc/Cargo.toml index 5d36ffc2d3a52..e150725d52566 100644 --- a/src/librustdoc/Cargo.toml +++ b/src/librustdoc/Cargo.toml @@ -21,7 +21,7 @@ rustdoc-json-types = { path = "../rustdoc-json-types" } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" smallvec = "1.8.1" -stringdex = { version = "0.0.1-alpha4" } +stringdex = { version = "0.0.1-alpha8" } tempfile = "3" threadpool = "1.8.1" tracing = "0.1" diff --git a/src/librustdoc/html/static/js/search.js b/src/librustdoc/html/static/js/search.js index 3fb4db3a89c5c..2c9004bc8ec1f 100644 --- a/src/librustdoc/html/static/js/search.js +++ b/src/librustdoc/html/static/js/search.js @@ -1211,7 +1211,7 @@ class DocSearch { * will never fulfill. */ async buildIndex() { - const nn = this.database.getIndex("normalizedName"); + const nn = this.database.getData("normalizedName"); if (!nn) { return; } @@ -3706,7 +3706,7 @@ class DocSearch { * @returns {AsyncGenerator} */ async function*(currentCrate) { - const index = this.database.getIndex("normalizedName"); + const index = this.database.getData("normalizedName"); if (!index) { return; } @@ -3835,8 +3835,7 @@ class DocSearch { }; if (elem.normalizedPathLast === "") { // faster full-table scan for this specific case. - const nameData = this.database.getData("name"); - const l = nameData ? nameData.length : 0; + const l = index.length; for (let id = 0; id < l; ++id) { if (!idDuplicates.has(id)) { idDuplicates.add(id); @@ -3938,7 +3937,7 @@ class DocSearch { * @returns {AsyncGenerator} */ async function*(inputs, output, typeInfo, currentCrate) { - const index = this.database.getIndex("normalizedName"); + const index = this.database.getData("normalizedName"); if (!index) { return; } diff --git a/src/librustdoc/html/static/js/stringdex.d.ts b/src/librustdoc/html/static/js/stringdex.d.ts index cf9a8b6b5648d..2eb1fdf95d845 100644 --- a/src/librustdoc/html/static/js/stringdex.d.ts +++ b/src/librustdoc/html/static/js/stringdex.d.ts @@ -5,17 +5,8 @@ declare namespace stringdex { * The client interface to Stringdex. */ interface Database { - getIndex(colname: string): SearchTree|undefined; getData(colname: string): DataColumn|undefined; } - /** - * A search index file. - */ - interface SearchTree { - trie(): Trie; - search(name: Uint8Array|string): Promise; - searchLev(name: Uint8Array|string): AsyncGenerator; - } /** * A compressed node in the search tree. * @@ -29,9 +20,7 @@ declare namespace stringdex { matches(): RoaringBitmap; substringMatches(): AsyncGenerator; prefixMatches(): AsyncGenerator; - keys(): Uint8Array; keysExcludeSuffixOnly(): Uint8Array; - children(): [number, Promise][]; childrenExcludeSuffixOnly(): [number, Promise][]; child(id: number): Promise?; } @@ -41,6 +30,8 @@ declare namespace stringdex { interface DataColumn { isEmpty(id: number): boolean; at(id: number): Promise; + search(name: Uint8Array|string): Promise; + searchLev(name: Uint8Array|string): AsyncGenerator; length: number, } /** diff --git a/src/librustdoc/html/static/js/stringdex.js b/src/librustdoc/html/static/js/stringdex.js index cb956d926db97..b7f605a10351d 100644 --- a/src/librustdoc/html/static/js/stringdex.js +++ b/src/librustdoc/html/static/js/stringdex.js @@ -1,3 +1,4 @@ +// ignore-tidy-filelength /** * @import * as stringdex from "./stringdex.d.ts" */ @@ -485,6 +486,63 @@ class RoaringBitmap { const mid = this.getContainerId(key); return mid === -1 ? false : this.containers[mid].contains(value); } + /** + * @param {number} keyvalue + * @returns {RoaringBitmap} + */ + remove(keyvalue) { + const key = keyvalue >> 16; + const value = keyvalue & 0xFFFF; + const mid = this.getContainerId(key); + if (mid === -1) { + return this; + } + const container = this.containers[mid]; + if (!container.contains(value)) { + return this; + } + const newCardinality = (this.keysAndCardinalities[(mid * 4) + 2] | + (this.keysAndCardinalities[(mid * 4) + 3] << 8)); + const l = this.containers.length; + const m = l - (newCardinality === 0 ? 1 : 0); + const result = new RoaringBitmap(null, 0); + result.keysAndCardinalities = new Uint8Array(m * 4); + let j = 0; + for (let i = 0; i < l; i += 1) { + if (i === mid) { + if (newCardinality !== 0) { + result.keysAndCardinalities[(j * 4) + 0] = key; + result.keysAndCardinalities[(j * 4) + 1] = key >> 8; + const card = newCardinality - 1; + result.keysAndCardinalities[(j * 4) + 2] = card; + result.keysAndCardinalities[(j * 4) + 3] = card >> 8; + const newContainer = new RoaringBitmapArray( + newCardinality, + new Uint8Array(newCardinality * 2), + ); + let newContainerSlot = 0; + for (const containerValue of container.values()) { + if (containerValue !== value) { + newContainer.array[newContainerSlot] = value & 0xFF; + newContainerSlot += 1; + newContainer.array[newContainerSlot] = value >> 8; + newContainerSlot += 1; + } + } + result.containers.push(newContainer); + j += 1; + } + } else { + result.keysAndCardinalities[(j * 4) + 0] = this.keysAndCardinalities[(i * 4) + 0]; + result.keysAndCardinalities[(j * 4) + 1] = this.keysAndCardinalities[(i * 4) + 1]; + result.keysAndCardinalities[(j * 4) + 2] = this.keysAndCardinalities[(i * 4) + 2]; + result.keysAndCardinalities[(j * 4) + 3] = this.keysAndCardinalities[(i * 4) + 3]; + result.containers.push(this.containers[i]); + j += 1; + } + } + return result; + } /** * @param {number} key * @returns {number} @@ -877,6 +935,46 @@ function bitCount(n) { } /*eslint-enable */ +/** + * https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore%E2%80%93Horspool_algorithm + */ +class Uint8ArraySearchPattern { + /** @param {Uint8Array} needle */ + constructor(needle) { + this.needle = needle; + this.skipTable = []; + const m = needle.length; + for (let i = 0; i < 256; i += 1) { + this.skipTable.push(m); + } + for (let i = 0; i < m - 1; i += 1) { + this.skipTable[needle[i]] = m - 1 - i; + } + } + /** + * @param {Uint8Array} haystack + * @returns {boolean} + */ + matches(haystack) { + const needle = this.needle; + const skipTable = this.skipTable; + const m = needle.length; + const n = haystack.length; + + let skip = 0; + search: while (n - skip >= m) { + for (let i = m - 1; i >= 0; i -= 1) { + if (haystack[skip + i] !== needle[i]) { + skip += skipTable[haystack[skip + m - 1]]; + continue search; + } + } + return true; + } + return false; + } +} + /** * @param {stringdex.Hooks} hooks * @returns {Promise} @@ -887,12 +985,6 @@ function loadDatabase(hooks) { rr_: function(data) { const dataObj = JSON.parse(data); for (const colName of Object.keys(dataObj)) { - if (Object.hasOwn(dataObj[colName], "I")) { - registry.searchTreeRoots.set( - colName, - makeSearchTreeFromBase64(dataObj[colName].I)[1], - ); - } if (Object.hasOwn(dataObj[colName], "N")) { const counts = []; const countsstring = dataObj[colName]["N"]; @@ -915,6 +1007,9 @@ function loadDatabase(hooks) { makeUint8ArrayFromBase64(dataObj[colName]["H"]), new RoaringBitmap(makeUint8ArrayFromBase64(dataObj[colName]["E"]), 0), colName, + Object.hasOwn(dataObj[colName], "I") ? + makeSearchTreeFromBase64(dataObj[colName].I)[1] : + null, )); } } @@ -978,7 +1073,7 @@ function loadDatabase(hooks) { * searchTreePromises: HashTable>; * dataColumnLoadPromiseCallbacks: HashTable; * dataColumns: Map; - * dataColumnsBuckets: Map>>; + * dataColumnsBuckets: HashTable>; * searchTreeLoadByNodeID: function(Uint8Array): Promise; * searchTreeRootCallback?: function(any, Database?): any; * dataLoadByNameAndHash: function(string, Uint8Array): Promise; @@ -990,7 +1085,7 @@ function loadDatabase(hooks) { searchTreePromises: new HashTable(), dataColumnLoadPromiseCallbacks: new HashTable(), dataColumns: new Map(), - dataColumnsBuckets: new Map(), + dataColumnsBuckets: new HashTable(), searchTreeLoadByNodeID: function(nodeid) { const existingPromise = registry.searchTreePromises.get(nodeid); if (existingPromise) { @@ -1022,7 +1117,7 @@ function loadDatabase(hooks) { const data = (nodeid[0] & 0x20) !== 0 ? Uint8Array.of(((nodeid[0] & 0x0f) << 4) | (nodeid[1] >> 4)) : EMPTY_UINT8; - newPromise = Promise.resolve(new SearchTree( + newPromise = Promise.resolve(new PrefixSearchTree( EMPTY_SEARCH_TREE_BRANCHES, EMPTY_SEARCH_TREE_BRANCHES, data, @@ -1058,12 +1153,7 @@ function loadDatabase(hooks) { return newPromise; }, dataLoadByNameAndHash: function(name, hash) { - let dataColumnBuckets = registry.dataColumnsBuckets.get(name); - if (dataColumnBuckets === undefined) { - dataColumnBuckets = new HashTable(); - registry.dataColumnsBuckets.set(name, dataColumnBuckets); - } - const existingBucket = dataColumnBuckets.get(hash); + const existingBucket = registry.dataColumnsBuckets.get(hash); if (existingBucket) { return existingBucket; } @@ -1091,16 +1181,17 @@ function loadDatabase(hooks) { hooks.loadDataByNameAndHash(name, hashHex); } }); - dataColumnBuckets.set(hash, newBucket); + registry.dataColumnsBuckets.set(hash, newBucket); return newBucket; }, }; /** * The set of child subtrees. + * @template ST * @type {{ * nodeids: Uint8Array, - * subtrees: Array|null>, + * subtrees: Array|null>, * }} */ class SearchTreeBranches { @@ -1128,7 +1219,7 @@ function loadDatabase(hooks) { ); } // https://github.com/microsoft/TypeScript/issues/17227 - /** @returns {Generator<[number, Promise|null]>} */ + /** @returns {Generator<[number, Promise|null]>} */ entries() { throw new Error(); } @@ -1157,10 +1248,12 @@ function loadDatabase(hooks) { /** * A sorted array of search tree branches. * + * @template ST + * @extends SearchTreeBranches * @type {{ * keys: Uint8Array, * nodeids: Uint8Array, - * subtrees: Array|null>, + * subtrees: Array|null>, * }} */ class SearchTreeBranchesArray extends SearchTreeBranches { @@ -1179,7 +1272,7 @@ function loadDatabase(hooks) { i += 1; } } - /** @returns {Generator<[number, Promise|null]>} */ + /** @returns {Generator<[number, Promise|null]>} */ * entries() { let i = 0; const l = this.keys.length; @@ -1246,12 +1339,16 @@ function loadDatabase(hooks) { } /** + * @template ST * @param {number[]} alphabitmap_chars * @param {number} width - * @return {(typeof SearchTreeBranches)&{"ALPHABITMAP_CHARS": number[], "width": number}} + * @return {(typeof SearchTreeBranches)&{"ALPHABITMAP_CHARS": number[], "width": number}} */ function makeSearchTreeBranchesAlphaBitmapClass(alphabitmap_chars, width) { const bitwidth = width * 8; + /** + * @extends SearchTreeBranches + */ const cls = class SearchTreeBranchesAlphaBitmap extends SearchTreeBranches { /** * @param {number} bitmap @@ -1265,7 +1362,7 @@ function loadDatabase(hooks) { this.bitmap = bitmap; this.nodeids = nodeids; } - /** @returns {Generator<[number, Promise|null]>} */ + /** @returns {Generator<[number, Promise|null]>} */ * entries() { let i = 0; let j = 0; @@ -1295,15 +1392,7 @@ function loadDatabase(hooks) { * @returns {number} */ getKey(branch_index) { - //return this.getKeys()[branch_index]; - let alpha_index = 0; - while (branch_index >= 0) { - if (this.bitmap & (1 << alpha_index)) { - branch_index -= 1; - } - alpha_index += 1; - } - return alphabitmap_chars[alpha_index]; + return this.getKeys()[branch_index]; } /** * @returns {Uint8Array} @@ -1326,20 +1415,35 @@ function loadDatabase(hooks) { return cls; } + /** + * @template ST + * @type {(typeof SearchTreeBranches)&{"ALPHABITMAP_CHARS": number[], "width": number}} + */ const SearchTreeBranchesShortAlphaBitmap = makeSearchTreeBranchesAlphaBitmapClass(SHORT_ALPHABITMAP_CHARS, 3); + /** + * @template ST + * @type {(typeof SearchTreeBranches)&{"ALPHABITMAP_CHARS": number[], "width": number}} + */ const SearchTreeBranchesLongAlphaBitmap = makeSearchTreeBranchesAlphaBitmapClass(LONG_ALPHABITMAP_CHARS, 4); /** - * A [suffix tree], used for name-based search. + * @typedef {PrefixSearchTree|SuffixSearchTree} SearchTree + * @typedef {PrefixTrie|SuffixTrie} Trie + */ + + /** + * An interleaved [prefix] and [suffix tree], + * used for name-based search. * - * This data structure is used to drive substring matches, + * This data structure is used to drive prefix matches, * such as matching the query "link" to `LinkedList`, * and Lev-distance matches, such as matching the * query "hahsmap" to `HashMap`. * + * [prefix tree]: https://en.wikipedia.org/wiki/Prefix_tree * [suffix tree]: https://en.wikipedia.org/wiki/Suffix_tree * * branches @@ -1358,17 +1462,17 @@ function loadDatabase(hooks) { * will include these. * * @type {{ - * might_have_prefix_branches: SearchTreeBranches, - * branches: SearchTreeBranches, + * might_have_prefix_branches: SearchTreeBranches, + * branches: SearchTreeBranches, * data: Uint8Array, * leaves_suffix: RoaringBitmap, * leaves_whole: RoaringBitmap, * }} */ - class SearchTree { + class PrefixSearchTree { /** - * @param {SearchTreeBranches} branches - * @param {SearchTreeBranches} might_have_prefix_branches + * @param {SearchTreeBranches} branches + * @param {SearchTreeBranches} might_have_prefix_branches * @param {Uint8Array} data * @param {RoaringBitmap} leaves_whole * @param {RoaringBitmap} leaves_suffix @@ -1392,25 +1496,31 @@ function loadDatabase(hooks) { * A Trie pointer refers to a single node in a logical decompressed search tree * (the real search tree is compressed). * - * @return {Trie} + * @param {DataColumn} dataColumn + * @param {Uint8ArraySearchPattern} searchPattern + * @return {PrefixTrie} */ - trie() { - return new Trie(this, 0); + trie(dataColumn, searchPattern) { + return new PrefixTrie(this, 0, dataColumn, searchPattern); } /** * Return the trie representing `name` * @param {Uint8Array|string} name + * @param {DataColumn} dataColumn * @returns {Promise} */ - async search(name) { + async search(name, dataColumn) { if (typeof name === "string") { const utf8encoder = new TextEncoder(); name = utf8encoder.encode(name); } - let trie = this.trie(); + const searchPattern = new Uint8ArraySearchPattern(name); + /** @type {Trie} */ + let trie = this.trie(dataColumn, searchPattern); for (const datum of name) { // code point definitely exists + /** @type {Promise?} */ const newTrie = trie.child(datum); if (newTrie) { trie = await newTrie; @@ -1423,26 +1533,28 @@ function loadDatabase(hooks) { /** * @param {Uint8Array|string} name + * @param {DataColumn} dataColumn * @returns {AsyncGenerator} */ - async* searchLev(name) { + async* searchLev(name, dataColumn) { if (typeof name === "string") { const utf8encoder = new TextEncoder(); name = utf8encoder.encode(name); } const w = name.length; if (w < 3) { - const trie = await this.search(name); + const trie = await this.search(name, dataColumn); if (trie !== null) { yield trie; } return; } + const searchPattern = new Uint8ArraySearchPattern(name); const levParams = w >= 6 ? new Lev2TParametricDescription(w) : new Lev1TParametricDescription(w); /** @type {Array<[Promise, number]>} */ - const stack = [[Promise.resolve(this.trie()), 0]]; + const stack = [[Promise.resolve(this.trie(dataColumn, searchPattern)), 0]]; const n = levParams.n; while (stack.length !== 0) { // It's not empty @@ -1475,20 +1587,29 @@ function loadDatabase(hooks) { } } } + + /** @returns {RoaringBitmap} */ + getCurrentLeaves() { + return this.leaves_whole.union(this.leaves_suffix); + } } /** * A representation of a set of strings in the search index, * as a subset of the entire tree. */ - class Trie { + class PrefixTrie { /** - * @param {SearchTree} tree + * @param {PrefixSearchTree} tree * @param {number} offset + * @param {DataColumn} dataColumn + * @param {Uint8ArraySearchPattern} searchPattern */ - constructor(tree, offset) { + constructor(tree, offset, dataColumn, searchPattern) { this.tree = tree; this.offset = offset; + this.dataColumn = dataColumn; + this.searchPattern = searchPattern; } /** @@ -1514,7 +1635,28 @@ function loadDatabase(hooks) { const current_layer = layer; layer = []; for await (const tree of current_layer) { - yield tree.leaves_whole.union(tree.leaves_suffix); + /** @type {number[]?} */ + let rejected = null; + let leaves = tree.getCurrentLeaves(); + for (const leaf of leaves.entries()) { + const haystack = await this.dataColumn.at(leaf); + if (haystack === undefined || !this.searchPattern.matches(haystack)) { + if (!rejected) { + rejected = []; + } + rejected.push(leaf); + } + } + if (rejected) { + if (leaves.cardinality() !== rejected.length) { + for (const rej of rejected) { + leaves = leaves.remove(rej); + } + yield leaves; + } + } else { + yield leaves; + } } /** @type {HashTable<[number, SearchTree][]>} */ const subnodes = new HashTable(); @@ -1548,12 +1690,14 @@ function loadDatabase(hooks) { const res = registry.searchTreeLoadByNodeID(newnode); for (const [byte, node] of subnode_list) { const branches = node.branches; - const might_have_prefix_branches = node.might_have_prefix_branches; const i = branches.getIndex(byte); branches.subtrees[i] = res; - const mhpI = might_have_prefix_branches.getIndex(byte); - if (mhpI !== -1) { - might_have_prefix_branches.subtrees[mhpI] = res; + if (node instanceof PrefixSearchTree) { + const might_have_prefix_branches = node.might_have_prefix_branches; + const mhpI = might_have_prefix_branches.getIndex(byte); + if (mhpI !== -1) { + might_have_prefix_branches.subtrees[mhpI] = res; + } } } layer.push(res); @@ -1581,6 +1725,9 @@ function loadDatabase(hooks) { // if we want to do them in order) for (const {node, len} of current_layer) { const tree = await node; + if (!(tree instanceof PrefixSearchTree)) { + continue; + } const length = len + tree.data.length; if (minLength === null || length < minLength) { minLength = length; @@ -1637,10 +1784,13 @@ function loadDatabase(hooks) { } } // if we still have more subtrees to walk, then keep going - /** @type {HashTable<{byte: number, tree: SearchTree, len: number}[]>} */ + /** @type {HashTable<{byte: number, tree: PrefixSearchTree, len: number}[]>} */ const subnodes = new HashTable(); for await (const {node, len} of current_layer) { const tree = await node; + if (!(tree instanceof PrefixSearchTree)) { + continue; + } const length = len + tree.data.length; const mhp_branches = tree.might_have_prefix_branches; const l = mhp_branches.subtrees.length; @@ -1663,8 +1813,6 @@ function loadDatabase(hooks) { subnode_list.push({byte, tree, len}); } } - } else { - throw new Error(`malformed tree; index ${i} does not exist`); } } } @@ -1728,14 +1876,21 @@ function loadDatabase(hooks) { this.tree.might_have_prefix_branches.subtrees[mhpI] = node; } } - nodes.push([k, node.then(node => node.trie())]); + nodes.push([k, node.then(node => { + return node.trie(this.dataColumn, this.searchPattern); + })]); i += 1; } return nodes; } else { /** @type {number} */ const codePoint = data[this.offset]; - const trie = new Trie(this.tree, this.offset + 1); + const trie = new PrefixTrie( + this.tree, + this.offset + 1, + this.dataColumn, + this.searchPattern, + ); return [[codePoint, Promise.resolve(trie)]]; } } @@ -1777,14 +1932,21 @@ function loadDatabase(hooks) { this.tree.might_have_prefix_branches.subtrees[i] = node; this.tree.branches.subtrees[this.tree.branches.getIndex(k)] = node; } - nodes.push([k, node.then(node => node.trie())]); + nodes.push([k, node.then(node => { + return node.trie(this.dataColumn, this.searchPattern); + })]); i += 1; } return nodes; } else { /** @type {number} */ const codePoint = data[this.offset]; - const trie = new Trie(this.tree, this.offset + 1); + const trie = new PrefixTrie( + this.tree, + this.offset + 1, + this.dataColumn, + this.searchPattern, + ); return [[codePoint, Promise.resolve(trie)]]; } } @@ -1811,10 +1973,275 @@ function loadDatabase(hooks) { this.tree.might_have_prefix_branches.subtrees[mhpI] = branch; } } - return branch.then(branch => branch.trie()); + return branch.then(branch => branch.trie(this.dataColumn, this.searchPattern)); } } else if (this.tree.data[this.offset] === byte) { - return Promise.resolve(new Trie(this.tree, this.offset + 1)); + return Promise.resolve(new PrefixTrie( + this.tree, + this.offset + 1, + this.dataColumn, + this.searchPattern, + )); + } + return null; + } + } + /** + * A [suffix tree], used for name-based search. + * + * This data structure is used to drive substring matches, + * such as matching the query "inked" to `LinkedList`. + * + * [suffix tree]: https://en.wikipedia.org/wiki/Suffix_tree + * + * Suffix trees do not actually carry the intermediate data + * between branches, so, in order to validate the results, + * they must go through a filtering step at the end. + * Suffix trees also cannot match lev and exact matches, + * so those just return empty sets. + * + * branches + * : A sorted-array map of subtrees. + * + * dataLen + * : The length of the substring used by this node. + * + * leaves_suffix + * : The IDs of every entry that matches. Levenshtein matches + * won't include these. + * + * @type {{ + * branches: SearchTreeBranches, + * dataLen: number, + * leaves_suffix: RoaringBitmap, + * }} + */ + class SuffixSearchTree { + /** + * @param {SearchTreeBranches} branches + * @param {number} dataLen + * @param {RoaringBitmap} leaves_suffix + */ + constructor( + branches, + dataLen, + leaves_suffix, + ) { + this.branches = branches; + this.dataLen = dataLen; + this.leaves_suffix = leaves_suffix; + } + /** + * Returns the Trie for the root node. + * + * A Trie pointer refers to a single node in a logical decompressed search tree + * (the real search tree is compressed). + * + * @param {DataColumn} dataColumn + * @param {Uint8ArraySearchPattern} searchPattern + * @return {Trie} + */ + trie(dataColumn, searchPattern) { + return new SuffixTrie(this, 0, dataColumn, searchPattern); + } + + /** + * Return the trie representing `name` + * @param {Uint8Array|string} name + * @param {DataColumn} dataColumn + * @returns {Promise} + */ + async search(name, dataColumn) { + if (typeof name === "string") { + const utf8encoder = new TextEncoder(); + name = utf8encoder.encode(name); + } + const searchPattern = new Uint8ArraySearchPattern(name); + let trie = this.trie(dataColumn, searchPattern); + for (const datum of name) { + // code point definitely exists + const newTrie = trie.child(datum); + if (newTrie) { + trie = await newTrie; + } else { + return null; + } + } + return trie; + } + + /** + * @param {Uint8Array|string} _name + * @param {DataColumn} _dataColumn + * @returns {AsyncGenerator} + */ + async* searchLev(_name, _dataColumn) { + // this function only returns whole-string matches, + // which pure-suffix nodes don't have, so is + // intentionally blank + } + + /** @returns {RoaringBitmap} */ + getCurrentLeaves() { + return this.leaves_suffix; + } + } + + /** + * A representation of a set of strings in the search index, + * as a subset of the entire tree (suffix-only). + */ + class SuffixTrie { + /** + * @param {SuffixSearchTree} tree + * @param {number} offset + * @param {DataColumn} dataColumn + * @param {Uint8ArraySearchPattern} searchPattern + */ + constructor(tree, offset, dataColumn, searchPattern) { + this.tree = tree; + this.offset = offset; + this.dataColumn = dataColumn; + this.searchPattern = searchPattern; + } + + /** + * All exact matches for the string represented by this node. + * Since pure-suffix nodes have no exactly-matching children, + * this function returns the empty bitmap. + * @returns {RoaringBitmap} + */ + matches() { + return EMPTY_BITMAP; + } + + /** + * All matches for strings that contain the string represented by this node. + * @returns {AsyncGenerator} + */ + async* substringMatches() { + /** @type {Promise[]} */ + let layer = [Promise.resolve(this.tree)]; + while (layer.length) { + const current_layer = layer; + layer = []; + for await (const tree of current_layer) { + /** @type {number[]?} */ + let rejected = null; + let leaves = tree.getCurrentLeaves(); + for (const leaf of leaves.entries()) { + const haystack = await this.dataColumn.at(leaf); + if (haystack === undefined || !this.searchPattern.matches(haystack)) { + if (!rejected) { + rejected = []; + } + rejected.push(leaf); + } + } + if (rejected) { + if (leaves.cardinality() !== rejected.length) { + for (const rej of rejected) { + leaves = leaves.remove(rej); + } + yield leaves; + } + } else { + yield leaves; + } + } + /** @type {HashTable<[number, SearchTree][]>} */ + const subnodes = new HashTable(); + for await (const node of current_layer) { + const branches = node.branches; + const l = branches.subtrees.length; + for (let i = 0; i < l; ++i) { + const subtree = branches.subtrees[i]; + if (subtree) { + layer.push(subtree); + } else if (subtree === null) { + const newnode = branches.getNodeID(i); + if (!newnode) { + throw new Error(`malformed tree; no node for index ${i}`); + } else { + let subnode_list = subnodes.get(newnode); + if (!subnode_list) { + subnode_list = [[i, node]]; + subnodes.set(newnode, subnode_list); + } else { + subnode_list.push([i, node]); + } + } + } else { + throw new Error(`malformed tree; index ${i} does not exist`); + } + } + } + for (const [newnode, subnode_list] of subnodes.entries()) { + const res = registry.searchTreeLoadByNodeID(newnode); + for (const [i, node] of subnode_list) { + const branches = node.branches; + branches.subtrees[i] = res; + } + layer.push(res); + } + } + } + + /** + * All matches for strings that start with the string represented by this node. + * Since this is a pure-suffix node, there aren't any. + * @returns {AsyncGenerator} + */ + async* prefixMatches() { + // this function only returns prefix matches, + // which pure-suffix nodes don't have, so is + // intentionally blank + } + + /** + * Returns all keys that are children of this node. + * @returns {Uint8Array} + */ + keysExcludeSuffixOnly() { + return EMPTY_UINT8; + } + + /** + * Returns all nodes that are direct children of this node. + * @returns {[number, Promise][]} + */ + childrenExcludeSuffixOnly() { + return []; + } + + /** + * Returns a single node that is a direct child of this node. + * @param {number} byte + * @returns {Promise?} + */ + child(byte) { + if (this.offset === this.tree.dataLen) { + const i = this.tree.branches.getIndex(byte); + if (i !== -1) { + /** @type {Promise?} */ + let branch = this.tree.branches.subtrees[i]; + if (branch === null) { + const newnode = this.tree.branches.getNodeID(i); + if (!newnode) { + throw new Error(`malformed tree; no node for key ${byte}`); + } + branch = registry.searchTreeLoadByNodeID(newnode); + this.tree.branches.subtrees[i] = branch; + } + return branch.then(branch => branch.trie(this.dataColumn, this.searchPattern)); + } + } else { + return Promise.resolve(new SuffixTrie( + this.tree, + this.offset + 1, + this.dataColumn, + this.searchPattern, + )); } return null; } @@ -1827,8 +2254,10 @@ function loadDatabase(hooks) { * @param {Uint8Array} hashes * @param {RoaringBitmap} emptyset * @param {string} name + * @param {SearchTree?} searchTree */ - constructor(counts, hashes, emptyset, name) { + constructor(counts, hashes, emptyset, name, searchTree) { + this.searchTree = searchTree; this.hashes = hashes; this.emptyset = emptyset; this.name = name; @@ -1883,7 +2312,7 @@ function loadDatabase(hooks) { const {hash, end} = this.buckets[idx]; let data = this.buckets[idx].data; if (data === null) { - const dataSansEmptyset = await registry.dataLoadByNameAndHash( + const dataSansEmptysetOrig = await registry.dataLoadByNameAndHash( this.name, hash, ); @@ -1893,6 +2322,7 @@ function loadDatabase(hooks) { if (data !== null) { return (await data)[id - start]; } + const dataSansEmptyset = [...dataSansEmptysetOrig]; /** @type {(Uint8Array[])|null} */ let dataWithEmptyset = null; let pos = start; @@ -1924,6 +2354,24 @@ function loadDatabase(hooks) { } } } + /** + * Search by exact substring + * @param {Uint8Array|string} name + * @returns {Promise} + */ + async search(name) { + return await (this.searchTree ? this.searchTree.search(name, this) : null); + } + /** + * Search by whole, inexact match + * @param {Uint8Array|string} name + * @returns {AsyncGenerator} + */ + async *searchLev(name) { + if (this.searchTree) { + yield* this.searchTree.searchLev(name, this); + } + } } class Database { @@ -2015,7 +2463,7 @@ function loadDatabase(hooks) { const data_history = []; let canonical = EMPTY_UINT8; /** @type {SearchTree} */ - let tree = new SearchTree( + let tree = new PrefixSearchTree( EMPTY_SEARCH_TREE_BRANCHES, EMPTY_SEARCH_TREE_BRANCHES, EMPTY_UINT8, @@ -2029,8 +2477,8 @@ function loadDatabase(hooks) { * @returns {{ * "cpbranches": Uint8Array, * "csbranches": Uint8Array, - * "might_have_prefix_branches": SearchTreeBranches, - * "branches": SearchTreeBranches, + * "might_have_prefix_branches": SearchTreeBranches, + * "branches": SearchTreeBranches, * "cpnodes": Uint8Array, * "csnodes": Uint8Array, * "consumed_len_bytes": number, @@ -2051,6 +2499,12 @@ function loadDatabase(hooks) { const start_point = i; let cplen; let cslen; + /** + * @type {( + * typeof SearchTreeBranches & + * {"ALPHABITMAP_CHARS": number[], "width": number} + * )?} + */ let alphabitmap = null; if (is_pure_suffixes_only_node) { cplen = 0; @@ -2299,6 +2753,8 @@ function loadDatabase(hooks) { if (is_data_compressed) { data = data_history[data_history.length - dlen - 1]; dlen = data.length; + } else if (is_pure_suffixes_only_node) { + data = EMPTY_UINT8; } else { data = dlen === 0 ? EMPTY_UINT8 : @@ -2319,80 +2775,109 @@ function loadDatabase(hooks) { let whole; let suffix; if (is_pure_suffixes_only_node) { - whole = EMPTY_BITMAP; suffix = input[i] === 0 ? EMPTY_BITMAP1 : new RoaringBitmap(input, i); i += suffix.consumed_len_bytes; - } else if (input[i] === 0xff) { - whole = EMPTY_BITMAP; - suffix = EMPTY_BITMAP1; - i += 1; + tree = new SuffixSearchTree( + branches, + dlen, + suffix, + ); + const clen = ( + 3 + // lengths of children and data + csnodes.length + + csbranches.length + + suffix.consumed_len_bytes + ); + if (canonical.length < clen) { + canonical = new Uint8Array(clen); + } + let ci = 0; + canonical[ci] = 1; + ci += 1; + canonical[ci] = dlen; + ci += 1; + canonical[ci] = input[coffset]; // suffix child count + ci += 1; + canonical.set(csnodes, ci); + ci += csnodes.length; + canonical.set(csbranches, ci); + ci += csbranches.length; + const leavesOffset = i - suffix.consumed_len_bytes; + for (let j = leavesOffset; j < i; j += 1) { + canonical[ci + j - leavesOffset] = input[j]; + } + siphashOfBytes(canonical.subarray(0, clen), 0, 0, 0, 0, hash); } else { - whole = input[i] === 0 ? - EMPTY_BITMAP1 : - new RoaringBitmap(input, i); - i += whole.consumed_len_bytes; - suffix = input[i] === 0 ? - EMPTY_BITMAP1 : - new RoaringBitmap(input, i); - i += suffix.consumed_len_bytes; - } - tree = new SearchTree( - branches, - might_have_prefix_branches, - data, - whole, - suffix, - ); - const clen = ( - (is_pure_suffixes_only_node ? 3 : 4) + // lengths of children and data - dlen + - cpnodes.length + csnodes.length + - cpbranches.length + csbranches.length + - whole.consumed_len_bytes + - suffix.consumed_len_bytes - ); - if (canonical.length < clen) { - canonical = new Uint8Array(clen); - } - let ci = 0; - canonical[ci] = is_pure_suffixes_only_node ? 1 : 0; - ci += 1; - canonical[ci] = dlen; - ci += 1; - canonical.set(data, ci); - ci += dlen; - canonical[ci] = input[coffset]; - ci += 1; - if (!is_pure_suffixes_only_node) { - canonical[ci] = input[coffset + 1]; + if (input[i] === 0xff) { + whole = EMPTY_BITMAP; + suffix = EMPTY_BITMAP1; + i += 1; + } else { + whole = input[i] === 0 ? + EMPTY_BITMAP1 : + new RoaringBitmap(input, i); + i += whole.consumed_len_bytes; + suffix = input[i] === 0 ? + EMPTY_BITMAP1 : + new RoaringBitmap(input, i); + i += suffix.consumed_len_bytes; + } + tree = new PrefixSearchTree( + branches, + might_have_prefix_branches, + data, + whole, + suffix, + ); + const clen = ( + 4 + // lengths of children and data + dlen + + cpnodes.length + csnodes.length + + cpbranches.length + csbranches.length + + whole.consumed_len_bytes + + suffix.consumed_len_bytes + ); + if (canonical.length < clen) { + canonical = new Uint8Array(clen); + } + let ci = 0; + canonical[ci] = 0; ci += 1; + canonical[ci] = dlen; + ci += 1; + canonical.set(data, ci); + ci += data.length; + canonical[ci] = input[coffset]; // prefix child count + ci += 1; + canonical[ci] = input[coffset + 1]; // suffix child count + ci += 1; + canonical.set(cpnodes, ci); + ci += cpnodes.length; + canonical.set(csnodes, ci); + ci += csnodes.length; + canonical.set(cpbranches, ci); + ci += cpbranches.length; + canonical.set(csbranches, ci); + ci += csbranches.length; + const leavesOffset = i - whole.consumed_len_bytes - suffix.consumed_len_bytes; + for (let j = leavesOffset; j < i; j += 1) { + canonical[ci + j - leavesOffset] = input[j]; + } + siphashOfBytes(canonical.subarray(0, clen), 0, 0, 0, 0, hash); } - canonical.set(cpnodes, ci); - ci += cpnodes.length; - canonical.set(csnodes, ci); - ci += csnodes.length; - canonical.set(cpbranches, ci); - ci += cpbranches.length; - canonical.set(csbranches, ci); - ci += csbranches.length; - const leavesOffset = i - whole.consumed_len_bytes - suffix.consumed_len_bytes; - for (let j = leavesOffset; j < i; j += 1) { - canonical[ci + j - leavesOffset] = input[j]; - } - siphashOfBytes(canonical.subarray(0, clen), 0, 0, 0, 0, hash); hash[2] &= 0x7f; } else { // uncompressed node const dlen = input [i + 1]; i += 2; - if (dlen === 0) { + if (dlen === 0 || is_pure_suffixes_only_node) { data = EMPTY_UINT8; } else { data = new Uint8Array(input.buffer, i + input.byteOffset, dlen); + i += dlen; } - i += dlen; const { consumed_len_bytes: branches_consumed_len_bytes, branches, @@ -2427,13 +2912,19 @@ function loadDatabase(hooks) { i - start, ), 0, 0, 0, 0, hash); hash[2] &= 0x7f; - tree = new SearchTree( - branches, - might_have_prefix_branches, - data, - whole, - suffix, - ); + tree = is_pure_suffixes_only_node ? + new SuffixSearchTree( + branches, + dlen, + suffix, + ) : + new PrefixSearchTree( + branches, + might_have_prefix_branches, + data, + whole, + suffix, + ); } hash_history.push({hash: truncatedHash.slice(), used: false}); if (data.length !== 0) { @@ -2454,20 +2945,22 @@ function loadDatabase(hooks) { } j += 1; } - const tree_mhp_branch_nodeids = tree.might_have_prefix_branches.nodeids; - const tree_mhp_branch_subtrees = tree.might_have_prefix_branches.subtrees; - j = 0; - lb = tree.might_have_prefix_branches.subtrees.length; - while (j < lb) { - // node id with a 1 in its most significant bit is inlined, and, so - // it won't be in the stash - if ((tree_mhp_branch_nodeids[j * 6] & 0x80) === 0) { - const subtree = stash.getWithOffsetKey(tree_mhp_branch_nodeids, j * 6); - if (subtree !== undefined) { - tree_mhp_branch_subtrees[j] = Promise.resolve(subtree); + if (tree instanceof PrefixSearchTree) { + const tree_mhp_branch_nodeids = tree.might_have_prefix_branches.nodeids; + const tree_mhp_branch_subtrees = tree.might_have_prefix_branches.subtrees; + j = 0; + lb = tree.might_have_prefix_branches.subtrees.length; + while (j < lb) { + // node id with a 1 in its most significant bit is inlined, and, so + // it won't be in the stash + if ((tree_mhp_branch_nodeids[j * 6] & 0x80) === 0) { + const subtree = stash.getWithOffsetKey(tree_mhp_branch_nodeids, j * 6); + if (subtree !== undefined) { + tree_mhp_branch_subtrees[j] = Promise.resolve(subtree); + } } + j += 1; } - j += 1; } if (i !== l) { stash.set(truncatedHash, tree);