Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
060cda5
first failed test. just an api
mkhludnev Oct 29, 2024
79f2ea5
silly bug fxd
mkhludnev Oct 29, 2024
326fa95
add some pivot randomnes
mkhludnev Oct 30, 2024
847b5b0
horrible impl.
mkhludnev Nov 2, 2024
2a444ec
it works
mkhludnev Nov 3, 2024
a7181db
some cleanup
mkhludnev Nov 5, 2024
64fd288
some cleanup
mkhludnev Nov 6, 2024
26332ca
sweep
mkhludnev Nov 21, 2024
e5168d1
javadoc
mkhludnev Nov 21, 2024
447450b
in the middle of PR feedback
mkhludnev Dec 23, 2024
dee5961
review in progress
mkhludnev Dec 24, 2024
82458d3
- renamed Builder
mkhludnev Dec 26, 2024
957f507
added duel with PointsMultiRange
mkhludnev Dec 26, 2024
b075819
tidy
mkhludnev Dec 26, 2024
63f4edd
expose only builder with java function interface.
mkhludnev Dec 27, 2024
0ac0d09
remove ranges overlaps with sweepline alg.
mkhludnev Jan 1, 2025
7d0c33b
check dvSkipper via index sorter
mkhludnev Jan 1, 2025
e98e4ce
linter
mkhludnev Jan 1, 2025
65bfe20
introduce ordRande tree lookup algm
mkhludnev Jan 1, 2025
0d92caa
sweep
mkhludnev Jan 1, 2025
c97f268
import
mkhludnev Jan 1, 2025
8c7cab8
tudy
mkhludnev Jan 2, 2025
69f26db
bike shedding
mkhludnev Jan 2, 2025
56e2197
remove range tree impl
mkhludnev Jan 10, 2025
fabe8ac
remove sweep line range merge
mkhludnev Feb 1, 2025
67e4953
tidy
mkhludnev Feb 1, 2025
144a14c
Removed fixed width comparator
mkhludnev Feb 14, 2025
5642877
import
mkhludnev Feb 14, 2025
6d096eb
Some testing. Added CHANGES.txt
mkhludnev Feb 16, 2025
221e9bb
Some testing. Added CHANGES.txt
mkhludnev Feb 16, 2025
bb7302f
Some testing. Added CHANGES.txt
mkhludnev Feb 16, 2025
94e9a78
tidy
mkhludnev Feb 16, 2025
9344b8f
tidy
mkhludnev Feb 16, 2025
0ea57f6
move changes entry under 10.2
Feb 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ API Changes
* GITHUB#14134: Added Bits#applyMask API to help apply live docs as a mask on a
bit set of matches. (Adrien Grand)

* GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and
* GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and
concatenate(Automaton,Automaton) in favor of the methods taking List. (Robert Muir)

New Features
Expand All @@ -59,6 +59,7 @@ New Features
These queries allow for the vector search entry points to be initialized via a `seed` query. This follows
the research provided via https://arxiv.org/abs/2307.16779. (Sean MacAvaney, Ben Trent).

* GITHUB#13974: Introducing DocValuesMultiRangeQuery.SortedSetStabbingBuilder into sandbox. (Mikhail Khludnev)

Improvements
---------------------
Expand Down Expand Up @@ -137,7 +138,7 @@ Other

* GITHUB#14091: Cover all DataType. (Lu Xugang)

* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j
* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j
from 1.7.36 to 2.0.16. (Michael Froh)

* GITHUB#14223 : Fixed a flaky test TestKnnFloatVectorQuery.testFindFewer (Navneet Verma)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.sandbox.search;

import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;

/**
* A few query builders for doc values multi range queries.
*
* @lucene.experimental
*/
public final class DocValuesMultiRangeQuery {

private DocValuesMultiRangeQuery() {}

/** Representation of a single clause in a MultiRangeQuery */
public static class ByteRange {
protected BytesRef lower;
protected BytesRef upper;

/** copies ByteRefs passed */
public ByteRange(BytesRef lowerValue, BytesRef upperValue) {
this.lower = BytesRef.deepCopyOf(lowerValue);
this.upper = BytesRef.deepCopyOf(upperValue);
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
ByteRange that = (ByteRange) o;
return lower.equals(that.lower) && upper.equals(that.upper);
}

@Override
public int hashCode() {
int result = lower.hashCode();
result = 31 * result + upper.hashCode();
return result;
}

@Override
public String toString() {
return lower + ".." + upper;
}
}

/**
* Builder for creating a multi-range query for stabbing by SortedSet or Sorted field values. For
* example, it matches IPs in docvalues field by multiple IP ranges. For the single range it
* behaves like {@link SortedSetDocValuesField#newSlowRangeQuery(String, BytesRef, BytesRef,
* boolean, boolean)} with both true arguments
*/
public static class SortedSetStabbingBuilder {
protected final String fieldName;
protected final List<ByteRange> clauses = new ArrayList<>();

public SortedSetStabbingBuilder(String fieldName) {
this.fieldName = Objects.requireNonNull(fieldName);
}

// TODO support nulls as min,max boundaries ???
public SortedSetStabbingBuilder add(BytesRef lowerValue, BytesRef upperValue) {
clauses.add(new ByteRange(lowerValue, upperValue));
return this;
}

public Query build() {
if (clauses.isEmpty()) {
return new MatchNoDocsQuery();
}
if (clauses.size() == 1) {
ByteRange theOnlyOne = clauses.getFirst();
return SortedSetDocValuesField.newSlowRangeQuery(
fieldName, theOnlyOne.lower, theOnlyOne.upper, true, true);
}
return createSortedSetDocValuesMultiRangeQuery();
}

protected Query createSortedSetDocValuesMultiRangeQuery() {
return new SortedSetDocValuesMultiRangeQuery(fieldName, clauses);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.sandbox.search;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesSkipper;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.DocValuesRangeIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongBitSet;

/**
* A union multiple ranges over SortedSetDocValuesField
*
* @lucene.experimental
*/
public class SortedSetDocValuesMultiRangeQuery extends Query {

/** A range for ordinal ends. */
protected static final class OrdRange {
final long lower;
long upper; // mutable field, can't afford equals hashcode here

public OrdRange(long lower, long upper) {
this.lower = lower;
this.upper = upper;
}
}

protected final String fieldName;
protected final List<DocValuesMultiRangeQuery.ByteRange> rangeClauses;

protected SortedSetDocValuesMultiRangeQuery(
String fieldName, List<DocValuesMultiRangeQuery.ByteRange> clauses) {
this.fieldName = fieldName;
ArrayList<DocValuesMultiRangeQuery.ByteRange> sortedClauses = new ArrayList<>(clauses);
sortedClauses.sort(
Comparator.<DocValuesMultiRangeQuery.ByteRange, BytesRef>comparing(r -> r.lower)
.thenComparing(r -> r.upper));
this.rangeClauses = sortedClauses;
}

@Override
public String toString(String fld) {
return (Objects.equals(fieldName, fld) ? "" : fieldName + ":") + rangeClauses;
}

@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
return new MultiRangeWeight(boost, scoreMode);
}

/**
* Resolves ordinals for {@linkplain #rangeClauses}. Caveat: sometimes it updates ranges after
* inserting
*
* @param values doc values to lookup ordinals
* @param ordRanges destination collection for ord ranges
*/
protected void createOrdRanges(SortedSetDocValues values, Collection<OrdRange> ordRanges)
throws IOException {
TermsEnum termsEnum = values.termsEnum();
OrdRange previous = null;
clauses:
for (DocValuesMultiRangeQuery.ByteRange range : rangeClauses) {
TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(range.lower);
long lowerOrd = -1;
switch (seekStatus) {
case TermsEnum.SeekStatus.END:
break clauses;
case FOUND, NOT_FOUND:
lowerOrd = termsEnum.ord();
}
seekStatus = termsEnum.seekCeil(range.upper);
long upperOrd = -1;
switch (seekStatus) {
case TermsEnum.SeekStatus.END:
upperOrd = values.getValueCount() - 1;
break;
case FOUND:
upperOrd = termsEnum.ord();
break;
case NOT_FOUND:
if (termsEnum.ord() == 0) {
continue; // this range is before values.
}
upperOrd = termsEnum.ord() - 1;
}
if (lowerOrd <= upperOrd) { // otherwise ignore
if (previous == null || previous.upper < lowerOrd - 1) { // standing out of previous
ordRanges.add(previous = new OrdRange(lowerOrd, upperOrd));
} else { // adjacent or overlap
previous.upper =
Math.max(upperOrd, previous.upper); // update one. which was yield. danger
}
}
}
}

@Override
public void visit(QueryVisitor visitor) {
if (visitor.acceptField(fieldName)) {
visitor.visitLeaf(this);
}
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SortedSetDocValuesMultiRangeQuery that = (SortedSetDocValuesMultiRangeQuery) o;
return Objects.equals(fieldName, that.fieldName)
&& Objects.equals(rangeClauses, that.rangeClauses);
}

@Override
public int hashCode() {
return Objects.hash(fieldName, rangeClauses);
}

/** Weight for {@linkplain SortedSetDocValuesMultiRangeQuery} */
protected class MultiRangeWeight extends ConstantScoreWeight {
final ScoreMode scoreMode;

public MultiRangeWeight(float boost, ScoreMode scoreMode) {
super(SortedSetDocValuesMultiRangeQuery.this, boost);
this.scoreMode = scoreMode;
}

@Override
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
if (context.reader().getFieldInfos().fieldInfo(fieldName) == null) {
return null;
}
SortedSetDocValues values = DocValues.getSortedSet(context.reader(), fieldName);

return new MultiRangeScorerSupplier(values, context);
}

// TODO perhaps count() specification?

@Override
public boolean isCacheable(LeafReaderContext ctx) {
return DocValues.isCacheable(ctx, fieldName);
}

/** Scorer supplier for {@linkplain SortedSetDocValuesMultiRangeQuery} */
protected class MultiRangeScorerSupplier extends ScorerSupplier {
final SortedSetDocValues values;
protected final LeafReaderContext context;

public MultiRangeScorerSupplier(SortedSetDocValues values, LeafReaderContext context) {
this.values = values;
this.context = context;
}

@Override
public Scorer get(long leadCost) throws IOException {
List<OrdRange> ordRanges = new ArrayList<>();
createOrdRanges(values, ordRanges);
if (ordRanges.isEmpty()) {
return empty();
}
LongBitSet matchingOrdsShifted = null;
long minOrd = ordRanges.getFirst().lower, maxOrd = ordRanges.getLast().upper;

DocValuesSkipper skipper = context.reader().getDocValuesSkipper(fieldName);

if (skipper != null && (minOrd > skipper.maxValue() || maxOrd < skipper.minValue())) {
return empty();
}

if (ordRanges.size() > 1) {
matchingOrdsShifted = new LongBitSet(maxOrd + 1 - minOrd);
for (OrdRange range : ordRanges) {
matchingOrdsShifted.set(
range.lower - minOrd, range.upper - minOrd + 1); // up is exclusive
}
}
TwoPhaseIterator iterator;
LongBitSet finalMatchingOrdsShifted = matchingOrdsShifted;
iterator =
new TwoPhaseIterator(values) {
// TODO unwrap singleton?
@Override
public boolean matches() throws IOException {
for (int i = 0; i < values.docValueCount(); i++) {
long ord = values.nextOrd();
if (ord >= minOrd && ord <= maxOrd) {
if (finalMatchingOrdsShifted == null // singleton
|| finalMatchingOrdsShifted.get(ord - minOrd)) {
return true;
}
}
}
return false;
}

@Override
public float matchCost() {
return 2; // 2 comparisons
}
};
// }
if (skipper != null) {
iterator =
new DocValuesRangeIterator(
iterator, skipper, minOrd, maxOrd, matchingOrdsShifted != null);
}
return new ConstantScoreScorer(score(), scoreMode, iterator);
}

protected ConstantScoreScorer empty() {
return new ConstantScoreScorer(score(), scoreMode, DocIdSetIterator.empty());
}

@Override
public long cost() {
return values.cost();
}
}
}
}
Loading