diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 44c045a0b6ec..828f46c35cd7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -49,7 +49,7 @@ API Changes * GITHUB#14134: Added Bits#applyMask API to help apply live docs as a mask on a bit set of matches. (Adrien Grand) -* GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and +* GITHUB#14209: Deprecate Operations.union(Automaton,Automaton) and concatenate(Automaton,Automaton) in favor of the methods taking List. (Robert Muir) New Features @@ -59,6 +59,7 @@ New Features These queries allow for the vector search entry points to be initialized via a `seed` query. This follows the research provided via https://arxiv.org/abs/2307.16779. (Sean MacAvaney, Ben Trent). +* GITHUB#13974: Introducing DocValuesMultiRangeQuery.SortedSetStabbingBuilder into sandbox. (Mikhail Khludnev) Improvements --------------------- @@ -137,7 +138,7 @@ Other * GITHUB#14091: Cover all DataType. (Lu Xugang) -* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j +* GITHUB#14130: Upgrade OpenNLP from 2.3.2 to 2.5.3, which transitively upgrades Slf4j from 1.7.36 to 2.0.16. (Michael Froh) * GITHUB#14223 : Fixed a flaky test TestKnnFloatVectorQuery.testFindFewer (Navneet Verma) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/DocValuesMultiRangeQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/DocValuesMultiRangeQuery.java new file mode 100644 index 000000000000..943a048054b9 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/DocValuesMultiRangeQuery.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.search; + +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.search.*; +import org.apache.lucene.util.BytesRef; + +/** + * A few query builders for doc values multi range queries. + * + * @lucene.experimental + */ +public final class DocValuesMultiRangeQuery { + + private DocValuesMultiRangeQuery() {} + + /** Representation of a single clause in a MultiRangeQuery */ + public static class ByteRange { + protected BytesRef lower; + protected BytesRef upper; + + /** copies ByteRefs passed */ + public ByteRange(BytesRef lowerValue, BytesRef upperValue) { + this.lower = BytesRef.deepCopyOf(lowerValue); + this.upper = BytesRef.deepCopyOf(upperValue); + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + ByteRange that = (ByteRange) o; + return lower.equals(that.lower) && upper.equals(that.upper); + } + + @Override + public int hashCode() { + int result = lower.hashCode(); + result = 31 * result + upper.hashCode(); + return result; + } + + @Override + public String toString() { + return lower + ".." + upper; + } + } + + /** + * Builder for creating a multi-range query for stabbing by SortedSet or Sorted field values. For + * example, it matches IPs in docvalues field by multiple IP ranges. For the single range it + * behaves like {@link SortedSetDocValuesField#newSlowRangeQuery(String, BytesRef, BytesRef, + * boolean, boolean)} with both true arguments + */ + public static class SortedSetStabbingBuilder { + protected final String fieldName; + protected final List clauses = new ArrayList<>(); + + public SortedSetStabbingBuilder(String fieldName) { + this.fieldName = Objects.requireNonNull(fieldName); + } + + // TODO support nulls as min,max boundaries ??? + public SortedSetStabbingBuilder add(BytesRef lowerValue, BytesRef upperValue) { + clauses.add(new ByteRange(lowerValue, upperValue)); + return this; + } + + public Query build() { + if (clauses.isEmpty()) { + return new MatchNoDocsQuery(); + } + if (clauses.size() == 1) { + ByteRange theOnlyOne = clauses.getFirst(); + return SortedSetDocValuesField.newSlowRangeQuery( + fieldName, theOnlyOne.lower, theOnlyOne.upper, true, true); + } + return createSortedSetDocValuesMultiRangeQuery(); + } + + protected Query createSortedSetDocValuesMultiRangeQuery() { + return new SortedSetDocValuesMultiRangeQuery(fieldName, clauses); + } + } +} diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/SortedSetDocValuesMultiRangeQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/SortedSetDocValuesMultiRangeQuery.java new file mode 100644 index 000000000000..34ae3dcee26b --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/search/SortedSetDocValuesMultiRangeQuery.java @@ -0,0 +1,257 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.List; +import java.util.Objects; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesSkipper; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.ConstantScoreScorer; +import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.DocValuesRangeIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryVisitor; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LongBitSet; + +/** + * A union multiple ranges over SortedSetDocValuesField + * + * @lucene.experimental + */ +public class SortedSetDocValuesMultiRangeQuery extends Query { + + /** A range for ordinal ends. */ + protected static final class OrdRange { + final long lower; + long upper; // mutable field, can't afford equals hashcode here + + public OrdRange(long lower, long upper) { + this.lower = lower; + this.upper = upper; + } + } + + protected final String fieldName; + protected final List rangeClauses; + + protected SortedSetDocValuesMultiRangeQuery( + String fieldName, List clauses) { + this.fieldName = fieldName; + ArrayList sortedClauses = new ArrayList<>(clauses); + sortedClauses.sort( + Comparator.comparing(r -> r.lower) + .thenComparing(r -> r.upper)); + this.rangeClauses = sortedClauses; + } + + @Override + public String toString(String fld) { + return (Objects.equals(fieldName, fld) ? "" : fieldName + ":") + rangeClauses; + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) + throws IOException { + return new MultiRangeWeight(boost, scoreMode); + } + + /** + * Resolves ordinals for {@linkplain #rangeClauses}. Caveat: sometimes it updates ranges after + * inserting + * + * @param values doc values to lookup ordinals + * @param ordRanges destination collection for ord ranges + */ + protected void createOrdRanges(SortedSetDocValues values, Collection ordRanges) + throws IOException { + TermsEnum termsEnum = values.termsEnum(); + OrdRange previous = null; + clauses: + for (DocValuesMultiRangeQuery.ByteRange range : rangeClauses) { + TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(range.lower); + long lowerOrd = -1; + switch (seekStatus) { + case TermsEnum.SeekStatus.END: + break clauses; + case FOUND, NOT_FOUND: + lowerOrd = termsEnum.ord(); + } + seekStatus = termsEnum.seekCeil(range.upper); + long upperOrd = -1; + switch (seekStatus) { + case TermsEnum.SeekStatus.END: + upperOrd = values.getValueCount() - 1; + break; + case FOUND: + upperOrd = termsEnum.ord(); + break; + case NOT_FOUND: + if (termsEnum.ord() == 0) { + continue; // this range is before values. + } + upperOrd = termsEnum.ord() - 1; + } + if (lowerOrd <= upperOrd) { // otherwise ignore + if (previous == null || previous.upper < lowerOrd - 1) { // standing out of previous + ordRanges.add(previous = new OrdRange(lowerOrd, upperOrd)); + } else { // adjacent or overlap + previous.upper = + Math.max(upperOrd, previous.upper); // update one. which was yield. danger + } + } + } + } + + @Override + public void visit(QueryVisitor visitor) { + if (visitor.acceptField(fieldName)) { + visitor.visitLeaf(this); + } + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + SortedSetDocValuesMultiRangeQuery that = (SortedSetDocValuesMultiRangeQuery) o; + return Objects.equals(fieldName, that.fieldName) + && Objects.equals(rangeClauses, that.rangeClauses); + } + + @Override + public int hashCode() { + return Objects.hash(fieldName, rangeClauses); + } + + /** Weight for {@linkplain SortedSetDocValuesMultiRangeQuery} */ + protected class MultiRangeWeight extends ConstantScoreWeight { + final ScoreMode scoreMode; + + public MultiRangeWeight(float boost, ScoreMode scoreMode) { + super(SortedSetDocValuesMultiRangeQuery.this, boost); + this.scoreMode = scoreMode; + } + + @Override + public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + if (context.reader().getFieldInfos().fieldInfo(fieldName) == null) { + return null; + } + SortedSetDocValues values = DocValues.getSortedSet(context.reader(), fieldName); + + return new MultiRangeScorerSupplier(values, context); + } + + // TODO perhaps count() specification? + + @Override + public boolean isCacheable(LeafReaderContext ctx) { + return DocValues.isCacheable(ctx, fieldName); + } + + /** Scorer supplier for {@linkplain SortedSetDocValuesMultiRangeQuery} */ + protected class MultiRangeScorerSupplier extends ScorerSupplier { + final SortedSetDocValues values; + protected final LeafReaderContext context; + + public MultiRangeScorerSupplier(SortedSetDocValues values, LeafReaderContext context) { + this.values = values; + this.context = context; + } + + @Override + public Scorer get(long leadCost) throws IOException { + List ordRanges = new ArrayList<>(); + createOrdRanges(values, ordRanges); + if (ordRanges.isEmpty()) { + return empty(); + } + LongBitSet matchingOrdsShifted = null; + long minOrd = ordRanges.getFirst().lower, maxOrd = ordRanges.getLast().upper; + + DocValuesSkipper skipper = context.reader().getDocValuesSkipper(fieldName); + + if (skipper != null && (minOrd > skipper.maxValue() || maxOrd < skipper.minValue())) { + return empty(); + } + + if (ordRanges.size() > 1) { + matchingOrdsShifted = new LongBitSet(maxOrd + 1 - minOrd); + for (OrdRange range : ordRanges) { + matchingOrdsShifted.set( + range.lower - minOrd, range.upper - minOrd + 1); // up is exclusive + } + } + TwoPhaseIterator iterator; + LongBitSet finalMatchingOrdsShifted = matchingOrdsShifted; + iterator = + new TwoPhaseIterator(values) { + // TODO unwrap singleton? + @Override + public boolean matches() throws IOException { + for (int i = 0; i < values.docValueCount(); i++) { + long ord = values.nextOrd(); + if (ord >= minOrd && ord <= maxOrd) { + if (finalMatchingOrdsShifted == null // singleton + || finalMatchingOrdsShifted.get(ord - minOrd)) { + return true; + } + } + } + return false; + } + + @Override + public float matchCost() { + return 2; // 2 comparisons + } + }; + // } + if (skipper != null) { + iterator = + new DocValuesRangeIterator( + iterator, skipper, minOrd, maxOrd, matchingOrdsShifted != null); + } + return new ConstantScoreScorer(score(), scoreMode, iterator); + } + + protected ConstantScoreScorer empty() { + return new ConstantScoreScorer(score(), scoreMode, DocIdSetIterator.empty()); + } + + @Override + public long cost() { + return values.cost(); + } + } + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestInetAddrSsDvMultiRangeQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestInetAddrSsDvMultiRangeQuery.java new file mode 100644 index 000000000000..7ab0a6eca735 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestInetAddrSsDvMultiRangeQuery.java @@ -0,0 +1,245 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.search; + +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; +import java.util.Set; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.InetAddressPoint; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +public class TestInetAddrSsDvMultiRangeQuery extends LuceneTestCase { + /** Add a single address and search for it */ + public void testBasics() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + // add a doc with an address + Document document = new Document(); + SortedSetDocValuesField field = getIpField("field", new byte[] {1, 2, 3, 4}); + document.add(field); + writer.addDocument(document); + + // search and verify we found our doc + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + + Query q = + rangeQuery( + "field", + InetAddress.getByAddress(new byte[] {1, 2, 3, 3}), + InetAddress.getByAddress(new byte[] {1, 2, 3, 5}), + InetAddress.getByAddress( + new byte[] {127, 2, 3, 3}), // bogus range to avoid optimization + InetAddress.getByAddress(new byte[] {127, 2, 3, 5})); + assertEquals(1, searcher.count(q)); + // assertEquals(1, searcher.count(InetAddressPoint.newPrefixQuery("field", address, + // 24))); + // assertEquals( + // 1, + // searcher.count( + // InetAddressPoint.newRangeQuery( + // "field", InetAddress.getByName("1.2.3.3"), + // InetAddress.getByName("1.2.3.5")))); + // assertEquals( + // 1, searcher.count(InetAddressPoint.newSetQuery("field", + // InetAddress.getByName("1.2.3.4")))); + // assertEquals( + // 1, + // searcher.count( + // InetAddressPoint.newSetQuery( + // "field", InetAddress.getByName("1.2.3.4"), + // InetAddress.getByName("1.2.3.5")))); + // assertEquals( + // 0, searcher.count(InetAddressPoint.newSetQuery("field", + // InetAddress.getByName("1.2.3.3")))); + // assertEquals(0, searcher.count(InetAddressPoint.newSetQuery("field"))); + + reader.close(); + writer.close(); + dir.close(); + } + + public void testRandom() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + + int docs = 0; + List pivotIps = new ArrayList<>(); + // add a doc with an address + for (int doc = 0; doc < atLeast(100); doc++) { + Document document = new Document(); + // System.out.print("doc #"+doc+" "); + for (int fld = 0; fld < atLeast(1); fld++) { + byte[] ip = getRandomIpBytes(); + SortedSetDocValuesField field = getIpField("field", ip); + document.add(field); + // System.out.print(field+", "); + // add nearby points + for (int delta : Arrays.asList(0, 1, 2, -1, -2)) { + byte[] inc = ip.clone(); + inc[3] = (byte) (delta + inc[3]); + pivotIps.add(inc); + } + } + // System.out.println(); + writer.addDocument(document); + docs++; + } + + // search and verify we found our doc + IndexReader reader = writer.getReader(); + IndexSearcher searcher = newSearcher(reader); + // List ranges = new ArrayList<>(); + + Supplier pivotIpsStream = + new Supplier<>() { + Iterator iter = pivotIps.iterator(); + + @Override + public byte[] get() { + if (!iter.hasNext()) { + iter = pivotIps.iterator(); + } + return iter.next(); + } + }; + for (int pass = 0; pass < atLeast(10); pass++) { + BooleanQuery.Builder bq = new BooleanQuery.Builder(); + ArrayUtil.ByteArrayComparator comparator = ArrayUtil.getUnsignedComparator(4); + DocValuesMultiRangeQuery.SortedSetStabbingBuilder qbuilder = + new DocValuesMultiRangeQuery.SortedSetStabbingBuilder("field"); + for (int q = 0; q < atLeast(10); q++) { + byte[] alfa = random().nextBoolean() ? getRandomIpBytes() : pivotIpsStream.get(); + byte[] beta = random().nextBoolean() ? getRandomIpBytes() : pivotIpsStream.get(); + if (comparator.compare(alfa, 0, beta, 0) > 0) { + byte[] swap = beta; + beta = alfa; + alfa = swap; + } + // ranges.add(InetAddress.getByAddress(alfa)); + // ranges.add(InetAddress.getByAddress(beta)); + qbuilder.add( + new BytesRef(InetAddressPoint.encode(InetAddress.getByAddress(alfa))), + new BytesRef(InetAddressPoint.encode(InetAddress.getByAddress(beta)))); + + bq.add( + SortedSetDocValuesField.newSlowRangeQuery( + "field", + new BytesRef(InetAddressPoint.encode(InetAddress.getByAddress(alfa))), + new BytesRef(InetAddressPoint.encode(InetAddress.getByAddress(beta))), + true, + true), + BooleanClause.Occur.SHOULD); + } + // InetAddress[] addr = ranges.toArray(new InetAddress[0]); + Query multiRange = qbuilder.build(); + long cnt; + BooleanQuery orRanges = bq.build(); + if (pass == 0) { + continue; + } + TopDocs boolRes; + // System.out.println(Arrays.toString(( + boolRes = searcher.search(orRanges, 1000); // ).scoreDocs)); + + Set boolDocs = + Stream.of(boolRes.scoreDocs).map((sd) -> sd.doc).collect(Collectors.toSet()); + TopDocs mulRes; + // System.out.println(Arrays.toString(( + mulRes = searcher.search(multiRange, 1000); // ).scoreDocs)); + Set mulDocs = + Stream.of(mulRes.scoreDocs).map((sd) -> sd.doc).collect(Collectors.toSet()); + Set falsePos = new HashSet<>(mulDocs); + falsePos.removeAll(boolDocs); + if (!falsePos.isEmpty()) { + System.out.println("false pos:" + falsePos); + } + Set falseNeg = new HashSet<>(boolDocs); + falseNeg.removeAll(mulDocs); + if (!falseNeg.isEmpty()) { + System.out.println("false neg:" + falseNeg); + } + assertEquals(cnt = boolRes.totalHits.value(), mulRes.totalHits.value()); + System.out.printf(Locale.ROOT, "found %d of %d\n", cnt, docs); + } + reader.close(); + writer.close(); + dir.close(); + } + + private static byte[] getRandomIpBytes() { + return new byte[] { + (byte) random().nextInt(256), + (byte) random().nextInt(256), + (byte) random().nextInt(256), + (byte) random().nextInt(256) + }; + } + + private static SortedSetDocValuesField getIpField(String field, byte[] ip) + throws UnknownHostException { + return new SortedSetDocValuesField( + field, new BytesRef(InetAddressPoint.encode(InetAddress.getByAddress(ip)))); + } + + private static Query rangeQuery(String field, InetAddress... addr) { + DocValuesMultiRangeQuery.SortedSetStabbingBuilder qbuilder = + new DocValuesMultiRangeQuery.SortedSetStabbingBuilder(field); + for (int i = 0; i < addr.length; i += 2) { + qbuilder.add( + new BytesRef(InetAddressPoint.encode(addr[i])), + new BytesRef(InetAddressPoint.encode(addr[i + 1]))); + } + return qbuilder.build(); + } + + public static byte[] concatenateByteArrays(byte[] array1, byte[] array2) { + // Step 1: Create a new byte array with the combined length of both input arrays + byte[] result = new byte[array1.length + array2.length]; + + // Step 2: Copy the first array into the result array + System.arraycopy(array1, 0, result, 0, array1.length); + + // Step 3: Copy the second array into the result array, starting from the end of the first array + System.arraycopy(array2, 0, result, array1.length, array2.length); + + // Step 4: Return the concatenated byte array + return result; + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestSsDvMultiRangeQuery.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestSsDvMultiRangeQuery.java new file mode 100644 index 000000000000..2bdecf033252 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/search/TestSsDvMultiRangeQuery.java @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.search; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.IntPoint; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.sandbox.document.LongPointMultiRangeBuilder; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.Weight; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.index.RandomIndexWriter; +import org.apache.lucene.tests.search.QueryUtils; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; + +public class TestSsDvMultiRangeQuery extends LuceneTestCase { + private Codec getCodec() { + // small interval size to test with many intervals + return TestUtil.alwaysDocValuesFormat(new Lucene90DocValuesFormat(random().nextInt(4, 16))); + } + + public void testDuelWithStandardDisjunction() throws IOException { + int iterations = LuceneTestCase.TEST_NIGHTLY ? atLeast(100) : 10; + for (int iter = 0; iter < iterations; iter++) { + Directory dir = newDirectory(); + final RandomIndexWriter w; + + int dims = 1; + boolean singleton = random().nextBoolean(); + boolean sortedIndex = random().nextBoolean(); + if (!sortedIndex) { + w = new RandomIndexWriter(random(), dir); + } else { + IndexWriterConfig config = new IndexWriterConfig().setCodec(getCodec()); + config.setIndexSort( + new Sort(new SortField("docVal", SortField.Type.STRING, random().nextBoolean()))); + w = new RandomIndexWriter(random(), dir); + } + + long[] scratch = new long[dims]; + for (int i = 0; i < (LuceneTestCase.TEST_NIGHTLY ? atLeast(1000) : 100); i++) { + int numPoints = singleton ? 1 : RandomNumbers.randomIntBetween(random(), 1, 10); + Document doc = new Document(); + for (int j = 0; j < numPoints; j++) { + for (int v = 0; v < dims; v++) { + scratch[v] = RandomNumbers.randomLongBetween(random(), 0, atLeast(100)); + } + doc.add(new LongPoint("point", scratch)); + if (singleton) { + if (sortedIndex) { + doc.add(SortedDocValuesField.indexedField("docVal", LongPoint.pack(scratch))); + } else { + doc.add(new SortedDocValuesField("docVal", LongPoint.pack(scratch))); + } + } else { + if (sortedIndex) { + doc.add(SortedSetDocValuesField.indexedField("docVal", LongPoint.pack(scratch))); + } else { + doc.add(new SortedSetDocValuesField("docVal", LongPoint.pack(scratch))); + } + } + } + w.addDocument(doc); + if (rarely()) { + w.commit(); // segmenting to check index sorter. + } + } + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + + int numRanges = RandomNumbers.randomIntBetween(random(), 1, 20); + LongPointMultiRangeBuilder builder1 = new LongPointMultiRangeBuilder("point", dims); + BooleanQuery.Builder builder2 = new BooleanQuery.Builder(); + DocValuesMultiRangeQuery.SortedSetStabbingBuilder builder3 = + new DocValuesMultiRangeQuery.SortedSetStabbingBuilder("docVal"); + + for (int i = 0; i < numRanges; i++) { + long[] lower = new long[dims]; + long[] upper = new long[dims]; + for (int j = 0; j < dims; j++) { + lower[j] = RandomNumbers.randomLongBetween(random(), -100, 200); + upper[j] = lower[j] + RandomNumbers.randomLongBetween(random(), 0, 100); + } + builder1.add(lower, upper); + builder2.add(LongPoint.newRangeQuery("point", lower, upper), BooleanClause.Occur.SHOULD); + builder3.add(LongPoint.pack(lower), LongPoint.pack(upper)); + } + + Query query1 = builder1.build(); + Query query2 = builder2.build(); + Query query3 = builder3.build(); + TopDocs result1 = searcher.search(query1, 100, Sort.INDEXORDER); + TopDocs result2 = searcher.search(query2, 100, Sort.INDEXORDER); + TopDocs result3 = searcher.search(query3, 100, Sort.INDEXORDER); + assertEquals(result2.totalHits, result1.totalHits); + assertEquals(result2.totalHits, result3.totalHits); + assertEquals(result2.scoreDocs.length, result1.scoreDocs.length); + assertEquals(result2.scoreDocs.length, result3.scoreDocs.length); + for (int i = 0; i < result2.scoreDocs.length; i++) { + assertEquals(result2.scoreDocs[i].doc, result1.scoreDocs[i].doc); + assertEquals(result2.scoreDocs[i].doc, result3.scoreDocs[i].doc); + } + + IOUtils.close(reader, w, dir); + } + } + + public void testEquals() { + Query q1 = mrSsDvQ("foo", 3, 5, 7, 9); + QueryUtils.checkEqual(q1, mrSsDvQ("foo", 3, 5, 7, 9)); + QueryUtils.checkEqual(q1, mrSsDvQ("foo", 7, 9, 3, 5)); + QueryUtils.checkUnequal(q1, mrSsDvQ("foo", 7, 9, 5, 3)); + QueryUtils.checkUnequal(q1, mrSsDvQ("foo", 3, 5 + 1, 7, 9)); + QueryUtils.checkUnequal(q1, mrSsDvQ("foo", 3, 5, 7 + 1, 9)); + QueryUtils.checkUnequal(q1, mrSsDvQ("bar", 3, 5, 7, 9)); + } + + private Query mrSsDvQ(String field, int... ends) { + DocValuesMultiRangeQuery.SortedSetStabbingBuilder b = mrSsDvBuilder(field, ends); + return b.build(); + } + + private static DocValuesMultiRangeQuery.SortedSetStabbingBuilder mrSsDvBuilder( + String field, int... ends) { + DocValuesMultiRangeQuery.SortedSetStabbingBuilder b = + new DocValuesMultiRangeQuery.SortedSetStabbingBuilder(field); + for (int j = 0; j < ends.length; j += 2) { + b.add(IntPoint.pack(ends[j]), IntPoint.pack(ends[j + 1])); + } + return b; + } + + public void testToString() { + Query q1 = mrSsDvQ("foo", 3, 5, 7, 9); + assertEquals("foo:[[80 0 0 3]..[80 0 0 5], [80 0 0 7]..[80 0 0 9]]", q1.toString()); + assertEquals("[[80 0 0 3]..[80 0 0 5], [80 0 0 7]..[80 0 0 9]]", q1.toString("foo")); + assertEquals("foo:[[80 0 0 3]..[80 0 0 5], [80 0 0 7]..[80 0 0 9]]", q1.toString("bar")); + } + + public void testOverrideToString() { + DocValuesMultiRangeQuery.SortedSetStabbingBuilder b = + new DocValuesMultiRangeQuery.SortedSetStabbingBuilder("foo") { + @Override + protected Query createSortedSetDocValuesMultiRangeQuery() { + return new SortedSetDocValuesMultiRangeQuery(fieldName, clauses) { + @Override + public String toString(String fld) { + return fieldName + " " + rangeClauses.size(); + } + }; + } + }; + b.add(IntPoint.pack(1), IntPoint.pack(2)); + b.add(IntPoint.pack(3), IntPoint.pack(4)); + assertEquals("foo 2", b.build().toString()); + + DocValuesMultiRangeQuery.ByteRange myrange = + new DocValuesMultiRangeQuery.ByteRange(IntPoint.pack(1), IntPoint.pack(2)) { + @Override + public String toString() { + return IntPoint.decodeDimension(lower.bytes, 0) + + " " + + IntPoint.decodeDimension(upper.bytes, 0); + } + }; + assertEquals("1 2", myrange.toString()); + } + + public void testMissingField() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir); + iw.addDocument(new Document()); + IndexReader reader = iw.getReader(); + iw.close(); + IndexSearcher searcher = newSearcher(reader); + for (Query query : Collections.singletonList(mrSsDvQ("foo", 1, 2))) { + Weight w = searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE, 1); + assertNull(w.scorer(searcher.getIndexReader().leaves().getFirst())); + } + reader.close(); + dir.close(); + } + + public void testEdgeCases() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir); + final Document doc1 = new Document(); + doc1.add(new SortedSetDocValuesField("foo", IntPoint.pack(1))); + iw.addDocument(doc1); + final Document doc2 = new Document(); + doc2.add(new SortedSetDocValuesField("foo", IntPoint.pack(10))); + iw.addDocument(doc2); + + IndexReader reader = iw.getReader(); + iw.close(); + IndexSearcher searcher = newSearcher(reader); + for (DocValuesMultiRangeQuery.SortedSetStabbingBuilder builder : + List.of( + mrSsDvBuilder("foo", 2, 3, 4, 5, -5, -2), mrSsDvBuilder("foo", 2, 3, 4, 5, 12, 15))) { + assertEquals("no match", 0, searcher.search(builder.build(), 1).totalHits.value()); + BytesRef lower; + BytesRef upper; + builder.add(lower = IntPoint.pack(100), upper = IntPoint.pack(200)); + assertEquals("no match", 0, searcher.search(builder.build(), 1).totalHits.value()); + lower.bytes = IntPoint.pack(1).bytes; + upper.bytes = IntPoint.pack(10).bytes; + assertEquals( + "updating bytes changes nothing", + 0, + searcher.search(builder.build(), 1).totalHits.value()); + builder.add(lower, upper); + assertEquals( + "sanity check for potential match", + 2, + searcher.search(builder.build(), 1).totalHits.value()); + } + // hit by value as a range upper==lower + TopDocs hit1 = searcher.search(mrSsDvQ("foo", 2, 3, 4, 5, -5, -2, 1, 1), 1); + TopDocs hit10 = searcher.search(mrSsDvQ("foo", 2, 3, 4, 5, -5, -2, 10, 10), 1); + assertEquals(1, hit1.totalHits.value()); + assertEquals(1, hit10.totalHits.value()); + assertNotEquals(hit1.scoreDocs[0].doc, hit10.scoreDocs[0].doc); + reader.close(); + dir.close(); + } +}