Skip to content

Commit 5f5f54d

Browse files
committed
javadoc and code refactor
Signed-off-by: Rishabh Maurya <[email protected]>
1 parent 25fecf3 commit 5f5f54d

File tree

4 files changed

+127
-62
lines changed

4 files changed

+127
-62
lines changed

server/src/main/java/org/opensearch/index/mapper/DerivedFieldMapper.java

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,22 @@ public DerivedFieldMapper build(BuilderContext context) {
8383
type.getValue(),
8484
name
8585
);
86-
DerivedFieldType ft = new DerivedFieldType(
87-
new DerivedField(buildFullName(context), type.getValue(), script.getValue(), sourceIndexedField.getValue()),
88-
fieldMapper,
89-
fieldFunction,
90-
indexAnalyzers
91-
);
86+
DerivedFieldType ft;
87+
if (name.contains(".")) {
88+
ft = new DerivedObjectFieldType(
89+
new DerivedField(buildFullName(context), type.getValue(), script.getValue(), sourceIndexedField.getValue()),
90+
fieldMapper,
91+
fieldFunction,
92+
indexAnalyzers
93+
);
94+
} else {
95+
ft = new DerivedFieldType(
96+
new DerivedField(buildFullName(context), type.getValue(), script.getValue(), sourceIndexedField.getValue()),
97+
fieldMapper,
98+
fieldFunction,
99+
indexAnalyzers
100+
);
101+
}
92102
return new DerivedFieldMapper(name, ft, multiFieldsBuilder.build(this, context), copyTo.build(), this, indexAnalyzers);
93103
}
94104
}

server/src/main/java/org/opensearch/index/mapper/DerivedObjectFieldType.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424

2525
public class DerivedObjectFieldType extends DerivedFieldType {
2626

27-
public DerivedObjectFieldType(
27+
DerivedObjectFieldType(
2828
DerivedField derivedField,
2929
FieldMapper typeFieldMapper,
3030
Function<Object, IndexableField> fieldFunction,

server/src/main/java/org/opensearch/index/mapper/FieldTypeInference.java

Lines changed: 110 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
package org.opensearch.index.mapper;
1010

1111
import org.apache.lucene.index.IndexReader;
12-
import org.apache.lucene.index.LeafReader;
1312
import org.apache.lucene.index.LeafReaderContext;
1413
import org.opensearch.common.xcontent.XContentFactory;
1514
import org.opensearch.common.xcontent.json.JsonXContent;
@@ -20,23 +19,37 @@
2019
import java.io.IOException;
2120
import java.util.Arrays;
2221
import java.util.HashSet;
22+
import java.util.Iterator;
2323
import java.util.List;
2424
import java.util.Random;
2525
import java.util.Set;
2626

27+
/**
28+
* This method infers the field type by examining the _source documents. For a given value, type inference is similar to the dynamic mapping type guessing logic.
29+
* Instead of guessing the type based on the first document, it generates a random sample of documents to make a more accurate inference.
30+
* This approach is particularly useful when dealing with missing fields, which is common in nested fields within derived fields of object types.
31+
*
32+
* <p>The sample size should be selected carefully to ensure a high probability of selecting at least one document where the field is present.
33+
* However, it's important to maintain a balance as a large sample size can lead to performance issues as for each sample document _source field is loaded and examined.
34+
*
35+
* <p>The problem of determining the sample size (S) is akin to deciding how many balls to draw from a bin,
36+
* ensuring a high probability (>=P) of drawing at least one green ball (documents with the field) from a mixture of
37+
* R red balls (documents without the field) and G green balls -
38+
* <pre>
39+
* P >= 1 - C(R, S) / C(R + G, S)
40+
* </pre>
41+
* Where C() is the binomial coefficient
42+
* For a high confidence, we want the P >= 0.95
43+
*/
44+
2745
public class FieldTypeInference {
2846
private final IndexReader indexReader;
2947
private final String indexName;
3048
private final MapperService mapperService;
3149
// TODO expose using a index setting?
3250
private int sampleSize;
33-
34-
// this will lead to the probability of more than 0.95 to select on the document containing this field,
35-
// when at least 5% of the overall documents contain the field
3651
private static final int DEFAULT_SAMPLE_SIZE = 60;
3752

38-
private final int MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES = 10000;
39-
4053
public FieldTypeInference(String indexName, MapperService mapperService, IndexReader indexReader) {
4154
this.indexName = indexName;
4255
this.mapperService = mapperService;
@@ -53,56 +66,17 @@ public int getSampleSize() {
5366
}
5467

5568
public Mapper infer(ValueFetcher valueFetcher) throws IOException {
56-
int iter = 0;
57-
int totalDocs = indexReader.numDocs();
58-
int sampleSize = Math.min(totalDocs, getSampleSize());
59-
int[] docs = getSortedRandomNum(sampleSize, totalDocs, Math.max(getSampleSize(), MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES));
60-
int offset = 0;
61-
SourceLookup sourceLookup = new SourceLookup();
62-
for (LeafReaderContext leafReaderContext : indexReader.leaves()) {
63-
LeafReader leafReader = leafReaderContext.reader();
64-
valueFetcher.setNextReader(leafReaderContext);
65-
if (iter >= docs.length) {
66-
break;
69+
RandomSourceValuesGenerator valuesGenerator = new RandomSourceValuesGenerator(sampleSize, indexReader, valueFetcher);
70+
Mapper inferredMapper = null;
71+
while (inferredMapper == null && valuesGenerator.hasNext()) {
72+
List<Object> values = valuesGenerator.next();
73+
if (values == null) {
74+
continue;
6775
}
68-
int docID = docs[iter] - offset;
69-
while (docID < leafReader.numDocs()) {
70-
sourceLookup.setSegmentAndDocument(leafReaderContext, docID);
71-
List<Object> objects = valueFetcher.fetchValues(sourceLookup);
72-
Mapper inferredMapper = null;
73-
if (objects != null && !objects.isEmpty()) {
74-
// always using first value in case of multi value field
75-
inferredMapper = inferTypeFromObject(objects.get(0));
76-
}
77-
if (inferredMapper != null) {
78-
return inferredMapper;
79-
}
80-
iter++;
81-
if (iter >= docs.length) {
82-
break;
83-
}
84-
docID = docs[iter] - offset;
85-
}
86-
offset += leafReader.numDocs();
87-
}
88-
return null;
89-
}
90-
91-
private static int[] getSortedRandomNum(int k, int n, int attempts) {
92-
Set<Integer> generatedNumbers = new HashSet<>();
93-
Random random = new Random();
94-
int itr = 0;
95-
while (generatedNumbers.size() < k && itr++ < attempts) {
96-
int randomNumber = random.nextInt(n);
97-
generatedNumbers.add(randomNumber);
76+
// always use first value in case of multi value field to infer type
77+
inferredMapper = inferTypeFromObject(values.get(0));
9878
}
99-
int[] result = new int[generatedNumbers.size()];
100-
int i = 0;
101-
for (int number : generatedNumbers) {
102-
result[i++] = number;
103-
}
104-
Arrays.sort(result);
105-
return result;
79+
return inferredMapper;
10680
}
10781

10882
private Mapper inferTypeFromObject(Object o) throws IOException {
@@ -117,4 +91,86 @@ private Mapper inferTypeFromObject(Object o) throws IOException {
11791
Mapping mapping = parsedDocument.dynamicMappingsUpdate();
11892
return mapping.root.getMapper("field");
11993
}
94+
95+
private static class RandomSourceValuesGenerator implements Iterator<List<Object>> {
96+
private final ValueFetcher valueFetcher;
97+
private final IndexReader indexReader;
98+
private final SourceLookup sourceLookup;
99+
private final int numLeaves;
100+
private final int[] docs;
101+
private int iter;
102+
private int offset;
103+
private LeafReaderContext leafReaderContext;
104+
private int leaf;
105+
private final int MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES = 10000;
106+
107+
public RandomSourceValuesGenerator(int sampleSize, IndexReader indexReader, ValueFetcher valueFetcher) {
108+
this.valueFetcher = valueFetcher;
109+
this.indexReader = indexReader;
110+
sampleSize = Math.min(sampleSize, indexReader.numDocs());
111+
this.docs = getSortedRandomNum(
112+
sampleSize,
113+
indexReader.numDocs(),
114+
Math.max(sampleSize, MAX_ATTEMPTS_TO_GENERATE_RANDOM_SAMPLES)
115+
);
116+
this.iter = 0;
117+
this.offset = 0;
118+
this.leaf = 0;
119+
this.numLeaves = indexReader.leaves().size();
120+
this.sourceLookup = new SourceLookup();
121+
this.leafReaderContext = indexReader.leaves().get(leaf);
122+
valueFetcher.setNextReader(leafReaderContext);
123+
}
124+
125+
@Override
126+
public boolean hasNext() {
127+
return iter < docs.length && leaf < numLeaves;
128+
}
129+
130+
/**
131+
* Ensure hasNext() is called before calling next()
132+
*/
133+
@Override
134+
public List<Object> next() {
135+
int docID = docs[iter] - offset;
136+
if (docID >= leafReaderContext.reader().numDocs()) {
137+
setNextLeaf();
138+
return next();
139+
}
140+
// deleted docs are getting used to infer type, which should be okay?
141+
sourceLookup.setSegmentAndDocument(leafReaderContext, docID);
142+
try {
143+
iter++;
144+
return valueFetcher.fetchValues(sourceLookup);
145+
} catch (IOException e) {
146+
throw new RuntimeException(e);
147+
}
148+
}
149+
150+
private void setNextLeaf() {
151+
offset += leafReaderContext.reader().numDocs();
152+
leaf++;
153+
if (leaf < numLeaves) {
154+
leafReaderContext = indexReader.leaves().get(leaf);
155+
valueFetcher.setNextReader(leafReaderContext);
156+
}
157+
}
158+
159+
private static int[] getSortedRandomNum(int sampleSize, int upperBound, int attempts) {
160+
Set<Integer> generatedNumbers = new HashSet<>();
161+
Random random = new Random();
162+
int itr = 0;
163+
while (generatedNumbers.size() < sampleSize && itr++ < attempts) {
164+
int randomNumber = random.nextInt(upperBound);
165+
generatedNumbers.add(randomNumber);
166+
}
167+
int[] result = new int[generatedNumbers.size()];
168+
int i = 0;
169+
for (int number : generatedNumbers) {
170+
result[i++] = number;
171+
}
172+
Arrays.sort(result);
173+
return result;
174+
}
175+
}
120176
}

server/src/test/java/org/opensearch/index/mapper/FieldTypeInferenceTests.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,6 @@ public void setNextReader(LeafReaderContext leafReaderContext) {
126126
}
127127
});
128128
assertNull(mapper);
129-
// assertEquals(leaves, docsEvaluated.size());
130129
assertEquals(typeInference.getSampleSize(), totalDocsEvaluated[0]);
131130
for (List<Integer> docsPerLeaf : docsEvaluated) {
132131
for (int j = 0; j < docsPerLeaf.size() - 1; j++) {

0 commit comments

Comments
 (0)