Skip to content

Commit 0980720

Browse files
committed
Allow configre prune strategy for sparse encoding of the semantic field.
Signed-off-by: Bo Zhang <[email protected]>
1 parent 796d08f commit 0980720

File tree

19 files changed

+534
-33
lines changed

19 files changed

+534
-33
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
99

1010
### Enhancements
1111
- [Semantic Field] Support configuring the auto-generated knn_vector field through the semantic field. ([#1420](https://github.com/opensearch-project/neural-search/pull/1420))
12+
- [Semantic Field] Allow configre prune strategy for sparse encoding of the semantic field. ([#1434](https://github.com/opensearch-project/neural-search/pull/1434))
1213

1314
### Bug Fixes
1415
- Fix for collapse bug with knn query not deduplicating results ([#1413](https://github.com/opensearch-project/neural-search/pull/1413))

src/main/java/org/opensearch/neuralsearch/constants/SemanticFieldConstants.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,4 +53,10 @@ public class SemanticFieldConstants {
5353
* knn_vector field.
5454
*/
5555
public static final String DENSE_EMBEDDING_CONFIG = "dense_embedding_config";
56+
57+
/**
58+
* Name of the field for sparse encoding config. The config will be used to control how to do sparse encoding.
59+
* {@link org.opensearch.neuralsearch.mapper.dto.SparseEncodingConfig}
60+
*/
61+
public static final String SPARSE_ENCODING_CONFIG = "sparse_encoding_config";
5662
}

src/main/java/org/opensearch/neuralsearch/mapper/SemanticFieldMapper.java

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.opensearch.index.mapper.WildcardFieldMapper;
2323
import org.opensearch.neuralsearch.constants.MappingConstants;
2424
import org.opensearch.neuralsearch.mapper.dto.SemanticParameters;
25+
import org.opensearch.neuralsearch.mapper.dto.SparseEncodingConfig;
2526

2627
import java.io.IOException;
2728
import java.util.HashMap;
@@ -39,6 +40,7 @@
3940
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEARCH_MODEL_ID;
4041
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEMANTIC_INFO_FIELD_NAME;
4142
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEMANTIC_FIELD_SEARCH_ANALYZER;
43+
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SPARSE_ENCODING_CONFIG;
4244

4345
/**
4446
* FieldMapper for the semantic field. It will hold a delegate field mapper to delegate the data parsing and query work
@@ -171,6 +173,20 @@ public static class Builder extends ParametrizedFieldMapper.Builder {
171173
}
172174
}, (v) -> v == null ? null : v.toString());
173175

176+
protected final Parameter<SparseEncodingConfig> sparseEncodingConfig = new Parameter<>(
177+
SPARSE_ENCODING_CONFIG,
178+
false,
179+
() -> null,
180+
SparseEncodingConfig::parse,
181+
m -> ((SemanticFieldMapper) m).semanticParameters.getSparseEncodingConfig()
182+
).setSerializer((builder, name, value) -> {
183+
if (value == null) {
184+
builder.nullField(name);
185+
} else {
186+
value.toXContent(builder, name, value);
187+
}
188+
}, (value) -> value == null ? null : value.toString());
189+
174190
@Setter
175191
protected ParametrizedFieldMapper.Builder delegateBuilder;
176192

@@ -187,7 +203,8 @@ protected List<Parameter<?>> getParameters() {
187203
semanticInfoFieldName,
188204
chunkingEnabled,
189205
semanticFieldSearchAnalyzer,
190-
denseEmbeddingConfig
206+
denseEmbeddingConfig,
207+
sparseEncodingConfig
191208
);
192209
}
193210

@@ -217,6 +234,7 @@ public SemanticParameters getSemanticParameters() {
217234
.chunkingEnabled(chunkingEnabled.getValue())
218235
.semanticFieldSearchAnalyzer(semanticFieldSearchAnalyzer.getValue())
219236
.denseEmbeddingConfig(denseEmbeddingConfig.getValue())
237+
.sparseEncodingConfig(sparseEncodingConfig.getValue())
220238
.build();
221239
}
222240
}

src/main/java/org/opensearch/neuralsearch/mapper/dto/SemanticParameters.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,5 @@ public class SemanticParameters {
2222
private final Boolean chunkingEnabled;
2323
private final String semanticFieldSearchAnalyzer;
2424
private final Map<String, Object> denseEmbeddingConfig;
25+
private final SparseEncodingConfig sparseEncodingConfig;
2526
}
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
/*
2+
* Copyright OpenSearch Contributors
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
package org.opensearch.neuralsearch.mapper.dto;
6+
7+
import lombok.Builder;
8+
import lombok.Getter;
9+
import lombok.NonNull;
10+
import org.apache.commons.lang.builder.EqualsBuilder;
11+
import org.apache.commons.lang.builder.HashCodeBuilder;
12+
import org.opensearch.core.xcontent.XContentBuilder;
13+
import org.opensearch.index.mapper.Mapper;
14+
import org.opensearch.index.mapper.MapperParsingException;
15+
import org.opensearch.neuralsearch.util.prune.PruneType;
16+
import org.opensearch.neuralsearch.util.prune.PruneUtils;
17+
18+
import java.io.IOException;
19+
import java.util.HashMap;
20+
import java.util.Locale;
21+
import java.util.Map;
22+
23+
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SPARSE_ENCODING_CONFIG;
24+
import static org.opensearch.neuralsearch.util.prune.PruneUtils.PRUNE_RATIO_FIELD;
25+
import static org.opensearch.neuralsearch.util.prune.PruneUtils.PRUNE_TYPE_FIELD;
26+
27+
@Builder
28+
@Getter
29+
public class SparseEncodingConfig {
30+
private PruneType pruneType;
31+
private Float pruneRatio;
32+
33+
/**
34+
* Parse the json input when we define the sparse encoding config in the index mappings
35+
* @param name parameter name
36+
* @param ctx parse context
37+
* @param value parameter value
38+
* @return parsed SparseEncodingConfig
39+
*/
40+
public static SparseEncodingConfig parse(@NonNull final String name, final Mapper.TypeParser.ParserContext ctx, final Object value) {
41+
if (value instanceof Map == false) {
42+
throw new MapperParsingException(String.format(Locale.ROOT, "[%s] must be a Map", name));
43+
}
44+
final Map<String, Object> config = new HashMap<>((Map<String, Object>) value);
45+
final PruneType pruneType = readPruneType(config, true);
46+
final Float pruneRatio = readPruneRatio(config, true);
47+
48+
// Check for any unrecognized parameters
49+
if (config.isEmpty() == false) {
50+
throw new MapperParsingException(
51+
String.format(Locale.ROOT, "Unsupported parameters %s in %s", String.join(",", config.keySet()), name)
52+
);
53+
}
54+
55+
// Case: pruneType is null and pruneRatio is null → nothing configured
56+
if (pruneType == null && pruneRatio == null) {
57+
return null;
58+
}
59+
60+
// Case: pruneRatio is set but pruneType is null or NONE → invalid
61+
if (pruneRatio != null && (pruneType == null | PruneType.NONE.equals(pruneType))) {
62+
throw new MapperParsingException(
63+
String.format(
64+
Locale.ROOT,
65+
"%s should not be defined when %s is %s or null",
66+
PRUNE_RATIO_FIELD,
67+
PRUNE_TYPE_FIELD,
68+
PruneType.NONE.getValue()
69+
)
70+
);
71+
}
72+
73+
// Case: pruneType is defined and not NONE and pruneRatio is null → missing pruneRatio
74+
if (pruneRatio == null && PruneType.NONE.equals(pruneType) == false) {
75+
throw new MapperParsingException(
76+
String.format(
77+
Locale.ROOT,
78+
"%s is required when %s is defined and not %s",
79+
PRUNE_RATIO_FIELD,
80+
PRUNE_TYPE_FIELD,
81+
PruneType.NONE.getValue()
82+
)
83+
);
84+
}
85+
86+
// Case: pruneType is NONE and pruneRatio is null
87+
if (pruneRatio == null) {
88+
return SparseEncodingConfig.builder().pruneType(pruneType).build();
89+
}
90+
91+
// Case: pruneType is not NONE or null and pruneRatio is not null
92+
if (PruneUtils.isValidPruneRatio(pruneType, pruneRatio) == false) {
93+
throw new MapperParsingException(
94+
String.format(
95+
Locale.ROOT,
96+
"Invalid %s and %s combo. Check %s for the valid combos.",
97+
PRUNE_RATIO_FIELD,
98+
PRUNE_TYPE_FIELD,
99+
"https://docs.opensearch.org/docs/latest/ingest-pipelines/processors/sparse-encoding/#pruning-sparse-vectors"
100+
)
101+
);
102+
}
103+
104+
return SparseEncodingConfig.builder().pruneType(pruneType).pruneRatio(pruneRatio).build();
105+
}
106+
107+
/**
108+
* Parse the config of the semantic field to build the SparseEncodingConfig if it is defined. Only should be used
109+
* to parse the valid semantic field config.
110+
* @param fieldConfig semantic field config
111+
* @return SparseEncodingConfig or null
112+
*/
113+
public static SparseEncodingConfig parse(@NonNull final Map<String, Object> fieldConfig) {
114+
if (fieldConfig.containsKey(SPARSE_ENCODING_CONFIG) == false) {
115+
return null;
116+
}
117+
final Map<String, Object> sparseEncodingConfig = (Map<String, Object>) fieldConfig.get(SPARSE_ENCODING_CONFIG);
118+
final PruneType pruneType = readPruneType(sparseEncodingConfig, false);
119+
final Float pruneRatio = readPruneRatio(sparseEncodingConfig, false);
120+
if (pruneType == null && pruneRatio == null) {
121+
return null;
122+
}
123+
return SparseEncodingConfig.builder().pruneType(pruneType).pruneRatio(pruneRatio).build();
124+
}
125+
126+
private static Float readPruneRatio(@NonNull final Map<String, Object> config, final boolean shouldRemoveIt) {
127+
if (config.containsKey(PRUNE_RATIO_FIELD)) {
128+
try {
129+
return Float.parseFloat(config.get(PRUNE_RATIO_FIELD).toString());
130+
} catch (Exception e) {
131+
throw new MapperParsingException(String.format(Locale.ROOT, "[%s] must be a Float", PRUNE_RATIO_FIELD));
132+
} finally {
133+
if (shouldRemoveIt) {
134+
config.remove(PRUNE_RATIO_FIELD);
135+
}
136+
}
137+
}
138+
return null;
139+
}
140+
141+
private static PruneType readPruneType(@NonNull final Map<String, Object> config, final boolean shouldRemoveIt) {
142+
if (config.containsKey(PRUNE_TYPE_FIELD)) {
143+
try {
144+
return PruneType.fromString((String) config.get(PRUNE_TYPE_FIELD));
145+
} catch (Exception e) {
146+
throw new MapperParsingException(
147+
String.format(Locale.ROOT, "Invalid [%s]. Valid values are [%s].", PRUNE_TYPE_FIELD, PruneType.getValidValues())
148+
);
149+
} finally {
150+
if (shouldRemoveIt) {
151+
config.remove(PRUNE_TYPE_FIELD);
152+
}
153+
}
154+
}
155+
return null;
156+
}
157+
158+
public void toXContent(@NonNull final XContentBuilder builder, String name, @NonNull final SparseEncodingConfig sparseEncodingConfig)
159+
throws IOException {
160+
builder.startObject(name);
161+
if (sparseEncodingConfig.pruneType != null) {
162+
builder.field(PRUNE_TYPE_FIELD, sparseEncodingConfig.pruneType.getValue());
163+
}
164+
if (sparseEncodingConfig.pruneRatio != null) {
165+
builder.field(PRUNE_RATIO_FIELD, sparseEncodingConfig.pruneRatio.floatValue());
166+
}
167+
builder.endObject();
168+
}
169+
170+
@Override
171+
public String toString() {
172+
final Map<String, Object> config = new HashMap<>();
173+
if (pruneType != null) {
174+
config.put(PRUNE_TYPE_FIELD, pruneType.getValue());
175+
}
176+
if (pruneRatio != null) {
177+
config.put(PRUNE_RATIO_FIELD, pruneRatio);
178+
}
179+
return config.toString();
180+
}
181+
182+
@Override
183+
public boolean equals(Object obj) {
184+
if (this == obj) {
185+
return true;
186+
} else if (obj != null && this.getClass() == obj.getClass()) {
187+
SparseEncodingConfig other = (SparseEncodingConfig) obj;
188+
EqualsBuilder equalsBuilder = new EqualsBuilder();
189+
equalsBuilder.append(this.pruneType, other.pruneType);
190+
equalsBuilder.append(this.pruneRatio, other.pruneRatio);
191+
return equalsBuilder.isEquals();
192+
} else {
193+
return false;
194+
}
195+
}
196+
197+
@Override
198+
public int hashCode() {
199+
return (new HashCodeBuilder()).append(this.pruneType).append(this.pruneRatio).toHashCode();
200+
}
201+
}

src/main/java/org/opensearch/neuralsearch/mappingtransformer/SemanticInfoConfigBuilder.java

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import static org.opensearch.neuralsearch.constants.MappingConstants.PROPERTIES;
2525
import static org.opensearch.neuralsearch.constants.MappingConstants.TYPE;
2626
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.DENSE_EMBEDDING_CONFIG;
27+
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SPARSE_ENCODING_CONFIG;
2728
import static org.opensearch.neuralsearch.constants.SemanticInfoFieldConstants.EMBEDDING_FIELD_NAME;
2829
import static org.opensearch.neuralsearch.constants.SemanticInfoFieldConstants.CHUNKS_FIELD_NAME;
2930
import static org.opensearch.neuralsearch.constants.SemanticInfoFieldConstants.CHUNKS_TEXT_FIELD_NAME;
@@ -51,6 +52,7 @@ public class SemanticInfoConfigBuilder {
5152
private Boolean chunkingEnabled;
5253
private String semanticFieldSearchAnalyzer;
5354
private Map<String, Object> denseEmbeddingConfig;
55+
private boolean sparseEncodingConfigDefined;
5456
private final static List<String> UNSUPPORTED_DENSE_EMBEDDING_CONFIG = List.of(
5557
KNN_VECTOR_DIMENSION_FIELD_NAME,
5658
KNN_VECTOR_DATA_TYPE_FIELD_NAME,
@@ -111,17 +113,30 @@ public Map<String, Object> build() {
111113
}
112114

113115
private void validate() {
114-
if (semanticFieldSearchAnalyzer != null && RankFeaturesFieldMapper.CONTENT_TYPE.equals(embeddingFieldType) == false) {
116+
if (KNNVectorFieldMapper.CONTENT_TYPE.equals(embeddingFieldType)) {
117+
validateSearchAnalyzerNotDefined();
118+
validateSparseEncodingConfigNotDefined();
119+
}
120+
121+
if (RankFeaturesFieldMapper.CONTENT_TYPE.equals(embeddingFieldType)) {
122+
validateDenseEmbeddingConfigNotDefined();
123+
}
124+
}
125+
126+
private void validateSparseEncodingConfigNotDefined() {
127+
if (sparseEncodingConfigDefined) {
115128
throw new IllegalArgumentException(
116129
String.format(
117130
Locale.ROOT,
118-
"Cannot build the semantic info config because the embedding field type %s cannot build with semantic field search analyzer %s",
131+
"Cannot build the semantic info config because the embedding field type %s cannot build with %s.",
119132
embeddingFieldType,
120-
semanticFieldSearchAnalyzer
133+
SPARSE_ENCODING_CONFIG
121134
)
122135
);
123136
}
137+
}
124138

139+
private void validateDenseEmbeddingConfigNotDefined() {
125140
if (denseEmbeddingConfig != null && RankFeaturesFieldMapper.CONTENT_TYPE.equals(embeddingFieldType)) {
126141
throw new IllegalArgumentException(
127142
String.format(
@@ -134,6 +149,19 @@ private void validate() {
134149
}
135150
}
136151

152+
private void validateSearchAnalyzerNotDefined() {
153+
if (semanticFieldSearchAnalyzer != null) {
154+
throw new IllegalArgumentException(
155+
String.format(
156+
Locale.ROOT,
157+
"Cannot build the semantic info config because the embedding field type %s cannot build with semantic field search analyzer %s",
158+
embeddingFieldType,
159+
semanticFieldSearchAnalyzer
160+
)
161+
);
162+
}
163+
}
164+
137165
private Map<String, Object> buildKnnFieldConfig() {
138166
final Map<String, Object> config = new HashMap<>();
139167
config.put(TYPE, KNNVectorFieldMapper.CONTENT_TYPE);
@@ -343,4 +371,9 @@ public SemanticInfoConfigBuilder denseEmbeddingConfig(final Map<String, Object>
343371
this.denseEmbeddingConfig = denseEmbeddingConfig;
344372
return this;
345373
}
374+
375+
public SemanticInfoConfigBuilder sparseEncodingConfigDefined(final boolean sparseEncodingConfigDefined) {
376+
this.sparseEncodingConfigDefined = sparseEncodingConfigDefined;
377+
return this;
378+
}
346379
}

src/main/java/org/opensearch/neuralsearch/mappingtransformer/SemanticMappingTransformer.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
import static org.opensearch.neuralsearch.constants.MappingConstants.PROPERTIES;
2727
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SEMANTIC_INFO_FIELD_NAME;
28+
import static org.opensearch.neuralsearch.constants.SemanticFieldConstants.SPARSE_ENCODING_CONFIG;
2829
import static org.opensearch.neuralsearch.util.SemanticMappingUtils.collectSemanticField;
2930
import static org.opensearch.neuralsearch.util.SemanticMappingUtils.extractModelIdToFieldPathMap;
3031
import static org.opensearch.neuralsearch.util.SemanticMappingUtils.getDenseEmbeddingConfig;
@@ -221,6 +222,7 @@ private Map<String, Object> createSemanticInfoField(
221222
builder.chunkingEnabled(isChunkingEnabled(fieldConfig, fieldPath));
222223
builder.semanticFieldSearchAnalyzer(getSemanticFieldSearchAnalyzer(fieldConfig, fieldPath));
223224
builder.denseEmbeddingConfig(getDenseEmbeddingConfig(fieldConfig, fieldPath));
225+
builder.sparseEncodingConfigDefined(fieldConfig.containsKey(SPARSE_ENCODING_CONFIG));
224226
return builder.build();
225227
}
226228

src/main/java/org/opensearch/neuralsearch/processor/dto/SemanticFieldInfo.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import lombok.Builder;
88
import lombok.Data;
9+
import org.opensearch.neuralsearch.mapper.dto.SparseEncodingConfig;
910
import org.opensearch.neuralsearch.processor.chunker.Chunker;
1011

1112
import java.util.List;
@@ -54,6 +55,8 @@ public class SemanticFieldInfo {
5455
*/
5556
private List<String> chunks;
5657

58+
private SparseEncodingConfig sparseEncodingConfig;
59+
5760
/**
5861
* @return full path to the chunks field of the semantic field in a doc
5962
*/

0 commit comments

Comments
 (0)