Skip to content

Commit 8a7d542

Browse files
committed
Adding Id, Ignored and Routing ParquetFields for Metadata
Signed-off-by: Sagar Darji <[email protected]>
1 parent 912aeb5 commit 8a7d542

File tree

8 files changed

+209
-5
lines changed

8 files changed

+209
-5
lines changed

modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/fields/core/metadata/DocCountParquetField.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,6 @@ public ArrowType getArrowType() {
5656

5757
@Override
5858
public FieldType getFieldType() {
59-
return FieldType.notNullable(getArrowType());
59+
return FieldType.nullable(getArrowType());
6060
}
6161
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package com.parquet.parquetdataformat.fields.core.metadata;
10+
11+
import com.parquet.parquetdataformat.fields.ParquetField;
12+
import com.parquet.parquetdataformat.vsr.ManagedVSR;
13+
import org.apache.arrow.vector.VarCharVector;
14+
import org.apache.arrow.vector.types.pojo.ArrowType;
15+
import org.apache.arrow.vector.types.pojo.FieldType;
16+
import org.apache.lucene.util.BytesRef;
17+
import org.opensearch.index.mapper.MappedFieldType;
18+
19+
/**
20+
* Parquet field implementation for handling document ID metadata in OpenSearch documents.
21+
*
22+
* <p>This class provides the conversion logic between OpenSearch document ID fields and Apache Arrow
23+
* UTF-8 string vectors for columnar storage in Parquet format. Document ID values are stored
24+
* using Apache Arrow's {@link VarCharVector}, which provides efficient variable-length string storage.</p>
25+
*
26+
* <p>This field type corresponds to OpenSearch's {@code _id} metadata field and
27+
* supports unique document identifiers. The ID values are processed from {@link BytesRef} objects
28+
* and stored directly in the Arrow vector with proper offset and length handling.</p>
29+
*
30+
* <p><strong>Usage Example:</strong></p>
31+
* <pre>{@code
32+
* IdParquetField idField = new IdParquetField();
33+
* ArrowType arrowType = idField.getArrowType(); // Returns UTF-8 string type
34+
* FieldType fieldType = idField.getFieldType(); // Returns nullable UTF-8 field type
35+
* }</pre>
36+
*
37+
* @see ParquetField
38+
* @see VarCharVector
39+
* @see ArrowType.Utf8
40+
* @since 1.0
41+
*/
42+
public class IdParquetField extends ParquetField {
43+
44+
@Override
45+
protected void addToGroup(MappedFieldType mappedFieldType, ManagedVSR managedVSR, Object parseValue) {
46+
VarCharVector idVector = (VarCharVector) managedVSR.getVector(mappedFieldType.name());
47+
int rowIndex = managedVSR.getRowCount();
48+
BytesRef bytesRef = (BytesRef) parseValue;
49+
idVector.setSafe(rowIndex, bytesRef.bytes, bytesRef.offset, bytesRef.length);
50+
}
51+
52+
@Override
53+
public ArrowType getArrowType() {
54+
return new ArrowType.Utf8();
55+
}
56+
57+
@Override
58+
public FieldType getFieldType() {
59+
return FieldType.nullable(getArrowType());
60+
}
61+
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package com.parquet.parquetdataformat.fields.core.metadata;
10+
11+
import com.parquet.parquetdataformat.fields.ParquetField;
12+
import com.parquet.parquetdataformat.vsr.ManagedVSR;
13+
import org.apache.arrow.vector.VarCharVector;
14+
import org.apache.arrow.vector.types.pojo.ArrowType;
15+
import org.apache.arrow.vector.types.pojo.FieldType;
16+
import org.opensearch.index.mapper.MappedFieldType;
17+
18+
import java.nio.charset.StandardCharsets;
19+
20+
/**
21+
* Parquet field implementation for handling ignored field data types in OpenSearch documents.
22+
*
23+
* <p>This class provides the conversion logic between OpenSearch ignored fields and Apache Arrow
24+
* UTF-8 string vectors for columnar storage in Parquet format. Ignored field values are stored
25+
* using Apache Arrow's {@link VarCharVector}, which provides efficient variable-length string storage.</p>
26+
*
27+
* <p>This field type corresponds to OpenSearch's {@code ignored} field mapping and
28+
* supports fields that are indexed but not stored in the document source. The field values
29+
* are converted to UTF-8 string representation before storage in the Arrow vector.</p>
30+
*
31+
* <p><strong>Usage Example:</strong></p>
32+
* <pre>{@code
33+
* IgnoredParquetField ignoredField = new IgnoredParquetField();
34+
* ArrowType arrowType = ignoredField.getArrowType(); // Returns UTF-8 string type
35+
* FieldType fieldType = ignoredField.getFieldType(); // Returns nullable UTF-8 field type
36+
* }</pre>
37+
*
38+
* @see ParquetField
39+
* @see VarCharVector
40+
* @see ArrowType.Utf8
41+
* @since 1.0
42+
*/
43+
public class IgnoredParquetField extends ParquetField {
44+
45+
@Override
46+
protected void addToGroup(MappedFieldType mappedFieldType, ManagedVSR managedVSR, Object parseValue) {
47+
VarCharVector varCharVector = (VarCharVector) managedVSR.getVector(mappedFieldType.name());
48+
int rowIndex = managedVSR.getRowCount();
49+
varCharVector.setSafe(rowIndex, parseValue.toString().getBytes(StandardCharsets.UTF_8));
50+
}
51+
52+
@Override
53+
public ArrowType getArrowType() {
54+
return new ArrowType.Utf8();
55+
}
56+
57+
@Override
58+
public FieldType getFieldType() {
59+
return FieldType.nullable(getArrowType());
60+
}
61+
}
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* The OpenSearch Contributors require contributions made to
5+
* this file be licensed under the Apache-2.0 license or a
6+
* compatible open source license.
7+
*/
8+
9+
package com.parquet.parquetdataformat.fields.core.metadata;
10+
11+
import com.parquet.parquetdataformat.fields.ParquetField;
12+
import com.parquet.parquetdataformat.vsr.ManagedVSR;
13+
import org.apache.arrow.vector.VarCharVector;
14+
import org.apache.arrow.vector.types.pojo.ArrowType;
15+
import org.apache.arrow.vector.types.pojo.FieldType;
16+
import org.opensearch.index.mapper.MappedFieldType;
17+
18+
import java.nio.charset.StandardCharsets;
19+
20+
/**
21+
* Parquet field implementation for handling routing metadata in OpenSearch documents.
22+
*
23+
* <p>This class provides the conversion logic between OpenSearch routing fields and Apache Arrow
24+
* UTF-8 string vectors for columnar storage in Parquet format. Routing values are stored
25+
* using Apache Arrow's {@link VarCharVector}, which provides efficient variable-length string storage.</p>
26+
*
27+
* <p>This field type corresponds to OpenSearch's {@code _routing} metadata field and
28+
* supports custom routing values that determine which shard a document is stored on. The routing
29+
* value is converted to UTF-8 bytes before storage in the Arrow vector.</p>
30+
*
31+
* <p><strong>Usage Example:</strong></p>
32+
* <pre>{@code
33+
* RoutingParquetField routingField = new RoutingParquetField();
34+
* ArrowType arrowType = routingField.getArrowType(); // Returns UTF-8 string type
35+
* FieldType fieldType = routingField.getFieldType(); // Returns nullable UTF-8 field type
36+
* }</pre>
37+
*
38+
* @see ParquetField
39+
* @see VarCharVector
40+
* @see ArrowType.Utf8
41+
* @since 1.0
42+
*/
43+
public class RoutingParquetField extends ParquetField {
44+
45+
@Override
46+
protected void addToGroup(MappedFieldType mappedFieldType, ManagedVSR managedVSR, Object parseValue) {
47+
VarCharVector routingVector = (VarCharVector) managedVSR.getVector(mappedFieldType.name());
48+
int rowIndex = managedVSR.getRowCount();
49+
routingVector.setSafe(rowIndex, parseValue.toString().getBytes(StandardCharsets.UTF_8));
50+
}
51+
52+
@Override
53+
public ArrowType getArrowType() {
54+
return new ArrowType.Utf8();
55+
}
56+
57+
@Override
58+
public FieldType getFieldType() {
59+
return FieldType.nullable(getArrowType());
60+
}
61+
}

modules/parquet-data-format/src/main/java/com/parquet/parquetdataformat/plugins/fields/MetadataFieldPlugin.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,14 @@
1010

1111
import com.parquet.parquetdataformat.fields.ParquetField;
1212
import com.parquet.parquetdataformat.fields.core.metadata.DocCountParquetField;
13+
import com.parquet.parquetdataformat.fields.core.metadata.IdParquetField;
14+
import com.parquet.parquetdataformat.fields.core.metadata.IgnoredParquetField;
15+
import com.parquet.parquetdataformat.fields.core.metadata.RoutingParquetField;
1316
import com.parquet.parquetdataformat.fields.core.metadata.SizeParquetField;
1417
import org.opensearch.index.mapper.DocCountFieldMapper;
18+
import org.opensearch.index.mapper.IdFieldMapper;
19+
import org.opensearch.index.mapper.IgnoredFieldMapper;
20+
import org.opensearch.index.mapper.RoutingFieldMapper;
1521

1622
import java.util.HashMap;
1723
import java.util.Map;
@@ -36,5 +42,8 @@ public Map<String, ParquetField> getParquetFields() {
3642
private static void registerMetadataFields(final Map<String, ParquetField> fieldMap) {
3743
fieldMap.put(DocCountFieldMapper.CONTENT_TYPE, new DocCountParquetField());
3844
fieldMap.put("_size", new SizeParquetField());
45+
fieldMap.put(RoutingFieldMapper.CONTENT_TYPE, new RoutingParquetField());
46+
fieldMap.put(IgnoredFieldMapper.CONTENT_TYPE, new IgnoredParquetField());
47+
fieldMap.put(IdFieldMapper.CONTENT_TYPE, new IdParquetField());
3948
}
4049
}

server/src/main/java/org/opensearch/index/mapper/IdFieldMapper.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,11 @@ private IdFieldMapper(Supplier<Boolean> fieldDataEnabled) {
298298
@Override
299299
public void preParse(ParseContext context) {
300300
BytesRef id = Uid.encodeId(context.sourceToParse().id());
301-
context.doc().add(new Field(NAME, id, Defaults.FIELD_TYPE));
301+
if (isPluggableDataFormatFeatureEnabled()) {
302+
context.compositeDocumentInput().addField(fieldType(), id);
303+
} else {
304+
context.doc().add(new Field(NAME, id, Defaults.FIELD_TYPE));
305+
}
302306
}
303307

304308
@Override

server/src/main/java/org/opensearch/index/mapper/IgnoredFieldMapper.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,11 @@ private IgnoredFieldMapper() {
114114
@Override
115115
public void postParse(ParseContext context) {
116116
for (String field : context.getIgnoredFields()) {
117-
context.doc().add(new Field(NAME, field, Defaults.FIELD_TYPE));
117+
if (isPluggableDataFormatFeatureEnabled()) {
118+
context.compositeDocumentInput().addField(fieldType(), field);
119+
} else {
120+
context.doc().add(new Field(NAME, field, Defaults.FIELD_TYPE));
121+
}
118122
}
119123
}
120124

server/src/main/java/org/opensearch/index/mapper/RoutingFieldMapper.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,8 +148,12 @@ public boolean required() {
148148
public void preParse(ParseContext context) {
149149
String routing = context.sourceToParse().routing();
150150
if (routing != null) {
151-
context.doc().add(new Field(fieldType().name(), routing, Defaults.FIELD_TYPE));
152-
createFieldNamesField(context);
151+
if (isPluggableDataFormatFeatureEnabled()) {
152+
context.compositeDocumentInput().addField(fieldType(), routing);
153+
} else {
154+
context.doc().add(new Field(fieldType().name(), routing, Defaults.FIELD_TYPE));
155+
createFieldNamesField(context);
156+
}
153157
}
154158
}
155159

0 commit comments

Comments
 (0)