Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -380,8 +380,7 @@ public LogicalPlan visitRareTopN(RareTopN node, AnalysisContext context) {
fields.forEach(
field -> newEnv.define(new Symbol(Namespace.FIELD_NAME, field.toString()), field.type()));

List<Argument> options = node.getArguments();
Integer noOfResults = (Integer) options.get(0).getValue().getValue();
Integer noOfResults = node.getNoOfResults();

return new LogicalRareTopN(child, node.getCommandType(), noOfResults, fields, groupBys);
}
Expand Down
12 changes: 10 additions & 2 deletions core/src/main/java/org/opensearch/sql/ast/dsl/AstDSL.java
Original file line number Diff line number Diff line change
Expand Up @@ -540,8 +540,16 @@ public static RareTopN rareTopN(
List<Argument> noOfResults,
List<UnresolvedExpression> groupList,
Field... fields) {
return new RareTopN(input, commandType, noOfResults, Arrays.asList(fields), groupList)
.attach(input);
Integer N =
(Integer)
Argument.ArgumentMap.of(noOfResults)
.getOrDefault("noOfResults", new Literal(10, DataType.INTEGER))
.getValue();
List<Argument> removed =
noOfResults.stream()
.filter(argument -> !argument.getArgName().equals("noOfResults"))
.toList();
return new RareTopN(commandType, N, removed, Arrays.asList(fields), groupList).attach(input);
}

public static Limit limit(UnresolvedPlan input, Integer limit, Integer offset) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ public <R, C> R accept(AbstractNodeVisitor<R, C> nodeVisitor, C context) {
}

/** ArgumentMap is a helper class to get argument value by name. */
@EqualsAndHashCode
@ToString
public static class ArgumentMap {
private final Map<String, Literal> map;

Expand Down
10 changes: 7 additions & 3 deletions core/src/main/java/org/opensearch/sql/ast/tree/RareTopN.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import com.google.common.collect.ImmutableList;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
Expand All @@ -24,12 +23,11 @@
@ToString
@EqualsAndHashCode(callSuper = false)
@RequiredArgsConstructor
@AllArgsConstructor
public class RareTopN extends UnresolvedPlan {

private UnresolvedPlan child;
private final CommandType commandType;
// arguments: noOfResults: Integer, countField: String, showCount: Boolean
private final Integer noOfResults;
private final List<Argument> arguments;
private final List<Field> fields;
private final List<UnresolvedExpression> groupExprList;
Expand All @@ -54,4 +52,10 @@ public enum CommandType {
TOP,
RARE
}

public enum Option {
countField,
showCount,
useNull,
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
import static org.opensearch.sql.ast.tree.Sort.SortOrder.ASC;
import static org.opensearch.sql.ast.tree.Sort.SortOrder.DESC;
import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_FOR_DEDUP;
import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_NAME;
import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_NAME_MAIN;
import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_NAME_SUBSEARCH;
import static org.opensearch.sql.calcite.utils.PlanUtils.ROW_NUMBER_COLUMN_NAME_TOP_RARE;
import static org.opensearch.sql.calcite.utils.PlanUtils.getRelation;
import static org.opensearch.sql.calcite.utils.PlanUtils.getRexCall;
import static org.opensearch.sql.calcite.utils.PlanUtils.transformPlanToAttachChild;
Expand Down Expand Up @@ -1128,22 +1128,7 @@ public RelNode visitAggregation(Aggregation node, CalcitePlanContext context) {
Pair<List<RexNode>, List<AggCall>> aggregationAttributes =
aggregateWithTrimming(groupExprList, aggExprList, context);
if (toAddHintsOnAggregate) {
final RelHint statHits =
RelHint.builder("stats_args").hintOption(Argument.BUCKET_NULLABLE, "false").build();
assert context.relBuilder.peek() instanceof LogicalAggregate
: "Stats hits should be added to LogicalAggregate";
context.relBuilder.hints(statHits);
context
.relBuilder
.getCluster()
.setHintStrategies(
HintStrategyTable.builder()
.hintStrategy(
"stats_args",
(hint, rel) -> {
return rel instanceof LogicalAggregate;
})
.build());
addIgnoreNullBucketHintToAggregate(context);
}

// schema reordering
Expand Down Expand Up @@ -1862,9 +1847,8 @@ public RelNode visitKmeans(Kmeans node, CalcitePlanContext context) {
@Override
public RelNode visitRareTopN(RareTopN node, CalcitePlanContext context) {
visitChildren(node, context);

ArgumentMap arguments = ArgumentMap.of(node.getArguments());
String countFieldName = (String) arguments.get("countField").getValue();
ArgumentMap argumentMap = ArgumentMap.of(node.getArguments());
String countFieldName = (String) argumentMap.get(RareTopN.Option.countField.name()).getValue();
if (context.relBuilder.peek().getRowType().getFieldNames().contains(countFieldName)) {
throw new IllegalArgumentException(
"Field `"
Expand All @@ -1879,8 +1863,27 @@ public RelNode visitRareTopN(RareTopN node, CalcitePlanContext context) {
groupExprList.addAll(fieldList);
List<UnresolvedExpression> aggExprList =
List.of(AstDSL.alias(countFieldName, AstDSL.aggregate("count", null)));

// if usenull=false, add a isNotNull before Aggregate and the hint to this Aggregate
Boolean bucketNullable = (Boolean) argumentMap.get(RareTopN.Option.useNull.name()).getValue();
boolean toAddHintsOnAggregate = false;
if (!bucketNullable && !groupExprList.isEmpty()) {
toAddHintsOnAggregate = true;
// add isNotNull filter before aggregation to filter out null bucket
List<RexNode> groupByList =
groupExprList.stream().map(expr -> rexVisitor.analyze(expr, context)).toList();
context.relBuilder.filter(
PlanUtils.getSelectColumns(groupByList).stream()
.map(context.relBuilder::field)
.map(context.relBuilder::isNotNull)
.toList());
Comment on lines +1875 to +1879
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not apply isNotNull directly on groupByList, but on their underlying input ref? If some operation converts a null field to a non-null one, I think it should not be filtered out.

Copy link
Member Author

@LantaoJin LantaoJin Oct 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yuancu
Think about query:

| eval a = nullif(status, "200") | stats bucket_nullable = false count() by a

The groupByList contains RexCall "AS($9, 'a')". So we finally build a
context.relBuilder.filter(isNotNull($9)) which $9 is nullif(status, "200") instead of status

}
aggregateWithTrimming(groupExprList, aggExprList, context);

if (toAddHintsOnAggregate) {
addIgnoreNullBucketHintToAggregate(context);
}

// 2. add a window column
List<RexNode> partitionKeys = rexVisitor.analyze(node.getGroupExprList(), context);
RexNode countField;
Expand All @@ -1899,26 +1902,46 @@ public RelNode visitRareTopN(RareTopN node, CalcitePlanContext context) {
List.of(countField),
WindowFrame.toCurrentRow());
context.relBuilder.projectPlus(
context.relBuilder.alias(rowNumberWindowOver, ROW_NUMBER_COLUMN_NAME));
context.relBuilder.alias(rowNumberWindowOver, ROW_NUMBER_COLUMN_NAME_TOP_RARE));

// 3. filter row_number() <= k in each partition
Integer N = (Integer) arguments.get("noOfResults").getValue();
int k = node.getNoOfResults();
context.relBuilder.filter(
context.relBuilder.lessThanOrEqual(
context.relBuilder.field(ROW_NUMBER_COLUMN_NAME), context.relBuilder.literal(N)));
context.relBuilder.field(ROW_NUMBER_COLUMN_NAME_TOP_RARE),
context.relBuilder.literal(k)));

// 4. project final output. the default output is group by list + field list
Boolean showCount = (Boolean) arguments.get("showCount").getValue();
Boolean showCount = (Boolean) argumentMap.get(RareTopN.Option.showCount.name()).getValue();
if (showCount) {
context.relBuilder.projectExcept(context.relBuilder.field(ROW_NUMBER_COLUMN_NAME));
context.relBuilder.projectExcept(context.relBuilder.field(ROW_NUMBER_COLUMN_NAME_TOP_RARE));
} else {
context.relBuilder.projectExcept(
context.relBuilder.field(ROW_NUMBER_COLUMN_NAME),
context.relBuilder.field(ROW_NUMBER_COLUMN_NAME_TOP_RARE),
context.relBuilder.field(countFieldName));
}
return context.relBuilder.peek();
}

private static void addIgnoreNullBucketHintToAggregate(CalcitePlanContext context) {
final RelHint statHits =
RelHint.builder("stats_args").hintOption(Argument.BUCKET_NULLABLE, "false").build();
assert context.relBuilder.peek() instanceof LogicalAggregate
: "Stats hits should be added to LogicalAggregate";
context.relBuilder.hints(statHits);
context
.relBuilder
.getCluster()
.setHintStrategies(
HintStrategyTable.builder()
.hintStrategy(
"stats_args",
(hint, rel) -> {
return rel instanceof LogicalAggregate;
})
.build());
}

@Override
public RelNode visitTableFunction(TableFunction node, CalcitePlanContext context) {
throw new CalciteUnsupportedException("Table function is unsupported in Calcite");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public interface PlanUtils {
/** this is only for dedup command, do not reuse it in other command */
String ROW_NUMBER_COLUMN_FOR_DEDUP = "_row_number_dedup_";

String ROW_NUMBER_COLUMN_NAME = "_row_number_";
String ROW_NUMBER_COLUMN_NAME_TOP_RARE = "_row_number_top_rare_";
String ROW_NUMBER_COLUMN_NAME_MAIN = "_row_number_main_";
String ROW_NUMBER_COLUMN_NAME_SUBSEARCH = "_row_number_subsearch_";

Expand Down
1 change: 1 addition & 0 deletions docs/user/ppl/admin/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ The behaviours it controlled includes:

- The default value of argument ``bucket_nullable`` in ``stats`` command. Check `stats command <../cmd/stats.rst>`_ for details.
- The return value of ``divide`` and ``/`` operator. Check `expressions <../functions/expressions.rst>`_ for details.
- The default value of argument ``usenull`` in ``top`` and ``rare`` commands. Check `top command <../cmd/top.rst>`_ and `rare command <../cmd/rare.rst>`_ for details.

Example 1
-------
Expand Down
57 changes: 43 additions & 14 deletions docs/user/ppl/cmd/rare.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
=============
====
rare
=============
====

.. rubric:: Table of contents

Expand All @@ -10,13 +10,13 @@ rare


Description
============
===========
| Using ``rare`` command to find the least common tuple of values of all fields in the field list.

**Note**: A maximum of 10 results is returned for each distinct tuple of values of the group-by fields.

Syntax
============
======
rare <field-list> [by-clause]

rare [rare-options] <field-list> [by-clause] ``(available from 3.1.0+)``
Expand All @@ -26,10 +26,13 @@ rare [rare-options] <field-list> [by-clause] ``(available from 3.1.0+)``
* rare-options: optional. options for the rare command. Supported syntax is [countfield=<string>] [showcount=<bool>].
* showcount=<bool>: optional. whether to create a field in output that represent a count of the tuple of values. Default value is ``true``.
* countfield=<string>: optional. the name of the field that contains count. Default value is ``'count'``.
* usenull=<bool>: optional (since 3.4.0). whether to output the null value. The default value of ``usenull`` is determined by ``plugins.ppl.syntax.legacy.preferred``:

* When ``plugins.ppl.syntax.legacy.preferred=true``, ``usenull`` defaults to ``true``
* When ``plugins.ppl.syntax.legacy.preferred=false``, ``usenull`` defaults to ``false``
Comment on lines +31 to +32
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bullet points in blockquote, is it expected?

Copy link
Member Author

@LantaoJin LantaoJin Oct 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. Keep the format same with the current stats.rst. I tried to use the new format to align with Update PPL Command Documentation. But seems there are still many format problems in that PR. ref


Example 1: Find the least common values in a field
===========================================
==================================================

The example finds least common gender of all the accounts.

Expand All @@ -46,7 +49,7 @@ PPL query::


Example 2: Find the least common values organized by gender
====================================================
===========================================================

The example finds least common age of all the accounts group by gender.

Expand All @@ -66,12 +69,10 @@ PPL query::
Example 3: Rare command with Calcite enabled
============================================

The example finds least common gender of all the accounts when ``plugins.calcite.enabled`` is true.

PPL query::

PPL> source=accounts | rare gender;
fetched row
os> source=accounts | rare gender;
fetched rows / total rows = 2/2
+--------+-------+
| gender | count |
|--------+-------|
Expand All @@ -83,19 +84,47 @@ PPL query::
Example 4: Specify the count field option
=========================================

The example specifies the count field when ``plugins.calcite.enabled`` is true.

PPL query::

PPL> source=accounts | rare countfield='cnt' gender;
fetched row
os> source=accounts | rare countfield='cnt' gender;
fetched rows / total rows = 2/2
+--------+-----+
| gender | cnt |
|--------+-----|
| F | 1 |
| M | 3 |
+--------+-----+


Example 5: Specify the usenull field option
===========================================

PPL query::

os> source=accounts | rare usenull=false email;
fetched rows / total rows = 3/3
+-----------------------+-------+
| email | count |
|-----------------------+-------|
| [email protected] | 1 |
| [email protected] | 1 |
| [email protected] | 1 |
+-----------------------+-------+

PPL query::

os> source=accounts | rare usenull=true email;
fetched rows / total rows = 4/4
+-----------------------+-------+
| email | count |
|-----------------------+-------|
| null | 1 |
| [email protected] | 1 |
| [email protected] | 1 |
| [email protected] | 1 |
+-----------------------+-------+


Limitations
===========
The ``rare`` command is not rewritten to OpenSearch DSL, it is only executed on the coordination node.
Loading
Loading