Skip to content

Commit e64f25c

Browse files
committed
Change priority for scheduling reroute during timeout (opensearch-project#16445)
Signed-off-by: Rishab Nahata <[email protected]>
1 parent c332506 commit e64f25c

File tree

6 files changed

+248
-24
lines changed

6 files changed

+248
-24
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
88
- Improve performace of NumericTermAggregation by avoiding unnecessary sorting([#17252](https://github.com/opensearch-project/OpenSearch/pull/17252))
99
- Add execution_hint to cardinality aggregator request (#[17419](https://github.com/opensearch-project/OpenSearch/pull/17419))
1010
- [Rule Based Auto-tagging] Add in-memory attribute value store ([#17342](https://github.com/opensearch-project/OpenSearch/pull/17342))
11+
- Change priority for scheduling reroute during timeout([#16445](https://github.com/opensearch-project/OpenSearch/pull/16445))
1112

1213
### Dependencies
1314
- Bump `dnsjava:dnsjava` from 3.6.2 to 3.6.3 ([#17231](https://github.com/opensearch-project/OpenSearch/pull/17231))

server/src/main/java/org/opensearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,11 @@
6262
import java.util.HashMap;
6363
import java.util.HashSet;
6464
import java.util.Iterator;
65+
import java.util.Locale;
6566
import java.util.Map;
6667
import java.util.Set;
6768

69+
import static org.opensearch.cluster.action.shard.ShardStateAction.FOLLOW_UP_REROUTE_PRIORITY_SETTING;
6870
import static org.opensearch.cluster.routing.allocation.ConstraintTypes.CLUSTER_PRIMARY_SHARD_BALANCE_CONSTRAINT_ID;
6971
import static org.opensearch.cluster.routing.allocation.ConstraintTypes.CLUSTER_PRIMARY_SHARD_REBALANCE_CONSTRAINT_ID;
7072
import static org.opensearch.cluster.routing.allocation.ConstraintTypes.INDEX_PRIMARY_SHARD_BALANCE_CONSTRAINT_ID;
@@ -191,6 +193,32 @@ public class BalancedShardsAllocator implements ShardsAllocator {
191193
Setting.Property.Dynamic
192194
);
193195

196+
/**
197+
* Adjusts the priority of the followup reroute task when current round times out. NORMAL is right for reasonable clusters,
198+
* but for a cluster in a messed up state which is starving NORMAL priority tasks, it might be necessary to raise this higher
199+
* to allocate shards.
200+
*/
201+
public static final Setting<Priority> FOLLOW_UP_REROUTE_PRIORITY_SETTING = new Setting<>(
202+
"cluster.routing.allocation.balanced_shards_allocator.schedule_reroute.priority",
203+
Priority.NORMAL.toString(),
204+
BalancedShardsAllocator::parseReroutePriority,
205+
Setting.Property.NodeScope,
206+
Setting.Property.Dynamic
207+
);
208+
209+
private static Priority parseReroutePriority(String priorityString) {
210+
final Priority priority = Priority.valueOf(priorityString.toUpperCase(Locale.ROOT));
211+
switch (priority) {
212+
case NORMAL:
213+
case HIGH:
214+
case URGENT:
215+
return priority;
216+
}
217+
throw new IllegalArgumentException(
218+
"priority [" + priority + "] not supported for [" + FOLLOW_UP_REROUTE_PRIORITY_SETTING.getKey() + "]"
219+
);
220+
}
221+
194222
private volatile boolean movePrimaryFirst;
195223
private volatile ShardMovementStrategy shardMovementStrategy;
196224

@@ -204,6 +232,7 @@ public class BalancedShardsAllocator implements ShardsAllocator {
204232

205233
private volatile boolean ignoreThrottleInRestore;
206234
private volatile TimeValue allocatorTimeout;
235+
private volatile Priority followUpRerouteTaskPriority;
207236
private long startTime;
208237
private RerouteService rerouteService;
209238

@@ -223,6 +252,7 @@ public BalancedShardsAllocator(Settings settings, ClusterSettings clusterSetting
223252
setPreferPrimaryShardRebalance(PREFER_PRIMARY_SHARD_REBALANCE.get(settings));
224253
setShardMovementStrategy(SHARD_MOVEMENT_STRATEGY_SETTING.get(settings));
225254
setAllocatorTimeout(ALLOCATOR_TIMEOUT_SETTING.get(settings));
255+
setFollowUpRerouteTaskPriority(FOLLOW_UP_REROUTE_PRIORITY_SETTING.get(settings));
226256
clusterSettings.addSettingsUpdateConsumer(PREFER_PRIMARY_SHARD_BALANCE, this::setPreferPrimaryShardBalance);
227257
clusterSettings.addSettingsUpdateConsumer(SHARD_MOVE_PRIMARY_FIRST_SETTING, this::setMovePrimaryFirst);
228258
clusterSettings.addSettingsUpdateConsumer(SHARD_MOVEMENT_STRATEGY_SETTING, this::setShardMovementStrategy);
@@ -233,6 +263,7 @@ public BalancedShardsAllocator(Settings settings, ClusterSettings clusterSetting
233263
clusterSettings.addSettingsUpdateConsumer(THRESHOLD_SETTING, this::setThreshold);
234264
clusterSettings.addSettingsUpdateConsumer(IGNORE_THROTTLE_FOR_REMOTE_RESTORE, this::setIgnoreThrottleInRestore);
235265
clusterSettings.addSettingsUpdateConsumer(ALLOCATOR_TIMEOUT_SETTING, this::setAllocatorTimeout);
266+
clusterSettings.addSettingsUpdateConsumer(FOLLOW_UP_REROUTE_PRIORITY_SETTING, this::setFollowUpRerouteTaskPriority);
236267
}
237268

238269
@Override
@@ -321,6 +352,10 @@ private void setAllocatorTimeout(TimeValue allocatorTimeout) {
321352
this.allocatorTimeout = allocatorTimeout;
322353
}
323354

355+
private void setFollowUpRerouteTaskPriority(Priority followUpRerouteTaskPriority) {
356+
this.followUpRerouteTaskPriority = followUpRerouteTaskPriority;
357+
}
358+
324359
protected boolean allocatorTimedOut() {
325360
if (allocatorTimeout.equals(TimeValue.MINUS_ONE)) {
326361
if (logger.isTraceEnabled()) {
@@ -417,10 +452,13 @@ private void failAllocationOfNewPrimaries(RoutingAllocation allocation) {
417452

418453
private void scheduleRerouteIfAllocatorTimedOut() {
419454
if (allocatorTimedOut()) {
420-
assert rerouteService != null : "RerouteService not set to schedule reroute after allocator time out";
455+
if (rerouteService == null) {
456+
logger.info("RerouteService not set to schedule reroute after allocator time out");
457+
return;
458+
}
421459
rerouteService.reroute(
422460
"reroute after balanced shards allocator timed out",
423-
Priority.HIGH,
461+
followUpRerouteTaskPriority,
424462
ActionListener.wrap(
425463
r -> logger.trace("reroute after balanced shards allocator timed out completed"),
426464
e -> logger.debug("reroute after balanced shards allocator timed out failed", e)

server/src/main/java/org/opensearch/common/settings/ClusterSettings.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ public void apply(Settings value, Settings current, Settings previous) {
278278
BalancedShardsAllocator.THRESHOLD_SETTING,
279279
BalancedShardsAllocator.IGNORE_THROTTLE_FOR_REMOTE_RESTORE,
280280
BalancedShardsAllocator.ALLOCATOR_TIMEOUT_SETTING,
281+
BalancedShardsAllocator.FOLLOW_UP_REROUTE_PRIORITY_SETTING,
281282
BreakerSettings.CIRCUIT_BREAKER_LIMIT_SETTING,
282283
BreakerSettings.CIRCUIT_BREAKER_OVERHEAD_SETTING,
283284
BreakerSettings.CIRCUIT_BREAKER_TYPE,
@@ -355,6 +356,7 @@ public void apply(Settings value, Settings current, Settings previous) {
355356
ShardsBatchGatewayAllocator.GATEWAY_ALLOCATOR_BATCH_SIZE,
356357
ShardsBatchGatewayAllocator.PRIMARY_BATCH_ALLOCATOR_TIMEOUT_SETTING,
357358
ShardsBatchGatewayAllocator.REPLICA_BATCH_ALLOCATOR_TIMEOUT_SETTING,
359+
ShardsBatchGatewayAllocator.FOLLOW_UP_REROUTE_PRIORITY_SETTING,
358360
PersistedClusterStateService.SLOW_WRITE_LOGGING_THRESHOLD,
359361
NetworkModule.HTTP_DEFAULT_TYPE_SETTING,
360362
NetworkModule.TRANSPORT_DEFAULT_TYPE_SETTING,

server/src/main/java/org/opensearch/gateway/ShardsBatchGatewayAllocator.java

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
import java.util.HashSet;
5454
import java.util.Iterator;
5555
import java.util.List;
56+
import java.util.Locale;
5657
import java.util.Map;
5758
import java.util.Objects;
5859
import java.util.Set;
@@ -82,6 +83,7 @@ public class ShardsBatchGatewayAllocator implements ExistingShardsAllocator {
8283

8384
private TimeValue primaryShardsBatchGatewayAllocatorTimeout;
8485
private TimeValue replicaShardsBatchGatewayAllocatorTimeout;
86+
private volatile Priority followUpRerouteTaskPriority;
8587
public static final TimeValue MIN_ALLOCATOR_TIMEOUT = TimeValue.timeValueSeconds(20);
8688
private final ClusterManagerMetrics clusterManagerMetrics;
8789

@@ -145,6 +147,32 @@ public void validate(TimeValue timeValue) {
145147
Setting.Property.Dynamic
146148
);
147149

150+
/**
151+
* Adjusts the priority of the followup reroute task when current round times out. NORMAL is right for reasonable clusters,
152+
* but for a cluster in a messed up state which is starving NORMAL priority tasks, it might be necessary to raise this higher
153+
* to allocate existing shards.
154+
*/
155+
public static final Setting<Priority> FOLLOW_UP_REROUTE_PRIORITY_SETTING = new Setting<>(
156+
"cluster.routing.allocation.shards_batch_gateway_allocator.schedule_reroute.priority",
157+
Priority.NORMAL.toString(),
158+
ShardsBatchGatewayAllocator::parseReroutePriority,
159+
Setting.Property.NodeScope,
160+
Setting.Property.Dynamic
161+
);
162+
163+
private static Priority parseReroutePriority(String priorityString) {
164+
final Priority priority = Priority.valueOf(priorityString.toUpperCase(Locale.ROOT));
165+
switch (priority) {
166+
case NORMAL:
167+
case HIGH:
168+
case URGENT:
169+
return priority;
170+
}
171+
throw new IllegalArgumentException(
172+
"priority [" + priority + "] not supported for [" + FOLLOW_UP_REROUTE_PRIORITY_SETTING.getKey() + "]"
173+
);
174+
}
175+
148176
private final RerouteService rerouteService;
149177
private final PrimaryShardBatchAllocator primaryShardBatchAllocator;
150178
private final ReplicaShardBatchAllocator replicaShardBatchAllocator;
@@ -179,6 +207,8 @@ public ShardsBatchGatewayAllocator(
179207
this.replicaShardsBatchGatewayAllocatorTimeout = REPLICA_BATCH_ALLOCATOR_TIMEOUT_SETTING.get(settings);
180208
clusterSettings.addSettingsUpdateConsumer(REPLICA_BATCH_ALLOCATOR_TIMEOUT_SETTING, this::setReplicaBatchAllocatorTimeout);
181209
this.clusterManagerMetrics = clusterManagerMetrics;
210+
setFollowUpRerouteTaskPriority(FOLLOW_UP_REROUTE_PRIORITY_SETTING.get(settings));
211+
clusterSettings.addSettingsUpdateConsumer(FOLLOW_UP_REROUTE_PRIORITY_SETTING, this::setFollowUpRerouteTaskPriority);
182212
}
183213

184214
@Override
@@ -308,8 +338,8 @@ public void onComplete() {
308338
logger.trace("scheduling reroute after existing shards allocator timed out for primary shards");
309339
assert rerouteService != null;
310340
rerouteService.reroute(
311-
"reroute after existing shards allocator timed out",
312-
Priority.HIGH,
341+
"reroute after existing shards allocator [P] timed out",
342+
followUpRerouteTaskPriority,
313343
ActionListener.wrap(
314344
r -> logger.trace("reroute after existing shards allocator timed out completed"),
315345
e -> logger.debug("reroute after existing shards allocator timed out failed", e)
@@ -343,8 +373,8 @@ public void onComplete() {
343373
logger.trace("scheduling reroute after existing shards allocator timed out for replica shards");
344374
assert rerouteService != null;
345375
rerouteService.reroute(
346-
"reroute after existing shards allocator timed out",
347-
Priority.HIGH,
376+
"reroute after existing shards allocator [R] timed out",
377+
followUpRerouteTaskPriority,
348378
ActionListener.wrap(
349379
r -> logger.trace("reroute after existing shards allocator timed out completed"),
350380
e -> logger.debug("reroute after existing shards allocator timed out failed", e)
@@ -920,4 +950,8 @@ protected void setPrimaryBatchAllocatorTimeout(TimeValue primaryShardsBatchGatew
920950
protected void setReplicaBatchAllocatorTimeout(TimeValue replicaShardsBatchGatewayAllocatorTimeout) {
921951
this.replicaShardsBatchGatewayAllocatorTimeout = replicaShardsBatchGatewayAllocatorTimeout;
922952
}
953+
954+
protected void setFollowUpRerouteTaskPriority(Priority followUpRerouteTaskPriority) {
955+
this.followUpRerouteTaskPriority = followUpRerouteTaskPriority;
956+
}
923957
}

0 commit comments

Comments
 (0)