2525import org .elasticsearch .cluster .RestoreInProgress ;
2626import org .elasticsearch .cluster .health .ClusterHealthStatus ;
2727import org .elasticsearch .cluster .health .ClusterStateHealth ;
28+ import org .elasticsearch .cluster .metadata .AutoExpandReplicas ;
2829import org .elasticsearch .cluster .metadata .IndexMetaData ;
2930import org .elasticsearch .cluster .metadata .MetaData ;
3031import org .elasticsearch .cluster .routing .RoutingNode ;
4647import java .util .Comparator ;
4748import java .util .Iterator ;
4849import java .util .List ;
50+ import java .util .Map ;
4951import java .util .function .Function ;
5052import java .util .stream .Collectors ;
5153
@@ -206,11 +208,12 @@ public ClusterState applyFailedShards(final ClusterState clusterState, final Lis
206208 * unassigned an shards that are associated with nodes that are no longer part of the cluster, potentially promoting replicas
207209 * if needed.
208210 */
209- public ClusterState deassociateDeadNodes (final ClusterState clusterState , boolean reroute , String reason ) {
210- RoutingNodes routingNodes = getMutableRoutingNodes (clusterState );
211+ public ClusterState deassociateDeadNodes (ClusterState clusterState , boolean reroute , String reason ) {
212+ ClusterState fixedClusterState = adaptAutoExpandReplicas (clusterState );
213+ RoutingNodes routingNodes = getMutableRoutingNodes (fixedClusterState );
211214 // shuffle the unassigned nodes, just so we won't have things like poison failed shards
212215 routingNodes .unassigned ().shuffle ();
213- RoutingAllocation allocation = new RoutingAllocation (allocationDeciders , routingNodes , clusterState ,
216+ RoutingAllocation allocation = new RoutingAllocation (allocationDeciders , routingNodes , fixedClusterState ,
214217 clusterInfoService .getClusterInfo (), currentNanoTime ());
215218
216219 // first, clear from the shards any node id they used to belong to that is now dead
@@ -220,12 +223,40 @@ public ClusterState deassociateDeadNodes(final ClusterState clusterState, boolea
220223 reroute (allocation );
221224 }
222225
223- if (allocation .routingNodesChanged () == false ) {
226+ if (fixedClusterState == clusterState && allocation .routingNodesChanged () == false ) {
224227 return clusterState ;
225228 }
226229 return buildResultAndLogHealthChange (clusterState , allocation , reason );
227230 }
228231
232+ /**
233+ * Checks if the are replicas with the auto-expand feature that need to be adapted.
234+ * Returns an updated cluster state if changes were necessary, or the identical cluster if no changes were required.
235+ */
236+ private ClusterState adaptAutoExpandReplicas (ClusterState clusterState ) {
237+ final Map <Integer , List <String >> autoExpandReplicaChanges =
238+ AutoExpandReplicas .getAutoExpandReplicaChanges (clusterState .metaData (), clusterState .nodes ());
239+ if (autoExpandReplicaChanges .isEmpty ()) {
240+ return clusterState ;
241+ } else {
242+ final RoutingTable .Builder routingTableBuilder = RoutingTable .builder (clusterState .routingTable ());
243+ final MetaData .Builder metaDataBuilder = MetaData .builder (clusterState .metaData ());
244+ for (Map .Entry <Integer , List <String >> entry : autoExpandReplicaChanges .entrySet ()) {
245+ final int numberOfReplicas = entry .getKey ();
246+ final String [] indices = entry .getValue ().toArray (new String [entry .getValue ().size ()]);
247+ // we do *not* update the in sync allocation ids as they will be removed upon the first index
248+ // operation which make these copies stale
249+ routingTableBuilder .updateNumberOfReplicas (numberOfReplicas , indices );
250+ metaDataBuilder .updateNumberOfReplicas (numberOfReplicas , indices );
251+ logger .info ("updating number_of_replicas to [{}] for indices {}" , numberOfReplicas , indices );
252+ }
253+ final ClusterState fixedState = ClusterState .builder (clusterState ).routingTable (routingTableBuilder .build ())
254+ .metaData (metaDataBuilder ).build ();
255+ assert AutoExpandReplicas .getAutoExpandReplicaChanges (fixedState .metaData (), fixedState .nodes ()).isEmpty ();
256+ return fixedState ;
257+ }
258+ }
259+
229260 /**
230261 * Removes delay markers from unassigned shards based on current time stamp.
231262 */
@@ -301,6 +332,7 @@ public CommandsResult reroute(final ClusterState clusterState, AllocationCommand
301332 if (retryFailed ) {
302333 resetFailedAllocationCounter (allocation );
303334 }
335+
304336 reroute (allocation );
305337 return new CommandsResult (explanations , buildResultAndLogHealthChange (clusterState , allocation , "reroute commands" ));
306338 }
@@ -320,15 +352,17 @@ public ClusterState reroute(ClusterState clusterState, String reason) {
320352 * <p>
321353 * If the same instance of ClusterState is returned, then no change has been made.
322354 */
323- protected ClusterState reroute (final ClusterState clusterState , String reason , boolean debug ) {
324- RoutingNodes routingNodes = getMutableRoutingNodes (clusterState );
355+ protected ClusterState reroute (ClusterState clusterState , String reason , boolean debug ) {
356+ ClusterState fixedClusterState = adaptAutoExpandReplicas (clusterState );
357+
358+ RoutingNodes routingNodes = getMutableRoutingNodes (fixedClusterState );
325359 // shuffle the unassigned nodes, just so we won't have things like poison failed shards
326360 routingNodes .unassigned ().shuffle ();
327- RoutingAllocation allocation = new RoutingAllocation (allocationDeciders , routingNodes , clusterState ,
361+ RoutingAllocation allocation = new RoutingAllocation (allocationDeciders , routingNodes , fixedClusterState ,
328362 clusterInfoService .getClusterInfo (), currentNanoTime ());
329363 allocation .debugDecision (debug );
330364 reroute (allocation );
331- if (allocation .routingNodesChanged () == false ) {
365+ if (fixedClusterState == clusterState && allocation .routingNodesChanged () == false ) {
332366 return clusterState ;
333367 }
334368 return buildResultAndLogHealthChange (clusterState , allocation , reason );
@@ -353,6 +387,8 @@ private boolean hasDeadNodes(RoutingAllocation allocation) {
353387
354388 private void reroute (RoutingAllocation allocation ) {
355389 assert hasDeadNodes (allocation ) == false : "dead nodes should be explicitly cleaned up. See deassociateDeadNodes" ;
390+ assert AutoExpandReplicas .getAutoExpandReplicaChanges (allocation .metaData (), allocation .nodes ()).isEmpty () :
391+ "auto-expand replicas out of sync with number of nodes in the cluster" ;
356392
357393 // now allocate all the unassigned to available nodes
358394 if (allocation .routingNodes ().unassigned ().size () > 0 ) {
0 commit comments