Skip to content

Commit 82b251a

Browse files
authored
Auto-expand replicas when adding or removing nodes (#30423)
Auto-expands replicas in the same cluster state update (instead of a follow-up reroute) where nodes are added or removed. Closes #1873, fixing an issue where nodes drop their copy of auto-expanded data when coming up, only to sync it again later.
1 parent 44c6dcf commit 82b251a

File tree

8 files changed

+162
-119
lines changed

8 files changed

+162
-119
lines changed

docs/CHANGELOG.asciidoc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,12 @@ Rollup::
183183
* Validate timezone in range queries to ensure they match the selected job when
184184
searching ({pull}30338[#30338])
185185

186+
187+
Allocation::
188+
189+
Auto-expand replicas when adding or removing nodes to prevent shard copies from
190+
being dropped and resynced when a data node rejoins the cluster ({pull}30423[#30423])
191+
186192
//[float]
187193
//=== Regressions
188194

server/src/main/java/org/elasticsearch/cluster/metadata/AutoExpandReplicas.java

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,36 @@
1818
*/
1919
package org.elasticsearch.cluster.metadata;
2020

21+
import org.elasticsearch.cluster.node.DiscoveryNodes;
2122
import org.elasticsearch.common.Booleans;
2223
import org.elasticsearch.common.settings.Setting;
2324
import org.elasticsearch.common.settings.Setting.Property;
2425

26+
import java.util.ArrayList;
27+
import java.util.HashMap;
28+
import java.util.List;
29+
import java.util.Map;
30+
import java.util.Optional;
31+
2532
/**
2633
* This class acts as a functional wrapper around the {@code index.auto_expand_replicas} setting.
2734
* This setting or rather it's value is expanded into a min and max value which requires special handling
2835
* based on the number of datanodes in the cluster. This class handles all the parsing and streamlines the access to these values.
2936
*/
30-
final class AutoExpandReplicas {
37+
public final class AutoExpandReplicas {
3138
// the value we recognize in the "max" position to mean all the nodes
3239
private static final String ALL_NODES_VALUE = "all";
33-
public static final Setting<AutoExpandReplicas> SETTING = new Setting<>(IndexMetaData.SETTING_AUTO_EXPAND_REPLICAS, "false", (value) -> {
40+
41+
private static final AutoExpandReplicas FALSE_INSTANCE = new AutoExpandReplicas(0, 0, false);
42+
43+
public static final Setting<AutoExpandReplicas> SETTING = new Setting<>(IndexMetaData.SETTING_AUTO_EXPAND_REPLICAS, "false",
44+
AutoExpandReplicas::parse, Property.Dynamic, Property.IndexScope);
45+
46+
private static AutoExpandReplicas parse(String value) {
3447
final int min;
3548
final int max;
3649
if (Booleans.isFalse(value)) {
37-
return new AutoExpandReplicas(0, 0, false);
50+
return FALSE_INSTANCE;
3851
}
3952
final int dash = value.indexOf('-');
4053
if (-1 == dash) {
@@ -57,7 +70,7 @@ final class AutoExpandReplicas {
5770
}
5871
}
5972
return new AutoExpandReplicas(min, max, true);
60-
}, Property.Dynamic, Property.IndexScope);
73+
}
6174

6275
private final int minReplicas;
6376
private final int maxReplicas;
@@ -80,6 +93,24 @@ int getMaxReplicas(int numDataNodes) {
8093
return Math.min(maxReplicas, numDataNodes-1);
8194
}
8295

96+
Optional<Integer> getDesiredNumberOfReplicas(int numDataNodes) {
97+
if (enabled) {
98+
final int min = getMinReplicas();
99+
final int max = getMaxReplicas(numDataNodes);
100+
int numberOfReplicas = numDataNodes - 1;
101+
if (numberOfReplicas < min) {
102+
numberOfReplicas = min;
103+
} else if (numberOfReplicas > max) {
104+
numberOfReplicas = max;
105+
}
106+
107+
if (numberOfReplicas >= min && numberOfReplicas <= max) {
108+
return Optional.of(numberOfReplicas);
109+
}
110+
}
111+
return Optional.empty();
112+
}
113+
83114
@Override
84115
public String toString() {
85116
return enabled ? minReplicas + "-" + maxReplicas : "false";
@@ -88,6 +119,31 @@ public String toString() {
88119
boolean isEnabled() {
89120
return enabled;
90121
}
122+
123+
/**
124+
* Checks if the are replicas with the auto-expand feature that need to be adapted.
125+
* Returns a map of updates, which maps the indices to be updated to the desired number of replicas.
126+
* The map has the desired number of replicas as key and the indices to update as value, as this allows the result
127+
* of this method to be directly applied to RoutingTable.Builder#updateNumberOfReplicas.
128+
*/
129+
public static Map<Integer, List<String>> getAutoExpandReplicaChanges(MetaData metaData, DiscoveryNodes discoveryNodes) {
130+
// used for translating "all" to a number
131+
final int dataNodeCount = discoveryNodes.getDataNodes().size();
132+
133+
Map<Integer, List<String>> nrReplicasChanged = new HashMap<>();
134+
135+
for (final IndexMetaData indexMetaData : metaData) {
136+
if (indexMetaData.getState() != IndexMetaData.State.CLOSE) {
137+
AutoExpandReplicas autoExpandReplicas = SETTING.get(indexMetaData.getSettings());
138+
autoExpandReplicas.getDesiredNumberOfReplicas(dataNodeCount).ifPresent(numberOfReplicas -> {
139+
if (numberOfReplicas != indexMetaData.getNumberOfReplicas()) {
140+
nrReplicasChanged.computeIfAbsent(numberOfReplicas, ArrayList::new).add(indexMetaData.getIndex().getName());
141+
}
142+
});
143+
}
144+
}
145+
return nrReplicasChanged;
146+
}
91147
}
92148

93149

server/src/main/java/org/elasticsearch/cluster/metadata/MetaDataUpdateSettingsService.java

Lines changed: 1 addition & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,7 @@
2525
import org.elasticsearch.action.admin.indices.settings.put.UpdateSettingsClusterStateUpdateRequest;
2626
import org.elasticsearch.action.admin.indices.upgrade.post.UpgradeSettingsClusterStateUpdateRequest;
2727
import org.elasticsearch.cluster.AckedClusterStateUpdateTask;
28-
import org.elasticsearch.cluster.ClusterChangedEvent;
2928
import org.elasticsearch.cluster.ClusterState;
30-
import org.elasticsearch.cluster.ClusterStateListener;
3129
import org.elasticsearch.cluster.ack.ClusterStateUpdateResponse;
3230
import org.elasticsearch.cluster.block.ClusterBlock;
3331
import org.elasticsearch.cluster.block.ClusterBlocks;
@@ -42,16 +40,12 @@
4240
import org.elasticsearch.common.settings.IndexScopedSettings;
4341
import org.elasticsearch.common.settings.Setting;
4442
import org.elasticsearch.common.settings.Settings;
45-
import org.elasticsearch.common.unit.TimeValue;
4643
import org.elasticsearch.index.Index;
4744
import org.elasticsearch.indices.IndicesService;
4845
import org.elasticsearch.threadpool.ThreadPool;
4946

5047
import java.io.IOException;
51-
import java.util.ArrayList;
52-
import java.util.HashMap;
5348
import java.util.HashSet;
54-
import java.util.List;
5549
import java.util.Locale;
5650
import java.util.Map;
5751
import java.util.Set;
@@ -61,7 +55,7 @@
6155
/**
6256
* Service responsible for submitting update index settings requests
6357
*/
64-
public class MetaDataUpdateSettingsService extends AbstractComponent implements ClusterStateListener {
58+
public class MetaDataUpdateSettingsService extends AbstractComponent {
6559

6660
private final ClusterService clusterService;
6761

@@ -77,87 +71,11 @@ public MetaDataUpdateSettingsService(Settings settings, ClusterService clusterSe
7771
super(settings);
7872
this.clusterService = clusterService;
7973
this.threadPool = threadPool;
80-
this.clusterService.addListener(this);
8174
this.allocationService = allocationService;
8275
this.indexScopedSettings = indexScopedSettings;
8376
this.indicesService = indicesService;
8477
}
8578

86-
@Override
87-
public void clusterChanged(ClusterChangedEvent event) {
88-
// update an index with number of replicas based on data nodes if possible
89-
if (!event.state().nodes().isLocalNodeElectedMaster()) {
90-
return;
91-
}
92-
// we will want to know this for translating "all" to a number
93-
final int dataNodeCount = event.state().nodes().getDataNodes().size();
94-
95-
Map<Integer, List<Index>> nrReplicasChanged = new HashMap<>();
96-
// we need to do this each time in case it was changed by update settings
97-
for (final IndexMetaData indexMetaData : event.state().metaData()) {
98-
AutoExpandReplicas autoExpandReplicas = IndexMetaData.INDEX_AUTO_EXPAND_REPLICAS_SETTING.get(indexMetaData.getSettings());
99-
if (autoExpandReplicas.isEnabled()) {
100-
/*
101-
* we have to expand the number of replicas for this index to at least min and at most max nodes here
102-
* so we are bumping it up if we have to or reduce it depending on min/max and the number of datanodes.
103-
* If we change the number of replicas we just let the shard allocator do it's thing once we updated it
104-
* since it goes through the index metadata to figure out if something needs to be done anyway. Do do that
105-
* we issue a cluster settings update command below and kicks off a reroute.
106-
*/
107-
final int min = autoExpandReplicas.getMinReplicas();
108-
final int max = autoExpandReplicas.getMaxReplicas(dataNodeCount);
109-
int numberOfReplicas = dataNodeCount - 1;
110-
if (numberOfReplicas < min) {
111-
numberOfReplicas = min;
112-
} else if (numberOfReplicas > max) {
113-
numberOfReplicas = max;
114-
}
115-
// same value, nothing to do there
116-
if (numberOfReplicas == indexMetaData.getNumberOfReplicas()) {
117-
continue;
118-
}
119-
120-
if (numberOfReplicas >= min && numberOfReplicas <= max) {
121-
122-
if (!nrReplicasChanged.containsKey(numberOfReplicas)) {
123-
nrReplicasChanged.put(numberOfReplicas, new ArrayList<>());
124-
}
125-
126-
nrReplicasChanged.get(numberOfReplicas).add(indexMetaData.getIndex());
127-
}
128-
}
129-
}
130-
131-
if (nrReplicasChanged.size() > 0) {
132-
// update settings and kick of a reroute (implicit) for them to take effect
133-
for (final Integer fNumberOfReplicas : nrReplicasChanged.keySet()) {
134-
Settings settings = Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, fNumberOfReplicas).build();
135-
final List<Index> indices = nrReplicasChanged.get(fNumberOfReplicas);
136-
137-
UpdateSettingsClusterStateUpdateRequest updateRequest = new UpdateSettingsClusterStateUpdateRequest()
138-
.indices(indices.toArray(new Index[indices.size()])).settings(settings)
139-
.ackTimeout(TimeValue.timeValueMillis(0)) //no need to wait for ack here
140-
.masterNodeTimeout(TimeValue.timeValueMinutes(10));
141-
142-
updateSettings(updateRequest, new ActionListener<ClusterStateUpdateResponse>() {
143-
@Override
144-
public void onResponse(ClusterStateUpdateResponse response) {
145-
for (Index index : indices) {
146-
logger.info("{} auto expanded replicas to [{}]", index, fNumberOfReplicas);
147-
}
148-
}
149-
150-
@Override
151-
public void onFailure(Exception t) {
152-
for (Index index : indices) {
153-
logger.warn("{} fail to auto expand replicas to [{}]", index, fNumberOfReplicas);
154-
}
155-
}
156-
});
157-
}
158-
}
159-
}
160-
16179
public void updateSettings(final UpdateSettingsClusterStateUpdateRequest request, final ActionListener<ClusterStateUpdateResponse> listener) {
16280
final Settings normalizedSettings = Settings.builder().put(request.settings()).normalizePrefix(IndexMetaData.INDEX_SETTING_PREFIX).build();
16381
Settings.Builder settingsForClosedIndices = Settings.builder();

server/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.elasticsearch.cluster.RestoreInProgress;
2626
import org.elasticsearch.cluster.health.ClusterHealthStatus;
2727
import org.elasticsearch.cluster.health.ClusterStateHealth;
28+
import org.elasticsearch.cluster.metadata.AutoExpandReplicas;
2829
import org.elasticsearch.cluster.metadata.IndexMetaData;
2930
import org.elasticsearch.cluster.metadata.MetaData;
3031
import org.elasticsearch.cluster.routing.RoutingNode;
@@ -46,6 +47,7 @@
4647
import java.util.Comparator;
4748
import java.util.Iterator;
4849
import java.util.List;
50+
import java.util.Map;
4951
import java.util.function.Function;
5052
import java.util.stream.Collectors;
5153

@@ -206,11 +208,12 @@ public ClusterState applyFailedShards(final ClusterState clusterState, final Lis
206208
* unassigned an shards that are associated with nodes that are no longer part of the cluster, potentially promoting replicas
207209
* if needed.
208210
*/
209-
public ClusterState deassociateDeadNodes(final ClusterState clusterState, boolean reroute, String reason) {
210-
RoutingNodes routingNodes = getMutableRoutingNodes(clusterState);
211+
public ClusterState deassociateDeadNodes(ClusterState clusterState, boolean reroute, String reason) {
212+
ClusterState fixedClusterState = adaptAutoExpandReplicas(clusterState);
213+
RoutingNodes routingNodes = getMutableRoutingNodes(fixedClusterState);
211214
// shuffle the unassigned nodes, just so we won't have things like poison failed shards
212215
routingNodes.unassigned().shuffle();
213-
RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, clusterState,
216+
RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, fixedClusterState,
214217
clusterInfoService.getClusterInfo(), currentNanoTime());
215218

216219
// first, clear from the shards any node id they used to belong to that is now dead
@@ -220,12 +223,40 @@ public ClusterState deassociateDeadNodes(final ClusterState clusterState, boolea
220223
reroute(allocation);
221224
}
222225

223-
if (allocation.routingNodesChanged() == false) {
226+
if (fixedClusterState == clusterState && allocation.routingNodesChanged() == false) {
224227
return clusterState;
225228
}
226229
return buildResultAndLogHealthChange(clusterState, allocation, reason);
227230
}
228231

232+
/**
233+
* Checks if the are replicas with the auto-expand feature that need to be adapted.
234+
* Returns an updated cluster state if changes were necessary, or the identical cluster if no changes were required.
235+
*/
236+
private ClusterState adaptAutoExpandReplicas(ClusterState clusterState) {
237+
final Map<Integer, List<String>> autoExpandReplicaChanges =
238+
AutoExpandReplicas.getAutoExpandReplicaChanges(clusterState.metaData(), clusterState.nodes());
239+
if (autoExpandReplicaChanges.isEmpty()) {
240+
return clusterState;
241+
} else {
242+
final RoutingTable.Builder routingTableBuilder = RoutingTable.builder(clusterState.routingTable());
243+
final MetaData.Builder metaDataBuilder = MetaData.builder(clusterState.metaData());
244+
for (Map.Entry<Integer, List<String>> entry : autoExpandReplicaChanges.entrySet()) {
245+
final int numberOfReplicas = entry.getKey();
246+
final String[] indices = entry.getValue().toArray(new String[entry.getValue().size()]);
247+
// we do *not* update the in sync allocation ids as they will be removed upon the first index
248+
// operation which make these copies stale
249+
routingTableBuilder.updateNumberOfReplicas(numberOfReplicas, indices);
250+
metaDataBuilder.updateNumberOfReplicas(numberOfReplicas, indices);
251+
logger.info("updating number_of_replicas to [{}] for indices {}", numberOfReplicas, indices);
252+
}
253+
final ClusterState fixedState = ClusterState.builder(clusterState).routingTable(routingTableBuilder.build())
254+
.metaData(metaDataBuilder).build();
255+
assert AutoExpandReplicas.getAutoExpandReplicaChanges(fixedState.metaData(), fixedState.nodes()).isEmpty();
256+
return fixedState;
257+
}
258+
}
259+
229260
/**
230261
* Removes delay markers from unassigned shards based on current time stamp.
231262
*/
@@ -301,6 +332,7 @@ public CommandsResult reroute(final ClusterState clusterState, AllocationCommand
301332
if (retryFailed) {
302333
resetFailedAllocationCounter(allocation);
303334
}
335+
304336
reroute(allocation);
305337
return new CommandsResult(explanations, buildResultAndLogHealthChange(clusterState, allocation, "reroute commands"));
306338
}
@@ -320,15 +352,17 @@ public ClusterState reroute(ClusterState clusterState, String reason) {
320352
* <p>
321353
* If the same instance of ClusterState is returned, then no change has been made.
322354
*/
323-
protected ClusterState reroute(final ClusterState clusterState, String reason, boolean debug) {
324-
RoutingNodes routingNodes = getMutableRoutingNodes(clusterState);
355+
protected ClusterState reroute(ClusterState clusterState, String reason, boolean debug) {
356+
ClusterState fixedClusterState = adaptAutoExpandReplicas(clusterState);
357+
358+
RoutingNodes routingNodes = getMutableRoutingNodes(fixedClusterState);
325359
// shuffle the unassigned nodes, just so we won't have things like poison failed shards
326360
routingNodes.unassigned().shuffle();
327-
RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, clusterState,
361+
RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, fixedClusterState,
328362
clusterInfoService.getClusterInfo(), currentNanoTime());
329363
allocation.debugDecision(debug);
330364
reroute(allocation);
331-
if (allocation.routingNodesChanged() == false) {
365+
if (fixedClusterState == clusterState && allocation.routingNodesChanged() == false) {
332366
return clusterState;
333367
}
334368
return buildResultAndLogHealthChange(clusterState, allocation, reason);
@@ -353,6 +387,8 @@ private boolean hasDeadNodes(RoutingAllocation allocation) {
353387

354388
private void reroute(RoutingAllocation allocation) {
355389
assert hasDeadNodes(allocation) == false : "dead nodes should be explicitly cleaned up. See deassociateDeadNodes";
390+
assert AutoExpandReplicas.getAutoExpandReplicaChanges(allocation.metaData(), allocation.nodes()).isEmpty() :
391+
"auto-expand replicas out of sync with number of nodes in the cluster";
356392

357393
// now allocate all the unassigned to available nodes
358394
if (allocation.routingNodes().unassigned().size() > 0) {

0 commit comments

Comments
 (0)