Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
private long maxTermSeen;
private final Reconfigurator reconfigurator;
private final ClusterBootstrapService clusterBootstrapService;
private final LagDetector lagDetector;

private Mode mode;
private Optional<DiscoveryNode> lastKnownLeader;
Expand Down Expand Up @@ -153,6 +154,7 @@ public Coordinator(String nodeName, Settings settings, ClusterSettings clusterSe
masterService.setClusterStateSupplier(this::getStateForMasterService);
this.reconfigurator = new Reconfigurator(settings, clusterSettings);
this.clusterBootstrapService = new ClusterBootstrapService(settings, transportService);
this.lagDetector = new LagDetector(settings, transportService.getThreadPool(), n -> removeNode(n, "lagging"));
}

private Runnable getOnLeaderFailure() {
Expand Down Expand Up @@ -370,6 +372,7 @@ void becomeCandidate(String method) {

followersChecker.clearCurrentNodes();
followersChecker.updateFastResponseState(getCurrentTerm(), mode);
lagDetector.clearTrackedNodes();

if (applierState.nodes().getMasterNodeId() != null) {
applierState = clusterStateWithNoMasterBlock(applierState);
Expand Down Expand Up @@ -424,6 +427,7 @@ void becomeFollower(String method, DiscoveryNode leaderNode) {

followersChecker.clearCurrentNodes();
followersChecker.updateFastResponseState(getCurrentTerm(), mode);
lagDetector.clearTrackedNodes();
}

private PreVoteResponse getPreVoteResponse() {
Expand Down Expand Up @@ -508,6 +512,11 @@ public void invariant() {
assert (applierState.nodes().getMasterNodeId() == null) == applierState.blocks().hasGlobalBlock(NO_MASTER_BLOCK_WRITES.id());
assert preVoteCollector.getPreVoteResponse().equals(getPreVoteResponse())
: preVoteCollector + " vs " + getPreVoteResponse();
{
final Set<DiscoveryNode> lagDetectorTrackedNodes = new HashSet<>(lagDetector.getTrackedNodes());
assert lagDetectorTrackedNodes.isEmpty() || lagDetectorTrackedNodes.remove(getLocalNode());
assert followersChecker.getKnownFollowers().equals(lagDetectorTrackedNodes);
}
if (mode == Mode.LEADER) {
final boolean becomingMaster = getStateForMasterService().term() != getCurrentTerm();

Expand Down Expand Up @@ -822,8 +831,10 @@ public String toString() {
}
});

leaderChecker.setCurrentNodes(publishRequest.getAcceptedState().nodes());
followersChecker.setCurrentNodes(publishRequest.getAcceptedState().nodes());
final DiscoveryNodes publishNodes = publishRequest.getAcceptedState().nodes();
leaderChecker.setCurrentNodes(publishNodes);
followersChecker.setCurrentNodes(publishNodes);
lagDetector.setTrackedNodes(publishNodes);
publication.start(followersChecker.getFaultyNodes());
}
} catch (Exception e) {
Expand Down Expand Up @@ -979,7 +990,7 @@ public void onNodeAck(DiscoveryNode node, Exception e) {
}
}
},
transportService.getThreadPool()::relativeTimeInMillis);
transportService.getThreadPool()::relativeTimeInMillis, lagDetector::setAppliedVersion);
this.publishRequest = publishRequest;
this.publicationContext = publicationContext;
this.localNodeAckEvent = localNodeAckEvent;
Expand Down Expand Up @@ -1042,6 +1053,7 @@ public void onSuccess(String source) {
if (mode == Mode.LEADER) {
scheduleReconfigurationIfNeeded();
}
lagDetector.startLagDetector(publishRequest.getAcceptedState().version());
}
ackListener.onNodeAck(getLocalNode(), null);
publishListener.onResponse(null);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster.coordination;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.threadpool.ThreadPool.Names;

import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Consumer;

import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;

/**
* A publication can succeed and complete before all nodes have applied the published state and acknowledged it; however we need every node
* eventually either to apply the published state (or a later state) or be removed from the cluster. This component achieves this by
* removing any lagging nodes from the cluster after a timeout.
*/
public class LagDetector {

private static final Logger logger = LogManager.getLogger(LagDetector.class);

// the timeout for each node to apply a value after the end of publication
public static final Setting<TimeValue> CLUSTER_STATE_APPLICATION_TIMEOUT_SETTING =
Setting.timeSetting("cluster.applier.timeout",
TimeValue.timeValueMillis(60000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);

private final TimeValue clusterStateApplicationTimeout;
private final Consumer<DiscoveryNode> onLagDetected;
private final ThreadPool threadPool;
private final Map<DiscoveryNode, NodeAppliedStateTracker> appliedStateTrackersByNode = newConcurrentMap();

public LagDetector(final Settings settings, final ThreadPool threadPool, final Consumer<DiscoveryNode> onLagDetected) {
this.threadPool = threadPool;
this.clusterStateApplicationTimeout = CLUSTER_STATE_APPLICATION_TIMEOUT_SETTING.get(settings);
this.onLagDetected = onLagDetected;
}

public void setTrackedNodes(final Iterable<DiscoveryNode> discoveryNodes) {
final Set<DiscoveryNode> discoveryNodeSet = new HashSet<>();
discoveryNodes.forEach(discoveryNodeSet::add);
appliedStateTrackersByNode.keySet().retainAll(discoveryNodeSet);
discoveryNodeSet.forEach(node -> appliedStateTrackersByNode.putIfAbsent(node, new NodeAppliedStateTracker(node)));
}

public void clearTrackedNodes() {
appliedStateTrackersByNode.clear();
}

public void setAppliedVersion(final DiscoveryNode discoveryNode, final long appliedVersion) {
final NodeAppliedStateTracker nodeAppliedStateTracker = appliedStateTrackersByNode.get(discoveryNode);
if (nodeAppliedStateTracker == null) {
logger.trace("node {} applied version {} but this node's version is not being tracked", discoveryNode, appliedVersion);
} else {
nodeAppliedStateTracker.increaseAppliedVersion(appliedVersion);
}
}

public void startLagDetector(final long version) {
appliedStateTrackersByNode.values().forEach(nodeAppliedStateTracker -> nodeAppliedStateTracker.startLagDetector(version));
}

@Override
public String toString() {
return "LagDetector{" +
"clusterStateApplicationTimeout=" + clusterStateApplicationTimeout +
", appliedStateTrackersByNode=" + appliedStateTrackersByNode.values() +
'}';
}

// for assertions
Set<DiscoveryNode> getTrackedNodes() {
return Collections.unmodifiableSet(appliedStateTrackersByNode.keySet());
}

private class NodeAppliedStateTracker {
private final DiscoveryNode discoveryNode;
private final AtomicLong appliedVersion = new AtomicLong();

NodeAppliedStateTracker(final DiscoveryNode discoveryNode) {
this.discoveryNode = discoveryNode;
}

void increaseAppliedVersion(long appliedVersion) {
long maxAppliedVersion = this.appliedVersion.updateAndGet(v -> Math.max(v, appliedVersion));
logger.trace("{} applied version {}, max now {}", this, appliedVersion, maxAppliedVersion);
}

@Override
public String toString() {
return "NodeAppliedStateTracker{" +
"discoveryNode=" + discoveryNode +
", appliedVersion=" + appliedVersion +
'}';
}

void startLagDetector(final long version) {
final long appliedVersionWhenStarted = appliedVersion.get();
if (version <= appliedVersionWhenStarted) {
logger.trace("lag detection for {} for version {} unnecessary, node has already applied version {}",
discoveryNode, version, appliedVersionWhenStarted);
return;
}

threadPool.schedule(clusterStateApplicationTimeout, Names.GENERIC, new Runnable() {
@Override
public void run() {
if (appliedStateTrackersByNode.get(discoveryNode) != NodeAppliedStateTracker.this) {
logger.trace("{}, no longer active", this);
return;
}

long appliedVersion = NodeAppliedStateTracker.this.appliedVersion.get();
if (version <= appliedVersion) {
logger.trace("{}, satisfied, node applied version {}", this, appliedVersion);
return;
}

logger.debug("{}, detected lag, node has only applied version {}", this, appliedVersion);
onLagDetected.accept(discoveryNode);
}

@Override
public String toString() {
return "lag detection for " + discoveryNode + " started at version " + appliedVersionWhenStarted
+ ", expected version " + version;
}
});
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import java.util.Optional;
import java.util.Set;
import java.util.function.LongSupplier;
import java.util.function.ObjLongConsumer;

public abstract class Publication {

Expand All @@ -46,16 +47,19 @@ public abstract class Publication {
private final AckListener ackListener;
private final LongSupplier currentTimeSupplier;
private final long startTime;
private final ObjLongConsumer<DiscoveryNode> onNodeApplicationAck;

private Optional<ApplyCommitRequest> applyCommitRequest; // set when state is committed
private boolean isCompleted; // set when publication is completed
private boolean timedOut; // set when publication timed out

public Publication(PublishRequest publishRequest, AckListener ackListener, LongSupplier currentTimeSupplier) {
public Publication(PublishRequest publishRequest, AckListener ackListener, LongSupplier currentTimeSupplier,
ObjLongConsumer<DiscoveryNode> onNodeApplicationAck) {
this.publishRequest = publishRequest;
this.ackListener = ackListener;
this.currentTimeSupplier = currentTimeSupplier;
startTime = currentTimeSupplier.getAsLong();
this.onNodeApplicationAck = onNodeApplicationAck;
applyCommitRequest = Optional.empty();
publicationTargets = new ArrayList<>(publishRequest.getAcceptedState().getNodes().getNodes().size());
publishRequest.getAcceptedState().getNodes().iterator().forEachRemaining(n -> publicationTargets.add(new PublicationTarget(n)));
Expand Down Expand Up @@ -251,6 +255,7 @@ void sendApplyCommit() {
void setAppliedCommit() {
assert state == PublicationTargetState.SENT_APPLY_COMMIT : state + " -> " + PublicationTargetState.APPLIED_COMMIT;
state = PublicationTargetState.APPLIED_COMMIT;
onNodeApplicationAck.accept(discoveryNode, publishRequest.getAcceptedState().version());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we can avoid to hook this in here, and whether there's a way to do this in Coordinator. We call ackOnce(null) here, which in turn calls AckListener.onNodeAck(DiscoveryNode, @Nullable Exception). We already hook into that acklistener in Coordinator, so we could also get those events there. And we also know the version of the cluster state we're publishing in Coordinator.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And we also know the version of the cluster state we're publishing in Coordinator.

How do we know that? We start the lag detector after clearing currentPublication, and another publication could then start. It wouldn't be right to detect lag based on a newer publication.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My suggestion is something like the following (I had to revert the not-null assertion, because I think we don't have that guarantee, and CoordinatorTests were failing):

diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java
index f132998d6ba..64e9310402c 100644
--- a/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java
+++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Coordinator.java
@@ -985,6 +985,9 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
 
                     @Override
                     public void onNodeAck(DiscoveryNode node, Exception e) {
+                        if (e == null) {
+                            lagDetector.setAppliedVersion(node, publishRequest.getAcceptedState().version());
+                        }
                         // acking and cluster state application for local node is handled specially
                         if (node.equals(getLocalNode())) {
                             synchronized (mutex) {
@@ -999,7 +1002,7 @@ public class Coordinator extends AbstractLifecycleComponent implements Discovery
                         }
                     }
                 },
-                transportService.getThreadPool()::relativeTimeInMillis, lagDetector::setAppliedVersion);
+                transportService.getThreadPool()::relativeTimeInMillis);
             this.publishRequest = publishRequest;
             this.publicationContext = publicationContext;
             this.localNodeAckEvent = localNodeAckEvent;
diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/LagDetector.java b/server/src/main/java/org/elasticsearch/cluster/coordination/LagDetector.java
index 3180913a012..ea52ec95673 100644
--- a/server/src/main/java/org/elasticsearch/cluster/coordination/LagDetector.java
+++ b/server/src/main/java/org/elasticsearch/cluster/coordination/LagDetector.java
@@ -85,8 +85,11 @@ public class LagDetector {
         }
 
         final NodeAppliedStateTracker nodeAppliedStateTracker = appliedStateTrackersByNode.get(discoveryNode);
-        assert nodeAppliedStateTracker != null : "untracked node " + discoveryNode + " applied version " + appliedVersion;
-        nodeAppliedStateTracker.increaseAppliedVersion(appliedVersion);
+        if (nodeAppliedStateTracker == null) {
+            logger.trace("node {} applied version {} but this node's version is not being tracked", discoveryNode, appliedVersion);
+        } else {
+            nodeAppliedStateTracker.increaseAppliedVersion(appliedVersion);
+        }
     }
 
     public void startLagDetector(final long version) {
diff --git a/server/src/main/java/org/elasticsearch/cluster/coordination/Publication.java b/server/src/main/java/org/elasticsearch/cluster/coordination/Publication.java
index a602750bba8..9ec8d562b81 100644
--- a/server/src/main/java/org/elasticsearch/cluster/coordination/Publication.java
+++ b/server/src/main/java/org/elasticsearch/cluster/coordination/Publication.java
@@ -36,7 +36,6 @@ import java.util.List;
 import java.util.Optional;
 import java.util.Set;
 import java.util.function.LongSupplier;
-import java.util.function.ObjLongConsumer;
 
 public abstract class Publication {
 
@@ -47,19 +46,16 @@ public abstract class Publication {
     private final AckListener ackListener;
     private final LongSupplier currentTimeSupplier;
     private final long startTime;
-    private final ObjLongConsumer<DiscoveryNode> onNodeApplicationAck;
 
     private Optional<ApplyCommitRequest> applyCommitRequest; // set when state is committed
     private boolean isCompleted; // set when publication is completed
     private boolean timedOut; // set when publication timed out
 
-    public Publication(PublishRequest publishRequest, AckListener ackListener, LongSupplier currentTimeSupplier,
-                       ObjLongConsumer<DiscoveryNode> onNodeApplicationAck) {
+    public Publication(PublishRequest publishRequest, AckListener ackListener, LongSupplier currentTimeSupplier) {
         this.publishRequest = publishRequest;
         this.ackListener = ackListener;
         this.currentTimeSupplier = currentTimeSupplier;
         startTime = currentTimeSupplier.getAsLong();
-        this.onNodeApplicationAck = onNodeApplicationAck;
         applyCommitRequest = Optional.empty();
         publicationTargets = new ArrayList<>(publishRequest.getAcceptedState().getNodes().getNodes().size());
         publishRequest.getAcceptedState().getNodes().iterator().forEachRemaining(n -> publicationTargets.add(new PublicationTarget(n)));
@@ -255,7 +251,6 @@ public abstract class Publication {
         void setAppliedCommit() {
             assert state == PublicationTargetState.SENT_APPLY_COMMIT : state + " -> " + PublicationTargetState.APPLIED_COMMIT;
             state = PublicationTargetState.APPLIED_COMMIT;
-            onNodeApplicationAck.accept(discoveryNode, publishRequest.getAcceptedState().version());
             ackOnce(null);
         }
 
diff --git a/server/src/test/java/org/elasticsearch/cluster/coordination/PublicationTests.java b/server/src/test/java/org/elasticsearch/cluster/coordination/PublicationTests.java
index e17c77ce6dc..914ee1e95f7 100644
--- a/server/src/test/java/org/elasticsearch/cluster/coordination/PublicationTests.java
+++ b/server/src/test/java/org/elasticsearch/cluster/coordination/PublicationTests.java
@@ -103,8 +103,7 @@ public class PublicationTests extends ESTestCase {
         Set<DiscoveryNode> missingJoins = new HashSet<>();
 
         MockPublication(PublishRequest publishRequest, Discovery.AckListener ackListener, LongSupplier currentTimeSupplier) {
-            super(publishRequest, ackListener, currentTimeSupplier, (n, l) -> {
-            });
+            super(publishRequest, ackListener, currentTimeSupplier);
             this.publishRequest = publishRequest;
         }

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, ok, I did that in 63b21ee.

ackOnce(null);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import org.elasticsearch.cluster.NodeConnectionsService;
import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
import org.elasticsearch.cluster.coordination.ClusterBootstrapService;
import org.elasticsearch.cluster.coordination.LagDetector;
import org.elasticsearch.cluster.coordination.Coordinator;
import org.elasticsearch.cluster.coordination.ElectionSchedulerFactory;
import org.elasticsearch.cluster.coordination.JoinHelper;
Expand Down Expand Up @@ -461,7 +462,8 @@ public void apply(Settings value, Settings current, Settings previous) {
JoinHelper.JOIN_TIMEOUT_SETTING,
Reconfigurator.CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION,
TransportAddVotingTombstonesAction.MAXIMUM_VOTING_TOMBSTONES_SETTING,
ClusterBootstrapService.INITIAL_MASTER_NODE_COUNT_SETTING
ClusterBootstrapService.INITIAL_MASTER_NODE_COUNT_SETTING,
LagDetector.CLUSTER_STATE_APPLICATION_TIMEOUT_SETTING
)));

public static List<SettingUpgrader<?>> BUILT_IN_SETTING_UPGRADERS = Collections.unmodifiableList(Arrays.asList(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -599,7 +599,7 @@ public void testAckListenerReceivesNoAckFromHangingFollower() {

assertTrue("expected immediate ack from " + follower1, ackCollector.hasAckedSuccessfully(follower1));
assertFalse("expected no ack from " + leader, ackCollector.hasAckedSuccessfully(leader));
cluster.stabilise();
cluster.stabilise(defaultMillis(PUBLISH_TIMEOUT_SETTING));
assertTrue("expected eventual ack from " + leader, ackCollector.hasAckedSuccessfully(leader));
assertFalse("expected no ack from " + follower0, ackCollector.hasAcked(follower0));
}
Expand Down Expand Up @@ -1097,7 +1097,6 @@ void stabilise(long stabilisationDurationMillis) {
}

runFor(stabilisationDurationMillis, "stabilising");
fixLag();

final ClusterNode leader = getAnyLeader();
final long leaderTerm = leader.coordinator.getCurrentTerm();
Expand Down Expand Up @@ -1154,35 +1153,6 @@ void stabilise(long stabilisationDurationMillis) {
leader.improveConfiguration(lastAcceptedState), sameInstance(lastAcceptedState));
}

// TODO remove this when lag detection is implemented
void fixLag() {
final ClusterNode leader = getAnyLeader();
final long leaderVersion = leader.getLastAppliedClusterState().version();
final long minVersion = clusterNodes.stream()
.filter(n -> isConnectedPair(n, leader))
.map(n -> n.getLastAppliedClusterState().version()).min(Long::compare).orElse(Long.MIN_VALUE);
assert minVersion >= 0;
if (minVersion < leaderVersion) {
logger.info("--> fixLag publishing a value to fix lag, leaderVersion={}, minVersion={}", leaderVersion, minVersion);
onNode(leader.getLocalNode(), () -> {
synchronized (leader.coordinator.mutex) {
leader.submitValue(randomLong());
}
}).run();

runFor(DEFAULT_CLUSTER_STATE_UPDATE_DELAY
// may need to bump terms too
+ DEFAULT_ELECTION_DELAY,
"re-stabilising after lag-fixing publication");

if (clusterNodes.stream().anyMatch(n -> n.getClusterStateApplyResponse().equals(ClusterStateApplyResponse.HANG))) {
runFor(defaultMillis(PUBLISH_TIMEOUT_SETTING), "allowing lag-fixing publication to time out");
}
} else {
logger.info("--> fixLag found no lag, leader={}, leaderVersion={}, minVersion={}", leader, leaderVersion, minVersion);
}
}

void runFor(long runDurationMillis, String description) {
final long endTime = deterministicTaskQueue.getCurrentTimeMillis() + runDurationMillis;
logger.info("--> runFor({}ms) running until [{}ms]: {}", runDurationMillis, endTime, description);
Expand Down
Loading