Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ protected Settings nodeSettings(int nodeOrdinal) {
)
.put("node.attr." + REMOTE_STORE_ROUTING_TABLE_REPOSITORY_NAME_ATTRIBUTE_KEY, REMOTE_ROUTING_TABLE_REPO)
.put(REMOTE_PUBLICATION_EXPERIMENTAL, true)
.put(RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING.getKey(), true)
.put(
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING.getKey(),
RemoteClusterStateService.RemoteClusterStateValidationMode.FAILURE
)
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,10 @@ protected Settings nodeSettings(int nodeOrdinal) {
.put("node.attr." + REMOTE_STORE_ROUTING_TABLE_REPOSITORY_NAME_ATTRIBUTE_KEY, routingTableRepoName)
.put(routingTableRepoTypeAttributeKey, ReloadableFsRepository.TYPE)
.put(routingTableRepoSettingsAttributeKeyPrefix + "location", segmentRepoPath)
.put(RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING.getKey(), true)
.put(
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING.getKey(),
RemoteClusterStateService.RemoteClusterStateValidationMode.FAILURE
)
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,7 @@ public void apply(Settings value, Settings current, Settings previous) {
IndicesService.CLUSTER_INDEX_RESTRICT_REPLICATION_TYPE_SETTING,
RemoteRoutingTableBlobStore.REMOTE_ROUTING_TABLE_PATH_TYPE_SETTING,
RemoteRoutingTableBlobStore.REMOTE_ROUTING_TABLE_PATH_HASH_ALGO_SETTING,
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING,
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING,

AdmissionControlSettings.ADMISSION_CONTROL_TRANSPORT_LAYER_MODE,
CpuBasedAdmissionControllerSettings.CPU_BASED_ADMISSION_CONTROLLER_TRANSPORT_LAYER_MODE,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
Expand Down Expand Up @@ -141,13 +142,49 @@ public class RemoteClusterStateService implements Closeable {
Setting.Property.NodeScope
);

public static final Setting<Boolean> REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING = Setting.boolSetting(
"cluster.remote_store.state.checksum_validation.enabled",
false,
Property.Dynamic,
Property.NodeScope
public static final Setting<RemoteClusterStateValidationMode> REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING = new Setting<>(
"cluster.remote_store.state.checksum_validation.mode",
RemoteClusterStateValidationMode.NONE.name(),
RemoteClusterStateValidationMode::parseString,
Setting.Property.Dynamic,
Setting.Property.NodeScope
);

/**
* Validation mode for cluster state checksum.
* None: Validation will be disabled.
* Debug: Validation enabled but only matches checksum and logs failing entities.
* Trace: Matches checksum and downloads full cluster state to find diff in failing entities. Only logs failures.
* Failure: Throws exception on failing validation.
*/
public enum RemoteClusterStateValidationMode {
DEBUG("debug"),
TRACE("trace"),
FAILURE("failure"),
NONE("none");

public final String mode;

RemoteClusterStateValidationMode(String mode) {
this.mode = mode;
}

public static RemoteClusterStateValidationMode parseString(String mode) {
try {
return RemoteClusterStateValidationMode.valueOf(mode.toUpperCase(Locale.ROOT));
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(
"["
+ mode
+ "] mode is not supported. "
+ "supported modes are ["
+ Arrays.toString(RemoteClusterStateValidationMode.values())
+ "]"
);
}
}
}

private TimeValue remoteStateReadTimeout;
private final String nodeId;
private final Supplier<RepositoriesService> repositoriesService;
Expand All @@ -159,7 +196,7 @@ public class RemoteClusterStateService implements Closeable {
private BlobStoreTransferService blobStoreTransferService;
private RemoteRoutingTableService remoteRoutingTableService;
private volatile TimeValue slowWriteLoggingThreshold;
private boolean checksumValidationEnabled;
private RemoteClusterStateValidationMode remoteClusterStateValidationMode;

private final RemotePersistenceStats remoteStateStats;
private RemoteClusterStateCleanupManager remoteClusterStateCleanupManager;
Expand Down Expand Up @@ -206,11 +243,8 @@ public RemoteClusterStateService(
clusterSettings.addSettingsUpdateConsumer(SLOW_WRITE_LOGGING_THRESHOLD, this::setSlowWriteLoggingThreshold);
this.remoteStateReadTimeout = clusterSettings.get(REMOTE_STATE_READ_TIMEOUT_SETTING);
clusterSettings.addSettingsUpdateConsumer(REMOTE_STATE_READ_TIMEOUT_SETTING, this::setRemoteStateReadTimeout);
this.checksumValidationEnabled = clusterSettings.get(REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING);
clusterSettings.addSettingsUpdateConsumer(
REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING,
this::setChecksumValidationEnabled
);
this.remoteClusterStateValidationMode = REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING.get(settings);
clusterSettings.addSettingsUpdateConsumer(REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING, this::setChecksumValidationMode);

this.remoteStateStats = new RemotePersistenceStats();
this.namedWriteableRegistry = namedWriteableRegistry;
Expand Down Expand Up @@ -272,7 +306,7 @@ public RemoteClusterStateManifestInfo writeFullMetadata(ClusterState clusterStat
uploadedMetadataResults,
previousClusterUUID,
clusterStateDiffManifest,
checksumValidationEnabled ? new ClusterStateChecksum(clusterState) : null,
!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) ? new ClusterStateChecksum(clusterState) : null,
false,
codecVersion
);
Expand Down Expand Up @@ -472,7 +506,7 @@ public RemoteClusterStateManifestInfo writeIncrementalMetadata(
uploadedMetadataResults,
previousManifest.getPreviousClusterUUID(),
clusterStateDiffManifest,
checksumValidationEnabled ? new ClusterStateChecksum(clusterState) : null,
!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) ? new ClusterStateChecksum(clusterState) : null,
false,
previousManifest.getCodecVersion()
);
Expand Down Expand Up @@ -917,7 +951,7 @@ public RemoteClusterStateManifestInfo markLastStateAsCommitted(ClusterState clus
uploadedMetadataResults,
previousManifest.getPreviousClusterUUID(),
previousManifest.getDiffManifest(),
checksumValidationEnabled ? previousManifest.getClusterStateChecksum() : null,
!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) ? new ClusterStateChecksum(clusterState) : null,
true,
previousManifest.getCodecVersion()
);
Expand Down Expand Up @@ -1003,8 +1037,8 @@ private void setSlowWriteLoggingThreshold(TimeValue slowWriteLoggingThreshold) {
this.slowWriteLoggingThreshold = slowWriteLoggingThreshold;
}

private void setChecksumValidationEnabled(Boolean checksumValidationEnabled) {
this.checksumValidationEnabled = checksumValidationEnabled;
private void setChecksumValidationMode(RemoteClusterStateValidationMode remoteClusterStateValidationMode) {
this.remoteClusterStateValidationMode = remoteClusterStateValidationMode;
}

// Package private for unit test
Expand Down Expand Up @@ -1376,7 +1410,9 @@ public ClusterState getClusterStateForManifest(
includeEphemeral
);

if (includeEphemeral && checksumValidationEnabled && manifest.getClusterStateChecksum() != null) {
if (includeEphemeral
&& !remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE)
&& manifest.getClusterStateChecksum() != null) {
validateClusterStateFromChecksum(manifest, clusterState, clusterName, localNodeId, true);
}
} else {
Expand Down Expand Up @@ -1498,7 +1534,7 @@ public ClusterState getClusterStateUsingDiff(ClusterMetadataManifest manifest, C
.routingTable(new RoutingTable(manifest.getRoutingTableVersion(), indexRoutingTables))
.build();

if (checksumValidationEnabled && manifest.getClusterStateChecksum() != null) {
if (!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) && manifest.getClusterStateChecksum() != null) {
validateClusterStateFromChecksum(manifest, clusterState, previousState.getClusterName().value(), localNodeId, false);
}
final long durationMillis = TimeValue.nsecToMSec(relativeTimeNanosSupplier.getAsLong() - startTimeNanos);
Expand All @@ -1517,20 +1553,24 @@ void validateClusterStateFromChecksum(
) {
ClusterStateChecksum newClusterStateChecksum = new ClusterStateChecksum(clusterState);
List<String> failedValidation = newClusterStateChecksum.getMismatchEntities(manifest.getClusterStateChecksum());
if (!failedValidation.isEmpty()) {
logger.error(
() -> new ParameterizedMessage(
"Cluster state checksums do not match. Checksum from manifest {}, checksum from created cluster state {}. Entities failing validation {}",
manifest.getClusterStateChecksum(),
newClusterStateChecksum,
failedValidation
)
if (failedValidation.isEmpty()) {
return;
}
logger.error(
() -> new ParameterizedMessage(
"Cluster state checksums do not match. Checksum from manifest {}, checksum from created cluster state {}. Entities failing validation {}",
manifest.getClusterStateChecksum(),
newClusterStateChecksum,
failedValidation
)
);
if (isFullStateDownload && remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)) {
throw new IllegalStateException(
"Cluster state checksums do not match during full state read. Validation failed for " + failedValidation
);
if (isFullStateDownload) {
throw new IllegalStateException(
"Cluster state checksums do not match during full state read. Validation failed for " + failedValidation
);
}
}
if (remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)
|| remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.TRACE)) {
// download full cluster state and match against state created for the failing entities
ClusterState fullClusterState = readClusterStateInParallel(
ClusterState.builder(new ClusterName(clusterName)).build(),
Expand Down Expand Up @@ -1663,6 +1703,8 @@ void validateClusterStateFromChecksum(
break;
}
}
}
if (remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)) {
throw new IllegalStateException(
"Cluster state checksums do not match during diff read. Validation failed for " + failedValidation
);
Expand Down
Loading