Skip to content

Commit 157166b

Browse files
committed
introduce RepartitionError for differentiated handling
1 parent c555606 commit 157166b

File tree

2 files changed

+53
-28
lines changed

2 files changed

+53
-28
lines changed

pageserver/src/tenant/timeline.rs

Lines changed: 25 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,6 +1118,26 @@ enum ImageLayerCreationOutcome {
11181118
Skip,
11191119
}
11201120

1121+
enum RepartitionError {
1122+
Other(anyhow::Error),
1123+
CollectKeyspace(CollectKeySpaceError),
1124+
}
1125+
1126+
impl RepartitionError {
1127+
fn is_cancel(&self) -> bool {
1128+
match self {
1129+
RepartitionError::Other(_) => false,
1130+
RepartitionError::CollectKeyspace(e) => e.is_cancel(),
1131+
}
1132+
}
1133+
fn into_anyhow(self) -> anyhow::Error {
1134+
match self {
1135+
RepartitionError::Other(e) => e,
1136+
RepartitionError::CollectKeyspace(e) => e.into_anyhow(),
1137+
}
1138+
}
1139+
}
1140+
11211141
/// Public interface functions
11221142
impl Timeline {
11231143
/// Get the LSN where this branch was created
@@ -4992,7 +5012,7 @@ impl Timeline {
49925012
ctx,
49935013
)
49945014
.await
4995-
.map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?;
5015+
.map_err(|e| FlushLayerError::from_anyhow(self, e.into_anyhow()))?;
49965016

49975017
if self.cancel.is_cancelled() {
49985018
return Err(FlushLayerError::Cancelled);
@@ -5242,18 +5262,18 @@ impl Timeline {
52425262
partition_size: u64,
52435263
flags: EnumSet<CompactFlags>,
52445264
ctx: &RequestContext,
5245-
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
5265+
) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), RepartitionError> {
52465266
let Ok(mut guard) = self.partitioning.try_write_guard() else {
52475267
// NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
52485268
// The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
52495269
// and hence before the compaction task starts.
5250-
return Err(CompactionError::Other(anyhow!(
5270+
return Err(RepartitionError::Other(anyhow!(
52515271
"repartition() called concurrently"
52525272
)));
52535273
};
52545274
let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read();
52555275
if lsn < *partition_lsn {
5256-
return Err(CompactionError::Other(anyhow!(
5276+
return Err(RepartitionError::Other(anyhow!(
52575277
"repartition() called with LSN going backwards, this should not happen"
52585278
)));
52595279
}
@@ -5277,23 +5297,7 @@ impl Timeline {
52775297
let (dense_ks, sparse_ks) = self
52785298
.collect_keyspace(lsn, ctx)
52795299
.await
5280-
.inspect_err(|e| {
5281-
if matches!(
5282-
e,
5283-
CollectKeySpaceError::Decode(_)
5284-
| CollectKeySpaceError::PageRead(
5285-
PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
5286-
)
5287-
) {
5288-
// Alert on critical errors that indicate data corruption.
5289-
critical_timeline!(
5290-
self.tenant_shard_id,
5291-
self.timeline_id,
5292-
"could not compact, repartitioning keyspace failed: {e:?}"
5293-
);
5294-
}
5295-
})
5296-
.map_err(CompactionError::from_collect_keyspace)?;
5300+
.map_err(RepartitionError::CollectKeyspace)?;
52975301
let dense_partitioning = dense_ks.partition(
52985302
&self.shard_identity,
52995303
partition_size,

pageserver/src/tenant/timeline/compaction.rs

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ use super::{
1616
Timeline,
1717
};
1818

19-
use crate::tenant::timeline::DeltaEntry;
19+
use crate::pgdatadir_mapping::CollectKeySpaceError;
20+
use crate::tenant::timeline::{DeltaEntry, RepartitionError};
2021
use crate::walredo::RedoAttemptType;
2122
use anyhow::{Context, anyhow};
2223
use bytes::Bytes;
@@ -64,7 +65,7 @@ use crate::tenant::timeline::{
6465
DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
6566
ResidentLayer, drop_layer_manager_rlock,
6667
};
67-
use crate::tenant::{DeltaLayer, MaybeOffloaded};
68+
use crate::tenant::{DeltaLayer, MaybeOffloaded, PageReconstructError};
6869
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
6970

7071
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
@@ -1417,13 +1418,33 @@ impl Timeline {
14171418
}
14181419

14191420
// Suppress errors when cancelled.
1420-
Err(_) if self.cancel.is_cancelled() => {}
1421-
Err(err) if err.is_cancel() => {}
1422-
1423-
// Log other errors. No partitioning? This is normal, if the timeline was just created
1421+
//
1422+
// Log other errors but continue. Failure to repartition is normal, if the timeline was just created
14241423
// as an empty timeline. Also in unit tests, when we use the timeline as a simple
14251424
// key-value store, ignoring the datadir layout. Log the error but continue.
1426-
Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"),
1425+
//
1426+
// TODO:
1427+
// 1. shouldn't we return early here if we observe cancellation
1428+
// 2. Experiment: can we stop checking self.cancel here?
1429+
Err(_) if self.cancel.is_cancelled() => {} // TODO: try how we fare removing this branch
1430+
Err(err) if err.is_cancel() => {}
1431+
Err(RepartitionError::CollectKeyspace(
1432+
e @ CollectKeySpaceError::Decode(_)
1433+
| e @ CollectKeySpaceError::PageRead(
1434+
PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
1435+
),
1436+
)) => {
1437+
// Alert on critical errors that indicate data corruption.
1438+
critical_timeline!(
1439+
self.tenant_shard_id,
1440+
self.timeline_id,
1441+
"could not compact, repartitioning keyspace failed: {e:?}"
1442+
);
1443+
}
1444+
Err(e) => error!(
1445+
"could not compact, repartitioning keyspace failed: {:?}",
1446+
e.into_anyhow()
1447+
),
14271448
};
14281449

14291450
let partition_count = self.partitioning.read().0.0.parts.len();

0 commit comments

Comments
 (0)