Skip to content

Commit e6b489a

Browse files
committed
Account for Crucible Agent reservation overhead
When creating a region's dataset, the Crucible Agent will include a reservation 25% larger than the region's size to account for on-disk overhead (storing encryption contexts and other metadata). Nexus does not take this overhead into account when computing `size_used` for crucible_dataset rows, or when allocating regions. This leads to the scenario where Nexus thinks there's enough room for a region but the Agent will fail to create the dataset due to not having enough space for the reservation to succeed. Fix this: add a reservation factor column to the Region model, and account for this when performing region allocation and when computing the `size_used` column for crucible datasets. This commit also adds an upgrader that will set all currently allocated Region's reservation factor to 1.25, and recompute all the `size_used` values for all non-deleted crucible datasets. This may lead to `size_used` being greater than the pool's total_size - a follow up commit will add an omdb command to identify these cases, and identify candidate regions to request replacement for in order to remedy this. The `regions_hard_delete` function now uses this upgrader's CTE to set `size_used` for all crucible datasets at once, instead of in a for loop during an interactive transaction.
1 parent ca6a1dc commit e6b489a

File tree

24 files changed

+480
-225
lines changed

24 files changed

+480
-225
lines changed

nexus/db-model/src/region.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,16 @@ pub struct Region {
5555
// Shared read-only regions require a "deleting" flag to avoid a
5656
// use-after-free scenario
5757
deleting: bool,
58+
59+
// The Agent will reserve space for Downstairs overhead when creating the
60+
// corresponding ZFS dataset. Nexus has to account for that: store that
61+
// reservation factor here as it may change in the future, and it can be
62+
// used during Crucible related accounting.
63+
reservation_factor: f64,
5864
}
5965

6066
impl Region {
67+
#[allow(clippy::too_many_arguments)]
6168
pub fn new(
6269
dataset_id: DatasetUuid,
6370
volume_id: VolumeUuid,
@@ -66,6 +73,7 @@ impl Region {
6673
extent_count: u64,
6774
port: u16,
6875
read_only: bool,
76+
reservation_factor: f64,
6977
) -> Self {
7078
Self {
7179
identity: RegionIdentity::new(Uuid::new_v4()),
@@ -77,6 +85,7 @@ impl Region {
7785
port: Some(port.into()),
7886
read_only,
7987
deleting: false,
88+
reservation_factor,
8089
}
8190
}
8291

@@ -112,4 +121,18 @@ impl Region {
112121
pub fn deleting(&self) -> bool {
113122
self.deleting
114123
}
124+
125+
/// The size of the Region without accounting for any overhead
126+
pub fn requested_size(&self) -> u64 {
127+
self.block_size().to_bytes()
128+
* self.blocks_per_extent()
129+
* self.extent_count()
130+
}
131+
132+
/// The size the Crucible agent would have reserved during ZFS creation,
133+
/// which is some factor higher than the requested region size to account
134+
/// for on-disk overhead.
135+
pub fn reserved_size(&self) -> u64 {
136+
(self.requested_size() as f64 * self.reservation_factor).round() as u64
137+
}
115138
}

nexus/db-model/src/schema.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1125,6 +1125,8 @@ table! {
11251125
read_only -> Bool,
11261126

11271127
deleting -> Bool,
1128+
1129+
reservation_factor -> Float8,
11281130
}
11291131
}
11301132

nexus/db-model/src/schema_versions.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
1616
///
1717
/// This must be updated when you change the database schema. Refer to
1818
/// schema/crdb/README.adoc in the root of this repository for details.
19-
pub const SCHEMA_VERSION: Version = Version::new(132, 0, 0);
19+
pub const SCHEMA_VERSION: Version = Version::new(133, 0, 0);
2020

2121
/// List of all past database schema versions, in *reverse* order
2222
///
@@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
2828
// | leaving the first copy as an example for the next person.
2929
// v
3030
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
31+
KnownVersion::new(133, "crucible-agent-reservation-overhead"),
3132
KnownVersion::new(132, "bp-omicron-zone-filesystem-pool-not-null"),
3233
KnownVersion::new(131, "tuf-generation"),
3334
KnownVersion::new(130, "bp-sled-agent-generation"),

nexus/db-queries/src/db/datastore/region.rs

Lines changed: 25 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ use crate::db::pagination::paginated;
2323
use crate::db::queries::region_allocation::RegionParameters;
2424
use crate::db::update_and_check::UpdateAndCheck;
2525
use crate::db::update_and_check::UpdateStatus;
26-
use crate::transaction_retry::OptionalError;
2726
use async_bb8_diesel::AsyncRunQueryDsl;
27+
use diesel::dsl::sql_query;
2828
use diesel::prelude::*;
2929
use nexus_config::RegionAllocationStrategy;
3030
use nexus_types::external_api::params;
@@ -301,68 +301,26 @@ impl DataStore {
301301
return Ok(());
302302
}
303303

304-
#[derive(Debug, thiserror::Error)]
305-
enum RegionDeleteError {
306-
#[error("Numeric error: {0}")]
307-
NumericError(String),
308-
}
309-
let err = OptionalError::new();
310304
let conn = self.pool_connection_unauthorized().await?;
311305
self.transaction_retry_wrapper("regions_hard_delete")
312306
.transaction(&conn, |conn| {
313-
let err = err.clone();
314307
let region_ids = region_ids.clone();
315308
async move {
316-
use db::schema::crucible_dataset::dsl as dataset_dsl;
317-
use db::schema::region::dsl as region_dsl;
309+
use db::schema::region::dsl as dsl;
318310

319-
// Remove the regions, collecting datasets they're from.
320-
let datasets = diesel::delete(region_dsl::region)
321-
.filter(region_dsl::id.eq_any(region_ids))
322-
.returning(region_dsl::dataset_id)
323-
.get_results_async::<Uuid>(&conn).await?;
311+
// Remove the regions
312+
diesel::delete(dsl::region)
313+
.filter(dsl::id.eq_any(region_ids))
314+
.execute_async(&conn)
315+
.await?;
324316

325317
// Update datasets to which the regions belonged.
326-
for dataset in datasets {
327-
let dataset_total_occupied_size: Option<
328-
diesel::pg::data_types::PgNumeric,
329-
> = region_dsl::region
330-
.filter(region_dsl::dataset_id.eq(dataset))
331-
.select(diesel::dsl::sum(
332-
region_dsl::block_size
333-
* region_dsl::blocks_per_extent
334-
* region_dsl::extent_count,
335-
))
336-
.nullable()
337-
.get_result_async(&conn).await?;
338-
339-
let dataset_total_occupied_size: i64 = if let Some(
340-
dataset_total_occupied_size,
341-
) =
342-
dataset_total_occupied_size
343-
{
344-
let dataset_total_occupied_size: db::model::ByteCount =
345-
dataset_total_occupied_size.try_into().map_err(
346-
|e: anyhow::Error| {
347-
err.bail(RegionDeleteError::NumericError(
348-
e.to_string(),
349-
))
350-
},
351-
)?;
352-
353-
dataset_total_occupied_size.into()
354-
} else {
355-
0
356-
};
357-
358-
diesel::update(dataset_dsl::crucible_dataset)
359-
.filter(dataset_dsl::id.eq(dataset))
360-
.set(
361-
dataset_dsl::size_used
362-
.eq(dataset_total_occupied_size),
363-
)
364-
.execute_async(&conn).await?;
365-
}
318+
// XXX put this file somewhere else
319+
sql_query(include_str!(
320+
"../../../../../schema/crdb/crucible-agent-reservation-overhead/up03.sql"
321+
))
322+
.execute_async(&conn)
323+
.await?;
366324

367325
// Whenever a region is hard-deleted, validate invariants
368326
// for all volumes
@@ -373,52 +331,25 @@ impl DataStore {
373331
}
374332
})
375333
.await
376-
.map_err(|e| {
377-
if let Some(err) = err.take() {
378-
match err {
379-
RegionDeleteError::NumericError(err) => {
380-
return Error::internal_error(
381-
&format!("Transaction error: {}", err)
382-
);
383-
}
384-
}
385-
}
386-
public_error_from_diesel(e, ErrorHandler::Server)
387-
})
334+
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
388335
}
389336

390-
/// Return the total occupied size for a dataset
391-
pub async fn regions_total_occupied_size(
337+
/// Return the total reserved size for all the regions allocated to a
338+
/// dataset
339+
pub async fn regions_total_reserved_size(
392340
&self,
393341
dataset_id: DatasetUuid,
394342
) -> Result<u64, Error> {
395-
use db::schema::region::dsl as region_dsl;
396-
397-
let total_occupied_size: Option<diesel::pg::data_types::PgNumeric> =
398-
region_dsl::region
399-
.filter(region_dsl::dataset_id.eq(to_db_typed_uuid(dataset_id)))
400-
.select(diesel::dsl::sum(
401-
region_dsl::block_size
402-
* region_dsl::blocks_per_extent
403-
* region_dsl::extent_count,
404-
))
405-
.nullable()
406-
.get_result_async(&*self.pool_connection_unauthorized().await?)
407-
.await
408-
.map_err(|e| {
409-
public_error_from_diesel(e, ErrorHandler::Server)
410-
})?;
343+
use db::schema::region::dsl;
411344

412-
if let Some(total_occupied_size) = total_occupied_size {
413-
let total_occupied_size: db::model::ByteCount =
414-
total_occupied_size.try_into().map_err(
415-
|e: anyhow::Error| Error::internal_error(&e.to_string()),
416-
)?;
345+
let dataset_regions: Vec<Region> = dsl::region
346+
.filter(dsl::dataset_id.eq(to_db_typed_uuid(dataset_id)))
347+
.select(Region::as_select())
348+
.load_async(&*self.pool_connection_unauthorized().await?)
349+
.await
350+
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
417351

418-
Ok(total_occupied_size.to_bytes())
419-
} else {
420-
Ok(0)
421-
}
352+
Ok(dataset_regions.iter().map(|r| r.reserved_size()).sum())
422353
}
423354

424355
/// Find read/write regions on expunged disks

nexus/db-queries/src/db/datastore/volume.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4715,6 +4715,7 @@ mod tests {
47154715
region.extent_count(),
47164716
111,
47174717
false, // read-write
4718+
1.0,
47184719
);
47194720

47204721
use nexus_db_model::schema::region::dsl;
@@ -4952,6 +4953,7 @@ mod tests {
49524953
region.extent_count(),
49534954
111,
49544955
true, // read-only
4956+
1.0,
49554957
);
49564958

49574959
use nexus_db_model::schema::region::dsl;

nexus/db-queries/src/db/queries/region_allocation.rs

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,14 @@ pub fn allocation_query(
117117

118118
let seed = seed.to_le_bytes().to_vec();
119119

120-
let size_delta =
120+
// The Crucible Agent's current reservation factor
121+
const RESERVATION_FACTOR: f64 = 1.25;
122+
123+
let requested_size: u64 =
121124
params.block_size * params.blocks_per_extent * params.extent_count;
125+
let size_delta: u64 =
126+
(requested_size as f64 * RESERVATION_FACTOR).round() as u64;
127+
122128
let redundancy: i64 = i64::try_from(redundancy).unwrap();
123129

124130
let mut builder = QueryBuilder::new();
@@ -272,7 +278,8 @@ pub fn allocation_query(
272278
").param().sql(" AS extent_count,
273279
NULL AS port,
274280
").param().sql(" AS read_only,
275-
FALSE as deleting
281+
FALSE as deleting,
282+
").param().sql(" AS reservation_factor
276283
FROM shuffled_candidate_datasets")
277284
// Only select the *additional* number of candidate regions for the required
278285
// redundancy level
@@ -286,6 +293,7 @@ pub fn allocation_query(
286293
.bind::<sql_types::BigInt, _>(params.blocks_per_extent as i64)
287294
.bind::<sql_types::BigInt, _>(params.extent_count as i64)
288295
.bind::<sql_types::Bool, _>(params.read_only)
296+
.bind::<sql_types::Float8, _>(RESERVATION_FACTOR)
289297
.bind::<sql_types::BigInt, _>(redundancy)
290298

291299
// A subquery which summarizes the changes we intend to make, showing:
@@ -298,7 +306,17 @@ pub fn allocation_query(
298306
SELECT
299307
candidate_regions.dataset_id AS id,
300308
crucible_dataset.pool_id AS pool_id,
301-
((candidate_regions.block_size * candidate_regions.blocks_per_extent) * candidate_regions.extent_count) AS size_used_delta
309+
CAST(
310+
CAST(
311+
candidate_regions.block_size *
312+
candidate_regions.blocks_per_extent *
313+
candidate_regions.extent_count
314+
AS FLOAT
315+
)
316+
* candidate_regions.reservation_factor
317+
AS INT
318+
)
319+
AS size_used_delta
302320
FROM (candidate_regions INNER JOIN crucible_dataset ON (crucible_dataset.id = candidate_regions.dataset_id))
303321
),")
304322

@@ -385,7 +403,7 @@ pub fn allocation_query(
385403
.sql("
386404
inserted_regions AS (
387405
INSERT INTO region
388-
(id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only, deleting)
406+
(id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only, deleting, reservation_factor)
389407
SELECT ").sql(AllColumnsOfRegion::with_prefix("candidate_regions")).sql("
390408
FROM candidate_regions
391409
WHERE

0 commit comments

Comments
 (0)