Skip to content

Commit e6b3fea

Browse files
authored
Account for Crucible Agent reservation overhead (#7885)
When creating a region's dataset, the Crucible Agent will include a reservation 25% larger than the region's size to account for on-disk overhead (storing encryption contexts and other metadata). Nexus does not take this overhead into account when computing `size_used` for crucible_dataset rows, or when allocating regions. This leads to the scenario where Nexus thinks there's enough room for a region but the Agent will fail to create the dataset due to not having enough space for the reservation to succeed. Fix this: add a reservation factor column to the Region model, and account for this when performing region allocation and when computing the `size_used` column for crucible datasets. This commit also adds an upgrader that will set all currently allocated Region's reservation factor to 1.25, and recompute all the `size_used` values for all non-deleted crucible datasets. This may lead to `size_used` being greater than the pool's total_size - a follow up commit will add an omdb command to identify these cases, and identify candidate regions to request replacement for in order to remedy this. The `regions_hard_delete` function now uses this upgrader's CTE to set `size_used` for all crucible datasets at once, instead of in a for loop during an interactive transaction.
1 parent 77c4136 commit e6b3fea

File tree

21 files changed

+689
-242
lines changed

21 files changed

+689
-242
lines changed

nexus/db-model/src/region.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
use super::ByteCount;
66
use crate::SqlU16;
7+
use crate::impl_enum_type;
78
use crate::typed_uuid::DbTypedUuid;
89
use db_macros::Asset;
910
use nexus_db_schema::schema::region;
@@ -15,6 +16,16 @@ use omicron_uuid_kinds::VolumeUuid;
1516
use serde::{Deserialize, Serialize};
1617
use uuid::Uuid;
1718

19+
impl_enum_type!(
20+
RegionReservationPercentEnum:
21+
22+
#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)]
23+
pub enum RegionReservationPercent;
24+
25+
// Enum values
26+
TwentyFive => b"25"
27+
);
28+
1829
/// Database representation of a Region.
1930
///
2031
/// A region represents a portion of a Crucible Downstairs dataset
@@ -55,6 +66,13 @@ pub struct Region {
5566
// Shared read-only regions require a "deleting" flag to avoid a
5667
// use-after-free scenario
5768
deleting: bool,
69+
70+
// The Agent will reserve space for Downstairs overhead when creating the
71+
// corresponding ZFS dataset. Nexus has to account for that: store that
72+
// reservation percent here as it may change in the future, and it can be
73+
// used during Crucible related accounting. This is stored as an enum to
74+
// restrict the values to what the Crucible Agent uses.
75+
reservation_percent: RegionReservationPercent,
5876
}
5977

6078
impl Region {
@@ -77,6 +95,10 @@ impl Region {
7795
port: Some(port.into()),
7896
read_only,
7997
deleting: false,
98+
// When the Crucible agent's reservation percentage changes, this
99+
// function should accept that as argument. Until then, it can only
100+
// ever be 25%.
101+
reservation_percent: RegionReservationPercent::TwentyFive,
80102
}
81103
}
82104

@@ -112,4 +134,24 @@ impl Region {
112134
pub fn deleting(&self) -> bool {
113135
self.deleting
114136
}
137+
138+
/// The size of the Region without accounting for any overhead. The
139+
/// `allocation_query` function should have validated that this won't
140+
/// overflow.
141+
pub fn requested_size(&self) -> u64 {
142+
self.block_size().to_bytes()
143+
* self.blocks_per_extent()
144+
* self.extent_count()
145+
}
146+
147+
/// The size the Crucible agent would have reserved during ZFS creation,
148+
/// which is some factor higher than the requested region size to account
149+
/// for on-disk overhead.
150+
pub fn reserved_size(&self) -> u64 {
151+
let overhead = match &self.reservation_percent {
152+
RegionReservationPercent::TwentyFive => self.requested_size() / 4,
153+
};
154+
155+
self.requested_size() + overhead
156+
}
115157
}

nexus/db-model/src/schema_versions.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
1616
///
1717
/// This must be updated when you change the database schema. Refer to
1818
/// schema/crdb/README.adoc in the root of this repository for details.
19-
pub const SCHEMA_VERSION: Version = Version::new(133, 0, 0);
19+
pub const SCHEMA_VERSION: Version = Version::new(134, 0, 0);
2020

2121
/// List of all past database schema versions, in *reverse* order
2222
///
@@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
2828
// | leaving the first copy as an example for the next person.
2929
// v
3030
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
31+
KnownVersion::new(134, "crucible-agent-reservation-overhead"),
3132
KnownVersion::new(133, "delete-defunct-reservations"),
3233
KnownVersion::new(132, "bp-omicron-zone-filesystem-pool-not-null"),
3334
KnownVersion::new(131, "tuf-generation"),

nexus/db-queries/src/db/datastore/region.rs

Lines changed: 48 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ use crate::db::pagination::paginated;
2323
use crate::db::queries::region_allocation::RegionParameters;
2424
use crate::db::update_and_check::UpdateAndCheck;
2525
use crate::db::update_and_check::UpdateStatus;
26-
use crate::transaction_retry::OptionalError;
2726
use async_bb8_diesel::AsyncRunQueryDsl;
27+
use diesel::dsl::sql_query;
2828
use diesel::prelude::*;
2929
use nexus_config::RegionAllocationStrategy;
3030
use nexus_types::external_api::params;
@@ -269,7 +269,7 @@ impl DataStore {
269269
},
270270
allocation_strategy,
271271
num_regions_required,
272-
);
272+
)?;
273273

274274
let conn = self.pool_connection_authorized(&opctx).await?;
275275

@@ -301,68 +301,48 @@ impl DataStore {
301301
return Ok(());
302302
}
303303

304-
#[derive(Debug, thiserror::Error)]
305-
enum RegionDeleteError {
306-
#[error("Numeric error: {0}")]
307-
NumericError(String),
308-
}
309-
let err = OptionalError::new();
310304
let conn = self.pool_connection_unauthorized().await?;
311305
self.transaction_retry_wrapper("regions_hard_delete")
312306
.transaction(&conn, |conn| {
313-
let err = err.clone();
314307
let region_ids = region_ids.clone();
315308
async move {
316-
use nexus_db_schema::schema::crucible_dataset::dsl as dataset_dsl;
317-
use nexus_db_schema::schema::region::dsl as region_dsl;
309+
use nexus_db_schema::schema::region::dsl;
318310

319-
// Remove the regions, collecting datasets they're from.
320-
let datasets = diesel::delete(region_dsl::region)
321-
.filter(region_dsl::id.eq_any(region_ids))
322-
.returning(region_dsl::dataset_id)
323-
.get_results_async::<Uuid>(&conn).await?;
311+
// Remove the regions
312+
diesel::delete(dsl::region)
313+
.filter(dsl::id.eq_any(region_ids))
314+
.execute_async(&conn)
315+
.await?;
324316

325317
// Update datasets to which the regions belonged.
326-
for dataset in datasets {
327-
let dataset_total_occupied_size: Option<
328-
diesel::pg::data_types::PgNumeric,
329-
> = region_dsl::region
330-
.filter(region_dsl::dataset_id.eq(dataset))
331-
.select(diesel::dsl::sum(
332-
region_dsl::block_size
333-
* region_dsl::blocks_per_extent
334-
* region_dsl::extent_count,
335-
))
336-
.nullable()
337-
.get_result_async(&conn).await?;
338-
339-
let dataset_total_occupied_size: i64 = if let Some(
340-
dataset_total_occupied_size,
341-
) =
342-
dataset_total_occupied_size
343-
{
344-
let dataset_total_occupied_size: db::model::ByteCount =
345-
dataset_total_occupied_size.try_into().map_err(
346-
|e: anyhow::Error| {
347-
err.bail(RegionDeleteError::NumericError(
348-
e.to_string(),
349-
))
350-
},
351-
)?;
352-
353-
dataset_total_occupied_size.into()
354-
} else {
355-
0
356-
};
357-
358-
diesel::update(dataset_dsl::crucible_dataset)
359-
.filter(dataset_dsl::id.eq(dataset))
360-
.set(
361-
dataset_dsl::size_used
362-
.eq(dataset_total_occupied_size),
363-
)
364-
.execute_async(&conn).await?;
365-
}
318+
sql_query(
319+
r#"
320+
WITH size_used_with_reservation AS (
321+
SELECT
322+
crucible_dataset.id AS crucible_dataset_id,
323+
SUM(
324+
CASE
325+
WHEN block_size IS NULL THEN 0
326+
ELSE
327+
CASE
328+
WHEN reservation_percent = '25' THEN
329+
(block_size * blocks_per_extent * extent_count) / 4 +
330+
(block_size * blocks_per_extent * extent_count)
331+
END
332+
END
333+
) AS reserved_size
334+
FROM crucible_dataset
335+
LEFT JOIN region ON crucible_dataset.id = region.dataset_id
336+
WHERE crucible_dataset.time_deleted IS NULL
337+
GROUP BY crucible_dataset.id
338+
)
339+
UPDATE crucible_dataset
340+
SET size_used = size_used_with_reservation.reserved_size
341+
FROM size_used_with_reservation
342+
WHERE crucible_dataset.id = size_used_with_reservation.crucible_dataset_id"#,
343+
)
344+
.execute_async(&conn)
345+
.await?;
366346

367347
// Whenever a region is hard-deleted, validate invariants
368348
// for all volumes
@@ -373,52 +353,25 @@ impl DataStore {
373353
}
374354
})
375355
.await
376-
.map_err(|e| {
377-
if let Some(err) = err.take() {
378-
match err {
379-
RegionDeleteError::NumericError(err) => {
380-
return Error::internal_error(
381-
&format!("Transaction error: {}", err)
382-
);
383-
}
384-
}
385-
}
386-
public_error_from_diesel(e, ErrorHandler::Server)
387-
})
356+
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))
388357
}
389358

390-
/// Return the total occupied size for a dataset
391-
pub async fn regions_total_occupied_size(
359+
/// Return the total reserved size for all the regions allocated to a
360+
/// dataset
361+
pub async fn regions_total_reserved_size(
392362
&self,
393363
dataset_id: DatasetUuid,
394364
) -> Result<u64, Error> {
395-
use nexus_db_schema::schema::region::dsl as region_dsl;
396-
397-
let total_occupied_size: Option<diesel::pg::data_types::PgNumeric> =
398-
region_dsl::region
399-
.filter(region_dsl::dataset_id.eq(to_db_typed_uuid(dataset_id)))
400-
.select(diesel::dsl::sum(
401-
region_dsl::block_size
402-
* region_dsl::blocks_per_extent
403-
* region_dsl::extent_count,
404-
))
405-
.nullable()
406-
.get_result_async(&*self.pool_connection_unauthorized().await?)
407-
.await
408-
.map_err(|e| {
409-
public_error_from_diesel(e, ErrorHandler::Server)
410-
})?;
365+
use nexus_db_schema::schema::region::dsl;
411366

412-
if let Some(total_occupied_size) = total_occupied_size {
413-
let total_occupied_size: db::model::ByteCount =
414-
total_occupied_size.try_into().map_err(
415-
|e: anyhow::Error| Error::internal_error(&e.to_string()),
416-
)?;
367+
let dataset_regions: Vec<Region> = dsl::region
368+
.filter(dsl::dataset_id.eq(to_db_typed_uuid(dataset_id)))
369+
.select(Region::as_select())
370+
.load_async(&*self.pool_connection_unauthorized().await?)
371+
.await
372+
.map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?;
417373

418-
Ok(total_occupied_size.to_bytes())
419-
} else {
420-
Ok(0)
421-
}
374+
Ok(dataset_regions.iter().map(|r| r.reserved_size()).sum())
422375
}
423376

424377
/// Find read/write regions on expunged disks

0 commit comments

Comments
 (0)