diff --git a/nexus/db-model/src/region.rs b/nexus/db-model/src/region.rs index 8d8d34692bb..821b7182c18 100644 --- a/nexus/db-model/src/region.rs +++ b/nexus/db-model/src/region.rs @@ -4,6 +4,7 @@ use super::ByteCount; use crate::SqlU16; +use crate::impl_enum_type; use crate::typed_uuid::DbTypedUuid; use db_macros::Asset; use nexus_db_schema::schema::region; @@ -15,6 +16,16 @@ use omicron_uuid_kinds::VolumeUuid; use serde::{Deserialize, Serialize}; use uuid::Uuid; +impl_enum_type!( + RegionReservationPercentEnum: + + #[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, Serialize, Deserialize, PartialEq)] + pub enum RegionReservationPercent; + + // Enum values + TwentyFive => b"25" +); + /// Database representation of a Region. /// /// A region represents a portion of a Crucible Downstairs dataset @@ -55,6 +66,13 @@ pub struct Region { // Shared read-only regions require a "deleting" flag to avoid a // use-after-free scenario deleting: bool, + + // The Agent will reserve space for Downstairs overhead when creating the + // corresponding ZFS dataset. Nexus has to account for that: store that + // reservation percent here as it may change in the future, and it can be + // used during Crucible related accounting. This is stored as an enum to + // restrict the values to what the Crucible Agent uses. + reservation_percent: RegionReservationPercent, } impl Region { @@ -77,6 +95,10 @@ impl Region { port: Some(port.into()), read_only, deleting: false, + // When the Crucible agent's reservation percentage changes, this + // function should accept that as argument. Until then, it can only + // ever be 25%. + reservation_percent: RegionReservationPercent::TwentyFive, } } @@ -112,4 +134,24 @@ impl Region { pub fn deleting(&self) -> bool { self.deleting } + + /// The size of the Region without accounting for any overhead. The + /// `allocation_query` function should have validated that this won't + /// overflow. + pub fn requested_size(&self) -> u64 { + self.block_size().to_bytes() + * self.blocks_per_extent() + * self.extent_count() + } + + /// The size the Crucible agent would have reserved during ZFS creation, + /// which is some factor higher than the requested region size to account + /// for on-disk overhead. + pub fn reserved_size(&self) -> u64 { + let overhead = match &self.reservation_percent { + RegionReservationPercent::TwentyFive => self.requested_size() / 4, + }; + + self.requested_size() + overhead + } } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index b375a742841..ad09695d792 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(133, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(134, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(134, "crucible-agent-reservation-overhead"), KnownVersion::new(133, "delete-defunct-reservations"), KnownVersion::new(132, "bp-omicron-zone-filesystem-pool-not-null"), KnownVersion::new(131, "tuf-generation"), diff --git a/nexus/db-queries/src/db/datastore/region.rs b/nexus/db-queries/src/db/datastore/region.rs index d5541c8763c..1596588ff8c 100644 --- a/nexus/db-queries/src/db/datastore/region.rs +++ b/nexus/db-queries/src/db/datastore/region.rs @@ -23,8 +23,8 @@ use crate::db::pagination::paginated; use crate::db::queries::region_allocation::RegionParameters; use crate::db::update_and_check::UpdateAndCheck; use crate::db::update_and_check::UpdateStatus; -use crate::transaction_retry::OptionalError; use async_bb8_diesel::AsyncRunQueryDsl; +use diesel::dsl::sql_query; use diesel::prelude::*; use nexus_config::RegionAllocationStrategy; use nexus_types::external_api::params; @@ -269,7 +269,7 @@ impl DataStore { }, allocation_strategy, num_regions_required, - ); + )?; let conn = self.pool_connection_authorized(&opctx).await?; @@ -301,68 +301,48 @@ impl DataStore { return Ok(()); } - #[derive(Debug, thiserror::Error)] - enum RegionDeleteError { - #[error("Numeric error: {0}")] - NumericError(String), - } - let err = OptionalError::new(); let conn = self.pool_connection_unauthorized().await?; self.transaction_retry_wrapper("regions_hard_delete") .transaction(&conn, |conn| { - let err = err.clone(); let region_ids = region_ids.clone(); async move { - use nexus_db_schema::schema::crucible_dataset::dsl as dataset_dsl; - use nexus_db_schema::schema::region::dsl as region_dsl; + use nexus_db_schema::schema::region::dsl; - // Remove the regions, collecting datasets they're from. - let datasets = diesel::delete(region_dsl::region) - .filter(region_dsl::id.eq_any(region_ids)) - .returning(region_dsl::dataset_id) - .get_results_async::(&conn).await?; + // Remove the regions + diesel::delete(dsl::region) + .filter(dsl::id.eq_any(region_ids)) + .execute_async(&conn) + .await?; // Update datasets to which the regions belonged. - for dataset in datasets { - let dataset_total_occupied_size: Option< - diesel::pg::data_types::PgNumeric, - > = region_dsl::region - .filter(region_dsl::dataset_id.eq(dataset)) - .select(diesel::dsl::sum( - region_dsl::block_size - * region_dsl::blocks_per_extent - * region_dsl::extent_count, - )) - .nullable() - .get_result_async(&conn).await?; - - let dataset_total_occupied_size: i64 = if let Some( - dataset_total_occupied_size, - ) = - dataset_total_occupied_size - { - let dataset_total_occupied_size: db::model::ByteCount = - dataset_total_occupied_size.try_into().map_err( - |e: anyhow::Error| { - err.bail(RegionDeleteError::NumericError( - e.to_string(), - )) - }, - )?; - - dataset_total_occupied_size.into() - } else { - 0 - }; - - diesel::update(dataset_dsl::crucible_dataset) - .filter(dataset_dsl::id.eq(dataset)) - .set( - dataset_dsl::size_used - .eq(dataset_total_occupied_size), - ) - .execute_async(&conn).await?; - } + sql_query( + r#" +WITH size_used_with_reservation AS ( + SELECT + crucible_dataset.id AS crucible_dataset_id, + SUM( + CASE + WHEN block_size IS NULL THEN 0 + ELSE + CASE + WHEN reservation_percent = '25' THEN + (block_size * blocks_per_extent * extent_count) / 4 + + (block_size * blocks_per_extent * extent_count) + END + END + ) AS reserved_size + FROM crucible_dataset + LEFT JOIN region ON crucible_dataset.id = region.dataset_id + WHERE crucible_dataset.time_deleted IS NULL + GROUP BY crucible_dataset.id +) +UPDATE crucible_dataset +SET size_used = size_used_with_reservation.reserved_size +FROM size_used_with_reservation +WHERE crucible_dataset.id = size_used_with_reservation.crucible_dataset_id"#, + ) + .execute_async(&conn) + .await?; // Whenever a region is hard-deleted, validate invariants // for all volumes @@ -373,52 +353,25 @@ impl DataStore { } }) .await - .map_err(|e| { - if let Some(err) = err.take() { - match err { - RegionDeleteError::NumericError(err) => { - return Error::internal_error( - &format!("Transaction error: {}", err) - ); - } - } - } - public_error_from_diesel(e, ErrorHandler::Server) - }) + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } - /// Return the total occupied size for a dataset - pub async fn regions_total_occupied_size( + /// Return the total reserved size for all the regions allocated to a + /// dataset + pub async fn regions_total_reserved_size( &self, dataset_id: DatasetUuid, ) -> Result { - use nexus_db_schema::schema::region::dsl as region_dsl; - - let total_occupied_size: Option = - region_dsl::region - .filter(region_dsl::dataset_id.eq(to_db_typed_uuid(dataset_id))) - .select(diesel::dsl::sum( - region_dsl::block_size - * region_dsl::blocks_per_extent - * region_dsl::extent_count, - )) - .nullable() - .get_result_async(&*self.pool_connection_unauthorized().await?) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; + use nexus_db_schema::schema::region::dsl; - if let Some(total_occupied_size) = total_occupied_size { - let total_occupied_size: db::model::ByteCount = - total_occupied_size.try_into().map_err( - |e: anyhow::Error| Error::internal_error(&e.to_string()), - )?; + let dataset_regions: Vec = dsl::region + .filter(dsl::dataset_id.eq(to_db_typed_uuid(dataset_id))) + .select(Region::as_select()) + .load_async(&*self.pool_connection_unauthorized().await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; - Ok(total_occupied_size.to_bytes()) - } else { - Ok(0) - } + Ok(dataset_regions.iter().map(|r| r.reserved_size()).sum()) } /// Find read/write regions on expunged disks diff --git a/nexus/db-queries/src/db/queries/region_allocation.rs b/nexus/db-queries/src/db/queries/region_allocation.rs index bc638ec4e4f..268af89b58e 100644 --- a/nexus/db-queries/src/db/queries/region_allocation.rs +++ b/nexus/db-queries/src/db/queries/region_allocation.rs @@ -5,7 +5,7 @@ //! Implementation of queries for provisioning regions. use crate::db::column_walker::AllColumnsOf; -use crate::db::model::{CrucibleDataset, Region}; +use crate::db::model::{CrucibleDataset, Region, RegionReservationPercent}; use crate::db::raw_query_builder::{QueryBuilder, TypedSqlQuery}; use crate::db::true_or_cast_error::matches_sentinel; use const_format::concatcp; @@ -13,6 +13,7 @@ use diesel::pg::Pg; use diesel::result::Error as DieselError; use diesel::sql_types; use nexus_config::RegionAllocationStrategy; +use nexus_db_schema::enums::RegionReservationPercentEnum; use nexus_db_schema::schema; use omicron_common::api::external; use omicron_uuid_kinds::GenericUuid; @@ -49,7 +50,11 @@ pub fn from_diesel(e: DieselError) -> external::Error { NOT_ENOUGH_ZPOOL_SPACE_SENTINEL => { return external::Error::insufficient_capacity( external_message, - "Not enough zpool space to allocate disks. There may not be enough disks with space for the requested region. You may also see this if your rack is in a degraded state, or you're running the default multi-rack topology configuration in a 1-sled development environment.", + "Not enough zpool space to allocate disks. There may not \ + be enough disks with space for the requested region. You \ + may also see this if your rack is in a degraded state, or \ + you're running the default multi-rack topology \ + configuration in a 1-sled development environment.", ); } NOT_ENOUGH_UNIQUE_ZPOOLS_SENTINEL => { @@ -82,6 +87,79 @@ pub struct RegionParameters { pub read_only: bool, } +type AllocationQuery = + TypedSqlQuery<(SelectableSql, SelectableSql)>; + +/// Currently the largest region that can be allocated matches the largest disk +/// that can be requested, but separate this constant so that when +/// MAX_DISK_SIZE_BYTES is increased the region allocation query will still use +/// this as a maximum size. +pub const MAX_REGION_SIZE_BYTES: u64 = 1098437885952; // 1023 * (1 << 30); + +#[derive(Debug)] +pub enum AllocationQueryError { + /// Region size multiplication overflowed u64 + RegionSizeOverflow, + + /// Requested region size larger than maximum + RequestedRegionOverMaxSize { request: u64, maximum: u64 }, + + /// Requested size not divisible by reservation factor + RequestedRegionNotDivisibleByFactor { request: i64, factor: i64 }, + + /// Adding the overhead to the requested size overflowed + RequestedRegionOverheadOverflow { request: i64, overhead: i64 }, + + /// Converting from u64 to i64 truncated + RequestedRegionSizeTruncated { request: u64, e: String }, +} + +impl From for external::Error { + fn from(e: AllocationQueryError) -> external::Error { + match e { + AllocationQueryError::RegionSizeOverflow => { + external::Error::invalid_value( + "region allocation", + "region size overflowed u64", + ) + } + + AllocationQueryError::RequestedRegionOverMaxSize { + request, + maximum, + } => external::Error::invalid_value( + "region allocation", + format!("region size {request} over maximum {maximum}"), + ), + + AllocationQueryError::RequestedRegionNotDivisibleByFactor { + request, + factor, + } => external::Error::invalid_value( + "region allocation", + format!("region size {request} not divisible by {factor}"), + ), + + AllocationQueryError::RequestedRegionOverheadOverflow { + request, + overhead, + } => external::Error::invalid_value( + "region allocation", + format!( + "adding {overhead} to region size {request} overflowed" + ), + ), + + AllocationQueryError::RequestedRegionSizeTruncated { + request, + e, + } => external::Error::internal_error(&format!( + "converting {request} to i64 failed! {e}" + )), + } + } +} + /// For a given volume, idempotently allocate enough regions (according to some /// allocation strategy) to meet some redundancy level. This should only be used /// for the region set that is in the top level of the Volume (not the deeper @@ -93,7 +171,7 @@ pub fn allocation_query( params: RegionParameters, allocation_strategy: &RegionAllocationStrategy, redundancy: usize, -) -> TypedSqlQuery<(SelectableSql, SelectableSql)> { +) -> Result { let (seed, distinct_sleds) = { let (input_seed, distinct_sleds) = match allocation_strategy { RegionAllocationStrategy::Random { seed } => (seed, false), @@ -117,8 +195,65 @@ pub fn allocation_query( let seed = seed.to_le_bytes().to_vec(); - let size_delta = - params.block_size * params.blocks_per_extent * params.extent_count; + // Ensure that the multiplication doesn't overflow. + let requested_size: u64 = params + .block_size + .checked_mul(params.blocks_per_extent) + .ok_or(AllocationQueryError::RegionSizeOverflow)? + .checked_mul(params.extent_count) + .ok_or(AllocationQueryError::RegionSizeOverflow)?; + + if requested_size > MAX_REGION_SIZE_BYTES { + return Err(AllocationQueryError::RequestedRegionOverMaxSize { + request: requested_size, + maximum: MAX_REGION_SIZE_BYTES, + }); + } + + // After the above check, cast from u64 to i64. The value is low enough + // (after the check above) that try_into should always return Ok. + let requested_size: i64 = match requested_size.try_into() { + Ok(v) => v, + Err(e) => { + return Err(AllocationQueryError::RequestedRegionSizeTruncated { + request: requested_size, + e: e.to_string(), + }); + } + }; + + let reservation_percent = RegionReservationPercent::TwentyFive; + + let size_delta: i64 = match reservation_percent { + RegionReservationPercent::TwentyFive => { + // Check first that the requested region size is divisible by this. + // This should basically never fail because all block sizes are + // divisible by 4. + if requested_size % 4 != 0 { + return Err( + AllocationQueryError::RequestedRegionNotDivisibleByFactor { + request: requested_size, + factor: 4, + }, + ); + } + + let overhead: i64 = requested_size.checked_div(4).ok_or( + AllocationQueryError::RequestedRegionNotDivisibleByFactor { + request: requested_size, + factor: 4, + }, + )?; + + requested_size.checked_add(overhead).ok_or( + AllocationQueryError::RequestedRegionOverheadOverflow { + request: requested_size, + overhead, + }, + )? + } + }; + let redundancy: i64 = i64::try_from(redundancy).unwrap(); let mut builder = QueryBuilder::new(); @@ -217,7 +352,7 @@ pub fn allocation_query( AND physical_disk.disk_state = 'active' AND NOT(zpool.id = ANY(SELECT existing_zpools.pool_id FROM existing_zpools)) " - ).bind::(size_delta as i64); + ).bind::(size_delta); if distinct_sleds { builder @@ -272,7 +407,8 @@ pub fn allocation_query( ").param().sql(" AS extent_count, NULL AS port, ").param().sql(" AS read_only, - FALSE as deleting + FALSE as deleting, + ").param().sql(" AS reservation_percent FROM shuffled_candidate_datasets") // Only select the *additional* number of candidate regions for the required // redundancy level @@ -286,6 +422,7 @@ pub fn allocation_query( .bind::(params.blocks_per_extent as i64) .bind::(params.extent_count as i64) .bind::(params.read_only) + .bind::(reservation_percent) .bind::(redundancy) // A subquery which summarizes the changes we intend to make, showing: @@ -298,9 +435,10 @@ pub fn allocation_query( SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - ((candidate_regions.block_size * candidate_regions.blocks_per_extent) * candidate_regions.extent_count) AS size_used_delta + ").param().sql(" AS size_used_delta FROM (candidate_regions INNER JOIN crucible_dataset ON (crucible_dataset.id = candidate_regions.dataset_id)) ),") + .bind::(size_delta) // Confirms whether or not the insertion and updates should // occur. @@ -385,7 +523,7 @@ pub fn allocation_query( .sql(" inserted_regions AS ( INSERT INTO region - (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only, deleting) + (id, time_created, time_modified, dataset_id, volume_id, block_size, blocks_per_extent, extent_count, port, read_only, deleting, reservation_percent) SELECT ").sql(AllColumnsOfRegion::with_prefix("candidate_regions")).sql(" FROM candidate_regions WHERE @@ -418,7 +556,7 @@ UNION )" ); - builder.query() + Ok(builder.query()) } #[cfg(test)] @@ -458,7 +596,8 @@ mod test { seed: Some(1), }, REGION_REDUNDANCY_THRESHOLD, - ); + ) + .unwrap(); expectorate_query_contents( ®ion_allocate, @@ -474,7 +613,8 @@ mod test { params, &RegionAllocationStrategy::Random { seed: Some(1) }, REGION_REDUNDANCY_THRESHOLD, - ); + ) + .unwrap(); expectorate_query_contents( ®ion_allocate, "tests/output/region_allocate_random_sleds.sql", @@ -495,7 +635,8 @@ mod test { seed: Some(1), }, REGION_REDUNDANCY_THRESHOLD, - ); + ) + .unwrap(); expectorate_query_contents( ®ion_allocate, "tests/output/region_allocate_with_snapshot_distinct_sleds.sql", @@ -510,7 +651,8 @@ mod test { params, &RegionAllocationStrategy::Random { seed: Some(1) }, REGION_REDUNDANCY_THRESHOLD, - ); + ) + .unwrap(); expectorate_query_contents( ®ion_allocate, "tests/output/region_allocate_with_snapshot_random_sleds.sql", @@ -543,7 +685,8 @@ mod test { params, &RegionAllocationStrategy::RandomWithDistinctSleds { seed: None }, REGION_REDUNDANCY_THRESHOLD, - ); + ) + .unwrap(); let _ = region_allocate .explain_async(&conn) .await @@ -557,7 +700,8 @@ mod test { params, &RegionAllocationStrategy::Random { seed: None }, REGION_REDUNDANCY_THRESHOLD, - ); + ) + .unwrap(); let _ = region_allocate .explain_async(&conn) .await @@ -566,4 +710,64 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + #[test] + fn allocation_query_region_size_overflow() { + let volume_id = VolumeUuid::nil(); + let snapshot_id = None; + + let params = RegionParameters { + block_size: 512, + blocks_per_extent: 4294967296, + extent_count: 8388609, // should cause an overflow! + read_only: false, + }; + + let Err(e) = allocation_query( + volume_id, + snapshot_id, + params, + &RegionAllocationStrategy::RandomWithDistinctSleds { + seed: Some(1), + }, + REGION_REDUNDANCY_THRESHOLD, + ) else { + panic!("expected error"); + }; + + assert!(matches!(e, AllocationQueryError::RegionSizeOverflow)); + } + + #[test] + fn allocation_query_region_size_too_large() { + let volume_id = VolumeUuid::nil(); + let snapshot_id = None; + + let params = RegionParameters { + block_size: 512, + blocks_per_extent: 8388608, // 2^32 / 512 + extent_count: 256, // 255 would be ok, 256 is too large + read_only: false, + }; + + let Err(e) = allocation_query( + volume_id, + snapshot_id, + params, + &RegionAllocationStrategy::RandomWithDistinctSleds { + seed: Some(1), + }, + REGION_REDUNDANCY_THRESHOLD, + ) else { + panic!("expected error!"); + }; + + assert!(matches!( + e, + AllocationQueryError::RequestedRegionOverMaxSize { + request: 1099511627776u64, + maximum: MAX_REGION_SIZE_BYTES, + } + )); + } } diff --git a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql index 81741bf8322..51f56f89ac6 100644 --- a/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_distinct_sleds.sql @@ -12,7 +12,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_percent FROM region WHERE @@ -113,21 +114,19 @@ WITH $10 AS extent_count, NULL AS port, $11 AS read_only, - false AS deleting + false AS deleting, + $12 AS reservation_percent FROM shuffled_candidate_datasets LIMIT - $12 - (SELECT count(*) FROM old_regions) + $13 - (SELECT count(*) FROM old_regions) ), proposed_dataset_changes AS ( SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - candidate_regions.block_size - * candidate_regions.blocks_per_extent - * candidate_regions.extent_count - AS size_used_delta + $14 AS size_used_delta FROM candidate_regions INNER JOIN crucible_dataset ON crucible_dataset.id = candidate_regions.dataset_id @@ -137,7 +136,7 @@ WITH SELECT ( ( - (SELECT count(*) FROM old_regions LIMIT 1) < $13 + (SELECT count(*) FROM old_regions LIMIT 1) < $15 AND CAST( IF( ( @@ -147,7 +146,7 @@ WITH + (SELECT count(*) FROM existing_zpools LIMIT 1) ) ) - >= $14 + >= $16 ), 'TRUE', 'Not enough space' @@ -164,7 +163,7 @@ WITH + (SELECT count(*) FROM old_regions LIMIT 1) ) ) - >= $15 + >= $17 ), 'TRUE', 'Not enough datasets' @@ -203,7 +202,7 @@ WITH 1 ) ) - >= $16 + >= $18 ), 'TRUE', 'Not enough unique zpools selected' @@ -228,7 +227,8 @@ WITH extent_count, port, read_only, - deleting + deleting, + reservation_percent ) SELECT candidate_regions.id, @@ -241,7 +241,8 @@ WITH candidate_regions.extent_count, candidate_regions.port, candidate_regions.read_only, - candidate_regions.deleting + candidate_regions.deleting, + candidate_regions.reservation_percent FROM candidate_regions WHERE @@ -257,7 +258,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_percent ), updated_datasets AS ( @@ -311,7 +313,8 @@ WITH old_regions.extent_count, old_regions.port, old_regions.read_only, - old_regions.deleting + old_regions.deleting, + old_regions.reservation_percent FROM old_regions INNER JOIN crucible_dataset ON old_regions.dataset_id = crucible_dataset.id ) @@ -337,7 +340,8 @@ UNION inserted_regions.extent_count, inserted_regions.port, inserted_regions.read_only, - inserted_regions.deleting + inserted_regions.deleting, + inserted_regions.reservation_percent FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql index dcfa9736a9b..7e1ac153a3f 100644 --- a/nexus/db-queries/tests/output/region_allocate_random_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_random_sleds.sql @@ -12,7 +12,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_percent FROM region WHERE @@ -101,21 +102,19 @@ WITH $9 AS extent_count, NULL AS port, $10 AS read_only, - false AS deleting + false AS deleting, + $11 AS reservation_percent FROM shuffled_candidate_datasets LIMIT - $11 - (SELECT count(*) FROM old_regions) + $12 - (SELECT count(*) FROM old_regions) ), proposed_dataset_changes AS ( SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - candidate_regions.block_size - * candidate_regions.blocks_per_extent - * candidate_regions.extent_count - AS size_used_delta + $13 AS size_used_delta FROM candidate_regions INNER JOIN crucible_dataset ON crucible_dataset.id = candidate_regions.dataset_id @@ -125,7 +124,7 @@ WITH SELECT ( ( - (SELECT count(*) FROM old_regions LIMIT 1) < $12 + (SELECT count(*) FROM old_regions LIMIT 1) < $14 AND CAST( IF( ( @@ -135,7 +134,7 @@ WITH + (SELECT count(*) FROM existing_zpools LIMIT 1) ) ) - >= $13 + >= $15 ), 'TRUE', 'Not enough space' @@ -152,7 +151,7 @@ WITH + (SELECT count(*) FROM old_regions LIMIT 1) ) ) - >= $14 + >= $16 ), 'TRUE', 'Not enough datasets' @@ -191,7 +190,7 @@ WITH 1 ) ) - >= $15 + >= $17 ), 'TRUE', 'Not enough unique zpools selected' @@ -216,7 +215,8 @@ WITH extent_count, port, read_only, - deleting + deleting, + reservation_percent ) SELECT candidate_regions.id, @@ -229,7 +229,8 @@ WITH candidate_regions.extent_count, candidate_regions.port, candidate_regions.read_only, - candidate_regions.deleting + candidate_regions.deleting, + candidate_regions.reservation_percent FROM candidate_regions WHERE @@ -245,7 +246,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_percent ), updated_datasets AS ( @@ -299,7 +301,8 @@ WITH old_regions.extent_count, old_regions.port, old_regions.read_only, - old_regions.deleting + old_regions.deleting, + old_regions.reservation_percent FROM old_regions INNER JOIN crucible_dataset ON old_regions.dataset_id = crucible_dataset.id ) @@ -325,7 +328,8 @@ UNION inserted_regions.extent_count, inserted_regions.port, inserted_regions.read_only, - inserted_regions.deleting + inserted_regions.deleting, + inserted_regions.reservation_percent FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql index 856e1ec3f41..03b2b6cb708 100644 --- a/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_distinct_sleds.sql @@ -12,7 +12,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_percent FROM region WHERE @@ -125,21 +126,19 @@ WITH $11 AS extent_count, NULL AS port, $12 AS read_only, - false AS deleting + false AS deleting, + $13 AS reservation_percent FROM shuffled_candidate_datasets LIMIT - $13 - (SELECT count(*) FROM old_regions) + $14 - (SELECT count(*) FROM old_regions) ), proposed_dataset_changes AS ( SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - candidate_regions.block_size - * candidate_regions.blocks_per_extent - * candidate_regions.extent_count - AS size_used_delta + $15 AS size_used_delta FROM candidate_regions INNER JOIN crucible_dataset ON crucible_dataset.id = candidate_regions.dataset_id @@ -149,7 +148,7 @@ WITH SELECT ( ( - (SELECT count(*) FROM old_regions LIMIT 1) < $14 + (SELECT count(*) FROM old_regions LIMIT 1) < $16 AND CAST( IF( ( @@ -159,7 +158,7 @@ WITH + (SELECT count(*) FROM existing_zpools LIMIT 1) ) ) - >= $15 + >= $17 ), 'TRUE', 'Not enough space' @@ -176,7 +175,7 @@ WITH + (SELECT count(*) FROM old_regions LIMIT 1) ) ) - >= $16 + >= $18 ), 'TRUE', 'Not enough datasets' @@ -215,7 +214,7 @@ WITH 1 ) ) - >= $17 + >= $19 ), 'TRUE', 'Not enough unique zpools selected' @@ -240,7 +239,8 @@ WITH extent_count, port, read_only, - deleting + deleting, + reservation_percent ) SELECT candidate_regions.id, @@ -253,7 +253,8 @@ WITH candidate_regions.extent_count, candidate_regions.port, candidate_regions.read_only, - candidate_regions.deleting + candidate_regions.deleting, + candidate_regions.reservation_percent FROM candidate_regions WHERE @@ -269,7 +270,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_percent ), updated_datasets AS ( @@ -323,7 +325,8 @@ WITH old_regions.extent_count, old_regions.port, old_regions.read_only, - old_regions.deleting + old_regions.deleting, + old_regions.reservation_percent FROM old_regions INNER JOIN crucible_dataset ON old_regions.dataset_id = crucible_dataset.id ) @@ -349,7 +352,8 @@ UNION inserted_regions.extent_count, inserted_regions.port, inserted_regions.read_only, - inserted_regions.deleting + inserted_regions.deleting, + inserted_regions.reservation_percent FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql index 60637b53778..b71578e0509 100644 --- a/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql +++ b/nexus/db-queries/tests/output/region_allocate_with_snapshot_random_sleds.sql @@ -12,7 +12,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_percent FROM region WHERE @@ -113,21 +114,19 @@ WITH $10 AS extent_count, NULL AS port, $11 AS read_only, - false AS deleting + false AS deleting, + $12 AS reservation_percent FROM shuffled_candidate_datasets LIMIT - $12 - (SELECT count(*) FROM old_regions) + $13 - (SELECT count(*) FROM old_regions) ), proposed_dataset_changes AS ( SELECT candidate_regions.dataset_id AS id, crucible_dataset.pool_id AS pool_id, - candidate_regions.block_size - * candidate_regions.blocks_per_extent - * candidate_regions.extent_count - AS size_used_delta + $14 AS size_used_delta FROM candidate_regions INNER JOIN crucible_dataset ON crucible_dataset.id = candidate_regions.dataset_id @@ -137,7 +136,7 @@ WITH SELECT ( ( - (SELECT count(*) FROM old_regions LIMIT 1) < $13 + (SELECT count(*) FROM old_regions LIMIT 1) < $15 AND CAST( IF( ( @@ -147,7 +146,7 @@ WITH + (SELECT count(*) FROM existing_zpools LIMIT 1) ) ) - >= $14 + >= $16 ), 'TRUE', 'Not enough space' @@ -164,7 +163,7 @@ WITH + (SELECT count(*) FROM old_regions LIMIT 1) ) ) - >= $15 + >= $17 ), 'TRUE', 'Not enough datasets' @@ -203,7 +202,7 @@ WITH 1 ) ) - >= $16 + >= $18 ), 'TRUE', 'Not enough unique zpools selected' @@ -228,7 +227,8 @@ WITH extent_count, port, read_only, - deleting + deleting, + reservation_percent ) SELECT candidate_regions.id, @@ -241,7 +241,8 @@ WITH candidate_regions.extent_count, candidate_regions.port, candidate_regions.read_only, - candidate_regions.deleting + candidate_regions.deleting, + candidate_regions.reservation_percent FROM candidate_regions WHERE @@ -257,7 +258,8 @@ WITH region.extent_count, region.port, region.read_only, - region.deleting + region.deleting, + region.reservation_percent ), updated_datasets AS ( @@ -311,7 +313,8 @@ WITH old_regions.extent_count, old_regions.port, old_regions.read_only, - old_regions.deleting + old_regions.deleting, + old_regions.reservation_percent FROM old_regions INNER JOIN crucible_dataset ON old_regions.dataset_id = crucible_dataset.id ) @@ -337,7 +340,8 @@ UNION inserted_regions.extent_count, inserted_regions.port, inserted_regions.read_only, - inserted_regions.deleting + inserted_regions.deleting, + inserted_regions.reservation_percent FROM inserted_regions INNER JOIN updated_datasets ON inserted_regions.dataset_id = updated_datasets.id diff --git a/nexus/db-schema/src/enums.rs b/nexus/db-schema/src/enums.rs index 31e70dfd2e2..c529ef824fb 100644 --- a/nexus/db-schema/src/enums.rs +++ b/nexus/db-schema/src/enums.rs @@ -66,6 +66,7 @@ define_enums! { ReadOnlyTargetReplacementTypeEnum => "read_only_target_replacement_type", RegionReplacementStateEnum => "region_replacement_state", RegionReplacementStepTypeEnum => "region_replacement_step_type", + RegionReservationPercentEnum => "region_reservation_percent", RegionSnapshotReplacementStateEnum => "region_snapshot_replacement_state", RegionSnapshotReplacementStepStateEnum => "region_snapshot_replacement_step_state", RotImageErrorEnum => "rot_image_error", diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 28abcff9c89..737b7d5665b 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1127,6 +1127,8 @@ table! { read_only -> Bool, deleting -> Bool, + + reservation_percent -> crate::enums::RegionReservationPercentEnum, } } diff --git a/nexus/src/app/sagas/disk_create.rs b/nexus/src/app/sagas/disk_create.rs index e9df037916f..701fdae63b7 100644 --- a/nexus/src/app/sagas/disk_create.rs +++ b/nexus/src/app/sagas/disk_create.rs @@ -1014,7 +1014,7 @@ pub(crate) mod test { for zpool in test.zpools() { for dataset in &zpool.datasets { if datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap() != 0 diff --git a/nexus/src/app/sagas/region_replacement_start.rs b/nexus/src/app/sagas/region_replacement_start.rs index f38f2ad5f33..b1fe3c5a27b 100644 --- a/nexus/src/app/sagas/region_replacement_start.rs +++ b/nexus/src/app/sagas/region_replacement_start.rs @@ -1027,7 +1027,7 @@ pub(crate) mod test { for zpool in test.zpools() { for dataset in &zpool.datasets { if datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap() != 0 diff --git a/nexus/test-utils/src/resource_helpers.rs b/nexus/test-utils/src/resource_helpers.rs index 6f23bfafca0..8be043655dd 100644 --- a/nexus/test-utils/src/resource_helpers.rs +++ b/nexus/test-utils/src/resource_helpers.rs @@ -1185,7 +1185,7 @@ pub struct DiskTest<'a, N: NexusServer> { } impl<'a, N: NexusServer> DiskTest<'a, N> { - pub const DEFAULT_ZPOOL_SIZE_GIB: u32 = 10; + pub const DEFAULT_ZPOOL_SIZE_GIB: u32 = 16; pub const DEFAULT_ZPOOL_COUNT: u32 = 3; /// Creates a new [DiskTest] with default configuration. diff --git a/nexus/tests/integration_tests/disks.rs b/nexus/tests/integration_tests/disks.rs index 1951a17bf91..29e3d7cd865 100644 --- a/nexus/tests/integration_tests/disks.rs +++ b/nexus/tests/integration_tests/disks.rs @@ -998,13 +998,13 @@ async fn test_disk_backed_by_multiple_region_sets( ) { let client = &cptestctx.external_client; - // Create three zpools, all 10 gibibytes, each with one dataset + // Create three zpools, each with one dataset let mut test = DiskTest::new(&cptestctx).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); + // Assert default is still 16 GiB + assert_eq!(16, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create another three zpools, all 10 gibibytes, each with one dataset + // Create another three zpools, each with one dataset test.add_zpool_with_dataset(cptestctx.first_sled_id()).await; test.add_zpool_with_dataset(cptestctx.first_sled_id()).await; test.add_zpool_with_dataset(cptestctx.first_sled_id()).await; @@ -1044,10 +1044,10 @@ async fn test_disk_too_big(cptestctx: &ControlPlaneTestContext) { DiskTest::new(&cptestctx).await; create_project_and_pool(client).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); + // Assert default is still 16 GiB + assert_eq!(16, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Ask for a 300 gibibyte disk (but only 10 is available) + // Ask for a 300 gibibyte disk (but only 16 is available) let disk_size = ByteCount::from_gibibytes_u32(300); let disks_url = get_disks_url(); let new_disk = params::DiskCreate { @@ -1546,11 +1546,11 @@ async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { let nexus = &cptestctx.server.server_context().nexus; let datastore = nexus.datastore(); - // Create three 10 GiB zpools, each with one dataset. + // Create three zpools, each with one dataset. let test = DiskTest::new(&cptestctx).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); + // Assert default is still 16 GiB + assert_eq!(16, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); create_project_and_pool(client).await; @@ -1559,11 +1559,20 @@ async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { for dataset in &zpool.datasets { assert_eq!( datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap(), 0 ); + + assert_eq!( + datastore + .crucible_dataset_get(dataset.id) + .await + .unwrap() + .size_used, + 0, + ); } } @@ -1594,21 +1603,22 @@ async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { // Total occupied size is 7 GiB * 3 (each Crucible disk requires three // regions to make a region set for an Upstairs, one region per dataset) + // plus reservation overhead for zpool in test.zpools() { for dataset in &zpool.datasets { assert_eq!( datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap(), - ByteCount::from_gibibytes_u32(7).to_bytes(), + ByteCount::from_mebibytes_u32(8960).to_bytes(), ); } } - // Ask for a 4 gibibyte disk, this should fail because there isn't space + // Ask for a 6 gibibyte disk, this should fail because there isn't space // available. - let disk_size = ByteCount::from_gibibytes_u32(4); + let disk_size = ByteCount::from_gibibytes_u32(6); let disk_two = params::DiskCreate { identity: IdentityMetadataCreateParams { name: "disk-two".parse().unwrap(), @@ -1628,17 +1638,17 @@ async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { .authn_as(AuthnMode::PrivilegedUser) .execute() .await - .expect("unexpected success creating 4 GiB disk"); + .expect("unexpected success creating 6 GiB disk"); - // Total occupied size is still 7 GiB * 3 + // Total occupied size is still 7 GiB * 3 (plus overhead) for zpool in test.zpools() { for dataset in &zpool.datasets { assert_eq!( datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap(), - ByteCount::from_gibibytes_u32(7).to_bytes(), + ByteCount::from_mebibytes_u32(8960).to_bytes(), ); } } @@ -1659,7 +1669,7 @@ async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { for dataset in &zpool.datasets { assert_eq!( datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap(), 0, @@ -1690,15 +1700,15 @@ async fn test_disk_size_accounting(cptestctx: &ControlPlaneTestContext) { .await .expect("unexpected failure creating 10 GiB disk"); - // Total occupied size should be 10 GiB * 3 + // Total occupied size should be 10 GiB * 3 plus overhead for zpool in test.zpools() { for dataset in &zpool.datasets { assert_eq!( datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap(), - ByteCount::from_gibibytes_u32(10).to_bytes(), + ByteCount::from_mebibytes_u32(12800).to_bytes(), ); } } @@ -1711,16 +1721,13 @@ async fn test_multiple_disks_multiple_zpools( ) { let client = &cptestctx.external_client; - // Create six 10 GB zpools, each with one dataset + // Create six zpools, each with one dataset let _test = DiskTestBuilder::new(&cptestctx) .on_specific_sled(cptestctx.first_sled_id()) .with_zpool_count(6) .build() .await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - create_project_and_pool(client).await; // Ask for a 10 gibibyte disk, this should succeed @@ -2086,12 +2093,9 @@ async fn test_single_region_allocate(cptestctx: &ControlPlaneTestContext) { let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create three 10 GiB zpools, each with one dataset. + // Create three zpools, each with one dataset. let disk_test = DiskTest::new(&cptestctx).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Allocate a single 1 GB region let volume_id = VolumeUuid::new_v4(); @@ -2133,11 +2137,11 @@ async fn test_single_region_allocate(cptestctx: &ControlPlaneTestContext) { for zpool in disk_test.zpools() { for dataset in &zpool.datasets { let total_size = datastore - .regions_total_occupied_size(dataset.id) + .regions_total_reserved_size(dataset.id) .await .unwrap(); - if total_size == 1073741824 { + if total_size == allocated_region.reserved_size() { number_of_matching_regions += 1; } else if total_size == 0 { // ok, unallocated @@ -2160,16 +2164,13 @@ async fn test_region_allocation_strategy_random_is_idempotent( let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create four 10 GiB zpools, each with one dataset. + // Create four zpools, each with one dataset. let _test = DiskTestBuilder::new(&cptestctx) .on_specific_sled(cptestctx.first_sled_id()) .with_zpool_count(4) .build() .await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create a disk let client = &cptestctx.external_client; let _project_id = create_project_and_pool(client).await; @@ -2230,16 +2231,13 @@ async fn test_region_allocation_strategy_random_is_idempotent_arbitrary( let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create four 10 GiB zpools, each with one dataset. + // Create four zpools, each with one dataset. let _test = DiskTestBuilder::new(&cptestctx) .on_specific_sled(cptestctx.first_sled_id()) .with_zpool_count(4) .build() .await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Call region allocation in isolation let volume_id = VolumeUuid::new_v4(); @@ -2293,7 +2291,7 @@ async fn test_single_region_allocate_for_replace( let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create four 10 GiB zpools, each with one dataset. + // Create four zpools, each with one dataset. // // We add one more then the "three" default to meet `region_allocate`'s // redundancy requirements. @@ -2303,9 +2301,6 @@ async fn test_single_region_allocate_for_replace( .build() .await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create a disk let client = &cptestctx.external_client; let _project_id = create_project_and_pool(client).await; @@ -2387,12 +2382,9 @@ async fn test_single_region_allocate_for_replace_not_enough_zpools( let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create three 10 GiB zpools, each with one dataset. + // Create three zpools, each with one dataset. let _disk_test = DiskTest::new(&cptestctx).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create a disk let client = &cptestctx.external_client; let _project_id = create_project_and_pool(client).await; @@ -2558,12 +2550,9 @@ async fn test_disk_expunge(cptestctx: &ControlPlaneTestContext) { let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create three 10 GiB zpools, each with one dataset. + // Create three zpools, each with one dataset. let _disk_test = DiskTest::new(&cptestctx).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create a disk let client = &cptestctx.external_client; let _project_id = create_project_and_pool(client).await; diff --git a/nexus/tests/integration_tests/schema.rs b/nexus/tests/integration_tests/schema.rs index 86288cf9883..c0de0fe47e0 100644 --- a/nexus/tests/integration_tests/schema.rs +++ b/nexus/tests/integration_tests/schema.rs @@ -1854,6 +1854,199 @@ fn after_133_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { }) } +fn before_134_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { + Box::pin(async { + // To test the size_used upgrader, create a few crucible datasets and + // regions + + // First, a crucible dataset with no regions + let dataset_id: Uuid = + "00000001-0000-0000-0000-000000000000".parse().unwrap(); + let pool_id = Uuid::new_v4(); + let size_used = 0; + + ctx.client + .batch_execute(&format!( + "INSERT INTO crucible_dataset VALUES ( + '{dataset_id}', + now(), + now(), + null, + 1, + '{pool_id}', + '::1', + 10000, + {size_used} + )" + )) + .await + .expect("inserted"); + + // Then, a crucible dataset with 1 region + let dataset_id: Uuid = + "00000002-0000-0000-0000-000000000000".parse().unwrap(); + + let region_id = Uuid::new_v4(); + let pool_id = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + let block_size = 512; + let blocks_per_extent = 64; + let extent_count = 1000; + + ctx.client + .batch_execute(&format!( + "INSERT INTO region VALUES ( + '{region_id}', + now(), + now(), + '{dataset_id}', + '{volume_id}', + {block_size}, + {blocks_per_extent}, + {extent_count}, + 5000, + false, + false + )" + )) + .await + .expect("inserted"); + + let size_used = block_size * blocks_per_extent * extent_count; + + ctx.client + .batch_execute(&format!( + "INSERT INTO crucible_dataset VALUES ( + '{dataset_id}', + now(), + now(), + null, + 1, + '{pool_id}', + '::1', + 10000, + {size_used} + )" + )) + .await + .expect("inserted"); + + // Finally, a crucible dataset with 3 regions + let dataset_id: Uuid = + "00000003-0000-0000-0000-000000000000".parse().unwrap(); + let pool_id = Uuid::new_v4(); + + let block_size = 512; + let blocks_per_extent = 64; + let extent_count = 7000; + + for _ in 0..3 { + let region_id = Uuid::new_v4(); + let volume_id = Uuid::new_v4(); + + ctx.client + .batch_execute(&format!( + "INSERT INTO region VALUES ( + '{region_id}', + now(), + now(), + '{dataset_id}', + '{volume_id}', + {block_size}, + {blocks_per_extent}, + {extent_count}, + 5000, + false, + false + )" + )) + .await + .expect("inserted"); + } + + let size_used = 3 * block_size * blocks_per_extent * extent_count; + + ctx.client + .batch_execute(&format!( + "INSERT INTO crucible_dataset VALUES ( + '{dataset_id}', + now(), + now(), + null, + 1, + '{pool_id}', + '::1', + 10000, + {size_used} + )" + )) + .await + .expect("inserted"); + }) +} + +fn after_134_0_0<'a>(ctx: &'a MigrationContext<'a>) -> BoxFuture<'a, ()> { + Box::pin(async { + // The first crucible dataset still has size_used = 0 + let rows = ctx + .client + .query( + "SELECT size_used FROM crucible_dataset WHERE + id = '00000001-0000-0000-0000-000000000000'", + &[], + ) + .await + .expect("select"); + + let records = process_rows(&rows); + assert_eq!(records.len(), 1); + assert_eq!( + records[0].values, + vec![ColumnValue::new("size_used", 0i64)], + ); + + // Note: the default crucible reservation factor is 1.25 + + // The second crucible dataset has + // size_used = 1.25 * (512 * 64 * 1000) + let rows = ctx + .client + .query( + "SELECT size_used FROM crucible_dataset WHERE + id = '00000002-0000-0000-0000-000000000000'", + &[], + ) + .await + .expect("select"); + + let records = process_rows(&rows); + assert_eq!(records.len(), 1); + assert_eq!( + records[0].values, + vec![ColumnValue::new("size_used", 40960000i64)], + ); + + // The third crucible dataset has + // size_used = 1.25 * (3 * 512 * 64 * 7000) + let rows = ctx + .client + .query( + "SELECT size_used FROM crucible_dataset WHERE + id = '00000003-0000-0000-0000-000000000000'", + &[], + ) + .await + .expect("select"); + + let records = process_rows(&rows); + assert_eq!(records.len(), 1); + assert_eq!( + records[0].values, + vec![ColumnValue::new("size_used", 860160000i64)], + ); + }) +} + // Lazily initializes all migration checks. The combination of Rust function // pointers and async makes defining a static table fairly painful, so we're // using lazy initialization instead. @@ -1906,6 +2099,11 @@ fn get_migration_checks() -> BTreeMap { Version::new(133, 0, 0), DataMigrationFns::new().before(before_133_0_0).after(after_133_0_0), ); + map.insert( + Version::new(134, 0, 0), + DataMigrationFns::new().before(before_134_0_0).after(after_134_0_0), + ); + map } diff --git a/nexus/tests/integration_tests/snapshots.rs b/nexus/tests/integration_tests/snapshots.rs index e63c54e1589..e4ccb18aed6 100644 --- a/nexus/tests/integration_tests/snapshots.rs +++ b/nexus/tests/integration_tests/snapshots.rs @@ -1461,7 +1461,7 @@ async fn test_region_allocation_for_snapshot( let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create four 10 GiB zpools, each with one dataset. + // Create four zpools, each with one dataset. // // We add one more than the "three" default to avoid failing // with "not enough storage". @@ -1472,9 +1472,6 @@ async fn test_region_allocation_for_snapshot( .build() .await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create a disk let client = &cptestctx.external_client; let _project_id = create_project_and_pool(client).await; @@ -1656,12 +1653,9 @@ async fn test_snapshot_expunge(cptestctx: &ControlPlaneTestContext) { let opctx = OpContext::for_tests(cptestctx.logctx.log.new(o!()), datastore.clone()); - // Create three 10 GiB zpools, each with one dataset. + // Create three zpools, each with one dataset. let _disk_test = DiskTest::new(&cptestctx).await; - // Assert default is still 10 GiB - assert_eq!(10, DiskTest::DEFAULT_ZPOOL_SIZE_GIB); - // Create a disk, then a snapshot of that disk let client = &cptestctx.external_client; let _project_id = create_project_and_pool(client).await; diff --git a/schema/crdb/crucible-agent-reservation-overhead/up01.sql b/schema/crdb/crucible-agent-reservation-overhead/up01.sql new file mode 100644 index 00000000000..4bc50409f8e --- /dev/null +++ b/schema/crdb/crucible-agent-reservation-overhead/up01.sql @@ -0,0 +1,3 @@ +CREATE TYPE IF NOT EXISTS omicron.public.region_reservation_percent AS ENUM ( + '25' +); diff --git a/schema/crdb/crucible-agent-reservation-overhead/up02.sql b/schema/crdb/crucible-agent-reservation-overhead/up02.sql new file mode 100644 index 00000000000..d498edf0525 --- /dev/null +++ b/schema/crdb/crucible-agent-reservation-overhead/up02.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.region + ADD COLUMN IF NOT EXISTS reservation_percent omicron.public.region_reservation_percent NOT NULL DEFAULT '25'; diff --git a/schema/crdb/crucible-agent-reservation-overhead/up03.sql b/schema/crdb/crucible-agent-reservation-overhead/up03.sql new file mode 100644 index 00000000000..c7c13a6dbed --- /dev/null +++ b/schema/crdb/crucible-agent-reservation-overhead/up03.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.region + ALTER COLUMN reservation_percent DROP DEFAULT; diff --git a/schema/crdb/crucible-agent-reservation-overhead/up04.sql b/schema/crdb/crucible-agent-reservation-overhead/up04.sql new file mode 100644 index 00000000000..cfd0ff7d0d1 --- /dev/null +++ b/schema/crdb/crucible-agent-reservation-overhead/up04.sql @@ -0,0 +1,23 @@ +WITH size_used_with_reservation AS ( + SELECT + crucible_dataset.id AS crucible_dataset_id, + SUM( + CASE + WHEN block_size IS NULL THEN 0 + ELSE + CASE + WHEN reservation_percent = '25' THEN + (block_size * blocks_per_extent * extent_count) / 4 + + (block_size * blocks_per_extent * extent_count) + END + END + ) AS reserved_size + FROM crucible_dataset + LEFT JOIN region ON crucible_dataset.id = region.dataset_id + WHERE crucible_dataset.time_deleted IS NULL + GROUP BY crucible_dataset.id +) +UPDATE crucible_dataset +SET size_used = size_used_with_reservation.reserved_size +FROM size_used_with_reservation +WHERE crucible_dataset.id = size_used_with_reservation.crucible_dataset_id; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 9104db4c5c7..b71842b65ef 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -598,6 +598,11 @@ CREATE TABLE IF NOT EXISTS omicron.public.crucible_dataset ( * Reconfigurator rendezvous process, this field is set to 0. Reconfigurator * otherwise ignores this field. It's updated by Nexus as region allocations * and deletions are performed using this dataset. + * + * Note that the value in this column is _not_ the sum of requested region + * sizes, but sum of the size *reserved* by the Crucible agent for the + * dataset that contains the regions (which is larger than the the actual + * region size). */ size_used INT NOT NULL ); @@ -615,6 +620,10 @@ CREATE INDEX IF NOT EXISTS lookup_crucible_dataset_by_zpool ON CREATE INDEX IF NOT EXISTS lookup_crucible_dataset_by_ip ON omicron.public.crucible_dataset (ip); +CREATE TYPE IF NOT EXISTS omicron.public.region_reservation_percent AS ENUM ( + '25' +); + /* * A region of space allocated to Crucible Downstairs, within a dataset. */ @@ -639,7 +648,15 @@ CREATE TABLE IF NOT EXISTS omicron.public.region ( read_only BOOL NOT NULL, - deleting BOOL NOT NULL + deleting BOOL NOT NULL, + + /* + * The Crucible Agent will reserve space for a region with overhead for + * on-disk metadata that the downstairs needs to store. Record here the + * overhead associated with a specific region as this may change or be + * configurable in the future. + */ + reservation_percent omicron.public.region_reservation_percent NOT NULL ); /* @@ -5001,7 +5018,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '133.0.0', NULL) + (TRUE, NOW(), NOW(), '134.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT;