Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions sled-agent/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,23 @@ pub struct Config {
pub sled_mode: SledMode,
// TODO: Remove once this can be auto-detected.
pub sidecar_revision: SidecarRevision,
/// Optional percentage of DRAM to reserve for guest memory
pub vmm_reservoir_percentage: Option<u8>,
/// Optional percentage of otherwise-unbudgeted DRAM to reserve for guest
/// memory, after accounting for expected host OS memory consumption and, if
/// set, `vmm_reservoir_size_mb`.
pub vmm_reservoir_percentage: Option<f32>,
/// Optional DRAM to reserve for guest memory in MiB (mutually exclusive
/// option with vmm_reservoir_percentage).
/// option with vmm_reservoir_percentage). This can be at most the amount of
/// otherwise-unbudgeted memory on the slde - a setting high enough to
/// oversubscribe physical memory results in a `sled-agent` error at
/// startup.
pub vmm_reservoir_size_mb: Option<u32>,
/// Amount of memory to set aside in anticipation of use for services that
/// will have roughly constant memory use. These are services that may have
/// zero to one instances on a given sled - internal DNS, MGS, Nexus,
/// ClickHouse, and so on. For a sled that happens to not run these kinds of
/// control plane services, this memory is "wasted", but ensures the sled
/// could run those services if reconfiguration desired it.
pub control_plane_memory_earmark_mb: Option<u32>,
/// Optional swap device size in GiB
pub swap_device_size_gb: Option<u32>,
/// Optional VLAN ID to be used for tagging guest VNICs.
Expand Down Expand Up @@ -181,8 +193,8 @@ mod test {
let entry = entry.unwrap();
if entry.file_name() == "config.toml" {
let path = entry.path();
Config::from_file(&path).unwrap_or_else(|_| {
panic!("Failed to parse config {path}")
Config::from_file(&path).unwrap_or_else(|e| {
panic!("Failed to parse config {path}: {e}")
});
configs_seen += 1;
}
Expand Down
19 changes: 13 additions & 6 deletions sled-agent/src/sled_agent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ use sled_agent_types::zone_bundle::{
PriorityOrder, StorageLimit, ZoneBundleMetadata,
};
use sled_diagnostics::{SledDiagnosticsCmdError, SledDiagnosticsCmdOutput};
use sled_hardware::{HardwareManager, underlay};
use sled_hardware::{HardwareManager, MemoryReservations, underlay};
use sled_hardware_types::Baseboard;
use sled_hardware_types::underlay::BootstrapInterface;
use sled_storage::manager::StorageHandle;
Expand Down Expand Up @@ -495,18 +495,25 @@ impl SledAgent {
*sled_address.ip(),
);

// The VMM reservoir is configured with respect to what's left after
// accounting for relatively fixed and predictable uses.
// We expect certain amounts of memory to be set aside for kernel,
// buffer, or control plane uses.
let memory_sizes = MemoryReservations::new(
parent_log.new(o!("component" => "MemoryReservations")),
long_running_task_handles.hardware_manager.clone(),
config.control_plane_memory_earmark_mb,
);

// Configure the VMM reservoir as either a percentage of DRAM or as an
// exact size in MiB.
let reservoir_mode = ReservoirMode::from_config(
config.vmm_reservoir_percentage,
config.vmm_reservoir_size_mb,
);

let vmm_reservoir_manager = VmmReservoirManager::spawn(
&log,
long_running_task_handles.hardware_manager.clone(),
reservoir_mode,
);
let vmm_reservoir_manager =
VmmReservoirManager::spawn(&log, memory_sizes, reservoir_mode);

let instances = InstanceManager::new(
parent_log.clone(),
Expand Down
56 changes: 24 additions & 32 deletions sled-agent/src/vmm_reservoir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
use std::thread;
use tokio::sync::{broadcast, oneshot};

use sled_hardware::HardwareManager;
use sled_hardware::MemoryReservations;

#[derive(thiserror::Error, Debug)]
pub enum Error {
Expand All @@ -35,7 +35,7 @@ pub enum Error {
#[derive(Debug, Clone, Copy)]
pub enum ReservoirMode {
Size(u32),
Percentage(u8),
Percentage(f32),
}

impl ReservoirMode {
Expand All @@ -44,7 +44,7 @@ impl ReservoirMode {
///
/// Panic upon invalid configuration
pub fn from_config(
percentage: Option<u8>,
percentage: Option<f32>,
size_mb: Option<u32>,
) -> Option<ReservoirMode> {
match (percentage, size_mb) {
Expand Down Expand Up @@ -135,6 +135,7 @@ impl VmmReservoirManagerHandle {

/// Manage the VMM reservoir in a background thread
pub struct VmmReservoirManager {
memory_reservations: MemoryReservations,
reservoir_size: Arc<AtomicU64>,
rx: flume::Receiver<ReservoirManagerMsg>,
size_updated_tx: broadcast::Sender<()>,
Expand All @@ -146,7 +147,7 @@ pub struct VmmReservoirManager {
impl VmmReservoirManager {
pub fn spawn(
log: &Logger,
hardware_manager: HardwareManager,
memory_reservations: sled_hardware::MemoryReservations,
reservoir_mode: Option<ReservoirMode>,
) -> VmmReservoirManagerHandle {
let log = log.new(o!("component" => "VmmReservoirManager"));
Expand All @@ -157,15 +158,15 @@ impl VmmReservoirManager {
let (tx, rx) = flume::bounded(0);
let reservoir_size = Arc::new(AtomicU64::new(0));
let manager = VmmReservoirManager {
memory_reservations,
reservoir_size: reservoir_size.clone(),
size_updated_tx: size_updated_tx.clone(),
_size_updated_rx,
rx,
log,
};
let _manager_handle = Arc::new(thread::spawn(move || {
manager.run(hardware_manager, reservoir_mode)
}));
let _manager_handle =
Arc::new(thread::spawn(move || manager.run(reservoir_mode)));
VmmReservoirManagerHandle {
reservoir_size,
tx,
Expand All @@ -174,31 +175,26 @@ impl VmmReservoirManager {
}
}

fn run(
self,
hardware_manager: HardwareManager,
reservoir_mode: Option<ReservoirMode>,
) {
fn run(self, reservoir_mode: Option<ReservoirMode>) {
match reservoir_mode {
None => warn!(self.log, "Not using VMM reservoir"),
Some(ReservoirMode::Size(0))
| Some(ReservoirMode::Percentage(0)) => {
| Some(ReservoirMode::Percentage(0.0)) => {
warn!(
self.log,
"Not using VMM reservoir (size 0 bytes requested)"
)
}
Some(mode) => {
if let Err(e) = self.set_reservoir_size(&hardware_manager, mode)
{
if let Err(e) = self.set_reservoir_size(mode) {
error!(self.log, "Failed to setup VMM reservoir: {e}");
}
}
}

while let Ok(msg) = self.rx.recv() {
let ReservoirManagerMsg::SetReservoirSize { mode, reply_tx } = msg;
match self.set_reservoir_size(&hardware_manager, mode) {
match self.set_reservoir_size(mode) {
Ok(()) => {
let _ = reply_tx.send(Ok(()));
}
Expand All @@ -213,33 +209,28 @@ impl VmmReservoirManager {
/// Sets the VMM reservoir to the requested percentage of usable physical
/// RAM or to a size in MiB. Either mode will round down to the nearest
/// aligned size required by the control plane.
fn set_reservoir_size(
&self,
hardware: &sled_hardware::HardwareManager,
mode: ReservoirMode,
) -> Result<(), Error> {
let hardware_physical_ram_bytes = hardware.usable_physical_ram_bytes();
fn set_reservoir_size(&self, mode: ReservoirMode) -> Result<(), Error> {
let vmm_eligible_memory = self.memory_reservations.vmm_eligible();
let req_bytes = match mode {
ReservoirMode::Size(mb) => {
let bytes = ByteCount::from_mebibytes_u32(mb).to_bytes();
if bytes > hardware_physical_ram_bytes {
if bytes > vmm_eligible_memory {
return Err(Error::ReservoirConfig(format!(
"cannot specify a reservoir of {bytes} bytes when \
physical memory is {hardware_physical_ram_bytes} bytes",
"cannot specify a reservoir of {bytes} bytes when the \
maximum reservoir size is {vmm_eligible_memory} bytes",
)));
}
bytes
}
ReservoirMode::Percentage(percent) => {
if !matches!(percent, 1..=99) {
if !matches!(percent, 0.1..100.0) {
return Err(Error::ReservoirConfig(format!(
"VMM reservoir percentage of {} must be between 0 and \
100",
percent
)));
};
(hardware_physical_ram_bytes as f64
* (f64::from(percent) / 100.0))
(vmm_eligible_memory as f64 * (f64::from(percent) / 100.0))
.floor() as u64
}
};
Expand All @@ -258,15 +249,16 @@ impl VmmReservoirManager {
}

// The max ByteCount value is i64::MAX, which is ~8 million TiB.
// As this value is either a percentage of DRAM or a size in MiB
// represented as a u32, constructing this should always work.
// As this value is either a percentage of otherwise-unbudgeted DRAM or
// a size in MiB represented as a u32, constructing this should always
// work.
let reservoir_size = ByteCount::try_from(req_bytes_aligned).unwrap();
if let ReservoirMode::Percentage(percent) = mode {
info!(
self.log,
"{}% of {} physical ram = {} bytes)",
"{}% of {} VMM eligible ram = {} bytes)",
percent,
hardware_physical_ram_bytes,
vmm_eligible_memory,
req_bytes,
);
}
Expand Down
7 changes: 7 additions & 0 deletions sled-hardware/src/illumos/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ struct HardwareView {
disks: HashMap<DiskIdentity, UnparsedDisk>,
baseboard: Option<Baseboard>,
online_processor_count: u32,
usable_physical_pages: u64,
usable_physical_ram_bytes: u64,
}

Expand All @@ -220,6 +221,7 @@ impl HardwareView {
disks: HashMap::new(),
baseboard: None,
online_processor_count: sysconf::online_processor_count()?,
usable_physical_pages: sysconf::usable_physical_pages()?,
usable_physical_ram_bytes: sysconf::usable_physical_ram_bytes()?,
})
}
Expand All @@ -230,6 +232,7 @@ impl HardwareView {
disks: HashMap::new(),
baseboard: None,
online_processor_count: sysconf::online_processor_count()?,
usable_physical_pages: sysconf::usable_physical_pages()?,
usable_physical_ram_bytes: sysconf::usable_physical_ram_bytes()?,
})
}
Expand Down Expand Up @@ -798,6 +801,10 @@ impl HardwareManager {
self.inner.lock().unwrap().online_processor_count
}

pub fn usable_physical_pages(&self) -> u64 {
self.inner.lock().unwrap().usable_physical_pages
}

pub fn usable_physical_ram_bytes(&self) -> u64 {
self.inner.lock().unwrap().usable_physical_ram_bytes
}
Expand Down
18 changes: 14 additions & 4 deletions sled-hardware/src/illumos/sysconf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,24 @@ pub fn online_processor_count() -> Result<u32, Error> {
Ok(u32::try_from(res)?)
}

/// Returns the amount of RAM on this sled, in bytes.
pub fn usable_physical_ram_bytes() -> Result<u64, Error> {
let phys_pages: u64 = illumos_utils::libc::sysconf(libc::_SC_PHYS_PAGES)
/// Returns the number of physical RAM pages on this sled.
pub fn usable_physical_pages() -> Result<u64, Error> {
let pages = illumos_utils::libc::sysconf(libc::_SC_PHYS_PAGES)
.map_err(|e| Error::Sysconf { arg: "physical pages", e })?
.try_into()?;
Ok(pages)
}

/// Returns the amount of RAM on this sled, in bytes.
pub fn usable_physical_ram_bytes() -> Result<u64, Error> {
let page_size: u64 = illumos_utils::libc::sysconf(libc::_SC_PAGESIZE)
.map_err(|e| Error::Sysconf { arg: "physical page size", e })?
.try_into()?;

Ok(phys_pages * page_size)
// Note that `_SC_PHYS_PAGES` counts, specifically, the number of
// `_SC_PAGESIZE` pages of physical memory. This means the multiplication
// below yields the total physical RAM bytes, even if in some sense there
// are fewer "actual" physical pages in page tables (such as if there were
// 2MiB pages mixed in on x86).
Ok(usable_physical_pages()? * page_size)
}
74 changes: 74 additions & 0 deletions sled-hardware/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use slog::{Logger, info};

cfg_if::cfg_if! {
if #[cfg(target_os = "illumos")] {
Expand Down Expand Up @@ -75,3 +76,76 @@ pub enum SledMode {
/// Force sled to run as a Scrimlet
Scrimlet { asic: DendriteAsic },
}

/// Accounting for high watermark memory usage for various system purposes
#[derive(Clone)]
pub struct MemoryReservations {
log: Logger,
hardware_manager: HardwareManager,
/// The amount of memory expected to be used if "control plane" services all
/// running on this sled. "control plane" here refers to services that have
/// roughly fixed memory use given differing sled hardware configurations.
/// DNS (internal, external), Nexus, Cockroach, or ClickHouse are all
/// examples of "control plane" here.
///
/// This is a pessimistic overestimate; it is unlikely
/// (and one might say undesirable) that all such services are colocated on
/// a sled, and (as described in RFD 413) the budgeting for each service's
/// RAM must include headroom for those services potentially forking and
/// bursting required swap or resident pages.
//
// XXX: This is really something we should be told by Nexus, perhaps after
// starting with this conservative estimate to get the sled started.
control_plane_earmark_bytes: u64,
// XXX: Crucible involves some amount of memory in support of the volumes it
// manages. We should collect zpool size and estimate the memory that would
// be used if all available storage was dedicated to Crucible volumes. For
// now this is part of the control plane earmark.
}

impl MemoryReservations {
pub fn new(
log: Logger,
hardware_manager: HardwareManager,
control_plane_earmark_mib: Option<u32>,
) -> MemoryReservations {
const MIB: u64 = 1024 * 1024;
let control_plane_earmark_bytes =
u64::from(control_plane_earmark_mib.unwrap_or(0)) * MIB;

Self { log, hardware_manager, control_plane_earmark_bytes }
}

/// Compute the amount of physical memory that could be set aside for the
/// VMM reservoir.
///
/// The actual VMM reservoir will be smaller than this amount, and is either
/// a fixed amount of memory specified by `ReservoirMode::Size` or
/// a percentage of this amount specified by `ReservoirMode::Percentage`.
pub fn vmm_eligible(&self) -> u64 {
let hardware_physical_ram_bytes =
self.hardware_manager.usable_physical_ram_bytes();
// Don't like hardcoding a struct size from the host OS here like
// this, maybe we shuffle some bits around before merging.. On the
// other hand, the last time page_t changed was illumos-gate commit
// a5652762e5 from 2006.
const PAGE_T_SIZE: u64 = 120;
let max_page_t_bytes =
self.hardware_manager.usable_physical_pages() * PAGE_T_SIZE;

let vmm_eligible = hardware_physical_ram_bytes
- max_page_t_bytes
- self.control_plane_earmark_bytes;

info!(
self.log,
"Calculated {vmm_eligible} bytes for VMM \
reservoir: {hardware_physical_ram_bytes} physical RAM bytes \
- {max_page_t_bytes} bytes for maximum page_t structures \
- {} bytes held back for services on the sled",
self.control_plane_earmark_bytes
);

vmm_eligible
}
}
4 changes: 4 additions & 0 deletions sled-hardware/src/non_illumos/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ impl HardwareManager {
unimplemented!("Accessing hardware unsupported on non-illumos");
}

pub fn usable_physical_pages(&self) -> u64 {
unimplemented!("Accessing hardware unsupported on non-illumos");
}

pub fn usable_physical_ram_bytes(&self) -> u64 {
unimplemented!("Accessing hardware unsupported on non-illumos");
}
Expand Down
Loading
Loading