src/backends/mod.rs
audience: ai
The backend abstraction and fleet router. A small
trait (name, capabilities, can_satisfy,
provision, watch_until_exit, terminate) every
concrete backend implements. The Fleet holds every
configured backend and routes each grant to the first
one whose can_satisfy(grant) returns true.
Order in the fleet matters: operators who want a
specific preference order arrange it in the boot config
(backends are registered in a fixed order — AWS, GCP,
Azure, bare-metal — and the router walks them in that
order). A grant whose manifest.tdx = Required skips
backends whose tdx_capable = false during the
matching walk.
ProvisionedInstance is the uniform handle every
backend returns. It carries a backend name field that
the fleet uses to route post-provisioning calls
(watch, terminate) back to the originating backend.
Concrete implementations:
backends/aws.rs— EC2.backends/gcp.rs— Compute Engine, with optional TDX via Confidential VMs.backends/azure.rs— Azure VMs, with optional TDX via DCdsv3/ECdsv3 SKUs.backends/baremetal.rs— operator-owned hosts reached via SSH root; workloads run as systemd-managed containers or, on bare-TDX hosts, as nested TDX guests.
//! Backend abstraction + fleet router.
//!
//! The bridge is provider-agnostic: it defines a
//! [`Backend`] trait and routes each incoming
//! [`ComputeGrant`] to the first configured backend that
//! reports `can_satisfy == true`. Backends ship in this
//! crate as separate modules:
//!
//! - [`aws`] — AWS EC2.
//! - [`gcp`] — Google Compute Engine.
//! - [`azure`] — Azure Virtual Machines.
//! - [`baremetal`] — operator-owned hosts reached via
//! SSH root; workloads run as systemd-managed
//! containers on the host. Bare-TDX hosts additionally
//! carry a measured MR_TD in their ProviderCard so
//! TDX-required grants can route to them.
pub mod aws;
pub mod azure;
pub mod baremetal;
pub mod gcp;
use std::sync::Arc;
use async_trait::async_trait;
use coalition_compute::{
AlmanacTick, ComputeGrant, ProviderId, UsageMetrics,
};
use crate::config::BackendsBootConfig;
use crate::zipnet_io::Envelope;
/// Operator-chosen region label. For clouds this is the
/// native cloud region (`"us-east-1"`, `"europe-west1"`,
/// `"westus2"`); for bare-metal it is an operator-
/// supplied label (e.g. `"local-eu-fra-1"`).
pub type RegionLabel = String;
/// Capability flags a backend publishes so the fleet
/// router can filter grants.
#[derive(Clone, Debug, Default)]
pub struct Capabilities {
pub regions: Vec<RegionLabel>,
pub tdx_capable: bool,
pub max_cpu_millicores: u32,
pub max_ram_mib: u32,
}
/// Running-instance handle. Opaque beyond the `backend`
/// field, which the fleet uses to route post-
/// provisioning operations (watch, terminate, usage
/// reporting) back to the originating backend.
#[derive(Clone)]
pub struct ProvisionedInstance {
pub backend: &'static str,
pub instance_id: String,
pub region: RegionLabel,
pub public_host: String,
pub ssh_port: u16,
pub user: String,
pub key_private: Vec<u8>,
pub key_public: Vec<u8>,
pub host_key: Vec<u8>,
pub started_at: AlmanacTick,
pub provider_id: ProviderId,
}
impl ProvisionedInstance {
pub fn provider_id(&self) -> ProviderId {
self.provider_id.clone()
}
}
/// A provisioning backend.
///
/// Each backend is stateless at the crate level; state
/// (client handles, per-region credentials, connection
/// pools) is held inside the implementor.
#[async_trait]
pub trait Backend: Send + Sync {
/// Short stable identifier, used in logs and in
/// `ProvisionedInstance::backend`.
fn name(&self) -> &'static str;
/// Aggregated capabilities this backend will publish
/// in the provider card.
async fn capabilities(&self) -> anyhow::Result<Capabilities>;
/// Quick filter — will this backend attempt to
/// satisfy this grant? Used by the fleet router to
/// pick. Implementations should check region, CPU,
/// RAM, and TDX requirement.
fn can_satisfy(&self, grant: &ComputeGrant<'_>) -> bool;
/// Provision a workload for a grant. Returns a
/// running-instance handle once the workload is
/// accepting SSH connections at the returned host.
async fn provision(
&self,
grant: &ComputeGrant<'_>,
envelope: &Envelope,
) -> anyhow::Result<ProvisionedInstance>;
/// Block until the instance exits or `valid_to`
/// passes. Return usage metrics over the run.
async fn watch_until_exit(
&self,
instance: &ProvisionedInstance,
valid_to: AlmanacTick,
) -> anyhow::Result<UsageMetrics>;
/// Terminate a running instance. Idempotent; safe to
/// call if the instance has already exited.
async fn terminate(&self, instance: &ProvisionedInstance) -> anyhow::Result<()>;
}
/// Holds every configured backend and routes grants.
///
/// Order matters: the first backend whose
/// `can_satisfy(grant)` returns `true` wins. Operators
/// who want a specific preference order arrange it in
/// the boot config.
#[derive(Clone)]
pub struct Fleet {
backends: Arc<Vec<Arc<dyn Backend>>>,
}
impl Fleet {
pub async fn from_boot_config(cfg: &BackendsBootConfig) -> anyhow::Result<Self> {
let mut backends: Vec<Arc<dyn Backend>> = Vec::new();
if let Some(aws) = &cfg.aws {
backends.push(Arc::new(aws::AwsBackend::new(aws).await?));
}
if let Some(gcp) = &cfg.gcp {
backends.push(Arc::new(gcp::GcpBackend::new(gcp).await?));
}
if let Some(azure) = &cfg.azure {
backends.push(Arc::new(azure::AzureBackend::new(azure).await?));
}
if let Some(baremetal) = &cfg.baremetal {
backends.push(Arc::new(baremetal::BareMetalBackend::new(baremetal).await?));
}
if backends.is_empty() {
anyhow::bail!(
"no backends configured — enable at least one of \
aws / gcp / azure / baremetal in the boot config"
);
}
tracing::info!(
backends = ?backends.iter().map(|b| b.name()).collect::<Vec<_>>(),
"fleet initialised",
);
Ok(Self { backends: Arc::new(backends) })
}
/// Aggregate capabilities across every backend for
/// provider-card publication.
pub async fn capabilities(&self) -> anyhow::Result<Vec<(String, Capabilities)>> {
let mut out = Vec::with_capacity(self.backends.len());
for b in self.backends.iter() {
let caps = b.capabilities().await?;
out.push((b.name().to_string(), caps));
}
Ok(out)
}
pub async fn provision_for_grant(
&self,
grant: &ComputeGrant<'_>,
envelope: &Envelope,
) -> anyhow::Result<ProvisionedInstance> {
for b in self.backends.iter() {
if b.can_satisfy(grant) {
tracing::info!(
backend = b.name(),
request_id = ?grant.request_id,
"routing grant to backend",
);
return b.provision(grant, envelope).await;
}
}
anyhow::bail!(
"no backend can satisfy grant {:?}",
grant.request_id,
)
}
pub async fn watch_until_exit(
&self,
instance: &ProvisionedInstance,
valid_to: AlmanacTick,
) -> anyhow::Result<UsageMetrics> {
self.backend_for(instance)?
.watch_until_exit(instance, valid_to)
.await
}
pub async fn terminate(
&self,
instance: &ProvisionedInstance,
) -> anyhow::Result<()> {
self.backend_for(instance)?.terminate(instance).await
}
fn backend_for(
&self,
instance: &ProvisionedInstance,
) -> anyhow::Result<Arc<dyn Backend>> {
self.backends
.iter()
.find(|b| b.name() == instance.backend)
.cloned()
.ok_or_else(|| anyhow::anyhow!(
"instance was provisioned by backend {:?} \
which is not registered in this fleet",
instance.backend,
))
}
}
Up: compute-bridge.