Keyboard shortcuts

Press or to navigate between chapters

Press S or / to search in the book

Press ? to show this help

Press Esc to hide this help

src/backends/mod.rs

audience: ai

The backend abstraction and fleet router. A small trait (name, capabilities, can_satisfy, provision, watch_until_exit, terminate) every concrete backend implements. The Fleet holds every configured backend and routes each grant to the first one whose can_satisfy(grant) returns true.

Order in the fleet matters: operators who want a specific preference order arrange it in the boot config (backends are registered in a fixed order — AWS, GCP, Azure, bare-metal — and the router walks them in that order). A grant whose manifest.tdx = Required skips backends whose tdx_capable = false during the matching walk.

ProvisionedInstance is the uniform handle every backend returns. It carries a backend name field that the fleet uses to route post-provisioning calls (watch, terminate) back to the originating backend.

Concrete implementations:

//! Backend abstraction + fleet router.
//!
//! The bridge is provider-agnostic: it defines a
//! [`Backend`] trait and routes each incoming
//! [`ComputeGrant`] to the first configured backend that
//! reports `can_satisfy == true`. Backends ship in this
//! crate as separate modules:
//!
//! - [`aws`] — AWS EC2.
//! - [`gcp`] — Google Compute Engine.
//! - [`azure`] — Azure Virtual Machines.
//! - [`baremetal`] — operator-owned hosts reached via
//!   SSH root; workloads run as systemd-managed
//!   containers on the host. Bare-TDX hosts additionally
//!   carry a measured MR_TD in their ProviderCard so
//!   TDX-required grants can route to them.

pub mod aws;
pub mod azure;
pub mod baremetal;
pub mod gcp;

use std::sync::Arc;

use async_trait::async_trait;
use coalition_compute::{
    AlmanacTick, ComputeGrant, ProviderId, UsageMetrics,
};

use crate::config::BackendsBootConfig;
use crate::zipnet_io::Envelope;

/// Operator-chosen region label. For clouds this is the
/// native cloud region (`"us-east-1"`, `"europe-west1"`,
/// `"westus2"`); for bare-metal it is an operator-
/// supplied label (e.g. `"local-eu-fra-1"`).
pub type RegionLabel = String;

/// Capability flags a backend publishes so the fleet
/// router can filter grants.
#[derive(Clone, Debug, Default)]
pub struct Capabilities {
    pub regions:     Vec<RegionLabel>,
    pub tdx_capable: bool,
    pub max_cpu_millicores: u32,
    pub max_ram_mib: u32,
}

/// Running-instance handle. Opaque beyond the `backend`
/// field, which the fleet uses to route post-
/// provisioning operations (watch, terminate, usage
/// reporting) back to the originating backend.
#[derive(Clone)]
pub struct ProvisionedInstance {
    pub backend:       &'static str,
    pub instance_id:   String,
    pub region:        RegionLabel,
    pub public_host:   String,
    pub ssh_port:      u16,
    pub user:          String,
    pub key_private:   Vec<u8>,
    pub key_public:    Vec<u8>,
    pub host_key:      Vec<u8>,
    pub started_at:    AlmanacTick,
    pub provider_id:   ProviderId,
}

impl ProvisionedInstance {
    pub fn provider_id(&self) -> ProviderId {
        self.provider_id.clone()
    }
}

/// A provisioning backend.
///
/// Each backend is stateless at the crate level; state
/// (client handles, per-region credentials, connection
/// pools) is held inside the implementor.
#[async_trait]
pub trait Backend: Send + Sync {
    /// Short stable identifier, used in logs and in
    /// `ProvisionedInstance::backend`.
    fn name(&self) -> &'static str;

    /// Aggregated capabilities this backend will publish
    /// in the provider card.
    async fn capabilities(&self) -> anyhow::Result<Capabilities>;

    /// Quick filter — will this backend attempt to
    /// satisfy this grant? Used by the fleet router to
    /// pick. Implementations should check region, CPU,
    /// RAM, and TDX requirement.
    fn can_satisfy(&self, grant: &ComputeGrant<'_>) -> bool;

    /// Provision a workload for a grant. Returns a
    /// running-instance handle once the workload is
    /// accepting SSH connections at the returned host.
    async fn provision(
        &self,
        grant: &ComputeGrant<'_>,
        envelope: &Envelope,
    ) -> anyhow::Result<ProvisionedInstance>;

    /// Block until the instance exits or `valid_to`
    /// passes. Return usage metrics over the run.
    async fn watch_until_exit(
        &self,
        instance: &ProvisionedInstance,
        valid_to: AlmanacTick,
    ) -> anyhow::Result<UsageMetrics>;

    /// Terminate a running instance. Idempotent; safe to
    /// call if the instance has already exited.
    async fn terminate(&self, instance: &ProvisionedInstance) -> anyhow::Result<()>;
}

/// Holds every configured backend and routes grants.
///
/// Order matters: the first backend whose
/// `can_satisfy(grant)` returns `true` wins. Operators
/// who want a specific preference order arrange it in
/// the boot config.
#[derive(Clone)]
pub struct Fleet {
    backends: Arc<Vec<Arc<dyn Backend>>>,
}

impl Fleet {
    pub async fn from_boot_config(cfg: &BackendsBootConfig) -> anyhow::Result<Self> {
        let mut backends: Vec<Arc<dyn Backend>> = Vec::new();

        if let Some(aws) = &cfg.aws {
            backends.push(Arc::new(aws::AwsBackend::new(aws).await?));
        }
        if let Some(gcp) = &cfg.gcp {
            backends.push(Arc::new(gcp::GcpBackend::new(gcp).await?));
        }
        if let Some(azure) = &cfg.azure {
            backends.push(Arc::new(azure::AzureBackend::new(azure).await?));
        }
        if let Some(baremetal) = &cfg.baremetal {
            backends.push(Arc::new(baremetal::BareMetalBackend::new(baremetal).await?));
        }

        if backends.is_empty() {
            anyhow::bail!(
                "no backends configured — enable at least one of \
                 aws / gcp / azure / baremetal in the boot config"
            );
        }

        tracing::info!(
            backends = ?backends.iter().map(|b| b.name()).collect::<Vec<_>>(),
            "fleet initialised",
        );

        Ok(Self { backends: Arc::new(backends) })
    }

    /// Aggregate capabilities across every backend for
    /// provider-card publication.
    pub async fn capabilities(&self) -> anyhow::Result<Vec<(String, Capabilities)>> {
        let mut out = Vec::with_capacity(self.backends.len());
        for b in self.backends.iter() {
            let caps = b.capabilities().await?;
            out.push((b.name().to_string(), caps));
        }
        Ok(out)
    }

    pub async fn provision_for_grant(
        &self,
        grant: &ComputeGrant<'_>,
        envelope: &Envelope,
    ) -> anyhow::Result<ProvisionedInstance> {
        for b in self.backends.iter() {
            if b.can_satisfy(grant) {
                tracing::info!(
                    backend = b.name(),
                    request_id = ?grant.request_id,
                    "routing grant to backend",
                );
                return b.provision(grant, envelope).await;
            }
        }
        anyhow::bail!(
            "no backend can satisfy grant {:?}",
            grant.request_id,
        )
    }

    pub async fn watch_until_exit(
        &self,
        instance: &ProvisionedInstance,
        valid_to: AlmanacTick,
    ) -> anyhow::Result<UsageMetrics> {
        self.backend_for(instance)?
            .watch_until_exit(instance, valid_to)
            .await
    }

    pub async fn terminate(
        &self,
        instance: &ProvisionedInstance,
    ) -> anyhow::Result<()> {
        self.backend_for(instance)?.terminate(instance).await
    }

    fn backend_for(
        &self,
        instance: &ProvisionedInstance,
    ) -> anyhow::Result<Arc<dyn Backend>> {
        self.backends
            .iter()
            .find(|b| b.name() == instance.backend)
            .cloned()
            .ok_or_else(|| anyhow::anyhow!(
                "instance was provisioned by backend {:?} \
                 which is not registered in this fleet",
                instance.backend,
            ))
    }
}

Up: compute-bridge.