aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHazel Atkinson <yellowsink@riseup.net>2025-04-06 21:01:33 +0100
committerHazel Atkinson <yellowsink@riseup.net>2025-04-06 21:01:33 +0100
commita61ac43c3ae5baaede67449dde84ca86cbbc3873 (patch)
tree3ae14e652f45f82502711c9ead611a0b4cb877f0
parent6ccc65ab237a9926e6e0f43e3bc1ef357ff16240 (diff)
downloadcontainerspy-a61ac43c3ae5baaede67449dde84ca86cbbc3873.tar.gz
containerspy-a61ac43c3ae5baaede67449dde84ca86cbbc3873.tar.bz2
containerspy-a61ac43c3ae5baaede67449dde84ca86cbbc3873.zip
impl more stats
-rw-r--r--README.md46
-rw-r--r--src/main.rs84
-rw-r--r--src/stats_task.rs214
3 files changed, 261 insertions, 83 deletions
diff --git a/README.md b/README.md
index 5b38772..a3926d2 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ TODO: will write once it actually works
## How to configure
| `config.json` | env var | description | default |
-| --------------- | -------------------- | ----------------------------------------------------------------- | ---------- |
+|-----------------|----------------------|-------------------------------------------------------------------|------------|
| `docker_socket` | `CSPY_DOCKER_SOCKET` | The docker socket / named pipe to connect to | unset |
| `otlp_protocol` | `CSPY_OTLP_PROTO` | Whether to use httpbinary, httpjson, or grpc to send OTLP metrics | httpbinary |
@@ -58,5 +58,47 @@ If a docker socket path is not set, containerspy will try to connect to
This is intended to be a dropin replacement for cAdvisor, which lists its supported metrics
[here](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md).
+All generic labels attached to all metrics are implemented, and the status of labels applied only to specific metrics
+is listed below ("N/A" if there are none).
+
The list of ContainerSpy's currently supported items from this list is:
- - `container_cpu_usage_seconds_total`
+
+| Name | Has metric-specific labels | Notes |
+|---------------------------------------------|----------------------------|-------------------------|
+| `container_cpu_usage_seconds_total` | | |
+| `container_cpu_user_seconds_total` | | |
+| `container_cpu_system_seconds_total` | | |
+| `container_cpu_cfs_periods_total` | | |
+| `container_cpu_cfs_throttled_periods_total` | | |
+| `container_cpu_cfs_throttled_seconds_total` | | |
+| `container_fs_reads_bytes_total` | | Not reported on Windows |
+| `container_fs_writes_bytes_total` | | Not reported on Windows |
+| `container_last_seen` | | |
+
+The list of known omitted metrics are:
+
+| Name | Reason |
+|--------------------------------------------------|-----------------------------------|
+| `container_cpu_load_average_10s` | Not reported by Docker Engine API |
+| `container_cpu_schedstat_run_periods_total` | Not reported by Docker Engine API |
+| `container_cpu_schedstat_runqueue_seconds_total` | Not reported by Docker Engine API |
+| `container_cpu_schedstat_run_seconds_total` | Not reported by Docker Engine API |
+| `container_file_descriptors` | Not reported by Docker Engine API |
+| `container_fs_inodes_free` | Not reported by Docker Engine API |
+| `container_fs_inodes_total` | Not reported by Docker Engine API |
+| `container_fs_io_current` | Not reported by Docker Engine API |
+| `container_fs_io_time_seconds_total` | Only reported on cgroups v1 hosts |
+| `container_fs_io_time_weighted_seconds_total` | Not reported by Docker Engine API |
+| `container_fs_limit_bytes` | Not reported by Docker Engine API |
+| `container_fs_read_seconds_total` | Only reported on cgroups v1 hosts |
+| `container_fs_reads_merged_total` | Only reported on cgroups v1 hosts |
+| `container_fs_reads_total` | Not reported by Docker Engine API |
+| `container_fs_sector_reads_total` | Only reported on cgroups v1 hosts |
+| `container_fs_write_seconds_total` | Only reported on cgroups v1 hosts |
+| `container_fs_writes_merged_total` | Only reported on cgroups v1 hosts |
+| `container_fs_writes_total` | Not reported by Docker Engine API |
+| `container_fs_sector_writes_total` | Only reported on cgroups v1 hosts |
+| `container_fs_usage_bytes` | Requires SystemDataUsage API |
+| `container_hugetlb_failcnt` | Not reported by Docker Engine API |
+| `container_hugetlb_max_usage_bytes` | Not reported by Docker Engine API |
+| `container_hugetlb_usage_bytes` | Not reported by Docker Engine API |
diff --git a/src/main.rs b/src/main.rs
index 6a6d61f..2bbfe3f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,18 +1,15 @@
use std::{collections::BTreeMap, sync::Arc, time::Duration};
use anyhow::Result;
-use bollard::{container::StatsOptions, Docker};
-use bollard::models::ContainerSummary;
-use opentelemetry::KeyValue;
+use bollard::Docker;
use config::CONFIG;
-use opentelemetry::metrics::MeterProvider;
use opentelemetry_otlp::{MetricExporter, Protocol, WithExportConfig};
use opentelemetry_sdk::metrics::SdkMeterProvider;
use tokio::task::JoinHandle;
-use tokio_stream::StreamExt;
use tokio_util::sync::CancellationToken;
mod config;
+mod stats_task;
fn setup_otlp() -> Result<SdkMeterProvider> {
let metric_exporter =
@@ -105,7 +102,7 @@ async fn main() -> Result<()> {
let id_string = cont.id.as_ref().unwrap();
if !tasks.contains_key(id_string) {
// all this string cloning hurts me
- tasks.insert(id_string.clone(), launch_stats_task(cont, docker.clone(), meter_provider.clone()));
+ tasks.insert(id_string.clone(), stats_task::launch_stats_task(cont, docker.clone(), meter_provider.clone()));
}
}
}
@@ -118,79 +115,4 @@ async fn main() -> Result<()> {
println!("clean shutdown.");
Ok(())
-}
-
-// I do not enjoy taking a bunch of Rcs but tokio needs ownership so fine.
-fn launch_stats_task(container: ContainerSummary, docker: Arc<Docker>, meter_provider: Arc<SdkMeterProvider>) -> JoinHandle<()> {
- tokio::spawn(async move {
- // extract some container info
- let container_id = container.id.unwrap();
- let container_name = container.names.iter().flatten().next().map(|n| n.trim_start_matches("/").to_owned());
-
- let mut stats_stream =
- docker.stats(container_id.as_str(), Some(StatsOptions {
- stream: true,
- one_shot: false
- }));
-
- // drop the first read
- loop {
- match stats_stream.next().await {
- None => return,
- Some(Ok(_)) => break,
- Some(Err(err)) => {
- // TODO: use json logging or syslog so loki can understand this lol
- println!("Failed to get stats for container {container_id}!: {err:?}");
- }
- }
- }
-
- // container labels shared for all metrics
- let mut shared_labels = vec![
- KeyValue::new("id", container_id.to_owned()),
- KeyValue::new("image", container.image.unwrap_or(container.image_id.unwrap()))
- ];
-
- if let Some(name) = container_name {
- shared_labels.push(KeyValue::new("name", name));
- }
-
- if let Some(docker_labels) = &container.labels {
- for (key, value) in docker_labels {
- shared_labels.push(KeyValue::new("container_label_".to_string() + key, value.clone()))
- }
- }
-
- // free space and make mutable
- shared_labels.shrink_to_fit();
- let shared_labels = &shared_labels[..];
-
- //println!("Starting reporting for container: {shared_labels:?}");
-
- // create meters
- let meter_container_cpu_usage_seconds_total = meter_provider.meter("test_meter").f64_counter("container_cpu_usage_seconds_total").with_unit("s").with_description("Cumulative cpu time consumed").build();
-
- while let Some(val) = stats_stream.next().await {
- if let Ok(stats) = val {
- meter_container_cpu_usage_seconds_total.add(
- cpu_delta_from_docker(stats.cpu_stats.cpu_usage.total_usage, stats.precpu_stats.cpu_usage.total_usage).as_secs_f64(),
- shared_labels);
- }
- else {
- // failed to get stats, log as such:
- // TODO: use json logging or syslog so loki can understand this lol
- println!("Failed to get stats for container {container_id}!: {:?}", val.unwrap_err());
- }
- }
- })
-}
-
-fn cpu_delta_from_docker(cpu_usage: u64, precpu_usage: u64) -> Duration {
- let delta = cpu_usage - precpu_usage;
-
- // https://docs.docker.com/reference/api/engine/version/v1.48/#tag/Container/operation/ContainerStats
- // see response schema > cpu_stats > cpu_usage > total_usage
- let delta_ns = if cfg!(windows) { delta * 100 } else { delta };
-
- Duration::from_nanos(delta_ns)
} \ No newline at end of file
diff --git a/src/stats_task.rs b/src/stats_task.rs
new file mode 100644
index 0000000..464793a
--- /dev/null
+++ b/src/stats_task.rs
@@ -0,0 +1,214 @@
+use bollard::container::StatsOptions;
+use bollard::models::ContainerSummary;
+use bollard::Docker;
+use opentelemetry::metrics::MeterProvider;
+use opentelemetry::KeyValue;
+use opentelemetry_sdk::metrics::SdkMeterProvider;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use tokio::task::JoinHandle;
+use tokio_stream::StreamExt;
+
+// I do not enjoy taking a bunch of Rcs but tokio needs ownership so fine.
+pub fn launch_stats_task(
+ container: ContainerSummary,
+ docker: Arc<Docker>,
+ meter_provider: Arc<SdkMeterProvider>,
+) -> JoinHandle<()> {
+ tokio::spawn(async move {
+ // extract some container info
+ let container_id = container.id.unwrap();
+ let container_name = container
+ .names
+ .iter()
+ .flatten()
+ .next()
+ .map(|n| n.trim_start_matches("/").to_owned());
+
+ let meter_name = "cspy_".to_string() + container_id.as_str();
+ // lol 'static moment
+ let meter_name = &*Box::leak(meter_name.into_boxed_str());
+
+ let mut stats_stream = docker.stats(
+ container_id.as_str(),
+ Some(StatsOptions {
+ stream: true,
+ one_shot: false,
+ }),
+ );
+
+ // drop the first read
+ loop {
+ match stats_stream.next().await {
+ None => return,
+ Some(Ok(_)) => break,
+ Some(Err(err)) => {
+ // TODO: use json logging or syslog so loki can understand this lol
+ println!("Failed to get stats for container {container_id}!: {err:?}");
+ }
+ }
+ }
+
+ // container labels shared for all metrics
+ let mut shared_labels = vec![
+ KeyValue::new("id", container_id.to_owned()),
+ KeyValue::new(
+ "image",
+ container.image.unwrap_or(container.image_id.unwrap()),
+ ),
+ ];
+
+ if let Some(name) = container_name {
+ shared_labels.push(KeyValue::new("name", name));
+ }
+
+ if let Some(docker_labels) = &container.labels {
+ for (key, value) in docker_labels {
+ shared_labels.push(KeyValue::new(
+ "container_label_".to_string() + key,
+ value.clone(),
+ ))
+ }
+ }
+
+ // free space and make mutable
+ shared_labels.shrink_to_fit();
+ let shared_labels = &shared_labels[..];
+
+ //println!("Starting reporting for container: {shared_labels:?}");
+
+ // create meters
+ let meter = meter_provider.meter(meter_name);
+
+ let meter_container_cpu_usage_seconds_total = meter
+ .f64_counter("container_cpu_usage_seconds_total")
+ .with_unit("s")
+ .with_description("Cumulative cpu time consumed")
+ .build();
+ let meter_container_cpu_user_seconds_total = meter
+ .f64_counter("container_cpu_user_seconds_total")
+ .with_unit("s")
+ .with_description("Cumulative userland cpu time consumed")
+ .build();
+ let meter_container_cpu_system_seconds_total = meter
+ .f64_counter("container_cpu_system_seconds_total")
+ .with_unit("s")
+ .with_description("Cumulative kernel cpu time consumed")
+ .build();
+
+ let meter_container_cpu_cfs_periods_total = meter
+ .u64_counter("container_cpu_cfs_periods_total")
+ .with_description("Number of elapsed enforcement period intervals")
+ .build();
+ let meter_container_cpu_cfs_throttled_periods_total = meter
+ .u64_counter("container_cpu_cfs_throttled_periods_total")
+ .with_description("Number of throttled period intervals")
+ .build();
+ let meter_container_cpu_cfs_throttled_seconds_total = meter
+ .f64_counter("container_cpu_cfs_throttled_seconds_total")
+ .with_unit("s")
+ .with_description("Total time duration the container has been throttled")
+ .build();
+
+ let meter_container_fs_reads_bytes_total = meter
+ .u64_counter("container_fs_reads_bytes_total")
+ .with_unit("B")
+ .with_description("Cumulative bytes read")
+ .build();
+ let meter_container_fs_writes_bytes_total = meter
+ .u64_counter("container_fs_writes_bytes_total")
+ .with_unit("B")
+ .with_description("Cumulative bytes written")
+ .build();
+
+ let meter_container_last_seen = meter
+ .u64_gauge("container_last_seen")
+ .with_unit("s")
+ .with_description("Last time this container was seen by ContainerSpy")
+ .build();
+
+ while let Some(val) = stats_stream.next().await {
+ if let Ok(stats) = val {
+ meter_container_cpu_usage_seconds_total.add(
+ cpu_delta_from_docker(
+ stats.cpu_stats.cpu_usage.total_usage,
+ stats.precpu_stats.cpu_usage.total_usage,
+ )
+ .as_secs_f64(),
+ shared_labels,
+ );
+
+ meter_container_cpu_user_seconds_total.add(
+ cpu_delta_from_docker(
+ stats.cpu_stats.cpu_usage.usage_in_usermode,
+ stats.precpu_stats.cpu_usage.usage_in_usermode,
+ )
+ .as_secs_f64(),
+ shared_labels,
+ );
+
+ meter_container_cpu_system_seconds_total.add(
+ cpu_delta_from_docker(
+ stats.cpu_stats.cpu_usage.usage_in_kernelmode,
+ stats.precpu_stats.cpu_usage.usage_in_kernelmode,
+ )
+ .as_secs_f64(),
+ shared_labels,
+ );
+
+ meter_container_cpu_cfs_periods_total.add(
+ stats.cpu_stats.throttling_data.periods - stats.precpu_stats.throttling_data.periods,
+ shared_labels,
+ );
+
+ meter_container_cpu_cfs_throttled_periods_total.add(
+ stats.cpu_stats.throttling_data.throttled_periods
+ - stats.precpu_stats.throttling_data.throttled_periods,
+ shared_labels,
+ );
+
+ meter_container_cpu_cfs_throttled_seconds_total.add(
+ cpu_delta_from_docker(stats.cpu_stats.throttling_data.throttled_time,
+ stats.precpu_stats.throttling_data.throttled_time).as_secs_f64(),
+ shared_labels,
+ );
+
+ // other blkio_stats values only exist on cgroups v1 so don't bother.
+ // io_service_bytes_recursive exists only on cgroups v1.
+ // storage_stats only exists on windows.
+ if let Some(service_bytes_rec) = stats.blkio_stats.io_service_bytes_recursive {
+ println!("{service_bytes_rec:?}"); // todo: remove
+
+ for entry in &service_bytes_rec {
+ match entry.op.as_str() {
+ "read" =>
+ meter_container_fs_reads_bytes_total.add(entry.value, shared_labels),
+ "write" =>
+ meter_container_fs_writes_bytes_total.add(entry.value, shared_labels),
+ _ => println!("Unknown service_bytes_recursive entry type {}", entry.op)
+ }
+ }
+ }
+
+ meter_container_last_seen.record(SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs(), shared_labels);
+ } else {
+ // failed to get stats, log as such:
+ // TODO: use json logging or syslog so loki can understand this lol
+ println!(
+ "Failed to get stats for container {container_id}!: {:?}",
+ val.unwrap_err()
+ );
+ }
+ }
+ })
+}
+
+fn cpu_delta_from_docker(cpu_usage: u64, precpu_usage: u64) -> Duration {
+ let delta = cpu_usage - precpu_usage;
+
+ // https://docs.docker.com/reference/api/engine/version/v1.48/#tag/Container/operation/ContainerStats
+ // see response schema > cpu_stats > cpu_usage > total_usage
+ let delta_ns = if cfg!(windows) { delta * 100 } else { delta };
+
+ Duration::from_nanos(delta_ns)
+}