impl more stats

author: Hazel Atkinson <yellowsink@riseup.net> 2025-04-06 21:01:33 +0100
committer: Hazel Atkinson <yellowsink@riseup.net> 2025-04-06 21:01:33 +0100
commit: a61ac43c3ae5baaede67449dde84ca86cbbc3873 (patch)
tree: 3ae14e652f45f82502711c9ead611a0b4cb877f0
parent: 6ccc65ab237a9926e6e0f43e3bc1ef357ff16240 (diff)
download: containerspy-a61ac43c3ae5baaede67449dde84ca86cbbc3873.tar.gz
containerspy-a61ac43c3ae5baaede67449dde84ca86cbbc3873.tar.bz2
containerspy-a61ac43c3ae5baaede67449dde84ca86cbbc3873.zip
3 files changed, 261 insertions, 83 deletions
diff --git a/README.md b/README.md
index 5b38772..a3926d2 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,7 @@ TODO: will write once it actually works
 ## How to configure
 
 | `config.json`   | env var              | description                                                       | default    |
-| --------------- | -------------------- | ----------------------------------------------------------------- | ---------- |
+|-----------------|----------------------|-------------------------------------------------------------------|------------|
 | `docker_socket` | `CSPY_DOCKER_SOCKET` | The docker socket / named pipe to connect to                      | unset      |
 | `otlp_protocol` | `CSPY_OTLP_PROTO`    | Whether to use httpbinary, httpjson, or grpc to send OTLP metrics | httpbinary |
 
@@ -58,5 +58,47 @@ If a docker socket path is not set, containerspy will try to connect to
 This is intended to be a dropin replacement for cAdvisor, which lists its supported metrics
 [here](https://github.com/google/cadvisor/blob/master/docs/storage/prometheus.md).
 
+All generic labels attached to all metrics are implemented, and the status of labels applied only to specific metrics
+is listed below ("N/A" if there are none).
+
 The list of ContainerSpy's currently supported items from this list is:
- - `container_cpu_usage_seconds_total`
+
+| Name                                        | Has metric-specific labels | Notes                   |
+|---------------------------------------------|----------------------------|-------------------------|
+| `container_cpu_usage_seconds_total`         |                            |                         |
+| `container_cpu_user_seconds_total`          |                            |                         |
+| `container_cpu_system_seconds_total`        |                            |                         |
+| `container_cpu_cfs_periods_total`           |                            |                         |
+| `container_cpu_cfs_throttled_periods_total` |                            |                         |
+| `container_cpu_cfs_throttled_seconds_total` |                            |                         |
+| `container_fs_reads_bytes_total`            |                            | Not reported on Windows |
+| `container_fs_writes_bytes_total`           |                            | Not reported on Windows |
+| `container_last_seen`                       |                            |                         |
+
+The list of known omitted metrics are:
+
+| Name                                             | Reason                            |
+|--------------------------------------------------|-----------------------------------|
+| `container_cpu_load_average_10s`                 | Not reported by Docker Engine API |
+| `container_cpu_schedstat_run_periods_total`      | Not reported by Docker Engine API |
+| `container_cpu_schedstat_runqueue_seconds_total` | Not reported by Docker Engine API |
+| `container_cpu_schedstat_run_seconds_total`      | Not reported by Docker Engine API |
+| `container_file_descriptors`                     | Not reported by Docker Engine API |
+| `container_fs_inodes_free`                       | Not reported by Docker Engine API |
+| `container_fs_inodes_total`                      | Not reported by Docker Engine API |
+| `container_fs_io_current`                        | Not reported by Docker Engine API |
+| `container_fs_io_time_seconds_total`             | Only reported on cgroups v1 hosts |
+| `container_fs_io_time_weighted_seconds_total`    | Not reported by Docker Engine API |
+| `container_fs_limit_bytes`                       | Not reported by Docker Engine API |
+| `container_fs_read_seconds_total`                | Only reported on cgroups v1 hosts |
+| `container_fs_reads_merged_total`                | Only reported on cgroups v1 hosts |
+| `container_fs_reads_total`                       | Not reported by Docker Engine API |
+| `container_fs_sector_reads_total`                | Only reported on cgroups v1 hosts |
+| `container_fs_write_seconds_total`               | Only reported on cgroups v1 hosts |
+| `container_fs_writes_merged_total`               | Only reported on cgroups v1 hosts |
+| `container_fs_writes_total`                      | Not reported by Docker Engine API |
+| `container_fs_sector_writes_total`               | Only reported on cgroups v1 hosts |
+| `container_fs_usage_bytes`                       | Requires SystemDataUsage API      |
+| `container_hugetlb_failcnt`                      | Not reported by Docker Engine API |
+| `container_hugetlb_max_usage_bytes`              | Not reported by Docker Engine API |
+| `container_hugetlb_usage_bytes`                  | Not reported by Docker Engine API |
diff --git a/src/main.rs b/src/main.rs
index 6a6d61f..2bbfe3f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,18 +1,15 @@
 use std::{collections::BTreeMap, sync::Arc, time::Duration};
 
 use anyhow::Result;
-use bollard::{container::StatsOptions, Docker};
-use bollard::models::ContainerSummary;
-use opentelemetry::KeyValue;
+use bollard::Docker;
 use config::CONFIG;
-use opentelemetry::metrics::MeterProvider;
 use opentelemetry_otlp::{MetricExporter, Protocol, WithExportConfig};
 use opentelemetry_sdk::metrics::SdkMeterProvider;
 use tokio::task::JoinHandle;
-use tokio_stream::StreamExt;
 use tokio_util::sync::CancellationToken;
 
 mod config;
+mod stats_task;
 
 fn setup_otlp() -> Result<SdkMeterProvider> {
 	let metric_exporter =
@@ -105,7 +102,7 @@ async fn main() -> Result<()> {
 			let id_string = cont.id.as_ref().unwrap();
 			if !tasks.contains_key(id_string) {
 				// all this string cloning hurts me
-				tasks.insert(id_string.clone(), launch_stats_task(cont, docker.clone(), meter_provider.clone()));
+				tasks.insert(id_string.clone(), stats_task::launch_stats_task(cont, docker.clone(), meter_provider.clone()));
 			}
 		}
 	}
@@ -118,79 +115,4 @@ async fn main() -> Result<()> {
 	println!("clean shutdown.");
 
 	Ok(())
-}
-
-// I do not enjoy taking a bunch of Rcs but tokio needs ownership so fine.
-fn launch_stats_task(container: ContainerSummary, docker: Arc<Docker>, meter_provider: Arc<SdkMeterProvider>) -> JoinHandle<()> {
-	tokio::spawn(async move {
-		// extract some container info
-		let container_id = container.id.unwrap();
-		let container_name = container.names.iter().flatten().next().map(|n| n.trim_start_matches("/").to_owned());
-
-		let mut stats_stream =
-			docker.stats(container_id.as_str(), Some(StatsOptions {
-				stream: true,
-				one_shot: false
-			}));
-
-		// drop the first read
-		loop {
-			match stats_stream.next().await {
-				None => return,
-				Some(Ok(_)) => break,
-				Some(Err(err)) => {
-					// TODO: use json logging or syslog so loki can understand this lol
-					println!("Failed to get stats for container {container_id}!: {err:?}");
-				}
-			}
-		}
-
-		// container labels shared for all metrics
-		let mut shared_labels = vec![
-			KeyValue::new("id", container_id.to_owned()),
-			KeyValue::new("image", container.image.unwrap_or(container.image_id.unwrap()))
-		];
-
-		if let Some(name) = container_name {
-			shared_labels.push(KeyValue::new("name", name));
-		}
-
-		if let Some(docker_labels) = &container.labels {
-			for (key, value) in docker_labels {
-				shared_labels.push(KeyValue::new("container_label_".to_string() + key, value.clone()))
-			}
-		}
-
-		// free space and make mutable
-		shared_labels.shrink_to_fit();
-		let shared_labels = &shared_labels[..];
-
-		//println!("Starting reporting for container: {shared_labels:?}");
-
-		// create meters
-		let meter_container_cpu_usage_seconds_total = meter_provider.meter("test_meter").f64_counter("container_cpu_usage_seconds_total").with_unit("s").with_description("Cumulative cpu time consumed").build();
-
-		while let Some(val) = stats_stream.next().await {
-			if let Ok(stats) = val {
-				meter_container_cpu_usage_seconds_total.add(
-					cpu_delta_from_docker(stats.cpu_stats.cpu_usage.total_usage, stats.precpu_stats.cpu_usage.total_usage).as_secs_f64(),
-					shared_labels);
-			}
-			else {
-				// failed to get stats, log as such:
-				// TODO: use json logging or syslog so loki can understand this lol
-				println!("Failed to get stats for container {container_id}!: {:?}", val.unwrap_err());
-			}
-		}
-	})
-}
-
-fn cpu_delta_from_docker(cpu_usage: u64, precpu_usage: u64) -> Duration {
-	let delta = cpu_usage - precpu_usage;
-
-	// https://docs.docker.com/reference/api/engine/version/v1.48/#tag/Container/operation/ContainerStats
-	// see response schema > cpu_stats > cpu_usage > total_usage
-	let delta_ns = if cfg!(windows) { delta * 100 } else { delta };
-
-	Duration::from_nanos(delta_ns)
 }
 \ No newline at end of file
diff --git a/src/stats_task.rs b/src/stats_task.rs
new file mode 100644
index 0000000..464793a
--- /dev/null
+++ b/src/stats_task.rs
@@ -0,0 +1,214 @@
+use bollard::container::StatsOptions;
+use bollard::models::ContainerSummary;
+use bollard::Docker;
+use opentelemetry::metrics::MeterProvider;
+use opentelemetry::KeyValue;
+use opentelemetry_sdk::metrics::SdkMeterProvider;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use tokio::task::JoinHandle;
+use tokio_stream::StreamExt;
+
+// I do not enjoy taking a bunch of Rcs but tokio needs ownership so fine.
+pub fn launch_stats_task(
+	container: ContainerSummary,
+	docker: Arc<Docker>,
+	meter_provider: Arc<SdkMeterProvider>,
+) -> JoinHandle<()> {
+	tokio::spawn(async move {
+		// extract some container info
+		let container_id = container.id.unwrap();
+		let container_name = container
+			.names
+			.iter()
+			.flatten()
+			.next()
+			.map(|n| n.trim_start_matches("/").to_owned());
+
+		let meter_name = "cspy_".to_string() + container_id.as_str();
+		// lol 'static moment
+		let meter_name = &*Box::leak(meter_name.into_boxed_str());
+
+		let mut stats_stream = docker.stats(
+			container_id.as_str(),
+			Some(StatsOptions {
+				stream: true,
+				one_shot: false,
+			}),
+		);
+
+		// drop the first read
+		loop {
+			match stats_stream.next().await {
+				None => return,
+				Some(Ok(_)) => break,
+				Some(Err(err)) => {
+					// TODO: use json logging or syslog so loki can understand this lol
+					println!("Failed to get stats for container {container_id}!: {err:?}");
+				}
+			}
+		}
+
+		// container labels shared for all metrics
+		let mut shared_labels = vec![
+			KeyValue::new("id", container_id.to_owned()),
+			KeyValue::new(
+				"image",
+				container.image.unwrap_or(container.image_id.unwrap()),
+			),
+		];
+
+		if let Some(name) = container_name {
+			shared_labels.push(KeyValue::new("name", name));
+		}
+
+		if let Some(docker_labels) = &container.labels {
+			for (key, value) in docker_labels {
+				shared_labels.push(KeyValue::new(
+					"container_label_".to_string() + key,
+					value.clone(),
+				))
+			}
+		}
+
+		// free space and make mutable
+		shared_labels.shrink_to_fit();
+		let shared_labels = &shared_labels[..];
+
+		//println!("Starting reporting for container: {shared_labels:?}");
+
+		// create meters
+		let meter = meter_provider.meter(meter_name);
+
+		let meter_container_cpu_usage_seconds_total = meter
+			.f64_counter("container_cpu_usage_seconds_total")
+			.with_unit("s")
+			.with_description("Cumulative cpu time consumed")
+			.build();
+		let meter_container_cpu_user_seconds_total = meter
+			.f64_counter("container_cpu_user_seconds_total")
+			.with_unit("s")
+			.with_description("Cumulative userland cpu time consumed")
+			.build();
+		let meter_container_cpu_system_seconds_total = meter
+			.f64_counter("container_cpu_system_seconds_total")
+			.with_unit("s")
+			.with_description("Cumulative kernel cpu time consumed")
+			.build();
+
+		let meter_container_cpu_cfs_periods_total = meter
+			.u64_counter("container_cpu_cfs_periods_total")
+			.with_description("Number of elapsed enforcement period intervals")
+			.build();
+		let meter_container_cpu_cfs_throttled_periods_total = meter
+			.u64_counter("container_cpu_cfs_throttled_periods_total")
+			.with_description("Number of throttled period intervals")
+			.build();
+		let meter_container_cpu_cfs_throttled_seconds_total = meter
+			.f64_counter("container_cpu_cfs_throttled_seconds_total")
+			.with_unit("s")
+			.with_description("Total time duration the container has been throttled")
+			.build();
+
+		let meter_container_fs_reads_bytes_total = meter
+			.u64_counter("container_fs_reads_bytes_total")
+			.with_unit("B")
+			.with_description("Cumulative bytes read")
+			.build();
+		let meter_container_fs_writes_bytes_total = meter
+			.u64_counter("container_fs_writes_bytes_total")
+			.with_unit("B")
+			.with_description("Cumulative bytes written")
+			.build();
+
+		let meter_container_last_seen = meter
+			.u64_gauge("container_last_seen")
+			.with_unit("s")
+			.with_description("Last time this container was seen by ContainerSpy")
+			.build();
+
+		while let Some(val) = stats_stream.next().await {
+			if let Ok(stats) = val {
+				meter_container_cpu_usage_seconds_total.add(
+					cpu_delta_from_docker(
+						stats.cpu_stats.cpu_usage.total_usage,
+						stats.precpu_stats.cpu_usage.total_usage,
+					)
+					.as_secs_f64(),
+					shared_labels,
+				);
+
+				meter_container_cpu_user_seconds_total.add(
+					cpu_delta_from_docker(
+						stats.cpu_stats.cpu_usage.usage_in_usermode,
+						stats.precpu_stats.cpu_usage.usage_in_usermode,
+					)
+					.as_secs_f64(),
+					shared_labels,
+				);
+
+				meter_container_cpu_system_seconds_total.add(
+					cpu_delta_from_docker(
+						stats.cpu_stats.cpu_usage.usage_in_kernelmode,
+						stats.precpu_stats.cpu_usage.usage_in_kernelmode,
+					)
+					.as_secs_f64(),
+					shared_labels,
+				);
+
+				meter_container_cpu_cfs_periods_total.add(
+					stats.cpu_stats.throttling_data.periods - stats.precpu_stats.throttling_data.periods,
+					shared_labels,
+				);
+
+				meter_container_cpu_cfs_throttled_periods_total.add(
+					stats.cpu_stats.throttling_data.throttled_periods
+						- stats.precpu_stats.throttling_data.throttled_periods,
+					shared_labels,
+				);
+
+				meter_container_cpu_cfs_throttled_seconds_total.add(
+					cpu_delta_from_docker(stats.cpu_stats.throttling_data.throttled_time,
+						 stats.precpu_stats.throttling_data.throttled_time).as_secs_f64(),
+					shared_labels,
+				);
+
+				// other blkio_stats values only exist on cgroups v1 so don't bother.
+				// io_service_bytes_recursive exists only on cgroups v1.
+				// storage_stats only exists on windows.
+				if let Some(service_bytes_rec) = stats.blkio_stats.io_service_bytes_recursive {
+					println!("{service_bytes_rec:?}"); // todo: remove
+
+					for entry in &service_bytes_rec {
+						match entry.op.as_str() {
+							"read" =>
+								meter_container_fs_reads_bytes_total.add(entry.value, shared_labels),
+							"write" =>
+								meter_container_fs_writes_bytes_total.add(entry.value, shared_labels),
+							_ => println!("Unknown service_bytes_recursive entry type {}", entry.op)
+						}
+					}
+				}
+
+				meter_container_last_seen.record(SystemTime::now().duration_since(UNIX_EPOCH).unwrap().as_secs(), shared_labels);
+			} else {
+				// failed to get stats, log as such:
+				// TODO: use json logging or syslog so loki can understand this lol
+				println!(
+					"Failed to get stats for container {container_id}!: {:?}",
+					val.unwrap_err()
+				);
+			}
+		}
+	})
+}
+
+fn cpu_delta_from_docker(cpu_usage: u64, precpu_usage: u64) -> Duration {
+	let delta = cpu_usage - precpu_usage;
+
+	// https://docs.docker.com/reference/api/engine/version/v1.48/#tag/Container/operation/ContainerStats
+	// see response schema > cpu_stats > cpu_usage > total_usage
+	let delta_ns = if cfg!(windows) { delta * 100 } else { delta };
+
+	Duration::from_nanos(delta_ns)
+}
author	Hazel Atkinson <yellowsink@riseup.net>	2025-04-06 21:01:33 +0100
committer	Hazel Atkinson <yellowsink@riseup.net>	2025-04-06 21:01:33 +0100
commit	a61ac43c3ae5baaede67449dde84ca86cbbc3873 (patch)
tree	3ae14e652f45f82502711c9ead611a0b4cb877f0
parent	6ccc65ab237a9926e6e0f43e3bc1ef357ff16240 (diff)
download	containerspy-a61ac43c3ae5baaede67449dde84ca86cbbc3873.tar.gz containerspy-a61ac43c3ae5baaede67449dde84ca86cbbc3873.tar.bz2 containerspy-a61ac43c3ae5baaede67449dde84ca86cbbc3873.zip