diff options
| author | Hazel Atkinson <yellowsink@riseup.net> | 2025-04-08 14:26:32 +0100 |
|---|---|---|
| committer | Hazel Atkinson <yellowsink@riseup.net> | 2025-04-08 14:31:53 +0100 |
| commit | 707539b3a98db039f11234c36bc9aa09a1521bab (patch) | |
| tree | 9f5b1a084ca76d2e01969ac1f458b7ba4c9e2a08 | |
| parent | d351eb8d5a00e597cea3f93abeb2c91f077b3ceb (diff) | |
| download | containerspy-707539b3a98db039f11234c36bc9aa09a1521bab.tar.gz containerspy-707539b3a98db039f11234c36bc9aa09a1521bab.tar.bz2 containerspy-707539b3a98db039f11234c36bc9aa09a1521bab.zip | |
impl memory metrics
| -rw-r--r-- | README.md | 55 | ||||
| -rw-r--r-- | src/stats_task.rs | 102 |
2 files changed, 119 insertions, 38 deletions
@@ -80,26 +80,34 @@ is listed below ("N/A" if there are none). The list of ContainerSpy's currently supported items from this list is: -| Name | Has metric-specific labels | Notes | -|----------------------------------------------------|----------------------------|--------------------------------| -| `container_cpu_usage_seconds_total` | | | -| `container_cpu_user_seconds_total` | | | -| `container_cpu_system_seconds_total` | | | -| `container_cpu_cfs_periods_total` | | | -| `container_cpu_cfs_throttled_periods_total` | | | -| `container_cpu_cfs_throttled_seconds_total` | | | -| `container_fs_reads_bytes_total` | | Not reported on Windows (TODO) | -| `container_fs_writes_bytes_total` | | Not reported on Windows (TODO) | -| `container_last_seen` | | | -| `container_network_receive_bytes_total` | `interface` | | -| `container_network_receive_errors_total` | `interface` | Not reported on Windows | -| `container_network_receive_packets_dropped_total` | `interface` | | -| `container_network_receive_packets_total` | `interface` | | -| `container_network_transmit_bytes_total` | `interface` | | -| `container_network_transmit_errors_total` | `interface` | Not reported on Windows | -| `container_network_transmit_packets_dropped_total` | `interface` | | -| `container_network_transmit_packets_total` | `interface` | | -| `container_start_time_seconds` | | | +| Name | Metric-specific labels | Notes | +|----------------------------------------------------|-------------------------|--------------------------------| +| `container_cpu_usage_seconds_total` | TODO: `cpu` | | +| `container_cpu_user_seconds_total` | N/A | | +| `container_cpu_system_seconds_total` | N/A | | +| `container_cpu_cfs_periods_total` | | | +| `container_cpu_cfs_throttled_periods_total` | | | +| `container_cpu_cfs_throttled_seconds_total` | | | +| `container_fs_reads_bytes_total` | TODO: `device` | Not reported on Windows (TODO) | +| `container_fs_writes_bytes_total` | TODO: `device` | Not reported on Windows (TODO) | +| `container_last_seen` | N/A | | +| `container_memory_cache` | N/A | Not reported on Windows | +| `container_memory_failures_total` | `failure_type`, `scope` | Not reported on Windows | +| `container_memory_mapped_file` | N/A | Not reported on Windows | +| `container_memory_rss` | N/A | Not reported on Windows | +| `container_memory_usage_bytes` | N/A | Not reported on Windows | +| `container_memory_working_set_bytes` | N/A | Not reported on Windows | +| `container_network_receive_bytes_total` | `interface` | | +| `container_network_receive_errors_total` | `interface` | Not reported on Windows | +| `container_network_receive_packets_dropped_total` | `interface` | | +| `container_network_receive_packets_total` | `interface` | | +| `container_network_transmit_bytes_total` | `interface` | | +| `container_network_transmit_errors_total` | `interface` | Not reported on Windows | +| `container_network_transmit_packets_dropped_total` | `interface` | | +| `container_network_transmit_packets_total` | `interface` | | +| `container_start_time_seconds` | N/A | | + +Additional TODO: figure out which of these metrics are or are not reportable on Windows. The list of known omitted metrics are: @@ -131,9 +139,12 @@ The list of known omitted metrics are: | `container_llc_occupancy_bytes` | Not reported by Docker Engine API | | `container_memory_bandwidth_bytes` | Not reported by Docker Engine API | | `container_memory_bandwidth_local_bytes` | Not reported by Docker Engine API | -| ... | | +| `container_memory_failcnt` | Only reported on cgroups v1 hosts | +| `container_memory_kernel_usage` | Undocumented, cspy has it, but i'm unsure my math's right! | | `container_memory_max_usage_bytes` | Only reported on cgroups v1 hosts | -| ... | | +| `container_memory_migrate` | Not reported by Docker Engine API (or cA on my pc!) | +| `container_memory_numa_pages` | Difficult to collect, not reported by cA on my pc | +| `container_memory_swap` | Not reported by Docker Engine API | | `container_network_advance_tcp_stats_total` | Not reported by Docker Engine API | | `container_network_tcp6_usage_total` | Not reported by Docker Engine API | | `container_network_tcp_usage_total` | Not reported by Docker Engine API | diff --git a/src/stats_task.rs b/src/stats_task.rs index a687a37..ec73685 100644 --- a/src/stats_task.rs +++ b/src/stats_task.rs @@ -60,7 +60,7 @@ pub fn launch_stats_task( // I'm going to rust jail! let first_read = unsafe { first_read.assume_init() }; - let Stats { blkio_stats, networks: mut last_net_stats, .. } = first_read; + let Stats { blkio_stats, networks: mut last_net_stats, memory_stats: mut last_mem_stats, .. } = first_read; let mut last_io_stats = blkio_stats.io_service_bytes_recursive; @@ -86,10 +86,34 @@ pub fn launch_stats_task( } } - // free space and make mutable + // other label sets that are static per container + let mut labels_mem_container_min_c = shared_labels.clone(); + labels_mem_container_min_c.push(KeyValue::new("failure_type", "pgfault")); + + let mut labels_mem_container_maj_c = shared_labels.clone(); + labels_mem_container_maj_c.push(KeyValue::new("failure_type", "pgmajfault")); + + let mut labels_mem_container_min_h = labels_mem_container_min_c.clone(); + labels_mem_container_min_h.push(KeyValue::new("scope", "hierarchy")); + labels_mem_container_min_c.push(KeyValue::new("scope", "container")); + + let mut labels_mem_container_maj_h = labels_mem_container_maj_c.clone(); + labels_mem_container_maj_h.push(KeyValue::new("scope", "hierarchy")); + labels_mem_container_maj_c.push(KeyValue::new("scope", "container")); + + // free space and make immutable shared_labels.shrink_to_fit(); let shared_labels = &shared_labels[..]; + labels_mem_container_min_c.shrink_to_fit(); + labels_mem_container_min_h.shrink_to_fit(); + labels_mem_container_maj_c.shrink_to_fit(); + labels_mem_container_maj_h.shrink_to_fit(); + let labels_mem_container_min_c = &labels_mem_container_min_c[..]; + let labels_mem_container_min_h = &labels_mem_container_min_h[..]; + let labels_mem_container_maj_c = &labels_mem_container_maj_c[..]; + let labels_mem_container_maj_h = &labels_mem_container_maj_h[..]; + //println!("Starting reporting for container: {shared_labels:?}"); // create meters @@ -141,7 +165,36 @@ pub fn launch_stats_task( .with_description("Last time this container was seen by ContainerSpy") .build(); - // memory stats go here + // annoyingly a lot of the meter names cadvisor went with don't have units attached even though they have known units + let meter_container_memory_cache = meter + .u64_gauge("container_memory_cache") + //.with_unit("By") + .with_description("Total page cache memory") + .build(); + let meter_container_memory_failures_total = meter + .u64_counter("container_memory_failures_total") + .with_description("Cumulative count of memory allocation failures") + .build(); + let meter_container_memory_mapped_file = meter + .u64_gauge("container_memory_mapped_file") + //.with_unit("By") + .with_description("Size of memory mapped files") + .build(); + let meter_container_memory_rss = meter + .u64_gauge("container_memory_rss") + //.with_unit("By") + .with_description("Size of RSS") + .build(); + let meter_container_memory_usage_bytes = meter + .u64_gauge("container_memory_usage_bytes") + .with_unit("By") + .with_description("Current memory usage, including all memory regardless of when it was accessed") + .build(); + let meter_container_memory_working_set_bytes = meter + .u64_gauge("container_memory_working_set_bytes") + .with_unit("By") + .with_description("Current working set") + .build(); let meter_container_network_receive_bytes_total = meter .u64_counter("container_network_receive_bytes_total") @@ -288,38 +341,55 @@ pub fn launch_stats_task( // - https://github.com/google/cadvisor/blob/f6e31a3c/info/v1/container.go#L389 (yes, v1, roll w it) // - https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html + // see https://stackoverflow.com/a/66778814 and also https://archive.is/qJWTp + // also see this comparison between cAdvisor output and {stats.memory_stats.usage:?} {v2stats:?} + // on my dev laptop: https://web.archive.org/web/20250408121954/https://pastebin.com/Kc4Ur0Hr + // and jackpot: https://github.com/google/cadvisor/blob/1f17a6c/container/libcontainer/handler.go#L808 + if let Some(all_usage) = stats.memory_stats.usage { if cfg!(windows) { // todo // i have no way to test cgroups v2 so only work on v1 - see readme for more info } else if let Some(MemoryStatsStats::V2(v2stats)) = stats.memory_stats.stats { - // container_memory_cache - // container_memory_failcnt only on cgroups v1 + // container_memory_cache + meter_container_memory_cache.record(v2stats.file, shared_labels); // container_memory_failures_total - v2stats.pgfault; // label failure_type=pgfault - v2stats.pgmajfault; // label failure_type=pgmajfault + // need last + if let Some(MemoryStatsStats::V2(last_v2)) = last_mem_stats.stats { + meter_container_memory_failures_total.add(v2stats.pgfault - last_v2.pgfault, labels_mem_container_min_c); + meter_container_memory_failures_total.add(v2stats.pgfault - last_v2.pgfault, labels_mem_container_min_h); - // container_memory_mapped_file - v2stats.file; // includes tmpfs - - // container_memory_max_usage_bytes only on cgroups v1 + meter_container_memory_failures_total.add(v2stats.pgmajfault - last_v2.pgmajfault, labels_mem_container_maj_c); + meter_container_memory_failures_total.add(v2stats.pgmajfault - last_v2.pgmajfault, labels_mem_container_maj_h); + } - // container_memory_migrate + // container_memory_kernel_usage + // actually not reported by cA but is reported by docker! + // not sure if slab contains kernel_stack or not though + // in my one sample, kernel_stack < slab + //v2stats.slab + v2stats.kernel_stack; - // container_memory_numa_pages omitted cause its hard :< + // container_memory_mapped_file + meter_container_memory_mapped_file.record(v2stats.file_mapped, shared_labels); // includes tmpfs - // container_memory_rss: may need recalcing + // container_memory_rss + meter_container_memory_rss.record(v2stats.anon, shared_labels); // container_memory_swap: can't get + // need accesss to memory.swap.*, but we only have memory.stat :( - // container_memory_usage_bytes: how? + // container_memory_usage_bytes + meter_container_memory_usage_bytes.record(all_usage, shared_labels); - // container_memory_working_set_bytes: not reported + // container_memory_working_set_bytes + meter_container_memory_working_set_bytes.record(all_usage - v2stats.inactive_file, shared_labels); } } + last_mem_stats = stats.memory_stats; + // networking // TODO: what is stats.network? is it populated on windows? if let Some(net) = &stats.networks { |
