diff --git a/health_functions.sql b/health_functions.sql index ca34632..68c23eb 100644 --- a/health_functions.sql +++ b/health_functions.sql @@ -38,28 +38,57 @@ $$ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION public.c77_dbh_get_mv_details(p_mv_name text) RETURNS JSON AS $$ DECLARE - v_io_load numeric; -- Estimated I/O rate (blocks/sec) - v_cpu_wait boolean; -- True if waiting on CPU - v_mem_pressure numeric; -- Buffer hit ratio (lower = more memory pressure) + v_io_load numeric; -- Blocks read per second + v_cpu_time numeric; -- CPU time per call (ms) + v_mem_hit_ratio numeric; -- Cache hit ratio + v_has_pg_stat_statements boolean; -- Check if extension is available + v_stats_reset timestamp; -- Time of last stats reset BEGIN - -- Calculate system stats - SELECT - (d.blks_read + d.blks_hit) / EXTRACT(EPOCH FROM (NOW() - d.stats_reset)) AS io_load, - EXISTS ( - SELECT 1 - FROM pg_stat_activity - WHERE query ILIKE '%REFRESH MATERIALIZED VIEW%' - AND wait_event_type = 'CPU' - AND state = 'active' - ) AS cpu_wait, - CASE - WHEN b.buffers_clean + b.buffers_backend = 0 THEN 1.0 - ELSE b.buffers_clean::numeric / (b.buffers_clean + b.buffers_backend) - END AS mem_pressure - INTO v_io_load, v_cpu_wait, v_mem_pressure - FROM pg_stat_database d - CROSS JOIN pg_stat_bgwriter b - WHERE d.datname = current_database(); + -- Check if pg_stat_statements is installed + SELECT EXISTS ( + SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements' + ) INTO v_has_pg_stat_statements; + + -- Get stats_reset separately for the current database + SELECT stats_reset + INTO v_stats_reset + FROM pg_stat_database + WHERE datname = current_database(); + + -- If pg_stat_statements is available, use it for refined stats + IF v_has_pg_stat_statements THEN + SELECT + COALESCE(SUM(s.shared_blks_read) / EXTRACT(EPOCH FROM (NOW() - v_stats_reset)), 0) AS io_load, + COALESCE(SUM(s.total_time) / GREATEST(SUM(s.calls), 1), 0) AS cpu_time, + CASE + WHEN SUM(s.shared_blks_hit + s.shared_blks_read) = 0 THEN 1.0 + ELSE SUM(s.shared_blks_hit)::numeric / (SUM(s.shared_blks_hit) + SUM(s.shared_blks_read)) + END AS mem_hit_ratio + INTO v_io_load, v_cpu_time, v_mem_hit_ratio + FROM pg_stat_statements s + JOIN pg_stat_database d ON s.dbid = d.datid + WHERE s.query ILIKE '%REFRESH MATERIALIZED VIEW ' || p_mv_name || '%' + AND d.datname = current_database(); + ELSE + -- Fallback to basic stats without pg_stat_statements + SELECT + (d.blks_read + d.blks_hit) / EXTRACT(EPOCH FROM (NOW() - d.stats_reset)) AS io_load, + EXISTS ( + SELECT 1 + FROM pg_stat_activity + WHERE query ILIKE '%REFRESH MATERIALIZED VIEW%' + AND wait_event_type = 'CPU' + AND state = 'active' + )::numeric AS cpu_time, -- 1 if waiting, 0 if not (crude proxy) + CASE + WHEN b.buffers_clean + b.buffers_backend = 0 THEN 1.0 + ELSE b.buffers_clean::numeric / (b.buffers_clean + b.buffers_backend) + END AS mem_hit_ratio + INTO v_io_load, v_cpu_time, v_mem_hit_ratio + FROM pg_stat_database d + CROSS JOIN pg_stat_bgwriter b + WHERE d.datname = current_database(); + END IF; RETURN ( SELECT json_build_object( @@ -76,15 +105,15 @@ BEGIN ELSE 1 END, 'insight', CASE - WHEN v_io_load > 1000 THEN 'Refreshes are slower due to high I/O load.' - WHEN v_cpu_wait THEN 'Refreshes are slower due to CPU contention.' - WHEN v_mem_pressure < 0.9 THEN 'Refreshes are slower due to memory pressure.' + WHEN v_io_load > 100 THEN 'Refreshes are slower due to high I/O load.' + WHEN v_cpu_time > (CASE WHEN v_has_pg_stat_statements THEN 1000 ELSE 0.5 END) THEN 'Refreshes are slower due to CPU usage.' + WHEN v_mem_hit_ratio < 0.9 THEN 'Refreshes are slower due to memory pressure.' ELSE 'Refreshes are performing normally.' END, 'action', CASE - WHEN v_io_load > 1000 THEN 'Optimize I/O or schedule during off-peak hours.' - WHEN v_cpu_wait THEN 'Increase CPU capacity or reduce concurrent load.' - WHEN v_mem_pressure < 0.9 THEN 'Adjust memory settings (e.g., shared_buffers).' + WHEN v_io_load > 100 THEN 'Optimize I/O or schedule during off-peak hours.' + WHEN v_cpu_time > (CASE WHEN v_has_pg_stat_statements THEN 1000 ELSE 0.5 END) THEN 'Increase CPU capacity or optimize queries.' + WHEN v_mem_hit_ratio < 0.9 THEN 'Adjust memory settings (e.g., work_mem, shared_buffers).' ELSE 'No action needed.' END, 'details', json_build_object( @@ -94,19 +123,25 @@ BEGIN 'max_refresh_time', refresh_mv_time_max::text, 'refresh_count', refresh_count, 'io_load', round(v_io_load)::text || ' blocks/sec', - 'cpu_waiting', v_cpu_wait::text, - 'buffer_hit_ratio', round(v_mem_pressure * 100)::text || '%', + 'cpu_time', (CASE + WHEN v_has_pg_stat_statements THEN round(v_cpu_time)::text || ' ms/call' + ELSE (v_cpu_time * 100)::text || '% waiting' + END), + 'buffer_hit_ratio', round(v_mem_hit_ratio * 100)::text || '%', 'bottleneck', CASE - WHEN v_io_load > 1000 THEN 'I/O' - WHEN v_cpu_wait THEN 'CPU' - WHEN v_mem_pressure < 0.9 THEN 'RAM' + WHEN v_io_load > 100 THEN 'I/O' + WHEN v_cpu_time > (CASE WHEN v_has_pg_stat_statements THEN 1000 ELSE 0.5 END) THEN 'CPU' + WHEN v_mem_hit_ratio < 0.9 THEN 'RAM' ELSE 'None' END, 'explanations', json_build_object( 'last_refresh_time', 'Last refresh duration compared to historical min/max.', - 'io_load', 'High I/O load (blocks/sec) indicates disk operations are limiting performance.', - 'cpu_waiting', 'True if the refresh is waiting on CPU resources.', - 'buffer_hit_ratio', 'Lower ratio means more disk access due to memory pressure.' + 'io_load', 'Blocks read per second; high values indicate disk I/O bottleneck.', + 'cpu_time', (CASE + WHEN v_has_pg_stat_statements THEN 'Average CPU time per refresh; high values suggest CPU bottleneck.' + ELSE 'Percentage of time waiting on CPU (crude estimate).' + END), + 'buffer_hit_ratio', 'Percentage of blocks from cache; <90% suggests memory pressure.' ) ) )