diff --git a/health_functions.sql b/health_functions.sql index 68c23eb..61448bb 100644 --- a/health_functions.sql +++ b/health_functions.sql @@ -1,40 +1,120 @@ CREATE OR REPLACE FUNCTION public.c77_dbh_get_mv_health() RETURNS JSON AS $$ +DECLARE + v_has_pg_stat_statements boolean; -- Check if extension is available + v_stats_reset timestamp; -- Time of last stats reset + v_base_io_load numeric; -- Fallback I/O load + v_base_mem_hit_ratio numeric; -- Fallback memory hit ratio BEGIN + -- Check if pg_stat_statements is installed + SELECT EXISTS ( + SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements' + ) INTO v_has_pg_stat_statements; + + -- Get stats_reset, base I/O load, and base memory hit ratio for the current database + SELECT + d.stats_reset, + (d.blks_read + d.blks_hit) / EXTRACT(EPOCH FROM (NOW() - d.stats_reset)), + CASE + WHEN b.buffers_clean + b.buffers_backend = 0 THEN 1.0 + ELSE b.buffers_clean::numeric / (b.buffers_clean + b.buffers_backend) + END + INTO v_stats_reset, v_base_io_load, v_base_mem_hit_ratio + FROM pg_stat_database d + CROSS JOIN pg_stat_bgwriter b + WHERE d.datname = current_database(); + RETURN ( SELECT json_agg( json_build_object( 'metric', 'Materialized View Refresh', - 'mv_name', mv_name, + 'mv_name', m.mv_name, 'status', CASE - WHEN refresh_mv_time_last > refresh_mv_time_max * 0.9 THEN 'Red' - WHEN refresh_mv_time_last > (refresh_mv_time_total / GREATEST(refresh_count, 1)) * 1.2 THEN 'Yellow' + WHEN m.refresh_mv_time_last > m.refresh_mv_time_max * 0.9 THEN 'Red' + WHEN m.refresh_mv_time_last > (m.refresh_mv_time_total / GREATEST(m.refresh_count, 1)) * 1.2 THEN 'Yellow' ELSE 'Green' END, 'severity', CASE - WHEN refresh_mv_time_last > refresh_mv_time_max * 0.9 THEN 5 - WHEN refresh_mv_time_last > (refresh_mv_time_total / GREATEST(refresh_count, 1)) * 1.2 THEN 3 + WHEN m.refresh_mv_time_last > m.refresh_mv_time_max * 0.9 THEN 5 + WHEN m.refresh_mv_time_last > (m.refresh_mv_time_total / GREATEST(m.refresh_count, 1)) * 1.2 THEN 3 ELSE 1 END, 'insight', CASE - WHEN refresh_mv_time_last > refresh_mv_time_max * 0.9 THEN 'Refreshes are critically slow compared to historical max.' - WHEN refresh_mv_time_last > (refresh_mv_time_total / GREATEST(refresh_count, 1)) * 1.2 THEN 'Refreshes are slower than average.' + WHEN v_io_load > 100 THEN 'Refreshes are slower due to high I/O load.' + WHEN v_cpu_time > (CASE WHEN v_has_pg_stat_statements THEN 1000 ELSE 0.5 END) THEN 'Refreshes are slower due to CPU usage.' + WHEN v_mem_hit_ratio < 0.9 THEN 'Refreshes are slower due to memory pressure.' ELSE 'Refreshes are performing normally.' END, 'action', CASE - WHEN refresh_mv_time_last > refresh_mv_time_max * 0.9 THEN 'Investigate immediately; consider optimizing queries or resources.' - WHEN refresh_mv_time_last > (refresh_mv_time_total / GREATEST(refresh_count, 1)) * 1.2 THEN 'Review refresh schedule or resource usage.' + WHEN v_io_load > 100 THEN 'Optimize I/O or schedule during off-peak hours.' + WHEN v_cpu_time > (CASE WHEN v_has_pg_stat_statements THEN 1000 ELSE 0.5 END) THEN 'Increase CPU capacity or optimize queries.' + WHEN v_mem_hit_ratio < 0.9 THEN 'Adjust memory settings (e.g., work_mem, shared_buffers).' ELSE 'No action needed.' END, - 'last_refresh_time', refresh_mv_time_last::text, - 'refresh_count', refresh_count + 'last_refresh_time', m.refresh_mv_time_last::text, + 'refresh_count', m.refresh_count, + 'io_load', round(v_io_load)::text || ' blocks/sec', + 'cpu_time', (CASE + WHEN v_has_pg_stat_statements THEN round(v_cpu_time)::text || ' ms/call' + ELSE (v_cpu_time * 100)::text || '% waiting' + END), + 'buffer_hit_ratio', round(v_mem_hit_ratio * 100)::text || '%', + 'bottleneck', CASE + WHEN v_io_load > 100 THEN 'I/O' + WHEN v_cpu_time > (CASE WHEN v_has_pg_stat_statements THEN 1000 ELSE 0.5 END) THEN 'CPU' + WHEN v_mem_hit_ratio < 0.9 THEN 'RAM' + ELSE 'None' + END ) ) - FROM public.c77_dbh_mv_stats + FROM public.c77_dbh_mv_stats m + CROSS JOIN LATERAL ( + -- Calculate bottleneck stats per MV + SELECT + CASE + WHEN v_has_pg_stat_statements THEN + COALESCE(SUM(s.shared_blks_read) / EXTRACT(EPOCH FROM (NOW() - v_stats_reset)), 0) + ELSE + v_base_io_load + END AS v_io_load, + CASE + WHEN v_has_pg_stat_statements THEN + COALESCE(SUM(s.total_time) / GREATEST(SUM(s.calls), 1), 0) + ELSE + CASE + WHEN EXISTS ( + SELECT 1 + FROM pg_stat_activity + WHERE query ILIKE '%REFRESH MATERIALIZED VIEW ' || m.mv_name || '%' + AND wait_event_type = 'CPU' + AND state = 'active' + ) THEN 1 + ELSE 0 + END + END AS v_cpu_time, + CASE + WHEN v_has_pg_stat_statements THEN + CASE + WHEN SUM(s.shared_blks_hit + s.shared_blks_read) = 0 THEN 1.0 + ELSE SUM(s.shared_blks_hit)::numeric / (SUM(s.shared_blks_hit) + SUM(s.shared_blks_read)) + END + ELSE + v_base_mem_hit_ratio + END AS v_mem_hit_ratio + FROM ( + -- Subquery to avoid duplicate logic + SELECT 1 AS dummy + ) AS dummy + LEFT JOIN pg_stat_statements s ON v_has_pg_stat_statements + AND s.query ILIKE '%REFRESH MATERIALIZED VIEW ' || m.mv_name || '%' + AND s.dbid = (SELECT datid FROM pg_stat_database WHERE datname = current_database()) + ) AS stats ); END; $$ LANGUAGE plpgsql; + + CREATE OR REPLACE FUNCTION public.c77_dbh_get_mv_details(p_mv_name text) RETURNS JSON AS $$ DECLARE