-- c77_mvc--1.0.sql -- Check if c77_dbh extension is installed DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'c77_dbh') THEN RAISE NOTICE 'The c77_dbh extension is not installed. This extension is required for c77_mvc to function properly.'; RAISE NOTICE 'Please install c77_dbh first by following the instructions at https://git.jctr3.com/trogers1884/c77_dbh'; RAISE NOTICE 'Example: CREATE EXTENSION c77_dbh; (after compiling and installing from source if necessary)'; RAISE EXCEPTION 'Installation aborted due to missing c77_dbh extension.'; END IF; END; $$; -- If we reach here, c77_dbh is installed, so proceed with the installation -- Create the table CREATE TABLE IF NOT EXISTS public.c77_mvc_table_fitness_audit ( run_id BIGSERIAL, run_timestamp timestamp without time zone DEFAULT CURRENT_TIMESTAMP, source_schema text COLLATE pg_catalog."default", source_table text COLLATE pg_catalog."default", analysis_result jsonb, notes text[] COLLATE pg_catalog."default", CONSTRAINT table_fitness_audit_pkey PRIMARY KEY (run_id) ) TABLESPACE pg_default; CREATE INDEX IF NOT EXISTS idx_table_fitness_audit_table ON public.c77_mvc_table_fitness_audit USING btree (source_schema COLLATE pg_catalog."default" ASC NULLS LAST, source_table COLLATE pg_catalog."default" ASC NULLS LAST) TABLESPACE pg_default; CREATE INDEX IF NOT EXISTS idx_table_fitness_audit_timestamp ON public.c77_mvc_table_fitness_audit USING btree (run_timestamp ASC NULLS LAST) TABLESPACE pg_default; -- Define the functions in dependency order CREATE OR REPLACE FUNCTION public.c77_mvc_analyze_column_combinations(temp_table_name text, column_stats jsonb, sample_size bigint, total_rows bigint, exclude_key_columns text[]) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE column_combinations jsonb := '{}'; combination_result record; distinct_count bigint; uniqueness_ratio numeric; duplicate_count bigint; synthetic_uniqueness numeric; adjusted_sample_size bigint; sampling_percentage numeric; BEGIN -- Adjust sample size to not exceed total rows adjusted_sample_size := LEAST(sample_size, total_rows); sampling_percentage := (adjusted_sample_size::float / total_rows * 100); -- Analyze column combinations FOR combination_result IN ( SELECT c1.key AS col1, c2.key AS col2, ((c1.value->>'fitness_score')::float + (c2.value->>'fitness_score')::float) / 2 AS avg_fitness FROM jsonb_each(column_stats) c1, jsonb_each(column_stats) c2 WHERE c1.key < c2.key AND (c1.value->>'fitness_score')::float >= 70 AND (c2.value->>'fitness_score')::float >= 70 AND NOT (c1.key = ANY(exclude_key_columns)) AND NOT (c2.key = ANY(exclude_key_columns)) ORDER BY avg_fitness DESC LIMIT 5 ) LOOP -- Test uniqueness of the combination EXECUTE format('SELECT COUNT(DISTINCT (%I, %I)) FROM (SELECT %I, %I FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s) t', combination_result.col1, combination_result.col2, combination_result.col1, combination_result.col2, temp_table_name, sampling_percentage::text, adjusted_sample_size) INTO distinct_count; uniqueness_ratio := distinct_count::float / adjusted_sample_size; -- Simulate synthetic key uniqueness EXECUTE format('SELECT COUNT(*) FROM ( SELECT ROW_NUMBER() OVER (PARTITION BY %I, %I ORDER BY random()) AS rn FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s ) t WHERE rn > 1', combination_result.col1, combination_result.col2, temp_table_name, sampling_percentage::text, adjusted_sample_size) INTO duplicate_count; synthetic_uniqueness := 1 - (duplicate_count::float / adjusted_sample_size); -- Store combination stats column_combinations := column_combinations || jsonb_build_object( format('%s,%s', combination_result.col1, combination_result.col2), jsonb_build_object( 'uniqueness_ratio', uniqueness_ratio, 'synthetic_uniqueness', synthetic_uniqueness, 'discrimination_power', uniqueness_ratio, 'avg_fitness_score', combination_result.avg_fitness ) ); END LOOP; RETURN column_combinations; EXCEPTION WHEN OTHERS THEN RETURN jsonb_build_object( 'error', format('Failed to analyze column combinations: %s', SQLERRM) ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_assemble_matv_result(target_schema text, target_mv_name text, partition_columns text[], order_by_columns text[], exclude_hash_columns text[], where_clause text, custom_sql text, notes text[]) RETURNS json LANGUAGE plpgsql AS $$ DECLARE vtw_name text := replace(target_mv_name, 'matc_', 'vtw_'); vm_name text := replace(target_mv_name, 'matc_', 'vm_'); BEGIN notes := array_append(notes, format('Process completed at %s', clock_timestamp())); RETURN json_build_object( 'message', format('Created view %I.%I, materialized view %I.%I, and view %I.%I for reading.', target_schema, vtw_name, target_schema, target_mv_name, target_schema, vm_name), 'view_name', format('%I.%I', target_schema, vtw_name), 'matview_name', format('%I.%I', target_schema, target_mv_name), 'vm_view_name', format('%I.%I', target_schema, vm_name), 'partition_columns', partition_columns, 'order_by_columns', order_by_columns, 'exclude_hash_columns', exclude_hash_columns, 'where_clause', where_clause, 'custom_sql', custom_sql, 'notes', notes ); EXCEPTION WHEN OTHERS THEN RETURN json_build_object( 'error', format('Failed to assemble result: %s', SQLERRM), 'notes', notes ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_assemble_result(source_schema text, source_table text, column_stats jsonb, column_combinations jsonb, order_by_candidates jsonb, data_quality_index numeric, notes text[], temp_table_name text) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE result_json jsonb; run_id integer; updated_notes text[] := notes; -- Create a local copy of notes BEGIN -- Build the result JSON updated_notes := array_append(updated_notes, format('Analysis completed at %s', clock_timestamp())); result_json := jsonb_build_object( 'message', format('Analysis of %I.%I completed', source_schema, source_table), 'column_stats', column_stats, 'recommended_partition_combinations', column_combinations, 'order_by_candidates', order_by_candidates, 'data_quality_index', ROUND(data_quality_index, 2), 'notes', updated_notes ); -- Store results in audit table INSERT INTO public.c77_mvc_table_fitness_audit ( source_schema, source_table, analysis_result, notes ) VALUES ( source_schema, source_table, result_json, updated_notes ) RETURNING table_fitness_audit.run_id INTO run_id; -- Add run_id to the result result_json := result_json || jsonb_build_object('run_id', run_id); -- Clean up temporary table EXECUTE format('DROP TABLE IF EXISTS %I', temp_table_name); updated_notes := array_append(updated_notes, format('Dropped temporary table %s', temp_table_name)); -- Update result_json with the final notes result_json := result_json || jsonb_build_object('notes', updated_notes); RETURN result_json; EXCEPTION WHEN OTHERS THEN RAISE NOTICE 'Error assembling result: %', SQLERRM; RETURN jsonb_build_object( 'error', format('Failed to assemble result: %s', SQLERRM), 'notes', updated_notes ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_calculate_dqi(column_stats jsonb) RETURNS numeric LANGUAGE plpgsql AS $$ DECLARE dqi_components jsonb := '{}'; col_name text; null_ratio numeric; encoding_issue_ratio numeric; uniqueness_ratio numeric; component_score numeric; BEGIN -- Calculate DQI components for each column FOR col_name IN SELECT key FROM jsonb_object_keys(column_stats) AS key LOOP null_ratio := (column_stats->col_name->>'null_ratio')::numeric; encoding_issue_ratio := (column_stats->col_name->>'encoding_issue_ratio')::numeric; uniqueness_ratio := (column_stats->col_name->>'uniqueness_ratio')::numeric; component_score := (1 - null_ratio) * 0.4 + (1 - encoding_issue_ratio) * 0.4 + uniqueness_ratio * 0.2; dqi_components := dqi_components || jsonb_build_object(col_name, component_score); END LOOP; -- Calculate average DQI across all columns (scaled to 0-100) RETURN (SELECT AVG(value::numeric) * 100 FROM jsonb_each_text(dqi_components)); EXCEPTION WHEN OTHERS THEN RAISE NOTICE 'Error calculating DQI: %', SQLERRM; RETURN 0; END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_calculate_matv_sample_size(full_matview_name text, params jsonb) RETURNS bigint LANGUAGE plpgsql AS $$ DECLARE total_matview_records bigint; sample_size bigint; BEGIN -- Get total records EXECUTE format('SELECT COUNT(*) FROM %s', full_matview_name) INTO total_matview_records; -- Calculate sample size using c77_mvc_calculate_sample_size sample_size := public.c77_mvc_calculate_sample_size(total_matview_records); RETURN sample_size; EXCEPTION WHEN OTHERS THEN RAISE NOTICE 'Error calculating sample size: %', SQLERRM; RETURN 100; END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_calculate_sample_size(total_rows bigint, confidence_level numeric DEFAULT 0.99, margin_of_error numeric DEFAULT 0.03) RETURNS bigint LANGUAGE plpgsql AS $$ DECLARE z_score numeric; n0 numeric; p numeric := 0.5; -- Conservative estimate for maximum variability sample_size bigint; BEGIN -- Map confidence level to Z-score z_score := CASE WHEN confidence_level = 0.90 THEN 1.645 WHEN confidence_level = 0.95 THEN 1.96 WHEN confidence_level = 0.99 THEN 2.576 ELSE 2.576 -- Default to 99% END; -- Initial sample size (infinite population) n0 := (z_score * z_score * p * (1 - p)) / (margin_of_error * margin_of_error); -- Adjust for finite population sample_size := CEIL(n0 * total_rows / (n0 + total_rows)); sample_size := GREATEST(sample_size, 1000); -- Minimum sample size for small tables sample_size := LEAST(sample_size, total_rows); -- Cap at total rows RETURN sample_size; END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_collect_matv_stats(full_matview_name text, full_vtw_name text) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE total_matview_records bigint; clean_records bigint; encoding_issues bigint; total_vtw_records bigint; last_matview_update timestamp with time zone; last_vtw_update timestamp with time zone; size_pretty_string text; size_mb numeric; BEGIN -- Collect stats EXECUTE format('SELECT COUNT(*), COUNT(*) FILTER (WHERE encoding_status = ''CLEAN''), COUNT(*) FILTER (WHERE encoding_status IS DISTINCT FROM ''CLEAN'') FROM %s', full_matview_name) INTO total_matview_records, clean_records, encoding_issues; EXECUTE format('SELECT COUNT(*) FROM %s', full_vtw_name) INTO total_vtw_records; EXECUTE format('SELECT MAX(rowlastupdated) FROM %s', full_matview_name) INTO last_matview_update; EXECUTE format('SELECT MAX(rowlastupdated) FROM %s', full_vtw_name) INTO last_vtw_update; EXECUTE format('SELECT pg_size_pretty(pg_total_relation_size(''%s'')::BIGINT)::TEXT', full_matview_name) INTO size_pretty_string; size_mb := regexp_replace(size_pretty_string, '[^0-9.]', '', 'g')::NUMERIC; RETURN jsonb_build_object( 'total_matview_records', total_matview_records, 'clean_records', clean_records, 'encoding_issues', encoding_issues, 'total_vtw_records', total_vtw_records, 'last_matview_update', last_matview_update, 'last_vtw_update', last_vtw_update, 'size_mb', size_mb ); EXCEPTION WHEN OTHERS THEN RAISE NOTICE 'Error fetching stats: %', SQLERRM; RETURN jsonb_build_object( 'total_matview_records', 0, 'clean_records', 0, 'encoding_issues', 0, 'total_vtw_records', 0, 'last_matview_update', NULL, 'last_vtw_update', NULL, 'size_mb', 0 ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_create_indexes(target_schema text, target_mv_name text, partition_columns text[]) RETURNS text[] LANGUAGE plpgsql AS $$ DECLARE notes text[] := '{}'; encoding_index_name text; content_hash_index_name text; unique_index_name text; vtw_name text := replace(target_mv_name, 'matc_', 'vtw_'); vm_name text := replace(target_mv_name, 'matc_', 'vm_'); BEGIN -- Index on encoding_status encoding_index_name := 'idx_' || target_mv_name || '_encoding_status'; EXECUTE format('CREATE INDEX %I ON %I.%I (encoding_status)', encoding_index_name, target_schema, target_mv_name); -- Index on content_hash content_hash_index_name := 'idx_' || target_mv_name || '_content_hash'; EXECUTE format('CREATE INDEX %I ON %I.%I (content_hash)', content_hash_index_name, target_schema, target_mv_name); notes := array_append(notes, 'Created index on content_hash'); -- Unique index on synthetic_key and partition columns SELECT string_agg(quote_ident(unnest), ', ') INTO unique_index_name FROM unnest(partition_columns); unique_index_name := format( 'CREATE UNIQUE INDEX %I ON %I.%I (synthetic_key, %s)', 'idx_' || target_mv_name || '_synthetic_key', target_schema, target_mv_name, COALESCE(unique_index_name, '1') ); BEGIN EXECUTE unique_index_name; notes := array_append(notes, 'Successfully created unique index on synthetic_key and partition columns'); EXCEPTION WHEN OTHERS THEN EXECUTE format('DROP MATERIALIZED VIEW %I.%I', target_schema, target_mv_name); EXECUTE format('DROP VIEW IF EXISTS %I.%I', target_schema, vtw_name); EXECUTE format('DROP VIEW IF EXISTS %I.%I', target_schema, vm_name); notes := array_append(notes, format('Failed to create unique index: %s', SQLERRM)); SELECT string_agg(quote_ident(unnest), ', ') INTO unique_index_name FROM unnest(partition_columns); unique_index_name := format( 'CREATE INDEX %I ON %I.%I (synthetic_key, %s)', 'idx_' || target_mv_name || '_synthetic_key_fallback', target_schema, target_mv_name, COALESCE(unique_index_name, '1') ); EXECUTE unique_index_name; notes := array_append(notes, 'Created non-unique fallback index due to unique index failure'); END; RETURN notes; EXCEPTION WHEN OTHERS THEN RETURN array_append(notes, format('Error creating indexes: %s', SQLERRM)); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_create_optimized_matv(source_schema text, source_table text, target_schema text, target_matview text, partition_columns text[], order_by_columns text[], exclude_columns_from_hash text[] DEFAULT ARRAY[]::text[], filter_latest_only boolean DEFAULT false) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE source_full_name text := quote_ident(source_schema) || '.' || quote_ident(source_table); vtw_name text := replace(target_matview, 'matc_', 'vtw_'); vm_name text := replace(target_matview, 'matc_', 'vm_'); vprob_name text := replace(target_matview, 'matc_', 'vprob_'); vtw_full_name text := quote_ident(target_schema) || '.' || quote_ident(vtw_name); vm_full_name text := quote_ident(target_schema) || '.' || quote_ident(vm_name); vprob_full_name text := quote_ident(target_schema) || '.' || quote_ident(vprob_name); matview_full_name text := quote_ident(target_schema) || '.' || quote_ident(target_matview); columns_list text; vm_columns_list text; hash_columns_list text; encoding_check_list text; partition_clause text := ''; order_by_clause text := ''; create_vtw_sql text; create_matview_sql text; create_vm_sql text; create_vprob_sql text; create_index_sql text; notes text[] := '{}'; column_record record; BEGIN -- Step 1: Get the list of columns with regexp_replace for non-partition character-based columns (for vtw_) columns_list := ''; vm_columns_list := ''; FOR column_record IN ( SELECT column_name, data_type FROM information_schema.columns WHERE table_schema = source_schema AND table_name = source_table AND column_name NOT IN ('content_hash', 'synthetic_key') ORDER BY column_name ) LOOP -- For vtw_: Apply regexp_replace to character-based non-partition columns IF column_record.column_name = ANY(partition_columns) THEN columns_list := columns_list || quote_ident(column_record.column_name) || ', '; ELSIF column_record.data_type IN ('character', 'character varying', 'varchar', 'char', 'text') THEN columns_list := columns_list || format('regexp_replace(t.%I, ''[^\x00-\x7F]''::text, ''PROBLEM''::text, ''g''::text) AS %I, ', column_record.column_name, column_record.column_name); ELSE columns_list := columns_list || quote_ident(column_record.column_name) || ', '; END IF; -- For vm_ and vprob_: Just the column names, no regexp_replace or t. prefix vm_columns_list := vm_columns_list || quote_ident(column_record.column_name) || ', '; END LOOP; columns_list := rtrim(columns_list, ', '); vm_columns_list := rtrim(vm_columns_list, ', '); -- Step 2: Validate partition_columns IF array_length(partition_columns, 1) IS NULL OR array_length(partition_columns, 1) = 0 THEN RAISE EXCEPTION 'partition_columns cannot be empty. At least one column is required for partitioning to ensure proper deduplication.'; END IF; -- Step 3: Validate order_by_columns IF array_length(order_by_columns, 1) IS NULL OR array_length(order_by_columns, 1) = 0 THEN RAISE EXCEPTION 'order_by_columns cannot be empty. At least one column is required to ensure deterministic ordering for synthetic_key generation.'; END IF; -- Step 4: Get the list of columns for the content_hash IF exclude_columns_from_hash IS NULL OR array_length(exclude_columns_from_hash, 1) IS NULL OR array_length(exclude_columns_from_hash, 1) = 0 THEN -- If exclude_columns_from_hash is empty, include all columns SELECT string_agg('t.' || quote_ident(column_name), ', ') INTO hash_columns_list FROM information_schema.columns WHERE table_schema = source_schema AND table_name = source_table; notes := array_append(notes, 'exclude_columns_from_hash is empty; including all columns from the source table in content_hash calculation'); ELSE -- Otherwise, exclude the specified columns SELECT string_agg('t.' || quote_ident(column_name), ', ') INTO hash_columns_list FROM information_schema.columns WHERE table_schema = source_schema AND table_name = source_table AND column_name NOT IN ( SELECT unnest(exclude_columns_from_hash) ); -- If excluding the specified columns results in no columns, include all columns as a fallback IF hash_columns_list IS NULL THEN SELECT string_agg('t.' || quote_ident(column_name), ', ') INTO hash_columns_list FROM information_schema.columns WHERE table_schema = source_schema AND table_name = source_table; notes := array_append(notes, 'exclude_columns_from_hash excluded all columns; including all columns from the source table in content_hash calculation as a fallback'); END IF; END IF; -- Step 5: Get the list of columns for encoding_status check SELECT string_agg(format('t.%I::text ~ ''[^\x00-\x7F]''::text', column_name), ' OR ') INTO encoding_check_list FROM information_schema.columns WHERE table_schema = source_schema AND table_name = source_table; -- Step 6: Build partition and order-by clauses for synthetic_key IF array_length(partition_columns, 1) > 0 THEN partition_clause := 'PARTITION BY ' || array_to_string(partition_columns, ', '); END IF; IF array_length(order_by_columns, 1) > 0 THEN order_by_clause := 'ORDER BY ' || array_to_string(order_by_columns, ', '); END IF; -- Step 7: Create the vtw_ view with content_hash, synthetic_key, and encoding_status IF filter_latest_only THEN create_vtw_sql := format(' CREATE OR REPLACE VIEW %s AS SELECT * FROM ( SELECT md5(CAST(ROW(%s) AS text)) AS content_hash, %s, (row_number() OVER (%s %s))::bigint AS synthetic_key, CASE WHEN %s THEN ''ENCODING_ISSUE''::text ELSE ''CLEAN''::text END AS encoding_status FROM %s t ) sub WHERE synthetic_key = 1', vtw_full_name, hash_columns_list, columns_list, partition_clause, order_by_clause, encoding_check_list, source_full_name ); ELSE create_vtw_sql := format(' CREATE OR REPLACE VIEW %s AS SELECT md5(CAST(ROW(%s) AS text)) AS content_hash, %s, (row_number() OVER (%s %s))::bigint AS synthetic_key, CASE WHEN %s THEN ''ENCODING_ISSUE''::text ELSE ''CLEAN''::text END AS encoding_status FROM %s t', vtw_full_name, hash_columns_list, columns_list, partition_clause, order_by_clause, encoding_check_list, source_full_name ); END IF; EXECUTE create_vtw_sql; notes := array_append(notes, format('Created view %s', vtw_full_name)); -- Step 8: Create the matc_ materialized view as a direct copy of vtw_ create_matview_sql := format(' CREATE MATERIALIZED VIEW IF NOT EXISTS %s AS SELECT * FROM %s', matview_full_name, vtw_full_name ); EXECUTE create_matview_sql; notes := array_append(notes, format('Created materialized view %s', matview_full_name)); -- Step 9: Add indexes on matc_ -- Index on encoding_status create_index_sql := format(' CREATE INDEX IF NOT EXISTS %I ON %s (encoding_status)', target_matview || '_encoding_status_idx', matview_full_name ); EXECUTE create_index_sql; notes := array_append(notes, format('Created index %s on encoding_status', target_matview || '_encoding_status_idx')); -- Index on content_hash create_index_sql := format(' CREATE INDEX IF NOT EXISTS %I ON %s (content_hash)', target_matview || '_content_hash_idx', matview_full_name ); EXECUTE create_index_sql; notes := array_append(notes, format('Created index %s on content_hash', target_matview || '_content_hash_idx')); -- Unique index on (synthetic_key, partition_columns) IF array_length(partition_columns, 1) > 0 THEN create_index_sql := format(' CREATE UNIQUE INDEX IF NOT EXISTS %I ON %s (synthetic_key, %s)', target_matview || '_unique_key_idx', matview_full_name, array_to_string(partition_columns, ', ') ); BEGIN EXECUTE create_index_sql; notes := array_append(notes, format('Created unique index %s on (synthetic_key, %s)', target_matview || '_unique_key_idx', array_to_string(partition_columns, ', '))); EXCEPTION WHEN unique_violation THEN notes := array_append(notes, format('Unexpected failure to create unique index %s on (synthetic_key, %s) due to duplicate values in %s. This should not happen due to synthetic_key generation. Check the synthetic_key logic in %s and look for duplicates using: SELECT synthetic_key, %s, count(*) FROM %s GROUP BY synthetic_key, %s HAVING count(*) > 1;', target_matview || '_unique_key_idx', array_to_string(partition_columns, ', '), matview_full_name, vtw_full_name, array_to_string(partition_columns, ', '), matview_full_name, array_to_string(partition_columns, ', '))); END; END IF; -- Step 10: Create the vm_ view, excluding content_hash, synthetic_key, and encoding_status, with WHERE encoding_status = 'CLEAN' create_vm_sql := format(' CREATE OR REPLACE VIEW %s AS SELECT %s FROM %s WHERE encoding_status = ''CLEAN''', vm_full_name, vm_columns_list, matview_full_name ); EXECUTE create_vm_sql; notes := array_append(notes, format('Created view %s', vm_full_name)); -- Step 11: Create the vprob_ view, excluding content_hash, synthetic_key, and encoding_status, with WHERE encoding_status != 'CLEAN' create_vprob_sql := format(' CREATE OR REPLACE VIEW %s AS SELECT %s FROM %s WHERE encoding_status != ''CLEAN''', vprob_full_name, vm_columns_list, matview_full_name ); EXECUTE create_vprob_sql; notes := array_append(notes, format('Created view %s', vprob_full_name)); RETURN jsonb_build_object('notes', notes); EXCEPTION WHEN OTHERS THEN RETURN jsonb_build_object('error', SQLERRM, 'notes', notes); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_create_temp_table(source_schema text, source_table text) RETURNS text LANGUAGE plpgsql AS $$ DECLARE temp_table_name text := 'temp_' || source_table || '_' || to_char(current_timestamp, 'YYYYMMDDHH24MISS'); column_defs text; BEGIN -- Step 1: Generate column definitions from source table SELECT string_agg( format('%I %s', column_name, data_type), ', ' ) INTO column_defs FROM information_schema.columns WHERE table_schema = source_schema AND table_name = source_table AND column_name IS NOT NULL AND TRIM(column_name) != ''; -- Step 2: Create temp table with column definitions EXECUTE format('CREATE TEMP TABLE %I (%s)', temp_table_name, column_defs); -- Step 3: Insert data from source table EXECUTE format('INSERT INTO %I SELECT * FROM %I.%I', temp_table_name, source_schema, source_table); RETURN temp_table_name; END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_generate_column_lists(source_schema text, source_table text) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE columns_list text; non_char_columns text; case_conditions text; BEGIN -- Character-type columns (cleansed) SELECT string_agg( 'regexp_replace(' || quote_ident(column_name) || ', ''[^\x00-\x7F]'', ''PROBLEM'', ''g'') as ' || quote_ident(column_name), ', ' ) INTO columns_list FROM information_schema.columns WHERE table_schema = source_schema AND table_name = source_table AND data_type IN ('character varying', 'character', 'text', 'varchar', 'char'); -- Non-character-type columns SELECT string_agg(quote_ident(column_name), ', ') INTO non_char_columns FROM information_schema.columns WHERE table_schema = source_schema AND table_name = source_table AND data_type NOT IN ('character varying', 'character', 'text', 'varchar', 'char'); -- CASE conditions for encoding status SELECT string_agg(quote_ident(column_name) || ' ~ ''[^\x00-\x7F]''', ' OR ') INTO case_conditions FROM information_schema.columns WHERE table_schema = source_schema AND table_name = source_table AND data_type IN ('character varying', 'character', 'text', 'varchar', 'char'); RETURN jsonb_build_object( 'columns_list', COALESCE(columns_list, ''), 'non_char_columns', COALESCE(non_char_columns, ''), 'case_conditions', COALESCE(case_conditions, 'FALSE') ); EXCEPTION WHEN OTHERS THEN RAISE NOTICE 'Error generating column lists: %', SQLERRM; RETURN jsonb_build_object( 'error', format('Failed to generate column lists: %s', SQLERRM) ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_generate_synthetic_key_and_hash(partition_columns text[], order_by_columns text[], exclude_hash_columns text[], all_columns text[]) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE partition_sql text; order_by_clause text; synthetic_key_sql text; content_hash_sql text; hashable_columns text[]; datetime_format CONSTANT text := 'YYYY-MM-DD HH24:MI:SS'; -- Hardcoded BEGIN -- Partition clause SELECT string_agg(quote_ident(unnest), ', ') INTO partition_sql FROM unnest(partition_columns); partition_sql := format('PARTITION BY %s', COALESCE(partition_sql, '1')); -- Order-by clause SELECT string_agg( format('TO_TIMESTAMP(SUBSTRING(NULLIF(%I, ''''), 1, 19), %L) DESC NULLS LAST', unnest, datetime_format), ', ' ) INTO order_by_clause FROM unnest(order_by_columns); synthetic_key_sql := format( 'ROW_NUMBER() OVER (%s ORDER BY %s) AS synthetic_key', partition_sql, COALESCE(order_by_clause, '1') ); -- Content hash hashable_columns := array( SELECT unnest(all_columns) EXCEPT SELECT unnest(exclude_hash_columns) ); SELECT string_agg(quote_ident(unnest), ', ') INTO content_hash_sql FROM unnest(hashable_columns); content_hash_sql := format('md5(CAST(row_to_json(ROW(%s)) AS text)) AS content_hash', COALESCE(content_hash_sql, '1')); RETURN jsonb_build_object( 'synthetic_key_sql', synthetic_key_sql, 'content_hash_sql', content_hash_sql ); EXCEPTION WHEN OTHERS THEN RETURN jsonb_build_object( 'error', format('Failed to generate synthetic key and hash: %s', SQLERRM) ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_identify_order_by_candidates(temp_table_name text, column_stats jsonb) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE order_by_candidates jsonb := '{}'; col_name text; column_type text; null_ratio numeric; notes text[] := '{}'; BEGIN -- Loop through columns to identify order-by candidates FOR col_name, column_type IN SELECT column_name, data_type FROM information_schema.columns WHERE table_schema LIKE 'pg_temp%' AND table_name = temp_table_name AND column_name IS NOT NULL AND TRIM(column_name) != '' LOOP -- Get null ratio from column_stats null_ratio := (column_stats->col_name->>'null_ratio')::numeric; -- Skip columns with high null ratio IF null_ratio > 0.5 THEN notes := array_append(notes, format('Skipped %I as order-by candidate due to high null ratio: %s', col_name, null_ratio)); CONTINUE; END IF; -- Check for timestamp or text columns IF column_type IN ('timestamp', 'timestamp with time zone', 'timestamp without time zone', 'text') THEN IF column_type = 'text' THEN BEGIN EXECUTE format('SELECT TO_TIMESTAMP(SUBSTRING(NULLIF(%I, ''''), 1, 19), %L) FROM %I LIMIT 1', col_name, 'YYYY-MM-DD HH24:MI:SS', temp_table_name); order_by_candidates := order_by_candidates || jsonb_build_object( col_name, jsonb_build_object( 'fitness_score', (1 - null_ratio) * 100, 'note', 'Text column parseable as timestamp' ) ); EXCEPTION WHEN OTHERS THEN notes := array_append(notes, format('%I is text but not parseable as timestamp: %s', col_name, SQLERRM)); END; ELSE order_by_candidates := order_by_candidates || jsonb_build_object( col_name, jsonb_build_object( 'fitness_score', (1 - null_ratio) * 100, 'note', 'Native timestamp column' ) ); END IF; END IF; END LOOP; RETURN order_by_candidates; EXCEPTION WHEN OTHERS THEN RETURN jsonb_build_object( 'error', format('Failed to identify order-by candidates: %s', SQLERRM), 'notes', notes ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_perform_matv_action(full_matview_name text, schema_name text, matview_name text, action text, mismatched_records bigint, total_matview_records bigint, time_diff interval, mismatch_threshold numeric, time_threshold interval, encoding_issues bigint) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE action_performed boolean := false; action_result text; has_unique_index boolean; index_rec record; constraint_rec record; BEGIN -- Check if the materialized view has a unique index SELECT EXISTS ( SELECT 1 FROM pg_index i JOIN pg_class c ON c.oid = i.indrelid JOIN pg_namespace n ON n.oid = c.relnamespace WHERE n.nspname = schema_name AND c.relname = matview_name AND i.indisunique = true ) INTO has_unique_index; IF action = 'refresh' AND ( (mismatched_records::NUMERIC / NULLIF(total_matview_records, 0)::NUMERIC) * 100 > mismatch_threshold OR time_diff >= time_threshold ) THEN IF has_unique_index THEN EXECUTE format('REFRESH MATERIALIZED VIEW CONCURRENTLY %s', full_matview_name); action_result := 'Refreshed successfully (concurrently)'; ELSE EXECUTE format('REFRESH MATERIALIZED VIEW %s', full_matview_name); action_result := 'Refreshed successfully (non-concurrently: no unique index found)'; RAISE NOTICE 'No unique index found for %, using non-concurrent refresh', full_matview_name; END IF; action_performed := true; ELSIF action = 'repair' AND encoding_issues > 0 THEN -- Drop existing indexes FOR index_rec IN ( SELECT indexname FROM pg_indexes WHERE schemaname = schema_name AND tablename = matview_name AND indexname NOT LIKE '%_pkey' ) LOOP EXECUTE format('DROP INDEX IF EXISTS %I.%I', schema_name, index_rec.indexname); END LOOP; -- Drop primary key or unique constraints FOR constraint_rec IN ( SELECT conname FROM pg_constraint WHERE conrelid = (SELECT oid FROM pg_class WHERE relname = matview_name AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = schema_name)) AND contype IN ('p', 'u') ) LOOP EXECUTE format('ALTER TABLE %I.%I DROP CONSTRAINT %I', schema_name, matview_name, constraint_rec.conname); END LOOP; -- Recreate standard indexes IF EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = schema_name AND table_name = matview_name AND column_name = 'content_hash') THEN EXECUTE format('CREATE INDEX IF NOT EXISTS %I ON %I.%I (content_hash)', 'idx_' || matview_name || '_content_hash', schema_name, matview_name); END IF; IF EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = schema_name AND table_name = matview_name AND column_name = 'synthetic_key') THEN EXECUTE format('CREATE UNIQUE INDEX IF NOT EXISTS %I ON %I.%I (synthetic_key)', 'idx_' || matview_name || '_synthetic_key', schema_name, matview_name); END IF; -- Analyze the table EXECUTE format('ANALYZE %I.%I', schema_name, matview_name); action_result := 'Repaired successfully: indexes and keys rebuilt'; action_performed := true; ELSIF action = 'reindex' THEN EXECUTE format('REINDEX TABLE %s', full_matview_name); action_result := 'Reindexed successfully'; action_performed := true; ELSE action_result := 'Action skipped: threshold not met or invalid action'; END IF; RETURN jsonb_build_object( 'action_performed', action_performed, 'action_result', action_result ); EXCEPTION WHEN OTHERS THEN action_result := format('Action failed: %s', SQLERRM); RAISE NOTICE 'Action exception: %', action_result; RETURN jsonb_build_object( 'action_performed', false, 'action_result', action_result ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_set_validation_params(validation_type text) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE params jsonb; mismatch_threshold numeric; time_threshold interval; BEGIN -- Set validation parameters params := CASE validation_type WHEN 'quick' THEN '{"sample_percent": 0.1, "confidence": 0.95, "margin": 0.03}'::jsonb WHEN 'daily' THEN '{"sample_percent": 1.0, "confidence": 0.99, "margin": 0.01}'::jsonb WHEN 'full' THEN '{"sample_percent": 100.0, "confidence": 0.99, "margin": 0.005}'::jsonb ELSE '{"sample_percent": 0.1, "confidence": 0.95, "margin": 0.03}'::jsonb END; -- Set dynamic thresholds mismatch_threshold := CASE validation_type WHEN 'quick' THEN 0.1 -- 0.1% mismatch for quick WHEN 'daily' THEN 0.05 -- 0.05% mismatch for daily WHEN 'full' THEN 0.01 -- 0.01% mismatch for full ELSE 0.1 END; time_threshold := CASE validation_type WHEN 'quick' THEN '3 days'::interval -- 3 days for quick WHEN 'daily' THEN '1 day'::interval -- 1 day for daily WHEN 'full' THEN '12 hours'::interval -- 12 hours for full ELSE '3 days'::interval END; RETURN jsonb_build_object( 'params', params, 'mismatch_threshold', mismatch_threshold, 'time_threshold', time_threshold ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_validate_matv_inputs(schema_name text, matview_name text, vtw_name text) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE full_matview_name text; full_vtw_name text; notes text[] := '{}'; vtw_table_name text; BEGIN -- Construct full names full_matview_name := quote_ident(schema_name) || '.' || quote_ident(matview_name); vtw_table_name := COALESCE(vtw_name, replace(matview_name, 'matc_', 'vtw_')); full_vtw_name := quote_ident(schema_name) || '.' || quote_ident(vtw_table_name); -- Validate materialized view existence IF NOT EXISTS ( SELECT 1 FROM pg_matviews WHERE schemaname = schema_name AND matviewname = matview_name ) THEN RETURN jsonb_build_object( 'error', format('Materialized view %I.%I does not exist', schema_name, matview_name), 'notes', notes ); END IF; -- Validate source view existence IF NOT EXISTS ( SELECT 1 FROM pg_tables WHERE schemaname = schema_name AND tablename = vtw_table_name ) AND NOT EXISTS ( SELECT 1 FROM pg_views WHERE schemaname = schema_name AND viewname = vtw_table_name ) THEN RETURN jsonb_build_object( 'error', format('Source view %I.%I does not exist', schema_name, vtw_table_name), 'notes', notes ); END IF; RETURN jsonb_build_object( 'full_matview_name', full_matview_name, 'full_vtw_name', full_vtw_name, 'notes', notes ); EXCEPTION WHEN OTHERS THEN RETURN jsonb_build_object( 'error', format('Error validating inputs: %s', SQLERRM), 'notes', notes ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_validate_order_by_columns(source_schema text, source_table text, order_by_columns text[]) RETURNS text[] LANGUAGE plpgsql AS $$ DECLARE notes text[] := '{}'; col_name text; datetime_format CONSTANT text := 'YYYY-MM-DD HH24:MI:SS'; -- Hardcoded BEGIN FOREACH col_name IN ARRAY order_by_columns LOOP IF NOT EXISTS ( SELECT 1 FROM information_schema.columns WHERE table_schema = source_schema AND table_name = source_table AND column_name = col_name ) THEN notes := array_append(notes, format('Warning: %I not found in %I.%I', col_name, source_schema, source_table)); ELSE BEGIN EXECUTE format('SELECT TO_TIMESTAMP(SUBSTRING(NULLIF(%I, ''''), 1, 19), %L) FROM %I.%I LIMIT 1', col_name, datetime_format, source_schema, source_table); EXCEPTION WHEN OTHERS THEN notes := array_append(notes, format('Warning: %I contains unparseable timestamp data: %s', col_name, SQLERRM)); END; END IF; END LOOP; RETURN notes; EXCEPTION WHEN OTHERS THEN RETURN array_append(notes, format('Error validating order-by columns: %s', SQLERRM)); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_analyze_column_stats(temp_table_name text, col_name text, column_type text, sample_size bigint, total_rows bigint, exclude_key_columns text[]) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE null_count bigint; distinct_count bigint; encoding_issue_count bigint; uniqueness_ratio numeric; null_ratio numeric; encoding_issue_ratio numeric; fitness_score numeric; adjusted_sample_size bigint; sampling_percentage numeric; BEGIN -- Adjust sample size to not exceed total rows adjusted_sample_size := LEAST(sample_size, total_rows); sampling_percentage := (adjusted_sample_size::float / total_rows * 100); -- Null count EXECUTE format('SELECT COUNT(*) FROM (SELECT %I FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s) t WHERE %I IS NULL', col_name, temp_table_name, sampling_percentage::text, adjusted_sample_size, col_name) INTO null_count; null_ratio := null_count::float / adjusted_sample_size; -- Distinct count (skip for excluded columns) IF NOT (col_name = ANY(exclude_key_columns)) THEN EXECUTE format('SELECT COUNT(DISTINCT %I) FROM (SELECT %I FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s) t', col_name, col_name, temp_table_name, sampling_percentage::text, adjusted_sample_size) INTO distinct_count; uniqueness_ratio := distinct_count::float / adjusted_sample_size; ELSE uniqueness_ratio := 0; END IF; -- Encoding issues (for text-like columns) IF column_type IN ('character varying', 'character', 'text', 'varchar', 'char') THEN EXECUTE format('SELECT COUNT(*) FROM (SELECT %I FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s) t WHERE %I ~ ''[^\x00-\x7F]''', col_name, temp_table_name, sampling_percentage::text, adjusted_sample_size, col_name) INTO encoding_issue_count; encoding_issue_ratio := encoding_issue_count::float / adjusted_sample_size; ELSE encoding_issue_ratio := 0; END IF; -- Fitness score for key fitness (if not excluded) IF NOT (col_name = ANY(exclude_key_columns)) THEN fitness_score := (uniqueness_ratio * 40) + ((1 - null_ratio) * 30) + ((1 - encoding_issue_ratio) * 20) + (CASE WHEN column_type IN ('character varying', 'character', 'text', 'varchar', 'char', 'integer', 'bigint') THEN 10 WHEN column_type IN ('timestamp', 'timestamp with time zone', 'timestamp without time zone') THEN 8 ELSE 5 END); ELSE fitness_score := 0; END IF; -- Return stats as JSONB RETURN jsonb_build_object( 'data_type', column_type, 'uniqueness_ratio', uniqueness_ratio, 'distinct_count', distinct_count, 'null_ratio', null_ratio, 'null_count', null_count, 'encoding_issue_ratio', encoding_issue_ratio, 'encoding_issue_count', encoding_issue_count, 'fitness_score', fitness_score, 'excluded_from_key_fitness', (col_name = ANY(exclude_key_columns)) ); EXCEPTION WHEN OTHERS THEN RETURN jsonb_build_object( 'error', format('Failed to analyze column %I: %s', col_name, SQLERRM), 'data_type', column_type ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_estimate_matv_refresh_time(full_matview_name text) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE v_refresh_count bigint; -- Renamed to avoid conflict v_refresh_total interval; -- Renamed for consistency estimated_refresh_time interval; BEGIN -- Estimate refresh time SELECT s.refresh_count, s.refresh_mv_time_total INTO v_refresh_count, v_refresh_total FROM public.c77_dbh_matv_stats s WHERE s.mv_name = full_matview_name LIMIT 1; IF COALESCE(v_refresh_count, 0) > 0 THEN estimated_refresh_time := v_refresh_total / v_refresh_count::numeric; ELSE estimated_refresh_time := '00:00:00'::interval; END IF; -- Return raw values for debugging RETURN jsonb_build_object( 'estimated_refresh_time', estimated_refresh_time, 'refresh_count', v_refresh_count, 'refresh_total', v_refresh_total ); EXCEPTION WHEN OTHERS THEN RETURN jsonb_build_object( 'estimated_refresh_time', '00:00:00'::interval, 'refresh_count', NULL, 'refresh_total', NULL, 'error', SQLERRM ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_check_matv_mismatches(target_schema text, matview_name text, validation_type text DEFAULT 'quick'::text) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE vtw_name text := 'vtw_' || substring(matview_name from 6); -- Replace 'matc_' with 'vtw_' matview_full_name text := quote_ident(target_schema) || '.' || quote_ident(matview_name); vtw_full_name text := quote_ident(target_schema) || '.' || quote_ident(vtw_name); mismatch_count bigint; mismatch_sql text; content_hash_exists boolean := true; total_matview_records bigint; params jsonb; sample_size bigint; notes text[] := '{}'; BEGIN -- Define validation parameters params := CASE validation_type WHEN 'quick' THEN '{"sample_percent": 0.1, "confidence": 0.95, "margin": 0.03}'::jsonb WHEN 'daily' THEN '{"sample_percent": 1.0, "confidence": 0.99, "margin": 0.01}'::jsonb WHEN 'full' THEN '{"sample_percent": 100.0, "confidence": 0.99, "margin": 0.005}'::jsonb ELSE '{"sample_percent": 0.1, "confidence": 0.95, "margin": 0.03}'::jsonb END; -- Calculate sample size EXECUTE format('SELECT COUNT(*) FROM %s', matview_full_name) INTO total_matview_records; sample_size := GREATEST(100, CEIL((jsonb_extract_path_text(params, 'sample_percent')::NUMERIC / 100) * total_matview_records)); notes := array_append(notes, format('Total matview records: %s, Sample size: %s', total_matview_records, sample_size)); -- Attempt to query content_hash to check if it exists in both relations BEGIN EXECUTE format('SELECT 1 FROM %s WHERE content_hash IS NOT NULL LIMIT 1', vtw_full_name); EXECUTE format('SELECT 1 FROM %s WHERE content_hash IS NOT NULL LIMIT 1', matview_full_name); EXCEPTION WHEN undefined_column THEN content_hash_exists := false; END; -- If content_hash is not found in either, return early IF NOT content_hash_exists THEN RAISE NOTICE 'content_hash column not found in either %.% or %.%, skipping mismatch check', target_schema, matview_name, target_schema, vtw_name; RETURN jsonb_build_object( 'mismatched_records', 0, 'mismatch_percent', 0.0, 'notes', notes ); END IF; -- Construct the mismatch check query IF jsonb_extract_path_text(params, 'sample_percent')::NUMERIC < 100.0 THEN -- Use sampling for quick and daily mismatch_sql := format(' WITH matview_sample AS ( SELECT content_hash FROM %s ORDER BY random() LIMIT %s ), vtw_sample AS ( SELECT content_hash FROM %s ORDER BY random() LIMIT %s ) SELECT COUNT(*) FROM ( SELECT content_hash FROM matview_sample EXCEPT SELECT content_hash FROM vtw_sample ) mismatches', matview_full_name, sample_size, vtw_full_name, sample_size ); ELSE -- Full comparison for 'full' validation mismatch_sql := format(' SELECT COUNT(*) FROM ( SELECT content_hash FROM %s EXCEPT SELECT content_hash FROM %s ) mismatches', vtw_full_name, matview_full_name ); END IF; EXECUTE mismatch_sql INTO mismatch_count; notes := array_append(notes, format('Mismatch count: %s', mismatch_count)); RETURN jsonb_build_object( 'mismatched_records', mismatch_count, 'mismatch_percent', (mismatch_count::float / GREATEST(sample_size, 1)) * 100, 'notes', notes ); EXCEPTION WHEN OTHERS THEN notes := array_append(notes, format('Error in public.c77_mvc_check_matv_mismatches: %s', SQLERRM)); RETURN jsonb_build_object( 'mismatched_records', -1, 'mismatch_percent', -1.0, 'notes', notes ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_analyze_table_fitness(source_schema text, source_table text, exclude_key_columns text[] DEFAULT ARRAY[]::text[]) RETURNS json LANGUAGE plpgsql AS $$ DECLARE total_rows bigint; sample_size bigint; temp_table_name text; notes text[] := '{}'; result_json jsonb; column_stats jsonb := '{}'; order_by_candidates jsonb := '{}'; column_combinations jsonb := '{}'; data_quality_index numeric; col_name text; column_type text; table_exists boolean; confidence_level CONSTANT numeric := 0.99; -- Hardcoded 99% confidence margin_of_error CONSTANT numeric := 0.03; -- Hardcoded 3% margin of error BEGIN notes := array_append(notes, format('Analysis started at %s', clock_timestamp())); -- Step 1: Validate schema and table existence SELECT EXISTS ( SELECT 1 FROM information_schema.tables WHERE table_schema = source_schema AND table_name = source_table ) INTO table_exists; IF NOT table_exists THEN result_json := jsonb_build_object( 'error', format('Table %I.%I does not exist', source_schema, source_table), 'notes', notes ); RETURN result_json::json; END IF; -- Step 2: Get total rows BEGIN EXECUTE format('SELECT COUNT(*) FROM %I.%I', source_schema, source_table) INTO total_rows; notes := array_append(notes, format('Total rows in %I.%I: %s', source_schema, source_table, total_rows)); EXCEPTION WHEN OTHERS THEN result_json := jsonb_build_object( 'error', format('Failed to count rows in %I.%I: %s', source_schema, source_table, SQLERRM), 'notes', notes ); RETURN result_json::json; END; -- Step 3: Calculate sample size with hardcoded confidence level and margin of error sample_size := public.c77_mvc_calculate_sample_size(total_rows, confidence_level, margin_of_error); notes := array_append(notes, format('Sample size calculated: %s for %s rows (Confidence: %s%%, Margin of Error: ±%s%%)', sample_size, total_rows, confidence_level * 100, margin_of_error * 100)); -- Step 4: Create temp table BEGIN temp_table_name := public.c77_mvc_create_temp_table(source_schema, source_table); notes := array_append(notes, format('Created temporary table %s for analysis', temp_table_name)); EXCEPTION WHEN OTHERS THEN result_json := jsonb_build_object( 'error', format('Failed to create temp table for %I.%I: %s', source_schema, source_table, SQLERRM), 'notes', notes ); RETURN result_json::json; END; -- Step 5: Analyze individual columns FOR col_name, column_type IN SELECT column_name, data_type FROM information_schema.columns WHERE table_schema LIKE 'pg_temp%' AND table_name = temp_table_name AND column_name IS NOT NULL AND TRIM(column_name) != '' LOOP column_stats := column_stats || jsonb_build_object( col_name, public.c77_mvc_analyze_column_stats(temp_table_name, col_name, column_type, sample_size, total_rows, exclude_key_columns) ); END LOOP; notes := array_append(notes, 'Completed analysis of individual columns'); -- Step 6: Identify order-by candidates order_by_candidates := public.c77_mvc_identify_order_by_candidates(temp_table_name, column_stats); notes := array_append(notes, 'Completed identification of order-by candidates'); -- Step 7: Analyze column combinations column_combinations := public.c77_mvc_analyze_column_combinations(temp_table_name, column_stats, sample_size, total_rows, exclude_key_columns); notes := array_append(notes, 'Completed analysis of column combinations'); -- Step 8: Calculate Data Quality Index (DQI) data_quality_index := public.c77_mvc_calculate_dqi(column_stats); notes := array_append(notes, format('Data Quality Index (DQI): %s', ROUND(data_quality_index, 2))); -- Step 9: Assemble final result and clean up result_json := public.c77_mvc_assemble_result( source_schema, source_table, column_stats, column_combinations, order_by_candidates, data_quality_index, notes, temp_table_name ); RETURN result_json::json; EXCEPTION WHEN OTHERS THEN result_json := jsonb_build_object( 'error', format('Unexpected error in public.c77_mvc_analyze_table_fitness: %s', SQLERRM), 'notes', notes ); RETURN result_json::json; END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_manage_matv_health(target_schema text, matview_name text, validation_type text DEFAULT 'quick'::text, action text DEFAULT NULL::text) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE vtw_name text := 'vtw_' || substring(matview_name from 6); -- Replace 'matc_' with 'vtw_' matview_full_name text := quote_ident(target_schema) || '.' || quote_ident(matview_name); vtw_full_name text := quote_ident(target_schema) || '.' || quote_ident(vtw_name); total_vtw_records bigint; total_matview_records bigint; encoding_issues bigint; mismatch_result jsonb; mismatch_count bigint; mismatch_percent float; status text; character_encoding_status text; last_refresh timestamp with time zone; staleness_threshold interval; mismatch_threshold float; is_stale_by_time boolean := false; notes text[] := '{}'; start_time timestamp with time zone := now(); end_time timestamp with time zone; elapsed_time interval; refresh_estimate jsonb; estimated_refresh_time interval; stats_found boolean := false; BEGIN -- Log the start time notes := array_append(notes, format('Process started at %s', start_time)); -- Step 1: Check if the materialized view exists BEGIN EXECUTE format('SELECT COUNT(*) FROM %s', matview_full_name) INTO total_matview_records; EXCEPTION WHEN undefined_table THEN status := 'Non Existent'; end_time := now(); elapsed_time := end_time - start_time; notes := array_append(notes, format('Elapsed time: %s', elapsed_time)); RETURN jsonb_build_object( 'total_vtw_records', 0, 'total_matview_records', 0, 'encoding_issues', 0, 'mismatched_records', 0, 'mismatch_percent', 0.0, 'status', status, 'character_encoding_status', 'CLEAN', 'estimated_refresh_time', interval '0 seconds', 'notes', notes ); END; -- Step 2: Check if the materialized view has data IF total_matview_records = 0 THEN status := 'Uninitialized'; end_time := now(); elapsed_time := end_time - start_time; notes := array_append(notes, format('Elapsed time: %s', elapsed_time)); RETURN jsonb_build_object( 'total_vtw_records', 0, 'total_matview_records', 0, 'encoding_issues', 0, 'mismatched_records', 0, 'mismatch_percent', 0.0, 'status', status, 'character_encoding_status', 'CLEAN', 'estimated_refresh_time', interval '0 seconds', 'notes', notes ); END IF; -- Step 3: Get the total records in the source view EXECUTE format('SELECT COUNT(*) FROM %s', vtw_full_name) INTO total_vtw_records; -- Step 4: Check for encoding issues BEGIN EXECUTE format('SELECT COUNT(*) FROM %s WHERE encoding_status = ''ENCODING_ISSUE''', matview_full_name) INTO encoding_issues; EXCEPTION WHEN undefined_column THEN encoding_issues := 0; -- If encoding_status column doesn't exist, assume no issues END; -- Set character_encoding_status IF encoding_issues > 0 THEN character_encoding_status := 'DEGRADED'; ELSE character_encoding_status := 'CLEAN'; END IF; -- Step 5: Check for time-based staleness SELECT refresh_mv_last INTO last_refresh FROM public.c77_dbh_matv_stats WHERE mv_name = target_schema || '.' || matview_name; IF FOUND THEN stats_found := true; ELSE notes := array_append(notes, format('Warning: No refresh stats found for materialized view %s in c77_dbh_matv_stats', matview_name)); END IF; staleness_threshold := CASE validation_type WHEN 'quick' THEN '3 days'::interval WHEN 'daily' THEN '1 day'::interval WHEN 'full' THEN '12 hours'::interval ELSE '3 days'::interval END; notes := array_append(notes, format('Last refresh: %s, Time since last refresh: %s, Staleness threshold: %s', last_refresh, now() - last_refresh, staleness_threshold)); IF last_refresh IS NULL OR (now() - last_refresh) > staleness_threshold THEN is_stale_by_time := true; END IF; notes := array_append(notes, format('Is stale by time: %s', is_stale_by_time)); -- Step 6: Set mismatch threshold based on validation_type mismatch_threshold := CASE validation_type WHEN 'quick' THEN 1.0 -- 1.0% WHEN 'daily' THEN 0.5 -- 0.5% WHEN 'full' THEN 0.2 -- 0.2% ELSE 1.0 END; -- Step 7: Check for mismatches mismatch_result := public.c77_mvc_check_matv_mismatches(target_schema, matview_name, validation_type); mismatch_count := (mismatch_result->>'mismatched_records')::bigint; mismatch_percent := (mismatch_result->>'mismatch_percent')::float; -- Append mismatch notes notes := array_cat(notes, ARRAY(SELECT jsonb_array_elements_text(mismatch_result->'notes'))); -- Log mismatch details notes := array_append(notes, format('Mismatch percent: %s, Mismatch threshold: %s', mismatch_percent, mismatch_threshold)); -- Step 8: Determine refresh status IF is_stale_by_time OR mismatch_percent > mismatch_threshold THEN status := 'Stale'; ELSE status := 'Healthy'; END IF; -- Step 9: Get estimated refresh time using the correct function SELECT public.c77_mvc_estimate_matv_refresh_time(matview_full_name) INTO refresh_estimate; estimated_refresh_time := (refresh_estimate->>'estimated_refresh_time')::interval; notes := array_append(notes, format('Refresh estimate details: %s', refresh_estimate)); -- Step 10: Perform action if specified and status is Stale IF action IS NOT NULL AND status = 'Stale' THEN IF action = 'refresh' THEN -- Refresh the materialized view (using WITH DATA for PostgreSQL 12 compatibility) EXECUTE format('REFRESH MATERIALIZED VIEW %s WITH DATA', matview_full_name); notes := array_append(notes, 'Performed REFRESH on materialized view'); -- No need to update refresh time; handled by system triggers ELSIF action = 'repair' THEN -- Drop and recreate indexes EXECUTE format('DROP INDEX IF EXISTS %s_encoding_status_idx', matview_name); EXECUTE format('DROP INDEX IF EXISTS %s_content_hash_idx', matview_name); EXECUTE format('DROP INDEX IF EXISTS %s_unique_key_idx', matview_name); EXECUTE format('CREATE INDEX %s_encoding_status_idx ON %s (encoding_status)', matview_name, matview_full_name); EXECUTE format('CREATE INDEX %s_content_hash_idx ON %s (content_hash)', matview_name, matview_full_name); EXECUTE format('CREATE UNIQUE INDEX %s_unique_key_idx ON %s (synthetic_key, companyid, orgname_id)', matview_name, matview_full_name); notes := array_append(notes, 'Performed REPAIR (dropped and recreated indexes) on materialized view'); ELSIF action = 'reindex' THEN -- Reindex the materialized view EXECUTE format('REINDEX TABLE %s', matview_full_name); notes := array_append(notes, 'Performed REINDEX on materialized view'); ELSE notes := array_append(notes, format('Invalid action: %s, no action performed', action)); END IF; -- Step 11: Re-evaluate after action EXECUTE format('SELECT COUNT(*) FROM %s', matview_full_name) INTO total_matview_records; EXECUTE format('SELECT COUNT(*) FROM %s WHERE encoding_status = ''ENCODING_ISSUE''', matview_full_name) INTO encoding_issues; mismatch_result := public.c77_mvc_check_matv_mismatches(target_schema, matview_name, validation_type); mismatch_count := (mismatch_result->>'mismatched_records')::bigint; mismatch_percent := (mismatch_result->>'mismatch_percent')::float; -- Append mismatch notes notes := array_cat(notes, ARRAY(SELECT jsonb_array_elements_text(mismatch_result->'notes'))); -- Update character_encoding_status IF encoding_issues > 0 THEN character_encoding_status := 'DEGRADED'; ELSE character_encoding_status := 'CLEAN'; END IF; -- Update status (time-based staleness should be resolved if action was 'refresh') SELECT refresh_mv_last INTO last_refresh FROM public.c77_dbh_matv_stats WHERE mv_name = target_schema || '.' || matview_name; IF NOT FOUND THEN notes := array_append(notes, format('Warning: No refresh stats found for materialized view %s in c77_dbh_matv_stats after action', matview_name)); END IF; notes := array_append(notes, format('After action - Last refresh: %s, Time since last refresh: %s, Staleness threshold: %s', last_refresh, now() - last_refresh, staleness_threshold)); IF last_refresh IS NULL OR (now() - last_refresh) > staleness_threshold THEN is_stale_by_time := true; ELSE is_stale_by_time := false; END IF; notes := array_append(notes, format('After action - Is stale by time: %s', is_stale_by_time)); -- Log mismatch details after action notes := array_append(notes, format('After action - Mismatch percent: %s, Mismatch threshold: %s', mismatch_percent, mismatch_threshold)); IF is_stale_by_time OR mismatch_percent > mismatch_threshold THEN status := 'Stale'; ELSE status := 'Healthy'; END IF; END IF; -- Step 12: Calculate elapsed time and return the results end_time := now(); elapsed_time := end_time - start_time; notes := array_append(notes, format('Elapsed time: %s', elapsed_time)); RETURN jsonb_build_object( 'total_vtw_records', total_vtw_records, 'total_matview_records', total_matview_records, 'encoding_issues', encoding_issues, 'mismatched_records', mismatch_count, 'mismatch_percent', mismatch_percent, 'status', status, 'character_encoding_status', character_encoding_status, 'estimated_refresh_time', estimated_refresh_time, 'notes', notes ); EXCEPTION WHEN OTHERS THEN end_time := now(); elapsed_time := end_time - start_time; notes := array_append(notes, format('Elapsed time: %s', elapsed_time)); notes := array_append(notes, format('Unexpected error in c77_mvc_manage_matv_health: %s', SQLERRM)); RETURN jsonb_build_object( 'error', SQLERRM, 'notes', notes ); END; $$; CREATE OR REPLACE FUNCTION public.c77_mvc_assemble_matv_health_result(full_matview_name text, full_vtw_name text, stats jsonb, mismatched_records bigint, validation_type text, sample_size bigint, mismatch_threshold numeric, action_result text, exec_time timestamp with time zone) RETURNS jsonb LANGUAGE plpgsql AS $$ DECLARE total_matview_records bigint := (stats->>'total_matview_records')::bigint; clean_records bigint := (stats->>'clean_records')::bigint; encoding_issues bigint := (stats->>'encoding_issues')::bigint; clean_percent text; estimated_refresh_time interval; BEGIN -- Calculate clean percent clean_percent := CASE WHEN total_matview_records > 0 THEN to_char(ROUND((clean_records::NUMERIC / total_matview_records::NUMERIC) * 100, 2), 'FM9999999999999999.99') || '%' ELSE 'N/A' END; -- Estimate refresh time estimated_refresh_time := public.c77_mvc_estimate_matv_refresh_time(full_matview_name); -- Assemble result RETURN jsonb_build_object( 'matview', full_matview_name, 'vtw_source', full_vtw_name, 'total_matview_records', total_matview_records::text, 'total_vtw_records', (stats->>'total_vtw_records')::text, 'mismatched_records', mismatched_records::text, 'mismatch_percent', CASE WHEN total_matview_records > 0 THEN to_char(ROUND((mismatched_records::NUMERIC / total_matview_records::NUMERIC) * 100, 2), 'FM9999999999999999.99') || '%' ELSE 'N/A' END, 'clean_records', clean_records::text, 'encoding_issues', encoding_issues::text, 'clean_record%', clean_percent, 'last_matview_update', COALESCE((stats->>'last_matview_update')::text, 'N/A'), 'last_vtw_update', COALESCE((stats->>'last_vtw_update')::text, 'N/A'), 'size_mb', (stats->>'size_mb')::text, 'estimated_refresh_time', to_char(estimated_refresh_time, 'HH24:MI:SS.MS'), 'validation_type', validation_type, 'sample_size', sample_size::text, 'status', CASE WHEN total_matview_records = 0 THEN 'Uninitialized' WHEN (mismatched_records::NUMERIC / NULLIF(total_matview_records, 0)::NUMERIC) * 100 > mismatch_threshold THEN 'Stale' WHEN encoding_issues > 0 THEN 'Degraded' ELSE 'Healthy' END, 'execution_time', to_char(clock_timestamp() - exec_time, 'HH24:MI:SS.MS') ) || COALESCE(jsonb_build_object('action_result', action_result), '{}'); END; $$;