1694 lines
71 KiB
PL/PgSQL
1694 lines
71 KiB
PL/PgSQL
-- c77_mvc--1.0.sql
|
|
|
|
-- Check if c77_dbh extension is installed
|
|
DO $$
|
|
BEGIN
|
|
IF NOT EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'c77_dbh') THEN
|
|
RAISE NOTICE 'The c77_dbh extension is not installed. This extension is required for c77_mvc to function properly.';
|
|
RAISE NOTICE 'Please install c77_dbh first by following the instructions at https://git.jctr3.com/trogers1884/c77_dbh';
|
|
RAISE NOTICE 'Example: CREATE EXTENSION c77_dbh; (after compiling and installing from source if necessary)';
|
|
RAISE EXCEPTION 'Installation aborted due to missing c77_dbh extension.';
|
|
END IF;
|
|
END;
|
|
$$;
|
|
|
|
-- If we reach here, c77_dbh is installed, so proceed with the installation
|
|
|
|
-- Create the table
|
|
CREATE TABLE IF NOT EXISTS public.c77_mvc_table_fitness_audit (
|
|
run_id BIGSERIAL,
|
|
run_timestamp timestamp without time zone DEFAULT CURRENT_TIMESTAMP,
|
|
source_schema text COLLATE pg_catalog."default",
|
|
source_table text COLLATE pg_catalog."default",
|
|
analysis_result jsonb,
|
|
notes text[] COLLATE pg_catalog."default",
|
|
CONSTRAINT table_fitness_audit_pkey PRIMARY KEY (run_id)
|
|
) TABLESPACE pg_default;
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_table_fitness_audit_table
|
|
ON public.c77_mvc_table_fitness_audit USING btree
|
|
(source_schema COLLATE pg_catalog."default" ASC NULLS LAST, source_table COLLATE pg_catalog."default" ASC NULLS LAST)
|
|
TABLESPACE pg_default;
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_table_fitness_audit_timestamp
|
|
ON public.c77_mvc_table_fitness_audit USING btree
|
|
(run_timestamp ASC NULLS LAST)
|
|
TABLESPACE pg_default;
|
|
|
|
-- Define the functions in dependency order
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_analyze_column_combinations(temp_table_name text, column_stats jsonb, sample_size bigint, total_rows bigint, exclude_key_columns text[]) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
column_combinations jsonb := '{}';
|
|
combination_result record;
|
|
distinct_count bigint;
|
|
uniqueness_ratio numeric;
|
|
duplicate_count bigint;
|
|
synthetic_uniqueness numeric;
|
|
adjusted_sample_size bigint;
|
|
sampling_percentage numeric;
|
|
BEGIN
|
|
-- Adjust sample size to not exceed total rows
|
|
adjusted_sample_size := LEAST(sample_size, total_rows);
|
|
sampling_percentage := (adjusted_sample_size::float / total_rows * 100);
|
|
|
|
-- Analyze column combinations
|
|
FOR combination_result IN (
|
|
SELECT c1.key AS col1, c2.key AS col2,
|
|
((c1.value->>'fitness_score')::float + (c2.value->>'fitness_score')::float) / 2 AS avg_fitness
|
|
FROM jsonb_each(column_stats) c1,
|
|
jsonb_each(column_stats) c2
|
|
WHERE c1.key < c2.key
|
|
AND (c1.value->>'fitness_score')::float >= 70
|
|
AND (c2.value->>'fitness_score')::float >= 70
|
|
AND NOT (c1.key = ANY(exclude_key_columns))
|
|
AND NOT (c2.key = ANY(exclude_key_columns))
|
|
ORDER BY avg_fitness DESC
|
|
LIMIT 5
|
|
)
|
|
LOOP
|
|
-- Test uniqueness of the combination
|
|
EXECUTE format('SELECT COUNT(DISTINCT (%I, %I)) FROM (SELECT %I, %I FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s) t',
|
|
combination_result.col1, combination_result.col2,
|
|
combination_result.col1, combination_result.col2,
|
|
temp_table_name,
|
|
sampling_percentage::text, adjusted_sample_size)
|
|
INTO distinct_count;
|
|
uniqueness_ratio := distinct_count::float / adjusted_sample_size;
|
|
|
|
-- Simulate synthetic key uniqueness
|
|
EXECUTE format('SELECT COUNT(*) FROM (
|
|
SELECT ROW_NUMBER() OVER (PARTITION BY %I, %I ORDER BY random()) AS rn
|
|
FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s
|
|
) t WHERE rn > 1',
|
|
combination_result.col1, combination_result.col2,
|
|
temp_table_name,
|
|
sampling_percentage::text, adjusted_sample_size)
|
|
INTO duplicate_count;
|
|
synthetic_uniqueness := 1 - (duplicate_count::float / adjusted_sample_size);
|
|
|
|
-- Store combination stats
|
|
column_combinations := column_combinations || jsonb_build_object(
|
|
format('%s,%s', combination_result.col1, combination_result.col2),
|
|
jsonb_build_object(
|
|
'uniqueness_ratio', uniqueness_ratio,
|
|
'synthetic_uniqueness', synthetic_uniqueness,
|
|
'discrimination_power', uniqueness_ratio,
|
|
'avg_fitness_score', combination_result.avg_fitness
|
|
)
|
|
);
|
|
END LOOP;
|
|
|
|
RETURN column_combinations;
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RETURN jsonb_build_object(
|
|
'error', format('Failed to analyze column combinations: %s', SQLERRM)
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_assemble_matv_result(target_schema text, target_mv_name text, partition_columns text[], order_by_columns text[], exclude_hash_columns text[], where_clause text, custom_sql text, notes text[]) RETURNS json
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
vtw_name text := replace(target_mv_name, 'matc_', 'vtw_');
|
|
vm_name text := replace(target_mv_name, 'matc_', 'vm_');
|
|
BEGIN
|
|
notes := array_append(notes, format('Process completed at %s', clock_timestamp()));
|
|
RETURN json_build_object(
|
|
'message', format('Created view %I.%I, materialized view %I.%I, and view %I.%I for reading.',
|
|
target_schema, vtw_name, target_schema, target_mv_name, target_schema, vm_name),
|
|
'view_name', format('%I.%I', target_schema, vtw_name),
|
|
'matview_name', format('%I.%I', target_schema, target_mv_name),
|
|
'vm_view_name', format('%I.%I', target_schema, vm_name),
|
|
'partition_columns', partition_columns,
|
|
'order_by_columns', order_by_columns,
|
|
'exclude_hash_columns', exclude_hash_columns,
|
|
'where_clause', where_clause,
|
|
'custom_sql', custom_sql,
|
|
'notes', notes
|
|
);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RETURN json_build_object(
|
|
'error', format('Failed to assemble result: %s', SQLERRM),
|
|
'notes', notes
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_assemble_result(source_schema text, source_table text, column_stats jsonb, column_combinations jsonb, order_by_candidates jsonb, data_quality_index numeric, notes text[], temp_table_name text) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
result_json jsonb;
|
|
run_id integer;
|
|
updated_notes text[] := notes; -- Create a local copy of notes
|
|
BEGIN
|
|
-- Build the result JSON
|
|
updated_notes := array_append(updated_notes, format('Analysis completed at %s', clock_timestamp()));
|
|
result_json := jsonb_build_object(
|
|
'message', format('Analysis of %I.%I completed', source_schema, source_table),
|
|
'column_stats', column_stats,
|
|
'recommended_partition_combinations', column_combinations,
|
|
'order_by_candidates', order_by_candidates,
|
|
'data_quality_index', ROUND(data_quality_index, 2),
|
|
'notes', updated_notes
|
|
);
|
|
|
|
-- Store results in audit table
|
|
INSERT INTO public.c77_mvc_table_fitness_audit (
|
|
source_schema,
|
|
source_table,
|
|
analysis_result,
|
|
notes
|
|
)
|
|
VALUES (
|
|
source_schema,
|
|
source_table,
|
|
result_json,
|
|
updated_notes
|
|
)
|
|
RETURNING table_fitness_audit.run_id INTO run_id;
|
|
|
|
-- Add run_id to the result
|
|
result_json := result_json || jsonb_build_object('run_id', run_id);
|
|
|
|
-- Clean up temporary table
|
|
EXECUTE format('DROP TABLE IF EXISTS %I', temp_table_name);
|
|
updated_notes := array_append(updated_notes, format('Dropped temporary table %s', temp_table_name));
|
|
|
|
-- Update result_json with the final notes
|
|
result_json := result_json || jsonb_build_object('notes', updated_notes);
|
|
|
|
RETURN result_json;
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RAISE NOTICE 'Error assembling result: %', SQLERRM;
|
|
RETURN jsonb_build_object(
|
|
'error', format('Failed to assemble result: %s', SQLERRM),
|
|
'notes', updated_notes
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_calculate_dqi(column_stats jsonb) RETURNS numeric
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
dqi_components jsonb := '{}';
|
|
col_name text;
|
|
null_ratio numeric;
|
|
encoding_issue_ratio numeric;
|
|
uniqueness_ratio numeric;
|
|
component_score numeric;
|
|
BEGIN
|
|
-- Calculate DQI components for each column
|
|
FOR col_name IN
|
|
SELECT key
|
|
FROM jsonb_object_keys(column_stats) AS key
|
|
LOOP
|
|
null_ratio := (column_stats->col_name->>'null_ratio')::numeric;
|
|
encoding_issue_ratio := (column_stats->col_name->>'encoding_issue_ratio')::numeric;
|
|
uniqueness_ratio := (column_stats->col_name->>'uniqueness_ratio')::numeric;
|
|
|
|
component_score := (1 - null_ratio) * 0.4 + (1 - encoding_issue_ratio) * 0.4 + uniqueness_ratio * 0.2;
|
|
dqi_components := dqi_components || jsonb_build_object(col_name, component_score);
|
|
END LOOP;
|
|
|
|
-- Calculate average DQI across all columns (scaled to 0-100)
|
|
RETURN (SELECT AVG(value::numeric) * 100
|
|
FROM jsonb_each_text(dqi_components));
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RAISE NOTICE 'Error calculating DQI: %', SQLERRM;
|
|
RETURN 0;
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_calculate_matv_sample_size(full_matview_name text, params jsonb) RETURNS bigint
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
total_matview_records bigint;
|
|
sample_size bigint;
|
|
BEGIN
|
|
-- Get total records
|
|
EXECUTE format('SELECT COUNT(*) FROM %s', full_matview_name)
|
|
INTO total_matview_records;
|
|
|
|
-- Calculate sample size using c77_mvc_calculate_sample_size
|
|
sample_size := public.c77_mvc_calculate_sample_size(total_matview_records);
|
|
|
|
RETURN sample_size;
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RAISE NOTICE 'Error calculating sample size: %', SQLERRM;
|
|
RETURN 100;
|
|
END;
|
|
$$;
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_calculate_sample_size(total_rows bigint, confidence_level numeric DEFAULT 0.99, margin_of_error numeric DEFAULT 0.03) RETURNS bigint
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
z_score numeric;
|
|
n0 numeric;
|
|
p numeric := 0.5; -- Conservative estimate for maximum variability
|
|
sample_size bigint;
|
|
BEGIN
|
|
-- Map confidence level to Z-score
|
|
z_score := CASE
|
|
WHEN confidence_level = 0.90 THEN 1.645
|
|
WHEN confidence_level = 0.95 THEN 1.96
|
|
WHEN confidence_level = 0.99 THEN 2.576
|
|
ELSE 2.576 -- Default to 99%
|
|
END;
|
|
|
|
-- Initial sample size (infinite population)
|
|
n0 := (z_score * z_score * p * (1 - p)) / (margin_of_error * margin_of_error);
|
|
|
|
-- Adjust for finite population
|
|
sample_size := CEIL(n0 * total_rows / (n0 + total_rows));
|
|
sample_size := GREATEST(sample_size, 1000); -- Minimum sample size for small tables
|
|
sample_size := LEAST(sample_size, total_rows); -- Cap at total rows
|
|
|
|
RETURN sample_size;
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_collect_matv_stats(full_matview_name text, full_vtw_name text) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
total_matview_records bigint;
|
|
clean_records bigint;
|
|
encoding_issues bigint;
|
|
total_vtw_records bigint;
|
|
last_matview_update timestamp with time zone;
|
|
last_vtw_update timestamp with time zone;
|
|
size_pretty_string text;
|
|
size_mb numeric;
|
|
BEGIN
|
|
-- Collect stats
|
|
EXECUTE format('SELECT COUNT(*), COUNT(*) FILTER (WHERE encoding_status = ''CLEAN''), COUNT(*) FILTER (WHERE encoding_status IS DISTINCT FROM ''CLEAN'') FROM %s', full_matview_name)
|
|
INTO total_matview_records, clean_records, encoding_issues;
|
|
|
|
EXECUTE format('SELECT COUNT(*) FROM %s', full_vtw_name)
|
|
INTO total_vtw_records;
|
|
|
|
EXECUTE format('SELECT MAX(rowlastupdated) FROM %s', full_matview_name)
|
|
INTO last_matview_update;
|
|
|
|
EXECUTE format('SELECT MAX(rowlastupdated) FROM %s', full_vtw_name)
|
|
INTO last_vtw_update;
|
|
|
|
EXECUTE format('SELECT pg_size_pretty(pg_total_relation_size(''%s'')::BIGINT)::TEXT', full_matview_name)
|
|
INTO size_pretty_string;
|
|
|
|
size_mb := regexp_replace(size_pretty_string, '[^0-9.]', '', 'g')::NUMERIC;
|
|
|
|
RETURN jsonb_build_object(
|
|
'total_matview_records', total_matview_records,
|
|
'clean_records', clean_records,
|
|
'encoding_issues', encoding_issues,
|
|
'total_vtw_records', total_vtw_records,
|
|
'last_matview_update', last_matview_update,
|
|
'last_vtw_update', last_vtw_update,
|
|
'size_mb', size_mb
|
|
);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RAISE NOTICE 'Error fetching stats: %', SQLERRM;
|
|
RETURN jsonb_build_object(
|
|
'total_matview_records', 0,
|
|
'clean_records', 0,
|
|
'encoding_issues', 0,
|
|
'total_vtw_records', 0,
|
|
'last_matview_update', NULL,
|
|
'last_vtw_update', NULL,
|
|
'size_mb', 0
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_create_indexes(target_schema text, target_mv_name text, partition_columns text[]) RETURNS text[]
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
notes text[] := '{}';
|
|
encoding_index_name text;
|
|
content_hash_index_name text;
|
|
unique_index_name text;
|
|
vtw_name text := replace(target_mv_name, 'matc_', 'vtw_');
|
|
vm_name text := replace(target_mv_name, 'matc_', 'vm_');
|
|
BEGIN
|
|
-- Index on encoding_status
|
|
encoding_index_name := 'idx_' || target_mv_name || '_encoding_status';
|
|
EXECUTE format('CREATE INDEX %I ON %I.%I (encoding_status)',
|
|
encoding_index_name, target_schema, target_mv_name);
|
|
|
|
-- Index on content_hash
|
|
content_hash_index_name := 'idx_' || target_mv_name || '_content_hash';
|
|
EXECUTE format('CREATE INDEX %I ON %I.%I (content_hash)',
|
|
content_hash_index_name, target_schema, target_mv_name);
|
|
notes := array_append(notes, 'Created index on content_hash');
|
|
|
|
-- Unique index on synthetic_key and partition columns
|
|
SELECT string_agg(quote_ident(unnest), ', ')
|
|
INTO unique_index_name
|
|
FROM unnest(partition_columns);
|
|
|
|
unique_index_name := format(
|
|
'CREATE UNIQUE INDEX %I ON %I.%I (synthetic_key, %s)',
|
|
'idx_' || target_mv_name || '_synthetic_key', target_schema, target_mv_name,
|
|
COALESCE(unique_index_name, '1')
|
|
);
|
|
|
|
BEGIN
|
|
EXECUTE unique_index_name;
|
|
notes := array_append(notes, 'Successfully created unique index on synthetic_key and partition columns');
|
|
EXCEPTION WHEN OTHERS THEN
|
|
EXECUTE format('DROP MATERIALIZED VIEW %I.%I', target_schema, target_mv_name);
|
|
EXECUTE format('DROP VIEW IF EXISTS %I.%I', target_schema, vtw_name);
|
|
EXECUTE format('DROP VIEW IF EXISTS %I.%I', target_schema, vm_name);
|
|
notes := array_append(notes, format('Failed to create unique index: %s', SQLERRM));
|
|
|
|
SELECT string_agg(quote_ident(unnest), ', ')
|
|
INTO unique_index_name
|
|
FROM unnest(partition_columns);
|
|
|
|
unique_index_name := format(
|
|
'CREATE INDEX %I ON %I.%I (synthetic_key, %s)',
|
|
'idx_' || target_mv_name || '_synthetic_key_fallback', target_schema, target_mv_name,
|
|
COALESCE(unique_index_name, '1')
|
|
);
|
|
|
|
EXECUTE unique_index_name;
|
|
notes := array_append(notes, 'Created non-unique fallback index due to unique index failure');
|
|
END;
|
|
|
|
RETURN notes;
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RETURN array_append(notes, format('Error creating indexes: %s', SQLERRM));
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_create_optimized_matv(source_schema text, source_table text, target_schema text, target_matview text, partition_columns text[], order_by_columns text[], exclude_columns_from_hash text[] DEFAULT ARRAY[]::text[], filter_latest_only boolean DEFAULT false) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
source_full_name text := quote_ident(source_schema) || '.' || quote_ident(source_table);
|
|
vtw_name text := replace(target_matview, 'matc_', 'vtw_');
|
|
vm_name text := replace(target_matview, 'matc_', 'vm_');
|
|
vprob_name text := replace(target_matview, 'matc_', 'vprob_');
|
|
vtw_full_name text := quote_ident(target_schema) || '.' || quote_ident(vtw_name);
|
|
vm_full_name text := quote_ident(target_schema) || '.' || quote_ident(vm_name);
|
|
vprob_full_name text := quote_ident(target_schema) || '.' || quote_ident(vprob_name);
|
|
matview_full_name text := quote_ident(target_schema) || '.' || quote_ident(target_matview);
|
|
columns_list text;
|
|
vm_columns_list text;
|
|
hash_columns_list text;
|
|
encoding_check_list text;
|
|
partition_clause text := '';
|
|
order_by_clause text := '';
|
|
create_vtw_sql text;
|
|
create_matview_sql text;
|
|
create_vm_sql text;
|
|
create_vprob_sql text;
|
|
create_index_sql text;
|
|
notes text[] := '{}';
|
|
column_record record;
|
|
BEGIN
|
|
-- Step 1: Get the list of columns with regexp_replace for non-partition character-based columns (for vtw_)
|
|
columns_list := '';
|
|
vm_columns_list := '';
|
|
FOR column_record IN (
|
|
SELECT column_name, data_type
|
|
FROM information_schema.columns
|
|
WHERE table_schema = source_schema
|
|
AND table_name = source_table
|
|
AND column_name NOT IN ('content_hash', 'synthetic_key')
|
|
ORDER BY column_name
|
|
) LOOP
|
|
-- For vtw_: Apply regexp_replace to character-based non-partition columns
|
|
IF column_record.column_name = ANY(partition_columns) THEN
|
|
columns_list := columns_list || quote_ident(column_record.column_name) || ', ';
|
|
ELSIF column_record.data_type IN ('character', 'character varying', 'varchar', 'char', 'text') THEN
|
|
columns_list := columns_list || format('regexp_replace(t.%I, ''[^\x00-\x7F]''::text, ''PROBLEM''::text, ''g''::text) AS %I, ',
|
|
column_record.column_name, column_record.column_name);
|
|
ELSE
|
|
columns_list := columns_list || quote_ident(column_record.column_name) || ', ';
|
|
END IF;
|
|
|
|
-- For vm_ and vprob_: Just the column names, no regexp_replace or t. prefix
|
|
vm_columns_list := vm_columns_list || quote_ident(column_record.column_name) || ', ';
|
|
END LOOP;
|
|
columns_list := rtrim(columns_list, ', ');
|
|
vm_columns_list := rtrim(vm_columns_list, ', ');
|
|
|
|
-- Step 2: Validate partition_columns
|
|
IF array_length(partition_columns, 1) IS NULL OR array_length(partition_columns, 1) = 0 THEN
|
|
RAISE EXCEPTION 'partition_columns cannot be empty. At least one column is required for partitioning to ensure proper deduplication.';
|
|
END IF;
|
|
|
|
-- Step 3: Validate order_by_columns
|
|
IF array_length(order_by_columns, 1) IS NULL OR array_length(order_by_columns, 1) = 0 THEN
|
|
RAISE EXCEPTION 'order_by_columns cannot be empty. At least one column is required to ensure deterministic ordering for synthetic_key generation.';
|
|
END IF;
|
|
|
|
-- Step 4: Get the list of columns for the content_hash
|
|
IF exclude_columns_from_hash IS NULL OR array_length(exclude_columns_from_hash, 1) IS NULL OR array_length(exclude_columns_from_hash, 1) = 0 THEN
|
|
-- If exclude_columns_from_hash is empty, include all columns
|
|
SELECT string_agg('t.' || quote_ident(column_name), ', ')
|
|
INTO hash_columns_list
|
|
FROM information_schema.columns
|
|
WHERE table_schema = source_schema
|
|
AND table_name = source_table;
|
|
notes := array_append(notes, 'exclude_columns_from_hash is empty; including all columns from the source table in content_hash calculation');
|
|
ELSE
|
|
-- Otherwise, exclude the specified columns
|
|
SELECT string_agg('t.' || quote_ident(column_name), ', ')
|
|
INTO hash_columns_list
|
|
FROM information_schema.columns
|
|
WHERE table_schema = source_schema
|
|
AND table_name = source_table
|
|
AND column_name NOT IN (
|
|
SELECT unnest(exclude_columns_from_hash)
|
|
);
|
|
|
|
-- If excluding the specified columns results in no columns, include all columns as a fallback
|
|
IF hash_columns_list IS NULL THEN
|
|
SELECT string_agg('t.' || quote_ident(column_name), ', ')
|
|
INTO hash_columns_list
|
|
FROM information_schema.columns
|
|
WHERE table_schema = source_schema
|
|
AND table_name = source_table;
|
|
notes := array_append(notes, 'exclude_columns_from_hash excluded all columns; including all columns from the source table in content_hash calculation as a fallback');
|
|
END IF;
|
|
END IF;
|
|
|
|
-- Step 5: Get the list of columns for encoding_status check
|
|
SELECT string_agg(format('t.%I::text ~ ''[^\x00-\x7F]''::text', column_name), ' OR ')
|
|
INTO encoding_check_list
|
|
FROM information_schema.columns
|
|
WHERE table_schema = source_schema
|
|
AND table_name = source_table;
|
|
|
|
-- Step 6: Build partition and order-by clauses for synthetic_key
|
|
IF array_length(partition_columns, 1) > 0 THEN
|
|
partition_clause := 'PARTITION BY ' || array_to_string(partition_columns, ', ');
|
|
END IF;
|
|
|
|
IF array_length(order_by_columns, 1) > 0 THEN
|
|
order_by_clause := 'ORDER BY ' || array_to_string(order_by_columns, ', ');
|
|
END IF;
|
|
|
|
-- Step 7: Create the vtw_ view with content_hash, synthetic_key, and encoding_status
|
|
IF filter_latest_only THEN
|
|
create_vtw_sql := format('
|
|
CREATE OR REPLACE VIEW %s AS
|
|
SELECT *
|
|
FROM (
|
|
SELECT md5(CAST(ROW(%s) AS text)) AS content_hash,
|
|
%s,
|
|
(row_number() OVER (%s %s))::bigint AS synthetic_key,
|
|
CASE
|
|
WHEN %s THEN ''ENCODING_ISSUE''::text
|
|
ELSE ''CLEAN''::text
|
|
END AS encoding_status
|
|
FROM %s t
|
|
) sub
|
|
WHERE synthetic_key = 1',
|
|
vtw_full_name,
|
|
hash_columns_list,
|
|
columns_list,
|
|
partition_clause,
|
|
order_by_clause,
|
|
encoding_check_list,
|
|
source_full_name
|
|
);
|
|
ELSE
|
|
create_vtw_sql := format('
|
|
CREATE OR REPLACE VIEW %s AS
|
|
SELECT md5(CAST(ROW(%s) AS text)) AS content_hash,
|
|
%s,
|
|
(row_number() OVER (%s %s))::bigint AS synthetic_key,
|
|
CASE
|
|
WHEN %s THEN ''ENCODING_ISSUE''::text
|
|
ELSE ''CLEAN''::text
|
|
END AS encoding_status
|
|
FROM %s t',
|
|
vtw_full_name,
|
|
hash_columns_list,
|
|
columns_list,
|
|
partition_clause,
|
|
order_by_clause,
|
|
encoding_check_list,
|
|
source_full_name
|
|
);
|
|
END IF;
|
|
|
|
EXECUTE create_vtw_sql;
|
|
notes := array_append(notes, format('Created view %s', vtw_full_name));
|
|
|
|
-- Step 8: Create the matc_ materialized view as a direct copy of vtw_
|
|
create_matview_sql := format('
|
|
CREATE MATERIALIZED VIEW IF NOT EXISTS %s AS
|
|
SELECT *
|
|
FROM %s',
|
|
matview_full_name,
|
|
vtw_full_name
|
|
);
|
|
|
|
EXECUTE create_matview_sql;
|
|
notes := array_append(notes, format('Created materialized view %s', matview_full_name));
|
|
|
|
-- Step 9: Add indexes on matc_
|
|
-- Index on encoding_status
|
|
create_index_sql := format('
|
|
CREATE INDEX IF NOT EXISTS %I ON %s (encoding_status)',
|
|
target_matview || '_encoding_status_idx',
|
|
matview_full_name
|
|
);
|
|
EXECUTE create_index_sql;
|
|
notes := array_append(notes, format('Created index %s on encoding_status', target_matview || '_encoding_status_idx'));
|
|
|
|
-- Index on content_hash
|
|
create_index_sql := format('
|
|
CREATE INDEX IF NOT EXISTS %I ON %s (content_hash)',
|
|
target_matview || '_content_hash_idx',
|
|
matview_full_name
|
|
);
|
|
EXECUTE create_index_sql;
|
|
notes := array_append(notes, format('Created index %s on content_hash', target_matview || '_content_hash_idx'));
|
|
|
|
-- Unique index on (synthetic_key, partition_columns)
|
|
IF array_length(partition_columns, 1) > 0 THEN
|
|
create_index_sql := format('
|
|
CREATE UNIQUE INDEX IF NOT EXISTS %I ON %s (synthetic_key, %s)',
|
|
target_matview || '_unique_key_idx',
|
|
matview_full_name,
|
|
array_to_string(partition_columns, ', ')
|
|
);
|
|
BEGIN
|
|
EXECUTE create_index_sql;
|
|
notes := array_append(notes, format('Created unique index %s on (synthetic_key, %s)', target_matview || '_unique_key_idx', array_to_string(partition_columns, ', ')));
|
|
EXCEPTION WHEN unique_violation THEN
|
|
notes := array_append(notes, format('Unexpected failure to create unique index %s on (synthetic_key, %s) due to duplicate values in %s. This should not happen due to synthetic_key generation. Check the synthetic_key logic in %s and look for duplicates using: SELECT synthetic_key, %s, count(*) FROM %s GROUP BY synthetic_key, %s HAVING count(*) > 1;',
|
|
target_matview || '_unique_key_idx',
|
|
array_to_string(partition_columns, ', '),
|
|
matview_full_name,
|
|
vtw_full_name,
|
|
array_to_string(partition_columns, ', '),
|
|
matview_full_name,
|
|
array_to_string(partition_columns, ', ')));
|
|
END;
|
|
END IF;
|
|
|
|
-- Step 10: Create the vm_ view, excluding content_hash, synthetic_key, and encoding_status, with WHERE encoding_status = 'CLEAN'
|
|
create_vm_sql := format('
|
|
CREATE OR REPLACE VIEW %s AS
|
|
SELECT %s
|
|
FROM %s
|
|
WHERE encoding_status = ''CLEAN''',
|
|
vm_full_name,
|
|
vm_columns_list,
|
|
matview_full_name
|
|
);
|
|
|
|
EXECUTE create_vm_sql;
|
|
notes := array_append(notes, format('Created view %s', vm_full_name));
|
|
|
|
-- Step 11: Create the vprob_ view, excluding content_hash, synthetic_key, and encoding_status, with WHERE encoding_status != 'CLEAN'
|
|
create_vprob_sql := format('
|
|
CREATE OR REPLACE VIEW %s AS
|
|
SELECT %s
|
|
FROM %s
|
|
WHERE encoding_status != ''CLEAN''',
|
|
vprob_full_name,
|
|
vm_columns_list,
|
|
matview_full_name
|
|
);
|
|
|
|
EXECUTE create_vprob_sql;
|
|
notes := array_append(notes, format('Created view %s', vprob_full_name));
|
|
|
|
RETURN jsonb_build_object('notes', notes);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RETURN jsonb_build_object('error', SQLERRM, 'notes', notes);
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_create_temp_table(source_schema text, source_table text) RETURNS text
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
temp_table_name text := 'temp_' || source_table || '_' || to_char(current_timestamp, 'YYYYMMDDHH24MISS');
|
|
column_defs text;
|
|
BEGIN
|
|
-- Step 1: Generate column definitions from source table
|
|
SELECT string_agg(
|
|
format('%I %s', column_name, data_type),
|
|
', '
|
|
) INTO column_defs
|
|
FROM information_schema.columns
|
|
WHERE table_schema = source_schema
|
|
AND table_name = source_table
|
|
AND column_name IS NOT NULL
|
|
AND TRIM(column_name) != '';
|
|
|
|
-- Step 2: Create temp table with column definitions
|
|
EXECUTE format('CREATE TEMP TABLE %I (%s)', temp_table_name, column_defs);
|
|
|
|
-- Step 3: Insert data from source table
|
|
EXECUTE format('INSERT INTO %I SELECT * FROM %I.%I', temp_table_name, source_schema, source_table);
|
|
|
|
RETURN temp_table_name;
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_generate_column_lists(source_schema text, source_table text) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
columns_list text;
|
|
non_char_columns text;
|
|
case_conditions text;
|
|
BEGIN
|
|
-- Character-type columns (cleansed)
|
|
SELECT string_agg(
|
|
'regexp_replace(' || quote_ident(column_name) || ', ''[^\x00-\x7F]'', ''PROBLEM'', ''g'') as ' || quote_ident(column_name),
|
|
', '
|
|
)
|
|
INTO columns_list
|
|
FROM information_schema.columns
|
|
WHERE table_schema = source_schema
|
|
AND table_name = source_table
|
|
AND data_type IN ('character varying', 'character', 'text', 'varchar', 'char');
|
|
|
|
-- Non-character-type columns
|
|
SELECT string_agg(quote_ident(column_name), ', ')
|
|
INTO non_char_columns
|
|
FROM information_schema.columns
|
|
WHERE table_schema = source_schema
|
|
AND table_name = source_table
|
|
AND data_type NOT IN ('character varying', 'character', 'text', 'varchar', 'char');
|
|
|
|
-- CASE conditions for encoding status
|
|
SELECT string_agg(quote_ident(column_name) || ' ~ ''[^\x00-\x7F]''', ' OR ')
|
|
INTO case_conditions
|
|
FROM information_schema.columns
|
|
WHERE table_schema = source_schema
|
|
AND table_name = source_table
|
|
AND data_type IN ('character varying', 'character', 'text', 'varchar', 'char');
|
|
|
|
RETURN jsonb_build_object(
|
|
'columns_list', COALESCE(columns_list, ''),
|
|
'non_char_columns', COALESCE(non_char_columns, ''),
|
|
'case_conditions', COALESCE(case_conditions, 'FALSE')
|
|
);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RAISE NOTICE 'Error generating column lists: %', SQLERRM;
|
|
RETURN jsonb_build_object(
|
|
'error', format('Failed to generate column lists: %s', SQLERRM)
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_generate_synthetic_key_and_hash(partition_columns text[], order_by_columns text[], exclude_hash_columns text[], all_columns text[]) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
partition_sql text;
|
|
order_by_clause text;
|
|
synthetic_key_sql text;
|
|
content_hash_sql text;
|
|
hashable_columns text[];
|
|
datetime_format CONSTANT text := 'YYYY-MM-DD HH24:MI:SS'; -- Hardcoded
|
|
BEGIN
|
|
-- Partition clause
|
|
SELECT string_agg(quote_ident(unnest), ', ')
|
|
INTO partition_sql
|
|
FROM unnest(partition_columns);
|
|
|
|
partition_sql := format('PARTITION BY %s', COALESCE(partition_sql, '1'));
|
|
|
|
-- Order-by clause
|
|
SELECT string_agg(
|
|
format('TO_TIMESTAMP(SUBSTRING(NULLIF(%I, ''''), 1, 19), %L) DESC NULLS LAST', unnest, datetime_format),
|
|
', '
|
|
)
|
|
INTO order_by_clause
|
|
FROM unnest(order_by_columns);
|
|
|
|
synthetic_key_sql := format(
|
|
'ROW_NUMBER() OVER (%s ORDER BY %s) AS synthetic_key',
|
|
partition_sql, COALESCE(order_by_clause, '1')
|
|
);
|
|
|
|
-- Content hash
|
|
hashable_columns := array(
|
|
SELECT unnest(all_columns)
|
|
EXCEPT
|
|
SELECT unnest(exclude_hash_columns)
|
|
);
|
|
|
|
SELECT string_agg(quote_ident(unnest), ', ')
|
|
INTO content_hash_sql
|
|
FROM unnest(hashable_columns);
|
|
|
|
content_hash_sql := format('md5(CAST(row_to_json(ROW(%s)) AS text)) AS content_hash', COALESCE(content_hash_sql, '1'));
|
|
|
|
RETURN jsonb_build_object(
|
|
'synthetic_key_sql', synthetic_key_sql,
|
|
'content_hash_sql', content_hash_sql
|
|
);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RETURN jsonb_build_object(
|
|
'error', format('Failed to generate synthetic key and hash: %s', SQLERRM)
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_identify_order_by_candidates(temp_table_name text, column_stats jsonb) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
order_by_candidates jsonb := '{}';
|
|
col_name text;
|
|
column_type text;
|
|
null_ratio numeric;
|
|
notes text[] := '{}';
|
|
BEGIN
|
|
-- Loop through columns to identify order-by candidates
|
|
FOR col_name, column_type IN
|
|
SELECT column_name, data_type
|
|
FROM information_schema.columns
|
|
WHERE table_schema LIKE 'pg_temp%'
|
|
AND table_name = temp_table_name
|
|
AND column_name IS NOT NULL
|
|
AND TRIM(column_name) != ''
|
|
LOOP
|
|
-- Get null ratio from column_stats
|
|
null_ratio := (column_stats->col_name->>'null_ratio')::numeric;
|
|
|
|
-- Skip columns with high null ratio
|
|
IF null_ratio > 0.5 THEN
|
|
notes := array_append(notes, format('Skipped %I as order-by candidate due to high null ratio: %s', col_name, null_ratio));
|
|
CONTINUE;
|
|
END IF;
|
|
|
|
-- Check for timestamp or text columns
|
|
IF column_type IN ('timestamp', 'timestamp with time zone', 'timestamp without time zone', 'text') THEN
|
|
IF column_type = 'text' THEN
|
|
BEGIN
|
|
EXECUTE format('SELECT TO_TIMESTAMP(SUBSTRING(NULLIF(%I, ''''), 1, 19), %L) FROM %I LIMIT 1',
|
|
col_name, 'YYYY-MM-DD HH24:MI:SS', temp_table_name);
|
|
order_by_candidates := order_by_candidates || jsonb_build_object(
|
|
col_name, jsonb_build_object(
|
|
'fitness_score', (1 - null_ratio) * 100,
|
|
'note', 'Text column parseable as timestamp'
|
|
)
|
|
);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
notes := array_append(notes, format('%I is text but not parseable as timestamp: %s', col_name, SQLERRM));
|
|
END;
|
|
ELSE
|
|
order_by_candidates := order_by_candidates || jsonb_build_object(
|
|
col_name, jsonb_build_object(
|
|
'fitness_score', (1 - null_ratio) * 100,
|
|
'note', 'Native timestamp column'
|
|
)
|
|
);
|
|
END IF;
|
|
END IF;
|
|
END LOOP;
|
|
|
|
RETURN order_by_candidates;
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RETURN jsonb_build_object(
|
|
'error', format('Failed to identify order-by candidates: %s', SQLERRM),
|
|
'notes', notes
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_perform_matv_action(full_matview_name text, schema_name text, matview_name text, action text, mismatched_records bigint, total_matview_records bigint, time_diff interval, mismatch_threshold numeric, time_threshold interval, encoding_issues bigint) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
action_performed boolean := false;
|
|
action_result text;
|
|
has_unique_index boolean;
|
|
index_rec record;
|
|
constraint_rec record;
|
|
BEGIN
|
|
-- Check if the materialized view has a unique index
|
|
SELECT EXISTS (
|
|
SELECT 1
|
|
FROM pg_index i
|
|
JOIN pg_class c ON c.oid = i.indrelid
|
|
JOIN pg_namespace n ON n.oid = c.relnamespace
|
|
WHERE n.nspname = schema_name
|
|
AND c.relname = matview_name
|
|
AND i.indisunique = true
|
|
) INTO has_unique_index;
|
|
|
|
IF action = 'refresh' AND (
|
|
(mismatched_records::NUMERIC / NULLIF(total_matview_records, 0)::NUMERIC) * 100 > mismatch_threshold
|
|
OR time_diff >= time_threshold
|
|
) THEN
|
|
IF has_unique_index THEN
|
|
EXECUTE format('REFRESH MATERIALIZED VIEW CONCURRENTLY %s', full_matview_name);
|
|
action_result := 'Refreshed successfully (concurrently)';
|
|
ELSE
|
|
EXECUTE format('REFRESH MATERIALIZED VIEW %s', full_matview_name);
|
|
action_result := 'Refreshed successfully (non-concurrently: no unique index found)';
|
|
RAISE NOTICE 'No unique index found for %, using non-concurrent refresh', full_matview_name;
|
|
END IF;
|
|
action_performed := true;
|
|
ELSIF action = 'repair' AND encoding_issues > 0 THEN
|
|
-- Drop existing indexes
|
|
FOR index_rec IN (
|
|
SELECT indexname
|
|
FROM pg_indexes
|
|
WHERE schemaname = schema_name AND tablename = matview_name
|
|
AND indexname NOT LIKE '%_pkey'
|
|
) LOOP
|
|
EXECUTE format('DROP INDEX IF EXISTS %I.%I', schema_name, index_rec.indexname);
|
|
END LOOP;
|
|
|
|
-- Drop primary key or unique constraints
|
|
FOR constraint_rec IN (
|
|
SELECT conname
|
|
FROM pg_constraint
|
|
WHERE conrelid = (SELECT oid FROM pg_class WHERE relname = matview_name AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = schema_name))
|
|
AND contype IN ('p', 'u')
|
|
) LOOP
|
|
EXECUTE format('ALTER TABLE %I.%I DROP CONSTRAINT %I', schema_name, matview_name, constraint_rec.conname);
|
|
END LOOP;
|
|
|
|
-- Recreate standard indexes
|
|
IF EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = schema_name AND table_name = matview_name AND column_name = 'content_hash') THEN
|
|
EXECUTE format('CREATE INDEX IF NOT EXISTS %I ON %I.%I (content_hash)', 'idx_' || matview_name || '_content_hash', schema_name, matview_name);
|
|
END IF;
|
|
IF EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = schema_name AND table_name = matview_name AND column_name = 'synthetic_key') THEN
|
|
EXECUTE format('CREATE UNIQUE INDEX IF NOT EXISTS %I ON %I.%I (synthetic_key)', 'idx_' || matview_name || '_synthetic_key', schema_name, matview_name);
|
|
END IF;
|
|
|
|
-- Analyze the table
|
|
EXECUTE format('ANALYZE %I.%I', schema_name, matview_name);
|
|
|
|
action_result := 'Repaired successfully: indexes and keys rebuilt';
|
|
action_performed := true;
|
|
ELSIF action = 'reindex' THEN
|
|
EXECUTE format('REINDEX TABLE %s', full_matview_name);
|
|
action_result := 'Reindexed successfully';
|
|
action_performed := true;
|
|
ELSE
|
|
action_result := 'Action skipped: threshold not met or invalid action';
|
|
END IF;
|
|
|
|
RETURN jsonb_build_object(
|
|
'action_performed', action_performed,
|
|
'action_result', action_result
|
|
);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
action_result := format('Action failed: %s', SQLERRM);
|
|
RAISE NOTICE 'Action exception: %', action_result;
|
|
RETURN jsonb_build_object(
|
|
'action_performed', false,
|
|
'action_result', action_result
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_set_validation_params(validation_type text) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
params jsonb;
|
|
mismatch_threshold numeric;
|
|
time_threshold interval;
|
|
BEGIN
|
|
-- Set validation parameters
|
|
params := CASE validation_type
|
|
WHEN 'quick' THEN '{"sample_percent": 0.1, "confidence": 0.95, "margin": 0.03}'::jsonb
|
|
WHEN 'daily' THEN '{"sample_percent": 1.0, "confidence": 0.99, "margin": 0.01}'::jsonb
|
|
WHEN 'full' THEN '{"sample_percent": 100.0, "confidence": 0.99, "margin": 0.005}'::jsonb
|
|
ELSE '{"sample_percent": 0.1, "confidence": 0.95, "margin": 0.03}'::jsonb
|
|
END;
|
|
|
|
-- Set dynamic thresholds
|
|
mismatch_threshold := CASE validation_type
|
|
WHEN 'quick' THEN 0.1 -- 0.1% mismatch for quick
|
|
WHEN 'daily' THEN 0.05 -- 0.05% mismatch for daily
|
|
WHEN 'full' THEN 0.01 -- 0.01% mismatch for full
|
|
ELSE 0.1
|
|
END;
|
|
|
|
time_threshold := CASE validation_type
|
|
WHEN 'quick' THEN '3 days'::interval -- 3 days for quick
|
|
WHEN 'daily' THEN '1 day'::interval -- 1 day for daily
|
|
WHEN 'full' THEN '12 hours'::interval -- 12 hours for full
|
|
ELSE '3 days'::interval
|
|
END;
|
|
|
|
RETURN jsonb_build_object(
|
|
'params', params,
|
|
'mismatch_threshold', mismatch_threshold,
|
|
'time_threshold', time_threshold
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_validate_matv_inputs(schema_name text, matview_name text, vtw_name text) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
full_matview_name text;
|
|
full_vtw_name text;
|
|
notes text[] := '{}';
|
|
vtw_table_name text;
|
|
BEGIN
|
|
-- Construct full names
|
|
full_matview_name := quote_ident(schema_name) || '.' || quote_ident(matview_name);
|
|
vtw_table_name := COALESCE(vtw_name, replace(matview_name, 'matc_', 'vtw_'));
|
|
full_vtw_name := quote_ident(schema_name) || '.' || quote_ident(vtw_table_name);
|
|
|
|
-- Validate materialized view existence
|
|
IF NOT EXISTS (
|
|
SELECT 1
|
|
FROM pg_matviews
|
|
WHERE schemaname = schema_name
|
|
AND matviewname = matview_name
|
|
) THEN
|
|
RETURN jsonb_build_object(
|
|
'error', format('Materialized view %I.%I does not exist', schema_name, matview_name),
|
|
'notes', notes
|
|
);
|
|
END IF;
|
|
|
|
-- Validate source view existence
|
|
IF NOT EXISTS (
|
|
SELECT 1
|
|
FROM pg_tables
|
|
WHERE schemaname = schema_name
|
|
AND tablename = vtw_table_name
|
|
) AND NOT EXISTS (
|
|
SELECT 1
|
|
FROM pg_views
|
|
WHERE schemaname = schema_name
|
|
AND viewname = vtw_table_name
|
|
) THEN
|
|
RETURN jsonb_build_object(
|
|
'error', format('Source view %I.%I does not exist', schema_name, vtw_table_name),
|
|
'notes', notes
|
|
);
|
|
END IF;
|
|
|
|
RETURN jsonb_build_object(
|
|
'full_matview_name', full_matview_name,
|
|
'full_vtw_name', full_vtw_name,
|
|
'notes', notes
|
|
);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RETURN jsonb_build_object(
|
|
'error', format('Error validating inputs: %s', SQLERRM),
|
|
'notes', notes
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_validate_order_by_columns(source_schema text, source_table text, order_by_columns text[]) RETURNS text[]
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
notes text[] := '{}';
|
|
col_name text;
|
|
datetime_format CONSTANT text := 'YYYY-MM-DD HH24:MI:SS'; -- Hardcoded
|
|
BEGIN
|
|
FOREACH col_name IN ARRAY order_by_columns LOOP
|
|
IF NOT EXISTS (
|
|
SELECT 1
|
|
FROM information_schema.columns
|
|
WHERE table_schema = source_schema
|
|
AND table_name = source_table
|
|
AND column_name = col_name
|
|
) THEN
|
|
notes := array_append(notes, format('Warning: %I not found in %I.%I', col_name, source_schema, source_table));
|
|
ELSE
|
|
BEGIN
|
|
EXECUTE format('SELECT TO_TIMESTAMP(SUBSTRING(NULLIF(%I, ''''), 1, 19), %L) FROM %I.%I LIMIT 1',
|
|
col_name, datetime_format, source_schema, source_table);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
notes := array_append(notes, format('Warning: %I contains unparseable timestamp data: %s', col_name, SQLERRM));
|
|
END;
|
|
END IF;
|
|
END LOOP;
|
|
|
|
RETURN notes;
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RETURN array_append(notes, format('Error validating order-by columns: %s', SQLERRM));
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_analyze_column_stats(temp_table_name text, col_name text, column_type text, sample_size bigint, total_rows bigint, exclude_key_columns text[]) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
null_count bigint;
|
|
distinct_count bigint;
|
|
encoding_issue_count bigint;
|
|
uniqueness_ratio numeric;
|
|
null_ratio numeric;
|
|
encoding_issue_ratio numeric;
|
|
fitness_score numeric;
|
|
adjusted_sample_size bigint;
|
|
sampling_percentage numeric;
|
|
BEGIN
|
|
-- Adjust sample size to not exceed total rows
|
|
adjusted_sample_size := LEAST(sample_size, total_rows);
|
|
sampling_percentage := (adjusted_sample_size::float / total_rows * 100);
|
|
|
|
-- Null count
|
|
EXECUTE format('SELECT COUNT(*) FROM (SELECT %I FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s) t WHERE %I IS NULL',
|
|
col_name, temp_table_name, sampling_percentage::text, adjusted_sample_size, col_name)
|
|
INTO null_count;
|
|
null_ratio := null_count::float / adjusted_sample_size;
|
|
|
|
-- Distinct count (skip for excluded columns)
|
|
IF NOT (col_name = ANY(exclude_key_columns)) THEN
|
|
EXECUTE format('SELECT COUNT(DISTINCT %I) FROM (SELECT %I FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s) t',
|
|
col_name, col_name, temp_table_name, sampling_percentage::text, adjusted_sample_size)
|
|
INTO distinct_count;
|
|
uniqueness_ratio := distinct_count::float / adjusted_sample_size;
|
|
ELSE
|
|
uniqueness_ratio := 0;
|
|
END IF;
|
|
|
|
-- Encoding issues (for text-like columns)
|
|
IF column_type IN ('character varying', 'character', 'text', 'varchar', 'char') THEN
|
|
EXECUTE format('SELECT COUNT(*) FROM (SELECT %I FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s) t WHERE %I ~ ''[^\x00-\x7F]''',
|
|
col_name, temp_table_name, sampling_percentage::text, adjusted_sample_size, col_name)
|
|
INTO encoding_issue_count;
|
|
encoding_issue_ratio := encoding_issue_count::float / adjusted_sample_size;
|
|
ELSE
|
|
encoding_issue_ratio := 0;
|
|
END IF;
|
|
|
|
-- Fitness score for key fitness (if not excluded)
|
|
IF NOT (col_name = ANY(exclude_key_columns)) THEN
|
|
fitness_score := (uniqueness_ratio * 40) +
|
|
((1 - null_ratio) * 30) +
|
|
((1 - encoding_issue_ratio) * 20) +
|
|
(CASE
|
|
WHEN column_type IN ('character varying', 'character', 'text', 'varchar', 'char', 'integer', 'bigint') THEN 10
|
|
WHEN column_type IN ('timestamp', 'timestamp with time zone', 'timestamp without time zone') THEN 8
|
|
ELSE 5
|
|
END);
|
|
ELSE
|
|
fitness_score := 0;
|
|
END IF;
|
|
|
|
-- Return stats as JSONB
|
|
RETURN jsonb_build_object(
|
|
'data_type', column_type,
|
|
'uniqueness_ratio', uniqueness_ratio,
|
|
'distinct_count', distinct_count,
|
|
'null_ratio', null_ratio,
|
|
'null_count', null_count,
|
|
'encoding_issue_ratio', encoding_issue_ratio,
|
|
'encoding_issue_count', encoding_issue_count,
|
|
'fitness_score', fitness_score,
|
|
'excluded_from_key_fitness', (col_name = ANY(exclude_key_columns))
|
|
);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RETURN jsonb_build_object(
|
|
'error', format('Failed to analyze column %I: %s', col_name, SQLERRM),
|
|
'data_type', column_type
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_estimate_matv_refresh_time(full_matview_name text) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
v_refresh_count bigint; -- Renamed to avoid conflict
|
|
v_refresh_total interval; -- Renamed for consistency
|
|
estimated_refresh_time interval;
|
|
BEGIN
|
|
-- Estimate refresh time
|
|
SELECT s.refresh_count, s.refresh_mv_time_total
|
|
INTO v_refresh_count, v_refresh_total
|
|
FROM public.c77_dbh_matv_stats s
|
|
WHERE s.mv_name = full_matview_name
|
|
LIMIT 1;
|
|
|
|
IF COALESCE(v_refresh_count, 0) > 0 THEN
|
|
estimated_refresh_time := v_refresh_total / v_refresh_count::numeric;
|
|
ELSE
|
|
estimated_refresh_time := '00:00:00'::interval;
|
|
END IF;
|
|
|
|
-- Return raw values for debugging
|
|
RETURN jsonb_build_object(
|
|
'estimated_refresh_time', estimated_refresh_time,
|
|
'refresh_count', v_refresh_count,
|
|
'refresh_total', v_refresh_total
|
|
);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
RETURN jsonb_build_object(
|
|
'estimated_refresh_time', '00:00:00'::interval,
|
|
'refresh_count', NULL,
|
|
'refresh_total', NULL,
|
|
'error', SQLERRM
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_check_matv_mismatches(target_schema text, matview_name text, validation_type text DEFAULT 'quick'::text) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
vtw_name text := 'vtw_' || substring(matview_name from 6); -- Replace 'matc_' with 'vtw_'
|
|
matview_full_name text := quote_ident(target_schema) || '.' || quote_ident(matview_name);
|
|
vtw_full_name text := quote_ident(target_schema) || '.' || quote_ident(vtw_name);
|
|
mismatch_count bigint;
|
|
mismatch_sql text;
|
|
content_hash_exists boolean := true;
|
|
total_matview_records bigint;
|
|
params jsonb;
|
|
sample_size bigint;
|
|
notes text[] := '{}';
|
|
BEGIN
|
|
-- Define validation parameters
|
|
params := CASE validation_type
|
|
WHEN 'quick' THEN '{"sample_percent": 0.1, "confidence": 0.95, "margin": 0.03}'::jsonb
|
|
WHEN 'daily' THEN '{"sample_percent": 1.0, "confidence": 0.99, "margin": 0.01}'::jsonb
|
|
WHEN 'full' THEN '{"sample_percent": 100.0, "confidence": 0.99, "margin": 0.005}'::jsonb
|
|
ELSE '{"sample_percent": 0.1, "confidence": 0.95, "margin": 0.03}'::jsonb
|
|
END;
|
|
|
|
-- Calculate sample size
|
|
EXECUTE format('SELECT COUNT(*) FROM %s', matview_full_name) INTO total_matview_records;
|
|
sample_size := GREATEST(100, CEIL((jsonb_extract_path_text(params, 'sample_percent')::NUMERIC / 100) * total_matview_records));
|
|
notes := array_append(notes, format('Total matview records: %s, Sample size: %s', total_matview_records, sample_size));
|
|
|
|
-- Attempt to query content_hash to check if it exists in both relations
|
|
BEGIN
|
|
EXECUTE format('SELECT 1 FROM %s WHERE content_hash IS NOT NULL LIMIT 1', vtw_full_name);
|
|
EXECUTE format('SELECT 1 FROM %s WHERE content_hash IS NOT NULL LIMIT 1', matview_full_name);
|
|
EXCEPTION WHEN undefined_column THEN
|
|
content_hash_exists := false;
|
|
END;
|
|
|
|
-- If content_hash is not found in either, return early
|
|
IF NOT content_hash_exists THEN
|
|
RAISE NOTICE 'content_hash column not found in either %.% or %.%, skipping mismatch check',
|
|
target_schema, matview_name, target_schema, vtw_name;
|
|
RETURN jsonb_build_object(
|
|
'mismatched_records', 0,
|
|
'mismatch_percent', 0.0,
|
|
'notes', notes
|
|
);
|
|
END IF;
|
|
|
|
-- Construct the mismatch check query
|
|
IF jsonb_extract_path_text(params, 'sample_percent')::NUMERIC < 100.0 THEN
|
|
-- Use sampling for quick and daily
|
|
mismatch_sql := format('
|
|
WITH matview_sample AS (
|
|
SELECT content_hash
|
|
FROM %s
|
|
ORDER BY random()
|
|
LIMIT %s
|
|
),
|
|
vtw_sample AS (
|
|
SELECT content_hash
|
|
FROM %s
|
|
ORDER BY random()
|
|
LIMIT %s
|
|
)
|
|
SELECT COUNT(*)
|
|
FROM (
|
|
SELECT content_hash FROM matview_sample
|
|
EXCEPT
|
|
SELECT content_hash FROM vtw_sample
|
|
) mismatches',
|
|
matview_full_name,
|
|
sample_size,
|
|
vtw_full_name,
|
|
sample_size
|
|
);
|
|
ELSE
|
|
-- Full comparison for 'full' validation
|
|
mismatch_sql := format('
|
|
SELECT COUNT(*)
|
|
FROM (
|
|
SELECT content_hash FROM %s
|
|
EXCEPT
|
|
SELECT content_hash FROM %s
|
|
) mismatches',
|
|
vtw_full_name,
|
|
matview_full_name
|
|
);
|
|
END IF;
|
|
|
|
EXECUTE mismatch_sql INTO mismatch_count;
|
|
notes := array_append(notes, format('Mismatch count: %s', mismatch_count));
|
|
|
|
RETURN jsonb_build_object(
|
|
'mismatched_records', mismatch_count,
|
|
'mismatch_percent', (mismatch_count::float / GREATEST(sample_size, 1)) * 100,
|
|
'notes', notes
|
|
);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
notes := array_append(notes, format('Error in public.c77_mvc_check_matv_mismatches: %s', SQLERRM));
|
|
RETURN jsonb_build_object(
|
|
'mismatched_records', -1,
|
|
'mismatch_percent', -1.0,
|
|
'notes', notes
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_analyze_table_fitness(source_schema text, source_table text, exclude_key_columns text[] DEFAULT ARRAY[]::text[]) RETURNS json
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
total_rows bigint;
|
|
sample_size bigint;
|
|
temp_table_name text;
|
|
notes text[] := '{}';
|
|
result_json jsonb;
|
|
column_stats jsonb := '{}';
|
|
order_by_candidates jsonb := '{}';
|
|
column_combinations jsonb := '{}';
|
|
data_quality_index numeric;
|
|
col_name text;
|
|
column_type text;
|
|
table_exists boolean;
|
|
confidence_level CONSTANT numeric := 0.99; -- Hardcoded 99% confidence
|
|
margin_of_error CONSTANT numeric := 0.03; -- Hardcoded 3% margin of error
|
|
BEGIN
|
|
notes := array_append(notes, format('Analysis started at %s', clock_timestamp()));
|
|
|
|
-- Step 1: Validate schema and table existence
|
|
SELECT EXISTS (
|
|
SELECT 1
|
|
FROM information_schema.tables
|
|
WHERE table_schema = source_schema
|
|
AND table_name = source_table
|
|
) INTO table_exists;
|
|
|
|
IF NOT table_exists THEN
|
|
result_json := jsonb_build_object(
|
|
'error', format('Table %I.%I does not exist', source_schema, source_table),
|
|
'notes', notes
|
|
);
|
|
RETURN result_json::json;
|
|
END IF;
|
|
|
|
-- Step 2: Get total rows
|
|
BEGIN
|
|
EXECUTE format('SELECT COUNT(*) FROM %I.%I', source_schema, source_table) INTO total_rows;
|
|
notes := array_append(notes, format('Total rows in %I.%I: %s', source_schema, source_table, total_rows));
|
|
EXCEPTION WHEN OTHERS THEN
|
|
result_json := jsonb_build_object(
|
|
'error', format('Failed to count rows in %I.%I: %s', source_schema, source_table, SQLERRM),
|
|
'notes', notes
|
|
);
|
|
RETURN result_json::json;
|
|
END;
|
|
|
|
-- Step 3: Calculate sample size with hardcoded confidence level and margin of error
|
|
sample_size := public.c77_mvc_calculate_sample_size(total_rows, confidence_level, margin_of_error);
|
|
notes := array_append(notes, format('Sample size calculated: %s for %s rows (Confidence: %s%%, Margin of Error: ±%s%%)',
|
|
sample_size, total_rows, confidence_level * 100, margin_of_error * 100));
|
|
|
|
-- Step 4: Create temp table
|
|
BEGIN
|
|
temp_table_name := public.c77_mvc_create_temp_table(source_schema, source_table);
|
|
notes := array_append(notes, format('Created temporary table %s for analysis', temp_table_name));
|
|
EXCEPTION WHEN OTHERS THEN
|
|
result_json := jsonb_build_object(
|
|
'error', format('Failed to create temp table for %I.%I: %s', source_schema, source_table, SQLERRM),
|
|
'notes', notes
|
|
);
|
|
RETURN result_json::json;
|
|
END;
|
|
|
|
-- Step 5: Analyze individual columns
|
|
FOR col_name, column_type IN
|
|
SELECT column_name, data_type
|
|
FROM information_schema.columns
|
|
WHERE table_schema LIKE 'pg_temp%'
|
|
AND table_name = temp_table_name
|
|
AND column_name IS NOT NULL
|
|
AND TRIM(column_name) != ''
|
|
LOOP
|
|
column_stats := column_stats || jsonb_build_object(
|
|
col_name, public.c77_mvc_analyze_column_stats(temp_table_name, col_name, column_type, sample_size, total_rows, exclude_key_columns)
|
|
);
|
|
END LOOP;
|
|
notes := array_append(notes, 'Completed analysis of individual columns');
|
|
|
|
-- Step 6: Identify order-by candidates
|
|
order_by_candidates := public.c77_mvc_identify_order_by_candidates(temp_table_name, column_stats);
|
|
notes := array_append(notes, 'Completed identification of order-by candidates');
|
|
|
|
-- Step 7: Analyze column combinations
|
|
column_combinations := public.c77_mvc_analyze_column_combinations(temp_table_name, column_stats, sample_size, total_rows, exclude_key_columns);
|
|
notes := array_append(notes, 'Completed analysis of column combinations');
|
|
|
|
-- Step 8: Calculate Data Quality Index (DQI)
|
|
data_quality_index := public.c77_mvc_calculate_dqi(column_stats);
|
|
notes := array_append(notes, format('Data Quality Index (DQI): %s', ROUND(data_quality_index, 2)));
|
|
|
|
-- Step 9: Assemble final result and clean up
|
|
result_json := public.c77_mvc_assemble_result(
|
|
source_schema, source_table, column_stats, column_combinations,
|
|
order_by_candidates, data_quality_index, notes, temp_table_name
|
|
);
|
|
|
|
RETURN result_json::json;
|
|
EXCEPTION WHEN OTHERS THEN
|
|
result_json := jsonb_build_object(
|
|
'error', format('Unexpected error in public.c77_mvc_analyze_table_fitness: %s', SQLERRM),
|
|
'notes', notes
|
|
);
|
|
RETURN result_json::json;
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_manage_matv_health(target_schema text, matview_name text, validation_type text DEFAULT 'quick'::text, action text DEFAULT NULL::text) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
vtw_name text := 'vtw_' || substring(matview_name from 6); -- Replace 'matc_' with 'vtw_'
|
|
matview_full_name text := quote_ident(target_schema) || '.' || quote_ident(matview_name);
|
|
vtw_full_name text := quote_ident(target_schema) || '.' || quote_ident(vtw_name);
|
|
total_vtw_records bigint;
|
|
total_matview_records bigint;
|
|
encoding_issues bigint;
|
|
mismatch_result jsonb;
|
|
mismatch_count bigint;
|
|
mismatch_percent float;
|
|
status text;
|
|
character_encoding_status text;
|
|
last_refresh timestamp with time zone;
|
|
staleness_threshold interval;
|
|
mismatch_threshold float;
|
|
is_stale_by_time boolean := false;
|
|
notes text[] := '{}';
|
|
start_time timestamp with time zone := now();
|
|
end_time timestamp with time zone;
|
|
elapsed_time interval;
|
|
refresh_estimate jsonb;
|
|
estimated_refresh_time interval;
|
|
stats_found boolean := false;
|
|
BEGIN
|
|
-- Log the start time
|
|
notes := array_append(notes, format('Process started at %s', start_time));
|
|
|
|
-- Step 1: Check if the materialized view exists
|
|
BEGIN
|
|
EXECUTE format('SELECT COUNT(*) FROM %s', matview_full_name) INTO total_matview_records;
|
|
EXCEPTION WHEN undefined_table THEN
|
|
status := 'Non Existent';
|
|
end_time := now();
|
|
elapsed_time := end_time - start_time;
|
|
notes := array_append(notes, format('Elapsed time: %s', elapsed_time));
|
|
RETURN jsonb_build_object(
|
|
'total_vtw_records', 0,
|
|
'total_matview_records', 0,
|
|
'encoding_issues', 0,
|
|
'mismatched_records', 0,
|
|
'mismatch_percent', 0.0,
|
|
'status', status,
|
|
'character_encoding_status', 'CLEAN',
|
|
'estimated_refresh_time', interval '0 seconds',
|
|
'notes', notes
|
|
);
|
|
END;
|
|
|
|
-- Step 2: Check if the materialized view has data
|
|
IF total_matview_records = 0 THEN
|
|
status := 'Uninitialized';
|
|
end_time := now();
|
|
elapsed_time := end_time - start_time;
|
|
notes := array_append(notes, format('Elapsed time: %s', elapsed_time));
|
|
RETURN jsonb_build_object(
|
|
'total_vtw_records', 0,
|
|
'total_matview_records', 0,
|
|
'encoding_issues', 0,
|
|
'mismatched_records', 0,
|
|
'mismatch_percent', 0.0,
|
|
'status', status,
|
|
'character_encoding_status', 'CLEAN',
|
|
'estimated_refresh_time', interval '0 seconds',
|
|
'notes', notes
|
|
);
|
|
END IF;
|
|
|
|
-- Step 3: Get the total records in the source view
|
|
EXECUTE format('SELECT COUNT(*) FROM %s', vtw_full_name) INTO total_vtw_records;
|
|
|
|
-- Step 4: Check for encoding issues
|
|
BEGIN
|
|
EXECUTE format('SELECT COUNT(*) FROM %s WHERE encoding_status = ''ENCODING_ISSUE''', matview_full_name)
|
|
INTO encoding_issues;
|
|
EXCEPTION WHEN undefined_column THEN
|
|
encoding_issues := 0; -- If encoding_status column doesn't exist, assume no issues
|
|
END;
|
|
|
|
-- Set character_encoding_status
|
|
IF encoding_issues > 0 THEN
|
|
character_encoding_status := 'DEGRADED';
|
|
ELSE
|
|
character_encoding_status := 'CLEAN';
|
|
END IF;
|
|
|
|
-- Step 5: Check for time-based staleness
|
|
SELECT refresh_mv_last
|
|
INTO last_refresh
|
|
FROM public.c77_dbh_matv_stats
|
|
WHERE mv_name = target_schema || '.' || matview_name;
|
|
|
|
IF FOUND THEN
|
|
stats_found := true;
|
|
ELSE
|
|
notes := array_append(notes, format('Warning: No refresh stats found for materialized view %s in c77_dbh_matv_stats', matview_name));
|
|
END IF;
|
|
|
|
staleness_threshold := CASE validation_type
|
|
WHEN 'quick' THEN '3 days'::interval
|
|
WHEN 'daily' THEN '1 day'::interval
|
|
WHEN 'full' THEN '12 hours'::interval
|
|
ELSE '3 days'::interval
|
|
END;
|
|
|
|
notes := array_append(notes, format('Last refresh: %s, Time since last refresh: %s, Staleness threshold: %s', last_refresh, now() - last_refresh, staleness_threshold));
|
|
|
|
IF last_refresh IS NULL OR (now() - last_refresh) > staleness_threshold THEN
|
|
is_stale_by_time := true;
|
|
END IF;
|
|
notes := array_append(notes, format('Is stale by time: %s', is_stale_by_time));
|
|
|
|
-- Step 6: Set mismatch threshold based on validation_type
|
|
mismatch_threshold := CASE validation_type
|
|
WHEN 'quick' THEN 1.0 -- 1.0%
|
|
WHEN 'daily' THEN 0.5 -- 0.5%
|
|
WHEN 'full' THEN 0.2 -- 0.2%
|
|
ELSE 1.0
|
|
END;
|
|
|
|
-- Step 7: Check for mismatches
|
|
mismatch_result := public.c77_mvc_check_matv_mismatches(target_schema, matview_name, validation_type);
|
|
mismatch_count := (mismatch_result->>'mismatched_records')::bigint;
|
|
mismatch_percent := (mismatch_result->>'mismatch_percent')::float;
|
|
|
|
-- Append mismatch notes
|
|
notes := array_cat(notes, ARRAY(SELECT jsonb_array_elements_text(mismatch_result->'notes')));
|
|
|
|
-- Log mismatch details
|
|
notes := array_append(notes, format('Mismatch percent: %s, Mismatch threshold: %s', mismatch_percent, mismatch_threshold));
|
|
|
|
-- Step 8: Determine refresh status
|
|
IF is_stale_by_time OR mismatch_percent > mismatch_threshold THEN
|
|
status := 'Stale';
|
|
ELSE
|
|
status := 'Healthy';
|
|
END IF;
|
|
|
|
-- Step 9: Get estimated refresh time using the correct function
|
|
SELECT public.c77_mvc_estimate_matv_refresh_time(matview_full_name)
|
|
INTO refresh_estimate;
|
|
estimated_refresh_time := (refresh_estimate->>'estimated_refresh_time')::interval;
|
|
notes := array_append(notes, format('Refresh estimate details: %s', refresh_estimate));
|
|
|
|
-- Step 10: Perform action if specified and status is Stale
|
|
IF action IS NOT NULL AND status = 'Stale' THEN
|
|
IF action = 'refresh' THEN
|
|
-- Refresh the materialized view (using WITH DATA for PostgreSQL 12 compatibility)
|
|
EXECUTE format('REFRESH MATERIALIZED VIEW %s WITH DATA', matview_full_name);
|
|
notes := array_append(notes, 'Performed REFRESH on materialized view');
|
|
-- No need to update refresh time; handled by system triggers
|
|
ELSIF action = 'repair' THEN
|
|
-- Drop and recreate indexes
|
|
EXECUTE format('DROP INDEX IF EXISTS %s_encoding_status_idx', matview_name);
|
|
EXECUTE format('DROP INDEX IF EXISTS %s_content_hash_idx', matview_name);
|
|
EXECUTE format('DROP INDEX IF EXISTS %s_unique_key_idx', matview_name);
|
|
EXECUTE format('CREATE INDEX %s_encoding_status_idx ON %s (encoding_status)', matview_name, matview_full_name);
|
|
EXECUTE format('CREATE INDEX %s_content_hash_idx ON %s (content_hash)', matview_name, matview_full_name);
|
|
EXECUTE format('CREATE UNIQUE INDEX %s_unique_key_idx ON %s (synthetic_key, companyid, orgname_id)', matview_name, matview_full_name);
|
|
notes := array_append(notes, 'Performed REPAIR (dropped and recreated indexes) on materialized view');
|
|
ELSIF action = 'reindex' THEN
|
|
-- Reindex the materialized view
|
|
EXECUTE format('REINDEX TABLE %s', matview_full_name);
|
|
notes := array_append(notes, 'Performed REINDEX on materialized view');
|
|
ELSE
|
|
notes := array_append(notes, format('Invalid action: %s, no action performed', action));
|
|
END IF;
|
|
|
|
-- Step 11: Re-evaluate after action
|
|
EXECUTE format('SELECT COUNT(*) FROM %s', matview_full_name) INTO total_matview_records;
|
|
EXECUTE format('SELECT COUNT(*) FROM %s WHERE encoding_status = ''ENCODING_ISSUE''', matview_full_name)
|
|
INTO encoding_issues;
|
|
mismatch_result := public.c77_mvc_check_matv_mismatches(target_schema, matview_name, validation_type);
|
|
mismatch_count := (mismatch_result->>'mismatched_records')::bigint;
|
|
mismatch_percent := (mismatch_result->>'mismatch_percent')::float;
|
|
|
|
-- Append mismatch notes
|
|
notes := array_cat(notes, ARRAY(SELECT jsonb_array_elements_text(mismatch_result->'notes')));
|
|
|
|
-- Update character_encoding_status
|
|
IF encoding_issues > 0 THEN
|
|
character_encoding_status := 'DEGRADED';
|
|
ELSE
|
|
character_encoding_status := 'CLEAN';
|
|
END IF;
|
|
|
|
-- Update status (time-based staleness should be resolved if action was 'refresh')
|
|
SELECT refresh_mv_last
|
|
INTO last_refresh
|
|
FROM public.c77_dbh_matv_stats
|
|
WHERE mv_name = target_schema || '.' || matview_name;
|
|
|
|
IF NOT FOUND THEN
|
|
notes := array_append(notes, format('Warning: No refresh stats found for materialized view %s in c77_dbh_matv_stats after action', matview_name));
|
|
END IF;
|
|
|
|
notes := array_append(notes, format('After action - Last refresh: %s, Time since last refresh: %s, Staleness threshold: %s', last_refresh, now() - last_refresh, staleness_threshold));
|
|
|
|
IF last_refresh IS NULL OR (now() - last_refresh) > staleness_threshold THEN
|
|
is_stale_by_time := true;
|
|
ELSE
|
|
is_stale_by_time := false;
|
|
END IF;
|
|
notes := array_append(notes, format('After action - Is stale by time: %s', is_stale_by_time));
|
|
|
|
-- Log mismatch details after action
|
|
notes := array_append(notes, format('After action - Mismatch percent: %s, Mismatch threshold: %s', mismatch_percent, mismatch_threshold));
|
|
|
|
IF is_stale_by_time OR mismatch_percent > mismatch_threshold THEN
|
|
status := 'Stale';
|
|
ELSE
|
|
status := 'Healthy';
|
|
END IF;
|
|
END IF;
|
|
|
|
-- Step 12: Calculate elapsed time and return the results
|
|
end_time := now();
|
|
elapsed_time := end_time - start_time;
|
|
notes := array_append(notes, format('Elapsed time: %s', elapsed_time));
|
|
|
|
RETURN jsonb_build_object(
|
|
'total_vtw_records', total_vtw_records,
|
|
'total_matview_records', total_matview_records,
|
|
'encoding_issues', encoding_issues,
|
|
'mismatched_records', mismatch_count,
|
|
'mismatch_percent', mismatch_percent,
|
|
'status', status,
|
|
'character_encoding_status', character_encoding_status,
|
|
'estimated_refresh_time', estimated_refresh_time,
|
|
'notes', notes
|
|
);
|
|
EXCEPTION WHEN OTHERS THEN
|
|
end_time := now();
|
|
elapsed_time := end_time - start_time;
|
|
notes := array_append(notes, format('Elapsed time: %s', elapsed_time));
|
|
notes := array_append(notes, format('Unexpected error in c77_mvc_manage_matv_health: %s', SQLERRM));
|
|
RETURN jsonb_build_object(
|
|
'error', SQLERRM,
|
|
'notes', notes
|
|
);
|
|
END;
|
|
$$;
|
|
|
|
|
|
CREATE OR REPLACE FUNCTION public.c77_mvc_assemble_matv_health_result(full_matview_name text, full_vtw_name text, stats jsonb, mismatched_records bigint, validation_type text, sample_size bigint, mismatch_threshold numeric, action_result text, exec_time timestamp with time zone) RETURNS jsonb
|
|
LANGUAGE plpgsql
|
|
AS $$
|
|
DECLARE
|
|
total_matview_records bigint := (stats->>'total_matview_records')::bigint;
|
|
clean_records bigint := (stats->>'clean_records')::bigint;
|
|
encoding_issues bigint := (stats->>'encoding_issues')::bigint;
|
|
clean_percent text;
|
|
estimated_refresh_time interval;
|
|
BEGIN
|
|
-- Calculate clean percent
|
|
clean_percent := CASE WHEN total_matview_records > 0
|
|
THEN to_char(ROUND((clean_records::NUMERIC / total_matview_records::NUMERIC) * 100, 2), 'FM9999999999999999.99') || '%'
|
|
ELSE 'N/A'
|
|
END;
|
|
|
|
-- Estimate refresh time
|
|
estimated_refresh_time := public.c77_mvc_estimate_matv_refresh_time(full_matview_name);
|
|
|
|
-- Assemble result
|
|
RETURN jsonb_build_object(
|
|
'matview', full_matview_name,
|
|
'vtw_source', full_vtw_name,
|
|
'total_matview_records', total_matview_records::text,
|
|
'total_vtw_records', (stats->>'total_vtw_records')::text,
|
|
'mismatched_records', mismatched_records::text,
|
|
'mismatch_percent', CASE WHEN total_matview_records > 0
|
|
THEN to_char(ROUND((mismatched_records::NUMERIC / total_matview_records::NUMERIC) * 100, 2), 'FM9999999999999999.99') || '%'
|
|
ELSE 'N/A'
|
|
END,
|
|
'clean_records', clean_records::text,
|
|
'encoding_issues', encoding_issues::text,
|
|
'clean_record%', clean_percent,
|
|
'last_matview_update', COALESCE((stats->>'last_matview_update')::text, 'N/A'),
|
|
'last_vtw_update', COALESCE((stats->>'last_vtw_update')::text, 'N/A'),
|
|
'size_mb', (stats->>'size_mb')::text,
|
|
'estimated_refresh_time', to_char(estimated_refresh_time, 'HH24:MI:SS.MS'),
|
|
'validation_type', validation_type,
|
|
'sample_size', sample_size::text,
|
|
'status', CASE
|
|
WHEN total_matview_records = 0 THEN 'Uninitialized'
|
|
WHEN (mismatched_records::NUMERIC / NULLIF(total_matview_records, 0)::NUMERIC) * 100 > mismatch_threshold THEN 'Stale'
|
|
WHEN encoding_issues > 0 THEN 'Degraded'
|
|
ELSE 'Healthy'
|
|
END,
|
|
'execution_time', to_char(clock_timestamp() - exec_time, 'HH24:MI:SS.MS')
|
|
) || COALESCE(jsonb_build_object('action_result', action_result), '{}');
|
|
END;
|
|
$$;
|