diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..13566b8
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/c77_mvc.iml b/.idea/c77_mvc.iml
new file mode 100644
index 0000000..c956989
--- /dev/null
+++ b/.idea/c77_mvc.iml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml
new file mode 100644
index 0000000..3745213
--- /dev/null
+++ b/.idea/dataSources.xml
@@ -0,0 +1,12 @@
+
+
+
+
+ postgresql
+ true
+ org.postgresql.Driver
+ jdbc:postgresql://localhost:5432/dbh
+ $ProjectFileDir$
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..50b3f41
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/php.xml b/.idea/php.xml
new file mode 100644
index 0000000..f324872
--- /dev/null
+++ b/.idea/php.xml
@@ -0,0 +1,19 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/INSTALLATION.md b/INSTALLATION.md
new file mode 100644
index 0000000..e3b23ad
--- /dev/null
+++ b/INSTALLATION.md
@@ -0,0 +1,187 @@
+# Installation Guide for c77_mvc PostgreSQL Extension
+
+## Prerequisites
+
+Before installing the c77_mvc extension, ensure you have:
+
+1. PostgreSQL 11 or later installed
+2. Administrative access to your PostgreSQL instance
+3. The c77_dbh extension installed (required dependency)
+4. Git (if installing from source repository)
+
+## Standard Installation
+
+### Option 1: Using PostgreSQL Extensions Directory
+
+1. Copy the extension files to your PostgreSQL extensions directory:
+
+```bash
+# Get the extension directory location
+export PGEXTDIR=$(pg_config --sharedir)/extension
+
+# Copy files
+sudo cp c77_mvc.control $PGEXTDIR/
+sudo cp c77_mvc--1.0.sql $PGEXTDIR/
+```
+
+2. Connect to your PostgreSQL database and create the extension:
+
+```sql
+CREATE EXTENSION c77_dbh; -- Install dependency first if not already installed
+CREATE EXTENSION c77_mvc;
+```
+
+### Option 2: Installing from Git Repository
+
+1. Clone the repository:
+
+```bash
+git clone https://git.jctr3.com/trogers1884/c77_mvc.git
+cd c77_mvc
+```
+
+2. Copy files to your PostgreSQL extensions directory:
+
+```bash
+export PGEXTDIR=$(pg_config --sharedir)/extension
+sudo cp c77_mvc.control $PGEXTDIR/
+sudo cp c77_mvc--1.0.sql $PGEXTDIR/
+```
+
+3. Connect to your PostgreSQL database and create the extension:
+
+```sql
+CREATE EXTENSION c77_dbh; -- Install dependency first if not already installed
+CREATE EXTENSION c77_mvc;
+```
+
+## Manual Installation
+
+If you prefer to install the extension manually or if you need to customize the installation process, follow these steps:
+
+1. Ensure the c77_dbh extension is already installed:
+
+```sql
+SELECT * FROM pg_extension WHERE extname = 'c77_dbh';
+```
+
+If not installed, install it first:
+
+```sql
+CREATE EXTENSION c77_dbh;
+```
+
+2. Create the table and functions manually by executing the SQL commands:
+
+```sql
+-- Create the audit table
+CREATE TABLE IF NOT EXISTS public.c77_mvc_table_fitness_audit (
+ run_id BIGSERIAL,
+ run_timestamp timestamp without time zone DEFAULT CURRENT_TIMESTAMP,
+ source_schema text COLLATE pg_catalog."default",
+ source_table text COLLATE pg_catalog."default",
+ analysis_result jsonb,
+ notes text[] COLLATE pg_catalog."default",
+ CONSTRAINT table_fitness_audit_pkey PRIMARY KEY (run_id)
+) TABLESPACE pg_default;
+
+CREATE INDEX IF NOT EXISTS idx_table_fitness_audit_table
+ ON public.c77_mvc_table_fitness_audit USING btree
+ (source_schema COLLATE pg_catalog."default" ASC NULLS LAST, source_table COLLATE pg_catalog."default" ASC NULLS LAST)
+ TABLESPACE pg_default;
+
+CREATE INDEX IF NOT EXISTS idx_table_fitness_audit_timestamp
+ ON public.c77_mvc_table_fitness_audit USING btree
+ (run_timestamp ASC NULLS LAST)
+ TABLESPACE pg_default;
+
+-- Now execute all the function creation SQL commands from c77_mvc--1.0.sql
+-- (Copy and paste all CREATE OR REPLACE FUNCTION statements from the SQL file)
+```
+
+3. Verify the installation:
+
+```sql
+-- Check if the main table exists
+SELECT * FROM pg_tables WHERE tablename = 'c77_mvc_table_fitness_audit';
+
+-- Check if key functions exist
+SELECT proname, pronamespace::regnamespace as schema
+FROM pg_proc
+WHERE proname LIKE 'c77_mvc%'
+ORDER BY proname;
+```
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Dependency Error**: If you see an error about missing the c77_dbh extension, make sure it's installed properly:
+
+```sql
+CREATE EXTENSION c77_dbh;
+```
+
+2. **Permission Issues**: Ensure your PostgreSQL user has sufficient privileges:
+
+```sql
+-- For a specific user
+GRANT ALL ON SCHEMA public TO your_user;
+GRANT ALL ON ALL TABLES IN SCHEMA public TO your_user;
+GRANT ALL ON ALL SEQUENCES IN SCHEMA public TO your_user;
+GRANT ALL ON ALL FUNCTIONS IN SCHEMA public TO your_user;
+```
+
+3. **Schema Issues**: If you're installing to a non-public schema, adjust permissions accordingly:
+
+```sql
+-- Replace 'custom_schema' with your target schema
+GRANT ALL ON SCHEMA custom_schema TO your_user;
+GRANT ALL ON ALL TABLES IN SCHEMA custom_schema TO your_user;
+GRANT ALL ON ALL SEQUENCES IN SCHEMA custom_schema TO your_user;
+GRANT ALL ON ALL FUNCTIONS IN SCHEMA custom_schema TO your_user;
+```
+
+### Checking for Successful Installation
+
+To verify if the extension was installed correctly:
+
+```sql
+-- List installed extensions
+SELECT * FROM pg_extension WHERE extname = 'c77_mvc';
+
+-- Check if the main table exists
+SELECT * FROM information_schema.tables WHERE table_name = 'c77_mvc_table_fitness_audit';
+
+-- Test a simple function
+SELECT public.c77_mvc_calculate_sample_size(1000000);
+```
+
+## Upgrading
+
+To upgrade from a previous version of the extension:
+
+```sql
+ALTER EXTENSION c77_mvc UPDATE TO '1.0';
+```
+
+## Uninstallation
+
+If you need to uninstall the extension:
+
+```sql
+DROP EXTENSION c77_mvc;
+```
+
+Note: This will not remove the tables and objects created by the extension. To completely remove all objects:
+
+```sql
+DROP EXTENSION c77_mvc CASCADE;
+```
+
+## Getting Help
+
+For additional help or to report issues:
+
+- Visit the repository at: https://git.jctr3.com/trogers1884/c77_mvc
+- Contact the maintainer via issues on the repository
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000..29aecd2
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,19 @@
+PostgreSQL License
+
+Copyright (c) 2025 c77_mvc Contributors
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose, without fee, and without a written agreement
+is hereby granted, provided that the above copyright notice and this paragraph
+and the following two paragraphs appear in all copies.
+
+IN NO EVENT SHALL THE AUTHORS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
+SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,
+ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
+THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+THE AUTHORS SPECIFICALLY DISCLAIM ANY WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS,
+AND THE AUTHORS HAVE NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT,
+UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
diff --git a/README.md b/README.md
index 69e87ba..310fb2e 100644
--- a/README.md
+++ b/README.md
@@ -1,173 +1,149 @@
-# PostgreSQL Data Management System
+# c77_mvc PostgreSQL Extension
+
+[](https://www.postgresql.org/)
+
+A PostgreSQL extension for materialized view management and table fitness analysis.
## Overview
-This collection of PostgreSQL functions forms a comprehensive data management system designed to analyze table structures, create optimized materialized views, and maintain their health over time. The system consists of two integrated subsystems that work together to improve database performance, data quality, and maintenance efficiency.
+c77_mvc (Materialized View and table fitness utilities) provides database administrators and developers with tools to:
-## Core Subsystems
+1. **Create optimized materialized views** with synthetic keys and content hashing
+2. **Manage materialized view health** with automatic staleness detection and refresh
+3. **Analyze table fitness** for partitioning, indexing, and query optimization
+4. **Monitor data quality** with encoding issue detection and isolation
-### 1. Table Analysis Subsystem
+## Features
-This subsystem analyzes database tables to identify their characteristics, data quality, and optimal strategies for keys, partitioning, and ordering.
+### Materialized View Management
-**Key Features:**
-- Statistical sampling for efficient analysis of large tables
-- Column-level fitness evaluation for primary/foreign key suitability
-- Data quality assessment with encoding issue detection
-- Identification of optimal column combinations for partitioning
-- Detection of timestamp columns suitable for ordering
-- Overall Data Quality Index (DQI) calculation
+- Create materialized views with synthetic keys and proper partitioning
+- Track content hashes to detect data changes efficiently
+- Isolate records with encoding issues into separate views
+- Monitor materialized view health with configurable thresholds
+- Automatically refresh views based on staleness metrics
+- Estimate refresh times based on historical performance
-**Primary Functions:**
-- `grok_analyze_table_fitness`: Main entry point for table analysis
-- `grok_analyze_column_stats`: Analyzes individual column characteristics
-- `grok_analyze_column_combinations`: Evaluates column pairs for composite keys
-- `grok_calculate_dqi`: Calculates the overall Data Quality Index
+### Table Fitness Analysis
-### 2. Materialized View Management Subsystem
+- Analyze column characteristics for partitioning and indexing
+- Identify optimal column combinations for keys and partitioning
+- Evaluate data quality with comprehensive metrics
+- Calculate overall Data Quality Index (DQI)
+- Use statistical sampling for efficient analysis of large tables
-This subsystem creates, monitors, and maintains optimized materialized views based on insights from the table analysis.
+## Requirements
-**Key Features:**
-- Optimized materialized view creation with proper indexing
-- Automatic handling of character encoding issues
-- Synthetic key generation for uniqueness
-- Content hash generation for efficient change detection
-- Health monitoring with staleness detection
-- Automated maintenance and remediation actions
+- PostgreSQL 11 or later
+- c77_dbh extension (dependency)
-**Primary Functions:**
-- `grok_create_optimized_matv`: Creates a complete materialized view system
-- `grok_manage_matv_health`: Monitors and maintains materialized view health
-- `grok_check_matv_mismatches`: Detects inconsistencies between source and materialized views
-- `grok_perform_matv_action`: Executes maintenance actions on materialized views
+## Installation
-## Architecture & Design Patterns
+### Quick Install
-The system implements several important design patterns:
-
-1. **View Layering Pattern**: Creates multiple views serving different purposes:
- - `vtw_*`: View To Watch (source view with data quality enhancement)
- - `matc_*`: MATerialized Copy (physical storage with indexes)
- - `vm_*`: View of Materialized view (clean data for querying)
- - `vprob_*`: View of PROBlematic data (encoding issues for review)
-
-2. **Data Quality Management Pattern**: Automatically detects, flags, and segregates problematic data:
- - Non-ASCII character detection
- - Cleansed versions of problematic text
- - Separate views for clean vs. problematic data
-
-3. **Change Detection Pattern**: Implements efficient methods to detect data changes:
- - Content hash generation from relevant columns
- - Timestamp-based staleness detection
- - Sampling-based consistency validation
-
-4. **Maintenance Strategy Pattern**: Provides multiple strategies for maintaining materialized views:
- - Refresh: Updates with fresh data from the source
- - Repair: Rebuilds indexes and constraints
- - Reindex: Rebuilds indexes without dropping them
-
-## Usage Examples
-
-### Analyzing a Table
+If you have both extensions available in your PostgreSQL extensions directory:
```sql
--- Analyze a table to identify key characteristics and data quality
-SELECT config.grok_analyze_table_fitness(
- 'public', -- Source schema
- 'customer_data', -- Source table
- ARRAY['id', 'uid'] -- Columns to exclude from key fitness evaluation
+CREATE EXTENSION c77_dbh; -- Install dependency first
+CREATE EXTENSION c77_mvc;
+```
+
+### From Source
+
+```bash
+# Clone repository
+git clone https://git.jctr3.com/trogers1884/c77_mvc.git
+cd c77_mvc
+
+# Copy files to PostgreSQL extension directory
+export PGEXTDIR=$(pg_config --sharedir)/extension
+sudo cp c77_mvc.control $PGEXTDIR/
+sudo cp c77_mvc--1.0.sql $PGEXTDIR/
+
+# Create extension in your database
+psql -d your_database -c "CREATE EXTENSION c77_dbh;"
+psql -d your_database -c "CREATE EXTENSION c77_mvc;"
+```
+
+For detailed installation instructions, see [INSTALLATION.md](INSTALLATION.md).
+
+## Basic Usage
+
+### Table Fitness Analysis
+
+```sql
+-- Analyze a table for fitness metrics
+SELECT * FROM public.c77_mvc_analyze_table_fitness('schema_name', 'table_name');
+```
+
+### Creating Optimized Materialized Views
+
+```sql
+-- Create an optimized materialized view
+SELECT * FROM public.c77_mvc_create_optimized_matv(
+ 'source_schema', -- Source schema
+ 'source_table', -- Source table
+ 'target_schema', -- Target schema
+ 'matc_target_view_name', -- Target materialized view name (must start with matc_)
+ ARRAY['customer_id'], -- Partition columns
+ ARRAY['last_updated'], -- Order-by columns
+ ARRAY['notes'], -- Columns to exclude from hash calculation (optional)
+ false -- Filter for latest records only (optional)
);
```
-### Creating an Optimized Materialized View
+### Managing Materialized View Health
```sql
--- Create an optimized materialized view system based on analysis results
-SELECT config.grok_create_optimized_matv(
- 'public', -- Source schema
- 'customer_data', -- Source table
- 'analytics', -- Target schema
- 'matc_customer_summary', -- Target materialized view name
- ARRAY['region', 'customer_type'], -- Partition columns
- ARRAY['updated_at', 'customer_id'], -- Order-by columns
- ARRAY['created_by', 'modified_by'], -- Columns to exclude from hash
- true -- Filter to latest records only
+-- Check materialized view health
+SELECT * FROM public.c77_mvc_manage_matv_health(
+ 'schema_name', -- Schema name
+ 'matc_view_name', -- Materialized view name
+ 'quick', -- Validation type: 'quick', 'daily', 'full'
+ NULL -- Action: NULL, 'refresh', 'repair', 'reindex'
+);
+
+-- Check and refresh if needed
+SELECT * FROM public.c77_mvc_manage_matv_health(
+ 'schema_name',
+ 'matc_view_name',
+ 'daily',
+ 'refresh'
);
```
-### Monitoring Materialized View Health
+For comprehensive usage examples, see [USAGE.md](USAGE.md).
-```sql
--- Check health of a materialized view
-SELECT config.grok_manage_matv_health(
- 'analytics', -- Schema
- 'matc_customer_summary', -- Materialized view name
- 'daily', -- Validation type: 'quick', 'daily', or 'full'
- NULL -- Action (NULL for check only, 'refresh', 'repair', 'reindex')
-);
-```
+## View Structure
-### Maintaining Materialized View Health
+When you create an optimized materialized view, the extension creates multiple objects:
-```sql
--- Refresh a stale materialized view
-SELECT config.grok_manage_matv_health(
- 'analytics', -- Schema
- 'matc_customer_summary', -- Materialized view name
- 'daily', -- Validation type
- 'refresh' -- Action to perform
-);
-```
+| Object | Naming Pattern | Purpose |
+|--------|----------------|---------|
+| View | vtw_* | Source view with content hash, synthetic key, and encoding status |
+| Materialized View | matc_* | Materialized copy of the vtw_ view |
+| View | vm_* | Clean data view (excludes encoding issues) |
+| View | vprob_* | Problematic data view (only encoding issues) |
-## Performance Considerations
+## Documentation
-- **Sampling**: The system uses statistical sampling for efficient analysis of large tables
-- **Concurrent Refresh**: Uses concurrent refresh when possible (requires unique indexes)
-- **Validation Modes**: Offers different validation modes with performance/thoroughness tradeoffs:
- - `quick`: Fastest, uses 0.1% sampling, 3-day staleness threshold
- - `daily`: Medium, uses 1% sampling, 1-day staleness threshold
- - `full`: Most thorough, uses 100% sampling, 12-hour staleness threshold
+- [Installation Guide](INSTALLATION.md)
+- [Usage Guide](USAGE.md)
+- [Technical Assessment](TECHNICAL.md)
-## Dependencies
+## Contributing
-This system depends on the following database objects:
+1. Fork the repository
+2. Create your feature branch (`git checkout -b feature/amazing-feature`)
+3. Commit your changes (`git commit -m 'Add some amazing feature'`)
+4. Push to the branch (`git push origin feature/amazing-feature`)
+5. Open a Pull Request
-1. **Table Fitness Audit Table**:
- - `config.table_fitness_audit`: Stores table analysis results
+## License
-2. **Materialized View Statistics Table**:
- - `public.c77_dbh_matv_stats`: Stores materialized view refresh statistics
+This project is licensed under the PostgreSQL License - see the LICENSE file for details.
-## Best Practices
+## Acknowledgements
-1. **Initial Analysis**: Run table analysis before creating materialized views to identify optimal configuration
-2. **Regular Health Checks**: Schedule periodic health checks using `grok_manage_matv_health`
-3. **Validation Types**: Use `quick` for frequent checks, `daily` for daily maintenance, and `full` for critical views
-4. **Monitoring**: Track Data Quality Index (DQI) over time to detect data quality trends
-5. **Maintenance Windows**: Schedule refreshes during low-usage periods for large materialized views
-
-## Error Handling
-
-All functions include comprehensive error handling with:
-- Clear error messages indicating what went wrong
-- Processing notes to track execution steps
-- Safe failure modes that avoid leaving the database in an inconsistent state
-
-## Troubleshooting
-
-Common issues and solutions:
-
-1. **Stale Materialized Views**: Use `grok_manage_matv_health` with action='refresh'
-2. **Encoding Issues**: Use `grok_manage_matv_health` with action='repair'
-3. **Index Performance Issues**: Use `grok_manage_matv_health` with action='reindex'
-4. **Missing Statistics**: Ensure `public.c77_dbh_matv_stats` table is populated with refresh statistics
-
-## Extension Points
-
-The system is designed to be extended in several ways:
-
-1. Add custom data quality checks in the `vtw_` view creation
-2. Extend partition and order-by column validation logic
-3. Implement additional maintenance actions in `grok_perform_matv_action`
-4. Add custom health metrics to `grok_manage_matv_health`
+- Developed by [Your Name/Organization]
+- Special thanks to contributors and testers
diff --git a/TECHNICAL.md b/TECHNICAL.md
new file mode 100644
index 0000000..8bf47f9
--- /dev/null
+++ b/TECHNICAL.md
@@ -0,0 +1,148 @@
+# Technical Assessment: c77_mvc PostgreSQL Extension
+
+## Extension Overview
+**Name:** c77_mvc
+**Version:** 1.0
+**Description:** Materialized view and table fitness utilities
+**Repository:** https://git.jctr3.com/trogers1884/c77_mvc
+**Dependencies:** c77_dbh
+**Relocatable:** Yes
+
+## Purpose and Functionality
+The c77_mvc extension provides a comprehensive set of utilities for:
+
+1. **Materialized View Management**
+ - Creating optimized materialized views with synthetic keys and content hashing
+ - Monitoring materialized view health and staleness
+ - Refreshing materialized views based on configurable thresholds
+ - Handling character encoding issues in data
+
+2. **Table Fitness Analysis**
+ - Evaluating column characteristics for partitioning and ordering
+ - Identifying optimal column combinations for keys
+ - Calculating overall data quality metrics
+ - Providing recommendations for database optimization
+
+## Technical Architecture
+
+### Core Components
+
+#### Materialized View Management
+The extension implements a structured approach to materialized view creation and management using a naming convention pattern:
+- `vtw_*`: Source view with content hash, synthetic key, and encoding status
+- `matc_*`: Materialized view derived from the source view
+- `vm_*`: View for reading cleaned data (filtering out encoding issues)
+- `vprob_*`: View for displaying problematic data with encoding issues
+
+#### Table Fitness Analysis
+The extension provides analytical functions to:
+- Sample table data appropriately based on statistical methods
+- Assess individual column characteristics
+- Evaluate column combinations for uniqueness and discriminatory power
+- Calculate a data quality index (DQI) based on nulls, encoding, and uniqueness
+
+### Database Schema
+The extension creates one table:
+- `c77_mvc_table_fitness_audit`: Stores the results of table fitness analyses
+
+### Key Functions
+
+#### Materialized View Management
+1. `c77_mvc_create_optimized_matv`: Creates a set of views/materialized views with content hashing and synthetic keys
+2. `c77_mvc_manage_matv_health`: Monitors materialized view health and performs maintenance actions
+3. `c77_mvc_check_matv_mismatches`: Compares materialized views with source views to detect staleness
+4. `c77_mvc_estimate_matv_refresh_time`: Estimates refresh time based on historical performance
+
+#### Table Fitness Analysis
+1. `c77_mvc_analyze_table_fitness`: Main entry point for analyzing table structure and data quality
+2. `c77_mvc_analyze_column_stats`: Evaluates individual column characteristics
+3. `c77_mvc_analyze_column_combinations`: Identifies effective column combinations for partitioning
+4. `c77_mvc_calculate_dqi`: Calculates a data quality index based on multiple metrics
+
+## Implementation Details
+
+### Statistical Sampling
+The extension employs statistical sampling methods to efficiently analyze large tables:
+- Uses confidence level and margin of error parameters to calculate appropriate sample sizes
+- Implements table sampling using PostgreSQL's TABLESAMPLE clause
+- Adjusts sample sizes dynamically based on validation type (quick, daily, full)
+
+### Synthetic Key Generation
+For materialized views, the extension:
+- Creates synthetic keys using ROW_NUMBER() with custom PARTITION BY and ORDER BY clauses
+- Handles timestamp parsing and ordering intelligently
+- Ensures deterministic ordering for consistent key generation
+
+### Content Hashing
+The extension uses MD5 hashing of row data to:
+- Detect changes between source data and materialized views
+- Enable efficient comparison for staleness detection
+- Facilitate incremental refresh decisions
+
+### Character Encoding Handling
+The extension provides robust handling of character encoding issues:
+- Detects non-ASCII characters using regex pattern matching
+- Segregates problematic data into separate views
+- Provides clean views for standard operations
+
+## Security and Performance Considerations
+
+### Security
+- The extension uses proper quoting and identifier escaping throughout to prevent SQL injection
+- Error handling includes careful message construction to avoid exposing sensitive information
+- Temporary tables are used to isolate analysis operations
+
+### Performance
+- Statistical sampling is employed to analyze large tables efficiently
+- The extension uses table partitioning and appropriate indexing for materialized views
+- Validation types (quick, daily, full) allow for different performance/accuracy tradeoffs
+- Refresh operations consider existing performance statistics to make intelligent decisions
+
+### Dependencies
+- Relies on the `c77_dbh` extension for certain operations
+- Uses the `c77_dbh_matv_stats` table for historical performance tracking
+- Verifies dependency existence at installation time
+
+## Code Quality Assessment
+
+### Strengths
+1. **Robust Error Handling**: Comprehensive try-catch blocks throughout the codebase
+2. **Parameterization**: Extensive use of parameters allows for flexible configuration
+3. **Documentation**: Clear inline documentation of function purposes and parameters
+4. **Statistical Approach**: Uses sound statistical methods for sampling and analysis
+5. **Modular Design**: Functions are well-organized with clear responsibilities
+
+### Areas for Improvement
+1. **Configuration Management**: Some parameters are hardcoded and could be externalized
+2. **Testing Coverage**: No explicit test functions or frameworks are included
+3. **Schema Management**: Some functions assume specific table structures without validation
+4. **Code Duplication**: Some SQL generation patterns are repeated across functions
+5. **Performance Metrics**: Limited documentation of expected performance characteristics
+
+## Recommendations
+
+### Documentation Enhancements
+1. Add comprehensive function-level documentation explaining parameter use and return values
+2. Document the expected table structures and naming conventions
+3. Provide examples of common usage patterns for key functions
+4. Add performance guidance for large databases
+
+### Feature Enhancements
+1. **Configuration Management**: Create a configuration table for tunable parameters
+2. **Monitoring Dashboard**: Add functions to generate monitoring reports for DBA use
+3. **Batch Operations**: Add capabilities for managing multiple materialized views simultaneously
+4. **Custom Metrics**: Allow users to define custom fitness metrics for specific use cases
+
+### Technical Improvements
+1. **Parallelization**: Add support for parallel analysis of large tables
+2. **Versioning**: Improve version management for schema changes
+3. **Testing**: Add a comprehensive test suite
+4. **Logging**: Enhance logging capabilities for troubleshooting
+5. **Performance Optimization**: Optimize sampling methods for very large tables
+
+## Conclusion
+The c77_mvc extension provides a well-designed and comprehensive solution for managing materialized views and analyzing table fitness in PostgreSQL. Its approach to content hashing, synthetic key generation, and encoding issue handling is particularly noteworthy. The statistical sampling methods enable efficient analysis of large tables.
+
+The extension would benefit from improved configuration management, enhanced documentation, and a more structured approach to testing. Overall, it represents a valuable tool for database administrators working with complex PostgreSQL environments, particularly those dealing with data quality issues and materialized view management.
+
+Key strengths include the robust error handling, statistical approach to sampling, and comprehensive materialized view management capabilities. With the suggested improvements, this extension could become an essential part of a PostgreSQL database administrator's toolkit.
diff --git a/USAGE.md b/USAGE.md
new file mode 100644
index 0000000..ca826b2
--- /dev/null
+++ b/USAGE.md
@@ -0,0 +1,335 @@
+# c77_mvc PostgreSQL Extension Usage Guide
+
+This guide provides detailed instructions on how to use the c77_mvc extension for materialized view management and table fitness analysis in PostgreSQL.
+
+## Table of Contents
+
+1. [Overview](#overview)
+2. [Table Fitness Analysis](#table-fitness-analysis)
+3. [Materialized View Management](#materialized-view-management)
+4. [Materialized View Health Monitoring](#materialized-view-health-monitoring)
+5. [Advanced Use Cases](#advanced-use-cases)
+6. [Best Practices](#best-practices)
+7. [Function Reference](#function-reference)
+
+## Overview
+
+The c77_mvc extension provides two main sets of functionality:
+
+1. **Table Fitness Analysis**: Evaluate table structure for data quality, partitioning suitability, and optimization opportunities
+2. **Materialized View Management**: Create and maintain optimized materialized views with content hashing, synthetic keys, and encoding status tracking
+
+## Table Fitness Analysis
+
+Table fitness analysis helps you evaluate table structure and data quality to inform optimization decisions.
+
+### Basic Table Analysis
+
+To perform a basic analysis on a table:
+
+```sql
+SELECT * FROM public.c77_mvc_analyze_table_fitness('schema_name', 'table_name');
+```
+
+This returns a JSON object containing:
+- Column statistics
+- Recommended partition combinations
+- Order-by candidates
+- Data quality index
+- Analysis notes
+
+### Example with Excluding Key Columns
+
+If you have columns that should not be considered for key combinations:
+
+```sql
+SELECT * FROM public.c77_mvc_analyze_table_fitness(
+ 'schema_name',
+ 'table_name',
+ ARRAY['id', 'created_at']::text[]
+);
+```
+
+### Interpreting Analysis Results
+
+The analysis result includes:
+
+1. **Column Stats**: Individual column metrics including:
+ - Null ratio
+ - Uniqueness ratio
+ - Encoding issue ratio
+ - Fitness score
+
+2. **Recommended Partition Combinations**: Column pairs that work well together for partitioning:
+ - Uniqueness ratio
+ - Discrimination power
+ - Average fitness score
+
+3. **Order-by Candidates**: Columns suitable for ordering data:
+ - Timestamp columns
+ - Text columns parseable as timestamps
+
+4. **Data Quality Index (DQI)**: Overall score from 0-100 indicating data quality
+
+Example query to extract key information:
+
+```sql
+SELECT
+ run_id,
+ run_timestamp,
+ analysis_result->>'data_quality_index' as dqi,
+ analysis_result->'recommended_partition_combinations' as partition_recommendations
+FROM public.c77_mvc_table_fitness_audit
+WHERE source_schema = 'schema_name' AND source_table = 'table_name'
+ORDER BY run_id DESC
+LIMIT 1;
+```
+
+## Materialized View Management
+
+The extension provides tools to create and manage optimized materialized views.
+
+### Creating Optimized Materialized Views
+
+To create an optimized materialized view with synthetic keys and content hashing:
+
+```sql
+SELECT * FROM public.c77_mvc_create_optimized_matv(
+ 'source_schema', -- Source schema name
+ 'source_table', -- Source table name
+ 'target_schema', -- Target schema for materialized view
+ 'matc_target_mv_name', -- Target materialized view name (should start with matc_)
+ ARRAY['column1', 'column2'], -- Partition columns
+ ARRAY['timestamp_column'], -- Order-by columns
+ ARRAY['exclude_column1'], -- Columns to exclude from content hash calculation (optional)
+ false -- Filter to get only latest records (optional)
+);
+```
+
+This creates:
+1. `vtw_target_mv_name`: A view with regexp_replace for character columns
+2. `matc_target_mv_name`: A materialized view derived from the vtw_ view
+3. `vm_target_mv_name`: A view that filters out encoding issues
+4. `vprob_target_mv_name`: A view showing only records with encoding issues
+
+### View Structure and Purpose
+
+When you create an optimized materialized view, multiple objects are created:
+
+| Object Type | Naming Pattern | Purpose |
+|-------------|----------------|---------|
+| View | vtw_* | Source view with content hash, synthetic key, and encoding status |
+| Materialized View | matc_* | Materialized copy of the vtw_ view |
+| View | vm_* | Clean data view (excludes encoding issues) |
+| View | vprob_* | Problematic data view (only encoding issues) |
+
+### Example Use Case
+
+Scenario: Creating a materialized view of customer data:
+
+```sql
+SELECT * FROM public.c77_mvc_create_optimized_matv(
+ 'sales',
+ 'customers',
+ 'reporting',
+ 'matc_customer_summary',
+ ARRAY['customer_id', 'region'],
+ ARRAY['last_updated'],
+ ARRAY['notes', 'internal_comments'],
+ false
+);
+```
+
+To query clean data:
+```sql
+SELECT * FROM reporting.vm_customer_summary;
+```
+
+To check for encoding issues:
+```sql
+SELECT * FROM reporting.vprob_customer_summary;
+```
+
+## Materialized View Health Monitoring
+
+The extension provides tools to monitor and maintain the health of materialized views.
+
+### Checking Materialized View Health
+
+```sql
+SELECT * FROM public.c77_mvc_manage_matv_health(
+ 'schema_name', -- Schema containing the materialized view
+ 'matc_view_name', -- Materialized view name (should start with matc_)
+ 'quick', -- Validation type: 'quick', 'daily', or 'full'
+ NULL -- Action: NULL, 'refresh', 'repair', or 'reindex'
+);
+```
+
+Validation types:
+- `quick`: Fast check with 0.1% sample (3-day threshold)
+- `daily`: More thorough check with 1% sample (1-day threshold)
+- `full`: Complete check with 100% sample (12-hour threshold)
+
+### Automated Refresh
+
+To check health and refresh if needed:
+
+```sql
+SELECT * FROM public.c77_mvc_manage_matv_health(
+ 'schema_name',
+ 'matc_view_name',
+ 'daily',
+ 'refresh' -- Will refresh if stale based on thresholds
+);
+```
+
+### Monitoring Multiple Views
+
+Example script to monitor all materialized views in a schema:
+
+```sql
+DO $$
+DECLARE
+ view_record RECORD;
+ result JSONB;
+BEGIN
+ FOR view_record IN
+ SELECT matviewname
+ FROM pg_matviews
+ WHERE schemaname = 'target_schema'
+ AND matviewname LIKE 'matc_%'
+ LOOP
+ RAISE NOTICE 'Checking view: %', view_record.matviewname;
+ SELECT * FROM public.c77_mvc_manage_matv_health('target_schema', view_record.matviewname, 'quick', NULL) INTO result;
+ RAISE NOTICE 'Status: %', result->>'status';
+ END LOOP;
+END;
+$$;
+```
+
+## Advanced Use Cases
+
+### Customizing Character Encoding Handling
+
+The extension detects non-ASCII characters using the regex pattern `[^\x00-\x7F]`. For custom handling:
+
+1. Create your own view that modifies the `vtw_` view:
+```sql
+CREATE OR REPLACE VIEW custom_schema.my_custom_vtw AS
+SELECT *,
+ CASE
+ WHEN column1 ~ '[^\x00-\x7F]' OR column2 ~ '[^\x20-\x7E]' THEN 'CUSTOM_ENCODING_ISSUE'
+ ELSE 'CLEAN'
+ END AS encoding_status
+FROM schema_name.vtw_original_view;
+```
+
+### Batch Refresh Strategy
+
+Example of a batch refresh strategy based on analysis:
+
+```sql
+DO $$
+DECLARE
+ view_record RECORD;
+ health_result JSONB;
+ estimated_time INTERVAL;
+ total_time INTERVAL := '0 seconds'::INTERVAL;
+ max_batch_time INTERVAL := '2 hours'::INTERVAL;
+ views_to_refresh TEXT[] := '{}';
+BEGIN
+ -- Gather health stats and estimated times
+ FOR view_record IN
+ SELECT matviewname
+ FROM pg_matviews
+ WHERE schemaname = 'target_schema'
+ AND matviewname LIKE 'matc_%'
+ LOOP
+ SELECT * FROM public.c77_mvc_manage_matv_health('target_schema', view_record.matviewname, 'quick', NULL)
+ INTO health_result;
+
+ IF health_result->>'status' = 'Stale' THEN
+ estimated_time := (health_result->>'estimated_refresh_time')::INTERVAL;
+
+ -- Add to batch if we don't exceed max time
+ IF (total_time + estimated_time) < max_batch_time THEN
+ views_to_refresh := array_append(views_to_refresh, view_record.matviewname);
+ total_time := total_time + estimated_time;
+ END IF;
+ END IF;
+ END LOOP;
+
+ -- Refresh the batch
+ FOREACH view_record.matviewname IN ARRAY views_to_refresh
+ LOOP
+ RAISE NOTICE 'Refreshing %', view_record.matviewname;
+ PERFORM public.c77_mvc_manage_matv_health('target_schema', view_record.matviewname, 'quick', 'refresh');
+ END LOOP;
+END;
+$$;
+```
+
+## Best Practices
+
+### Table Fitness Analysis
+
+1. **Analyze Regularly**: Run table fitness analysis regularly to track data quality changes
+2. **Compare Over Time**: Store historical analysis results for trend tracking
+3. **Sample Size Consideration**: For very large tables, adjust confidence level and margin of error:
+ ```sql
+ SELECT public.c77_mvc_calculate_sample_size(10000000, 0.95, 0.05);
+ ```
+
+### Materialized View Management
+
+1. **Naming Convention**: Follow the expected naming pattern:
+ - Materialized views must start with `matc_`
+ - Source views will be created with `vtw_` prefix
+ - Read views will be created with `vm_` prefix
+ - Problem record views will have `vprob_` prefix
+
+2. **Partition Column Selection**: Choose partition columns based on:
+ - Table fitness analysis recommendations
+ - High uniqueness ratio
+ - Low null ratio
+ - Business requirements for data segmentation
+
+3. **Order-by Column Selection**: Choose columns that:
+ - Represent timestamps or dates
+ - Have a clear logical ordering in the data
+ - Are regularly populated (low null ratio)
+
+4. **Refresh Strategy**: Consider:
+ - Data change frequency
+ - Query load patterns
+ - Validation type based on criticality
+
+5. **Performance Monitoring**:
+ - Track refresh times
+ - Monitor the c77_dbh_matv_stats table for historical performance
+
+## Function Reference
+
+### Table Fitness Analysis
+
+| Function | Description |
+|----------|-------------|
+| `c77_mvc_analyze_table_fitness(source_schema, source_table, exclude_key_columns)` | Main function for table fitness analysis |
+| `c77_mvc_calculate_sample_size(total_rows, confidence_level, margin_of_error)` | Calculate appropriate sample size for analysis |
+| `c77_mvc_analyze_column_stats(temp_table_name, col_name, column_type, sample_size, total_rows, exclude_key_columns)` | Analyze individual column statistics |
+| `c77_mvc_analyze_column_combinations(temp_table_name, column_stats, sample_size, total_rows, exclude_key_columns)` | Analyze column combinations for partitioning |
+| `c77_mvc_identify_order_by_candidates(temp_table_name, column_stats)` | Identify columns suitable for ordering |
+| `c77_mvc_calculate_dqi(column_stats)` | Calculate Data Quality Index |
+
+### Materialized View Management
+
+| Function | Description |
+|----------|-------------|
+| `c77_mvc_create_optimized_matv(source_schema, source_table, target_schema, target_matview, partition_columns, order_by_columns, exclude_columns_from_hash, filter_latest_only)` | Create an optimized materialized view |
+| `c77_mvc_manage_matv_health(target_schema, matview_name, validation_type, action)` | Check and manage materialized view health |
+| `c77_mvc_check_matv_mismatches(target_schema, matview_name, validation_type)` | Check for mismatches between source and materialized view |
+| `c77_mvc_create_indexes(target_schema, target_mv_name, partition_columns)` | Create indexes on a materialized view |
+| `c77_mvc_validate_matv_inputs(schema_name, matview_name, vtw_name)` | Validate materialized view inputs |
+| `c77_mvc_validate_order_by_columns(source_schema, source_table, order_by_columns)` | Validate order-by columns |
+| `c77_mvc_collect_matv_stats(full_matview_name, full_vtw_name)` | Collect materialized view statistics |
+| `c77_mvc_estimate_matv_refresh_time(full_matview_name)` | Estimate refresh time for a materialized view |
diff --git a/c77_mvc--1.0.sql b/c77_mvc--1.0.sql
new file mode 100644
index 0000000..8174738
--- /dev/null
+++ b/c77_mvc--1.0.sql
@@ -0,0 +1,1693 @@
+-- c77_mvc--1.0.sql
+
+-- Check if c77_dbh extension is installed
+DO $$
+ BEGIN
+ IF NOT EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'c77_dbh') THEN
+ RAISE NOTICE 'The c77_dbh extension is not installed. This extension is required for c77_mvc to function properly.';
+ RAISE NOTICE 'Please install c77_dbh first by following the instructions at https://git.jctr3.com/trogers1884/c77_dbh';
+ RAISE NOTICE 'Example: CREATE EXTENSION c77_dbh; (after compiling and installing from source if necessary)';
+ RAISE EXCEPTION 'Installation aborted due to missing c77_dbh extension.';
+ END IF;
+ END;
+$$;
+
+-- If we reach here, c77_dbh is installed, so proceed with the installation
+
+-- Create the table
+CREATE TABLE IF NOT EXISTS public.c77_mvc_table_fitness_audit (
+ run_id BIGSERIAL,
+ run_timestamp timestamp without time zone DEFAULT CURRENT_TIMESTAMP,
+ source_schema text COLLATE pg_catalog."default",
+ source_table text COLLATE pg_catalog."default",
+ analysis_result jsonb,
+ notes text[] COLLATE pg_catalog."default",
+ CONSTRAINT table_fitness_audit_pkey PRIMARY KEY (run_id)
+) TABLESPACE pg_default;
+
+CREATE INDEX IF NOT EXISTS idx_table_fitness_audit_table
+ ON public.c77_mvc_table_fitness_audit USING btree
+ (source_schema COLLATE pg_catalog."default" ASC NULLS LAST, source_table COLLATE pg_catalog."default" ASC NULLS LAST)
+ TABLESPACE pg_default;
+
+CREATE INDEX IF NOT EXISTS idx_table_fitness_audit_timestamp
+ ON public.c77_mvc_table_fitness_audit USING btree
+ (run_timestamp ASC NULLS LAST)
+ TABLESPACE pg_default;
+
+-- Define the functions in dependency order
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_analyze_column_combinations(temp_table_name text, column_stats jsonb, sample_size bigint, total_rows bigint, exclude_key_columns text[]) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ column_combinations jsonb := '{}';
+ combination_result record;
+ distinct_count bigint;
+ uniqueness_ratio numeric;
+ duplicate_count bigint;
+ synthetic_uniqueness numeric;
+ adjusted_sample_size bigint;
+ sampling_percentage numeric;
+BEGIN
+ -- Adjust sample size to not exceed total rows
+ adjusted_sample_size := LEAST(sample_size, total_rows);
+ sampling_percentage := (adjusted_sample_size::float / total_rows * 100);
+
+ -- Analyze column combinations
+ FOR combination_result IN (
+ SELECT c1.key AS col1, c2.key AS col2,
+ ((c1.value->>'fitness_score')::float + (c2.value->>'fitness_score')::float) / 2 AS avg_fitness
+ FROM jsonb_each(column_stats) c1,
+ jsonb_each(column_stats) c2
+ WHERE c1.key < c2.key
+ AND (c1.value->>'fitness_score')::float >= 70
+ AND (c2.value->>'fitness_score')::float >= 70
+ AND NOT (c1.key = ANY(exclude_key_columns))
+ AND NOT (c2.key = ANY(exclude_key_columns))
+ ORDER BY avg_fitness DESC
+ LIMIT 5
+ )
+ LOOP
+ -- Test uniqueness of the combination
+ EXECUTE format('SELECT COUNT(DISTINCT (%I, %I)) FROM (SELECT %I, %I FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s) t',
+ combination_result.col1, combination_result.col2,
+ combination_result.col1, combination_result.col2,
+ temp_table_name,
+ sampling_percentage::text, adjusted_sample_size)
+ INTO distinct_count;
+ uniqueness_ratio := distinct_count::float / adjusted_sample_size;
+
+ -- Simulate synthetic key uniqueness
+ EXECUTE format('SELECT COUNT(*) FROM (
+ SELECT ROW_NUMBER() OVER (PARTITION BY %I, %I ORDER BY random()) AS rn
+ FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s
+ ) t WHERE rn > 1',
+ combination_result.col1, combination_result.col2,
+ temp_table_name,
+ sampling_percentage::text, adjusted_sample_size)
+ INTO duplicate_count;
+ synthetic_uniqueness := 1 - (duplicate_count::float / adjusted_sample_size);
+
+ -- Store combination stats
+ column_combinations := column_combinations || jsonb_build_object(
+ format('%s,%s', combination_result.col1, combination_result.col2),
+ jsonb_build_object(
+ 'uniqueness_ratio', uniqueness_ratio,
+ 'synthetic_uniqueness', synthetic_uniqueness,
+ 'discrimination_power', uniqueness_ratio,
+ 'avg_fitness_score', combination_result.avg_fitness
+ )
+ );
+ END LOOP;
+
+ RETURN column_combinations;
+EXCEPTION WHEN OTHERS THEN
+ RETURN jsonb_build_object(
+ 'error', format('Failed to analyze column combinations: %s', SQLERRM)
+ );
+END;
+$$;
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_assemble_matv_result(target_schema text, target_mv_name text, partition_columns text[], order_by_columns text[], exclude_hash_columns text[], where_clause text, custom_sql text, notes text[]) RETURNS json
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ vtw_name text := replace(target_mv_name, 'matc_', 'vtw_');
+ vm_name text := replace(target_mv_name, 'matc_', 'vm_');
+BEGIN
+ notes := array_append(notes, format('Process completed at %s', clock_timestamp()));
+ RETURN json_build_object(
+ 'message', format('Created view %I.%I, materialized view %I.%I, and view %I.%I for reading.',
+ target_schema, vtw_name, target_schema, target_mv_name, target_schema, vm_name),
+ 'view_name', format('%I.%I', target_schema, vtw_name),
+ 'matview_name', format('%I.%I', target_schema, target_mv_name),
+ 'vm_view_name', format('%I.%I', target_schema, vm_name),
+ 'partition_columns', partition_columns,
+ 'order_by_columns', order_by_columns,
+ 'exclude_hash_columns', exclude_hash_columns,
+ 'where_clause', where_clause,
+ 'custom_sql', custom_sql,
+ 'notes', notes
+ );
+EXCEPTION WHEN OTHERS THEN
+ RETURN json_build_object(
+ 'error', format('Failed to assemble result: %s', SQLERRM),
+ 'notes', notes
+ );
+END;
+$$;
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_assemble_result(source_schema text, source_table text, column_stats jsonb, column_combinations jsonb, order_by_candidates jsonb, data_quality_index numeric, notes text[], temp_table_name text) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ result_json jsonb;
+ run_id integer;
+ updated_notes text[] := notes; -- Create a local copy of notes
+BEGIN
+ -- Build the result JSON
+ updated_notes := array_append(updated_notes, format('Analysis completed at %s', clock_timestamp()));
+ result_json := jsonb_build_object(
+ 'message', format('Analysis of %I.%I completed', source_schema, source_table),
+ 'column_stats', column_stats,
+ 'recommended_partition_combinations', column_combinations,
+ 'order_by_candidates', order_by_candidates,
+ 'data_quality_index', ROUND(data_quality_index, 2),
+ 'notes', updated_notes
+ );
+
+ -- Store results in audit table
+ INSERT INTO public.c77_mvc_table_fitness_audit (
+ source_schema,
+ source_table,
+ analysis_result,
+ notes
+ )
+ VALUES (
+ source_schema,
+ source_table,
+ result_json,
+ updated_notes
+ )
+ RETURNING table_fitness_audit.run_id INTO run_id;
+
+ -- Add run_id to the result
+ result_json := result_json || jsonb_build_object('run_id', run_id);
+
+ -- Clean up temporary table
+ EXECUTE format('DROP TABLE IF EXISTS %I', temp_table_name);
+ updated_notes := array_append(updated_notes, format('Dropped temporary table %s', temp_table_name));
+
+ -- Update result_json with the final notes
+ result_json := result_json || jsonb_build_object('notes', updated_notes);
+
+ RETURN result_json;
+EXCEPTION WHEN OTHERS THEN
+ RAISE NOTICE 'Error assembling result: %', SQLERRM;
+ RETURN jsonb_build_object(
+ 'error', format('Failed to assemble result: %s', SQLERRM),
+ 'notes', updated_notes
+ );
+END;
+$$;
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_calculate_dqi(column_stats jsonb) RETURNS numeric
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ dqi_components jsonb := '{}';
+ col_name text;
+ null_ratio numeric;
+ encoding_issue_ratio numeric;
+ uniqueness_ratio numeric;
+ component_score numeric;
+BEGIN
+ -- Calculate DQI components for each column
+ FOR col_name IN
+ SELECT key
+ FROM jsonb_object_keys(column_stats) AS key
+ LOOP
+ null_ratio := (column_stats->col_name->>'null_ratio')::numeric;
+ encoding_issue_ratio := (column_stats->col_name->>'encoding_issue_ratio')::numeric;
+ uniqueness_ratio := (column_stats->col_name->>'uniqueness_ratio')::numeric;
+
+ component_score := (1 - null_ratio) * 0.4 + (1 - encoding_issue_ratio) * 0.4 + uniqueness_ratio * 0.2;
+ dqi_components := dqi_components || jsonb_build_object(col_name, component_score);
+ END LOOP;
+
+ -- Calculate average DQI across all columns (scaled to 0-100)
+ RETURN (SELECT AVG(value::numeric) * 100
+ FROM jsonb_each_text(dqi_components));
+EXCEPTION WHEN OTHERS THEN
+ RAISE NOTICE 'Error calculating DQI: %', SQLERRM;
+ RETURN 0;
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_calculate_matv_sample_size(full_matview_name text, params jsonb) RETURNS bigint
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ total_matview_records bigint;
+ sample_size bigint;
+BEGIN
+ -- Get total records
+ EXECUTE format('SELECT COUNT(*) FROM %s', full_matview_name)
+ INTO total_matview_records;
+
+ -- Calculate sample size using c77_mvc_calculate_sample_size
+ sample_size := public.c77_mvc_calculate_sample_size(total_matview_records);
+
+ RETURN sample_size;
+EXCEPTION WHEN OTHERS THEN
+ RAISE NOTICE 'Error calculating sample size: %', SQLERRM;
+ RETURN 100;
+END;
+$$;
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_calculate_sample_size(total_rows bigint, confidence_level numeric DEFAULT 0.99, margin_of_error numeric DEFAULT 0.03) RETURNS bigint
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ z_score numeric;
+ n0 numeric;
+ p numeric := 0.5; -- Conservative estimate for maximum variability
+ sample_size bigint;
+BEGIN
+ -- Map confidence level to Z-score
+ z_score := CASE
+ WHEN confidence_level = 0.90 THEN 1.645
+ WHEN confidence_level = 0.95 THEN 1.96
+ WHEN confidence_level = 0.99 THEN 2.576
+ ELSE 2.576 -- Default to 99%
+ END;
+
+ -- Initial sample size (infinite population)
+ n0 := (z_score * z_score * p * (1 - p)) / (margin_of_error * margin_of_error);
+
+ -- Adjust for finite population
+ sample_size := CEIL(n0 * total_rows / (n0 + total_rows));
+ sample_size := GREATEST(sample_size, 1000); -- Minimum sample size for small tables
+ sample_size := LEAST(sample_size, total_rows); -- Cap at total rows
+
+ RETURN sample_size;
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_collect_matv_stats(full_matview_name text, full_vtw_name text) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ total_matview_records bigint;
+ clean_records bigint;
+ encoding_issues bigint;
+ total_vtw_records bigint;
+ last_matview_update timestamp with time zone;
+ last_vtw_update timestamp with time zone;
+ size_pretty_string text;
+ size_mb numeric;
+BEGIN
+ -- Collect stats
+ EXECUTE format('SELECT COUNT(*), COUNT(*) FILTER (WHERE encoding_status = ''CLEAN''), COUNT(*) FILTER (WHERE encoding_status IS DISTINCT FROM ''CLEAN'') FROM %s', full_matview_name)
+ INTO total_matview_records, clean_records, encoding_issues;
+
+ EXECUTE format('SELECT COUNT(*) FROM %s', full_vtw_name)
+ INTO total_vtw_records;
+
+ EXECUTE format('SELECT MAX(rowlastupdated) FROM %s', full_matview_name)
+ INTO last_matview_update;
+
+ EXECUTE format('SELECT MAX(rowlastupdated) FROM %s', full_vtw_name)
+ INTO last_vtw_update;
+
+ EXECUTE format('SELECT pg_size_pretty(pg_total_relation_size(''%s'')::BIGINT)::TEXT', full_matview_name)
+ INTO size_pretty_string;
+
+ size_mb := regexp_replace(size_pretty_string, '[^0-9.]', '', 'g')::NUMERIC;
+
+ RETURN jsonb_build_object(
+ 'total_matview_records', total_matview_records,
+ 'clean_records', clean_records,
+ 'encoding_issues', encoding_issues,
+ 'total_vtw_records', total_vtw_records,
+ 'last_matview_update', last_matview_update,
+ 'last_vtw_update', last_vtw_update,
+ 'size_mb', size_mb
+ );
+EXCEPTION WHEN OTHERS THEN
+ RAISE NOTICE 'Error fetching stats: %', SQLERRM;
+ RETURN jsonb_build_object(
+ 'total_matview_records', 0,
+ 'clean_records', 0,
+ 'encoding_issues', 0,
+ 'total_vtw_records', 0,
+ 'last_matview_update', NULL,
+ 'last_vtw_update', NULL,
+ 'size_mb', 0
+ );
+END;
+$$;
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_create_indexes(target_schema text, target_mv_name text, partition_columns text[]) RETURNS text[]
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ notes text[] := '{}';
+ encoding_index_name text;
+ content_hash_index_name text;
+ unique_index_name text;
+ vtw_name text := replace(target_mv_name, 'matc_', 'vtw_');
+ vm_name text := replace(target_mv_name, 'matc_', 'vm_');
+BEGIN
+ -- Index on encoding_status
+ encoding_index_name := 'idx_' || target_mv_name || '_encoding_status';
+ EXECUTE format('CREATE INDEX %I ON %I.%I (encoding_status)',
+ encoding_index_name, target_schema, target_mv_name);
+
+ -- Index on content_hash
+ content_hash_index_name := 'idx_' || target_mv_name || '_content_hash';
+ EXECUTE format('CREATE INDEX %I ON %I.%I (content_hash)',
+ content_hash_index_name, target_schema, target_mv_name);
+ notes := array_append(notes, 'Created index on content_hash');
+
+ -- Unique index on synthetic_key and partition columns
+ SELECT string_agg(quote_ident(unnest), ', ')
+ INTO unique_index_name
+ FROM unnest(partition_columns);
+
+ unique_index_name := format(
+ 'CREATE UNIQUE INDEX %I ON %I.%I (synthetic_key, %s)',
+ 'idx_' || target_mv_name || '_synthetic_key', target_schema, target_mv_name,
+ COALESCE(unique_index_name, '1')
+ );
+
+ BEGIN
+ EXECUTE unique_index_name;
+ notes := array_append(notes, 'Successfully created unique index on synthetic_key and partition columns');
+ EXCEPTION WHEN OTHERS THEN
+ EXECUTE format('DROP MATERIALIZED VIEW %I.%I', target_schema, target_mv_name);
+ EXECUTE format('DROP VIEW IF EXISTS %I.%I', target_schema, vtw_name);
+ EXECUTE format('DROP VIEW IF EXISTS %I.%I', target_schema, vm_name);
+ notes := array_append(notes, format('Failed to create unique index: %s', SQLERRM));
+
+ SELECT string_agg(quote_ident(unnest), ', ')
+ INTO unique_index_name
+ FROM unnest(partition_columns);
+
+ unique_index_name := format(
+ 'CREATE INDEX %I ON %I.%I (synthetic_key, %s)',
+ 'idx_' || target_mv_name || '_synthetic_key_fallback', target_schema, target_mv_name,
+ COALESCE(unique_index_name, '1')
+ );
+
+ EXECUTE unique_index_name;
+ notes := array_append(notes, 'Created non-unique fallback index due to unique index failure');
+ END;
+
+ RETURN notes;
+EXCEPTION WHEN OTHERS THEN
+ RETURN array_append(notes, format('Error creating indexes: %s', SQLERRM));
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_create_optimized_matv(source_schema text, source_table text, target_schema text, target_matview text, partition_columns text[], order_by_columns text[], exclude_columns_from_hash text[] DEFAULT ARRAY[]::text[], filter_latest_only boolean DEFAULT false) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ source_full_name text := quote_ident(source_schema) || '.' || quote_ident(source_table);
+ vtw_name text := replace(target_matview, 'matc_', 'vtw_');
+ vm_name text := replace(target_matview, 'matc_', 'vm_');
+ vprob_name text := replace(target_matview, 'matc_', 'vprob_');
+ vtw_full_name text := quote_ident(target_schema) || '.' || quote_ident(vtw_name);
+ vm_full_name text := quote_ident(target_schema) || '.' || quote_ident(vm_name);
+ vprob_full_name text := quote_ident(target_schema) || '.' || quote_ident(vprob_name);
+ matview_full_name text := quote_ident(target_schema) || '.' || quote_ident(target_matview);
+ columns_list text;
+ vm_columns_list text;
+ hash_columns_list text;
+ encoding_check_list text;
+ partition_clause text := '';
+ order_by_clause text := '';
+ create_vtw_sql text;
+ create_matview_sql text;
+ create_vm_sql text;
+ create_vprob_sql text;
+ create_index_sql text;
+ notes text[] := '{}';
+ column_record record;
+BEGIN
+ -- Step 1: Get the list of columns with regexp_replace for non-partition character-based columns (for vtw_)
+ columns_list := '';
+ vm_columns_list := '';
+ FOR column_record IN (
+ SELECT column_name, data_type
+ FROM information_schema.columns
+ WHERE table_schema = source_schema
+ AND table_name = source_table
+ AND column_name NOT IN ('content_hash', 'synthetic_key')
+ ORDER BY column_name
+ ) LOOP
+ -- For vtw_: Apply regexp_replace to character-based non-partition columns
+ IF column_record.column_name = ANY(partition_columns) THEN
+ columns_list := columns_list || quote_ident(column_record.column_name) || ', ';
+ ELSIF column_record.data_type IN ('character', 'character varying', 'varchar', 'char', 'text') THEN
+ columns_list := columns_list || format('regexp_replace(t.%I, ''[^\x00-\x7F]''::text, ''PROBLEM''::text, ''g''::text) AS %I, ',
+ column_record.column_name, column_record.column_name);
+ ELSE
+ columns_list := columns_list || quote_ident(column_record.column_name) || ', ';
+ END IF;
+
+ -- For vm_ and vprob_: Just the column names, no regexp_replace or t. prefix
+ vm_columns_list := vm_columns_list || quote_ident(column_record.column_name) || ', ';
+ END LOOP;
+ columns_list := rtrim(columns_list, ', ');
+ vm_columns_list := rtrim(vm_columns_list, ', ');
+
+ -- Step 2: Validate partition_columns
+ IF array_length(partition_columns, 1) IS NULL OR array_length(partition_columns, 1) = 0 THEN
+ RAISE EXCEPTION 'partition_columns cannot be empty. At least one column is required for partitioning to ensure proper deduplication.';
+ END IF;
+
+ -- Step 3: Validate order_by_columns
+ IF array_length(order_by_columns, 1) IS NULL OR array_length(order_by_columns, 1) = 0 THEN
+ RAISE EXCEPTION 'order_by_columns cannot be empty. At least one column is required to ensure deterministic ordering for synthetic_key generation.';
+ END IF;
+
+ -- Step 4: Get the list of columns for the content_hash
+ IF exclude_columns_from_hash IS NULL OR array_length(exclude_columns_from_hash, 1) IS NULL OR array_length(exclude_columns_from_hash, 1) = 0 THEN
+ -- If exclude_columns_from_hash is empty, include all columns
+ SELECT string_agg('t.' || quote_ident(column_name), ', ')
+ INTO hash_columns_list
+ FROM information_schema.columns
+ WHERE table_schema = source_schema
+ AND table_name = source_table;
+ notes := array_append(notes, 'exclude_columns_from_hash is empty; including all columns from the source table in content_hash calculation');
+ ELSE
+ -- Otherwise, exclude the specified columns
+ SELECT string_agg('t.' || quote_ident(column_name), ', ')
+ INTO hash_columns_list
+ FROM information_schema.columns
+ WHERE table_schema = source_schema
+ AND table_name = source_table
+ AND column_name NOT IN (
+ SELECT unnest(exclude_columns_from_hash)
+ );
+
+ -- If excluding the specified columns results in no columns, include all columns as a fallback
+ IF hash_columns_list IS NULL THEN
+ SELECT string_agg('t.' || quote_ident(column_name), ', ')
+ INTO hash_columns_list
+ FROM information_schema.columns
+ WHERE table_schema = source_schema
+ AND table_name = source_table;
+ notes := array_append(notes, 'exclude_columns_from_hash excluded all columns; including all columns from the source table in content_hash calculation as a fallback');
+ END IF;
+ END IF;
+
+ -- Step 5: Get the list of columns for encoding_status check
+ SELECT string_agg(format('t.%I::text ~ ''[^\x00-\x7F]''::text', column_name), ' OR ')
+ INTO encoding_check_list
+ FROM information_schema.columns
+ WHERE table_schema = source_schema
+ AND table_name = source_table;
+
+ -- Step 6: Build partition and order-by clauses for synthetic_key
+ IF array_length(partition_columns, 1) > 0 THEN
+ partition_clause := 'PARTITION BY ' || array_to_string(partition_columns, ', ');
+ END IF;
+
+ IF array_length(order_by_columns, 1) > 0 THEN
+ order_by_clause := 'ORDER BY ' || array_to_string(order_by_columns, ', ');
+ END IF;
+
+ -- Step 7: Create the vtw_ view with content_hash, synthetic_key, and encoding_status
+ IF filter_latest_only THEN
+ create_vtw_sql := format('
+ CREATE OR REPLACE VIEW %s AS
+ SELECT *
+ FROM (
+ SELECT md5(CAST(ROW(%s) AS text)) AS content_hash,
+ %s,
+ (row_number() OVER (%s %s))::bigint AS synthetic_key,
+ CASE
+ WHEN %s THEN ''ENCODING_ISSUE''::text
+ ELSE ''CLEAN''::text
+ END AS encoding_status
+ FROM %s t
+ ) sub
+ WHERE synthetic_key = 1',
+ vtw_full_name,
+ hash_columns_list,
+ columns_list,
+ partition_clause,
+ order_by_clause,
+ encoding_check_list,
+ source_full_name
+ );
+ ELSE
+ create_vtw_sql := format('
+ CREATE OR REPLACE VIEW %s AS
+ SELECT md5(CAST(ROW(%s) AS text)) AS content_hash,
+ %s,
+ (row_number() OVER (%s %s))::bigint AS synthetic_key,
+ CASE
+ WHEN %s THEN ''ENCODING_ISSUE''::text
+ ELSE ''CLEAN''::text
+ END AS encoding_status
+ FROM %s t',
+ vtw_full_name,
+ hash_columns_list,
+ columns_list,
+ partition_clause,
+ order_by_clause,
+ encoding_check_list,
+ source_full_name
+ );
+ END IF;
+
+ EXECUTE create_vtw_sql;
+ notes := array_append(notes, format('Created view %s', vtw_full_name));
+
+ -- Step 8: Create the matc_ materialized view as a direct copy of vtw_
+ create_matview_sql := format('
+ CREATE MATERIALIZED VIEW IF NOT EXISTS %s AS
+ SELECT *
+ FROM %s',
+ matview_full_name,
+ vtw_full_name
+ );
+
+ EXECUTE create_matview_sql;
+ notes := array_append(notes, format('Created materialized view %s', matview_full_name));
+
+ -- Step 9: Add indexes on matc_
+ -- Index on encoding_status
+ create_index_sql := format('
+ CREATE INDEX IF NOT EXISTS %I ON %s (encoding_status)',
+ target_matview || '_encoding_status_idx',
+ matview_full_name
+ );
+ EXECUTE create_index_sql;
+ notes := array_append(notes, format('Created index %s on encoding_status', target_matview || '_encoding_status_idx'));
+
+ -- Index on content_hash
+ create_index_sql := format('
+ CREATE INDEX IF NOT EXISTS %I ON %s (content_hash)',
+ target_matview || '_content_hash_idx',
+ matview_full_name
+ );
+ EXECUTE create_index_sql;
+ notes := array_append(notes, format('Created index %s on content_hash', target_matview || '_content_hash_idx'));
+
+ -- Unique index on (synthetic_key, partition_columns)
+ IF array_length(partition_columns, 1) > 0 THEN
+ create_index_sql := format('
+ CREATE UNIQUE INDEX IF NOT EXISTS %I ON %s (synthetic_key, %s)',
+ target_matview || '_unique_key_idx',
+ matview_full_name,
+ array_to_string(partition_columns, ', ')
+ );
+ BEGIN
+ EXECUTE create_index_sql;
+ notes := array_append(notes, format('Created unique index %s on (synthetic_key, %s)', target_matview || '_unique_key_idx', array_to_string(partition_columns, ', ')));
+ EXCEPTION WHEN unique_violation THEN
+ notes := array_append(notes, format('Unexpected failure to create unique index %s on (synthetic_key, %s) due to duplicate values in %s. This should not happen due to synthetic_key generation. Check the synthetic_key logic in %s and look for duplicates using: SELECT synthetic_key, %s, count(*) FROM %s GROUP BY synthetic_key, %s HAVING count(*) > 1;',
+ target_matview || '_unique_key_idx',
+ array_to_string(partition_columns, ', '),
+ matview_full_name,
+ vtw_full_name,
+ array_to_string(partition_columns, ', '),
+ matview_full_name,
+ array_to_string(partition_columns, ', ')));
+ END;
+ END IF;
+
+ -- Step 10: Create the vm_ view, excluding content_hash, synthetic_key, and encoding_status, with WHERE encoding_status = 'CLEAN'
+ create_vm_sql := format('
+ CREATE OR REPLACE VIEW %s AS
+ SELECT %s
+ FROM %s
+ WHERE encoding_status = ''CLEAN''',
+ vm_full_name,
+ vm_columns_list,
+ matview_full_name
+ );
+
+ EXECUTE create_vm_sql;
+ notes := array_append(notes, format('Created view %s', vm_full_name));
+
+ -- Step 11: Create the vprob_ view, excluding content_hash, synthetic_key, and encoding_status, with WHERE encoding_status != 'CLEAN'
+ create_vprob_sql := format('
+ CREATE OR REPLACE VIEW %s AS
+ SELECT %s
+ FROM %s
+ WHERE encoding_status != ''CLEAN''',
+ vprob_full_name,
+ vm_columns_list,
+ matview_full_name
+ );
+
+ EXECUTE create_vprob_sql;
+ notes := array_append(notes, format('Created view %s', vprob_full_name));
+
+ RETURN jsonb_build_object('notes', notes);
+EXCEPTION WHEN OTHERS THEN
+ RETURN jsonb_build_object('error', SQLERRM, 'notes', notes);
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_create_temp_table(source_schema text, source_table text) RETURNS text
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ temp_table_name text := 'temp_' || source_table || '_' || to_char(current_timestamp, 'YYYYMMDDHH24MISS');
+ column_defs text;
+BEGIN
+ -- Step 1: Generate column definitions from source table
+ SELECT string_agg(
+ format('%I %s', column_name, data_type),
+ ', '
+ ) INTO column_defs
+ FROM information_schema.columns
+ WHERE table_schema = source_schema
+ AND table_name = source_table
+ AND column_name IS NOT NULL
+ AND TRIM(column_name) != '';
+
+ -- Step 2: Create temp table with column definitions
+ EXECUTE format('CREATE TEMP TABLE %I (%s)', temp_table_name, column_defs);
+
+ -- Step 3: Insert data from source table
+ EXECUTE format('INSERT INTO %I SELECT * FROM %I.%I', temp_table_name, source_schema, source_table);
+
+ RETURN temp_table_name;
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_generate_column_lists(source_schema text, source_table text) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ columns_list text;
+ non_char_columns text;
+ case_conditions text;
+BEGIN
+ -- Character-type columns (cleansed)
+ SELECT string_agg(
+ 'regexp_replace(' || quote_ident(column_name) || ', ''[^\x00-\x7F]'', ''PROBLEM'', ''g'') as ' || quote_ident(column_name),
+ ', '
+ )
+ INTO columns_list
+ FROM information_schema.columns
+ WHERE table_schema = source_schema
+ AND table_name = source_table
+ AND data_type IN ('character varying', 'character', 'text', 'varchar', 'char');
+
+ -- Non-character-type columns
+ SELECT string_agg(quote_ident(column_name), ', ')
+ INTO non_char_columns
+ FROM information_schema.columns
+ WHERE table_schema = source_schema
+ AND table_name = source_table
+ AND data_type NOT IN ('character varying', 'character', 'text', 'varchar', 'char');
+
+ -- CASE conditions for encoding status
+ SELECT string_agg(quote_ident(column_name) || ' ~ ''[^\x00-\x7F]''', ' OR ')
+ INTO case_conditions
+ FROM information_schema.columns
+ WHERE table_schema = source_schema
+ AND table_name = source_table
+ AND data_type IN ('character varying', 'character', 'text', 'varchar', 'char');
+
+ RETURN jsonb_build_object(
+ 'columns_list', COALESCE(columns_list, ''),
+ 'non_char_columns', COALESCE(non_char_columns, ''),
+ 'case_conditions', COALESCE(case_conditions, 'FALSE')
+ );
+EXCEPTION WHEN OTHERS THEN
+ RAISE NOTICE 'Error generating column lists: %', SQLERRM;
+ RETURN jsonb_build_object(
+ 'error', format('Failed to generate column lists: %s', SQLERRM)
+ );
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_generate_synthetic_key_and_hash(partition_columns text[], order_by_columns text[], exclude_hash_columns text[], all_columns text[]) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ partition_sql text;
+ order_by_clause text;
+ synthetic_key_sql text;
+ content_hash_sql text;
+ hashable_columns text[];
+ datetime_format CONSTANT text := 'YYYY-MM-DD HH24:MI:SS'; -- Hardcoded
+BEGIN
+ -- Partition clause
+ SELECT string_agg(quote_ident(unnest), ', ')
+ INTO partition_sql
+ FROM unnest(partition_columns);
+
+ partition_sql := format('PARTITION BY %s', COALESCE(partition_sql, '1'));
+
+ -- Order-by clause
+ SELECT string_agg(
+ format('TO_TIMESTAMP(SUBSTRING(NULLIF(%I, ''''), 1, 19), %L) DESC NULLS LAST', unnest, datetime_format),
+ ', '
+ )
+ INTO order_by_clause
+ FROM unnest(order_by_columns);
+
+ synthetic_key_sql := format(
+ 'ROW_NUMBER() OVER (%s ORDER BY %s) AS synthetic_key',
+ partition_sql, COALESCE(order_by_clause, '1')
+ );
+
+ -- Content hash
+ hashable_columns := array(
+ SELECT unnest(all_columns)
+ EXCEPT
+ SELECT unnest(exclude_hash_columns)
+ );
+
+ SELECT string_agg(quote_ident(unnest), ', ')
+ INTO content_hash_sql
+ FROM unnest(hashable_columns);
+
+ content_hash_sql := format('md5(CAST(row_to_json(ROW(%s)) AS text)) AS content_hash', COALESCE(content_hash_sql, '1'));
+
+ RETURN jsonb_build_object(
+ 'synthetic_key_sql', synthetic_key_sql,
+ 'content_hash_sql', content_hash_sql
+ );
+EXCEPTION WHEN OTHERS THEN
+ RETURN jsonb_build_object(
+ 'error', format('Failed to generate synthetic key and hash: %s', SQLERRM)
+ );
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_identify_order_by_candidates(temp_table_name text, column_stats jsonb) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ order_by_candidates jsonb := '{}';
+ col_name text;
+ column_type text;
+ null_ratio numeric;
+ notes text[] := '{}';
+BEGIN
+ -- Loop through columns to identify order-by candidates
+ FOR col_name, column_type IN
+ SELECT column_name, data_type
+ FROM information_schema.columns
+ WHERE table_schema LIKE 'pg_temp%'
+ AND table_name = temp_table_name
+ AND column_name IS NOT NULL
+ AND TRIM(column_name) != ''
+ LOOP
+ -- Get null ratio from column_stats
+ null_ratio := (column_stats->col_name->>'null_ratio')::numeric;
+
+ -- Skip columns with high null ratio
+ IF null_ratio > 0.5 THEN
+ notes := array_append(notes, format('Skipped %I as order-by candidate due to high null ratio: %s', col_name, null_ratio));
+ CONTINUE;
+ END IF;
+
+ -- Check for timestamp or text columns
+ IF column_type IN ('timestamp', 'timestamp with time zone', 'timestamp without time zone', 'text') THEN
+ IF column_type = 'text' THEN
+ BEGIN
+ EXECUTE format('SELECT TO_TIMESTAMP(SUBSTRING(NULLIF(%I, ''''), 1, 19), %L) FROM %I LIMIT 1',
+ col_name, 'YYYY-MM-DD HH24:MI:SS', temp_table_name);
+ order_by_candidates := order_by_candidates || jsonb_build_object(
+ col_name, jsonb_build_object(
+ 'fitness_score', (1 - null_ratio) * 100,
+ 'note', 'Text column parseable as timestamp'
+ )
+ );
+ EXCEPTION WHEN OTHERS THEN
+ notes := array_append(notes, format('%I is text but not parseable as timestamp: %s', col_name, SQLERRM));
+ END;
+ ELSE
+ order_by_candidates := order_by_candidates || jsonb_build_object(
+ col_name, jsonb_build_object(
+ 'fitness_score', (1 - null_ratio) * 100,
+ 'note', 'Native timestamp column'
+ )
+ );
+ END IF;
+ END IF;
+ END LOOP;
+
+ RETURN order_by_candidates;
+EXCEPTION WHEN OTHERS THEN
+ RETURN jsonb_build_object(
+ 'error', format('Failed to identify order-by candidates: %s', SQLERRM),
+ 'notes', notes
+ );
+END;
+$$;
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_perform_matv_action(full_matview_name text, schema_name text, matview_name text, action text, mismatched_records bigint, total_matview_records bigint, time_diff interval, mismatch_threshold numeric, time_threshold interval, encoding_issues bigint) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ action_performed boolean := false;
+ action_result text;
+ has_unique_index boolean;
+ index_rec record;
+ constraint_rec record;
+BEGIN
+ -- Check if the materialized view has a unique index
+ SELECT EXISTS (
+ SELECT 1
+ FROM pg_index i
+ JOIN pg_class c ON c.oid = i.indrelid
+ JOIN pg_namespace n ON n.oid = c.relnamespace
+ WHERE n.nspname = schema_name
+ AND c.relname = matview_name
+ AND i.indisunique = true
+ ) INTO has_unique_index;
+
+ IF action = 'refresh' AND (
+ (mismatched_records::NUMERIC / NULLIF(total_matview_records, 0)::NUMERIC) * 100 > mismatch_threshold
+ OR time_diff >= time_threshold
+ ) THEN
+ IF has_unique_index THEN
+ EXECUTE format('REFRESH MATERIALIZED VIEW CONCURRENTLY %s', full_matview_name);
+ action_result := 'Refreshed successfully (concurrently)';
+ ELSE
+ EXECUTE format('REFRESH MATERIALIZED VIEW %s', full_matview_name);
+ action_result := 'Refreshed successfully (non-concurrently: no unique index found)';
+ RAISE NOTICE 'No unique index found for %, using non-concurrent refresh', full_matview_name;
+ END IF;
+ action_performed := true;
+ ELSIF action = 'repair' AND encoding_issues > 0 THEN
+ -- Drop existing indexes
+ FOR index_rec IN (
+ SELECT indexname
+ FROM pg_indexes
+ WHERE schemaname = schema_name AND tablename = matview_name
+ AND indexname NOT LIKE '%_pkey'
+ ) LOOP
+ EXECUTE format('DROP INDEX IF EXISTS %I.%I', schema_name, index_rec.indexname);
+ END LOOP;
+
+ -- Drop primary key or unique constraints
+ FOR constraint_rec IN (
+ SELECT conname
+ FROM pg_constraint
+ WHERE conrelid = (SELECT oid FROM pg_class WHERE relname = matview_name AND relnamespace = (SELECT oid FROM pg_namespace WHERE nspname = schema_name))
+ AND contype IN ('p', 'u')
+ ) LOOP
+ EXECUTE format('ALTER TABLE %I.%I DROP CONSTRAINT %I', schema_name, matview_name, constraint_rec.conname);
+ END LOOP;
+
+ -- Recreate standard indexes
+ IF EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = schema_name AND table_name = matview_name AND column_name = 'content_hash') THEN
+ EXECUTE format('CREATE INDEX IF NOT EXISTS %I ON %I.%I (content_hash)', 'idx_' || matview_name || '_content_hash', schema_name, matview_name);
+ END IF;
+ IF EXISTS (SELECT 1 FROM information_schema.columns WHERE table_schema = schema_name AND table_name = matview_name AND column_name = 'synthetic_key') THEN
+ EXECUTE format('CREATE UNIQUE INDEX IF NOT EXISTS %I ON %I.%I (synthetic_key)', 'idx_' || matview_name || '_synthetic_key', schema_name, matview_name);
+ END IF;
+
+ -- Analyze the table
+ EXECUTE format('ANALYZE %I.%I', schema_name, matview_name);
+
+ action_result := 'Repaired successfully: indexes and keys rebuilt';
+ action_performed := true;
+ ELSIF action = 'reindex' THEN
+ EXECUTE format('REINDEX TABLE %s', full_matview_name);
+ action_result := 'Reindexed successfully';
+ action_performed := true;
+ ELSE
+ action_result := 'Action skipped: threshold not met or invalid action';
+ END IF;
+
+ RETURN jsonb_build_object(
+ 'action_performed', action_performed,
+ 'action_result', action_result
+ );
+EXCEPTION WHEN OTHERS THEN
+ action_result := format('Action failed: %s', SQLERRM);
+ RAISE NOTICE 'Action exception: %', action_result;
+ RETURN jsonb_build_object(
+ 'action_performed', false,
+ 'action_result', action_result
+ );
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_set_validation_params(validation_type text) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ params jsonb;
+ mismatch_threshold numeric;
+ time_threshold interval;
+BEGIN
+ -- Set validation parameters
+ params := CASE validation_type
+ WHEN 'quick' THEN '{"sample_percent": 0.1, "confidence": 0.95, "margin": 0.03}'::jsonb
+ WHEN 'daily' THEN '{"sample_percent": 1.0, "confidence": 0.99, "margin": 0.01}'::jsonb
+ WHEN 'full' THEN '{"sample_percent": 100.0, "confidence": 0.99, "margin": 0.005}'::jsonb
+ ELSE '{"sample_percent": 0.1, "confidence": 0.95, "margin": 0.03}'::jsonb
+ END;
+
+ -- Set dynamic thresholds
+ mismatch_threshold := CASE validation_type
+ WHEN 'quick' THEN 0.1 -- 0.1% mismatch for quick
+ WHEN 'daily' THEN 0.05 -- 0.05% mismatch for daily
+ WHEN 'full' THEN 0.01 -- 0.01% mismatch for full
+ ELSE 0.1
+ END;
+
+ time_threshold := CASE validation_type
+ WHEN 'quick' THEN '3 days'::interval -- 3 days for quick
+ WHEN 'daily' THEN '1 day'::interval -- 1 day for daily
+ WHEN 'full' THEN '12 hours'::interval -- 12 hours for full
+ ELSE '3 days'::interval
+ END;
+
+ RETURN jsonb_build_object(
+ 'params', params,
+ 'mismatch_threshold', mismatch_threshold,
+ 'time_threshold', time_threshold
+ );
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_validate_matv_inputs(schema_name text, matview_name text, vtw_name text) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ full_matview_name text;
+ full_vtw_name text;
+ notes text[] := '{}';
+ vtw_table_name text;
+BEGIN
+ -- Construct full names
+ full_matview_name := quote_ident(schema_name) || '.' || quote_ident(matview_name);
+ vtw_table_name := COALESCE(vtw_name, replace(matview_name, 'matc_', 'vtw_'));
+ full_vtw_name := quote_ident(schema_name) || '.' || quote_ident(vtw_table_name);
+
+ -- Validate materialized view existence
+ IF NOT EXISTS (
+ SELECT 1
+ FROM pg_matviews
+ WHERE schemaname = schema_name
+ AND matviewname = matview_name
+ ) THEN
+ RETURN jsonb_build_object(
+ 'error', format('Materialized view %I.%I does not exist', schema_name, matview_name),
+ 'notes', notes
+ );
+ END IF;
+
+ -- Validate source view existence
+ IF NOT EXISTS (
+ SELECT 1
+ FROM pg_tables
+ WHERE schemaname = schema_name
+ AND tablename = vtw_table_name
+ ) AND NOT EXISTS (
+ SELECT 1
+ FROM pg_views
+ WHERE schemaname = schema_name
+ AND viewname = vtw_table_name
+ ) THEN
+ RETURN jsonb_build_object(
+ 'error', format('Source view %I.%I does not exist', schema_name, vtw_table_name),
+ 'notes', notes
+ );
+ END IF;
+
+ RETURN jsonb_build_object(
+ 'full_matview_name', full_matview_name,
+ 'full_vtw_name', full_vtw_name,
+ 'notes', notes
+ );
+EXCEPTION WHEN OTHERS THEN
+ RETURN jsonb_build_object(
+ 'error', format('Error validating inputs: %s', SQLERRM),
+ 'notes', notes
+ );
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_validate_order_by_columns(source_schema text, source_table text, order_by_columns text[]) RETURNS text[]
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ notes text[] := '{}';
+ col_name text;
+ datetime_format CONSTANT text := 'YYYY-MM-DD HH24:MI:SS'; -- Hardcoded
+BEGIN
+ FOREACH col_name IN ARRAY order_by_columns LOOP
+ IF NOT EXISTS (
+ SELECT 1
+ FROM information_schema.columns
+ WHERE table_schema = source_schema
+ AND table_name = source_table
+ AND column_name = col_name
+ ) THEN
+ notes := array_append(notes, format('Warning: %I not found in %I.%I', col_name, source_schema, source_table));
+ ELSE
+ BEGIN
+ EXECUTE format('SELECT TO_TIMESTAMP(SUBSTRING(NULLIF(%I, ''''), 1, 19), %L) FROM %I.%I LIMIT 1',
+ col_name, datetime_format, source_schema, source_table);
+ EXCEPTION WHEN OTHERS THEN
+ notes := array_append(notes, format('Warning: %I contains unparseable timestamp data: %s', col_name, SQLERRM));
+ END;
+ END IF;
+ END LOOP;
+
+ RETURN notes;
+EXCEPTION WHEN OTHERS THEN
+ RETURN array_append(notes, format('Error validating order-by columns: %s', SQLERRM));
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_analyze_column_stats(temp_table_name text, col_name text, column_type text, sample_size bigint, total_rows bigint, exclude_key_columns text[]) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ null_count bigint;
+ distinct_count bigint;
+ encoding_issue_count bigint;
+ uniqueness_ratio numeric;
+ null_ratio numeric;
+ encoding_issue_ratio numeric;
+ fitness_score numeric;
+ adjusted_sample_size bigint;
+ sampling_percentage numeric;
+BEGIN
+ -- Adjust sample size to not exceed total rows
+ adjusted_sample_size := LEAST(sample_size, total_rows);
+ sampling_percentage := (adjusted_sample_size::float / total_rows * 100);
+
+ -- Null count
+ EXECUTE format('SELECT COUNT(*) FROM (SELECT %I FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s) t WHERE %I IS NULL',
+ col_name, temp_table_name, sampling_percentage::text, adjusted_sample_size, col_name)
+ INTO null_count;
+ null_ratio := null_count::float / adjusted_sample_size;
+
+ -- Distinct count (skip for excluded columns)
+ IF NOT (col_name = ANY(exclude_key_columns)) THEN
+ EXECUTE format('SELECT COUNT(DISTINCT %I) FROM (SELECT %I FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s) t',
+ col_name, col_name, temp_table_name, sampling_percentage::text, adjusted_sample_size)
+ INTO distinct_count;
+ uniqueness_ratio := distinct_count::float / adjusted_sample_size;
+ ELSE
+ uniqueness_ratio := 0;
+ END IF;
+
+ -- Encoding issues (for text-like columns)
+ IF column_type IN ('character varying', 'character', 'text', 'varchar', 'char') THEN
+ EXECUTE format('SELECT COUNT(*) FROM (SELECT %I FROM %I TABLESAMPLE SYSTEM (%s) LIMIT %s) t WHERE %I ~ ''[^\x00-\x7F]''',
+ col_name, temp_table_name, sampling_percentage::text, adjusted_sample_size, col_name)
+ INTO encoding_issue_count;
+ encoding_issue_ratio := encoding_issue_count::float / adjusted_sample_size;
+ ELSE
+ encoding_issue_ratio := 0;
+ END IF;
+
+ -- Fitness score for key fitness (if not excluded)
+ IF NOT (col_name = ANY(exclude_key_columns)) THEN
+ fitness_score := (uniqueness_ratio * 40) +
+ ((1 - null_ratio) * 30) +
+ ((1 - encoding_issue_ratio) * 20) +
+ (CASE
+ WHEN column_type IN ('character varying', 'character', 'text', 'varchar', 'char', 'integer', 'bigint') THEN 10
+ WHEN column_type IN ('timestamp', 'timestamp with time zone', 'timestamp without time zone') THEN 8
+ ELSE 5
+ END);
+ ELSE
+ fitness_score := 0;
+ END IF;
+
+ -- Return stats as JSONB
+ RETURN jsonb_build_object(
+ 'data_type', column_type,
+ 'uniqueness_ratio', uniqueness_ratio,
+ 'distinct_count', distinct_count,
+ 'null_ratio', null_ratio,
+ 'null_count', null_count,
+ 'encoding_issue_ratio', encoding_issue_ratio,
+ 'encoding_issue_count', encoding_issue_count,
+ 'fitness_score', fitness_score,
+ 'excluded_from_key_fitness', (col_name = ANY(exclude_key_columns))
+ );
+EXCEPTION WHEN OTHERS THEN
+ RETURN jsonb_build_object(
+ 'error', format('Failed to analyze column %I: %s', col_name, SQLERRM),
+ 'data_type', column_type
+ );
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_estimate_matv_refresh_time(full_matview_name text) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ v_refresh_count bigint; -- Renamed to avoid conflict
+ v_refresh_total interval; -- Renamed for consistency
+ estimated_refresh_time interval;
+BEGIN
+ -- Estimate refresh time
+ SELECT s.refresh_count, s.refresh_mv_time_total
+ INTO v_refresh_count, v_refresh_total
+ FROM public.c77_dbh_matv_stats s
+ WHERE s.mv_name = full_matview_name
+ LIMIT 1;
+
+ IF COALESCE(v_refresh_count, 0) > 0 THEN
+ estimated_refresh_time := v_refresh_total / v_refresh_count::numeric;
+ ELSE
+ estimated_refresh_time := '00:00:00'::interval;
+ END IF;
+
+ -- Return raw values for debugging
+ RETURN jsonb_build_object(
+ 'estimated_refresh_time', estimated_refresh_time,
+ 'refresh_count', v_refresh_count,
+ 'refresh_total', v_refresh_total
+ );
+EXCEPTION WHEN OTHERS THEN
+ RETURN jsonb_build_object(
+ 'estimated_refresh_time', '00:00:00'::interval,
+ 'refresh_count', NULL,
+ 'refresh_total', NULL,
+ 'error', SQLERRM
+ );
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_check_matv_mismatches(target_schema text, matview_name text, validation_type text DEFAULT 'quick'::text) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ vtw_name text := 'vtw_' || substring(matview_name from 6); -- Replace 'matc_' with 'vtw_'
+ matview_full_name text := quote_ident(target_schema) || '.' || quote_ident(matview_name);
+ vtw_full_name text := quote_ident(target_schema) || '.' || quote_ident(vtw_name);
+ mismatch_count bigint;
+ mismatch_sql text;
+ content_hash_exists boolean := true;
+ total_matview_records bigint;
+ params jsonb;
+ sample_size bigint;
+ notes text[] := '{}';
+BEGIN
+ -- Define validation parameters
+ params := CASE validation_type
+ WHEN 'quick' THEN '{"sample_percent": 0.1, "confidence": 0.95, "margin": 0.03}'::jsonb
+ WHEN 'daily' THEN '{"sample_percent": 1.0, "confidence": 0.99, "margin": 0.01}'::jsonb
+ WHEN 'full' THEN '{"sample_percent": 100.0, "confidence": 0.99, "margin": 0.005}'::jsonb
+ ELSE '{"sample_percent": 0.1, "confidence": 0.95, "margin": 0.03}'::jsonb
+ END;
+
+ -- Calculate sample size
+ EXECUTE format('SELECT COUNT(*) FROM %s', matview_full_name) INTO total_matview_records;
+ sample_size := GREATEST(100, CEIL((jsonb_extract_path_text(params, 'sample_percent')::NUMERIC / 100) * total_matview_records));
+ notes := array_append(notes, format('Total matview records: %s, Sample size: %s', total_matview_records, sample_size));
+
+ -- Attempt to query content_hash to check if it exists in both relations
+ BEGIN
+ EXECUTE format('SELECT 1 FROM %s WHERE content_hash IS NOT NULL LIMIT 1', vtw_full_name);
+ EXECUTE format('SELECT 1 FROM %s WHERE content_hash IS NOT NULL LIMIT 1', matview_full_name);
+ EXCEPTION WHEN undefined_column THEN
+ content_hash_exists := false;
+ END;
+
+ -- If content_hash is not found in either, return early
+ IF NOT content_hash_exists THEN
+ RAISE NOTICE 'content_hash column not found in either %.% or %.%, skipping mismatch check',
+ target_schema, matview_name, target_schema, vtw_name;
+ RETURN jsonb_build_object(
+ 'mismatched_records', 0,
+ 'mismatch_percent', 0.0,
+ 'notes', notes
+ );
+ END IF;
+
+ -- Construct the mismatch check query
+ IF jsonb_extract_path_text(params, 'sample_percent')::NUMERIC < 100.0 THEN
+ -- Use sampling for quick and daily
+ mismatch_sql := format('
+ WITH matview_sample AS (
+ SELECT content_hash
+ FROM %s
+ ORDER BY random()
+ LIMIT %s
+ ),
+ vtw_sample AS (
+ SELECT content_hash
+ FROM %s
+ ORDER BY random()
+ LIMIT %s
+ )
+ SELECT COUNT(*)
+ FROM (
+ SELECT content_hash FROM matview_sample
+ EXCEPT
+ SELECT content_hash FROM vtw_sample
+ ) mismatches',
+ matview_full_name,
+ sample_size,
+ vtw_full_name,
+ sample_size
+ );
+ ELSE
+ -- Full comparison for 'full' validation
+ mismatch_sql := format('
+ SELECT COUNT(*)
+ FROM (
+ SELECT content_hash FROM %s
+ EXCEPT
+ SELECT content_hash FROM %s
+ ) mismatches',
+ vtw_full_name,
+ matview_full_name
+ );
+ END IF;
+
+ EXECUTE mismatch_sql INTO mismatch_count;
+ notes := array_append(notes, format('Mismatch count: %s', mismatch_count));
+
+ RETURN jsonb_build_object(
+ 'mismatched_records', mismatch_count,
+ 'mismatch_percent', (mismatch_count::float / GREATEST(sample_size, 1)) * 100,
+ 'notes', notes
+ );
+EXCEPTION WHEN OTHERS THEN
+ notes := array_append(notes, format('Error in public.c77_mvc_check_matv_mismatches: %s', SQLERRM));
+ RETURN jsonb_build_object(
+ 'mismatched_records', -1,
+ 'mismatch_percent', -1.0,
+ 'notes', notes
+ );
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_analyze_table_fitness(source_schema text, source_table text, exclude_key_columns text[] DEFAULT ARRAY[]::text[]) RETURNS json
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ total_rows bigint;
+ sample_size bigint;
+ temp_table_name text;
+ notes text[] := '{}';
+ result_json jsonb;
+ column_stats jsonb := '{}';
+ order_by_candidates jsonb := '{}';
+ column_combinations jsonb := '{}';
+ data_quality_index numeric;
+ col_name text;
+ column_type text;
+ table_exists boolean;
+ confidence_level CONSTANT numeric := 0.99; -- Hardcoded 99% confidence
+ margin_of_error CONSTANT numeric := 0.03; -- Hardcoded 3% margin of error
+BEGIN
+ notes := array_append(notes, format('Analysis started at %s', clock_timestamp()));
+
+ -- Step 1: Validate schema and table existence
+ SELECT EXISTS (
+ SELECT 1
+ FROM information_schema.tables
+ WHERE table_schema = source_schema
+ AND table_name = source_table
+ ) INTO table_exists;
+
+ IF NOT table_exists THEN
+ result_json := jsonb_build_object(
+ 'error', format('Table %I.%I does not exist', source_schema, source_table),
+ 'notes', notes
+ );
+ RETURN result_json::json;
+ END IF;
+
+ -- Step 2: Get total rows
+ BEGIN
+ EXECUTE format('SELECT COUNT(*) FROM %I.%I', source_schema, source_table) INTO total_rows;
+ notes := array_append(notes, format('Total rows in %I.%I: %s', source_schema, source_table, total_rows));
+ EXCEPTION WHEN OTHERS THEN
+ result_json := jsonb_build_object(
+ 'error', format('Failed to count rows in %I.%I: %s', source_schema, source_table, SQLERRM),
+ 'notes', notes
+ );
+ RETURN result_json::json;
+ END;
+
+ -- Step 3: Calculate sample size with hardcoded confidence level and margin of error
+ sample_size := public.c77_mvc_calculate_sample_size(total_rows, confidence_level, margin_of_error);
+ notes := array_append(notes, format('Sample size calculated: %s for %s rows (Confidence: %s%%, Margin of Error: ±%s%%)',
+ sample_size, total_rows, confidence_level * 100, margin_of_error * 100));
+
+ -- Step 4: Create temp table
+ BEGIN
+ temp_table_name := public.c77_mvc_create_temp_table(source_schema, source_table);
+ notes := array_append(notes, format('Created temporary table %s for analysis', temp_table_name));
+ EXCEPTION WHEN OTHERS THEN
+ result_json := jsonb_build_object(
+ 'error', format('Failed to create temp table for %I.%I: %s', source_schema, source_table, SQLERRM),
+ 'notes', notes
+ );
+ RETURN result_json::json;
+ END;
+
+ -- Step 5: Analyze individual columns
+ FOR col_name, column_type IN
+ SELECT column_name, data_type
+ FROM information_schema.columns
+ WHERE table_schema LIKE 'pg_temp%'
+ AND table_name = temp_table_name
+ AND column_name IS NOT NULL
+ AND TRIM(column_name) != ''
+ LOOP
+ column_stats := column_stats || jsonb_build_object(
+ col_name, public.c77_mvc_analyze_column_stats(temp_table_name, col_name, column_type, sample_size, total_rows, exclude_key_columns)
+ );
+ END LOOP;
+ notes := array_append(notes, 'Completed analysis of individual columns');
+
+ -- Step 6: Identify order-by candidates
+ order_by_candidates := public.c77_mvc_identify_order_by_candidates(temp_table_name, column_stats);
+ notes := array_append(notes, 'Completed identification of order-by candidates');
+
+ -- Step 7: Analyze column combinations
+ column_combinations := public.c77_mvc_analyze_column_combinations(temp_table_name, column_stats, sample_size, total_rows, exclude_key_columns);
+ notes := array_append(notes, 'Completed analysis of column combinations');
+
+ -- Step 8: Calculate Data Quality Index (DQI)
+ data_quality_index := public.c77_mvc_calculate_dqi(column_stats);
+ notes := array_append(notes, format('Data Quality Index (DQI): %s', ROUND(data_quality_index, 2)));
+
+ -- Step 9: Assemble final result and clean up
+ result_json := public.c77_mvc_assemble_result(
+ source_schema, source_table, column_stats, column_combinations,
+ order_by_candidates, data_quality_index, notes, temp_table_name
+ );
+
+ RETURN result_json::json;
+EXCEPTION WHEN OTHERS THEN
+ result_json := jsonb_build_object(
+ 'error', format('Unexpected error in public.c77_mvc_analyze_table_fitness: %s', SQLERRM),
+ 'notes', notes
+ );
+ RETURN result_json::json;
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_manage_matv_health(target_schema text, matview_name text, validation_type text DEFAULT 'quick'::text, action text DEFAULT NULL::text) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ vtw_name text := 'vtw_' || substring(matview_name from 6); -- Replace 'matc_' with 'vtw_'
+ matview_full_name text := quote_ident(target_schema) || '.' || quote_ident(matview_name);
+ vtw_full_name text := quote_ident(target_schema) || '.' || quote_ident(vtw_name);
+ total_vtw_records bigint;
+ total_matview_records bigint;
+ encoding_issues bigint;
+ mismatch_result jsonb;
+ mismatch_count bigint;
+ mismatch_percent float;
+ status text;
+ character_encoding_status text;
+ last_refresh timestamp with time zone;
+ staleness_threshold interval;
+ mismatch_threshold float;
+ is_stale_by_time boolean := false;
+ notes text[] := '{}';
+ start_time timestamp with time zone := now();
+ end_time timestamp with time zone;
+ elapsed_time interval;
+ refresh_estimate jsonb;
+ estimated_refresh_time interval;
+ stats_found boolean := false;
+BEGIN
+ -- Log the start time
+ notes := array_append(notes, format('Process started at %s', start_time));
+
+ -- Step 1: Check if the materialized view exists
+ BEGIN
+ EXECUTE format('SELECT COUNT(*) FROM %s', matview_full_name) INTO total_matview_records;
+ EXCEPTION WHEN undefined_table THEN
+ status := 'Non Existent';
+ end_time := now();
+ elapsed_time := end_time - start_time;
+ notes := array_append(notes, format('Elapsed time: %s', elapsed_time));
+ RETURN jsonb_build_object(
+ 'total_vtw_records', 0,
+ 'total_matview_records', 0,
+ 'encoding_issues', 0,
+ 'mismatched_records', 0,
+ 'mismatch_percent', 0.0,
+ 'status', status,
+ 'character_encoding_status', 'CLEAN',
+ 'estimated_refresh_time', interval '0 seconds',
+ 'notes', notes
+ );
+ END;
+
+ -- Step 2: Check if the materialized view has data
+ IF total_matview_records = 0 THEN
+ status := 'Uninitialized';
+ end_time := now();
+ elapsed_time := end_time - start_time;
+ notes := array_append(notes, format('Elapsed time: %s', elapsed_time));
+ RETURN jsonb_build_object(
+ 'total_vtw_records', 0,
+ 'total_matview_records', 0,
+ 'encoding_issues', 0,
+ 'mismatched_records', 0,
+ 'mismatch_percent', 0.0,
+ 'status', status,
+ 'character_encoding_status', 'CLEAN',
+ 'estimated_refresh_time', interval '0 seconds',
+ 'notes', notes
+ );
+ END IF;
+
+ -- Step 3: Get the total records in the source view
+ EXECUTE format('SELECT COUNT(*) FROM %s', vtw_full_name) INTO total_vtw_records;
+
+ -- Step 4: Check for encoding issues
+ BEGIN
+ EXECUTE format('SELECT COUNT(*) FROM %s WHERE encoding_status = ''ENCODING_ISSUE''', matview_full_name)
+ INTO encoding_issues;
+ EXCEPTION WHEN undefined_column THEN
+ encoding_issues := 0; -- If encoding_status column doesn't exist, assume no issues
+ END;
+
+ -- Set character_encoding_status
+ IF encoding_issues > 0 THEN
+ character_encoding_status := 'DEGRADED';
+ ELSE
+ character_encoding_status := 'CLEAN';
+ END IF;
+
+ -- Step 5: Check for time-based staleness
+ SELECT refresh_mv_last
+ INTO last_refresh
+ FROM public.c77_dbh_matv_stats
+ WHERE mv_name = target_schema || '.' || matview_name;
+
+ IF FOUND THEN
+ stats_found := true;
+ ELSE
+ notes := array_append(notes, format('Warning: No refresh stats found for materialized view %s in c77_dbh_matv_stats', matview_name));
+ END IF;
+
+ staleness_threshold := CASE validation_type
+ WHEN 'quick' THEN '3 days'::interval
+ WHEN 'daily' THEN '1 day'::interval
+ WHEN 'full' THEN '12 hours'::interval
+ ELSE '3 days'::interval
+ END;
+
+ notes := array_append(notes, format('Last refresh: %s, Time since last refresh: %s, Staleness threshold: %s', last_refresh, now() - last_refresh, staleness_threshold));
+
+ IF last_refresh IS NULL OR (now() - last_refresh) > staleness_threshold THEN
+ is_stale_by_time := true;
+ END IF;
+ notes := array_append(notes, format('Is stale by time: %s', is_stale_by_time));
+
+ -- Step 6: Set mismatch threshold based on validation_type
+ mismatch_threshold := CASE validation_type
+ WHEN 'quick' THEN 1.0 -- 1.0%
+ WHEN 'daily' THEN 0.5 -- 0.5%
+ WHEN 'full' THEN 0.2 -- 0.2%
+ ELSE 1.0
+ END;
+
+ -- Step 7: Check for mismatches
+ mismatch_result := public.c77_mvc_check_matv_mismatches(target_schema, matview_name, validation_type);
+ mismatch_count := (mismatch_result->>'mismatched_records')::bigint;
+ mismatch_percent := (mismatch_result->>'mismatch_percent')::float;
+
+ -- Append mismatch notes
+ notes := array_cat(notes, ARRAY(SELECT jsonb_array_elements_text(mismatch_result->'notes')));
+
+ -- Log mismatch details
+ notes := array_append(notes, format('Mismatch percent: %s, Mismatch threshold: %s', mismatch_percent, mismatch_threshold));
+
+ -- Step 8: Determine refresh status
+ IF is_stale_by_time OR mismatch_percent > mismatch_threshold THEN
+ status := 'Stale';
+ ELSE
+ status := 'Healthy';
+ END IF;
+
+ -- Step 9: Get estimated refresh time using the correct function
+ SELECT public.c77_mvc_estimate_matv_refresh_time(matview_full_name)
+ INTO refresh_estimate;
+ estimated_refresh_time := (refresh_estimate->>'estimated_refresh_time')::interval;
+ notes := array_append(notes, format('Refresh estimate details: %s', refresh_estimate));
+
+ -- Step 10: Perform action if specified and status is Stale
+ IF action IS NOT NULL AND status = 'Stale' THEN
+ IF action = 'refresh' THEN
+ -- Refresh the materialized view (using WITH DATA for PostgreSQL 12 compatibility)
+ EXECUTE format('REFRESH MATERIALIZED VIEW %s WITH DATA', matview_full_name);
+ notes := array_append(notes, 'Performed REFRESH on materialized view');
+ -- No need to update refresh time; handled by system triggers
+ ELSIF action = 'repair' THEN
+ -- Drop and recreate indexes
+ EXECUTE format('DROP INDEX IF EXISTS %s_encoding_status_idx', matview_name);
+ EXECUTE format('DROP INDEX IF EXISTS %s_content_hash_idx', matview_name);
+ EXECUTE format('DROP INDEX IF EXISTS %s_unique_key_idx', matview_name);
+ EXECUTE format('CREATE INDEX %s_encoding_status_idx ON %s (encoding_status)', matview_name, matview_full_name);
+ EXECUTE format('CREATE INDEX %s_content_hash_idx ON %s (content_hash)', matview_name, matview_full_name);
+ EXECUTE format('CREATE UNIQUE INDEX %s_unique_key_idx ON %s (synthetic_key, companyid, orgname_id)', matview_name, matview_full_name);
+ notes := array_append(notes, 'Performed REPAIR (dropped and recreated indexes) on materialized view');
+ ELSIF action = 'reindex' THEN
+ -- Reindex the materialized view
+ EXECUTE format('REINDEX TABLE %s', matview_full_name);
+ notes := array_append(notes, 'Performed REINDEX on materialized view');
+ ELSE
+ notes := array_append(notes, format('Invalid action: %s, no action performed', action));
+ END IF;
+
+ -- Step 11: Re-evaluate after action
+ EXECUTE format('SELECT COUNT(*) FROM %s', matview_full_name) INTO total_matview_records;
+ EXECUTE format('SELECT COUNT(*) FROM %s WHERE encoding_status = ''ENCODING_ISSUE''', matview_full_name)
+ INTO encoding_issues;
+ mismatch_result := public.c77_mvc_check_matv_mismatches(target_schema, matview_name, validation_type);
+ mismatch_count := (mismatch_result->>'mismatched_records')::bigint;
+ mismatch_percent := (mismatch_result->>'mismatch_percent')::float;
+
+ -- Append mismatch notes
+ notes := array_cat(notes, ARRAY(SELECT jsonb_array_elements_text(mismatch_result->'notes')));
+
+ -- Update character_encoding_status
+ IF encoding_issues > 0 THEN
+ character_encoding_status := 'DEGRADED';
+ ELSE
+ character_encoding_status := 'CLEAN';
+ END IF;
+
+ -- Update status (time-based staleness should be resolved if action was 'refresh')
+ SELECT refresh_mv_last
+ INTO last_refresh
+ FROM public.c77_dbh_matv_stats
+ WHERE mv_name = target_schema || '.' || matview_name;
+
+ IF NOT FOUND THEN
+ notes := array_append(notes, format('Warning: No refresh stats found for materialized view %s in c77_dbh_matv_stats after action', matview_name));
+ END IF;
+
+ notes := array_append(notes, format('After action - Last refresh: %s, Time since last refresh: %s, Staleness threshold: %s', last_refresh, now() - last_refresh, staleness_threshold));
+
+ IF last_refresh IS NULL OR (now() - last_refresh) > staleness_threshold THEN
+ is_stale_by_time := true;
+ ELSE
+ is_stale_by_time := false;
+ END IF;
+ notes := array_append(notes, format('After action - Is stale by time: %s', is_stale_by_time));
+
+ -- Log mismatch details after action
+ notes := array_append(notes, format('After action - Mismatch percent: %s, Mismatch threshold: %s', mismatch_percent, mismatch_threshold));
+
+ IF is_stale_by_time OR mismatch_percent > mismatch_threshold THEN
+ status := 'Stale';
+ ELSE
+ status := 'Healthy';
+ END IF;
+ END IF;
+
+ -- Step 12: Calculate elapsed time and return the results
+ end_time := now();
+ elapsed_time := end_time - start_time;
+ notes := array_append(notes, format('Elapsed time: %s', elapsed_time));
+
+ RETURN jsonb_build_object(
+ 'total_vtw_records', total_vtw_records,
+ 'total_matview_records', total_matview_records,
+ 'encoding_issues', encoding_issues,
+ 'mismatched_records', mismatch_count,
+ 'mismatch_percent', mismatch_percent,
+ 'status', status,
+ 'character_encoding_status', character_encoding_status,
+ 'estimated_refresh_time', estimated_refresh_time,
+ 'notes', notes
+ );
+EXCEPTION WHEN OTHERS THEN
+ end_time := now();
+ elapsed_time := end_time - start_time;
+ notes := array_append(notes, format('Elapsed time: %s', elapsed_time));
+ notes := array_append(notes, format('Unexpected error in c77_mvc_manage_matv_health: %s', SQLERRM));
+ RETURN jsonb_build_object(
+ 'error', SQLERRM,
+ 'notes', notes
+ );
+END;
+$$;
+
+
+CREATE OR REPLACE FUNCTION public.c77_mvc_assemble_matv_health_result(full_matview_name text, full_vtw_name text, stats jsonb, mismatched_records bigint, validation_type text, sample_size bigint, mismatch_threshold numeric, action_result text, exec_time timestamp with time zone) RETURNS jsonb
+ LANGUAGE plpgsql
+AS $$
+DECLARE
+ total_matview_records bigint := (stats->>'total_matview_records')::bigint;
+ clean_records bigint := (stats->>'clean_records')::bigint;
+ encoding_issues bigint := (stats->>'encoding_issues')::bigint;
+ clean_percent text;
+ estimated_refresh_time interval;
+BEGIN
+ -- Calculate clean percent
+ clean_percent := CASE WHEN total_matview_records > 0
+ THEN to_char(ROUND((clean_records::NUMERIC / total_matview_records::NUMERIC) * 100, 2), 'FM9999999999999999.99') || '%'
+ ELSE 'N/A'
+ END;
+
+ -- Estimate refresh time
+ estimated_refresh_time := public.c77_mvc_estimate_matv_refresh_time(full_matview_name);
+
+ -- Assemble result
+ RETURN jsonb_build_object(
+ 'matview', full_matview_name,
+ 'vtw_source', full_vtw_name,
+ 'total_matview_records', total_matview_records::text,
+ 'total_vtw_records', (stats->>'total_vtw_records')::text,
+ 'mismatched_records', mismatched_records::text,
+ 'mismatch_percent', CASE WHEN total_matview_records > 0
+ THEN to_char(ROUND((mismatched_records::NUMERIC / total_matview_records::NUMERIC) * 100, 2), 'FM9999999999999999.99') || '%'
+ ELSE 'N/A'
+ END,
+ 'clean_records', clean_records::text,
+ 'encoding_issues', encoding_issues::text,
+ 'clean_record%', clean_percent,
+ 'last_matview_update', COALESCE((stats->>'last_matview_update')::text, 'N/A'),
+ 'last_vtw_update', COALESCE((stats->>'last_vtw_update')::text, 'N/A'),
+ 'size_mb', (stats->>'size_mb')::text,
+ 'estimated_refresh_time', to_char(estimated_refresh_time, 'HH24:MI:SS.MS'),
+ 'validation_type', validation_type,
+ 'sample_size', sample_size::text,
+ 'status', CASE
+ WHEN total_matview_records = 0 THEN 'Uninitialized'
+ WHEN (mismatched_records::NUMERIC / NULLIF(total_matview_records, 0)::NUMERIC) * 100 > mismatch_threshold THEN 'Stale'
+ WHEN encoding_issues > 0 THEN 'Degraded'
+ ELSE 'Healthy'
+ END,
+ 'execution_time', to_char(clock_timestamp() - exec_time, 'HH24:MI:SS.MS')
+ ) || COALESCE(jsonb_build_object('action_result', action_result), '{}');
+END;
+$$;
diff --git a/c77_mvc.control b/c77_mvc.control
new file mode 100644
index 0000000..097cb8e
--- /dev/null
+++ b/c77_mvc.control
@@ -0,0 +1,6 @@
+# c77_mvc.control
+comment = 'Materialized view and table fitness utilities'
+default_version = '1.0'
+module_pathname = ''
+requires = 'c77_dbh'
+relocatable = true
diff --git a/dependency-map.md b/dependency-map.md
deleted file mode 100644
index c00e2ef..0000000
--- a/dependency-map.md
+++ /dev/null
@@ -1,153 +0,0 @@
-# PostgreSQL Function Dependency Map
-
-## Overview
-
-This document maps the dependencies between the PostgreSQL functions in the `config` schema. The functions are organized into two main subsystems:
-
-1. **Table Analysis Subsystem**: Functions for analyzing tables to identify optimal keys, partitioning strategies, and data quality issues
-2. **Materialized View Management Subsystem**: Functions for creating, monitoring, and maintaining materialized views
-
-## Table Analysis Subsystem
-
-### Main Entry Point
-- `config.grok_analyze_table_fitness` - Orchestrates the complete table analysis process
-
-### Dependency Hierarchy
-
-```
-grok_analyze_table_fitness
-├── grok_calculate_sample_size
-├── grok_create_temp_table
-├── grok_analyze_column_stats
-├── grok_identify_order_by_candidates
-├── grok_analyze_column_combinations
-├── grok_calculate_dqi
-└── grok_assemble_result
-```
-
-### Function Relationships
-
-1. `grok_analyze_table_fitness`
- - Calls `grok_calculate_sample_size` to determine appropriate sample size
- - Calls `grok_create_temp_table` to create a temporary copy of the source table
- - Calls `grok_analyze_column_stats` for each column to analyze its characteristics
- - Calls `grok_identify_order_by_candidates` to find columns suitable for ordering
- - Calls `grok_analyze_column_combinations` to identify potential composite keys
- - Calls `grok_calculate_dqi` to calculate the Data Quality Index
- - Calls `grok_assemble_result` to prepare the final results and clean up
-
-2. `grok_analyze_column_stats`
- - No dependencies on other functions
- - Results are used by `grok_analyze_column_combinations`, `grok_identify_order_by_candidates`, and `grok_calculate_dqi`
-
-3. `grok_calculate_dqi`
- - Uses data from `grok_analyze_column_stats`
- - No direct function dependencies
-
-4. `grok_create_temp_table`
- - No dependencies on other functions
- - Creates temporary tables used by other analysis functions
-
-## Materialized View Management Subsystem
-
-### Main Entry Points
-- `grok_create_optimized_matv` - Creates an optimized materialized view system
-- `grok_manage_matv_health` - Monitors and maintains materialized view health
-
-### Dependency Hierarchy for Creation
-
-```
-grok_create_optimized_matv
-├── grok_generate_column_lists (not explicitly called but similar functionality)
-├── grok_generate_synthetic_key_and_hash (not explicitly called but similar functionality)
-└── grok_create_indexes (not explicitly called but similar functionality)
-```
-
-### Dependency Hierarchy for Health Management
-
-```
-grok_manage_matv_health
-├── grok_check_matv_mismatches
-├── grok_estimate_matv_refresh_time
-└── grok_perform_matv_action (indirectly)
-
-grok_perform_matv_action
-└── (No function dependencies)
-
-grok_assemble_matv_health_result
-└── grok_estimate_matv_refresh_time
-```
-
-### Function Relationships
-
-1. `grok_create_optimized_matv`
- - Has similar functionality to `grok_generate_column_lists` but doesn't call it directly
- - Has similar functionality to `grok_generate_synthetic_key_and_hash` but doesn't call it directly
- - Has similar functionality to `grok_create_indexes` but doesn't call it directly
- - Creates a complete materialized view system (source view, materialized view, and read views)
-
-2. `grok_manage_matv_health`
- - Calls `grok_check_matv_mismatches` to detect inconsistencies
- - Calls `grok_estimate_matv_refresh_time` to estimate refresh times
- - Contains embedded functionality similar to `grok_perform_matv_action`
-
-3. `grok_perform_matv_action`
- - No direct function dependencies
- - Performs maintenance actions on materialized views
-
-4. `grok_assemble_matv_health_result`
- - Calls `grok_estimate_matv_refresh_time` to get refresh time estimates
- - Formats health check results
-
-5. `grok_check_matv_mismatches`
- - No direct function dependencies
- - Performs content hash comparison between source and materialized views
-
-6. `grok_validate_matv_inputs`
- - No direct function dependencies
- - Validates materialized view and source view existence
-
-7. `grok_set_validation_params`
- - No direct function dependencies
- - Configures validation parameters for health checks
-
-## Utility Functions
-
-1. `grok_calculate_sample_size`
- - Called by `grok_analyze_table_fitness`
- - Called by `grok_calculate_matv_sample_size` (though the result is unused)
-
-2. `grok_calculate_matv_sample_size`
- - Calls `grok_calculate_sample_size` but doesn't use the result
- - Used for materialized view validation sampling
-
-3. `grok_estimate_matv_refresh_time`
- - Called by `grok_assemble_matv_health_result`
- - Called by `grok_manage_matv_health`
- - Estimates materialized view refresh times
-
-4. `grok_validate_order_by_columns`
- - No direct function dependencies
- - Validates timestamp-like columns for ordering
-
-## Integration Points
-
-The two subsystems integrate at these key points:
-
-1. **Table Analysis → Materialized View Creation**:
- - Analysis results from `grok_analyze_table_fitness` can inform parameters for `grok_create_optimized_matv`
- - Recommended partition columns and order-by columns can be used directly
-
-2. **Materialized View Management**:
- - Both `grok_create_indexes` and `grok_create_optimized_matv` create similar index structures
- - `grok_assemble_matv_result` and `grok_assemble_matv_health_result` format related outputs
-
-## External Dependencies
-
-These functions depend on external database objects:
-
-1. **Table Fitness Audit Table**:
- - `config.table_fitness_audit` - Stores table analysis results
-
-2. **Materialized View Statistics Tables**:
- - `public.c77_dbh_matv_stats` - Stores materialized view refresh statistics
diff --git a/grok_perform_matv_action-readme.md b/grok_perform_matv_action-readme.md
deleted file mode 100644
index f42841f..0000000
--- a/grok_perform_matv_action-readme.md
+++ /dev/null
@@ -1,82 +0,0 @@
-# Function: grok_perform_matv_action
-
-## Overview
-This function performs maintenance actions on a materialized view based on its current health status, applying the appropriate remediation strategy.
-
-## Schema
-`config.grok_perform_matv_action`
-
-## Parameters
-- `full_matview_name` (text): Full name of the materialized view (schema.name)
-- `schema_name` (text): Schema containing the materialized view
-- `matview_name` (text): Name of the materialized view
-- `action` (text): Action to perform: 'refresh', 'repair', or 'reindex'
-- `mismatched_records` (bigint): Number of records that don't match between materialized view and source
-- `total_matview_records` (bigint): Total number of records in the materialized view
-- `time_diff` (interval): Time since last refresh
-- `mismatch_threshold` (numeric): Threshold percentage that determines when a refresh is needed
-- `time_threshold` (interval): Time threshold that determines when a refresh is needed
-- `encoding_issues` (bigint): Number of records with encoding issues
-
-## Return Value
-Returns a JSONB object indicating the action result:
-```json
-{
- "action_performed": true,
- "action_result": "Refreshed successfully (concurrently)"
-}
-```
-
-Or in case no action was taken or an error occurred:
-```json
-{
- "action_performed": false,
- "action_result": "Action skipped: threshold not met or invalid action"
-}
-```
-
-## Description
-This function implements a conditional maintenance system for materialized views based on their current health. It supports three types of actions:
-
-1. **Refresh**: Updates the materialized view with current data from the source view
- - Uses concurrent refresh if a unique index exists
- - Falls back to non-concurrent refresh if no unique index is found
- - Only performed if mismatch ratio exceeds the threshold or time since last refresh exceeds the time threshold
-
-2. **Repair**: Rebuilds indexes and constraints to address encoding issues
- - Drops all existing indexes (except primary keys)
- - Drops primary key and unique constraints
- - Recreates standard indexes on content_hash and synthetic_key
- - Analyzes the table to update statistics
- - Only performed if encoding issues are detected
-
-3. **Reindex**: Rebuilds all indexes without dropping them
- - Can be used for routine maintenance
- - Always performed when requested (no threshold check)
-
-The function intelligently applies the most appropriate technique based on the materialized view's structure and current state.
-
-## Index Management
-For materialized views with unique indexes, the function uses PostgreSQL's REFRESH MATERIALIZED VIEW CONCURRENTLY command, which allows queries to continue running against the materialized view during the refresh. For views without unique indexes, it falls back to the standard non-concurrent refresh.
-
-## Error Handling
-If an error occurs during action execution, the function returns information about the failure without raising an exception, allowing the calling process to continue.
-
-## Dependencies
-This function doesn't directly call other functions but is likely called by `config.grok_manage_matv_health`.
-
-## Usage Example
-```sql
-SELECT config.grok_perform_matv_action(
- 'analytics.matc_daily_sales',
- 'analytics',
- 'matc_daily_sales',
- 'refresh',
- 155,
- 12345,
- '25:30:00'::interval,
- 1.0,
- '24:00:00'::interval,
- 0
-);
-```
diff --git a/grok_set_validation_params-readme.md b/grok_set_validation_params-readme.md
deleted file mode 100644
index 4a34809..0000000
--- a/grok_set_validation_params-readme.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# Function: grok_set_validation_params
-
-## Overview
-This function sets validation parameters and thresholds based on the specified validation type for materialized view health checks.
-
-## Schema
-`config.grok_set_validation_params`
-
-## Parameters
-- `validation_type` (text): Type of validation to configure: 'quick', 'daily', or 'full'
-
-## Return Value
-Returns a JSONB object containing validation parameters and thresholds:
-```json
-{
- "params": {
- "sample_percent": 0.1,
- "confidence": 0.95,
- "margin": 0.03
- },
- "mismatch_threshold": 0.1,
- "time_threshold": "3 days"
-}
-```
-
-## Description
-This function configures appropriate validation parameters and thresholds based on the specified validation type. It supports three validation modes, each with its own balance between thoroughness and performance:
-
-1. **Quick** (default): Light validation for frequent checks
- - Sampling: 0.1% of records
- - Confidence level: 95%
- - Margin of error: 3%
- - Mismatch threshold: 0.1% (data mismatch tolerance)
- - Time threshold: 3 days (acceptable staleness)
-
-2. **Daily**: Medium validation for daily maintenance
- - Sampling: 1% of records
- - Confidence level: 99%
- - Margin of error: 1%
- - Mismatch threshold: 0.05% (data mismatch tolerance)
- - Time threshold: 1 day (acceptable staleness)
-
-3. **Full**: Thorough validation for critical checks
- - Sampling: 100% of records (full scan)
- - Confidence level: 99%
- - Margin of error: 0.5%
- - Mismatch threshold: 0.01% (data mismatch tolerance)
- - Time threshold: 12 hours (acceptable staleness)
-
-If an invalid validation type is provided, the function defaults to 'quick' mode parameters.
-
-## Parameter Explanations
-- `sample_percent`: Percentage of records to sample during validation
-- `confidence`: Statistical confidence level for sampling
-- `margin`: Acceptable margin of error for sampling
-- `mismatch_threshold`: Maximum acceptable percentage of mismatched records
-- `time_threshold`: Maximum acceptable time since last refresh
-
-## Dependencies
-This function is likely called by other materialized view health check functions to configure validation parameters.
-
-## Usage Example
-```sql
--- Get validation parameters for daily checks
-SELECT config.grok_set_validation_params('daily');
-
--- Get validation parameters for thorough health check
-SELECT config.grok_set_validation_params('full');
-```
diff --git a/grok_validate_matv_inputs-readme.md b/grok_validate_matv_inputs-readme.md
deleted file mode 100644
index 6bf0668..0000000
--- a/grok_validate_matv_inputs-readme.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# Function: grok_validate_matv_inputs
-
-## Overview
-This function validates the existence of a materialized view and its source view before performing operations on them, ensuring inputs are valid.
-
-## Schema
-`config.grok_validate_matv_inputs`
-
-## Parameters
-- `schema_name` (text): Schema containing the materialized view and source view
-- `matview_name` (text): Name of the materialized view
-- `vtw_name` (text): Optional name of the source view (if not provided, derived from matview_name)
-
-## Return Value
-Returns a JSONB object with validation results:
-
-Success case:
-```json
-{
- "full_matview_name": "schema.matview_name",
- "full_vtw_name": "schema.vtw_name",
- "notes": []
-}
-```
-
-Error case:
-```json
-{
- "error": "Materialized view schema.matview_name does not exist",
- "notes": []
-}
-```
-
-## Description
-This function performs input validation before executing operations on materialized views by:
-
-1. Constructing the fully qualified names for the materialized view and source view
-2. Checking if the materialized view exists in pg_matviews
-3. Checking if the source view exists in either pg_views or pg_tables
-4. Returning appropriate error messages if either object is missing
-
-If `vtw_name` is not provided, the function derives it by replacing 'matc_' with 'vtw_' in the materialized view name, following the standard naming convention.
-
-## Validation Checks
-The function checks:
-- Materialized view existence using the pg_matviews system catalog
-- Source view existence using both pg_views and pg_tables system catalogs (handles both views and tables)
-
-## Error Handling
-If validation fails, the function returns a descriptive error message indicating which object is missing. If an unexpected error occurs during validation, it returns a generic error message with the exception details.
-
-## Dependencies
-This function doesn't call other functions but is likely called by materialized view management functions before performing operations.
-
-## Usage Example
-```sql
--- Validate materialized view with automatic source view name derivation
-SELECT config.grok_validate_matv_inputs(
- 'analytics',
- 'matc_daily_sales',
- NULL
-);
-
--- Validate materialized view with explicit source view name
-SELECT config.grok_validate_matv_inputs(
- 'analytics',
- 'matc_daily_sales',
- 'custom_source_view'
-);
-```
diff --git a/grok_validate_order_by_columns-readme.md b/grok_validate_order_by_columns-readme.md
deleted file mode 100644
index 1b1bf95..0000000
--- a/grok_validate_order_by_columns-readme.md
+++ /dev/null
@@ -1,63 +0,0 @@
-# Function: grok_validate_order_by_columns
-
-## Overview
-This function validates that specified order-by columns exist in a source table and contain data that can be parsed as timestamps, ensuring they can be used for deterministic ordering.
-
-## Schema
-`config.grok_validate_order_by_columns`
-
-## Parameters
-- `source_schema` (text): Schema containing the source table
-- `source_table` (text): Name of the source table
-- `order_by_columns` (text[]): Array of column names to validate
-
-## Return Value
-Returns a text array containing warning messages for any issues found:
-```
-{
- "Warning: column_name not found in schema.table",
- "Warning: column_name contains unparseable timestamp data: error message"
-}
-```
-
-## Description
-This function validates columns intended for use in ORDER BY clauses, particularly for generating synthetic keys in materialized views. It performs two types of validation:
-
-1. **Existence Check**: Verifies each column exists in the specified table
-2. **Timestamp Parsing**: Tests if each column's data can be parsed as a timestamp
-
-For timestamp parsing, the function attempts to convert the column data using:
-```sql
-TO_TIMESTAMP(SUBSTRING(NULLIF(column, ''), 1, 19), 'YYYY-MM-DD HH24:MI:SS')
-```
-
-This validation approach ensures that:
-- Columns are valid for the source table
-- Timestamp columns can be parsed consistently
-- The ORDER BY clause will produce deterministic results
-
-## Timestamp Parsing Details
-The timestamp parsing logic:
-- Uses NULLIF to handle NULL values
-- Takes only the first 19 characters using SUBSTRING
-- Uses a fixed format of 'YYYY-MM-DD HH24:MI:SS'
-
-This standardized parsing ensures consistent ordering behavior regardless of the actual format stored in the column.
-
-## Error Handling
-The function collects warnings without failing, allowing for a complete validation report:
-- Missing columns generate a warning
-- Unparseable timestamp data generates a warning with the specific error
-- If an unexpected error occurs, it returns a general error message
-
-## Dependencies
-This function is likely called by other functions that create materialized views to validate order-by columns before using them.
-
-## Usage Example
-```sql
-SELECT config.grok_validate_order_by_columns(
- 'public',
- 'customers',
- ARRAY['created_at', 'updated_at']
-);
-```