c77_mvc sql and documentation

This commit is contained in:
Tom Rogers 2025-03-27 07:00:55 -05:00
parent 1458c8e493
commit 24910e700d
18 changed files with 2563 additions and 575 deletions

8
.idea/.gitignore generated vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

8
.idea/c77_mvc.iml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="WEB_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

12
.idea/dataSources.xml generated Normal file
View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
<data-source source="LOCAL" name="dbh@localhost" uuid="c71b97f6-26f8-4e20-acce-d8677010d18e">
<driver-ref>postgresql</driver-ref>
<synchronize>true</synchronize>
<jdbc-driver>org.postgresql.Driver</jdbc-driver>
<jdbc-url>jdbc:postgresql://localhost:5432/dbh</jdbc-url>
<working-dir>$ProjectFileDir$</working-dir>
</data-source>
</component>
</project>

8
.idea/modules.xml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/c77_mvc.iml" filepath="$PROJECT_DIR$/.idea/c77_mvc.iml" />
</modules>
</component>
</project>

19
.idea/php.xml generated Normal file
View File

@ -0,0 +1,19 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="MessDetectorOptionsConfiguration">
<option name="transferred" value="true" />
</component>
<component name="PHPCSFixerOptionsConfiguration">
<option name="transferred" value="true" />
</component>
<component name="PHPCodeSnifferOptionsConfiguration">
<option name="highlightLevel" value="WARNING" />
<option name="transferred" value="true" />
</component>
<component name="PhpStanOptionsConfiguration">
<option name="transferred" value="true" />
</component>
<component name="PsalmOptionsConfiguration">
<option name="transferred" value="true" />
</component>
</project>

6
.idea/vcs.xml generated Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

187
INSTALLATION.md Normal file
View File

@ -0,0 +1,187 @@
# Installation Guide for c77_mvc PostgreSQL Extension
## Prerequisites
Before installing the c77_mvc extension, ensure you have:
1. PostgreSQL 11 or later installed
2. Administrative access to your PostgreSQL instance
3. The c77_dbh extension installed (required dependency)
4. Git (if installing from source repository)
## Standard Installation
### Option 1: Using PostgreSQL Extensions Directory
1. Copy the extension files to your PostgreSQL extensions directory:
```bash
# Get the extension directory location
export PGEXTDIR=$(pg_config --sharedir)/extension
# Copy files
sudo cp c77_mvc.control $PGEXTDIR/
sudo cp c77_mvc--1.0.sql $PGEXTDIR/
```
2. Connect to your PostgreSQL database and create the extension:
```sql
CREATE EXTENSION c77_dbh; -- Install dependency first if not already installed
CREATE EXTENSION c77_mvc;
```
### Option 2: Installing from Git Repository
1. Clone the repository:
```bash
git clone https://git.jctr3.com/trogers1884/c77_mvc.git
cd c77_mvc
```
2. Copy files to your PostgreSQL extensions directory:
```bash
export PGEXTDIR=$(pg_config --sharedir)/extension
sudo cp c77_mvc.control $PGEXTDIR/
sudo cp c77_mvc--1.0.sql $PGEXTDIR/
```
3. Connect to your PostgreSQL database and create the extension:
```sql
CREATE EXTENSION c77_dbh; -- Install dependency first if not already installed
CREATE EXTENSION c77_mvc;
```
## Manual Installation
If you prefer to install the extension manually or if you need to customize the installation process, follow these steps:
1. Ensure the c77_dbh extension is already installed:
```sql
SELECT * FROM pg_extension WHERE extname = 'c77_dbh';
```
If not installed, install it first:
```sql
CREATE EXTENSION c77_dbh;
```
2. Create the table and functions manually by executing the SQL commands:
```sql
-- Create the audit table
CREATE TABLE IF NOT EXISTS public.c77_mvc_table_fitness_audit (
run_id BIGSERIAL,
run_timestamp timestamp without time zone DEFAULT CURRENT_TIMESTAMP,
source_schema text COLLATE pg_catalog."default",
source_table text COLLATE pg_catalog."default",
analysis_result jsonb,
notes text[] COLLATE pg_catalog."default",
CONSTRAINT table_fitness_audit_pkey PRIMARY KEY (run_id)
) TABLESPACE pg_default;
CREATE INDEX IF NOT EXISTS idx_table_fitness_audit_table
ON public.c77_mvc_table_fitness_audit USING btree
(source_schema COLLATE pg_catalog."default" ASC NULLS LAST, source_table COLLATE pg_catalog."default" ASC NULLS LAST)
TABLESPACE pg_default;
CREATE INDEX IF NOT EXISTS idx_table_fitness_audit_timestamp
ON public.c77_mvc_table_fitness_audit USING btree
(run_timestamp ASC NULLS LAST)
TABLESPACE pg_default;
-- Now execute all the function creation SQL commands from c77_mvc--1.0.sql
-- (Copy and paste all CREATE OR REPLACE FUNCTION statements from the SQL file)
```
3. Verify the installation:
```sql
-- Check if the main table exists
SELECT * FROM pg_tables WHERE tablename = 'c77_mvc_table_fitness_audit';
-- Check if key functions exist
SELECT proname, pronamespace::regnamespace as schema
FROM pg_proc
WHERE proname LIKE 'c77_mvc%'
ORDER BY proname;
```
## Troubleshooting
### Common Issues
1. **Dependency Error**: If you see an error about missing the c77_dbh extension, make sure it's installed properly:
```sql
CREATE EXTENSION c77_dbh;
```
2. **Permission Issues**: Ensure your PostgreSQL user has sufficient privileges:
```sql
-- For a specific user
GRANT ALL ON SCHEMA public TO your_user;
GRANT ALL ON ALL TABLES IN SCHEMA public TO your_user;
GRANT ALL ON ALL SEQUENCES IN SCHEMA public TO your_user;
GRANT ALL ON ALL FUNCTIONS IN SCHEMA public TO your_user;
```
3. **Schema Issues**: If you're installing to a non-public schema, adjust permissions accordingly:
```sql
-- Replace 'custom_schema' with your target schema
GRANT ALL ON SCHEMA custom_schema TO your_user;
GRANT ALL ON ALL TABLES IN SCHEMA custom_schema TO your_user;
GRANT ALL ON ALL SEQUENCES IN SCHEMA custom_schema TO your_user;
GRANT ALL ON ALL FUNCTIONS IN SCHEMA custom_schema TO your_user;
```
### Checking for Successful Installation
To verify if the extension was installed correctly:
```sql
-- List installed extensions
SELECT * FROM pg_extension WHERE extname = 'c77_mvc';
-- Check if the main table exists
SELECT * FROM information_schema.tables WHERE table_name = 'c77_mvc_table_fitness_audit';
-- Test a simple function
SELECT public.c77_mvc_calculate_sample_size(1000000);
```
## Upgrading
To upgrade from a previous version of the extension:
```sql
ALTER EXTENSION c77_mvc UPDATE TO '1.0';
```
## Uninstallation
If you need to uninstall the extension:
```sql
DROP EXTENSION c77_mvc;
```
Note: This will not remove the tables and objects created by the extension. To completely remove all objects:
```sql
DROP EXTENSION c77_mvc CASCADE;
```
## Getting Help
For additional help or to report issues:
- Visit the repository at: https://git.jctr3.com/trogers1884/c77_mvc
- Contact the maintainer via issues on the repository

19
LICENSE.md Normal file
View File

@ -0,0 +1,19 @@
PostgreSQL License
Copyright (c) 2025 c77_mvc Contributors
Permission to use, copy, modify, and distribute this software and its
documentation for any purpose, without fee, and without a written agreement
is hereby granted, provided that the above copyright notice and this paragraph
and the following two paragraphs appear in all copies.
IN NO EVENT SHALL THE AUTHORS BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT,
SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS,
ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF
THE AUTHORS HAVE BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
THE AUTHORS SPECIFICALLY DISCLAIM ANY WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS,
AND THE AUTHORS HAVE NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT,
UPDATES, ENHANCEMENTS, OR MODIFICATIONS.

252
README.md
View File

@ -1,173 +1,149 @@
# PostgreSQL Data Management System # c77_mvc PostgreSQL Extension
[![PostgreSQL](https://img.shields.io/badge/PostgreSQL-11%2B-blue.svg)](https://www.postgresql.org/)
A PostgreSQL extension for materialized view management and table fitness analysis.
## Overview ## Overview
This collection of PostgreSQL functions forms a comprehensive data management system designed to analyze table structures, create optimized materialized views, and maintain their health over time. The system consists of two integrated subsystems that work together to improve database performance, data quality, and maintenance efficiency. c77_mvc (Materialized View and table fitness utilities) provides database administrators and developers with tools to:
## Core Subsystems 1. **Create optimized materialized views** with synthetic keys and content hashing
2. **Manage materialized view health** with automatic staleness detection and refresh
3. **Analyze table fitness** for partitioning, indexing, and query optimization
4. **Monitor data quality** with encoding issue detection and isolation
### 1. Table Analysis Subsystem ## Features
This subsystem analyzes database tables to identify their characteristics, data quality, and optimal strategies for keys, partitioning, and ordering. ### Materialized View Management
**Key Features:** - Create materialized views with synthetic keys and proper partitioning
- Statistical sampling for efficient analysis of large tables - Track content hashes to detect data changes efficiently
- Column-level fitness evaluation for primary/foreign key suitability - Isolate records with encoding issues into separate views
- Data quality assessment with encoding issue detection - Monitor materialized view health with configurable thresholds
- Identification of optimal column combinations for partitioning - Automatically refresh views based on staleness metrics
- Detection of timestamp columns suitable for ordering - Estimate refresh times based on historical performance
- Overall Data Quality Index (DQI) calculation
**Primary Functions:** ### Table Fitness Analysis
- `grok_analyze_table_fitness`: Main entry point for table analysis
- `grok_analyze_column_stats`: Analyzes individual column characteristics
- `grok_analyze_column_combinations`: Evaluates column pairs for composite keys
- `grok_calculate_dqi`: Calculates the overall Data Quality Index
### 2. Materialized View Management Subsystem - Analyze column characteristics for partitioning and indexing
- Identify optimal column combinations for keys and partitioning
- Evaluate data quality with comprehensive metrics
- Calculate overall Data Quality Index (DQI)
- Use statistical sampling for efficient analysis of large tables
This subsystem creates, monitors, and maintains optimized materialized views based on insights from the table analysis. ## Requirements
**Key Features:** - PostgreSQL 11 or later
- Optimized materialized view creation with proper indexing - c77_dbh extension (dependency)
- Automatic handling of character encoding issues
- Synthetic key generation for uniqueness
- Content hash generation for efficient change detection
- Health monitoring with staleness detection
- Automated maintenance and remediation actions
**Primary Functions:** ## Installation
- `grok_create_optimized_matv`: Creates a complete materialized view system
- `grok_manage_matv_health`: Monitors and maintains materialized view health
- `grok_check_matv_mismatches`: Detects inconsistencies between source and materialized views
- `grok_perform_matv_action`: Executes maintenance actions on materialized views
## Architecture & Design Patterns ### Quick Install
The system implements several important design patterns: If you have both extensions available in your PostgreSQL extensions directory:
1. **View Layering Pattern**: Creates multiple views serving different purposes:
- `vtw_*`: View To Watch (source view with data quality enhancement)
- `matc_*`: MATerialized Copy (physical storage with indexes)
- `vm_*`: View of Materialized view (clean data for querying)
- `vprob_*`: View of PROBlematic data (encoding issues for review)
2. **Data Quality Management Pattern**: Automatically detects, flags, and segregates problematic data:
- Non-ASCII character detection
- Cleansed versions of problematic text
- Separate views for clean vs. problematic data
3. **Change Detection Pattern**: Implements efficient methods to detect data changes:
- Content hash generation from relevant columns
- Timestamp-based staleness detection
- Sampling-based consistency validation
4. **Maintenance Strategy Pattern**: Provides multiple strategies for maintaining materialized views:
- Refresh: Updates with fresh data from the source
- Repair: Rebuilds indexes and constraints
- Reindex: Rebuilds indexes without dropping them
## Usage Examples
### Analyzing a Table
```sql ```sql
-- Analyze a table to identify key characteristics and data quality CREATE EXTENSION c77_dbh; -- Install dependency first
SELECT config.grok_analyze_table_fitness( CREATE EXTENSION c77_mvc;
'public', -- Source schema ```
'customer_data', -- Source table
ARRAY['id', 'uid'] -- Columns to exclude from key fitness evaluation ### From Source
```bash
# Clone repository
git clone https://git.jctr3.com/trogers1884/c77_mvc.git
cd c77_mvc
# Copy files to PostgreSQL extension directory
export PGEXTDIR=$(pg_config --sharedir)/extension
sudo cp c77_mvc.control $PGEXTDIR/
sudo cp c77_mvc--1.0.sql $PGEXTDIR/
# Create extension in your database
psql -d your_database -c "CREATE EXTENSION c77_dbh;"
psql -d your_database -c "CREATE EXTENSION c77_mvc;"
```
For detailed installation instructions, see [INSTALLATION.md](INSTALLATION.md).
## Basic Usage
### Table Fitness Analysis
```sql
-- Analyze a table for fitness metrics
SELECT * FROM public.c77_mvc_analyze_table_fitness('schema_name', 'table_name');
```
### Creating Optimized Materialized Views
```sql
-- Create an optimized materialized view
SELECT * FROM public.c77_mvc_create_optimized_matv(
'source_schema', -- Source schema
'source_table', -- Source table
'target_schema', -- Target schema
'matc_target_view_name', -- Target materialized view name (must start with matc_)
ARRAY['customer_id'], -- Partition columns
ARRAY['last_updated'], -- Order-by columns
ARRAY['notes'], -- Columns to exclude from hash calculation (optional)
false -- Filter for latest records only (optional)
); );
``` ```
### Creating an Optimized Materialized View ### Managing Materialized View Health
```sql ```sql
-- Create an optimized materialized view system based on analysis results -- Check materialized view health
SELECT config.grok_create_optimized_matv( SELECT * FROM public.c77_mvc_manage_matv_health(
'public', -- Source schema 'schema_name', -- Schema name
'customer_data', -- Source table 'matc_view_name', -- Materialized view name
'analytics', -- Target schema 'quick', -- Validation type: 'quick', 'daily', 'full'
'matc_customer_summary', -- Target materialized view name NULL -- Action: NULL, 'refresh', 'repair', 'reindex'
ARRAY['region', 'customer_type'], -- Partition columns );
ARRAY['updated_at', 'customer_id'], -- Order-by columns
ARRAY['created_by', 'modified_by'], -- Columns to exclude from hash -- Check and refresh if needed
true -- Filter to latest records only SELECT * FROM public.c77_mvc_manage_matv_health(
'schema_name',
'matc_view_name',
'daily',
'refresh'
); );
``` ```
### Monitoring Materialized View Health For comprehensive usage examples, see [USAGE.md](USAGE.md).
```sql ## View Structure
-- Check health of a materialized view
SELECT config.grok_manage_matv_health(
'analytics', -- Schema
'matc_customer_summary', -- Materialized view name
'daily', -- Validation type: 'quick', 'daily', or 'full'
NULL -- Action (NULL for check only, 'refresh', 'repair', 'reindex')
);
```
### Maintaining Materialized View Health When you create an optimized materialized view, the extension creates multiple objects:
```sql | Object | Naming Pattern | Purpose |
-- Refresh a stale materialized view |--------|----------------|---------|
SELECT config.grok_manage_matv_health( | View | vtw_* | Source view with content hash, synthetic key, and encoding status |
'analytics', -- Schema | Materialized View | matc_* | Materialized copy of the vtw_ view |
'matc_customer_summary', -- Materialized view name | View | vm_* | Clean data view (excludes encoding issues) |
'daily', -- Validation type | View | vprob_* | Problematic data view (only encoding issues) |
'refresh' -- Action to perform
);
```
## Performance Considerations ## Documentation
- **Sampling**: The system uses statistical sampling for efficient analysis of large tables - [Installation Guide](INSTALLATION.md)
- **Concurrent Refresh**: Uses concurrent refresh when possible (requires unique indexes) - [Usage Guide](USAGE.md)
- **Validation Modes**: Offers different validation modes with performance/thoroughness tradeoffs: - [Technical Assessment](TECHNICAL.md)
- `quick`: Fastest, uses 0.1% sampling, 3-day staleness threshold
- `daily`: Medium, uses 1% sampling, 1-day staleness threshold
- `full`: Most thorough, uses 100% sampling, 12-hour staleness threshold
## Dependencies ## Contributing
This system depends on the following database objects: 1. Fork the repository
2. Create your feature branch (`git checkout -b feature/amazing-feature`)
3. Commit your changes (`git commit -m 'Add some amazing feature'`)
4. Push to the branch (`git push origin feature/amazing-feature`)
5. Open a Pull Request
1. **Table Fitness Audit Table**: ## License
- `config.table_fitness_audit`: Stores table analysis results
2. **Materialized View Statistics Table**: This project is licensed under the PostgreSQL License - see the LICENSE file for details.
- `public.c77_dbh_matv_stats`: Stores materialized view refresh statistics
## Best Practices ## Acknowledgements
1. **Initial Analysis**: Run table analysis before creating materialized views to identify optimal configuration - Developed by [Your Name/Organization]
2. **Regular Health Checks**: Schedule periodic health checks using `grok_manage_matv_health` - Special thanks to contributors and testers
3. **Validation Types**: Use `quick` for frequent checks, `daily` for daily maintenance, and `full` for critical views
4. **Monitoring**: Track Data Quality Index (DQI) over time to detect data quality trends
5. **Maintenance Windows**: Schedule refreshes during low-usage periods for large materialized views
## Error Handling
All functions include comprehensive error handling with:
- Clear error messages indicating what went wrong
- Processing notes to track execution steps
- Safe failure modes that avoid leaving the database in an inconsistent state
## Troubleshooting
Common issues and solutions:
1. **Stale Materialized Views**: Use `grok_manage_matv_health` with action='refresh'
2. **Encoding Issues**: Use `grok_manage_matv_health` with action='repair'
3. **Index Performance Issues**: Use `grok_manage_matv_health` with action='reindex'
4. **Missing Statistics**: Ensure `public.c77_dbh_matv_stats` table is populated with refresh statistics
## Extension Points
The system is designed to be extended in several ways:
1. Add custom data quality checks in the `vtw_` view creation
2. Extend partition and order-by column validation logic
3. Implement additional maintenance actions in `grok_perform_matv_action`
4. Add custom health metrics to `grok_manage_matv_health`

148
TECHNICAL.md Normal file
View File

@ -0,0 +1,148 @@
# Technical Assessment: c77_mvc PostgreSQL Extension
## Extension Overview
**Name:** c77_mvc
**Version:** 1.0
**Description:** Materialized view and table fitness utilities
**Repository:** https://git.jctr3.com/trogers1884/c77_mvc
**Dependencies:** c77_dbh
**Relocatable:** Yes
## Purpose and Functionality
The c77_mvc extension provides a comprehensive set of utilities for:
1. **Materialized View Management**
- Creating optimized materialized views with synthetic keys and content hashing
- Monitoring materialized view health and staleness
- Refreshing materialized views based on configurable thresholds
- Handling character encoding issues in data
2. **Table Fitness Analysis**
- Evaluating column characteristics for partitioning and ordering
- Identifying optimal column combinations for keys
- Calculating overall data quality metrics
- Providing recommendations for database optimization
## Technical Architecture
### Core Components
#### Materialized View Management
The extension implements a structured approach to materialized view creation and management using a naming convention pattern:
- `vtw_*`: Source view with content hash, synthetic key, and encoding status
- `matc_*`: Materialized view derived from the source view
- `vm_*`: View for reading cleaned data (filtering out encoding issues)
- `vprob_*`: View for displaying problematic data with encoding issues
#### Table Fitness Analysis
The extension provides analytical functions to:
- Sample table data appropriately based on statistical methods
- Assess individual column characteristics
- Evaluate column combinations for uniqueness and discriminatory power
- Calculate a data quality index (DQI) based on nulls, encoding, and uniqueness
### Database Schema
The extension creates one table:
- `c77_mvc_table_fitness_audit`: Stores the results of table fitness analyses
### Key Functions
#### Materialized View Management
1. `c77_mvc_create_optimized_matv`: Creates a set of views/materialized views with content hashing and synthetic keys
2. `c77_mvc_manage_matv_health`: Monitors materialized view health and performs maintenance actions
3. `c77_mvc_check_matv_mismatches`: Compares materialized views with source views to detect staleness
4. `c77_mvc_estimate_matv_refresh_time`: Estimates refresh time based on historical performance
#### Table Fitness Analysis
1. `c77_mvc_analyze_table_fitness`: Main entry point for analyzing table structure and data quality
2. `c77_mvc_analyze_column_stats`: Evaluates individual column characteristics
3. `c77_mvc_analyze_column_combinations`: Identifies effective column combinations for partitioning
4. `c77_mvc_calculate_dqi`: Calculates a data quality index based on multiple metrics
## Implementation Details
### Statistical Sampling
The extension employs statistical sampling methods to efficiently analyze large tables:
- Uses confidence level and margin of error parameters to calculate appropriate sample sizes
- Implements table sampling using PostgreSQL's TABLESAMPLE clause
- Adjusts sample sizes dynamically based on validation type (quick, daily, full)
### Synthetic Key Generation
For materialized views, the extension:
- Creates synthetic keys using ROW_NUMBER() with custom PARTITION BY and ORDER BY clauses
- Handles timestamp parsing and ordering intelligently
- Ensures deterministic ordering for consistent key generation
### Content Hashing
The extension uses MD5 hashing of row data to:
- Detect changes between source data and materialized views
- Enable efficient comparison for staleness detection
- Facilitate incremental refresh decisions
### Character Encoding Handling
The extension provides robust handling of character encoding issues:
- Detects non-ASCII characters using regex pattern matching
- Segregates problematic data into separate views
- Provides clean views for standard operations
## Security and Performance Considerations
### Security
- The extension uses proper quoting and identifier escaping throughout to prevent SQL injection
- Error handling includes careful message construction to avoid exposing sensitive information
- Temporary tables are used to isolate analysis operations
### Performance
- Statistical sampling is employed to analyze large tables efficiently
- The extension uses table partitioning and appropriate indexing for materialized views
- Validation types (quick, daily, full) allow for different performance/accuracy tradeoffs
- Refresh operations consider existing performance statistics to make intelligent decisions
### Dependencies
- Relies on the `c77_dbh` extension for certain operations
- Uses the `c77_dbh_matv_stats` table for historical performance tracking
- Verifies dependency existence at installation time
## Code Quality Assessment
### Strengths
1. **Robust Error Handling**: Comprehensive try-catch blocks throughout the codebase
2. **Parameterization**: Extensive use of parameters allows for flexible configuration
3. **Documentation**: Clear inline documentation of function purposes and parameters
4. **Statistical Approach**: Uses sound statistical methods for sampling and analysis
5. **Modular Design**: Functions are well-organized with clear responsibilities
### Areas for Improvement
1. **Configuration Management**: Some parameters are hardcoded and could be externalized
2. **Testing Coverage**: No explicit test functions or frameworks are included
3. **Schema Management**: Some functions assume specific table structures without validation
4. **Code Duplication**: Some SQL generation patterns are repeated across functions
5. **Performance Metrics**: Limited documentation of expected performance characteristics
## Recommendations
### Documentation Enhancements
1. Add comprehensive function-level documentation explaining parameter use and return values
2. Document the expected table structures and naming conventions
3. Provide examples of common usage patterns for key functions
4. Add performance guidance for large databases
### Feature Enhancements
1. **Configuration Management**: Create a configuration table for tunable parameters
2. **Monitoring Dashboard**: Add functions to generate monitoring reports for DBA use
3. **Batch Operations**: Add capabilities for managing multiple materialized views simultaneously
4. **Custom Metrics**: Allow users to define custom fitness metrics for specific use cases
### Technical Improvements
1. **Parallelization**: Add support for parallel analysis of large tables
2. **Versioning**: Improve version management for schema changes
3. **Testing**: Add a comprehensive test suite
4. **Logging**: Enhance logging capabilities for troubleshooting
5. **Performance Optimization**: Optimize sampling methods for very large tables
## Conclusion
The c77_mvc extension provides a well-designed and comprehensive solution for managing materialized views and analyzing table fitness in PostgreSQL. Its approach to content hashing, synthetic key generation, and encoding issue handling is particularly noteworthy. The statistical sampling methods enable efficient analysis of large tables.
The extension would benefit from improved configuration management, enhanced documentation, and a more structured approach to testing. Overall, it represents a valuable tool for database administrators working with complex PostgreSQL environments, particularly those dealing with data quality issues and materialized view management.
Key strengths include the robust error handling, statistical approach to sampling, and comprehensive materialized view management capabilities. With the suggested improvements, this extension could become an essential part of a PostgreSQL database administrator's toolkit.

335
USAGE.md Normal file
View File

@ -0,0 +1,335 @@
# c77_mvc PostgreSQL Extension Usage Guide
This guide provides detailed instructions on how to use the c77_mvc extension for materialized view management and table fitness analysis in PostgreSQL.
## Table of Contents
1. [Overview](#overview)
2. [Table Fitness Analysis](#table-fitness-analysis)
3. [Materialized View Management](#materialized-view-management)
4. [Materialized View Health Monitoring](#materialized-view-health-monitoring)
5. [Advanced Use Cases](#advanced-use-cases)
6. [Best Practices](#best-practices)
7. [Function Reference](#function-reference)
## Overview
The c77_mvc extension provides two main sets of functionality:
1. **Table Fitness Analysis**: Evaluate table structure for data quality, partitioning suitability, and optimization opportunities
2. **Materialized View Management**: Create and maintain optimized materialized views with content hashing, synthetic keys, and encoding status tracking
## Table Fitness Analysis
Table fitness analysis helps you evaluate table structure and data quality to inform optimization decisions.
### Basic Table Analysis
To perform a basic analysis on a table:
```sql
SELECT * FROM public.c77_mvc_analyze_table_fitness('schema_name', 'table_name');
```
This returns a JSON object containing:
- Column statistics
- Recommended partition combinations
- Order-by candidates
- Data quality index
- Analysis notes
### Example with Excluding Key Columns
If you have columns that should not be considered for key combinations:
```sql
SELECT * FROM public.c77_mvc_analyze_table_fitness(
'schema_name',
'table_name',
ARRAY['id', 'created_at']::text[]
);
```
### Interpreting Analysis Results
The analysis result includes:
1. **Column Stats**: Individual column metrics including:
- Null ratio
- Uniqueness ratio
- Encoding issue ratio
- Fitness score
2. **Recommended Partition Combinations**: Column pairs that work well together for partitioning:
- Uniqueness ratio
- Discrimination power
- Average fitness score
3. **Order-by Candidates**: Columns suitable for ordering data:
- Timestamp columns
- Text columns parseable as timestamps
4. **Data Quality Index (DQI)**: Overall score from 0-100 indicating data quality
Example query to extract key information:
```sql
SELECT
run_id,
run_timestamp,
analysis_result->>'data_quality_index' as dqi,
analysis_result->'recommended_partition_combinations' as partition_recommendations
FROM public.c77_mvc_table_fitness_audit
WHERE source_schema = 'schema_name' AND source_table = 'table_name'
ORDER BY run_id DESC
LIMIT 1;
```
## Materialized View Management
The extension provides tools to create and manage optimized materialized views.
### Creating Optimized Materialized Views
To create an optimized materialized view with synthetic keys and content hashing:
```sql
SELECT * FROM public.c77_mvc_create_optimized_matv(
'source_schema', -- Source schema name
'source_table', -- Source table name
'target_schema', -- Target schema for materialized view
'matc_target_mv_name', -- Target materialized view name (should start with matc_)
ARRAY['column1', 'column2'], -- Partition columns
ARRAY['timestamp_column'], -- Order-by columns
ARRAY['exclude_column1'], -- Columns to exclude from content hash calculation (optional)
false -- Filter to get only latest records (optional)
);
```
This creates:
1. `vtw_target_mv_name`: A view with regexp_replace for character columns
2. `matc_target_mv_name`: A materialized view derived from the vtw_ view
3. `vm_target_mv_name`: A view that filters out encoding issues
4. `vprob_target_mv_name`: A view showing only records with encoding issues
### View Structure and Purpose
When you create an optimized materialized view, multiple objects are created:
| Object Type | Naming Pattern | Purpose |
|-------------|----------------|---------|
| View | vtw_* | Source view with content hash, synthetic key, and encoding status |
| Materialized View | matc_* | Materialized copy of the vtw_ view |
| View | vm_* | Clean data view (excludes encoding issues) |
| View | vprob_* | Problematic data view (only encoding issues) |
### Example Use Case
Scenario: Creating a materialized view of customer data:
```sql
SELECT * FROM public.c77_mvc_create_optimized_matv(
'sales',
'customers',
'reporting',
'matc_customer_summary',
ARRAY['customer_id', 'region'],
ARRAY['last_updated'],
ARRAY['notes', 'internal_comments'],
false
);
```
To query clean data:
```sql
SELECT * FROM reporting.vm_customer_summary;
```
To check for encoding issues:
```sql
SELECT * FROM reporting.vprob_customer_summary;
```
## Materialized View Health Monitoring
The extension provides tools to monitor and maintain the health of materialized views.
### Checking Materialized View Health
```sql
SELECT * FROM public.c77_mvc_manage_matv_health(
'schema_name', -- Schema containing the materialized view
'matc_view_name', -- Materialized view name (should start with matc_)
'quick', -- Validation type: 'quick', 'daily', or 'full'
NULL -- Action: NULL, 'refresh', 'repair', or 'reindex'
);
```
Validation types:
- `quick`: Fast check with 0.1% sample (3-day threshold)
- `daily`: More thorough check with 1% sample (1-day threshold)
- `full`: Complete check with 100% sample (12-hour threshold)
### Automated Refresh
To check health and refresh if needed:
```sql
SELECT * FROM public.c77_mvc_manage_matv_health(
'schema_name',
'matc_view_name',
'daily',
'refresh' -- Will refresh if stale based on thresholds
);
```
### Monitoring Multiple Views
Example script to monitor all materialized views in a schema:
```sql
DO $$
DECLARE
view_record RECORD;
result JSONB;
BEGIN
FOR view_record IN
SELECT matviewname
FROM pg_matviews
WHERE schemaname = 'target_schema'
AND matviewname LIKE 'matc_%'
LOOP
RAISE NOTICE 'Checking view: %', view_record.matviewname;
SELECT * FROM public.c77_mvc_manage_matv_health('target_schema', view_record.matviewname, 'quick', NULL) INTO result;
RAISE NOTICE 'Status: %', result->>'status';
END LOOP;
END;
$$;
```
## Advanced Use Cases
### Customizing Character Encoding Handling
The extension detects non-ASCII characters using the regex pattern `[^\x00-\x7F]`. For custom handling:
1. Create your own view that modifies the `vtw_` view:
```sql
CREATE OR REPLACE VIEW custom_schema.my_custom_vtw AS
SELECT *,
CASE
WHEN column1 ~ '[^\x00-\x7F]' OR column2 ~ '[^\x20-\x7E]' THEN 'CUSTOM_ENCODING_ISSUE'
ELSE 'CLEAN'
END AS encoding_status
FROM schema_name.vtw_original_view;
```
### Batch Refresh Strategy
Example of a batch refresh strategy based on analysis:
```sql
DO $$
DECLARE
view_record RECORD;
health_result JSONB;
estimated_time INTERVAL;
total_time INTERVAL := '0 seconds'::INTERVAL;
max_batch_time INTERVAL := '2 hours'::INTERVAL;
views_to_refresh TEXT[] := '{}';
BEGIN
-- Gather health stats and estimated times
FOR view_record IN
SELECT matviewname
FROM pg_matviews
WHERE schemaname = 'target_schema'
AND matviewname LIKE 'matc_%'
LOOP
SELECT * FROM public.c77_mvc_manage_matv_health('target_schema', view_record.matviewname, 'quick', NULL)
INTO health_result;
IF health_result->>'status' = 'Stale' THEN
estimated_time := (health_result->>'estimated_refresh_time')::INTERVAL;
-- Add to batch if we don't exceed max time
IF (total_time + estimated_time) < max_batch_time THEN
views_to_refresh := array_append(views_to_refresh, view_record.matviewname);
total_time := total_time + estimated_time;
END IF;
END IF;
END LOOP;
-- Refresh the batch
FOREACH view_record.matviewname IN ARRAY views_to_refresh
LOOP
RAISE NOTICE 'Refreshing %', view_record.matviewname;
PERFORM public.c77_mvc_manage_matv_health('target_schema', view_record.matviewname, 'quick', 'refresh');
END LOOP;
END;
$$;
```
## Best Practices
### Table Fitness Analysis
1. **Analyze Regularly**: Run table fitness analysis regularly to track data quality changes
2. **Compare Over Time**: Store historical analysis results for trend tracking
3. **Sample Size Consideration**: For very large tables, adjust confidence level and margin of error:
```sql
SELECT public.c77_mvc_calculate_sample_size(10000000, 0.95, 0.05);
```
### Materialized View Management
1. **Naming Convention**: Follow the expected naming pattern:
- Materialized views must start with `matc_`
- Source views will be created with `vtw_` prefix
- Read views will be created with `vm_` prefix
- Problem record views will have `vprob_` prefix
2. **Partition Column Selection**: Choose partition columns based on:
- Table fitness analysis recommendations
- High uniqueness ratio
- Low null ratio
- Business requirements for data segmentation
3. **Order-by Column Selection**: Choose columns that:
- Represent timestamps or dates
- Have a clear logical ordering in the data
- Are regularly populated (low null ratio)
4. **Refresh Strategy**: Consider:
- Data change frequency
- Query load patterns
- Validation type based on criticality
5. **Performance Monitoring**:
- Track refresh times
- Monitor the c77_dbh_matv_stats table for historical performance
## Function Reference
### Table Fitness Analysis
| Function | Description |
|----------|-------------|
| `c77_mvc_analyze_table_fitness(source_schema, source_table, exclude_key_columns)` | Main function for table fitness analysis |
| `c77_mvc_calculate_sample_size(total_rows, confidence_level, margin_of_error)` | Calculate appropriate sample size for analysis |
| `c77_mvc_analyze_column_stats(temp_table_name, col_name, column_type, sample_size, total_rows, exclude_key_columns)` | Analyze individual column statistics |
| `c77_mvc_analyze_column_combinations(temp_table_name, column_stats, sample_size, total_rows, exclude_key_columns)` | Analyze column combinations for partitioning |
| `c77_mvc_identify_order_by_candidates(temp_table_name, column_stats)` | Identify columns suitable for ordering |
| `c77_mvc_calculate_dqi(column_stats)` | Calculate Data Quality Index |
### Materialized View Management
| Function | Description |
|----------|-------------|
| `c77_mvc_create_optimized_matv(source_schema, source_table, target_schema, target_matview, partition_columns, order_by_columns, exclude_columns_from_hash, filter_latest_only)` | Create an optimized materialized view |
| `c77_mvc_manage_matv_health(target_schema, matview_name, validation_type, action)` | Check and manage materialized view health |
| `c77_mvc_check_matv_mismatches(target_schema, matview_name, validation_type)` | Check for mismatches between source and materialized view |
| `c77_mvc_create_indexes(target_schema, target_mv_name, partition_columns)` | Create indexes on a materialized view |
| `c77_mvc_validate_matv_inputs(schema_name, matview_name, vtw_name)` | Validate materialized view inputs |
| `c77_mvc_validate_order_by_columns(source_schema, source_table, order_by_columns)` | Validate order-by columns |
| `c77_mvc_collect_matv_stats(full_matview_name, full_vtw_name)` | Collect materialized view statistics |
| `c77_mvc_estimate_matv_refresh_time(full_matview_name)` | Estimate refresh time for a materialized view |

1693
c77_mvc--1.0.sql Normal file

File diff suppressed because it is too large Load Diff

6
c77_mvc.control Normal file
View File

@ -0,0 +1,6 @@
# c77_mvc.control
comment = 'Materialized view and table fitness utilities'
default_version = '1.0'
module_pathname = ''
requires = 'c77_dbh'
relocatable = true

View File

@ -1,153 +0,0 @@
# PostgreSQL Function Dependency Map
## Overview
This document maps the dependencies between the PostgreSQL functions in the `config` schema. The functions are organized into two main subsystems:
1. **Table Analysis Subsystem**: Functions for analyzing tables to identify optimal keys, partitioning strategies, and data quality issues
2. **Materialized View Management Subsystem**: Functions for creating, monitoring, and maintaining materialized views
## Table Analysis Subsystem
### Main Entry Point
- `config.grok_analyze_table_fitness` - Orchestrates the complete table analysis process
### Dependency Hierarchy
```
grok_analyze_table_fitness
├── grok_calculate_sample_size
├── grok_create_temp_table
├── grok_analyze_column_stats
├── grok_identify_order_by_candidates
├── grok_analyze_column_combinations
├── grok_calculate_dqi
└── grok_assemble_result
```
### Function Relationships
1. `grok_analyze_table_fitness`
- Calls `grok_calculate_sample_size` to determine appropriate sample size
- Calls `grok_create_temp_table` to create a temporary copy of the source table
- Calls `grok_analyze_column_stats` for each column to analyze its characteristics
- Calls `grok_identify_order_by_candidates` to find columns suitable for ordering
- Calls `grok_analyze_column_combinations` to identify potential composite keys
- Calls `grok_calculate_dqi` to calculate the Data Quality Index
- Calls `grok_assemble_result` to prepare the final results and clean up
2. `grok_analyze_column_stats`
- No dependencies on other functions
- Results are used by `grok_analyze_column_combinations`, `grok_identify_order_by_candidates`, and `grok_calculate_dqi`
3. `grok_calculate_dqi`
- Uses data from `grok_analyze_column_stats`
- No direct function dependencies
4. `grok_create_temp_table`
- No dependencies on other functions
- Creates temporary tables used by other analysis functions
## Materialized View Management Subsystem
### Main Entry Points
- `grok_create_optimized_matv` - Creates an optimized materialized view system
- `grok_manage_matv_health` - Monitors and maintains materialized view health
### Dependency Hierarchy for Creation
```
grok_create_optimized_matv
├── grok_generate_column_lists (not explicitly called but similar functionality)
├── grok_generate_synthetic_key_and_hash (not explicitly called but similar functionality)
└── grok_create_indexes (not explicitly called but similar functionality)
```
### Dependency Hierarchy for Health Management
```
grok_manage_matv_health
├── grok_check_matv_mismatches
├── grok_estimate_matv_refresh_time
└── grok_perform_matv_action (indirectly)
grok_perform_matv_action
└── (No function dependencies)
grok_assemble_matv_health_result
└── grok_estimate_matv_refresh_time
```
### Function Relationships
1. `grok_create_optimized_matv`
- Has similar functionality to `grok_generate_column_lists` but doesn't call it directly
- Has similar functionality to `grok_generate_synthetic_key_and_hash` but doesn't call it directly
- Has similar functionality to `grok_create_indexes` but doesn't call it directly
- Creates a complete materialized view system (source view, materialized view, and read views)
2. `grok_manage_matv_health`
- Calls `grok_check_matv_mismatches` to detect inconsistencies
- Calls `grok_estimate_matv_refresh_time` to estimate refresh times
- Contains embedded functionality similar to `grok_perform_matv_action`
3. `grok_perform_matv_action`
- No direct function dependencies
- Performs maintenance actions on materialized views
4. `grok_assemble_matv_health_result`
- Calls `grok_estimate_matv_refresh_time` to get refresh time estimates
- Formats health check results
5. `grok_check_matv_mismatches`
- No direct function dependencies
- Performs content hash comparison between source and materialized views
6. `grok_validate_matv_inputs`
- No direct function dependencies
- Validates materialized view and source view existence
7. `grok_set_validation_params`
- No direct function dependencies
- Configures validation parameters for health checks
## Utility Functions
1. `grok_calculate_sample_size`
- Called by `grok_analyze_table_fitness`
- Called by `grok_calculate_matv_sample_size` (though the result is unused)
2. `grok_calculate_matv_sample_size`
- Calls `grok_calculate_sample_size` but doesn't use the result
- Used for materialized view validation sampling
3. `grok_estimate_matv_refresh_time`
- Called by `grok_assemble_matv_health_result`
- Called by `grok_manage_matv_health`
- Estimates materialized view refresh times
4. `grok_validate_order_by_columns`
- No direct function dependencies
- Validates timestamp-like columns for ordering
## Integration Points
The two subsystems integrate at these key points:
1. **Table Analysis → Materialized View Creation**:
- Analysis results from `grok_analyze_table_fitness` can inform parameters for `grok_create_optimized_matv`
- Recommended partition columns and order-by columns can be used directly
2. **Materialized View Management**:
- Both `grok_create_indexes` and `grok_create_optimized_matv` create similar index structures
- `grok_assemble_matv_result` and `grok_assemble_matv_health_result` format related outputs
## External Dependencies
These functions depend on external database objects:
1. **Table Fitness Audit Table**:
- `config.table_fitness_audit` - Stores table analysis results
2. **Materialized View Statistics Tables**:
- `public.c77_dbh_matv_stats` - Stores materialized view refresh statistics

View File

@ -1,82 +0,0 @@
# Function: grok_perform_matv_action
## Overview
This function performs maintenance actions on a materialized view based on its current health status, applying the appropriate remediation strategy.
## Schema
`config.grok_perform_matv_action`
## Parameters
- `full_matview_name` (text): Full name of the materialized view (schema.name)
- `schema_name` (text): Schema containing the materialized view
- `matview_name` (text): Name of the materialized view
- `action` (text): Action to perform: 'refresh', 'repair', or 'reindex'
- `mismatched_records` (bigint): Number of records that don't match between materialized view and source
- `total_matview_records` (bigint): Total number of records in the materialized view
- `time_diff` (interval): Time since last refresh
- `mismatch_threshold` (numeric): Threshold percentage that determines when a refresh is needed
- `time_threshold` (interval): Time threshold that determines when a refresh is needed
- `encoding_issues` (bigint): Number of records with encoding issues
## Return Value
Returns a JSONB object indicating the action result:
```json
{
"action_performed": true,
"action_result": "Refreshed successfully (concurrently)"
}
```
Or in case no action was taken or an error occurred:
```json
{
"action_performed": false,
"action_result": "Action skipped: threshold not met or invalid action"
}
```
## Description
This function implements a conditional maintenance system for materialized views based on their current health. It supports three types of actions:
1. **Refresh**: Updates the materialized view with current data from the source view
- Uses concurrent refresh if a unique index exists
- Falls back to non-concurrent refresh if no unique index is found
- Only performed if mismatch ratio exceeds the threshold or time since last refresh exceeds the time threshold
2. **Repair**: Rebuilds indexes and constraints to address encoding issues
- Drops all existing indexes (except primary keys)
- Drops primary key and unique constraints
- Recreates standard indexes on content_hash and synthetic_key
- Analyzes the table to update statistics
- Only performed if encoding issues are detected
3. **Reindex**: Rebuilds all indexes without dropping them
- Can be used for routine maintenance
- Always performed when requested (no threshold check)
The function intelligently applies the most appropriate technique based on the materialized view's structure and current state.
## Index Management
For materialized views with unique indexes, the function uses PostgreSQL's REFRESH MATERIALIZED VIEW CONCURRENTLY command, which allows queries to continue running against the materialized view during the refresh. For views without unique indexes, it falls back to the standard non-concurrent refresh.
## Error Handling
If an error occurs during action execution, the function returns information about the failure without raising an exception, allowing the calling process to continue.
## Dependencies
This function doesn't directly call other functions but is likely called by `config.grok_manage_matv_health`.
## Usage Example
```sql
SELECT config.grok_perform_matv_action(
'analytics.matc_daily_sales',
'analytics',
'matc_daily_sales',
'refresh',
155,
12345,
'25:30:00'::interval,
1.0,
'24:00:00'::interval,
0
);
```

View File

@ -1,69 +0,0 @@
# Function: grok_set_validation_params
## Overview
This function sets validation parameters and thresholds based on the specified validation type for materialized view health checks.
## Schema
`config.grok_set_validation_params`
## Parameters
- `validation_type` (text): Type of validation to configure: 'quick', 'daily', or 'full'
## Return Value
Returns a JSONB object containing validation parameters and thresholds:
```json
{
"params": {
"sample_percent": 0.1,
"confidence": 0.95,
"margin": 0.03
},
"mismatch_threshold": 0.1,
"time_threshold": "3 days"
}
```
## Description
This function configures appropriate validation parameters and thresholds based on the specified validation type. It supports three validation modes, each with its own balance between thoroughness and performance:
1. **Quick** (default): Light validation for frequent checks
- Sampling: 0.1% of records
- Confidence level: 95%
- Margin of error: 3%
- Mismatch threshold: 0.1% (data mismatch tolerance)
- Time threshold: 3 days (acceptable staleness)
2. **Daily**: Medium validation for daily maintenance
- Sampling: 1% of records
- Confidence level: 99%
- Margin of error: 1%
- Mismatch threshold: 0.05% (data mismatch tolerance)
- Time threshold: 1 day (acceptable staleness)
3. **Full**: Thorough validation for critical checks
- Sampling: 100% of records (full scan)
- Confidence level: 99%
- Margin of error: 0.5%
- Mismatch threshold: 0.01% (data mismatch tolerance)
- Time threshold: 12 hours (acceptable staleness)
If an invalid validation type is provided, the function defaults to 'quick' mode parameters.
## Parameter Explanations
- `sample_percent`: Percentage of records to sample during validation
- `confidence`: Statistical confidence level for sampling
- `margin`: Acceptable margin of error for sampling
- `mismatch_threshold`: Maximum acceptable percentage of mismatched records
- `time_threshold`: Maximum acceptable time since last refresh
## Dependencies
This function is likely called by other materialized view health check functions to configure validation parameters.
## Usage Example
```sql
-- Get validation parameters for daily checks
SELECT config.grok_set_validation_params('daily');
-- Get validation parameters for thorough health check
SELECT config.grok_set_validation_params('full');
```

View File

@ -1,70 +0,0 @@
# Function: grok_validate_matv_inputs
## Overview
This function validates the existence of a materialized view and its source view before performing operations on them, ensuring inputs are valid.
## Schema
`config.grok_validate_matv_inputs`
## Parameters
- `schema_name` (text): Schema containing the materialized view and source view
- `matview_name` (text): Name of the materialized view
- `vtw_name` (text): Optional name of the source view (if not provided, derived from matview_name)
## Return Value
Returns a JSONB object with validation results:
Success case:
```json
{
"full_matview_name": "schema.matview_name",
"full_vtw_name": "schema.vtw_name",
"notes": []
}
```
Error case:
```json
{
"error": "Materialized view schema.matview_name does not exist",
"notes": []
}
```
## Description
This function performs input validation before executing operations on materialized views by:
1. Constructing the fully qualified names for the materialized view and source view
2. Checking if the materialized view exists in pg_matviews
3. Checking if the source view exists in either pg_views or pg_tables
4. Returning appropriate error messages if either object is missing
If `vtw_name` is not provided, the function derives it by replacing 'matc_' with 'vtw_' in the materialized view name, following the standard naming convention.
## Validation Checks
The function checks:
- Materialized view existence using the pg_matviews system catalog
- Source view existence using both pg_views and pg_tables system catalogs (handles both views and tables)
## Error Handling
If validation fails, the function returns a descriptive error message indicating which object is missing. If an unexpected error occurs during validation, it returns a generic error message with the exception details.
## Dependencies
This function doesn't call other functions but is likely called by materialized view management functions before performing operations.
## Usage Example
```sql
-- Validate materialized view with automatic source view name derivation
SELECT config.grok_validate_matv_inputs(
'analytics',
'matc_daily_sales',
NULL
);
-- Validate materialized view with explicit source view name
SELECT config.grok_validate_matv_inputs(
'analytics',
'matc_daily_sales',
'custom_source_view'
);
```

View File

@ -1,63 +0,0 @@
# Function: grok_validate_order_by_columns
## Overview
This function validates that specified order-by columns exist in a source table and contain data that can be parsed as timestamps, ensuring they can be used for deterministic ordering.
## Schema
`config.grok_validate_order_by_columns`
## Parameters
- `source_schema` (text): Schema containing the source table
- `source_table` (text): Name of the source table
- `order_by_columns` (text[]): Array of column names to validate
## Return Value
Returns a text array containing warning messages for any issues found:
```
{
"Warning: column_name not found in schema.table",
"Warning: column_name contains unparseable timestamp data: error message"
}
```
## Description
This function validates columns intended for use in ORDER BY clauses, particularly for generating synthetic keys in materialized views. It performs two types of validation:
1. **Existence Check**: Verifies each column exists in the specified table
2. **Timestamp Parsing**: Tests if each column's data can be parsed as a timestamp
For timestamp parsing, the function attempts to convert the column data using:
```sql
TO_TIMESTAMP(SUBSTRING(NULLIF(column, ''), 1, 19), 'YYYY-MM-DD HH24:MI:SS')
```
This validation approach ensures that:
- Columns are valid for the source table
- Timestamp columns can be parsed consistently
- The ORDER BY clause will produce deterministic results
## Timestamp Parsing Details
The timestamp parsing logic:
- Uses NULLIF to handle NULL values
- Takes only the first 19 characters using SUBSTRING
- Uses a fixed format of 'YYYY-MM-DD HH24:MI:SS'
This standardized parsing ensures consistent ordering behavior regardless of the actual format stored in the column.
## Error Handling
The function collects warnings without failing, allowing for a complete validation report:
- Missing columns generate a warning
- Unparseable timestamp data generates a warning with the specific error
- If an unexpected error occurs, it returns a general error message
## Dependencies
This function is likely called by other functions that create materialized views to validate order-by columns before using them.
## Usage Example
```sql
SELECT config.grok_validate_order_by_columns(
'public',
'customers',
ARRAY['created_at', 'updated_at']
);
```