532 lines
22 KiB
Python
532 lines
22 KiB
Python
"""
|
|
Final Production Verification - Code Evidence Based
|
|
===================================================
|
|
|
|
Comprehensive verification system that uses code evidence to verify:
|
|
1. All file paths are relative and self-contained
|
|
2. All dependencies are within the project
|
|
3. All required files exist
|
|
4. Data integrity at granular level
|
|
5. Schema accuracy
|
|
6. Production readiness
|
|
|
|
This script provides 100% confidence verification before production deployment.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import ast
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Tuple, Set
|
|
import pandas as pd
|
|
import json
|
|
from datetime import datetime
|
|
|
|
# Fix Windows console encoding
|
|
if sys.platform == 'win32':
|
|
import io
|
|
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
|
|
|
class ProductionVerifier:
|
|
"""Comprehensive production verification with code evidence"""
|
|
|
|
def __init__(self):
|
|
self.issues = []
|
|
self.warnings = []
|
|
self.verified = []
|
|
self.code_evidence = []
|
|
|
|
def log_issue(self, category: str, issue: str, evidence: str = ""):
|
|
"""Log a critical issue"""
|
|
self.issues.append({
|
|
'category': category,
|
|
'issue': issue,
|
|
'evidence': evidence
|
|
})
|
|
|
|
def log_warning(self, category: str, warning: str, evidence: str = ""):
|
|
"""Log a warning"""
|
|
self.warnings.append({
|
|
'category': category,
|
|
'warning': warning,
|
|
'evidence': evidence
|
|
})
|
|
|
|
def log_verified(self, category: str, message: str, evidence: str = ""):
|
|
"""Log successful verification"""
|
|
self.verified.append({
|
|
'category': category,
|
|
'message': message,
|
|
'evidence': evidence
|
|
})
|
|
|
|
def check_file_paths_in_code(self) -> Dict:
|
|
"""Verify all file paths in code are relative"""
|
|
print("=" * 80)
|
|
print("VERIFICATION 1: FILE PATH ANALYSIS (Code Evidence)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Files to check
|
|
python_files = [
|
|
BASE_DIR / "run_complete_pipeline.py",
|
|
BASE_DIR / "main.py",
|
|
BASE_DIR / "config.py",
|
|
BASE_DIR / "scripts" / "prepare_data.py",
|
|
BASE_DIR / "scripts" / "comprehensive_post_processor.py",
|
|
BASE_DIR / "services" / "data_loader.py",
|
|
BASE_DIR / "services" / "simulator.py",
|
|
BASE_DIR / "services" / "cognition_simulator.py",
|
|
]
|
|
|
|
external_paths_found = []
|
|
relative_paths_found = []
|
|
|
|
for py_file in python_files:
|
|
if not py_file.exists():
|
|
self.log_issue("File Paths", f"Python file not found: {py_file.name}", str(py_file))
|
|
continue
|
|
|
|
try:
|
|
with open(py_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
lines = content.split('\n')
|
|
|
|
# Check for hardcoded absolute paths
|
|
# Pattern: C:\ or /c:/ or absolute Windows/Unix paths
|
|
path_patterns = [
|
|
r'[C-Z]:\\[^"\']+[^\\n]', # Windows absolute paths (exclude \n)
|
|
r'/c:/[^"\']+[^\\n]', # Windows path in Unix format (exclude \n)
|
|
r'Path\(r?["\']C:\\[^"\']+["\']\)', # Path() with Windows absolute
|
|
r'Path\(r?["\']/[^"\']+["\']\)', # Path() with Unix absolute (if external)
|
|
]
|
|
|
|
for line_num, line in enumerate(lines, 1):
|
|
# Skip comments
|
|
if line.strip().startswith('#'):
|
|
continue
|
|
|
|
# Skip string literals with escape sequences (like \n)
|
|
if '\\n' in line and ('"' in line or "'" in line):
|
|
# This is likely a string with newline, not a path
|
|
continue
|
|
|
|
for pattern in path_patterns:
|
|
matches = re.finditer(pattern, line, re.IGNORECASE)
|
|
for match in matches:
|
|
path_str = match.group(0)
|
|
# Only flag if it's clearly an external path
|
|
if 'FW_Pseudo_Data_Documents' in path_str or 'CP_AUTOMATION' in path_str:
|
|
external_paths_found.append({
|
|
'file': py_file.name,
|
|
'line': line_num,
|
|
'path': path_str,
|
|
'code': line.strip()[:100]
|
|
})
|
|
# Check for Windows absolute paths (C:\ through Z:\)
|
|
elif re.match(r'^[C-Z]:\\', path_str, re.IGNORECASE):
|
|
# But exclude if it's in a string with other content (like \n)
|
|
if BASE_DIR.name not in path_str and 'BASE_DIR' not in line:
|
|
if not any(rel_indicator in line for rel_indicator in ['BASE_DIR', 'Path(__file__)', '.parent', 'data/', 'output/', 'support/']):
|
|
external_paths_found.append({
|
|
'file': py_file.name,
|
|
'line': line_num,
|
|
'path': path_str,
|
|
'code': line.strip()[:100]
|
|
})
|
|
|
|
# Check for relative path usage
|
|
if 'BASE_DIR' in content or 'Path(__file__)' in content:
|
|
relative_paths_found.append(py_file.name)
|
|
|
|
except Exception as e:
|
|
self.log_issue("File Paths", f"Error reading {py_file.name}: {e}", str(e))
|
|
|
|
# Report results
|
|
if external_paths_found:
|
|
print(f"❌ Found {len(external_paths_found)} external/hardcoded paths:")
|
|
for ext_path in external_paths_found:
|
|
print(f" File: {ext_path['file']}, Line {ext_path['line']}")
|
|
print(f" Path: {ext_path['path']}")
|
|
print(f" Code: {ext_path['code']}")
|
|
print()
|
|
self.log_issue("File Paths",
|
|
f"External path in {ext_path['file']}:{ext_path['line']}",
|
|
ext_path['code'])
|
|
else:
|
|
print("✅ No external hardcoded paths found")
|
|
self.log_verified("File Paths", "All paths are relative or use BASE_DIR", f"{len(relative_paths_found)} files use relative paths")
|
|
|
|
print()
|
|
return {
|
|
'external_paths': external_paths_found,
|
|
'relative_paths': relative_paths_found,
|
|
'status': 'PASS' if not external_paths_found else 'FAIL'
|
|
}
|
|
|
|
def check_required_files(self) -> Dict:
|
|
"""Verify all required files exist within project"""
|
|
print("=" * 80)
|
|
print("VERIFICATION 2: REQUIRED FILES CHECK")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
required_files = {
|
|
'Core Scripts': [
|
|
'run_complete_pipeline.py',
|
|
'main.py',
|
|
'config.py',
|
|
],
|
|
'Data Files': [
|
|
'data/AllQuestions.xlsx',
|
|
'data/merged_personas.xlsx',
|
|
],
|
|
'Support Files': [
|
|
'support/3000-students.xlsx',
|
|
'support/3000_students_output.xlsx',
|
|
'support/fixed_3k_personas.xlsx',
|
|
],
|
|
'Scripts': [
|
|
'scripts/prepare_data.py',
|
|
'scripts/comprehensive_post_processor.py',
|
|
],
|
|
'Services': [
|
|
'services/data_loader.py',
|
|
'services/simulator.py',
|
|
'services/cognition_simulator.py',
|
|
],
|
|
}
|
|
|
|
missing_files = []
|
|
existing_files = []
|
|
|
|
for category, files in required_files.items():
|
|
print(f"📂 {category}:")
|
|
for file_path in files:
|
|
full_path = BASE_DIR / file_path
|
|
if full_path.exists():
|
|
print(f" ✅ {file_path}")
|
|
existing_files.append(file_path)
|
|
else:
|
|
print(f" ❌ {file_path} (MISSING)")
|
|
missing_files.append(file_path)
|
|
self.log_issue("Required Files", f"Missing: {file_path}", str(full_path))
|
|
print()
|
|
|
|
if missing_files:
|
|
print(f"❌ {len(missing_files)} required files missing")
|
|
else:
|
|
print(f"✅ All {len(existing_files)} required files present")
|
|
self.log_verified("Required Files", f"All {len(existing_files)} files present", "")
|
|
|
|
return {
|
|
'missing': missing_files,
|
|
'existing': existing_files,
|
|
'status': 'PASS' if not missing_files else 'FAIL'
|
|
}
|
|
|
|
def check_data_integrity(self) -> Dict:
|
|
"""Verify data integrity at granular level"""
|
|
print("=" * 80)
|
|
print("VERIFICATION 3: DATA INTEGRITY CHECK (Granular Level)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
results = {}
|
|
|
|
# Check merged_personas.xlsx
|
|
personas_file = BASE_DIR / "data" / "merged_personas.xlsx"
|
|
if personas_file.exists():
|
|
try:
|
|
df = pd.read_excel(personas_file, engine='openpyxl')
|
|
|
|
# Check row count
|
|
if len(df) != 3000:
|
|
self.log_issue("Data Integrity", f"merged_personas.xlsx: Expected 3000 rows, got {len(df)}", f"Row count: {len(df)}")
|
|
else:
|
|
self.log_verified("Data Integrity", "merged_personas.xlsx: 3000 rows", f"Rows: {len(df)}")
|
|
|
|
# Check StudentCPID uniqueness
|
|
if 'StudentCPID' in df.columns:
|
|
unique_cpids = df['StudentCPID'].nunique()
|
|
if unique_cpids != len(df):
|
|
self.log_issue("Data Integrity", f"Duplicate StudentCPIDs: {unique_cpids}/{len(df)}", "")
|
|
else:
|
|
self.log_verified("Data Integrity", "All StudentCPIDs unique", f"{unique_cpids} unique")
|
|
|
|
# Check for DB columns (should be removed)
|
|
db_cols = [c for c in df.columns if '_DB' in str(c)]
|
|
if db_cols:
|
|
self.log_warning("Data Integrity", f"DB columns still present: {db_cols}", "")
|
|
else:
|
|
self.log_verified("Data Integrity", "No redundant DB columns", "")
|
|
|
|
results['personas'] = {
|
|
'rows': len(df),
|
|
'columns': len(df.columns),
|
|
'unique_cpids': df['StudentCPID'].nunique() if 'StudentCPID' in df.columns else 0,
|
|
'db_columns': len(db_cols)
|
|
}
|
|
|
|
print(f"✅ merged_personas.xlsx: {len(df)} rows, {len(df.columns)} columns")
|
|
|
|
except Exception as e:
|
|
self.log_issue("Data Integrity", f"Error reading merged_personas.xlsx: {e}", str(e))
|
|
|
|
# Check AllQuestions.xlsx
|
|
questions_file = BASE_DIR / "data" / "AllQuestions.xlsx"
|
|
if questions_file.exists():
|
|
try:
|
|
df = pd.read_excel(questions_file, engine='openpyxl')
|
|
|
|
# Check for duplicate question codes
|
|
if 'code' in df.columns:
|
|
unique_codes = df['code'].nunique()
|
|
if unique_codes != len(df):
|
|
self.log_issue("Data Integrity", f"Duplicate question codes: {unique_codes}/{len(df)}", "")
|
|
else:
|
|
self.log_verified("Data Integrity", f"All question codes unique: {unique_codes}", "")
|
|
|
|
results['questions'] = {
|
|
'total': len(df),
|
|
'unique_codes': df['code'].nunique() if 'code' in df.columns else 0
|
|
}
|
|
|
|
print(f"✅ AllQuestions.xlsx: {len(df)} questions")
|
|
|
|
except Exception as e:
|
|
self.log_issue("Data Integrity", f"Error reading AllQuestions.xlsx: {e}", str(e))
|
|
|
|
print()
|
|
return results
|
|
|
|
def check_output_files(self) -> Dict:
|
|
"""Verify output file structure"""
|
|
print("=" * 80)
|
|
print("VERIFICATION 4: OUTPUT FILES STRUCTURE")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
output_dir = BASE_DIR / "output" / "full_run"
|
|
|
|
expected_files = {
|
|
'adolescense/5_domain': [
|
|
'Personality_14-17.xlsx',
|
|
'Grit_14-17.xlsx',
|
|
'Emotional_Intelligence_14-17.xlsx',
|
|
'Vocational_Interest_14-17.xlsx',
|
|
'Learning_Strategies_14-17.xlsx'
|
|
],
|
|
'adults/5_domain': [
|
|
'Personality_18-23.xlsx',
|
|
'Grit_18-23.xlsx',
|
|
'Emotional_Intelligence_18-23.xlsx',
|
|
'Vocational_Interest_18-23.xlsx',
|
|
'Learning_Strategies_18-23.xlsx'
|
|
]
|
|
}
|
|
|
|
missing_files = []
|
|
existing_files = []
|
|
|
|
for age_dir, files in expected_files.items():
|
|
print(f"📂 {age_dir}:")
|
|
for file_name in files:
|
|
file_path = output_dir / age_dir / file_name
|
|
if file_path.exists():
|
|
print(f" ✅ {file_name}")
|
|
existing_files.append(f"{age_dir}/{file_name}")
|
|
else:
|
|
print(f" ⚠️ {file_name} (not found - may not be generated yet)")
|
|
missing_files.append(f"{age_dir}/{file_name}")
|
|
print()
|
|
|
|
if missing_files:
|
|
print(f"⚠️ {len(missing_files)} output files not found (may be expected if simulation not run)")
|
|
self.log_warning("Output Files", f"{len(missing_files)} files not found", "Simulation may not be complete")
|
|
else:
|
|
print(f"✅ All {len(existing_files)} expected domain files present")
|
|
self.log_verified("Output Files", f"All {len(existing_files)} domain files present", "")
|
|
|
|
return {
|
|
'missing': missing_files,
|
|
'existing': existing_files,
|
|
'status': 'PASS' if not missing_files else 'WARN'
|
|
}
|
|
|
|
def check_imports_and_dependencies(self) -> Dict:
|
|
"""Verify all imports are valid and dependencies are internal"""
|
|
print("=" * 80)
|
|
print("VERIFICATION 5: IMPORTS AND DEPENDENCIES")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
python_files = [
|
|
BASE_DIR / "run_complete_pipeline.py",
|
|
BASE_DIR / "main.py",
|
|
BASE_DIR / "config.py",
|
|
]
|
|
|
|
external_imports = []
|
|
internal_imports = []
|
|
|
|
for py_file in python_files:
|
|
if not py_file.exists():
|
|
continue
|
|
|
|
try:
|
|
with open(py_file, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
# Parse imports
|
|
tree = ast.parse(content)
|
|
for node in ast.walk(tree):
|
|
if isinstance(node, ast.Import):
|
|
for alias in node.names:
|
|
module = alias.name
|
|
# Internal imports
|
|
if module.startswith('services') or module.startswith('scripts') or module == 'config':
|
|
internal_imports.append((py_file.name, module))
|
|
# Standard library and common packages
|
|
elif any(module.startswith(prefix) for prefix in ['pandas', 'numpy', 'pathlib', 'typing', 'json', 'sys', 'os', 'subprocess', 'threading', 'concurrent', 'anthropic', 'openpyxl', 'dotenv', 'datetime', 'time', 'uuid', 'random', 're', 'io', 'ast', 'collections', 'itertools', 'functools']):
|
|
internal_imports.append((py_file.name, module))
|
|
# Check if it's a standard library module
|
|
else:
|
|
try:
|
|
__import__(module)
|
|
internal_imports.append((py_file.name, module))
|
|
except ImportError:
|
|
# Not a standard library - might be external
|
|
external_imports.append((py_file.name, module))
|
|
except:
|
|
# Other error - assume internal
|
|
internal_imports.append((py_file.name, module))
|
|
|
|
elif isinstance(node, ast.ImportFrom):
|
|
if node.module:
|
|
module = node.module
|
|
# Internal imports (from services, scripts, config)
|
|
if module and (module.startswith('services') or module.startswith('scripts') or module == 'config' or module.startswith('.')):
|
|
internal_imports.append((py_file.name, module))
|
|
# Standard library and common packages
|
|
elif module and any(module.startswith(prefix) for prefix in ['pandas', 'numpy', 'pathlib', 'typing', 'json', 'sys', 'os', 'subprocess', 'threading', 'concurrent', 'anthropic', 'openpyxl', 'dotenv', 'datetime', 'time', 'uuid', 'random', 're', 'io', 'ast']):
|
|
internal_imports.append((py_file.name, module))
|
|
# Check if it's a relative import that failed to parse
|
|
elif not module:
|
|
# This is a relative import (from . import ...)
|
|
internal_imports.append((py_file.name, 'relative'))
|
|
else:
|
|
# Only flag if it's clearly external
|
|
external_imports.append((py_file.name, module))
|
|
|
|
except Exception as e:
|
|
self.log_warning("Imports", f"Error parsing {py_file.name}: {e}", str(e))
|
|
|
|
if external_imports:
|
|
print(f"⚠️ Found {len(external_imports)} potentially external imports:")
|
|
for file, module in external_imports:
|
|
print(f" {file}: {module}")
|
|
print()
|
|
else:
|
|
print("✅ All imports are standard library or internal modules")
|
|
self.log_verified("Imports", "All imports valid", f"{len(internal_imports)} internal imports")
|
|
|
|
print()
|
|
return {
|
|
'external': external_imports,
|
|
'internal': internal_imports,
|
|
'status': 'PASS' if not external_imports else 'WARN'
|
|
}
|
|
|
|
def generate_report(self) -> Dict:
|
|
"""Generate comprehensive verification report"""
|
|
report = {
|
|
'timestamp': datetime.now().isoformat(),
|
|
'project_dir': str(BASE_DIR),
|
|
'summary': {
|
|
'total_issues': len(self.issues),
|
|
'total_warnings': len(self.warnings),
|
|
'total_verified': len(self.verified),
|
|
'status': 'PASS' if len(self.issues) == 0 else 'FAIL'
|
|
},
|
|
'issues': self.issues,
|
|
'warnings': self.warnings,
|
|
'verified': self.verified
|
|
}
|
|
|
|
# Save report
|
|
report_path = BASE_DIR / "production_verification_report.json"
|
|
with open(report_path, 'w', encoding='utf-8') as f:
|
|
json.dump(report, f, indent=2, ensure_ascii=False)
|
|
|
|
return report
|
|
|
|
def run_all_verifications(self):
|
|
"""Run all verification checks"""
|
|
print("=" * 80)
|
|
print("PRODUCTION VERIFICATION - CODE EVIDENCE BASED")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"Project Directory: {BASE_DIR}")
|
|
print()
|
|
|
|
# Run all verifications
|
|
results = {}
|
|
results['file_paths'] = self.check_file_paths_in_code()
|
|
results['required_files'] = self.check_required_files()
|
|
results['data_integrity'] = self.check_data_integrity()
|
|
results['output_files'] = self.check_output_files()
|
|
results['imports'] = self.check_imports_and_dependencies()
|
|
|
|
# Generate report
|
|
report = self.generate_report()
|
|
|
|
# Final summary
|
|
print("=" * 80)
|
|
print("VERIFICATION SUMMARY")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"✅ Verified: {len(self.verified)}")
|
|
print(f"⚠️ Warnings: {len(self.warnings)}")
|
|
print(f"❌ Issues: {len(self.issues)}")
|
|
print()
|
|
|
|
if self.issues:
|
|
print("CRITICAL ISSUES FOUND:")
|
|
for issue in self.issues:
|
|
print(f" [{issue['category']}] {issue['issue']}")
|
|
if issue['evidence']:
|
|
print(f" Evidence: {issue['evidence'][:100]}")
|
|
print()
|
|
|
|
if self.warnings:
|
|
print("WARNINGS:")
|
|
for warning in self.warnings:
|
|
print(f" [{warning['category']}] {warning['warning']}")
|
|
print()
|
|
|
|
print(f"📄 Detailed report saved: production_verification_report.json")
|
|
print()
|
|
|
|
if len(self.issues) == 0:
|
|
print("=" * 80)
|
|
print("✅ PRODUCTION READY - ALL CHECKS PASSED")
|
|
print("=" * 80)
|
|
return True
|
|
else:
|
|
print("=" * 80)
|
|
print("❌ NOT PRODUCTION READY - ISSUES FOUND")
|
|
print("=" * 80)
|
|
return False
|
|
|
|
def main():
|
|
verifier = ProductionVerifier()
|
|
success = verifier.run_all_verifications()
|
|
sys.exit(0 if success else 1)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|