Add --max-age filter to skip old files during import

- Default: 90 days (matches retention policy)
- Use --max-age 0 to disable filtering
- Extracts date from CLM filename pattern
- Shows "Skipped (too old)" count in summary
- Prevents re-importing data that would be immediately purged

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
cproudlock
2025-12-16 10:08:25 -05:00
parent 5c707c3cd4
commit 149a223ce2

View File

@@ -22,6 +22,7 @@ import json
import os import os
import sys import sys
import glob import glob
import re
import argparse import argparse
import hashlib import hashlib
from datetime import datetime, timedelta from datetime import datetime, timedelta
@@ -98,7 +99,7 @@ def is_file_complete(filepath: str) -> bool:
class CLMParser: class CLMParser:
"""Parser for CLM_Data JSON files""" """Parser for CLM_Data JSON files"""
def __init__(self, db_config: dict): def __init__(self, db_config: dict, max_age_days: int = None):
self.db_config = db_config self.db_config = db_config
self.conn = None self.conn = None
self.cursor = None self.cursor = None
@@ -107,6 +108,31 @@ class CLMParser:
self.manual_requests_batch = [] self.manual_requests_batch = []
self.header_updates_batch = [] self.header_updates_batch = []
self.violations_batch = [] self.violations_batch = []
self.max_age_days = max_age_days # None = no limit
self.cutoff_date = None
if max_age_days:
self.cutoff_date = (datetime.now() - timedelta(days=max_age_days)).date()
def get_file_date(self, filename: str) -> Optional[datetime]:
"""Extract date from CLM filename: {Part}_{Oper}_{Serial}_{YYYY-MM-DD}_{HH-MM-SS}.json"""
match = re.search(r'(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})\.json$', filename)
if match:
try:
date_str = match.group(1)
time_str = match.group(2).replace('-', ':')
return datetime.strptime(f"{date_str} {time_str}", '%Y-%m-%d %H:%M:%S')
except ValueError:
return None
return None
def is_file_too_old(self, filename: str) -> bool:
"""Check if file date is older than max_age_days"""
if not self.cutoff_date:
return False
file_date = self.get_file_date(filename)
if file_date:
return file_date.date() < self.cutoff_date
return False # If can't parse date, don't skip
def connect(self): def connect(self):
"""Connect to MySQL database""" """Connect to MySQL database"""
@@ -558,6 +584,10 @@ class CLMParser:
"""Parse a single CLM JSON file""" """Parse a single CLM JSON file"""
filename = os.path.basename(filepath) filename = os.path.basename(filepath)
# Check if file is too old (based on filename date)
if self.is_file_too_old(filename):
return {'success': False, 'error': f'File too old (>{self.max_age_days} days): {filename}', 'skip_silent': True, 'too_old': True}
# Extract machine number from parent folder # Extract machine number from parent folder
parent_dir = os.path.basename(os.path.dirname(os.path.dirname(filepath))) parent_dir = os.path.basename(os.path.dirname(os.path.dirname(filepath)))
machine_number = parent_dir if parent_dir.isdigit() else None machine_number = parent_dir if parent_dir.isdigit() else None
@@ -791,6 +821,7 @@ class CLMParser:
'imported': 0, 'imported': 0,
'updated': 0, 'updated': 0,
'skipped': 0, 'skipped': 0,
'skipped_old': 0,
'in_progress': 0, 'in_progress': 0,
'tool_setup': 0, 'tool_setup': 0,
'errors': 0, 'errors': 0,
@@ -813,8 +844,10 @@ class CLMParser:
elif result.get('in_progress'): elif result.get('in_progress'):
results['in_progress'] += 1 results['in_progress'] += 1
elif result.get('skip_silent'): elif result.get('skip_silent'):
# Silent skips: unchanged files or tool setup files # Silent skips: unchanged files, tool setup files, or old files
if 'tool setup' in result.get('error', '').lower(): if result.get('too_old'):
results['skipped_old'] += 1
elif 'tool setup' in result.get('error', '').lower():
results['tool_setup'] += 1 results['tool_setup'] += 1
else: else:
results['skipped'] += 1 # Unchanged files results['skipped'] += 1 # Unchanged files
@@ -1081,6 +1114,7 @@ class CLMParser:
'total_files': 0, 'total_files': 0,
'imported': 0, 'imported': 0,
'skipped': 0, 'skipped': 0,
'skipped_old': 0,
'in_progress': 0, 'in_progress': 0,
'tool_setup': 0, 'tool_setup': 0,
'errors': 0, 'errors': 0,
@@ -1111,6 +1145,7 @@ class CLMParser:
results['total_files'] += machine_results.get('total_files', 0) results['total_files'] += machine_results.get('total_files', 0)
results['imported'] += machine_results.get('imported', 0) results['imported'] += machine_results.get('imported', 0)
results['skipped'] += machine_results.get('skipped', 0) results['skipped'] += machine_results.get('skipped', 0)
results['skipped_old'] += machine_results.get('skipped_old', 0)
results['in_progress'] += machine_results.get('in_progress', 0) results['in_progress'] += machine_results.get('in_progress', 0)
results['tool_setup'] += machine_results.get('tool_setup', 0) results['tool_setup'] += machine_results.get('tool_setup', 0)
results['errors'] += machine_results.get('errors', 0) results['errors'] += machine_results.get('errors', 0)
@@ -1127,6 +1162,7 @@ def main():
parser.add_argument('--file', help='Parse a specific JSON file') parser.add_argument('--file', help='Parse a specific JSON file')
parser.add_argument('--machine', help='Parse only a specific machine folder') parser.add_argument('--machine', help='Parse only a specific machine folder')
parser.add_argument('--active-only', action='store_true', help='Only update active sessions (skip imports)') parser.add_argument('--active-only', action='store_true', help='Only update active sessions (skip imports)')
parser.add_argument('--max-age', type=int, default=90, help='Max file age in days (default: 90, 0=no limit)')
parser.add_argument('--host', default=DB_CONFIG['host'], help='MySQL host') parser.add_argument('--host', default=DB_CONFIG['host'], help='MySQL host')
parser.add_argument('--port', type=int, default=DB_CONFIG['port'], help='MySQL port') parser.add_argument('--port', type=int, default=DB_CONFIG['port'], help='MySQL port')
parser.add_argument('--user', default=DB_CONFIG['user'], help='MySQL user') parser.add_argument('--user', default=DB_CONFIG['user'], help='MySQL user')
@@ -1144,8 +1180,11 @@ def main():
'database': args.database 'database': args.database
} }
# Max age (0 = no limit)
max_age = args.max_age if args.max_age > 0 else None
# Create parser and connect # Create parser and connect
clm_parser = CLMParser(db_config) clm_parser = CLMParser(db_config, max_age_days=max_age)
if not clm_parser.connect(): if not clm_parser.connect():
sys.exit(1) sys.exit(1)
@@ -1189,6 +1228,7 @@ def main():
print(f" Total files found: {results.get('total_files', 0)}") print(f" Total files found: {results.get('total_files', 0)}")
print(f" Files imported: {results.get('imported', 0)}") print(f" Files imported: {results.get('imported', 0)}")
print(f" Files skipped: {results.get('skipped', 0)}") print(f" Files skipped: {results.get('skipped', 0)}")
print(f" Skipped (too old): {results.get('skipped_old', 0)}")
print(f" Jobs in progress: {results.get('in_progress', 0)}") print(f" Jobs in progress: {results.get('in_progress', 0)}")
print(f" Files with errors: {results.get('errors', 0)}") print(f" Files with errors: {results.get('errors', 0)}")
print(f" Total measurements: {results.get('total_measurements', 0)}") print(f" Total measurements: {results.get('total_measurements', 0)}")
@@ -1196,6 +1236,8 @@ def main():
else: else:
# Parse all machines # Parse all machines
if max_age:
print(f"Max file age: {max_age} days")
results = clm_parser.parse_all_machines(args.dir) results = clm_parser.parse_all_machines(args.dir)
print(f"\n{'='*50}") print(f"\n{'='*50}")
print(f"CLM Data Import Summary:") print(f"CLM Data Import Summary:")
@@ -1203,6 +1245,7 @@ def main():
print(f" Total files found: {results['total_files']}") print(f" Total files found: {results['total_files']}")
print(f" Files imported: {results['imported']}") print(f" Files imported: {results['imported']}")
print(f" Files skipped: {results['skipped']}") print(f" Files skipped: {results['skipped']}")
print(f" Skipped (too old): {results['skipped_old']}")
print(f" Jobs in progress: {results['in_progress']}") print(f" Jobs in progress: {results['in_progress']}")
print(f" Files with errors: {results['errors']}") print(f" Files with errors: {results['errors']}")
print(f" Total measurements: {results['total_measurements']}") print(f" Total measurements: {results['total_measurements']}")