Add --max-age filter to skip old files during import
- Default: 90 days (matches retention policy) - Use --max-age 0 to disable filtering - Extracts date from CLM filename pattern - Shows "Skipped (too old)" count in summary - Prevents re-importing data that would be immediately purged 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import glob
|
import glob
|
||||||
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
import hashlib
|
import hashlib
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
@@ -98,7 +99,7 @@ def is_file_complete(filepath: str) -> bool:
|
|||||||
class CLMParser:
|
class CLMParser:
|
||||||
"""Parser for CLM_Data JSON files"""
|
"""Parser for CLM_Data JSON files"""
|
||||||
|
|
||||||
def __init__(self, db_config: dict):
|
def __init__(self, db_config: dict, max_age_days: int = None):
|
||||||
self.db_config = db_config
|
self.db_config = db_config
|
||||||
self.conn = None
|
self.conn = None
|
||||||
self.cursor = None
|
self.cursor = None
|
||||||
@@ -107,6 +108,31 @@ class CLMParser:
|
|||||||
self.manual_requests_batch = []
|
self.manual_requests_batch = []
|
||||||
self.header_updates_batch = []
|
self.header_updates_batch = []
|
||||||
self.violations_batch = []
|
self.violations_batch = []
|
||||||
|
self.max_age_days = max_age_days # None = no limit
|
||||||
|
self.cutoff_date = None
|
||||||
|
if max_age_days:
|
||||||
|
self.cutoff_date = (datetime.now() - timedelta(days=max_age_days)).date()
|
||||||
|
|
||||||
|
def get_file_date(self, filename: str) -> Optional[datetime]:
|
||||||
|
"""Extract date from CLM filename: {Part}_{Oper}_{Serial}_{YYYY-MM-DD}_{HH-MM-SS}.json"""
|
||||||
|
match = re.search(r'(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})\.json$', filename)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
date_str = match.group(1)
|
||||||
|
time_str = match.group(2).replace('-', ':')
|
||||||
|
return datetime.strptime(f"{date_str} {time_str}", '%Y-%m-%d %H:%M:%S')
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
return None
|
||||||
|
|
||||||
|
def is_file_too_old(self, filename: str) -> bool:
|
||||||
|
"""Check if file date is older than max_age_days"""
|
||||||
|
if not self.cutoff_date:
|
||||||
|
return False
|
||||||
|
file_date = self.get_file_date(filename)
|
||||||
|
if file_date:
|
||||||
|
return file_date.date() < self.cutoff_date
|
||||||
|
return False # If can't parse date, don't skip
|
||||||
|
|
||||||
def connect(self):
|
def connect(self):
|
||||||
"""Connect to MySQL database"""
|
"""Connect to MySQL database"""
|
||||||
@@ -558,6 +584,10 @@ class CLMParser:
|
|||||||
"""Parse a single CLM JSON file"""
|
"""Parse a single CLM JSON file"""
|
||||||
filename = os.path.basename(filepath)
|
filename = os.path.basename(filepath)
|
||||||
|
|
||||||
|
# Check if file is too old (based on filename date)
|
||||||
|
if self.is_file_too_old(filename):
|
||||||
|
return {'success': False, 'error': f'File too old (>{self.max_age_days} days): {filename}', 'skip_silent': True, 'too_old': True}
|
||||||
|
|
||||||
# Extract machine number from parent folder
|
# Extract machine number from parent folder
|
||||||
parent_dir = os.path.basename(os.path.dirname(os.path.dirname(filepath)))
|
parent_dir = os.path.basename(os.path.dirname(os.path.dirname(filepath)))
|
||||||
machine_number = parent_dir if parent_dir.isdigit() else None
|
machine_number = parent_dir if parent_dir.isdigit() else None
|
||||||
@@ -791,6 +821,7 @@ class CLMParser:
|
|||||||
'imported': 0,
|
'imported': 0,
|
||||||
'updated': 0,
|
'updated': 0,
|
||||||
'skipped': 0,
|
'skipped': 0,
|
||||||
|
'skipped_old': 0,
|
||||||
'in_progress': 0,
|
'in_progress': 0,
|
||||||
'tool_setup': 0,
|
'tool_setup': 0,
|
||||||
'errors': 0,
|
'errors': 0,
|
||||||
@@ -813,8 +844,10 @@ class CLMParser:
|
|||||||
elif result.get('in_progress'):
|
elif result.get('in_progress'):
|
||||||
results['in_progress'] += 1
|
results['in_progress'] += 1
|
||||||
elif result.get('skip_silent'):
|
elif result.get('skip_silent'):
|
||||||
# Silent skips: unchanged files or tool setup files
|
# Silent skips: unchanged files, tool setup files, or old files
|
||||||
if 'tool setup' in result.get('error', '').lower():
|
if result.get('too_old'):
|
||||||
|
results['skipped_old'] += 1
|
||||||
|
elif 'tool setup' in result.get('error', '').lower():
|
||||||
results['tool_setup'] += 1
|
results['tool_setup'] += 1
|
||||||
else:
|
else:
|
||||||
results['skipped'] += 1 # Unchanged files
|
results['skipped'] += 1 # Unchanged files
|
||||||
@@ -1081,6 +1114,7 @@ class CLMParser:
|
|||||||
'total_files': 0,
|
'total_files': 0,
|
||||||
'imported': 0,
|
'imported': 0,
|
||||||
'skipped': 0,
|
'skipped': 0,
|
||||||
|
'skipped_old': 0,
|
||||||
'in_progress': 0,
|
'in_progress': 0,
|
||||||
'tool_setup': 0,
|
'tool_setup': 0,
|
||||||
'errors': 0,
|
'errors': 0,
|
||||||
@@ -1111,6 +1145,7 @@ class CLMParser:
|
|||||||
results['total_files'] += machine_results.get('total_files', 0)
|
results['total_files'] += machine_results.get('total_files', 0)
|
||||||
results['imported'] += machine_results.get('imported', 0)
|
results['imported'] += machine_results.get('imported', 0)
|
||||||
results['skipped'] += machine_results.get('skipped', 0)
|
results['skipped'] += machine_results.get('skipped', 0)
|
||||||
|
results['skipped_old'] += machine_results.get('skipped_old', 0)
|
||||||
results['in_progress'] += machine_results.get('in_progress', 0)
|
results['in_progress'] += machine_results.get('in_progress', 0)
|
||||||
results['tool_setup'] += machine_results.get('tool_setup', 0)
|
results['tool_setup'] += machine_results.get('tool_setup', 0)
|
||||||
results['errors'] += machine_results.get('errors', 0)
|
results['errors'] += machine_results.get('errors', 0)
|
||||||
@@ -1127,6 +1162,7 @@ def main():
|
|||||||
parser.add_argument('--file', help='Parse a specific JSON file')
|
parser.add_argument('--file', help='Parse a specific JSON file')
|
||||||
parser.add_argument('--machine', help='Parse only a specific machine folder')
|
parser.add_argument('--machine', help='Parse only a specific machine folder')
|
||||||
parser.add_argument('--active-only', action='store_true', help='Only update active sessions (skip imports)')
|
parser.add_argument('--active-only', action='store_true', help='Only update active sessions (skip imports)')
|
||||||
|
parser.add_argument('--max-age', type=int, default=90, help='Max file age in days (default: 90, 0=no limit)')
|
||||||
parser.add_argument('--host', default=DB_CONFIG['host'], help='MySQL host')
|
parser.add_argument('--host', default=DB_CONFIG['host'], help='MySQL host')
|
||||||
parser.add_argument('--port', type=int, default=DB_CONFIG['port'], help='MySQL port')
|
parser.add_argument('--port', type=int, default=DB_CONFIG['port'], help='MySQL port')
|
||||||
parser.add_argument('--user', default=DB_CONFIG['user'], help='MySQL user')
|
parser.add_argument('--user', default=DB_CONFIG['user'], help='MySQL user')
|
||||||
@@ -1144,8 +1180,11 @@ def main():
|
|||||||
'database': args.database
|
'database': args.database
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Max age (0 = no limit)
|
||||||
|
max_age = args.max_age if args.max_age > 0 else None
|
||||||
|
|
||||||
# Create parser and connect
|
# Create parser and connect
|
||||||
clm_parser = CLMParser(db_config)
|
clm_parser = CLMParser(db_config, max_age_days=max_age)
|
||||||
if not clm_parser.connect():
|
if not clm_parser.connect():
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
@@ -1189,6 +1228,7 @@ def main():
|
|||||||
print(f" Total files found: {results.get('total_files', 0)}")
|
print(f" Total files found: {results.get('total_files', 0)}")
|
||||||
print(f" Files imported: {results.get('imported', 0)}")
|
print(f" Files imported: {results.get('imported', 0)}")
|
||||||
print(f" Files skipped: {results.get('skipped', 0)}")
|
print(f" Files skipped: {results.get('skipped', 0)}")
|
||||||
|
print(f" Skipped (too old): {results.get('skipped_old', 0)}")
|
||||||
print(f" Jobs in progress: {results.get('in_progress', 0)}")
|
print(f" Jobs in progress: {results.get('in_progress', 0)}")
|
||||||
print(f" Files with errors: {results.get('errors', 0)}")
|
print(f" Files with errors: {results.get('errors', 0)}")
|
||||||
print(f" Total measurements: {results.get('total_measurements', 0)}")
|
print(f" Total measurements: {results.get('total_measurements', 0)}")
|
||||||
@@ -1196,6 +1236,8 @@ def main():
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
# Parse all machines
|
# Parse all machines
|
||||||
|
if max_age:
|
||||||
|
print(f"Max file age: {max_age} days")
|
||||||
results = clm_parser.parse_all_machines(args.dir)
|
results = clm_parser.parse_all_machines(args.dir)
|
||||||
print(f"\n{'='*50}")
|
print(f"\n{'='*50}")
|
||||||
print(f"CLM Data Import Summary:")
|
print(f"CLM Data Import Summary:")
|
||||||
@@ -1203,6 +1245,7 @@ def main():
|
|||||||
print(f" Total files found: {results['total_files']}")
|
print(f" Total files found: {results['total_files']}")
|
||||||
print(f" Files imported: {results['imported']}")
|
print(f" Files imported: {results['imported']}")
|
||||||
print(f" Files skipped: {results['skipped']}")
|
print(f" Files skipped: {results['skipped']}")
|
||||||
|
print(f" Skipped (too old): {results['skipped_old']}")
|
||||||
print(f" Jobs in progress: {results['in_progress']}")
|
print(f" Jobs in progress: {results['in_progress']}")
|
||||||
print(f" Files with errors: {results['errors']}")
|
print(f" Files with errors: {results['errors']}")
|
||||||
print(f" Total measurements: {results['total_measurements']}")
|
print(f" Total measurements: {results['total_measurements']}")
|
||||||
|
|||||||
Reference in New Issue
Block a user