From 149a223ce21a0405f9319e7849809e1adfc83288 Mon Sep 17 00:00:00 2001 From: cproudlock Date: Tue, 16 Dec 2025 10:08:25 -0500 Subject: [PATCH] Add --max-age filter to skip old files during import MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Default: 90 days (matches retention policy) - Use --max-age 0 to disable filtering - Extracts date from CLM filename pattern - Shows "Skipped (too old)" count in summary - Prevents re-importing data that would be immediately purged 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- parser/clmparser.py | 51 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/parser/clmparser.py b/parser/clmparser.py index b9f10e9..b1b8ca5 100644 --- a/parser/clmparser.py +++ b/parser/clmparser.py @@ -22,6 +22,7 @@ import json import os import sys import glob +import re import argparse import hashlib from datetime import datetime, timedelta @@ -98,7 +99,7 @@ def is_file_complete(filepath: str) -> bool: class CLMParser: """Parser for CLM_Data JSON files""" - def __init__(self, db_config: dict): + def __init__(self, db_config: dict, max_age_days: int = None): self.db_config = db_config self.conn = None self.cursor = None @@ -107,6 +108,31 @@ class CLMParser: self.manual_requests_batch = [] self.header_updates_batch = [] self.violations_batch = [] + self.max_age_days = max_age_days # None = no limit + self.cutoff_date = None + if max_age_days: + self.cutoff_date = (datetime.now() - timedelta(days=max_age_days)).date() + + def get_file_date(self, filename: str) -> Optional[datetime]: + """Extract date from CLM filename: {Part}_{Oper}_{Serial}_{YYYY-MM-DD}_{HH-MM-SS}.json""" + match = re.search(r'(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2})\.json$', filename) + if match: + try: + date_str = match.group(1) + time_str = match.group(2).replace('-', ':') + return datetime.strptime(f"{date_str} {time_str}", '%Y-%m-%d %H:%M:%S') + except ValueError: + return None + return None + + def is_file_too_old(self, filename: str) -> bool: + """Check if file date is older than max_age_days""" + if not self.cutoff_date: + return False + file_date = self.get_file_date(filename) + if file_date: + return file_date.date() < self.cutoff_date + return False # If can't parse date, don't skip def connect(self): """Connect to MySQL database""" @@ -558,6 +584,10 @@ class CLMParser: """Parse a single CLM JSON file""" filename = os.path.basename(filepath) + # Check if file is too old (based on filename date) + if self.is_file_too_old(filename): + return {'success': False, 'error': f'File too old (>{self.max_age_days} days): {filename}', 'skip_silent': True, 'too_old': True} + # Extract machine number from parent folder parent_dir = os.path.basename(os.path.dirname(os.path.dirname(filepath))) machine_number = parent_dir if parent_dir.isdigit() else None @@ -791,6 +821,7 @@ class CLMParser: 'imported': 0, 'updated': 0, 'skipped': 0, + 'skipped_old': 0, 'in_progress': 0, 'tool_setup': 0, 'errors': 0, @@ -813,8 +844,10 @@ class CLMParser: elif result.get('in_progress'): results['in_progress'] += 1 elif result.get('skip_silent'): - # Silent skips: unchanged files or tool setup files - if 'tool setup' in result.get('error', '').lower(): + # Silent skips: unchanged files, tool setup files, or old files + if result.get('too_old'): + results['skipped_old'] += 1 + elif 'tool setup' in result.get('error', '').lower(): results['tool_setup'] += 1 else: results['skipped'] += 1 # Unchanged files @@ -1081,6 +1114,7 @@ class CLMParser: 'total_files': 0, 'imported': 0, 'skipped': 0, + 'skipped_old': 0, 'in_progress': 0, 'tool_setup': 0, 'errors': 0, @@ -1111,6 +1145,7 @@ class CLMParser: results['total_files'] += machine_results.get('total_files', 0) results['imported'] += machine_results.get('imported', 0) results['skipped'] += machine_results.get('skipped', 0) + results['skipped_old'] += machine_results.get('skipped_old', 0) results['in_progress'] += machine_results.get('in_progress', 0) results['tool_setup'] += machine_results.get('tool_setup', 0) results['errors'] += machine_results.get('errors', 0) @@ -1127,6 +1162,7 @@ def main(): parser.add_argument('--file', help='Parse a specific JSON file') parser.add_argument('--machine', help='Parse only a specific machine folder') parser.add_argument('--active-only', action='store_true', help='Only update active sessions (skip imports)') + parser.add_argument('--max-age', type=int, default=90, help='Max file age in days (default: 90, 0=no limit)') parser.add_argument('--host', default=DB_CONFIG['host'], help='MySQL host') parser.add_argument('--port', type=int, default=DB_CONFIG['port'], help='MySQL port') parser.add_argument('--user', default=DB_CONFIG['user'], help='MySQL user') @@ -1144,8 +1180,11 @@ def main(): 'database': args.database } + # Max age (0 = no limit) + max_age = args.max_age if args.max_age > 0 else None + # Create parser and connect - clm_parser = CLMParser(db_config) + clm_parser = CLMParser(db_config, max_age_days=max_age) if not clm_parser.connect(): sys.exit(1) @@ -1189,6 +1228,7 @@ def main(): print(f" Total files found: {results.get('total_files', 0)}") print(f" Files imported: {results.get('imported', 0)}") print(f" Files skipped: {results.get('skipped', 0)}") + print(f" Skipped (too old): {results.get('skipped_old', 0)}") print(f" Jobs in progress: {results.get('in_progress', 0)}") print(f" Files with errors: {results.get('errors', 0)}") print(f" Total measurements: {results.get('total_measurements', 0)}") @@ -1196,6 +1236,8 @@ def main(): else: # Parse all machines + if max_age: + print(f"Max file age: {max_age} days") results = clm_parser.parse_all_machines(args.dir) print(f"\n{'='*50}") print(f"CLM Data Import Summary:") @@ -1203,6 +1245,7 @@ def main(): print(f" Total files found: {results['total_files']}") print(f" Files imported: {results['imported']}") print(f" Files skipped: {results['skipped']}") + print(f" Skipped (too old): {results['skipped_old']}") print(f" Jobs in progress: {results['in_progress']}") print(f" Files with errors: {results['errors']}") print(f" Total measurements: {results['total_measurements']}")