diff --git a/playbook/shopfloor-setup/Standard/Restore-UDCData.ps1 b/playbook/shopfloor-setup/Standard/Restore-UDCData.ps1 index 4ce4a5c..ace258e 100644 --- a/playbook/shopfloor-setup/Standard/Restore-UDCData.ps1 +++ b/playbook/shopfloor-setup/Standard/Restore-UDCData.ps1 @@ -6,7 +6,7 @@ # whether there's actually any work to do. # # CONTRACT: -# - 99% of cycles: no backup waiting -> exit 0 in ~1 second, no side effects +# - 99% of cycles: no backup waiting -> exit 0 in ~1 second, ~5 lines of log # - 1 cycle (the one after Backup-UDCData lands a backup for this PC's bay): # stop UDC, copy CurrentData.json + ArchivedData/ to C:\ProgramData\UDC, # move consumed backup to \migrated\\, write @@ -25,6 +25,12 @@ # the backup existed). Update-MachineNumber.ps1's branch still handles the # secondary case (tech used 9999 placeholder + sets number at bay) - both # triggers safely no-op if the other already consumed the backup. +# +# LOGGING: +# Single rotating log at C:\Logs\UDC\Restore-UDCData.log (1 MB cap, rotated +# to .old.log on overflow). Every cycle writes a header line so even the +# silent no-op path leaves a trace. Errors include full exception type, +# position, and stack trace. [CmdletBinding()] param( @@ -32,160 +38,274 @@ param( [string]$UdcDataDir = 'C:\ProgramData\UDC', [string]$UdcExePath = 'C:\Program Files\UDC\UDC.exe', [string]$UdcSettingsPath = 'C:\ProgramData\UDC\udc_settings.json', - [string]$Site = 'West Jefferson' + [string]$Site = 'West Jefferson', + # Share can take 20-60s to become reachable from SYSTEM context after a + # cold boot or fresh logon. Retry until then before deciding "no backup". + [int]$ShareTimeoutSec = 60, + [int]$SharePollSec = 3 ) -$ErrorActionPreference = 'Stop' +$ErrorActionPreference = 'Continue' + +# -- Logging setup -------------------------------------------------------- $logDir = 'C:\Logs\UDC' -if (-not (Test-Path $logDir)) { - try { New-Item -Path $logDir -ItemType Directory -Force | Out-Null } catch { $logDir = $env:TEMP } -} -$logFile = Join-Path $logDir 'Restore-UDCData.log' +try { + if (-not (Test-Path $logDir)) { New-Item -Path $logDir -ItemType Directory -Force | Out-Null } +} catch { $logDir = $env:TEMP } +$logFile = Join-Path $logDir 'Restore-UDCData.log' +$logFileMaxBytes = 1MB + +# Rotate log file if oversized (keeps one prior generation) +try { + if ((Test-Path $logFile) -and ((Get-Item $logFile).Length -gt $logFileMaxBytes)) { + $rotated = Join-Path $logDir 'Restore-UDCData.old.log' + if (Test-Path $rotated) { Remove-Item $rotated -Force -ErrorAction SilentlyContinue } + Rename-Item -Path $logFile -NewName 'Restore-UDCData.old.log' -Force -ErrorAction SilentlyContinue + } +} catch {} function Log { param([string]$Msg, [string]$Level = 'INFO') - $ts = Get-Date -Format 'yyyy-MM-dd HH:mm:ss' + $ts = Get-Date -Format 'yyyy-MM-dd HH:mm:ss.fff' $line = "[$ts][$Level] $Msg" - Add-Content -LiteralPath $logFile -Value $line + try { Add-Content -LiteralPath $logFile -Value $line -ErrorAction SilentlyContinue } catch {} Write-Host $line } -# --- Resolve local machine number --- +function LogErr { + param($Err) + if (-not $Err) { return } + $exType = if ($Err.Exception) { $Err.Exception.GetType().FullName } else { '' } + $exMsg = if ($Err.Exception) { $Err.Exception.Message } else { "$Err" } + Log " exception: $exType - $exMsg" 'ERROR' + if ($Err.InvocationInfo -and $Err.InvocationInfo.PositionMessage) { + $pos = ($Err.InvocationInfo.PositionMessage -replace "`r?`n", ' | ') + Log " at: $pos" 'ERROR' + } + if ($Err.ScriptStackTrace) { + $st = ($Err.ScriptStackTrace -replace "`r?`n", ' | ') + Log " stack: $st" 'ERROR' + } + if ($Err.Exception -and $Err.Exception.InnerException) { + Log " inner: $($Err.Exception.InnerException.Message)" 'ERROR' + } +} + +Log '===============================================' +Log "Restore-UDCData starting (PID $PID)" +Log "Hostname: $env:COMPUTERNAME" +try { + $whoami = [System.Security.Principal.WindowsIdentity]::GetCurrent().Name +} catch { $whoami = '' } +Log "User identity: $whoami" +Log "PowerShell version: $($PSVersionTable.PSVersion)" +Log "BackupShareRoot: $BackupShareRoot" +Log "UdcDataDir: $UdcDataDir" +Log "UdcSettingsPath: $UdcSettingsPath" +Log "ShareTimeoutSec: $ShareTimeoutSec SharePollSec: $SharePollSec" + +# -- Resolve local machine number ---------------------------------------- if (-not (Test-Path -LiteralPath $UdcSettingsPath)) { - # No UDC installed yet. Manifest engine will catch up later if/when it lands. + Log "udc_settings.json not present - UDC not installed yet, no work to do." + Log 'Exit 0.' exit 0 } try { - $json = Get-Content -LiteralPath $UdcSettingsPath -Raw | ConvertFrom-Json + $json = Get-Content -LiteralPath $UdcSettingsPath -Raw -ErrorAction Stop | ConvertFrom-Json -ErrorAction Stop $mn = $json.GeneralSettings.MachineNumber + Log "Resolved MachineNumber from udc_settings: $mn" } catch { - Log "Failed to parse $UdcSettingsPath : $_" 'ERROR' + Log "Failed to parse $UdcSettingsPath" 'ERROR' + LogErr $_ + Log 'Exit 0.' exit 0 } if (-not $mn -or $mn -eq '9999' -or $mn -notmatch '^\d+$') { - # Placeholder or invalid - the placeholder->real trigger in - # Update-MachineNumber.ps1 will catch it when the tech sets a real number. + Log "Machine number is placeholder/empty/non-numeric ('$mn'). Update-MachineNumber.ps1's branch will catch the placeholder->real transition. No work to do." + Log 'Exit 0.' exit 0 } -# --- Probe for a waiting backup --- +# -- Wait for the share to be reachable --------------------------------- +# When this script runs early in a logon (e.g. via GE-Enforce on autologon), +# the SFLD share via the SMB redirector can take 20-60 seconds to become +# reachable, especially in SYSTEM context where the credential is the +# computer account. Poll until reachable or timeout before deciding "no backup". +Log "Polling share root for reachability: $BackupShareRoot" +$shareReachable = $false +$sw = [Diagnostics.Stopwatch]::StartNew() +while ($sw.Elapsed.TotalSeconds -lt $ShareTimeoutSec) { + if (Test-Path -LiteralPath $BackupShareRoot) { + $shareReachable = $true + break + } + Start-Sleep -Seconds $SharePollSec +} +$sw.Stop() +if ($shareReachable) { + Log ("Share reachable after {0:N1} s" -f $sw.Elapsed.TotalSeconds) +} else { + Log "Share NOT reachable after $ShareTimeoutSec s. Cannot probe for backup. Exiting non-zero so the dispatcher logs a failure." 'ERROR' + Log 'Exit 1.' + exit 1 +} + +# -- Probe for a waiting backup ------------------------------------------ $bayDir = Join-Path $BackupShareRoot $mn $srcCur = Join-Path $bayDir 'CurrentData.json' $srcArc = Join-Path $bayDir 'ArchivedData' +Log "Probing backup paths for bay $mn" +Log " bayDir: $bayDir" +$bayDirExists = Test-Path -LiteralPath $bayDir +Log " bayDir exists: $bayDirExists" +$srcCurExists = Test-Path -LiteralPath $srcCur +Log " CurrentData.json src: $(if ($srcCurExists) { 'present' } else { 'absent' }) - $srcCur" +$srcArcExists = Test-Path -LiteralPath $srcArc +Log " ArchivedData/ src: $(if ($srcArcExists) { 'present' } else { 'absent' }) - $srcArc" -if (-not (Test-Path -LiteralPath $srcCur)) { - # Most-common path: no backup waiting. Exit silently to keep enforce-cycle - # logs clean. (Manifest engine still records that the entry ran.) +if (-not $srcCurExists) { + Log "No backup waiting for bay $mn - no work to do this cycle." + Log 'Exit 0.' exit 0 } -# We have a backup. From here on, log everything. -Log "===============================================" -Log "UDC data backup detected at $bayDir - restoring." -Log "Hostname: $env:COMPUTERNAME" -Log "Machine number: $mn" +# -- We have a backup. Restore. ------------------------------------------ +Log "Backup waiting at $bayDir - proceeding with restore" -# --- Stop UDC.exe so CurrentData.json isn't locked --- -Get-Process UDC -ErrorAction SilentlyContinue | ForEach-Object { +# Stop UDC.exe so CurrentData.json isn't locked +$udcProcs = @(Get-Process UDC -ErrorAction SilentlyContinue) +Log "UDC processes currently running: $($udcProcs.Count)" +foreach ($p in $udcProcs) { try { - $_.Kill() - $_.WaitForExit(5000) | Out-Null - Log "Stopped existing UDC.exe (PID $($_.Id))" + Log " stopping UDC.exe PID $($p.Id)" + $p.Kill() + $p.WaitForExit(5000) | Out-Null + Log " stopped" } catch { - Log "Could not stop UDC.exe (PID $($_.Id)): $_" 'WARN' + Log " could not stop UDC.exe PID $($p.Id)" 'WARN' + LogErr $_ } } Start-Sleep -Milliseconds 500 -# --- Ensure local UDC data dir exists --- +# Ensure local UDC data dir exists if (-not (Test-Path -LiteralPath $UdcDataDir)) { - New-Item -ItemType Directory -Path $UdcDataDir -Force | Out-Null + Log "Creating local UDC data dir: $UdcDataDir" + try { + New-Item -Path $UdcDataDir -ItemType Directory -Force | Out-Null + } catch { + Log "Failed to create $UdcDataDir - cannot continue" 'ERROR' + LogErr $_ + Log 'Exit 1.' + exit 1 + } } $localCur = Join-Path $UdcDataDir 'CurrentData.json' $localArc = Join-Path $UdcDataDir 'ArchivedData' -# --- Copy CurrentData.json --- +# Copy CurrentData.json $copiedCur = $false +Log "Copying CurrentData.json" +Log " src: $srcCur" +Log " dst: $localCur" try { Copy-Item -LiteralPath $srcCur -Destination $localCur -Force -ErrorAction Stop $copiedCur = $true - Log "Copied CurrentData.json ($((Get-Item $localCur).Length) bytes)" + $sz = (Get-Item -LiteralPath $localCur).Length + Log " OK ($sz bytes)" } catch { - Log "Copy CurrentData.json failed: $_" 'ERROR' + Log " FAILED" 'ERROR' + LogErr $_ } -# --- Copy ArchivedData/ --- +# Copy ArchivedData/ $copiedArc = $false $arcFiles = 0 $arcBytes = 0 -if (Test-Path -LiteralPath $srcArc) { +if ($srcArcExists) { + Log "Copying ArchivedData/" + Log " src: $srcArc" + Log " dst: $localArc" try { if (Test-Path -LiteralPath $localArc) { + Log " removing existing $localArc" Remove-Item -LiteralPath $localArc -Recurse -Force -ErrorAction SilentlyContinue } Copy-Item -LiteralPath $srcArc -Destination $localArc -Recurse -Force -ErrorAction Stop $arcItems = Get-ChildItem -LiteralPath $localArc -Recurse -File -ErrorAction SilentlyContinue - $arcFiles = $arcItems.Count + $arcFiles = @($arcItems).Count $arcBytes = ($arcItems | Measure-Object Length -Sum).Sum $copiedArc = $true - Log "Copied ArchivedData/ ($arcFiles files, $arcBytes bytes)" + Log " OK ($arcFiles files, $arcBytes bytes)" } catch { - Log "Copy ArchivedData/ failed: $_" 'ERROR' + Log " FAILED" 'ERROR' + LogErr $_ } +} else { + Log "ArchivedData/ not present in backup - skipping that copy step" } -# --- One-shot consumption: only move backup -> migrated/ if BOTH copies succeeded --- -# (If one failed, leave the live backup in place so the next cycle can retry.) -$consumeOk = ($copiedCur -and ($copiedArc -or -not (Test-Path -LiteralPath $srcArc))) +# One-shot consumption: only move backup -> migrated/ if everything required succeeded +$consumeOk = ($copiedCur -and ($copiedArc -or -not $srcArcExists)) +Log "consumeOk=$consumeOk (copiedCur=$copiedCur, copiedArc=$copiedArc, srcArcExists=$srcArcExists)" + if ($consumeOk) { try { $stamp = Get-Date -Format 'yyyy-MM-ddTHH-mm-ssZ' $migDir = Join-Path $bayDir 'migrated' $migStamp = Join-Path $migDir $stamp + Log "Moving consumed backup to $migStamp" if (-not (Test-Path -LiteralPath $migDir)) { New-Item -ItemType Directory -Path $migDir -Force | Out-Null } if (-not (Test-Path -LiteralPath $migStamp)) { New-Item -ItemType Directory -Path $migStamp -Force | Out-Null } Move-Item -LiteralPath $srcCur -Destination (Join-Path $migStamp 'CurrentData.json') -Force -ErrorAction Stop + Log " moved CurrentData.json" if (Test-Path -LiteralPath $srcArc) { Move-Item -LiteralPath $srcArc -Destination (Join-Path $migStamp 'ArchivedData') -Force -ErrorAction Stop + Log " moved ArchivedData/" } $bakManifest = Join-Path $bayDir 'backup.manifest.json' if (Test-Path -LiteralPath $bakManifest) { Move-Item -LiteralPath $bakManifest -Destination (Join-Path $migStamp 'backup.manifest.json') -Force -ErrorAction SilentlyContinue + Log " moved backup.manifest.json" } $restoreManifest = [ordered]@{ RestoredAt = (Get-Date -Format 'o') DestinationHostname = $env:COMPUTERNAME - DestinationUser = [System.Security.Principal.WindowsIdentity]::GetCurrent().Name + DestinationUser = $whoami MachineNumber = $mn - CurrentDataBytes = (Get-Item $localCur).Length - ArchivedDataFiles = $arcFiles - ArchivedDataBytes = $arcBytes + CurrentDataBytes = (Get-Item -LiteralPath $localCur).Length + ArchivedDataFiles = $arcFiles + ArchivedDataBytes = $arcBytes RestoredVia = 'Restore-UDCData.ps1 (manifest engine, on logon)' } $restoreManifest | ConvertTo-Json | Set-Content -Path (Join-Path $migStamp 'restore.manifest.json') -Encoding UTF8 + Log " wrote restore.manifest.json" Log "Backup consumed -> migrated\$stamp\" } catch { - Log "Move-to-migrated failed (data IS restored locally, but live backup remains - next cycle will retry consumption): $_" 'ERROR' + Log "Move-to-migrated FAILED (data IS restored locally; live backup remains, next cycle will retry consumption)" 'ERROR' + LogErr $_ } } else { Log "Restore incomplete - leaving live backup at $bayDir for retry next cycle." 'WARN' } -# --- Relaunch UDC with the current machine number args so it picks up the -# restored CurrentData.json. UDC's vendor autostart in HKLM\Run will also -# fire on the next user logon, so this is belt-and-suspenders for the -# same-session case (e.g. tech is at the keyboard during the restore). --- +# Relaunch UDC with the current machine number args. UDC's vendor autostart in +# HKLM\Run will also fire on the next user logon, so this is belt-and-suspenders +# for the same-session case (e.g. tech is at the keyboard during the restore). if ((Test-Path -LiteralPath $UdcExePath) -and $copiedCur) { + Log "Relaunching UDC.exe: `"$Site`" -$mn" try { Start-Process -FilePath $UdcExePath -ArgumentList @("`"$Site`"", "-$mn") - Log "Relaunched UDC.exe with `"$Site`" -$mn" + Log " relaunched" } catch { - Log "UDC relaunch failed: $_" 'WARN' + Log " UDC relaunch FAILED" 'WARN' + LogErr $_ } } -Log "===============================================" +Log 'Exit 0.' +Log '===============================================' exit 0