Restore-UDCData: verbose per-cycle logging + share-reachability retry
Two production-debuggability gaps closed. 1. Logging is now always-on. The previous version exited silently on the common no-op paths (no UDC installed, no backup waiting, share not reachable), leaving zero log evidence when techs reported "restore didn't happen". New behavior writes a header + identity + share-path + decision-point line to a single rotating log file every cycle. Errors include exception type, position, and full ScriptStackTrace. Log lives at C:\Logs\UDC\Restore-UDCData.log with a 1 MB cap and one-generation rotation to .old.log. 2. Share-reachability is now polled instead of probed once. The SFLD share over the SMB redirector takes 20-60 s to become reachable from SYSTEM context after a cold logon, especially on the first GE-Enforce cycle of the boot. The old single Test-Path returned false in that window and the script silently exited, missing the backup. New behavior polls Test-Path on the share root every 3 s for up to 60 s (both tunable via -ShareTimeoutSec / -SharePollSec) before deciding "no backup". If the share never comes up in that window the script exits 1 instead of 0 so the dispatcher logs a visible failure. Both behaviors propagated to the host staging copy at /home/camp/pxe-images/Restore-UDCData.ps1 and to the v2 share-staged copy at tsgwp00525-v2/.../standard-machine/scripts/Restore-UDCData.ps1. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -6,7 +6,7 @@
|
||||
# whether there's actually any work to do.
|
||||
#
|
||||
# CONTRACT:
|
||||
# - 99% of cycles: no backup waiting -> exit 0 in ~1 second, no side effects
|
||||
# - 99% of cycles: no backup waiting -> exit 0 in ~1 second, ~5 lines of log
|
||||
# - 1 cycle (the one after Backup-UDCData lands a backup for this PC's bay):
|
||||
# stop UDC, copy CurrentData.json + ArchivedData/ to C:\ProgramData\UDC,
|
||||
# move consumed backup to <bay>\migrated\<timestamp>\, write
|
||||
@@ -25,6 +25,12 @@
|
||||
# the backup existed). Update-MachineNumber.ps1's branch still handles the
|
||||
# secondary case (tech used 9999 placeholder + sets number at bay) - both
|
||||
# triggers safely no-op if the other already consumed the backup.
|
||||
#
|
||||
# LOGGING:
|
||||
# Single rotating log at C:\Logs\UDC\Restore-UDCData.log (1 MB cap, rotated
|
||||
# to .old.log on overflow). Every cycle writes a header line so even the
|
||||
# silent no-op path leaves a trace. Errors include full exception type,
|
||||
# position, and stack trace.
|
||||
|
||||
[CmdletBinding()]
|
||||
param(
|
||||
@@ -32,160 +38,274 @@ param(
|
||||
[string]$UdcDataDir = 'C:\ProgramData\UDC',
|
||||
[string]$UdcExePath = 'C:\Program Files\UDC\UDC.exe',
|
||||
[string]$UdcSettingsPath = 'C:\ProgramData\UDC\udc_settings.json',
|
||||
[string]$Site = 'West Jefferson'
|
||||
[string]$Site = 'West Jefferson',
|
||||
# Share can take 20-60s to become reachable from SYSTEM context after a
|
||||
# cold boot or fresh logon. Retry until then before deciding "no backup".
|
||||
[int]$ShareTimeoutSec = 60,
|
||||
[int]$SharePollSec = 3
|
||||
)
|
||||
|
||||
$ErrorActionPreference = 'Stop'
|
||||
$ErrorActionPreference = 'Continue'
|
||||
|
||||
# -- Logging setup --------------------------------------------------------
|
||||
$logDir = 'C:\Logs\UDC'
|
||||
if (-not (Test-Path $logDir)) {
|
||||
try { New-Item -Path $logDir -ItemType Directory -Force | Out-Null } catch { $logDir = $env:TEMP }
|
||||
}
|
||||
try {
|
||||
if (-not (Test-Path $logDir)) { New-Item -Path $logDir -ItemType Directory -Force | Out-Null }
|
||||
} catch { $logDir = $env:TEMP }
|
||||
$logFile = Join-Path $logDir 'Restore-UDCData.log'
|
||||
$logFileMaxBytes = 1MB
|
||||
|
||||
# Rotate log file if oversized (keeps one prior generation)
|
||||
try {
|
||||
if ((Test-Path $logFile) -and ((Get-Item $logFile).Length -gt $logFileMaxBytes)) {
|
||||
$rotated = Join-Path $logDir 'Restore-UDCData.old.log'
|
||||
if (Test-Path $rotated) { Remove-Item $rotated -Force -ErrorAction SilentlyContinue }
|
||||
Rename-Item -Path $logFile -NewName 'Restore-UDCData.old.log' -Force -ErrorAction SilentlyContinue
|
||||
}
|
||||
} catch {}
|
||||
|
||||
function Log {
|
||||
param([string]$Msg, [string]$Level = 'INFO')
|
||||
$ts = Get-Date -Format 'yyyy-MM-dd HH:mm:ss'
|
||||
$ts = Get-Date -Format 'yyyy-MM-dd HH:mm:ss.fff'
|
||||
$line = "[$ts][$Level] $Msg"
|
||||
Add-Content -LiteralPath $logFile -Value $line
|
||||
try { Add-Content -LiteralPath $logFile -Value $line -ErrorAction SilentlyContinue } catch {}
|
||||
Write-Host $line
|
||||
}
|
||||
|
||||
# --- Resolve local machine number ---
|
||||
function LogErr {
|
||||
param($Err)
|
||||
if (-not $Err) { return }
|
||||
$exType = if ($Err.Exception) { $Err.Exception.GetType().FullName } else { '<no exception>' }
|
||||
$exMsg = if ($Err.Exception) { $Err.Exception.Message } else { "$Err" }
|
||||
Log " exception: $exType - $exMsg" 'ERROR'
|
||||
if ($Err.InvocationInfo -and $Err.InvocationInfo.PositionMessage) {
|
||||
$pos = ($Err.InvocationInfo.PositionMessage -replace "`r?`n", ' | ')
|
||||
Log " at: $pos" 'ERROR'
|
||||
}
|
||||
if ($Err.ScriptStackTrace) {
|
||||
$st = ($Err.ScriptStackTrace -replace "`r?`n", ' | ')
|
||||
Log " stack: $st" 'ERROR'
|
||||
}
|
||||
if ($Err.Exception -and $Err.Exception.InnerException) {
|
||||
Log " inner: $($Err.Exception.InnerException.Message)" 'ERROR'
|
||||
}
|
||||
}
|
||||
|
||||
Log '==============================================='
|
||||
Log "Restore-UDCData starting (PID $PID)"
|
||||
Log "Hostname: $env:COMPUTERNAME"
|
||||
try {
|
||||
$whoami = [System.Security.Principal.WindowsIdentity]::GetCurrent().Name
|
||||
} catch { $whoami = '<unknown>' }
|
||||
Log "User identity: $whoami"
|
||||
Log "PowerShell version: $($PSVersionTable.PSVersion)"
|
||||
Log "BackupShareRoot: $BackupShareRoot"
|
||||
Log "UdcDataDir: $UdcDataDir"
|
||||
Log "UdcSettingsPath: $UdcSettingsPath"
|
||||
Log "ShareTimeoutSec: $ShareTimeoutSec SharePollSec: $SharePollSec"
|
||||
|
||||
# -- Resolve local machine number ----------------------------------------
|
||||
if (-not (Test-Path -LiteralPath $UdcSettingsPath)) {
|
||||
# No UDC installed yet. Manifest engine will catch up later if/when it lands.
|
||||
Log "udc_settings.json not present - UDC not installed yet, no work to do."
|
||||
Log 'Exit 0.'
|
||||
exit 0
|
||||
}
|
||||
try {
|
||||
$json = Get-Content -LiteralPath $UdcSettingsPath -Raw | ConvertFrom-Json
|
||||
$json = Get-Content -LiteralPath $UdcSettingsPath -Raw -ErrorAction Stop | ConvertFrom-Json -ErrorAction Stop
|
||||
$mn = $json.GeneralSettings.MachineNumber
|
||||
Log "Resolved MachineNumber from udc_settings: $mn"
|
||||
} catch {
|
||||
Log "Failed to parse $UdcSettingsPath : $_" 'ERROR'
|
||||
Log "Failed to parse $UdcSettingsPath" 'ERROR'
|
||||
LogErr $_
|
||||
Log 'Exit 0.'
|
||||
exit 0
|
||||
}
|
||||
if (-not $mn -or $mn -eq '9999' -or $mn -notmatch '^\d+$') {
|
||||
# Placeholder or invalid - the placeholder->real trigger in
|
||||
# Update-MachineNumber.ps1 will catch it when the tech sets a real number.
|
||||
Log "Machine number is placeholder/empty/non-numeric ('$mn'). Update-MachineNumber.ps1's branch will catch the placeholder->real transition. No work to do."
|
||||
Log 'Exit 0.'
|
||||
exit 0
|
||||
}
|
||||
|
||||
# --- Probe for a waiting backup ---
|
||||
# -- Wait for the share to be reachable ---------------------------------
|
||||
# When this script runs early in a logon (e.g. via GE-Enforce on autologon),
|
||||
# the SFLD share via the SMB redirector can take 20-60 seconds to become
|
||||
# reachable, especially in SYSTEM context where the credential is the
|
||||
# computer account. Poll until reachable or timeout before deciding "no backup".
|
||||
Log "Polling share root for reachability: $BackupShareRoot"
|
||||
$shareReachable = $false
|
||||
$sw = [Diagnostics.Stopwatch]::StartNew()
|
||||
while ($sw.Elapsed.TotalSeconds -lt $ShareTimeoutSec) {
|
||||
if (Test-Path -LiteralPath $BackupShareRoot) {
|
||||
$shareReachable = $true
|
||||
break
|
||||
}
|
||||
Start-Sleep -Seconds $SharePollSec
|
||||
}
|
||||
$sw.Stop()
|
||||
if ($shareReachable) {
|
||||
Log ("Share reachable after {0:N1} s" -f $sw.Elapsed.TotalSeconds)
|
||||
} else {
|
||||
Log "Share NOT reachable after $ShareTimeoutSec s. Cannot probe for backup. Exiting non-zero so the dispatcher logs a failure." 'ERROR'
|
||||
Log 'Exit 1.'
|
||||
exit 1
|
||||
}
|
||||
|
||||
# -- Probe for a waiting backup ------------------------------------------
|
||||
$bayDir = Join-Path $BackupShareRoot $mn
|
||||
$srcCur = Join-Path $bayDir 'CurrentData.json'
|
||||
$srcArc = Join-Path $bayDir 'ArchivedData'
|
||||
Log "Probing backup paths for bay $mn"
|
||||
Log " bayDir: $bayDir"
|
||||
$bayDirExists = Test-Path -LiteralPath $bayDir
|
||||
Log " bayDir exists: $bayDirExists"
|
||||
$srcCurExists = Test-Path -LiteralPath $srcCur
|
||||
Log " CurrentData.json src: $(if ($srcCurExists) { 'present' } else { 'absent' }) - $srcCur"
|
||||
$srcArcExists = Test-Path -LiteralPath $srcArc
|
||||
Log " ArchivedData/ src: $(if ($srcArcExists) { 'present' } else { 'absent' }) - $srcArc"
|
||||
|
||||
if (-not (Test-Path -LiteralPath $srcCur)) {
|
||||
# Most-common path: no backup waiting. Exit silently to keep enforce-cycle
|
||||
# logs clean. (Manifest engine still records that the entry ran.)
|
||||
if (-not $srcCurExists) {
|
||||
Log "No backup waiting for bay $mn - no work to do this cycle."
|
||||
Log 'Exit 0.'
|
||||
exit 0
|
||||
}
|
||||
|
||||
# We have a backup. From here on, log everything.
|
||||
Log "==============================================="
|
||||
Log "UDC data backup detected at $bayDir - restoring."
|
||||
Log "Hostname: $env:COMPUTERNAME"
|
||||
Log "Machine number: $mn"
|
||||
# -- We have a backup. Restore. ------------------------------------------
|
||||
Log "Backup waiting at $bayDir - proceeding with restore"
|
||||
|
||||
# --- Stop UDC.exe so CurrentData.json isn't locked ---
|
||||
Get-Process UDC -ErrorAction SilentlyContinue | ForEach-Object {
|
||||
# Stop UDC.exe so CurrentData.json isn't locked
|
||||
$udcProcs = @(Get-Process UDC -ErrorAction SilentlyContinue)
|
||||
Log "UDC processes currently running: $($udcProcs.Count)"
|
||||
foreach ($p in $udcProcs) {
|
||||
try {
|
||||
$_.Kill()
|
||||
$_.WaitForExit(5000) | Out-Null
|
||||
Log "Stopped existing UDC.exe (PID $($_.Id))"
|
||||
Log " stopping UDC.exe PID $($p.Id)"
|
||||
$p.Kill()
|
||||
$p.WaitForExit(5000) | Out-Null
|
||||
Log " stopped"
|
||||
} catch {
|
||||
Log "Could not stop UDC.exe (PID $($_.Id)): $_" 'WARN'
|
||||
Log " could not stop UDC.exe PID $($p.Id)" 'WARN'
|
||||
LogErr $_
|
||||
}
|
||||
}
|
||||
Start-Sleep -Milliseconds 500
|
||||
|
||||
# --- Ensure local UDC data dir exists ---
|
||||
# Ensure local UDC data dir exists
|
||||
if (-not (Test-Path -LiteralPath $UdcDataDir)) {
|
||||
New-Item -ItemType Directory -Path $UdcDataDir -Force | Out-Null
|
||||
Log "Creating local UDC data dir: $UdcDataDir"
|
||||
try {
|
||||
New-Item -Path $UdcDataDir -ItemType Directory -Force | Out-Null
|
||||
} catch {
|
||||
Log "Failed to create $UdcDataDir - cannot continue" 'ERROR'
|
||||
LogErr $_
|
||||
Log 'Exit 1.'
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
$localCur = Join-Path $UdcDataDir 'CurrentData.json'
|
||||
$localArc = Join-Path $UdcDataDir 'ArchivedData'
|
||||
|
||||
# --- Copy CurrentData.json ---
|
||||
# Copy CurrentData.json
|
||||
$copiedCur = $false
|
||||
Log "Copying CurrentData.json"
|
||||
Log " src: $srcCur"
|
||||
Log " dst: $localCur"
|
||||
try {
|
||||
Copy-Item -LiteralPath $srcCur -Destination $localCur -Force -ErrorAction Stop
|
||||
$copiedCur = $true
|
||||
Log "Copied CurrentData.json ($((Get-Item $localCur).Length) bytes)"
|
||||
$sz = (Get-Item -LiteralPath $localCur).Length
|
||||
Log " OK ($sz bytes)"
|
||||
} catch {
|
||||
Log "Copy CurrentData.json failed: $_" 'ERROR'
|
||||
Log " FAILED" 'ERROR'
|
||||
LogErr $_
|
||||
}
|
||||
|
||||
# --- Copy ArchivedData/ ---
|
||||
# Copy ArchivedData/
|
||||
$copiedArc = $false
|
||||
$arcFiles = 0
|
||||
$arcBytes = 0
|
||||
if (Test-Path -LiteralPath $srcArc) {
|
||||
if ($srcArcExists) {
|
||||
Log "Copying ArchivedData/"
|
||||
Log " src: $srcArc"
|
||||
Log " dst: $localArc"
|
||||
try {
|
||||
if (Test-Path -LiteralPath $localArc) {
|
||||
Log " removing existing $localArc"
|
||||
Remove-Item -LiteralPath $localArc -Recurse -Force -ErrorAction SilentlyContinue
|
||||
}
|
||||
Copy-Item -LiteralPath $srcArc -Destination $localArc -Recurse -Force -ErrorAction Stop
|
||||
$arcItems = Get-ChildItem -LiteralPath $localArc -Recurse -File -ErrorAction SilentlyContinue
|
||||
$arcFiles = $arcItems.Count
|
||||
$arcFiles = @($arcItems).Count
|
||||
$arcBytes = ($arcItems | Measure-Object Length -Sum).Sum
|
||||
$copiedArc = $true
|
||||
Log "Copied ArchivedData/ ($arcFiles files, $arcBytes bytes)"
|
||||
Log " OK ($arcFiles files, $arcBytes bytes)"
|
||||
} catch {
|
||||
Log "Copy ArchivedData/ failed: $_" 'ERROR'
|
||||
Log " FAILED" 'ERROR'
|
||||
LogErr $_
|
||||
}
|
||||
} else {
|
||||
Log "ArchivedData/ not present in backup - skipping that copy step"
|
||||
}
|
||||
|
||||
# --- One-shot consumption: only move backup -> migrated/ if BOTH copies succeeded ---
|
||||
# (If one failed, leave the live backup in place so the next cycle can retry.)
|
||||
$consumeOk = ($copiedCur -and ($copiedArc -or -not (Test-Path -LiteralPath $srcArc)))
|
||||
# One-shot consumption: only move backup -> migrated/ if everything required succeeded
|
||||
$consumeOk = ($copiedCur -and ($copiedArc -or -not $srcArcExists))
|
||||
Log "consumeOk=$consumeOk (copiedCur=$copiedCur, copiedArc=$copiedArc, srcArcExists=$srcArcExists)"
|
||||
|
||||
if ($consumeOk) {
|
||||
try {
|
||||
$stamp = Get-Date -Format 'yyyy-MM-ddTHH-mm-ssZ'
|
||||
$migDir = Join-Path $bayDir 'migrated'
|
||||
$migStamp = Join-Path $migDir $stamp
|
||||
Log "Moving consumed backup to $migStamp"
|
||||
if (-not (Test-Path -LiteralPath $migDir)) { New-Item -ItemType Directory -Path $migDir -Force | Out-Null }
|
||||
if (-not (Test-Path -LiteralPath $migStamp)) { New-Item -ItemType Directory -Path $migStamp -Force | Out-Null }
|
||||
|
||||
Move-Item -LiteralPath $srcCur -Destination (Join-Path $migStamp 'CurrentData.json') -Force -ErrorAction Stop
|
||||
Log " moved CurrentData.json"
|
||||
if (Test-Path -LiteralPath $srcArc) {
|
||||
Move-Item -LiteralPath $srcArc -Destination (Join-Path $migStamp 'ArchivedData') -Force -ErrorAction Stop
|
||||
Log " moved ArchivedData/"
|
||||
}
|
||||
$bakManifest = Join-Path $bayDir 'backup.manifest.json'
|
||||
if (Test-Path -LiteralPath $bakManifest) {
|
||||
Move-Item -LiteralPath $bakManifest -Destination (Join-Path $migStamp 'backup.manifest.json') -Force -ErrorAction SilentlyContinue
|
||||
Log " moved backup.manifest.json"
|
||||
}
|
||||
|
||||
$restoreManifest = [ordered]@{
|
||||
RestoredAt = (Get-Date -Format 'o')
|
||||
DestinationHostname = $env:COMPUTERNAME
|
||||
DestinationUser = [System.Security.Principal.WindowsIdentity]::GetCurrent().Name
|
||||
DestinationUser = $whoami
|
||||
MachineNumber = $mn
|
||||
CurrentDataBytes = (Get-Item $localCur).Length
|
||||
CurrentDataBytes = (Get-Item -LiteralPath $localCur).Length
|
||||
ArchivedDataFiles = $arcFiles
|
||||
ArchivedDataBytes = $arcBytes
|
||||
RestoredVia = 'Restore-UDCData.ps1 (manifest engine, on logon)'
|
||||
}
|
||||
$restoreManifest | ConvertTo-Json | Set-Content -Path (Join-Path $migStamp 'restore.manifest.json') -Encoding UTF8
|
||||
Log " wrote restore.manifest.json"
|
||||
|
||||
Log "Backup consumed -> migrated\$stamp\"
|
||||
} catch {
|
||||
Log "Move-to-migrated failed (data IS restored locally, but live backup remains - next cycle will retry consumption): $_" 'ERROR'
|
||||
Log "Move-to-migrated FAILED (data IS restored locally; live backup remains, next cycle will retry consumption)" 'ERROR'
|
||||
LogErr $_
|
||||
}
|
||||
} else {
|
||||
Log "Restore incomplete - leaving live backup at $bayDir for retry next cycle." 'WARN'
|
||||
}
|
||||
|
||||
# --- Relaunch UDC with the current machine number args so it picks up the
|
||||
# restored CurrentData.json. UDC's vendor autostart in HKLM\Run will also
|
||||
# fire on the next user logon, so this is belt-and-suspenders for the
|
||||
# same-session case (e.g. tech is at the keyboard during the restore). ---
|
||||
# Relaunch UDC with the current machine number args. UDC's vendor autostart in
|
||||
# HKLM\Run will also fire on the next user logon, so this is belt-and-suspenders
|
||||
# for the same-session case (e.g. tech is at the keyboard during the restore).
|
||||
if ((Test-Path -LiteralPath $UdcExePath) -and $copiedCur) {
|
||||
Log "Relaunching UDC.exe: `"$Site`" -$mn"
|
||||
try {
|
||||
Start-Process -FilePath $UdcExePath -ArgumentList @("`"$Site`"", "-$mn")
|
||||
Log "Relaunched UDC.exe with `"$Site`" -$mn"
|
||||
Log " relaunched"
|
||||
} catch {
|
||||
Log "UDC relaunch failed: $_" 'WARN'
|
||||
Log " UDC relaunch FAILED" 'WARN'
|
||||
LogErr $_
|
||||
}
|
||||
}
|
||||
|
||||
Log "==============================================="
|
||||
Log 'Exit 0.'
|
||||
Log '==============================================='
|
||||
exit 0
|
||||
|
||||
Reference in New Issue
Block a user