PXE server: fix WinPE re-image SMB connection loss
WinPE clients re-imaging the same machine hit "System error 53 - network path not found" on the second attempt. systemctl restart smbd did not help; only a full server power cycle cleared the state. Root cause is kernel nf_conntrack: the default TCP ESTABLISHED timeout is 5 days (432000s), so a session from the first WinPE run whose client rebooted abnormally leaves an ASSURED ESTABLISHED entry that ufw's state-tracking rules then mis-classify the new SYN against. Fix applied in three layers: - /etc/sysctl.d/99-pxe-conntrack.conf drops TCP ESTABLISHED timeout to 1 hour and shortens the half-closed states to 30s each. - smb.conf gains socket options TCP_NODELAY SO_KEEPALIVE IPTOS_LOWDELAY plus keepalive = 30 and deadtime = 5. Active sessions refresh the conntrack timer every 30s via keepalives so they never age out; dead ones expire in an hour. - /usr/local/sbin/smb-diag.sh snapshots kernel + Samba state for remote diagnosis; /usr/local/sbin/smb-soft-reset.sh walks a progressive recovery (nmbd/smbd restart, conntrack flush, arp flush, ss -K) as an alternative to power-cycling. conntrack package added to download-packages.sh and playbook verify list so the offline .deb bundle ships with it. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
13
playbook/pxe-server-helpers/99-pxe-conntrack.conf
Normal file
13
playbook/pxe-server-helpers/99-pxe-conntrack.conf
Normal file
@@ -0,0 +1,13 @@
|
||||
# Reduce nf_conntrack TCP timeouts so stale SMB sessions from rebooted WinPE
|
||||
# clients age out in minutes instead of days. Real active sessions stay
|
||||
# alive via keepalive traffic refreshing the conntrack timer. Defaults
|
||||
# (432000s = 5 days for ESTABLISHED) are designed for long-lived enterprise
|
||||
# TCP flows, not the short bursty connections PXE imaging creates.
|
||||
#
|
||||
# Paired with Samba's keepalive=30 + SO_KEEPALIVE (see smb.conf) the
|
||||
# combination is: dead sessions cleaned up in ~1 hour max, active sessions
|
||||
# never ageing out because every keepalive packet refreshes the timer.
|
||||
net.netfilter.nf_conntrack_tcp_timeout_established = 3600
|
||||
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 30
|
||||
net.netfilter.nf_conntrack_tcp_timeout_last_ack = 30
|
||||
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 30
|
||||
85
playbook/pxe-server-helpers/smb-diag.sh
Executable file
85
playbook/pxe-server-helpers/smb-diag.sh
Executable file
@@ -0,0 +1,85 @@
|
||||
#!/bin/bash
|
||||
# smb-diag.sh - snapshot Samba + kernel network state so a future failure
|
||||
# can be diagnosed remotely. Run this on the PXE server BEFORE power-cycling
|
||||
# when a WinPE re-image client is getting "cannot connect" errors.
|
||||
#
|
||||
# Output: /tmp/smb-diag-<timestamp>.log (pastebin-friendly)
|
||||
#
|
||||
# Captures: smbd processes, open SMB sessions, port 445 TCP sockets,
|
||||
# conntrack, arp, bridge fdb, dnsmasq leases, recent smbd logs.
|
||||
|
||||
set -o pipefail
|
||||
|
||||
TS=$(date +%Y%m%d-%H%M%S)
|
||||
OUT=/tmp/smb-diag-$TS.log
|
||||
|
||||
exec > >(tee "$OUT") 2>&1
|
||||
|
||||
echo "=============================================================="
|
||||
echo "SMB diagnostic snapshot - $(date)"
|
||||
echo "=============================================================="
|
||||
|
||||
echo
|
||||
echo "### uptime / kernel ###"
|
||||
uptime
|
||||
uname -r
|
||||
|
||||
echo
|
||||
echo "### interfaces + bridge state ###"
|
||||
ip -brief addr
|
||||
echo
|
||||
bridge link show 2>/dev/null
|
||||
echo
|
||||
bridge fdb show 2>/dev/null | head -30
|
||||
|
||||
echo
|
||||
echo "### smbd process tree ###"
|
||||
pstree -p $(systemctl show -p MainPID --value smbd 2>/dev/null) 2>/dev/null
|
||||
echo
|
||||
ps -eo pid,ppid,state,command | grep -E 'smbd|nmbd' | grep -v grep
|
||||
|
||||
echo
|
||||
echo "### systemctl status ###"
|
||||
systemctl is-active smbd nmbd dnsmasq apache2
|
||||
|
||||
echo
|
||||
echo "### smbstatus ###"
|
||||
smbstatus 2>&1 | head -40
|
||||
|
||||
echo
|
||||
echo "### port 445 sockets ###"
|
||||
ss -tnp 2>/dev/null | grep :445
|
||||
|
||||
echo
|
||||
echo "### conntrack entries for PXE subnet ###"
|
||||
if command -v conntrack >/dev/null 2>&1; then
|
||||
conntrack -L 2>&1 | grep -E '10\.9\.100' | head -30
|
||||
echo "total conntrack entries: $(conntrack -C 2>&1)"
|
||||
else
|
||||
echo "conntrack tool not installed"
|
||||
fi
|
||||
|
||||
echo
|
||||
echo "### arp / neighbour table for PXE subnet ###"
|
||||
ip neigh show 2>/dev/null | grep -E '10\.9\.100|br-pxe'
|
||||
|
||||
echo
|
||||
echo "### dnsmasq DHCP leases ###"
|
||||
cat /var/lib/misc/dnsmasq.leases 2>/dev/null | head -20
|
||||
|
||||
echo
|
||||
echo "### recent smbd log files ###"
|
||||
ls -la /var/log/samba/ 2>/dev/null | head -20
|
||||
|
||||
echo
|
||||
echo "### recent smbd auth / status errors (all machine logs) ###"
|
||||
grep -hE 'NT_STATUS|error|denied' /var/log/samba/log.*.log 2>/dev/null | tail -30
|
||||
|
||||
echo
|
||||
echo "### last 20 lines of smbd master log ###"
|
||||
tail -20 /var/log/samba/log.smbd 2>/dev/null
|
||||
|
||||
echo
|
||||
echo "=============================================================="
|
||||
echo "Snapshot saved to $OUT"
|
||||
echo "=============================================================="
|
||||
87
playbook/pxe-server-helpers/smb-soft-reset.sh
Executable file
87
playbook/pxe-server-helpers/smb-soft-reset.sh
Executable file
@@ -0,0 +1,87 @@
|
||||
#!/bin/bash
|
||||
# smb-soft-reset.sh - progressive soft recovery for the WinPE re-image
|
||||
# "System error 53 has occurred" issue without having to power-cycle the box.
|
||||
# Error 53 = network path not found; the TCP connect to :445 never
|
||||
# completes. Root cause is almost always stale kernel netfilter conntrack
|
||||
# state that systemctl restart smbd doesn't touch. Full reboot clears it;
|
||||
# this script tries to do the same without the reboot.
|
||||
#
|
||||
# Run as root (sudo) when the issue happens. It applies increasingly
|
||||
# aggressive fixes and pauses between steps so you can retry the WinPE
|
||||
# re-image without closing this shell.
|
||||
#
|
||||
# Step 1: reload smbd config (cheapest, may clear cached state)
|
||||
# Step 2: restart nmbd (NetBIOS daemon - separate from smbd)
|
||||
# Step 3: restart smbd (full smbd restart, kills all child sessions)
|
||||
# Step 4: kill any leftover smbd child processes that survived restart
|
||||
# Step 5: flush conntrack for 10.9.100.0/24 (kernel connection tracking)
|
||||
# Step 6: flush ARP / neighbour cache on br-pxe
|
||||
# Step 7: drop TCP sockets on port 445 via ss -K
|
||||
# Step 8: restart dnsmasq (DHCP/TFTP state as a last resort before reboot)
|
||||
|
||||
set -e
|
||||
|
||||
pause() {
|
||||
echo
|
||||
echo "----------------------------------------------------------------"
|
||||
echo "$1"
|
||||
echo " Try your WinPE re-image NOW. Did it work?"
|
||||
echo " [Enter] to continue to next step, Ctrl+C to stop."
|
||||
echo "----------------------------------------------------------------"
|
||||
read -r _
|
||||
}
|
||||
|
||||
if [ "$EUID" -ne 0 ]; then
|
||||
echo "Run as root: sudo $0" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== Step 1/8: reload smbd config ==="
|
||||
systemctl reload smbd 2>&1 || true
|
||||
pause "Step 1 done"
|
||||
|
||||
echo "=== Step 2/8: restart nmbd ==="
|
||||
systemctl restart nmbd 2>&1 || true
|
||||
pause "Step 2 done"
|
||||
|
||||
echo "=== Step 3/8: restart smbd ==="
|
||||
systemctl restart smbd 2>&1
|
||||
pause "Step 3 done"
|
||||
|
||||
echo "=== Step 4/8: kill any leftover smbd children ==="
|
||||
# systemctl restart should take care of this with KillMode=control-group,
|
||||
# but belt + suspenders since the issue has been surviving restarts.
|
||||
pkill -9 -x smbd 2>/dev/null || true
|
||||
sleep 1
|
||||
systemctl start smbd 2>&1
|
||||
pause "Step 4 done"
|
||||
|
||||
echo "=== Step 5/8: flush conntrack entries for 10.9.100.0/24 ==="
|
||||
if command -v conntrack >/dev/null 2>&1; then
|
||||
conntrack -D -s 10.9.100.0/24 2>&1 || true
|
||||
conntrack -D -d 10.9.100.0/24 2>&1 || true
|
||||
else
|
||||
echo " conntrack tool not installed - skipping (apt install conntrack)"
|
||||
fi
|
||||
pause "Step 5 done"
|
||||
|
||||
echo "=== Step 6/8: flush ARP / neighbour cache on br-pxe ==="
|
||||
ip neigh flush dev br-pxe 2>&1 || ip neigh flush all 2>&1 || true
|
||||
pause "Step 6 done"
|
||||
|
||||
echo "=== Step 7/8: drop tcp sockets on port 445 ==="
|
||||
# ss -K requires iproute2 >= 4.11 and CONFIG_INET_DIAG_DESTROY in the kernel.
|
||||
# Drops existing TCP connections to port 445 without touching listening socket.
|
||||
ss -K dport = 445 2>&1 || echo " ss -K not supported on this system"
|
||||
ss -K sport = 445 2>&1 || true
|
||||
pause "Step 7 done"
|
||||
|
||||
echo "=== Step 8/8: restart dnsmasq ==="
|
||||
systemctl restart dnsmasq 2>&1
|
||||
pause "Step 8 done"
|
||||
|
||||
echo
|
||||
echo "All soft-fix steps exhausted. If the issue is still present, you'll"
|
||||
echo "have to power-cycle. Before you do, run /usr/local/sbin/smb-diag.sh"
|
||||
echo "and share the /tmp/smb-diag-*.log output so we can pinpoint which"
|
||||
echo "kernel state the reboot is actually clearing."
|
||||
@@ -17,6 +17,7 @@
|
||||
- cron
|
||||
- ansible
|
||||
- wimtools
|
||||
- conntrack
|
||||
register: pkg_check
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
@@ -364,6 +365,23 @@
|
||||
wide links = yes
|
||||
unix extensions = no
|
||||
|
||||
- name: "Samba SMB session handling for WinPE re-image robustness"
|
||||
blockinfile:
|
||||
path: /etc/samba/smb.conf
|
||||
backup: yes
|
||||
marker: "# {mark} MANAGED - PXE REIMAGE FIX"
|
||||
insertafter: "# END MANAGED - GLOBAL SYMLINKS"
|
||||
block: |
|
||||
# Reduce the chance a WinPE client rebooting mid-imaging leaves a
|
||||
# stale session on the server that blocks its next connection
|
||||
# attempt with "System error 53 network path not found". Combined
|
||||
# with /etc/sysctl.d/99-pxe-conntrack.conf (shorter nf_conntrack
|
||||
# TCP timeouts) this keeps the conntrack + smbd state in sync with
|
||||
# the short-lived flows that PXE imaging produces.
|
||||
socket options = TCP_NODELAY SO_KEEPALIVE IPTOS_LOWDELAY
|
||||
keepalive = 30
|
||||
deadtime = 5
|
||||
|
||||
- name: "Configure Samba shares"
|
||||
blockinfile:
|
||||
path: /etc/samba/smb.conf
|
||||
@@ -427,6 +445,22 @@
|
||||
executable: /bin/bash
|
||||
changed_when: false
|
||||
|
||||
- name: "Deploy nf_conntrack TCP timeout sysctl for PXE workload"
|
||||
copy:
|
||||
src: "{{ usb_mount }}/pxe-server-helpers/99-pxe-conntrack.conf"
|
||||
dest: /etc/sysctl.d/99-pxe-conntrack.conf
|
||||
mode: '0644'
|
||||
notify: reload sysctl
|
||||
|
||||
- name: "Deploy SMB diagnostic + soft-reset helper scripts"
|
||||
copy:
|
||||
src: "{{ usb_mount }}/pxe-server-helpers/{{ item }}"
|
||||
dest: "/usr/local/sbin/{{ item }}"
|
||||
mode: '0755'
|
||||
loop:
|
||||
- smb-diag.sh
|
||||
- smb-soft-reset.sh
|
||||
|
||||
- name: "Create image-type top-level directories"
|
||||
file:
|
||||
path: "{{ samba_share }}/{{ item }}"
|
||||
@@ -784,3 +818,6 @@
|
||||
handlers:
|
||||
- name: "Apply netplan"
|
||||
command: netplan apply
|
||||
|
||||
- name: "reload sysctl"
|
||||
command: sysctl --system
|
||||
|
||||
Reference in New Issue
Block a user