diff --git a/download-packages.sh b/download-packages.sh index c9b47c1..d698934 100755 --- a/download-packages.sh +++ b/download-packages.sh @@ -29,6 +29,7 @@ PLAYBOOK_PACKAGES=( p7zip-full grub-efi-amd64-bin grub-common + conntrack ) # Packages installed during autoinstall late-commands (NetworkManager, WiFi, etc.) diff --git a/playbook/pxe-server-helpers/99-pxe-conntrack.conf b/playbook/pxe-server-helpers/99-pxe-conntrack.conf new file mode 100644 index 0000000..e5285db --- /dev/null +++ b/playbook/pxe-server-helpers/99-pxe-conntrack.conf @@ -0,0 +1,13 @@ +# Reduce nf_conntrack TCP timeouts so stale SMB sessions from rebooted WinPE +# clients age out in minutes instead of days. Real active sessions stay +# alive via keepalive traffic refreshing the conntrack timer. Defaults +# (432000s = 5 days for ESTABLISHED) are designed for long-lived enterprise +# TCP flows, not the short bursty connections PXE imaging creates. +# +# Paired with Samba's keepalive=30 + SO_KEEPALIVE (see smb.conf) the +# combination is: dead sessions cleaned up in ~1 hour max, active sessions +# never ageing out because every keepalive packet refreshes the timer. +net.netfilter.nf_conntrack_tcp_timeout_established = 3600 +net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 30 +net.netfilter.nf_conntrack_tcp_timeout_last_ack = 30 +net.netfilter.nf_conntrack_tcp_timeout_close_wait = 30 diff --git a/playbook/pxe-server-helpers/smb-diag.sh b/playbook/pxe-server-helpers/smb-diag.sh new file mode 100755 index 0000000..0d87e76 --- /dev/null +++ b/playbook/pxe-server-helpers/smb-diag.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# smb-diag.sh - snapshot Samba + kernel network state so a future failure +# can be diagnosed remotely. Run this on the PXE server BEFORE power-cycling +# when a WinPE re-image client is getting "cannot connect" errors. +# +# Output: /tmp/smb-diag-.log (pastebin-friendly) +# +# Captures: smbd processes, open SMB sessions, port 445 TCP sockets, +# conntrack, arp, bridge fdb, dnsmasq leases, recent smbd logs. + +set -o pipefail + +TS=$(date +%Y%m%d-%H%M%S) +OUT=/tmp/smb-diag-$TS.log + +exec > >(tee "$OUT") 2>&1 + +echo "==============================================================" +echo "SMB diagnostic snapshot - $(date)" +echo "==============================================================" + +echo +echo "### uptime / kernel ###" +uptime +uname -r + +echo +echo "### interfaces + bridge state ###" +ip -brief addr +echo +bridge link show 2>/dev/null +echo +bridge fdb show 2>/dev/null | head -30 + +echo +echo "### smbd process tree ###" +pstree -p $(systemctl show -p MainPID --value smbd 2>/dev/null) 2>/dev/null +echo +ps -eo pid,ppid,state,command | grep -E 'smbd|nmbd' | grep -v grep + +echo +echo "### systemctl status ###" +systemctl is-active smbd nmbd dnsmasq apache2 + +echo +echo "### smbstatus ###" +smbstatus 2>&1 | head -40 + +echo +echo "### port 445 sockets ###" +ss -tnp 2>/dev/null | grep :445 + +echo +echo "### conntrack entries for PXE subnet ###" +if command -v conntrack >/dev/null 2>&1; then + conntrack -L 2>&1 | grep -E '10\.9\.100' | head -30 + echo "total conntrack entries: $(conntrack -C 2>&1)" +else + echo "conntrack tool not installed" +fi + +echo +echo "### arp / neighbour table for PXE subnet ###" +ip neigh show 2>/dev/null | grep -E '10\.9\.100|br-pxe' + +echo +echo "### dnsmasq DHCP leases ###" +cat /var/lib/misc/dnsmasq.leases 2>/dev/null | head -20 + +echo +echo "### recent smbd log files ###" +ls -la /var/log/samba/ 2>/dev/null | head -20 + +echo +echo "### recent smbd auth / status errors (all machine logs) ###" +grep -hE 'NT_STATUS|error|denied' /var/log/samba/log.*.log 2>/dev/null | tail -30 + +echo +echo "### last 20 lines of smbd master log ###" +tail -20 /var/log/samba/log.smbd 2>/dev/null + +echo +echo "==============================================================" +echo "Snapshot saved to $OUT" +echo "==============================================================" diff --git a/playbook/pxe-server-helpers/smb-soft-reset.sh b/playbook/pxe-server-helpers/smb-soft-reset.sh new file mode 100755 index 0000000..bc3b9d7 --- /dev/null +++ b/playbook/pxe-server-helpers/smb-soft-reset.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# smb-soft-reset.sh - progressive soft recovery for the WinPE re-image +# "System error 53 has occurred" issue without having to power-cycle the box. +# Error 53 = network path not found; the TCP connect to :445 never +# completes. Root cause is almost always stale kernel netfilter conntrack +# state that systemctl restart smbd doesn't touch. Full reboot clears it; +# this script tries to do the same without the reboot. +# +# Run as root (sudo) when the issue happens. It applies increasingly +# aggressive fixes and pauses between steps so you can retry the WinPE +# re-image without closing this shell. +# +# Step 1: reload smbd config (cheapest, may clear cached state) +# Step 2: restart nmbd (NetBIOS daemon - separate from smbd) +# Step 3: restart smbd (full smbd restart, kills all child sessions) +# Step 4: kill any leftover smbd child processes that survived restart +# Step 5: flush conntrack for 10.9.100.0/24 (kernel connection tracking) +# Step 6: flush ARP / neighbour cache on br-pxe +# Step 7: drop TCP sockets on port 445 via ss -K +# Step 8: restart dnsmasq (DHCP/TFTP state as a last resort before reboot) + +set -e + +pause() { + echo + echo "----------------------------------------------------------------" + echo "$1" + echo " Try your WinPE re-image NOW. Did it work?" + echo " [Enter] to continue to next step, Ctrl+C to stop." + echo "----------------------------------------------------------------" + read -r _ +} + +if [ "$EUID" -ne 0 ]; then + echo "Run as root: sudo $0" >&2 + exit 1 +fi + +echo "=== Step 1/8: reload smbd config ===" +systemctl reload smbd 2>&1 || true +pause "Step 1 done" + +echo "=== Step 2/8: restart nmbd ===" +systemctl restart nmbd 2>&1 || true +pause "Step 2 done" + +echo "=== Step 3/8: restart smbd ===" +systemctl restart smbd 2>&1 +pause "Step 3 done" + +echo "=== Step 4/8: kill any leftover smbd children ===" +# systemctl restart should take care of this with KillMode=control-group, +# but belt + suspenders since the issue has been surviving restarts. +pkill -9 -x smbd 2>/dev/null || true +sleep 1 +systemctl start smbd 2>&1 +pause "Step 4 done" + +echo "=== Step 5/8: flush conntrack entries for 10.9.100.0/24 ===" +if command -v conntrack >/dev/null 2>&1; then + conntrack -D -s 10.9.100.0/24 2>&1 || true + conntrack -D -d 10.9.100.0/24 2>&1 || true +else + echo " conntrack tool not installed - skipping (apt install conntrack)" +fi +pause "Step 5 done" + +echo "=== Step 6/8: flush ARP / neighbour cache on br-pxe ===" +ip neigh flush dev br-pxe 2>&1 || ip neigh flush all 2>&1 || true +pause "Step 6 done" + +echo "=== Step 7/8: drop tcp sockets on port 445 ===" +# ss -K requires iproute2 >= 4.11 and CONFIG_INET_DIAG_DESTROY in the kernel. +# Drops existing TCP connections to port 445 without touching listening socket. +ss -K dport = 445 2>&1 || echo " ss -K not supported on this system" +ss -K sport = 445 2>&1 || true +pause "Step 7 done" + +echo "=== Step 8/8: restart dnsmasq ===" +systemctl restart dnsmasq 2>&1 +pause "Step 8 done" + +echo +echo "All soft-fix steps exhausted. If the issue is still present, you'll" +echo "have to power-cycle. Before you do, run /usr/local/sbin/smb-diag.sh" +echo "and share the /tmp/smb-diag-*.log output so we can pinpoint which" +echo "kernel state the reboot is actually clearing." diff --git a/playbook/pxe_server_setup.yml b/playbook/pxe_server_setup.yml index 17dd309..d2ccdd6 100644 --- a/playbook/pxe_server_setup.yml +++ b/playbook/pxe_server_setup.yml @@ -17,6 +17,7 @@ - cron - ansible - wimtools + - conntrack register: pkg_check failed_when: false changed_when: false @@ -364,6 +365,23 @@ wide links = yes unix extensions = no + - name: "Samba SMB session handling for WinPE re-image robustness" + blockinfile: + path: /etc/samba/smb.conf + backup: yes + marker: "# {mark} MANAGED - PXE REIMAGE FIX" + insertafter: "# END MANAGED - GLOBAL SYMLINKS" + block: | + # Reduce the chance a WinPE client rebooting mid-imaging leaves a + # stale session on the server that blocks its next connection + # attempt with "System error 53 network path not found". Combined + # with /etc/sysctl.d/99-pxe-conntrack.conf (shorter nf_conntrack + # TCP timeouts) this keeps the conntrack + smbd state in sync with + # the short-lived flows that PXE imaging produces. + socket options = TCP_NODELAY SO_KEEPALIVE IPTOS_LOWDELAY + keepalive = 30 + deadtime = 5 + - name: "Configure Samba shares" blockinfile: path: /etc/samba/smb.conf @@ -427,6 +445,22 @@ executable: /bin/bash changed_when: false + - name: "Deploy nf_conntrack TCP timeout sysctl for PXE workload" + copy: + src: "{{ usb_mount }}/pxe-server-helpers/99-pxe-conntrack.conf" + dest: /etc/sysctl.d/99-pxe-conntrack.conf + mode: '0644' + notify: reload sysctl + + - name: "Deploy SMB diagnostic + soft-reset helper scripts" + copy: + src: "{{ usb_mount }}/pxe-server-helpers/{{ item }}" + dest: "/usr/local/sbin/{{ item }}" + mode: '0755' + loop: + - smb-diag.sh + - smb-soft-reset.sh + - name: "Create image-type top-level directories" file: path: "{{ samba_share }}/{{ item }}" @@ -784,3 +818,6 @@ handlers: - name: "Apply netplan" command: netplan apply + + - name: "reload sysctl" + command: sysctl --system