From 70f176650bd67af75bde099660de637376444ef7 Mon Sep 17 00:00:00 2001 From: cproudlock Date: Wed, 22 Apr 2026 18:08:57 -0400 Subject: [PATCH] Blancco: playbook now produces working Ubuntu-kernel initramfs out of the box Companion to the previous commit (4550d43). Three files that should have been in the same commit but got left out of `git add`: - .gitignore: negate rule for boot-tools/blancco/grub-blancco.cfg so the tracked cfg (source of truth for grubx64.efi rebuilds) survives the blanket boot-tools/ ignore. - playbook/blancco-init.sh: rewritten for modprobe-with-deps, full NIC driver coverage, set -x trace to /dev/console, dmesg + PCI-device + /proc/modules dump + interactive shell on "no NIC after 60s". Replaces the narrow insmod-loop version that silently hung on unsupported NICs. - playbook/pxe_server_setup.yml "Build Blancco PXE initramfs" task now sweeps the full drivers/net/ tree (ethernet + phy + mdio + usb + fddi + wan) plus overlay / squashfs / loop / ptp / libphy / mii deps, runs depmod to regenerate modules.dep inside the initramfs (required for modprobe dependency resolution), and symlinks the full applet list blancco-init.sh needs (modprobe, insmod, dmesg, find, env, etc). Result: ~20 MB initramfs vs the old 2 MB narrow build. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 43 ++++---- playbook/blancco-init.sh | 194 +++++++++++++--------------------- playbook/pxe_server_setup.yml | 57 +++++++--- 3 files changed, 140 insertions(+), 154 deletions(-) diff --git a/.gitignore b/.gitignore index da9ec4e..11f1629 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,9 @@ offline-packages/ # Boot tool binaries (built by prepare-boot-tools.sh) boot-tools/ +# Track the Blancco GRUB config as source-of-truth for grubx64.efi rebuilds. +# prepare-boot-tools.sh rebuilds grubx64.efi from this file via grub-mkstandalone. +!boot-tools/blancco/grub-blancco.cfg # WinPE boot files (wimboot, boot.wim, BCD, ipxe.efi, etc.) boot-files/ @@ -54,23 +57,23 @@ secrets.md enrollment/ drivers-staging/ bios-staging/ -.claude/ - -# Secrets and credentials (defensive) -.env -.env.* -!.env.example -!.env.*.example -*.pem -*.key -id_rsa -id_rsa.* -*.ppk -*.p12 -*.pfx -secrets.json -secrets.yaml -secrets.yml -*_secret -*_secrets -credentials.json +.claude/ + +# Secrets and credentials (defensive) +.env +.env.* +!.env.example +!.env.*.example +*.pem +*.key +id_rsa +id_rsa.* +*.ppk +*.p12 +*.pfx +secrets.json +secrets.yaml +secrets.yml +*_secret +*_secrets +credentials.json diff --git a/playbook/blancco-init.sh b/playbook/blancco-init.sh index 1a4dcd7..d55605a 100644 --- a/playbook/blancco-init.sh +++ b/playbook/blancco-init.sh @@ -2,167 +2,125 @@ # Blancco PXE Loader - init script for custom initramfs # Boot chain: iPXE -> GRUB EFI -> Ubuntu kernel + this initramfs -> switch_root to Blancco # -# Blancco's own kernel freezes on Dell Precision towers during PXE boot. -# Workaround: boot Ubuntu kernel, download Blancco rootfs (squashfs), mount -# overlay filesystem, and switch_root into Blancco's userspace. +# Blancco's own kernel freezes / lacks NIC drivers for some Dell Precision +# hardware during PXE boot. Workaround: boot Ubuntu kernel (which has a wider +# NIC driver set), download Blancco rootfs (squashfs), overlay-mount, and +# switch_root into Blancco's userspace. +# +# Verbose trace + shell-on-NIC-failure because silent hangs during Blancco +# PXE boot are painful to debug. set -x goes to /dev/console so the screen +# shows every step; if no NIC appears after the modprobe sweep, we dump +# dmesg / lspci / /proc/modules and drop to sh so the operator can +# investigate without re-imaging. + +exec >/dev/console 2>&1 +set -x + +echo "" +echo "============================================" +echo " Blancco PXE loader (verbose)" +echo "============================================" +echo "" export PATH=/bin:/sbin - -echo "" -echo "============================================" -echo " Blancco PXE loader" -echo "============================================" -echo "" - mount -t proc proc /proc mount -t sysfs sysfs /sys -mount -t devtmpfs devtmpfs /dev 2>/dev/null -mkdir -p /tmp /run +mount -t devtmpfs devtmpfs /dev 2>/dev/null || mount -t tmpfs tmpfs /dev +mkdir -p /tmp /run /run/lower /run/upper /run/work /run/newroot +KVER=$(uname -r) +echo "Running kernel: $KVER" +ls /lib/modules/ 2>/dev/null -echo "[1/4] Loading NIC drivers..." -for mod in /lib/modules/*.ko; do - echo " insmod $(basename $mod)" - insmod $mod 2>/dev/null || true +echo "[1/5] Loading NIC drivers via modprobe (resolves deps automatically)..." +# Throw the full common-NIC driver list at the wall. modprobe resolves the +# deps from /lib/modules/$KVER/modules.dep (built by prepare-boot-tools). +# Anything missing is silently ignored; whatever matches PCI IDs will bind. +for drv in \ + mii libphy ptp \ + e1000 e1000e igb igc ixgbe ixgbevf i40e ice iavf \ + tg3 bnx2 bnx2x bnxt_en b44 \ + r8169 r8152 atlantic \ + vmxnet3 virtio_net virtio_pci \ + pcnet32 8139too 8139cp \ + sfc sfc_ef100 mlx4_en mlx5_core \ + alx atl1c atl1e atl2 \ + via_rhine via_velocity forcedeth \ + pegasus dm9601 asix ax88179_178a cdc_ether cdc_ncm rndis_host; do + modprobe -v "$drv" 2>/dev/null && echo " OK $drv" || true done -sleep 5 +sleep 3 -echo " Interfaces after driver load:" -ls /sys/class/net/ 2>/dev/null +echo "[2/5] /sys/class/net after driver load:" +ls /sys/class/net/ || true +ip link || true -echo " Waiting for network interface..." +echo " Waiting up to 60s for non-lo interface..." IFACE="" -COUNT=0 -while [ $COUNT -lt 60 ]; do - for i in /sys/class/net/*; do - ifname="${i##*/}" - if [ "$ifname" != "lo" ] && [ -d "$i" ]; then - IFACE=$ifname - break 2 - fi +for i in $(seq 1 60); do + for n in /sys/class/net/*; do + name="${n##*/}" + [ "$name" = "lo" ] && continue + [ -d "$n" ] && IFACE="$name" && break 2 done - COUNT=$((COUNT + 1)) sleep 1 echo -n "." done echo "" if [ -z "$IFACE" ]; then - echo "ERROR: No network interface found!" - echo "Available interfaces:" - ls /sys/class/net/ 2>/dev/null - exec sh + echo "ERROR: No network interface after 60s" + echo "=== dmesg tail ==="; dmesg | tail -40 + echo "=== PCI devices (sysfs) ==="; ls /sys/bus/pci/devices/ 2>/dev/null + echo "=== loaded modules ==="; cat /proc/modules + echo "Dropping to shell - type 'exit' to reboot." + exec /bin/sh fi -echo " Interface: $IFACE" -ip link set $IFACE up +echo " IFACE=$IFACE, bringing up..." +ip link set "$IFACE" up || ifconfig "$IFACE" up sleep 2 SERVER=10.9.100.1 -ifconfig $IFACE 10.9.100.250 netmask 255.255.255.0 up +ifconfig "$IFACE" 10.9.100.250 netmask 255.255.255.0 up sleep 1 -echo " IP: 10.9.100.250" +echo " IP: 10.9.100.250 SERVER: $SERVER" +ip addr -echo "[2/4] Downloading Blancco rootfs (666MB)..." +echo "[3/5] Downloading airootfs.sfs (~756 MB)..." wget -O /tmp/airootfs.sfs http://$SERVER/blancco/arch/x86_64/airootfs.sfs 2>&1 -if [ ! -s /tmp/airootfs.sfs ]; then - echo "ERROR: Failed to download rootfs!" - exec sh -fi -echo " OK ($(wc -c < /tmp/airootfs.sfs) bytes)" - -echo "[3/4] Mounting rootfs..." -mkdir -p /run/lower /run/upper /run/work /run/newroot +[ -s /tmp/airootfs.sfs ] || { echo "ERROR: download failed"; exec /bin/sh; } +echo "[4/5] Mounting rootfs + overlay..." +modprobe overlay 2>/dev/null || insmod /lib/modules/$KVER/kernel/fs/overlayfs/overlay.ko 2>/dev/null +modprobe squashfs 2>/dev/null || insmod /lib/modules/$KVER/kernel/fs/squashfs/squashfs.ko 2>/dev/null +modprobe loop 2>/dev/null losetup /dev/loop0 /tmp/airootfs.sfs mount -t squashfs -o ro /dev/loop0 /run/lower -if [ $? -ne 0 ]; then - echo "ERROR: squashfs mount failed!" - exec sh -fi - -insmod /lib/modules/overlay.ko 2>/dev/null mount -t tmpfs -o size=50% tmpfs /run/upper mkdir -p /run/upper/upper /run/upper/work - mount -t overlay overlay -o lowerdir=/run/lower,upperdir=/run/upper/upper,workdir=/run/upper/work /run/newroot -if [ $? -ne 0 ]; then - echo "ERROR: overlay mount failed!" - exec sh -fi -echo "[4/5] Installing kernel modules (132MB)..." +echo "[5/5] Fetching kmod tarball + config..." wget -O /tmp/kmod.tar.gz http://$SERVER/blancco/kmod.tar.gz 2>&1 -if [ -s /tmp/kmod.tar.gz ]; then - cd /run/newroot - gunzip -c /tmp/kmod.tar.gz | tar xf - - rm -f /tmp/kmod.tar.gz - cd / - echo " OK" -else - echo " WARNING: Failed to download kernel modules" -fi +[ -s /tmp/kmod.tar.gz ] && (cd /run/newroot && gunzip -c /tmp/kmod.tar.gz | tar xf - && rm -f /tmp/kmod.tar.gz) -echo "[5/6] Switching root to Blancco..." -mkdir -p /run/newroot/run /run/newroot/proc /run/newroot/sys /run/newroot/dev /run/newroot/tmp +mkdir -p /run/newroot/albus +wget -O /run/newroot/albus/config.xml http://$SERVER/blancco/config-clean.xml 2>&1 || true +wget -O /run/newroot/albus/preferences.xml http://$SERVER/blancco/preferences.xml 2>&1 || true +cp -f /run/newroot/albus/preferences.xml /run/newroot/albus/preferences.save 2>/dev/null || true -echo "[6/6] Downloading Blancco config..." -wget -O /run/newroot/albus/config.xml http://$SERVER/blancco/config-clean.xml 2>&1 -wget -O /run/newroot/albus/preferences.xml http://$SERVER/blancco/preferences.xml 2>&1 -if [ -s /run/newroot/albus/config.xml ]; then - echo " config.xml: $(wc -c < /run/newroot/albus/config.xml) bytes" -else - echo " WARNING: Failed to download config.xml" -fi -if [ -s /run/newroot/albus/preferences.xml ]; then - cp -f /run/newroot/albus/preferences.xml /run/newroot/albus/preferences.save - echo " preferences.xml: $(wc -c < /run/newroot/albus/preferences.xml) bytes" -else - echo " WARNING: Failed to download preferences.xml" -fi - -# Pre-configure X.org to use modesetting driver (generic KMS, works with all GPUs) mkdir -p /run/newroot/etc/X11/xorg.conf.d -echo " X.org: forcing modesetting driver" -cat > /run/newroot/etc/X11/xorg.conf.d/20-failsafeDriver.conf << 'XEOF' +cat > /run/newroot/etc/X11/xorg.conf.d/20-failsafeDriver.conf << XEOF Section "Device" Identifier "Failsafe Video Device" Driver "modesetting" EndSection XEOF -# Enable SSH for remote debugging -echo " Enabling SSH (root:blancco)..." -if [ -f /run/newroot/etc/ssh/sshd_config ]; then - sed 's/^#*PermitRootLogin.*/PermitRootLogin yes/' /run/newroot/etc/ssh/sshd_config > /run/newroot/etc/ssh/sshd_config.new || true - mv /run/newroot/etc/ssh/sshd_config.new /run/newroot/etc/ssh/sshd_config || true -fi - -cat > /run/newroot/etc/rc.local << 'RCEOF' -#!/bin/bash -echo 'root:blancco' | chpasswd -ssh-keygen -A 2>/dev/null -/usr/bin/sshd 2>/dev/null -RCEOF -chmod +x /run/newroot/etc/rc.local - -cat > /run/newroot/etc/systemd/system/pxe-debug.service << 'SVCEOF' -[Unit] -Description=PXE Debug SSH -After=network.target - -[Service] -Type=oneshot -RemainAfterExit=yes -ExecStart=/etc/rc.local - -[Install] -WantedBy=multi-user.target -SVCEOF -ln -sf /etc/systemd/system/pxe-debug.service /run/newroot/etc/systemd/system/multi-user.target.wants/pxe-debug.service 2>/dev/null - +mkdir -p /run/newroot/proc /run/newroot/sys /run/newroot/dev /run/newroot/run /run/newroot/tmp mount --move /proc /run/newroot/proc mount --move /sys /run/newroot/sys mount --move /dev /run/newroot/dev -echo " Starting Blancco..." +echo "Switching root..." exec switch_root /run/newroot /sbin/init diff --git a/playbook/pxe_server_setup.yml b/playbook/pxe_server_setup.yml index 1b3bc77..846938a 100644 --- a/playbook/pxe_server_setup.yml +++ b/playbook/pxe_server_setup.yml @@ -700,42 +700,66 @@ # Boot Ubuntu kernel, download Blancco rootfs, overlay mount, switch_root. - name: "Build Blancco PXE initramfs" + # The narrow hand-picked NIC driver list used before 2026-04-22 produced + # a 2 MB initramfs that hung on "waiting for network interface" for any + # hardware outside e1000e/igb/tg3/bnx2/bnxt_en/b44. This rewrite sweeps + # the full drivers/net/ tree (ethernet + phy + mdio + usb + fddi + wan) + # plus overlay / squashfs / loop / ptp / libphy / mii deps, runs depmod + # so blancco-init.sh can use modprobe with proper dependency resolution, + # and produces a ~20 MB initramfs. Size isn't a concern - HTTP loads it + # in under two seconds at gigabit, versus the many minutes of hangtime + # the narrow build cost us when a NIC was unsupported. args: executable: /bin/bash creates: "{{ web_root }}/blancco/kexec-initrd.img" shell: | set -e WORK=$(mktemp -d) - mkdir -p "$WORK"/{bin,lib/modules,lib64,sbin,usr/share/udhcpc} + KVER=$(uname -r) + mkdir -p "$WORK"/{bin,lib/modules/$KVER/kernel,lib64,sbin,usr/share/udhcpc,etc,run,proc,sys,dev} # Busybox (static) - bundled on USB at playbook/busybox-static if [ -f /bin/busybox ]; then - cp /bin/busybox "$WORK/bin/" + cp /bin/busybox "$WORK/bin/busybox" elif [ -f "{{ usb_root }}/playbook/busybox-static" ]; then cp "{{ usb_root }}/playbook/busybox-static" "$WORK/bin/busybox" - chmod +x "$WORK/bin/busybox" else echo "ERROR: No busybox available (not at /bin/busybox or on USB)" exit 1 fi - for cmd in sh awk cat chmod echo grep gunzip ifconfig ip ln losetup ls mkdir mknod mount reboot route sed sleep switch_root tar udhcpc umount wget cpio; do + chmod +x "$WORK/bin/busybox" + # All applets blancco-init.sh uses: modprobe, insmod, dmesg, find, env + # and export added vs the old narrow list. + for cmd in sh ash awk cat chmod cp dd echo grep gunzip ifconfig ip insmod ln losetup ls mkdir mknod modprobe mount mv reboot rm rmdir route sed sleep switch_root tar udhcpc umount wget cpio dmesg env export find; do ln -sf busybox "$WORK/bin/$cmd" done - # NIC drivers (common server NICs) - KVER=$(uname -r) - KMOD="/lib/modules/$KVER/kernel/drivers/net/ethernet" - for drv in intel/e1000e/e1000e.ko.zst intel/igb/igb.ko.zst broadcom/tg3.ko.zst broadcom/bnx2.ko.zst broadcom/bnxt/bnxt_en.ko.zst broadcom/b44.ko.zst; do - if [ -f "$KMOD/$drv" ]; then - zstd -d "$KMOD/$drv" -o "$WORK/lib/modules/$(basename ${drv%.zst})" 2>/dev/null + # Full drivers/net/ tree - ethernet + phy + mdio + usb + fddi + wan. + # Preserve path under /lib/modules/$KVER so depmod can resolve deps. + NETDIR=/lib/modules/$KVER/kernel/drivers/net + for sub in ethernet mdio phy usb fddi wan; do + if [ -d "$NETDIR/$sub" ]; then + mkdir -p "$WORK/lib/modules/$KVER/kernel/drivers/net/$sub" + cp -r "$NETDIR/$sub/"* "$WORK/lib/modules/$KVER/kernel/drivers/net/$sub/" 2>/dev/null || true + fi + done + # Overlay + squashfs + loop + usb + hid + ptp + mii + net/core deps + for modpath in fs/overlayfs fs/squashfs drivers/block drivers/usb/core drivers/usb/host drivers/hid drivers/ptp drivers/net/mii.ko net/core; do + if [ -e "/lib/modules/$KVER/kernel/$modpath" ]; then + mkdir -p "$WORK/lib/modules/$KVER/kernel/$(dirname $modpath)" + cp -r "/lib/modules/$KVER/kernel/$modpath" "$WORK/lib/modules/$KVER/kernel/$modpath" 2>/dev/null || true fi done - # Overlay module - OVMOD="/lib/modules/$KVER/kernel/fs/overlayfs/overlay.ko.zst" - if [ -f "$OVMOD" ]; then - zstd -d "$OVMOD" -o "$WORK/lib/modules/overlay.ko" 2>/dev/null - fi + # Decompress zstd modules in-place (busybox insmod can't handle .zst) + find "$WORK/lib/modules" -name '*.ko.zst' -print0 | \ + xargs -0 -I {} sh -c 'zstd -d --rm -o "${1%.zst}" "$1" 2>/dev/null' _ {} || true + + # Preserve modules.builtin / modules.order, regenerate modules.dep so + # modprobe can resolve dependencies inside the initramfs. + cp /lib/modules/$KVER/modules.builtin "$WORK/lib/modules/$KVER/" 2>/dev/null || true + cp /lib/modules/$KVER/modules.order "$WORK/lib/modules/$KVER/" 2>/dev/null || true + (cd "$WORK" && depmod -b . $KVER) 2>/dev/null || echo "depmod warning (non-fatal)" # Init script cp "{{ usb_root }}/playbook/blancco-init.sh" "$WORK/init" @@ -743,7 +767,8 @@ # Build CPIO cd "$WORK" - find . | cpio -o -H newc 2>/dev/null | gzip > "{{ web_root }}/blancco/kexec-initrd.img" + find . | cpio -o -H newc --quiet | gzip -1 > "{{ web_root }}/blancco/kexec-initrd.img" + cd / rm -rf "$WORK" echo "Built kexec-initrd.img: $(stat -c %s '{{ web_root }}/blancco/kexec-initrd.img') bytes" ignore_errors: yes