Files
cmi/perf_monitor.py
2025-12-18 16:54:46 -05:00

374 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Real-time VM Performance Monitor
Run this during a backup to identify bottlenecks (CPU, disk, network).
"""
import argparse
import configparser
import ssl
import sys
import time
from datetime import datetime
try:
from pyVim.connect import SmartConnect, Disconnect
from pyVmomi import vim
except ImportError:
print("Error: pyvmomi is required. Install with: pip install pyvmomi")
sys.exit(1)
def connect_vcenter(server, username, password, port=443):
"""Connect to vCenter."""
context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
try:
si = SmartConnect(host=server, user=username, pwd=password, port=port, sslContext=context)
return si
except Exception as e:
print(f"Error connecting: {e}")
sys.exit(1)
def get_counter_ids(perf_manager):
"""Get performance counter IDs."""
metric_ids = {
'cpu.usage.average': None,
'cpu.ready.summation': None,
'disk.read.average': None,
'disk.write.average': None,
'disk.totalReadLatency.average': None,
'disk.totalWriteLatency.average': None,
'disk.maxTotalLatency.latest': None,
'net.received.average': None,
'net.transmitted.average': None,
'mem.usage.average': None,
}
for counter in perf_manager.perfCounter:
full_name = f"{counter.groupInfo.key}.{counter.nameInfo.key}.{counter.rollupType}"
if full_name in metric_ids:
metric_ids[full_name] = counter.key
return metric_ids
def get_vm_perf(si, vm_name, metric_ids):
"""Get performance stats for a specific VM."""
content = si.RetrieveContent()
perf_manager = content.perfManager
container = content.viewManager.CreateContainerView(
content.rootFolder, [vim.VirtualMachine], True
)
target_vm = None
for vm in container.view:
if vm.name.lower() == vm_name.lower():
target_vm = vm
break
container.Destroy()
if not target_vm:
print(f"VM '{vm_name}' not found")
return None
if target_vm.runtime.powerState != vim.VirtualMachinePowerState.poweredOn:
print(f"VM '{vm_name}' is not powered on")
return None
# Build query
metric_id_objs = []
for name, counter_id in metric_ids.items():
if counter_id:
metric_id_objs.append(vim.PerformanceManager.MetricId(
counterId=counter_id,
instance=""
))
query_spec = vim.PerformanceManager.QuerySpec(
entity=target_vm,
metricId=metric_id_objs,
intervalId=20,
maxSample=1
)
results = perf_manager.QueryPerf(querySpec=[query_spec])
perf_data = {
'cpu_pct': 0,
'cpu_ready_ms': 0,
'mem_pct': 0,
'disk_read_kbps': 0,
'disk_write_kbps': 0,
'disk_read_lat_ms': 0,
'disk_write_lat_ms': 0,
'disk_max_lat_ms': 0,
'net_rx_kbps': 0,
'net_tx_kbps': 0,
}
if results:
for result in results:
for val in result.value:
counter_id = val.id.counterId
value = val.value[0] if val.value else 0
for name, cid in metric_ids.items():
if cid == counter_id:
if name == 'cpu.usage.average':
perf_data['cpu_pct'] = round(value / 100, 1)
elif name == 'cpu.ready.summation':
perf_data['cpu_ready_ms'] = round(value / 20, 1) # Convert to ms per interval
elif name == 'mem.usage.average':
perf_data['mem_pct'] = round(value / 100, 1)
elif name == 'disk.read.average':
perf_data['disk_read_kbps'] = value
elif name == 'disk.write.average':
perf_data['disk_write_kbps'] = value
elif name == 'disk.totalReadLatency.average':
perf_data['disk_read_lat_ms'] = value
elif name == 'disk.totalWriteLatency.average':
perf_data['disk_write_lat_ms'] = value
elif name == 'disk.maxTotalLatency.latest':
perf_data['disk_max_lat_ms'] = value
elif name == 'net.received.average':
perf_data['net_rx_kbps'] = value
elif name == 'net.transmitted.average':
perf_data['net_tx_kbps'] = value
break
return perf_data
def get_all_vms_perf(si, metric_ids):
"""Get performance stats for all powered-on VMs."""
content = si.RetrieveContent()
perf_manager = content.perfManager
container = content.viewManager.CreateContainerView(
content.rootFolder, [vim.VirtualMachine], True
)
all_perf = []
for vm in container.view:
if vm.runtime.powerState != vim.VirtualMachinePowerState.poweredOn:
continue
try:
metric_id_objs = []
for name, counter_id in metric_ids.items():
if counter_id:
metric_id_objs.append(vim.PerformanceManager.MetricId(
counterId=counter_id,
instance=""
))
query_spec = vim.PerformanceManager.QuerySpec(
entity=vm,
metricId=metric_id_objs,
intervalId=20,
maxSample=1
)
results = perf_manager.QueryPerf(querySpec=[query_spec])
perf_data = {
'name': vm.name,
'cpu_pct': 0,
'mem_pct': 0,
'disk_read_mbps': 0,
'disk_write_mbps': 0,
'disk_lat_ms': 0,
'net_mbps': 0,
}
if results:
for result in results:
for val in result.value:
counter_id = val.id.counterId
value = val.value[0] if val.value else 0
for name, cid in metric_ids.items():
if cid == counter_id:
if name == 'cpu.usage.average':
perf_data['cpu_pct'] = round(value / 100, 1)
elif name == 'mem.usage.average':
perf_data['mem_pct'] = round(value / 100, 1)
elif name == 'disk.read.average':
perf_data['disk_read_mbps'] = round(value / 1024, 1)
elif name == 'disk.write.average':
perf_data['disk_write_mbps'] = round(value / 1024, 1)
elif name == 'disk.maxTotalLatency.latest':
perf_data['disk_lat_ms'] = value
elif name == 'net.received.average':
perf_data['net_mbps'] += round(value / 1024, 1)
elif name == 'net.transmitted.average':
perf_data['net_mbps'] += round(value / 1024, 1)
break
all_perf.append(perf_data)
except Exception as e:
pass
container.Destroy()
return sorted(all_perf, key=lambda x: x['disk_write_mbps'], reverse=True)
def format_bar(value, max_val, width=20):
"""Create ASCII progress bar."""
filled = int((value / max_val) * width) if max_val > 0 else 0
filled = min(filled, width)
return '' * filled + '' * (width - filled)
def monitor_vm(si, vm_name, interval=5):
"""Monitor a specific VM in real-time."""
content = si.RetrieveContent()
metric_ids = get_counter_ids(content.perfManager)
print(f"\nMonitoring VM: {vm_name}")
print("Press Ctrl+C to stop\n")
print("-" * 100)
try:
while True:
perf = get_vm_perf(si, vm_name, metric_ids)
if not perf:
break
timestamp = datetime.now().strftime('%H:%M:%S')
# Determine bottleneck indicators
cpu_warn = "⚠️ " if perf['cpu_pct'] > 80 else ""
lat_warn = "⚠️ " if perf['disk_max_lat_ms'] > 20 else ""
print(f"\r{timestamp} | "
f"CPU: {cpu_warn}{perf['cpu_pct']:5.1f}% | "
f"Mem: {perf['mem_pct']:5.1f}% | "
f"Disk R: {perf['disk_read_kbps']:6} KB/s | "
f"Disk W: {perf['disk_write_kbps']:6} KB/s | "
f"Lat: {lat_warn}{perf['disk_max_lat_ms']:3}ms | "
f"Net RX: {perf['net_rx_kbps']:6} KB/s | "
f"Net TX: {perf['net_tx_kbps']:6} KB/s",
end='', flush=True)
time.sleep(interval)
print() # New line for next update
except KeyboardInterrupt:
print("\n\nMonitoring stopped.")
def show_all_vms(si):
"""Show performance summary for all VMs."""
content = si.RetrieveContent()
metric_ids = get_counter_ids(content.perfManager)
print("\nCollecting VM performance data...")
all_perf = get_all_vms_perf(si, metric_ids)
print("\n" + "=" * 100)
print(f"{'VM Name':<35} {'CPU%':>6} {'Mem%':>6} {'DiskR':>8} {'DiskW':>8} {'Lat':>6} {'Net':>8}")
print(f"{'':<35} {'':>6} {'':>6} {'(MB/s)':>8} {'(MB/s)':>8} {'(ms)':>6} {'(MB/s)':>8}")
print("=" * 100)
for vm in all_perf:
# Highlight high values
cpu_mark = "*" if vm['cpu_pct'] > 80 else " "
lat_mark = "*" if vm['disk_lat_ms'] > 20 else " "
print(f"{vm['name']:<35} {vm['cpu_pct']:>5.1f}{cpu_mark} {vm['mem_pct']:>6.1f} "
f"{vm['disk_read_mbps']:>8.1f} {vm['disk_write_mbps']:>8.1f} "
f"{vm['disk_lat_ms']:>5}{lat_mark} {vm['net_mbps']:>8.1f}")
print("=" * 100)
print("* = potential bottleneck (CPU > 80% or Latency > 20ms)")
def main():
parser = argparse.ArgumentParser(description='Real-time VM performance monitor')
parser.add_argument('--config', '-c', help='Config file path')
parser.add_argument('--server', '-s', help='vCenter server')
parser.add_argument('--username', '-u', help='Username')
parser.add_argument('--password', '-p', help='Password')
parser.add_argument('--vm', '-v', help='VM name to monitor (omit for all VMs summary)')
parser.add_argument('--interval', '-i', type=int, default=5, help='Polling interval in seconds (default: 5)')
parser.add_argument('--watch', '-w', action='store_true', help='Continuous monitoring mode')
args = parser.parse_args()
server = args.server
username = args.username
password = args.password
if args.config:
config = configparser.ConfigParser()
config.read(args.config)
if 'vcenter' in config:
server = server or config.get('vcenter', 'server', fallback=None)
username = username or config.get('vcenter', 'username', fallback=None)
password = password or config.get('vcenter', 'password', fallback=None)
if not all([server, username, password]):
print("Error: server, username, and password required")
sys.exit(1)
print(f"Connecting to {server}...")
si = connect_vcenter(server, username, password)
try:
if args.vm:
if args.watch:
monitor_vm(si, args.vm, args.interval)
else:
content = si.RetrieveContent()
metric_ids = get_counter_ids(content.perfManager)
perf = get_vm_perf(si, args.vm, metric_ids)
if perf:
print(f"\nPerformance for {args.vm}:")
print(f" CPU Usage: {perf['cpu_pct']}%")
print(f" CPU Ready: {perf['cpu_ready_ms']} ms")
print(f" Memory Usage: {perf['mem_pct']}%")
print(f" Disk Read: {perf['disk_read_kbps']} KB/s ({perf['disk_read_kbps']/1024:.1f} MB/s)")
print(f" Disk Write: {perf['disk_write_kbps']} KB/s ({perf['disk_write_kbps']/1024:.1f} MB/s)")
print(f" Disk Read Lat: {perf['disk_read_lat_ms']} ms")
print(f" Disk Write Lat: {perf['disk_write_lat_ms']} ms")
print(f" Disk Max Lat: {perf['disk_max_lat_ms']} ms")
print(f" Network RX: {perf['net_rx_kbps']} KB/s ({perf['net_rx_kbps']/1024:.1f} MB/s)")
print(f" Network TX: {perf['net_tx_kbps']} KB/s ({perf['net_tx_kbps']/1024:.1f} MB/s)")
# Analysis
print("\n Analysis:")
if perf['cpu_pct'] > 80:
print(" ⚠️ HIGH CPU - VM may be CPU bottlenecked")
if perf['disk_max_lat_ms'] > 20:
print(" ⚠️ HIGH DISK LATENCY - Storage may be bottleneck")
if perf['disk_max_lat_ms'] <= 20 and perf['cpu_pct'] <= 80:
print(" ✓ No obvious VMware-side bottlenecks detected")
else:
if args.watch:
try:
while True:
print("\033[2J\033[H") # Clear screen
show_all_vms(si)
print(f"\nRefreshing every {args.interval} seconds... (Ctrl+C to stop)")
time.sleep(args.interval)
except KeyboardInterrupt:
print("\nStopped.")
else:
show_all_vms(si)
finally:
Disconnect(si)
if __name__ == '__main__':
main()