374 lines
14 KiB
Python
374 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Real-time VM Performance Monitor
|
|
Run this during a backup to identify bottlenecks (CPU, disk, network).
|
|
"""
|
|
|
|
import argparse
|
|
import configparser
|
|
import ssl
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
|
|
try:
|
|
from pyVim.connect import SmartConnect, Disconnect
|
|
from pyVmomi import vim
|
|
except ImportError:
|
|
print("Error: pyvmomi is required. Install with: pip install pyvmomi")
|
|
sys.exit(1)
|
|
|
|
|
|
def connect_vcenter(server, username, password, port=443):
|
|
"""Connect to vCenter."""
|
|
context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
|
context.check_hostname = False
|
|
context.verify_mode = ssl.CERT_NONE
|
|
|
|
try:
|
|
si = SmartConnect(host=server, user=username, pwd=password, port=port, sslContext=context)
|
|
return si
|
|
except Exception as e:
|
|
print(f"Error connecting: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
def get_counter_ids(perf_manager):
|
|
"""Get performance counter IDs."""
|
|
metric_ids = {
|
|
'cpu.usage.average': None,
|
|
'cpu.ready.summation': None,
|
|
'disk.read.average': None,
|
|
'disk.write.average': None,
|
|
'disk.totalReadLatency.average': None,
|
|
'disk.totalWriteLatency.average': None,
|
|
'disk.maxTotalLatency.latest': None,
|
|
'net.received.average': None,
|
|
'net.transmitted.average': None,
|
|
'mem.usage.average': None,
|
|
}
|
|
|
|
for counter in perf_manager.perfCounter:
|
|
full_name = f"{counter.groupInfo.key}.{counter.nameInfo.key}.{counter.rollupType}"
|
|
if full_name in metric_ids:
|
|
metric_ids[full_name] = counter.key
|
|
|
|
return metric_ids
|
|
|
|
|
|
def get_vm_perf(si, vm_name, metric_ids):
|
|
"""Get performance stats for a specific VM."""
|
|
content = si.RetrieveContent()
|
|
perf_manager = content.perfManager
|
|
|
|
container = content.viewManager.CreateContainerView(
|
|
content.rootFolder, [vim.VirtualMachine], True
|
|
)
|
|
|
|
target_vm = None
|
|
for vm in container.view:
|
|
if vm.name.lower() == vm_name.lower():
|
|
target_vm = vm
|
|
break
|
|
|
|
container.Destroy()
|
|
|
|
if not target_vm:
|
|
print(f"VM '{vm_name}' not found")
|
|
return None
|
|
|
|
if target_vm.runtime.powerState != vim.VirtualMachinePowerState.poweredOn:
|
|
print(f"VM '{vm_name}' is not powered on")
|
|
return None
|
|
|
|
# Build query
|
|
metric_id_objs = []
|
|
for name, counter_id in metric_ids.items():
|
|
if counter_id:
|
|
metric_id_objs.append(vim.PerformanceManager.MetricId(
|
|
counterId=counter_id,
|
|
instance=""
|
|
))
|
|
|
|
query_spec = vim.PerformanceManager.QuerySpec(
|
|
entity=target_vm,
|
|
metricId=metric_id_objs,
|
|
intervalId=20,
|
|
maxSample=1
|
|
)
|
|
|
|
results = perf_manager.QueryPerf(querySpec=[query_spec])
|
|
|
|
perf_data = {
|
|
'cpu_pct': 0,
|
|
'cpu_ready_ms': 0,
|
|
'mem_pct': 0,
|
|
'disk_read_kbps': 0,
|
|
'disk_write_kbps': 0,
|
|
'disk_read_lat_ms': 0,
|
|
'disk_write_lat_ms': 0,
|
|
'disk_max_lat_ms': 0,
|
|
'net_rx_kbps': 0,
|
|
'net_tx_kbps': 0,
|
|
}
|
|
|
|
if results:
|
|
for result in results:
|
|
for val in result.value:
|
|
counter_id = val.id.counterId
|
|
value = val.value[0] if val.value else 0
|
|
|
|
for name, cid in metric_ids.items():
|
|
if cid == counter_id:
|
|
if name == 'cpu.usage.average':
|
|
perf_data['cpu_pct'] = round(value / 100, 1)
|
|
elif name == 'cpu.ready.summation':
|
|
perf_data['cpu_ready_ms'] = round(value / 20, 1) # Convert to ms per interval
|
|
elif name == 'mem.usage.average':
|
|
perf_data['mem_pct'] = round(value / 100, 1)
|
|
elif name == 'disk.read.average':
|
|
perf_data['disk_read_kbps'] = value
|
|
elif name == 'disk.write.average':
|
|
perf_data['disk_write_kbps'] = value
|
|
elif name == 'disk.totalReadLatency.average':
|
|
perf_data['disk_read_lat_ms'] = value
|
|
elif name == 'disk.totalWriteLatency.average':
|
|
perf_data['disk_write_lat_ms'] = value
|
|
elif name == 'disk.maxTotalLatency.latest':
|
|
perf_data['disk_max_lat_ms'] = value
|
|
elif name == 'net.received.average':
|
|
perf_data['net_rx_kbps'] = value
|
|
elif name == 'net.transmitted.average':
|
|
perf_data['net_tx_kbps'] = value
|
|
break
|
|
|
|
return perf_data
|
|
|
|
|
|
def get_all_vms_perf(si, metric_ids):
|
|
"""Get performance stats for all powered-on VMs."""
|
|
content = si.RetrieveContent()
|
|
perf_manager = content.perfManager
|
|
|
|
container = content.viewManager.CreateContainerView(
|
|
content.rootFolder, [vim.VirtualMachine], True
|
|
)
|
|
|
|
all_perf = []
|
|
|
|
for vm in container.view:
|
|
if vm.runtime.powerState != vim.VirtualMachinePowerState.poweredOn:
|
|
continue
|
|
|
|
try:
|
|
metric_id_objs = []
|
|
for name, counter_id in metric_ids.items():
|
|
if counter_id:
|
|
metric_id_objs.append(vim.PerformanceManager.MetricId(
|
|
counterId=counter_id,
|
|
instance=""
|
|
))
|
|
|
|
query_spec = vim.PerformanceManager.QuerySpec(
|
|
entity=vm,
|
|
metricId=metric_id_objs,
|
|
intervalId=20,
|
|
maxSample=1
|
|
)
|
|
|
|
results = perf_manager.QueryPerf(querySpec=[query_spec])
|
|
|
|
perf_data = {
|
|
'name': vm.name,
|
|
'cpu_pct': 0,
|
|
'mem_pct': 0,
|
|
'disk_read_mbps': 0,
|
|
'disk_write_mbps': 0,
|
|
'disk_lat_ms': 0,
|
|
'net_mbps': 0,
|
|
}
|
|
|
|
if results:
|
|
for result in results:
|
|
for val in result.value:
|
|
counter_id = val.id.counterId
|
|
value = val.value[0] if val.value else 0
|
|
|
|
for name, cid in metric_ids.items():
|
|
if cid == counter_id:
|
|
if name == 'cpu.usage.average':
|
|
perf_data['cpu_pct'] = round(value / 100, 1)
|
|
elif name == 'mem.usage.average':
|
|
perf_data['mem_pct'] = round(value / 100, 1)
|
|
elif name == 'disk.read.average':
|
|
perf_data['disk_read_mbps'] = round(value / 1024, 1)
|
|
elif name == 'disk.write.average':
|
|
perf_data['disk_write_mbps'] = round(value / 1024, 1)
|
|
elif name == 'disk.maxTotalLatency.latest':
|
|
perf_data['disk_lat_ms'] = value
|
|
elif name == 'net.received.average':
|
|
perf_data['net_mbps'] += round(value / 1024, 1)
|
|
elif name == 'net.transmitted.average':
|
|
perf_data['net_mbps'] += round(value / 1024, 1)
|
|
break
|
|
|
|
all_perf.append(perf_data)
|
|
|
|
except Exception as e:
|
|
pass
|
|
|
|
container.Destroy()
|
|
return sorted(all_perf, key=lambda x: x['disk_write_mbps'], reverse=True)
|
|
|
|
|
|
def format_bar(value, max_val, width=20):
|
|
"""Create ASCII progress bar."""
|
|
filled = int((value / max_val) * width) if max_val > 0 else 0
|
|
filled = min(filled, width)
|
|
return '█' * filled + '░' * (width - filled)
|
|
|
|
|
|
def monitor_vm(si, vm_name, interval=5):
|
|
"""Monitor a specific VM in real-time."""
|
|
content = si.RetrieveContent()
|
|
metric_ids = get_counter_ids(content.perfManager)
|
|
|
|
print(f"\nMonitoring VM: {vm_name}")
|
|
print("Press Ctrl+C to stop\n")
|
|
print("-" * 100)
|
|
|
|
try:
|
|
while True:
|
|
perf = get_vm_perf(si, vm_name, metric_ids)
|
|
if not perf:
|
|
break
|
|
|
|
timestamp = datetime.now().strftime('%H:%M:%S')
|
|
|
|
# Determine bottleneck indicators
|
|
cpu_warn = "⚠️ " if perf['cpu_pct'] > 80 else ""
|
|
lat_warn = "⚠️ " if perf['disk_max_lat_ms'] > 20 else ""
|
|
|
|
print(f"\r{timestamp} | "
|
|
f"CPU: {cpu_warn}{perf['cpu_pct']:5.1f}% | "
|
|
f"Mem: {perf['mem_pct']:5.1f}% | "
|
|
f"Disk R: {perf['disk_read_kbps']:6} KB/s | "
|
|
f"Disk W: {perf['disk_write_kbps']:6} KB/s | "
|
|
f"Lat: {lat_warn}{perf['disk_max_lat_ms']:3}ms | "
|
|
f"Net RX: {perf['net_rx_kbps']:6} KB/s | "
|
|
f"Net TX: {perf['net_tx_kbps']:6} KB/s",
|
|
end='', flush=True)
|
|
|
|
time.sleep(interval)
|
|
print() # New line for next update
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n\nMonitoring stopped.")
|
|
|
|
|
|
def show_all_vms(si):
|
|
"""Show performance summary for all VMs."""
|
|
content = si.RetrieveContent()
|
|
metric_ids = get_counter_ids(content.perfManager)
|
|
|
|
print("\nCollecting VM performance data...")
|
|
all_perf = get_all_vms_perf(si, metric_ids)
|
|
|
|
print("\n" + "=" * 100)
|
|
print(f"{'VM Name':<35} {'CPU%':>6} {'Mem%':>6} {'DiskR':>8} {'DiskW':>8} {'Lat':>6} {'Net':>8}")
|
|
print(f"{'':<35} {'':>6} {'':>6} {'(MB/s)':>8} {'(MB/s)':>8} {'(ms)':>6} {'(MB/s)':>8}")
|
|
print("=" * 100)
|
|
|
|
for vm in all_perf:
|
|
# Highlight high values
|
|
cpu_mark = "*" if vm['cpu_pct'] > 80 else " "
|
|
lat_mark = "*" if vm['disk_lat_ms'] > 20 else " "
|
|
|
|
print(f"{vm['name']:<35} {vm['cpu_pct']:>5.1f}{cpu_mark} {vm['mem_pct']:>6.1f} "
|
|
f"{vm['disk_read_mbps']:>8.1f} {vm['disk_write_mbps']:>8.1f} "
|
|
f"{vm['disk_lat_ms']:>5}{lat_mark} {vm['net_mbps']:>8.1f}")
|
|
|
|
print("=" * 100)
|
|
print("* = potential bottleneck (CPU > 80% or Latency > 20ms)")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Real-time VM performance monitor')
|
|
parser.add_argument('--config', '-c', help='Config file path')
|
|
parser.add_argument('--server', '-s', help='vCenter server')
|
|
parser.add_argument('--username', '-u', help='Username')
|
|
parser.add_argument('--password', '-p', help='Password')
|
|
parser.add_argument('--vm', '-v', help='VM name to monitor (omit for all VMs summary)')
|
|
parser.add_argument('--interval', '-i', type=int, default=5, help='Polling interval in seconds (default: 5)')
|
|
parser.add_argument('--watch', '-w', action='store_true', help='Continuous monitoring mode')
|
|
|
|
args = parser.parse_args()
|
|
|
|
server = args.server
|
|
username = args.username
|
|
password = args.password
|
|
|
|
if args.config:
|
|
config = configparser.ConfigParser()
|
|
config.read(args.config)
|
|
if 'vcenter' in config:
|
|
server = server or config.get('vcenter', 'server', fallback=None)
|
|
username = username or config.get('vcenter', 'username', fallback=None)
|
|
password = password or config.get('vcenter', 'password', fallback=None)
|
|
|
|
if not all([server, username, password]):
|
|
print("Error: server, username, and password required")
|
|
sys.exit(1)
|
|
|
|
print(f"Connecting to {server}...")
|
|
si = connect_vcenter(server, username, password)
|
|
|
|
try:
|
|
if args.vm:
|
|
if args.watch:
|
|
monitor_vm(si, args.vm, args.interval)
|
|
else:
|
|
content = si.RetrieveContent()
|
|
metric_ids = get_counter_ids(content.perfManager)
|
|
perf = get_vm_perf(si, args.vm, metric_ids)
|
|
if perf:
|
|
print(f"\nPerformance for {args.vm}:")
|
|
print(f" CPU Usage: {perf['cpu_pct']}%")
|
|
print(f" CPU Ready: {perf['cpu_ready_ms']} ms")
|
|
print(f" Memory Usage: {perf['mem_pct']}%")
|
|
print(f" Disk Read: {perf['disk_read_kbps']} KB/s ({perf['disk_read_kbps']/1024:.1f} MB/s)")
|
|
print(f" Disk Write: {perf['disk_write_kbps']} KB/s ({perf['disk_write_kbps']/1024:.1f} MB/s)")
|
|
print(f" Disk Read Lat: {perf['disk_read_lat_ms']} ms")
|
|
print(f" Disk Write Lat: {perf['disk_write_lat_ms']} ms")
|
|
print(f" Disk Max Lat: {perf['disk_max_lat_ms']} ms")
|
|
print(f" Network RX: {perf['net_rx_kbps']} KB/s ({perf['net_rx_kbps']/1024:.1f} MB/s)")
|
|
print(f" Network TX: {perf['net_tx_kbps']} KB/s ({perf['net_tx_kbps']/1024:.1f} MB/s)")
|
|
|
|
# Analysis
|
|
print("\n Analysis:")
|
|
if perf['cpu_pct'] > 80:
|
|
print(" ⚠️ HIGH CPU - VM may be CPU bottlenecked")
|
|
if perf['disk_max_lat_ms'] > 20:
|
|
print(" ⚠️ HIGH DISK LATENCY - Storage may be bottleneck")
|
|
if perf['disk_max_lat_ms'] <= 20 and perf['cpu_pct'] <= 80:
|
|
print(" ✓ No obvious VMware-side bottlenecks detected")
|
|
else:
|
|
if args.watch:
|
|
try:
|
|
while True:
|
|
print("\033[2J\033[H") # Clear screen
|
|
show_all_vms(si)
|
|
print(f"\nRefreshing every {args.interval} seconds... (Ctrl+C to stop)")
|
|
time.sleep(args.interval)
|
|
except KeyboardInterrupt:
|
|
print("\nStopped.")
|
|
else:
|
|
show_all_vms(si)
|
|
|
|
finally:
|
|
Disconnect(si)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|