Files
cmi/perf_history.py
2025-12-18 16:54:46 -05:00

347 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Historical VM Performance Report
Pull performance stats from vCenter for the past month to identify patterns.
"""
import argparse
import configparser
import csv
import ssl
import sys
from datetime import datetime, timedelta
try:
from pyVim.connect import SmartConnect, Disconnect
from pyVmomi import vim
except ImportError:
print("Error: pyvmomi is required. Install with: pip install pyvmomi")
sys.exit(1)
def connect_vcenter(server, username, password, port=443):
"""Connect to vCenter."""
context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
try:
si = SmartConnect(host=server, user=username, pwd=password, port=port, sslContext=context)
return si
except Exception as e:
print(f"Error connecting: {e}")
sys.exit(1)
def get_historical_intervals(perf_manager):
"""Get available historical intervals."""
intervals = {}
for interval in perf_manager.historicalInterval:
intervals[interval.samplingPeriod] = {
'name': interval.name,
'length': interval.length,
'level': interval.level,
}
return intervals
def get_counter_ids(perf_manager, metrics_needed):
"""Get performance counter IDs for specified metrics."""
metric_ids = {m: None for m in metrics_needed}
for counter in perf_manager.perfCounter:
full_name = f"{counter.groupInfo.key}.{counter.nameInfo.key}.{counter.rollupType}"
if full_name in metric_ids:
metric_ids[full_name] = counter.key
return metric_ids
def get_vm_by_name(content, vm_name):
"""Find VM by name."""
container = content.viewManager.CreateContainerView(
content.rootFolder, [vim.VirtualMachine], True
)
target_vm = None
for vm in container.view:
if vm.name.lower() == vm_name.lower():
target_vm = vm
break
container.Destroy()
return target_vm
def get_historical_perf(si, entity, metric_ids, days=30):
"""Get historical performance data."""
content = si.RetrieveContent()
perf_manager = content.perfManager
# Use daily interval (86400 seconds) for month-long data
# Or hourly (3600) for more detail but more data
# Available intervals: 300 (5min), 1800 (30min), 7200 (2hr), 86400 (daily)
if days <= 1:
interval_id = 300 # 5-minute samples for last day
elif days <= 7:
interval_id = 1800 # 30-minute samples for last week
else:
interval_id = 7200 # 2-hour samples for longer periods
end_time = datetime.now()
start_time = end_time - timedelta(days=days)
# Build metric ID objects
metric_id_objs = []
for name, counter_id in metric_ids.items():
if counter_id:
metric_id_objs.append(vim.PerformanceManager.MetricId(
counterId=counter_id,
instance=""
))
if not metric_id_objs:
print("No valid metrics found")
return []
query_spec = vim.PerformanceManager.QuerySpec(
entity=entity,
metricId=metric_id_objs,
intervalId=interval_id,
startTime=start_time,
endTime=end_time,
)
try:
results = perf_manager.QueryPerf(querySpec=[query_spec])
except Exception as e:
print(f"Error querying performance: {e}")
return []
# Parse results into time series
data = []
if results:
for result in results:
# Get timestamps
timestamps = result.sampleInfo
# Create data structure
for i, sample_info in enumerate(timestamps):
sample = {
'timestamp': sample_info.timestamp,
'interval': sample_info.interval,
}
for val in result.value:
counter_id = val.id.counterId
if i < len(val.value):
value = val.value[i]
for name, cid in metric_ids.items():
if cid == counter_id:
sample[name] = value
break
data.append(sample)
return data
def analyze_vm_history(si, vm_name, days=30):
"""Analyze historical performance for a VM."""
content = si.RetrieveContent()
perf_manager = content.perfManager
vm = get_vm_by_name(content, vm_name)
if not vm:
print(f"VM '{vm_name}' not found")
return
print(f"\nAnalyzing historical performance for: {vm_name}")
print(f"Period: Last {days} days")
print("-" * 60)
metrics = [
'cpu.usage.average',
'cpu.ready.summation',
'mem.usage.average',
'disk.read.average',
'disk.write.average',
'disk.totalReadLatency.average',
'disk.totalWriteLatency.average',
'disk.maxTotalLatency.latest',
'net.received.average',
'net.transmitted.average',
]
metric_ids = get_counter_ids(perf_manager, metrics)
data = get_historical_perf(si, vm, metric_ids, days)
if not data:
print("No historical data available")
return
print(f"Retrieved {len(data)} samples")
# Calculate statistics
stats = {}
for metric in metrics:
values = [d.get(metric, 0) for d in data if metric in d]
if values:
stats[metric] = {
'min': min(values),
'max': max(values),
'avg': sum(values) / len(values),
'samples': len(values),
}
# Display results
print("\n" + "=" * 60)
print("PERFORMANCE STATISTICS")
print("=" * 60)
if 'cpu.usage.average' in stats:
s = stats['cpu.usage.average']
print(f"\nCPU Usage:")
print(f" Average: {s['avg']/100:.1f}%")
print(f" Maximum: {s['max']/100:.1f}%")
if s['max']/100 > 80:
print(f" ⚠️ CPU reached {s['max']/100:.1f}% - potential bottleneck")
if 'mem.usage.average' in stats:
s = stats['mem.usage.average']
print(f"\nMemory Usage:")
print(f" Average: {s['avg']/100:.1f}%")
print(f" Maximum: {s['max']/100:.1f}%")
if 'disk.read.average' in stats and 'disk.write.average' in stats:
r = stats['disk.read.average']
w = stats['disk.write.average']
print(f"\nDisk I/O (KB/s):")
print(f" Read - Avg: {r['avg']:.0f}, Max: {r['max']:.0f} ({r['max']/1024:.1f} MB/s)")
print(f" Write - Avg: {w['avg']:.0f}, Max: {w['max']:.0f} ({w['max']/1024:.1f} MB/s)")
if 'disk.totalReadLatency.average' in stats and 'disk.totalWriteLatency.average' in stats:
rl = stats['disk.totalReadLatency.average']
wl = stats['disk.totalWriteLatency.average']
print(f"\nDisk Latency (ms):")
print(f" Read - Avg: {rl['avg']:.1f}, Max: {rl['max']:.0f}")
print(f" Write - Avg: {wl['avg']:.1f}, Max: {wl['max']:.0f}")
if rl['max'] > 20 or wl['max'] > 20:
print(f" ⚠️ High disk latency detected - storage may be bottleneck")
if 'disk.maxTotalLatency.latest' in stats:
s = stats['disk.maxTotalLatency.latest']
print(f"\nPeak Disk Latency:")
print(f" Average Peak: {s['avg']:.1f} ms")
print(f" Maximum Peak: {s['max']:.0f} ms")
if s['max'] > 50:
print(f" ⚠️ SEVERE: Peak latency reached {s['max']} ms!")
if 'net.received.average' in stats and 'net.transmitted.average' in stats:
rx = stats['net.received.average']
tx = stats['net.transmitted.average']
print(f"\nNetwork I/O (KB/s):")
print(f" RX - Avg: {rx['avg']:.0f}, Max: {rx['max']:.0f} ({rx['max']/1024:.1f} MB/s)")
print(f" TX - Avg: {tx['avg']:.0f}, Max: {tx['max']:.0f} ({tx['max']/1024:.1f} MB/s)")
# Summary
print("\n" + "=" * 60)
print("BOTTLENECK ANALYSIS")
print("=" * 60)
issues = []
if 'cpu.usage.average' in stats and stats['cpu.usage.average']['max']/100 > 80:
issues.append(f"CPU spiked to {stats['cpu.usage.average']['max']/100:.0f}%")
if 'disk.maxTotalLatency.latest' in stats:
max_lat = stats['disk.maxTotalLatency.latest']['max']
avg_lat = stats['disk.maxTotalLatency.latest']['avg']
if max_lat > 50:
issues.append(f"Disk latency peaked at {max_lat:.0f}ms (severe)")
elif max_lat > 20:
issues.append(f"Disk latency peaked at {max_lat:.0f}ms (moderate)")
if issues:
print("\nPotential issues detected:")
for issue in issues:
print(f" ⚠️ {issue}")
else:
print("\n✓ No major VMware-side bottlenecks detected in historical data")
print(" If backups are still slow, the issue is likely:")
print(" - DATTO agent/MercuryFTP performance")
print(" - DATTO appliance storage/CPU")
print(" - Network between guest and DATTO (not VMware layer)")
return data, stats
def export_to_csv(data, filename, vm_name):
"""Export historical data to CSV."""
if not data:
return
with open(filename, 'w', newline='') as f:
writer = csv.writer(f)
# Get all keys
keys = set()
for d in data:
keys.update(d.keys())
keys = sorted(keys)
writer.writerow(['vm_name'] + keys)
for d in data:
row = [vm_name] + [d.get(k, '') for k in keys]
writer.writerow(row)
print(f"\nData exported to: {filename}")
def main():
parser = argparse.ArgumentParser(description='Historical VM performance analysis')
parser.add_argument('--config', '-c', help='Config file path')
parser.add_argument('--server', '-s', help='vCenter server')
parser.add_argument('--username', '-u', help='Username')
parser.add_argument('--password', '-p', help='Password')
parser.add_argument('--vm', '-v', required=True, help='VM name to analyze')
parser.add_argument('--days', '-d', type=int, default=30, help='Number of days to analyze (default: 30)')
parser.add_argument('--export', '-e', help='Export data to CSV file')
args = parser.parse_args()
server = args.server
username = args.username
password = args.password
if args.config:
config = configparser.ConfigParser()
config.read(args.config)
if 'vcenter' in config:
server = server or config.get('vcenter', 'server', fallback=None)
username = username or config.get('vcenter', 'username', fallback=None)
password = password or config.get('vcenter', 'password', fallback=None)
if not all([server, username, password]):
print("Error: server, username, and password required")
sys.exit(1)
print(f"Connecting to {server}...")
si = connect_vcenter(server, username, password)
try:
data, stats = analyze_vm_history(si, args.vm, args.days)
if args.export and data:
export_to_csv(data, args.export, args.vm)
finally:
Disconnect(si)
if __name__ == '__main__':
main()