File: //usr/local/bin/zabbix/ceph-osd-stats.py
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import os
import commands
import json
import math
# This script returns a few useful metrics from `osd df` - including
# a list of OSDs that are outside an acceptable range of utilization
df_output = commands.getoutput('timeout 10 ceph osd df -f json-pretty 2>/dev/null')
osd_df = json.loads(df_output)
osd_utilization = osd_df["summary"]["average_utilization"]
osd_spread = osd_df["summary"]["max_var"] - osd_df["summary"]["min_var"]
# This carefully selected linear function return "good enough" values that
# care more about osd utilization disparity as the cluster fills up
osd_spread_tolerated = math.fabs(-0.39 * (osd_utilization/100) + 0.35)
osds_outside_tolerated_spread = []
for osd in osd_df["nodes"]:
osds = {}
# Get absolute value of divergence from average utilization
avg_diff = math.fabs(1-osd["var"])
# If the OSD holds more or less than the tolerated spread
# and has more than just a little bit of data (probably marked out)
if avg_diff > osd_spread_tolerated and osd["var"] > .01:
osds["id"] = osd["id"]
osds["var"] = osd["var"]
osds["reweight"] = osd["reweight"]
osds_outside_tolerated_spread.append(osds)
final_array = {}
final_array["skewed_osds"] = osds_outside_tolerated_spread
final_array["average_osd_utilization"] = osd_utilization
final_array["osd_spread_tolerated"] = osd_spread_tolerated
print(json.dumps(final_array))