首先写python的exporter需要知道prometheus提供4种类型metrics
分别是:counter, gauge, summary和histogram
我们需要的pip模块
代码思路实例
def push_yarn():
# 监控zk_rm
yarn_zkrmapproot()
# 监控yarn任务信息
yarn_appsinfo()
def run():
start_http_server(8006) # 8006端口启动
while true:
push_yarn()
time.sleep(10)
if __name__ == '__main__':
run()
push_yarn()为监控的数据数据
循环进行监控拿取数据进行监控
我们使用gauge实例
注意⚠️:gauge与counter类似,唯一不同的是gauge数值可以减少,常被用于温度、利用率等指标。
新增gauge实例
yarn_zkrmapproot_code = gauge('yarn_zkrmapproot', 'yarn_zkrmapproot_num', ['instance'])
started_time_gauge = gauge('yarn_started_time', 'started_time', ['application'])
launch_time_gauge = gauge('yarn_launch_time', 'launch_time', ['application'])
finished_time_gauge = gauge('yarn_finished_time', 'finished_time', ['application'])
memory_seconds_gauge = gauge('yarn_memory_seconds', 'memory_seconds', ['application'])
vcore_seconds_gauge = gauge('yarn_vcore_seconds', 'vcore_seconds', ['application'])
实现一下我们要监控的指标
# --------yarn-------- #####
def yarn_zkrmapproot():
# 命令
# 命令
if kerberos_switch:
command = f'''
echo 'ls /rmstore/zkrmstateroot/rmapproot' | /opt/dtstack/dtbase/zookeeper/bin/zkcli.sh | grep application_ | awk -f , '{{print nf}}'
'''
else:
command = f'''
export client_jvmflags="$client_jvmflags -djava.security.auth.login.config=/opt/dtstack/dtbase/zookeeper/conf/jaas.conf -djava.security.krb5.conf=/opt/dtstack/kerberos/kerberos_pkg/conf/krb5.conf -dzookeeper.server.principal=zookeeper/{hostname}@dtstack.com"
echo 'ls /rmstore/zkrmstateroot/rmapproot' | /opt/dtstack/dtbase/zookeeper/bin/zkcli.sh | grep application_ | awk -f , '{{print nf}}'
'''
# 使用subprocess模块执行命令
result = subprocess.getstatusoutput(command) # (0, '455')
if result[0] == 0:
yarn_zkrmapproot_code.labels('yarn_' + hostname).set(result[1])
else:
print(f"failed to execute command: {command}")
def yarn_appsinfo():
list_apps = []
command = "yarn rmadmin -getservicestate rm1"
apps_url = "http://{}/ws/v1/cluster/apps"
rm_info = subprocess.getstatusoutput(command)
if rm_info[0] == 0:
if rm_info[1] == 'active':
rm_host = yarn_rm1
else:
rm_host = yarn_rm2
response = requests.get(url=apps_url.format(rm_host))
html = response.text
data = json.loads(html)
for i in range(0, len(data['apps']['app'])):
need_data = data['apps']['app']
if need_data[i]['memoryseconds'] > 102400: # 大于10g的任务
list_apps.append([need_data[i]['id'],
need_data[i]['startedtime'],
need_data[i]['launchtime'],
need_data[i]['finishedtime'],
need_data[i]['memoryseconds'], need_data[i]['vcoreseconds']])
sorted_lst = sorted(list_apps, key=lambda x: (x[4], x[5]))
for list in sorted_lst:
application = list[0]
started_time = list[1]
launch_time = list[2]
finished_time = list[3]
memory_seconds = list[4]
vcore_seconds = list[5]
started_time_gauge.labels(application=application).set(started_time)
launch_time_gauge.labels(application=application).set(launch_time)
finished_time_gauge.labels(application=application).set(finished_time)
memory_seconds_gauge.labels(application=application).set(memory_seconds)
vcore_seconds_gauge.labels(application=application).set(vcore_seconds)
其中yarn_zkrmapproot是检测znode数量的
yarn_appsinfo是检测大于10g的任务的
传到服务器启动这个exporter
然后加入prometheus配置中就可以检测到了
发表评论