Python实现数据库表的监控警告的项目实践_Python

简介

使用python 实现对数据库表的监控告警功能, 并将告警信息通过钉钉机器人发送到钉钉群

实现dataworks中数据质量的基本功能, 当然 dw的数据质量的规则类型很多, 用起来比较方便, 这里只简单实现了其中两个规则类型的功能, 仅供参考;

初次使用python, 请多指教

使用工具: maxcompute

1. 创建表

1. tmp_monitor_tbl_info

create table if not exists puture_bigdata.tmp_monitor_tbl_info (
      `id`					string comment '表编号id'
	, `tbl_name`			string comment '表名'
	, `pt_format`			string comment '分区格式: yyyy-mm-dd,yyyymmdd 等'
	, `val_type`			string comment '值类型: 表行数,周期值等'
    , `monitor_flag` 		int comment '监控标识: 0:不监控, 1:监控;'
    , `rule_code` 			int comment '规则编码: 1:表行数，上周期差值, 2:表行数，固定值 等'
    , `rule_type`			string comment '规则类型: 表行数，上周期差值; 表行数，固定值; 与固定值比较 等'
    , `expect_val` 			int comment '期望值'
    , `tbl_sort_code`       int comment '表类型编码: 0:其它(维表类), 1:亚马逊, 2:中小平台, 3:市场数据 等'
    , `tbl_sort_name`       string comment '表类型名字: 0:其它(维表类), 1:亚马逊, 2:中小平台, 3:市场数据 等'
    , `pt_num`				int comment '分区日期差值'
) comment '数据监控表信息' 
tblproperties ("transactional"="true") 
;

-- 插入数据
insert into table puture_bigdata_dev.tmp_monitor_tbl_info
select * from (
  values  (1 , 'ods_amazon_amz_customer_returns_df',              'yyyymmdd', '表行数', 1, 1, '表行数，上周期差值', 0,      1, '亚马逊' , -1)     
        , (2 , 'ods_amazon_amz_flat_file_all_orders_df',          'yyyymmdd', '表行数', 1, 1, '表行数，上周期差值', 0,      1, '亚马逊' , -1)         
        , (3 , 'dim_sys_salesman_info_df',                        'yyyymmdd', '表行数', 1, 1, '表行数，上周期差值', 0,      0, '其它' , -1)  
) as table_name(id, tbl_name, pt_format, val_type, monitor_flag, rule_code, rule_type, expect_val, tbl_sort_code, tbl_sort_name, pt_num) ;

2. tmp_monitor_tbl_info_log_di

create table if not exists puture_bigdata_dev.tmp_monitor_tbl_info_log_di (
	  `id`					string comment '监控id编码:md5(表名_分区)_小时'
	, `tbl_name`			string comment '表名'
	, `stat_time`			string comment '统计时间'
	, `pt_format`			string comment '分区格式: yyyy-mm-dd,yyyymmdd 等'
	, `stat_pt`				string comment '统计分区'
	, `val_type`			string comment '值类型: 表行数,周期值等'
    , `val` 				int comment '统计值'
    , `rule_code` 			int comment '规则编码: 1:表行数，上周期差值, 2:表行数，固定值 等'
    , `rule_type`			string comment '规则类型: 表行数，上周期差值; 表行数，固定值; 与固定值比较 等'
    , `expect_val` 			int comment '期望值'
    , `is_exc` 				int comment '是否异常: 0:否,1:是,默认值0'
    , `tbl_sort_code`       int comment '表类型编码: 0:其它(维表类), 1:亚马逊, 2:中小平台, 3:市场数据 等'
    , `tbl_sort_name`       string comment '表类型名字: 0:其它(维表类), 1:亚马逊, 2:中小平台, 3:市场数据 等'
) comment '数据监控信息记录表'
partitioned by (pt string comment '数据日期, yyyy-mm-dd') ;

2. 程序开发

1. 数据检查程序

'''pyodps 3
请确保不要使用从 maxcompute下载数据来处理。下载数据操作常包括table/instance的open_reader以及 dataframe的to_pandas方法。 
推荐使用 pyodps dataframe（从 maxcompute 表创建）和maxcompute sql来处理数据。
更详细的内容可以参考：https://help.aliyun.com/document_detail/90481.html
'''

import os
from odps import odps, dataframe
from datetime import datetime, timedelta
from dateutil import parser
options.tunnel.use_instance_tunnel = true

# 获取当前时间
now_time = datetime.now().strftime('%y-%m-%d %h:%m:%s')
print(now_time)
pt = args['date']
print(pt)
date = datetime.strptime(pt, "%y-%m-%d") 

# 监控表列表 tbl_sort_code -> 0:其它(维表类), 1:亚马逊, 2:中小平台, 3:市场数据
sql_tbl_info = """
select * from puture_bigdata.tmp_monitor_tbl_info
where monitor_flag = 1 and tbl_sort_code = 3
"""

# 结果表
res_tbl_name = "puture_bigdata.tmp_monitor_tbl_info_log_di"

# 统计sql代码 -- 表行数,上周期差值
def sql_upper_period_diff():
    sql = f"""
    set odps.sql.hive.compatible=true ;

    insert into table {res_tbl_name} partition (pt='{pt}')
    select 
          a.id
        , a.tbl_name
        , a.stat_time
        , a.pt_format
        , a.stat_pt
        , a.val_type
        , a.val
        , a.rule_code
        , a.rule_type
        , a.expect_val
        , if (a.val = 0, 1, (if ((a.val - nvl(b.val,0)) >= {expect_val}, 0, 1 ))) as is_exc
        , a.tbl_sort_code
        , a.tbl_sort_name 
    from (
        select 
              concat( md5(concat('{tbl_name}', '_', date_format('{date_str}' ,'{pt_format}')) ), '_', {rule_code}, '_', hour('{now_time}') ) as id
            , '{tbl_name}' as tbl_name
            , '{now_time}' as stat_time
            , '{pt_format}' as pt_format
            , date_format('{date_str}' ,'{pt_format}') as stat_pt
            , '{val_type}' as val_type
            , count(1) as val 
            , '{rule_code}' as rule_code
            , '{rule_type}' as rule_type
            , {expect_val} as expect_val
            , {tbl_sort_code} as tbl_sort_code
            , '{tbl_sort_name}' as tbl_sort_name
        from puture_bigdata.{tbl_name}
        where pt = date_format('{date_str}' ,'{pt_format}')
    ) a 
    left join 
    (
        select tbl_name, val from (
            select tbl_name, val
                , row_number() over(partition by tbl_name order by stat_time desc ) as rn 
            from {res_tbl_name}
            where pt = date_add('{date_str}', -1)
        ) where rn = 1
    ) b
    on a.tbl_name = b.tbl_name
    ;
    """
    return sql

# 表行数, 固定值
def sql_line_fixed_val():
    sql = f"""
    set odps.sql.hive.compatible=true ;

    insert into table {res_tbl_name} partition (pt='{pt}')
    select 
          concat( md5(concat('{tbl_name}', '_', date_format('{date_str}' ,'{pt_format}')) ), '_', {rule_code}, '_', hour('{now_time}') ) as id
        , '{tbl_name}' as tbl_name
        , '{now_time}' as stat_time
        , '{pt_format}' as pt_format
        , date_format('{date_str}' ,'{pt_format}') as stat_pt
        , '{val_type}' as val_type
        , count(1) as val 
        , '{rule_code}' as rule_code
        , '{rule_type}' as rule_type
        , {expect_val} as expect_val
        , if (count(1) >= {expect_val}, 0, 1 ) as is_exc
        , {tbl_sort_code} as tbl_sort_code
        , '{tbl_sort_name}' as tbl_sort_name
    from puture_bigdata.{tbl_name}
    where pt = date_format('{date_str}' ,'{pt_format}') ;
    """
    return sql

# 执行监控统计代码
def ex_monitor(sql: str):
    try :
        # print (sql)
        o.execute_sql(sql, hints={'odps.sql.hive.compatible': true , "odps.sql.submit.mode":"script"})
        print("{}: 运行成功".format(tbl_name) )
    except exception as e:
        print('{}: 运行异常 ======> '.format(tbl_name) + str(e))


if __name__ == '__main__':
    try :
        with o.execute_sql(sql_tbl_info, hints={'odps.sql.hive.compatible': true}).open_reader() as reader:

            for row_record in reader:
                # print(row_record) # 打印一条数据值
                tbl_name = row_record.tbl_name
                pt_format = row_record.pt_format
                val_type = row_record.val_type
                monitor_flag = row_record.monitor_flag
                rule_code = row_record.rule_code
                rule_type = row_record.rule_type
                expect_val = row_record.expect_val
                tbl_sort_code = row_record.tbl_sort_code
                tbl_sort_name = row_record.tbl_sort_name
                pt_num = row_record.pt_num
                date_str = (date + timedelta(days=pt_num)).strftime('%y-%m-%d')
                
                if rule_code == 1 :
                    ex_monitor(sql_upper_period_diff())
                elif rule_code == 2 :
                    ex_monitor(sql_line_fixed_val())
                else :
                    print("未知规则!!!")
                           
    except exception as e:
        print('异常 ======> ' + str(e))

2. 告警信息推送程序

'''pyodps 3
请确保不要使用从 maxcompute下载数据来处理。下载数据操作常包括table/instance的open_reader以及 dataframe的to_pandas方法。 
推荐使用 pyodps dataframe（从 maxcompute 表创建）和maxcompute sql来处理数据。
更详细的内容可以参考：https://help.aliyun.com/document_detail/90481.html
'''

import json
import requests 
from datetime import datetime
import os
from odps import odps, dataframe

date_str = args['date']

# 接口地址和token信息
url = 'https://oapi.dingtalk.com/robot/send?access_token=***********************'

now_time = datetime.now().strftime('%y-%m-%d %h:%m:%s')
print (now_time)

sql_query = f"""
select tbl_name, stat_time, stat_pt, val_type, val, rule_type, expect_val, is_exc
from (
    select tbl_name, stat_time, stat_pt, val_type, val, rule_type, expect_val, is_exc
        , row_number() over(partition by tbl_name order by stat_time desc) as rn 
    from puture_bigdata_dev.tmp_monitor_tbl_info_log_di 
    where pt = '{date_str}' 
         and tbl_sort_code = 1 -- 表种类
) a
where rn = 1 and is_exc = 1 
"""

# 钉钉机器人,发送消息
def dd_robot(url:str, content: str):
  headers = {"content-type": "application/json;charset=utf-8"}
  #content里面要设置关键字
  data_info = {
    "msgtype": "text",
    "text": {
    "content": content
    },
    "isatall": false
    #这是配置需要@的人
     # ,"at": {"atmobiles": ["15xxxxxx06",'18xxxxxx1']}
  }
  value = json.dumps(data_info)
  response = requests.post(url,data=value,headers=headers)
  if response.json()['errmsg']!='ok':
    print(response.text)

# 主函数
if __name__ == '__main__': # py3可以省略
    try :
        with o.execute_sql(sql_query, hints={'odps.sql.hive.compatible': true}).open_reader() as reader:
            result_rows = list(reader) # 读取所有的结果行
            result_count = len(result_rows) # 获取结果条数
            #print("结果条数:", result_count) # 打印结果条数

            if result_count > 0 :
                for row in result_rows:
                    tbl_name = row.tbl_name
                    stat_time = row.stat_time
                    stat_pt = row.stat_pt
                    val_type = row.val_type
                    val = row.val
                    rule_type = row.rule_type
                    expect_val = row.expect_val
                    #print (tbl_name)
                    content = "数据质量(dqc)校验告警 \n  "
                    content = content + "【对象名称】：" + tbl_name + " \n  "
                    content = content + "【实际分区】：pt=" + stat_pt + " \n  "
                    content = content + "【触发规则】: " + rule_type + " | 当前样本值: " + val + " | 阈值: " + expect_val + " \n  "
                    content = content + now_time  + " \n  "
                    dd_robot(url, content)
            else :
                print ("无异常情况;")
    except exception as e:
        print ('异常 ========>' + str(e) )