# 用于钉钉监控T+1任务是否需要重跑 import sys import re import os import requests import json abspath = os.path.abspath(__file__) root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath) sys.path.append(root_path) from dw_base.spark.spark_sql import SparkSQL from dw_base.utils.log_utils import pretty_print from configparser import ConfigParser from datetime import time, datetime from pymongo import MongoClient from dw_base import * from dw_base.scheduler.polling_scheduler import get_mongo_client from dw_base.utils.config_utils import parse_args from dw_base.scheduler.mg2es.conf_reader import ConfReader from dw_base.scheduler.mg2es.es_operator import ESOperator from elasticsearch.exceptions import NotFoundError call_count = 0 def check_call_count(): global call_count if call_count == 0: pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 1 ' f'{NORM_MGT} call_count =>{call_count}') print('${setValue(is_run=%s)}' % '1') else: pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 0 ' f'{NORM_MGT} call_count =>{call_count}') print('${setValue(is_run=%s)}' % '0') def send_dingtalk_notification(msg): global call_count call_count += 1 headers = {"Content-Type": "application/json"} data = { "msgtype": "text", "text": {"content": msg} } json_data = json.dumps(data) # 下面的url用于测试 url = 'https://oapi.dingtalk.com/robot/send?access_token=d4955560edf9d78fbf5273fe3ea4022ecf5955570a68ff710f7fe81926dff71e' response = requests.post(url=url, data=json_data, headers=headers) response.raise_for_status() def send_dingtalk_notification_es(msg): headers = {"Content-Type": "application/json"} data = { "msgtype": "text", "text": {"content": msg} } json_data = json.dumps(data) # 下面的url用于测试 url = 'http://m1.node.cdh/dingtalk/api/robot/send?access_token=a4a48ed82627149f3317ee86e249fd7d973f5bed40fcac55cc2e7ca8d9ae0c61' response = requests.post(url=url, data=json_data, headers=headers) response.raise_for_status() def get_mongo_client(conf_path): config_parser = ConfigParser() config_parser.read(root_path + conf_path) url = config_parser.get('base', 'address') return MongoClient(url) def get_count(client, mgdb, mgtbl): db = client[mgdb] collection = db[mgtbl] return collection.count() def get_count_null(client, mgdb, mgtbl): db = client[mgdb] collection = db[mgtbl] # 计数`date`字段不为null的文档 # return collection.count_documents({'date': {'$ne': None}}) # 计数`date` 为null的文档 return collection.count_documents({'date': None}) def get_old_count(mgdb, mgtbl): client = get_mongo_client('/../datasource/mongo/mongo-cts-prod-old.ini') result = get_count(client, mgdb, mgtbl) pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} ' f'{NORM_MGT} old data count: {NORM_GRN}{result}') return result def get_clu_count_null(mgdb, mgtbl): client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini') result = get_count_null(client, mgdb, mgtbl) pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} ' f'{NORM_MGT} old data count: {NORM_GRN}{result}') return result def get_dev_count_null(mgdb, mgtbl): client = get_mongo_client('/../datasource/mongo/mongo-cts-dev-rw-200-test.ini') result = get_count_null(client, mgdb, mgtbl) pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} ' f'{NORM_MGT} old data count: {NORM_GRN}{result}') return result def get_clu_count(mgdb, mgtbl): client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini') result = get_count(client, mgdb, mgtbl) pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT} 大数据集群mongo sink mongo: {NORM_GRN}{mgdb}.{mgtbl} ' f'{NORM_MGT} 大数据集群mongo data count: {NORM_GRN}{result}') return result def get_bigdata_count(mgdb, mgtbl, dt, spark,cdt): sql = (f"select count(1) cnt " f"from dwd.cts_{mgdb}_{mgtbl} " f" where dt in ('19700101', {dt},{cdt}) ") res = spark.query(sql)[0].collect() pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT} 大数据dwd表名: {NORM_GRN}dwd.cts_{mgdb}_{mgtbl} ' f'{NORM_MGT} 大数据dwd 1970+昨日分区+当日分区 count: {NORM_GRN}{res[0].cnt}') return res[0].cnt def get_bigdata_global_bol_count(catalog, dt, spark): sql = (f""" select sum(cnt) cnt from (select count(1) cnt from dwd.`cts_north_america_bol_{catalog}` where dt in ('19700101', {dt}) union all select count(1) from dwd.`cts_central_america_bol_{catalog}` where dt in ('19700101', {dt}) union all select count(1) from dwd.`cts_south_america_bol_{catalog}` where dt in ('19700101', {dt}) union all select count(1) from dwd.`cts_asia_bol_{catalog}` where dt in ('19700101', {dt}) union all select count(1) from dwd.`cts_middle_east_bol_{catalog}` where dt in ('19700101', {dt}) union all select count(1) from dwd.`cts_europe_bol_{catalog}` where dt in ('19700101', {dt}) union all select count(1) from dwd.`cts_africa_bol_{catalog}` where dt in ('19700101', {dt}) union all select count(1) from dwd.`cts_oceania_bol_{catalog}` where dt in ('19700101', {dt}) ) a""") res = spark.query(sql)[0].collect() pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT} 大数据dwd表名: global_bol 1拆8 ' f'{NORM_MGT} 大数据dwd 1970+昨日分区count: {NORM_GRN}{res[0].cnt}') return res[0].cnt def get_year_count(mgdb, catalog, dt, spark): if mgdb != "global_bol": sql = (f"select from_unixtime(cast(`date`/1000 as int)- 8 * 60 * 60, 'yyyy') as year,count(1) hive_cnt " f"from dwd.cts_{mgdb}_{catalog} " f" where dt in ('19700101', {dt}) " f" group by from_unixtime(cast(`date`/1000 as int)- 8 * 60 * 60, 'yyyy')" f" order by from_unixtime(cast(`date`/1000 as int)- 8 * 60 * 60, 'yyyy')") res = spark.query(sql)[0].collect() hive_year_cnt_dict = {} es_year_cnt_dict = {} host, port = ConfReader().get_es_conf() es_operator = ESOperator(host, port) for record in res: year = record['year'] hive_cnt = record['hive_cnt'] hive_year_cnt_dict[year] = hive_cnt # index_name = 'customs_' + str(catalogs[catalog]) + '_' + mgdb + '-' + year index_name = str(catalog) + '_' + mgdb + '-' + year try: ES_year_cnt = es_operator.get_index_document_count(index_name) except NotFoundError: # 因为钉钉关键词所以没有发钉钉 msg7 = (f"ES Index {index_name} not found.\n" f" 请检查原因\n" ) # print(msg7) send_dingtalk_notification_es(msg7) ES_year_cnt = 0 if ES_year_cnt is None: ES_year_cnt = 0 es_year_cnt_dict[year] = ES_year_cnt es_diff = ES_year_cnt - hive_cnt if es_diff != 0: msg5 = ( f"-----------------------------\n" f"\n" f"{mgdb}_{catalog} - 数据一致性警告:ES{year}与大数据DWD的{year}数量不一致。\n\n" f"详细差异报告:\n" f"-----------------------------------------------------------------------\n" f"年份:{year}\n" f"ES{year} 计数:{ES_year_cnt}\n" f"大数据{year} 计数:{hive_cnt}\n" f"差异值:{es_diff}\n" f"-----------------------------------------------------------------------\n" f"\n" f"请检查原因 \n" f"\n" f"-----------------------------\n" ) # print(msg5) send_dingtalk_notification_es(msg5) pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT} 大数据dwd表名: {NORM_GRN}dwd.cts_{mgdb}_{catalog} ' f'{NORM_MGT} 大数据hive_year_cnt_dict {NORM_GRN}{hive_year_cnt_dict}' f'{NORM_MGT} es_year_cnt_dict {NORM_GRN}{es_year_cnt_dict}' ) def get_count_range_date(mgdb, mgtbl, target_date): """ 统计 date 字段值小于目标日期的文档总数 Args: client: MongoDB客户端实例 mgdb: 数据库名称 mgtbl: 集合名称 target_date_str: 目标日期字符串 (格式: "YYYYMMDD") Returns: int: 符合条件的文档数量 """ client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini') db = client[mgdb] collection = db[mgtbl] # 将输入的字符串转换为 datetime 对象 target_date = datetime.strptime(target_date, "%Y%m%d").replace( tzinfo=None # 如果数据库时间不带时区,可以移除此行 ) count = collection.count_documents({'date': {'$lt': target_date}}) return count def main(): CONFIG, _ = parse_args(sys.argv[1:]) dt = CONFIG.get('dt') cdt = CONFIG.get('cdt') spark = SparkSQL() spark._final_spark_config = {'hive.exec.dynamic.partition': 'true', 'hive.exec.dynamic.partition.mode': 'nonstrict', 'spark.yarn.queue': 'cts', 'spark.sql.crossJoin.enabled': 'true', 'spark.executor.memory': '6g', 'spark.executor.memoryOverhead': '2048', 'spark.driver.memory': '4g', 'spark.executor.instances': "15", 'spark.executor.cores': '2' } sql = (f"select mgdb, catalog from task.mg_count_monitor " f"where is_deleted = '0'") res = spark.query(sql)[0].collect() mgdbs_prod = { 'dwd表名': '大数据mongo库名', 'un_global_trade_tatistics': 'united_nations_stat', "global_bol": "global_bol" } mgdbs_old = { 'dwd表名': 'old_mongo库名', 'un_global_trade_tatistics': 'united_nations_stat', "global_bol": "global_sea" } catalogs = { 'im': 'shipments_imports', 'ex': 'shipments_exports', } # 添加需要排除的读 old_mongo 的数据库名称 excluded_dbs = ["un_global_trade_tatistics", "north_america_bol", "central_america_bol", "south_america_bol", "asia_bol", "middle_east_bol", "europe_bol", "africa_bol", "oceania_bol"] # 以下用于测试 # res = [{"mgdb": "global_bol", "catalog": "im"}] # res = [{"mgdb": "ethiopia", "catalog": "ex"}] mirror_dbs = ["fiji"] mirror_dbs_date = {"fiji_im": "20211101", "fiji_ex": "20211101"} for record in res: mgdb = record['mgdb'] catalog = record['catalog'] prod_mgdb = mgdbs_prod.get(record['mgdb'], mgdb) old_mgdb = mgdbs_old.get(record['mgdb'], mgdb) if mgdb == "global_bol": old_cnt = get_old_count(old_mgdb, catalogs[catalog]) # oldmongo和dwd拆分表 clu_cnt = get_bigdata_global_bol_count(catalog, dt, spark) bigdata_count = get_bigdata_global_bol_count(catalog, dt, spark) date_null_cnt=get_clu_count_null(mgdb, catalogs[catalog]) else: old_cnt = get_old_count(prod_mgdb, catalogs[catalog]) clu_cnt = get_clu_count(prod_mgdb, catalogs[catalog]) bigdata_count = get_bigdata_count(mgdb, catalog, dt, spark,cdt) date_null_cnt = get_clu_count_null(mgdb, catalogs[catalog]) # get_year_count(mgdb, catalog, dt, spark) if mgdb in mirror_dbs: clu_cnt = get_count_range_date(mgdb, catalogs[catalog], target_date=mirror_dbs_date[f"{mgdb}_{catalog}"]) print(f"{mgdb}{catalogs[catalog]} clu_cnt: {clu_cnt}") # 两个mongo数据量对比 cnt_diff = old_cnt - clu_cnt # oldmongo和dwd 对比 bd_diff = old_cnt - bigdata_count if bd_diff != 0 or cnt_diff != 0 or date_null_cnt != 0: msg3 = ( f"\n" f"--------------------------------\n" f"数据一致性警告\n" f"--------------------------------\n" f"在 {mgdb}_{catalog} 详细差异报告:\n\n" f"\n" f"--------------------------------\n" f"计数对比:\n" f" old_mongo 计数: {old_cnt}\n" f" 大数据_mongo 计数: {clu_cnt}\n" f" 大数据平台 DWD 计数: {bigdata_count}\n" f" 大数据_mongo `date`字段为空 计数: {date_null_cnt}\n" f"\n" f"请检查原因 \n" f"\n" f"--------------------------------\n" ) if mgdb not in excluded_dbs: send_dingtalk_notification(msg3) # 添加最终各个国家的统计数据量 statistical_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) sql_insert_cnt=f""" insert into table task.cts_country_count select '{mgdb}','{catalog}',{clu_cnt},'{statistical_time}','{dt}' """ spark.query(sql_insert_cnt)[0].collect() sql_overwrite_cnt = f""" INSERT overwrite TABLE task.cts_country_count SELECT country, catalog, cnt, creat_time, dt FROM ( SELECT *, row_number() over (partition BY country,catalog ORDER BY `creat_time` DESC) AS rk FROM task.cts_country_count WHERE dt ={dt} ) tmp where rk =1 """ spark.query(sql_overwrite_cnt)[0].collect() check_call_count() if __name__ == '__main__': main() # CREATE TABLE task.cts_country_count # ( # `country` string COMMENT 'mgdb', # `catalog` string COMMENT '进出口类型', # `cnt` bigint comment '数据量', # `creat_time` STRING COMMENT '统计时间' # ) # PARTITIONED BY ( `dt` string ) # TBLPROPERTIES ( 'COMMENT' = '同步到大数据平台的数据量统计');