# 用于钉钉监控T+1任务是否需要重跑 import sys import re import os import requests import json abspath = os.path.abspath(__file__) root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath) sys.path.append(root_path) from dw_base.spark.spark_sql import SparkSQL from dw_base.utils.log_utils import pretty_print from configparser import ConfigParser from datetime import time from pymongo import MongoClient from dw_base import * from dw_base.scheduler.polling_scheduler import get_mongo_client from dw_base.utils.config_utils import parse_args from dw_base.scheduler.mg2es.conf_reader import ConfReader from dw_base.scheduler.mg2es.es_operator import ESOperator from elasticsearch.exceptions import NotFoundError call_count = 0 def check_call_count(): global call_count if call_count == 0: pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 1 ' f'{NORM_MGT} call_count =>{call_count}') print('${setValue(is_run=%s)}' % '1') else: pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 0 ' f'{NORM_MGT} call_count =>{call_count}') print('${setValue(is_run=%s)}' % '0') def send_dingtalk_notification(msg): global call_count call_count += 1 headers = {"Content-Type": "application/json"} data = { "msgtype": "text", "text": {"content": msg} } json_data = json.dumps(data) # 下面的url用于测试 url = 'http://m1.node.cdh/dingtalk/api/robot/send?access_token=a4a48ed82627149f3317ee86e249fd7d973f5bed40fcac55cc2e7ca8d9ae0c61' response = requests.post(url=url, data=json_data, headers=headers) response.raise_for_status() def send_dingtalk_notification_es(msg): headers = {"Content-Type": "application/json"} data = { "msgtype": "text", "text": {"content": msg} } json_data = json.dumps(data) # 下面的url用于测试 url = 'http://m1.node.cdh/dingtalk/api/robot/send?access_token=a4a48ed82627149f3317ee86e249fd7d973f5bed40fcac55cc2e7ca8d9ae0c61' response = requests.post(url=url, data=json_data, headers=headers) response.raise_for_status() def get_mongo_client(conf_path): config_parser = ConfigParser() config_parser.read(root_path + conf_path) url = config_parser.get('base', 'address') return MongoClient(url) def get_count(client, mgdb, mgtbl): db = client[mgdb] collection = db[mgtbl] return collection.count() def get_count_null(client, mgdb, mgtbl): db = client[mgdb] collection = db[mgtbl] # 计数`date`字段不为null的文档 # return collection.count_documents({'date': {'$ne': None}}) # 计数`date` 为null的文档 return collection.count_documents({'date': None}) def get_old_count(mgdb, mgtbl): client = get_mongo_client('/../datasource/mongo/mongo-cts-prod-old.ini') result = get_count(client, mgdb, mgtbl) pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} ' f'{NORM_MGT} old data count: {NORM_GRN}{result}') return result def get_clu_count_null(mgdb, mgtbl): client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini') result = get_count_null(client, mgdb, mgtbl) pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT} 集群 mongo: {NORM_GRN}{mgdb}.{mgtbl} ' f'{NORM_MGT} 集群date字段为空 count: {NORM_GRN}{result}') return result def get_dev_count_null(mgdb, mgtbl): client = get_mongo_client('/../datasource/mongo/mongo-cts-dev-rw-200-test.ini') result = get_count_null(client, mgdb, mgtbl) pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT} dev source mongo: {NORM_GRN}{mgdb}.{mgtbl} ' f'{NORM_MGT} dev data count: {NORM_GRN}{result}') return result def get_clu_count(mgdb, mgtbl): client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini') result = get_count(client, mgdb, mgtbl) pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} ' f'{NORM_MGT} 大数据集群mongo sink mongo: {NORM_GRN}{mgdb}.{mgtbl} ' f'{NORM_MGT} 大数据集群mongo data count: {NORM_GRN}{result}') return result def get_diff_logic(spark,record,dt): mgdb = record['mgdb'] catalog = record['catalog'] bigdata_count = record['cnt'] clu_cnt = get_clu_count(mgdb, catalog) date_null_cnt = get_clu_count_null(mgdb, catalog) # 两个mongo数据量对比 cnt_diff = clu_cnt - bigdata_count # if cnt_diff != 0 or date_null_cnt != 0: if date_null_cnt != 0: msg3 = ( f"\n" f"--------------------------------\n" f"镜像_mir 数据一致性警告\n" f"--------------------------------\n" f"在 {mgdb}_{catalog} 详细差异报告:\n\n" f"\n" f"--------------------------------\n" f"计数对比:\n" f" 大数据_镜像mongo 计数: {clu_cnt}\n" f" 大数据平台 DWD 计数: {bigdata_count}\n" f" 大数据_镜像mongo `date`字段为空 计数: {date_null_cnt}\n" f"\n" f"请检查原因 \n" f"\n" f"--------------------------------\n" ) print(msg3) # send_dingtalk_notification(msg3) # 添加最终各个国家的统计数据量 statistical_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) sql_insert_cnt = f""" insert into table task.cts_mirror_count select '{mgdb}','{catalog}',{bigdata_count},{clu_cnt},'{statistical_time}','{dt}' """ spark.query(sql_insert_cnt)[0].collect() def main(): CONFIG, _ = parse_args(sys.argv[1:]) dt = CONFIG.get('dt') ydt = CONFIG.get('ydt') spark = SparkSQL() spark._final_spark_config = {'hive.exec.dynamic.partition': 'true', 'hive.exec.dynamic.partition.mode': 'nonstrict', 'spark.yarn.queue': 'cts', 'spark.sql.crossJoin.enabled': 'true', 'spark.executor.memory': '8g', 'spark.executor.memoryOverhead': '2048', 'spark.driver.memory': '4g', 'spark.executor.instances': "12", 'spark.executor.cores': '4', "spark.sql.hive.filesourcePartitionFileCacheSize":"536870912" } im_sql = ( f"select i.code3 as code3,code.english_name as country_name,concat(code.english_name,'_mir') as mgdb,cnt,'shipments_imports' as catalog" f" from" f"( select country_code as code3 ,count(1) as cnt from (select country_code from dwd.cts_mirror_country_im where dt ='{ydt}') im " f"group by country_code) i left join dim.cts_mirror_monitor code" f" on i.code3 = code.code3 where code.english_name is not null") ex_sql = ( f"select i.code3 as code3,code.english_name as country_name,concat(code.english_name,'_mir') as mgdb,cnt,'shipments_exports' as catalog " f" from" f"( select country_code as code3 ,count(1) as cnt from (select country_code from dwd.cts_mirror_country_ex where dt ='{ydt}') ex " f"group by country_code) i left join dim.cts_mirror_monitor code" f" on i.code3 = code.code3 where code.english_name is not null") res_im = spark.query(im_sql)[0].collect() res_ex = spark.query(ex_sql)[0].collect() for record in res_im: get_diff_logic(spark, record,dt) for record in res_ex: get_diff_logic(spark, record,dt) sql_overwrite_cnt = f""" INSERT overwrite TABLE task.cts_mirror_count SELECT country, catalog, dwd_cnt, mongo_cnt, creat_time, dt FROM ( SELECT *, row_number() over (partition BY country,catalog ORDER BY `creat_time` DESC) AS rk FROM task.cts_mirror_count WHERE dt ={dt} ) tmp where rk =1 """ spark.query(sql_overwrite_cnt)[0].collect() check_call_count() if __name__ == '__main__': main() # CREATE TABLE task.cts_mirror_count # ( # `country` string COMMENT 'mgdb', # `catalog` string COMMENT '进出口类型', # `cnt` bigint comment '数据量', # `creat_time` STRING COMMENT '统计时间' # ) # PARTITIONED BY ( `dt` string ) # TBLPROPERTIES ( 'COMMENT' = '同步到大数据平台的数据量统计');