| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368 |
- # 用于钉钉监控T+1任务是否需要重跑
- import sys
- import re
- import os
- import requests
- import json
- abspath = os.path.abspath(__file__)
- root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
- sys.path.append(root_path)
- from dw_base.spark.spark_sql import SparkSQL
- from dw_base.utils.log_utils import pretty_print
- from configparser import ConfigParser
- from datetime import time, datetime
- from pymongo import MongoClient
- from dw_base import *
- from dw_base.scheduler.polling_scheduler import get_mongo_client
- from dw_base.utils.config_utils import parse_args
- from dw_base.scheduler.mg2es.conf_reader import ConfReader
- from dw_base.scheduler.mg2es.es_operator import ESOperator
- from elasticsearch.exceptions import NotFoundError
- call_count = 0
- def check_call_count():
- global call_count
- if call_count == 0:
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 1 '
- f'{NORM_MGT} call_count =>{call_count}')
- print('${setValue(is_run=%s)}' % '1')
- else:
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 0 '
- f'{NORM_MGT} call_count =>{call_count}')
- print('${setValue(is_run=%s)}' % '0')
- def send_dingtalk_notification(msg):
- global call_count
- call_count += 1
- headers = {"Content-Type": "application/json"}
- data = {
- "msgtype": "text",
- "text": {"content": msg}
- }
- json_data = json.dumps(data)
- # 下面的url用于测试
- url = 'https://oapi.dingtalk.com/robot/send?access_token=d4955560edf9d78fbf5273fe3ea4022ecf5955570a68ff710f7fe81926dff71e'
- response = requests.post(url=url, data=json_data, headers=headers)
- response.raise_for_status()
- def send_dingtalk_notification_es(msg):
- headers = {"Content-Type": "application/json"}
- data = {
- "msgtype": "text",
- "text": {"content": msg}
- }
- json_data = json.dumps(data)
- # 下面的url用于测试
- url = 'http://m1.node.cdh/dingtalk/api/robot/send?access_token=a4a48ed82627149f3317ee86e249fd7d973f5bed40fcac55cc2e7ca8d9ae0c61'
- response = requests.post(url=url, data=json_data, headers=headers)
- response.raise_for_status()
- def get_mongo_client(conf_path):
- config_parser = ConfigParser()
- config_parser.read(root_path + conf_path)
- url = config_parser.get('base', 'address')
- return MongoClient(url)
- def get_count(client, mgdb, mgtbl):
- db = client[mgdb]
- collection = db[mgtbl]
- return collection.count()
- def get_count_null(client, mgdb, mgtbl):
- db = client[mgdb]
- collection = db[mgtbl]
- # 计数`date`字段不为null的文档
- # return collection.count_documents({'date': {'$ne': None}})
- # 计数`date` 为null的文档
- return collection.count_documents({'date': None})
- def get_old_count(mgdb, mgtbl):
- client = get_mongo_client('/../datasource/mongo/mongo-cts-prod-old.ini')
- result = get_count(client, mgdb, mgtbl)
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
- f'{NORM_MGT} old data count: {NORM_GRN}{result}')
- return result
- def get_clu_count_null(mgdb, mgtbl):
- client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
- result = get_count_null(client, mgdb, mgtbl)
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
- f'{NORM_MGT} old data count: {NORM_GRN}{result}')
- return result
- def get_dev_count_null(mgdb, mgtbl):
- client = get_mongo_client('/../datasource/mongo/mongo-cts-dev-rw-200-test.ini')
- result = get_count_null(client, mgdb, mgtbl)
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
- f'{NORM_MGT} old data count: {NORM_GRN}{result}')
- return result
- def get_clu_count(mgdb, mgtbl):
- client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
- result = get_count(client, mgdb, mgtbl)
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT} 大数据集群mongo sink mongo: {NORM_GRN}{mgdb}.{mgtbl} '
- f'{NORM_MGT} 大数据集群mongo data count: {NORM_GRN}{result}')
- return result
- def get_bigdata_count(mgdb, mgtbl, dt, spark,cdt):
- sql = (f"select count(1) cnt "
- f"from dwd.cts_{mgdb}_{mgtbl} "
- f" where dt in ('19700101', {dt},{cdt}) ")
- res = spark.query(sql)[0].collect()
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT} 大数据dwd表名: {NORM_GRN}dwd.cts_{mgdb}_{mgtbl} '
- f'{NORM_MGT} 大数据dwd 1970+昨日分区+当日分区 count: {NORM_GRN}{res[0].cnt}')
- return res[0].cnt
- def get_bigdata_global_bol_count(catalog, dt, spark):
- sql = (f"""
- select sum(cnt) cnt from (select count(1) cnt from dwd.`cts_north_america_bol_{catalog}` where dt in ('19700101', {dt})
- union all select count(1) from dwd.`cts_central_america_bol_{catalog}` where dt in ('19700101', {dt})
- union all select count(1) from dwd.`cts_south_america_bol_{catalog}` where dt in ('19700101', {dt})
- union all select count(1) from dwd.`cts_asia_bol_{catalog}` where dt in ('19700101', {dt})
- union all select count(1) from dwd.`cts_middle_east_bol_{catalog}` where dt in ('19700101', {dt})
- union all select count(1) from dwd.`cts_europe_bol_{catalog}` where dt in ('19700101', {dt})
- union all select count(1) from dwd.`cts_africa_bol_{catalog}` where dt in ('19700101', {dt})
- union all select count(1) from dwd.`cts_oceania_bol_{catalog}` where dt in ('19700101', {dt}) ) a""")
- res = spark.query(sql)[0].collect()
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT} 大数据dwd表名: global_bol 1拆8 '
- f'{NORM_MGT} 大数据dwd 1970+昨日分区count: {NORM_GRN}{res[0].cnt}')
- return res[0].cnt
- def get_year_count(mgdb, catalog, dt, spark):
- if mgdb != "global_bol":
- sql = (f"select from_unixtime(cast(`date`/1000 as int)- 8 * 60 * 60, 'yyyy') as year,count(1) hive_cnt "
- f"from dwd.cts_{mgdb}_{catalog} "
- f" where dt in ('19700101', {dt}) "
- f" group by from_unixtime(cast(`date`/1000 as int)- 8 * 60 * 60, 'yyyy')"
- f" order by from_unixtime(cast(`date`/1000 as int)- 8 * 60 * 60, 'yyyy')")
- res = spark.query(sql)[0].collect()
- hive_year_cnt_dict = {}
- es_year_cnt_dict = {}
- host, port = ConfReader().get_es_conf()
- es_operator = ESOperator(host, port)
- for record in res:
- year = record['year']
- hive_cnt = record['hive_cnt']
- hive_year_cnt_dict[year] = hive_cnt
- # index_name = 'customs_' + str(catalogs[catalog]) + '_' + mgdb + '-' + year
- index_name = str(catalog) + '_' + mgdb + '-' + year
- try:
- ES_year_cnt = es_operator.get_index_document_count(index_name)
- except NotFoundError:
- # 因为钉钉关键词所以没有发钉钉
- msg7 = (f"ES Index {index_name} not found.\n"
- f" 请检查原因\n"
- )
- # print(msg7)
- send_dingtalk_notification_es(msg7)
- ES_year_cnt = 0
- if ES_year_cnt is None:
- ES_year_cnt = 0
- es_year_cnt_dict[year] = ES_year_cnt
- es_diff = ES_year_cnt - hive_cnt
- if es_diff != 0:
- msg5 = (
- f"-----------------------------\n"
- f"\n"
- f"{mgdb}_{catalog} - 数据一致性警告:ES{year}与大数据DWD的{year}数量不一致。\n\n"
- f"详细差异报告:\n"
- f"-----------------------------------------------------------------------\n"
- f"年份:{year}\n"
- f"ES{year} 计数:{ES_year_cnt}\n"
- f"大数据{year} 计数:{hive_cnt}\n"
- f"差异值:{es_diff}\n"
- f"-----------------------------------------------------------------------\n"
- f"\n"
- f"请检查原因 \n"
- f"\n"
- f"-----------------------------\n"
- )
- # print(msg5)
- send_dingtalk_notification_es(msg5)
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT} 大数据dwd表名: {NORM_GRN}dwd.cts_{mgdb}_{catalog} '
- f'{NORM_MGT} 大数据hive_year_cnt_dict {NORM_GRN}{hive_year_cnt_dict}'
- f'{NORM_MGT} es_year_cnt_dict {NORM_GRN}{es_year_cnt_dict}'
- )
- def get_count_range_date(mgdb, mgtbl, target_date):
- """
- 统计 date 字段值小于目标日期的文档总数
- Args:
- client: MongoDB客户端实例
- mgdb: 数据库名称
- mgtbl: 集合名称
- target_date_str: 目标日期字符串 (格式: "YYYYMMDD")
- Returns:
- int: 符合条件的文档数量
- """
- client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
- db = client[mgdb]
- collection = db[mgtbl]
- # 将输入的字符串转换为 datetime 对象
- target_date = datetime.strptime(target_date, "%Y%m%d").replace(
- tzinfo=None # 如果数据库时间不带时区,可以移除此行
- )
- count = collection.count_documents({'date': {'$lt': target_date}})
- return count
- def main():
- CONFIG, _ = parse_args(sys.argv[1:])
- dt = CONFIG.get('dt')
- cdt = CONFIG.get('cdt')
- spark = SparkSQL()
- spark._final_spark_config = {'hive.exec.dynamic.partition': 'true',
- 'hive.exec.dynamic.partition.mode': 'nonstrict',
- 'spark.yarn.queue': 'cts',
- 'spark.sql.crossJoin.enabled': 'true',
- 'spark.executor.memory': '6g',
- 'spark.executor.memoryOverhead': '2048',
- 'spark.driver.memory': '4g',
- 'spark.executor.instances': "15",
- 'spark.executor.cores': '2'
- }
- sql = (f"select mgdb, catalog from task.mg_count_monitor "
- f"where is_deleted = '0'")
- res = spark.query(sql)[0].collect()
- mgdbs_prod = {
- 'dwd表名': '大数据mongo库名',
- 'un_global_trade_tatistics': 'united_nations_stat',
- "global_bol": "global_bol"
- }
- mgdbs_old = {
- 'dwd表名': 'old_mongo库名',
- 'un_global_trade_tatistics': 'united_nations_stat',
- "global_bol": "global_sea"
- }
- catalogs = {
- 'im': 'shipments_imports',
- 'ex': 'shipments_exports',
- }
- # 添加需要排除的读 old_mongo 的数据库名称
- excluded_dbs = ["un_global_trade_tatistics",
- "north_america_bol",
- "central_america_bol",
- "south_america_bol",
- "asia_bol",
- "middle_east_bol",
- "europe_bol",
- "africa_bol",
- "oceania_bol"]
- # 以下用于测试
- # res = [{"mgdb": "global_bol", "catalog": "im"}]
- # res = [{"mgdb": "ethiopia", "catalog": "ex"}]
- mirror_dbs = ["fiji"]
- mirror_dbs_date = {"fiji_im": "20211101", "fiji_ex": "20211101"}
- for record in res:
- mgdb = record['mgdb']
- catalog = record['catalog']
- prod_mgdb = mgdbs_prod.get(record['mgdb'], mgdb)
- old_mgdb = mgdbs_old.get(record['mgdb'], mgdb)
- if mgdb == "global_bol":
- old_cnt = get_old_count(old_mgdb, catalogs[catalog])
- # oldmongo和dwd拆分表
- clu_cnt = get_bigdata_global_bol_count(catalog, dt, spark)
- bigdata_count = get_bigdata_global_bol_count(catalog, dt, spark)
- date_null_cnt=get_clu_count_null(mgdb, catalogs[catalog])
- else:
- old_cnt = get_old_count(prod_mgdb, catalogs[catalog])
- clu_cnt = get_clu_count(prod_mgdb, catalogs[catalog])
- bigdata_count = get_bigdata_count(mgdb, catalog, dt, spark,cdt)
- date_null_cnt = get_clu_count_null(mgdb, catalogs[catalog])
- # get_year_count(mgdb, catalog, dt, spark)
- if mgdb in mirror_dbs:
- clu_cnt = get_count_range_date(mgdb, catalogs[catalog], target_date=mirror_dbs_date[f"{mgdb}_{catalog}"])
- print(f"{mgdb}{catalogs[catalog]} clu_cnt: {clu_cnt}")
- # 两个mongo数据量对比
- cnt_diff = old_cnt - clu_cnt
- # oldmongo和dwd 对比
- bd_diff = old_cnt - bigdata_count
- if bd_diff != 0 or cnt_diff != 0 or date_null_cnt != 0:
- msg3 = (
- f"\n"
- f"--------------------------------\n"
- f"数据一致性警告\n"
- f"--------------------------------\n"
- f"在 {mgdb}_{catalog} 详细差异报告:\n\n"
- f"\n"
- f"--------------------------------\n"
- f"计数对比:\n"
- f" old_mongo 计数: {old_cnt}\n"
- f" 大数据_mongo 计数: {clu_cnt}\n"
- f" 大数据平台 DWD 计数: {bigdata_count}\n"
- f" 大数据_mongo `date`字段为空 计数: {date_null_cnt}\n"
- f"\n"
- f"请检查原因 \n"
- f"\n"
- f"--------------------------------\n"
- )
- if mgdb not in excluded_dbs:
- send_dingtalk_notification(msg3)
- # 添加最终各个国家的统计数据量
- statistical_time=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- sql_insert_cnt=f"""
-
- insert into table task.cts_country_count
- select '{mgdb}','{catalog}',{clu_cnt},'{statistical_time}','{dt}'
-
- """
- spark.query(sql_insert_cnt)[0].collect()
- sql_overwrite_cnt = f"""
- INSERT overwrite TABLE task.cts_country_count
- SELECT country,
- catalog,
- cnt,
- creat_time,
- dt
- FROM
- ( SELECT *,
- row_number() over (partition BY country,catalog
- ORDER BY `creat_time` DESC) AS rk
- FROM task.cts_country_count
- WHERE dt ={dt} ) tmp
- where rk =1
- """
- spark.query(sql_overwrite_cnt)[0].collect()
- check_call_count()
- if __name__ == '__main__':
- main()
- # CREATE TABLE task.cts_country_count
- # (
- # `country` string COMMENT 'mgdb',
- # `catalog` string COMMENT '进出口类型',
- # `cnt` bigint comment '数据量',
- # `creat_time` STRING COMMENT '统计时间'
- # )
- # PARTITIONED BY ( `dt` string )
- # TBLPROPERTIES ( 'COMMENT' = '同步到大数据平台的数据量统计');
|