| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 |
- # 用于钉钉监控T+1任务是否需要重跑
- import sys
- import re
- import os
- import requests
- import json
- abspath = os.path.abspath(__file__)
- root_path = re.sub(r"tendata-warehouse.*", "tendata-warehouse", abspath)
- sys.path.append(root_path)
- from dw_base.spark.spark_sql import SparkSQL
- from dw_base.utils.log_utils import pretty_print
- from configparser import ConfigParser
- from datetime import time
- from pymongo import MongoClient
- from dw_base import *
- from dw_base.scheduler.polling_scheduler import get_mongo_client
- from dw_base.utils.config_utils import parse_args
- from dw_base.scheduler.mg2es.conf_reader import ConfReader
- from dw_base.scheduler.mg2es.es_operator import ESOperator
- from elasticsearch.exceptions import NotFoundError
- call_count = 0
- def check_call_count():
- global call_count
- if call_count == 0:
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 1 '
- f'{NORM_MGT} call_count =>{call_count}')
- print('${setValue(is_run=%s)}' % '1')
- else:
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT}向后传递参数: {NORM_GRN}is_run => 0 '
- f'{NORM_MGT} call_count =>{call_count}')
- print('${setValue(is_run=%s)}' % '0')
- def send_dingtalk_notification(msg):
- global call_count
- call_count += 1
- headers = {"Content-Type": "application/json"}
- data = {
- "msgtype": "text",
- "text": {"content": msg}
- }
- json_data = json.dumps(data)
- # 下面的url用于测试
- url = 'http://m1.node.cdh/dingtalk/api/robot/send?access_token=a4a48ed82627149f3317ee86e249fd7d973f5bed40fcac55cc2e7ca8d9ae0c61'
- response = requests.post(url=url, data=json_data, headers=headers)
- response.raise_for_status()
- def send_dingtalk_notification_es(msg):
- headers = {"Content-Type": "application/json"}
- data = {
- "msgtype": "text",
- "text": {"content": msg}
- }
- json_data = json.dumps(data)
- # 下面的url用于测试
- url = 'http://m1.node.cdh/dingtalk/api/robot/send?access_token=a4a48ed82627149f3317ee86e249fd7d973f5bed40fcac55cc2e7ca8d9ae0c61'
- response = requests.post(url=url, data=json_data, headers=headers)
- response.raise_for_status()
- def get_mongo_client(conf_path):
- config_parser = ConfigParser()
- config_parser.read(root_path + conf_path)
- url = config_parser.get('base', 'address')
- return MongoClient(url)
- def get_count(client, mgdb, mgtbl):
- db = client[mgdb]
- collection = db[mgtbl]
- return collection.count()
- def get_count_null(client, mgdb, mgtbl):
- db = client[mgdb]
- collection = db[mgtbl]
- # 计数`date`字段不为null的文档
- # return collection.count_documents({'date': {'$ne': None}})
- # 计数`date` 为null的文档
- return collection.count_documents({'date': None})
- def get_old_count(mgdb, mgtbl):
- client = get_mongo_client('/../datasource/mongo/mongo-cts-prod-old.ini')
- result = get_count(client, mgdb, mgtbl)
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT} old source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
- f'{NORM_MGT} old data count: {NORM_GRN}{result}')
- return result
- def get_clu_count_null(mgdb, mgtbl):
- client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
- result = get_count_null(client, mgdb, mgtbl)
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT} 集群 mongo: {NORM_GRN}{mgdb}.{mgtbl} '
- f'{NORM_MGT} 集群date字段为空 count: {NORM_GRN}{result}')
- return result
- def get_dev_count_null(mgdb, mgtbl):
- client = get_mongo_client('/../datasource/mongo/mongo-cts-dev-rw-200-test.ini')
- result = get_count_null(client, mgdb, mgtbl)
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT} dev source mongo: {NORM_GRN}{mgdb}.{mgtbl} '
- f'{NORM_MGT} dev data count: {NORM_GRN}{result}')
- return result
- def get_clu_count(mgdb, mgtbl):
- client = get_mongo_client('/../datasource/mongo/mongo-cluster-cts-prod.ini')
- result = get_count(client, mgdb, mgtbl)
- pretty_print(f'{NORM_CYN}{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())} '
- f'{NORM_MGT} 大数据集群mongo sink mongo: {NORM_GRN}{mgdb}.{mgtbl} '
- f'{NORM_MGT} 大数据集群mongo data count: {NORM_GRN}{result}')
- return result
- def get_diff_logic(spark,record,dt):
- mgdb = record['mgdb']
- catalog = record['catalog']
- bigdata_count = record['cnt']
- clu_cnt = get_clu_count(mgdb, catalog)
- date_null_cnt = get_clu_count_null(mgdb, catalog)
- # 两个mongo数据量对比
- cnt_diff = clu_cnt - bigdata_count
- # if cnt_diff != 0 or date_null_cnt != 0:
- if date_null_cnt != 0:
- msg3 = (
- f"\n"
- f"--------------------------------\n"
- f"镜像_mir 数据一致性警告\n"
- f"--------------------------------\n"
- f"在 {mgdb}_{catalog} 详细差异报告:\n\n"
- f"\n"
- f"--------------------------------\n"
- f"计数对比:\n"
- f" 大数据_镜像mongo 计数: {clu_cnt}\n"
- f" 大数据平台 DWD 计数: {bigdata_count}\n"
- f" 大数据_镜像mongo `date`字段为空 计数: {date_null_cnt}\n"
- f"\n"
- f"请检查原因 \n"
- f"\n"
- f"--------------------------------\n"
- )
- print(msg3)
- # send_dingtalk_notification(msg3)
- # 添加最终各个国家的统计数据量
- statistical_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
- sql_insert_cnt = f"""
- insert into table task.cts_mirror_count
- select '{mgdb}','{catalog}',{bigdata_count},{clu_cnt},'{statistical_time}','{dt}'
- """
- spark.query(sql_insert_cnt)[0].collect()
- def main():
- CONFIG, _ = parse_args(sys.argv[1:])
- dt = CONFIG.get('dt')
- ydt = CONFIG.get('ydt')
- spark = SparkSQL()
- spark._final_spark_config = {'hive.exec.dynamic.partition': 'true',
- 'hive.exec.dynamic.partition.mode': 'nonstrict',
- 'spark.yarn.queue': 'cts',
- 'spark.sql.crossJoin.enabled': 'true',
- 'spark.executor.memory': '8g',
- 'spark.executor.memoryOverhead': '2048',
- 'spark.driver.memory': '4g',
- 'spark.executor.instances': "12",
- 'spark.executor.cores': '4',
- "spark.sql.hive.filesourcePartitionFileCacheSize":"536870912"
- }
- im_sql = (
- f"select i.code3 as code3,code.english_name as country_name,concat(code.english_name,'_mir') as mgdb,cnt,'shipments_imports' as catalog"
- f" from"
- f"( select country_code as code3 ,count(1) as cnt from (select country_code from dwd.cts_mirror_country_im where dt ='{ydt}') im "
- f"group by country_code) i left join dim.cts_mirror_monitor code"
- f" on i.code3 = code.code3 where code.english_name is not null")
- ex_sql = (
- f"select i.code3 as code3,code.english_name as country_name,concat(code.english_name,'_mir') as mgdb,cnt,'shipments_exports' as catalog "
- f" from"
- f"( select country_code as code3 ,count(1) as cnt from (select country_code from dwd.cts_mirror_country_ex where dt ='{ydt}') ex "
- f"group by country_code) i left join dim.cts_mirror_monitor code"
- f" on i.code3 = code.code3 where code.english_name is not null")
- res_im = spark.query(im_sql)[0].collect()
- res_ex = spark.query(ex_sql)[0].collect()
- for record in res_im:
- get_diff_logic(spark, record,dt)
- for record in res_ex:
- get_diff_logic(spark, record,dt)
- sql_overwrite_cnt = f"""
- INSERT overwrite TABLE task.cts_mirror_count
- SELECT country,
- catalog,
- dwd_cnt,
- mongo_cnt,
- creat_time,
- dt
- FROM
- ( SELECT *,
- row_number() over (partition BY country,catalog
- ORDER BY `creat_time` DESC) AS rk
- FROM task.cts_mirror_count
- WHERE dt ={dt} ) tmp
- where rk =1
- """
- spark.query(sql_overwrite_cnt)[0].collect()
- check_call_count()
- if __name__ == '__main__':
- main()
- # CREATE TABLE task.cts_mirror_count
- # (
- # `country` string COMMENT 'mgdb',
- # `catalog` string COMMENT '进出口类型',
- # `cnt` bigint comment '数据量',
- # `creat_time` STRING COMMENT '统计时间'
- # )
- # PARTITIONED BY ( `dt` string )
- # TBLPROPERTIES ( 'COMMENT' = '同步到大数据平台的数据量统计');
|